diff --git a/third_party/xla/xla/backends/gpu/runtime/BUILD b/third_party/xla/xla/backends/gpu/runtime/BUILD
index d4c47e96cee..5d7dff2ad7f 100644
--- a/third_party/xla/xla/backends/gpu/runtime/BUILD
+++ b/third_party/xla/xla/backends/gpu/runtime/BUILD
@@ -932,6 +932,8 @@ cc_library(
         "@com_google_absl//absl/synchronization",
         "@com_google_absl//absl/types:span",
         "@llvm-project//llvm:Support",
+        "@local_tsl//tsl/profiler/lib:traceme",
+        "@local_tsl//tsl/profiler/lib:traceme_encode",
     ],
 )
 
diff --git a/third_party/xla/xla/backends/gpu/runtime/kernel_thunk.cc b/third_party/xla/xla/backends/gpu/runtime/kernel_thunk.cc
index e180049fa00..2e9b2dafcfb 100644
--- a/third_party/xla/xla/backends/gpu/runtime/kernel_thunk.cc
+++ b/third_party/xla/xla/backends/gpu/runtime/kernel_thunk.cc
@@ -50,6 +50,12 @@ limitations under the License.
 #include "xla/stream_executor/stream.h"
 #include "xla/stream_executor/stream_executor.h"
 #include "xla/tsl/platform/statusor.h"
+#include "tsl/profiler/lib/traceme.h"
+#include "tsl/profiler/lib/traceme_encode.h"
+
+using tsl::profiler::TraceMe;
+using tsl::profiler::TraceMeEncode;
+using tsl::profiler::TraceMeLevel;
 
 namespace xla {
 namespace gpu {
@@ -223,43 +229,71 @@ static void PrintBufferContents(
 }
 
 absl::Status KernelThunk::ExecuteOnStream(const ExecuteParams& params) {
+  TraceMe trace(
+      [] { return TraceMeEncode("KernelThunk::ExecuteOnStream", {}); },
+      /*level=*/TraceMeLevel::kVerbose);
+
   // Load the kernel.
   se::StreamExecutor* executor = params.stream->parent();
   se::Kernel* kernel = nullptr;
 
-  TF_ASSIGN_OR_RETURN(
-      se::Stream * stream,
-      GetStreamForExecution(Thunk::execution_stream_id(), params));
+  se::Stream* stream = nullptr;
+  {
+    TraceMe trace(
+        [] {
+          return TraceMeEncode(
+              "KernelThunk::ExecuteOnStream/GetStreamForExecution", {});
+        },
+        /*level=*/TraceMeLevel::kVerbose);
+    TF_ASSIGN_OR_RETURN(
+        stream, GetStreamForExecution(Thunk::execution_stream_id(), params));
+  }
 
   {
+    TraceMe trace(
+        [] { return TraceMeEncode("KernelThunk::ExecuteOnStream/mutex", {}); },
+        /*level=*/TraceMeLevel::kVerbose);
     absl::MutexLock lock(mutex_);
+    TraceMe trace_find(
+        [] {
+          return TraceMeEncode("KernelThunk::ExecuteOnStream/mutex/find", {});
+        },
+        /*level=*/TraceMeLevel::kVerbose);
     auto it = kernel_cache_.find(executor);
     CHECK(it != kernel_cache_.end())
         << "Initialize() not called for StreamExecutor " << executor;
     kernel = it->second.get();
   }
 
-  int device_ordinal = executor->device_ordinal();
-  VLOG(3) << "[" << device_ordinal << "] Launching " << kernel->name();
   absl::InlinedVector<se::KernelArgument, 4> kernel_args;
-  for (const auto& [idx, arg] : llvm::enumerate(args_)) {
-    se::DeviceMemoryBase buf = params.buffer_allocations->GetDeviceAddress(arg);
-    VLOG(3) << "[" << device_ordinal << "] Arg: alloc #" << arg.index()
-            << ", offset: " << arg.offset() << ": " << buf.opaque() << " ("
-            << buf.size() << "B)";
+  {
+    TraceMe trace(
+        [] {
+          return TraceMeEncode("KernelThunk::ExecuteOnStream/kernel_args", {});
+        },
+        /*level=*/TraceMeLevel::kVerbose);
+    int device_ordinal = executor->device_ordinal();
+    VLOG(3) << "[" << device_ordinal << "] Launching " << kernel->name();
+    for (const auto& [idx, arg] : llvm::enumerate(args_)) {
+      se::DeviceMemoryBase buf =
+          params.buffer_allocations->GetDeviceAddress(arg);
+      VLOG(3) << "[" << device_ordinal << "] Arg: alloc #" << arg.index()
+              << ", offset: " << arg.offset() << ": " << buf.opaque() << " ("
+              << buf.size() << "B)";
 
-    if (auto it = tma_metadata_.arg_index_to_tma_info.find(idx);
-        it != tma_metadata_.arg_index_to_tma_info.end()) {
-      // TMA descriptor argument.
-      const se::gpu::TmaDescriptor& tma_desc = it->second;
-      TF_ASSIGN_OR_RETURN(se::TensorMap tensor_map,
-                          executor->CreateTensorMap(tma_desc, buf.opaque()));
-      VLOG(3) << "[" << device_ordinal << "]  Using TensorMap for arg #" << idx
-              << ": " << tma_desc.ToString();
-      kernel_args.push_back(std::move(tensor_map));
-    } else {
-      // Buffer argument.
-      kernel_args.push_back(buf);
+      if (auto it = tma_metadata_.arg_index_to_tma_info.find(idx);
+          it != tma_metadata_.arg_index_to_tma_info.end()) {
+        // TMA descriptor argument.
+        const se::gpu::TmaDescriptor& tma_desc = it->second;
+        TF_ASSIGN_OR_RETURN(se::TensorMap tensor_map,
+                            executor->CreateTensorMap(tma_desc, buf.opaque()));
+        VLOG(3) << "[" << device_ordinal << "]  Using TensorMap for arg #"
+                << idx << ": " << tma_desc.ToString();
+        kernel_args.push_back(std::move(tensor_map));
+      } else {
+        // Buffer argument.
+        kernel_args.push_back(buf);
+      }
     }
   }
 
diff --git a/third_party/xla/xla/service/gpu/BUILD b/third_party/xla/xla/service/gpu/BUILD
index b7c983e49ab..715b409bf8b 100644
--- a/third_party/xla/xla/service/gpu/BUILD
+++ b/third_party/xla/xla/service/gpu/BUILD
@@ -2625,6 +2625,8 @@ cc_library(
         "@local_tsl//tsl/platform:ml_dtypes",
         "@local_tsl//tsl/platform:status",
         "@local_tsl//tsl/platform:statusor",
+        "@local_tsl//tsl/profiler/lib:traceme",
+        "@local_tsl//tsl/profiler/lib:traceme_encode",
     ],
 )
 
diff --git a/third_party/xla/xla/service/gpu/stream_executor_util.cc b/third_party/xla/xla/service/gpu/stream_executor_util.cc
index 64164d527e6..53f6f72afc5 100644
--- a/third_party/xla/xla/service/gpu/stream_executor_util.cc
+++ b/third_party/xla/xla/service/gpu/stream_executor_util.cc
@@ -67,6 +67,12 @@ limitations under the License.
 #include "xla/util.h"
 #include "xla/xla_data.pb.h"
 #include "tsl/platform/ml_dtypes.h"
+#include "tsl/profiler/lib/traceme.h"
+#include "tsl/profiler/lib/traceme_encode.h"
+
+using tsl::profiler::TraceMe;
+using tsl::profiler::TraceMeEncode;
+using tsl::profiler::TraceMeLevel;
 
 namespace xla {
 namespace gpu {
@@ -405,9 +411,19 @@ absl::Status ExecuteKernelOnStream(
     se::Kernel& kernel, absl::Span<const se::KernelArgument> args,
     const LaunchDimensions& dims,
     const std::optional<se::ClusterDim>& cluster_dim, se::Stream* stream) {
-  TF_ASSIGN_OR_RETURN(
-      std::unique_ptr<se::KernelArgsPackedArrayBase> kernel_args,
-      se::PackKernelArgs(args, kernel.metadata()));
+  TraceMe trace([] { return TraceMeEncode("ExecuteKernelOnStream", {}); },
+                /*level=*/TraceMeLevel::kVerbose);
+
+  std::unique_ptr<se::KernelArgsPackedArrayBase> kernel_args;
+  {
+    TraceMe trace(
+        [] {
+          return TraceMeEncode("ExecuteKernelOnStream/PackKernelArgs", {});
+        },
+        /*level=*/TraceMeLevel::kVerbose);
+    TF_ASSIGN_OR_RETURN(kernel_args,
+                        se::PackKernelArgs(args, kernel.metadata()));
+  }
 
   return kernel.Launch(dims.thread_counts_per_block(), dims.block_counts(),
                        cluster_dim, stream, *kernel_args);
diff --git a/third_party/xla/xla/stream_executor/cuda/BUILD b/third_party/xla/xla/stream_executor/cuda/BUILD
index cdcdbfcb0b8..573889d3379 100644
--- a/third_party/xla/xla/stream_executor/cuda/BUILD
+++ b/third_party/xla/xla/stream_executor/cuda/BUILD
@@ -699,6 +699,8 @@ cc_library(
         "@com_google_absl//absl/strings:str_format",
         "@local_config_cuda//cuda:cuda_headers",
         "@local_tsl//tsl/platform:logging",
+        "@local_tsl//tsl/profiler/lib:traceme",
+        "@local_tsl//tsl/profiler/lib:traceme_encode",
     ],
 )
 
@@ -1380,6 +1382,8 @@ cc_library(
         "@com_google_absl//absl/synchronization",
         "@local_config_cuda//cuda:cuda_headers",
         "@local_tsl//tsl/profiler/lib:nvtx_utils",
+        "@local_tsl//tsl/profiler/lib:traceme",
+        "@local_tsl//tsl/profiler/lib:traceme_encode",
     ],
 )
 
diff --git a/third_party/xla/xla/stream_executor/cuda/cuda_kernel.cc b/third_party/xla/xla/stream_executor/cuda/cuda_kernel.cc
index 1ab6d767d3a..367c6aff4ff 100644
--- a/third_party/xla/xla/stream_executor/cuda/cuda_kernel.cc
+++ b/third_party/xla/xla/stream_executor/cuda/cuda_kernel.cc
@@ -34,6 +34,12 @@ limitations under the License.
 #include "xla/stream_executor/stream.h"
 #include "xla/tsl/platform/errors.h"
 #include "xla/tsl/platform/statusor.h"
+#include "tsl/profiler/lib/traceme.h"
+#include "tsl/profiler/lib/traceme_encode.h"
+
+using tsl::profiler::TraceMe;
+using tsl::profiler::TraceMeEncode;
+using tsl::profiler::TraceMeLevel;
 
 namespace stream_executor {
 namespace gpu {
@@ -84,11 +90,17 @@ absl::Status CudaKernel::Launch(const ThreadDim& thread_dims,
                                 const BlockDim& block_dims,
                                 const std::optional<ClusterDim>& cluster_dims,
                                 Stream* stream, const KernelArgs& args) {
+  TraceMe trace([] { return TraceMeEncode("CudaKernel::Launch", {}); },
+                /*level=*/TraceMeLevel::kVerbose);
+
   CUfunction function = gpu_function();
 
   // Launch kernels with packed arguments.
   auto launch = [this, stream, &cluster_dims, &thread_dims, &block_dims,
                  function](const KernelArgsPackedArrayBase& packed) {
+    TraceMe trace([] { return TraceMeEncode("CudaKernel::Launch/launch", {}); },
+                  /*level=*/TraceMeLevel::kVerbose);
+
     int32_t expected_number_of_arguments =
         Arity() + (packed.number_of_shared_bytes() > 0);
 
diff --git a/third_party/xla/xla/stream_executor/cuda/cuda_stream.cc b/third_party/xla/xla/stream_executor/cuda/cuda_stream.cc
index edfaca09491..3ecb91a79d7 100644
--- a/third_party/xla/xla/stream_executor/cuda/cuda_stream.cc
+++ b/third_party/xla/xla/stream_executor/cuda/cuda_stream.cc
@@ -48,6 +48,12 @@ limitations under the License.
 #include "xla/tsl/platform/errors.h"
 #include "xla/tsl/platform/statusor.h"
 #include "tsl/profiler/lib/nvtx_utils.h"
+#include "tsl/profiler/lib/traceme.h"
+#include "tsl/profiler/lib/traceme_encode.h"
+
+using tsl::profiler::TraceMe;
+using tsl::profiler::TraceMeEncode;
+using tsl::profiler::TraceMeLevel;
 
 namespace stream_executor {
 namespace gpu {
@@ -363,6 +369,9 @@ absl::Status LaunchCudaKernel(
     unsigned int grid_dim_z, unsigned int block_dim_x, unsigned int block_dim_y,
     unsigned int block_dim_z, unsigned int shared_mem_bytes, CUstream stream,
     void** kernel_params, void** extra) {
+  TraceMe trace([] { return TraceMeEncode("LaunchCudaKernel", {}); },
+                /*level=*/TraceMeLevel::kVerbose);
+
   std::unique_ptr<ActivateContext> activation = executor->Activate();
   VLOG(2) << "launching kernel: " << kernel_name << "; gdx: " << grid_dim_x
           << " gdy: " << grid_dim_y << " gdz: " << grid_dim_z
@@ -383,15 +392,20 @@ absl::Status LaunchCudaKernel(
         cuFuncSetCacheConfig(function, CU_FUNC_CACHE_PREFER_SHARED)));
   }
 
-  return cuda::ToStatus(
-      cuLaunchKernel(function, grid_dim_x, grid_dim_y, grid_dim_z, block_dim_x,
-                     block_dim_y, block_dim_z, shared_mem_bytes, stream,
-                     kernel_params, extra),
-      absl::StrCat("Failed to launch CUDA kernel: ", kernel_name,
-                   "; block dims: ", block_dim_x, "x", block_dim_y, "x",
-                   block_dim_z, "; grid dims: ", grid_dim_x, "x", grid_dim_y,
-                   "x", grid_dim_z,
-                   "; shared memory size: ", shared_mem_bytes));
+  {
+    TraceMe trace(
+        [&] { return TraceMeEncode("LaunchCudaKernel/cuLaunchKernel", {}); },
+        /*level=*/TraceMeLevel::kVerbose);
+    return cuda::ToStatus(
+        cuLaunchKernel(function, grid_dim_x, grid_dim_y, grid_dim_z,
+                       block_dim_x, block_dim_y, block_dim_z, shared_mem_bytes,
+                       stream, kernel_params, extra),
+        absl::StrCat("Failed to launch CUDA kernel: ", kernel_name,
+                     "; block dims: ", block_dim_x, "x", block_dim_y, "x",
+                     block_dim_z, "; grid dims: ", grid_dim_x, "x", grid_dim_y,
+                     "x", grid_dim_z,
+                     "; shared memory size: ", shared_mem_bytes));
+  }
 }
 
 absl::Status LaunchCudaKernel(
@@ -402,6 +416,8 @@ absl::Status LaunchCudaKernel(
     unsigned int block_dim_y, unsigned int block_dim_z,
     unsigned int shared_mem_bytes, CUstream stream, void** kernel_params,
     void** extra) {
+  TraceMe trace([] { return TraceMeEncode("LaunchCudaKernel", {}); },
+                /*level=*/TraceMeLevel::kVerbose);
   std::unique_ptr<ActivateContext> activation = executor->Activate();
   VLOG(2) << "launching kernel: " << kernel_name << "; cdx: " << cluster_dim_x
           << " cdy: " << cluster_dim_y << " cdz: " << cluster_dim_z
@@ -444,14 +460,19 @@ absl::Status LaunchCudaKernel(
   launch_config.attrs = &cluster_dims;
   launch_config.numAttrs = 1;
 
-  return cuda::ToStatus(
-      cuLaunchKernelEx(&launch_config, function, kernel_params, extra),
-      absl::StrCat("Failed to launch CUDA kernel: ", kernel_name,
-                   "; cluster dims: ", cluster_dim_x, "x", cluster_dim_y, "x",
-                   cluster_dim_z, "; block dims: ", block_dim_x, "x",
-                   block_dim_y, "x", block_dim_z, "; grid dims: ", grid_dim_x,
-                   "x", grid_dim_y, "x", grid_dim_z,
-                   "; shared memory size: ", shared_mem_bytes));
+  {
+    TraceMe trace(
+        [] { return TraceMeEncode("LaunchCudaKernel/cuLaunchKernelEx", {}); },
+        /*level=*/TraceMeLevel::kVerbose);
+    return cuda::ToStatus(
+        cuLaunchKernelEx(&launch_config, function, kernel_params, extra),
+        absl::StrCat("Failed to launch CUDA kernel: ", kernel_name,
+                     "; cluster dims: ", cluster_dim_x, "x", cluster_dim_y, "x",
+                     cluster_dim_z, "; block dims: ", block_dim_x, "x",
+                     block_dim_y, "x", block_dim_z, "; grid dims: ", grid_dim_x,
+                     "x", grid_dim_y, "x", grid_dim_z,
+                     "; shared memory size: ", shared_mem_bytes));
+  }
 }
 
 }  // namespace
@@ -460,6 +481,9 @@ absl::Status CudaStream::LaunchKernel(
     const ThreadDim& thread_dims, const BlockDim& block_dims,
     const std::optional<ClusterDim>& cluster_dims, void* function,
     absl::string_view name, void** args, int64_t shmem_bytes) {
+  TraceMe trace([] { return TraceMeEncode("CudaStream::LaunchKernel", {}); },
+                /*level=*/TraceMeLevel::kVerbose);
+
   if (cluster_dims.has_value()) {
     return LaunchCudaKernel(executor_, name, static_cast<CUfunction>(function),
                             cluster_dims->x, cluster_dims->y, cluster_dims->z,