Merge pull request #48645 from ROCmSoftwarePlatform/google_upstream_r25_port_pr_47508

[r2.5 port][ROCm] Port PR 47508 to r2.5
2025-12-06 12:20:11 +01:00 · 2021-04-22 15:25:51 -07:00 · 2021-04-22 15:25:51 -07:00 · 3aba8ab14d
commit 3aba8ab14d
parent b95a352416 a90a1dcfff
21 changed files with 2554 additions and 51 deletions
--- a/tensorflow/core/platform/default/build_config.bzl
+++ b/tensorflow/core/platform/default/build_config.bzl
@ -625,7 +625,10 @@ def tf_protos_grappler():
    )

 def tf_additional_device_tracer_srcs():
-    return ["device_tracer.cc"]
+    return [
+        "device_tracer_cuda.cc",
+        "device_tracer_rocm.cc",
+    ]

 def tf_additional_cupti_utils_cuda_deps():
    return []
--- a/tensorflow/core/profiler/internal/gpu/BUILD
+++ b/tensorflow/core/profiler/internal/gpu/BUILD
@ -1,4 +1,5 @@
 load("@local_config_cuda//cuda:build_defs.bzl", "if_cuda")
+load("@local_config_rocm//rocm:build_defs.bzl", "if_rocm")
 load(
    "//tensorflow:tensorflow.bzl",
    "tf_copts",
@ -33,6 +34,7 @@ tf_cuda_library(
    cuda_deps = [
        "//tensorflow/core/profiler/internal/gpu:cupti_tracer",
        "//tensorflow/core/profiler/internal/gpu:cupti_wrapper",
+        "//tensorflow/core/profiler/internal/gpu:rocm_tracer",
    ],
    deps = [
        ":cupti_utils",
@ -138,6 +140,25 @@ tf_cuda_library(
    ],
 )

+tf_cuda_library(
+    name = "rocm_tracer",
+    srcs = if_rocm(["rocm_tracer.cc"]),
+    hdrs = if_rocm(["rocm_tracer.h"]),
+    copts = tf_profiler_copts() + tf_copts(),
+    visibility = ["//visibility:public"],
+    deps = [
+        "//tensorflow/core:lib",
+        "//tensorflow/core/profiler/internal/cpu:annotation_stack",
+        "//tensorflow/core/profiler/utils:time_utils",
+        "//tensorflow/stream_executor/rocm:roctracer_wrapper",
+        "@com_google_absl//absl/container:fixed_array",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/container:node_hash_map",
+        "@com_google_absl//absl/container:node_hash_set",
+        "@com_google_absl//absl/types:optional",
+    ],
+)
+
 tf_cuda_library(
    name = "nvtx_utils",
    srcs = if_cuda(["nvtx_utils.cc"]),
--- a/tensorflow/core/profiler/internal/gpu/device_tracer_cuda.cc
+++ b/tensorflow/core/profiler/internal/gpu/device_tracer_cuda.cc
--- a/tensorflow/core/profiler/internal/gpu/device_tracer_rocm.cc
+++ b/tensorflow/core/profiler/internal/gpu/device_tracer_rocm.cc
@ -0,0 +1,833 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#if TENSORFLOW_USE_ROCM
+
+#include <memory>
+#include <utility>
+
+#include "absl/container/fixed_array.h"
+#include "absl/container/flat_hash_map.h"
+#include "absl/container/flat_hash_set.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/str_format.h"
+#include "absl/strings/str_join.h"
+#include "tensorflow/core/framework/step_stats.pb.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/platform/abi.h"
+#include "tensorflow/core/platform/env_time.h"
+#include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/platform/mutex.h"
+#include "tensorflow/core/platform/thread_annotations.h"
+#include "tensorflow/core/profiler/internal/cpu/annotation_stack.h"
+#include "tensorflow/core/profiler/internal/gpu/rocm_tracer.h"
+#include "tensorflow/core/profiler/lib/profiler_factory.h"
+#include "tensorflow/core/profiler/lib/profiler_interface.h"
+#include "tensorflow/core/profiler/utils/parse_annotation.h"
+#include "tensorflow/core/profiler/utils/xplane_builder.h"
+#include "tensorflow/core/profiler/utils/xplane_schema.h"
+#include "tensorflow/core/profiler/utils/xplane_utils.h"
+#include "tensorflow/core/util/env_var.h"
+
+namespace tensorflow {
+namespace profiler {
+
+namespace {
+// Set the all XLines of specified XPlane to starting walltime.
+// Events time in both host and device planes are CUTPI timestamps.
+// We set initial RocmTracer timestamp as start time for all lines to reflect
+// this fact. Eventually we change line start time to corresponding
+// start_walltime_ns to normalize with CPU wall time.
+static void NormalizeTimeStamps(XPlaneBuilder* plane,
+                                uint64_t start_walltime_ns) {
+  plane->ForEachLine([&](tensorflow::profiler::XLineBuilder line) {
+    line.SetTimestampNs(start_walltime_ns);
+  });
+}
+
+void GetDeviceCapabilities(int32_t device_ordinal,
+                           XPlaneBuilder* device_plane) {
+  // TODO(rocm)
+}
+
+bool IsHostEvent(const RocmTracerEvent& event) {
+  // TODO(rocm)
+  // Classify all events as GPU events for now
+  return false;
+}
+
+std::string GetDeviceXLineName(
+    int64_t stream_id, absl::flat_hash_set<RocmTracerEventType>& event_types) {
+  std::string line_name = absl::StrCat("Stream #", stream_id);
+  event_types.erase(RocmTracerEventType::Unsupported);
+  if (event_types.empty()) return line_name;
+  std::vector<const char*> type_names;
+  for (const auto event_type : event_types) {
+    type_names.emplace_back(GetRocmTracerEventTypeName(event_type));
+  }
+  return absl::StrCat(line_name, "(", absl::StrJoin(type_names, ","), ")");
+}
+
+}  // namespace
+
+class RocmTraceCollectorImpl : public profiler::RocmTraceCollector {
+ public:
+  RocmTraceCollectorImpl(const RocmTraceCollectorOptions& options,
+                         uint64_t start_walltime_ns, uint64_t start_gputime_ns)
+      : RocmTraceCollector(options),
+        num_callback_events_(0),
+        num_activity_events_(0),
+        start_walltime_ns_(start_walltime_ns),
+        start_gputime_ns_(start_gputime_ns),
+        next_logical_device_id_(0),
+        per_device_collector_(options.num_gpus) {
+    // in the physical -> logical device_id map, add an explicit entry for
+    // RocmTracerEvent::kInvalidDeviceId -> RocmTracerEvent::kInvalidDeviceId
+    // event with this device_id are events for which we were not able to
+    // determine the correct device_id via the API+Activity callbacks
+    // we will special case such events in the Flush routine
+    device_id_map_[RocmTracerEvent::kInvalidDeviceId] =
+        RocmTracerEvent::kInvalidDeviceId;
+  }
+
+  void AddEvent(RocmTracerEvent&& event) override {
+    mutex_lock lock(aggregated_events_mutex_);
+
+    if (event.source == RocmTracerEventSource::ApiCallback) {
+      if (num_callback_events_ > options_.max_callback_api_events) {
+        OnEventsDropped("max callback event capacity reached",
+                        event.correlation_id);
+        DumpRocmTracerEvent(event, 0, 0);
+        return;
+      }
+      num_callback_events_++;
+    }
+    if (event.source == RocmTracerEventSource::Activity) {
+      if (num_activity_events_ > options_.max_activity_api_events) {
+        OnEventsDropped("max activity event capacity reached",
+                        event.correlation_id);
+        DumpRocmTracerEvent(event, 0, 0);
+        return;
+      }
+      num_activity_events_++;
+    }
+
+    auto iter = aggregated_events_.find(event.correlation_id);
+    if (iter != aggregated_events_.end()) {
+      // event with this correlation id already present
+      // agrregate this event with the existing one
+      switch (event.domain) {
+        case RocmTracerEventDomain::HIP_API:
+          switch (event.source) {
+            case RocmTracerEventSource::ApiCallback:
+              break;
+            case RocmTracerEventSource::Activity:
+              // Use the start/stop time from the HCC_OPS domain
+              // unless this is one of those events for which we do not
+              // receive any HCC activity record callback
+              if (IsEventTypeWithoutHCCActivityRecordCallback(event.type)) {
+                iter->second.start_time_ns = event.start_time_ns;
+                iter->second.end_time_ns = event.end_time_ns;
+              }
+              iter->second.annotation = event.annotation;
+              break;
+          }
+          break;
+        case RocmTracerEventDomain::HCC_OPS:
+          switch (event.source) {
+            case RocmTracerEventSource::ApiCallback:
+              break;
+            case RocmTracerEventSource::Activity:
+              iter->second.device_id = event.device_id;
+              iter->second.stream_id = event.stream_id;
+              iter->second.start_time_ns = event.start_time_ns;
+              iter->second.end_time_ns = event.end_time_ns;
+              // Use the annotation from the HIP_API domain
+              // iter->second.annotation = event.annotation;
+              break;
+          }
+          break;
+      }
+    } else {
+      switch (event.source) {
+        case RocmTracerEventSource::ApiCallback:
+          aggregated_events_.emplace(event.correlation_id, std::move(event));
+          break;
+        case RocmTracerEventSource::Activity:
+          // you would think that this cannot happen, but it does
+          // This is primarily because the call "roctracer_flush_activity" does
+          // not work as it should. Imagine a sequence where we enable/disable
+          // tracing more than once in a single TF session.
+          // If the "flush" that happens during disable, does not flush out all
+          // the activity records, then they will show up during the subsequent
+          // call to enable, and we will end up here!
+          OnEventsDropped(
+              "Activity event encountered before a corresponding API event",
+              event.correlation_id);
+          DumpRocmTracerEvent(event, 0, 0);
+          break;
+      }
+    }
+  }
+
+  void OnEventsDropped(const std::string& reason,
+                       uint32_t correlation_id) override {
+    LOG(INFO) << "RocmTracerEvent dropped (correlation_id=" << correlation_id
+              << ",) : " << reason << ".";
+  }
+
+  void Flush() override {
+    mutex_lock lock(aggregated_events_mutex_);
+
+    VLOG(3) << "RocmTraceCollector collected " << num_callback_events_
+            << " callback events, " << num_activity_events_
+            << " activity events, and aggregated them into "
+            << aggregated_events_.size() << " events.";
+
+    for (auto& iter : aggregated_events_) {
+      auto& event = iter.second;
+
+      // For some hip API events, we never get a corresponding HCC
+      // activity record callback and hence we currently do not have a way
+      // of associating a valid device_id and stream_id with those events.
+      // For such events, explcitly set those id sto 0 for now
+      if (IsEventTypeWithoutHCCActivityRecordCallback(event.type)) {
+        DumpRocmTracerEvent(event, 0, 0);
+        if (event.device_id == RocmTracerEvent::kInvalidDeviceId) {
+          VLOG(3) << "Explicitly setting device_id to 0 for "
+                     "event with correlation_id="
+                  << event.correlation_id << ",";
+          event.device_id = 0;
+        } else {
+          VLOG(3) << "Unexpectedly found a non-default "
+                     "device_id for event with correlation_id="
+                  << event.correlation_id << ",";
+        }
+        if (event.stream_id == RocmTracerEvent::kInvalidStreamId) {
+          VLOG(3) << "Explicitly setting stream_id to 0 for "
+                     "event with correlation_id="
+                  << event.correlation_id << ",";
+          event.stream_id = 0;
+        } else {
+          VLOG(3) << "Unexpectedly found a non-default "
+                     "stream_id for event with correlation_id="
+                  << event.correlation_id << ",";
+        }
+      }
+
+      // determine the logical device id
+      uint32_t physical_id = event.device_id;
+      uint32_t logical_id = options_.num_gpus;
+      auto kv_pair = device_id_map_.find(physical_id);
+      if (kv_pair == device_id_map_.end()) {
+        logical_id = next_logical_device_id_++;
+        VLOG(3) << "Mapping physical device id " << physical_id
+                << " to logical device id " << logical_id;
+        device_id_map_[physical_id] = logical_id;
+      } else {
+        logical_id = kv_pair->second;
+      }
+      event.device_id = logical_id;
+
+      if (event.device_id >= options_.num_gpus) {
+        OnEventsDropped("logical device id >= num gpus", event.correlation_id);
+        DumpRocmTracerEvent(event, 0, 0);
+        continue;
+      }
+
+      if (event.stream_id == RocmTracerEvent::kInvalidStreamId) {
+        OnEventsDropped("invalid stream id", event.correlation_id);
+        DumpRocmTracerEvent(event, 0, 0);
+        continue;
+      }
+
+      per_device_collector_[logical_id].AddEvent(event);
+    }
+    aggregated_events_.clear();
+
+    for (int i = 0; i < options_.num_gpus; ++i) {
+      per_device_collector_[i].SortByStartTime();
+    }
+  }
+
+  void Export(StepStats* step_stats) {
+    for (int i = 0; i < options_.num_gpus; ++i) {
+      per_device_collector_[i].Export(i, start_walltime_ns_, start_gputime_ns_,
+                                      step_stats);
+    }
+  }
+
+  void Export(XSpace* space) {
+    uint64_t end_gputime_ns = RocmTracer::GetTimestamp();
+    XPlaneBuilder host_plane(
+        FindOrAddMutablePlaneWithName(space, kRoctracerApiPlaneName));
+    for (int i = 0; i < options_.num_gpus; ++i) {
+      std::string name = GpuPlaneName(i);
+      XPlaneBuilder device_plane(FindOrAddMutablePlaneWithName(space, name));
+      device_plane.SetId(i);
+      per_device_collector_[i].Export(start_walltime_ns_, start_gputime_ns_,
+                                      end_gputime_ns, &device_plane,
+                                      &host_plane);
+      GetDeviceCapabilities(i, &device_plane);
+      NormalizeTimeStamps(&device_plane, start_walltime_ns_);
+    }
+    NormalizeTimeStamps(&host_plane, start_walltime_ns_);
+  }
+
+ private:
+  std::atomic<int> num_callback_events_;
+  std::atomic<int> num_activity_events_;
+  uint64_t start_walltime_ns_;
+  uint64_t start_gputime_ns_;
+
+  mutex aggregated_events_mutex_;
+  absl::flat_hash_map<uint32_t, RocmTracerEvent> aggregated_events_
+      TF_GUARDED_BY(aggregated_events_mutex_);
+
+  // We need to create a map of
+  //  event.device_id -> index into per_device_collector_ array
+  // The event.device_id returned by the RocmTracer is the physical
+  // device_id and not the logical device_id. Say for example we are
+  // running on a node with 8 GPUs. The expected physical device_id(s)
+  // for those 8 GPUs would be 0,1,2,3,4,5,6,7. On such a node, if we
+  // run a test with HIP_VISIBLE_DEVICES=5, then "options.num_gpus_ == 1",
+  // but the event.device_id field will have 5 in it!
+  // So the event.device_id can be thought of as the physical device id
+  // and the index can be thought of as the logical device id.
+  // We cannot determine the actual phsyical device id logical device id
+  // mapping here, so we determine it empirically
+  std::map<uint32_t, uint32_t> device_id_map_;
+  uint32_t next_logical_device_id_;
+
+  bool IsEventTypeWithoutHCCActivityRecordCallback(RocmTracerEventType type) {
+    switch (type) {
+      case RocmTracerEventType::MemoryAlloc:
+        return true;
+        break;
+      default:
+        break;
+    }
+    return false;
+  }
+
+  struct PerDeviceCollector {
+    void AddEvent(const RocmTracerEvent& event) {
+      mutex_lock lock(events_mutex);
+      events.emplace_back(event);
+    }
+
+    void SortByStartTime() {
+      mutex_lock lock(events_mutex);
+      std::sort(
+          events.begin(), events.end(),
+          [](const RocmTracerEvent& event1, const RocmTracerEvent& event2) {
+            return event1.start_time_ns < event2.start_time_ns;
+          });
+    }
+
+    void Export(int32_t device_ordinal, uint64_t start_walltime_ns,
+                uint64_t start_gputime_ns, StepStats* step_stats) {
+      mutex_lock lock(events_mutex);
+      absl::flat_hash_map<
+          std::pair<uint64_t /*stream_id*/, RocmTracerEventType>,
+          DeviceStepStats*>
+          per_stream_dev_stats;
+
+      DeviceStepStats* generic_stream_dev_stats = nullptr;
+      DeviceStepStats* all_streams_dev_stats = nullptr;
+      DeviceStepStats* memcpy_dev_stats = nullptr;
+      DeviceStepStats* sync_dev_stats = nullptr;
+
+      for (const RocmTracerEvent& event : events) {
+        DumpRocmTracerEvent(event, start_walltime_ns, start_gputime_ns);
+
+        std::unique_ptr<NodeExecStats> ns(new NodeExecStats);
+
+        ns->set_all_start_micros(
+            (start_walltime_ns + (event.start_time_ns - start_gputime_ns)) /
+            1000);
+        ns->set_op_start_rel_micros(0);
+        uint64_t elapsed_ns = event.end_time_ns - event.start_time_ns;
+        ns->set_op_end_rel_micros(
+            tensorflow::profiler::NanosToMicros(elapsed_ns));
+        ns->set_all_end_rel_micros(
+            tensorflow::profiler::NanosToMicros(elapsed_ns));
+
+        auto annotation_stack = ParseAnnotationStack(event.annotation);
+        std::string kernel_name = port::MaybeAbiDemangle(event.name.c_str());
+        std::string activity_name =
+            !annotation_stack.empty()
+                ? std::string(annotation_stack.back().name)
+                : kernel_name;
+        ns->set_node_name(activity_name);
+
+        ns->set_thread_id(event.thread_id);
+
+        switch (event.type) {
+          case RocmTracerEventType::Kernel: {
+            ns->set_timeline_label(absl::StrFormat(
+                "%s regs:%u shm:%u grid:%u,%u,%u block:%u,%u,%u@@%s",
+                kernel_name, event.kernel_info.registers_per_thread,
+                event.kernel_info.static_shared_memory_usage,
+                event.kernel_info.grid_x, event.kernel_info.grid_y,
+                event.kernel_info.grid_z, event.kernel_info.block_x,
+                event.kernel_info.block_y, event.kernel_info.block_z,
+                event.annotation));
+            DeviceStepStats*& stream_dev_stats =
+                per_stream_dev_stats[std::make_pair(event.stream_id,
+                                                    event.type)];
+            if (stream_dev_stats == nullptr) {
+              stream_dev_stats = step_stats->add_dev_stats();
+              stream_dev_stats->set_device(absl::StrCat(
+                  "/device:GPU:", device_ordinal, "/stream:", event.stream_id,
+                  "<", GetRocmTracerEventTypeName(event.type), ">"));
+            }
+            *stream_dev_stats->add_node_stats() = *ns;
+            if (all_streams_dev_stats == nullptr) {
+              all_streams_dev_stats = step_stats->add_dev_stats();
+              all_streams_dev_stats->set_device(
+                  absl::StrCat("/device:GPU:", device_ordinal, "/stream:all"));
+            }
+            all_streams_dev_stats->add_node_stats()->Swap(ns.release());
+          } break;
+          case RocmTracerEventType::MemcpyD2H:
+          case RocmTracerEventType::MemcpyH2D:
+          case RocmTracerEventType::MemcpyD2D:
+          case RocmTracerEventType::MemcpyP2P: {
+            std::string details = absl::StrCat(
+                event.name, " bytes:", event.memcpy_info.num_bytes);
+            if (event.memcpy_info.async) {
+              absl::StrAppend(&details, " async");
+            }
+            if (event.memcpy_info.destination != event.device_id) {
+              absl::StrAppend(&details,
+                              " to device:", event.memcpy_info.destination);
+            }
+            ns->set_timeline_label(std::move(details));
+
+            DeviceStepStats*& stream_dev_stats =
+                per_stream_dev_stats[std::make_pair(event.stream_id,
+                                                    event.type)];
+            if (stream_dev_stats == nullptr) {
+              stream_dev_stats = step_stats->add_dev_stats();
+              stream_dev_stats->set_device(absl::StrCat(
+                  "/device:GPU:", device_ordinal, "/stream:", event.stream_id,
+                  "<", GetRocmTracerEventTypeName(event.type), ">"));
+            }
+            *stream_dev_stats->add_node_stats() = *ns;
+            if (memcpy_dev_stats == nullptr) {
+              memcpy_dev_stats = step_stats->add_dev_stats();
+              memcpy_dev_stats->set_device(
+                  absl::StrCat("/device:GPU:", device_ordinal, "/memcpy"));
+            }
+            memcpy_dev_stats->add_node_stats()->Swap(ns.release());
+          } break;
+          case RocmTracerEventType::MemoryAlloc: {
+            std::string details = absl::StrCat(
+                event.name, " bytes:", event.memalloc_info.num_bytes);
+            ns->set_timeline_label(std::move(details));
+
+            DeviceStepStats*& stream_dev_stats =
+                per_stream_dev_stats[std::make_pair(event.stream_id,
+                                                    event.type)];
+            if (stream_dev_stats == nullptr) {
+              stream_dev_stats = step_stats->add_dev_stats();
+              stream_dev_stats->set_device(absl::StrCat(
+                  "/device:GPU:", device_ordinal, "/stream:", event.stream_id,
+                  "<", GetRocmTracerEventTypeName(event.type), ">"));
+            }
+            *stream_dev_stats->add_node_stats() = *ns;
+          } break;
+          case RocmTracerEventType::StreamSynchronize: {
+            std::string details = event.name;
+            ns->set_timeline_label(std::move(details));
+
+            if (sync_dev_stats == nullptr) {
+              sync_dev_stats = step_stats->add_dev_stats();
+              sync_dev_stats->set_device(
+                  absl::StrCat("/device:GPU:", device_ordinal, "/sync"));
+            }
+            sync_dev_stats->add_node_stats()->Swap(ns.release());
+          } break;
+          case RocmTracerEventType::Generic: {
+            std::string details = event.name;
+            ns->set_timeline_label(std::move(details));
+
+            if (generic_stream_dev_stats == nullptr) {
+              generic_stream_dev_stats = step_stats->add_dev_stats();
+              generic_stream_dev_stats->set_device(
+                  absl::StrCat("/device:GPU:", device_ordinal, "/stream:"));
+            }
+            generic_stream_dev_stats->add_node_stats()->Swap(ns.release());
+          } break;
+          default:
+            DCHECK(false);
+            break;
+        }
+      }
+      events.clear();
+    }
+
+    void CreateXEvent(const RocmTracerEvent& event, XPlaneBuilder* plane,
+                      uint64_t start_gpu_ns, uint64_t end_gpu_ns,
+                      XLineBuilder* line) {
+      if (event.start_time_ns < start_gpu_ns ||
+          event.end_time_ns > end_gpu_ns ||
+          event.start_time_ns > event.end_time_ns) {
+        VLOG(2) << "events have abnormal timestamps:" << event.name
+                << " start time(ns): " << event.start_time_ns
+                << " end time(ns): " << event.end_time_ns;
+        return;
+      }
+      std::string kernel_name = port::MaybeAbiDemangle(event.name.c_str());
+      if (kernel_name.empty()) {
+        kernel_name = GetRocmTracerEventTypeName(event.type);
+      }
+      XEventMetadata* event_metadata =
+          plane->GetOrCreateEventMetadata(std::move(kernel_name));
+      XEventBuilder xevent = line->AddEvent(*event_metadata);
+      xevent.SetTimestampNs(event.start_time_ns);
+      xevent.SetEndTimestampNs(event.end_time_ns);
+      if (event.correlation_id != RocmTracerEvent::kInvalidCorrelationId) {
+        xevent.AddStatValue(*plane->GetOrCreateStatMetadata(
+                                GetStatTypeStr(StatType::kCorrelationId)),
+                            event.correlation_id);
+      }
+      if (!event.annotation.empty()) {
+        xevent.AddStatValue(*plane->GetOrCreateStatMetadata(
+                                GetStatTypeStr(StatType::kKernelAnnotation)),
+                            event.annotation);
+      }
+      switch (event.type) {
+        case RocmTracerEventType::Kernel: {
+          const std::string kernel_details = absl::StrFormat(
+              "regs:%u shm:%u grid:%u,%u,%u block:%u,%u,%u",
+              event.kernel_info.registers_per_thread,
+              event.kernel_info.static_shared_memory_usage,
+              event.kernel_info.grid_x, event.kernel_info.grid_y,
+              event.kernel_info.grid_z, event.kernel_info.block_x,
+              event.kernel_info.block_y, event.kernel_info.block_z);
+          xevent.AddStatValue(*plane->GetOrCreateStatMetadata(
+                                  GetStatTypeStr(StatType::kKernelDetails)),
+                              kernel_details);
+        } break;
+
+        case RocmTracerEventType::MemcpyD2H:
+        case RocmTracerEventType::MemcpyH2D:
+        case RocmTracerEventType::MemcpyD2D:
+        case RocmTracerEventType::MemcpyP2P:
+        case RocmTracerEventType::MemcpyOther: {
+          const auto& memcpy_info = event.memcpy_info;
+          std::string memcpy_details =
+              absl::StrFormat("size:%u dest:%u async:%u", memcpy_info.num_bytes,
+                              memcpy_info.destination, memcpy_info.async);
+          xevent.AddStatValue(*plane->GetOrCreateStatMetadata(
+                                  GetStatTypeStr(StatType::kMemcpyDetails)),
+                              memcpy_details);
+        } break;
+        case RocmTracerEventType::MemoryAlloc: {
+          std::string memalloc_details =
+              absl::StrFormat("num_bytes:%u", event.memalloc_info.num_bytes);
+          xevent.AddStatValue(*plane->GetOrCreateStatMetadata(
+                                  GetStatTypeStr(StatType::kMemallocDetails)),
+                              memalloc_details);
+        } break;
+        case RocmTracerEventType::StreamSynchronize: {
+          // TODO(rocm)
+          // Don't yet know what to do here
+        } break;
+        case RocmTracerEventType::Generic: {
+          // TODO(rocm)
+          // Don't yet know what to do here
+        } break;
+        default:
+          DCHECK(false);
+          break;
+      }
+
+      std::vector<Annotation> annotation_stack =
+          ParseAnnotationStack(event.annotation);
+      // If multiple metadata have the same key name, show the values from the
+      // top of the stack (innermost annotation). Concatenate the values from
+      // "hlo_op".
+      absl::flat_hash_set<absl::string_view> key_set;
+      std::vector<absl::string_view> hlo_op_names;
+      for (auto annotation = annotation_stack.rbegin();
+           annotation != annotation_stack.rend(); ++annotation) {
+        for (const Annotation::Metadata& metadata : annotation->metadata) {
+          if (metadata.key == "tf_op") {
+            continue;  // ignored, obtained from HLO proto via DebugInfoMap
+          } else if (key_set.insert(metadata.key).second) {
+            xevent.ParseAndAddStatValue(
+                *plane->GetOrCreateStatMetadata(metadata.key), metadata.value);
+          }
+        }
+      }
+      // TODO(profiler): we should get rid of kLevel0, it is based on the
+      // assumption that those op-related ScopedAnnotation are at the very TOP
+      // level.
+      if (!annotation_stack.empty()) {
+        xevent.AddStatValue(
+            *plane->GetOrCreateStatMetadata(GetStatTypeStr(StatType::kLevel0)),
+            annotation_stack.begin()->name);
+      }
+    }
+
+    void Export(uint64_t start_walltime_ns, uint64_t start_gputime_ns,
+                uint64_t end_gputime_ns, XPlaneBuilder* device_plane,
+                XPlaneBuilder* host_plane) {
+      mutex_lock lock(events_mutex);
+      // Tracking event types per line.
+      absl::flat_hash_map<int64, absl::flat_hash_set<RocmTracerEventType>>
+          events_types_per_line;
+      for (const RocmTracerEvent& event : events) {
+        DumpRocmTracerEvent(event, start_walltime_ns, start_gputime_ns);
+        bool is_host_event = IsHostEvent(event);
+        int64_t line_id = is_host_event ? static_cast<int64>(event.thread_id)
+                                        : event.stream_id;
+        if (line_id == RocmTracerEvent::kInvalidThreadId ||
+            line_id == RocmTracerEvent::kInvalidStreamId)
+          continue;
+        auto* plane = is_host_event ? host_plane : device_plane;
+        XLineBuilder line = plane->GetOrCreateLine(line_id);
+        line.SetTimestampNs(start_gputime_ns);
+        CreateXEvent(event, plane, start_gputime_ns, end_gputime_ns, &line);
+        events_types_per_line[line_id].emplace(event.type);
+      }
+      device_plane->ForEachLine([&](tensorflow::profiler::XLineBuilder line) {
+        line.SetName(
+            GetDeviceXLineName(line.Id(), events_types_per_line[line.Id()]));
+      });
+      events.clear();
+    }
+
+    mutex events_mutex;
+    std::vector<RocmTracerEvent> events TF_GUARDED_BY(events_mutex);
+  };
+
+  absl::FixedArray<PerDeviceCollector> per_device_collector_;
+};
+
+// GpuTracer for ROCm GPU.
+class GpuTracer : public profiler::ProfilerInterface {
+ public:
+  GpuTracer(RocmTracer* rocm_tracer) : rocm_tracer_(rocm_tracer) {
+    LOG(INFO) << "GpuTracer created.";
+  }
+  ~GpuTracer() override {}
+
+  // GpuTracer interface:
+  Status Start() override;
+  Status Stop() override;
+  Status CollectData(RunMetadata* run_metadata) override;
+  Status CollectData(XSpace* space) override;
+
+ private:
+  Status DoStart();
+  Status DoStop();
+  Status DoCollectData(StepStats* step_stats);
+  Status DoCollectData(XSpace* space);
+
+  RocmTracerOptions GetRocmTracerOptions();
+
+  RocmTraceCollectorOptions GetRocmTraceCollectorOptions(uint32_t num_gpus);
+
+  enum State {
+    kNotStarted,
+    kStartedOk,
+    kStartedError,
+    kStoppedOk,
+    kStoppedError
+  };
+  State profiling_state_ = State::kNotStarted;
+
+  RocmTracer* rocm_tracer_;
+  std::unique_ptr<RocmTraceCollectorImpl> rocm_trace_collector_;
+};
+
+RocmTracerOptions GpuTracer::GetRocmTracerOptions() {
+  RocmTracerOptions options;
+  std::vector<uint32_t> empty_vec;
+
+  // clang formatting does not preserve one entry per line
+  // clang-format off
+  std::vector<uint32_t> hip_api_domain_ops{
+      HIP_API_ID_hipExtModuleLaunchKernel,
+      HIP_API_ID_hipFree,
+      HIP_API_ID_hipHccModuleLaunchKernel,
+      HIP_API_ID_hipLaunchKernel,
+      HIP_API_ID_hipMalloc,
+      HIP_API_ID_hipMemcpyAsync,
+      HIP_API_ID_hipMemcpyDtoD,
+      HIP_API_ID_hipMemcpyDtoDAsync,
+      HIP_API_ID_hipMemcpyDtoH,
+      HIP_API_ID_hipMemcpyDtoHAsync,
+      HIP_API_ID_hipMemcpyHtoD,
+      HIP_API_ID_hipMemcpyHtoDAsync,
+      HIP_API_ID_hipMemsetD32,
+      HIP_API_ID_hipMemsetD32Async,
+      HIP_API_ID_hipMemsetD8,
+      HIP_API_ID_hipMemsetD8Async,
+      HIP_API_ID_hipModuleLaunchKernel,
+      HIP_API_ID_hipStreamSynchronize,
+  };
+  // clang-format on
+
+  options.api_callbacks.emplace(ACTIVITY_DOMAIN_HIP_API, hip_api_domain_ops);
+  // options.api_callbacks.emplace(ACTIVITY_DOMAIN_HIP_API, empty_vec);
+
+  // options.activity_tracing.emplace(ACTIVITY_DOMAIN_HIP_API,
+  // hip_api_domain_ops);
+  options.activity_tracing.emplace(ACTIVITY_DOMAIN_HIP_API, empty_vec);
+  options.activity_tracing.emplace(ACTIVITY_DOMAIN_HCC_OPS, empty_vec);
+
+  return options;
+}
+
+RocmTraceCollectorOptions GpuTracer::GetRocmTraceCollectorOptions(
+    uint32_t num_gpus) {
+  RocmTraceCollectorOptions options;
+  options.max_callback_api_events = 2 * 1024 * 1024;
+  options.max_activity_api_events = 2 * 1024 * 1024;
+  options.max_annotation_strings = 1024 * 1024;
+  options.num_gpus = num_gpus;
+  return options;
+}
+
+Status GpuTracer::DoStart() {
+  if (!rocm_tracer_->IsAvailable()) {
+    return errors::Unavailable("Another profile session running.");
+  }
+
+  AnnotationStack::Enable(true);
+
+  RocmTraceCollectorOptions trace_collector_options =
+      GetRocmTraceCollectorOptions(rocm_tracer_->NumGpus());
+  uint64_t start_gputime_ns = RocmTracer::GetTimestamp();
+  uint64_t start_walltime_ns = tensorflow::EnvTime::NowNanos();
+  rocm_trace_collector_ = std::make_unique<RocmTraceCollectorImpl>(
+      trace_collector_options, start_walltime_ns, start_gputime_ns);
+
+  RocmTracerOptions tracer_options = GetRocmTracerOptions();
+  rocm_tracer_->Enable(tracer_options, rocm_trace_collector_.get());
+
+  return Status::OK();
+}
+
+Status GpuTracer::Start() {
+  Status status = DoStart();
+  if (status.ok()) {
+    profiling_state_ = State::kStartedOk;
+    return Status::OK();
+  } else {
+    profiling_state_ = State::kStartedError;
+    return status;
+  }
+}
+
+Status GpuTracer::DoStop() {
+  rocm_tracer_->Disable();
+  AnnotationStack::Enable(false);
+  return Status::OK();
+}
+
+Status GpuTracer::Stop() {
+  if (profiling_state_ == State::kStartedOk) {
+    Status status = DoStop();
+    profiling_state_ = status.ok() ? State::kStoppedOk : State::kStoppedError;
+  }
+  return Status::OK();
+}
+
+Status GpuTracer::DoCollectData(StepStats* step_stats) {
+  if (rocm_trace_collector_) rocm_trace_collector_->Export(step_stats);
+  return Status::OK();
+}
+
+Status GpuTracer::CollectData(RunMetadata* run_metadata) {
+  switch (profiling_state_) {
+    case State::kNotStarted:
+      VLOG(3) << "No trace data collected, session wasn't started";
+      return Status::OK();
+    case State::kStartedOk:
+      return errors::FailedPrecondition("Cannot collect trace before stopping");
+    case State::kStartedError:
+      LOG(ERROR) << "Cannot collect, roctracer failed to start";
+      return Status::OK();
+    case State::kStoppedError:
+      VLOG(3) << "No trace data collected";
+      return Status::OK();
+    case State::kStoppedOk: {
+      // Input run_metadata is shared by profiler interfaces, we need append.
+      StepStats step_stats;
+      DoCollectData(&step_stats);
+      for (auto& dev_stats : *step_stats.mutable_dev_stats()) {
+        run_metadata->mutable_step_stats()->add_dev_stats()->Swap(&dev_stats);
+      }
+      return Status::OK();
+    }
+  }
+  return errors::Internal("Invalid profiling state: ", profiling_state_);
+}
+
+Status GpuTracer::DoCollectData(XSpace* space) {
+  if (rocm_trace_collector_) rocm_trace_collector_->Export(space);
+  return Status::OK();
+}
+
+Status GpuTracer::CollectData(XSpace* space) {
+  switch (profiling_state_) {
+    case State::kNotStarted:
+      VLOG(3) << "No trace data collected, session wasn't started";
+      return Status::OK();
+    case State::kStartedOk:
+      return errors::FailedPrecondition("Cannot collect trace before stopping");
+    case State::kStartedError:
+      LOG(ERROR) << "Cannot collect, roctracer failed to start";
+      return Status::OK();
+    case State::kStoppedError:
+      VLOG(3) << "No trace data collected";
+      return Status::OK();
+    case State::kStoppedOk: {
+      DoCollectData(space);
+      return Status::OK();
+    }
+  }
+  return errors::Internal("Invalid profiling state: ", profiling_state_);
+}
+
+// Not in anonymous namespace for testing purposes.
+std::unique_ptr<profiler::ProfilerInterface> CreateGpuTracer(
+    const ProfileOptions& options) {
+  if (options.device_type() != ProfileOptions::GPU &&
+      options.device_type() != ProfileOptions::UNSPECIFIED)
+    return nullptr;
+
+  profiler::RocmTracer* rocm_tracer =
+      profiler::RocmTracer::GetRocmTracerSingleton();
+  if (!rocm_tracer->IsAvailable()) return nullptr;
+
+  return absl::make_unique<profiler::GpuTracer>(rocm_tracer);
+}
+
+auto register_rocm_gpu_tracer_factory = [] {
+  RegisterProfilerFactory(&CreateGpuTracer);
+  return 0;
+}();
+
+}  // namespace profiler
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_USE_ROCM
--- a/tensorflow/core/profiler/internal/gpu/device_tracer_test.cc
+++ b/tensorflow/core/profiler/internal/gpu/device_tracer_test.cc
@ -49,7 +49,7 @@ limitations under the License.
 namespace tensorflow {
 namespace profiler {

-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 extern std::unique_ptr<ProfilerInterface> CreateGpuTracer(
    const ProfileOptions& options);
 std::unique_ptr<ProfilerInterface> CreateGpuTracer() {
@ -243,6 +243,54 @@ TEST_F(DeviceTracerTest, RunWithTraceOption) {
  EXPECT_GE(run_metadata.step_stats().dev_stats_size(), 1);
 }

+#if TENSORFLOW_USE_ROCM
+TEST_F(DeviceTracerTest, TraceToXSpace) {
+  auto tracer = CreateGpuTracer();
+  if (!tracer) return;
+
+  Initialize({3, 2, -1, 0});
+  auto session = CreateSession();
+  ASSERT_TRUE(session != nullptr);
+  TF_ASSERT_OK(session->Create(def_));
+  std::vector<std::pair<string, Tensor>> inputs;
+
+  // Request two targets: one fetch output and one non-fetched output.
+  std::vector<string> output_names = {y_ + ":0"};
+  std::vector<string> target_nodes = {y_neg_};
+  std::vector<Tensor> outputs;
+
+  TF_ASSERT_OK(tracer->Start());
+  Status s = session->Run(inputs, output_names, target_nodes, &outputs);
+  TF_ASSERT_OK(s);
+
+  TF_ASSERT_OK(tracer->Stop());
+  XSpace space;
+  TF_ASSERT_OK(tracer->CollectData(&space));
+  // At least one gpu plane and one host plane for launching events.
+  const XPlane* host_plane = FindPlaneWithName(space, kRoctracerApiPlaneName);
+  ASSERT_NE(host_plane, nullptr);
+
+  const XPlane* device_plane =
+      FindPlaneWithName(space, strings::StrCat(kGpuPlanePrefix, 0));
+  ASSERT_NE(device_plane, nullptr);  // Check if device plane is serialized.
+  // one for MemcpyH2D, one for MemcpyD2H, two for Matmul (one from Eigen, one
+  // from cudnn), one for memset.
+  EXPECT_EQ(device_plane->event_metadata_size(), 5);
+  // Check if device capacity is serialized.
+  XPlaneVisitor plane = CreateTfXPlaneVisitor(device_plane);
+
+  // Check if the device events timestamps are set.
+  int total_events = 0;
+  plane.ForEachLine([&](const tensorflow::profiler::XLineVisitor& line) {
+    line.ForEachEvent([&](const tensorflow::profiler::XEventVisitor& event) {
+      EXPECT_GT(event.TimestampNs(), 0);
+      EXPECT_GT(event.DurationNs(), 0);
+      ++total_events;
+    });
+  });
+  EXPECT_GE(total_events, 5);
+}
+#else   // TENSORFLOW_USE_ROCM
 TEST_F(DeviceTracerTest, TraceToXSpace) {
  auto tracer = CreateGpuTracer();
  if (!tracer) return;
@ -295,6 +343,7 @@ TEST_F(DeviceTracerTest, TraceToXSpace) {
  });
  EXPECT_GE(total_events, 5);
 }
+#endif  // TENSORFLOW_USE_ROCM

 #if GOOGLE_CUDA
 TEST_F(DeviceTracerTest, CudaRuntimeResource) {
--- a/tensorflow/core/profiler/internal/gpu/rocm_tracer.cc
+++ b/tensorflow/core/profiler/internal/gpu/rocm_tracer.cc
--- a/tensorflow/core/profiler/internal/gpu/rocm_tracer.h
+++ b/tensorflow/core/profiler/internal/gpu/rocm_tracer.h
@ -0,0 +1,355 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_PROFILER_INTERNAL_GPU_ROCM_TRACER_H_
+#define TENSORFLOW_CORE_PROFILER_INTERNAL_GPU_ROCM_TRACER_H_
+
+#include "absl/container/fixed_array.h"
+#include "absl/container/flat_hash_map.h"
+#include "absl/container/flat_hash_set.h"
+#include "absl/container/node_hash_set.h"
+#include "absl/types/optional.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/platform/types.h"
+#include "tensorflow/stream_executor/rocm/roctracer_wrapper.h"
+
+namespace tensorflow {
+namespace profiler {
+
+struct MemcpyDetails {
+  // The amount of data copied for memcpy events.
+  size_t num_bytes;
+  // The destination device for peer-2-peer communication (memcpy). The source
+  // device is implicit: its the current device.
+  uint32_t destination;
+  // Whether or not the memcpy is asynchronous.
+  bool async;
+};
+
+struct MemsetDetails {
+  // The number of memory elements getting set
+  size_t num_elements;
+  // Whether or not the memset is asynchronous.
+  bool async;
+};
+
+struct MemAllocDetails {
+  // The amount of data requested for cudaMalloc events.
+  uint64_t num_bytes;
+};
+
+struct KernelDetails {
+  // The number of registers used in this kernel.
+  uint32_t registers_per_thread;
+  // The amount of shared memory space used by a thread block.
+  uint32_t static_shared_memory_usage;
+  // The amount of dynamic memory space used by a thread block.
+  uint32_t dynamic_shared_memory_usage;
+  // X-dimension of a thread block.
+  uint32_t block_x;
+  // Y-dimension of a thread block.
+  uint32_t block_y;
+  // Z-dimension of a thread block.
+  uint32_t block_z;
+  // X-dimension of a grid.
+  uint32_t grid_x;
+  // Y-dimension of a grid.
+  uint32_t grid_y;
+  // Z-dimension of a grid.
+  uint32_t grid_z;
+};
+
+enum class RocmTracerEventType {
+  Unsupported = 0,
+  Kernel,
+  MemcpyH2D,
+  MemcpyD2H,
+  MemcpyD2D,
+  MemcpyP2P,
+  MemcpyOther,
+  MemoryAlloc,
+  Memset,
+  StreamSynchronize,
+  Generic,
+};
+
+const char* GetRocmTracerEventTypeName(const RocmTracerEventType& type);
+
+enum class RocmTracerEventSource {
+  ApiCallback = 0,
+  Activity,
+};
+
+const char* GetRocmTracerEventSourceName(const RocmTracerEventSource& source);
+
+enum class RocmTracerEventDomain {
+  HIP_API = 0,
+  HCC_OPS,
+};
+
+const char* GetRocmTracerEventDomainName(const RocmTracerEventDomain& domain);
+
+struct RocmTracerEvent {
+  static constexpr uint32_t kInvalidDeviceId =
+      std::numeric_limits<uint32_t>::max();
+  static constexpr uint32_t kInvalidThreadId =
+      std::numeric_limits<uint32_t>::max();
+  static constexpr uint32_t kInvalidCorrelationId =
+      std::numeric_limits<uint32_t>::max();
+  static constexpr uint64_t kInvalidStreamId =
+      std::numeric_limits<uint64_t>::max();
+  RocmTracerEventType type;
+  RocmTracerEventSource source;
+  RocmTracerEventDomain domain;
+  std::string name;
+  // This points to strings in AnnotationMap, which should outlive the point
+  // where serialization happens.
+  absl::string_view annotation;
+  uint64_t start_time_ns;
+  uint64_t end_time_ns;
+  uint32_t device_id = kInvalidDeviceId;
+  uint32_t correlation_id = kInvalidCorrelationId;
+  uint32_t thread_id = kInvalidThreadId;
+  int64_t stream_id = kInvalidStreamId;
+  union {
+    MemcpyDetails memcpy_info;      // If type == Memcpy*
+    MemsetDetails memset_info;      // If type == Memset*
+    MemAllocDetails memalloc_info;  // If type == MemoryAlloc
+    KernelDetails kernel_info;      // If type == Kernel
+  };
+};
+
+void DumpRocmTracerEvent(const RocmTracerEvent& event,
+                         uint64_t start_walltime_ns, uint64_t start_gputime_ns);
+
+struct RocmTracerOptions {
+  // map of domain --> ops for which we need to enable the API callbacks
+  // If the ops vector is empty, then enable API callbacks for entire domain
+  absl::flat_hash_map<activity_domain_t, std::vector<uint32_t> > api_callbacks;
+
+  // map of domain --> ops for which we need to enable the Activity records
+  // If the ops vector is empty, then enable Activity records for entire domain
+  absl::flat_hash_map<activity_domain_t, std::vector<uint32_t> >
+      activity_tracing;
+};
+
+struct RocmTraceCollectorOptions {
+  // Maximum number of events to collect from callback API; if -1, no limit.
+  // if 0, the callback API is enabled to build a correlation map, but no
+  // events are collected.
+  uint64_t max_callback_api_events;
+  // Maximum number of events to collect from activity API; if -1, no limit.
+  uint64_t max_activity_api_events;
+  // Maximum number of annotation strings that we can accommodate.
+  uint64_t max_annotation_strings;
+  // Number of GPUs involved.
+  uint32_t num_gpus;
+};
+
+class AnnotationMap {
+ public:
+  explicit AnnotationMap(uint64_t max_size) : max_size_(max_size) {}
+  void Add(uint32_t correlation_id, const std::string& annotation);
+  absl::string_view LookUp(uint32_t correlation_id);
+
+ private:
+  struct AnnotationMapImpl {
+    // The population/consumption of annotations might happen from multiple
+    // callback/activity api related threads.
+    absl::Mutex mutex;
+    // Annotation tends to be repetitive, use a hash_set to store the strings,
+    // an use the reference to the string in the map.
+    absl::node_hash_set<std::string> annotations;
+    absl::flat_hash_map<uint32_t, absl::string_view> correlation_map;
+  };
+  const uint64_t max_size_;
+  AnnotationMapImpl map_;
+
+ public:
+  // Disable copy and move.
+  AnnotationMap(const AnnotationMap&) = delete;
+  AnnotationMap& operator=(const AnnotationMap&) = delete;
+};
+
+class RocmTraceCollector {
+ public:
+  explicit RocmTraceCollector(const RocmTraceCollectorOptions& options)
+      : options_(options), annotation_map_(options.max_annotation_strings) {}
+  virtual ~RocmTraceCollector() {}
+
+  virtual void AddEvent(RocmTracerEvent&& event) = 0;
+  virtual void OnEventsDropped(const std::string& reason,
+                               uint32_t num_events) = 0;
+  virtual void Flush() = 0;
+
+  AnnotationMap* annotation_map() { return &annotation_map_; }
+
+ protected:
+  RocmTraceCollectorOptions options_;
+
+ private:
+  AnnotationMap annotation_map_;
+
+ public:
+  // Disable copy and move.
+  RocmTraceCollector(const RocmTraceCollector&) = delete;
+  RocmTraceCollector& operator=(const RocmTraceCollector&) = delete;
+};
+
+class RocmTracer;
+
+class RocmApiCallbackImpl {
+ public:
+  RocmApiCallbackImpl(const RocmTracerOptions& options, RocmTracer* tracer,
+                      RocmTraceCollector* collector)
+      : options_(options), tracer_(tracer), collector_(collector) {}
+
+  Status operator()(uint32_t domain, uint32_t cbid, const void* cbdata);
+
+ private:
+  void AddKernelEventUponApiExit(uint32_t cbid, const hip_api_data_t* data);
+  void AddMemcpyEventUponApiExit(uint32_t cbid, const hip_api_data_t* data);
+  void AddMemsetEventUponApiExit(uint32_t cbid, const hip_api_data_t* data);
+  void AddMallocEventUponApiExit(uint32_t cbid, const hip_api_data_t* data);
+  void AddStreamSynchronizeEventUponApiExit(uint32_t cbid,
+                                            const hip_api_data_t* data);
+  void AddGenericEventUponApiExit(uint32_t cbid, const hip_api_data_t* data);
+
+  RocmTracerOptions options_;
+  RocmTracer* tracer_ = nullptr;
+  RocmTraceCollector* collector_ = nullptr;
+};
+
+class RocmActivityCallbackImpl {
+ public:
+  RocmActivityCallbackImpl(const RocmTracerOptions& options, RocmTracer* tracer,
+                           RocmTraceCollector* collector)
+      : options_(options), tracer_(tracer), collector_(collector) {}
+
+  Status operator()(const char* begin, const char* end);
+
+ private:
+  void AddHipKernelActivityEvent(const roctracer_record_t* record);
+  void AddHipMemcpyActivityEvent(const roctracer_record_t* record);
+  void AddHipMemsetActivityEvent(const roctracer_record_t* record);
+  void AddHipMallocEvent(const roctracer_record_t* record);
+  void AddHipStreamSynchronizeEvent(const roctracer_record_t* record);
+  void AddHccKernelActivityEvent(const roctracer_record_t* record);
+  void AddHccMemcpyActivityEvent(const roctracer_record_t* record);
+
+  RocmTracerOptions options_;
+  RocmTracer* tracer_ = nullptr;
+  RocmTraceCollector* collector_ = nullptr;
+};
+
+// The class use to enable cupti callback/activity API and forward the collected
+// trace events to RocmTraceCollector. There should be only one RocmTracer
+// per process.
+class RocmTracer {
+ public:
+  // Returns a pointer to singleton RocmTracer.
+  static RocmTracer* GetRocmTracerSingleton();
+
+  // Only one profile session can be live in the same time.
+  bool IsAvailable() const;
+
+  void Enable(const RocmTracerOptions& options, RocmTraceCollector* collector);
+  void Disable();
+
+  void ApiCallbackHandler(uint32_t domain, uint32_t cbid, const void* cbdata);
+  void ActivityCallbackHandler(const char* begin, const char* end);
+
+  static uint64_t GetTimestamp();
+  static int NumGpus();
+
+  void AddToPendingActivityRecords(uint32_t correlation_id) {
+    pending_activity_records_.Add(correlation_id);
+  }
+
+  void RemoveFromPendingActivityRecords(uint32_t correlation_id) {
+    pending_activity_records_.Remove(correlation_id);
+  }
+
+  void ClearPendingActivityRecordsCount() { pending_activity_records_.Clear(); }
+
+  size_t GetPendingActivityRecordsCount() {
+    return pending_activity_records_.Count();
+  }
+
+ protected:
+  // protected constructor for injecting mock cupti interface for testing.
+  explicit RocmTracer() : num_gpus_(NumGpus()) {}
+
+ private:
+  Status EnableApiTracing();
+  Status DisableApiTracing();
+
+  Status EnableActivityTracing();
+  Status DisableActivityTracing();
+
+  int num_gpus_;
+  absl::optional<RocmTracerOptions> options_;
+  RocmTraceCollector* collector_ = nullptr;
+
+  bool api_tracing_enabled_ = false;
+  bool activity_tracing_enabled_ = false;
+
+  RocmApiCallbackImpl* api_cb_impl_;
+  RocmActivityCallbackImpl* activity_cb_impl_;
+
+  class PendingActivityRecords {
+   public:
+    // add a correlation id to the pending set
+    void Add(uint32_t correlation_id) {
+      absl::MutexLock lock(&mutex);
+      pending_set.insert(correlation_id);
+    }
+    // remove a correlation id from the pending set
+    void Remove(uint32_t correlation_id) {
+      absl::MutexLock lock(&mutex);
+      pending_set.erase(correlation_id);
+    }
+    // clear the pending set
+    void Clear() {
+      absl::MutexLock lock(&mutex);
+      pending_set.clear();
+    }
+    // count the number of correlation ids in the pending set
+    size_t Count() {
+      absl::MutexLock lock(&mutex);
+      return pending_set.size();
+    }
+
+   private:
+    // set of co-relation ids for which the hcc activity record is pending
+    absl::flat_hash_set<uint32_t> pending_set;
+    // the callback which processes the activity records (and consequently
+    // removes items from the pending set) is called in a separate thread
+    // from the one that adds item to the list.
+    absl::Mutex mutex;
+  };
+  PendingActivityRecords pending_activity_records_;
+
+ public:
+  // Disable copy and move.
+  RocmTracer(const RocmTracer&) = delete;
+  RocmTracer& operator=(const RocmTracer&) = delete;
+};
+
+}  // namespace profiler
+}  // namespace tensorflow
+#endif  // TENSORFLOW_CORE_PROFILER_INTERNAL_GPU_ROCM_TRACER_H_
--- a/tensorflow/core/profiler/utils/xplane_schema.cc
+++ b/tensorflow/core/profiler/utils/xplane_schema.cc
@ -31,6 +31,7 @@ const absl::string_view kGpuPlanePrefix = "/device:GPU:";
 const absl::string_view kTpuPlanePrefix = "/device:TPU:";
 const absl::string_view kTpuRuntimePlaneName = "/host:TPU-runtime";
 const absl::string_view kCuptiDriverApiPlaneName = "/host:CUPTI";
+const absl::string_view kRoctracerApiPlaneName = "/host:ROCTRACER";
 const absl::string_view kMetadataPlaneName = "/host:metadata";
 const absl::string_view kTFStreamzPlaneName = "/host:tfstreamz";
 const absl::string_view kPythonTracerPlaneName = "/host:python-tracer";
--- a/tensorflow/core/profiler/utils/xplane_schema.h
+++ b/tensorflow/core/profiler/utils/xplane_schema.h
@ -37,6 +37,8 @@ TF_CONST_INIT extern const absl::string_view kTpuPlanePrefix;
 TF_CONST_INIT extern const absl::string_view kTpuRuntimePlaneName;
 // Name of XPlane that contains CUPTI driver API generated events.
 TF_CONST_INIT extern const absl::string_view kCuptiDriverApiPlaneName;
+// Name of XPlane that contains Roctracer API generated events.
+TF_CONST_INIT extern const absl::string_view kRoctracerApiPlaneName;
 // Name of XPlane that contains profile metadata such as XLA debug info.
 TF_CONST_INIT extern const absl::string_view kMetadataPlaneName;
 // Name of XPlane that contains kpi related metrics.
--- a/tensorflow/python/client/timeline.py
+++ b/tensorflow/python/client/timeline.py
@ -26,7 +26,9 @@ import re
 # The timeline target is usually imported as part of BUILD target
 # "platform_test", which includes also includes the "platform"
 # dependency.  This is why the logging import here is okay.
+from tensorflow.python.platform import build_info
 from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.platform import build_info


 class AllocationMaximum(collections.namedtuple(
@ -448,6 +450,8 @@ class Timeline(object):
    else:
      _, op, inputs = self._parse_op_label(nodestats.timeline_label)
    args = {'name': node_name, 'op': op}
+    if build_info.build_info['is_rocm_build']:
+      args['kernel'] = nodestats.timeline_label.split('@@')[0]
    for i, iname in enumerate(inputs):
      args['input%d' % i] = iname
    self._chrome_trace.emit_region(start, duration, pid, tid, 'Op', op, args)
--- a/tensorflow/python/client/timeline_test.py
+++ b/tensorflow/python/client/timeline_test.py
@ -104,10 +104,7 @@ class TimelineTest(test.TestCase):
    step_stats = run_metadata.step_stats
    devices = [d.device for d in step_stats.dev_stats]
    self.assertTrue('/job:localhost/replica:0/task:0/device:GPU:0' in devices)
-    if not test.is_built_with_rocm():
-      # skip this check for the ROCm platform
-      # stream level tracing is not yet supported on the ROCm platform
-      self.assertTrue('/device:GPU:0/stream:all' in devices)
+    self.assertIn('/device:GPU:0/stream:all', devices)
    tl = timeline.Timeline(step_stats)
    ctf = tl.generate_chrome_trace_format()
    self._validateTrace(ctf)
--- a/tensorflow/python/eager/profiler_test.py
+++ b/tensorflow/python/eager/profiler_test.py
@ -47,8 +47,7 @@ class ProfilerTest(test_util.TensorFlowTestCase):
    profile_pb.ParseFromString(profile_result)
    devices = frozenset(device.name for device in profile_pb.devices.values())
    self.assertIn('/host:CPU', devices)
-    if not test_util.IsBuiltWithROCm() and config.list_physical_devices('GPU'):
-      # device tracing is not yet supported on the ROCm platform
+    if config.list_physical_devices('GPU'):
      self.assertIn('/device:GPU:0', devices)
    events = frozenset(event.name for event in profile_pb.trace_events)
    self.assertIn('three_times_five', events)
--- a/tensorflow/python/profiler/BUILD
+++ b/tensorflow/python/profiler/BUILD
@ -61,7 +61,6 @@ cuda_py_test(
    python_version = "PY3",
    tags = [
        "no_pip",
-        "no_rocm",
    ],
    deps = [
        ":profiler_v2",
--- a/tensorflow/python/profiler/internal/run_metadata_test.py
+++ b/tensorflow/python/profiler/internal/run_metadata_test.py
@ -77,14 +77,13 @@ def _run_model():
    opts['min_bytes'] = 0
    opts['order_by'] = 'name'
    opts['output'] = 'none'
-    _ = sess.run(y,
-                 options=config_pb2.RunOptions(
-                     trace_level=config_pb2.RunOptions.SOFTWARE_TRACE),
-                 run_metadata=run_metadata)
+    _ = sess.run(
+        y,
+        options=config_pb2.RunOptions(
+            trace_level=config_pb2.RunOptions.SOFTWARE_TRACE),
+        run_metadata=run_metadata)
    tfprof_node = model_analyzer.profile(
-        sess.graph,
-        run_meta=run_metadata,
-        options=opts)
+        sess.graph, run_meta=run_metadata, options=opts)

    return tfprof_node, run_metadata

@ -99,17 +98,17 @@ def _run_loop_model():

    sess.run(variables.global_variables_initializer())
    run_meta = config_pb2.RunMetadata()
-    _ = sess.run(x,
-                 options=config_pb2.RunOptions(
-                     trace_level=config_pb2.RunOptions.SOFTWARE_TRACE),
-                 run_metadata=run_meta)
+    _ = sess.run(
+        x,
+        options=config_pb2.RunOptions(
+            trace_level=config_pb2.RunOptions.SOFTWARE_TRACE),
+        run_metadata=run_meta)

    opts = builder.time_and_memory()
    opts['order_by'] = 'name'
    opts['output'] = 'none'

-    tfprof_node = model_analyzer.profile(
-        sess.graph, run_meta, options=opts)
+    tfprof_node = model_analyzer.profile(sess.graph, run_meta, options=opts)
    return tfprof_node, run_meta


@ -136,10 +135,6 @@ class RunMetadataTest(test.TestCase):

    ret = _extract_node(run_meta, 'MatMul')
    self.assertEqual(len(ret['gpu:0']), 1)
-    if not test.is_built_with_rocm():
-      # skip this check for the ROCm platform
-      # stream level tracing is not yet supported on the ROCm platform
-      self.assertEqual(len(ret['gpu:0/stream:all']), 1, '%s' % run_meta)

  @test_util.run_deprecated_v1
  def testAllocationHistory(self):
@ -163,8 +158,8 @@ class RunMetadataTest(test.TestCase):
    # All memory deallocated.
    self.assertEqual(mm_allocs[0].alloc_bytes + mm_allocs[1].alloc_bytes, 0)

-    rand = _extract_node(
-        run_meta, 'random_normal/RandomStandardNormal')['gpu:0'][0]
+    rand = _extract_node(run_meta,
+                         'random_normal/RandomStandardNormal')['gpu:0'][0]
    random_allocs = rand.memory[0].allocation_records
    # random normal must allocated first since matmul depends on it.
    self.assertLess(random_allocs[0].alloc_micros, mm.all_start_micros)
@ -191,17 +186,15 @@ class RunMetadataTest(test.TestCase):
    with ops.device('/cpu:0'):
      tfprof_node, run_meta = _run_loop_model()
      # The while-loop caused a node to appear 4 times in scheduling.
-      ret = _extract_node(run_meta,
-                          'rnn/while/basic_rnn_cell/MatMul')
+      ret = _extract_node(run_meta, 'rnn/while/basic_rnn_cell/MatMul')
      self.assertEqual(len(ret['cpu:0']), 4)

      total_cpu_execs = 0
      for node in ret['cpu:0']:
        total_cpu_execs += node.op_end_rel_micros

-      mm_node = lib.SearchTFProfNode(
-          tfprof_node,
-          'rnn/while/basic_rnn_cell/MatMul')
+      mm_node = lib.SearchTFProfNode(tfprof_node,
+                                     'rnn/while/basic_rnn_cell/MatMul')

      self.assertEqual(mm_node.run_count, 4)
      self.assertEqual(mm_node.cpu_exec_micros, total_cpu_execs)
@ -243,17 +236,13 @@ class RunMetadataTest(test.TestCase):
    with ops.device('/device:GPU:0'):
      _, run_meta = _run_loop_model()
      # The while-loop caused a node to appear 4 times in scheduling.
-      ret = _extract_node(run_meta,
-                          'rnn/while/basic_rnn_cell/MatMul')
+      ret = _extract_node(run_meta, 'rnn/while/basic_rnn_cell/MatMul')
      self.assertEqual(len(ret['gpu:0']), 4, '%s' % run_meta)

      total_cpu_execs = 0
      for node in ret['gpu:0']:
        total_cpu_execs += node.op_end_rel_micros

-      if not test.is_built_with_rocm():
-        # skip this check for the ROCm platform
-        # stream level tracing is not yet supported on the ROCm platform
        self.assertGreaterEqual(
            len(ret['gpu:0/stream:all']), 4, '%s' % run_meta)

--- a/tensorflow/python/profiler/profile_context_test.py
+++ b/tensorflow/python/profiler/profile_context_test.py
@ -40,8 +40,7 @@ class ProfilerContextTest(test.TestCase):
  def testBasics(self):
    ops.reset_default_graph()
    outfile = os.path.join(test.get_temp_dir(), "dump")
-    opts = builder(builder.time_and_memory()
-                  ).with_file_output(outfile).build()
+    opts = builder(builder.time_and_memory()).with_file_output(outfile).build()

    x = lib.BuildFullModel()

@ -65,17 +64,10 @@ class ProfilerContextTest(test.TestCase):

      self.assertEqual(set([15, 50, 100]), set(pctx.get_profiles("op").keys()))

-    with lib.ProfilerFromFile(
-        os.path.join(test.get_temp_dir(), "profile_100")) as profiler:
+    with lib.ProfilerFromFile(os.path.join(test.get_temp_dir(),
+                                           "profile_100")) as profiler:
      profiler.profile_operations(options=opts)
      with gfile.Open(outfile, "r") as f:
-
-        if test.is_built_with_rocm():
-          # The profiler output for ROCm mode, includes an extra warning
-          # related to the lack of stream tracing in ROCm mode.
-          # Need to skip this warning when doing the diff
-          profile_str = "\n".join(profile_str.split("\n")[7:])
-
        self.assertEqual(profile_str, f.read())

  @test_util.run_deprecated_v1
@ -104,8 +96,8 @@ class ProfilerContextTest(test.TestCase):
  def testDisabled(self):
    ops.reset_default_graph()
    x = lib.BuildFullModel()
-    with profile_context.ProfileContext(test.get_temp_dir(),
-                                        enabled=False) as pctx:
+    with profile_context.ProfileContext(
+        test.get_temp_dir(), enabled=False) as pctx:
      with session.Session() as sess:
        self.evaluate(variables.global_variables_initializer())
        for _ in range(10):
--- a/tensorflow/stream_executor/platform/default/dso_loader.cc
+++ b/tensorflow/stream_executor/platform/default/dso_loader.cc
@ -141,6 +141,10 @@ port::StatusOr<void*> GetRocrandDsoHandle() {
  return GetDsoHandle("rocrand", "");
 }

+port::StatusOr<void*> GetRoctracerDsoHandle() {
+  return GetDsoHandle("roctracer64", "");
+}
+
 port::StatusOr<void*> GetHipsparseDsoHandle() {
  return GetDsoHandle("hipsparse", "");
 }
@ -220,6 +224,11 @@ port::StatusOr<void*> GetRocrandDsoHandle() {
  return *result;
 }

+port::StatusOr<void*> GetRoctracerDsoHandle() {
+  static auto result = new auto(DsoLoader::GetRoctracerDsoHandle());
+  return *result;
+}
+
 port::StatusOr<void*> GetHipsparseDsoHandle() {
  static auto result = new auto(DsoLoader::GetHipsparseDsoHandle());
  return *result;
--- a/tensorflow/stream_executor/platform/default/dso_loader.h
+++ b/tensorflow/stream_executor/platform/default/dso_loader.h
@ -51,6 +51,7 @@ port::StatusOr<void*> GetRocblasDsoHandle();
 port::StatusOr<void*> GetMiopenDsoHandle();
 port::StatusOr<void*> GetRocfftDsoHandle();
 port::StatusOr<void*> GetRocrandDsoHandle();
+port::StatusOr<void*> GetRoctracerDsoHandle();
 port::StatusOr<void*> GetHipsparseDsoHandle();
 port::StatusOr<void*> GetHipDsoHandle();

@ -85,6 +86,7 @@ port::StatusOr<void*> GetRocblasDsoHandle();
 port::StatusOr<void*> GetMiopenDsoHandle();
 port::StatusOr<void*> GetRocfftDsoHandle();
 port::StatusOr<void*> GetRocrandDsoHandle();
+port::StatusOr<void*> GetRoctracerDsoHandle();
 port::StatusOr<void*> GetHipsparseDsoHandle();
 port::StatusOr<void*> GetHipDsoHandle();
 }  // namespace CachedDsoLoader
--- a/tensorflow/stream_executor/rocm/BUILD
+++ b/tensorflow/stream_executor/rocm/BUILD
@ -327,6 +327,29 @@ cc_library(
    alwayslink = True,
 )

+cc_library(
+    name = "roctracer_if_static",
+    deps = if_static([
+        "@local_config_rocm//rocm:roctracer",
+    ]),
+)
+
+cc_library(
+    name = "roctracer_wrapper",
+    srcs = if_rocm_is_configured(["roctracer_wrapper.h"]),
+    hdrs = if_rocm_is_configured(["roctracer_wrapper.h"]),
+    deps = if_rocm_is_configured([
+        ":rocm_gpu_executor",
+        ":rocm_platform_id",
+        ":roctracer_if_static",
+        "@local_config_rocm//rocm:rocm_headers",
+        "//tensorflow/stream_executor/lib",
+        "//tensorflow/stream_executor/platform",
+        "//tensorflow/stream_executor/platform:dso_loader",
+    ]),
+    alwayslink = True,
+)
+
 cc_library(
    name = "all_runtime",
    copts = tf_copts(),
--- a/tensorflow/stream_executor/rocm/roctracer_wrapper.h
+++ b/tensorflow/stream_executor/rocm/roctracer_wrapper.h
@ -0,0 +1,88 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// This file wraps roctracer API calls with dso loader so that we don't need to
+// have explicit linking to libroctracer. All TF hipsarse API usage should route
+// through this wrapper.
+
+#ifndef TENSORFLOW_STREAM_EXECUTOR_ROCM_ROCTRACER_WRAPPER_H_
+#define TENSORFLOW_STREAM_EXECUTOR_ROCM_ROCTRACER_WRAPPER_H_
+
+#include "rocm/include/roctracer/roctracer.h"
+#include "rocm/include/roctracer/roctracer_hcc.h"
+#include "rocm/include/roctracer/roctracer_hip.h"
+#include "tensorflow/stream_executor/lib/env.h"
+#include "tensorflow/stream_executor/platform/dso_loader.h"
+#include "tensorflow/stream_executor/platform/port.h"
+
+namespace tensorflow {
+namespace wrap {
+
+#ifdef PLATFORM_GOOGLE
+
+#define ROCTRACER_API_WRAPPER(API_NAME)                          \
+  template <typename... Args>                                    \
+  auto API_NAME()(Args... args)->decltype(::API_NAME(args...)) { \
+    return ::API_NAME(args...);                                  \
+  }
+
+#else
+
+#define ROCTRACER_API_WRAPPER(API_NAME)                                       \
+  template <typename... Args>                                                 \
+  auto API_NAME(Args... args)->decltype(::API_NAME(args...)) {                \
+    using FuncPtrT = std::add_pointer<decltype(::API_NAME)>::type;            \
+    static FuncPtrT loaded = []() -> FuncPtrT {                               \
+      static const char* kName = #API_NAME;                                   \
+      void* f;                                                                \
+      auto s = Env::Default()->GetSymbolFromLibrary(                          \
+          stream_executor::internal::CachedDsoLoader::GetRoctracerDsoHandle() \
+              .ValueOrDie(),                                                  \
+          kName, &f);                                                         \
+      CHECK(s.ok()) << "could not find " << kName                             \
+                    << " in roctracer DSO; dlerror: " << s.error_message();   \
+      return reinterpret_cast<FuncPtrT>(f);                                   \
+    }();                                                                      \
+    return loaded(args...);                                                   \
+  }
+
+#endif  // PLATFORM_GOOGLE
+
+#define FOREACH_ROCTRACER_API(DO_FUNC)           \
+  DO_FUNC(roctracer_default_pool_expl)           \
+  DO_FUNC(roctracer_disable_domain_activity)     \
+  DO_FUNC(roctracer_disable_domain_callback)     \
+  DO_FUNC(roctracer_disable_op_activity)         \
+  DO_FUNC(roctracer_disable_op_callback)         \
+  DO_FUNC(roctracer_enable_domain_activity_expl) \
+  DO_FUNC(roctracer_enable_domain_callback)      \
+  DO_FUNC(roctracer_enable_op_activity)          \
+  DO_FUNC(roctracer_enable_op_callback)          \
+  DO_FUNC(roctracer_error_string)                \
+  DO_FUNC(roctracer_flush_activity_expl)         \
+  DO_FUNC(roctracer_get_timestamp)               \
+  DO_FUNC(roctracer_op_string)                   \
+  DO_FUNC(roctracer_open_pool_expl)              \
+  DO_FUNC(roctracer_set_properties)
+
+FOREACH_ROCTRACER_API(ROCTRACER_API_WRAPPER)
+
+#undef FOREACH_ROCTRACER_API
+#undef ROCTRACER_API_WRAPPER
+
+}  // namespace wrap
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_STREAM_EXECUTOR_ROCM_ROCTRACER_WRAPPER_H_
--- a/third_party/gpus/rocm/BUILD.tpl
+++ b/third_party/gpus/rocm/BUILD.tpl
@ -21,6 +21,7 @@ cc_library(
        ".",
        "rocm/include",
        "rocm/include/rocrand",
+        "rocm/include/roctracer",
    ],
    visibility = ["//visibility:public"],
 )
@ -109,6 +110,7 @@ cc_library(
        ":hiprand",
        ":miopen",
        ":hipsparse",
+        ":roctracer",
        ":rocsolver",
    ],
 )
@ -144,6 +146,11 @@ cc_library(
    data = ["rocm/lib/%{hipsparse_lib}"],
 )

+cc_library(
+    name = "roctracer",
+    data = ["rocm/lib/%{roctracer_lib}"],
+)
+
 cc_library(
    name = "rocsolver",
    srcs = ["rocm/lib/%{rocsolver_lib}"],
--- a/third_party/gpus/rocm_configure.bzl
+++ b/third_party/gpus/rocm_configure.bzl
@ -332,6 +332,7 @@ def _find_libs(repository_ctx, rocm_config, bash_bin):
            ("MIOpen", rocm_config.rocm_toolkit_path + "/miopen"),
            ("rccl", rocm_config.rocm_toolkit_path + "/rccl"),
            ("hipsparse", rocm_config.rocm_toolkit_path + "/hipsparse"),
+            ("roctracer64", rocm_config.rocm_toolkit_path + "/roctracer"),
            ("rocsolver", rocm_config.rocm_toolkit_path + "/rocsolver"),
        ]
    ]
@ -458,6 +459,7 @@ def _create_dummy_repository(repository_ctx):
            "%{rocfft_lib}": _lib_name("rocfft"),
            "%{hiprand_lib}": _lib_name("hiprand"),
            "%{hipsparse_lib}": _lib_name("hipsparse"),
+            "%{roctracer_lib}": _lib_name("roctracer64"),
            "%{rocsolver_lib}": _lib_name("rocsolver"),
            "%{copy_rules}": "",
            "%{rocm_headers}": "",
@ -635,6 +637,7 @@ def _create_local_rocm_repository(repository_ctx):
            "%{miopen_lib}": rocm_libs["MIOpen"].file_name,
            "%{rccl_lib}": rocm_libs["rccl"].file_name,
            "%{hipsparse_lib}": rocm_libs["hipsparse"].file_name,
+            "%{roctracer_lib}": rocm_libs["roctracer64"].file_name,
            "%{rocsolver_lib}": rocm_libs["rocsolver"].file_name,
            "%{copy_rules}": "\n".join(copy_rules),
            "%{rocm_headers}": ('":rocm-include",\n' +