mirror of
https://github.com/zebrajr/tensorflow.git
synced 2025-12-06 12:20:11 +01:00
Merge pull request #48645 from ROCmSoftwarePlatform/google_upstream_r25_port_pr_47508
[r2.5 port][ROCm] Port PR 47508 to r2.5
This commit is contained in:
commit
3aba8ab14d
|
|
@ -625,7 +625,10 @@ def tf_protos_grappler():
|
|||
)
|
||||
|
||||
def tf_additional_device_tracer_srcs():
|
||||
return ["device_tracer.cc"]
|
||||
return [
|
||||
"device_tracer_cuda.cc",
|
||||
"device_tracer_rocm.cc",
|
||||
]
|
||||
|
||||
def tf_additional_cupti_utils_cuda_deps():
|
||||
return []
|
||||
|
|
|
|||
|
|
@ -1,4 +1,5 @@
|
|||
load("@local_config_cuda//cuda:build_defs.bzl", "if_cuda")
|
||||
load("@local_config_rocm//rocm:build_defs.bzl", "if_rocm")
|
||||
load(
|
||||
"//tensorflow:tensorflow.bzl",
|
||||
"tf_copts",
|
||||
|
|
@ -33,6 +34,7 @@ tf_cuda_library(
|
|||
cuda_deps = [
|
||||
"//tensorflow/core/profiler/internal/gpu:cupti_tracer",
|
||||
"//tensorflow/core/profiler/internal/gpu:cupti_wrapper",
|
||||
"//tensorflow/core/profiler/internal/gpu:rocm_tracer",
|
||||
],
|
||||
deps = [
|
||||
":cupti_utils",
|
||||
|
|
@ -138,6 +140,25 @@ tf_cuda_library(
|
|||
],
|
||||
)
|
||||
|
||||
tf_cuda_library(
|
||||
name = "rocm_tracer",
|
||||
srcs = if_rocm(["rocm_tracer.cc"]),
|
||||
hdrs = if_rocm(["rocm_tracer.h"]),
|
||||
copts = tf_profiler_copts() + tf_copts(),
|
||||
visibility = ["//visibility:public"],
|
||||
deps = [
|
||||
"//tensorflow/core:lib",
|
||||
"//tensorflow/core/profiler/internal/cpu:annotation_stack",
|
||||
"//tensorflow/core/profiler/utils:time_utils",
|
||||
"//tensorflow/stream_executor/rocm:roctracer_wrapper",
|
||||
"@com_google_absl//absl/container:fixed_array",
|
||||
"@com_google_absl//absl/container:flat_hash_map",
|
||||
"@com_google_absl//absl/container:node_hash_map",
|
||||
"@com_google_absl//absl/container:node_hash_set",
|
||||
"@com_google_absl//absl/types:optional",
|
||||
],
|
||||
)
|
||||
|
||||
tf_cuda_library(
|
||||
name = "nvtx_utils",
|
||||
srcs = if_cuda(["nvtx_utils.cc"]),
|
||||
|
|
|
|||
833
tensorflow/core/profiler/internal/gpu/device_tracer_rocm.cc
Normal file
833
tensorflow/core/profiler/internal/gpu/device_tracer_rocm.cc
Normal file
|
|
@ -0,0 +1,833 @@
|
|||
/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==============================================================================*/
|
||||
|
||||
#if TENSORFLOW_USE_ROCM
|
||||
|
||||
#include <memory>
|
||||
#include <utility>
|
||||
|
||||
#include "absl/container/fixed_array.h"
|
||||
#include "absl/container/flat_hash_map.h"
|
||||
#include "absl/container/flat_hash_set.h"
|
||||
#include "absl/strings/str_cat.h"
|
||||
#include "absl/strings/str_format.h"
|
||||
#include "absl/strings/str_join.h"
|
||||
#include "tensorflow/core/framework/step_stats.pb.h"
|
||||
#include "tensorflow/core/lib/core/errors.h"
|
||||
#include "tensorflow/core/platform/abi.h"
|
||||
#include "tensorflow/core/platform/env_time.h"
|
||||
#include "tensorflow/core/platform/macros.h"
|
||||
#include "tensorflow/core/platform/mutex.h"
|
||||
#include "tensorflow/core/platform/thread_annotations.h"
|
||||
#include "tensorflow/core/profiler/internal/cpu/annotation_stack.h"
|
||||
#include "tensorflow/core/profiler/internal/gpu/rocm_tracer.h"
|
||||
#include "tensorflow/core/profiler/lib/profiler_factory.h"
|
||||
#include "tensorflow/core/profiler/lib/profiler_interface.h"
|
||||
#include "tensorflow/core/profiler/utils/parse_annotation.h"
|
||||
#include "tensorflow/core/profiler/utils/xplane_builder.h"
|
||||
#include "tensorflow/core/profiler/utils/xplane_schema.h"
|
||||
#include "tensorflow/core/profiler/utils/xplane_utils.h"
|
||||
#include "tensorflow/core/util/env_var.h"
|
||||
|
||||
namespace tensorflow {
|
||||
namespace profiler {
|
||||
|
||||
namespace {
|
||||
// Set the all XLines of specified XPlane to starting walltime.
|
||||
// Events time in both host and device planes are CUTPI timestamps.
|
||||
// We set initial RocmTracer timestamp as start time for all lines to reflect
|
||||
// this fact. Eventually we change line start time to corresponding
|
||||
// start_walltime_ns to normalize with CPU wall time.
|
||||
static void NormalizeTimeStamps(XPlaneBuilder* plane,
|
||||
uint64_t start_walltime_ns) {
|
||||
plane->ForEachLine([&](tensorflow::profiler::XLineBuilder line) {
|
||||
line.SetTimestampNs(start_walltime_ns);
|
||||
});
|
||||
}
|
||||
|
||||
void GetDeviceCapabilities(int32_t device_ordinal,
|
||||
XPlaneBuilder* device_plane) {
|
||||
// TODO(rocm)
|
||||
}
|
||||
|
||||
bool IsHostEvent(const RocmTracerEvent& event) {
|
||||
// TODO(rocm)
|
||||
// Classify all events as GPU events for now
|
||||
return false;
|
||||
}
|
||||
|
||||
std::string GetDeviceXLineName(
|
||||
int64_t stream_id, absl::flat_hash_set<RocmTracerEventType>& event_types) {
|
||||
std::string line_name = absl::StrCat("Stream #", stream_id);
|
||||
event_types.erase(RocmTracerEventType::Unsupported);
|
||||
if (event_types.empty()) return line_name;
|
||||
std::vector<const char*> type_names;
|
||||
for (const auto event_type : event_types) {
|
||||
type_names.emplace_back(GetRocmTracerEventTypeName(event_type));
|
||||
}
|
||||
return absl::StrCat(line_name, "(", absl::StrJoin(type_names, ","), ")");
|
||||
}
|
||||
|
||||
} // namespace
|
||||
|
||||
class RocmTraceCollectorImpl : public profiler::RocmTraceCollector {
|
||||
public:
|
||||
RocmTraceCollectorImpl(const RocmTraceCollectorOptions& options,
|
||||
uint64_t start_walltime_ns, uint64_t start_gputime_ns)
|
||||
: RocmTraceCollector(options),
|
||||
num_callback_events_(0),
|
||||
num_activity_events_(0),
|
||||
start_walltime_ns_(start_walltime_ns),
|
||||
start_gputime_ns_(start_gputime_ns),
|
||||
next_logical_device_id_(0),
|
||||
per_device_collector_(options.num_gpus) {
|
||||
// in the physical -> logical device_id map, add an explicit entry for
|
||||
// RocmTracerEvent::kInvalidDeviceId -> RocmTracerEvent::kInvalidDeviceId
|
||||
// event with this device_id are events for which we were not able to
|
||||
// determine the correct device_id via the API+Activity callbacks
|
||||
// we will special case such events in the Flush routine
|
||||
device_id_map_[RocmTracerEvent::kInvalidDeviceId] =
|
||||
RocmTracerEvent::kInvalidDeviceId;
|
||||
}
|
||||
|
||||
void AddEvent(RocmTracerEvent&& event) override {
|
||||
mutex_lock lock(aggregated_events_mutex_);
|
||||
|
||||
if (event.source == RocmTracerEventSource::ApiCallback) {
|
||||
if (num_callback_events_ > options_.max_callback_api_events) {
|
||||
OnEventsDropped("max callback event capacity reached",
|
||||
event.correlation_id);
|
||||
DumpRocmTracerEvent(event, 0, 0);
|
||||
return;
|
||||
}
|
||||
num_callback_events_++;
|
||||
}
|
||||
if (event.source == RocmTracerEventSource::Activity) {
|
||||
if (num_activity_events_ > options_.max_activity_api_events) {
|
||||
OnEventsDropped("max activity event capacity reached",
|
||||
event.correlation_id);
|
||||
DumpRocmTracerEvent(event, 0, 0);
|
||||
return;
|
||||
}
|
||||
num_activity_events_++;
|
||||
}
|
||||
|
||||
auto iter = aggregated_events_.find(event.correlation_id);
|
||||
if (iter != aggregated_events_.end()) {
|
||||
// event with this correlation id already present
|
||||
// agrregate this event with the existing one
|
||||
switch (event.domain) {
|
||||
case RocmTracerEventDomain::HIP_API:
|
||||
switch (event.source) {
|
||||
case RocmTracerEventSource::ApiCallback:
|
||||
break;
|
||||
case RocmTracerEventSource::Activity:
|
||||
// Use the start/stop time from the HCC_OPS domain
|
||||
// unless this is one of those events for which we do not
|
||||
// receive any HCC activity record callback
|
||||
if (IsEventTypeWithoutHCCActivityRecordCallback(event.type)) {
|
||||
iter->second.start_time_ns = event.start_time_ns;
|
||||
iter->second.end_time_ns = event.end_time_ns;
|
||||
}
|
||||
iter->second.annotation = event.annotation;
|
||||
break;
|
||||
}
|
||||
break;
|
||||
case RocmTracerEventDomain::HCC_OPS:
|
||||
switch (event.source) {
|
||||
case RocmTracerEventSource::ApiCallback:
|
||||
break;
|
||||
case RocmTracerEventSource::Activity:
|
||||
iter->second.device_id = event.device_id;
|
||||
iter->second.stream_id = event.stream_id;
|
||||
iter->second.start_time_ns = event.start_time_ns;
|
||||
iter->second.end_time_ns = event.end_time_ns;
|
||||
// Use the annotation from the HIP_API domain
|
||||
// iter->second.annotation = event.annotation;
|
||||
break;
|
||||
}
|
||||
break;
|
||||
}
|
||||
} else {
|
||||
switch (event.source) {
|
||||
case RocmTracerEventSource::ApiCallback:
|
||||
aggregated_events_.emplace(event.correlation_id, std::move(event));
|
||||
break;
|
||||
case RocmTracerEventSource::Activity:
|
||||
// you would think that this cannot happen, but it does
|
||||
// This is primarily because the call "roctracer_flush_activity" does
|
||||
// not work as it should. Imagine a sequence where we enable/disable
|
||||
// tracing more than once in a single TF session.
|
||||
// If the "flush" that happens during disable, does not flush out all
|
||||
// the activity records, then they will show up during the subsequent
|
||||
// call to enable, and we will end up here!
|
||||
OnEventsDropped(
|
||||
"Activity event encountered before a corresponding API event",
|
||||
event.correlation_id);
|
||||
DumpRocmTracerEvent(event, 0, 0);
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void OnEventsDropped(const std::string& reason,
|
||||
uint32_t correlation_id) override {
|
||||
LOG(INFO) << "RocmTracerEvent dropped (correlation_id=" << correlation_id
|
||||
<< ",) : " << reason << ".";
|
||||
}
|
||||
|
||||
void Flush() override {
|
||||
mutex_lock lock(aggregated_events_mutex_);
|
||||
|
||||
VLOG(3) << "RocmTraceCollector collected " << num_callback_events_
|
||||
<< " callback events, " << num_activity_events_
|
||||
<< " activity events, and aggregated them into "
|
||||
<< aggregated_events_.size() << " events.";
|
||||
|
||||
for (auto& iter : aggregated_events_) {
|
||||
auto& event = iter.second;
|
||||
|
||||
// For some hip API events, we never get a corresponding HCC
|
||||
// activity record callback and hence we currently do not have a way
|
||||
// of associating a valid device_id and stream_id with those events.
|
||||
// For such events, explcitly set those id sto 0 for now
|
||||
if (IsEventTypeWithoutHCCActivityRecordCallback(event.type)) {
|
||||
DumpRocmTracerEvent(event, 0, 0);
|
||||
if (event.device_id == RocmTracerEvent::kInvalidDeviceId) {
|
||||
VLOG(3) << "Explicitly setting device_id to 0 for "
|
||||
"event with correlation_id="
|
||||
<< event.correlation_id << ",";
|
||||
event.device_id = 0;
|
||||
} else {
|
||||
VLOG(3) << "Unexpectedly found a non-default "
|
||||
"device_id for event with correlation_id="
|
||||
<< event.correlation_id << ",";
|
||||
}
|
||||
if (event.stream_id == RocmTracerEvent::kInvalidStreamId) {
|
||||
VLOG(3) << "Explicitly setting stream_id to 0 for "
|
||||
"event with correlation_id="
|
||||
<< event.correlation_id << ",";
|
||||
event.stream_id = 0;
|
||||
} else {
|
||||
VLOG(3) << "Unexpectedly found a non-default "
|
||||
"stream_id for event with correlation_id="
|
||||
<< event.correlation_id << ",";
|
||||
}
|
||||
}
|
||||
|
||||
// determine the logical device id
|
||||
uint32_t physical_id = event.device_id;
|
||||
uint32_t logical_id = options_.num_gpus;
|
||||
auto kv_pair = device_id_map_.find(physical_id);
|
||||
if (kv_pair == device_id_map_.end()) {
|
||||
logical_id = next_logical_device_id_++;
|
||||
VLOG(3) << "Mapping physical device id " << physical_id
|
||||
<< " to logical device id " << logical_id;
|
||||
device_id_map_[physical_id] = logical_id;
|
||||
} else {
|
||||
logical_id = kv_pair->second;
|
||||
}
|
||||
event.device_id = logical_id;
|
||||
|
||||
if (event.device_id >= options_.num_gpus) {
|
||||
OnEventsDropped("logical device id >= num gpus", event.correlation_id);
|
||||
DumpRocmTracerEvent(event, 0, 0);
|
||||
continue;
|
||||
}
|
||||
|
||||
if (event.stream_id == RocmTracerEvent::kInvalidStreamId) {
|
||||
OnEventsDropped("invalid stream id", event.correlation_id);
|
||||
DumpRocmTracerEvent(event, 0, 0);
|
||||
continue;
|
||||
}
|
||||
|
||||
per_device_collector_[logical_id].AddEvent(event);
|
||||
}
|
||||
aggregated_events_.clear();
|
||||
|
||||
for (int i = 0; i < options_.num_gpus; ++i) {
|
||||
per_device_collector_[i].SortByStartTime();
|
||||
}
|
||||
}
|
||||
|
||||
void Export(StepStats* step_stats) {
|
||||
for (int i = 0; i < options_.num_gpus; ++i) {
|
||||
per_device_collector_[i].Export(i, start_walltime_ns_, start_gputime_ns_,
|
||||
step_stats);
|
||||
}
|
||||
}
|
||||
|
||||
void Export(XSpace* space) {
|
||||
uint64_t end_gputime_ns = RocmTracer::GetTimestamp();
|
||||
XPlaneBuilder host_plane(
|
||||
FindOrAddMutablePlaneWithName(space, kRoctracerApiPlaneName));
|
||||
for (int i = 0; i < options_.num_gpus; ++i) {
|
||||
std::string name = GpuPlaneName(i);
|
||||
XPlaneBuilder device_plane(FindOrAddMutablePlaneWithName(space, name));
|
||||
device_plane.SetId(i);
|
||||
per_device_collector_[i].Export(start_walltime_ns_, start_gputime_ns_,
|
||||
end_gputime_ns, &device_plane,
|
||||
&host_plane);
|
||||
GetDeviceCapabilities(i, &device_plane);
|
||||
NormalizeTimeStamps(&device_plane, start_walltime_ns_);
|
||||
}
|
||||
NormalizeTimeStamps(&host_plane, start_walltime_ns_);
|
||||
}
|
||||
|
||||
private:
|
||||
std::atomic<int> num_callback_events_;
|
||||
std::atomic<int> num_activity_events_;
|
||||
uint64_t start_walltime_ns_;
|
||||
uint64_t start_gputime_ns_;
|
||||
|
||||
mutex aggregated_events_mutex_;
|
||||
absl::flat_hash_map<uint32_t, RocmTracerEvent> aggregated_events_
|
||||
TF_GUARDED_BY(aggregated_events_mutex_);
|
||||
|
||||
// We need to create a map of
|
||||
// event.device_id -> index into per_device_collector_ array
|
||||
// The event.device_id returned by the RocmTracer is the physical
|
||||
// device_id and not the logical device_id. Say for example we are
|
||||
// running on a node with 8 GPUs. The expected physical device_id(s)
|
||||
// for those 8 GPUs would be 0,1,2,3,4,5,6,7. On such a node, if we
|
||||
// run a test with HIP_VISIBLE_DEVICES=5, then "options.num_gpus_ == 1",
|
||||
// but the event.device_id field will have 5 in it!
|
||||
// So the event.device_id can be thought of as the physical device id
|
||||
// and the index can be thought of as the logical device id.
|
||||
// We cannot determine the actual phsyical device id logical device id
|
||||
// mapping here, so we determine it empirically
|
||||
std::map<uint32_t, uint32_t> device_id_map_;
|
||||
uint32_t next_logical_device_id_;
|
||||
|
||||
bool IsEventTypeWithoutHCCActivityRecordCallback(RocmTracerEventType type) {
|
||||
switch (type) {
|
||||
case RocmTracerEventType::MemoryAlloc:
|
||||
return true;
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
struct PerDeviceCollector {
|
||||
void AddEvent(const RocmTracerEvent& event) {
|
||||
mutex_lock lock(events_mutex);
|
||||
events.emplace_back(event);
|
||||
}
|
||||
|
||||
void SortByStartTime() {
|
||||
mutex_lock lock(events_mutex);
|
||||
std::sort(
|
||||
events.begin(), events.end(),
|
||||
[](const RocmTracerEvent& event1, const RocmTracerEvent& event2) {
|
||||
return event1.start_time_ns < event2.start_time_ns;
|
||||
});
|
||||
}
|
||||
|
||||
void Export(int32_t device_ordinal, uint64_t start_walltime_ns,
|
||||
uint64_t start_gputime_ns, StepStats* step_stats) {
|
||||
mutex_lock lock(events_mutex);
|
||||
absl::flat_hash_map<
|
||||
std::pair<uint64_t /*stream_id*/, RocmTracerEventType>,
|
||||
DeviceStepStats*>
|
||||
per_stream_dev_stats;
|
||||
|
||||
DeviceStepStats* generic_stream_dev_stats = nullptr;
|
||||
DeviceStepStats* all_streams_dev_stats = nullptr;
|
||||
DeviceStepStats* memcpy_dev_stats = nullptr;
|
||||
DeviceStepStats* sync_dev_stats = nullptr;
|
||||
|
||||
for (const RocmTracerEvent& event : events) {
|
||||
DumpRocmTracerEvent(event, start_walltime_ns, start_gputime_ns);
|
||||
|
||||
std::unique_ptr<NodeExecStats> ns(new NodeExecStats);
|
||||
|
||||
ns->set_all_start_micros(
|
||||
(start_walltime_ns + (event.start_time_ns - start_gputime_ns)) /
|
||||
1000);
|
||||
ns->set_op_start_rel_micros(0);
|
||||
uint64_t elapsed_ns = event.end_time_ns - event.start_time_ns;
|
||||
ns->set_op_end_rel_micros(
|
||||
tensorflow::profiler::NanosToMicros(elapsed_ns));
|
||||
ns->set_all_end_rel_micros(
|
||||
tensorflow::profiler::NanosToMicros(elapsed_ns));
|
||||
|
||||
auto annotation_stack = ParseAnnotationStack(event.annotation);
|
||||
std::string kernel_name = port::MaybeAbiDemangle(event.name.c_str());
|
||||
std::string activity_name =
|
||||
!annotation_stack.empty()
|
||||
? std::string(annotation_stack.back().name)
|
||||
: kernel_name;
|
||||
ns->set_node_name(activity_name);
|
||||
|
||||
ns->set_thread_id(event.thread_id);
|
||||
|
||||
switch (event.type) {
|
||||
case RocmTracerEventType::Kernel: {
|
||||
ns->set_timeline_label(absl::StrFormat(
|
||||
"%s regs:%u shm:%u grid:%u,%u,%u block:%u,%u,%u@@%s",
|
||||
kernel_name, event.kernel_info.registers_per_thread,
|
||||
event.kernel_info.static_shared_memory_usage,
|
||||
event.kernel_info.grid_x, event.kernel_info.grid_y,
|
||||
event.kernel_info.grid_z, event.kernel_info.block_x,
|
||||
event.kernel_info.block_y, event.kernel_info.block_z,
|
||||
event.annotation));
|
||||
DeviceStepStats*& stream_dev_stats =
|
||||
per_stream_dev_stats[std::make_pair(event.stream_id,
|
||||
event.type)];
|
||||
if (stream_dev_stats == nullptr) {
|
||||
stream_dev_stats = step_stats->add_dev_stats();
|
||||
stream_dev_stats->set_device(absl::StrCat(
|
||||
"/device:GPU:", device_ordinal, "/stream:", event.stream_id,
|
||||
"<", GetRocmTracerEventTypeName(event.type), ">"));
|
||||
}
|
||||
*stream_dev_stats->add_node_stats() = *ns;
|
||||
if (all_streams_dev_stats == nullptr) {
|
||||
all_streams_dev_stats = step_stats->add_dev_stats();
|
||||
all_streams_dev_stats->set_device(
|
||||
absl::StrCat("/device:GPU:", device_ordinal, "/stream:all"));
|
||||
}
|
||||
all_streams_dev_stats->add_node_stats()->Swap(ns.release());
|
||||
} break;
|
||||
case RocmTracerEventType::MemcpyD2H:
|
||||
case RocmTracerEventType::MemcpyH2D:
|
||||
case RocmTracerEventType::MemcpyD2D:
|
||||
case RocmTracerEventType::MemcpyP2P: {
|
||||
std::string details = absl::StrCat(
|
||||
event.name, " bytes:", event.memcpy_info.num_bytes);
|
||||
if (event.memcpy_info.async) {
|
||||
absl::StrAppend(&details, " async");
|
||||
}
|
||||
if (event.memcpy_info.destination != event.device_id) {
|
||||
absl::StrAppend(&details,
|
||||
" to device:", event.memcpy_info.destination);
|
||||
}
|
||||
ns->set_timeline_label(std::move(details));
|
||||
|
||||
DeviceStepStats*& stream_dev_stats =
|
||||
per_stream_dev_stats[std::make_pair(event.stream_id,
|
||||
event.type)];
|
||||
if (stream_dev_stats == nullptr) {
|
||||
stream_dev_stats = step_stats->add_dev_stats();
|
||||
stream_dev_stats->set_device(absl::StrCat(
|
||||
"/device:GPU:", device_ordinal, "/stream:", event.stream_id,
|
||||
"<", GetRocmTracerEventTypeName(event.type), ">"));
|
||||
}
|
||||
*stream_dev_stats->add_node_stats() = *ns;
|
||||
if (memcpy_dev_stats == nullptr) {
|
||||
memcpy_dev_stats = step_stats->add_dev_stats();
|
||||
memcpy_dev_stats->set_device(
|
||||
absl::StrCat("/device:GPU:", device_ordinal, "/memcpy"));
|
||||
}
|
||||
memcpy_dev_stats->add_node_stats()->Swap(ns.release());
|
||||
} break;
|
||||
case RocmTracerEventType::MemoryAlloc: {
|
||||
std::string details = absl::StrCat(
|
||||
event.name, " bytes:", event.memalloc_info.num_bytes);
|
||||
ns->set_timeline_label(std::move(details));
|
||||
|
||||
DeviceStepStats*& stream_dev_stats =
|
||||
per_stream_dev_stats[std::make_pair(event.stream_id,
|
||||
event.type)];
|
||||
if (stream_dev_stats == nullptr) {
|
||||
stream_dev_stats = step_stats->add_dev_stats();
|
||||
stream_dev_stats->set_device(absl::StrCat(
|
||||
"/device:GPU:", device_ordinal, "/stream:", event.stream_id,
|
||||
"<", GetRocmTracerEventTypeName(event.type), ">"));
|
||||
}
|
||||
*stream_dev_stats->add_node_stats() = *ns;
|
||||
} break;
|
||||
case RocmTracerEventType::StreamSynchronize: {
|
||||
std::string details = event.name;
|
||||
ns->set_timeline_label(std::move(details));
|
||||
|
||||
if (sync_dev_stats == nullptr) {
|
||||
sync_dev_stats = step_stats->add_dev_stats();
|
||||
sync_dev_stats->set_device(
|
||||
absl::StrCat("/device:GPU:", device_ordinal, "/sync"));
|
||||
}
|
||||
sync_dev_stats->add_node_stats()->Swap(ns.release());
|
||||
} break;
|
||||
case RocmTracerEventType::Generic: {
|
||||
std::string details = event.name;
|
||||
ns->set_timeline_label(std::move(details));
|
||||
|
||||
if (generic_stream_dev_stats == nullptr) {
|
||||
generic_stream_dev_stats = step_stats->add_dev_stats();
|
||||
generic_stream_dev_stats->set_device(
|
||||
absl::StrCat("/device:GPU:", device_ordinal, "/stream:"));
|
||||
}
|
||||
generic_stream_dev_stats->add_node_stats()->Swap(ns.release());
|
||||
} break;
|
||||
default:
|
||||
DCHECK(false);
|
||||
break;
|
||||
}
|
||||
}
|
||||
events.clear();
|
||||
}
|
||||
|
||||
void CreateXEvent(const RocmTracerEvent& event, XPlaneBuilder* plane,
|
||||
uint64_t start_gpu_ns, uint64_t end_gpu_ns,
|
||||
XLineBuilder* line) {
|
||||
if (event.start_time_ns < start_gpu_ns ||
|
||||
event.end_time_ns > end_gpu_ns ||
|
||||
event.start_time_ns > event.end_time_ns) {
|
||||
VLOG(2) << "events have abnormal timestamps:" << event.name
|
||||
<< " start time(ns): " << event.start_time_ns
|
||||
<< " end time(ns): " << event.end_time_ns;
|
||||
return;
|
||||
}
|
||||
std::string kernel_name = port::MaybeAbiDemangle(event.name.c_str());
|
||||
if (kernel_name.empty()) {
|
||||
kernel_name = GetRocmTracerEventTypeName(event.type);
|
||||
}
|
||||
XEventMetadata* event_metadata =
|
||||
plane->GetOrCreateEventMetadata(std::move(kernel_name));
|
||||
XEventBuilder xevent = line->AddEvent(*event_metadata);
|
||||
xevent.SetTimestampNs(event.start_time_ns);
|
||||
xevent.SetEndTimestampNs(event.end_time_ns);
|
||||
if (event.correlation_id != RocmTracerEvent::kInvalidCorrelationId) {
|
||||
xevent.AddStatValue(*plane->GetOrCreateStatMetadata(
|
||||
GetStatTypeStr(StatType::kCorrelationId)),
|
||||
event.correlation_id);
|
||||
}
|
||||
if (!event.annotation.empty()) {
|
||||
xevent.AddStatValue(*plane->GetOrCreateStatMetadata(
|
||||
GetStatTypeStr(StatType::kKernelAnnotation)),
|
||||
event.annotation);
|
||||
}
|
||||
switch (event.type) {
|
||||
case RocmTracerEventType::Kernel: {
|
||||
const std::string kernel_details = absl::StrFormat(
|
||||
"regs:%u shm:%u grid:%u,%u,%u block:%u,%u,%u",
|
||||
event.kernel_info.registers_per_thread,
|
||||
event.kernel_info.static_shared_memory_usage,
|
||||
event.kernel_info.grid_x, event.kernel_info.grid_y,
|
||||
event.kernel_info.grid_z, event.kernel_info.block_x,
|
||||
event.kernel_info.block_y, event.kernel_info.block_z);
|
||||
xevent.AddStatValue(*plane->GetOrCreateStatMetadata(
|
||||
GetStatTypeStr(StatType::kKernelDetails)),
|
||||
kernel_details);
|
||||
} break;
|
||||
|
||||
case RocmTracerEventType::MemcpyD2H:
|
||||
case RocmTracerEventType::MemcpyH2D:
|
||||
case RocmTracerEventType::MemcpyD2D:
|
||||
case RocmTracerEventType::MemcpyP2P:
|
||||
case RocmTracerEventType::MemcpyOther: {
|
||||
const auto& memcpy_info = event.memcpy_info;
|
||||
std::string memcpy_details =
|
||||
absl::StrFormat("size:%u dest:%u async:%u", memcpy_info.num_bytes,
|
||||
memcpy_info.destination, memcpy_info.async);
|
||||
xevent.AddStatValue(*plane->GetOrCreateStatMetadata(
|
||||
GetStatTypeStr(StatType::kMemcpyDetails)),
|
||||
memcpy_details);
|
||||
} break;
|
||||
case RocmTracerEventType::MemoryAlloc: {
|
||||
std::string memalloc_details =
|
||||
absl::StrFormat("num_bytes:%u", event.memalloc_info.num_bytes);
|
||||
xevent.AddStatValue(*plane->GetOrCreateStatMetadata(
|
||||
GetStatTypeStr(StatType::kMemallocDetails)),
|
||||
memalloc_details);
|
||||
} break;
|
||||
case RocmTracerEventType::StreamSynchronize: {
|
||||
// TODO(rocm)
|
||||
// Don't yet know what to do here
|
||||
} break;
|
||||
case RocmTracerEventType::Generic: {
|
||||
// TODO(rocm)
|
||||
// Don't yet know what to do here
|
||||
} break;
|
||||
default:
|
||||
DCHECK(false);
|
||||
break;
|
||||
}
|
||||
|
||||
std::vector<Annotation> annotation_stack =
|
||||
ParseAnnotationStack(event.annotation);
|
||||
// If multiple metadata have the same key name, show the values from the
|
||||
// top of the stack (innermost annotation). Concatenate the values from
|
||||
// "hlo_op".
|
||||
absl::flat_hash_set<absl::string_view> key_set;
|
||||
std::vector<absl::string_view> hlo_op_names;
|
||||
for (auto annotation = annotation_stack.rbegin();
|
||||
annotation != annotation_stack.rend(); ++annotation) {
|
||||
for (const Annotation::Metadata& metadata : annotation->metadata) {
|
||||
if (metadata.key == "tf_op") {
|
||||
continue; // ignored, obtained from HLO proto via DebugInfoMap
|
||||
} else if (key_set.insert(metadata.key).second) {
|
||||
xevent.ParseAndAddStatValue(
|
||||
*plane->GetOrCreateStatMetadata(metadata.key), metadata.value);
|
||||
}
|
||||
}
|
||||
}
|
||||
// TODO(profiler): we should get rid of kLevel0, it is based on the
|
||||
// assumption that those op-related ScopedAnnotation are at the very TOP
|
||||
// level.
|
||||
if (!annotation_stack.empty()) {
|
||||
xevent.AddStatValue(
|
||||
*plane->GetOrCreateStatMetadata(GetStatTypeStr(StatType::kLevel0)),
|
||||
annotation_stack.begin()->name);
|
||||
}
|
||||
}
|
||||
|
||||
void Export(uint64_t start_walltime_ns, uint64_t start_gputime_ns,
|
||||
uint64_t end_gputime_ns, XPlaneBuilder* device_plane,
|
||||
XPlaneBuilder* host_plane) {
|
||||
mutex_lock lock(events_mutex);
|
||||
// Tracking event types per line.
|
||||
absl::flat_hash_map<int64, absl::flat_hash_set<RocmTracerEventType>>
|
||||
events_types_per_line;
|
||||
for (const RocmTracerEvent& event : events) {
|
||||
DumpRocmTracerEvent(event, start_walltime_ns, start_gputime_ns);
|
||||
bool is_host_event = IsHostEvent(event);
|
||||
int64_t line_id = is_host_event ? static_cast<int64>(event.thread_id)
|
||||
: event.stream_id;
|
||||
if (line_id == RocmTracerEvent::kInvalidThreadId ||
|
||||
line_id == RocmTracerEvent::kInvalidStreamId)
|
||||
continue;
|
||||
auto* plane = is_host_event ? host_plane : device_plane;
|
||||
XLineBuilder line = plane->GetOrCreateLine(line_id);
|
||||
line.SetTimestampNs(start_gputime_ns);
|
||||
CreateXEvent(event, plane, start_gputime_ns, end_gputime_ns, &line);
|
||||
events_types_per_line[line_id].emplace(event.type);
|
||||
}
|
||||
device_plane->ForEachLine([&](tensorflow::profiler::XLineBuilder line) {
|
||||
line.SetName(
|
||||
GetDeviceXLineName(line.Id(), events_types_per_line[line.Id()]));
|
||||
});
|
||||
events.clear();
|
||||
}
|
||||
|
||||
mutex events_mutex;
|
||||
std::vector<RocmTracerEvent> events TF_GUARDED_BY(events_mutex);
|
||||
};
|
||||
|
||||
absl::FixedArray<PerDeviceCollector> per_device_collector_;
|
||||
};
|
||||
|
||||
// GpuTracer for ROCm GPU.
|
||||
class GpuTracer : public profiler::ProfilerInterface {
|
||||
public:
|
||||
GpuTracer(RocmTracer* rocm_tracer) : rocm_tracer_(rocm_tracer) {
|
||||
LOG(INFO) << "GpuTracer created.";
|
||||
}
|
||||
~GpuTracer() override {}
|
||||
|
||||
// GpuTracer interface:
|
||||
Status Start() override;
|
||||
Status Stop() override;
|
||||
Status CollectData(RunMetadata* run_metadata) override;
|
||||
Status CollectData(XSpace* space) override;
|
||||
|
||||
private:
|
||||
Status DoStart();
|
||||
Status DoStop();
|
||||
Status DoCollectData(StepStats* step_stats);
|
||||
Status DoCollectData(XSpace* space);
|
||||
|
||||
RocmTracerOptions GetRocmTracerOptions();
|
||||
|
||||
RocmTraceCollectorOptions GetRocmTraceCollectorOptions(uint32_t num_gpus);
|
||||
|
||||
enum State {
|
||||
kNotStarted,
|
||||
kStartedOk,
|
||||
kStartedError,
|
||||
kStoppedOk,
|
||||
kStoppedError
|
||||
};
|
||||
State profiling_state_ = State::kNotStarted;
|
||||
|
||||
RocmTracer* rocm_tracer_;
|
||||
std::unique_ptr<RocmTraceCollectorImpl> rocm_trace_collector_;
|
||||
};
|
||||
|
||||
RocmTracerOptions GpuTracer::GetRocmTracerOptions() {
|
||||
RocmTracerOptions options;
|
||||
std::vector<uint32_t> empty_vec;
|
||||
|
||||
// clang formatting does not preserve one entry per line
|
||||
// clang-format off
|
||||
std::vector<uint32_t> hip_api_domain_ops{
|
||||
HIP_API_ID_hipExtModuleLaunchKernel,
|
||||
HIP_API_ID_hipFree,
|
||||
HIP_API_ID_hipHccModuleLaunchKernel,
|
||||
HIP_API_ID_hipLaunchKernel,
|
||||
HIP_API_ID_hipMalloc,
|
||||
HIP_API_ID_hipMemcpyAsync,
|
||||
HIP_API_ID_hipMemcpyDtoD,
|
||||
HIP_API_ID_hipMemcpyDtoDAsync,
|
||||
HIP_API_ID_hipMemcpyDtoH,
|
||||
HIP_API_ID_hipMemcpyDtoHAsync,
|
||||
HIP_API_ID_hipMemcpyHtoD,
|
||||
HIP_API_ID_hipMemcpyHtoDAsync,
|
||||
HIP_API_ID_hipMemsetD32,
|
||||
HIP_API_ID_hipMemsetD32Async,
|
||||
HIP_API_ID_hipMemsetD8,
|
||||
HIP_API_ID_hipMemsetD8Async,
|
||||
HIP_API_ID_hipModuleLaunchKernel,
|
||||
HIP_API_ID_hipStreamSynchronize,
|
||||
};
|
||||
// clang-format on
|
||||
|
||||
options.api_callbacks.emplace(ACTIVITY_DOMAIN_HIP_API, hip_api_domain_ops);
|
||||
// options.api_callbacks.emplace(ACTIVITY_DOMAIN_HIP_API, empty_vec);
|
||||
|
||||
// options.activity_tracing.emplace(ACTIVITY_DOMAIN_HIP_API,
|
||||
// hip_api_domain_ops);
|
||||
options.activity_tracing.emplace(ACTIVITY_DOMAIN_HIP_API, empty_vec);
|
||||
options.activity_tracing.emplace(ACTIVITY_DOMAIN_HCC_OPS, empty_vec);
|
||||
|
||||
return options;
|
||||
}
|
||||
|
||||
RocmTraceCollectorOptions GpuTracer::GetRocmTraceCollectorOptions(
|
||||
uint32_t num_gpus) {
|
||||
RocmTraceCollectorOptions options;
|
||||
options.max_callback_api_events = 2 * 1024 * 1024;
|
||||
options.max_activity_api_events = 2 * 1024 * 1024;
|
||||
options.max_annotation_strings = 1024 * 1024;
|
||||
options.num_gpus = num_gpus;
|
||||
return options;
|
||||
}
|
||||
|
||||
Status GpuTracer::DoStart() {
|
||||
if (!rocm_tracer_->IsAvailable()) {
|
||||
return errors::Unavailable("Another profile session running.");
|
||||
}
|
||||
|
||||
AnnotationStack::Enable(true);
|
||||
|
||||
RocmTraceCollectorOptions trace_collector_options =
|
||||
GetRocmTraceCollectorOptions(rocm_tracer_->NumGpus());
|
||||
uint64_t start_gputime_ns = RocmTracer::GetTimestamp();
|
||||
uint64_t start_walltime_ns = tensorflow::EnvTime::NowNanos();
|
||||
rocm_trace_collector_ = std::make_unique<RocmTraceCollectorImpl>(
|
||||
trace_collector_options, start_walltime_ns, start_gputime_ns);
|
||||
|
||||
RocmTracerOptions tracer_options = GetRocmTracerOptions();
|
||||
rocm_tracer_->Enable(tracer_options, rocm_trace_collector_.get());
|
||||
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Status GpuTracer::Start() {
|
||||
Status status = DoStart();
|
||||
if (status.ok()) {
|
||||
profiling_state_ = State::kStartedOk;
|
||||
return Status::OK();
|
||||
} else {
|
||||
profiling_state_ = State::kStartedError;
|
||||
return status;
|
||||
}
|
||||
}
|
||||
|
||||
Status GpuTracer::DoStop() {
|
||||
rocm_tracer_->Disable();
|
||||
AnnotationStack::Enable(false);
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Status GpuTracer::Stop() {
|
||||
if (profiling_state_ == State::kStartedOk) {
|
||||
Status status = DoStop();
|
||||
profiling_state_ = status.ok() ? State::kStoppedOk : State::kStoppedError;
|
||||
}
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Status GpuTracer::DoCollectData(StepStats* step_stats) {
|
||||
if (rocm_trace_collector_) rocm_trace_collector_->Export(step_stats);
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Status GpuTracer::CollectData(RunMetadata* run_metadata) {
|
||||
switch (profiling_state_) {
|
||||
case State::kNotStarted:
|
||||
VLOG(3) << "No trace data collected, session wasn't started";
|
||||
return Status::OK();
|
||||
case State::kStartedOk:
|
||||
return errors::FailedPrecondition("Cannot collect trace before stopping");
|
||||
case State::kStartedError:
|
||||
LOG(ERROR) << "Cannot collect, roctracer failed to start";
|
||||
return Status::OK();
|
||||
case State::kStoppedError:
|
||||
VLOG(3) << "No trace data collected";
|
||||
return Status::OK();
|
||||
case State::kStoppedOk: {
|
||||
// Input run_metadata is shared by profiler interfaces, we need append.
|
||||
StepStats step_stats;
|
||||
DoCollectData(&step_stats);
|
||||
for (auto& dev_stats : *step_stats.mutable_dev_stats()) {
|
||||
run_metadata->mutable_step_stats()->add_dev_stats()->Swap(&dev_stats);
|
||||
}
|
||||
return Status::OK();
|
||||
}
|
||||
}
|
||||
return errors::Internal("Invalid profiling state: ", profiling_state_);
|
||||
}
|
||||
|
||||
Status GpuTracer::DoCollectData(XSpace* space) {
|
||||
if (rocm_trace_collector_) rocm_trace_collector_->Export(space);
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Status GpuTracer::CollectData(XSpace* space) {
|
||||
switch (profiling_state_) {
|
||||
case State::kNotStarted:
|
||||
VLOG(3) << "No trace data collected, session wasn't started";
|
||||
return Status::OK();
|
||||
case State::kStartedOk:
|
||||
return errors::FailedPrecondition("Cannot collect trace before stopping");
|
||||
case State::kStartedError:
|
||||
LOG(ERROR) << "Cannot collect, roctracer failed to start";
|
||||
return Status::OK();
|
||||
case State::kStoppedError:
|
||||
VLOG(3) << "No trace data collected";
|
||||
return Status::OK();
|
||||
case State::kStoppedOk: {
|
||||
DoCollectData(space);
|
||||
return Status::OK();
|
||||
}
|
||||
}
|
||||
return errors::Internal("Invalid profiling state: ", profiling_state_);
|
||||
}
|
||||
|
||||
// Not in anonymous namespace for testing purposes.
|
||||
std::unique_ptr<profiler::ProfilerInterface> CreateGpuTracer(
|
||||
const ProfileOptions& options) {
|
||||
if (options.device_type() != ProfileOptions::GPU &&
|
||||
options.device_type() != ProfileOptions::UNSPECIFIED)
|
||||
return nullptr;
|
||||
|
||||
profiler::RocmTracer* rocm_tracer =
|
||||
profiler::RocmTracer::GetRocmTracerSingleton();
|
||||
if (!rocm_tracer->IsAvailable()) return nullptr;
|
||||
|
||||
return absl::make_unique<profiler::GpuTracer>(rocm_tracer);
|
||||
}
|
||||
|
||||
auto register_rocm_gpu_tracer_factory = [] {
|
||||
RegisterProfilerFactory(&CreateGpuTracer);
|
||||
return 0;
|
||||
}();
|
||||
|
||||
} // namespace profiler
|
||||
} // namespace tensorflow
|
||||
|
||||
#endif // TENSORFLOW_USE_ROCM
|
||||
|
|
@ -49,7 +49,7 @@ limitations under the License.
|
|||
namespace tensorflow {
|
||||
namespace profiler {
|
||||
|
||||
#if GOOGLE_CUDA
|
||||
#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
|
||||
extern std::unique_ptr<ProfilerInterface> CreateGpuTracer(
|
||||
const ProfileOptions& options);
|
||||
std::unique_ptr<ProfilerInterface> CreateGpuTracer() {
|
||||
|
|
@ -243,6 +243,54 @@ TEST_F(DeviceTracerTest, RunWithTraceOption) {
|
|||
EXPECT_GE(run_metadata.step_stats().dev_stats_size(), 1);
|
||||
}
|
||||
|
||||
#if TENSORFLOW_USE_ROCM
|
||||
TEST_F(DeviceTracerTest, TraceToXSpace) {
|
||||
auto tracer = CreateGpuTracer();
|
||||
if (!tracer) return;
|
||||
|
||||
Initialize({3, 2, -1, 0});
|
||||
auto session = CreateSession();
|
||||
ASSERT_TRUE(session != nullptr);
|
||||
TF_ASSERT_OK(session->Create(def_));
|
||||
std::vector<std::pair<string, Tensor>> inputs;
|
||||
|
||||
// Request two targets: one fetch output and one non-fetched output.
|
||||
std::vector<string> output_names = {y_ + ":0"};
|
||||
std::vector<string> target_nodes = {y_neg_};
|
||||
std::vector<Tensor> outputs;
|
||||
|
||||
TF_ASSERT_OK(tracer->Start());
|
||||
Status s = session->Run(inputs, output_names, target_nodes, &outputs);
|
||||
TF_ASSERT_OK(s);
|
||||
|
||||
TF_ASSERT_OK(tracer->Stop());
|
||||
XSpace space;
|
||||
TF_ASSERT_OK(tracer->CollectData(&space));
|
||||
// At least one gpu plane and one host plane for launching events.
|
||||
const XPlane* host_plane = FindPlaneWithName(space, kRoctracerApiPlaneName);
|
||||
ASSERT_NE(host_plane, nullptr);
|
||||
|
||||
const XPlane* device_plane =
|
||||
FindPlaneWithName(space, strings::StrCat(kGpuPlanePrefix, 0));
|
||||
ASSERT_NE(device_plane, nullptr); // Check if device plane is serialized.
|
||||
// one for MemcpyH2D, one for MemcpyD2H, two for Matmul (one from Eigen, one
|
||||
// from cudnn), one for memset.
|
||||
EXPECT_EQ(device_plane->event_metadata_size(), 5);
|
||||
// Check if device capacity is serialized.
|
||||
XPlaneVisitor plane = CreateTfXPlaneVisitor(device_plane);
|
||||
|
||||
// Check if the device events timestamps are set.
|
||||
int total_events = 0;
|
||||
plane.ForEachLine([&](const tensorflow::profiler::XLineVisitor& line) {
|
||||
line.ForEachEvent([&](const tensorflow::profiler::XEventVisitor& event) {
|
||||
EXPECT_GT(event.TimestampNs(), 0);
|
||||
EXPECT_GT(event.DurationNs(), 0);
|
||||
++total_events;
|
||||
});
|
||||
});
|
||||
EXPECT_GE(total_events, 5);
|
||||
}
|
||||
#else // TENSORFLOW_USE_ROCM
|
||||
TEST_F(DeviceTracerTest, TraceToXSpace) {
|
||||
auto tracer = CreateGpuTracer();
|
||||
if (!tracer) return;
|
||||
|
|
@ -295,6 +343,7 @@ TEST_F(DeviceTracerTest, TraceToXSpace) {
|
|||
});
|
||||
EXPECT_GE(total_events, 5);
|
||||
}
|
||||
#endif // TENSORFLOW_USE_ROCM
|
||||
|
||||
#if GOOGLE_CUDA
|
||||
TEST_F(DeviceTracerTest, CudaRuntimeResource) {
|
||||
|
|
|
|||
1127
tensorflow/core/profiler/internal/gpu/rocm_tracer.cc
Normal file
1127
tensorflow/core/profiler/internal/gpu/rocm_tracer.cc
Normal file
File diff suppressed because it is too large
Load Diff
355
tensorflow/core/profiler/internal/gpu/rocm_tracer.h
Normal file
355
tensorflow/core/profiler/internal/gpu/rocm_tracer.h
Normal file
|
|
@ -0,0 +1,355 @@
|
|||
/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==============================================================================*/
|
||||
|
||||
#ifndef TENSORFLOW_CORE_PROFILER_INTERNAL_GPU_ROCM_TRACER_H_
|
||||
#define TENSORFLOW_CORE_PROFILER_INTERNAL_GPU_ROCM_TRACER_H_
|
||||
|
||||
#include "absl/container/fixed_array.h"
|
||||
#include "absl/container/flat_hash_map.h"
|
||||
#include "absl/container/flat_hash_set.h"
|
||||
#include "absl/container/node_hash_set.h"
|
||||
#include "absl/types/optional.h"
|
||||
#include "tensorflow/core/lib/core/errors.h"
|
||||
#include "tensorflow/core/lib/core/status.h"
|
||||
#include "tensorflow/core/platform/macros.h"
|
||||
#include "tensorflow/core/platform/types.h"
|
||||
#include "tensorflow/stream_executor/rocm/roctracer_wrapper.h"
|
||||
|
||||
namespace tensorflow {
|
||||
namespace profiler {
|
||||
|
||||
struct MemcpyDetails {
|
||||
// The amount of data copied for memcpy events.
|
||||
size_t num_bytes;
|
||||
// The destination device for peer-2-peer communication (memcpy). The source
|
||||
// device is implicit: its the current device.
|
||||
uint32_t destination;
|
||||
// Whether or not the memcpy is asynchronous.
|
||||
bool async;
|
||||
};
|
||||
|
||||
struct MemsetDetails {
|
||||
// The number of memory elements getting set
|
||||
size_t num_elements;
|
||||
// Whether or not the memset is asynchronous.
|
||||
bool async;
|
||||
};
|
||||
|
||||
struct MemAllocDetails {
|
||||
// The amount of data requested for cudaMalloc events.
|
||||
uint64_t num_bytes;
|
||||
};
|
||||
|
||||
struct KernelDetails {
|
||||
// The number of registers used in this kernel.
|
||||
uint32_t registers_per_thread;
|
||||
// The amount of shared memory space used by a thread block.
|
||||
uint32_t static_shared_memory_usage;
|
||||
// The amount of dynamic memory space used by a thread block.
|
||||
uint32_t dynamic_shared_memory_usage;
|
||||
// X-dimension of a thread block.
|
||||
uint32_t block_x;
|
||||
// Y-dimension of a thread block.
|
||||
uint32_t block_y;
|
||||
// Z-dimension of a thread block.
|
||||
uint32_t block_z;
|
||||
// X-dimension of a grid.
|
||||
uint32_t grid_x;
|
||||
// Y-dimension of a grid.
|
||||
uint32_t grid_y;
|
||||
// Z-dimension of a grid.
|
||||
uint32_t grid_z;
|
||||
};
|
||||
|
||||
enum class RocmTracerEventType {
|
||||
Unsupported = 0,
|
||||
Kernel,
|
||||
MemcpyH2D,
|
||||
MemcpyD2H,
|
||||
MemcpyD2D,
|
||||
MemcpyP2P,
|
||||
MemcpyOther,
|
||||
MemoryAlloc,
|
||||
Memset,
|
||||
StreamSynchronize,
|
||||
Generic,
|
||||
};
|
||||
|
||||
const char* GetRocmTracerEventTypeName(const RocmTracerEventType& type);
|
||||
|
||||
enum class RocmTracerEventSource {
|
||||
ApiCallback = 0,
|
||||
Activity,
|
||||
};
|
||||
|
||||
const char* GetRocmTracerEventSourceName(const RocmTracerEventSource& source);
|
||||
|
||||
enum class RocmTracerEventDomain {
|
||||
HIP_API = 0,
|
||||
HCC_OPS,
|
||||
};
|
||||
|
||||
const char* GetRocmTracerEventDomainName(const RocmTracerEventDomain& domain);
|
||||
|
||||
struct RocmTracerEvent {
|
||||
static constexpr uint32_t kInvalidDeviceId =
|
||||
std::numeric_limits<uint32_t>::max();
|
||||
static constexpr uint32_t kInvalidThreadId =
|
||||
std::numeric_limits<uint32_t>::max();
|
||||
static constexpr uint32_t kInvalidCorrelationId =
|
||||
std::numeric_limits<uint32_t>::max();
|
||||
static constexpr uint64_t kInvalidStreamId =
|
||||
std::numeric_limits<uint64_t>::max();
|
||||
RocmTracerEventType type;
|
||||
RocmTracerEventSource source;
|
||||
RocmTracerEventDomain domain;
|
||||
std::string name;
|
||||
// This points to strings in AnnotationMap, which should outlive the point
|
||||
// where serialization happens.
|
||||
absl::string_view annotation;
|
||||
uint64_t start_time_ns;
|
||||
uint64_t end_time_ns;
|
||||
uint32_t device_id = kInvalidDeviceId;
|
||||
uint32_t correlation_id = kInvalidCorrelationId;
|
||||
uint32_t thread_id = kInvalidThreadId;
|
||||
int64_t stream_id = kInvalidStreamId;
|
||||
union {
|
||||
MemcpyDetails memcpy_info; // If type == Memcpy*
|
||||
MemsetDetails memset_info; // If type == Memset*
|
||||
MemAllocDetails memalloc_info; // If type == MemoryAlloc
|
||||
KernelDetails kernel_info; // If type == Kernel
|
||||
};
|
||||
};
|
||||
|
||||
void DumpRocmTracerEvent(const RocmTracerEvent& event,
|
||||
uint64_t start_walltime_ns, uint64_t start_gputime_ns);
|
||||
|
||||
struct RocmTracerOptions {
|
||||
// map of domain --> ops for which we need to enable the API callbacks
|
||||
// If the ops vector is empty, then enable API callbacks for entire domain
|
||||
absl::flat_hash_map<activity_domain_t, std::vector<uint32_t> > api_callbacks;
|
||||
|
||||
// map of domain --> ops for which we need to enable the Activity records
|
||||
// If the ops vector is empty, then enable Activity records for entire domain
|
||||
absl::flat_hash_map<activity_domain_t, std::vector<uint32_t> >
|
||||
activity_tracing;
|
||||
};
|
||||
|
||||
struct RocmTraceCollectorOptions {
|
||||
// Maximum number of events to collect from callback API; if -1, no limit.
|
||||
// if 0, the callback API is enabled to build a correlation map, but no
|
||||
// events are collected.
|
||||
uint64_t max_callback_api_events;
|
||||
// Maximum number of events to collect from activity API; if -1, no limit.
|
||||
uint64_t max_activity_api_events;
|
||||
// Maximum number of annotation strings that we can accommodate.
|
||||
uint64_t max_annotation_strings;
|
||||
// Number of GPUs involved.
|
||||
uint32_t num_gpus;
|
||||
};
|
||||
|
||||
class AnnotationMap {
|
||||
public:
|
||||
explicit AnnotationMap(uint64_t max_size) : max_size_(max_size) {}
|
||||
void Add(uint32_t correlation_id, const std::string& annotation);
|
||||
absl::string_view LookUp(uint32_t correlation_id);
|
||||
|
||||
private:
|
||||
struct AnnotationMapImpl {
|
||||
// The population/consumption of annotations might happen from multiple
|
||||
// callback/activity api related threads.
|
||||
absl::Mutex mutex;
|
||||
// Annotation tends to be repetitive, use a hash_set to store the strings,
|
||||
// an use the reference to the string in the map.
|
||||
absl::node_hash_set<std::string> annotations;
|
||||
absl::flat_hash_map<uint32_t, absl::string_view> correlation_map;
|
||||
};
|
||||
const uint64_t max_size_;
|
||||
AnnotationMapImpl map_;
|
||||
|
||||
public:
|
||||
// Disable copy and move.
|
||||
AnnotationMap(const AnnotationMap&) = delete;
|
||||
AnnotationMap& operator=(const AnnotationMap&) = delete;
|
||||
};
|
||||
|
||||
class RocmTraceCollector {
|
||||
public:
|
||||
explicit RocmTraceCollector(const RocmTraceCollectorOptions& options)
|
||||
: options_(options), annotation_map_(options.max_annotation_strings) {}
|
||||
virtual ~RocmTraceCollector() {}
|
||||
|
||||
virtual void AddEvent(RocmTracerEvent&& event) = 0;
|
||||
virtual void OnEventsDropped(const std::string& reason,
|
||||
uint32_t num_events) = 0;
|
||||
virtual void Flush() = 0;
|
||||
|
||||
AnnotationMap* annotation_map() { return &annotation_map_; }
|
||||
|
||||
protected:
|
||||
RocmTraceCollectorOptions options_;
|
||||
|
||||
private:
|
||||
AnnotationMap annotation_map_;
|
||||
|
||||
public:
|
||||
// Disable copy and move.
|
||||
RocmTraceCollector(const RocmTraceCollector&) = delete;
|
||||
RocmTraceCollector& operator=(const RocmTraceCollector&) = delete;
|
||||
};
|
||||
|
||||
class RocmTracer;
|
||||
|
||||
class RocmApiCallbackImpl {
|
||||
public:
|
||||
RocmApiCallbackImpl(const RocmTracerOptions& options, RocmTracer* tracer,
|
||||
RocmTraceCollector* collector)
|
||||
: options_(options), tracer_(tracer), collector_(collector) {}
|
||||
|
||||
Status operator()(uint32_t domain, uint32_t cbid, const void* cbdata);
|
||||
|
||||
private:
|
||||
void AddKernelEventUponApiExit(uint32_t cbid, const hip_api_data_t* data);
|
||||
void AddMemcpyEventUponApiExit(uint32_t cbid, const hip_api_data_t* data);
|
||||
void AddMemsetEventUponApiExit(uint32_t cbid, const hip_api_data_t* data);
|
||||
void AddMallocEventUponApiExit(uint32_t cbid, const hip_api_data_t* data);
|
||||
void AddStreamSynchronizeEventUponApiExit(uint32_t cbid,
|
||||
const hip_api_data_t* data);
|
||||
void AddGenericEventUponApiExit(uint32_t cbid, const hip_api_data_t* data);
|
||||
|
||||
RocmTracerOptions options_;
|
||||
RocmTracer* tracer_ = nullptr;
|
||||
RocmTraceCollector* collector_ = nullptr;
|
||||
};
|
||||
|
||||
class RocmActivityCallbackImpl {
|
||||
public:
|
||||
RocmActivityCallbackImpl(const RocmTracerOptions& options, RocmTracer* tracer,
|
||||
RocmTraceCollector* collector)
|
||||
: options_(options), tracer_(tracer), collector_(collector) {}
|
||||
|
||||
Status operator()(const char* begin, const char* end);
|
||||
|
||||
private:
|
||||
void AddHipKernelActivityEvent(const roctracer_record_t* record);
|
||||
void AddHipMemcpyActivityEvent(const roctracer_record_t* record);
|
||||
void AddHipMemsetActivityEvent(const roctracer_record_t* record);
|
||||
void AddHipMallocEvent(const roctracer_record_t* record);
|
||||
void AddHipStreamSynchronizeEvent(const roctracer_record_t* record);
|
||||
void AddHccKernelActivityEvent(const roctracer_record_t* record);
|
||||
void AddHccMemcpyActivityEvent(const roctracer_record_t* record);
|
||||
|
||||
RocmTracerOptions options_;
|
||||
RocmTracer* tracer_ = nullptr;
|
||||
RocmTraceCollector* collector_ = nullptr;
|
||||
};
|
||||
|
||||
// The class use to enable cupti callback/activity API and forward the collected
|
||||
// trace events to RocmTraceCollector. There should be only one RocmTracer
|
||||
// per process.
|
||||
class RocmTracer {
|
||||
public:
|
||||
// Returns a pointer to singleton RocmTracer.
|
||||
static RocmTracer* GetRocmTracerSingleton();
|
||||
|
||||
// Only one profile session can be live in the same time.
|
||||
bool IsAvailable() const;
|
||||
|
||||
void Enable(const RocmTracerOptions& options, RocmTraceCollector* collector);
|
||||
void Disable();
|
||||
|
||||
void ApiCallbackHandler(uint32_t domain, uint32_t cbid, const void* cbdata);
|
||||
void ActivityCallbackHandler(const char* begin, const char* end);
|
||||
|
||||
static uint64_t GetTimestamp();
|
||||
static int NumGpus();
|
||||
|
||||
void AddToPendingActivityRecords(uint32_t correlation_id) {
|
||||
pending_activity_records_.Add(correlation_id);
|
||||
}
|
||||
|
||||
void RemoveFromPendingActivityRecords(uint32_t correlation_id) {
|
||||
pending_activity_records_.Remove(correlation_id);
|
||||
}
|
||||
|
||||
void ClearPendingActivityRecordsCount() { pending_activity_records_.Clear(); }
|
||||
|
||||
size_t GetPendingActivityRecordsCount() {
|
||||
return pending_activity_records_.Count();
|
||||
}
|
||||
|
||||
protected:
|
||||
// protected constructor for injecting mock cupti interface for testing.
|
||||
explicit RocmTracer() : num_gpus_(NumGpus()) {}
|
||||
|
||||
private:
|
||||
Status EnableApiTracing();
|
||||
Status DisableApiTracing();
|
||||
|
||||
Status EnableActivityTracing();
|
||||
Status DisableActivityTracing();
|
||||
|
||||
int num_gpus_;
|
||||
absl::optional<RocmTracerOptions> options_;
|
||||
RocmTraceCollector* collector_ = nullptr;
|
||||
|
||||
bool api_tracing_enabled_ = false;
|
||||
bool activity_tracing_enabled_ = false;
|
||||
|
||||
RocmApiCallbackImpl* api_cb_impl_;
|
||||
RocmActivityCallbackImpl* activity_cb_impl_;
|
||||
|
||||
class PendingActivityRecords {
|
||||
public:
|
||||
// add a correlation id to the pending set
|
||||
void Add(uint32_t correlation_id) {
|
||||
absl::MutexLock lock(&mutex);
|
||||
pending_set.insert(correlation_id);
|
||||
}
|
||||
// remove a correlation id from the pending set
|
||||
void Remove(uint32_t correlation_id) {
|
||||
absl::MutexLock lock(&mutex);
|
||||
pending_set.erase(correlation_id);
|
||||
}
|
||||
// clear the pending set
|
||||
void Clear() {
|
||||
absl::MutexLock lock(&mutex);
|
||||
pending_set.clear();
|
||||
}
|
||||
// count the number of correlation ids in the pending set
|
||||
size_t Count() {
|
||||
absl::MutexLock lock(&mutex);
|
||||
return pending_set.size();
|
||||
}
|
||||
|
||||
private:
|
||||
// set of co-relation ids for which the hcc activity record is pending
|
||||
absl::flat_hash_set<uint32_t> pending_set;
|
||||
// the callback which processes the activity records (and consequently
|
||||
// removes items from the pending set) is called in a separate thread
|
||||
// from the one that adds item to the list.
|
||||
absl::Mutex mutex;
|
||||
};
|
||||
PendingActivityRecords pending_activity_records_;
|
||||
|
||||
public:
|
||||
// Disable copy and move.
|
||||
RocmTracer(const RocmTracer&) = delete;
|
||||
RocmTracer& operator=(const RocmTracer&) = delete;
|
||||
};
|
||||
|
||||
} // namespace profiler
|
||||
} // namespace tensorflow
|
||||
#endif // TENSORFLOW_CORE_PROFILER_INTERNAL_GPU_ROCM_TRACER_H_
|
||||
|
|
@ -31,6 +31,7 @@ const absl::string_view kGpuPlanePrefix = "/device:GPU:";
|
|||
const absl::string_view kTpuPlanePrefix = "/device:TPU:";
|
||||
const absl::string_view kTpuRuntimePlaneName = "/host:TPU-runtime";
|
||||
const absl::string_view kCuptiDriverApiPlaneName = "/host:CUPTI";
|
||||
const absl::string_view kRoctracerApiPlaneName = "/host:ROCTRACER";
|
||||
const absl::string_view kMetadataPlaneName = "/host:metadata";
|
||||
const absl::string_view kTFStreamzPlaneName = "/host:tfstreamz";
|
||||
const absl::string_view kPythonTracerPlaneName = "/host:python-tracer";
|
||||
|
|
|
|||
|
|
@ -37,6 +37,8 @@ TF_CONST_INIT extern const absl::string_view kTpuPlanePrefix;
|
|||
TF_CONST_INIT extern const absl::string_view kTpuRuntimePlaneName;
|
||||
// Name of XPlane that contains CUPTI driver API generated events.
|
||||
TF_CONST_INIT extern const absl::string_view kCuptiDriverApiPlaneName;
|
||||
// Name of XPlane that contains Roctracer API generated events.
|
||||
TF_CONST_INIT extern const absl::string_view kRoctracerApiPlaneName;
|
||||
// Name of XPlane that contains profile metadata such as XLA debug info.
|
||||
TF_CONST_INIT extern const absl::string_view kMetadataPlaneName;
|
||||
// Name of XPlane that contains kpi related metrics.
|
||||
|
|
|
|||
|
|
@ -26,7 +26,9 @@ import re
|
|||
# The timeline target is usually imported as part of BUILD target
|
||||
# "platform_test", which includes also includes the "platform"
|
||||
# dependency. This is why the logging import here is okay.
|
||||
from tensorflow.python.platform import build_info
|
||||
from tensorflow.python.platform import tf_logging as logging
|
||||
from tensorflow.python.platform import build_info
|
||||
|
||||
|
||||
class AllocationMaximum(collections.namedtuple(
|
||||
|
|
@ -448,6 +450,8 @@ class Timeline(object):
|
|||
else:
|
||||
_, op, inputs = self._parse_op_label(nodestats.timeline_label)
|
||||
args = {'name': node_name, 'op': op}
|
||||
if build_info.build_info['is_rocm_build']:
|
||||
args['kernel'] = nodestats.timeline_label.split('@@')[0]
|
||||
for i, iname in enumerate(inputs):
|
||||
args['input%d' % i] = iname
|
||||
self._chrome_trace.emit_region(start, duration, pid, tid, 'Op', op, args)
|
||||
|
|
|
|||
|
|
@ -104,10 +104,7 @@ class TimelineTest(test.TestCase):
|
|||
step_stats = run_metadata.step_stats
|
||||
devices = [d.device for d in step_stats.dev_stats]
|
||||
self.assertTrue('/job:localhost/replica:0/task:0/device:GPU:0' in devices)
|
||||
if not test.is_built_with_rocm():
|
||||
# skip this check for the ROCm platform
|
||||
# stream level tracing is not yet supported on the ROCm platform
|
||||
self.assertTrue('/device:GPU:0/stream:all' in devices)
|
||||
self.assertIn('/device:GPU:0/stream:all', devices)
|
||||
tl = timeline.Timeline(step_stats)
|
||||
ctf = tl.generate_chrome_trace_format()
|
||||
self._validateTrace(ctf)
|
||||
|
|
|
|||
|
|
@ -47,8 +47,7 @@ class ProfilerTest(test_util.TensorFlowTestCase):
|
|||
profile_pb.ParseFromString(profile_result)
|
||||
devices = frozenset(device.name for device in profile_pb.devices.values())
|
||||
self.assertIn('/host:CPU', devices)
|
||||
if not test_util.IsBuiltWithROCm() and config.list_physical_devices('GPU'):
|
||||
# device tracing is not yet supported on the ROCm platform
|
||||
if config.list_physical_devices('GPU'):
|
||||
self.assertIn('/device:GPU:0', devices)
|
||||
events = frozenset(event.name for event in profile_pb.trace_events)
|
||||
self.assertIn('three_times_five', events)
|
||||
|
|
|
|||
|
|
@ -61,7 +61,6 @@ cuda_py_test(
|
|||
python_version = "PY3",
|
||||
tags = [
|
||||
"no_pip",
|
||||
"no_rocm",
|
||||
],
|
||||
deps = [
|
||||
":profiler_v2",
|
||||
|
|
|
|||
|
|
@ -77,14 +77,13 @@ def _run_model():
|
|||
opts['min_bytes'] = 0
|
||||
opts['order_by'] = 'name'
|
||||
opts['output'] = 'none'
|
||||
_ = sess.run(y,
|
||||
options=config_pb2.RunOptions(
|
||||
trace_level=config_pb2.RunOptions.SOFTWARE_TRACE),
|
||||
run_metadata=run_metadata)
|
||||
_ = sess.run(
|
||||
y,
|
||||
options=config_pb2.RunOptions(
|
||||
trace_level=config_pb2.RunOptions.SOFTWARE_TRACE),
|
||||
run_metadata=run_metadata)
|
||||
tfprof_node = model_analyzer.profile(
|
||||
sess.graph,
|
||||
run_meta=run_metadata,
|
||||
options=opts)
|
||||
sess.graph, run_meta=run_metadata, options=opts)
|
||||
|
||||
return tfprof_node, run_metadata
|
||||
|
||||
|
|
@ -99,17 +98,17 @@ def _run_loop_model():
|
|||
|
||||
sess.run(variables.global_variables_initializer())
|
||||
run_meta = config_pb2.RunMetadata()
|
||||
_ = sess.run(x,
|
||||
options=config_pb2.RunOptions(
|
||||
trace_level=config_pb2.RunOptions.SOFTWARE_TRACE),
|
||||
run_metadata=run_meta)
|
||||
_ = sess.run(
|
||||
x,
|
||||
options=config_pb2.RunOptions(
|
||||
trace_level=config_pb2.RunOptions.SOFTWARE_TRACE),
|
||||
run_metadata=run_meta)
|
||||
|
||||
opts = builder.time_and_memory()
|
||||
opts['order_by'] = 'name'
|
||||
opts['output'] = 'none'
|
||||
|
||||
tfprof_node = model_analyzer.profile(
|
||||
sess.graph, run_meta, options=opts)
|
||||
tfprof_node = model_analyzer.profile(sess.graph, run_meta, options=opts)
|
||||
return tfprof_node, run_meta
|
||||
|
||||
|
||||
|
|
@ -136,10 +135,6 @@ class RunMetadataTest(test.TestCase):
|
|||
|
||||
ret = _extract_node(run_meta, 'MatMul')
|
||||
self.assertEqual(len(ret['gpu:0']), 1)
|
||||
if not test.is_built_with_rocm():
|
||||
# skip this check for the ROCm platform
|
||||
# stream level tracing is not yet supported on the ROCm platform
|
||||
self.assertEqual(len(ret['gpu:0/stream:all']), 1, '%s' % run_meta)
|
||||
|
||||
@test_util.run_deprecated_v1
|
||||
def testAllocationHistory(self):
|
||||
|
|
@ -163,8 +158,8 @@ class RunMetadataTest(test.TestCase):
|
|||
# All memory deallocated.
|
||||
self.assertEqual(mm_allocs[0].alloc_bytes + mm_allocs[1].alloc_bytes, 0)
|
||||
|
||||
rand = _extract_node(
|
||||
run_meta, 'random_normal/RandomStandardNormal')['gpu:0'][0]
|
||||
rand = _extract_node(run_meta,
|
||||
'random_normal/RandomStandardNormal')['gpu:0'][0]
|
||||
random_allocs = rand.memory[0].allocation_records
|
||||
# random normal must allocated first since matmul depends on it.
|
||||
self.assertLess(random_allocs[0].alloc_micros, mm.all_start_micros)
|
||||
|
|
@ -191,17 +186,15 @@ class RunMetadataTest(test.TestCase):
|
|||
with ops.device('/cpu:0'):
|
||||
tfprof_node, run_meta = _run_loop_model()
|
||||
# The while-loop caused a node to appear 4 times in scheduling.
|
||||
ret = _extract_node(run_meta,
|
||||
'rnn/while/basic_rnn_cell/MatMul')
|
||||
ret = _extract_node(run_meta, 'rnn/while/basic_rnn_cell/MatMul')
|
||||
self.assertEqual(len(ret['cpu:0']), 4)
|
||||
|
||||
total_cpu_execs = 0
|
||||
for node in ret['cpu:0']:
|
||||
total_cpu_execs += node.op_end_rel_micros
|
||||
|
||||
mm_node = lib.SearchTFProfNode(
|
||||
tfprof_node,
|
||||
'rnn/while/basic_rnn_cell/MatMul')
|
||||
mm_node = lib.SearchTFProfNode(tfprof_node,
|
||||
'rnn/while/basic_rnn_cell/MatMul')
|
||||
|
||||
self.assertEqual(mm_node.run_count, 4)
|
||||
self.assertEqual(mm_node.cpu_exec_micros, total_cpu_execs)
|
||||
|
|
@ -243,17 +236,13 @@ class RunMetadataTest(test.TestCase):
|
|||
with ops.device('/device:GPU:0'):
|
||||
_, run_meta = _run_loop_model()
|
||||
# The while-loop caused a node to appear 4 times in scheduling.
|
||||
ret = _extract_node(run_meta,
|
||||
'rnn/while/basic_rnn_cell/MatMul')
|
||||
ret = _extract_node(run_meta, 'rnn/while/basic_rnn_cell/MatMul')
|
||||
self.assertEqual(len(ret['gpu:0']), 4, '%s' % run_meta)
|
||||
|
||||
total_cpu_execs = 0
|
||||
for node in ret['gpu:0']:
|
||||
total_cpu_execs += node.op_end_rel_micros
|
||||
|
||||
if not test.is_built_with_rocm():
|
||||
# skip this check for the ROCm platform
|
||||
# stream level tracing is not yet supported on the ROCm platform
|
||||
self.assertGreaterEqual(
|
||||
len(ret['gpu:0/stream:all']), 4, '%s' % run_meta)
|
||||
|
||||
|
|
|
|||
|
|
@ -40,8 +40,7 @@ class ProfilerContextTest(test.TestCase):
|
|||
def testBasics(self):
|
||||
ops.reset_default_graph()
|
||||
outfile = os.path.join(test.get_temp_dir(), "dump")
|
||||
opts = builder(builder.time_and_memory()
|
||||
).with_file_output(outfile).build()
|
||||
opts = builder(builder.time_and_memory()).with_file_output(outfile).build()
|
||||
|
||||
x = lib.BuildFullModel()
|
||||
|
||||
|
|
@ -65,17 +64,10 @@ class ProfilerContextTest(test.TestCase):
|
|||
|
||||
self.assertEqual(set([15, 50, 100]), set(pctx.get_profiles("op").keys()))
|
||||
|
||||
with lib.ProfilerFromFile(
|
||||
os.path.join(test.get_temp_dir(), "profile_100")) as profiler:
|
||||
with lib.ProfilerFromFile(os.path.join(test.get_temp_dir(),
|
||||
"profile_100")) as profiler:
|
||||
profiler.profile_operations(options=opts)
|
||||
with gfile.Open(outfile, "r") as f:
|
||||
|
||||
if test.is_built_with_rocm():
|
||||
# The profiler output for ROCm mode, includes an extra warning
|
||||
# related to the lack of stream tracing in ROCm mode.
|
||||
# Need to skip this warning when doing the diff
|
||||
profile_str = "\n".join(profile_str.split("\n")[7:])
|
||||
|
||||
self.assertEqual(profile_str, f.read())
|
||||
|
||||
@test_util.run_deprecated_v1
|
||||
|
|
@ -104,8 +96,8 @@ class ProfilerContextTest(test.TestCase):
|
|||
def testDisabled(self):
|
||||
ops.reset_default_graph()
|
||||
x = lib.BuildFullModel()
|
||||
with profile_context.ProfileContext(test.get_temp_dir(),
|
||||
enabled=False) as pctx:
|
||||
with profile_context.ProfileContext(
|
||||
test.get_temp_dir(), enabled=False) as pctx:
|
||||
with session.Session() as sess:
|
||||
self.evaluate(variables.global_variables_initializer())
|
||||
for _ in range(10):
|
||||
|
|
|
|||
|
|
@ -141,6 +141,10 @@ port::StatusOr<void*> GetRocrandDsoHandle() {
|
|||
return GetDsoHandle("rocrand", "");
|
||||
}
|
||||
|
||||
port::StatusOr<void*> GetRoctracerDsoHandle() {
|
||||
return GetDsoHandle("roctracer64", "");
|
||||
}
|
||||
|
||||
port::StatusOr<void*> GetHipsparseDsoHandle() {
|
||||
return GetDsoHandle("hipsparse", "");
|
||||
}
|
||||
|
|
@ -220,6 +224,11 @@ port::StatusOr<void*> GetRocrandDsoHandle() {
|
|||
return *result;
|
||||
}
|
||||
|
||||
port::StatusOr<void*> GetRoctracerDsoHandle() {
|
||||
static auto result = new auto(DsoLoader::GetRoctracerDsoHandle());
|
||||
return *result;
|
||||
}
|
||||
|
||||
port::StatusOr<void*> GetHipsparseDsoHandle() {
|
||||
static auto result = new auto(DsoLoader::GetHipsparseDsoHandle());
|
||||
return *result;
|
||||
|
|
|
|||
|
|
@ -51,6 +51,7 @@ port::StatusOr<void*> GetRocblasDsoHandle();
|
|||
port::StatusOr<void*> GetMiopenDsoHandle();
|
||||
port::StatusOr<void*> GetRocfftDsoHandle();
|
||||
port::StatusOr<void*> GetRocrandDsoHandle();
|
||||
port::StatusOr<void*> GetRoctracerDsoHandle();
|
||||
port::StatusOr<void*> GetHipsparseDsoHandle();
|
||||
port::StatusOr<void*> GetHipDsoHandle();
|
||||
|
||||
|
|
@ -85,6 +86,7 @@ port::StatusOr<void*> GetRocblasDsoHandle();
|
|||
port::StatusOr<void*> GetMiopenDsoHandle();
|
||||
port::StatusOr<void*> GetRocfftDsoHandle();
|
||||
port::StatusOr<void*> GetRocrandDsoHandle();
|
||||
port::StatusOr<void*> GetRoctracerDsoHandle();
|
||||
port::StatusOr<void*> GetHipsparseDsoHandle();
|
||||
port::StatusOr<void*> GetHipDsoHandle();
|
||||
} // namespace CachedDsoLoader
|
||||
|
|
|
|||
|
|
@ -327,6 +327,29 @@ cc_library(
|
|||
alwayslink = True,
|
||||
)
|
||||
|
||||
cc_library(
|
||||
name = "roctracer_if_static",
|
||||
deps = if_static([
|
||||
"@local_config_rocm//rocm:roctracer",
|
||||
]),
|
||||
)
|
||||
|
||||
cc_library(
|
||||
name = "roctracer_wrapper",
|
||||
srcs = if_rocm_is_configured(["roctracer_wrapper.h"]),
|
||||
hdrs = if_rocm_is_configured(["roctracer_wrapper.h"]),
|
||||
deps = if_rocm_is_configured([
|
||||
":rocm_gpu_executor",
|
||||
":rocm_platform_id",
|
||||
":roctracer_if_static",
|
||||
"@local_config_rocm//rocm:rocm_headers",
|
||||
"//tensorflow/stream_executor/lib",
|
||||
"//tensorflow/stream_executor/platform",
|
||||
"//tensorflow/stream_executor/platform:dso_loader",
|
||||
]),
|
||||
alwayslink = True,
|
||||
)
|
||||
|
||||
cc_library(
|
||||
name = "all_runtime",
|
||||
copts = tf_copts(),
|
||||
|
|
|
|||
88
tensorflow/stream_executor/rocm/roctracer_wrapper.h
Normal file
88
tensorflow/stream_executor/rocm/roctracer_wrapper.h
Normal file
|
|
@ -0,0 +1,88 @@
|
|||
/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==============================================================================*/
|
||||
|
||||
// This file wraps roctracer API calls with dso loader so that we don't need to
|
||||
// have explicit linking to libroctracer. All TF hipsarse API usage should route
|
||||
// through this wrapper.
|
||||
|
||||
#ifndef TENSORFLOW_STREAM_EXECUTOR_ROCM_ROCTRACER_WRAPPER_H_
|
||||
#define TENSORFLOW_STREAM_EXECUTOR_ROCM_ROCTRACER_WRAPPER_H_
|
||||
|
||||
#include "rocm/include/roctracer/roctracer.h"
|
||||
#include "rocm/include/roctracer/roctracer_hcc.h"
|
||||
#include "rocm/include/roctracer/roctracer_hip.h"
|
||||
#include "tensorflow/stream_executor/lib/env.h"
|
||||
#include "tensorflow/stream_executor/platform/dso_loader.h"
|
||||
#include "tensorflow/stream_executor/platform/port.h"
|
||||
|
||||
namespace tensorflow {
|
||||
namespace wrap {
|
||||
|
||||
#ifdef PLATFORM_GOOGLE
|
||||
|
||||
#define ROCTRACER_API_WRAPPER(API_NAME) \
|
||||
template <typename... Args> \
|
||||
auto API_NAME()(Args... args)->decltype(::API_NAME(args...)) { \
|
||||
return ::API_NAME(args...); \
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
#define ROCTRACER_API_WRAPPER(API_NAME) \
|
||||
template <typename... Args> \
|
||||
auto API_NAME(Args... args)->decltype(::API_NAME(args...)) { \
|
||||
using FuncPtrT = std::add_pointer<decltype(::API_NAME)>::type; \
|
||||
static FuncPtrT loaded = []() -> FuncPtrT { \
|
||||
static const char* kName = #API_NAME; \
|
||||
void* f; \
|
||||
auto s = Env::Default()->GetSymbolFromLibrary( \
|
||||
stream_executor::internal::CachedDsoLoader::GetRoctracerDsoHandle() \
|
||||
.ValueOrDie(), \
|
||||
kName, &f); \
|
||||
CHECK(s.ok()) << "could not find " << kName \
|
||||
<< " in roctracer DSO; dlerror: " << s.error_message(); \
|
||||
return reinterpret_cast<FuncPtrT>(f); \
|
||||
}(); \
|
||||
return loaded(args...); \
|
||||
}
|
||||
|
||||
#endif // PLATFORM_GOOGLE
|
||||
|
||||
#define FOREACH_ROCTRACER_API(DO_FUNC) \
|
||||
DO_FUNC(roctracer_default_pool_expl) \
|
||||
DO_FUNC(roctracer_disable_domain_activity) \
|
||||
DO_FUNC(roctracer_disable_domain_callback) \
|
||||
DO_FUNC(roctracer_disable_op_activity) \
|
||||
DO_FUNC(roctracer_disable_op_callback) \
|
||||
DO_FUNC(roctracer_enable_domain_activity_expl) \
|
||||
DO_FUNC(roctracer_enable_domain_callback) \
|
||||
DO_FUNC(roctracer_enable_op_activity) \
|
||||
DO_FUNC(roctracer_enable_op_callback) \
|
||||
DO_FUNC(roctracer_error_string) \
|
||||
DO_FUNC(roctracer_flush_activity_expl) \
|
||||
DO_FUNC(roctracer_get_timestamp) \
|
||||
DO_FUNC(roctracer_op_string) \
|
||||
DO_FUNC(roctracer_open_pool_expl) \
|
||||
DO_FUNC(roctracer_set_properties)
|
||||
|
||||
FOREACH_ROCTRACER_API(ROCTRACER_API_WRAPPER)
|
||||
|
||||
#undef FOREACH_ROCTRACER_API
|
||||
#undef ROCTRACER_API_WRAPPER
|
||||
|
||||
} // namespace wrap
|
||||
} // namespace tensorflow
|
||||
|
||||
#endif // TENSORFLOW_STREAM_EXECUTOR_ROCM_ROCTRACER_WRAPPER_H_
|
||||
7
third_party/gpus/rocm/BUILD.tpl
vendored
7
third_party/gpus/rocm/BUILD.tpl
vendored
|
|
@ -21,6 +21,7 @@ cc_library(
|
|||
".",
|
||||
"rocm/include",
|
||||
"rocm/include/rocrand",
|
||||
"rocm/include/roctracer",
|
||||
],
|
||||
visibility = ["//visibility:public"],
|
||||
)
|
||||
|
|
@ -109,6 +110,7 @@ cc_library(
|
|||
":hiprand",
|
||||
":miopen",
|
||||
":hipsparse",
|
||||
":roctracer",
|
||||
":rocsolver",
|
||||
],
|
||||
)
|
||||
|
|
@ -144,6 +146,11 @@ cc_library(
|
|||
data = ["rocm/lib/%{hipsparse_lib}"],
|
||||
)
|
||||
|
||||
cc_library(
|
||||
name = "roctracer",
|
||||
data = ["rocm/lib/%{roctracer_lib}"],
|
||||
)
|
||||
|
||||
cc_library(
|
||||
name = "rocsolver",
|
||||
srcs = ["rocm/lib/%{rocsolver_lib}"],
|
||||
|
|
|
|||
3
third_party/gpus/rocm_configure.bzl
vendored
3
third_party/gpus/rocm_configure.bzl
vendored
|
|
@ -332,6 +332,7 @@ def _find_libs(repository_ctx, rocm_config, bash_bin):
|
|||
("MIOpen", rocm_config.rocm_toolkit_path + "/miopen"),
|
||||
("rccl", rocm_config.rocm_toolkit_path + "/rccl"),
|
||||
("hipsparse", rocm_config.rocm_toolkit_path + "/hipsparse"),
|
||||
("roctracer64", rocm_config.rocm_toolkit_path + "/roctracer"),
|
||||
("rocsolver", rocm_config.rocm_toolkit_path + "/rocsolver"),
|
||||
]
|
||||
]
|
||||
|
|
@ -458,6 +459,7 @@ def _create_dummy_repository(repository_ctx):
|
|||
"%{rocfft_lib}": _lib_name("rocfft"),
|
||||
"%{hiprand_lib}": _lib_name("hiprand"),
|
||||
"%{hipsparse_lib}": _lib_name("hipsparse"),
|
||||
"%{roctracer_lib}": _lib_name("roctracer64"),
|
||||
"%{rocsolver_lib}": _lib_name("rocsolver"),
|
||||
"%{copy_rules}": "",
|
||||
"%{rocm_headers}": "",
|
||||
|
|
@ -635,6 +637,7 @@ def _create_local_rocm_repository(repository_ctx):
|
|||
"%{miopen_lib}": rocm_libs["MIOpen"].file_name,
|
||||
"%{rccl_lib}": rocm_libs["rccl"].file_name,
|
||||
"%{hipsparse_lib}": rocm_libs["hipsparse"].file_name,
|
||||
"%{roctracer_lib}": rocm_libs["roctracer64"].file_name,
|
||||
"%{rocsolver_lib}": rocm_libs["rocsolver"].file_name,
|
||||
"%{copy_rules}": "\n".join(copy_rules),
|
||||
"%{rocm_headers}": ('":rocm-include",\n' +
|
||||
|
|
|
|||
Loading…
Reference in New Issue
Block a user