Merge pull request #48645 from ROCmSoftwarePlatform/google_upstream_r25_port_pr_47508

[r2.5 port][ROCm] Port PR 47508 to r2.5
This commit is contained in:
Mihai Maruseac 2021-04-22 15:25:51 -07:00 committed by GitHub
commit 3aba8ab14d
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
21 changed files with 2554 additions and 51 deletions

View File

@ -625,7 +625,10 @@ def tf_protos_grappler():
)
def tf_additional_device_tracer_srcs():
return ["device_tracer.cc"]
return [
"device_tracer_cuda.cc",
"device_tracer_rocm.cc",
]
def tf_additional_cupti_utils_cuda_deps():
return []

View File

@ -1,4 +1,5 @@
load("@local_config_cuda//cuda:build_defs.bzl", "if_cuda")
load("@local_config_rocm//rocm:build_defs.bzl", "if_rocm")
load(
"//tensorflow:tensorflow.bzl",
"tf_copts",
@ -33,6 +34,7 @@ tf_cuda_library(
cuda_deps = [
"//tensorflow/core/profiler/internal/gpu:cupti_tracer",
"//tensorflow/core/profiler/internal/gpu:cupti_wrapper",
"//tensorflow/core/profiler/internal/gpu:rocm_tracer",
],
deps = [
":cupti_utils",
@ -138,6 +140,25 @@ tf_cuda_library(
],
)
tf_cuda_library(
name = "rocm_tracer",
srcs = if_rocm(["rocm_tracer.cc"]),
hdrs = if_rocm(["rocm_tracer.h"]),
copts = tf_profiler_copts() + tf_copts(),
visibility = ["//visibility:public"],
deps = [
"//tensorflow/core:lib",
"//tensorflow/core/profiler/internal/cpu:annotation_stack",
"//tensorflow/core/profiler/utils:time_utils",
"//tensorflow/stream_executor/rocm:roctracer_wrapper",
"@com_google_absl//absl/container:fixed_array",
"@com_google_absl//absl/container:flat_hash_map",
"@com_google_absl//absl/container:node_hash_map",
"@com_google_absl//absl/container:node_hash_set",
"@com_google_absl//absl/types:optional",
],
)
tf_cuda_library(
name = "nvtx_utils",
srcs = if_cuda(["nvtx_utils.cc"]),

View File

@ -0,0 +1,833 @@
/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#if TENSORFLOW_USE_ROCM
#include <memory>
#include <utility>
#include "absl/container/fixed_array.h"
#include "absl/container/flat_hash_map.h"
#include "absl/container/flat_hash_set.h"
#include "absl/strings/str_cat.h"
#include "absl/strings/str_format.h"
#include "absl/strings/str_join.h"
#include "tensorflow/core/framework/step_stats.pb.h"
#include "tensorflow/core/lib/core/errors.h"
#include "tensorflow/core/platform/abi.h"
#include "tensorflow/core/platform/env_time.h"
#include "tensorflow/core/platform/macros.h"
#include "tensorflow/core/platform/mutex.h"
#include "tensorflow/core/platform/thread_annotations.h"
#include "tensorflow/core/profiler/internal/cpu/annotation_stack.h"
#include "tensorflow/core/profiler/internal/gpu/rocm_tracer.h"
#include "tensorflow/core/profiler/lib/profiler_factory.h"
#include "tensorflow/core/profiler/lib/profiler_interface.h"
#include "tensorflow/core/profiler/utils/parse_annotation.h"
#include "tensorflow/core/profiler/utils/xplane_builder.h"
#include "tensorflow/core/profiler/utils/xplane_schema.h"
#include "tensorflow/core/profiler/utils/xplane_utils.h"
#include "tensorflow/core/util/env_var.h"
namespace tensorflow {
namespace profiler {
namespace {
// Set the all XLines of specified XPlane to starting walltime.
// Events time in both host and device planes are CUTPI timestamps.
// We set initial RocmTracer timestamp as start time for all lines to reflect
// this fact. Eventually we change line start time to corresponding
// start_walltime_ns to normalize with CPU wall time.
static void NormalizeTimeStamps(XPlaneBuilder* plane,
uint64_t start_walltime_ns) {
plane->ForEachLine([&](tensorflow::profiler::XLineBuilder line) {
line.SetTimestampNs(start_walltime_ns);
});
}
void GetDeviceCapabilities(int32_t device_ordinal,
XPlaneBuilder* device_plane) {
// TODO(rocm)
}
bool IsHostEvent(const RocmTracerEvent& event) {
// TODO(rocm)
// Classify all events as GPU events for now
return false;
}
std::string GetDeviceXLineName(
int64_t stream_id, absl::flat_hash_set<RocmTracerEventType>& event_types) {
std::string line_name = absl::StrCat("Stream #", stream_id);
event_types.erase(RocmTracerEventType::Unsupported);
if (event_types.empty()) return line_name;
std::vector<const char*> type_names;
for (const auto event_type : event_types) {
type_names.emplace_back(GetRocmTracerEventTypeName(event_type));
}
return absl::StrCat(line_name, "(", absl::StrJoin(type_names, ","), ")");
}
} // namespace
class RocmTraceCollectorImpl : public profiler::RocmTraceCollector {
public:
RocmTraceCollectorImpl(const RocmTraceCollectorOptions& options,
uint64_t start_walltime_ns, uint64_t start_gputime_ns)
: RocmTraceCollector(options),
num_callback_events_(0),
num_activity_events_(0),
start_walltime_ns_(start_walltime_ns),
start_gputime_ns_(start_gputime_ns),
next_logical_device_id_(0),
per_device_collector_(options.num_gpus) {
// in the physical -> logical device_id map, add an explicit entry for
// RocmTracerEvent::kInvalidDeviceId -> RocmTracerEvent::kInvalidDeviceId
// event with this device_id are events for which we were not able to
// determine the correct device_id via the API+Activity callbacks
// we will special case such events in the Flush routine
device_id_map_[RocmTracerEvent::kInvalidDeviceId] =
RocmTracerEvent::kInvalidDeviceId;
}
void AddEvent(RocmTracerEvent&& event) override {
mutex_lock lock(aggregated_events_mutex_);
if (event.source == RocmTracerEventSource::ApiCallback) {
if (num_callback_events_ > options_.max_callback_api_events) {
OnEventsDropped("max callback event capacity reached",
event.correlation_id);
DumpRocmTracerEvent(event, 0, 0);
return;
}
num_callback_events_++;
}
if (event.source == RocmTracerEventSource::Activity) {
if (num_activity_events_ > options_.max_activity_api_events) {
OnEventsDropped("max activity event capacity reached",
event.correlation_id);
DumpRocmTracerEvent(event, 0, 0);
return;
}
num_activity_events_++;
}
auto iter = aggregated_events_.find(event.correlation_id);
if (iter != aggregated_events_.end()) {
// event with this correlation id already present
// agrregate this event with the existing one
switch (event.domain) {
case RocmTracerEventDomain::HIP_API:
switch (event.source) {
case RocmTracerEventSource::ApiCallback:
break;
case RocmTracerEventSource::Activity:
// Use the start/stop time from the HCC_OPS domain
// unless this is one of those events for which we do not
// receive any HCC activity record callback
if (IsEventTypeWithoutHCCActivityRecordCallback(event.type)) {
iter->second.start_time_ns = event.start_time_ns;
iter->second.end_time_ns = event.end_time_ns;
}
iter->second.annotation = event.annotation;
break;
}
break;
case RocmTracerEventDomain::HCC_OPS:
switch (event.source) {
case RocmTracerEventSource::ApiCallback:
break;
case RocmTracerEventSource::Activity:
iter->second.device_id = event.device_id;
iter->second.stream_id = event.stream_id;
iter->second.start_time_ns = event.start_time_ns;
iter->second.end_time_ns = event.end_time_ns;
// Use the annotation from the HIP_API domain
// iter->second.annotation = event.annotation;
break;
}
break;
}
} else {
switch (event.source) {
case RocmTracerEventSource::ApiCallback:
aggregated_events_.emplace(event.correlation_id, std::move(event));
break;
case RocmTracerEventSource::Activity:
// you would think that this cannot happen, but it does
// This is primarily because the call "roctracer_flush_activity" does
// not work as it should. Imagine a sequence where we enable/disable
// tracing more than once in a single TF session.
// If the "flush" that happens during disable, does not flush out all
// the activity records, then they will show up during the subsequent
// call to enable, and we will end up here!
OnEventsDropped(
"Activity event encountered before a corresponding API event",
event.correlation_id);
DumpRocmTracerEvent(event, 0, 0);
break;
}
}
}
void OnEventsDropped(const std::string& reason,
uint32_t correlation_id) override {
LOG(INFO) << "RocmTracerEvent dropped (correlation_id=" << correlation_id
<< ",) : " << reason << ".";
}
void Flush() override {
mutex_lock lock(aggregated_events_mutex_);
VLOG(3) << "RocmTraceCollector collected " << num_callback_events_
<< " callback events, " << num_activity_events_
<< " activity events, and aggregated them into "
<< aggregated_events_.size() << " events.";
for (auto& iter : aggregated_events_) {
auto& event = iter.second;
// For some hip API events, we never get a corresponding HCC
// activity record callback and hence we currently do not have a way
// of associating a valid device_id and stream_id with those events.
// For such events, explcitly set those id sto 0 for now
if (IsEventTypeWithoutHCCActivityRecordCallback(event.type)) {
DumpRocmTracerEvent(event, 0, 0);
if (event.device_id == RocmTracerEvent::kInvalidDeviceId) {
VLOG(3) << "Explicitly setting device_id to 0 for "
"event with correlation_id="
<< event.correlation_id << ",";
event.device_id = 0;
} else {
VLOG(3) << "Unexpectedly found a non-default "
"device_id for event with correlation_id="
<< event.correlation_id << ",";
}
if (event.stream_id == RocmTracerEvent::kInvalidStreamId) {
VLOG(3) << "Explicitly setting stream_id to 0 for "
"event with correlation_id="
<< event.correlation_id << ",";
event.stream_id = 0;
} else {
VLOG(3) << "Unexpectedly found a non-default "
"stream_id for event with correlation_id="
<< event.correlation_id << ",";
}
}
// determine the logical device id
uint32_t physical_id = event.device_id;
uint32_t logical_id = options_.num_gpus;
auto kv_pair = device_id_map_.find(physical_id);
if (kv_pair == device_id_map_.end()) {
logical_id = next_logical_device_id_++;
VLOG(3) << "Mapping physical device id " << physical_id
<< " to logical device id " << logical_id;
device_id_map_[physical_id] = logical_id;
} else {
logical_id = kv_pair->second;
}
event.device_id = logical_id;
if (event.device_id >= options_.num_gpus) {
OnEventsDropped("logical device id >= num gpus", event.correlation_id);
DumpRocmTracerEvent(event, 0, 0);
continue;
}
if (event.stream_id == RocmTracerEvent::kInvalidStreamId) {
OnEventsDropped("invalid stream id", event.correlation_id);
DumpRocmTracerEvent(event, 0, 0);
continue;
}
per_device_collector_[logical_id].AddEvent(event);
}
aggregated_events_.clear();
for (int i = 0; i < options_.num_gpus; ++i) {
per_device_collector_[i].SortByStartTime();
}
}
void Export(StepStats* step_stats) {
for (int i = 0; i < options_.num_gpus; ++i) {
per_device_collector_[i].Export(i, start_walltime_ns_, start_gputime_ns_,
step_stats);
}
}
void Export(XSpace* space) {
uint64_t end_gputime_ns = RocmTracer::GetTimestamp();
XPlaneBuilder host_plane(
FindOrAddMutablePlaneWithName(space, kRoctracerApiPlaneName));
for (int i = 0; i < options_.num_gpus; ++i) {
std::string name = GpuPlaneName(i);
XPlaneBuilder device_plane(FindOrAddMutablePlaneWithName(space, name));
device_plane.SetId(i);
per_device_collector_[i].Export(start_walltime_ns_, start_gputime_ns_,
end_gputime_ns, &device_plane,
&host_plane);
GetDeviceCapabilities(i, &device_plane);
NormalizeTimeStamps(&device_plane, start_walltime_ns_);
}
NormalizeTimeStamps(&host_plane, start_walltime_ns_);
}
private:
std::atomic<int> num_callback_events_;
std::atomic<int> num_activity_events_;
uint64_t start_walltime_ns_;
uint64_t start_gputime_ns_;
mutex aggregated_events_mutex_;
absl::flat_hash_map<uint32_t, RocmTracerEvent> aggregated_events_
TF_GUARDED_BY(aggregated_events_mutex_);
// We need to create a map of
// event.device_id -> index into per_device_collector_ array
// The event.device_id returned by the RocmTracer is the physical
// device_id and not the logical device_id. Say for example we are
// running on a node with 8 GPUs. The expected physical device_id(s)
// for those 8 GPUs would be 0,1,2,3,4,5,6,7. On such a node, if we
// run a test with HIP_VISIBLE_DEVICES=5, then "options.num_gpus_ == 1",
// but the event.device_id field will have 5 in it!
// So the event.device_id can be thought of as the physical device id
// and the index can be thought of as the logical device id.
// We cannot determine the actual phsyical device id logical device id
// mapping here, so we determine it empirically
std::map<uint32_t, uint32_t> device_id_map_;
uint32_t next_logical_device_id_;
bool IsEventTypeWithoutHCCActivityRecordCallback(RocmTracerEventType type) {
switch (type) {
case RocmTracerEventType::MemoryAlloc:
return true;
break;
default:
break;
}
return false;
}
struct PerDeviceCollector {
void AddEvent(const RocmTracerEvent& event) {
mutex_lock lock(events_mutex);
events.emplace_back(event);
}
void SortByStartTime() {
mutex_lock lock(events_mutex);
std::sort(
events.begin(), events.end(),
[](const RocmTracerEvent& event1, const RocmTracerEvent& event2) {
return event1.start_time_ns < event2.start_time_ns;
});
}
void Export(int32_t device_ordinal, uint64_t start_walltime_ns,
uint64_t start_gputime_ns, StepStats* step_stats) {
mutex_lock lock(events_mutex);
absl::flat_hash_map<
std::pair<uint64_t /*stream_id*/, RocmTracerEventType>,
DeviceStepStats*>
per_stream_dev_stats;
DeviceStepStats* generic_stream_dev_stats = nullptr;
DeviceStepStats* all_streams_dev_stats = nullptr;
DeviceStepStats* memcpy_dev_stats = nullptr;
DeviceStepStats* sync_dev_stats = nullptr;
for (const RocmTracerEvent& event : events) {
DumpRocmTracerEvent(event, start_walltime_ns, start_gputime_ns);
std::unique_ptr<NodeExecStats> ns(new NodeExecStats);
ns->set_all_start_micros(
(start_walltime_ns + (event.start_time_ns - start_gputime_ns)) /
1000);
ns->set_op_start_rel_micros(0);
uint64_t elapsed_ns = event.end_time_ns - event.start_time_ns;
ns->set_op_end_rel_micros(
tensorflow::profiler::NanosToMicros(elapsed_ns));
ns->set_all_end_rel_micros(
tensorflow::profiler::NanosToMicros(elapsed_ns));
auto annotation_stack = ParseAnnotationStack(event.annotation);
std::string kernel_name = port::MaybeAbiDemangle(event.name.c_str());
std::string activity_name =
!annotation_stack.empty()
? std::string(annotation_stack.back().name)
: kernel_name;
ns->set_node_name(activity_name);
ns->set_thread_id(event.thread_id);
switch (event.type) {
case RocmTracerEventType::Kernel: {
ns->set_timeline_label(absl::StrFormat(
"%s regs:%u shm:%u grid:%u,%u,%u block:%u,%u,%u@@%s",
kernel_name, event.kernel_info.registers_per_thread,
event.kernel_info.static_shared_memory_usage,
event.kernel_info.grid_x, event.kernel_info.grid_y,
event.kernel_info.grid_z, event.kernel_info.block_x,
event.kernel_info.block_y, event.kernel_info.block_z,
event.annotation));
DeviceStepStats*& stream_dev_stats =
per_stream_dev_stats[std::make_pair(event.stream_id,
event.type)];
if (stream_dev_stats == nullptr) {
stream_dev_stats = step_stats->add_dev_stats();
stream_dev_stats->set_device(absl::StrCat(
"/device:GPU:", device_ordinal, "/stream:", event.stream_id,
"<", GetRocmTracerEventTypeName(event.type), ">"));
}
*stream_dev_stats->add_node_stats() = *ns;
if (all_streams_dev_stats == nullptr) {
all_streams_dev_stats = step_stats->add_dev_stats();
all_streams_dev_stats->set_device(
absl::StrCat("/device:GPU:", device_ordinal, "/stream:all"));
}
all_streams_dev_stats->add_node_stats()->Swap(ns.release());
} break;
case RocmTracerEventType::MemcpyD2H:
case RocmTracerEventType::MemcpyH2D:
case RocmTracerEventType::MemcpyD2D:
case RocmTracerEventType::MemcpyP2P: {
std::string details = absl::StrCat(
event.name, " bytes:", event.memcpy_info.num_bytes);
if (event.memcpy_info.async) {
absl::StrAppend(&details, " async");
}
if (event.memcpy_info.destination != event.device_id) {
absl::StrAppend(&details,
" to device:", event.memcpy_info.destination);
}
ns->set_timeline_label(std::move(details));
DeviceStepStats*& stream_dev_stats =
per_stream_dev_stats[std::make_pair(event.stream_id,
event.type)];
if (stream_dev_stats == nullptr) {
stream_dev_stats = step_stats->add_dev_stats();
stream_dev_stats->set_device(absl::StrCat(
"/device:GPU:", device_ordinal, "/stream:", event.stream_id,
"<", GetRocmTracerEventTypeName(event.type), ">"));
}
*stream_dev_stats->add_node_stats() = *ns;
if (memcpy_dev_stats == nullptr) {
memcpy_dev_stats = step_stats->add_dev_stats();
memcpy_dev_stats->set_device(
absl::StrCat("/device:GPU:", device_ordinal, "/memcpy"));
}
memcpy_dev_stats->add_node_stats()->Swap(ns.release());
} break;
case RocmTracerEventType::MemoryAlloc: {
std::string details = absl::StrCat(
event.name, " bytes:", event.memalloc_info.num_bytes);
ns->set_timeline_label(std::move(details));
DeviceStepStats*& stream_dev_stats =
per_stream_dev_stats[std::make_pair(event.stream_id,
event.type)];
if (stream_dev_stats == nullptr) {
stream_dev_stats = step_stats->add_dev_stats();
stream_dev_stats->set_device(absl::StrCat(
"/device:GPU:", device_ordinal, "/stream:", event.stream_id,
"<", GetRocmTracerEventTypeName(event.type), ">"));
}
*stream_dev_stats->add_node_stats() = *ns;
} break;
case RocmTracerEventType::StreamSynchronize: {
std::string details = event.name;
ns->set_timeline_label(std::move(details));
if (sync_dev_stats == nullptr) {
sync_dev_stats = step_stats->add_dev_stats();
sync_dev_stats->set_device(
absl::StrCat("/device:GPU:", device_ordinal, "/sync"));
}
sync_dev_stats->add_node_stats()->Swap(ns.release());
} break;
case RocmTracerEventType::Generic: {
std::string details = event.name;
ns->set_timeline_label(std::move(details));
if (generic_stream_dev_stats == nullptr) {
generic_stream_dev_stats = step_stats->add_dev_stats();
generic_stream_dev_stats->set_device(
absl::StrCat("/device:GPU:", device_ordinal, "/stream:"));
}
generic_stream_dev_stats->add_node_stats()->Swap(ns.release());
} break;
default:
DCHECK(false);
break;
}
}
events.clear();
}
void CreateXEvent(const RocmTracerEvent& event, XPlaneBuilder* plane,
uint64_t start_gpu_ns, uint64_t end_gpu_ns,
XLineBuilder* line) {
if (event.start_time_ns < start_gpu_ns ||
event.end_time_ns > end_gpu_ns ||
event.start_time_ns > event.end_time_ns) {
VLOG(2) << "events have abnormal timestamps:" << event.name
<< " start time(ns): " << event.start_time_ns
<< " end time(ns): " << event.end_time_ns;
return;
}
std::string kernel_name = port::MaybeAbiDemangle(event.name.c_str());
if (kernel_name.empty()) {
kernel_name = GetRocmTracerEventTypeName(event.type);
}
XEventMetadata* event_metadata =
plane->GetOrCreateEventMetadata(std::move(kernel_name));
XEventBuilder xevent = line->AddEvent(*event_metadata);
xevent.SetTimestampNs(event.start_time_ns);
xevent.SetEndTimestampNs(event.end_time_ns);
if (event.correlation_id != RocmTracerEvent::kInvalidCorrelationId) {
xevent.AddStatValue(*plane->GetOrCreateStatMetadata(
GetStatTypeStr(StatType::kCorrelationId)),
event.correlation_id);
}
if (!event.annotation.empty()) {
xevent.AddStatValue(*plane->GetOrCreateStatMetadata(
GetStatTypeStr(StatType::kKernelAnnotation)),
event.annotation);
}
switch (event.type) {
case RocmTracerEventType::Kernel: {
const std::string kernel_details = absl::StrFormat(
"regs:%u shm:%u grid:%u,%u,%u block:%u,%u,%u",
event.kernel_info.registers_per_thread,
event.kernel_info.static_shared_memory_usage,
event.kernel_info.grid_x, event.kernel_info.grid_y,
event.kernel_info.grid_z, event.kernel_info.block_x,
event.kernel_info.block_y, event.kernel_info.block_z);
xevent.AddStatValue(*plane->GetOrCreateStatMetadata(
GetStatTypeStr(StatType::kKernelDetails)),
kernel_details);
} break;
case RocmTracerEventType::MemcpyD2H:
case RocmTracerEventType::MemcpyH2D:
case RocmTracerEventType::MemcpyD2D:
case RocmTracerEventType::MemcpyP2P:
case RocmTracerEventType::MemcpyOther: {
const auto& memcpy_info = event.memcpy_info;
std::string memcpy_details =
absl::StrFormat("size:%u dest:%u async:%u", memcpy_info.num_bytes,
memcpy_info.destination, memcpy_info.async);
xevent.AddStatValue(*plane->GetOrCreateStatMetadata(
GetStatTypeStr(StatType::kMemcpyDetails)),
memcpy_details);
} break;
case RocmTracerEventType::MemoryAlloc: {
std::string memalloc_details =
absl::StrFormat("num_bytes:%u", event.memalloc_info.num_bytes);
xevent.AddStatValue(*plane->GetOrCreateStatMetadata(
GetStatTypeStr(StatType::kMemallocDetails)),
memalloc_details);
} break;
case RocmTracerEventType::StreamSynchronize: {
// TODO(rocm)
// Don't yet know what to do here
} break;
case RocmTracerEventType::Generic: {
// TODO(rocm)
// Don't yet know what to do here
} break;
default:
DCHECK(false);
break;
}
std::vector<Annotation> annotation_stack =
ParseAnnotationStack(event.annotation);
// If multiple metadata have the same key name, show the values from the
// top of the stack (innermost annotation). Concatenate the values from
// "hlo_op".
absl::flat_hash_set<absl::string_view> key_set;
std::vector<absl::string_view> hlo_op_names;
for (auto annotation = annotation_stack.rbegin();
annotation != annotation_stack.rend(); ++annotation) {
for (const Annotation::Metadata& metadata : annotation->metadata) {
if (metadata.key == "tf_op") {
continue; // ignored, obtained from HLO proto via DebugInfoMap
} else if (key_set.insert(metadata.key).second) {
xevent.ParseAndAddStatValue(
*plane->GetOrCreateStatMetadata(metadata.key), metadata.value);
}
}
}
// TODO(profiler): we should get rid of kLevel0, it is based on the
// assumption that those op-related ScopedAnnotation are at the very TOP
// level.
if (!annotation_stack.empty()) {
xevent.AddStatValue(
*plane->GetOrCreateStatMetadata(GetStatTypeStr(StatType::kLevel0)),
annotation_stack.begin()->name);
}
}
void Export(uint64_t start_walltime_ns, uint64_t start_gputime_ns,
uint64_t end_gputime_ns, XPlaneBuilder* device_plane,
XPlaneBuilder* host_plane) {
mutex_lock lock(events_mutex);
// Tracking event types per line.
absl::flat_hash_map<int64, absl::flat_hash_set<RocmTracerEventType>>
events_types_per_line;
for (const RocmTracerEvent& event : events) {
DumpRocmTracerEvent(event, start_walltime_ns, start_gputime_ns);
bool is_host_event = IsHostEvent(event);
int64_t line_id = is_host_event ? static_cast<int64>(event.thread_id)
: event.stream_id;
if (line_id == RocmTracerEvent::kInvalidThreadId ||
line_id == RocmTracerEvent::kInvalidStreamId)
continue;
auto* plane = is_host_event ? host_plane : device_plane;
XLineBuilder line = plane->GetOrCreateLine(line_id);
line.SetTimestampNs(start_gputime_ns);
CreateXEvent(event, plane, start_gputime_ns, end_gputime_ns, &line);
events_types_per_line[line_id].emplace(event.type);
}
device_plane->ForEachLine([&](tensorflow::profiler::XLineBuilder line) {
line.SetName(
GetDeviceXLineName(line.Id(), events_types_per_line[line.Id()]));
});
events.clear();
}
mutex events_mutex;
std::vector<RocmTracerEvent> events TF_GUARDED_BY(events_mutex);
};
absl::FixedArray<PerDeviceCollector> per_device_collector_;
};
// GpuTracer for ROCm GPU.
class GpuTracer : public profiler::ProfilerInterface {
public:
GpuTracer(RocmTracer* rocm_tracer) : rocm_tracer_(rocm_tracer) {
LOG(INFO) << "GpuTracer created.";
}
~GpuTracer() override {}
// GpuTracer interface:
Status Start() override;
Status Stop() override;
Status CollectData(RunMetadata* run_metadata) override;
Status CollectData(XSpace* space) override;
private:
Status DoStart();
Status DoStop();
Status DoCollectData(StepStats* step_stats);
Status DoCollectData(XSpace* space);
RocmTracerOptions GetRocmTracerOptions();
RocmTraceCollectorOptions GetRocmTraceCollectorOptions(uint32_t num_gpus);
enum State {
kNotStarted,
kStartedOk,
kStartedError,
kStoppedOk,
kStoppedError
};
State profiling_state_ = State::kNotStarted;
RocmTracer* rocm_tracer_;
std::unique_ptr<RocmTraceCollectorImpl> rocm_trace_collector_;
};
RocmTracerOptions GpuTracer::GetRocmTracerOptions() {
RocmTracerOptions options;
std::vector<uint32_t> empty_vec;
// clang formatting does not preserve one entry per line
// clang-format off
std::vector<uint32_t> hip_api_domain_ops{
HIP_API_ID_hipExtModuleLaunchKernel,
HIP_API_ID_hipFree,
HIP_API_ID_hipHccModuleLaunchKernel,
HIP_API_ID_hipLaunchKernel,
HIP_API_ID_hipMalloc,
HIP_API_ID_hipMemcpyAsync,
HIP_API_ID_hipMemcpyDtoD,
HIP_API_ID_hipMemcpyDtoDAsync,
HIP_API_ID_hipMemcpyDtoH,
HIP_API_ID_hipMemcpyDtoHAsync,
HIP_API_ID_hipMemcpyHtoD,
HIP_API_ID_hipMemcpyHtoDAsync,
HIP_API_ID_hipMemsetD32,
HIP_API_ID_hipMemsetD32Async,
HIP_API_ID_hipMemsetD8,
HIP_API_ID_hipMemsetD8Async,
HIP_API_ID_hipModuleLaunchKernel,
HIP_API_ID_hipStreamSynchronize,
};
// clang-format on
options.api_callbacks.emplace(ACTIVITY_DOMAIN_HIP_API, hip_api_domain_ops);
// options.api_callbacks.emplace(ACTIVITY_DOMAIN_HIP_API, empty_vec);
// options.activity_tracing.emplace(ACTIVITY_DOMAIN_HIP_API,
// hip_api_domain_ops);
options.activity_tracing.emplace(ACTIVITY_DOMAIN_HIP_API, empty_vec);
options.activity_tracing.emplace(ACTIVITY_DOMAIN_HCC_OPS, empty_vec);
return options;
}
RocmTraceCollectorOptions GpuTracer::GetRocmTraceCollectorOptions(
uint32_t num_gpus) {
RocmTraceCollectorOptions options;
options.max_callback_api_events = 2 * 1024 * 1024;
options.max_activity_api_events = 2 * 1024 * 1024;
options.max_annotation_strings = 1024 * 1024;
options.num_gpus = num_gpus;
return options;
}
Status GpuTracer::DoStart() {
if (!rocm_tracer_->IsAvailable()) {
return errors::Unavailable("Another profile session running.");
}
AnnotationStack::Enable(true);
RocmTraceCollectorOptions trace_collector_options =
GetRocmTraceCollectorOptions(rocm_tracer_->NumGpus());
uint64_t start_gputime_ns = RocmTracer::GetTimestamp();
uint64_t start_walltime_ns = tensorflow::EnvTime::NowNanos();
rocm_trace_collector_ = std::make_unique<RocmTraceCollectorImpl>(
trace_collector_options, start_walltime_ns, start_gputime_ns);
RocmTracerOptions tracer_options = GetRocmTracerOptions();
rocm_tracer_->Enable(tracer_options, rocm_trace_collector_.get());
return Status::OK();
}
Status GpuTracer::Start() {
Status status = DoStart();
if (status.ok()) {
profiling_state_ = State::kStartedOk;
return Status::OK();
} else {
profiling_state_ = State::kStartedError;
return status;
}
}
Status GpuTracer::DoStop() {
rocm_tracer_->Disable();
AnnotationStack::Enable(false);
return Status::OK();
}
Status GpuTracer::Stop() {
if (profiling_state_ == State::kStartedOk) {
Status status = DoStop();
profiling_state_ = status.ok() ? State::kStoppedOk : State::kStoppedError;
}
return Status::OK();
}
Status GpuTracer::DoCollectData(StepStats* step_stats) {
if (rocm_trace_collector_) rocm_trace_collector_->Export(step_stats);
return Status::OK();
}
Status GpuTracer::CollectData(RunMetadata* run_metadata) {
switch (profiling_state_) {
case State::kNotStarted:
VLOG(3) << "No trace data collected, session wasn't started";
return Status::OK();
case State::kStartedOk:
return errors::FailedPrecondition("Cannot collect trace before stopping");
case State::kStartedError:
LOG(ERROR) << "Cannot collect, roctracer failed to start";
return Status::OK();
case State::kStoppedError:
VLOG(3) << "No trace data collected";
return Status::OK();
case State::kStoppedOk: {
// Input run_metadata is shared by profiler interfaces, we need append.
StepStats step_stats;
DoCollectData(&step_stats);
for (auto& dev_stats : *step_stats.mutable_dev_stats()) {
run_metadata->mutable_step_stats()->add_dev_stats()->Swap(&dev_stats);
}
return Status::OK();
}
}
return errors::Internal("Invalid profiling state: ", profiling_state_);
}
Status GpuTracer::DoCollectData(XSpace* space) {
if (rocm_trace_collector_) rocm_trace_collector_->Export(space);
return Status::OK();
}
Status GpuTracer::CollectData(XSpace* space) {
switch (profiling_state_) {
case State::kNotStarted:
VLOG(3) << "No trace data collected, session wasn't started";
return Status::OK();
case State::kStartedOk:
return errors::FailedPrecondition("Cannot collect trace before stopping");
case State::kStartedError:
LOG(ERROR) << "Cannot collect, roctracer failed to start";
return Status::OK();
case State::kStoppedError:
VLOG(3) << "No trace data collected";
return Status::OK();
case State::kStoppedOk: {
DoCollectData(space);
return Status::OK();
}
}
return errors::Internal("Invalid profiling state: ", profiling_state_);
}
// Not in anonymous namespace for testing purposes.
std::unique_ptr<profiler::ProfilerInterface> CreateGpuTracer(
const ProfileOptions& options) {
if (options.device_type() != ProfileOptions::GPU &&
options.device_type() != ProfileOptions::UNSPECIFIED)
return nullptr;
profiler::RocmTracer* rocm_tracer =
profiler::RocmTracer::GetRocmTracerSingleton();
if (!rocm_tracer->IsAvailable()) return nullptr;
return absl::make_unique<profiler::GpuTracer>(rocm_tracer);
}
auto register_rocm_gpu_tracer_factory = [] {
RegisterProfilerFactory(&CreateGpuTracer);
return 0;
}();
} // namespace profiler
} // namespace tensorflow
#endif // TENSORFLOW_USE_ROCM

View File

@ -49,7 +49,7 @@ limitations under the License.
namespace tensorflow {
namespace profiler {
#if GOOGLE_CUDA
#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
extern std::unique_ptr<ProfilerInterface> CreateGpuTracer(
const ProfileOptions& options);
std::unique_ptr<ProfilerInterface> CreateGpuTracer() {
@ -243,6 +243,54 @@ TEST_F(DeviceTracerTest, RunWithTraceOption) {
EXPECT_GE(run_metadata.step_stats().dev_stats_size(), 1);
}
#if TENSORFLOW_USE_ROCM
TEST_F(DeviceTracerTest, TraceToXSpace) {
auto tracer = CreateGpuTracer();
if (!tracer) return;
Initialize({3, 2, -1, 0});
auto session = CreateSession();
ASSERT_TRUE(session != nullptr);
TF_ASSERT_OK(session->Create(def_));
std::vector<std::pair<string, Tensor>> inputs;
// Request two targets: one fetch output and one non-fetched output.
std::vector<string> output_names = {y_ + ":0"};
std::vector<string> target_nodes = {y_neg_};
std::vector<Tensor> outputs;
TF_ASSERT_OK(tracer->Start());
Status s = session->Run(inputs, output_names, target_nodes, &outputs);
TF_ASSERT_OK(s);
TF_ASSERT_OK(tracer->Stop());
XSpace space;
TF_ASSERT_OK(tracer->CollectData(&space));
// At least one gpu plane and one host plane for launching events.
const XPlane* host_plane = FindPlaneWithName(space, kRoctracerApiPlaneName);
ASSERT_NE(host_plane, nullptr);
const XPlane* device_plane =
FindPlaneWithName(space, strings::StrCat(kGpuPlanePrefix, 0));
ASSERT_NE(device_plane, nullptr); // Check if device plane is serialized.
// one for MemcpyH2D, one for MemcpyD2H, two for Matmul (one from Eigen, one
// from cudnn), one for memset.
EXPECT_EQ(device_plane->event_metadata_size(), 5);
// Check if device capacity is serialized.
XPlaneVisitor plane = CreateTfXPlaneVisitor(device_plane);
// Check if the device events timestamps are set.
int total_events = 0;
plane.ForEachLine([&](const tensorflow::profiler::XLineVisitor& line) {
line.ForEachEvent([&](const tensorflow::profiler::XEventVisitor& event) {
EXPECT_GT(event.TimestampNs(), 0);
EXPECT_GT(event.DurationNs(), 0);
++total_events;
});
});
EXPECT_GE(total_events, 5);
}
#else // TENSORFLOW_USE_ROCM
TEST_F(DeviceTracerTest, TraceToXSpace) {
auto tracer = CreateGpuTracer();
if (!tracer) return;
@ -295,6 +343,7 @@ TEST_F(DeviceTracerTest, TraceToXSpace) {
});
EXPECT_GE(total_events, 5);
}
#endif // TENSORFLOW_USE_ROCM
#if GOOGLE_CUDA
TEST_F(DeviceTracerTest, CudaRuntimeResource) {

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,355 @@
/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#ifndef TENSORFLOW_CORE_PROFILER_INTERNAL_GPU_ROCM_TRACER_H_
#define TENSORFLOW_CORE_PROFILER_INTERNAL_GPU_ROCM_TRACER_H_
#include "absl/container/fixed_array.h"
#include "absl/container/flat_hash_map.h"
#include "absl/container/flat_hash_set.h"
#include "absl/container/node_hash_set.h"
#include "absl/types/optional.h"
#include "tensorflow/core/lib/core/errors.h"
#include "tensorflow/core/lib/core/status.h"
#include "tensorflow/core/platform/macros.h"
#include "tensorflow/core/platform/types.h"
#include "tensorflow/stream_executor/rocm/roctracer_wrapper.h"
namespace tensorflow {
namespace profiler {
struct MemcpyDetails {
// The amount of data copied for memcpy events.
size_t num_bytes;
// The destination device for peer-2-peer communication (memcpy). The source
// device is implicit: its the current device.
uint32_t destination;
// Whether or not the memcpy is asynchronous.
bool async;
};
struct MemsetDetails {
// The number of memory elements getting set
size_t num_elements;
// Whether or not the memset is asynchronous.
bool async;
};
struct MemAllocDetails {
// The amount of data requested for cudaMalloc events.
uint64_t num_bytes;
};
struct KernelDetails {
// The number of registers used in this kernel.
uint32_t registers_per_thread;
// The amount of shared memory space used by a thread block.
uint32_t static_shared_memory_usage;
// The amount of dynamic memory space used by a thread block.
uint32_t dynamic_shared_memory_usage;
// X-dimension of a thread block.
uint32_t block_x;
// Y-dimension of a thread block.
uint32_t block_y;
// Z-dimension of a thread block.
uint32_t block_z;
// X-dimension of a grid.
uint32_t grid_x;
// Y-dimension of a grid.
uint32_t grid_y;
// Z-dimension of a grid.
uint32_t grid_z;
};
enum class RocmTracerEventType {
Unsupported = 0,
Kernel,
MemcpyH2D,
MemcpyD2H,
MemcpyD2D,
MemcpyP2P,
MemcpyOther,
MemoryAlloc,
Memset,
StreamSynchronize,
Generic,
};
const char* GetRocmTracerEventTypeName(const RocmTracerEventType& type);
enum class RocmTracerEventSource {
ApiCallback = 0,
Activity,
};
const char* GetRocmTracerEventSourceName(const RocmTracerEventSource& source);
enum class RocmTracerEventDomain {
HIP_API = 0,
HCC_OPS,
};
const char* GetRocmTracerEventDomainName(const RocmTracerEventDomain& domain);
struct RocmTracerEvent {
static constexpr uint32_t kInvalidDeviceId =
std::numeric_limits<uint32_t>::max();
static constexpr uint32_t kInvalidThreadId =
std::numeric_limits<uint32_t>::max();
static constexpr uint32_t kInvalidCorrelationId =
std::numeric_limits<uint32_t>::max();
static constexpr uint64_t kInvalidStreamId =
std::numeric_limits<uint64_t>::max();
RocmTracerEventType type;
RocmTracerEventSource source;
RocmTracerEventDomain domain;
std::string name;
// This points to strings in AnnotationMap, which should outlive the point
// where serialization happens.
absl::string_view annotation;
uint64_t start_time_ns;
uint64_t end_time_ns;
uint32_t device_id = kInvalidDeviceId;
uint32_t correlation_id = kInvalidCorrelationId;
uint32_t thread_id = kInvalidThreadId;
int64_t stream_id = kInvalidStreamId;
union {
MemcpyDetails memcpy_info; // If type == Memcpy*
MemsetDetails memset_info; // If type == Memset*
MemAllocDetails memalloc_info; // If type == MemoryAlloc
KernelDetails kernel_info; // If type == Kernel
};
};
void DumpRocmTracerEvent(const RocmTracerEvent& event,
uint64_t start_walltime_ns, uint64_t start_gputime_ns);
struct RocmTracerOptions {
// map of domain --> ops for which we need to enable the API callbacks
// If the ops vector is empty, then enable API callbacks for entire domain
absl::flat_hash_map<activity_domain_t, std::vector<uint32_t> > api_callbacks;
// map of domain --> ops for which we need to enable the Activity records
// If the ops vector is empty, then enable Activity records for entire domain
absl::flat_hash_map<activity_domain_t, std::vector<uint32_t> >
activity_tracing;
};
struct RocmTraceCollectorOptions {
// Maximum number of events to collect from callback API; if -1, no limit.
// if 0, the callback API is enabled to build a correlation map, but no
// events are collected.
uint64_t max_callback_api_events;
// Maximum number of events to collect from activity API; if -1, no limit.
uint64_t max_activity_api_events;
// Maximum number of annotation strings that we can accommodate.
uint64_t max_annotation_strings;
// Number of GPUs involved.
uint32_t num_gpus;
};
class AnnotationMap {
public:
explicit AnnotationMap(uint64_t max_size) : max_size_(max_size) {}
void Add(uint32_t correlation_id, const std::string& annotation);
absl::string_view LookUp(uint32_t correlation_id);
private:
struct AnnotationMapImpl {
// The population/consumption of annotations might happen from multiple
// callback/activity api related threads.
absl::Mutex mutex;
// Annotation tends to be repetitive, use a hash_set to store the strings,
// an use the reference to the string in the map.
absl::node_hash_set<std::string> annotations;
absl::flat_hash_map<uint32_t, absl::string_view> correlation_map;
};
const uint64_t max_size_;
AnnotationMapImpl map_;
public:
// Disable copy and move.
AnnotationMap(const AnnotationMap&) = delete;
AnnotationMap& operator=(const AnnotationMap&) = delete;
};
class RocmTraceCollector {
public:
explicit RocmTraceCollector(const RocmTraceCollectorOptions& options)
: options_(options), annotation_map_(options.max_annotation_strings) {}
virtual ~RocmTraceCollector() {}
virtual void AddEvent(RocmTracerEvent&& event) = 0;
virtual void OnEventsDropped(const std::string& reason,
uint32_t num_events) = 0;
virtual void Flush() = 0;
AnnotationMap* annotation_map() { return &annotation_map_; }
protected:
RocmTraceCollectorOptions options_;
private:
AnnotationMap annotation_map_;
public:
// Disable copy and move.
RocmTraceCollector(const RocmTraceCollector&) = delete;
RocmTraceCollector& operator=(const RocmTraceCollector&) = delete;
};
class RocmTracer;
class RocmApiCallbackImpl {
public:
RocmApiCallbackImpl(const RocmTracerOptions& options, RocmTracer* tracer,
RocmTraceCollector* collector)
: options_(options), tracer_(tracer), collector_(collector) {}
Status operator()(uint32_t domain, uint32_t cbid, const void* cbdata);
private:
void AddKernelEventUponApiExit(uint32_t cbid, const hip_api_data_t* data);
void AddMemcpyEventUponApiExit(uint32_t cbid, const hip_api_data_t* data);
void AddMemsetEventUponApiExit(uint32_t cbid, const hip_api_data_t* data);
void AddMallocEventUponApiExit(uint32_t cbid, const hip_api_data_t* data);
void AddStreamSynchronizeEventUponApiExit(uint32_t cbid,
const hip_api_data_t* data);
void AddGenericEventUponApiExit(uint32_t cbid, const hip_api_data_t* data);
RocmTracerOptions options_;
RocmTracer* tracer_ = nullptr;
RocmTraceCollector* collector_ = nullptr;
};
class RocmActivityCallbackImpl {
public:
RocmActivityCallbackImpl(const RocmTracerOptions& options, RocmTracer* tracer,
RocmTraceCollector* collector)
: options_(options), tracer_(tracer), collector_(collector) {}
Status operator()(const char* begin, const char* end);
private:
void AddHipKernelActivityEvent(const roctracer_record_t* record);
void AddHipMemcpyActivityEvent(const roctracer_record_t* record);
void AddHipMemsetActivityEvent(const roctracer_record_t* record);
void AddHipMallocEvent(const roctracer_record_t* record);
void AddHipStreamSynchronizeEvent(const roctracer_record_t* record);
void AddHccKernelActivityEvent(const roctracer_record_t* record);
void AddHccMemcpyActivityEvent(const roctracer_record_t* record);
RocmTracerOptions options_;
RocmTracer* tracer_ = nullptr;
RocmTraceCollector* collector_ = nullptr;
};
// The class use to enable cupti callback/activity API and forward the collected
// trace events to RocmTraceCollector. There should be only one RocmTracer
// per process.
class RocmTracer {
public:
// Returns a pointer to singleton RocmTracer.
static RocmTracer* GetRocmTracerSingleton();
// Only one profile session can be live in the same time.
bool IsAvailable() const;
void Enable(const RocmTracerOptions& options, RocmTraceCollector* collector);
void Disable();
void ApiCallbackHandler(uint32_t domain, uint32_t cbid, const void* cbdata);
void ActivityCallbackHandler(const char* begin, const char* end);
static uint64_t GetTimestamp();
static int NumGpus();
void AddToPendingActivityRecords(uint32_t correlation_id) {
pending_activity_records_.Add(correlation_id);
}
void RemoveFromPendingActivityRecords(uint32_t correlation_id) {
pending_activity_records_.Remove(correlation_id);
}
void ClearPendingActivityRecordsCount() { pending_activity_records_.Clear(); }
size_t GetPendingActivityRecordsCount() {
return pending_activity_records_.Count();
}
protected:
// protected constructor for injecting mock cupti interface for testing.
explicit RocmTracer() : num_gpus_(NumGpus()) {}
private:
Status EnableApiTracing();
Status DisableApiTracing();
Status EnableActivityTracing();
Status DisableActivityTracing();
int num_gpus_;
absl::optional<RocmTracerOptions> options_;
RocmTraceCollector* collector_ = nullptr;
bool api_tracing_enabled_ = false;
bool activity_tracing_enabled_ = false;
RocmApiCallbackImpl* api_cb_impl_;
RocmActivityCallbackImpl* activity_cb_impl_;
class PendingActivityRecords {
public:
// add a correlation id to the pending set
void Add(uint32_t correlation_id) {
absl::MutexLock lock(&mutex);
pending_set.insert(correlation_id);
}
// remove a correlation id from the pending set
void Remove(uint32_t correlation_id) {
absl::MutexLock lock(&mutex);
pending_set.erase(correlation_id);
}
// clear the pending set
void Clear() {
absl::MutexLock lock(&mutex);
pending_set.clear();
}
// count the number of correlation ids in the pending set
size_t Count() {
absl::MutexLock lock(&mutex);
return pending_set.size();
}
private:
// set of co-relation ids for which the hcc activity record is pending
absl::flat_hash_set<uint32_t> pending_set;
// the callback which processes the activity records (and consequently
// removes items from the pending set) is called in a separate thread
// from the one that adds item to the list.
absl::Mutex mutex;
};
PendingActivityRecords pending_activity_records_;
public:
// Disable copy and move.
RocmTracer(const RocmTracer&) = delete;
RocmTracer& operator=(const RocmTracer&) = delete;
};
} // namespace profiler
} // namespace tensorflow
#endif // TENSORFLOW_CORE_PROFILER_INTERNAL_GPU_ROCM_TRACER_H_

View File

@ -31,6 +31,7 @@ const absl::string_view kGpuPlanePrefix = "/device:GPU:";
const absl::string_view kTpuPlanePrefix = "/device:TPU:";
const absl::string_view kTpuRuntimePlaneName = "/host:TPU-runtime";
const absl::string_view kCuptiDriverApiPlaneName = "/host:CUPTI";
const absl::string_view kRoctracerApiPlaneName = "/host:ROCTRACER";
const absl::string_view kMetadataPlaneName = "/host:metadata";
const absl::string_view kTFStreamzPlaneName = "/host:tfstreamz";
const absl::string_view kPythonTracerPlaneName = "/host:python-tracer";

View File

@ -37,6 +37,8 @@ TF_CONST_INIT extern const absl::string_view kTpuPlanePrefix;
TF_CONST_INIT extern const absl::string_view kTpuRuntimePlaneName;
// Name of XPlane that contains CUPTI driver API generated events.
TF_CONST_INIT extern const absl::string_view kCuptiDriverApiPlaneName;
// Name of XPlane that contains Roctracer API generated events.
TF_CONST_INIT extern const absl::string_view kRoctracerApiPlaneName;
// Name of XPlane that contains profile metadata such as XLA debug info.
TF_CONST_INIT extern const absl::string_view kMetadataPlaneName;
// Name of XPlane that contains kpi related metrics.

View File

@ -26,7 +26,9 @@ import re
# The timeline target is usually imported as part of BUILD target
# "platform_test", which includes also includes the "platform"
# dependency. This is why the logging import here is okay.
from tensorflow.python.platform import build_info
from tensorflow.python.platform import tf_logging as logging
from tensorflow.python.platform import build_info
class AllocationMaximum(collections.namedtuple(
@ -448,6 +450,8 @@ class Timeline(object):
else:
_, op, inputs = self._parse_op_label(nodestats.timeline_label)
args = {'name': node_name, 'op': op}
if build_info.build_info['is_rocm_build']:
args['kernel'] = nodestats.timeline_label.split('@@')[0]
for i, iname in enumerate(inputs):
args['input%d' % i] = iname
self._chrome_trace.emit_region(start, duration, pid, tid, 'Op', op, args)

View File

@ -104,10 +104,7 @@ class TimelineTest(test.TestCase):
step_stats = run_metadata.step_stats
devices = [d.device for d in step_stats.dev_stats]
self.assertTrue('/job:localhost/replica:0/task:0/device:GPU:0' in devices)
if not test.is_built_with_rocm():
# skip this check for the ROCm platform
# stream level tracing is not yet supported on the ROCm platform
self.assertTrue('/device:GPU:0/stream:all' in devices)
self.assertIn('/device:GPU:0/stream:all', devices)
tl = timeline.Timeline(step_stats)
ctf = tl.generate_chrome_trace_format()
self._validateTrace(ctf)

View File

@ -47,8 +47,7 @@ class ProfilerTest(test_util.TensorFlowTestCase):
profile_pb.ParseFromString(profile_result)
devices = frozenset(device.name for device in profile_pb.devices.values())
self.assertIn('/host:CPU', devices)
if not test_util.IsBuiltWithROCm() and config.list_physical_devices('GPU'):
# device tracing is not yet supported on the ROCm platform
if config.list_physical_devices('GPU'):
self.assertIn('/device:GPU:0', devices)
events = frozenset(event.name for event in profile_pb.trace_events)
self.assertIn('three_times_five', events)

View File

@ -61,7 +61,6 @@ cuda_py_test(
python_version = "PY3",
tags = [
"no_pip",
"no_rocm",
],
deps = [
":profiler_v2",

View File

@ -77,14 +77,13 @@ def _run_model():
opts['min_bytes'] = 0
opts['order_by'] = 'name'
opts['output'] = 'none'
_ = sess.run(y,
options=config_pb2.RunOptions(
trace_level=config_pb2.RunOptions.SOFTWARE_TRACE),
run_metadata=run_metadata)
_ = sess.run(
y,
options=config_pb2.RunOptions(
trace_level=config_pb2.RunOptions.SOFTWARE_TRACE),
run_metadata=run_metadata)
tfprof_node = model_analyzer.profile(
sess.graph,
run_meta=run_metadata,
options=opts)
sess.graph, run_meta=run_metadata, options=opts)
return tfprof_node, run_metadata
@ -99,17 +98,17 @@ def _run_loop_model():
sess.run(variables.global_variables_initializer())
run_meta = config_pb2.RunMetadata()
_ = sess.run(x,
options=config_pb2.RunOptions(
trace_level=config_pb2.RunOptions.SOFTWARE_TRACE),
run_metadata=run_meta)
_ = sess.run(
x,
options=config_pb2.RunOptions(
trace_level=config_pb2.RunOptions.SOFTWARE_TRACE),
run_metadata=run_meta)
opts = builder.time_and_memory()
opts['order_by'] = 'name'
opts['output'] = 'none'
tfprof_node = model_analyzer.profile(
sess.graph, run_meta, options=opts)
tfprof_node = model_analyzer.profile(sess.graph, run_meta, options=opts)
return tfprof_node, run_meta
@ -136,10 +135,6 @@ class RunMetadataTest(test.TestCase):
ret = _extract_node(run_meta, 'MatMul')
self.assertEqual(len(ret['gpu:0']), 1)
if not test.is_built_with_rocm():
# skip this check for the ROCm platform
# stream level tracing is not yet supported on the ROCm platform
self.assertEqual(len(ret['gpu:0/stream:all']), 1, '%s' % run_meta)
@test_util.run_deprecated_v1
def testAllocationHistory(self):
@ -163,8 +158,8 @@ class RunMetadataTest(test.TestCase):
# All memory deallocated.
self.assertEqual(mm_allocs[0].alloc_bytes + mm_allocs[1].alloc_bytes, 0)
rand = _extract_node(
run_meta, 'random_normal/RandomStandardNormal')['gpu:0'][0]
rand = _extract_node(run_meta,
'random_normal/RandomStandardNormal')['gpu:0'][0]
random_allocs = rand.memory[0].allocation_records
# random normal must allocated first since matmul depends on it.
self.assertLess(random_allocs[0].alloc_micros, mm.all_start_micros)
@ -191,17 +186,15 @@ class RunMetadataTest(test.TestCase):
with ops.device('/cpu:0'):
tfprof_node, run_meta = _run_loop_model()
# The while-loop caused a node to appear 4 times in scheduling.
ret = _extract_node(run_meta,
'rnn/while/basic_rnn_cell/MatMul')
ret = _extract_node(run_meta, 'rnn/while/basic_rnn_cell/MatMul')
self.assertEqual(len(ret['cpu:0']), 4)
total_cpu_execs = 0
for node in ret['cpu:0']:
total_cpu_execs += node.op_end_rel_micros
mm_node = lib.SearchTFProfNode(
tfprof_node,
'rnn/while/basic_rnn_cell/MatMul')
mm_node = lib.SearchTFProfNode(tfprof_node,
'rnn/while/basic_rnn_cell/MatMul')
self.assertEqual(mm_node.run_count, 4)
self.assertEqual(mm_node.cpu_exec_micros, total_cpu_execs)
@ -243,17 +236,13 @@ class RunMetadataTest(test.TestCase):
with ops.device('/device:GPU:0'):
_, run_meta = _run_loop_model()
# The while-loop caused a node to appear 4 times in scheduling.
ret = _extract_node(run_meta,
'rnn/while/basic_rnn_cell/MatMul')
ret = _extract_node(run_meta, 'rnn/while/basic_rnn_cell/MatMul')
self.assertEqual(len(ret['gpu:0']), 4, '%s' % run_meta)
total_cpu_execs = 0
for node in ret['gpu:0']:
total_cpu_execs += node.op_end_rel_micros
if not test.is_built_with_rocm():
# skip this check for the ROCm platform
# stream level tracing is not yet supported on the ROCm platform
self.assertGreaterEqual(
len(ret['gpu:0/stream:all']), 4, '%s' % run_meta)

View File

@ -40,8 +40,7 @@ class ProfilerContextTest(test.TestCase):
def testBasics(self):
ops.reset_default_graph()
outfile = os.path.join(test.get_temp_dir(), "dump")
opts = builder(builder.time_and_memory()
).with_file_output(outfile).build()
opts = builder(builder.time_and_memory()).with_file_output(outfile).build()
x = lib.BuildFullModel()
@ -65,17 +64,10 @@ class ProfilerContextTest(test.TestCase):
self.assertEqual(set([15, 50, 100]), set(pctx.get_profiles("op").keys()))
with lib.ProfilerFromFile(
os.path.join(test.get_temp_dir(), "profile_100")) as profiler:
with lib.ProfilerFromFile(os.path.join(test.get_temp_dir(),
"profile_100")) as profiler:
profiler.profile_operations(options=opts)
with gfile.Open(outfile, "r") as f:
if test.is_built_with_rocm():
# The profiler output for ROCm mode, includes an extra warning
# related to the lack of stream tracing in ROCm mode.
# Need to skip this warning when doing the diff
profile_str = "\n".join(profile_str.split("\n")[7:])
self.assertEqual(profile_str, f.read())
@test_util.run_deprecated_v1
@ -104,8 +96,8 @@ class ProfilerContextTest(test.TestCase):
def testDisabled(self):
ops.reset_default_graph()
x = lib.BuildFullModel()
with profile_context.ProfileContext(test.get_temp_dir(),
enabled=False) as pctx:
with profile_context.ProfileContext(
test.get_temp_dir(), enabled=False) as pctx:
with session.Session() as sess:
self.evaluate(variables.global_variables_initializer())
for _ in range(10):

View File

@ -141,6 +141,10 @@ port::StatusOr<void*> GetRocrandDsoHandle() {
return GetDsoHandle("rocrand", "");
}
port::StatusOr<void*> GetRoctracerDsoHandle() {
return GetDsoHandle("roctracer64", "");
}
port::StatusOr<void*> GetHipsparseDsoHandle() {
return GetDsoHandle("hipsparse", "");
}
@ -220,6 +224,11 @@ port::StatusOr<void*> GetRocrandDsoHandle() {
return *result;
}
port::StatusOr<void*> GetRoctracerDsoHandle() {
static auto result = new auto(DsoLoader::GetRoctracerDsoHandle());
return *result;
}
port::StatusOr<void*> GetHipsparseDsoHandle() {
static auto result = new auto(DsoLoader::GetHipsparseDsoHandle());
return *result;

View File

@ -51,6 +51,7 @@ port::StatusOr<void*> GetRocblasDsoHandle();
port::StatusOr<void*> GetMiopenDsoHandle();
port::StatusOr<void*> GetRocfftDsoHandle();
port::StatusOr<void*> GetRocrandDsoHandle();
port::StatusOr<void*> GetRoctracerDsoHandle();
port::StatusOr<void*> GetHipsparseDsoHandle();
port::StatusOr<void*> GetHipDsoHandle();
@ -85,6 +86,7 @@ port::StatusOr<void*> GetRocblasDsoHandle();
port::StatusOr<void*> GetMiopenDsoHandle();
port::StatusOr<void*> GetRocfftDsoHandle();
port::StatusOr<void*> GetRocrandDsoHandle();
port::StatusOr<void*> GetRoctracerDsoHandle();
port::StatusOr<void*> GetHipsparseDsoHandle();
port::StatusOr<void*> GetHipDsoHandle();
} // namespace CachedDsoLoader

View File

@ -327,6 +327,29 @@ cc_library(
alwayslink = True,
)
cc_library(
name = "roctracer_if_static",
deps = if_static([
"@local_config_rocm//rocm:roctracer",
]),
)
cc_library(
name = "roctracer_wrapper",
srcs = if_rocm_is_configured(["roctracer_wrapper.h"]),
hdrs = if_rocm_is_configured(["roctracer_wrapper.h"]),
deps = if_rocm_is_configured([
":rocm_gpu_executor",
":rocm_platform_id",
":roctracer_if_static",
"@local_config_rocm//rocm:rocm_headers",
"//tensorflow/stream_executor/lib",
"//tensorflow/stream_executor/platform",
"//tensorflow/stream_executor/platform:dso_loader",
]),
alwayslink = True,
)
cc_library(
name = "all_runtime",
copts = tf_copts(),

View File

@ -0,0 +1,88 @@
/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
// This file wraps roctracer API calls with dso loader so that we don't need to
// have explicit linking to libroctracer. All TF hipsarse API usage should route
// through this wrapper.
#ifndef TENSORFLOW_STREAM_EXECUTOR_ROCM_ROCTRACER_WRAPPER_H_
#define TENSORFLOW_STREAM_EXECUTOR_ROCM_ROCTRACER_WRAPPER_H_
#include "rocm/include/roctracer/roctracer.h"
#include "rocm/include/roctracer/roctracer_hcc.h"
#include "rocm/include/roctracer/roctracer_hip.h"
#include "tensorflow/stream_executor/lib/env.h"
#include "tensorflow/stream_executor/platform/dso_loader.h"
#include "tensorflow/stream_executor/platform/port.h"
namespace tensorflow {
namespace wrap {
#ifdef PLATFORM_GOOGLE
#define ROCTRACER_API_WRAPPER(API_NAME) \
template <typename... Args> \
auto API_NAME()(Args... args)->decltype(::API_NAME(args...)) { \
return ::API_NAME(args...); \
}
#else
#define ROCTRACER_API_WRAPPER(API_NAME) \
template <typename... Args> \
auto API_NAME(Args... args)->decltype(::API_NAME(args...)) { \
using FuncPtrT = std::add_pointer<decltype(::API_NAME)>::type; \
static FuncPtrT loaded = []() -> FuncPtrT { \
static const char* kName = #API_NAME; \
void* f; \
auto s = Env::Default()->GetSymbolFromLibrary( \
stream_executor::internal::CachedDsoLoader::GetRoctracerDsoHandle() \
.ValueOrDie(), \
kName, &f); \
CHECK(s.ok()) << "could not find " << kName \
<< " in roctracer DSO; dlerror: " << s.error_message(); \
return reinterpret_cast<FuncPtrT>(f); \
}(); \
return loaded(args...); \
}
#endif // PLATFORM_GOOGLE
#define FOREACH_ROCTRACER_API(DO_FUNC) \
DO_FUNC(roctracer_default_pool_expl) \
DO_FUNC(roctracer_disable_domain_activity) \
DO_FUNC(roctracer_disable_domain_callback) \
DO_FUNC(roctracer_disable_op_activity) \
DO_FUNC(roctracer_disable_op_callback) \
DO_FUNC(roctracer_enable_domain_activity_expl) \
DO_FUNC(roctracer_enable_domain_callback) \
DO_FUNC(roctracer_enable_op_activity) \
DO_FUNC(roctracer_enable_op_callback) \
DO_FUNC(roctracer_error_string) \
DO_FUNC(roctracer_flush_activity_expl) \
DO_FUNC(roctracer_get_timestamp) \
DO_FUNC(roctracer_op_string) \
DO_FUNC(roctracer_open_pool_expl) \
DO_FUNC(roctracer_set_properties)
FOREACH_ROCTRACER_API(ROCTRACER_API_WRAPPER)
#undef FOREACH_ROCTRACER_API
#undef ROCTRACER_API_WRAPPER
} // namespace wrap
} // namespace tensorflow
#endif // TENSORFLOW_STREAM_EXECUTOR_ROCM_ROCTRACER_WRAPPER_H_

View File

@ -21,6 +21,7 @@ cc_library(
".",
"rocm/include",
"rocm/include/rocrand",
"rocm/include/roctracer",
],
visibility = ["//visibility:public"],
)
@ -109,6 +110,7 @@ cc_library(
":hiprand",
":miopen",
":hipsparse",
":roctracer",
":rocsolver",
],
)
@ -144,6 +146,11 @@ cc_library(
data = ["rocm/lib/%{hipsparse_lib}"],
)
cc_library(
name = "roctracer",
data = ["rocm/lib/%{roctracer_lib}"],
)
cc_library(
name = "rocsolver",
srcs = ["rocm/lib/%{rocsolver_lib}"],

View File

@ -332,6 +332,7 @@ def _find_libs(repository_ctx, rocm_config, bash_bin):
("MIOpen", rocm_config.rocm_toolkit_path + "/miopen"),
("rccl", rocm_config.rocm_toolkit_path + "/rccl"),
("hipsparse", rocm_config.rocm_toolkit_path + "/hipsparse"),
("roctracer64", rocm_config.rocm_toolkit_path + "/roctracer"),
("rocsolver", rocm_config.rocm_toolkit_path + "/rocsolver"),
]
]
@ -458,6 +459,7 @@ def _create_dummy_repository(repository_ctx):
"%{rocfft_lib}": _lib_name("rocfft"),
"%{hiprand_lib}": _lib_name("hiprand"),
"%{hipsparse_lib}": _lib_name("hipsparse"),
"%{roctracer_lib}": _lib_name("roctracer64"),
"%{rocsolver_lib}": _lib_name("rocsolver"),
"%{copy_rules}": "",
"%{rocm_headers}": "",
@ -635,6 +637,7 @@ def _create_local_rocm_repository(repository_ctx):
"%{miopen_lib}": rocm_libs["MIOpen"].file_name,
"%{rccl_lib}": rocm_libs["rccl"].file_name,
"%{hipsparse_lib}": rocm_libs["hipsparse"].file_name,
"%{roctracer_lib}": rocm_libs["roctracer64"].file_name,
"%{rocsolver_lib}": rocm_libs["rocsolver"].file_name,
"%{copy_rules}": "\n".join(copy_rules),
"%{rocm_headers}": ('":rocm-include",\n' +