mirror of
https://github.com/zebrajr/pytorch.git
synced 2025-12-06 12:20:52 +01:00
Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/49408 Nearly every non-test callsite doesn't need to capture any variables anyway, and this saves 48 bytes per callback. ghstack-source-id: 118665808 Test Plan: Wait for GitHub CI since we had C++14-specific issues with this one in previous PR https://github.com/pytorch/pytorch/pull/48629 Reviewed By: malfet Differential Revision: D25563207 fbshipit-source-id: 6a2831205917d465f8248ca37429ba2428d5626d
772 lines
25 KiB
C++
772 lines
25 KiB
C++
#include <torch/csrc/autograd/profiler.h>
|
|
#include <torch/csrc/autograd/function.h>
|
|
#include <torch/csrc/jit/frontend/code_template.h>
|
|
|
|
#include <torch/csrc/jit/frontend/tracer.h>
|
|
#include <torch/csrc/jit/runtime/operator.h>
|
|
|
|
#include <ATen/core/op_registration/op_registration.h>
|
|
#include <torch/library.h>
|
|
|
|
#include <fstream>
|
|
#include <list>
|
|
#include <mutex>
|
|
#include <sstream>
|
|
#include <string>
|
|
#include <vector>
|
|
|
|
#include <ATen/record_function.h>
|
|
#include <c10/core/Allocator.h>
|
|
#include <c10/util/ThreadLocalDebugInfo.h>
|
|
|
|
#include <iostream>
|
|
|
|
namespace torch { namespace autograd { namespace profiler {
|
|
|
|
std::vector<FileLineFunc> prepareCallstack(const std::vector<jit::StackEntry>& cs) {
|
|
std::vector<FileLineFunc> entries;
|
|
entries.reserve(cs.size());
|
|
for (const auto& entry : cs) {
|
|
auto& range = entry.range;
|
|
if (range.source()) {
|
|
auto& src = range.source();
|
|
if (src && src->filename()) {
|
|
auto line = src->starting_line_no() +
|
|
src->lineno_for_offset(range.start());
|
|
entries.emplace_back(FileLineFunc{*(src->filename()), line, entry.filename});
|
|
}
|
|
}
|
|
}
|
|
return entries;
|
|
}
|
|
|
|
std::vector<std::string> callstackStr(const std::vector<FileLineFunc>& cs) {
|
|
std::vector<std::string> cs_str;
|
|
cs_str.reserve(cs.size());
|
|
for (const auto& entry : cs) {
|
|
std::stringstream loc;
|
|
loc << entry.filename << "(" << entry.line << "): " << entry.funcname;
|
|
cs_str.push_back(loc.str());
|
|
}
|
|
return cs_str;
|
|
}
|
|
|
|
// We decompose the profiler logic into the following components:
|
|
//
|
|
// ThreadLocalDebugInfo:
|
|
//
|
|
// ThreadLocalDebugInfo is a thread local mapping from slots into
|
|
// the debug information structs.
|
|
// ThreadLocalDebugInfo is automatically propagated across thread
|
|
// boundaries, including the cases of:
|
|
// - launching async jobs with at::launch
|
|
// - executing JIT continuations
|
|
// - moving from the forward threads into autograd (backward) threads
|
|
//
|
|
// Entries in ThreadLocalDebugInfo are managed by DebugInfoGuard
|
|
// which can be used to add or overwrite an entry in the thread local
|
|
// mapping. A corresponding entry is removed when the guard is destroyed,
|
|
// potentially revealing the previously set value for the same slot.
|
|
//
|
|
// For the async tasks, slots previuosly set in the main thread before
|
|
// launching of an async task are shared and visible in the async task.
|
|
//
|
|
// On the other hand, any adding or overwriting of the mapping by the
|
|
// async task is not visible to the main thread and any modification
|
|
// (including removal of the entries) in the main thread is not visible
|
|
// to the async task if it happends after launching the task.
|
|
//
|
|
// We use ThreadLocalDebugInfo (slot PROFILER_STATE) to store profiler config,
|
|
// as well as a list of events that happen during profiling.
|
|
// An instance of ThreadLocalDebugInfo is created each time we enter
|
|
// profiler (i.e. enter profiling context manager/call enableConfig) and
|
|
// uniquely identifies a profiling run.
|
|
//
|
|
// We automatically propagate ThreadLocalDebugInfo into async tasks,
|
|
// as well as across JIT continuations and autograd thread, so all
|
|
// the operations that happen between profiling start and end
|
|
// (not necessarily within the same thread) are recorded.
|
|
// Unless the profiling slot is overwritten as in the case of nested
|
|
// profiling ranges (in this case events for the subrange are handled
|
|
// by the nested profiler)
|
|
//
|
|
// When we exit a profiling range (either by exiting profiling context
|
|
// manager or by calling disableProfiler), we remove the previously set
|
|
// profiling entry for the given thread local mapping, and consolidate
|
|
// events in the profiling result
|
|
//
|
|
//
|
|
// ThreadLocalState:
|
|
//
|
|
// ThreadLocalState takes a 'snapshot' of thread local variables
|
|
// using provided getters. It is used together with ThreadLocalStateGuard
|
|
// to transfer the snapshot across thread boundary and set the thread local
|
|
// values as in the parent task.
|
|
//
|
|
// Profiler uses ThreadLocalState to propagate profiler's thread local state.
|
|
// ThreadLocalState also automatically propagates profiler callbacks.
|
|
//
|
|
//
|
|
// at::RecordFunction and observers
|
|
//
|
|
// Profiler uses observers mechanism to add a pair of thread local callbacks
|
|
// that are executed on a number of predetermined ranges, including:
|
|
// - c10/ATen ops
|
|
// - TorchScript functions/methods
|
|
// - user defined named ranges (see `record_function` python context manager)
|
|
//
|
|
// Profiler setups a pair of callbacks that record profiling events and save
|
|
// them into the thread local profiler struct (ThreadLocalDebugInfo,
|
|
// PROFILER_STATE slot)
|
|
//
|
|
//
|
|
// Thus, the overall logic is:
|
|
//
|
|
// enableProfiler:
|
|
// - checks that profiler is not enabled (otherwise throws)
|
|
// - pushes new ThreadLocalDebugInfo (slot PROFILER_STATE) as the profiler
|
|
// config for the current thread
|
|
// - pushes profiling callbacks for the current thread
|
|
//
|
|
// disableProfiler:
|
|
// - pops PROFILER_STATE slot from the current ThreadLocalDebugInfo and
|
|
// consolidates events
|
|
// - removes profiling callbacks
|
|
//
|
|
// ThreadLocalState:
|
|
// - propagates ThreadLocalDebugInfo across threads
|
|
// - propagates profiler callbacks across threads
|
|
//
|
|
// Profiler callbacks:
|
|
// - get the current profiling state (PROFILER slot in ThreadLocalDebugInfo)
|
|
// - save profiling events into the profiling state
|
|
//
|
|
|
|
namespace {
|
|
const CUDAStubs default_stubs;
|
|
constexpr const CUDAStubs* default_stubs_addr = &default_stubs;
|
|
// Constant initialization, so it is guaranteed to be initialized before
|
|
// static initialization calls which may invoke registerCUDAMethods
|
|
inline const CUDAStubs*& cuda_stubs() {
|
|
static const CUDAStubs* stubs_ = default_stubs_addr;
|
|
return stubs_;
|
|
}
|
|
}
|
|
|
|
// Profiler state
|
|
const ProfilerConfig& ProfilerThreadLocalState::config() const {
|
|
return config_;
|
|
}
|
|
|
|
thread_event_lists ProfilerThreadLocalState::consolidate() {
|
|
std::lock_guard<std::mutex> g(state_mutex_);
|
|
thread_event_lists result;
|
|
for (auto& kv : event_lists_map_) {
|
|
auto& list = kv.second;
|
|
result.emplace_back(list->consolidate());
|
|
}
|
|
// Consolidate remote events if applicable as well.
|
|
if (remoteProfiledEvents_) {
|
|
result.insert(
|
|
result.end(),
|
|
std::make_move_iterator(remoteProfiledEvents_->begin()),
|
|
std::make_move_iterator(remoteProfiledEvents_->end()));
|
|
}
|
|
return result;
|
|
}
|
|
|
|
void ProfilerThreadLocalState::mark(std::string name, bool include_cuda) {
|
|
if (config_.state == ProfilerState::Disabled) {
|
|
return;
|
|
}
|
|
if (config_.state == ProfilerState::NVTX) {
|
|
cuda_stubs()->nvtxMarkA(name.c_str());
|
|
} else {
|
|
LegacyEvent evt(
|
|
EventKind::Mark,
|
|
at::StringView(std::move(name)),
|
|
at::RecordFunction::currentThreadId(),
|
|
include_cuda && config_.state == ProfilerState::CUDA);
|
|
evt.setNodeId(at::RecordFunction::getDefaultNodeId());
|
|
getEventList().record(std::move(evt));
|
|
}
|
|
}
|
|
|
|
void ProfilerThreadLocalState::setOrAddRemoteProfiledEvents(
|
|
std::vector<LegacyEvent>&& remoteProfiledEvents) {
|
|
// Lock to serialize access from multiple callback threads.
|
|
std::lock_guard<std::mutex> guard(state_mutex_);
|
|
if (remoteProfiledEvents_) {
|
|
(*remoteProfiledEvents_).emplace_back(remoteProfiledEvents);
|
|
} else {
|
|
remoteProfiledEvents_ = {std::move(remoteProfiledEvents)};
|
|
}
|
|
}
|
|
|
|
void ProfilerThreadLocalState::pushRange(
|
|
const at::RecordFunction& fn,
|
|
const bool record_cuda,
|
|
const char* msg,
|
|
std::vector<std::vector<int64_t>>&& shapes) {
|
|
if (config_.state == ProfilerState::Disabled) {
|
|
return;
|
|
}
|
|
if (config_.state == ProfilerState::NVTX) {
|
|
cuda_stubs()->nvtxRangePushA(getNvtxStr(
|
|
fn.name(), msg, fn.seqNr(), shapes).c_str());
|
|
} else {
|
|
LegacyEvent evt(
|
|
EventKind::PushRange,
|
|
fn.name(),
|
|
at::RecordFunction::currentThreadId(),
|
|
record_cuda,
|
|
fn.handle(),
|
|
std::move(shapes),
|
|
at::RecordFunction::getDefaultNodeId());
|
|
evt.setSequenceNr(fn.seqNr());
|
|
evt.setFwdThreadId(fn.forwardThreadId());
|
|
evt.setScope((uint8_t)fn.scope());
|
|
#ifndef C10_MOBILE
|
|
// backward nodes source range corresponds to the forward node
|
|
// TODO: consider using C++ stack trace
|
|
if (config_.with_stack && fn.scope() != at::RecordScope::BACKWARD_FUNCTION) {
|
|
auto cs = prepareCallstack(jit::currentCallstack());
|
|
if (cs.empty()) {
|
|
cs = prepareCallstack(jit::tracer::pythonCallstack());
|
|
}
|
|
evt.setStack(callstackStr(cs));
|
|
}
|
|
#endif
|
|
getEventList().record(std::move(evt));
|
|
}
|
|
}
|
|
|
|
void ProfilerThreadLocalState::popRange(const at::RecordFunction& fn, const bool record_cuda) {
|
|
if (config_.state == ProfilerState::Disabled) {
|
|
return;
|
|
}
|
|
if (config_.state == ProfilerState::NVTX) {
|
|
cuda_stubs()->nvtxRangePop();
|
|
} else {
|
|
// In some cases RecordFunction (and popRange) may be
|
|
// called on a different thread than pushRange
|
|
// As a convention, we put the async pop on the original
|
|
// thread and save current thread id in pop event
|
|
LegacyEvent evt(
|
|
EventKind::PopRange,
|
|
at::StringView(""),
|
|
at::RecordFunction::currentThreadId(),
|
|
record_cuda,
|
|
fn.handle());
|
|
evt.setNodeId(at::RecordFunction::getDefaultNodeId());
|
|
getEventList(fn.threadId()).record(std::move(evt));
|
|
}
|
|
}
|
|
|
|
void ProfilerThreadLocalState::reportMemoryUsage(
|
|
void* /* unused */,
|
|
int64_t alloc_size,
|
|
c10::Device device) {
|
|
if (config_.profile_memory && config_.state != ProfilerState::Disabled) {
|
|
uint64_t thread_id = at::RecordFunction::currentThreadId();
|
|
LegacyEvent evt(
|
|
EventKind::MemoryAlloc,
|
|
at::StringView(""),
|
|
thread_id,
|
|
config_.state == ProfilerState::CUDA);
|
|
evt.updateMemoryStats(alloc_size, device);
|
|
getEventList(thread_id).record(std::move(evt));
|
|
}
|
|
}
|
|
|
|
bool ProfilerThreadLocalState::memoryProfilingEnabled() const {
|
|
return config_.profile_memory;
|
|
}
|
|
|
|
std::string ProfilerThreadLocalState::getNvtxStr(
|
|
const at::StringView& name,
|
|
const char* msg,
|
|
int64_t sequence_nr,
|
|
const std::vector<std::vector<int64_t>>& shapes) const {
|
|
if (sequence_nr >= 0 || shapes.size() > 0) {
|
|
std::stringstream s;
|
|
#ifdef __HIP_PLATFORM_HCC__
|
|
s << name.str();
|
|
#endif
|
|
if (sequence_nr >= 0) {
|
|
#ifdef __HIP_PLATFORM_HCC__
|
|
s << msg << sequence_nr;
|
|
#else
|
|
s << name.str() << msg << sequence_nr;
|
|
#endif
|
|
}
|
|
if (shapes.size() > 0) {
|
|
s << ", sizes = [";
|
|
for (size_t idx = 0; idx < shapes.size(); ++idx) {
|
|
if (shapes[idx].size() > 0) {
|
|
s << "[";
|
|
for (size_t dim = 0; dim < shapes[idx].size(); ++dim) {
|
|
s << shapes[idx][dim];
|
|
if (dim < shapes[idx].size() - 1) {
|
|
s << ", ";
|
|
}
|
|
}
|
|
s << "]";
|
|
} else {
|
|
s << "[]";
|
|
}
|
|
if (idx < shapes.size() - 1) {
|
|
s << ", ";
|
|
}
|
|
}
|
|
s << "]";
|
|
}
|
|
return s.str();
|
|
} else {
|
|
return name.str();
|
|
}
|
|
}
|
|
|
|
RangeEventList& ProfilerThreadLocalState::getEventList(int64_t thread_id) {
|
|
if (thread_id < 0) {
|
|
thread_id = at::RecordFunction::currentThreadId();
|
|
}
|
|
RangeEventList* list_ptr = nullptr;
|
|
std::lock_guard<std::mutex> guard(state_mutex_);
|
|
auto it = event_lists_map_.find(thread_id);
|
|
if (it != event_lists_map_.end()) {
|
|
list_ptr = it->second.get();
|
|
} else {
|
|
auto event_list = std::make_shared<RangeEventList>();
|
|
event_lists_map_[thread_id] = event_list;
|
|
list_ptr = event_list.get();
|
|
}
|
|
return *list_ptr;
|
|
}
|
|
|
|
std::vector<std::vector<int64_t>> inputSizes(const at::RecordFunction& fn) {
|
|
std::vector<std::vector<int64_t>> sizes;
|
|
sizes.reserve(fn.inputs().size());
|
|
for (const c10::IValue& input : fn.inputs()) {
|
|
if (!input.isTensor()) {
|
|
sizes.emplace_back();
|
|
continue;
|
|
}
|
|
const at::Tensor& tensor = input.toTensor();
|
|
if (tensor.defined()) {
|
|
sizes.push_back(input.toTensor().sizes().vec());
|
|
} else {
|
|
sizes.emplace_back();
|
|
}
|
|
}
|
|
return sizes;
|
|
}
|
|
|
|
namespace {
|
|
|
|
enum EventIValueIdx {
|
|
KIND = 0,
|
|
NAME,
|
|
THREAD_ID,
|
|
HANDLE,
|
|
NODE_ID,
|
|
CPU_MEM_USAGE,
|
|
CPU_NS,
|
|
CUDA_RECORDED,
|
|
CUDA_MEM_USAGE,
|
|
CUDA_DEVICE,
|
|
CUDA_US,
|
|
SHAPES,
|
|
NUM_EVENT_IVALUE_IDX // must be last in list
|
|
};
|
|
|
|
enum ProfilerIValueIdx {
|
|
STATE = 0,
|
|
REPORT_INPUT_SHAPES,
|
|
PROFILE_MEMORY,
|
|
NUM_PROFILER_CFG_IVALUE_IDX // must be last in list
|
|
};
|
|
|
|
const std::unordered_set<std::string> disable_cuda_profiling = {
|
|
"aten::view",
|
|
"aten::t",
|
|
"aten::transpose",
|
|
"aten::stride",
|
|
"aten::empty",
|
|
"aten::empty_like",
|
|
"aten::empty_strided",
|
|
"aten::as_strided",
|
|
"aten::expand",
|
|
"aten::resize_",
|
|
"aten::squeeze",
|
|
"aten::unsqueeze",
|
|
"aten::slice",
|
|
"aten::_unsafe_view",
|
|
"aten::size"
|
|
};
|
|
|
|
ProfilerThreadLocalState* getProfilerTLSState() {
|
|
return static_cast<ProfilerThreadLocalState*>(
|
|
c10::ThreadLocalDebugInfo::get(c10::DebugInfoKind::PROFILER_STATE));
|
|
}
|
|
|
|
void pushProfilingCallbacksLegacy() {
|
|
auto state_ptr = getProfilerTLSState();
|
|
TORCH_INTERNAL_ASSERT(state_ptr, "Expected profiler state set");
|
|
auto handle = at::addThreadLocalCallback(at::RecordFunctionCallback(
|
|
[](const at::RecordFunction& fn) -> std::unique_ptr<at::ObserverContext> {
|
|
auto state_ptr = getProfilerTLSState();
|
|
if (!state_ptr || state_ptr->config().state == ProfilerState::Disabled) {
|
|
return nullptr;
|
|
}
|
|
bool record_cuda =
|
|
state_ptr->config().state == ProfilerState::CUDA;
|
|
if (record_cuda && disable_cuda_profiling.find(fn.name().str()) != disable_cuda_profiling.end()) {
|
|
record_cuda = false;
|
|
}
|
|
|
|
auto* msg = (fn.seqNr() >= 0) ? ", seq = " : "";
|
|
if (state_ptr->config().report_input_shapes) {
|
|
auto sizes = inputSizes(fn);
|
|
state_ptr->pushRange(fn, record_cuda, msg, std::move(sizes));
|
|
} else {
|
|
state_ptr->pushRange(fn, record_cuda, msg);
|
|
}
|
|
|
|
return nullptr;
|
|
},
|
|
[](const at::RecordFunction& fn, at::ObserverContext*) {
|
|
auto state_ptr = getProfilerTLSState();
|
|
if (!state_ptr || state_ptr->config().state == ProfilerState::Disabled) {
|
|
return;
|
|
}
|
|
bool record_cuda =
|
|
state_ptr->config().state == ProfilerState::CUDA;
|
|
if (record_cuda && disable_cuda_profiling.find(fn.name().str()) != disable_cuda_profiling.end()) {
|
|
record_cuda = false;
|
|
}
|
|
state_ptr->popRange(fn, record_cuda);
|
|
})
|
|
.needsInputs(state_ptr->config().report_input_shapes)
|
|
.needsIds(true));
|
|
state_ptr->setCallbackHandle(handle);
|
|
}
|
|
|
|
const int kCUDAWarmupStart = 5;
|
|
|
|
} // namespace
|
|
|
|
void registerCUDAMethods(CUDAStubs* stubs) {
|
|
cuda_stubs() = stubs;
|
|
}
|
|
|
|
at::IValue ProfilerConfig::toIValue() const {
|
|
c10::impl::GenericList eventIValueList(at::AnyType::get());
|
|
eventIValueList.reserve(NUM_PROFILER_CFG_IVALUE_IDX);
|
|
eventIValueList.emplace_back(static_cast<int64_t>(state));
|
|
eventIValueList.emplace_back(report_input_shapes);
|
|
eventIValueList.emplace_back(profile_memory);
|
|
return eventIValueList;
|
|
}
|
|
|
|
ProfilerConfig ProfilerConfig::fromIValue(
|
|
const at::IValue& profilerConfigIValue) {
|
|
TORCH_INTERNAL_ASSERT(
|
|
profilerConfigIValue.isList(),
|
|
"Expected IValue to contain type c10::impl::GenericList");
|
|
auto ivalues = profilerConfigIValue.toList();
|
|
TORCH_INTERNAL_ASSERT(
|
|
ivalues.size() == NUM_PROFILER_CFG_IVALUE_IDX,
|
|
c10::str(
|
|
"Expected exactly ",
|
|
NUM_PROFILER_CFG_IVALUE_IDX,
|
|
" ivalues to resconstruct ProfilerConfig."));
|
|
return ProfilerConfig(
|
|
static_cast<ProfilerState>(ivalues.get(ProfilerIValueIdx::STATE).toInt()),
|
|
ivalues.get(ProfilerIValueIdx::REPORT_INPUT_SHAPES).toBool(),
|
|
ivalues.get(ProfilerIValueIdx::PROFILE_MEMORY).toBool());
|
|
}
|
|
|
|
ProfilerConfig getProfilerConfig() {
|
|
auto state_ptr = getProfilerTLSState();
|
|
TORCH_CHECK(
|
|
state_ptr,
|
|
"Tried to access profiler config, but profiler is not enabled!");
|
|
return state_ptr->config();
|
|
}
|
|
|
|
bool profilerEnabled() {
|
|
auto state_ptr = getProfilerTLSState();
|
|
return state_ptr && state_ptr->config().state != ProfilerState::Disabled;
|
|
}
|
|
|
|
void enableProfilerLegacy(const ProfilerConfig& new_config) {
|
|
TORCH_CHECK(new_config.state != ProfilerState::NVTX || cuda_stubs()->enabled(),
|
|
"Can't use NVTX profiler - PyTorch was compiled without CUDA");
|
|
|
|
TORCH_CHECK(new_config.state != ProfilerState::KINETO);
|
|
|
|
auto state_ptr = getProfilerTLSState();
|
|
TORCH_CHECK(!state_ptr, "Profiler is already enabled on this thread");
|
|
auto state = std::make_shared<ProfilerThreadLocalState>(new_config);
|
|
c10::ThreadLocalDebugInfo::_push(c10::DebugInfoKind::PROFILER_STATE, state);
|
|
|
|
pushProfilingCallbacksLegacy();
|
|
|
|
if (new_config.state == ProfilerState::CUDA) {
|
|
// event recording appears to have some startup overhead, so we need to
|
|
// to generate some dummy events first before recording synchronization events
|
|
for (int idx = 0; idx < kCUDAWarmupStart; ++idx) {
|
|
cuda_stubs()->onEachDevice([state](int /* unused */) {
|
|
state->mark("__cuda_startup");
|
|
cuda_stubs()->synchronize();
|
|
});
|
|
}
|
|
|
|
// cuda events must be on the same device, so we need a start event recorded
|
|
// for each gpu. we then use this event to synchronize time on the GPU
|
|
// with the CPU clock.
|
|
cuda_stubs()->onEachDevice([state](int d) {
|
|
state->mark("__cuda_start_event");
|
|
});
|
|
}
|
|
state->mark("__start_profile", false);
|
|
}
|
|
|
|
thread_event_lists disableProfilerLegacy(c10::optional<ProfilerDisableOptions> profilerDisableOptions) {
|
|
auto cleanupTLSState = profilerDisableOptions ? profilerDisableOptions->cleanupTLSState : true;
|
|
auto consolidate = profilerDisableOptions ? profilerDisableOptions->consolidate : true;
|
|
// all the DebugInfoBase objects are scope based and supposed to use DebugInfoGuard
|
|
std::shared_ptr<c10::DebugInfoBase> state;
|
|
if (cleanupTLSState) {
|
|
state = c10::ThreadLocalDebugInfo::_pop(c10::DebugInfoKind::PROFILER_STATE);
|
|
} else {
|
|
state = c10::ThreadLocalDebugInfo::_peek(c10::DebugInfoKind::PROFILER_STATE);
|
|
}
|
|
|
|
auto state_ptr = static_cast<ProfilerThreadLocalState*>(state.get());
|
|
TORCH_CHECK(state_ptr && state_ptr->config().state != ProfilerState::Disabled,
|
|
"Can't disable profiler when it's not running");
|
|
|
|
if (cleanupTLSState) {
|
|
at::removeCallback(state_ptr->callbackHandle());
|
|
}
|
|
|
|
if (!consolidate || state_ptr->config().state == ProfilerState::NVTX) {
|
|
return thread_event_lists();
|
|
}
|
|
|
|
state_ptr->mark("__stop_profile");
|
|
// Note that this will erase the underlying events.
|
|
return state_ptr->consolidate();
|
|
}
|
|
|
|
void addEventList(std::vector<LegacyEvent>&& profiledEvents) {
|
|
auto state_ptr = getProfilerTLSState();
|
|
TORCH_CHECK(state_ptr, "Profiler must be enabled.");
|
|
state_ptr->setOrAddRemoteProfiledEvents(std::move(profiledEvents));
|
|
}
|
|
|
|
void LegacyEvent::record(bool record_cuda) {
|
|
if (record_cuda) {
|
|
cuda_stubs()->record(&device_, &cuda_event, &cpu_ns_);
|
|
return;
|
|
}
|
|
cpu_ns_ = getTime();
|
|
}
|
|
|
|
/* static */ LegacyEvent LegacyEvent::fromIValue(const at::IValue& eventIValue) {
|
|
TORCH_INTERNAL_ASSERT(
|
|
eventIValue.isList(),
|
|
"Expected IValue to contain type c10::impl::GenericList");
|
|
auto ivalues = eventIValue.toList();
|
|
TORCH_INTERNAL_ASSERT(
|
|
ivalues.size() >= NUM_EVENT_IVALUE_IDX,
|
|
"Expected at least ",
|
|
NUM_EVENT_IVALUE_IDX,
|
|
" elements to reconstruct LegacyEvent.");
|
|
|
|
// Reconstruct input shapes from ivalues.
|
|
auto shapeListIValue = ivalues.get(EventIValueIdx::SHAPES);
|
|
TORCH_INTERNAL_ASSERT(
|
|
shapeListIValue.isList(),
|
|
"Expected profiler shapes IValue to contain type c10::impl::GenericList."
|
|
);
|
|
|
|
auto shapeList = shapeListIValue.toList();
|
|
std::vector<std::vector<int64_t>> shapes;
|
|
shapes.reserve(shapeList.size());
|
|
for (size_t i = 0 ; i < shapeList.size(); ++i) {
|
|
std::vector<int64_t> s;
|
|
auto shapeIValue = shapeList.get(i);
|
|
TORCH_INTERNAL_ASSERT(
|
|
shapeIValue.isList(),
|
|
"Expected each profiler shape element to contain shapes of type c10::impl::GenericList.")
|
|
auto curShapesList = shapeIValue.toList();
|
|
s.reserve(curShapesList.size());
|
|
for (size_t j = 0; j < curShapesList.size(); ++j) {
|
|
s.emplace_back(curShapesList.get(j).toInt());
|
|
}
|
|
shapes.emplace_back(s);
|
|
}
|
|
|
|
LegacyEvent evt(
|
|
static_cast<EventKind>(
|
|
ivalues.get(EventIValueIdx::KIND).toInt()), // EventKind
|
|
at::StringView(ivalues.get(EventIValueIdx::NAME).toStringRef()), // name
|
|
ivalues.get(EventIValueIdx::THREAD_ID).toInt(), // thread_id
|
|
static_cast<at::RecordFunctionHandle>(
|
|
ivalues.get(EventIValueIdx::HANDLE).toDouble()), // handle
|
|
std::move(shapes), // input shapes
|
|
ivalues.get(EventIValueIdx::NODE_ID).toInt(), // node id
|
|
true, // is remote
|
|
ivalues.get(EventIValueIdx::CPU_MEM_USAGE).toInt(), // cpu_mem_usage
|
|
ivalues.get(EventIValueIdx::CPU_NS).toInt(), // cpu_ns
|
|
ivalues.get(EventIValueIdx::CUDA_RECORDED).toBool(), // was cuda recorded
|
|
ivalues.get(EventIValueIdx::CUDA_MEM_USAGE).toInt(), // cuda memory usage
|
|
ivalues.get(EventIValueIdx::CUDA_DEVICE).toInt(), // device
|
|
ivalues.get(EventIValueIdx::CUDA_US).toInt() // cuda_us
|
|
);
|
|
return evt;
|
|
}
|
|
|
|
at::IValue LegacyEvent::toIValue() const {
|
|
c10::impl::GenericList eventIValueList(at::AnyType::get());
|
|
eventIValueList.reserve(NUM_EVENT_IVALUE_IDX);
|
|
eventIValueList.emplace_back(static_cast<int64_t>(kind_));
|
|
eventIValueList.emplace_back(std::string(name_.str()));
|
|
eventIValueList.emplace_back(static_cast<int64_t>(thread_id_));
|
|
eventIValueList.emplace_back(static_cast<double>(handle_));
|
|
eventIValueList.emplace_back(node_id_);
|
|
eventIValueList.emplace_back(cpu_memory_usage_);
|
|
eventIValueList.emplace_back(cpu_ns_);
|
|
// CUDA event information
|
|
bool cuda_profiling_enabled = hasCuda();
|
|
eventIValueList.emplace_back(cuda_profiling_enabled);
|
|
eventIValueList.emplace_back(static_cast<int64_t>(cuda_memory_usage_));
|
|
eventIValueList.emplace_back(device_);
|
|
eventIValueList.emplace_back(cuda_us_);
|
|
// Shapes
|
|
c10::impl::GenericList shapesList =
|
|
c10::impl::GenericList(at::ListType::create(at::IntType::get()));
|
|
shapesList.reserve(shapes_.size());
|
|
for (const auto& shape : shapes_) {
|
|
c10::impl::GenericList s = c10::impl::GenericList(at::IntType::get());
|
|
s.reserve(shape.size());
|
|
for (const auto& k : shape) {
|
|
s.emplace_back(k);
|
|
}
|
|
shapesList.emplace_back(s);
|
|
}
|
|
eventIValueList.emplace_back(shapesList);
|
|
return at::IValue(eventIValueList);
|
|
}
|
|
|
|
double LegacyEvent::cudaElapsedUs(const LegacyEvent& e) const {
|
|
TORCH_CHECK(e.hasCuda() && hasCuda(), "Events were not recorded for CUDA");
|
|
TORCH_CHECK(
|
|
e.device() == device(),
|
|
c10::str(
|
|
"Events are not on the same device: ", e.device(), " vs ", device()));
|
|
if (isRemote() && e.isRemote()) {
|
|
// validate that cuda_us_ has been set properly.
|
|
TORCH_INTERNAL_ASSERT(cuda_us_ >= 0 && e.cuda_us_ >= 0);
|
|
return static_cast<double>(e.cuda_us_ - cuda_us_);
|
|
}
|
|
return cuda_stubs()->elapsed(&cuda_event, &e.cuda_event);
|
|
}
|
|
|
|
CUDAStubs::~CUDAStubs() = default;
|
|
|
|
static const jit::CodeTemplate event_template(R"(
|
|
{
|
|
"name": "${name}",
|
|
"ph": "X",
|
|
"ts": ${ts},
|
|
"dur": ${dur},
|
|
"tid": ${tid},
|
|
"pid": "CPU Functions",
|
|
"args": {}
|
|
})");
|
|
|
|
void writeProfilerEventsToStream(std::ostream& out, const std::vector<LegacyEvent*>& events) {
|
|
TORCH_CHECK(out, "Could not open file");
|
|
LegacyEvent* profiler_start = nullptr;
|
|
for (LegacyEvent* e : events) {
|
|
if (0 == strcmp(e->name(), "__start_profile")) {
|
|
profiler_start = e;
|
|
break;
|
|
}
|
|
}
|
|
TORCH_CHECK(profiler_start, "Could not find __start_profile mark");
|
|
|
|
struct PairHash {
|
|
size_t operator()(std::pair<at::RecordFunctionHandle, int> p) const
|
|
noexcept {
|
|
return std::hash<at::RecordFunctionHandle>()(p.first) ^ std::hash<int64_t>()(p.second);
|
|
}
|
|
};
|
|
std::unordered_map<std::pair<at::RecordFunctionHandle, int64_t>, LegacyEvent*, PairHash> events_map;
|
|
out << "[\n";
|
|
bool first = true;
|
|
for (LegacyEvent* evt : events) {
|
|
if (evt->kindStr() == "push") {
|
|
events_map[std::make_pair(evt->handle(), evt->nodeId())] = evt;
|
|
} else if (evt->kindStr() == "pop") {
|
|
if (!first) {
|
|
out << ",\n";
|
|
}
|
|
first = false;
|
|
auto it = events_map.find(std::make_pair(evt->handle(), evt->nodeId()));
|
|
TORCH_CHECK(it != events_map.end(), "Unmatched pop event");
|
|
LegacyEvent* evt_start = it->second;
|
|
events_map.erase(it);
|
|
|
|
jit::TemplateEnv env;
|
|
env.s("name", evt_start->name());
|
|
env.d("ts", profiler_start->cpuElapsedUs(*evt_start));
|
|
env.d("dur", evt_start->cpuElapsedUs(*evt));
|
|
env.d("tid", evt_start->threadId());
|
|
out << event_template.format(env);
|
|
}
|
|
}
|
|
out << "]\n";
|
|
}
|
|
|
|
RecordProfile::RecordProfile(std::ostream& out)
|
|
: out_(out) {
|
|
init();
|
|
}
|
|
|
|
RecordProfile::RecordProfile(const std::string& filename)
|
|
: file_(new std::ofstream(filename)), out_(*file_) {
|
|
init();
|
|
}
|
|
|
|
void RecordProfile::init() {
|
|
enableProfilerLegacy(ProfilerConfig(ProfilerState::CPU));
|
|
}
|
|
|
|
RecordProfile::~RecordProfile() {
|
|
try {
|
|
thread_event_lists event_lists = disableProfilerLegacy();
|
|
std::vector<LegacyEvent*> events;
|
|
for (auto& l : event_lists) {
|
|
for (auto& e : l) {
|
|
events.push_back(&e);
|
|
}
|
|
}
|
|
processEvents(events);
|
|
} catch (const std::exception& e) {
|
|
LOG(ERROR) << e.what() << std::endl;
|
|
} catch (...) {
|
|
LOG(ERROR) << "Unknown error" << std::endl;
|
|
}
|
|
}
|
|
|
|
void RecordProfile::processEvents(const std::vector<LegacyEvent*>& events) {
|
|
writeProfilerEventsToStream(out_, events);
|
|
}
|
|
|
|
}}}
|