pytorch/torch/csrc/autograd/profiler_kineto.cpp
Mwiza Kunda b5873292c6 Add overload names to profiler trace (#143114)
Currently, recorded profiler events for aten ops do not store overload names. It would be useful to know which overloads are actually called to analyse performance.
For example, consider the following dispatch trace which occurs if there is a fallthrough kernel registered for aten::add:
```
             [call] op=[aten::add.Tensor], key=[AutogradCPU]
               [redispatch] op=[aten::add.Tensor], key=[Undefined]
                 [call] op=[aten::empty.memory_format], key=[BackendSelect]
                   [redispatch] op=[aten::empty.memory_format], key=[CPU]
                 [call] op=[aten::add.out], key=[CPU]
```

In this case, aten::add.out is a child of aten::add.Tensor, however the current profiler trace provides no way to differentiate aten op calls.

See the added unit test for a more detailed example.

Fixes #ISSUE_NUMBER

Pull Request resolved: https://github.com/pytorch/pytorch/pull/143114
Approved by: https://github.com/sraikund16
2025-03-05 01:00:29 +00:00

1117 lines
38 KiB
C++

#include <cstring>
#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
#include <torch/csrc/autograd/profiler_kineto.h>
#include <c10/macros/Export.h>
#include <c10/util/ApproximateClock.h>
#include <c10/util/Exception.h>
#include <c10/util/flat_hash_map.h>
#include <c10/util/irange.h>
#include <c10/util/overloaded.h>
#include <torch/csrc/profiler/api.h>
#include <torch/csrc/profiler/collection.h>
#include <torch/csrc/profiler/containers.h>
#include <torch/csrc/profiler/events.h>
#include <torch/csrc/profiler/kineto_shim.h>
#include <torch/csrc/profiler/orchestration/observer.h>
#include <torch/csrc/profiler/perf.h>
#include <torch/csrc/profiler/standalone/itt_observer.h>
#include <torch/csrc/profiler/standalone/nvtx_observer.h>
#include <torch/csrc/profiler/standalone/privateuse1_observer.h>
#include <torch/csrc/profiler/util.h>
#include <ATen/Context.h>
#include <stdexcept>
#include <utility>
#ifdef USE_KINETO
#include <ApproximateClock.h>
#include <libkineto.h>
#include <time_since_epoch.h>
#ifndef _MSC_VER
// TODO: TO be removed, once this properly works from libkineto
// Literal copy-n-paste from third_party/kineto/libkineto/src/WeakSymbols.cpp
extern "C" {
// This function is needed to avoid superfluous dependency on GNU OpenMP library
// when cuPTI is linked statically For more details see
// https://github.com/pytorch/pytorch/issues/51026
__attribute__((weak)) int acc_get_device_type();
__attribute__((weak)) int acc_get_device_type() {
throw std::runtime_error(
"Dummy implementation of acc_get_device_type is not supposed to be called!");
}
} // extern "C"
#endif // _MSC_VER
#endif // USE_KINETO
namespace torch {
namespace autograd::profiler {
namespace {
inline int64_t getTimeNs() {
#ifdef USE_KINETO
return libkineto::timeSinceEpoch(std::chrono::system_clock::now());
#else
return c10::getTime();
#endif // USE_KINETO
}
using torch::profiler::impl::ActiveProfilerType;
using torch::profiler::impl::EventType;
using torch::profiler::impl::ExtraFields;
using torch::profiler::impl::get_record_concrete_inputs_enabled;
using torch::profiler::impl::ivalueListToStr;
using torch::profiler::impl::ivalueToStr;
using torch::profiler::impl::op_input_t;
using torch::profiler::impl::ProfilerStateBase;
using torch::profiler::impl::PyExtraFieldsBase;
using torch::profiler::impl::Result;
using torch::profiler::impl::shape;
using torch::profiler::impl::shapesToStr;
using torch::profiler::impl::stacksToStr;
using torch::profiler::impl::strListToStr;
using torch::profiler::impl::TensorMetadata;
using torch::profiler::impl::variantShapesToStr;
struct OpArgData {
bool hasData;
std::vector<shape> shapes;
std::vector<std::string> dtypes;
std::vector<c10::IValue> concreteInputs;
std::vector<std::vector<int64_t>> shapesForKinetoEvent;
std::vector<shape> strides;
};
auto parseArgData(
const std::vector<op_input_t>& input_shapes,
const std::vector<op_input_t>& concreteInputs) {
if (input_shapes.empty()) {
return OpArgData{false, {}, {}, {}, {}, {}};
}
std::vector<shape> shapes(input_shapes.size());
std::vector<shape> strides(input_shapes.size());
std::vector<std::vector<int64_t>> shapesForKinetoEvent(input_shapes.size());
std::vector<std::string> dtypes(input_shapes.size());
std::vector<c10::IValue> concrete_inputs_list;
for (const auto& i : c10::irange(input_shapes.size())) {
std::visit(
c10::overloaded(
[&](const TensorMetadata& t) {
shapes[i] = t.sizes_;
shapesForKinetoEvent[i] = t.sizes_;
dtypes[i] = std::string(scalarTypeToTypeMeta(t.dtype_).name());
strides[i] = t.strides_;
},
[&](const std::vector<TensorMetadata>& l) {
std::vector<std::vector<int64_t>> shape;
shape.reserve(l.size());
std::vector<std::vector<int64_t>> stride;
stride.reserve(l.size());
for (const auto& t : l) {
shape.emplace_back(t.sizes_);
stride.emplace_back(t.strides_);
}
shapes[i] = shape;
strides[i] = stride;
dtypes[i] = "TensorList";
},
[&](const c10::IValue&) { dtypes[i] = "Scalar"; },
[&](const auto&) {}),
input_shapes[i]);
}
// If we recorded concrete inputs, then parse them
if (input_shapes.size() == concreteInputs.size() && !concreteInputs.empty()) {
concrete_inputs_list.resize(input_shapes.size());
for (const auto& i : c10::irange(input_shapes.size())) {
std::visit(
c10::overloaded(
[&](const c10::IValue& val) { concrete_inputs_list[i] = val; },
[&](const auto&) {}),
input_shapes[i]);
std::visit(
c10::overloaded(
[&](const c10::IValue& val) {
concrete_inputs_list[i] = val;
dtypes[i] = "ScalarList";
},
[&](const auto&) {}),
concreteInputs[i]);
}
}
return OpArgData{
true,
shapes,
dtypes,
concrete_inputs_list,
shapesForKinetoEvent,
strides};
}
struct MetadataBase {
/* implicit */ MetadataBase(const std::shared_ptr<Result>& result)
: kinetoActivity_{result->kineto_activity_} {
if (std::holds_alternative<ExtraFields<EventType::Kineto>>(
result->extra_fields_)) {
// In order to add metadata we have to downcast from
// `libkineto::ITraceActivity` to `libkineto::GenericTraceActivity`. We
// know that all activities provided by PyTorch are of the correct type,
// however Kineto profilers can (and do) add events that inherit directly
// from ITraceActivity. As a result, any Result which was constructed from
// an event that Kineto provided is unsafe to cast.
if (!(SOFT_ASSERT(!hasKinetoActivity()))) {
result->kineto_activity_ = nullptr;
}
kinetoActivity_ = result->kineto_activity_;
}
}
void addMetadata(const std::string& key, const std::string& value) {
if (kinetoActivity_ && !value.empty() && value != "\"\"") {
torch::profiler::impl::kineto::addMetadata(
// NOLINTNEXTLINE(cppcoreguidelines-pro-type-const-cast)
const_cast<torch::profiler::impl::kineto::activity_t*>(
kinetoActivity_),
key,
value);
}
}
bool hasKinetoActivity() const {
return kinetoActivity_ != nullptr;
}
private:
const torch::profiler::impl::kineto::activity_t* kinetoActivity_{nullptr};
};
struct AddTensorboardFields : public MetadataBase {
AddTensorboardFields(
const std::shared_ptr<Result>& result,
KinetoEvent& kineto_event)
: MetadataBase(result) {
result->visit(*this);
const auto module_hierarchy = kineto_event.moduleHierarchy();
addMetadata("Module Hierarchy", stacksToStr(module_hierarchy.vec(), "."));
addMetadata("Call stack", stacksToStr(kineto_event.stack().vec(), ";"));
result->visit_if_base<PyExtraFieldsBase>([&, this](const auto& i) -> void {
this->addMetadata("Python id", std::to_string(i.id_));
std::optional<std::string> parent_id;
std::shared_ptr<Result> parent = result->parent_.lock();
while (parent && !parent_id.has_value()) {
parent->visit_if_base<PyExtraFieldsBase>(
[&](const auto& j) { parent_id = std::to_string(j.id_); });
parent = parent->parent_.lock();
}
this->addMetadata("Python parent id", parent_id.value_or("null"));
});
}
void operator()(const ExtraFields<EventType::PyCall>& py_call) {
if (py_call.module_.has_value()) {
addMetadata("Python module id", std::to_string(py_call.module_->id_));
}
}
template <typename T>
void operator()(const T&) {}
};
struct AddGenericMetadata : public MetadataBase {
AddGenericMetadata(
std::shared_ptr<Result>& result,
const torch::profiler::impl::ProfilerConfig* config)
: MetadataBase(result), config_(config) {
result->visit(*this);
if (config->experimental_config.verbose) {
result->visit_if_base<PyExtraFieldsBase>(
[&, this](const auto& i) -> void {
this->addMetadata("Python thread", std::to_string(i.python_tid_));
});
}
}
void operator()(ExtraFields<EventType::TorchOp>& op_event) {
const auto arg_data =
parseArgData(op_event.inputs_, op_event.concrete_inputs_);
if (arg_data.hasData) {
if (get_record_concrete_inputs_enabled()) {
addMetadata("Input Dims", variantShapesToStr(arg_data.shapes));
addMetadata("Input Strides", variantShapesToStr(arg_data.strides));
} else {
addMetadata("Input Dims", shapesToStr(arg_data.shapesForKinetoEvent));
}
addMetadata("Input type", strListToStr(arg_data.dtypes));
if (!arg_data.concreteInputs.empty()) {
addMetadata(
"Concrete Inputs", ivalueListToStr(arg_data.concreteInputs));
}
}
// Add metadata for kwinputs if exist
for (const auto& [key, val] : op_event.kwinputs_) {
if (key == "stream" && !val.isInt()) {
LOG(WARNING) << "Inputted stream is not an int for op: "
<< op_event.name_ << " skipping";
continue;
}
// Until needed, lets limit the kwargs to only ints, doubles, strings and
// bools
if (!val.isInt() && !val.isDouble() && !val.isString() && !val.isBool()) {
LOG(WARNING) << "Inputted kwarg: " << key
<< " is not an int, double, string, or bool for op: "
<< op_event.name_ << " skipping";
continue;
}
bool isString = val.isString();
addMetadata(key, ivalueToStr(val, isString));
}
// Add extra metadata if any
for (const auto& [key, val] : op_event.extra_meta_) {
addMetadata(key, val);
}
if (config_ && !config_->experimental_config.performance_events.empty()) {
auto& event_names = config_->experimental_config.performance_events;
for (const auto i : c10::irange(op_event.perf_event_counters_->size())) {
addMetadata(
event_names[i],
std::to_string((*op_event.perf_event_counters_)[i]));
}
}
// add information about an associated forward op, if a sequence number
// is available (e.g. during training)
if (op_event.sequence_number_ >= 0) {
addMetadata("Fwd thread id", std::to_string(op_event.forward_tid_));
addMetadata("Sequence number", std::to_string(op_event.sequence_number_));
}
addMetadata(
"Record function id", std::to_string(op_event.record_function_id_));
}
void operator()(ExtraFields<EventType::Backend>& backend_event) {
if (!backend_event.backend_.empty()) {
addMetadata("Backend", "\"" + backend_event.backend_ + "\"");
}
}
void operator()(const ExtraFields<EventType::Allocation>& alloc) {
addMetadata("Device Type", std::to_string((int8_t)alloc.device_type_));
addMetadata("Device Id", std::to_string(alloc.device_index_));
addMetadata("Addr", std::to_string(reinterpret_cast<intptr_t>(alloc.ptr_)));
addMetadata("Bytes", std::to_string(alloc.alloc_size_));
addMetadata("Total Allocated", std::to_string(alloc.total_allocated_));
addMetadata("Total Reserved", std::to_string(alloc.total_reserved_));
}
void operator()(const ExtraFields<EventType::OutOfMemory>& alloc) {
addMetadata("Device Type", std::to_string((int8_t)alloc.device_type_));
addMetadata("Device Id", std::to_string(alloc.device_index_));
addMetadata("Bytes", std::to_string(alloc.alloc_size_));
addMetadata("Total Allocated", std::to_string(alloc.total_allocated_));
addMetadata("Total Reserved", std::to_string(alloc.total_reserved_));
}
template <typename T>
void operator()(const T&) {}
private:
/* To get names of the performance events */
const torch::profiler::impl::ProfilerConfig* config_;
};
struct KinetoThreadLocalState : public ProfilerStateBase {
explicit KinetoThreadLocalState(
const ProfilerConfig& config,
std::set<torch::profiler::impl::ActivityType> activities)
: ProfilerStateBase(config),
startTime(getTimeNs()),
recordQueue(config, std::move(activities)) {}
~KinetoThreadLocalState() override = default;
static KinetoThreadLocalState* get(bool global) {
auto* state = ProfilerStateBase::get(/*global=*/global);
TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
state == nullptr ||
state->profilerType() == ActiveProfilerType::KINETO);
return static_cast<KinetoThreadLocalState*>(state);
}
ActiveProfilerType profilerType() override {
return ActiveProfilerType::KINETO;
}
void reportVulkanEventToProfiler(torch::profiler::impl::vulkan_id_t id) {
if (!config_.disabled()) {
recordQueue.getSubqueue()->emplace_vulkan_event(
c10::getApproximateTime(), id);
}
}
void reportMemoryUsage(
void* ptr,
int64_t alloc_size,
size_t total_allocated,
size_t total_reserved,
c10::Device device) override {
if (config_.profile_memory && !config_.disabled()) {
recordQueue.getSubqueue()->emplace_allocation_event(
c10::getApproximateTime(),
ptr,
alloc_size,
total_allocated,
total_reserved,
device.type(),
device.index());
}
}
void reportOutOfMemory(
int64_t alloc_size,
size_t total_allocated,
size_t total_reserved,
c10::Device device) override {
if (config_.profile_memory && !config_.disabled()) {
recordQueue.getSubqueue()->emplace_ooms_event(
c10::getApproximateTime(),
alloc_size,
total_allocated,
total_reserved,
device.type(),
device.index());
}
}
void setEventPostProcessingCallback(post_process_t&& cb) {
eventPostProcessCb = std::move(cb);
}
void pausePython() {
recordQueue.stop();
}
void resumePython() {
recordQueue.restart();
}
std::unique_ptr<torch::profiler::impl::kineto::ActivityTraceWrapper>
finalizeTrace() {
auto end_time = getTimeNs();
recordQueue.stop();
std::lock_guard<std::mutex> guard(state_mutex_);
auto converter = clockConverter.makeConverter();
#ifdef USE_KINETO
libkineto::get_time_converter() = converter;
#endif
auto records_and_trace =
recordQueue.getRecords(std::move(converter), startTime, end_time);
materializeOpEvents(records_and_trace.first);
// `kinetoEvents` does not include Python events. Instead it exposes them
// via the `stacks` property.
kinetoEvents.erase(
std::remove_if(
kinetoEvents.begin(),
kinetoEvents.end(),
[](const auto& i) { return i.isPythonFunction(); }),
kinetoEvents.end());
return std::move(records_and_trace.second);
}
template <typename T>
void invokeCallback(T& t) {
if (eventPostProcessCb) {
eventPostProcessCb(t.debug_handle_, t.jit_stack_, t.jit_modules_);
}
}
void materializeOpEvents(std::vector<std::shared_ptr<Result>>& events) {
for (auto& e : events) {
if (e->parent_.expired() && e->deviceType() == c10::DeviceType::CPU) {
eventTree.push_back(e);
}
if (e->finished_) {
e->visit(c10::overloaded(
[this](ExtraFields<EventType::TorchOp>& i) { invokeCallback(i); },
[this](ExtraFields<EventType::Backend>& i) { invokeCallback(i); },
[](auto&) {}));
kinetoEvents.emplace_back(e, config_.experimental_config.verbose);
AddTensorboardFields add_tb(e, kinetoEvents.back());
AddGenericMetadata add_generic(e, &config_);
// It is not safe to use the activity after post processing.
e->kineto_activity_ = nullptr;
}
}
}
uint64_t startTime;
c10::ApproximateClockToUnixTimeConverter clockConverter;
torch::profiler::impl::RecordQueue recordQueue;
std::vector<KinetoEvent> kinetoEvents;
std::vector<experimental_event_t> eventTree;
// Optional, if event post-processing is enabled.
post_process_t eventPostProcessCb;
};
template <bool use_global_state_ptr = false>
std::unique_ptr<at::ObserverContext> onFunctionEnter(
const at::RecordFunction& fn) {
auto state_ptr = KinetoThreadLocalState::get(use_global_state_ptr);
if (!state_ptr) {
return nullptr;
}
return state_ptr->recordQueue.getSubqueue()->begin_op(fn);
}
// @lint-ignore CLANGTIDY clang-diagnostic-unused-parameter
template <bool use_global_state_ptr = false>
void onFunctionExit(
const at::RecordFunction& fn,
at::ObserverContext* ctx_ptr) {
auto state_ptr = KinetoThreadLocalState::get(use_global_state_ptr);
if (!state_ptr) {
return;
}
const auto& config = state_ptr->config();
auto* kineto_ctx_ptr =
static_cast<torch::profiler::impl::KinetoObserverContext*>(ctx_ptr);
TORCH_INTERNAL_ASSERT(kineto_ctx_ptr != nullptr);
kineto_ctx_ptr->event_->end_time_ = c10::getApproximateTime();
if (!config.experimental_config.performance_events.empty()) {
state_ptr->recordQueue.getSubqueue()->disable_perf_profiler(
*kineto_ctx_ptr->event_->counters_);
}
kineto_ctx_ptr->event_->basic_fields_.end_tid_ =
at::RecordFunction::currentThreadId();
if (fn.isNcclMeta()) {
auto& extra_meta = *(kineto_ctx_ptr->event_->extra_nccl_meta_);
// Record only the outputs in this exit callback of the record function
torch::profiler::impl::SaveNcclMetaConfig ncclMetaConfig{
true, false, false, true};
auto additonal_nccl_meta =
torch::profiler::impl::saveNcclMeta(fn, ncclMetaConfig);
extra_meta.insert(additonal_nccl_meta.begin(), additonal_nccl_meta.end());
}
if (config.state == ProfilerState::KINETO_GPU_FALLBACK) {
try {
auto fallback = kineto_ctx_ptr->fallback_;
TORCH_INTERNAL_ASSERT(fallback != nullptr);
torch::profiler::impl::cudaStubs()->record(
nullptr, &fallback->device_event_end_, nullptr);
} catch (const std::exception& e) {
LOG(WARNING) << "Failed to record CUDA event. " << e.what();
}
} else if (config.state == ProfilerState::KINETO_PRIVATEUSE1_FALLBACK) {
auto fallback = kineto_ctx_ptr->fallback_;
TORCH_INTERNAL_ASSERT(fallback != nullptr);
torch::profiler::impl::privateuse1Stubs()->record(
nullptr, &fallback->device_event_end_, nullptr);
}
if (!config.experimental_config.disable_external_correlation) {
if (fn.scope() == at::RecordScope::USER_SCOPE) {
torch::profiler::impl::kineto::popUserCorrelationId();
} else {
torch::profiler::impl::kineto::popCorrelationId();
}
}
}
template <bool use_global_callback = false>
void pushProfilingCallbacks(const std::unordered_set<at::RecordScope>& scopes) {
auto registration_state_ptr =
KinetoThreadLocalState::get(use_global_callback);
TORCH_INTERNAL_ASSERT(registration_state_ptr, "Expected profiler state set");
auto recordFunctionCallback =
at::RecordFunctionCallback(
onFunctionEnter<use_global_callback>,
onFunctionExit<use_global_callback>)
.needsInputs(registration_state_ptr->config().report_input_shapes)
.scopes(scopes);
if constexpr (use_global_callback) {
registration_state_ptr->setCallbackHandle(
at::addGlobalCallback(recordFunctionCallback));
} else {
registration_state_ptr->setCallbackHandle(
at::addThreadLocalCallback(recordFunctionCallback));
}
}
struct ProfilerStateInfo {
std::shared_ptr<KinetoThreadLocalState> state_ptr;
std::unordered_set<at::RecordScope> scopes;
};
std::shared_ptr<ProfilerStateInfo> profiler_state_info_ptr{nullptr};
} // namespace
void reportBackendEventToActiveKinetoProfiler(
const int64_t start_time_us,
const int64_t end_time_us,
const int64_t debug_handle,
const at::RecordScope scope,
const std::string& event_name,
const std::string& backend_name) {
TORCH_INTERNAL_ASSERT(
KinetoThreadLocalState::get(/*global=*/true) == nullptr,
"On-demand profiling does not support post processing callback");
auto state_ptr = KinetoThreadLocalState::get(/*global=*/false);
if (!state_ptr) {
return;
}
state_ptr->recordQueue.getSubqueue()->emplace_backend_event(
start_time_us,
end_time_us,
debug_handle,
scope,
event_name,
backend_name);
/* no support for input shapes now?
if (config.report_input_shapes) {
ctx_ptr->shapes = inputSizes(fn);
ctx_ptr->dtypes = inputTypes(fn);
}
*/
}
void prepareProfiler(
const torch::profiler::impl::ProfilerConfig& config,
const std::set<torch::profiler::impl::ActivityType>& activities) {
if (config.state == ProfilerState::NVTX ||
config.state == ProfilerState::ITT) {
return;
}
TORCH_CHECK(
config.state == ProfilerState::KINETO ||
config.state == ProfilerState::KINETO_GPU_FALLBACK ||
config.state == ProfilerState::KINETO_PRIVATEUSE1_FALLBACK,
"Supported only in Kineto profiler");
torch::profiler::impl::kineto::prepareTrace(
/*cpuOnly=*/!(
at::hasCUDA() || at::hasXPU() || at::hasMTIA() ||
c10::get_privateuse1_backend() != "privateuseone"),
activities,
config.experimental_config,
config.trace_id);
if (!config.experimental_config.performance_events.empty()) {
/* For now only CPU activity is supported */
TORCH_CHECK(
activities.count(torch::autograd::profiler::ActivityType::CPU),
"Cannot run cpu hardware profiler without CPU activities, please only use CPU activity type");
/*
* Sending a warning and passing the non-standard event to the backend
* Backend can abort if the event is not supported.
* TODO Should we gracefully drop the invalid event if we have atleast one
* valid?
*/
auto is_standard_event = [](const std::string& event) -> bool {
for (auto e : torch::profiler::ProfilerPerfEvents) {
if (!std::strcmp(event.c_str(), e)) {
return true;
}
}
return false;
};
for (const auto& e : config.experimental_config.performance_events) {
if (!is_standard_event(e)) {
TORCH_WARN("Forwarding a non-standard CPU performance event : ", e);
}
}
}
}
static void toggleTorchOpCollectionDynamic(bool enable) {
auto state_ptr = ProfilerStateBase::get();
if (state_ptr) {
const auto& config = state_ptr->config();
if (enable) {
auto scopes = profiler_state_info_ptr->scopes;
config.global() ? pushProfilingCallbacks</*global=*/true>(scopes)
: pushProfilingCallbacks</*global=*/false>(scopes);
} else {
state_ptr->removeCallback();
}
}
}
// Set this function to be unused as profiler implementation needs more
// refactoring to support Python ops collection dynamic toggling
#ifdef _MSC_VER
#define UNUSED
#else
#define UNUSED __attribute__((unused))
#endif
static UNUSED void togglePythonCollectionDynamic(bool enable) {
auto state_ptr = ProfilerStateBase::get();
if (state_ptr) {
auto global = state_ptr->config().global();
KinetoThreadLocalState* kineto_thread_local_state_ptr =
KinetoThreadLocalState::get(global);
if (enable) {
kineto_thread_local_state_ptr->resumePython();
} else {
kineto_thread_local_state_ptr->pausePython();
}
}
}
static void toggleCPUCollectionDynamic(bool enable) {
toggleTorchOpCollectionDynamic(enable);
// For now we only support Torch Op collection dynamic toggling as
// implementing Python ops would require not only string parsing to get rid of
// the toggling events as well as other unfinished events as well as changes
// in stack logic
// togglePythonCollectionDynamic(enable);
}
void toggleCollectionDynamic(
const bool enable,
const std::set<torch::profiler::impl::ActivityType>& activities) {
if (activities.count(torch::autograd::profiler::ActivityType::CPU) > 0 &&
activities.count(torch::autograd::profiler::ActivityType::CUDA) == 0) {
LOG(WARNING)
<< "Toggling CPU activity with CUDA activity on may result in traces with CUDA events on artibrary tracks";
} else if (
activities.count(torch::autograd::profiler::ActivityType::CUDA) > 0 &&
activities.count(torch::autograd::profiler::ActivityType::CPU) == 0) {
LOG(WARNING)
<< "Toggling CUDA activity with CPU activity on may result in traces with incorrect correlation between CPU and CUDA events";
}
for (auto act : activities) {
if (act == torch::autograd::profiler::ActivityType::CUDA) {
torch::profiler::impl::kineto::toggleCollectionDynamic(enable);
} else if (act == torch::autograd::profiler::ActivityType::CPU) {
toggleCPUCollectionDynamic(enable);
} else {
LOG(WARNING)
<< "Dynamic toggle is only supported for CPU/GPU activity, skipping toggling of "
<< actToString(act);
continue;
}
}
}
void enableProfilerWithEventPostProcess(
const torch::profiler::impl::ProfilerConfig& config,
const std::set<torch::profiler::impl::ActivityType>& activities,
post_process_t&& cb,
const std::unordered_set<at::RecordScope>& scopes) {
TORCH_CHECK(
config.state != ProfilerState::NVTX,
"NVTX does not support post processing callback.");
TORCH_CHECK(
config.state != ProfilerState::ITT,
"ITT does not support post processing callback.");
TORCH_INTERNAL_ASSERT(
KinetoThreadLocalState::get(/*global=*/true) == nullptr,
"On-demand profiling does not support post processing callback");
enableProfiler(config, activities, scopes);
auto state_ptr = KinetoThreadLocalState::get(config.global());
state_ptr->setEventPostProcessingCallback(std::move(cb));
}
void enableProfiler(
const torch::profiler::impl::ProfilerConfig& config,
const std::set<torch::profiler::impl::ActivityType>& activities,
const std::unordered_set<at::RecordScope>& scopes) {
const auto has_cpu = activities.count(ActivityType::CPU);
TORCH_CHECK(
KinetoThreadLocalState::get(/*global=*/config.global()) == nullptr,
"Profiler is already enabled",
(config.global() ? "." : " on this thread."));
if (config.state == ProfilerState::NVTX) {
torch::profiler::impl::pushNVTXCallbacks(config, scopes);
return;
} else if (config.state == ProfilerState::ITT) {
torch::profiler::impl::pushITTCallbacks(config, scopes);
return;
} else if (config.state == ProfilerState::PRIVATEUSE1) {
torch::profiler::impl::pushPRIVATEUSE1CallbacksStub(config, scopes);
return;
}
TORCH_CHECK(
config.state == ProfilerState::KINETO ||
config.state == ProfilerState::KINETO_GPU_FALLBACK ||
config.state == ProfilerState::KINETO_PRIVATEUSE1_FALLBACK ||
config.global());
TORCH_CHECK(!activities.empty(), "No activities specified.");
TORCH_INTERNAL_ASSERT(
has_cpu || !config.global(),
"Ondemand profiling must enable CPU tracing");
auto state_ptr = std::make_shared<KinetoThreadLocalState>(config, activities);
KinetoThreadLocalState::push(state_ptr);
if (has_cpu) {
config.pushGlobalCallbacks()
? pushProfilingCallbacks</*global=*/true>(scopes)
: pushProfilingCallbacks</*global=*/false>(scopes);
}
if (!config.global()) {
torch::profiler::impl::kineto::startTrace();
}
if (has_cpu) {
auto state_info_ptr = std::make_shared<ProfilerStateInfo>();
state_info_ptr->state_ptr = state_ptr;
state_info_ptr->scopes = scopes;
profiler_state_info_ptr = state_info_ptr;
}
}
bool isProfilerEnabledInMainThread() {
return profiler_state_info_ptr != nullptr;
}
void enableProfilerInChildThread() {
auto state_info_ptr = profiler_state_info_ptr;
TORCH_CHECK(state_info_ptr, "Profiler is not enabled in main thread.");
TORCH_CHECK(
KinetoThreadLocalState::get(/*global=*/false) == nullptr,
"Profiler is already enabled in this thread.");
KinetoThreadLocalState::push(state_info_ptr->state_ptr);
pushProfilingCallbacks</*global=*/false>(state_info_ptr->scopes);
}
void disableProfilerInChildThread() {
auto state_ptr = ProfilerStateBase::pop();
TORCH_CHECK(
state_ptr,
"Can't disable Kineto profiler when it's not running in this thread");
state_ptr->removeCallback();
}
std::unique_ptr<ProfilerResult> disableProfiler() {
// releasing to inform child threads to stop profiling
profiler_state_info_ptr = nullptr;
auto state_ptr = ProfilerStateBase::pop();
const auto& config = state_ptr->config();
TORCH_CHECK(
state_ptr &&
(config.state == ProfilerState::KINETO ||
config.state == ProfilerState::KINETO_GPU_FALLBACK ||
config.state == ProfilerState::KINETO_PRIVATEUSE1_FALLBACK ||
config.state == ProfilerState::KINETO_ONDEMAND ||
config.state == ProfilerState::NVTX ||
config.state == ProfilerState::ITT ||
config.state == ProfilerState::PRIVATEUSE1),
"Can't disable Kineto profiler when it's not running");
state_ptr->removeCallback();
// Traces are converged via libkineto automatically for ondemand flow
if (state_ptr->config().global()) {
(void)std::static_pointer_cast<KinetoThreadLocalState>(state_ptr)
->finalizeTrace();
return std::make_unique<ProfilerResult>();
}
// Shared among NVTX, PRIVATEUSE1, KINETO, KINETO_GPU_FALLBACK,
// KINETO_PRIVATEUSE1_FALLBACK
std::unique_ptr<ProfilerResult> result;
if (state_ptr->config().state == ProfilerState::NVTX ||
state_ptr->config().state == ProfilerState::PRIVATEUSE1) {
result = std::make_unique<ProfilerResult>();
}
if (config.state == ProfilerState::KINETO ||
config.state == ProfilerState::KINETO_GPU_FALLBACK ||
config.state == ProfilerState::KINETO_PRIVATEUSE1_FALLBACK) {
auto kineto_state_ptr =
std::static_pointer_cast<KinetoThreadLocalState>(state_ptr);
auto trace = kineto_state_ptr->finalizeTrace();
result = std::make_unique<ProfilerResult>(
kineto_state_ptr->startTime,
std::move(kineto_state_ptr->kinetoEvents),
std::move(trace),
std::move(kineto_state_ptr->eventTree));
}
return result;
}
KinetoEvent::KinetoEvent(
const std::shared_ptr<const torch::profiler::impl::Result>& result,
const bool verbose)
: result_{result} {
TORCH_INTERNAL_ASSERT(result != nullptr);
if (verbose) {
// Populate Python stack
auto parent = result_->parent_.lock();
while (parent != nullptr) {
parent->visit_if_base<PyExtraFieldsBase>(
[&](const auto&) { python_stack_.push_back(parent->name()); });
parent = parent->parent_.lock();
}
}
result->visit_if_base<ExtraFields<EventType::TorchOp>>([&](const auto& op) {
auto arg_data = parseArgData(op.inputs_, op.concrete_inputs_);
shapes_ = std::move(arg_data.shapesForKinetoEvent);
dtypes_ = std::move(arg_data.dtypes);
concrete_inputs_ = std::move(arg_data.concreteInputs);
kwinputs_ = std::move(op.kwinputs_);
});
}
bool KinetoEvent::isPythonFunction() const {
bool out{false};
result_->visit_if_base<PyExtraFieldsBase>([&](const auto&) { out = true; });
return out;
}
bool KinetoEvent::hasShapes() const {
return !shapes_.empty();
}
const c10::ArrayRef<std::vector<int64_t>> KinetoEvent::shapes() const {
return shapes_;
}
bool KinetoEvent::hasTypes() const {
return !dtypes_.empty();
}
const c10::ArrayRef<std::string> KinetoEvent::dtypes() const {
return dtypes_;
}
bool KinetoEvent::hasConcreteInputs() const {
return !concrete_inputs_.empty();
}
const c10::ArrayRef<c10::IValue> KinetoEvent::concreteInputs() const {
return concrete_inputs_;
}
bool KinetoEvent::hasKwinputs() const {
return !kwinputs_.empty();
}
const std::unordered_map<std::string, c10::IValue> KinetoEvent::kwinputs()
const {
return kwinputs_;
}
const c10::ArrayRef<std::string> KinetoEvent::stack() const {
auto get = [&](const auto& i) -> auto& {
return !i.jit_stack_.empty() ? i.jit_stack_ : python_stack_;
};
auto const& extra_fields = result_->extra_fields_;
if (auto p = std::get_if<ExtraFields<EventType::TorchOp>>(&extra_fields)) {
return get(*p);
}
if (auto p = std::get_if<ExtraFields<EventType::Backend>>(&extra_fields)) {
return get(*p);
}
return python_stack_;
}
const c10::ArrayRef<std::string> KinetoEvent::moduleHierarchy() const {
auto const& extra_fields = result_->extra_fields_;
if (auto p = std::get_if<ExtraFields<EventType::TorchOp>>(&extra_fields)) {
return p->jit_modules_;
}
if (auto p = std::get_if<ExtraFields<EventType::Backend>>(&extra_fields)) {
return p->jit_modules_;
}
return {};
}
uint64_t KinetoEvent::endNs() const {
return result_->endTimeNS();
}
uint64_t KinetoEvent::durationNs() const {
return (result_->endTimeNS() - result_->start_time_ns_);
}
int64_t KinetoEvent::debugHandle() const {
return result_->visit(c10::overloaded(
[](const ExtraFields<EventType::TorchOp>& i) { return i.debug_handle_; },
[](const ExtraFields<EventType::Backend>& i) { return i.debug_handle_; },
[](const auto&) -> int64_t { return -1; }));
}
int KinetoEvent::deviceIndex() const {
return result_->visit(c10::overloaded(
[](const ExtraFields<EventType::Allocation>& i) {
return static_cast<int>(i.device_index_);
},
[](const ExtraFields<EventType::OutOfMemory>& i) {
return static_cast<int>(i.device_index_);
},
[&](const auto&) {
return static_cast<int>(result_->kineto_info_.device);
}));
}
bool KinetoEvent::hasStack() const {
return !stack().empty();
}
int64_t KinetoEvent::cudaElapsedUs() const {
auto cuda_event_start = fallbackStart();
auto cuda_event_end = fallbackEnd();
if (!cuda_event_start || !cuda_event_end) {
return -1;
}
try {
return (int64_t)torch::profiler::impl::cudaStubs()->elapsed(
&cuda_event_start, &cuda_event_end);
} catch (std::exception& e) {
LOG(WARNING) << "Failed to measure time between two CUDA events. "
<< e.what();
}
return -1;
}
int64_t KinetoEvent::privateuse1ElapsedUs() const {
auto privateuse1_event_start = fallbackStart();
auto privateuse1_event_end = fallbackEnd();
if (!privateuse1_event_start || !privateuse1_event_end) {
return -1;
}
return (int64_t)torch::profiler::impl::privateuse1Stubs()->elapsed(
&privateuse1_event_start, &privateuse1_event_end);
return -1;
}
void KinetoEvent::getPerfEventCounters(std::vector<uint64_t>& in) const {
return result_->visit(c10::overloaded(
[&in](const ExtraFields<EventType::TorchOp>& e) -> void {
const size_t n = e.perf_event_counters_->size();
// should be rare
if (in.size() < n) {
in.resize(n, 0);
}
for (size_t i = 0; i < n; ++i) {
in[i] = (*e.perf_event_counters_)[i];
}
},
[](const auto&) -> void { return; }));
}
#define FORWARD_FROM_RESULT(method_name, result_expr) \
decltype(std::declval<KinetoEvent>().method_name()) \
KinetoEvent::method_name() const { \
return static_cast<decltype(std::declval<KinetoEvent>().method_name())>( \
result_->result_expr); \
}
FORWARD_FROM_RESULT(startThreadId, start_tid_)
FORWARD_FROM_RESULT(endThreadId, endTID())
FORWARD_FROM_RESULT(activityType, kinetoType())
FORWARD_FROM_RESULT(name, name())
FORWARD_FROM_RESULT(overload_name, overload_name())
FORWARD_FROM_RESULT(deviceType, deviceType())
FORWARD_FROM_RESULT(startNs, start_time_ns_)
FORWARD_FROM_RESULT(correlationId, correlationID())
FORWARD_FROM_RESULT(deviceResourceId, kineto_info_.resource)
#undef FORWARD_FROM_RESULT
// Most of the fields in `KinetoEvent` only make sense for a single event type.
// (Generally TorchOp.) For all other types they simply return the default
// value. This macro provides a succinct way of expressing this behavior.
#define TYPED_ATTR_WITH_DEFAULT( \
event_type, method_name, expression, default_value) \
decltype(std::declval<KinetoEvent>().method_name()) \
KinetoEvent::method_name() const { \
using out_t = decltype(std::declval<KinetoEvent>().method_name()); \
return result_->visit(c10::overloaded( \
[](const ExtraFields<EventType::event_type>& e) -> out_t { \
return expression; \
}, \
[](const auto&) -> out_t { return default_value; })); \
}
#define TYPED_ATTR(event_type, method_name, expression) \
TYPED_ATTR_WITH_DEFAULT(event_type, method_name, expression, {})
TYPED_ATTR_WITH_DEFAULT(TorchOp, sequenceNr, e.sequence_number_, -1)
TYPED_ATTR(TorchOp, fwdThreadId, e.sequence_number_ >= 0 ? e.forward_tid_ : 0)
TYPED_ATTR(TorchOp, scope, static_cast<uint8_t>(e.scope_))
TYPED_ATTR(TorchOp, hasModuleHierarchy, !e.jit_modules_.empty())
TYPED_ATTR(TorchOp, isAsync, e.is_async_)
TYPED_ATTR(TorchOp, extraMeta, e.extra_meta_)
TYPED_ATTR(TorchOp, fallbackStart, e.device_fallback_.device_event_start_)
TYPED_ATTR(TorchOp, fallbackEnd, e.device_fallback_.device_event_end_)
TYPED_ATTR(
TorchOp,
flops,
!e.extra_args_.empty()
? torch::profiler::impl::computeFlops(e.name_, e.extra_args_)
: 0)
TYPED_ATTR(Backend, backend, e.backend_)
TYPED_ATTR(Allocation, nBytes, e.alloc_size_)
TYPED_ATTR(Kineto, linkedCorrelationId, [&]() {
const auto linked = e.linked_activity_.lock();
return linked ? linked->correlationID() : 0;
}())
#undef TYPED_ATTR
#undef TYPED_ATTR_WITH_DEFAULT
ProfilerResult::ProfilerResult(
uint64_t start_time,
std::vector<KinetoEvent> events,
std::unique_ptr<torch::profiler::impl::kineto::ActivityTraceWrapper>&&
trace,
std::vector<experimental_event_t>&& event_tree)
: trace_start_ns_(start_time),
events_(std::move(events)),
trace_(std::move(trace)),
event_tree_(std::move(event_tree)) {}
ProfilerResult::ProfilerResult() = default;
ProfilerResult::~ProfilerResult() = default;
void ProfilerResult::save(const std::string& path) {
trace_->save(path);
}
} // namespace autograd::profiler
namespace profiler::impl {
void _reportVulkanEventToProfiler(vulkan_id_t id) {
auto state_ptr = ::torch::autograd::profiler::KinetoThreadLocalState::get(
/*global=*/false);
if (state_ptr) {
state_ptr->reportVulkanEventToProfiler(id);
}
}
} // namespace profiler::impl
} // namespace torch