#include #define TORCH_ASSERT_ONLY_METHOD_OPERATORS #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef USE_KINETO #include #include #include #ifndef _MSC_VER // TODO: TO be removed, once this properly works from libkineto // Literal copy-n-paste from third_party/kineto/libkineto/src/WeakSymbols.cpp extern "C" { // This function is needed to avoid superfluous dependency on GNU OpenMP library // when cuPTI is linked statically For more details see // https://github.com/pytorch/pytorch/issues/51026 __attribute__((weak)) int acc_get_device_type(); __attribute__((weak)) int acc_get_device_type() { TORCH_CHECK( false, "Dummy implementation of acc_get_device_type is not supposed to be called!"); } } // extern "C" #endif // _MSC_VER #endif // USE_KINETO namespace torch { namespace autograd::profiler { namespace { inline int64_t getTimeNs() { #ifdef USE_KINETO return libkineto::timeSinceEpoch(std::chrono::system_clock::now()); #else return c10::getTime(); #endif // USE_KINETO } using torch::profiler::impl::ActiveProfilerType; using torch::profiler::impl::EventType; using torch::profiler::impl::ExtraFields; using torch::profiler::impl::get_record_concrete_inputs_enabled; using torch::profiler::impl::ivalueListToStr; using torch::profiler::impl::ivalueToStr; using torch::profiler::impl::op_input_t; using torch::profiler::impl::ProfilerStateBase; using torch::profiler::impl::PyExtraFieldsBase; using torch::profiler::impl::Result; using torch::profiler::impl::shape; using torch::profiler::impl::shapesToStr; using torch::profiler::impl::stacksToStr; using torch::profiler::impl::strListToStr; using torch::profiler::impl::TensorMetadata; using torch::profiler::impl::variantShapesToStr; struct OpArgData { bool hasData; std::vector shapes; std::vector dtypes; std::vector concreteInputs; std::vector> shapesForKinetoEvent; std::vector strides; }; auto parseArgData( const std::vector& input_shapes, const std::vector& concreteInputs) { if (input_shapes.empty()) { return OpArgData{false, {}, {}, {}, {}, {}}; } std::vector shapes(input_shapes.size()); std::vector strides(input_shapes.size()); std::vector> shapesForKinetoEvent(input_shapes.size()); std::vector dtypes(input_shapes.size()); std::vector concrete_inputs_list; for (const auto& i : c10::irange(input_shapes.size())) { std::visit( c10::overloaded( [&](const TensorMetadata& t) { shapes[i] = t.sizes_; shapesForKinetoEvent[i] = t.sizes_; dtypes[i] = std::string(scalarTypeToTypeMeta(t.dtype_).name()); strides[i] = t.strides_; }, [&](const std::vector& l) { std::vector> shape; shape.reserve(l.size()); std::vector> stride; stride.reserve(l.size()); for (const auto& t : l) { shape.emplace_back(t.sizes_); stride.emplace_back(t.strides_); } shapes[i] = shape; strides[i] = stride; dtypes[i] = "TensorList"; }, [&](const c10::IValue&) { dtypes[i] = "Scalar"; }, [&](const auto&) {}), input_shapes[i]); } // If we recorded concrete inputs, then parse them if (input_shapes.size() == concreteInputs.size() && !concreteInputs.empty()) { concrete_inputs_list.resize(input_shapes.size()); for (const auto& i : c10::irange(input_shapes.size())) { std::visit( c10::overloaded( [&](const c10::IValue& val) { concrete_inputs_list[i] = val; }, [&](const auto&) {}), input_shapes[i]); std::visit( c10::overloaded( [&](const c10::IValue& val) { concrete_inputs_list[i] = val; dtypes[i] = "ScalarList"; }, [&](const auto&) {}), concreteInputs[i]); } } return OpArgData{ true, shapes, dtypes, concrete_inputs_list, shapesForKinetoEvent, strides}; } struct MetadataBase { /* implicit */ MetadataBase(const std::shared_ptr& result) : kinetoActivity_{result->kineto_activity_} { if (std::holds_alternative>( result->extra_fields_)) { // In order to add metadata we have to downcast from // `libkineto::ITraceActivity` to `libkineto::GenericTraceActivity`. We // know that all activities provided by PyTorch are of the correct type, // however Kineto profilers can (and do) add events that inherit directly // from ITraceActivity. As a result, any Result which was constructed from // an event that Kineto provided is unsafe to cast. if (!(SOFT_ASSERT(!hasKinetoActivity()))) { result->kineto_activity_ = nullptr; } kinetoActivity_ = result->kineto_activity_; } } void addMetadata(const std::string& key, const std::string& value) { if (kinetoActivity_ && !value.empty() && value != "\"\"") { torch::profiler::impl::kineto::addMetadata( // NOLINTNEXTLINE(cppcoreguidelines-pro-type-const-cast) const_cast( kinetoActivity_), key, value); } } bool hasKinetoActivity() const { return kinetoActivity_ != nullptr; } private: const torch::profiler::impl::kineto::activity_t* kinetoActivity_{nullptr}; }; struct AddTensorboardFields : public MetadataBase { AddTensorboardFields( const std::shared_ptr& result, KinetoEvent& kineto_event) : MetadataBase(result) { result->visit(*this); const auto module_hierarchy = kineto_event.moduleHierarchy(); addMetadata("Module Hierarchy", stacksToStr(module_hierarchy.vec(), ".")); addMetadata("Call stack", stacksToStr(kineto_event.stack().vec(), ";")); result->visit_if_base([&, this](const auto& i) -> void { this->addMetadata("Python id", std::to_string(i.id_)); std::optional parent_id; std::shared_ptr parent = result->parent_.lock(); while (parent && !parent_id.has_value()) { parent->visit_if_base( [&](const auto& j) { parent_id = std::to_string(j.id_); }); parent = parent->parent_.lock(); } this->addMetadata("Python parent id", parent_id.value_or("null")); }); } void operator()(const ExtraFields& py_call) { if (py_call.module_.has_value()) { addMetadata("Python module id", std::to_string(py_call.module_->id_)); } } template void operator()(const T& /*unused*/) {} }; struct AddGenericMetadata : public MetadataBase { AddGenericMetadata( std::shared_ptr& result, const torch::profiler::impl::ProfilerConfig* config) : MetadataBase(result), config_(config) { result->visit(*this); if (config->experimental_config.verbose) { result->visit_if_base( [&, this](const auto& i) -> void { this->addMetadata("Python thread", std::to_string(i.python_tid_)); }); } } void operator()(ExtraFields& op_event) { const auto arg_data = parseArgData(op_event.inputs_, op_event.concrete_inputs_); if (arg_data.hasData) { if (get_record_concrete_inputs_enabled()) { addMetadata("Input Dims", variantShapesToStr(arg_data.shapes)); addMetadata("Input Strides", variantShapesToStr(arg_data.strides)); } else { addMetadata("Input Dims", shapesToStr(arg_data.shapesForKinetoEvent)); } addMetadata("Input type", strListToStr(arg_data.dtypes)); if (!arg_data.concreteInputs.empty()) { addMetadata( "Concrete Inputs", ivalueListToStr(arg_data.concreteInputs)); } } // Add metadata for kwinputs if exist for (const auto& [key, val] : op_event.kwinputs_) { if (key == "stream" && !val.isInt()) { LOG(WARNING) << "Inputted stream is not an int for op: " << op_event.name_ << " skipping"; continue; } // Until needed, lets limit the kwargs to only ints, doubles, strings, // bools, and list of strings bool isValidType = val.isInt() || val.isDouble() || val.isString() || val.isBool(); bool isStringList = false; if (!isValidType && val.isList()) { // Check if it's a list of strings auto list = val.toListRef(); isStringList = std::all_of(list.begin(), list.end(), [](const c10::IValue& item) { return item.isString(); }); } if (!isValidType && !isStringList) { LOG(WARNING) << "Inputted kwarg: " << key << " is not an int, double, string, bool, or list of strings for op: " << op_event.name_ << " skipping"; continue; } if (isStringList) { // For list of strings, use ivalueListToStr auto list = val.toListRef(); std::vector stringList(list.begin(), list.end()); addMetadata(key, ivalueListToStr(stringList)); } else { bool isString = val.isString(); addMetadata(key, ivalueToStr(val, isString)); } } // Add extra metadata if any for (const auto& [key, val] : op_event.extra_meta_) { addMetadata(key, val); } if (config_ && !config_->experimental_config.performance_events.empty()) { auto& event_names = config_->experimental_config.performance_events; for (const auto i : c10::irange(op_event.perf_event_counters_->size())) { addMetadata( event_names[i], std::to_string((*op_event.perf_event_counters_)[i])); } } // add information about an associated forward op, if a sequence number // is available (e.g. during training) if (op_event.sequence_number_ >= 0) { addMetadata("Fwd thread id", std::to_string(op_event.forward_tid_)); addMetadata("Sequence number", std::to_string(op_event.sequence_number_)); } addMetadata( "Record function id", std::to_string(op_event.record_function_id_)); } void operator()(ExtraFields& backend_event) { if (!backend_event.backend_.empty()) { addMetadata("Backend", "\"" + backend_event.backend_ + "\""); } } void operator()(const ExtraFields& alloc) { addMetadata("Device Type", std::to_string((int8_t)alloc.device_type_)); addMetadata("Device Id", std::to_string(alloc.device_index_)); addMetadata("Addr", std::to_string(reinterpret_cast(alloc.ptr_))); addMetadata("Bytes", std::to_string(alloc.alloc_size_)); addMetadata("Total Allocated", std::to_string(alloc.total_allocated_)); addMetadata("Total Reserved", std::to_string(alloc.total_reserved_)); } void operator()(const ExtraFields& alloc) { addMetadata("Device Type", std::to_string((int8_t)alloc.device_type_)); addMetadata("Device Id", std::to_string(alloc.device_index_)); addMetadata("Bytes", std::to_string(alloc.alloc_size_)); addMetadata("Total Allocated", std::to_string(alloc.total_allocated_)); addMetadata("Total Reserved", std::to_string(alloc.total_reserved_)); } template void operator()(const T& /*unused*/) {} private: /* To get names of the performance events */ const torch::profiler::impl::ProfilerConfig* config_; }; struct KinetoThreadLocalState : public ProfilerStateBase { explicit KinetoThreadLocalState( const ProfilerConfig& config, std::set activities) : ProfilerStateBase(config), startTime(getTimeNs()), recordQueue(config, std::move(activities)) {} ~KinetoThreadLocalState() override = default; static KinetoThreadLocalState* get(bool global) { auto* state = ProfilerStateBase::get(/*global=*/global); TORCH_INTERNAL_ASSERT_DEBUG_ONLY( state == nullptr || state->profilerType() == ActiveProfilerType::KINETO); return static_cast(state); } ActiveProfilerType profilerType() override { return ActiveProfilerType::KINETO; } void reportVulkanEventToProfiler(torch::profiler::impl::vulkan_id_t id) { if (!config_.disabled()) { recordQueue.getSubqueue()->emplace_vulkan_event( c10::getApproximateTime(), id); } } void reportMemoryUsage( void* ptr, int64_t alloc_size, size_t total_allocated, size_t total_reserved, c10::Device device) override { if (config_.profile_memory && !config_.disabled()) { recordQueue.getSubqueue()->emplace_allocation_event( c10::getApproximateTime(), ptr, alloc_size, total_allocated, total_reserved, device.type(), device.index()); } } void reportOutOfMemory( int64_t alloc_size, size_t total_allocated, size_t total_reserved, c10::Device device) override { if (config_.profile_memory && !config_.disabled()) { recordQueue.getSubqueue()->emplace_ooms_event( c10::getApproximateTime(), alloc_size, total_allocated, total_reserved, device.type(), device.index()); } } void setEventPostProcessingCallback(post_process_t&& cb) { eventPostProcessCb = std::move(cb); } void pausePython() { recordQueue.stop(); } void resumePython() { recordQueue.restart(); } std::unique_ptr finalizeTrace() { auto end_time = getTimeNs(); recordQueue.stop(); std::lock_guard guard(state_mutex_); auto converter = clockConverter.makeConverter(); #ifdef USE_KINETO libkineto::get_time_converter() = converter; #endif auto records_and_trace = recordQueue.getRecords(std::move(converter), startTime, end_time); materializeOpEvents(records_and_trace.first); // `kinetoEvents` does not include Python events. Instead it exposes them // via the `stacks` property. kinetoEvents.erase( std::remove_if( kinetoEvents.begin(), kinetoEvents.end(), [](const auto& i) { return i.isPythonFunction(); }), kinetoEvents.end()); return std::move(records_and_trace.second); } template void invokeCallback(T& t) { if (eventPostProcessCb) { eventPostProcessCb(t.debug_handle_, t.jit_stack_, t.jit_modules_); } } void materializeOpEvents(std::vector>& events) { for (auto& e : events) { if (e->parent_.expired() && e->deviceType() == c10::DeviceType::CPU) { eventTree.push_back(e); } if (e->finished_) { e->visit(c10::overloaded( [this](ExtraFields& i) { invokeCallback(i); }, [this](ExtraFields& i) { invokeCallback(i); }, [](auto&) {})); kinetoEvents.emplace_back(e, config_.experimental_config.verbose); AddTensorboardFields add_tb(e, kinetoEvents.back()); AddGenericMetadata add_generic(e, &config_); // It is not safe to use the activity after post processing. e->kineto_activity_ = nullptr; } } } uint64_t startTime; c10::ApproximateClockToUnixTimeConverter clockConverter; torch::profiler::impl::RecordQueue recordQueue; std::vector kinetoEvents; std::vector eventTree; // Optional, if event post-processing is enabled. post_process_t eventPostProcessCb; }; template std::unique_ptr onFunctionEnter( const at::RecordFunction& fn) { auto state_ptr = KinetoThreadLocalState::get(use_global_state_ptr); if (!state_ptr) { return nullptr; } return state_ptr->recordQueue.getSubqueue()->begin_op(fn); } // @lint-ignore CLANGTIDY clang-diagnostic-unused-parameter template void onFunctionExit( const at::RecordFunction& fn, at::ObserverContext* ctx_ptr) { auto state_ptr = KinetoThreadLocalState::get(use_global_state_ptr); if (!state_ptr) { return; } const auto& config = state_ptr->config(); auto* kineto_ctx_ptr = static_cast(ctx_ptr); TORCH_INTERNAL_ASSERT(kineto_ctx_ptr != nullptr); kineto_ctx_ptr->event_->end_time_ = c10::getApproximateTime(); if (!config.experimental_config.performance_events.empty()) { state_ptr->recordQueue.getSubqueue()->disable_perf_profiler( *kineto_ctx_ptr->event_->counters_); } kineto_ctx_ptr->event_->basic_fields_.end_tid_ = at::RecordFunction::currentThreadId(); if (fn.isNcclMeta()) { auto& extra_meta = *(kineto_ctx_ptr->event_->extra_nccl_meta_); // Record only the outputs in this exit callback of the record function torch::profiler::impl::SaveNcclMetaConfig ncclMetaConfig{ true, false, false, true}; auto additonal_nccl_meta = torch::profiler::impl::saveNcclMeta(fn, ncclMetaConfig); extra_meta.insert(additonal_nccl_meta.begin(), additonal_nccl_meta.end()); } if (config.state == ProfilerState::KINETO_GPU_FALLBACK) { try { auto fallback = kineto_ctx_ptr->fallback_; TORCH_INTERNAL_ASSERT(fallback != nullptr); torch::profiler::impl::cudaStubs()->record( nullptr, &fallback->device_event_end_, nullptr); } catch (const std::exception& e) { LOG(WARNING) << "Failed to record CUDA event. " << e.what(); } } else if (config.state == ProfilerState::KINETO_PRIVATEUSE1_FALLBACK) { auto fallback = kineto_ctx_ptr->fallback_; TORCH_INTERNAL_ASSERT(fallback != nullptr); torch::profiler::impl::privateuse1Stubs()->record( nullptr, &fallback->device_event_end_, nullptr); } if (!config.experimental_config.disable_external_correlation) { if (fn.scope() == at::RecordScope::USER_SCOPE) { torch::profiler::impl::kineto::popUserCorrelationId(); } else { torch::profiler::impl::kineto::popCorrelationId(); } } } template void pushProfilingCallbacks(const std::unordered_set& scopes) { auto registration_state_ptr = KinetoThreadLocalState::get(use_global_callback); TORCH_INTERNAL_ASSERT(registration_state_ptr, "Expected profiler state set"); auto recordFunctionCallback = at::RecordFunctionCallback( onFunctionEnter, onFunctionExit) .needsInputs(registration_state_ptr->config().report_input_shapes) .scopes(scopes); if constexpr (use_global_callback) { registration_state_ptr->setCallbackHandle( at::addGlobalCallback(recordFunctionCallback)); } else { registration_state_ptr->setCallbackHandle( at::addThreadLocalCallback(recordFunctionCallback)); } } struct ProfilerStateInfo { std::shared_ptr state_ptr; std::unordered_set scopes; }; std::shared_ptr profiler_state_info_ptr{nullptr}; } // namespace void reportBackendEventToActiveKinetoProfiler( const int64_t start_time_us, const int64_t end_time_us, const int64_t debug_handle, const at::RecordScope scope, const std::string& event_name, const std::string& backend_name) { TORCH_INTERNAL_ASSERT( KinetoThreadLocalState::get(/*global=*/true) == nullptr, "On-demand profiling does not support post processing callback"); auto state_ptr = KinetoThreadLocalState::get(/*global=*/false); if (!state_ptr) { return; } state_ptr->recordQueue.getSubqueue()->emplace_backend_event( start_time_us, end_time_us, debug_handle, scope, event_name, backend_name); /* no support for input shapes now? if (config.report_input_shapes) { ctx_ptr->shapes = inputSizes(fn); ctx_ptr->dtypes = inputTypes(fn); } */ } void prepareProfiler( const torch::profiler::impl::ProfilerConfig& config, const std::set& activities) { if (config.state == ProfilerState::NVTX || config.state == ProfilerState::ITT) { return; } TORCH_CHECK( config.state == ProfilerState::KINETO || config.state == ProfilerState::KINETO_GPU_FALLBACK || config.state == ProfilerState::KINETO_PRIVATEUSE1_FALLBACK, "Supported only in Kineto profiler"); torch::profiler::impl::kineto::prepareTrace( /*cpuOnly=*/!( at::hasCUDA() || at::hasXPU() || at::hasMTIA() || c10::get_privateuse1_backend() != "privateuseone"), activities, config.experimental_config, config.trace_id); if (!config.experimental_config.performance_events.empty()) { /* For now only CPU activity is supported */ TORCH_CHECK( activities.count(torch::autograd::profiler::ActivityType::CPU), "Cannot run cpu hardware profiler without CPU activities, please only use CPU activity type"); /* * Sending a warning and passing the non-standard event to the backend * Backend can abort if the event is not supported. * TODO Should we gracefully drop the invalid event if we have at least one * valid? */ auto is_standard_event = [](const std::string& event) -> bool { for (auto e : torch::profiler::ProfilerPerfEvents) { if (!std::strcmp(event.c_str(), e)) { return true; } } return false; }; for (const auto& e : config.experimental_config.performance_events) { if (!is_standard_event(e)) { TORCH_WARN("Forwarding a non-standard CPU performance event : ", e); } } } } static void toggleTorchOpCollectionDynamic(bool enable) { auto state_ptr = ProfilerStateBase::get(); if (state_ptr) { const auto& config = state_ptr->config(); if (enable) { auto scopes = profiler_state_info_ptr->scopes; config.global() ? pushProfilingCallbacks(scopes) : pushProfilingCallbacks(scopes); } else { state_ptr->removeCallback(); } } } // Set this function to be unused as profiler implementation needs more // refactoring to support Python ops collection dynamic toggling #ifdef _MSC_VER #define UNUSED #else #define UNUSED __attribute__((unused)) #endif static UNUSED void togglePythonCollectionDynamic(bool enable) { auto state_ptr = ProfilerStateBase::get(); if (state_ptr) { auto global = state_ptr->config().global(); KinetoThreadLocalState* kineto_thread_local_state_ptr = KinetoThreadLocalState::get(global); if (enable) { kineto_thread_local_state_ptr->resumePython(); } else { kineto_thread_local_state_ptr->pausePython(); } } } static void toggleCPUCollectionDynamic(bool enable) { toggleTorchOpCollectionDynamic(enable); // For now we only support Torch Op collection dynamic toggling as // implementing Python ops would require not only string parsing to get rid of // the toggling events as well as other unfinished events as well as changes // in stack logic // togglePythonCollectionDynamic(enable); } void toggleCollectionDynamic( const bool enable, const std::set& activities) { if (activities.count(torch::autograd::profiler::ActivityType::CPU) > 0 && (activities.count(torch::autograd::profiler::ActivityType::CUDA) == 0 || activities.count(torch::autograd::profiler::ActivityType::XPU) == 0)) { LOG(WARNING) << "Toggling CPU activity with GPU activity on may result in traces with GPU events on artibrary tracks"; } else if ( (activities.count(torch::autograd::profiler::ActivityType::CUDA) > 0 || activities.count(torch::autograd::profiler::ActivityType::XPU) > 0) && activities.count(torch::autograd::profiler::ActivityType::CPU) == 0) { LOG(WARNING) << "Toggling GPU activity with CPU activity on may result in traces with incorrect correlation between CPU and GPU events"; } for (auto act : activities) { if (act == torch::autograd::profiler::ActivityType::CUDA || act == torch::autograd::profiler::ActivityType::XPU) { torch::profiler::impl::kineto::toggleCollectionDynamic(enable); } else if (act == torch::autograd::profiler::ActivityType::CPU) { toggleCPUCollectionDynamic(enable); } else { LOG(WARNING) << "Dynamic toggle is only supported for CPU/GPU activity, skipping toggling of " << actToString(act); continue; } } } void enableProfilerWithEventPostProcess( const torch::profiler::impl::ProfilerConfig& config, const std::set& activities, post_process_t&& cb, const std::unordered_set& scopes) { TORCH_CHECK( config.state != ProfilerState::NVTX, "NVTX does not support post processing callback."); TORCH_CHECK( config.state != ProfilerState::ITT, "ITT does not support post processing callback."); TORCH_INTERNAL_ASSERT( KinetoThreadLocalState::get(/*global=*/true) == nullptr, "On-demand profiling does not support post processing callback"); enableProfiler(config, activities, scopes); auto state_ptr = KinetoThreadLocalState::get(config.global()); state_ptr->setEventPostProcessingCallback(std::move(cb)); } void enableProfiler( const torch::profiler::impl::ProfilerConfig& config, const std::set& activities, const std::unordered_set& scopes) { const auto has_cpu = activities.count(ActivityType::CPU); TORCH_CHECK( KinetoThreadLocalState::get(/*global=*/config.global()) == nullptr, "Profiler is already enabled", (config.global() ? "." : " on this thread.")); if (config.state == ProfilerState::NVTX) { torch::profiler::impl::pushNVTXCallbacks(config, scopes); return; } else if (config.state == ProfilerState::ITT) { torch::profiler::impl::pushITTCallbacks(config, scopes); return; } else if (config.state == ProfilerState::PRIVATEUSE1) { torch::profiler::impl::pushPRIVATEUSE1CallbacksStub(config, scopes); return; } TORCH_CHECK( config.state == ProfilerState::KINETO || config.state == ProfilerState::KINETO_GPU_FALLBACK || config.state == ProfilerState::KINETO_PRIVATEUSE1_FALLBACK || config.global()); TORCH_CHECK(!activities.empty(), "No activities specified."); TORCH_INTERNAL_ASSERT( has_cpu || !config.global(), "Ondemand profiling must enable CPU tracing"); auto state_ptr = std::make_shared(config, activities); KinetoThreadLocalState::push(state_ptr); if (has_cpu) { config.pushGlobalCallbacks() ? pushProfilingCallbacks(scopes) : pushProfilingCallbacks(scopes); } if (!config.global()) { torch::profiler::impl::kineto::startTrace(); } if (has_cpu) { auto state_info_ptr = std::make_shared(); state_info_ptr->state_ptr = state_ptr; state_info_ptr->scopes = scopes; profiler_state_info_ptr = state_info_ptr; } } bool isProfilerEnabledInMainThread() { return profiler_state_info_ptr != nullptr; } void enableProfilerInChildThread() { auto state_info_ptr = profiler_state_info_ptr; TORCH_CHECK(state_info_ptr, "Profiler is not enabled in main thread."); TORCH_CHECK( KinetoThreadLocalState::get(/*global=*/false) == nullptr, "Profiler is already enabled in this thread."); KinetoThreadLocalState::push(state_info_ptr->state_ptr); pushProfilingCallbacks(state_info_ptr->scopes); } void disableProfilerInChildThread() { auto state_ptr = ProfilerStateBase::pop(); TORCH_CHECK( state_ptr, "Can't disable Kineto profiler when it's not running in this thread"); state_ptr->removeCallback(); } std::unique_ptr disableProfiler() { // releasing to inform child threads to stop profiling profiler_state_info_ptr = nullptr; auto state_ptr = ProfilerStateBase::pop(); const auto& config = state_ptr->config(); TORCH_CHECK( state_ptr && (config.state == ProfilerState::KINETO || config.state == ProfilerState::KINETO_GPU_FALLBACK || config.state == ProfilerState::KINETO_PRIVATEUSE1_FALLBACK || config.state == ProfilerState::KINETO_ONDEMAND || config.state == ProfilerState::NVTX || config.state == ProfilerState::ITT || config.state == ProfilerState::PRIVATEUSE1), "Can't disable Kineto profiler when it's not running"); state_ptr->removeCallback(); // Traces are converged via libkineto automatically for ondemand flow if (state_ptr->config().global()) { (void)std::static_pointer_cast(state_ptr) ->finalizeTrace(); return std::make_unique(); } // Shared among NVTX, PRIVATEUSE1, KINETO, KINETO_GPU_FALLBACK, // KINETO_PRIVATEUSE1_FALLBACK std::unique_ptr result; if (state_ptr->config().state == ProfilerState::NVTX || state_ptr->config().state == ProfilerState::PRIVATEUSE1) { result = std::make_unique(); } if (config.state == ProfilerState::KINETO || config.state == ProfilerState::KINETO_GPU_FALLBACK || config.state == ProfilerState::KINETO_PRIVATEUSE1_FALLBACK) { auto kineto_state_ptr = std::static_pointer_cast(state_ptr); auto trace = kineto_state_ptr->finalizeTrace(); result = std::make_unique( kineto_state_ptr->startTime, std::move(kineto_state_ptr->kinetoEvents), std::move(trace), std::move(kineto_state_ptr->eventTree)); } return result; } namespace tracer = torch::profiler::impl::python_tracer; static std::unique_ptr memory_tracer; void startMemoryProfile() { if (memory_tracer == nullptr) { memory_tracer = tracer::PythonMemoryTracerBase::make(); } memory_tracer->start(); } void stopMemoryProfile() { memory_tracer->stop(); } void exportMemoryProfile(const std::string& filename) { memory_tracer->export_memory_history(filename); } KinetoEvent::KinetoEvent( const std::shared_ptr& result, const bool verbose) : result_{result} { TORCH_INTERNAL_ASSERT(result != nullptr); if (verbose) { // Populate Python stack auto parent = result_->parent_.lock(); while (parent != nullptr) { parent->visit_if_base( [&](const auto&) { python_stack_.push_back(parent->name()); }); parent = parent->parent_.lock(); } } result->visit_if_base>([&](const auto& op) { auto arg_data = parseArgData(op.inputs_, op.concrete_inputs_); shapes_ = std::move(arg_data.shapesForKinetoEvent); dtypes_ = std::move(arg_data.dtypes); concrete_inputs_ = std::move(arg_data.concreteInputs); kwinputs_ = std::move(op.kwinputs_); }); } bool KinetoEvent::isPythonFunction() const { bool out{false}; result_->visit_if_base([&](const auto&) { out = true; }); return out; } bool KinetoEvent::hasShapes() const { return !shapes_.empty(); } const c10::ArrayRef> KinetoEvent::shapes() const { return shapes_; } bool KinetoEvent::hasTypes() const { return !dtypes_.empty(); } const c10::ArrayRef KinetoEvent::dtypes() const { return dtypes_; } bool KinetoEvent::hasConcreteInputs() const { return !concrete_inputs_.empty(); } const c10::ArrayRef KinetoEvent::concreteInputs() const { return concrete_inputs_; } bool KinetoEvent::hasKwinputs() const { return !kwinputs_.empty(); } bool KinetoEvent::isHiddenEvent() const { return result_ && result_->hidden_; } const std::unordered_map KinetoEvent::kwinputs() const { return kwinputs_; } const c10::ArrayRef KinetoEvent::stack() const { auto get = [&](const auto& i) -> auto& { return !i.jit_stack_.empty() ? i.jit_stack_ : python_stack_; }; auto const& extra_fields = result_->extra_fields_; if (auto p = std::get_if>(&extra_fields)) { return get(*p); } if (auto p = std::get_if>(&extra_fields)) { return get(*p); } return python_stack_; } const c10::ArrayRef KinetoEvent::moduleHierarchy() const { auto const& extra_fields = result_->extra_fields_; if (auto p = std::get_if>(&extra_fields)) { return p->jit_modules_; } if (auto p = std::get_if>(&extra_fields)) { return p->jit_modules_; } return {}; } uint64_t KinetoEvent::endNs() const { return result_->endTimeNS(); } uint64_t KinetoEvent::durationNs() const { return (result_->endTimeNS() - result_->start_time_ns_); } int64_t KinetoEvent::debugHandle() const { return result_->visit(c10::overloaded( [](const ExtraFields& i) { return i.debug_handle_; }, [](const ExtraFields& i) { return i.debug_handle_; }, [](const auto&) -> int64_t { return -1; })); } int KinetoEvent::deviceIndex() const { return result_->visit(c10::overloaded( [](const ExtraFields& i) { return static_cast(i.device_index_); }, [](const ExtraFields& i) { return static_cast(i.device_index_); }, [&](const auto&) { return static_cast(result_->kineto_info_.device); })); } bool KinetoEvent::hasStack() const { return !stack().empty(); } int64_t KinetoEvent::cudaElapsedUs() const { auto cuda_event_start = fallbackStart(); auto cuda_event_end = fallbackEnd(); if (!cuda_event_start || !cuda_event_end) { return -1; } try { return (int64_t)torch::profiler::impl::cudaStubs()->elapsed( &cuda_event_start, &cuda_event_end); } catch (std::exception& e) { LOG(WARNING) << "Failed to measure time between two CUDA events. " << e.what(); } return -1; } int64_t KinetoEvent::privateuse1ElapsedUs() const { auto privateuse1_event_start = fallbackStart(); auto privateuse1_event_end = fallbackEnd(); if (!privateuse1_event_start || !privateuse1_event_end) { return -1; } return (int64_t)torch::profiler::impl::privateuse1Stubs()->elapsed( &privateuse1_event_start, &privateuse1_event_end); return -1; } void KinetoEvent::getPerfEventCounters(std::vector& in) const { return result_->visit(c10::overloaded( [&in](const ExtraFields& e) -> void { const size_t n = e.perf_event_counters_->size(); // should be rare if (in.size() < n) { in.resize(n, 0); } for (size_t i = 0; i < n; ++i) { in[i] = (*e.perf_event_counters_)[i]; } }, [](const auto&) -> void { return; })); } std::string KinetoEvent::metadataJson() const { return result_->visit(c10::overloaded( [](const ExtraFields& op) -> std::string { return op.metadata_json_; }, [](const ExtraFields& op) -> std::string { return op.metadata_json_; }, [](const auto&) -> std::string { return std::string(""); })); } #define FORWARD_FROM_RESULT(method_name, result_expr) \ decltype(std::declval().method_name()) \ KinetoEvent::method_name() const { \ return static_cast().method_name())>( \ result_->result_expr); \ } FORWARD_FROM_RESULT(startThreadId, start_tid_) FORWARD_FROM_RESULT(endThreadId, endTID()) FORWARD_FROM_RESULT(activityType, kinetoType()) FORWARD_FROM_RESULT(name, name()) FORWARD_FROM_RESULT(overload_name, overload_name()) FORWARD_FROM_RESULT(deviceType, deviceType()) FORWARD_FROM_RESULT(startNs, start_time_ns_) FORWARD_FROM_RESULT(correlationId, correlationID()) FORWARD_FROM_RESULT(deviceResourceId, kineto_info_.resource) #undef FORWARD_FROM_RESULT // Most of the fields in `KinetoEvent` only make sense for a single event type. // (Generally TorchOp.) For all other types they simply return the default // value. This macro provides a succinct way of expressing this behavior. #define TYPED_ATTR_WITH_DEFAULT( \ event_type, method_name, expression, default_value) \ decltype(std::declval().method_name()) \ KinetoEvent::method_name() const { \ using out_t = decltype(std::declval().method_name()); \ return result_->visit(c10::overloaded( \ [](const ExtraFields& e) -> out_t { \ return expression; \ }, \ [](const auto&) -> out_t { return default_value; })); \ } #define TYPED_ATTR(event_type, method_name, expression) \ TYPED_ATTR_WITH_DEFAULT(event_type, method_name, expression, {}) TYPED_ATTR_WITH_DEFAULT(TorchOp, sequenceNr, e.sequence_number_, -1) TYPED_ATTR(TorchOp, fwdThreadId, e.sequence_number_ >= 0 ? e.forward_tid_ : 0) TYPED_ATTR(TorchOp, scope, static_cast(e.scope_)) TYPED_ATTR(TorchOp, hasModuleHierarchy, !e.jit_modules_.empty()) TYPED_ATTR(TorchOp, isAsync, e.is_async_) TYPED_ATTR(TorchOp, extraMeta, e.extra_meta_) TYPED_ATTR(TorchOp, fallbackStart, e.device_fallback_.device_event_start_) TYPED_ATTR(TorchOp, fallbackEnd, e.device_fallback_.device_event_end_) TYPED_ATTR( TorchOp, flops, !e.extra_args_.empty() ? torch::profiler::impl::computeFlops(e.name_, e.extra_args_) : 0) TYPED_ATTR(Backend, backend, e.backend_) TYPED_ATTR(Allocation, nBytes, e.alloc_size_) TYPED_ATTR(Kineto, linkedCorrelationId, [&]() { const auto linked = e.linked_activity_.lock(); return linked ? linked->correlationID() : 0; }()) #undef TYPED_ATTR #undef TYPED_ATTR_WITH_DEFAULT ProfilerResult::ProfilerResult( uint64_t start_time, std::vector events, std::unique_ptr&& trace, std::vector&& event_tree) : trace_start_ns_(start_time), events_(std::move(events)), trace_(std::move(trace)), event_tree_(std::move(event_tree)) {} ProfilerResult::ProfilerResult() = default; ProfilerResult::~ProfilerResult() = default; void ProfilerResult::save(const std::string& path) { trace_->save(path); } } // namespace autograd::profiler namespace profiler::impl { void _reportVulkanEventToProfiler(vulkan_id_t id) { auto state_ptr = ::torch::autograd::profiler::KinetoThreadLocalState::get( /*global=*/false); if (state_ptr) { state_ptr->reportVulkanEventToProfiler(id); } } } // namespace profiler::impl } // namespace torch