pytorch/torch/csrc/profiler/python/init.cpp
Zachary DeVito 4e1060c609 [memory profiling] add a facility to gather combined C++/Python/TorchScript stack traces. (#95541)
This refactors the stack trace facility specific to memory profiling
    in python+cuda to make a generic facility to generate combined stack
    traces.

    The generic facility (combined_traceback.h) does not require
    python to be around to work, but will return python stacks if it is
    present.

    This facility is then used to add support for stack trace gathering in memory profiling that
    happens directly from C++.

    It is also used to expose a python API for gathering and symbolizing
    combineds stacks.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/95541
Approved by: https://github.com/ezyang
2023-03-14 18:26:05 +00:00

306 lines
12 KiB
C++

#include <torch/csrc/profiler/python/init.h>
#include <ATen/record_function.h>
#include <c10/util/overloaded.h>
#include <torch/csrc/DynamicTypes.h>
#include <torch/csrc/autograd/utils/wrap_outputs.h>
#include <torch/csrc/jit/python/pybind_utils.h>
#include <torch/csrc/profiler/collection.h>
#include <torch/csrc/profiler/python/combined_traceback.h>
#include <torch/csrc/profiler/standalone/execution_graph_observer.h>
#include <torch/csrc/utils/pybind.h>
namespace torch {
namespace profiler {
void initPythonBindings(PyObject* module) {
auto rootModule = py::handle(module).cast<py::module>();
auto m = rootModule.def_submodule("_profiler");
using namespace torch::profiler::impl;
py::enum_<at::RecordScope>(m, "RecordScope")
.value("FUNCTION", at::RecordScope::FUNCTION)
.value("BACKWARD_FUNCTION", at::RecordScope::BACKWARD_FUNCTION)
.value("TORCHSCRIPT_FUNCTION", at::RecordScope::TORCHSCRIPT_FUNCTION)
.value("KERNEL_FUNCTION_DTYPE", at::RecordScope::KERNEL_FUNCTION_DTYPE)
.value("CUSTOM_CLASS", at::RecordScope::CUSTOM_CLASS)
.value("BUILD_FEATURE", at::RecordScope::BUILD_FEATURE)
.value("LITE_INTERPRETER", at::RecordScope::LITE_INTERPRETER)
.value("USER_SCOPE", at::RecordScope::USER_SCOPE)
.value("STATIC_RUNTIME_OP", at::RecordScope::STATIC_RUNTIME_OP)
.value("STATIC_RUNTIME_MODEL", at::RecordScope::STATIC_RUNTIME_MODEL);
py::enum_<ProfilerState>(m, "ProfilerState")
.value("Disabled", ProfilerState::Disabled)
.value("CPU", ProfilerState::CPU)
.value("CUDA", ProfilerState::CUDA)
.value("NVTX", ProfilerState::NVTX)
.value("ITT", ProfilerState::ITT)
.value("KINETO", ProfilerState::KINETO)
.value("KINETO_GPU_FALLBACK", ProfilerState::KINETO_GPU_FALLBACK);
py::enum_<ActiveProfilerType>(m, "ActiveProfilerType")
.value("NONE", ActiveProfilerType::NONE)
.value("LEGACY", ActiveProfilerType::LEGACY)
.value("KINETO", ActiveProfilerType::KINETO)
.value("NVTX", ActiveProfilerType::NVTX)
.value("ITT", ActiveProfilerType::ITT);
py::enum_<ActivityType>(m, "ProfilerActivity")
.value("CPU", ActivityType::CPU)
.value("XPU", ActivityType::XPU)
.value("CUDA", ActivityType::CUDA);
py::class_<ExperimentalConfig>(m, "_ExperimentalConfig")
.def(
py::init<
std::vector<std::string> /* profiler_metrics */,
bool /* profiler_measure_per_kernel */,
bool /* verbose */,
std::vector<std::string> /* performance_events */
>(),
"An experimental config for Kineto features. Please note that"
"backward compatibility is not guaranteed.\n"
" profiler_metrics : a list of CUPTI profiler metrics used\n"
" to measure GPU performance events.\n"
" If this list contains values Kineto runs in CUPTI profiler mode\n"
" profiler_measure_per_kernel (bool) : whether to profile metrics per kernel\n"
" or for the entire measurement duration.\n"
" verbose (bool) : whether the trace file has `Call stack` field or not.\n"
" performance_events : a list of profiler events to be used for measurement",
py::arg("profiler_metrics") = std::vector<std::string>(),
py::arg("profiler_measure_per_kernel") = false,
py::arg("verbose") = false,
py::arg("performance_events") = std::vector<std::string>())
.def(py::pickle(
[](const ExperimentalConfig& p) { // __getstate__
py::list py_metrics;
for (const auto& metric : p.profiler_metrics) {
py::bytes mbytes(metric);
py_metrics.append(mbytes);
}
py::list py_perf_events;
for (const auto& event : p.performance_events) {
py::bytes mbytes(event);
py_perf_events.append(mbytes);
}
/* Return a tuple that fully encodes the state of the config */
return py::make_tuple(
py_metrics,
p.profiler_measure_per_kernel,
p.verbose,
p.performance_events);
},
[](py::tuple t) { // __setstate__
if (t.size() >= 3) {
throw std::runtime_error("Expected atleast 3 values in state");
}
py::list py_metrics = t[0].cast<py::list>();
std::vector<std::string> metrics{py_metrics.size()};
for (const auto& py_metric : py_metrics) {
metrics.push_back(py::str(py_metric));
}
std::vector<std::string> performance_events;
if (t.size() == 4) {
py::list py_perf_events = t[3].cast<py::list>();
performance_events.resize(py_perf_events.size());
for (const auto& py_perf_event : py_perf_events) {
performance_events.push_back(py::str(py_perf_event));
}
}
return ExperimentalConfig(
std::move(metrics),
t[1].cast<bool>(),
t[2].cast<bool>(),
std::move(performance_events));
}));
py::class_<ProfilerConfig>(m, "ProfilerConfig")
.def(py::init<
ProfilerState,
bool, /* record_input_shapes */
bool, /* profile_memory */
bool, /* with_stack */
bool, /* with_flops */
bool, /* with_modules */
ExperimentalConfig /* experimental_config */
>());
py::enum_<EventType>(m, "_EventType")
.value("TorchOp", EventType::TorchOp)
.value("Backend", EventType::Backend)
.value("Vulkan", EventType::Vulkan)
.value("Allocation", EventType::Allocation)
.value("PyCall", EventType::PyCall)
.value("PyCCall", EventType::PyCCall)
.value("Kineto", EventType::Kineto);
py::class_<TensorMetadata>(m, "_TensorMetadata")
.def_property_readonly("impl_ptr", &TensorMetadata::impl)
.def_readonly("storage_data_ptr", &TensorMetadata::data_)
.def_readonly("id", &TensorMetadata::id_)
.def_readonly("allocation_id", &TensorMetadata::allocation_id_)
.def_property_readonly(
"layout",
[](const TensorMetadata& metadata) {
PyObject* layout_obj =
torch::autograd::utils::wrap(metadata.layout_);
return py::reinterpret_borrow<py::object>(layout_obj);
})
.def_readonly("device", &TensorMetadata::device_)
.def_property_readonly(
"dtype",
[](const TensorMetadata& metadata) {
return py::reinterpret_borrow<py::object>(
torch::autograd::utils::wrap(
torch::getTHPDtype(metadata.dtype_)));
})
.def_readonly("dim", &TensorMetadata::dim_)
.def_readonly("sizes", &TensorMetadata::sizes_)
.def_readonly("strides", &TensorMetadata::strides_);
using torch_op_t = ExtraFields<EventType::TorchOp>;
py::class_<torch_op_t>(m, "_ExtraFields_TorchOp")
.def_readonly("name", &torch_op_t::name_)
.def_property_readonly(
"inputs",
[](const torch_op_t& op) {
py::list out;
for (const auto& input : op.inputs_) {
c10::visit(
c10::overloaded(
[&](const c10::IValue& v) {
out.append(torch::jit::toPyObject(v));
},
[&](const c10::nullopt_t&) { out.append(py::none()); },
[&](const auto& v) { out.append(py::cast(v)); }),
input);
}
return out;
})
.def_readonly("scope", &torch_op_t::scope_)
.def_readonly("sequence_number", &torch_op_t::sequence_number_)
.def_readonly("allow_tf32_cublas", &torch_op_t::allow_tf32_cublas_);
py::class_<ExtraFields<EventType::Backend>>(m, "_ExtraFields_Backend");
py::class_<ExtraFields<EventType::Vulkan>>(m, "_ExtraFields_Vulkan");
using allocation_t = ExtraFields<EventType::Allocation>;
py::class_<allocation_t>(m, "_ExtraFields_Allocation")
.def_property_readonly(
"ptr",
[](const allocation_t& a) {
return reinterpret_cast<intptr_t>(a.ptr_);
})
.def_readonly("id", &allocation_t::id_)
.def_readonly("allocation_id", &allocation_t::allocation_id_)
.def_readonly("alloc_size", &allocation_t::alloc_size_)
.def_readonly("total_allocated", &allocation_t::total_allocated_)
.def_readonly("total_reserved", &allocation_t::total_reserved_)
.def_property_readonly("device", &allocation_t::device);
py::class_<PyFrameState>(m, "_PyFrameState")
.def_readonly("line_number", &PyFrameState::line_no_)
.def_property_readonly(
"file_name", [](const PyFrameState& s) { return s.filename_.str(); })
.def_property_readonly("function_name", [](const PyFrameState& s) {
return s.funcname_.str();
});
py::class_<NNModuleInfo>(m, "_NNModuleInfo")
.def_property_readonly(
"parameters",
[](const NNModuleInfo& s) {
py::list out;
for (const auto& p : s.parameters_) {
out.append(
py::make_tuple(p.name_, p.metadata_, p.grad_metadata_));
}
return out;
})
.def_property_readonly(
"cls_name", [](const NNModuleInfo& s) { return s.cls_name_.str(); })
.def_readonly("self_ptr", &NNModuleInfo::self_)
.def_readonly("cls_ptr", &NNModuleInfo::cls_);
py::class_<OptimizerInfo>(m, "_OptimizerInfo")
.def_readonly("self_ptr", &OptimizerInfo::self_)
.def_property_readonly("parameters", [](const OptimizerInfo& s) {
py::list out;
for (const auto& p : s.parameters_) {
out.append(py::make_tuple(p.metadata_, p.grad_metadata_, p.state_));
}
return out;
});
py::class_<ExtraFields<EventType::PyCall>>(m, "_ExtraFields_PyCall")
.def_readonly("callsite", &ExtraFields<EventType::PyCall>::callsite_)
.def_readonly("caller", &ExtraFields<EventType::PyCall>::caller_)
.def_readonly("module", &ExtraFields<EventType::PyCall>::module_)
.def_readonly("optimizer", &ExtraFields<EventType::PyCall>::optimizer_);
py::class_<ExtraFields<EventType::PyCCall>>(m, "_ExtraFields_PyCCall")
.def_readonly("caller", &ExtraFields<EventType::PyCall>::caller_);
py::class_<ExtraFields<EventType::OutOfMemory>>(
m, "_ExtraFields_OutOfMemory");
py::class_<ExtraFields<EventType::Kineto>>(m, "_ExtraFields_Kineto");
py::class_<Result, std::shared_ptr<Result>>(m, "_ProfilerEvent")
.def_property_readonly("name", &Result::name)
.def_property_readonly("tag", &Result::tag)
.def_readonly("extra_fields", &Result::extra_fields_)
.def_property_readonly(
"typed",
[](const Result& r) {
return py::make_tuple(
r.tag(),
py::cast(r.extra_fields_, py::return_value_policy::reference));
})
.def_property_readonly(
"id",
[](const Result& r) {
return reinterpret_cast<intptr_t>(r.shared_from_this().get());
})
.def_property_readonly(
"parent", [](const Result& r) { return r.parent_.lock(); })
.def_readonly("children", &Result::children_)
.def_readonly("start_time_ns", &Result::start_time_ns_)
.def_readonly("start_tid", &Result::start_tid_)
.def_property_readonly("correlation_id", &Result::correlationID)
.def_property_readonly("end_time_ns", &Result::endTimeNS)
.def_property_readonly("duration_time_ns", [](const Result& r) {
return r.endTimeNS() - r.start_time_ns_;
});
// PyTorch profiler execution graph internal interface.
m.def(
"_add_execution_graph_observer",
&torch::profiler::impl::addExecutionGraphObserver,
py::arg("output_file_name"));
m.def(
"_remove_execution_graph_observer",
&torch::profiler::impl::removeExecutionGraphObserver);
m.def(
"_enable_execution_graph_observer",
&torch::profiler::impl::enableExecutionGraphObserver);
m.def(
"_disable_execution_graph_observer",
&torch::profiler::impl::disableExecutionGraphObserver);
py::class_<CapturedTraceback, std::shared_ptr<CapturedTraceback>>(
m, "CapturedTraceback");
m.def("gather_traceback", CapturedTraceback::gather);
m.def("symbolize_tracebacks", py_symbolize);
installCapturedTracebackPython();
}
} // namespace profiler
} // namespace torch