From ebc66bfeea0b6961ec0e9669d2554c5504836555 Mon Sep 17 00:00:00 2001 From: Taylor Robie Date: Thu, 16 Dec 2021 10:32:13 -0800 Subject: [PATCH] [Profiler] Pull helper methods into dedicated file. (And start `torch/csrc/profiler` folder. (#69255) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/69255 One thing that I've found as I optimize profier is that there's a lot of intermingled code, where the kineto profiler relies on the legacy (autograd) profiler for generic operations. This made optimization hard because I had to manage too many complex dependencies. (Exaserbated by the USE_KINETO #ifdef's sprinkled around.) This PR is the first of several to restructure the profiler(s) so the later optimizations go in easier. Test Plan: Unit tests Reviewed By: aaronenyeshi Differential Revision: D32671972 fbshipit-source-id: efa83b40dde4216f368f2a5fa707360031a85707 --- setup.py | 1 + tools/build_variables.bzl | 2 +- torch/csrc/autograd/init.cpp | 19 +- torch/csrc/autograd/profiler_kineto.cpp | 116 +---- torch/csrc/autograd/profiler_kineto.h | 15 - torch/csrc/autograd/profiler_legacy.cpp | 108 +---- torch/csrc/autograd/profiler_legacy.h | 55 +-- torch/csrc/autograd/profiler_utils.h | 16 - torch/csrc/profiler/util.cpp | 563 ++++++++++++++++++++++++ torch/csrc/profiler/util.h | 119 +++++ 10 files changed, 709 insertions(+), 305 deletions(-) delete mode 100644 torch/csrc/autograd/profiler_utils.h create mode 100644 torch/csrc/profiler/util.cpp create mode 100644 torch/csrc/profiler/util.h diff --git a/setup.py b/setup.py index 4fdc9c6b08f..b4bf03dc172 100644 --- a/setup.py +++ b/setup.py @@ -1029,6 +1029,7 @@ if __name__ == '__main__': 'include/torch/csrc/jit/tensorexpr/*.h', 'include/torch/csrc/jit/tensorexpr/operators/*.h', 'include/torch/csrc/onnx/*.h', + 'include/torch/csrc/profiler/*.h', 'include/torch/csrc/utils/*.h', 'include/torch/csrc/tensor/*.h', 'include/torch/csrc/lazy/core/*.h', diff --git a/tools/build_variables.bzl b/tools/build_variables.bzl index 16d547e7387..61f95c5f7d0 100644 --- a/tools/build_variables.bzl +++ b/tools/build_variables.bzl @@ -105,7 +105,6 @@ jit_core_sources = [ # list for the shared files. core_sources_common = [ - "torch/csrc/autograd/profiler_utils.cpp", "torch/csrc/autograd/autograd_meta.cpp", "torch/csrc/autograd/forward_grad.cpp", "torch/csrc/jit/frontend/edit_distance.cpp", @@ -122,6 +121,7 @@ core_sources_common = [ "torch/csrc/jit/mobile/promoted_prim_ops.cpp", "torch/csrc/jit/mobile/prim_ops_registery.cpp", "torch/csrc/jit/operator_upgraders/upgraders.cpp", + "torch/csrc/profiler/util.cpp", ] torch_unpickler_common = [ diff --git a/torch/csrc/autograd/init.cpp b/torch/csrc/autograd/init.cpp index 391d41933ab..29bd54c5f1d 100644 --- a/torch/csrc/autograd/init.cpp +++ b/torch/csrc/autograd/init.cpp @@ -237,23 +237,8 @@ PyObject* THPAutograd_initExtension(PyObject* _unused, PyObject *unused) { py::arg("scopes") = std::unordered_set()); m.def("_disable_profiler", disableProfiler); m.def("_prepare_profiler", prepareProfiler); - - m.def("_add_metadata_json", [](const std::string& key, const std::string& value) { -#ifdef USE_KINETO - addMetadataJson(key, value); -#else - LOG(WARNING) << "Adding profiling metadata requires using " - << "torch.profiler with Kineto support (USE_KINETO=1)"; -#endif // USE_KINETO - }); - - m.def("kineto_available", []() { -#ifdef USE_KINETO - return true; -#else - return false; -#endif - }); + m.def("_add_metadata_json", torch::profiler::impl::addMetadataJson); // Only if `USE_KINETO` is set + m.def("kineto_available", []() { return torch::profiler::kKinetoAvailable; }); // NOTICE: These record functions are not torch operators and may not show up // in TorchScript tracing, FX transforms, or operator serialization. For these diff --git a/torch/csrc/autograd/profiler_kineto.cpp b/torch/csrc/autograd/profiler_kineto.cpp index 2f9482d25e3..3ea1e2d64db 100644 --- a/torch/csrc/autograd/profiler_kineto.cpp +++ b/torch/csrc/autograd/profiler_kineto.cpp @@ -1,8 +1,9 @@ +#include + #include #include #include #include -#include #include #include @@ -104,11 +105,6 @@ void _push_reverse_order(PyTraceEvent* e, std::vector& names) { namespace { -std::string shapesToStr(const std::vector>& shapes); -std::string stacksToStr(const std::vector& stacks, const char* delim); -std::string dtypesToStr(const std::vector& types); -std::vector inputTypes(const at::RecordFunction& fn); - // Assumption: Total threads number will not exceed 2^16-1, and total ops will not exceed 2^48 -1. static inline uint64_t getForwardThreadKey(uint64_t tid, uint64_t seqNr) { return (((tid) << 48) | ((seqNr) & (((uint64_t)1 << 48) - 1))); @@ -180,7 +176,7 @@ struct KinetoThreadLocalState : public ProfilerThreadLocalState { kineto_events_.back().moduleHierarchy(*ctx->module_hierarchy); } if (ctx->extraArgs && !ctx->extraArgs->empty()) { - kineto_events_.back().flops(computeFlops(std::string(evt_name), *ctx->extraArgs)); + kineto_events_.back().flops(torch::profiler::impl::computeFlops(std::string(evt_name), *ctx->extraArgs)); } kineto_events_.back().cuda_event_start_ = ctx->cuda_event_start_; kineto_events_.back().cuda_event_end_ = ctx->cuda_event_end_; @@ -325,18 +321,18 @@ struct KinetoThreadLocalState : public ProfilerThreadLocalState { auto& activity = cpu_trace->activities[idx]; if (kineto_event.hasShapes()) { - activity.addMetadata("Input Dims", shapesToStr(kineto_event.shapes())); + activity.addMetadata("Input Dims", torch::profiler::impl::shapesToStr(kineto_event.shapes())); } if (kineto_event.hasStack()) { // NB: This is only for the JIT stack. The python stack (if applicable) // is constructed later. - activity.addMetadata("Call stack", stacksToStr(kineto_event.stack(), ";")); + activity.addMetadata("Call stack", torch::profiler::impl::stacksToStr(kineto_event.stack(), ";")); } if (kineto_event.hasModuleHierarchy()) { - activity.addMetadata("Module Hierarchy", stacksToStr(kineto_event.moduleHierarchy(), ".")); + activity.addMetadata("Module Hierarchy", torch::profiler::impl::stacksToStr(kineto_event.moduleHierarchy(), ".")); } if (kineto_event.hasTypes()) { - activity.addMetadata("Input type", dtypesToStr(kineto_event.dtypes())); + activity.addMetadata("Input type", torch::profiler::impl::dtypesToStr(kineto_event.dtypes())); } if (!kineto_event.backend().empty()) { activity.addMetadata("Backend", "\"" + kineto_event.backend() + "\""); @@ -472,7 +468,7 @@ struct KinetoThreadLocalState : public ProfilerThreadLocalState { std::vector py_names; _push_reverse_order(python_caller, py_names); kineto_events_[idx].stack(py_names); - activity.addMetadata("Call stack", stacksToStr(py_names, ";")); + activity.addMetadata("Call stack", torch::profiler::impl::stacksToStr(py_names, ";")); } cpu_trace->activities.push_back(activity); @@ -532,27 +528,6 @@ struct KinetoThreadLocalState : public ProfilerThreadLocalState { std::function&)> event_post_process_cb_; }; -std::vector inputTypes(const at::RecordFunction& fn) { - std::vector types; - types.reserve(fn.inputs().size()); - for (const c10::IValue& input : fn.inputs()) { - if (input.isTensor()) { - const at::Tensor& tensor = input.toTensor(); - if (tensor.defined()) { - types.push_back( - static_cast(input.toTensor().dtype().name())); - } else { - types.emplace_back(); - } - } else if (input.isScalar() || input.isList()) { - types.push_back(input.tagKind()); - } else { - types.emplace_back(); - } - } - return types; -} - KinetoThreadLocalState* getProfilerTLSState() { const auto& state = c10::ThreadLocalDebugInfo::get( c10::DebugInfoKind::PROFILER_STATE); @@ -582,12 +557,12 @@ void pushProfilingCallbacks(const std::unordered_set& scopes) { ctx_ptr->debug_handle = fn.debugHandle(); if (config.report_input_shapes) { - ctx_ptr->shapes = inputSizes(fn); - ctx_ptr->dtypes = inputTypes(fn); + ctx_ptr->shapes = torch::profiler::impl::inputSizes(fn); + ctx_ptr->dtypes = torch::profiler::impl::inputTypes(fn); } if (config.with_flops) { - ctx_ptr->extraArgs = saveExtraArgs(fn); + ctx_ptr->extraArgs = torch::profiler::impl::saveExtraArgs(fn); } ctx_ptr->sequenceNr = fn.seqNr(); @@ -599,7 +574,7 @@ void pushProfilingCallbacks(const std::unordered_set& scopes) { // TODO: consider using C++ stack trace if (config.with_stack && fn.scope() != at::RecordScope::BACKWARD_FUNCTION) { - auto cs = prepareCallstack(jit::currentCallstack()); + auto cs = torch::profiler::impl::prepareCallstack(jit::currentCallstack()); ctx_ptr->stack = callstackStr(cs); } if (config.with_modules && @@ -619,9 +594,9 @@ void pushProfilingCallbacks(const std::unordered_set& scopes) { } else if (config.state == ProfilerState::NVTX) { std::vector> shapes; if (config.report_input_shapes) { - shapes = inputSizes(fn); + shapes = torch::profiler::impl::inputSizes(fn); } - cudaStubs()->nvtxRangePushA(getNvtxStr( + cudaStubs()->nvtxRangePushA(torch::profiler::impl::getNvtxStr( fn.name(), fn.seqNr(), shapes).c_str()); } return nullptr; @@ -662,59 +637,6 @@ void pushProfilingCallbacks(const std::unordered_set& scopes) { state_ptr->setCallbackHandle(handle); } -std::string shapesToStr(const std::vector>& shapes) { - std::ostringstream oss; - oss << "["; - for (const auto t_idx : c10::irange(shapes.size())) { - if (t_idx > 0) { - oss << ", "; - } - oss << "["; - for (size_t s_idx = 0; s_idx < shapes[t_idx].size(); ++s_idx) { - if (s_idx > 0) { - oss << ", "; - } - oss << shapes[t_idx][s_idx]; - } - oss << "]"; - } - oss << "]"; - return oss.str(); -} - -std::string dtypesToStr(const std::vector& types) { - if (types.empty()) { - return "[]"; - } else { - std::ostringstream oss; - std::transform( - types.begin(), - types.end(), - std::ostream_iterator(oss, ", "), - [](std::string s) -> std::string { return "\"" + s + "\""; }); - auto rc = oss.str(); - rc.erase(rc.length() - 2); // remove last ", " - return "[" + rc + "]"; - } -} - -std::string stacksToStr(const std::vector& stacks, const char* delim) { - std::ostringstream oss; - std::transform( - stacks.begin(), - stacks.end(), - std::ostream_iterator(oss, delim), - [](std::string s) -> std::string { -#ifdef _WIN32 - // replace the windows backslash with forward slash - std::replace(s.begin(), s.end(), '\\', '/'); -#endif - return s; - }); - auto rc = oss.str(); - return "\"" + rc + "\""; -} - } // namespace void reportBackendEventToActiveKinetoProfiler( @@ -883,16 +805,6 @@ std::unique_ptr disableProfiler() { #endif // USE_KINETO } -void addMetadataJson(const std::string& key, const std::string& value) { -#ifdef USE_KINETO - if (libkineto::api().isProfilerInitialized()) { - libkineto::api().activityProfiler().addMetadata(key, value); - } else { - LOG(WARNING) << "Profiler is not initialized: skipping profiling metadata"; - } -#endif // USE_KINETO -} - int64_t KinetoEvent::cudaElapsedUs() const { if (!cuda_event_start_ || !cuda_event_end_) { return -1; diff --git a/torch/csrc/autograd/profiler_kineto.h b/torch/csrc/autograd/profiler_kineto.h index 09677b272d2..81d8164e6f9 100644 --- a/torch/csrc/autograd/profiler_kineto.h +++ b/torch/csrc/autograd/profiler_kineto.h @@ -4,17 +4,6 @@ #include #include -#ifdef USE_KINETO -// skip Kineto dependency on mobile -// unless explicitly asked for. -// When is it explicitly asked for? -// KinetoEdgeCPUProfiler uses KinetoProfiler for cpu -// event profiling. This has dependency on cpu only libkineto -#if defined(C10_MOBILE) && !defined(EDGE_PROFILER_USE_KINETO) -#undef USE_KINETO -#endif -#endif - #ifdef USE_KINETO namespace libkineto { struct TraceActivity; @@ -407,10 +396,6 @@ TORCH_API void prepareProfiler( const ProfilerConfig& config, const std::set& activities); -TORCH_API void addMetadataJson( - const std::string& key, const std::string& value); - - namespace python_tracer { /* diff --git a/torch/csrc/autograd/profiler_legacy.cpp b/torch/csrc/autograd/profiler_legacy.cpp index fbce610e518..5b8aa69deb0 100644 --- a/torch/csrc/autograd/profiler_legacy.cpp +++ b/torch/csrc/autograd/profiler_legacy.cpp @@ -1,4 +1,5 @@ -#include +#include + #include #include @@ -24,34 +25,6 @@ namespace torch { namespace autograd { namespace profiler { -std::vector prepareCallstack(const std::vector& cs) { - std::vector entries; - entries.reserve(cs.size()); - for (const auto& entry : cs) { - auto& range = entry.range; - if (range.source()) { - auto& src = range.source(); - if (src && src->filename()) { - auto line = src->starting_line_no() + - src->lineno_for_offset(range.start()); - entries.emplace_back(FileLineFunc{*(src->filename()), line, entry.filename}); - } - } - } - return entries; -} - -std::vector callstackStr(const std::vector& cs) { - std::vector cs_str; - cs_str.reserve(cs.size()); - for (const auto& entry : cs) { - std::stringstream loc; - loc << entry.filename << "(" << entry.line << "): " << entry.funcname; - cs_str.push_back(loc.str()); - } - return cs_str; -} - // We decompose the profiler logic into the following components: // // ThreadLocalDebugInfo: @@ -216,7 +189,7 @@ void ProfilerThreadLocalState::pushRange( return; } if (config_.state == ProfilerState::NVTX) { - cuda_stubs()->nvtxRangePushA(getNvtxStr( + cuda_stubs()->nvtxRangePushA(torch::profiler::impl::getNvtxStr( fn.name(), fn.seqNr(), shapes).c_str()); } else { LegacyEvent evt( @@ -232,8 +205,8 @@ void ProfilerThreadLocalState::pushRange( evt.setFwdThreadId(fn.forwardThreadId()); evt.setScope((uint8_t)fn.scope()); if (config_.with_flops) { - evt.setExtraArgs(saveExtraArgs(fn)); - evt.setFlops(computeFlops(std::string(fn.name()), evt.extraArgs())); + evt.setExtraArgs(torch::profiler::impl::saveExtraArgs(fn)); + evt.setFlops(torch::profiler::impl::computeFlops(std::string(fn.name()), evt.extraArgs())); } // TODO: will unify the two macros BUILD_LITE_INTERPRETER and C10_MOBILE soon. @@ -241,9 +214,9 @@ void ProfilerThreadLocalState::pushRange( // backward nodes source range corresponds to the forward node // TODO: consider using C++ stack trace if (config_.with_stack && fn.scope() != at::RecordScope::BACKWARD_FUNCTION) { - auto cs = prepareCallstack(jit::currentCallstack()); + auto cs = torch::profiler::impl::prepareCallstack(jit::currentCallstack()); if (cs.empty()) { - cs = prepareCallstack(jit::tracer::pythonCallstack()); + cs = torch::profiler::impl::prepareCallstack(jit::tracer::pythonCallstack()); } evt.setStack(callstackStr(cs)); } @@ -296,53 +269,6 @@ bool ProfilerThreadLocalState::memoryProfilingEnabled() const { return config_.profile_memory; } -std::string getNvtxStr( - const char* name, - int64_t sequence_nr, - const std::vector>& shapes) { - if (sequence_nr >= -1 || shapes.size() > 0) { - std::stringstream s; -#if defined(USE_ROCM) - s << name; -#endif - if (sequence_nr >= 0) { -#if defined(USE_ROCM) - s << ", seq = " << sequence_nr; -#else - s << name << ", seq = " << sequence_nr; -#endif - } else if (sequence_nr == -1) { -#if !defined(USE_ROCM) - s << name; -#endif - } - if (shapes.size() > 0) { - s << ", sizes = ["; - for (const auto idx : c10::irange(shapes.size())) { - if (shapes[idx].size() > 0) { - s << "["; - for (size_t dim = 0; dim < shapes[idx].size(); ++dim) { - s << shapes[idx][dim]; - if (dim < shapes[idx].size() - 1) { - s << ", "; - } - } - s << "]"; - } else { - s << "[]"; - } - if (idx < shapes.size() - 1) { - s << ", "; - } - } - s << "]"; - } - return s.str(); - } else { - return name; - } -} - RangeEventList& ProfilerThreadLocalState::getEventList(int64_t thread_id) { if (thread_id < 0) { thread_id = at::RecordFunction::currentThreadId(); @@ -360,24 +286,6 @@ RangeEventList& ProfilerThreadLocalState::getEventList(int64_t thread_id) { return *list_ptr; } -std::vector> inputSizes(const at::RecordFunction& fn) { - std::vector> sizes; - sizes.reserve(fn.inputs().size()); - for (const c10::IValue& input : fn.inputs()) { - if (!input.isTensor()) { - sizes.emplace_back(); - continue; - } - const at::Tensor& tensor = input.toTensor(); - if (tensor.defined()) { - sizes.push_back(input.toTensor().sizes().vec()); - } else { - sizes.emplace_back(); - } - } - return sizes; -} - namespace { enum EventIValueIdx { @@ -442,7 +350,7 @@ void pushProfilingCallbacksLegacy() { } if (state_ptr->config().report_input_shapes) { - auto sizes = inputSizes(fn); + auto sizes = torch::profiler::impl::inputSizes(fn); state_ptr->pushRange(fn, record_cuda, std::move(sizes)); } else { state_ptr->pushRange(fn, record_cuda); diff --git a/torch/csrc/autograd/profiler_legacy.h b/torch/csrc/autograd/profiler_legacy.h index f06401a2282..4628e9e00c3 100644 --- a/torch/csrc/autograd/profiler_legacy.h +++ b/torch/csrc/autograd/profiler_legacy.h @@ -10,18 +10,8 @@ #include #include #include +#include #include -#include -#ifndef _WIN32 -#include -#endif -#if defined(C10_IOS) && defined(C10_MOBILE) -#include // for gettimeofday() -#endif - -#include - -#include struct CUevent_st; typedef std::shared_ptr CUDAEventStub; @@ -69,33 +59,6 @@ private: TORCH_API void registerCUDAMethods(CUDAStubs* stubs); TORCH_API const CUDAStubs* cudaStubs(); -constexpr inline size_t ceilToMultiple(size_t a, size_t b) { - return ((a + b - 1) / b) * b; -} - -inline int64_t getTime(bool allow_monotonic = false) { -#if defined(C10_IOS) && defined(C10_MOBILE) -// clock_gettime is only available on iOS 10.0 or newer. Unlike OS X, iOS can't rely on -// CLOCK_REALTIME, as it is defined no matter if clock_gettime is implemented or not - struct timeval now; - gettimeofday(&now, NULL); - return static_cast(now.tv_sec) * 1000000000 + static_cast(now.tv_usec) * 1000; -#elif defined(_WIN32) || defined(__MACH__) - using namespace std::chrono; - using clock = std::conditional::type; - return duration_cast(clock::now().time_since_epoch()).count(); -#else - // clock_gettime is *much* faster than std::chrono implementation on Linux - struct timespec t{}; - auto mode = CLOCK_REALTIME; - if (allow_monotonic) { - mode = CLOCK_MONOTONIC; - } - clock_gettime(mode, &t); - return static_cast(t.tv_sec) * 1000000000 + static_cast(t.tv_nsec); -#endif -} - enum class C10_API_ENUM EventKind : uint16_t { Mark, PushRange, @@ -394,11 +357,6 @@ struct RangeEventList { static const size_t kReservedCapacity = 1024; }; -std::string getNvtxStr( - const char* name, - int64_t sequence_nr, - const std::vector>& shapes); - enum class C10_API_ENUM ProfilerState { Disabled = 0, CPU, // CPU-only profiling @@ -526,15 +484,6 @@ struct TORCH_API TLSLegacyProfilerGuard { const c10::optional profilerDisableOptions_; }; -struct TORCH_API FileLineFunc { - std::string filename; - size_t line; - std::string funcname; -}; -TORCH_API std::vector prepareCallstack(const std::vector& cs); -TORCH_API std::vector callstackStr(const std::vector& cs); -TORCH_API std::vector> inputSizes(const at::RecordFunction& fn); - struct TORCH_API ProfilerThreadLocalState : public c10::MemoryReportingInfoBase { // NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init) explicit ProfilerThreadLocalState(const ProfilerConfig& config) @@ -603,8 +552,6 @@ struct TORCH_API ProfilerThreadLocalState : public c10::MemoryReportingInfoBase namespace torch { namespace profiler { namespace impl { -using torch::autograd::profiler::computeFlops; -using torch::autograd::profiler::getTime; using torch::autograd::profiler::ProfilerConfig; using torch::autograd::profiler::ProfilerState; } // impl diff --git a/torch/csrc/autograd/profiler_utils.h b/torch/csrc/autograd/profiler_utils.h deleted file mode 100644 index 959821983e8..00000000000 --- a/torch/csrc/autograd/profiler_utils.h +++ /dev/null @@ -1,16 +0,0 @@ -#pragma once - -#include -#include -#include -#include - -namespace torch { namespace autograd { -namespace profiler { - -std::unordered_map TORCH_API saveExtraArgs(const at::RecordFunction& fn); - -uint64_t TORCH_API computeFlops(const std::string &op_name, - const std::unordered_map &extra_args); - -}}} diff --git a/torch/csrc/profiler/util.cpp b/torch/csrc/profiler/util.cpp new file mode 100644 index 00000000000..157dea63035 --- /dev/null +++ b/torch/csrc/profiler/util.cpp @@ -0,0 +1,563 @@ +#include + +#include + +#ifdef USE_KINETO +#include +#endif + +namespace torch { +namespace profiler { +namespace impl { + +void addMetadataJson(const std::string& key, const std::string& value) { +#ifdef USE_KINETO + if (libkineto::api().isProfilerInitialized()) { + libkineto::api().activityProfiler().addMetadata(key, value); + } else { + LOG(WARNING) << "Profiler is not initialized: skipping profiling metadata"; + } +#else + LOG(WARNING) << "Adding profiling metadata requires using " + << "torch.profiler with Kineto support (USE_KINETO=1)"; +#endif // USE_KINETO +} + +// ---------------------------------------------------------------------------- +// -- NVTX -------------------------------------------------------------------- +// ---------------------------------------------------------------------------- +std::string getNvtxStr( + const char* name, + int64_t sequence_nr, + const std::vector>& shapes) { + if (sequence_nr >= -1 || shapes.size() > 0) { + std::stringstream s; +#if defined(USE_ROCM) + s << name; +#endif + if (sequence_nr >= 0) { +#if defined(USE_ROCM) + s << ", seq = " << sequence_nr; +#else + s << name << ", seq = " << sequence_nr; +#endif + } else if (sequence_nr == -1) { +#if !defined(USE_ROCM) + s << name; +#endif + } + if (shapes.size() > 0) { + s << ", sizes = ["; + for (const auto idx : c10::irange(shapes.size())) { + if (shapes[idx].size() > 0) { + s << "["; + for (size_t dim = 0; dim < shapes[idx].size(); ++dim) { + s << shapes[idx][dim]; + if (dim < shapes[idx].size() - 1) { + s << ", "; + } + } + s << "]"; + } else { + s << "[]"; + } + if (idx < shapes.size() - 1) { + s << ", "; + } + } + s << "]"; + } + return s.str(); + } else { + return name; + } +} + +// ---------------------------------------------------------------------------- +// -- Op context (shapes, call stack) ----------------------------------------- +// ---------------------------------------------------------------------------- +std::vector prepareCallstack( + const std::vector& cs) { + std::vector entries; + entries.reserve(cs.size()); + for (const auto& entry : cs) { + auto& range = entry.range; + if (range.source()) { + auto& src = range.source(); + if (src && src->filename()) { + auto line = + src->starting_line_no() + src->lineno_for_offset(range.start()); + entries.emplace_back( + FileLineFunc{*(src->filename()), line, entry.filename}); + } + } + } + return entries; +} + +std::vector callstackStr(const std::vector& cs) { + std::vector cs_str; + cs_str.reserve(cs.size()); + for (const auto& entry : cs) { + std::stringstream loc; + loc << entry.filename << "(" << entry.line << "): " << entry.funcname; + cs_str.push_back(loc.str()); + } + return cs_str; +} + +std::string stacksToStr( + const std::vector& stacks, + const char* delim) { + std::ostringstream oss; + std::transform( + stacks.begin(), + stacks.end(), + std::ostream_iterator(oss, delim), + [](std::string s) -> std::string { +#ifdef _WIN32 + // replace the windows backslash with forward slash + std::replace(s.begin(), s.end(), '\\', '/'); +#endif + return s; + }); + auto rc = oss.str(); + return "\"" + rc + "\""; +} + +std::vector> inputSizes(const at::RecordFunction& fn) { + std::vector> sizes; + sizes.reserve(fn.inputs().size()); + for (const c10::IValue& input : fn.inputs()) { + if (!input.isTensor()) { + sizes.emplace_back(); + continue; + } + const at::Tensor& tensor = input.toTensor(); + if (tensor.defined()) { + sizes.push_back(input.toTensor().sizes().vec()); + } else { + sizes.emplace_back(); + } + } + return sizes; +} + +std::string shapesToStr(const std::vector>& shapes) { + std::ostringstream oss; + oss << "["; + for (const auto t_idx : c10::irange(shapes.size())) { + if (t_idx > 0) { + oss << ", "; + } + oss << "["; + for (size_t s_idx = 0; s_idx < shapes[t_idx].size(); ++s_idx) { + if (s_idx > 0) { + oss << ", "; + } + oss << shapes[t_idx][s_idx]; + } + oss << "]"; + } + oss << "]"; + return oss.str(); +} + +std::string dtypesToStr(const std::vector& types) { + if (types.empty()) { + return "[]"; + } else { + std::ostringstream oss; + std::transform( + types.begin(), + types.end(), + std::ostream_iterator(oss, ", "), + [](std::string s) -> std::string { return "\"" + s + "\""; }); + auto rc = oss.str(); + rc.erase(rc.length() - 2); // remove last ", " + return "[" + rc + "]"; + } +} + +std::vector inputTypes(const at::RecordFunction& fn) { + std::vector types; + types.reserve(fn.inputs().size()); + for (const c10::IValue& input : fn.inputs()) { + if (input.isTensor()) { + const at::Tensor& tensor = input.toTensor(); + if (tensor.defined()) { + types.push_back( + static_cast(input.toTensor().dtype().name())); + } else { + types.emplace_back(); + } + } else if (input.isScalar() || input.isList()) { + types.push_back(input.tagKind()); + } else { + types.emplace_back(); + } + } + return types; +} + +// ---------------------------------------------------------------------------- +// -- FLOPS ------------------------------------------------------------------- +// ---------------------------------------------------------------------------- +static constexpr auto kConv2dStride = 3; +static constexpr auto kConv2dPadding = 4; +static constexpr auto kConv2dDilation = 5; +static constexpr auto kConv2dGroups = 6; + +// List of supported operators +static constexpr auto kConv2dOp = "aten::conv2d"; +static constexpr auto kMMOp = "aten::mm"; +static constexpr auto kAddMMOp = "aten::addmm"; +static constexpr auto kMulOp = "aten::mul"; +static constexpr auto kAddOp = "aten::add"; +static constexpr auto kBMMOp = "aten::bmm"; +static constexpr auto kBAddBMMOp = "aten::baddbmm"; + +static constexpr auto kInputSize = "input_size"; +static constexpr auto kWeightSize = "weight_size"; +static constexpr auto kGroups = "groups"; +static constexpr auto kPadding = "padding"; +static constexpr auto kStride = "stride"; +static constexpr auto kDilation = "dilation"; +static constexpr auto kMatSize = "mat_size"; +static constexpr auto kMat1Size = "mat1_size"; +static constexpr auto kMat2Size = "mat2_size"; + +static bool validateInput( + const std::string& op_name, + size_t min_size, + const std::vector& inputs, + const c10::ArrayRef& should_be_tensor) { + std::stringstream ss; + if (inputs.size() < min_size) { + ss << "Failed to save extra arguments for flops compuation of op " + << op_name << ", min size: " << min_size + << ", actual size: " << inputs.size(); + TORCH_WARN(ss.str()); + return false; + } + for (auto index : should_be_tensor) { + if (!inputs[index].isTensor()) { + ss << "Failed to save extra arguments for flops compuation of op " + << op_name << ", input[" << index << "] must be a tensor."; + TORCH_WARN(ss.str()); + return false; + } + } + return true; +} + +std::unordered_map saveExtraArgs( + const at::RecordFunction& fn) { + // for specific types of fn, return the saved extra args for computing flops + std::unordered_map map; + std::vector inputs = fn.inputs(); + std::string fname(fn.name()); + + if (inputs.empty()) { + // Input shape is unavailable, return empty map + return map; + } + + if (fname == kConv2dOp) { + bool check = validateInput(fname, kConv2dGroups + 1, inputs, {0, 1}); + if (!check) { + return map; + } + + at::Tensor input = inputs[0].toTensor(); + at::Tensor weight = inputs[1].toTensor(); + if (weight.sizes().size() != 4) { + TORCH_WARN( + "Failed to compute flops for op aten::conv2d because it requires a 4D kernel tensor."); + return map; + } + map[kInputSize] = at::IValue(input.sizes()); + map[kWeightSize] = at::IValue(weight.sizes()); + map[kStride] = inputs[kConv2dStride]; + map[kPadding] = inputs[kConv2dPadding]; + map[kDilation] = inputs[kConv2dDilation]; + map[kGroups] = inputs[kConv2dGroups]; + } else if (fname == kMMOp) { + bool check = validateInput(fname, 2, inputs, {0, 1}); + if (!check) { + return map; + } + + at::Tensor left = inputs[0].toTensor(); + at::Tensor right = inputs[1].toTensor(); + map[kMat1Size] = at::IValue(left.sizes()); + map[kMat2Size] = at::IValue(right.sizes()); + } else if (fname == kAddMMOp) { + bool check = validateInput(fname, 3, inputs, {0, 1, 2}); + if (!check) { + return map; + } + + // Exact FLOP count depends on scaling factors alpha and beta but + // just assume these are +=1. + // (similar to http://www.netlib.org/lapack/lawnspdf/lawn41.pdf, + // "Operations Count for the BLAS and LAPACK", Table 3, SGEMM) + at::Tensor left = inputs[1].toTensor(); + at::Tensor right = inputs[2].toTensor(); + map[kMat1Size] = at::IValue(left.sizes()); + map[kMat2Size] = at::IValue(right.sizes()); + } else if (fname == kMulOp) { + bool check = validateInput(fname, 1, inputs, {0}); + if (!check) { + return map; + } + + at::Tensor mat = inputs[0].toTensor(); + map[kMatSize] = at::IValue(mat.sizes()); + } else if (fname == kAddOp) { + bool check = validateInput(fname, 1, inputs, {0}); + if (!check) { + return map; + } + + at::Tensor mat = inputs[0].toTensor(); + map[kMatSize] = at::IValue(mat.sizes()); + } else if (fname == kBMMOp) { + bool check = validateInput(fname, 2, inputs, {0, 1}); + if (!check) { + return map; + } + + at::Tensor left = inputs[0].toTensor(); + at::Tensor right = inputs[1].toTensor(); + map[kMat1Size] = at::IValue(left.sizes()); + map[kMat2Size] = at::IValue(right.sizes()); + } else if (fname == kBAddBMMOp) { + bool check = validateInput(fname, 3, inputs, {0, 1, 2}); + if (!check) { + return map; + } + + // Exact FLOP count depends on scaling factors alpha and beta but + // just assume these are +=1. + // (similar to http://www.netlib.org/lapack/lawnspdf/lawn41.pdf, + // "Operations Count for the BLAS and LAPACK", Table 3, SGEMM) + at::Tensor left = inputs[1].toTensor(); + at::Tensor right = inputs[2].toTensor(); + map[kMat1Size] = at::IValue(left.sizes()); + map[kMat2Size] = at::IValue(right.sizes()); + } + + return map; +} + +uint64_t computeFlops( + const std::string& op_name, + const std::unordered_map& extra_args) { + if (op_name == kConv2dOp) { + if (extra_args.find(kInputSize) == extra_args.end() || + extra_args.find(kWeightSize) == extra_args.end() || + extra_args.find(kGroups) == extra_args.end() || + extra_args.find(kPadding) == extra_args.end() || + extra_args.find(kStride) == extra_args.end() || + extra_args.find(kDilation) == extra_args.end()) { + TORCH_WARN( + "Calculating flops for aten::conv2d requires groups, padding, stride, dilation, input_size, and weight_size in saved arguments."); + return 0; + } + auto input_sizes_ref = extra_args.at(kInputSize); + auto kernel_sizes_ref = extra_args.at(kWeightSize); + auto groups_ref = extra_args.at(kGroups); + auto padding_ref = extra_args.at(kPadding); + auto stride_ref = extra_args.at(kStride); + auto dilation_ref = extra_args.at(kDilation); + if (!input_sizes_ref.isIntList() || !kernel_sizes_ref.isIntList()) { + TORCH_WARN( + "Failed to compute flops for op aten::conv2d because it requires input and weight tensor sizes."); + return 0; + } + if (!padding_ref.isIntList() || !stride_ref.isIntList() || + !dilation_ref.isIntList()) { + TORCH_WARN( + "Failed to compute flops for op aten::conv2d because it requires padding, stride, and dilation values."); + return 0; + } + + const std::vector input_sizes = input_sizes_ref.toIntVector(); + const std::vector kernel_sizes = kernel_sizes_ref.toIntVector(); + const uint64_t groups = groups_ref.toInt(); + const std::vector padding = padding_ref.toIntVector(); + const std::vector stride = stride_ref.toIntVector(); + const std::vector dilation = dilation_ref.toIntVector(); + if (input_sizes.size() != 4 || kernel_sizes.size() != 4) { + TORCH_WARN( + "Failed to compute flops for op aten::conv2d because both input and weight must be size 4."); + return 0; + } + if (!groups) { + TORCH_WARN( + "Failed to compute flops for op aten::conv2d because group size must not be 0."); + return 0; + } + if (padding.size() != 2 || dilation.size() != 2) { + TORCH_WARN( + "Failed to compute flops for op aten::conv2d because both padding and dilation must be size 2."); + return 0; + } + if (stride.size() != 2 || (stride[0] * stride[1] == 0)) { + TORCH_WARN( + "Failed to compute flops for op aten::conv2d because stride must be size 2 and cannot be 0."); + return 0; + } + // format of the input is defined in torch.nn.quantized.functional.conv2d() + uint64_t minibatch = 0, in_channels = 0, input_h = 0, input_w = 0; + uint64_t out_channels = 0, kernel_h = 0, kernel_w = 0; + const uint64_t conv2d_multiply_factor = 2; + std::tie(minibatch, in_channels, input_h, input_w) = std::make_tuple( + input_sizes[0], input_sizes[1], input_sizes[2], input_sizes[3]); + std::tie(out_channels, std::ignore, kernel_h, kernel_w) = std::make_tuple( + kernel_sizes[0], kernel_sizes[1], kernel_sizes[2], kernel_sizes[3]); + uint64_t output_h = + (input_h + 2 * padding[0] - dilation[0] * (kernel_h - 1) - 1) / + stride[0] + + 1; + uint64_t output_w = + (input_w + 2 * padding[1] - dilation[1] * (kernel_w - 1) - 1) / + stride[1] + + 1; + + return conv2d_multiply_factor * minibatch * output_h * output_w * kernel_h * + kernel_w * in_channels * out_channels / groups; + } else if (op_name == kMMOp || op_name == kAddMMOp) { + if (extra_args.find(kMat1Size) == extra_args.end() || + extra_args.find(kMat2Size) == extra_args.end()) { + TORCH_WARN( + "Calculating flops for ", + op_name, + " requires mat1_size and mat2_size in saved arguments."); + return 0; + } + auto mat1_sizes_ref = extra_args.at(kMat1Size); + auto mat2_sizes_ref = extra_args.at(kMat2Size); + if (!mat1_sizes_ref.isIntList() || !mat2_sizes_ref.isIntList()) { + TORCH_WARN( + "Failed to compute flops for op ", + op_name, + " because it requires mat1_size and mat2_size to be IntList."); + return 0; + } + + std::vector mat1_size = mat1_sizes_ref.toIntVector(); + std::vector mat2_size = mat2_sizes_ref.toIntVector(); + if (mat1_size.size() == 0) { + return 0; + } + + int64_t overlap_dim = mat1_size.back(); + if (overlap_dim == 0) { + return 0; + } + + const uint64_t gemm_multiply_factor = 2; + uint64_t flops = 1; + for (int64_t dim : mat1_size) { + flops *= dim; + } + flops /= overlap_dim; + for (int64_t dim : mat2_size) { + flops *= dim; + } + flops *= gemm_multiply_factor; + return flops; + } else if (op_name == kBMMOp || op_name == kBAddBMMOp) { + if (extra_args.find(kMat1Size) == extra_args.end() || + extra_args.find(kMat2Size) == extra_args.end()) { + TORCH_WARN( + "Calculating flops for ", + op_name, + " requires mat1_size and mat2_size in saved arguments."); + return 0; + } + auto mat1_sizes_ref = extra_args.at(kMat1Size); + auto mat2_sizes_ref = extra_args.at(kMat2Size); + if (!mat1_sizes_ref.isIntList() || !mat2_sizes_ref.isIntList()) { + TORCH_WARN( + "Failed to compute flops for op ", + op_name, + " because it requires mat1_size and mat2_size to be IntList."); + return 0; + } + + std::vector mat1_size = mat1_sizes_ref.toIntVector(); + std::vector mat2_size = mat2_sizes_ref.toIntVector(); + if (mat1_size.size() == 0) { + return 0; + } + + int64_t batch_size = mat1_size.front(); + if (batch_size == 0) { + return 0; + } + + int64_t overlap_dim = mat1_size.back(); + if (overlap_dim == 0) { + return 0; + } + + const uint64_t gemm_multiply_factor = 2; + uint64_t flops = 1; + for (int64_t dim : mat1_size) { + flops *= dim; + } + flops /= overlap_dim; + flops /= batch_size; + for (int64_t dim : mat2_size) { + flops *= dim; + } + flops *= gemm_multiply_factor; + return flops; + } else if (op_name == kMulOp) { + if (extra_args.find(kMatSize) == extra_args.end()) { + TORCH_WARN( + "Calculating flops for aten::mul.Tensor requires mat_size in saved arguments."); + return 0; + } + auto mat_sizes = extra_args.at(kMatSize); + if (!mat_sizes.isIntList()) { + TORCH_WARN( + "Failed to compute flops for op aten::mul because it requires mat_size to be IntList."); + return 0; + } + + std::vector mat_size = mat_sizes.toIntVector(); + uint64_t flops = 1; + for (int64_t dim : mat_size) { + flops *= dim; + } + return flops; + } else if (op_name == kAddOp) { + if (extra_args.find(kMatSize) == extra_args.end()) { + TORCH_WARN( + "Calculating flops for aten::add.Tensor requires mat_size in saved arguments."); + return 0; + } + auto mat_sizes = extra_args.at(kMatSize); + if (!mat_sizes.isIntList()) { + TORCH_WARN( + "Failed to compute flops for op aten::add because it requires mat_size to be IntList."); + return 0; + } + + std::vector mat_size = mat_sizes.toIntVector(); + uint64_t flops = 1; + for (int64_t dim : mat_size) { + flops *= dim; + } + return flops; + } + return 0; +} + +} // namespace impl +} // namespace profiler +} // namespace torch diff --git a/torch/csrc/profiler/util.h b/torch/csrc/profiler/util.h new file mode 100644 index 00000000000..cb7782e6b42 --- /dev/null +++ b/torch/csrc/profiler/util.h @@ -0,0 +1,119 @@ +#pragma once + +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +#ifndef _WIN32 +#include +#endif +#if defined(C10_IOS) && defined(C10_MOBILE) +#include // for gettimeofday() +#endif + +// skip Kineto dependency on mobile unless explicitly asked for. +// When is it explicitly asked for? +// KinetoEdgeCPUProfiler uses KinetoProfiler for cpu +// event profiling. This has dependency on cpu only libkineto +#if defined(USE_KINETO) && defined(C10_MOBILE) && \ + !defined(EDGE_PROFILER_USE_KINETO) +#undef USE_KINETO +#endif + +namespace torch { +namespace profiler { + +#ifdef USE_KINETO +constexpr bool kKinetoAvailable {true}; +#else +constexpr bool kKinetoAvailable {false}; +#endif + +namespace impl { + +inline int64_t getTime(bool allow_monotonic = false) { +#if defined(C10_IOS) && defined(C10_MOBILE) + // clock_gettime is only available on iOS 10.0 or newer. Unlike OS X, iOS + // can't rely on CLOCK_REALTIME, as it is defined no matter if clock_gettime + // is implemented or not + struct timeval now; + gettimeofday(&now, NULL); + return static_cast(now.tv_sec) * 1000000000 + + static_cast(now.tv_usec) * 1000; +#elif defined(_WIN32) || defined(__MACH__) + using namespace std::chrono; + using clock = std::conditional< + high_resolution_clock::is_steady, + high_resolution_clock, + steady_clock>::type; + return duration_cast(clock::now().time_since_epoch()).count(); +#else + // clock_gettime is *much* faster than std::chrono implementation on Linux + struct timespec t {}; + auto mode = CLOCK_REALTIME; + if (allow_monotonic) { + mode = CLOCK_MONOTONIC; + } + clock_gettime(mode, &t); + return static_cast(t.tv_sec) * 1000000000 + + static_cast(t.tv_nsec); +#endif +} + +// NB: This only works if USE_KINETO is set. (Otherwise it just logs a warning) +TORCH_API void addMetadataJson( + const std::string& key, + const std::string& value); + +std::string getNvtxStr( + const char* name, + int64_t sequence_nr, + const std::vector>& shapes); + +struct TORCH_API FileLineFunc { + std::string filename; + size_t line; + std::string funcname; +}; + +TORCH_API std::vector prepareCallstack( + const std::vector& cs); +TORCH_API std::vector callstackStr( + const std::vector& cs); +TORCH_API std::string stacksToStr( + const std::vector& stacks, + const char* delim); +TORCH_API std::vector> inputSizes( + const at::RecordFunction& fn); +TORCH_API std::string shapesToStr( + const std::vector>& shapes); +TORCH_API std::string dtypesToStr(const std::vector& types); +TORCH_API std::vector inputTypes(const at::RecordFunction& fn); + +std::unordered_map TORCH_API +saveExtraArgs(const at::RecordFunction& fn); + +uint64_t TORCH_API computeFlops( + const std::string& op_name, + const std::unordered_map& extra_args); + +} // namespace impl +} // namespace profiler +} // namespace torch + +namespace torch { +namespace autograd { +namespace profiler { +using torch::profiler::impl::getTime; +using torch::profiler::impl::computeFlops; +} // namespace profiler +} // namespace autograd +} // namespace torch