From 0d394cb252e6eac78626b467e0bb497d6d6ae86c Mon Sep 17 00:00:00 2001 From: Taylor Robie Date: Sun, 20 Feb 2022 14:32:29 -0800 Subject: [PATCH] [Profiler] Optimize `reportMemoryUsage` (#71538) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/71538 `reportMemoryUsage` is kind of awful. It does a bunch of string writes and such that makes it VERY expensive. Just moving that work off the hot path reduces the overhead for `profile_memory` from ~6.5 us to ~1.2 us. (85% reduction in the kineto contribution to profiling overhead.) Test Plan: Ran ubenchmark with `--op empty --stressTestKineto --kinetoProfileMemory` Reviewed By: swolchok Differential Revision: D32730167 fbshipit-source-id: fe18e8fa3881967cad8fa1c26c71c805e9b034e5 --- torch/csrc/autograd/profiler_kineto.cpp | 69 ++++++++++++++++--------- 1 file changed, 46 insertions(+), 23 deletions(-) diff --git a/torch/csrc/autograd/profiler_kineto.cpp b/torch/csrc/autograd/profiler_kineto.cpp index 206bd52f6d7..cc42e6d67ac 100644 --- a/torch/csrc/autograd/profiler_kineto.cpp +++ b/torch/csrc/autograd/profiler_kineto.cpp @@ -155,6 +155,19 @@ struct OpEventData { torch::profiler::impl::CUDAEventStub cuda_event_end_ = nullptr; }; +struct MemoryEventData { + int64_t start_time; + void* ptr; + int64_t alloc_size; + int64_t total_allocated; + int64_t total_reserved; + uint64_t threadID; + torch::profiler::impl::kineto::DeviceAndResource kineto_info; + c10::DeviceType device_type; + c10::DeviceIndex device_index; +}; +static_assert(std::is_pod::value, "Non-POD member of MemoryEventData."); + // Assumption: Total threads number will not exceed 2^16-1, and total ops will // not exceed 2^48 -1. static inline uint64_t getForwardThreadKey(uint64_t tid, uint64_t seqNr) { @@ -204,29 +217,16 @@ struct KinetoThreadLocalState : public ProfilerThreadLocalStateBase { int64_t total_reserved, c10::Device device) override { if (config_.profile_memory && config_.state != ProfilerState::Disabled) { - std::lock_guard guard(state_mutex_); - auto start_time = getTimeUs(); - if (cpu_trace_) { - torch::profiler::impl::kineto::recordThreadInfo(); - cpu_trace_.addMemoryUsageActivity( - kMemoryEventName, - torch::profiler::impl::kineto::kineto_ids(), - start_time, - device, - ptr, - alloc_size, - total_allocated, - total_reserved); - } - - kineto_events_.emplace_back(); - auto& evt = kineto_events_.back(); - evt.name(kMemoryEventName) - .startUs(start_time) - .deviceIndex(device.index()) - .deviceType(device.type()) - .nBytes(alloc_size) - .startThreadId(at::RecordFunction::currentThreadId()); + memory_events_.push_back( + {getTimeUs(), + ptr, + alloc_size, + total_allocated, + total_reserved, + at::RecordFunction::currentThreadId(), + torch::profiler::impl::kineto::kineto_ids(), + device.type(), + device.index()}); } } @@ -264,6 +264,28 @@ struct KinetoThreadLocalState : public ProfilerThreadLocalStateBase { void materializeOpEvents() { std::lock_guard guard(state_mutex_); + + for (const auto& e : memory_events_) { + cpu_trace_.addMemoryUsageActivity( + kMemoryEventName, + e.kineto_info, + e.start_time, + c10::Device(e.device_type, e.device_index), + e.ptr, + e.alloc_size, + e.total_allocated, + e.total_reserved); + + kineto_events_.emplace_back(); + auto& evt = kineto_events_.back(); + evt.name(kMemoryEventName) + .startUs(e.start_time) + .deviceIndex(e.device_index) + .deviceType(e.device_type) + .nBytes(e.alloc_size) + .startThreadId(e.threadID); + } + for (const auto& e : op_events_) { if (e.end_us_ < e.start_us_) { // We initialize end_us_ to the smallest int64_t, so this means that @@ -585,6 +607,7 @@ struct KinetoThreadLocalState : public ProfilerThreadLocalStateBase { uint64_t start_time_; std::set activities_; std::deque op_events_; + std::deque memory_events_; torch::profiler::impl::kineto::TraceWrapper cpu_trace_; std::vector kineto_events_; // Optional, if event post-processing is enabled.