[Profiler] Optimize reportMemoryUsage (#71538)

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/71538 `reportMemoryUsage` is kind of awful. It does a bunch of string writes and such that makes it VERY expensive. Just moving that work off the hot path reduces the overhead for `profile_memory` from ~6.5 us to ~1.2 us. (85% reduction in the kineto contribution to profiling overhead.) Test Plan: Ran ubenchmark with `--op empty --stressTestKineto --kinetoProfileMemory` Reviewed By: swolchok Differential Revision: D32730167 fbshipit-source-id: fe18e8fa3881967cad8fa1c26c71c805e9b034e5
2025-12-06 12:20:52 +01:00 · 2022-02-20 14:32:29 -08:00 · 2022-02-20 14:32:29 -08:00 · 0d394cb252
commit 0d394cb252
parent a1d18e3e6a
1 changed files with 46 additions and 23 deletions
--- a/torch/csrc/autograd/profiler_kineto.cpp
+++ b/torch/csrc/autograd/profiler_kineto.cpp
@ -155,6 +155,19 @@ struct OpEventData {
    torch::profiler::impl::CUDAEventStub cuda_event_end_ = nullptr;
 };
 struct MemoryEventData {
  int64_t start_time;
  void* ptr;
  int64_t alloc_size;
  int64_t total_allocated;
  int64_t total_reserved;
  uint64_t threadID;
  torch::profiler::impl::kineto::DeviceAndResource kineto_info;
  c10::DeviceType device_type;
  c10::DeviceIndex device_index;
 };
 static_assert(std::is_pod<MemoryEventData>::value, "Non-POD member of MemoryEventData.");
 // Assumption: Total threads number will not exceed 2^16-1, and total ops will
 // not exceed 2^48 -1.
 static inline uint64_t getForwardThreadKey(uint64_t tid, uint64_t seqNr) {
@ -204,29 +217,16 @@ struct KinetoThreadLocalState : public ProfilerThreadLocalStateBase {
      int64_t total_reserved,
      c10::Device device) override {
    if (config_.profile_memory && config_.state != ProfilerState::Disabled) {
-      std::lock_guard<std::mutex> guard(state_mutex_);
+      memory_events_.push_back(
-      auto start_time = getTimeUs();
+          {getTimeUs(),
-      if (cpu_trace_) {
+           ptr,
-        torch::profiler::impl::kineto::recordThreadInfo();
+           alloc_size,
-        cpu_trace_.addMemoryUsageActivity(
+           total_allocated,
-            kMemoryEventName,
+           total_reserved,
-            torch::profiler::impl::kineto::kineto_ids(),
+           at::RecordFunction::currentThreadId(),
-            start_time,
+           torch::profiler::impl::kineto::kineto_ids(),
-            device,
+           device.type(),
-            ptr,
+           device.index()});
            alloc_size,
            total_allocated,
            total_reserved);
      }
      kineto_events_.emplace_back();
      auto& evt = kineto_events_.back();
      evt.name(kMemoryEventName)
          .startUs(start_time)
          .deviceIndex(device.index())
          .deviceType(device.type())
          .nBytes(alloc_size)
          .startThreadId(at::RecordFunction::currentThreadId());
    }
  }
@ -264,6 +264,28 @@ struct KinetoThreadLocalState : public ProfilerThreadLocalStateBase {
  void materializeOpEvents() {
    std::lock_guard<std::mutex> guard(state_mutex_);
    for (const auto& e : memory_events_) {
        cpu_trace_.addMemoryUsageActivity(
            kMemoryEventName,
            e.kineto_info,
            e.start_time,
            c10::Device(e.device_type, e.device_index),
            e.ptr,
            e.alloc_size,
            e.total_allocated,
            e.total_reserved);
      kineto_events_.emplace_back();
      auto& evt = kineto_events_.back();
      evt.name(kMemoryEventName)
          .startUs(e.start_time)
          .deviceIndex(e.device_index)
          .deviceType(e.device_type)
          .nBytes(e.alloc_size)
          .startThreadId(e.threadID);
    }
    for (const auto& e : op_events_) {
      if (e.end_us_ < e.start_us_) {
        // We initialize end_us_ to the smallest int64_t, so this means that
@ -585,6 +607,7 @@ struct KinetoThreadLocalState : public ProfilerThreadLocalStateBase {
  uint64_t start_time_;
  std::set<torch::profiler::impl::ActivityType> activities_;
  std::deque<OpEventData> op_events_;
  std::deque<MemoryEventData> memory_events_;
  torch::profiler::impl::kineto::TraceWrapper cpu_trace_;
  std::vector<KinetoEvent> kineto_events_;
  // Optional, if event post-processing is enabled.