[Profiler][Minor] Group and consolidate stub APIs (#85510)

There is a concept in profiler of a stub that wraps a profiling API. It was introduced for CUDA profiling before Kineto, and ITT has adopted it to call into VTune APIs. However for the most part we don't really interact with them when developing the PyTorch profiler. Thus it makes sense to unify the fallback registration mechanism and create a subfolder to free up real estate in the top level `torch/csrc/profiler` directory. Differential Revision: [D39108647](https://our.internmc.facebook.com/intern/diff/D39108647/) Pull Request resolved: https://github.com/pytorch/pytorch/pull/85510 Approved by: https://github.com/aaronenyeshi
2025-12-06 12:20:52 +01:00 · 2022-10-13 07:48:58 -07:00 · 2022-10-13 07:48:58 -07:00 · b8f14b7877
commit b8f14b7877
parent bc4ca4c2c4
15 changed files with 144 additions and 180 deletions
--- a/build_variables.bzl
+++ b/build_variables.bzl
@ -133,7 +133,6 @@ libtorch_sources_common = sorted(core_sources_common + torch_unpickler_common)
 libtorch_profiler_sources = [
    "torch/csrc/autograd/profiler_legacy.cpp",
    "torch/csrc/autograd/profiler_kineto.cpp",
-    "torch/csrc/profiler/api.cpp",
    "torch/csrc/profiler/collection.cpp",
    "torch/csrc/profiler/execution_graph_observer.cpp",
    "torch/csrc/profiler/kineto_shim.cpp",
@ -142,6 +141,7 @@ libtorch_profiler_sources = [
    "torch/csrc/profiler/itt_observer.cpp",
    "torch/csrc/profiler/orchestration/observer.cpp",
    "torch/csrc/profiler/orchestration/python_tracer.cpp",
+    "torch/csrc/profiler/stubs/base.cpp",
    "torch/csrc/monitor/counters.cpp",
    "torch/csrc/monitor/events.cpp",
 ]
@ -661,7 +661,7 @@ libtorch_cuda_core_sources = [
    "torch/csrc/cuda/comm.cpp",
    "torch/csrc/cuda/memory_snapshot.cpp",
    "torch/csrc/jit/codegen/fuser/cuda/fused_kernel.cpp",
-    "torch/csrc/profiler/cuda.cpp",
+    "torch/csrc/profiler/stubs/cuda.cpp",
    "torch/csrc/autograd/functions/comm.cpp",
    "torch/csrc/jit/codegen/cuda/arith.cpp",
    "torch/csrc/jit/codegen/cuda/compute_at.cpp",
--- a/caffe2/CMakeLists.txt
+++ b/caffe2/CMakeLists.txt
@ -585,7 +585,7 @@ endif()
 if(${USE_ITT})
  list(APPEND TORCH_SRCS
    ${TORCH_SRC_DIR}/csrc/itt_wrapper.cpp
-    ${TORCH_SRC_DIR}/csrc/profiler/itt.cpp
+    ${TORCH_SRC_DIR}/csrc/profiler/stubs/itt.cpp
  )
 endif()

--- a/setup.py
+++ b/setup.py
@ -1115,6 +1115,7 @@ def main():
        'include/torch/csrc/onnx/*.h',
        'include/torch/csrc/profiler/*.h',
        'include/torch/csrc/profiler/orchestration/*.h',
+        'include/torch/csrc/profiler/stubs/*.h',
        'include/torch/csrc/utils/*.h',
        'include/torch/csrc/tensor/*.h',
        'include/torch/csrc/lazy/backend/*.h',
--- a/torch/csrc/autograd/profiler_kineto.h
+++ b/torch/csrc/autograd/profiler_kineto.h
@ -4,6 +4,7 @@
 #include <vector>

 #include <torch/csrc/profiler/api.h>
+#include <torch/csrc/profiler/stubs/base.h>
 #include <torch/csrc/profiler/util.h>

 namespace torch {
--- a/torch/csrc/autograd/profiler_legacy.h
+++ b/torch/csrc/autograd/profiler_legacy.h
@ -12,6 +12,7 @@

 #include <torch/csrc/Export.h>
 #include <torch/csrc/profiler/api.h>
+#include <torch/csrc/profiler/stubs/base.h>
 #include <torch/csrc/profiler/util.h>

 namespace torch {
--- a/torch/csrc/itt_wrapper.cpp
+++ b/torch/csrc/itt_wrapper.cpp
@ -1,6 +1,6 @@
 #include <c10/macros/Export.h>
 #include <ittnotify.h>
-#include <torch/csrc/profiler/api.h>
+#include <torch/csrc/profiler/stubs/base.h>

 namespace torch {
 namespace profiler {
--- a/torch/csrc/profiler/api.cpp
+++ b/torch/csrc/profiler/api.cpp
@ -1,129 +0,0 @@
-#include <torch/csrc/profiler/api.h>
-
-#include <torch/csrc/profiler/util.h>
-
-namespace torch {
-namespace profiler {
-namespace impl {
-
-ProfilerStubs::~ProfilerStubs() = default;
-
-namespace {
-struct DefaultCUDAStubs : public ProfilerStubs {
-  void record(
-      int* /*device*/,
-      ProfilerEventStub* /*event*/,
-      int64_t* /*cpu_ns*/) const override {
-    fail();
-  }
-  float elapsed(
-      const ProfilerEventStub* /*event*/,
-      const ProfilerEventStub* /*event2*/) const override {
-    fail();
-    return 0.f;
-  }
-  void mark(const char* /*name*/) const override {
-    fail();
-  }
-  void rangePush(const char* /*name*/) const override {
-    fail();
-  }
-  void rangePop() const override {
-    fail();
-  }
-  bool enabled() const override {
-    return false;
-  }
-  void onEachDevice(std::function<void(int)> /*op*/) const override {
-    fail();
-  }
-  void synchronize() const override {
-    fail();
-  }
-  ~DefaultCUDAStubs() override = default;
-
- private:
-  void fail() const {
-    AT_ERROR("CUDA used in profiler but not enabled.");
-  }
-};
-
-const DefaultCUDAStubs default_cuda_stubs;
-constexpr const DefaultCUDAStubs* default_cuda_stubs_addr = &default_cuda_stubs;
-// Constant initialization, so it is guaranteed to be initialized before
-// static initialization calls which may invoke registerCUDAMethods
-inline const ProfilerStubs*& cuda_stubs() {
-  static const ProfilerStubs* stubs_ =
-      static_cast<const ProfilerStubs*>(default_cuda_stubs_addr);
-  return stubs_;
-}
-
-struct DefaultITTStubs : public ProfilerStubs {
-  void record(
-      int* /*device*/,
-      ProfilerEventStub* /*event*/,
-      int64_t* /*cpu_ns*/) const override {
-    fail();
-  }
-  float elapsed(
-      const ProfilerEventStub* /*event*/,
-      const ProfilerEventStub* /*event2*/) const override {
-    fail();
-    return 0.f;
-  }
-  void mark(const char* /*name*/) const override {
-    fail();
-  }
-  void rangePush(const char* /*name*/) const override {
-    fail();
-  }
-  void rangePop() const override {
-    fail();
-  }
-  bool enabled() const override {
-    return false;
-  }
-  void onEachDevice(std::function<void(int)> /*op*/) const override {
-    fail();
-  }
-  void synchronize() const override {
-    fail();
-  }
-  ~DefaultITTStubs() override = default;
-
- private:
-  void fail() const {
-    AT_ERROR("ITT used in profiler but not enabled.");
-  }
-};
-
-const DefaultITTStubs default_itt_stubs;
-constexpr const DefaultITTStubs* default_itt_stubs_addr = &default_itt_stubs;
-// Constant initialization, so it is guaranteed to be initialized before
-// static initialization calls which may invoke registerITTMethods
-inline const ProfilerStubs*& itt_stubs() {
-  static const ProfilerStubs* stubs_ =
-      static_cast<const ProfilerStubs*>(default_itt_stubs_addr);
-  return stubs_;
-}
-} // namespace
-
-const ProfilerStubs* cudaStubs() {
-  return cuda_stubs();
-}
-
-void registerCUDAMethods(ProfilerStubs* stubs) {
-  cuda_stubs() = stubs;
-}
-
-const ProfilerStubs* ittStubs() {
-  return itt_stubs();
-}
-
-void registerITTMethods(ProfilerStubs* stubs) {
-  itt_stubs() = stubs;
-}
-
-} // namespace impl
-} // namespace profiler
-} // namespace torch
--- a/torch/csrc/profiler/api.h
+++ b/torch/csrc/profiler/api.h
@ -1,46 +1,7 @@
 #pragma once

-#include <ATen/record_function.h>
-#include <torch/csrc/Export.h>
 #include <torch/csrc/profiler/orchestration/observer.h>

-struct CUevent_st;
-
-namespace torch {
-namespace profiler {
-namespace impl {
-
-// ----------------------------------------------------------------------------
-// -- Annotation --------------------------------------------------------------
-// ----------------------------------------------------------------------------
-using ProfilerEventStub = std::shared_ptr<CUevent_st>;
-
-struct TORCH_API ProfilerStubs {
-  virtual void record(int* device, ProfilerEventStub* event, int64_t* cpu_ns)
-      const = 0;
-  virtual float elapsed(
-      const ProfilerEventStub* event,
-      const ProfilerEventStub* event2) const = 0;
-  virtual void mark(const char* name) const = 0;
-  virtual void rangePush(const char* name) const = 0;
-  virtual void rangePop() const = 0;
-  virtual bool enabled() const {
-    return false;
-  }
-  virtual void onEachDevice(std::function<void(int)> op) const = 0;
-  virtual void synchronize() const = 0;
-  virtual ~ProfilerStubs();
-};
-
-TORCH_API void registerCUDAMethods(ProfilerStubs* stubs);
-TORCH_API const ProfilerStubs* cudaStubs();
-TORCH_API void registerITTMethods(ProfilerStubs* stubs);
-TORCH_API const ProfilerStubs* ittStubs();
-
-} // namespace impl
-} // namespace profiler
-} // namespace torch
-
 // There are some components which use these symbols. Until we migrate them
 // we have to mirror them in the old autograd namespace.
 namespace torch {
--- a/torch/csrc/profiler/collection.h
+++ b/torch/csrc/profiler/collection.h
@ -16,6 +16,7 @@
 #include <torch/csrc/profiler/containers.h>
 #include <torch/csrc/profiler/kineto_shim.h>
 #include <torch/csrc/profiler/orchestration/python_tracer.h>
+#include <torch/csrc/profiler/stubs/base.h>
 #include <torch/csrc/profiler/util.h>
 #include <torch/csrc/utils/python_stub.h>

--- a/torch/csrc/profiler/itt_observer.cpp
+++ b/torch/csrc/profiler/itt_observer.cpp
@ -1,5 +1,6 @@
 #include <torch/csrc/profiler/itt_observer.h>

+#include <torch/csrc/profiler/stubs/base.h>
 #include <torch/csrc/profiler/util.h>

 namespace torch {
--- a/torch/csrc/profiler/nvtx_observer.cpp
+++ b/torch/csrc/profiler/nvtx_observer.cpp
@ -1,5 +1,6 @@
 #include <torch/csrc/profiler/nvtx_observer.h>

+#include <torch/csrc/profiler/stubs/base.h>
 #include <torch/csrc/profiler/util.h>

 namespace torch {
--- a/torch/csrc/profiler/stubs/base.cpp
+++ b/torch/csrc/profiler/stubs/base.cpp
@ -0,0 +1,81 @@
+#include <torch/csrc/profiler/stubs/base.h>
+
+#include <c10/util/Exception.h>
+
+namespace torch {
+namespace profiler {
+namespace impl {
+
+ProfilerStubs::~ProfilerStubs() = default;
+
+namespace {
+struct DefaultStubs : public ProfilerStubs {
+  DefaultStubs(const char* name) : name_{name} {}
+
+  void record(int*, ProfilerEventStub*, int64_t*) const override {
+    fail();
+  }
+  float elapsed(const ProfilerEventStub*, const ProfilerEventStub*)
+      const override {
+    fail();
+    return 0.f;
+  }
+  void mark(const char*) const override {
+    fail();
+  }
+  void rangePush(const char*) const override {
+    fail();
+  }
+  void rangePop() const override {
+    fail();
+  }
+  bool enabled() const override {
+    return false;
+  }
+  void onEachDevice(std::function<void(int)>) const override {
+    fail();
+  }
+  void synchronize() const override {
+    fail();
+  }
+  ~DefaultStubs() override = default;
+
+ private:
+  void fail() const {
+    AT_ERROR(name_, " used in profiler but not enabled.");
+  }
+
+  const char* const name_;
+};
+} // namespace
+
+#define REGISTER_DEFAULT(name, upper_name)                                   \
+  namespace {                                                                \
+  const DefaultStubs default_##name##_stubs{#upper_name};                    \
+  constexpr const DefaultStubs* default_##name##_stubs_addr =                \
+      &default_##name##_stubs;                                               \
+                                                                             \
+  /* Constant initialization, so it is guaranteed to be initialized before*/ \
+  /* static initialization calls which may invoke register<name>Methods*/    \
+  inline const ProfilerStubs*& name##_stubs() {                              \
+    static const ProfilerStubs* stubs_ =                                     \
+        static_cast<const ProfilerStubs*>(default_##name##_stubs_addr);      \
+    return stubs_;                                                           \
+  }                                                                          \
+  } /*namespace*/                                                            \
+                                                                             \
+  const ProfilerStubs* name##Stubs() {                                       \
+    return name##_stubs();                                                   \
+  }                                                                          \
+                                                                             \
+  void register##upper_name##Methods(ProfilerStubs* stubs) {                 \
+    name##_stubs() = stubs;                                                  \
+  }
+
+REGISTER_DEFAULT(cuda, CUDA)
+REGISTER_DEFAULT(itt, ITT)
+#undef REGISTER_DEFAULT
+
+} // namespace impl
+} // namespace profiler
+} // namespace torch
--- a/torch/csrc/profiler/stubs/base.h
+++ b/torch/csrc/profiler/stubs/base.h
@ -0,0 +1,43 @@
+#pragma once
+
+#include <functional>
+#include <memory>
+
+#include <torch/csrc/Export.h>
+
+struct CUevent_st;
+
+namespace torch {
+namespace profiler {
+namespace impl {
+
+// ----------------------------------------------------------------------------
+// -- Annotation --------------------------------------------------------------
+// ----------------------------------------------------------------------------
+using ProfilerEventStub = std::shared_ptr<CUevent_st>;
+
+struct TORCH_API ProfilerStubs {
+  virtual void record(int* device, ProfilerEventStub* event, int64_t* cpu_ns)
+      const = 0;
+  virtual float elapsed(
+      const ProfilerEventStub* event,
+      const ProfilerEventStub* event2) const = 0;
+  virtual void mark(const char* name) const = 0;
+  virtual void rangePush(const char* name) const = 0;
+  virtual void rangePop() const = 0;
+  virtual bool enabled() const {
+    return false;
+  }
+  virtual void onEachDevice(std::function<void(int)> op) const = 0;
+  virtual void synchronize() const = 0;
+  virtual ~ProfilerStubs();
+};
+
+TORCH_API void registerCUDAMethods(ProfilerStubs* stubs);
+TORCH_API const ProfilerStubs* cudaStubs();
+TORCH_API void registerITTMethods(ProfilerStubs* stubs);
+TORCH_API const ProfilerStubs* ittStubs();
+
+} // namespace impl
+} // namespace profiler
+} // namespace torch
--- a/torch/csrc/profiler/stubs/cuda.cpp
+++ b/torch/csrc/profiler/stubs/cuda.cpp
@ -1,9 +1,11 @@
+#include <sstream>
+
+#include <nvToolsExt.h>
+
 #include <c10/cuda/CUDAGuard.h>
 #include <c10/util/irange.h>
-#include <nvToolsExt.h>
-#include <torch/csrc/autograd/profiler.h>
-
-#include <sstream>
+#include <torch/csrc/profiler/stubs/base.h>
+#include <torch/csrc/profiler/util.h>

 namespace torch {
 namespace profiler {
--- a/torch/csrc/profiler/stubs/itt.cpp
+++ b/torch/csrc/profiler/stubs/itt.cpp
@ -1,9 +1,9 @@
-#include <c10/util/irange.h>
-#include <torch/csrc/autograd/profiler.h>
-#include <torch/csrc/itt_wrapper.h>
-
 #include <sstream>

+#include <c10/util/irange.h>
+#include <torch/csrc/itt_wrapper.h>
+#include <torch/csrc/profiler/stubs/base.h>
+
 namespace torch {
 namespace profiler {
 namespace impl {