From c934ed65673428c35f70cbce6ebd730ad22f058d Mon Sep 17 00:00:00 2001
From: "augusto.yjh" <augusto.yjh@antgroup.com>
Date: Thu, 31 Oct 2024 13:24:20 +0000
Subject: [PATCH] init kineto after torch module initialized (#131448)

Fixes #131020

As discussed in the issue thread,  we can use ` KINETO_DAEMON_INIT_DELAY_S` to delay the initialization of `kineto`  in case `kineto` is initialized before `libtorch_cuda.so`.

It's not clear to set a proper value of environmental variable `KINETO_DAEMON_INIT_DELAY_S`, here's a trick to make the initialization of `kineto` after the initialization of module `torch`. I'm not sure whether this is an acceptable trick, please take a look at this pr, thanks.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/131448
Approved by: https://github.com/sraikund16, https://github.com/briancoutinho
---
 test/profiler/test_kineto.py                  | 51 +++++++++++++++++++
 torch/csrc/Module.cpp                         |  5 ++
 .../csrc/profiler/kineto_client_interface.cpp | 45 +++++-----------
 torch/csrc/profiler/kineto_client_interface.h | 11 ++++
 4 files changed, 79 insertions(+), 33 deletions(-)
 create mode 100644 test/profiler/test_kineto.py
 create mode 100644 torch/csrc/profiler/kineto_client_interface.h
diff --git a/test/profiler/test_kineto.py b/test/profiler/test_kineto.py
new file mode 100644
index 00000000000..a122170e5ac
--- /dev/null
+++ b/test/profiler/test_kineto.py
@@ -0,0 +1,51 @@
+# Owner(s): ["oncall: profiler"]
+import os
+import subprocess
+import sys
+from unittest.mock import patch
+
+import torch
+from torch.testing._internal.common_utils import run_tests, TestCase
+
+
+class SimpleKinetoInitializationTest(TestCase):
+    @patch.dict(os.environ, {"KINETO_USE_DAEMON": "1"})
+    def test_kineto_profiler_with_environment_variable(self):
+        """
+        This test checks whether kineto works with torch in daemon mode, please refer to issue #112389 and #131020.
+        Besides that, this test will also check that kineto will not be initialized when user loads the shared library
+        directly.
+        """
+        script = """
+import torch
+if torch.cuda.is_available() > 0:
+    torch.cuda.init()
+"""
+        try:
+            subprocess.check_output(
+                [sys.executable, "-W", "always", "-c", script],
+                cwd=os.path.dirname(os.path.realpath(__file__)),
+            )
+        except subprocess.CalledProcessError as e:
+            if e.returncode != 0:
+                self.assertTrue(
+                    False,
+                    "Kineto is not working properly with the Dynolog environment variable",
+                )
+        # import the shared library directly - it triggers static init but doesn't call kineto_init
+        env = os.environ.copy()
+        env["KINETO_USE_DAEMON"] = "1"
+        if "KINETO_DAEMON_INIT_DELAY_S" in env:
+            env.pop("KINETO_DAEMON_INIT_DELAY_S")
+        _, stderr = TestCase.run_process_no_exception(
+            f"from ctypes import CDLL; CDLL('{torch._C.__file__}')"
+        )
+        self.assertNotRegex(
+            stderr.decode("ascii"),
+            "Registering daemon config loader",
+            "kineto should not be initialized when the shared library is imported directly",
+        )
+
+
+if __name__ == "__main__":
+    run_tests()
diff --git a/torch/csrc/Module.cpp b/torch/csrc/Module.cpp
index 383603bd3aa..03f30797f18 100644
--- a/torch/csrc/Module.cpp
+++ b/torch/csrc/Module.cpp
@@ -102,6 +102,7 @@
 
 #include <ATen/native/transformers/sdp_utils_cpp.h>
 #include <torch/csrc/profiler/combined_traceback.h>
+#include <torch/csrc/profiler/kineto_client_interface.h>
 #include <sstream>
 
 #ifdef USE_CUDA
@@ -2443,6 +2444,10 @@ Call this whenever a new thread is created in order to propagate values from
   torch::set_disabled_torch_dispatch_impl(
       PyObject_GetAttrString(module, "_disabled_torch_dispatch_impl"));
   ASSERT_TRUE(torch::disabled_torch_dispatch_impl() != nullptr);
+  // init kineto here
+#ifdef USE_KINETO
+  torch::global_kineto_init();
+#endif
   return module;
   END_HANDLE_TH_ERRORS
 }
diff --git a/torch/csrc/profiler/kineto_client_interface.cpp b/torch/csrc/profiler/kineto_client_interface.cpp
index f8929b74c75..fd145f4c4fa 100644
--- a/torch/csrc/profiler/kineto_client_interface.cpp
+++ b/torch/csrc/profiler/kineto_client_interface.cpp
@@ -2,6 +2,7 @@
 #include <ATen/Context.h>
 #include <libkineto.h>
 #include <torch/csrc/autograd/profiler_kineto.h>
+#include <torch/csrc/profiler/kineto_client_interface.h>
 #include <chrono>
 #include <thread>
 
@@ -71,46 +72,24 @@ class LibKinetoClient : public libkineto::ClientInterface {
 
 } // namespace profiler::impl
 
+void global_kineto_init() {
+#if ENABLE_GLOBAL_OBSERVER
+  if (c10::utils::get_env("KINETO_USE_DAEMON").has_value()) {
+    libkineto_init(
+        /*cpuOnly=*/!(at::hasCUDA() || at::hasXPU() || at::hasMTIA()),
+        /*logOnError=*/true);
+    libkineto::api().suppressLogMessages();
+  }
+#endif
+}
+
 #if ENABLE_GLOBAL_OBSERVER
 namespace {
 
-int get_init_delay() {
-  const char* delay_c = std::getenv("KINETO_DAEMON_INIT_DELAY_S");
-  if (!delay_c) {
-    return -1;
-  }
-  std::string delay_s{delay_c};
-  try {
-    return std::stoi(delay_s);
-  } catch (const std::invalid_argument& _) {
-    return -1;
-  }
-}
-
 struct RegisterLibKinetoClient {
   RegisterLibKinetoClient() {
     static profiler::impl::LibKinetoClient client;
     libkineto::api().registerClient(&client);
-
-    auto kineto_init = []() {
-      libkineto_init(
-          /*cpuOnly=*/!(at::hasCUDA() || at::hasXPU() || at::hasMTIA()),
-          /*logOnError=*/true);
-      libkineto::api().suppressLogMessages();
-    };
-
-    if (std::getenv("KINETO_USE_DAEMON") != nullptr) {
-      int init_delay_s = get_init_delay();
-      if (init_delay_s > 0) {
-        std::thread t([init_delay_s, kineto_init]() {
-          std::this_thread::sleep_for(std::chrono::seconds(init_delay_s));
-          kineto_init();
-        });
-        t.detach();
-      } else {
-        kineto_init();
-      }
-    }
   }
 } register_libkineto_client;
 
diff --git a/torch/csrc/profiler/kineto_client_interface.h b/torch/csrc/profiler/kineto_client_interface.h
new file mode 100644
index 00000000000..6d328256085
--- /dev/null
+++ b/torch/csrc/profiler/kineto_client_interface.h
@@ -0,0 +1,11 @@
+#pragma once
+
+#include <torch/csrc/jit/runtime/interpreter.h>
+#include <torch/csrc/profiler/unwind/unwind.h>
+
+namespace torch {
+
+// declare global_kineto_init for libtorch_cpu.so to call
+TORCH_API void global_kineto_init(void);
+
+} // namespace torch