Replace c10::call_once with static initialization (#166381)

This PR replaces c10::call_once calls with static initialization when possible. C++11 semantics guarantees that static initialization is atomic. Static initialization also has lower cost than using c10::call_once. Pull Request resolved: https://github.com/pytorch/pytorch/pull/166381 Approved by: https://github.com/malfet
2025-12-06 12:20:52 +01:00 · 2025-11-01 07:09:40 +00:00 · 2025-11-01 07:09:40 +00:00 · f0745ddb11
commit f0745ddb11
parent 4316df857c
6 changed files with 30 additions and 35 deletions
--- a/aten/src/ATen/cuda/detail/BLASConstants.cu
+++ b/aten/src/ATen/cuda/detail/BLASConstants.cu
@ -2,8 +2,6 @@
 #include <ATen/Tensor.h>
 #include <ATen/cuda/Exceptions.h>

-#include <mutex>
-
 namespace at {
 namespace cuda {
 namespace detail {
@ -12,39 +10,36 @@ __device__ __constant__ float cublas_one_device;
 __device__ __constant__ float cublas_zero_device;

 float *get_cublas_device_one() {
-  static c10::once_flag init_flag;
-
-  c10::call_once(init_flag, []() {
+  static float *ptr = nullptr;
+  static auto init_flag = [&]() {
    const float one = 1.f;
    AT_CUDA_CHECK(cudaMemcpyToSymbol(cublas_one_device, &one, sizeof(float)));
-  });
-
-  float *ptr;
    AT_CUDA_CHECK(cudaGetSymbolAddress(reinterpret_cast<void**>(&ptr), cublas_one_device));
+    return true;
+  }();
+
  return ptr;
 }

 float *get_cublas_device_zero() {
-  static c10::once_flag init_flag;
-
-  c10::call_once(init_flag, []() {
+  static float *ptr = nullptr;
+  static auto init_flag = [&]() {
    const float zero = 0.f;
    AT_CUDA_CHECK(cudaMemcpyToSymbol(cublas_zero_device, &zero, sizeof(float)));
-  });
-
-  float *ptr;
    AT_CUDA_CHECK(cudaGetSymbolAddress(reinterpret_cast<void**>(&ptr), cublas_zero_device));
+    return true;
+  }();
+
  return ptr;
 }

 float *get_user_alpha_ptr() {
  static float *alpha_ptr;

-  static c10::once_flag init_flag;
-
-  c10::call_once(init_flag, []() {
+  static bool init_flag [[maybe_unused]] = []() {
    AT_CUDA_CHECK(cudaMalloc(&alpha_ptr, sizeof(float)));
-  });
+    return true;
+  }();

  return alpha_ptr;
 }
--- a/aten/src/ATen/mps/MPSDevice.mm
+++ b/aten/src/ATen/mps/MPSDevice.mm
@ -1,7 +1,5 @@
 //  Copyright © 2022 Apple Inc.

-#include <c10/util/CallOnce.h>
-
 #include <ATen/mps/IndexKernels.h>
 #include <ATen/mps/MPSAllocatorInterface.h>
 #include <ATen/mps/MPSDevice.h>
@ -10,9 +8,6 @@

 namespace at::mps {

-static std::unique_ptr<MPSDevice> mps_device;
-static c10::once_flag mpsdev_init;
-
 static inline MTLLanguageVersion getMetalLanguageVersion(const id<MTLDevice>& device) {
  // MPS Advanced Indexing needs at least Metal 2.0 (support for Argument Buffers and function constants)
  // host_name attribute needs at least Metal 2.2 and ulong needs Metal 2.3 (supported on MacOS 11+
@ -21,8 +16,8 @@ static inline MTLLanguageVersion getMetalLanguageVersion(const id<MTLDevice>& de
 }

 MPSDevice* MPSDevice::getInstance() {
-  c10::call_once(mpsdev_init, [] { mps_device = std::unique_ptr<MPSDevice>(new MPSDevice()); });
-  return mps_device.get();
+  static MPSDevice mps_device;
+  return &mps_device;
 }

 MPSDevice::~MPSDevice() {
--- a/c10/cuda/CUDAStream.cpp
+++ b/c10/cuda/CUDAStream.cpp
@ -15,7 +15,6 @@ namespace c10::cuda {
 namespace {

 // Global stream state and constants
-c10::once_flag init_flag;
 DeviceIndex num_gpus = -1;
 constexpr int kStreamsPerPoolBits = 5;
 constexpr int kStreamsPerPool = 1 << kStreamsPerPoolBits;
@ -226,7 +225,10 @@ void initDeviceStreamState(DeviceIndex device_index) {
 // Init front-end to ensure initialization only occurs once
 void initCUDAStreamsOnce() {
  // Inits default streams (once, globally)
-  c10::call_once(init_flag, initGlobalStreamState);
+  auto static init_flag [[maybe_unused]] = [] {
+    initGlobalStreamState();
+    return true;
+  }();

  if (current_streams) {
    return;
--- a/c10/xpu/XPUFunctions.cpp
+++ b/c10/xpu/XPUFunctions.cpp
@ -1,4 +1,3 @@
-#include <c10/util/CallOnce.h>
 #include <c10/util/Exception.h>
 #include <c10/xpu/XPUFunctions.h>

@ -33,7 +32,6 @@ namespace {
 *    one iGPU and enumerate all iGPUs on that platform.
 * 3. If neither dGPUs nor iGPUs are found, conclude that no GPUs are available.
 */
-c10::once_flag init_flag;
 thread_local DeviceIndex curDeviceIndex = 0;

 struct DevicePool {
@ -149,7 +147,10 @@ inline void initGlobalDevicePoolState() {
 }

 inline void initDevicePoolCallOnce() {
-  c10::call_once(init_flag, initGlobalDevicePoolState);
+  auto static init_flag [[maybe_unused]] = [] {
+    initGlobalDevicePoolState();
+    return true;
+  }();
 }

 void initDeviceProperties(DeviceProp* device_prop, DeviceIndex device) {
--- a/c10/xpu/XPUStream.cpp
+++ b/c10/xpu/XPUStream.cpp
@ -12,7 +12,6 @@ namespace c10::xpu {
 namespace {

 // Global stream state and constants
-c10::once_flag init_flag;
 DeviceIndex num_gpus = -1;
 constexpr int kStreamsPerPoolBits = 5;
 constexpr int kStreamsPerPool = 1 << kStreamsPerPoolBits;
@ -163,7 +162,10 @@ void initDeviceStreamState(DeviceIndex device) {
 }

 void initXPUStreamsOnce() {
-  c10::call_once(init_flag, initGlobalStreamState);
+  auto static init_flag [[maybe_unused]] = [] {
+    initGlobalStreamState();
+    return true;
+  }();

  if (current_streams) {
    return;
--- a/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp
+++ b/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp
@ -349,8 +349,7 @@ static void cacheAllocatorDeregisterHook(
 }

 static void attachAllocatorHooks() {
-  static c10::once_flag flag;
-  c10::call_once(flag, [] {
+  static auto flag [[maybe_unused]] = [] {
    // Attaching hooks fails if CUDACachingAllocator is not initialized, so
    // Init for CUDA is called (and is a no-op if CUDA is already
    // initialized).
@ -359,7 +358,8 @@ static void attachAllocatorHooks() {
        &cacheAllocatorRegisterHook);
    c10::cuda::CUDACachingAllocator::attachAllocatorTraceTracker(
        &cacheAllocatorDeregisterHook);
-  });
+    return true;
+  }();
 }

 static std::