Replace c10::call_once with static initialization (#166381)

This PR replaces c10::call_once calls with static initialization when possible. C++11 semantics guarantees that static initialization is atomic. Static initialization also has lower cost than using c10::call_once.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/166381
Approved by: https://github.com/malfet
This commit is contained in:
Yuanyuan Chen 2025-11-01 07:09:40 +00:00 committed by PyTorch MergeBot
parent 4316df857c
commit f0745ddb11
6 changed files with 30 additions and 35 deletions

View File

@ -2,8 +2,6 @@
#include <ATen/Tensor.h>
#include <ATen/cuda/Exceptions.h>
#include <mutex>
namespace at {
namespace cuda {
namespace detail {
@ -12,39 +10,36 @@ __device__ __constant__ float cublas_one_device;
__device__ __constant__ float cublas_zero_device;
float *get_cublas_device_one() {
static c10::once_flag init_flag;
c10::call_once(init_flag, []() {
static float *ptr = nullptr;
static auto init_flag = [&]() {
const float one = 1.f;
AT_CUDA_CHECK(cudaMemcpyToSymbol(cublas_one_device, &one, sizeof(float)));
});
float *ptr;
AT_CUDA_CHECK(cudaGetSymbolAddress(reinterpret_cast<void**>(&ptr), cublas_one_device));
return true;
}();
return ptr;
}
float *get_cublas_device_zero() {
static c10::once_flag init_flag;
c10::call_once(init_flag, []() {
static float *ptr = nullptr;
static auto init_flag = [&]() {
const float zero = 0.f;
AT_CUDA_CHECK(cudaMemcpyToSymbol(cublas_zero_device, &zero, sizeof(float)));
});
float *ptr;
AT_CUDA_CHECK(cudaGetSymbolAddress(reinterpret_cast<void**>(&ptr), cublas_zero_device));
return true;
}();
return ptr;
}
float *get_user_alpha_ptr() {
static float *alpha_ptr;
static c10::once_flag init_flag;
c10::call_once(init_flag, []() {
static bool init_flag [[maybe_unused]] = []() {
AT_CUDA_CHECK(cudaMalloc(&alpha_ptr, sizeof(float)));
});
return true;
}();
return alpha_ptr;
}

View File

@ -1,7 +1,5 @@
// Copyright © 2022 Apple Inc.
#include <c10/util/CallOnce.h>
#include <ATen/mps/IndexKernels.h>
#include <ATen/mps/MPSAllocatorInterface.h>
#include <ATen/mps/MPSDevice.h>
@ -10,9 +8,6 @@
namespace at::mps {
static std::unique_ptr<MPSDevice> mps_device;
static c10::once_flag mpsdev_init;
static inline MTLLanguageVersion getMetalLanguageVersion(const id<MTLDevice>& device) {
// MPS Advanced Indexing needs at least Metal 2.0 (support for Argument Buffers and function constants)
// host_name attribute needs at least Metal 2.2 and ulong needs Metal 2.3 (supported on MacOS 11+
@ -21,8 +16,8 @@ static inline MTLLanguageVersion getMetalLanguageVersion(const id<MTLDevice>& de
}
MPSDevice* MPSDevice::getInstance() {
c10::call_once(mpsdev_init, [] { mps_device = std::unique_ptr<MPSDevice>(new MPSDevice()); });
return mps_device.get();
static MPSDevice mps_device;
return &mps_device;
}
MPSDevice::~MPSDevice() {

View File

@ -15,7 +15,6 @@ namespace c10::cuda {
namespace {
// Global stream state and constants
c10::once_flag init_flag;
DeviceIndex num_gpus = -1;
constexpr int kStreamsPerPoolBits = 5;
constexpr int kStreamsPerPool = 1 << kStreamsPerPoolBits;
@ -226,7 +225,10 @@ void initDeviceStreamState(DeviceIndex device_index) {
// Init front-end to ensure initialization only occurs once
void initCUDAStreamsOnce() {
// Inits default streams (once, globally)
c10::call_once(init_flag, initGlobalStreamState);
auto static init_flag [[maybe_unused]] = [] {
initGlobalStreamState();
return true;
}();
if (current_streams) {
return;

View File

@ -1,4 +1,3 @@
#include <c10/util/CallOnce.h>
#include <c10/util/Exception.h>
#include <c10/xpu/XPUFunctions.h>
@ -33,7 +32,6 @@ namespace {
* one iGPU and enumerate all iGPUs on that platform.
* 3. If neither dGPUs nor iGPUs are found, conclude that no GPUs are available.
*/
c10::once_flag init_flag;
thread_local DeviceIndex curDeviceIndex = 0;
struct DevicePool {
@ -149,7 +147,10 @@ inline void initGlobalDevicePoolState() {
}
inline void initDevicePoolCallOnce() {
c10::call_once(init_flag, initGlobalDevicePoolState);
auto static init_flag [[maybe_unused]] = [] {
initGlobalDevicePoolState();
return true;
}();
}
void initDeviceProperties(DeviceProp* device_prop, DeviceIndex device) {

View File

@ -12,7 +12,6 @@ namespace c10::xpu {
namespace {
// Global stream state and constants
c10::once_flag init_flag;
DeviceIndex num_gpus = -1;
constexpr int kStreamsPerPoolBits = 5;
constexpr int kStreamsPerPool = 1 << kStreamsPerPoolBits;
@ -163,7 +162,10 @@ void initDeviceStreamState(DeviceIndex device) {
}
void initXPUStreamsOnce() {
c10::call_once(init_flag, initGlobalStreamState);
auto static init_flag [[maybe_unused]] = [] {
initGlobalStreamState();
return true;
}();
if (current_streams) {
return;

View File

@ -349,8 +349,7 @@ static void cacheAllocatorDeregisterHook(
}
static void attachAllocatorHooks() {
static c10::once_flag flag;
c10::call_once(flag, [] {
static auto flag [[maybe_unused]] = [] {
// Attaching hooks fails if CUDACachingAllocator is not initialized, so
// Init for CUDA is called (and is a no-op if CUDA is already
// initialized).
@ -359,7 +358,8 @@ static void attachAllocatorHooks() {
&cacheAllocatorRegisterHook);
c10::cuda::CUDACachingAllocator::attachAllocatorTraceTracker(
&cacheAllocatorDeregisterHook);
});
return true;
}();
}
static std::