mirror of
https://github.com/zebrajr/pytorch.git
synced 2025-12-06 00:20:18 +01:00
Replace c10::call_once with static initialization (#166381)
This PR replaces c10::call_once calls with static initialization when possible. C++11 semantics guarantees that static initialization is atomic. Static initialization also has lower cost than using c10::call_once. Pull Request resolved: https://github.com/pytorch/pytorch/pull/166381 Approved by: https://github.com/malfet
This commit is contained in:
parent
4316df857c
commit
f0745ddb11
|
|
@ -2,8 +2,6 @@
|
|||
#include <ATen/Tensor.h>
|
||||
#include <ATen/cuda/Exceptions.h>
|
||||
|
||||
#include <mutex>
|
||||
|
||||
namespace at {
|
||||
namespace cuda {
|
||||
namespace detail {
|
||||
|
|
@ -12,39 +10,36 @@ __device__ __constant__ float cublas_one_device;
|
|||
__device__ __constant__ float cublas_zero_device;
|
||||
|
||||
float *get_cublas_device_one() {
|
||||
static c10::once_flag init_flag;
|
||||
|
||||
c10::call_once(init_flag, []() {
|
||||
static float *ptr = nullptr;
|
||||
static auto init_flag = [&]() {
|
||||
const float one = 1.f;
|
||||
AT_CUDA_CHECK(cudaMemcpyToSymbol(cublas_one_device, &one, sizeof(float)));
|
||||
});
|
||||
|
||||
float *ptr;
|
||||
AT_CUDA_CHECK(cudaGetSymbolAddress(reinterpret_cast<void**>(&ptr), cublas_one_device));
|
||||
return true;
|
||||
}();
|
||||
|
||||
return ptr;
|
||||
}
|
||||
|
||||
float *get_cublas_device_zero() {
|
||||
static c10::once_flag init_flag;
|
||||
|
||||
c10::call_once(init_flag, []() {
|
||||
static float *ptr = nullptr;
|
||||
static auto init_flag = [&]() {
|
||||
const float zero = 0.f;
|
||||
AT_CUDA_CHECK(cudaMemcpyToSymbol(cublas_zero_device, &zero, sizeof(float)));
|
||||
});
|
||||
|
||||
float *ptr;
|
||||
AT_CUDA_CHECK(cudaGetSymbolAddress(reinterpret_cast<void**>(&ptr), cublas_zero_device));
|
||||
return true;
|
||||
}();
|
||||
|
||||
return ptr;
|
||||
}
|
||||
|
||||
float *get_user_alpha_ptr() {
|
||||
static float *alpha_ptr;
|
||||
|
||||
static c10::once_flag init_flag;
|
||||
|
||||
c10::call_once(init_flag, []() {
|
||||
static bool init_flag [[maybe_unused]] = []() {
|
||||
AT_CUDA_CHECK(cudaMalloc(&alpha_ptr, sizeof(float)));
|
||||
});
|
||||
return true;
|
||||
}();
|
||||
|
||||
return alpha_ptr;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -1,7 +1,5 @@
|
|||
// Copyright © 2022 Apple Inc.
|
||||
|
||||
#include <c10/util/CallOnce.h>
|
||||
|
||||
#include <ATen/mps/IndexKernels.h>
|
||||
#include <ATen/mps/MPSAllocatorInterface.h>
|
||||
#include <ATen/mps/MPSDevice.h>
|
||||
|
|
@ -10,9 +8,6 @@
|
|||
|
||||
namespace at::mps {
|
||||
|
||||
static std::unique_ptr<MPSDevice> mps_device;
|
||||
static c10::once_flag mpsdev_init;
|
||||
|
||||
static inline MTLLanguageVersion getMetalLanguageVersion(const id<MTLDevice>& device) {
|
||||
// MPS Advanced Indexing needs at least Metal 2.0 (support for Argument Buffers and function constants)
|
||||
// host_name attribute needs at least Metal 2.2 and ulong needs Metal 2.3 (supported on MacOS 11+
|
||||
|
|
@ -21,8 +16,8 @@ static inline MTLLanguageVersion getMetalLanguageVersion(const id<MTLDevice>& de
|
|||
}
|
||||
|
||||
MPSDevice* MPSDevice::getInstance() {
|
||||
c10::call_once(mpsdev_init, [] { mps_device = std::unique_ptr<MPSDevice>(new MPSDevice()); });
|
||||
return mps_device.get();
|
||||
static MPSDevice mps_device;
|
||||
return &mps_device;
|
||||
}
|
||||
|
||||
MPSDevice::~MPSDevice() {
|
||||
|
|
|
|||
|
|
@ -15,7 +15,6 @@ namespace c10::cuda {
|
|||
namespace {
|
||||
|
||||
// Global stream state and constants
|
||||
c10::once_flag init_flag;
|
||||
DeviceIndex num_gpus = -1;
|
||||
constexpr int kStreamsPerPoolBits = 5;
|
||||
constexpr int kStreamsPerPool = 1 << kStreamsPerPoolBits;
|
||||
|
|
@ -226,7 +225,10 @@ void initDeviceStreamState(DeviceIndex device_index) {
|
|||
// Init front-end to ensure initialization only occurs once
|
||||
void initCUDAStreamsOnce() {
|
||||
// Inits default streams (once, globally)
|
||||
c10::call_once(init_flag, initGlobalStreamState);
|
||||
auto static init_flag [[maybe_unused]] = [] {
|
||||
initGlobalStreamState();
|
||||
return true;
|
||||
}();
|
||||
|
||||
if (current_streams) {
|
||||
return;
|
||||
|
|
|
|||
|
|
@ -1,4 +1,3 @@
|
|||
#include <c10/util/CallOnce.h>
|
||||
#include <c10/util/Exception.h>
|
||||
#include <c10/xpu/XPUFunctions.h>
|
||||
|
||||
|
|
@ -33,7 +32,6 @@ namespace {
|
|||
* one iGPU and enumerate all iGPUs on that platform.
|
||||
* 3. If neither dGPUs nor iGPUs are found, conclude that no GPUs are available.
|
||||
*/
|
||||
c10::once_flag init_flag;
|
||||
thread_local DeviceIndex curDeviceIndex = 0;
|
||||
|
||||
struct DevicePool {
|
||||
|
|
@ -149,7 +147,10 @@ inline void initGlobalDevicePoolState() {
|
|||
}
|
||||
|
||||
inline void initDevicePoolCallOnce() {
|
||||
c10::call_once(init_flag, initGlobalDevicePoolState);
|
||||
auto static init_flag [[maybe_unused]] = [] {
|
||||
initGlobalDevicePoolState();
|
||||
return true;
|
||||
}();
|
||||
}
|
||||
|
||||
void initDeviceProperties(DeviceProp* device_prop, DeviceIndex device) {
|
||||
|
|
|
|||
|
|
@ -12,7 +12,6 @@ namespace c10::xpu {
|
|||
namespace {
|
||||
|
||||
// Global stream state and constants
|
||||
c10::once_flag init_flag;
|
||||
DeviceIndex num_gpus = -1;
|
||||
constexpr int kStreamsPerPoolBits = 5;
|
||||
constexpr int kStreamsPerPool = 1 << kStreamsPerPoolBits;
|
||||
|
|
@ -163,7 +162,10 @@ void initDeviceStreamState(DeviceIndex device) {
|
|||
}
|
||||
|
||||
void initXPUStreamsOnce() {
|
||||
c10::call_once(init_flag, initGlobalStreamState);
|
||||
auto static init_flag [[maybe_unused]] = [] {
|
||||
initGlobalStreamState();
|
||||
return true;
|
||||
}();
|
||||
|
||||
if (current_streams) {
|
||||
return;
|
||||
|
|
|
|||
|
|
@ -349,8 +349,7 @@ static void cacheAllocatorDeregisterHook(
|
|||
}
|
||||
|
||||
static void attachAllocatorHooks() {
|
||||
static c10::once_flag flag;
|
||||
c10::call_once(flag, [] {
|
||||
static auto flag [[maybe_unused]] = [] {
|
||||
// Attaching hooks fails if CUDACachingAllocator is not initialized, so
|
||||
// Init for CUDA is called (and is a no-op if CUDA is already
|
||||
// initialized).
|
||||
|
|
@ -359,7 +358,8 @@ static void attachAllocatorHooks() {
|
|||
&cacheAllocatorRegisterHook);
|
||||
c10::cuda::CUDACachingAllocator::attachAllocatorTraceTracker(
|
||||
&cacheAllocatorDeregisterHook);
|
||||
});
|
||||
return true;
|
||||
}();
|
||||
}
|
||||
|
||||
static std::
|
||||
|
|
|
|||
Loading…
Reference in New Issue
Block a user