diff --git a/c10/cuda/CUDAAllocatorConfig.cpp b/c10/cuda/CUDAAllocatorConfig.cpp index 240f7ea5b05..1b6adb1dabe 100644 --- a/c10/cuda/CUDAAllocatorConfig.cpp +++ b/c10/cuda/CUDAAllocatorConfig.cpp @@ -297,7 +297,7 @@ size_t CUDAAllocatorConfig::parseAllocatorConfig( #endif // USE_ROCM } -void CUDAAllocatorConfig::parseArgs(const std::optional& env) { +void CUDAAllocatorConfig::parseArgs(const std::string& env) { // If empty, set the default values m_max_split_size = std::numeric_limits::max(); m_roundup_power2_divisions.assign(kRoundUpPowerOfTwoIntervals, 0); @@ -305,16 +305,13 @@ void CUDAAllocatorConfig::parseArgs(const std::optional& env) { bool used_cudaMallocAsync = false; bool used_native_specific_option = false; - if (!env.has_value()) { - return; - } { std::lock_guard lock(m_last_allocator_settings_mutex); - m_last_allocator_settings = env.value(); + m_last_allocator_settings = env; } std::vector config; - lexArgs(env.value(), config); + lexArgs(env, config); for (size_t i = 0; i < config.size(); i++) { std::string_view config_item_view(config[i]); @@ -487,9 +484,6 @@ size_t CUDAAllocatorConfig::parsePinnedUseBackgroundThreads( return i; } -// General caching allocator utilities -void setAllocatorSettings(const std::string& env) { - CUDACachingAllocator::CUDAAllocatorConfig::instance().parseArgs(env.c_str()); -} +REGISTER_ALLOCATOR_CONFIG_PARSE_HOOK(CUDAAllocatorConfig) } // namespace c10::cuda::CUDACachingAllocator diff --git a/c10/cuda/CUDAAllocatorConfig.h b/c10/cuda/CUDAAllocatorConfig.h index cd05db89de4..f598ba011ed 100644 --- a/c10/cuda/CUDAAllocatorConfig.h +++ b/c10/cuda/CUDAAllocatorConfig.h @@ -1,16 +1,10 @@ #pragma once +#include #include #include #include -#include -#include -#include -#include -#include -#include - namespace c10::cuda::CUDACachingAllocator { enum class Expandable_Segments_Handle_Type : int { @@ -111,13 +105,40 @@ class C10_CUDA_API CUDAAllocatorConfig { env = c10::utils::get_env("PYTORCH_HIP_ALLOC_CONF"); } #endif - inst->parseArgs(env); + // Note: keep the parsing order and logic stable to avoid potential + // performance regressions in internal tests. + if (!env.has_value()) { + env = c10::utils::get_env("PYTORCH_ALLOC_CONF"); + } + if (env.has_value()) { + inst->parseArgs(env.value()); + } return inst; })(); return *s_instance; } - void parseArgs(const std::optional& env); + // Use `Construct On First Use Idiom` to avoid `Static Initialization Order` + // issue. + static const std::unordered_set& getKeys() { + static std::unordered_set keys{ + "backend", + // keep BC for Rocm: `cuda` -> `cud` `a`, to avoid hipify issues + // NOLINTBEGIN(bugprone-suspicious-missing-comma,-warnings-as-errors) + "release_lock_on_cud" + "amalloc", + "pinned_use_cud" + "a_host_register", + // NOLINTEND(bugprone-suspicious-missing-comma,-warnings-as-errors) + "release_lock_on_hipmalloc", + "pinned_use_hip_host_register", + "graph_capture_record_stream_reuse", + "pinned_reserve_segment_size_mb", + "pinned_num_register_threads"}; + return keys; + } + + void parseArgs(const std::string& env); private: CUDAAllocatorConfig(); @@ -174,7 +195,7 @@ class C10_CUDA_API CUDAAllocatorConfig { std::mutex m_last_allocator_settings_mutex; }; -// General caching allocator utilities -C10_CUDA_API void setAllocatorSettings(const std::string& env); +// Keep this for backwards compatibility +using c10::CachingAllocator::setAllocatorSettings; } // namespace c10::cuda::CUDACachingAllocator diff --git a/c10/cuda/CUDACachingAllocator.cpp b/c10/cuda/CUDACachingAllocator.cpp index 88a40f8c051..48413e7a6f3 100644 --- a/c10/cuda/CUDACachingAllocator.cpp +++ b/c10/cuda/CUDACachingAllocator.cpp @@ -64,10 +64,6 @@ namespace cuda::CUDACachingAllocator { using namespace c10::CachingAllocator; using namespace c10::CachingDeviceAllocator; -// Included here as this is externally used in CUDAAllocatorConfig -const size_t kLargeBuffer = - 20971520; // "large" allocations may be packed in 20 MiB blocks - namespace Native { // diff --git a/c10/cuda/CUDACachingAllocator.h b/c10/cuda/CUDACachingAllocator.h index 509c542668f..89274c9f994 100644 --- a/c10/cuda/CUDACachingAllocator.h +++ b/c10/cuda/CUDACachingAllocator.h @@ -1,5 +1,6 @@ #pragma once +#include #include #include #include @@ -49,10 +50,9 @@ namespace c10::cuda::CUDACachingAllocator { // Preserved only for BC reasons // NOLINTNEXTLINE(misc-unused-using-decls) +using c10::CachingAllocator::kLargeBuffer; using c10::CachingDeviceAllocator::DeviceStats; -extern const size_t kLargeBuffer; - typedef std::shared_ptr (*CreateContextFn)(); // Struct containing info of an allocation block (i.e. a fractional part of a