diff --git a/c10/cuda/CUDAAllocatorConfig.cpp b/c10/cuda/CUDAAllocatorConfig.cpp
index 240f7ea5b05..1b6adb1dabe 100644
--- a/c10/cuda/CUDAAllocatorConfig.cpp
+++ b/c10/cuda/CUDAAllocatorConfig.cpp
@@ -297,7 +297,7 @@ size_t CUDAAllocatorConfig::parseAllocatorConfig(
 #endif // USE_ROCM
 }
 
-void CUDAAllocatorConfig::parseArgs(const std::optional<std::string>& env) {
+void CUDAAllocatorConfig::parseArgs(const std::string& env) {
   // If empty, set the default values
   m_max_split_size = std::numeric_limits<size_t>::max();
   m_roundup_power2_divisions.assign(kRoundUpPowerOfTwoIntervals, 0);
@@ -305,16 +305,13 @@ void CUDAAllocatorConfig::parseArgs(const std::optional<std::string>& env) {
   bool used_cudaMallocAsync = false;
   bool used_native_specific_option = false;
 
-  if (!env.has_value()) {
-    return;
-  }
   {
     std::lock_guard<std::mutex> lock(m_last_allocator_settings_mutex);
-    m_last_allocator_settings = env.value();
+    m_last_allocator_settings = env;
   }
 
   std::vector<std::string> config;
-  lexArgs(env.value(), config);
+  lexArgs(env, config);
 
   for (size_t i = 0; i < config.size(); i++) {
     std::string_view config_item_view(config[i]);
@@ -487,9 +484,6 @@ size_t CUDAAllocatorConfig::parsePinnedUseBackgroundThreads(
   return i;
 }
 
-// General caching allocator utilities
-void setAllocatorSettings(const std::string& env) {
-  CUDACachingAllocator::CUDAAllocatorConfig::instance().parseArgs(env.c_str());
-}
+REGISTER_ALLOCATOR_CONFIG_PARSE_HOOK(CUDAAllocatorConfig)
 
 } // namespace c10::cuda::CUDACachingAllocator
diff --git a/c10/cuda/CUDAAllocatorConfig.h b/c10/cuda/CUDAAllocatorConfig.h
index cd05db89de4..f598ba011ed 100644
--- a/c10/cuda/CUDAAllocatorConfig.h
+++ b/c10/cuda/CUDAAllocatorConfig.h
@@ -1,16 +1,10 @@
 #pragma once
 
+#include <c10/core/AllocatorConfig.h>
 #include <c10/cuda/CUDAMacros.h>
 #include <c10/util/Exception.h>
 #include <c10/util/env.h>
 
-#include <atomic>
-#include <cstddef>
-#include <cstdlib>
-#include <mutex>
-#include <string>
-#include <vector>
-
 namespace c10::cuda::CUDACachingAllocator {
 
 enum class Expandable_Segments_Handle_Type : int {
@@ -111,13 +105,40 @@ class C10_CUDA_API CUDAAllocatorConfig {
         env = c10::utils::get_env("PYTORCH_HIP_ALLOC_CONF");
       }
 #endif
-      inst->parseArgs(env);
+      // Note: keep the parsing order and logic stable to avoid potential
+      // performance regressions in internal tests.
+      if (!env.has_value()) {
+        env = c10::utils::get_env("PYTORCH_ALLOC_CONF");
+      }
+      if (env.has_value()) {
+        inst->parseArgs(env.value());
+      }
       return inst;
     })();
     return *s_instance;
   }
 
-  void parseArgs(const std::optional<std::string>& env);
+  // Use `Construct On First Use Idiom` to avoid `Static Initialization Order`
+  // issue.
+  static const std::unordered_set<std::string>& getKeys() {
+    static std::unordered_set<std::string> keys{
+        "backend",
+        // keep BC for Rocm: `cuda` -> `cud` `a`, to avoid hipify issues
+        // NOLINTBEGIN(bugprone-suspicious-missing-comma,-warnings-as-errors)
+        "release_lock_on_cud"
+        "amalloc",
+        "pinned_use_cud"
+        "a_host_register",
+        // NOLINTEND(bugprone-suspicious-missing-comma,-warnings-as-errors)
+        "release_lock_on_hipmalloc",
+        "pinned_use_hip_host_register",
+        "graph_capture_record_stream_reuse",
+        "pinned_reserve_segment_size_mb",
+        "pinned_num_register_threads"};
+    return keys;
+  }
+
+  void parseArgs(const std::string& env);
 
  private:
   CUDAAllocatorConfig();
@@ -174,7 +195,7 @@ class C10_CUDA_API CUDAAllocatorConfig {
   std::mutex m_last_allocator_settings_mutex;
 };
 
-// General caching allocator utilities
-C10_CUDA_API void setAllocatorSettings(const std::string& env);
+// Keep this for backwards compatibility
+using c10::CachingAllocator::setAllocatorSettings;
 
 } // namespace c10::cuda::CUDACachingAllocator
diff --git a/c10/cuda/CUDACachingAllocator.cpp b/c10/cuda/CUDACachingAllocator.cpp
index 88a40f8c051..48413e7a6f3 100644
--- a/c10/cuda/CUDACachingAllocator.cpp
+++ b/c10/cuda/CUDACachingAllocator.cpp
@@ -64,10 +64,6 @@ namespace cuda::CUDACachingAllocator {
 using namespace c10::CachingAllocator;
 using namespace c10::CachingDeviceAllocator;
 
-// Included here as this is externally used in CUDAAllocatorConfig
-const size_t kLargeBuffer =
-    20971520; // "large" allocations may be packed in 20 MiB blocks
-
 namespace Native {
 
 //
diff --git a/c10/cuda/CUDACachingAllocator.h b/c10/cuda/CUDACachingAllocator.h
index 509c542668f..89274c9f994 100644
--- a/c10/cuda/CUDACachingAllocator.h
+++ b/c10/cuda/CUDACachingAllocator.h
@@ -1,5 +1,6 @@
 #pragma once
 
+#include <c10/core/AllocatorConfig.h>
 #include <c10/core/CachingDeviceAllocator.h>
 #include <c10/cuda/CUDAGraphsC10Utils.h>
 #include <c10/cuda/CUDAMacros.h>
@@ -49,10 +50,9 @@ namespace c10::cuda::CUDACachingAllocator {
 
 // Preserved only for BC reasons
 // NOLINTNEXTLINE(misc-unused-using-decls)
+using c10::CachingAllocator::kLargeBuffer;
 using c10::CachingDeviceAllocator::DeviceStats;
 
-extern const size_t kLargeBuffer;
-
 typedef std::shared_ptr<GatheredContext> (*CreateContextFn)();
 
 // Struct containing info of an allocation block (i.e. a fractional part of a