Move hardware_destructive_interference_size to c10/core/alignment.h (#160067)

# Motivation Move `hardware_destructive_interference_size` to `c10/core/alignment.h`, which gives a chance to reuse it across different accelerators. Pull Request resolved: https://github.com/pytorch/pytorch/pull/160067 Approved by: https://github.com/Skylion007, https://github.com/EikanWang
2025-12-06 12:20:52 +01:00 · 2025-10-21 14:05:23 +00:00 · 2025-10-21 14:05:23 +00:00 · 0bff65503c
commit 0bff65503c
parent 21131a2444
5 changed files with 18 additions and 13 deletions
--- a/aten/src/ATen/core/CachingHostAllocator.h
+++ b/aten/src/ATen/core/CachingHostAllocator.h
@ -39,7 +39,7 @@ struct HostBlock {
 };

 template <typename B>
-struct alignas(64) FreeBlockList {
+struct alignas(hardware_destructive_interference_size) FreeBlockList {
  std::mutex mutex_;
  std::deque<B*> list_;
 };
@ -122,7 +122,7 @@ struct TORCH_API HostStats {
 // Struct containing memory allocator summary statistics for host, as they
 // are staged for reporting. This is a temporary struct that is used to
 // avoid locking the allocator while collecting stats.
-struct alignas(64) HostStatsStaged {
+struct alignas(hardware_destructive_interference_size) HostStatsStaged {
  std::mutex timing_mutex_;
  // COUNT: total allocations (active + free)
  // LOCK: access to this stat is protected by the allocator's blocks_mutex_
@ -669,7 +669,7 @@ struct CachingHostAllocatorImpl {
    TORCH_CHECK_NOT_IMPLEMENTED(false, "Not implemented for query_event");
  }

-  alignas(64) std::mutex blocks_mutex_;
+  alignas(hardware_destructive_interference_size) std::mutex blocks_mutex_;
  ska::flat_hash_set<B*> blocks_; // block list
  ska::flat_hash_map<void*, B*> ptr_to_block_;

@ -677,17 +677,17 @@ struct CachingHostAllocatorImpl {
  // size. This allows us to quickly find a free block of the right size.
  // We use deque to store per size free list and guard the list with its own
  // mutex.
-  alignas(64) std::vector<FreeBlockList<B>> free_list_ =
+  alignas(hardware_destructive_interference_size) std::vector<FreeBlockList<B>> free_list_ =
      std::vector<FreeBlockList<B>>(MAX_SIZE_INDEX);

-  alignas(64) std::mutex events_mutex_;
+  alignas(hardware_destructive_interference_size) std::mutex events_mutex_;
  std::deque<std::pair<E, B*>> events_; // event queue paired with block

  // Indicates whether the object is active.
  // Set to false in the destructor to signal background threads to stop.
  std::atomic<bool> active_{true};
 protected:
-  alignas(64) HostStatsStaged stats_;
+  alignas(hardware_destructive_interference_size) HostStatsStaged stats_;
 };

 struct TORCH_API HostAllocator : public at::Allocator {
--- a/c10/core/Allocator.h
+++ b/c10/core/Allocator.h
@ -9,6 +9,7 @@

 #include <c10/core/Device.h>
 #include <c10/core/DeviceType.h>
+#include <c10/core/alignment.h>
 #include <c10/macros/Export.h>
 #include <c10/macros/Macros.h>
 #include <c10/util/Exception.h>
--- a/c10/core/alignment.h
+++ b/c10/core/alignment.h
@ -1,6 +1,7 @@
 #pragma once

 #include <cstddef>
+#include <new>

 namespace c10 {

@ -18,4 +19,12 @@ constexpr size_t gPagesize = 4096;
 // since the default thp pagesize is 2MB, enable thp only
 // for buffers of size 2MB or larger to avoid memory bloating
 constexpr size_t gAlloc_threshold_thp = static_cast<size_t>(2) * 1024 * 1024;
+
+// Cache line size used to avoid false sharing between threads. Falls back to 64
+// bytes if C++17 feature is unavailable.
+#ifdef __cpp_lib_hardware_interference_size
+using std::hardware_destructive_interference_size;
+#else
+constexpr std::size_t hardware_destructive_interference_size = 64;
+#endif
 } // namespace c10
--- a/c10/cuda/CUDACachingAllocator.cpp
+++ b/c10/cuda/CUDACachingAllocator.cpp
@ -941,7 +941,7 @@ class EventPool {

 private:
  struct PerDevicePool {
-    alignas(64) std::mutex mutex_;
+    alignas(hardware_destructive_interference_size) std::mutex mutex_;
    std::vector<std::unique_ptr<cudaEvent_t>> event_pool_;
  };
  std::vector<PerDevicePool> pools_;
@ -3758,11 +3758,6 @@ static void uncached_delete(void* ptr) {
 static void local_raw_delete(void* ptr);
 thread_local std::stack<std::string> DeviceCachingAllocator::compile_context;
 thread_local std::string DeviceCachingAllocator::user_metadata;
-#ifdef __cpp_lib_hardware_interference_size
-using std::hardware_destructive_interference_size;
-#else
-static constexpr std::size_t hardware_destructive_interference_size = 64;
-#endif

 class NativeCachingAllocator : public CUDAAllocator {
 private:
--- a/c10/xpu/XPUCachingAllocator.cpp
+++ b/c10/xpu/XPUCachingAllocator.cpp
@ -554,7 +554,7 @@ static void local_raw_delete(void* ptr);

 class XPUAllocator : public DeviceAllocator {
 private:
-  std::mutex mutex;
+  alignas(hardware_destructive_interference_size) std::mutex mutex;
  ska::flat_hash_map<void*, Block*> allocated_blocks;

  void add_allocated_block(Block* block) {