mirror of
https://github.com/zebrajr/pytorch.git
synced 2025-12-06 12:20:52 +01:00
Move hardware_destructive_interference_size to c10/core/alignment.h (#160067)
# Motivation Move `hardware_destructive_interference_size` to `c10/core/alignment.h`, which gives a chance to reuse it across different accelerators. Pull Request resolved: https://github.com/pytorch/pytorch/pull/160067 Approved by: https://github.com/Skylion007, https://github.com/EikanWang
This commit is contained in:
parent
21131a2444
commit
0bff65503c
|
|
@ -39,7 +39,7 @@ struct HostBlock {
|
|||
};
|
||||
|
||||
template <typename B>
|
||||
struct alignas(64) FreeBlockList {
|
||||
struct alignas(hardware_destructive_interference_size) FreeBlockList {
|
||||
std::mutex mutex_;
|
||||
std::deque<B*> list_;
|
||||
};
|
||||
|
|
@ -122,7 +122,7 @@ struct TORCH_API HostStats {
|
|||
// Struct containing memory allocator summary statistics for host, as they
|
||||
// are staged for reporting. This is a temporary struct that is used to
|
||||
// avoid locking the allocator while collecting stats.
|
||||
struct alignas(64) HostStatsStaged {
|
||||
struct alignas(hardware_destructive_interference_size) HostStatsStaged {
|
||||
std::mutex timing_mutex_;
|
||||
// COUNT: total allocations (active + free)
|
||||
// LOCK: access to this stat is protected by the allocator's blocks_mutex_
|
||||
|
|
@ -669,7 +669,7 @@ struct CachingHostAllocatorImpl {
|
|||
TORCH_CHECK_NOT_IMPLEMENTED(false, "Not implemented for query_event");
|
||||
}
|
||||
|
||||
alignas(64) std::mutex blocks_mutex_;
|
||||
alignas(hardware_destructive_interference_size) std::mutex blocks_mutex_;
|
||||
ska::flat_hash_set<B*> blocks_; // block list
|
||||
ska::flat_hash_map<void*, B*> ptr_to_block_;
|
||||
|
||||
|
|
@ -677,17 +677,17 @@ struct CachingHostAllocatorImpl {
|
|||
// size. This allows us to quickly find a free block of the right size.
|
||||
// We use deque to store per size free list and guard the list with its own
|
||||
// mutex.
|
||||
alignas(64) std::vector<FreeBlockList<B>> free_list_ =
|
||||
alignas(hardware_destructive_interference_size) std::vector<FreeBlockList<B>> free_list_ =
|
||||
std::vector<FreeBlockList<B>>(MAX_SIZE_INDEX);
|
||||
|
||||
alignas(64) std::mutex events_mutex_;
|
||||
alignas(hardware_destructive_interference_size) std::mutex events_mutex_;
|
||||
std::deque<std::pair<E, B*>> events_; // event queue paired with block
|
||||
|
||||
// Indicates whether the object is active.
|
||||
// Set to false in the destructor to signal background threads to stop.
|
||||
std::atomic<bool> active_{true};
|
||||
protected:
|
||||
alignas(64) HostStatsStaged stats_;
|
||||
alignas(hardware_destructive_interference_size) HostStatsStaged stats_;
|
||||
};
|
||||
|
||||
struct TORCH_API HostAllocator : public at::Allocator {
|
||||
|
|
|
|||
|
|
@ -9,6 +9,7 @@
|
|||
|
||||
#include <c10/core/Device.h>
|
||||
#include <c10/core/DeviceType.h>
|
||||
#include <c10/core/alignment.h>
|
||||
#include <c10/macros/Export.h>
|
||||
#include <c10/macros/Macros.h>
|
||||
#include <c10/util/Exception.h>
|
||||
|
|
|
|||
|
|
@ -1,6 +1,7 @@
|
|||
#pragma once
|
||||
|
||||
#include <cstddef>
|
||||
#include <new>
|
||||
|
||||
namespace c10 {
|
||||
|
||||
|
|
@ -18,4 +19,12 @@ constexpr size_t gPagesize = 4096;
|
|||
// since the default thp pagesize is 2MB, enable thp only
|
||||
// for buffers of size 2MB or larger to avoid memory bloating
|
||||
constexpr size_t gAlloc_threshold_thp = static_cast<size_t>(2) * 1024 * 1024;
|
||||
|
||||
// Cache line size used to avoid false sharing between threads. Falls back to 64
|
||||
// bytes if C++17 feature is unavailable.
|
||||
#ifdef __cpp_lib_hardware_interference_size
|
||||
using std::hardware_destructive_interference_size;
|
||||
#else
|
||||
constexpr std::size_t hardware_destructive_interference_size = 64;
|
||||
#endif
|
||||
} // namespace c10
|
||||
|
|
|
|||
|
|
@ -941,7 +941,7 @@ class EventPool {
|
|||
|
||||
private:
|
||||
struct PerDevicePool {
|
||||
alignas(64) std::mutex mutex_;
|
||||
alignas(hardware_destructive_interference_size) std::mutex mutex_;
|
||||
std::vector<std::unique_ptr<cudaEvent_t>> event_pool_;
|
||||
};
|
||||
std::vector<PerDevicePool> pools_;
|
||||
|
|
@ -3758,11 +3758,6 @@ static void uncached_delete(void* ptr) {
|
|||
static void local_raw_delete(void* ptr);
|
||||
thread_local std::stack<std::string> DeviceCachingAllocator::compile_context;
|
||||
thread_local std::string DeviceCachingAllocator::user_metadata;
|
||||
#ifdef __cpp_lib_hardware_interference_size
|
||||
using std::hardware_destructive_interference_size;
|
||||
#else
|
||||
static constexpr std::size_t hardware_destructive_interference_size = 64;
|
||||
#endif
|
||||
|
||||
class NativeCachingAllocator : public CUDAAllocator {
|
||||
private:
|
||||
|
|
|
|||
|
|
@ -554,7 +554,7 @@ static void local_raw_delete(void* ptr);
|
|||
|
||||
class XPUAllocator : public DeviceAllocator {
|
||||
private:
|
||||
std::mutex mutex;
|
||||
alignas(hardware_destructive_interference_size) std::mutex mutex;
|
||||
ska::flat_hash_map<void*, Block*> allocated_blocks;
|
||||
|
||||
void add_allocated_block(Block* block) {
|
||||
|
|
|
|||
Loading…
Reference in New Issue
Block a user