[PyTorch Pinned Allocator] Add support of reserved pinned memory segment to avoid slow paths (#164501)

Summary:
This diff adds the feature of allocating a large pinned memory segment upfront based on the provided config. This large segment is then used to serve all the small pinned memory requests to avoid expensive device level APIs (slow paths).

Example:

PYTORCH_CUDA_ALLOC_CONF=pinned_reserve_segment_size_mb:2048

This reserves a 2GB pinned memory segment for the process and then all incoming small requests are just served from this segment and no cudaHostAlloc/cudaHostRegister apis are being called.

Differential Revision: D83779074

Pull Request resolved: https://github.com/pytorch/pytorch/pull/164501
Approved by: https://github.com/yangw-dev
This commit is contained in:
Banit Agrawal 2025-10-03 18:11:22 +00:00 committed by PyTorch MergeBot
parent 3d9d41c801
commit f39789cdab
5 changed files with 125 additions and 13 deletions

View File

@ -50,6 +50,46 @@ namespace {
constexpr size_t MAX_SIZE_INDEX = 64;
}
// A large reserved pinned memory segment that is created in advance which is used
// to allocate small pinned memory requests to avoid calling into expensive APIs.
// We never free this memory and move up the pointer as we allocate new blocks
// and when blocks are freed, they are cached in the free lists.
struct PinnedReserveSegment {
PinnedReserveSegment(void *start, size_t size) : start_(start), size_(size),
current_ptr_(start_), initialized_(true) {}
PinnedReserveSegment() : start_(nullptr), size_(0), current_ptr_(nullptr), initialized_(false) {}
bool initialized() {
return initialized_;
}
void* allocate(size_t bytes) {
std::lock_guard<std::mutex> guard(mutex_);
// Round up the requested size to 4KB boundary for all including the small ones.
size_t rounded_bytes = (bytes + 4096 - 1) & ~(4096 - 1);
if (((uint8_t*)current_ptr_ + rounded_bytes) > ((uint8_t*)start_ + size_)) {
return nullptr;
}
void* ptr = current_ptr_;
current_ptr_ = (uint8_t*)current_ptr_ + rounded_bytes;
return ptr;
}
bool owns(void* ptr) {
return ptr >= start_ && ptr < (uint8_t*)start_ + size_;
}
std::mutex mutex_;
void* start_;
size_t size_;
void* current_ptr_;
bool initialized_;
};
// Struct containing memory allocator summary statistics for host.
struct TORCH_API HostStats {
// COUNT: total allocations (active)
@ -203,17 +243,6 @@ struct CachingHostAllocatorImpl {
// background.
if (!pinned_use_background_threads()) {
process_events();
} else {
// Launch the background thread and process events in a loop.
static bool background_thread_flag [[maybe_unused]] = [this] {
getBackgroundThreadPool()->run([&]() {
while (active_) {
process_events();
std::this_thread::sleep_for(std::chrono::microseconds(100));
}
});
return true;
}();
}
// Round up the allocation to the nearest power of two to improve reuse.
@ -226,6 +255,21 @@ struct CachingHostAllocatorImpl {
return {block->ptr_, reinterpret_cast<void*>(block)};
}
// Check in the recently freed blocks with pending events to see if we
// can reuse them. Call get_free_block again after processing events
if (pinned_use_background_threads()) {
// Launch the background thread and process events in a loop.
static bool background_thread_flag [[maybe_unused]] = [this] {
getBackgroundThreadPool()->run([&]() {
while (active_) {
process_events();
std::this_thread::sleep_for(std::chrono::microseconds(100));
}
});
return true;
}();
}
// Slow path: if we can't allocate from the cached free list, we need
// to create a new block.
void* ptr = nullptr;

View File

@ -9,7 +9,6 @@
#include <cuda_runtime_api.h>
#include <future>
#include <unordered_map>
namespace at::cuda {
namespace {
@ -72,9 +71,20 @@ using Block = HostBlock<CUDAStream>;
struct CUDACachingHostAllocatorImpl
: public CachingHostAllocatorImpl<CUDAStream, EventPool::Event> {
private:
std::unordered_map<void*, bool> use_host_register;
ska::flat_hash_map<void*, bool> use_host_register;
void allocate_host_memory(size_t size, void** ptr) override {
// try allocating from reserve segment first before calling into expensive APIs
if (get_reserve_segment().initialized()) {
*ptr = get_reserve_segment().allocate(size);
if (*ptr != nullptr) {
return;
}
}
allocate_host_memory_slowpath(size, ptr);
}
void allocate_host_memory_slowpath(size_t size, void** ptr) {
// Pinned memory pointers allocated by any device can be directly used by
// any other device, regardless of the current device at the time of
// allocation, since we assume unified addressing. So we grab any existing
@ -113,6 +123,18 @@ struct CUDACachingHostAllocatorImpl
}
void free_block(Block* block) override {
// We never free blocks from the reserve segment
if (get_reserve_segment().initialized()) {
// Check if the block is from the reserve segment
if (get_reserve_segment().owns(block->ptr_)) {
return;
}
}
free_block_slowpath(block);
}
void free_block_slowpath(Block* block) {
auto start = std::chrono::steady_clock::now();
// Users may change the allocator config at will. torch unit tests do this.
// However, allocations using cudaHostRegister should use corresonding
@ -172,6 +194,20 @@ struct CUDACachingHostAllocatorImpl
return event_pool->get(idx);
}
PinnedReserveSegment& get_reserve_segment() {
static auto reserve_segment = [&]() {
if (c10::cuda::CUDACachingAllocator::CUDAAllocatorConfig::pinned_reserve_segment_size_mb() > 0) {
void *ptr;
size_t sz = c10::cuda::CUDACachingAllocator::CUDAAllocatorConfig::pinned_reserve_segment_size_mb() * 1024 * 1024;
allocate_host_memory_slowpath(sz, &ptr);
return PinnedReserveSegment(ptr, sz);
} else {
return PinnedReserveSegment();
}
} ();
return reserve_segment;
}
TaskThreadPool* getThreadPool() {
static TaskThreadPool* pool = new TaskThreadPool(
static_cast<int>(c10::cuda::CUDACachingAllocator::CUDAAllocatorConfig::

View File

@ -15,6 +15,7 @@ CUDAAllocatorConfig::CUDAAllocatorConfig()
m_max_non_split_rounding_size(kLargeBuffer),
m_garbage_collection_threshold(0),
m_pinned_num_register_threads(1),
m_pinned_reserve_segment_size_mb(0),
m_expandable_segments(false),
#if CUDA_VERSION >= 12030
m_expandable_segments_handle_type(
@ -371,6 +372,9 @@ void CUDAAllocatorConfig::parseArgs(const std::optional<std::string>& env) {
} else if (config_item_view == "pinned_num_register_threads") {
i = parsePinnedNumRegisterThreads(config, i);
used_native_specific_option = true;
} else if (config_item_view == "pinned_reserve_segment_size_mb") {
i = parsePinnedReserveSegmentSize(config, i);
used_native_specific_option = true;
} else if (config_item_view == "pinned_use_background_threads") {
i = parsePinnedUseBackgroundThreads(config, i);
used_native_specific_option = true;
@ -451,6 +455,22 @@ size_t CUDAAllocatorConfig::parsePinnedNumRegisterThreads(
return i;
}
size_t CUDAAllocatorConfig::parsePinnedReserveSegmentSize(
const std::vector<std::string>& config,
size_t i) {
consumeToken(config, ++i, ':');
if (++i < config.size()) {
size_t val2 = stoi(config[i]);
TORCH_CHECK(
val2 > 0, "Pinned reserve segment size has to be greater than 0 ", "");
m_pinned_reserve_segment_size_mb = val2;
} else {
TORCH_CHECK(
false, "Error, expecting pinned_reserve_segment_size_mb value", "");
}
return i;
}
size_t CUDAAllocatorConfig::parsePinnedUseBackgroundThreads(
const std::vector<std::string>& config,
size_t i) {

View File

@ -70,6 +70,10 @@ class C10_CUDA_API CUDAAllocatorConfig {
return instance().m_pinned_use_background_threads;
}
static size_t pinned_reserve_segment_size_mb() {
return instance().m_pinned_reserve_segment_size_mb;
}
static size_t pinned_max_register_threads() {
// Based on the benchmark results, we see better allocation performance
// with 8 threads. However on future systems, we may need more threads
@ -143,6 +147,9 @@ class C10_CUDA_API CUDAAllocatorConfig {
size_t parsePinnedNumRegisterThreads(
const std::vector<std::string>& config,
size_t i);
size_t parsePinnedReserveSegmentSize(
const std::vector<std::string>& config,
size_t i);
size_t parsePinnedUseBackgroundThreads(
const std::vector<std::string>& config,
size_t i);
@ -155,6 +162,7 @@ class C10_CUDA_API CUDAAllocatorConfig {
std::vector<size_t> m_roundup_power2_divisions;
std::atomic<double> m_garbage_collection_threshold;
std::atomic<size_t> m_pinned_num_register_threads;
std::atomic<size_t> m_pinned_reserve_segment_size_mb;
std::atomic<bool> m_expandable_segments;
std::atomic<Expandable_Segments_Handle_Type>
m_expandable_segments_handle_type;

View File

@ -608,6 +608,10 @@ Available options:
for processing events. This avoids any slow path associated with querying/processing of
events in the fast allocation path. This feature is disabled by default.
* `pinned_reserve_segment_size_mb` option is a size in MB to reserve for pinned memory
segment. This allocates a large segment of pinned memory upfront and then uses to allocate
small size requests. This helps reduce the number of expensive device library calls.
* ``graph_capture_record_stream_reuse`` (experimental, default: `False`)
If set to `True`, the CUDA caching allocator will attempt to reclaim device memory during
CUDA Graph capture by using the graph topology (instead of CUDA events) to determine