mirror of
https://github.com/zebrajr/pytorch.git
synced 2025-12-06 12:20:52 +01:00
Pull Request resolved: https://github.com/pytorch/pytorch/pull/116602 Approved by: https://github.com/ezyang
107 lines
4.1 KiB
C++
107 lines
4.1 KiB
C++
#pragma once
|
|
|
|
#include <cstddef>
|
|
#include <mutex>
|
|
|
|
#include <c10/macros/Export.h>
|
|
#include <c10/util/SmallVector.h>
|
|
#include <c10/util/flat_hash_map.h>
|
|
|
|
/*
|
|
* CPUCachingAllocator:
|
|
* DISCLAIMER:
|
|
* This is subject to change (beta) and only supported on mobile builds.
|
|
* If code snippet such as in 'Usage pattern' is used outside of mobile
|
|
* build you will not observe the intended behavior.
|
|
* See below for more information.
|
|
* Why?
|
|
* It has been observed that some mobile platforms, such as pixel 3, return
|
|
* memory aggressively to the system. This results in page faults in some
|
|
* cases and ends up hurting performance. This caching allocator aims to address
|
|
* that. Furthermore it also allows users to specify their own allocator by
|
|
* implementing allocate/free virtual interfaces. What are the cons? There are
|
|
* some cons that were observed where use of caching allocator led to worse
|
|
* performance on some platforms. Reason being that the caching mechanism used
|
|
* by this allocator left us worse off compared to the corresponding platform's
|
|
* tuned memory allocator. In that case it seemed better to not use this
|
|
* allocator. Note there are some ideas to fix this in the works.
|
|
*
|
|
* Usage:
|
|
* Usage pattern:
|
|
* Instantiate and own the caching allocator.
|
|
* std::unique_ptr<c10::CPUCachingAllocator> caching_allocator =
|
|
* std::make_unique<c10::CPUCachingAllocator>();
|
|
* Use caching allocator with a scoped guard at inference time.
|
|
* {
|
|
* WithCPUCachingAllocatorGuard(caching_allocator.get());
|
|
* ... model.forward(...);
|
|
* }
|
|
*/
|
|
|
|
namespace c10 {
|
|
|
|
class C10_API CPUCachingAllocator {
|
|
/*
|
|
* What it does:
|
|
* Caches all the allocations carried out by this allocator.
|
|
* Cache key is the size of the allocation.
|
|
* If requested size is found in the cache returns the cached pointer.
|
|
* What it does not do:
|
|
* No speculative allocation for any future allocations.
|
|
*/
|
|
private:
|
|
inline void* allocate_and_cache(const size_t bytes);
|
|
void free_cached();
|
|
|
|
protected:
|
|
// Invariants.
|
|
// 1. If memory is ever allocated via this allocator then
|
|
// the pointer will exist in allocation_map_, unless the allocator
|
|
// returned the memory to OS via free_cached.
|
|
// 1.1. Therefore even when the said memory is "freed" via this
|
|
// allocator (and thus cached), it will continue to stay
|
|
// in allocation_map_. Furthermore it will also exist in
|
|
// available_map_. Thus an allocated memory pointer can be in both
|
|
// allocation_map_ and available_map_ simultaneously.
|
|
// 2. Memory pointer maybe removed from allocation_map_, when it
|
|
// is freed outside of the scope of this allocator, but was allocated
|
|
// by this allocator.
|
|
// 3. Available map only contains that memory which was allocated
|
|
// by this allocator and subsequently freed by this allocator.
|
|
// As a result of above invariants, allocated memory ptr cannot be in
|
|
// available_map_ unless it is in allocation_map_ as well.
|
|
ska::flat_hash_map<size_t, c10::SmallVector<void*, 16>> available_map_;
|
|
static ska::flat_hash_map<void*, size_t> allocation_map_;
|
|
// Since allocation_map, which is a global instance, is mutated/read via
|
|
// all public APIs we need a global mutex.
|
|
static std::mutex mutex_;
|
|
|
|
public:
|
|
static void record_free(void* ptr);
|
|
virtual ~CPUCachingAllocator();
|
|
// Checks the cache to see if allocation of size bytes can be found.
|
|
// If so return cached memory, else
|
|
// allocates memory, records it for caching and returns.
|
|
virtual void* allocate(const size_t bytes);
|
|
// Checks if the memory being freed is was marked for allocation by
|
|
// an earlier call to allocate. If so cache the allocation.
|
|
// Otherwise free.
|
|
virtual void free(void* ptr);
|
|
};
|
|
|
|
CPUCachingAllocator* GetDefaultCPUCachingAllocator();
|
|
|
|
bool ThreadLocalCachingAllocatorEnabled();
|
|
CPUCachingAllocator* GetThreadLocalCachingAllocator();
|
|
|
|
class C10_API WithCPUCachingAllocatorGuard {
|
|
public:
|
|
WithCPUCachingAllocatorGuard(CPUCachingAllocator* allocator);
|
|
~WithCPUCachingAllocatorGuard();
|
|
|
|
private:
|
|
CPUCachingAllocator* prev_caching_allocator_ptr_{nullptr};
|
|
};
|
|
|
|
} // namespace c10
|