diff --git a/aten/src/ATen/CMakeLists.txt b/aten/src/ATen/CMakeLists.txt index fbf664b5bc7..e97b3d368d1 100644 --- a/aten/src/ATen/CMakeLists.txt +++ b/aten/src/ATen/CMakeLists.txt @@ -420,7 +420,7 @@ set(ATen_CORE_TEST_SRCS ${ATen_CORE_TEST_SRCS} PARENT_SCOPE) set(ATen_HIP_TEST_SRCS ${ATen_HIP_TEST_SRCS} PARENT_SCOPE) set(ATen_VULKAN_TEST_SRCS ${ATen_VULKAN_TEST_SRCS} PARENT_SCOPE) set(ATen_MOBILE_BENCHMARK_SRCS ${ATen_MOBILE_BENCHMARK_SRCS} PARENT_SCOPE) -set(ATen_MOBILE_TEST_SRCS ${ATen_VEC256_TEST_SRCS} ${ATen_VULKAN_TEST_SRCS} PARENT_SCOPE) +set(ATen_MOBILE_TEST_SRCS ${ATen_MOBILE_TEST_SRCS} ${ATen_VULKAN_TEST_SRCS} PARENT_SCOPE) set(ATen_QUANTIZED_TEST_SRCS ${ATen_QUANTIZED_TEST_SRCS} PARENT_SCOPE) set(ATen_CPU_INCLUDE ${ATen_CPU_INCLUDE} PARENT_SCOPE) set(ATen_THIRD_PARTY_INCLUDE ${ATen_THIRD_PARTY_INCLUDE} PARENT_SCOPE) diff --git a/aten/src/ATen/test/CMakeLists.txt b/aten/src/ATen/test/CMakeLists.txt index 4f089aad8df..0781eaad1ef 100644 --- a/aten/src/ATen/test/CMakeLists.txt +++ b/aten/src/ATen/test/CMakeLists.txt @@ -77,12 +77,13 @@ list(APPEND ATen_HIP_TEST_SRCS list(APPEND ATen_VULKAN_TEST_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/vulkan_test.cpp) -list(APPEND ATen_VEC256_TEST_SRCS - ${CMAKE_CURRENT_SOURCE_DIR}/vec256_test.cpp) +list(APPEND ATen_MOBILE_TEST_SRCS + ${CMAKE_CURRENT_SOURCE_DIR}/vec256_test.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/cpu_caching_allocator_test.cpp) # ---[ Send the lists to the parent scope. set(ATen_CPU_TEST_SRCS ${ATen_CPU_TEST_SRCS} PARENT_SCOPE) set(ATen_CUDA_TEST_SRCS ${ATen_CUDA_TEST_SRCS} PARENT_SCOPE) set(ATen_HIP_TEST_SRCS ${ATen_HIP_TEST_SRCS} PARENT_SCOPE) set(ATen_VULKAN_TEST_SRCS ${ATen_VULKAN_TEST_SRCS} PARENT_SCOPE) -set(ATen_VEC256_TEST_SRCS ${ATen_VEC256_TEST_SRCS} PARENT_SCOPE) +set(ATen_MOBILE_TEST_SRCS ${ATen_MOBILE_TEST_SRCS} PARENT_SCOPE) diff --git a/aten/src/ATen/test/cpu_caching_allocator_test.cpp b/aten/src/ATen/test/cpu_caching_allocator_test.cpp new file mode 100644 index 00000000000..28a9b047652 --- /dev/null +++ b/aten/src/ATen/test/cpu_caching_allocator_test.cpp @@ -0,0 +1,50 @@ +#include + +#include +#include + +#include + +TEST(CPUCachingAllocatorTest, check_alloc_free) { + c10::CPUCachingAllocator caching_allocator; + c10::WithCPUCachingAllocatorGuard cachine_allocator_guard( + &caching_allocator); + at::Tensor a = at::rand({23, 23}); + float* data_ptr = a.data_ptr(); + a.reset(); + a = at::rand({23, 23}); + ASSERT_TRUE(data_ptr == a.data_ptr()); +} + +// This should just free the pointer correctly. +TEST(CPUCachingAllocatorTest, check_alloc_outside_free_inside) { + c10::CPUCachingAllocator caching_allocator; + at::Tensor a = at::rand({23, 23}); + { + c10::WithCPUCachingAllocatorGuard cachine_allocator_guard( + &caching_allocator); + float* data_ptr = a.data_ptr(); + a.reset(); + a = at::rand({23, 23}); + } +} + +TEST(CPUCachingAllocatorTest, check_alloc_inside_free_outside) { + c10::CPUCachingAllocator caching_allocator; + at::Tensor a; + { + c10::WithCPUCachingAllocatorGuard cachine_allocator_guard( + &caching_allocator); + a = at::rand({23, 23}); + } + a.reset(); +} + +int main(int argc, char* argv[]) { +// At the moment caching allocator is only exposed to mobile cpu allocator. +#ifdef C10_MOBILE + ::testing::InitGoogleTest(&argc, argv); + at::manual_seed(42); + return RUN_ALL_TESTS(); +#endif /* C10_Mobile */ +} diff --git a/binaries/speed_benchmark_torch.cc b/binaries/speed_benchmark_torch.cc index f6464785e40..db78467cfb4 100644 --- a/binaries/speed_benchmark_torch.cc +++ b/binaries/speed_benchmark_torch.cc @@ -24,6 +24,8 @@ #include "torch/csrc/jit/serialization/import.h" #include "torch/script.h" +#include "c10/core/CPUCachingAllocator.h" + #include using namespace std::chrono; @@ -45,6 +47,10 @@ C10_DEFINE_bool( no_inputs, false, "Whether the model has any input. Will ignore other input arugments if true"); +C10_DEFINE_bool( + use_caching_allocator, + false, + "Whether to cache allocations between inference iterations"); C10_DEFINE_int( use_bundled_input, -1, @@ -198,6 +204,11 @@ int main(int argc, char** argv) { std::cout << module.forward(inputs) << std::endl; } + c10::CPUCachingAllocator caching_allocator; + c10::optional caching_allocator_guard; + if (FLAGS_use_caching_allocator) { + caching_allocator_guard.emplace(&caching_allocator); + } std::cout << "Starting benchmark." << std::endl; std::cout << "Running warmup runs." << std::endl; CAFFE_ENFORCE( diff --git a/c10/core/CPUAllocator.cpp b/c10/core/CPUAllocator.cpp index 79342404624..e830aa4832d 100644 --- a/c10/core/CPUAllocator.cpp +++ b/c10/core/CPUAllocator.cpp @@ -1,4 +1,5 @@ #include +#include #include // TODO: rename flags to C10 @@ -154,7 +155,15 @@ class DefaultMobileCPUAllocator final : public at::Allocator { } // TODO: enable with better TLS support on mobile // profiledCPUMemoryReporter().Delete(pointer); - c10::free_cpu(pointer); + auto allocator_ptr = GetThreadLocalCachingAllocator(); + if (allocator_ptr != nullptr) { + allocator_ptr->free(pointer); + } else { + c10::free_cpu(pointer); + // This adds extra cost to freeing memory to the default case when + // caching allocator is not enabled. + CPUCachingAllocator::record_free(pointer); + } } virtual DataPtr allocate(const size_t nbytes) const override { @@ -168,7 +177,13 @@ class DefaultMobileCPUAllocator final : public at::Allocator { } auto alloc_size = PreGuardBytes + nbytes + PostGuardBytes; - void* const data = c10::alloc_cpu(alloc_size); + void* data; + auto allocator_ptr = GetThreadLocalCachingAllocator(); + if (allocator_ptr != nullptr) { + data = allocator_ptr->allocate(alloc_size); + } else { + data = c10::alloc_cpu(alloc_size); + } // profiledCPUMemoryReporter().New(data, alloc_size); return { reinterpret_cast(data) + PreGuardBytes, diff --git a/c10/core/CPUCachingAllocator.cpp b/c10/core/CPUCachingAllocator.cpp new file mode 100644 index 00000000000..2ef63cdb25e --- /dev/null +++ b/c10/core/CPUCachingAllocator.cpp @@ -0,0 +1,105 @@ +#include + +namespace c10 { + +namespace { +thread_local CPUCachingAllocator* caching_allocator_ptr{nullptr}; +} // namespace + +std::mutex CPUCachingAllocator::mutex_; +ska::flat_hash_map CPUCachingAllocator::allocation_map_; + +inline void* CPUCachingAllocator::allocate_and_cache(const size_t bytes) { + void* ptr; + try { + ptr = c10::alloc_cpu(bytes); + } catch (c10::Error& e) { + // If allocation fails, try freeing cached available blocks. + // For now free all available cached blocks. + free_cached(); + // Furthermore to consider: If we ever come here running out of memory + // perhaps it is best to disable caching, since this is likely to happen + // again. + // Try again. + ptr = c10::alloc_cpu(bytes); + } + allocation_map_[ptr] = bytes; + return ptr; +} + +void* CPUCachingAllocator::allocate(const size_t bytes) { + std::lock_guard guard(mutex_); + const auto& it = available_map_.find(bytes); + if (it == available_map_.end() || it->second.empty()) { + return allocate_and_cache(bytes); + } + return it->second.pop_back_val(); +} + +void CPUCachingAllocator::free(void* ptr) { + // NB: since we are not really freeing the memory + // the cases such as quantization code freeing original weights + // on mobile, will not quite work, as we likely will hold + // onto that memory. + // NB: We can also enable max memory cached for better memory + // management such that free will actually free the memory if + // we are nearing or above the watermark. + std::lock_guard guard(mutex_); + // If this allocation was done before caching allocator was enabled + // then free regularly + const auto& it = allocation_map_.find(ptr); + if (it == allocation_map_.end()) { + c10::free_cpu(ptr); + return; + } + const size_t alloc_size = it->second; + available_map_[alloc_size].push_back(ptr); +} + +void CPUCachingAllocator::record_free(void* ptr) { + // This function captures the case when the allocated memory + // is being freed outside the scope of this allocator. + // At the moment only way to capture this is to have the allocator, + // that uses this CachingAllocator as the backing allocator, + // call this function explicity upon freeing memory while + // outside the scope of caching allocator. + // If the memory is freed in some other way, then we will likely + // have undefined behavior or page fault. But this can be + // the case without caching allocator as well. + std::lock_guard guard(mutex_); + const auto& it = allocation_map_.find(ptr); + if (it != allocation_map_.end()) { + allocation_map_.erase(it); + } +} + +void CPUCachingAllocator::free_cached() { + for (const auto& it : available_map_) { + for (const auto ptr : it.second) { + c10::free_cpu(ptr); + // When cached memory is return to OS, it must be removed + // from allocation_map. + allocation_map_.erase(ptr); + } + } + available_map_.clear(); +} + +CPUCachingAllocator::~CPUCachingAllocator() { + free_cached(); +} + +CPUCachingAllocator* GetThreadLocalCachingAllocator() { + return caching_allocator_ptr; +} + +WithCPUCachingAllocatorGuard::WithCPUCachingAllocatorGuard( + CPUCachingAllocator* allocator) { + prev_caching_allocator_ptr_ = GetThreadLocalCachingAllocator(); +} + +WithCPUCachingAllocatorGuard::~WithCPUCachingAllocatorGuard() { + caching_allocator_ptr = prev_caching_allocator_ptr_; +} + +} // namespace c10 diff --git a/c10/core/CPUCachingAllocator.h b/c10/core/CPUCachingAllocator.h new file mode 100644 index 00000000000..c12a9bf4966 --- /dev/null +++ b/c10/core/CPUCachingAllocator.h @@ -0,0 +1,85 @@ +#pragma once + +#include +#include +#include +#include + +#include +#include +#include +#include + +namespace c10 { + +class C10_API CPUCachingAllocator { + /* + * What it does: + * Caches all the allocations carried out by this allocator. + * Cache key is the size of the allocation. + * If requested size is found in the cache returns the cached pointer. + * What it does not do: + * No speculative allocation for any future allocations. + */ + private: + // Invariants. + // 1. If memory is ever allocated via this allocator then + // the pointer will exist in allocation_map_, unless the allocator + // returned the memory to OS via free_cached. + // 1.1. Therefore even when the said memory is "freed" via this + // allocator (and thus cached), it will continue to stay + // in allocaiton_map_. Furthermore it will also exist in + // available_map_. Thus an allocated memory pointer can be in both + // allocation_map_ and available_map_ simultaneously. + // 2. Memory pointer maybe removed from allocation_map_, when it + // is freed outside of the scope of this allocator, but was allocated + // by this allocator. + // 3. Available map only contains that memory which was allocated + // by this allocator and subsequently freed by this allocator. + // As a result of above invariants, allocated memory ptr cannot be in + // available_map_ unless it is in allocation_map_ as well. + ska::flat_hash_map> available_map_; + static ska::flat_hash_map allocation_map_; + // Since allocation_map, which is a global instance, is mutated/read via + // all public APIs we need a global mutex. + static std::mutex mutex_; + inline void* allocate_and_cache(const size_t bytes); + void free_cached(); + public: + static void record_free(void* ptr); + // Checks the cache to see if allocation of size bytes can be found. + // If so return cached memory, else + // allocates memory, records it for caching and returns. + void* allocate(const size_t bytes); + // Checks if the memory being freed is was marked for allocation by + // an earlier call to allocate. If so cache the allocation. + // Otherwise free. + void free(void* ptr); + // Mainly for testing + ~CPUCachingAllocator(); +}; + +CPUCachingAllocator* GetDefaultCPUCachingAllocator(); + +bool ThreadLocalCachingAllocatorEnabled(); +CPUCachingAllocator* GetThreadLocalCachingAllocator(); + +/* + * Usage pattern: + * std::unique_ptr caching_allocator = + * std::make_unique(); + * { + * WithCPUCachingAllocatorGuard(caching_allocator.get()); + * ... + * } + */ + +class C10_API WithCPUCachingAllocatorGuard { + public: + WithCPUCachingAllocatorGuard(CPUCachingAllocator* allocator); + ~WithCPUCachingAllocatorGuard(); + private: + CPUCachingAllocator* prev_caching_allocator_ptr_{nullptr}; +}; + +} // namespace c10 diff --git a/torch/csrc/jit/mobile/observer.h b/torch/csrc/jit/mobile/observer.h index 8d06edf0e82..dc80e30db1c 100644 --- a/torch/csrc/jit/mobile/observer.h +++ b/torch/csrc/jit/mobile/observer.h @@ -36,6 +36,30 @@ class MobileDebugInfo : public c10::DebugInfoBase { private: std::string model_name_; std::string method_name_; + // TODO: Kimish + // If we launch a thread such as for at::launch, interepter continuation + // and if the caching allocator is enabled in the base thread + // then, in order to propagate this information, that is caching allocator + // is enabled, across thread boundaries we can use the mechanism provided + // by ThreadLocalDebugInfo + // Once the thread local MobileDebugInfo is accessible in the launched + // thread, it can be accessed in that thread and that thread can set + // its own thread local CachingAllocatorInfo. + // However, we cannot expect every launched thread to extract and set + // its own thread local copy of CachingAllocatorInfo. + // But this can be done in lite interpreter, where in the run method + // it can do info = + // c10::ThreadLocalDebugInfo::get(c10::DebugInfoKind::MOBILE_RUNTIME_INFO)) + // .get_caching_allocator_info(); + // GetThreadLocalCachingAllocatorInfo() = info; + // Other option is to have MobileDebugInfo itself be the place where thread + // local copy of CachingAllocatorInfo is stored. Then + // DefaultMobileCPUAllocator inspects this to decide if to use + // CachingAllocator. However, current lite interpreter does not support FORK, + // thus from the run method of lite interpreter we are not really gonna launch + // another instance of lite interpreter in a different thread. So for now not + // getting bothered about passing CachingAllocatorInfo across thread + // boundaries. c10::CachingAllocatorInfo caching_allocator_info; size_t op_idx_ = 0; };