mirror of
https://github.com/zebrajr/pytorch.git
synced 2025-12-07 12:21:27 +01:00
Uses context pointer for deleter to enable multiple CUDAPluggableAllocator usage (#130472)
We should be able to create multiple CUDAPluggableAllocators in the same pytorch program (see https://github.com/pytorch/pytorch/issues/124807, https://github.com/pytorch/pytorch/pull/125722 for context). When mixing CUDAPluggableAllocators in the same pytorch program, we need to make sure that the deleter passed in through the CUDAPluggableAllocator gets "attached" to the data_ptr and persist until program exit (when it's called to free the memory). Currently, CUDAPluggableAllocator maintains a global `current_custom_allocator`. When creating the `DataPtr`, `raw_deleter` attaches `custom_raw_deleter` to the DataPtr which calls `current_custom_allocator->raw_delete(...)`. This approach is fine when using only one allocator, however for multiple allocator use case, DataPtr would be using the deleter of whatever is in the `current_custom_allocator`. For example, if allocation 1 was done with `cudaMalloc` and allocation 2 was done with `ncclMemAlloc`, and if `current_custom_allocator` is currently pointing to the CUDAPluggableAllocator with `ncclMemAlloc` - when cleaning up the allocation 1, we'd be using `ncclMemFree` instead of `cudaFree`. In this PR, we solve the above problem by remembering the `free_fn_` using a deleter context. Hence, there is no need to go through an allocator object to find the deleter. CC: @zdevito @ptrblck @eqy Pull Request resolved: https://github.com/pytorch/pytorch/pull/130472 Approved by: https://github.com/eqy, https://github.com/ezyang
This commit is contained in:
parent
28a74b9fa4
commit
38b7d89aa4
|
|
@ -5,6 +5,51 @@
|
||||||
|
|
||||||
#include <ATen/test/allocator_clone_test.h>
|
#include <ATen/test/allocator_clone_test.h>
|
||||||
|
|
||||||
|
#include <torch/csrc/cuda/CUDAPluggableAllocator.h>
|
||||||
|
|
||||||
TEST(AllocatorTestCUDA, test_clone) {
|
TEST(AllocatorTestCUDA, test_clone) {
|
||||||
test_allocator_clone(c10::cuda::CUDACachingAllocator::get());
|
test_allocator_clone(c10::cuda::CUDACachingAllocator::get());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static int called_dummy_free_0 = 0;
|
||||||
|
static int called_dummy_free_1 = 0;
|
||||||
|
|
||||||
|
void* dummy_alloc_0(size_t size, int device, void* stream) {return nullptr;}
|
||||||
|
void dummy_free_0(void* data, size_t size, int device, void* stream) {
|
||||||
|
called_dummy_free_0++;
|
||||||
|
}
|
||||||
|
void dummy_free_1(void* data, size_t size, int device, void* stream) {
|
||||||
|
called_dummy_free_1++;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Tests that data_ptrs have their respective deleters
|
||||||
|
// when mixing allocators
|
||||||
|
TEST(AllocatorTestCUDA, test_pluggable_allocator_deleters) {
|
||||||
|
// Create a tensor with dummy_allocator_0, where dummy_free_0 is the deleter
|
||||||
|
auto dummy_allocator_0 = torch::cuda::CUDAPluggableAllocator::createCustomAllocator(dummy_alloc_0, dummy_free_0);
|
||||||
|
c10::cuda::CUDACachingAllocator::allocator.store(dummy_allocator_0.get());
|
||||||
|
at::Tensor a = at::empty({0}, at::TensorOptions().device(at::kCUDA));
|
||||||
|
|
||||||
|
// Create a tensor with dummy_allocator_1, where dummy_free_1 is the deleter
|
||||||
|
auto dummy_allocator_1 = torch::cuda::CUDAPluggableAllocator::createCustomAllocator(dummy_alloc_0, dummy_free_1);
|
||||||
|
c10::cuda::CUDACachingAllocator::allocator.store(dummy_allocator_1.get());
|
||||||
|
at::Tensor b = at::empty({0}, at::TensorOptions().device(at::kCUDA));
|
||||||
|
|
||||||
|
// Manually use a's deleter
|
||||||
|
auto* ctx = a.storage().data_ptr().get_context();
|
||||||
|
a.storage().data_ptr().get_deleter()(ctx);
|
||||||
|
a.storage().mutable_data_ptr().release_context();
|
||||||
|
|
||||||
|
// a's deleter is dummy_free_0
|
||||||
|
// dummy_free_0 should be called above, so called_dummy_free_0 should be 1
|
||||||
|
ASSERT_TRUE(called_dummy_free_0 == 1);
|
||||||
|
|
||||||
|
// Manually use b's deleter
|
||||||
|
ctx = b.storage().data_ptr().get_context();
|
||||||
|
b.storage().data_ptr().get_deleter()(ctx);
|
||||||
|
b.storage().mutable_data_ptr().release_context();
|
||||||
|
|
||||||
|
// b's deleter is dummy_free_1
|
||||||
|
// dummy_free_1 should be called above, so called_dummy_free_1 should be 1
|
||||||
|
ASSERT_TRUE(called_dummy_free_1 == 1);
|
||||||
|
}
|
||||||
|
|
|
||||||
|
|
@ -661,6 +661,7 @@ libtorch_cuda_core_sources = [
|
||||||
"torch/csrc/CudaIPCTypes.cpp",
|
"torch/csrc/CudaIPCTypes.cpp",
|
||||||
"torch/csrc/cuda/comm.cpp",
|
"torch/csrc/cuda/comm.cpp",
|
||||||
"torch/csrc/cuda/memory_snapshot.cpp",
|
"torch/csrc/cuda/memory_snapshot.cpp",
|
||||||
|
"torch/csrc/cuda/CUDAPluggableAllocator.cpp",
|
||||||
"torch/csrc/inductor/aoti_runner/model_container_runner_cuda.cpp",
|
"torch/csrc/inductor/aoti_runner/model_container_runner_cuda.cpp",
|
||||||
"torch/csrc/inductor/aoti_torch/shim_cuda.cpp",
|
"torch/csrc/inductor/aoti_torch/shim_cuda.cpp",
|
||||||
"torch/csrc/jit/codegen/fuser/cuda/fused_kernel.cpp",
|
"torch/csrc/jit/codegen/fuser/cuda/fused_kernel.cpp",
|
||||||
|
|
@ -772,7 +773,6 @@ libtorch_python_cuda_core_sources = [
|
||||||
"torch/csrc/cuda/shared/cudart.cpp",
|
"torch/csrc/cuda/shared/cudart.cpp",
|
||||||
"torch/csrc/cuda/shared/nvtx.cpp",
|
"torch/csrc/cuda/shared/nvtx.cpp",
|
||||||
"torch/csrc/cuda/utils.cpp",
|
"torch/csrc/cuda/utils.cpp",
|
||||||
"torch/csrc/cuda/CUDAPluggableAllocator.cpp",
|
|
||||||
]
|
]
|
||||||
|
|
||||||
libtorch_python_cuda_sources = libtorch_python_cuda_core_sources + [
|
libtorch_python_cuda_sources = libtorch_python_cuda_core_sources + [
|
||||||
|
|
|
||||||
|
|
@ -8,6 +8,23 @@
|
||||||
|
|
||||||
namespace torch::cuda::CUDAPluggableAllocator {
|
namespace torch::cuda::CUDAPluggableAllocator {
|
||||||
|
|
||||||
|
CUDAPluggableAllocatorDeleterContext::CUDAPluggableAllocatorDeleterContext(
|
||||||
|
std::function<FreeFuncType> free_fn,
|
||||||
|
void* data,
|
||||||
|
size_t size,
|
||||||
|
int device,
|
||||||
|
cudaStream_t stream)
|
||||||
|
: free_fn_(free_fn),
|
||||||
|
data_(data),
|
||||||
|
size_(size),
|
||||||
|
device_(device),
|
||||||
|
stream_(stream) {}
|
||||||
|
|
||||||
|
void CUDAPluggableAllocatorDeleterContext::free() {
|
||||||
|
free_fn_(data_, size_, device_, stream_);
|
||||||
|
delete this;
|
||||||
|
}
|
||||||
|
|
||||||
int device_count = 0;
|
int device_count = 0;
|
||||||
|
|
||||||
void custom_raw_deleter(void* ptr);
|
void custom_raw_deleter(void* ptr);
|
||||||
|
|
@ -26,8 +43,8 @@ _AllocationMetadata::_AllocationMetadata(
|
||||||
// This avoids having to link against libtorch for C++ based custom allocators
|
// This avoids having to link against libtorch for C++ based custom allocators
|
||||||
// And also use this from python
|
// And also use this from python
|
||||||
CUDAPluggableAllocator::CUDAPluggableAllocator(
|
CUDAPluggableAllocator::CUDAPluggableAllocator(
|
||||||
std::function<void*(size_t, int, cudaStream_t)> alloc_fn,
|
std::function<MallocFuncType> alloc_fn,
|
||||||
std::function<void(void*, size_t, int, cudaStream_t)> free_fn)
|
std::function<FreeFuncType> free_fn)
|
||||||
: alloc_fn_(std::move(alloc_fn)), free_fn_(std::move(free_fn)) {}
|
: alloc_fn_(std::move(alloc_fn)), free_fn_(std::move(free_fn)) {}
|
||||||
|
|
||||||
CUDAPluggableAllocator::CUDAPluggableAllocator(CUDAPluggableAllocator& other)
|
CUDAPluggableAllocator::CUDAPluggableAllocator(CUDAPluggableAllocator& other)
|
||||||
|
|
@ -99,8 +116,10 @@ c10::DataPtr CUDAPluggableAllocator::allocate(size_t size) {
|
||||||
C10_CUDA_CHECK(c10::cuda::GetDevice(&device));
|
C10_CUDA_CHECK(c10::cuda::GetDevice(&device));
|
||||||
cudaStream_t stream = c10::cuda::getCurrentCUDAStream(device);
|
cudaStream_t stream = c10::cuda::getCurrentCUDAStream(device);
|
||||||
void* r = this->malloc(size, device, stream);
|
void* r = this->malloc(size, device, stream);
|
||||||
|
auto* ctx = new CUDAPluggableAllocatorDeleterContext(
|
||||||
|
free_fn_, r, size, device, stream);
|
||||||
c10::DataPtr data_ptr = {
|
c10::DataPtr data_ptr = {
|
||||||
r, r, raw_deleter(), c10::Device(c10::DeviceType::CUDA, device)};
|
r, ctx, raw_deleter(), c10::Device(c10::DeviceType::CUDA, device)};
|
||||||
return data_ptr;
|
return data_ptr;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -348,8 +367,8 @@ getCurrentAllocator() {
|
||||||
// TODO: add more functions in the argument
|
// TODO: add more functions in the argument
|
||||||
std::shared_ptr<c10::cuda::CUDACachingAllocator::CUDAAllocator>
|
std::shared_ptr<c10::cuda::CUDACachingAllocator::CUDAAllocator>
|
||||||
createCustomAllocator(
|
createCustomAllocator(
|
||||||
std::function<void*(size_t, int, cudaStream_t)> alloc_fn,
|
std::function<MallocFuncType> alloc_fn,
|
||||||
std::function<void(void*, size_t, int, cudaStream_t)> free_fn) {
|
std::function<FreeFuncType> free_fn) {
|
||||||
std::shared_ptr<CUDAPluggableAllocator> allocator(
|
std::shared_ptr<CUDAPluggableAllocator> allocator(
|
||||||
new CUDAPluggableAllocator(std::move(alloc_fn), std::move(free_fn)));
|
new CUDAPluggableAllocator(std::move(alloc_fn), std::move(free_fn)));
|
||||||
allocator->init(device_count);
|
allocator->init(device_count);
|
||||||
|
|
@ -366,8 +385,8 @@ void changeCurrentAllocator(
|
||||||
current_custom_allocator = allocator;
|
current_custom_allocator = allocator;
|
||||||
}
|
}
|
||||||
|
|
||||||
void custom_raw_deleter(void* ptr) {
|
void custom_raw_deleter(void* ctx) {
|
||||||
current_custom_allocator->raw_delete(ptr);
|
reinterpret_cast<CUDAPluggableAllocatorDeleterContext*>(ctx)->free();
|
||||||
}
|
}
|
||||||
|
|
||||||
} // namespace torch::cuda::CUDAPluggableAllocator
|
} // namespace torch::cuda::CUDAPluggableAllocator
|
||||||
|
|
|
||||||
|
|
@ -11,19 +11,47 @@
|
||||||
|
|
||||||
namespace torch::cuda::CUDAPluggableAllocator {
|
namespace torch::cuda::CUDAPluggableAllocator {
|
||||||
|
|
||||||
|
using MallocFuncType = void*(size_t, int, cudaStream_t);
|
||||||
|
using FreeFuncType = void(void*, size_t, int, cudaStream_t);
|
||||||
|
|
||||||
|
// A CUDAPluggableAllocatorDeleterContext object is used as the `ctx`
|
||||||
|
// argument for DataPtr. We need context because a user can use
|
||||||
|
// multiple allocators in the same PyTorch program, and
|
||||||
|
// the allocators can have different free functions, such as:
|
||||||
|
// free, cudaFree, cudaFreeAsync, ncclMemFree etc.
|
||||||
|
struct TORCH_CUDA_CPP_API CUDAPluggableAllocatorDeleterContext {
|
||||||
|
explicit CUDAPluggableAllocatorDeleterContext(
|
||||||
|
std::function<FreeFuncType> free_fn,
|
||||||
|
void* data,
|
||||||
|
size_t size,
|
||||||
|
int device,
|
||||||
|
cudaStream_t stream);
|
||||||
|
|
||||||
|
void free();
|
||||||
|
|
||||||
|
private:
|
||||||
|
std::function<FreeFuncType> free_fn_;
|
||||||
|
void* data_;
|
||||||
|
size_t size_;
|
||||||
|
int device_;
|
||||||
|
cudaStream_t stream_;
|
||||||
|
};
|
||||||
|
|
||||||
#if defined(TORCH_HIP_VERSION)
|
#if defined(TORCH_HIP_VERSION)
|
||||||
using streamType = c10::hip::HIPStream;
|
using streamType = c10::hip::HIPStream;
|
||||||
#else
|
#else
|
||||||
using streamType = c10::cuda::CUDAStream;
|
using streamType = c10::cuda::CUDAStream;
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
std::shared_ptr<c10::cuda::CUDACachingAllocator::CUDAAllocator>
|
TORCH_CUDA_CPP_API std::shared_ptr<
|
||||||
|
c10::cuda::CUDACachingAllocator::CUDAAllocator>
|
||||||
getCurrentAllocator();
|
getCurrentAllocator();
|
||||||
std::shared_ptr<c10::cuda::CUDACachingAllocator::CUDAAllocator>
|
TORCH_CUDA_CPP_API std::shared_ptr<
|
||||||
|
c10::cuda::CUDACachingAllocator::CUDAAllocator>
|
||||||
createCustomAllocator(
|
createCustomAllocator(
|
||||||
std::function<void*(size_t, int, cudaStream_t)> alloc_fn,
|
std::function<MallocFuncType> alloc_fn,
|
||||||
std::function<void(void*, size_t, int, cudaStream_t)> free_fn);
|
std::function<FreeFuncType> free_fn);
|
||||||
void changeCurrentAllocator(
|
TORCH_CUDA_CPP_API void changeCurrentAllocator(
|
||||||
const std::shared_ptr<c10::cuda::CUDACachingAllocator::CUDAAllocator>&
|
const std::shared_ptr<c10::cuda::CUDACachingAllocator::CUDAAllocator>&
|
||||||
allocator);
|
allocator);
|
||||||
|
|
||||||
|
|
@ -38,11 +66,11 @@ struct _AllocationMetadata {
|
||||||
cudaStream_t stream;
|
cudaStream_t stream;
|
||||||
};
|
};
|
||||||
|
|
||||||
struct CUDAPluggableAllocator
|
struct TORCH_CUDA_CPP_API CUDAPluggableAllocator
|
||||||
: public c10::cuda::CUDACachingAllocator::CUDAAllocator {
|
: public c10::cuda::CUDACachingAllocator::CUDAAllocator {
|
||||||
CUDAPluggableAllocator(
|
CUDAPluggableAllocator(
|
||||||
std::function<void*(size_t, int, cudaStream_t)> alloc_fn,
|
std::function<MallocFuncType> alloc_fn,
|
||||||
std::function<void(void*, size_t, int, cudaStream_t)> free_fn);
|
std::function<FreeFuncType> free_fn);
|
||||||
|
|
||||||
CUDAPluggableAllocator(CUDAPluggableAllocator& other);
|
CUDAPluggableAllocator(CUDAPluggableAllocator& other);
|
||||||
|
|
||||||
|
|
@ -131,8 +159,8 @@ struct CUDAPluggableAllocator
|
||||||
void copy_data(void* dest, const void* src, std::size_t count) const final;
|
void copy_data(void* dest, const void* src, std::size_t count) const final;
|
||||||
|
|
||||||
protected:
|
protected:
|
||||||
std::function<void*(size_t, int, cudaStream_t)> alloc_fn_;
|
std::function<MallocFuncType> alloc_fn_;
|
||||||
std::function<void(void*, size_t, int, cudaStream_t)> free_fn_;
|
std::function<FreeFuncType> free_fn_;
|
||||||
std::function<void(int)> init_fn_;
|
std::function<void(int)> init_fn_;
|
||||||
std::function<void()> reset_fn_;
|
std::function<void()> reset_fn_;
|
||||||
std::function<void(double, int)> memory_fraction_fn_;
|
std::function<void(double, int)> memory_fraction_fn_;
|
||||||
|
|
|
||||||
|
|
@ -1175,16 +1175,14 @@ static void registerCudaPluggableAllocator(PyObject* module) {
|
||||||
self.set_release_pool(func);
|
self.set_release_pool(func);
|
||||||
});
|
});
|
||||||
m.def("_cuda_customAllocator", [](uint64_t malloc_ptr, uint64_t free_ptr) {
|
m.def("_cuda_customAllocator", [](uint64_t malloc_ptr, uint64_t free_ptr) {
|
||||||
using MallocFuncType = void*(size_t, int, cudaStream_t);
|
using namespace torch::cuda::CUDAPluggableAllocator;
|
||||||
using FreeFuncType = void(void*, size_t, int, cudaStream_t);
|
|
||||||
std::function<MallocFuncType> malloc_fn =
|
std::function<MallocFuncType> malloc_fn =
|
||||||
// NOLINTNEXTLINE(performance-no-int-to-ptr)
|
// NOLINTNEXTLINE(performance-no-int-to-ptr)
|
||||||
reinterpret_cast<MallocFuncType*>(malloc_ptr);
|
reinterpret_cast<MallocFuncType*>(malloc_ptr);
|
||||||
std::function<FreeFuncType> free_fn =
|
std::function<FreeFuncType> free_fn =
|
||||||
// NOLINTNEXTLINE(performance-no-int-to-ptr)
|
// NOLINTNEXTLINE(performance-no-int-to-ptr)
|
||||||
reinterpret_cast<FreeFuncType*>(free_ptr);
|
reinterpret_cast<FreeFuncType*>(free_ptr);
|
||||||
return torch::cuda::CUDAPluggableAllocator::createCustomAllocator(
|
return createCustomAllocator(malloc_fn, free_fn);
|
||||||
malloc_fn, free_fn);
|
|
||||||
});
|
});
|
||||||
|
|
||||||
// NOLINTNEXTLINE(bugprone-unused-raii)
|
// NOLINTNEXTLINE(bugprone-unused-raii)
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue
Block a user