Uses context pointer for deleter to enable multiple CUDAPluggableAllocator usage (#130472)

We should be able to create multiple CUDAPluggableAllocators in the same pytorch program (see https://github.com/pytorch/pytorch/issues/124807, https://github.com/pytorch/pytorch/pull/125722 for context). When mixing CUDAPluggableAllocators in the same pytorch program, we need to make sure that the deleter passed in through the CUDAPluggableAllocator gets "attached" to the data_ptr and persist until program exit (when it's called to free the memory). Currently, CUDAPluggableAllocator maintains a global `current_custom_allocator`. When creating the `DataPtr`, `raw_deleter` attaches `custom_raw_deleter` to the DataPtr which calls `current_custom_allocator->raw_delete(...)`. This approach is fine when using only one allocator, however for multiple allocator use case, DataPtr would be using the deleter of whatever is in the `current_custom_allocator`. For example, if allocation 1 was done with `cudaMalloc` and allocation 2 was done with `ncclMemAlloc`, and if `current_custom_allocator` is currently pointing to the CUDAPluggableAllocator with `ncclMemAlloc` - when cleaning up the allocation 1, we'd be using `ncclMemFree` instead of `cudaFree`. In this PR, we solve the above problem by remembering the `free_fn_` using a deleter context. Hence, there is no need to go through an allocator object to find the deleter. CC: @zdevito @ptrblck @eqy Pull Request resolved: https://github.com/pytorch/pytorch/pull/130472 Approved by: https://github.com/eqy, https://github.com/ezyang
2025-12-07 12:21:27 +01:00 · 2024-07-18 11:33:19 +00:00 · 2024-07-18 11:33:19 +00:00 · 38b7d89aa4
commit 38b7d89aa4
parent 28a74b9fa4
5 changed files with 112 additions and 22 deletions
--- a/aten/src/ATen/test/cuda_allocator_test.cpp
+++ b/aten/src/ATen/test/cuda_allocator_test.cpp
@ -5,6 +5,51 @@
 #include <ATen/test/allocator_clone_test.h>
 #include <torch/csrc/cuda/CUDAPluggableAllocator.h>
 TEST(AllocatorTestCUDA, test_clone) {
  test_allocator_clone(c10::cuda::CUDACachingAllocator::get());
 }
 static int called_dummy_free_0 = 0;
 static int called_dummy_free_1 = 0;
 void* dummy_alloc_0(size_t size, int device, void* stream) {return nullptr;}
 void dummy_free_0(void* data, size_t size, int device, void* stream) {
  called_dummy_free_0++;
 }
 void dummy_free_1(void* data, size_t size, int device, void* stream) {
  called_dummy_free_1++;
 }
 // Tests that data_ptrs have their respective deleters
 // when mixing allocators
 TEST(AllocatorTestCUDA, test_pluggable_allocator_deleters) {
  // Create a tensor with dummy_allocator_0, where dummy_free_0 is the deleter
  auto dummy_allocator_0 = torch::cuda::CUDAPluggableAllocator::createCustomAllocator(dummy_alloc_0, dummy_free_0);
  c10::cuda::CUDACachingAllocator::allocator.store(dummy_allocator_0.get());
  at::Tensor a = at::empty({0}, at::TensorOptions().device(at::kCUDA));
  // Create a tensor with dummy_allocator_1, where dummy_free_1 is the deleter
  auto dummy_allocator_1 = torch::cuda::CUDAPluggableAllocator::createCustomAllocator(dummy_alloc_0, dummy_free_1);
  c10::cuda::CUDACachingAllocator::allocator.store(dummy_allocator_1.get());
  at::Tensor b = at::empty({0}, at::TensorOptions().device(at::kCUDA));
  // Manually use a's deleter
  auto* ctx = a.storage().data_ptr().get_context();
  a.storage().data_ptr().get_deleter()(ctx);
  a.storage().mutable_data_ptr().release_context();
  // a's deleter is dummy_free_0
  // dummy_free_0 should be called above, so called_dummy_free_0 should be 1
  ASSERT_TRUE(called_dummy_free_0 == 1);
  // Manually use b's deleter
  ctx = b.storage().data_ptr().get_context();
  b.storage().data_ptr().get_deleter()(ctx);
  b.storage().mutable_data_ptr().release_context();
  // b's deleter is dummy_free_1
  // dummy_free_1 should be called above, so called_dummy_free_1 should be 1
  ASSERT_TRUE(called_dummy_free_1 == 1);
 }
--- a/build_variables.bzl
+++ b/build_variables.bzl
@ -661,6 +661,7 @@ libtorch_cuda_core_sources = [
    "torch/csrc/CudaIPCTypes.cpp",
    "torch/csrc/cuda/comm.cpp",
    "torch/csrc/cuda/memory_snapshot.cpp",
    "torch/csrc/cuda/CUDAPluggableAllocator.cpp",
    "torch/csrc/inductor/aoti_runner/model_container_runner_cuda.cpp",
    "torch/csrc/inductor/aoti_torch/shim_cuda.cpp",
    "torch/csrc/jit/codegen/fuser/cuda/fused_kernel.cpp",
@ -772,7 +773,6 @@ libtorch_python_cuda_core_sources = [
    "torch/csrc/cuda/shared/cudart.cpp",
    "torch/csrc/cuda/shared/nvtx.cpp",
    "torch/csrc/cuda/utils.cpp",
    "torch/csrc/cuda/CUDAPluggableAllocator.cpp",
 ]
 libtorch_python_cuda_sources = libtorch_python_cuda_core_sources + [
--- a/torch/csrc/cuda/CUDAPluggableAllocator.cpp
+++ b/torch/csrc/cuda/CUDAPluggableAllocator.cpp
@ -8,6 +8,23 @@
 namespace torch::cuda::CUDAPluggableAllocator {
 CUDAPluggableAllocatorDeleterContext::CUDAPluggableAllocatorDeleterContext(
    std::function<FreeFuncType> free_fn,
    void* data,
    size_t size,
    int device,
    cudaStream_t stream)
    : free_fn_(free_fn),
      data_(data),
      size_(size),
      device_(device),
      stream_(stream) {}
 void CUDAPluggableAllocatorDeleterContext::free() {
  free_fn_(data_, size_, device_, stream_);
  delete this;
 }
 int device_count = 0;
 void custom_raw_deleter(void* ptr);
@ -26,8 +43,8 @@ _AllocationMetadata::_AllocationMetadata(
 // This avoids having to link against libtorch for C++ based custom allocators
 // And also use this from python
 CUDAPluggableAllocator::CUDAPluggableAllocator(
-    std::function<void*(size_t, int, cudaStream_t)> alloc_fn,
+    std::function<MallocFuncType> alloc_fn,
-    std::function<void(void*, size_t, int, cudaStream_t)> free_fn)
+    std::function<FreeFuncType> free_fn)
    : alloc_fn_(std::move(alloc_fn)), free_fn_(std::move(free_fn)) {}
 CUDAPluggableAllocator::CUDAPluggableAllocator(CUDAPluggableAllocator& other)
@ -99,8 +116,10 @@ c10::DataPtr CUDAPluggableAllocator::allocate(size_t size) {
  C10_CUDA_CHECK(c10::cuda::GetDevice(&device));
  cudaStream_t stream = c10::cuda::getCurrentCUDAStream(device);
  void* r = this->malloc(size, device, stream);
  auto* ctx = new CUDAPluggableAllocatorDeleterContext(
      free_fn_, r, size, device, stream);
  c10::DataPtr data_ptr = {
-      r, r, raw_deleter(), c10::Device(c10::DeviceType::CUDA, device)};
+      r, ctx, raw_deleter(), c10::Device(c10::DeviceType::CUDA, device)};
  return data_ptr;
 }
@ -348,8 +367,8 @@ getCurrentAllocator() {
 // TODO: add more functions in the argument
 std::shared_ptr<c10::cuda::CUDACachingAllocator::CUDAAllocator>
 createCustomAllocator(
-    std::function<void*(size_t, int, cudaStream_t)> alloc_fn,
+    std::function<MallocFuncType> alloc_fn,
-    std::function<void(void*, size_t, int, cudaStream_t)> free_fn) {
+    std::function<FreeFuncType> free_fn) {
  std::shared_ptr<CUDAPluggableAllocator> allocator(
      new CUDAPluggableAllocator(std::move(alloc_fn), std::move(free_fn)));
  allocator->init(device_count);
@ -366,8 +385,8 @@ void changeCurrentAllocator(
  current_custom_allocator = allocator;
 }
-void custom_raw_deleter(void* ptr) {
+void custom_raw_deleter(void* ctx) {
-  current_custom_allocator->raw_delete(ptr);
+  reinterpret_cast<CUDAPluggableAllocatorDeleterContext*>(ctx)->free();
 }
 } // namespace torch::cuda::CUDAPluggableAllocator
--- a/torch/csrc/cuda/CUDAPluggableAllocator.h
+++ b/torch/csrc/cuda/CUDAPluggableAllocator.h
@ -11,19 +11,47 @@
 namespace torch::cuda::CUDAPluggableAllocator {
 using MallocFuncType = void*(size_t, int, cudaStream_t);
 using FreeFuncType = void(void*, size_t, int, cudaStream_t);
 // A CUDAPluggableAllocatorDeleterContext object is used as the `ctx`
 // argument for DataPtr. We need context because a user can use
 // multiple allocators in the same PyTorch program, and
 // the allocators can have different free functions, such as:
 // free, cudaFree, cudaFreeAsync, ncclMemFree etc.
 struct TORCH_CUDA_CPP_API CUDAPluggableAllocatorDeleterContext {
  explicit CUDAPluggableAllocatorDeleterContext(
      std::function<FreeFuncType> free_fn,
      void* data,
      size_t size,
      int device,
      cudaStream_t stream);
  void free();
 private:
  std::function<FreeFuncType> free_fn_;
  void* data_;
  size_t size_;
  int device_;
  cudaStream_t stream_;
 };
 #if defined(TORCH_HIP_VERSION)
 using streamType = c10::hip::HIPStream;
 #else
 using streamType = c10::cuda::CUDAStream;
 #endif
-std::shared_ptr<c10::cuda::CUDACachingAllocator::CUDAAllocator>
+TORCH_CUDA_CPP_API std::shared_ptr<
    c10::cuda::CUDACachingAllocator::CUDAAllocator>
 getCurrentAllocator();
-std::shared_ptr<c10::cuda::CUDACachingAllocator::CUDAAllocator>
+TORCH_CUDA_CPP_API std::shared_ptr<
    c10::cuda::CUDACachingAllocator::CUDAAllocator>
 createCustomAllocator(
-    std::function<void*(size_t, int, cudaStream_t)> alloc_fn,
+    std::function<MallocFuncType> alloc_fn,
-    std::function<void(void*, size_t, int, cudaStream_t)> free_fn);
+    std::function<FreeFuncType> free_fn);
-void changeCurrentAllocator(
+TORCH_CUDA_CPP_API void changeCurrentAllocator(
    const std::shared_ptr<c10::cuda::CUDACachingAllocator::CUDAAllocator>&
        allocator);
@ -38,11 +66,11 @@ struct _AllocationMetadata {
  cudaStream_t stream;
 };
-struct CUDAPluggableAllocator
+struct TORCH_CUDA_CPP_API CUDAPluggableAllocator
    : public c10::cuda::CUDACachingAllocator::CUDAAllocator {
  CUDAPluggableAllocator(
-      std::function<void*(size_t, int, cudaStream_t)> alloc_fn,
+      std::function<MallocFuncType> alloc_fn,
-      std::function<void(void*, size_t, int, cudaStream_t)> free_fn);
+      std::function<FreeFuncType> free_fn);
  CUDAPluggableAllocator(CUDAPluggableAllocator& other);
@ -131,8 +159,8 @@ struct CUDAPluggableAllocator
  void copy_data(void* dest, const void* src, std::size_t count) const final;
 protected:
-  std::function<void*(size_t, int, cudaStream_t)> alloc_fn_;
+  std::function<MallocFuncType> alloc_fn_;
-  std::function<void(void*, size_t, int, cudaStream_t)> free_fn_;
+  std::function<FreeFuncType> free_fn_;
  std::function<void(int)> init_fn_;
  std::function<void()> reset_fn_;
  std::function<void(double, int)> memory_fraction_fn_;
--- a/torch/csrc/cuda/Module.cpp
+++ b/torch/csrc/cuda/Module.cpp
@ -1175,16 +1175,14 @@ static void registerCudaPluggableAllocator(PyObject* module) {
            self.set_release_pool(func);
          });
  m.def("_cuda_customAllocator", [](uint64_t malloc_ptr, uint64_t free_ptr) {
-    using MallocFuncType = void*(size_t, int, cudaStream_t);
+    using namespace torch::cuda::CUDAPluggableAllocator;
    using FreeFuncType = void(void*, size_t, int, cudaStream_t);
    std::function<MallocFuncType> malloc_fn =
        // NOLINTNEXTLINE(performance-no-int-to-ptr)
        reinterpret_cast<MallocFuncType*>(malloc_ptr);
    std::function<FreeFuncType> free_fn =
        // NOLINTNEXTLINE(performance-no-int-to-ptr)
        reinterpret_cast<FreeFuncType*>(free_ptr);
-    return torch::cuda::CUDAPluggableAllocator::createCustomAllocator(
+    return createCustomAllocator(malloc_fn, free_fn);
        malloc_fn, free_fn);
  });
  // NOLINTNEXTLINE(bugprone-unused-raii)