Generalize support of background thread in pinned allocator (#160505)

# Motivation https://github.com/pytorch/pytorch/pull/135524 only introduces the support of background thread for CUDA, this PR intends to support it for other backend such as XPU as well. Pull Request resolved: https://github.com/pytorch/pytorch/pull/160505 Approved by: https://github.com/albanD
2025-12-06 12:20:52 +01:00 · 2025-08-13 14:18:53 +00:00 · 2025-08-13 14:18:53 +00:00 · 8cfaf51d4e
commit 8cfaf51d4e
parent af3cabc55d
3 changed files with 14 additions and 6 deletions
--- a/aten/src/ATen/core/CachingHostAllocator.h
+++ b/aten/src/ATen/core/CachingHostAllocator.h
@ -1,6 +1,7 @@
 #pragma once

 #include <c10/core/Allocator.h>
+#include <c10/core/AllocatorConfig.h>
 #include <c10/core/Stream.h>
 #include <c10/core/thread_pool.h>
 #include <c10/util/flat_hash_map.h>
@ -351,7 +352,8 @@ struct CachingHostAllocatorImpl {
  }

  virtual bool pinned_use_background_threads() {
-    return false;
+    return c10::CachingAllocator::AcceleratorAllocatorConfig::
+        pinned_use_background_threads();
  }

  virtual void copy_data(void* dest [[maybe_unused]], const void* src [[maybe_unused]], std::size_t count [[maybe_unused]]) const {
--- a/aten/src/ATen/cuda/CachingHostAllocator.cpp
+++ b/aten/src/ATen/cuda/CachingHostAllocator.cpp
@ -161,11 +161,6 @@ struct CUDACachingHostAllocatorImpl
    return true;
  }

-  bool pinned_use_background_threads() override {
-    return c10::CachingAllocator::AcceleratorAllocatorConfig::
-        pinned_use_background_threads();
-  }
-
  EventPool::Event create_event_internal(DeviceIndex idx) {
    // Leak the event pool to avoid shutdown issue.
    static auto* event_pool = new EventPool();
--- a/test/test_xpu.py
+++ b/test/test_xpu.py
@ -607,6 +607,17 @@ if __name__ == "__main__":
            z[0] = z[0] + 1.0
            self.assertEqual(z, x)

+    def test_background_thread_for_pin_memory(self):
+        # Just ensure no crash
+        torch._C._accelerator_setAllocatorSettings("pinned_use_background_threads:True")
+        cpu_tensor = torch.randn(100)
+        pin_tensor = cpu_tensor.pin_memory()
+        xpu_tensor = pin_tensor.to(device="xpu", non_blocking=True)
+        torch.xpu.synchronize()
+        del pin_tensor
+        gc.collect()
+        self.assertEqual(xpu_tensor.cpu(), cpu_tensor)
+

 instantiate_device_type_tests(TestXpu, globals(), only_for="xpu", allow_xpu=True)