From d0c32971b41ba9b9e9b8953beb8c29dd275ebdd3 Mon Sep 17 00:00:00 2001 From: "Yu, Guangye" Date: Wed, 15 Oct 2025 23:38:02 +0000 Subject: [PATCH] Refine XPU allocator message when OOM (#165509) # Motivation Provide more information and align with other backends to enhance the user experience. Pull Request resolved: https://github.com/pytorch/pytorch/pull/165509 Approved by: https://github.com/EikanWang ghstack dependencies: #165508 --- c10/xpu/XPUCachingAllocator.cpp | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/c10/xpu/XPUCachingAllocator.cpp b/c10/xpu/XPUCachingAllocator.cpp index c837ee3d422..0c00eddf0e4 100644 --- a/c10/xpu/XPUCachingAllocator.cpp +++ b/c10/xpu/XPUCachingAllocator.cpp @@ -433,6 +433,18 @@ class DeviceCachingAllocator { c10::xpu::DeviceProp device_prop; c10::xpu::get_device_properties(&device_prop, device); auto device_total = device_prop.global_mem_size; + // Estimate the available device memory when the SYCL runtime does not + // support the corresponding aspect (ext_intel_free_memory). + size_t device_free = device_prop.global_mem_size - + stats.reserved_bytes[static_cast(StatType::AGGREGATE)] + .current; + auto& raw_device = c10::xpu::get_raw_device(device); + // TODO: Remove the aspect check once the SYCL runtime bug is fixed on + // affected devices. + if (raw_device.has(sycl::aspect::ext_intel_free_memory)) { + device_free = + raw_device.get_info(); + } auto allocated_bytes = stats.allocated_bytes[static_cast(StatType::AGGREGATE)] .current; @@ -455,7 +467,9 @@ class DeviceCachingAllocator { static_cast(device), " has a total capacity of ", format_size(device_total), - ". Of the allocated memory ", + " of which ", + format_size(device_free), + " is free. Of the allocated memory ", format_size(allocated_bytes), " is allocated by PyTorch, and ", format_size(reserved_bytes - allocated_bytes),