Filter out iGPU if dGPU is found on XPU (#144378)

# Motivation for https://github.com/pytorch/pytorch/issues/143914 On Windows, there are two separate SYCL platforms for iGPU and dGPU. To simplify the logic, we will exclude iGPUs when a dGPU is present. This ensures that all XPU devices enumerated by PyTorch share the same SYCL context. Now I generalize the logic as below: 1. We find the first L0 platform containing at least one dGPU and enumerate all dGPUs of that platform. 2. If no dGPU is found, we find the first L0 platform containing iGPU and enumerate all iGPUs of that platform. 3. No GPU is found (neither iGPU nor dGPU). Pull Request resolved: https://github.com/pytorch/pytorch/pull/144378 Approved by: https://github.com/EikanWang, https://github.com/gujinghui
2025-12-06 12:20:52 +01:00 · 2025-01-27 19:54:46 +00:00 · 2025-01-27 19:54:46 +00:00 · 3b3aac0cde
commit 3b3aac0cde
parent 5e5da9bd9a
1 changed files with 62 additions and 7 deletions
--- a/c10/xpu/XPUFunctions.cpp
+++ b/c10/xpu/XPUFunctions.cpp
@ -18,6 +18,20 @@ namespace {
 * determined at runtime. There's currently a SYCL device pool that is lazily
 * created and only initialized once, ensuring thread-local safety. Each device
 * within the device pool shares the same default context.
+ *
+ * In certain scenarios, GPU devices may reside on separate SYCL platforms. For
+ * instance, on Windows, an integrated GPU (iGPU) and a discrete GPU (dGPU) may
+ * exist on different platforms. Since sycl::context cannot span across multiple
+ * platforms, creating a single default context that includes both becomes
+ * infeasible.
+ *
+ * To address this limitation, we prioritize the enumeration of dGPU. The device
+ * enumeration logic is as follows:
+ * 1. Identify the first Level Zero (L0) platform that contains at least one
+ *    dGPU and enumerate all dGPUs on that platform.
+ * 2. If no dGPU is found, identify the first L0 platform containing at least
+ *    one iGPU and enumerate all iGPUs on that platform.
+ * 3. If neither dGPUs nor iGPUs are found, conclude that no GPUs are available.
 */
 c10::once_flag init_flag;
 thread_local DeviceIndex curDeviceIndex = 0;
@ -28,19 +42,60 @@ struct DevicePool {
 } gDevicePool;

 void enumDevices(std::vector<std::unique_ptr<sycl::device>>& devices) {
+  // See Note [Device Management] for more details.
  auto platform_list = sycl::platform::get_platforms();
-  // Enumerated GPU devices from the specific platform.
-  for (const auto& platform : platform_list) {
+  auto is_igpu = [](const sycl::device& device) {
+    // Generally, iGPUs share a unified memory subsystem with the host.
+    return device.get_info<sycl::info::device::host_unified_memory>();
+  };
+
+  // Check if a platform contains at least one GPU (either iGPU or dGPU).
+  auto has_gpu = [&is_igpu](const sycl::platform& platform, bool check_igpu) {
+    // Only consider platforms using the Level Zero backend.
    if (platform.get_backend() != sycl::backend::ext_oneapi_level_zero) {
-      continue;
+      return false;
    }
-    auto device_list = platform.get_devices();
-    for (const auto& device : device_list) {
-      if (device.is_gpu()) {
-        devices.push_back(std::make_unique<sycl::device>(device));
+    // Check if the platform contains at least one GPU.
+    for (const auto& device : platform.get_devices()) {
+      if (device.is_gpu() &&
+          (check_igpu ? is_igpu(device) : !is_igpu(device))) {
+        return true;
      }
    }
+    // No GPU found on the platform.
+    return false;
+  };
+
+  // Case 1: Platform with dGPU found. Most platforms with dGPU only have dGPU
+  // or a combination of dGPU and iGPU.
+  for (const auto& platform : platform_list) {
+    // Find the first platform that contains at least one dGPU.
+    if (has_gpu(platform, /*check_igpu=*/false)) {
+      for (const auto& device : platform.get_devices()) {
+        // Only add all dGPUs to the device list.
+        if (device.is_gpu() && !is_igpu(device)) {
+          devices.push_back(std::make_unique<sycl::device>(device));
+        }
+      }
+      return; // Exit early since we already found a platform with dGPU.
+    }
  }
+
+  // Case 2: No dGPU found, but a platform with iGPU is available.
+  for (const auto& platform : platform_list) {
+    // Find the first platform that contains at least one iGPU.
+    if (has_gpu(platform, /*check_igpu=*/true)) {
+      for (const auto& device : platform.get_devices()) {
+        // Add all iGPUs to the device list.
+        if (device.is_gpu()) { // If the device is a GPU, it must be a iGPU.
+          devices.push_back(std::make_unique<sycl::device>(device));
+        }
+      }
+      return; // Exit early since we already found a platform with iGPU.
+    }
+  }
+
+  // Case 3: No GPUs found (neither dGPU nor iGPU) - Do nothing.
 }

 inline void initGlobalDevicePoolState() {