Add Intel GPU info collection to the collect env script (#157351)

https://github.com/pytorch/pytorch/pull/137846 was mistakenly closed. Reopen a PR to land the PR. Pull Request resolved: https://github.com/pytorch/pytorch/pull/157351 Approved by: https://github.com/guangyey, https://github.com/malfet
2025-12-06 12:20:52 +01:00 · 2025-07-09 03:01:41 +00:00 · 2025-07-09 03:01:41 +00:00 · c515385b0a
commit c515385b0a
parent d6237721c0
1 changed files with 185 additions and 4 deletions
--- a/torch/utils/collect_env.py
+++ b/torch/utils/collect_env.py
@ -10,6 +10,7 @@ import re
 import subprocess
 import sys
 import os
+from typing import cast as _cast
 from collections import namedtuple


@ -37,6 +38,7 @@ SystemEnv = namedtuple('SystemEnv', [
    'nvidia_driver_version',
    'nvidia_gpu_models',
    'cudnn_version',
+    'is_xpu_available',
    'pip_version',  # 'pip' or 'pip3'
    'pip_packages',
    'conda_packages',
@ -73,6 +75,30 @@ NVIDIA_PATTERNS = [
    "nvtx",
 ]

+ONEAPI_PATTERNS = [
+    "dpcpp-cpp-rt",
+    "intel-cmplr-lib-rt",
+    "intel-cmplr-lib-ur",
+    "intel-cmplr-lic-rt",
+    "intel-opencl-rt",
+    "intel-sycl-rt",
+    "mkl",
+    "onemkl-sycl-blas",
+    "onemkl-sycl-dft",
+    "onemkl-sycl-lapack",
+    "onemkl-sycl-rng",
+    "onemkl-sycl-sparse",
+    "intel-openmp",
+    "tbb",
+    "impi-rt",
+    "impi-devel",
+    "oneccl",
+    "oneccl-devel",
+    "intel-pti",
+    "umf",
+    "tcmlib",
+]
+
 CONDA_PATTERNS = [
    "cudatoolkit",
    "soumith",
@ -131,7 +157,7 @@ def run_and_return_first_line(run_lambda, command):

 def get_conda_packages(run_lambda, patterns=None):
    if patterns is None:
-        patterns = CONDA_PATTERNS + COMMON_PATTERNS + NVIDIA_PATTERNS
+        patterns = CONDA_PATTERNS + COMMON_PATTERNS + NVIDIA_PATTERNS + ONEAPI_PATTERNS
    conda = os.environ.get('CONDA_EXE', 'conda')
    out = run_and_read_all(run_lambda, "{} list".format(conda))
    if out is None:
@ -243,6 +269,152 @@ def get_nvidia_smi():
    return smi


+def _detect_linux_pkg_manager():
+    if get_platform() != "linux":
+        return "N/A"
+    for mgr_name in ["dpkg", "dnf", "yum", "zypper"]:
+        rc, _, _ = run(f"which {mgr_name}")
+        if rc == 0:
+            return mgr_name
+    return "N/A"
+
+
+def get_linux_pkg_version(run_lambda, pkg_name):
+    pkg_mgr = _detect_linux_pkg_manager()
+    if pkg_mgr == "N/A":
+        return "N/A"
+
+    grep_version = {
+        "dpkg": {
+            "field_index": 2,
+            "command": "dpkg -l | grep {}",
+        },
+        "dnf": {
+            "field_index": 1,
+            "command": "dnf list | grep {}",
+        },
+        "yum": {
+            "field_index": 1,
+            "command": "yum list | grep {}",
+        },
+        "zypper": {
+            "field_index": 2,
+            "command": "zypper info {} | grep Version",
+        },
+    }
+
+    field_index: int = int(_cast(int, grep_version[pkg_mgr]["field_index"]))
+    cmd: str = str(grep_version[pkg_mgr]["command"])
+    cmd = cmd.format(pkg_name)
+    ret = run_and_read_all(run_lambda, cmd)
+    if ret is None or ret == "":
+        return "N/A"
+    lst = re.sub(" +", " ", ret).split(" ")
+    if len(lst) <= field_index:
+        return "N/A"
+    return lst[field_index]
+
+
+def get_intel_gpu_driver_version(run_lambda):
+    lst = []
+    platform = get_platform()
+    if platform == "linux":
+        pkgs = {  # type: ignore[var-annotated]
+            "dpkg": {
+                "intel-opencl-icd",
+                "libze1",
+                "level-zero",
+            },
+            "dnf": {
+                "intel-opencl",
+                "level-zero",
+            },
+            "yum": {
+                "intel-opencl",
+                "level-zero",
+            },
+            "zypper": {
+                "intel-opencl",
+                "level-zero",
+            },
+        }.get(_detect_linux_pkg_manager(), {})
+        for pkg in pkgs:
+            ver = get_linux_pkg_version(run_lambda, pkg)
+            if ver != "N/A":
+                lst.append(f"* {pkg}:\t{ver}")
+    if platform in ["win32", "cygwin"]:
+        txt = run_and_read_all(
+            run_lambda,
+            'powershell.exe "gwmi -Class Win32_PnpSignedDriver | where{$_.DeviceClass -eq \\"DISPLAY\\"\
+            -and $_.Manufacturer -match \\"Intel\\"} | Select-Object -Property DeviceName,DriverVersion,DriverDate\
+            | ConvertTo-Json"',
+        )
+        try:
+            obj = json.loads(txt)
+            if type(obj) is list:
+                for o in obj:
+                    lst.append(
+                        f'* {o["DeviceName"]}: {o["DriverVersion"]} ({o["DriverDate"]})'
+                    )
+            else:
+                lst.append(f'* {obj["DriverVersion"]} ({obj["DriverDate"]})')
+        except ValueError as e:
+            lst.append(txt)
+            lst.append(str(e))
+    return "\n".join(lst)
+
+
+def get_intel_gpu_onboard(run_lambda):
+    lst: list[str] = []
+    platform = get_platform()
+    if platform == "linux":
+        txt = run_and_read_all(run_lambda, "xpu-smi discovery -j")
+        if txt:
+            try:
+                obj = json.loads(txt)
+                device_list = obj.get("device_list", [])
+                if isinstance(device_list, list) and device_list:
+                    lst.extend(f'* {device["device_name"]}' for device in device_list)
+                else:
+                    lst.append("N/A")
+            except (ValueError, TypeError) as e:
+                lst.append(txt)
+                lst.append(str(e))
+        else:
+            lst.append("N/A")
+    if platform in ["win32", "cygwin"]:
+        txt = run_and_read_all(
+            run_lambda,
+            'powershell.exe "gwmi -Class Win32_PnpSignedDriver | where{$_.DeviceClass -eq \\"DISPLAY\\"\
+            -and $_.Manufacturer -match \\"Intel\\"} | Select-Object -Property DeviceName | ConvertTo-Json"',
+        )
+        if txt:
+            try:
+                obj = json.loads(txt)
+                if isinstance(obj, list) and obj:
+                    lst.extend(f'* {device["DeviceName"]}' for device in obj)
+                else:
+                    lst.append(f'* {obj.get("DeviceName", "N/A")}')
+            except ValueError as e:
+                lst.append(txt)
+                lst.append(str(e))
+        else:
+            lst.append("N/A")
+    return "\n".join(lst)
+
+
+def get_intel_gpu_detected(run_lambda):
+    if not TORCH_AVAILABLE or not hasattr(torch, "xpu"):
+        return "N/A"
+
+    device_count = torch.xpu.device_count()
+    if device_count == 0:
+        return "N/A"
+
+    devices = [f"* [{i}] {torch.xpu.get_device_properties(i)}" for i in range(device_count)]
+    return "\n".join(devices)
+
+
 # example outputs of CPU infos
 #  * linux
 #    Architecture:            x86_64
@ -396,7 +568,7 @@ def get_os(run_lambda):
    from platform import machine
    platform = get_platform()

-    if platform == 'win32' or platform == 'cygwin':
+    if platform in ["win32", "cygwin"]:
        return get_windows_version(run_lambda)

    if platform == 'darwin':
@ -437,7 +609,7 @@ def get_libc_version():
 def get_pip_packages(run_lambda, patterns=None):
    """Return `pip list` output. Note: will also find conda-installed pytorch and numpy packages."""
    if patterns is None:
-        patterns = PIP_PATTERNS + COMMON_PATTERNS + NVIDIA_PATTERNS
+        patterns = PIP_PATTERNS + COMMON_PATTERNS + NVIDIA_PATTERNS + ONEAPI_PATTERNS

    pip_version = 'pip3' if sys.version_info.major == 3 else 'pip'

@ -504,6 +676,13 @@ def get_env_info():
        debug_mode_str = str(torch.version.debug)
        cuda_available_str = str(torch.cuda.is_available())
        cuda_version_str = torch.version.cuda
+        xpu_available_str = str(torch.xpu.is_available())
+        if torch.xpu.is_available():
+            xpu_available_str = f'{xpu_available_str}\n' + \
+                                f'XPU used to build PyTorch: {torch.version.xpu}\n' + \
+                                f'Intel GPU driver version:\n{get_intel_gpu_driver_version(run_lambda)}\n' + \
+                                f'Intel GPU models onboard:\n{get_intel_gpu_onboard(run_lambda)}\n' + \
+                                f'Intel GPU models detected:\n{get_intel_gpu_detected(run_lambda)}'
        if not hasattr(torch.version, 'hip') or torch.version.hip is None:  # cuda version
            hip_compiled_version = hip_runtime_version = miopen_runtime_version = 'N/A'
        else:  # HIP version
@ -517,7 +696,7 @@ def get_env_info():
            cuda_version_str = 'N/A'
            hip_compiled_version = torch.version.hip
    else:
-        version_str = debug_mode_str = cuda_available_str = cuda_version_str = 'N/A'
+        version_str = debug_mode_str = cuda_available_str = cuda_version_str = xpu_available_str = 'N/A'
        hip_compiled_version = hip_runtime_version = miopen_runtime_version = 'N/A'

    sys_version = sys.version.replace("\n", " ")
@ -536,6 +715,7 @@ def get_env_info():
        nvidia_gpu_models=get_gpu_info(run_lambda),
        nvidia_driver_version=get_nvidia_driver_version(run_lambda),
        cudnn_version=get_cudnn_version(run_lambda),
+        is_xpu_available=xpu_available_str,
        hip_compiled_version=hip_compiled_version,
        hip_runtime_version=hip_runtime_version,
        miopen_runtime_version=miopen_runtime_version,
@ -572,6 +752,7 @@ CUDA_MODULE_LOADING set to: {cuda_module_loading}
 GPU models and configuration: {nvidia_gpu_models}
 Nvidia driver version: {nvidia_driver_version}
 cuDNN version: {cudnn_version}
+Is XPU available: {is_xpu_available}
 HIP runtime version: {hip_runtime_version}
 MIOpen runtime version: {miopen_runtime_version}
 Is XNNPACK available: {is_xnnpack_available}