mirror of
https://github.com/zebrajr/pytorch.git
synced 2025-12-06 12:20:52 +01:00
Add Intel GPU info collection to the collect env script (#157351)
https://github.com/pytorch/pytorch/pull/137846 was mistakenly closed. Reopen a PR to land the PR. Pull Request resolved: https://github.com/pytorch/pytorch/pull/157351 Approved by: https://github.com/guangyey, https://github.com/malfet
This commit is contained in:
parent
d6237721c0
commit
c515385b0a
|
|
@ -10,6 +10,7 @@ import re
|
|||
import subprocess
|
||||
import sys
|
||||
import os
|
||||
from typing import cast as _cast
|
||||
from collections import namedtuple
|
||||
|
||||
|
||||
|
|
@ -37,6 +38,7 @@ SystemEnv = namedtuple('SystemEnv', [
|
|||
'nvidia_driver_version',
|
||||
'nvidia_gpu_models',
|
||||
'cudnn_version',
|
||||
'is_xpu_available',
|
||||
'pip_version', # 'pip' or 'pip3'
|
||||
'pip_packages',
|
||||
'conda_packages',
|
||||
|
|
@ -73,6 +75,30 @@ NVIDIA_PATTERNS = [
|
|||
"nvtx",
|
||||
]
|
||||
|
||||
ONEAPI_PATTERNS = [
|
||||
"dpcpp-cpp-rt",
|
||||
"intel-cmplr-lib-rt",
|
||||
"intel-cmplr-lib-ur",
|
||||
"intel-cmplr-lic-rt",
|
||||
"intel-opencl-rt",
|
||||
"intel-sycl-rt",
|
||||
"mkl",
|
||||
"onemkl-sycl-blas",
|
||||
"onemkl-sycl-dft",
|
||||
"onemkl-sycl-lapack",
|
||||
"onemkl-sycl-rng",
|
||||
"onemkl-sycl-sparse",
|
||||
"intel-openmp",
|
||||
"tbb",
|
||||
"impi-rt",
|
||||
"impi-devel",
|
||||
"oneccl",
|
||||
"oneccl-devel",
|
||||
"intel-pti",
|
||||
"umf",
|
||||
"tcmlib",
|
||||
]
|
||||
|
||||
CONDA_PATTERNS = [
|
||||
"cudatoolkit",
|
||||
"soumith",
|
||||
|
|
@ -131,7 +157,7 @@ def run_and_return_first_line(run_lambda, command):
|
|||
|
||||
def get_conda_packages(run_lambda, patterns=None):
|
||||
if patterns is None:
|
||||
patterns = CONDA_PATTERNS + COMMON_PATTERNS + NVIDIA_PATTERNS
|
||||
patterns = CONDA_PATTERNS + COMMON_PATTERNS + NVIDIA_PATTERNS + ONEAPI_PATTERNS
|
||||
conda = os.environ.get('CONDA_EXE', 'conda')
|
||||
out = run_and_read_all(run_lambda, "{} list".format(conda))
|
||||
if out is None:
|
||||
|
|
@ -243,6 +269,152 @@ def get_nvidia_smi():
|
|||
return smi
|
||||
|
||||
|
||||
def _detect_linux_pkg_manager():
|
||||
if get_platform() != "linux":
|
||||
return "N/A"
|
||||
for mgr_name in ["dpkg", "dnf", "yum", "zypper"]:
|
||||
rc, _, _ = run(f"which {mgr_name}")
|
||||
if rc == 0:
|
||||
return mgr_name
|
||||
return "N/A"
|
||||
|
||||
|
||||
def get_linux_pkg_version(run_lambda, pkg_name):
|
||||
pkg_mgr = _detect_linux_pkg_manager()
|
||||
if pkg_mgr == "N/A":
|
||||
return "N/A"
|
||||
|
||||
grep_version = {
|
||||
"dpkg": {
|
||||
"field_index": 2,
|
||||
"command": "dpkg -l | grep {}",
|
||||
},
|
||||
"dnf": {
|
||||
"field_index": 1,
|
||||
"command": "dnf list | grep {}",
|
||||
},
|
||||
"yum": {
|
||||
"field_index": 1,
|
||||
"command": "yum list | grep {}",
|
||||
},
|
||||
"zypper": {
|
||||
"field_index": 2,
|
||||
"command": "zypper info {} | grep Version",
|
||||
},
|
||||
}
|
||||
|
||||
field_index: int = int(_cast(int, grep_version[pkg_mgr]["field_index"]))
|
||||
cmd: str = str(grep_version[pkg_mgr]["command"])
|
||||
cmd = cmd.format(pkg_name)
|
||||
ret = run_and_read_all(run_lambda, cmd)
|
||||
if ret is None or ret == "":
|
||||
return "N/A"
|
||||
lst = re.sub(" +", " ", ret).split(" ")
|
||||
if len(lst) <= field_index:
|
||||
return "N/A"
|
||||
return lst[field_index]
|
||||
|
||||
|
||||
def get_intel_gpu_driver_version(run_lambda):
|
||||
lst = []
|
||||
platform = get_platform()
|
||||
if platform == "linux":
|
||||
pkgs = { # type: ignore[var-annotated]
|
||||
"dpkg": {
|
||||
"intel-opencl-icd",
|
||||
"libze1",
|
||||
"level-zero",
|
||||
},
|
||||
"dnf": {
|
||||
"intel-opencl",
|
||||
"level-zero",
|
||||
},
|
||||
"yum": {
|
||||
"intel-opencl",
|
||||
"level-zero",
|
||||
},
|
||||
"zypper": {
|
||||
"intel-opencl",
|
||||
"level-zero",
|
||||
},
|
||||
}.get(_detect_linux_pkg_manager(), {})
|
||||
for pkg in pkgs:
|
||||
ver = get_linux_pkg_version(run_lambda, pkg)
|
||||
if ver != "N/A":
|
||||
lst.append(f"* {pkg}:\t{ver}")
|
||||
if platform in ["win32", "cygwin"]:
|
||||
txt = run_and_read_all(
|
||||
run_lambda,
|
||||
'powershell.exe "gwmi -Class Win32_PnpSignedDriver | where{$_.DeviceClass -eq \\"DISPLAY\\"\
|
||||
-and $_.Manufacturer -match \\"Intel\\"} | Select-Object -Property DeviceName,DriverVersion,DriverDate\
|
||||
| ConvertTo-Json"',
|
||||
)
|
||||
try:
|
||||
obj = json.loads(txt)
|
||||
if type(obj) is list:
|
||||
for o in obj:
|
||||
lst.append(
|
||||
f'* {o["DeviceName"]}: {o["DriverVersion"]} ({o["DriverDate"]})'
|
||||
)
|
||||
else:
|
||||
lst.append(f'* {obj["DriverVersion"]} ({obj["DriverDate"]})')
|
||||
except ValueError as e:
|
||||
lst.append(txt)
|
||||
lst.append(str(e))
|
||||
return "\n".join(lst)
|
||||
|
||||
|
||||
def get_intel_gpu_onboard(run_lambda):
|
||||
lst: list[str] = []
|
||||
platform = get_platform()
|
||||
if platform == "linux":
|
||||
txt = run_and_read_all(run_lambda, "xpu-smi discovery -j")
|
||||
if txt:
|
||||
try:
|
||||
obj = json.loads(txt)
|
||||
device_list = obj.get("device_list", [])
|
||||
if isinstance(device_list, list) and device_list:
|
||||
lst.extend(f'* {device["device_name"]}' for device in device_list)
|
||||
else:
|
||||
lst.append("N/A")
|
||||
except (ValueError, TypeError) as e:
|
||||
lst.append(txt)
|
||||
lst.append(str(e))
|
||||
else:
|
||||
lst.append("N/A")
|
||||
if platform in ["win32", "cygwin"]:
|
||||
txt = run_and_read_all(
|
||||
run_lambda,
|
||||
'powershell.exe "gwmi -Class Win32_PnpSignedDriver | where{$_.DeviceClass -eq \\"DISPLAY\\"\
|
||||
-and $_.Manufacturer -match \\"Intel\\"} | Select-Object -Property DeviceName | ConvertTo-Json"',
|
||||
)
|
||||
if txt:
|
||||
try:
|
||||
obj = json.loads(txt)
|
||||
if isinstance(obj, list) and obj:
|
||||
lst.extend(f'* {device["DeviceName"]}' for device in obj)
|
||||
else:
|
||||
lst.append(f'* {obj.get("DeviceName", "N/A")}')
|
||||
except ValueError as e:
|
||||
lst.append(txt)
|
||||
lst.append(str(e))
|
||||
else:
|
||||
lst.append("N/A")
|
||||
return "\n".join(lst)
|
||||
|
||||
|
||||
def get_intel_gpu_detected(run_lambda):
|
||||
if not TORCH_AVAILABLE or not hasattr(torch, "xpu"):
|
||||
return "N/A"
|
||||
|
||||
device_count = torch.xpu.device_count()
|
||||
if device_count == 0:
|
||||
return "N/A"
|
||||
|
||||
devices = [f"* [{i}] {torch.xpu.get_device_properties(i)}" for i in range(device_count)]
|
||||
return "\n".join(devices)
|
||||
|
||||
|
||||
# example outputs of CPU infos
|
||||
# * linux
|
||||
# Architecture: x86_64
|
||||
|
|
@ -396,7 +568,7 @@ def get_os(run_lambda):
|
|||
from platform import machine
|
||||
platform = get_platform()
|
||||
|
||||
if platform == 'win32' or platform == 'cygwin':
|
||||
if platform in ["win32", "cygwin"]:
|
||||
return get_windows_version(run_lambda)
|
||||
|
||||
if platform == 'darwin':
|
||||
|
|
@ -437,7 +609,7 @@ def get_libc_version():
|
|||
def get_pip_packages(run_lambda, patterns=None):
|
||||
"""Return `pip list` output. Note: will also find conda-installed pytorch and numpy packages."""
|
||||
if patterns is None:
|
||||
patterns = PIP_PATTERNS + COMMON_PATTERNS + NVIDIA_PATTERNS
|
||||
patterns = PIP_PATTERNS + COMMON_PATTERNS + NVIDIA_PATTERNS + ONEAPI_PATTERNS
|
||||
|
||||
pip_version = 'pip3' if sys.version_info.major == 3 else 'pip'
|
||||
|
||||
|
|
@ -504,6 +676,13 @@ def get_env_info():
|
|||
debug_mode_str = str(torch.version.debug)
|
||||
cuda_available_str = str(torch.cuda.is_available())
|
||||
cuda_version_str = torch.version.cuda
|
||||
xpu_available_str = str(torch.xpu.is_available())
|
||||
if torch.xpu.is_available():
|
||||
xpu_available_str = f'{xpu_available_str}\n' + \
|
||||
f'XPU used to build PyTorch: {torch.version.xpu}\n' + \
|
||||
f'Intel GPU driver version:\n{get_intel_gpu_driver_version(run_lambda)}\n' + \
|
||||
f'Intel GPU models onboard:\n{get_intel_gpu_onboard(run_lambda)}\n' + \
|
||||
f'Intel GPU models detected:\n{get_intel_gpu_detected(run_lambda)}'
|
||||
if not hasattr(torch.version, 'hip') or torch.version.hip is None: # cuda version
|
||||
hip_compiled_version = hip_runtime_version = miopen_runtime_version = 'N/A'
|
||||
else: # HIP version
|
||||
|
|
@ -517,7 +696,7 @@ def get_env_info():
|
|||
cuda_version_str = 'N/A'
|
||||
hip_compiled_version = torch.version.hip
|
||||
else:
|
||||
version_str = debug_mode_str = cuda_available_str = cuda_version_str = 'N/A'
|
||||
version_str = debug_mode_str = cuda_available_str = cuda_version_str = xpu_available_str = 'N/A'
|
||||
hip_compiled_version = hip_runtime_version = miopen_runtime_version = 'N/A'
|
||||
|
||||
sys_version = sys.version.replace("\n", " ")
|
||||
|
|
@ -536,6 +715,7 @@ def get_env_info():
|
|||
nvidia_gpu_models=get_gpu_info(run_lambda),
|
||||
nvidia_driver_version=get_nvidia_driver_version(run_lambda),
|
||||
cudnn_version=get_cudnn_version(run_lambda),
|
||||
is_xpu_available=xpu_available_str,
|
||||
hip_compiled_version=hip_compiled_version,
|
||||
hip_runtime_version=hip_runtime_version,
|
||||
miopen_runtime_version=miopen_runtime_version,
|
||||
|
|
@ -572,6 +752,7 @@ CUDA_MODULE_LOADING set to: {cuda_module_loading}
|
|||
GPU models and configuration: {nvidia_gpu_models}
|
||||
Nvidia driver version: {nvidia_driver_version}
|
||||
cuDNN version: {cudnn_version}
|
||||
Is XPU available: {is_xpu_available}
|
||||
HIP runtime version: {hip_runtime_version}
|
||||
MIOpen runtime version: {miopen_runtime_version}
|
||||
Is XNNPACK available: {is_xnnpack_available}
|
||||
|
|
|
|||
Loading…
Reference in New Issue
Block a user