Always filter devices (#12108)

* Always filter devices

Avoid crashing on unsupported AMD iGPUs

* Remove cuda device filtering

This interferes with mixed setups
This commit is contained in:
Daniel Hiltgen 2025-08-29 12:17:31 -07:00 committed by GitHub
parent 4383a3ab7a
commit ead4a9a1d0
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
8 changed files with 65 additions and 70 deletions

View File

@ -277,6 +277,7 @@ func AMDGetGPUInfo() ([]RocmGPUInfo, error) {
FreeMemory: (totalMemory - usedMemory),
},
ID: ID,
filterID: gpuOrdinalID,
Name: name,
Compute: fmt.Sprintf("gfx%d%x%x", major, minor, patch),
MinimumMemory: rocmMinimumMemory,
@ -394,7 +395,7 @@ func AMDGetGPUInfo() ([]RocmGPUInfo, error) {
// Check for env var workarounds
if name == "1002:687f" { // Vega RX 56
gpuInfo.EnvWorkarounds = append(gpuInfo.EnvWorkarounds, [2]string{"HSA_ENABLE_SDMA", "0"})
gpuInfo.EnvWorkarounds = append(gpuInfo.EnvWorkarounds, "HSA_ENABLE_SDMA=0")
}
// The GPU has passed all the verification steps and is supported
@ -523,19 +524,26 @@ func verifyKFDDriverAccess() error {
return nil
}
func rocmGetVisibleDevicesEnv(gpuInfo []GpuInfo) (string, string) {
func rocmGetVisibleDevicesEnv(gpuInfo []GpuInfo) string {
ids := []string{}
for _, info := range gpuInfo {
if info.Library != "rocm" {
// TODO shouldn't happen if things are wired correctly...
slog.Debug("rocmGetVisibleDevicesEnv skipping over non-rocm device", "library", info.Library)
continue
}
ids = append(ids, info.ID)
// If the devices requires a numeric ID, for filtering purposes, we use the unfiltered ID number
if _, err := strconv.Atoi(info.ID); err == nil {
ids = append(ids, fmt.Sprintf("%d", info.filterID))
} else {
ids = append(ids, info.ID)
}
}
if len(ids) == 0 {
return ""
}
// There are 3 potential env vars to use to select GPUs.
// ROCR_VISIBLE_DEVICES supports UUID or numeric so is our preferred on linux
// GPU_DEVICE_ORDINAL supports numeric IDs only
// HIP_VISIBLE_DEVICES supports numeric IDs only
return "ROCR_VISIBLE_DEVICES", strings.Join(ids, ",")
return "ROCR_VISIBLE_DEVICES=" + strings.Join(ids, ",")
}

View File

@ -111,6 +111,7 @@ func AMDGetGPUInfo() ([]RocmGPUInfo, error) {
UnreliableFreeMemory: true,
ID: strconv.Itoa(i), // TODO this is probably wrong if we specify visible devices
filterID: i,
DependencyPath: []string{libDir},
MinimumMemory: rocmMinimumMemory,
Name: name,
@ -200,19 +201,26 @@ func (gpus RocmGPUInfoList) RefreshFreeMemory() error {
return nil
}
func rocmGetVisibleDevicesEnv(gpuInfo []GpuInfo) (string, string) {
func rocmGetVisibleDevicesEnv(gpuInfo []GpuInfo) string {
ids := []string{}
for _, info := range gpuInfo {
if info.Library != "rocm" {
// TODO shouldn't happen if things are wired correctly...
slog.Debug("rocmGetVisibleDevicesEnv skipping over non-rocm device", "library", info.Library)
continue
}
ids = append(ids, info.ID)
// If the devices requires a numeric ID, for filtering purposes, we use the unfiltered ID number
if _, err := strconv.Atoi(info.ID); err == nil {
ids = append(ids, fmt.Sprintf("%d", info.filterID))
} else {
ids = append(ids, info.ID)
}
}
if len(ids) == 0 {
return ""
}
// There are 3 potential env vars to use to select GPUs.
// ROCR_VISIBLE_DEVICES supports UUID or numeric but does not work on Windows
// HIP_VISIBLE_DEVICES supports numeric IDs only
// GPU_DEVICE_ORDINAL supports numeric IDs only
return "HIP_VISIBLE_DEVICES", strings.Join(ids, ",")
return "HIP_VISIBLE_DEVICES=" + strings.Join(ids, ",")
}

View File

@ -16,19 +16,6 @@ import (
// Included to drive logic for reducing Ollama-allocated overhead on L4T/Jetson devices.
var CudaTegra string = os.Getenv("JETSON_JETPACK")
func cudaGetVisibleDevicesEnv(gpuInfo []GpuInfo) (string, string) {
ids := []string{}
for _, info := range gpuInfo {
if info.Library != "cuda" {
// TODO shouldn't happen if things are wired correctly...
slog.Debug("cudaGetVisibleDevicesEnv skipping over non-cuda device", "library", info.Library)
continue
}
ids = append(ids, info.ID)
}
return "CUDA_VISIBLE_DEVICES", strings.Join(ids, ",")
}
func cudaVariant(gpuInfo CudaGPUInfo) string {
if runtime.GOARCH == "arm64" && runtime.GOOS == "linux" {
if CudaTegra != "" {

View File

@ -371,6 +371,15 @@ func GetGPUInfo() GpuInfoList {
}
rocmGPUs, err = AMDGetGPUInfo()
// The ID field is used in context of the filtered set of GPUS
// so we have to replace any of these numeric IDs with their
// placement in this set of GPUs
for i := range rocmGPUs {
if _, err := strconv.Atoi(rocmGPUs[i].ID); err == nil {
rocmGPUs[i].ID = strconv.Itoa(i)
}
}
if err != nil {
bootstrapErrors = append(bootstrapErrors, err)
}
@ -680,23 +689,16 @@ func getVerboseState() C.uint16_t {
// Given the list of GPUs this instantiation is targeted for,
// figure out the visible devices environment variable
//
// If different libraries are detected, the first one is what we use
func (l GpuInfoList) GetVisibleDevicesEnv() (string, string) {
func (l GpuInfoList) GetVisibleDevicesEnv() []string {
if len(l) == 0 {
return "", ""
return nil
}
switch l[0].Library {
case "cuda":
return cudaGetVisibleDevicesEnv(l)
case "rocm":
return rocmGetVisibleDevicesEnv(l)
case "oneapi":
return oneapiGetVisibleDevicesEnv(l)
default:
slog.Debug("no filter required for library " + l[0].Library)
return "", ""
vd := []string{}
// Only filter the AMD GPUs at this level, let all NVIDIA devices through
if tmp := rocmGetVisibleDevicesEnv(l); tmp != "" {
vd = append(vd, tmp)
}
return vd
}
func GetSystemInfo() SystemInfo {

View File

@ -62,9 +62,9 @@ func GetCPUMem() (memInfo, error) {
}, nil
}
func (l GpuInfoList) GetVisibleDevicesEnv() (string, string) {
func (l GpuInfoList) GetVisibleDevicesEnv() []string {
// No-op on darwin
return "", ""
return nil
}
func GetSystemInfo() SystemInfo {

View File

@ -1,21 +0,0 @@
//go:build linux || windows
package discover
import (
"log/slog"
"strings"
)
func oneapiGetVisibleDevicesEnv(gpuInfo []GpuInfo) (string, string) {
ids := []string{}
for _, info := range gpuInfo {
if info.Library != "oneapi" {
// TODO shouldn't happen if things are wired correctly...
slog.Debug("oneapiGetVisibleDevicesEnv skipping over non-sycl device", "library", info.Library)
continue
}
ids = append(ids, info.ID)
}
return "ONEAPI_DEVICE_SELECTOR", "level_zero:" + strings.Join(ids, ",")
}

View File

@ -27,8 +27,8 @@ type GpuInfo struct { // TODO better name maybe "InferenceProcessor"?
// Any extra PATH/LD_LIBRARY_PATH dependencies required for the Library to operate properly
DependencyPath []string `json:"lib_path,omitempty"`
// Extra environment variables specific to the GPU as list of [key,value]
EnvWorkarounds [][2]string `json:"envs,omitempty"`
// Extra environment variables specific to the GPU as list of [key=value]
EnvWorkarounds []string `json:"envs,omitempty"`
// Set to true if we can NOT reliably discover FreeMemory. A value of true indicates
// the FreeMemory is best effort, and may over or under report actual memory usage
@ -36,9 +36,10 @@ type GpuInfo struct { // TODO better name maybe "InferenceProcessor"?
UnreliableFreeMemory bool
// GPU information
ID string `json:"gpu_id"` // string to use for selection of this specific GPU
Name string `json:"name"` // user friendly name if available
Compute string `json:"compute"` // Compute Capability or gfx
ID string `json:"gpu_id"` // string to use for selection of this specific GPU
filterID int //nolint:unused,nolintlint // AMD Workaround: The numeric ID of the device used to filter out other devices
Name string `json:"name"` // user friendly name if available
Compute string `json:"compute"` // Compute Capability or gfx
// Driver Information - TODO no need to put this on each GPU
DriverMajor int `json:"driver_major,omitempty"`

View File

@ -360,23 +360,28 @@ func NewLlamaServer(gpus discover.GpuInfoList, modelPath string, f *ggml.GGML, a
s.cmd.Env = append(s.cmd.Env, "OLLAMA_LIBRARY_PATH="+strings.Join(ggmlPaths, string(filepath.ListSeparator)))
envWorkarounds := [][2]string{}
envWorkarounds := []string{}
for _, gpu := range gpus {
envWorkarounds = append(envWorkarounds, gpu.EnvWorkarounds...)
}
// Always filter down the set of GPUs in case there are any unsupported devices that might crash
envWorkarounds = append(envWorkarounds, gpus.GetVisibleDevicesEnv()...)
pathEnvVal := strings.Join(libraryPaths, string(filepath.ListSeparator))
// Update or add the path variable with our adjusted version
pathNeeded := true
envWorkaroundDone := make([]bool, len(envWorkarounds))
for i := range s.cmd.Env {
cmp := strings.SplitN(s.cmd.Env[i], "=", 2)
if strings.EqualFold(cmp[0], pathEnv) {
s.cmd.Env[i] = pathEnv + "=" + pathEnvVal
pathNeeded = false
} else if len(envWorkarounds) != 0 {
for _, kv := range envWorkarounds {
if strings.EqualFold(cmp[0], kv[0]) {
s.cmd.Env[i] = kv[0] + "=" + kv[1]
for j, kv := range envWorkarounds {
tmp := strings.SplitN(kv, "=", 2)
if strings.EqualFold(cmp[0], tmp[0]) {
s.cmd.Env[i] = kv
envWorkaroundDone[j] = true
}
}
}
@ -384,6 +389,11 @@ func NewLlamaServer(gpus discover.GpuInfoList, modelPath string, f *ggml.GGML, a
if pathNeeded {
s.cmd.Env = append(s.cmd.Env, pathEnv+"="+pathEnvVal)
}
for i, done := range envWorkaroundDone {
if !done {
s.cmd.Env = append(s.cmd.Env, envWorkarounds[i])
}
}
slog.Info("starting runner", "cmd", s.cmd)
slog.Debug("subprocess", "", filteredEnv(s.cmd.Env))