Always filter devices (#12108)

* Always filter devices

Avoid crashing on unsupported AMD iGPUs

* Remove cuda device filtering

This interferes with mixed setups
This commit is contained in:
Daniel Hiltgen 2025-08-29 12:17:31 -07:00 committed by GitHub
parent 4383a3ab7a
commit ead4a9a1d0
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
8 changed files with 65 additions and 70 deletions

View File

@ -277,6 +277,7 @@ func AMDGetGPUInfo() ([]RocmGPUInfo, error) {
FreeMemory: (totalMemory - usedMemory), FreeMemory: (totalMemory - usedMemory),
}, },
ID: ID, ID: ID,
filterID: gpuOrdinalID,
Name: name, Name: name,
Compute: fmt.Sprintf("gfx%d%x%x", major, minor, patch), Compute: fmt.Sprintf("gfx%d%x%x", major, minor, patch),
MinimumMemory: rocmMinimumMemory, MinimumMemory: rocmMinimumMemory,
@ -394,7 +395,7 @@ func AMDGetGPUInfo() ([]RocmGPUInfo, error) {
// Check for env var workarounds // Check for env var workarounds
if name == "1002:687f" { // Vega RX 56 if name == "1002:687f" { // Vega RX 56
gpuInfo.EnvWorkarounds = append(gpuInfo.EnvWorkarounds, [2]string{"HSA_ENABLE_SDMA", "0"}) gpuInfo.EnvWorkarounds = append(gpuInfo.EnvWorkarounds, "HSA_ENABLE_SDMA=0")
} }
// The GPU has passed all the verification steps and is supported // The GPU has passed all the verification steps and is supported
@ -523,19 +524,26 @@ func verifyKFDDriverAccess() error {
return nil return nil
} }
func rocmGetVisibleDevicesEnv(gpuInfo []GpuInfo) (string, string) { func rocmGetVisibleDevicesEnv(gpuInfo []GpuInfo) string {
ids := []string{} ids := []string{}
for _, info := range gpuInfo { for _, info := range gpuInfo {
if info.Library != "rocm" { if info.Library != "rocm" {
// TODO shouldn't happen if things are wired correctly...
slog.Debug("rocmGetVisibleDevicesEnv skipping over non-rocm device", "library", info.Library)
continue continue
} }
ids = append(ids, info.ID) // If the devices requires a numeric ID, for filtering purposes, we use the unfiltered ID number
if _, err := strconv.Atoi(info.ID); err == nil {
ids = append(ids, fmt.Sprintf("%d", info.filterID))
} else {
ids = append(ids, info.ID)
}
} }
if len(ids) == 0 {
return ""
}
// There are 3 potential env vars to use to select GPUs. // There are 3 potential env vars to use to select GPUs.
// ROCR_VISIBLE_DEVICES supports UUID or numeric so is our preferred on linux // ROCR_VISIBLE_DEVICES supports UUID or numeric so is our preferred on linux
// GPU_DEVICE_ORDINAL supports numeric IDs only // GPU_DEVICE_ORDINAL supports numeric IDs only
// HIP_VISIBLE_DEVICES supports numeric IDs only // HIP_VISIBLE_DEVICES supports numeric IDs only
return "ROCR_VISIBLE_DEVICES", strings.Join(ids, ",") return "ROCR_VISIBLE_DEVICES=" + strings.Join(ids, ",")
} }

View File

@ -111,6 +111,7 @@ func AMDGetGPUInfo() ([]RocmGPUInfo, error) {
UnreliableFreeMemory: true, UnreliableFreeMemory: true,
ID: strconv.Itoa(i), // TODO this is probably wrong if we specify visible devices ID: strconv.Itoa(i), // TODO this is probably wrong if we specify visible devices
filterID: i,
DependencyPath: []string{libDir}, DependencyPath: []string{libDir},
MinimumMemory: rocmMinimumMemory, MinimumMemory: rocmMinimumMemory,
Name: name, Name: name,
@ -200,19 +201,26 @@ func (gpus RocmGPUInfoList) RefreshFreeMemory() error {
return nil return nil
} }
func rocmGetVisibleDevicesEnv(gpuInfo []GpuInfo) (string, string) { func rocmGetVisibleDevicesEnv(gpuInfo []GpuInfo) string {
ids := []string{} ids := []string{}
for _, info := range gpuInfo { for _, info := range gpuInfo {
if info.Library != "rocm" { if info.Library != "rocm" {
// TODO shouldn't happen if things are wired correctly...
slog.Debug("rocmGetVisibleDevicesEnv skipping over non-rocm device", "library", info.Library)
continue continue
} }
ids = append(ids, info.ID) // If the devices requires a numeric ID, for filtering purposes, we use the unfiltered ID number
if _, err := strconv.Atoi(info.ID); err == nil {
ids = append(ids, fmt.Sprintf("%d", info.filterID))
} else {
ids = append(ids, info.ID)
}
} }
if len(ids) == 0 {
return ""
}
// There are 3 potential env vars to use to select GPUs. // There are 3 potential env vars to use to select GPUs.
// ROCR_VISIBLE_DEVICES supports UUID or numeric but does not work on Windows // ROCR_VISIBLE_DEVICES supports UUID or numeric but does not work on Windows
// HIP_VISIBLE_DEVICES supports numeric IDs only // HIP_VISIBLE_DEVICES supports numeric IDs only
// GPU_DEVICE_ORDINAL supports numeric IDs only // GPU_DEVICE_ORDINAL supports numeric IDs only
return "HIP_VISIBLE_DEVICES", strings.Join(ids, ",") return "HIP_VISIBLE_DEVICES=" + strings.Join(ids, ",")
} }

View File

@ -16,19 +16,6 @@ import (
// Included to drive logic for reducing Ollama-allocated overhead on L4T/Jetson devices. // Included to drive logic for reducing Ollama-allocated overhead on L4T/Jetson devices.
var CudaTegra string = os.Getenv("JETSON_JETPACK") var CudaTegra string = os.Getenv("JETSON_JETPACK")
func cudaGetVisibleDevicesEnv(gpuInfo []GpuInfo) (string, string) {
ids := []string{}
for _, info := range gpuInfo {
if info.Library != "cuda" {
// TODO shouldn't happen if things are wired correctly...
slog.Debug("cudaGetVisibleDevicesEnv skipping over non-cuda device", "library", info.Library)
continue
}
ids = append(ids, info.ID)
}
return "CUDA_VISIBLE_DEVICES", strings.Join(ids, ",")
}
func cudaVariant(gpuInfo CudaGPUInfo) string { func cudaVariant(gpuInfo CudaGPUInfo) string {
if runtime.GOARCH == "arm64" && runtime.GOOS == "linux" { if runtime.GOARCH == "arm64" && runtime.GOOS == "linux" {
if CudaTegra != "" { if CudaTegra != "" {

View File

@ -371,6 +371,15 @@ func GetGPUInfo() GpuInfoList {
} }
rocmGPUs, err = AMDGetGPUInfo() rocmGPUs, err = AMDGetGPUInfo()
// The ID field is used in context of the filtered set of GPUS
// so we have to replace any of these numeric IDs with their
// placement in this set of GPUs
for i := range rocmGPUs {
if _, err := strconv.Atoi(rocmGPUs[i].ID); err == nil {
rocmGPUs[i].ID = strconv.Itoa(i)
}
}
if err != nil { if err != nil {
bootstrapErrors = append(bootstrapErrors, err) bootstrapErrors = append(bootstrapErrors, err)
} }
@ -680,23 +689,16 @@ func getVerboseState() C.uint16_t {
// Given the list of GPUs this instantiation is targeted for, // Given the list of GPUs this instantiation is targeted for,
// figure out the visible devices environment variable // figure out the visible devices environment variable
// func (l GpuInfoList) GetVisibleDevicesEnv() []string {
// If different libraries are detected, the first one is what we use
func (l GpuInfoList) GetVisibleDevicesEnv() (string, string) {
if len(l) == 0 { if len(l) == 0 {
return "", "" return nil
} }
switch l[0].Library { vd := []string{}
case "cuda": // Only filter the AMD GPUs at this level, let all NVIDIA devices through
return cudaGetVisibleDevicesEnv(l) if tmp := rocmGetVisibleDevicesEnv(l); tmp != "" {
case "rocm": vd = append(vd, tmp)
return rocmGetVisibleDevicesEnv(l)
case "oneapi":
return oneapiGetVisibleDevicesEnv(l)
default:
slog.Debug("no filter required for library " + l[0].Library)
return "", ""
} }
return vd
} }
func GetSystemInfo() SystemInfo { func GetSystemInfo() SystemInfo {

View File

@ -62,9 +62,9 @@ func GetCPUMem() (memInfo, error) {
}, nil }, nil
} }
func (l GpuInfoList) GetVisibleDevicesEnv() (string, string) { func (l GpuInfoList) GetVisibleDevicesEnv() []string {
// No-op on darwin // No-op on darwin
return "", "" return nil
} }
func GetSystemInfo() SystemInfo { func GetSystemInfo() SystemInfo {

View File

@ -1,21 +0,0 @@
//go:build linux || windows
package discover
import (
"log/slog"
"strings"
)
func oneapiGetVisibleDevicesEnv(gpuInfo []GpuInfo) (string, string) {
ids := []string{}
for _, info := range gpuInfo {
if info.Library != "oneapi" {
// TODO shouldn't happen if things are wired correctly...
slog.Debug("oneapiGetVisibleDevicesEnv skipping over non-sycl device", "library", info.Library)
continue
}
ids = append(ids, info.ID)
}
return "ONEAPI_DEVICE_SELECTOR", "level_zero:" + strings.Join(ids, ",")
}

View File

@ -27,8 +27,8 @@ type GpuInfo struct { // TODO better name maybe "InferenceProcessor"?
// Any extra PATH/LD_LIBRARY_PATH dependencies required for the Library to operate properly // Any extra PATH/LD_LIBRARY_PATH dependencies required for the Library to operate properly
DependencyPath []string `json:"lib_path,omitempty"` DependencyPath []string `json:"lib_path,omitempty"`
// Extra environment variables specific to the GPU as list of [key,value] // Extra environment variables specific to the GPU as list of [key=value]
EnvWorkarounds [][2]string `json:"envs,omitempty"` EnvWorkarounds []string `json:"envs,omitempty"`
// Set to true if we can NOT reliably discover FreeMemory. A value of true indicates // Set to true if we can NOT reliably discover FreeMemory. A value of true indicates
// the FreeMemory is best effort, and may over or under report actual memory usage // the FreeMemory is best effort, and may over or under report actual memory usage
@ -36,9 +36,10 @@ type GpuInfo struct { // TODO better name maybe "InferenceProcessor"?
UnreliableFreeMemory bool UnreliableFreeMemory bool
// GPU information // GPU information
ID string `json:"gpu_id"` // string to use for selection of this specific GPU ID string `json:"gpu_id"` // string to use for selection of this specific GPU
Name string `json:"name"` // user friendly name if available filterID int //nolint:unused,nolintlint // AMD Workaround: The numeric ID of the device used to filter out other devices
Compute string `json:"compute"` // Compute Capability or gfx Name string `json:"name"` // user friendly name if available
Compute string `json:"compute"` // Compute Capability or gfx
// Driver Information - TODO no need to put this on each GPU // Driver Information - TODO no need to put this on each GPU
DriverMajor int `json:"driver_major,omitempty"` DriverMajor int `json:"driver_major,omitempty"`

View File

@ -360,23 +360,28 @@ func NewLlamaServer(gpus discover.GpuInfoList, modelPath string, f *ggml.GGML, a
s.cmd.Env = append(s.cmd.Env, "OLLAMA_LIBRARY_PATH="+strings.Join(ggmlPaths, string(filepath.ListSeparator))) s.cmd.Env = append(s.cmd.Env, "OLLAMA_LIBRARY_PATH="+strings.Join(ggmlPaths, string(filepath.ListSeparator)))
envWorkarounds := [][2]string{} envWorkarounds := []string{}
for _, gpu := range gpus { for _, gpu := range gpus {
envWorkarounds = append(envWorkarounds, gpu.EnvWorkarounds...) envWorkarounds = append(envWorkarounds, gpu.EnvWorkarounds...)
} }
// Always filter down the set of GPUs in case there are any unsupported devices that might crash
envWorkarounds = append(envWorkarounds, gpus.GetVisibleDevicesEnv()...)
pathEnvVal := strings.Join(libraryPaths, string(filepath.ListSeparator)) pathEnvVal := strings.Join(libraryPaths, string(filepath.ListSeparator))
// Update or add the path variable with our adjusted version // Update or add the path variable with our adjusted version
pathNeeded := true pathNeeded := true
envWorkaroundDone := make([]bool, len(envWorkarounds))
for i := range s.cmd.Env { for i := range s.cmd.Env {
cmp := strings.SplitN(s.cmd.Env[i], "=", 2) cmp := strings.SplitN(s.cmd.Env[i], "=", 2)
if strings.EqualFold(cmp[0], pathEnv) { if strings.EqualFold(cmp[0], pathEnv) {
s.cmd.Env[i] = pathEnv + "=" + pathEnvVal s.cmd.Env[i] = pathEnv + "=" + pathEnvVal
pathNeeded = false pathNeeded = false
} else if len(envWorkarounds) != 0 { } else if len(envWorkarounds) != 0 {
for _, kv := range envWorkarounds { for j, kv := range envWorkarounds {
if strings.EqualFold(cmp[0], kv[0]) { tmp := strings.SplitN(kv, "=", 2)
s.cmd.Env[i] = kv[0] + "=" + kv[1] if strings.EqualFold(cmp[0], tmp[0]) {
s.cmd.Env[i] = kv
envWorkaroundDone[j] = true
} }
} }
} }
@ -384,6 +389,11 @@ func NewLlamaServer(gpus discover.GpuInfoList, modelPath string, f *ggml.GGML, a
if pathNeeded { if pathNeeded {
s.cmd.Env = append(s.cmd.Env, pathEnv+"="+pathEnvVal) s.cmd.Env = append(s.cmd.Env, pathEnv+"="+pathEnvVal)
} }
for i, done := range envWorkaroundDone {
if !done {
s.cmd.Env = append(s.cmd.Env, envWorkarounds[i])
}
}
slog.Info("starting runner", "cmd", s.cmd) slog.Info("starting runner", "cmd", s.cmd)
slog.Debug("subprocess", "", filteredEnv(s.cmd.Env)) slog.Debug("subprocess", "", filteredEnv(s.cmd.Env))