DRY out the runner lifecycle code (#12540)

* DRY out the runner lifecycle code

Now that discovery uses the runners as well, this unifies the runner spawning code
into a single place.  This also unifies GPU discovery types with the newer ml.DeviceInfo

* win: make incremental builds better

Place build artifacts in discrete directories so incremental builds don't have to start fresh

* Adjust sort order to consider iGPUs

* handle cpu inference oom scenarios

* review comments
This commit is contained in:
Daniel Hiltgen 2025-10-23 11:20:02 -07:00 committed by GitHub
parent 1c093e97af
commit 3258a89b6e
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
16 changed files with 720 additions and 924 deletions

View File

@ -2065,12 +2065,6 @@ power management:
cpus := linuxCPUDetails(buf)
slog.Info("example", "scenario", k, "cpus", cpus)
si := SystemInfo{
System: CPUInfo{
CPUs: cpus,
},
}
threadCount := si.GetOptimalThreadCount()
if len(v.expCPUs) != len(cpus) {
t.Fatalf("incorrect number of sockets: expected:%v got:%v", v.expCPUs, cpus)
}
@ -2085,10 +2079,6 @@ power management:
t.Fatalf("incorrect number of threads: expected:%v got:%v", v.expCPUs[i], c)
}
}
if threadCount != v.expThreadCount {
t.Fatalf("incorrect thread count expected:%d got:%d", v.expThreadCount, threadCount)
}
})
}
}

View File

@ -1,16 +1,13 @@
package discover
import (
"context"
"log/slog"
"os"
"path/filepath"
"regexp"
"runtime"
"strconv"
"strings"
"github.com/ollama/ollama/format"
"github.com/ollama/ollama/ml"
)
@ -18,159 +15,28 @@ import (
// Included to drive logic for reducing Ollama-allocated overhead on L4T/Jetson devices.
var CudaTegra string = os.Getenv("JETSON_JETPACK")
func GetCPUInfo() GpuInfo {
mem, err := GetCPUMem()
if err != nil {
slog.Warn("error looking up system memory", "error", err)
}
return GpuInfo{
memInfo: mem,
DeviceID: ml.DeviceID{
Library: "cpu",
ID: "0",
},
}
}
func GetGPUInfo(ctx context.Context, runners []FilteredRunnerDiscovery) GpuInfoList {
devs := GPUDevices(ctx, runners)
return devInfoToInfoList(devs)
}
func devInfoToInfoList(devs []ml.DeviceInfo) GpuInfoList {
resp := []GpuInfo{}
// Our current packaging model places ggml-hip in the main directory
// but keeps rocm in an isolated directory. We have to add it to
// the [LD_LIBRARY_]PATH so ggml-hip will load properly
rocmDir := filepath.Join(LibOllamaPath, "rocm")
if _, err := os.Stat(rocmDir); err != nil {
rocmDir = ""
}
for _, dev := range devs {
info := GpuInfo{
DeviceID: dev.DeviceID,
filterID: dev.FilteredID,
Name: dev.Description,
memInfo: memInfo{
TotalMemory: dev.TotalMemory,
FreeMemory: dev.FreeMemory,
},
// TODO can we avoid variant
DependencyPath: dev.LibraryPath,
DriverMajor: dev.DriverMajor,
DriverMinor: dev.DriverMinor,
ComputeMajor: dev.ComputeMajor,
ComputeMinor: dev.ComputeMinor,
}
if dev.Library == "CUDA" || dev.Library == "ROCm" {
info.MinimumMemory = 457 * format.MebiByte
}
if dev.Library == "ROCm" && rocmDir != "" {
info.DependencyPath = append(info.DependencyPath, rocmDir)
}
// TODO any special processing of Vulkan devices?
resp = append(resp, info)
}
if len(resp) == 0 {
mem, err := GetCPUMem()
if err != nil {
slog.Warn("error looking up system memory", "error", err)
}
resp = append(resp, GpuInfo{
memInfo: mem,
DeviceID: ml.DeviceID{
Library: "cpu",
ID: "0",
},
})
}
return resp
}
// Given the list of GPUs this instantiation is targeted for,
// figure out the visible devices environment variable
//
// If different libraries are detected, the first one is what we use
func (l GpuInfoList) GetVisibleDevicesEnv() []string {
if len(l) == 0 {
return nil
}
res := []string{}
envVar := rocmGetVisibleDevicesEnv(l)
if envVar != "" {
res = append(res, envVar)
}
envVar = vkGetVisibleDevicesEnv(l)
if envVar != "" {
res = append(res, envVar)
}
return res
}
func rocmGetVisibleDevicesEnv(gpuInfo []GpuInfo) string {
ids := []string{}
for _, info := range gpuInfo {
if info.Library != "ROCm" {
continue
}
// If the devices requires a numeric ID, for filtering purposes, we use the unfiltered ID number
if info.filterID != "" {
ids = append(ids, info.filterID)
} else {
ids = append(ids, info.ID)
}
}
if len(ids) == 0 {
return ""
}
envVar := "ROCR_VISIBLE_DEVICES="
if runtime.GOOS != "linux" {
envVar = "HIP_VISIBLE_DEVICES="
}
// There are 3 potential env vars to use to select GPUs.
// ROCR_VISIBLE_DEVICES supports UUID or numeric but does not work on Windows
// HIP_VISIBLE_DEVICES supports numeric IDs only
// GPU_DEVICE_ORDINAL supports numeric IDs only
return envVar + strings.Join(ids, ",")
}
func vkGetVisibleDevicesEnv(gpuInfo []GpuInfo) string {
ids := []string{}
for _, info := range gpuInfo {
if info.Library != "Vulkan" {
continue
}
if info.filterID != "" {
ids = append(ids, info.filterID)
} else {
ids = append(ids, info.ID)
}
}
if len(ids) == 0 {
return ""
}
envVar := "GGML_VK_VISIBLE_DEVICES="
return envVar + strings.Join(ids, ",")
}
// GetSystemInfo returns the last cached state of the GPUs on the system
func GetSystemInfo() SystemInfo {
deviceMu.Lock()
defer deviceMu.Unlock()
gpus := devInfoToInfoList(devices)
if len(gpus) == 1 && gpus[0].Library == "cpu" {
gpus = []GpuInfo{}
func GetSystemInfo() ml.SystemInfo {
memInfo, err := GetCPUMem()
if err != nil {
slog.Warn("error looking up system memory", "error", err)
}
var threadCount int
cpus := GetCPUDetails()
for _, c := range cpus {
threadCount += c.CoreCount - c.EfficiencyCoreCount
}
return SystemInfo{
System: CPUInfo{
CPUs: GetCPUDetails(),
GpuInfo: GetCPUInfo(),
},
GPUs: gpus,
if threadCount == 0 {
// Fall back to Go's num CPU
threadCount = runtime.NumCPU()
}
return ml.SystemInfo{
ThreadCount: threadCount,
TotalMemory: memInfo.TotalMemory,
FreeMemory: memInfo.FreeMemory,
FreeSwap: memInfo.FreeSwap,
}
}

View File

@ -4,13 +4,8 @@ package discover
import (
"context"
"encoding/json"
"fmt"
"io"
"log/slog"
"math/rand"
"net"
"net/http"
"os"
"os/exec"
"path/filepath"
@ -23,6 +18,7 @@ import (
"github.com/ollama/ollama/envconfig"
"github.com/ollama/ollama/format"
"github.com/ollama/ollama/llm"
"github.com/ollama/ollama/logutil"
"github.com/ollama/ollama/ml"
)
@ -36,7 +32,7 @@ var (
bootstrapped bool
)
func GPUDevices(ctx context.Context, runners []FilteredRunnerDiscovery) []ml.DeviceInfo {
func GPUDevices(ctx context.Context, runners []ml.FilteredRunnerDiscovery) []ml.DeviceInfo {
deviceMu.Lock()
defer deviceMu.Unlock()
startDiscovery := time.Now()
@ -154,9 +150,9 @@ func GPUDevices(ctx context.Context, runners []FilteredRunnerDiscovery) []ml.Dev
slog.Error("Unknown Library:" + devices[i].Library)
}
extraEnvs := []string{
"GGML_CUDA_INIT=1", // force deep initialization to trigger crash on unsupported GPUs
envVar + "=" + id, // Filter to just this one GPU
extraEnvs := map[string]string{
"GGML_CUDA_INIT": "1", // force deep initialization to trigger crash on unsupported GPUs
envVar: id, // Filter to just this one GPU
}
if len(bootstrapDevices(ctx2ndPass, devices[i].LibraryPath, extraEnvs)) == 0 {
needsDelete[i] = true
@ -449,100 +445,35 @@ func (r *bootstrapRunner) HasExited() bool {
return false
}
func bootstrapDevices(ctx context.Context, ollamaLibDirs []string, extraEnvs []string) []ml.DeviceInfo {
// TODO DRY out with llm/server.go
slog.Debug("spawning runner with", "OLLAMA_LIBRARY_PATH", ollamaLibDirs, "extra_envs", extraEnvs)
func bootstrapDevices(ctx context.Context, ollamaLibDirs []string, extraEnvs map[string]string) []ml.DeviceInfo {
var out io.Writer
if envconfig.LogLevel() == logutil.LevelTrace {
out = os.Stderr
}
start := time.Now()
defer func() {
slog.Debug("bootstrap discovery took", "duration", time.Since(start), "OLLAMA_LIBRARY_PATH", ollamaLibDirs, "extra_envs", extraEnvs)
}()
port := 0
if a, err := net.ResolveTCPAddr("tcp", "localhost:0"); err == nil {
var l *net.TCPListener
if l, err = net.ListenTCP("tcp", a); err == nil {
port = l.Addr().(*net.TCPAddr).Port
l.Close()
}
}
if port == 0 {
slog.Debug("ResolveTCPAddr failed, using random port")
port = rand.Intn(65535-49152) + 49152 // get a random port in the ephemeral range
}
params := []string{"runner", "--ollama-engine", "--port", strconv.Itoa(port)}
var pathEnv string
switch runtime.GOOS {
case "windows":
pathEnv = "PATH"
case "darwin":
pathEnv = "DYLD_LIBRARY_PATH"
default:
pathEnv = "LD_LIBRARY_PATH"
}
libraryPaths := append([]string{LibOllamaPath}, ollamaLibDirs...)
if rocmDir != "" {
libraryPaths = append(libraryPaths, rocmDir)
}
// Note: we always put our dependency paths first
// since these are the exact version we compiled/linked against
if libraryPath, ok := os.LookupEnv(pathEnv); ok {
libraryPaths = append(libraryPaths, filepath.SplitList(libraryPath)...)
}
cmd := exec.Command(exe, params...)
cmd.Env = os.Environ()
if envconfig.LogLevel() == logutil.LevelTrace {
cmd.Stdout = os.Stdout
cmd.Stderr = os.Stderr
}
// cmd.SysProcAttr = llm.LlamaServerSysProcAttr // circular dependency - bring back once refactored
pathEnvVal := strings.Join(libraryPaths, string(filepath.ListSeparator))
pathNeeded := true
ollamaPathNeeded := true
extraDone := make([]bool, len(extraEnvs))
for i := range cmd.Env {
cmp := strings.SplitN(cmd.Env[i], "=", 2)
if strings.EqualFold(cmp[0], pathEnv) {
cmd.Env[i] = pathEnv + "=" + pathEnvVal
pathNeeded = false
} else if strings.EqualFold(cmp[0], "OLLAMA_LIBRARY_PATH") {
cmd.Env[i] = "OLLAMA_LIBRARY_PATH=" + strings.Join(ollamaLibDirs, string(filepath.ListSeparator))
ollamaPathNeeded = false
} else {
for j := range extraEnvs {
if extraDone[j] {
continue
}
extra := strings.SplitN(extraEnvs[j], "=", 2)
if cmp[0] == extra[0] {
cmd.Env[i] = extraEnvs[j]
extraDone[j] = true
}
}
}
}
if pathNeeded {
cmd.Env = append(cmd.Env, pathEnv+"="+pathEnvVal)
}
if ollamaPathNeeded {
cmd.Env = append(cmd.Env, "OLLAMA_LIBRARY_PATH="+strings.Join(ollamaLibDirs, string(filepath.ListSeparator)))
}
for i := range extraDone {
if !extraDone[i] {
cmd.Env = append(cmd.Env, extraEnvs[i])
}
}
logutil.Trace("starting runner for device discovery", "env", cmd.Env, "cmd", cmd)
if err := cmd.Start(); err != nil {
slog.Warn("unable to start discovery subprocess", "cmd", cmd, "error", err)
logutil.Trace("starting runner for device discovery", "libDirs", ollamaLibDirs, "extraEnvs", extraEnvs)
cmd, port, err := llm.StartRunner(
true, // ollama engine
"", // no model
ollamaLibDirs,
out,
extraEnvs,
)
if err != nil {
slog.Debug("failed to start runner to discovery GPUs", "error", err)
return nil
}
go func() {
cmd.Wait() // exit status ignored
}()
defer cmd.Process.Kill()
devices, err := GetDevicesFromRunner(ctx, &bootstrapRunner{port: port, cmd: cmd})
devices, err := ml.GetDevicesFromRunner(ctx, &bootstrapRunner{port: port, cmd: cmd})
if err != nil {
if cmd.ProcessState != nil && cmd.ProcessState.ExitCode() >= 0 {
// Expected during bootstrapping while we filter out unsupported AMD GPUs
@ -555,52 +486,3 @@ func bootstrapDevices(ctx context.Context, ollamaLibDirs []string, extraEnvs []s
return devices
}
func GetDevicesFromRunner(ctx context.Context, runner BaseRunner) ([]ml.DeviceInfo, error) {
var moreDevices []ml.DeviceInfo
port := runner.GetPort()
tick := time.Tick(10 * time.Millisecond)
for {
select {
case <-ctx.Done():
return nil, fmt.Errorf("failed to finish discovery before timeout")
case <-tick:
r, err := http.NewRequestWithContext(ctx, http.MethodGet, fmt.Sprintf("http://127.0.0.1:%d/info", port), nil)
if err != nil {
return nil, fmt.Errorf("failed to create request: %w", err)
}
r.Header.Set("Content-Type", "application/json")
resp, err := http.DefaultClient.Do(r)
if err != nil {
// slog.Warn("failed to send request", "error", err)
if runner.HasExited() {
return nil, fmt.Errorf("runner crashed")
}
continue
}
defer resp.Body.Close()
if resp.StatusCode == http.StatusNotFound {
// old runner, fall back to bootstrapping model
return nil, fmt.Errorf("llamarunner free vram reporting not supported")
}
body, err := io.ReadAll(resp.Body)
if err != nil {
slog.Warn("failed to read response", "error", err)
continue
}
if resp.StatusCode != 200 {
logutil.Trace("runner failed to discover free VRAM", "status", resp.StatusCode, "response", body)
return nil, fmt.Errorf("runner error: %s", string(body))
}
if err := json.Unmarshal(body, &moreDevices); err != nil {
slog.Warn("unmarshal encode response", "error", err)
continue
}
return moreDevices, nil
}
}
}

View File

@ -1,10 +1,8 @@
package discover
import (
"context"
"log/slog"
"path/filepath"
"runtime"
"strings"
"github.com/ollama/ollama/format"
@ -17,50 +15,6 @@ type memInfo struct {
FreeSwap uint64 `json:"free_swap,omitempty"` // TODO split this out for system only
}
// Beginning of an `ollama info` command
type GpuInfo struct { // TODO better name maybe "InferenceProcessor"?
ml.DeviceID
memInfo
// Optional variant to select (e.g. versions, cpu feature flags)
Variant string `json:"variant"`
// MinimumMemory represents the minimum memory required to use the GPU
MinimumMemory uint64 `json:"-"`
// Any extra PATH/LD_LIBRARY_PATH dependencies required for the Library to operate properly
DependencyPath []string `json:"lib_path,omitempty"`
// Set to true if we can NOT reliably discover FreeMemory. A value of true indicates
// the FreeMemory is best effort, and may over or under report actual memory usage
// False indicates FreeMemory can generally be trusted on this GPU
UnreliableFreeMemory bool
// GPU information
filterID string // AMD/Vulkan Workaround: The numeric ID of the device used to filter out other devices
Name string `json:"name"` // user friendly name if available
ComputeMajor int `json:"compute_major"` // Compute Capability or gfx
ComputeMinor int `json:"compute_minor"`
// Driver Information - TODO no need to put this on each GPU
DriverMajor int `json:"driver_major,omitempty"`
DriverMinor int `json:"driver_minor,omitempty"`
// TODO other performance capability info to help in scheduling decisions
}
func (gpu GpuInfo) RunnerName() string {
if gpu.Variant != "" {
return gpu.Library + "_" + gpu.Variant
}
return gpu.Library
}
type CPUInfo struct {
GpuInfo
CPUs []CPU
}
// CPU type represents a CPU Package occupying a socket
type CPU struct {
ID string `cpuinfo:"processor"`
@ -71,32 +25,6 @@ type CPU struct {
ThreadCount int
}
type GpuInfoList []GpuInfo
func (l GpuInfoList) ByLibrary() []GpuInfoList {
resp := []GpuInfoList{}
libs := []string{}
for _, info := range l {
found := false
requested := info.Library
if info.Variant != "" {
requested += "_" + info.Variant
}
for i, lib := range libs {
if lib == requested {
resp[i] = append(resp[i], info)
found = true
break
}
}
if !found {
libs = append(libs, requested)
resp = append(resp, []GpuInfo{info})
}
}
return resp
}
func LogDetails(devices []ml.DeviceInfo) {
for _, dev := range devices {
var libs []string
@ -141,74 +69,3 @@ func LogDetails(devices []ml.DeviceInfo) {
)
}
}
// Sort by Free Space
type ByFreeMemory []GpuInfo
func (a ByFreeMemory) Len() int { return len(a) }
func (a ByFreeMemory) Swap(i, j int) { a[i], a[j] = a[j], a[i] }
func (a ByFreeMemory) Less(i, j int) bool { return a[i].FreeMemory < a[j].FreeMemory }
type SystemInfo struct {
System CPUInfo `json:"system"`
GPUs []GpuInfo `json:"gpus"`
}
// Return the optimal number of threads to use for inference
func (si SystemInfo) GetOptimalThreadCount() int {
if len(si.System.CPUs) == 0 {
// Fall back to Go's num CPU
return runtime.NumCPU()
}
coreCount := 0
for _, c := range si.System.CPUs {
coreCount += c.CoreCount - c.EfficiencyCoreCount
}
return coreCount
}
// For each GPU, check if it does NOT support flash attention
func (l GpuInfoList) FlashAttentionSupported() bool {
for _, gpu := range l {
supportsFA := gpu.Library == "cpu" ||
gpu.Name == "Metal" || gpu.Library == "Metal" ||
(gpu.Library == "CUDA" && gpu.DriverMajor >= 7 && !(gpu.ComputeMajor == 7 && gpu.ComputeMinor == 2)) || // We don't have kernels for Jetson Xavier
gpu.Library == "ROCm" ||
gpu.Library == "Vulkan"
if !supportsFA {
return false
}
}
return true
}
type BaseRunner interface {
// GetPort returns the localhost port number the runner is running on
GetPort() int
// HasExited indicates if the runner is no longer running. This can be used during
// bootstrap to detect if a given filtered device is incompatible and triggered an assert
HasExited() bool
}
type RunnerDiscovery interface {
BaseRunner
// GetDeviceInfos will perform a query of the underlying device libraries
// for device identification and free VRAM information
// During bootstrap scenarios, this routine may take seconds to complete
GetDeviceInfos(ctx context.Context) []ml.DeviceInfo
}
type FilteredRunnerDiscovery interface {
RunnerDiscovery
// GetActiveDeviceIDs returns the filtered set of devices actively in
// use by this runner for running models. If the runner is a bootstrap runner, no devices
// will be active yet so no device IDs are returned.
// This routine will not query the underlying device and will return immediately
GetActiveDeviceIDs() []ml.DeviceID
}

View File

@ -4,27 +4,28 @@ import (
"fmt"
"log/slog"
"os"
"slices"
"sort"
"strings"
"github.com/ollama/ollama/api"
"github.com/ollama/ollama/discover"
"github.com/ollama/ollama/envconfig"
"github.com/ollama/ollama/format"
"github.com/ollama/ollama/fs/ggml"
"github.com/ollama/ollama/ml"
)
// pickBestFullFitByLibrary will try to find the optimal placement of the model in the available GPUs where the model fully fits
// The list of GPUs returned will always be the same brand (library)
// If the model can not be fit fully within the available GPU(s) nil is returned
func pickBestFullFitByLibrary(f *ggml.GGML, modelPath string, projectors []string, adapters []string, opts api.Options, gpus discover.GpuInfoList, numParallel int) discover.GpuInfoList {
for _, gl := range gpus.ByLibrary() {
sgl := append(make(discover.GpuInfoList, 0, len(gl)), gl...)
func pickBestFullFitByLibrary(f *ggml.GGML, modelPath string, projectors []string, adapters []string, opts api.Options, gpus []ml.DeviceInfo, numParallel int) []ml.DeviceInfo {
for _, gl := range ml.ByLibrary(gpus) {
sgl := append(make([]ml.DeviceInfo, 0, len(gl)), gl...)
// TODO - potentially sort by performance capability, existing models loaded, etc.
// TODO - Eliminate any GPUs that already have envconfig.MaxRunners loaded on them
// Note: at present, this will favor most current available VRAM descending and ignoring faster GPU speed in mixed setups
sort.Sort(sort.Reverse(discover.ByFreeMemory(sgl)))
sort.Sort(sort.Reverse(ml.ByFreeMemory(sgl)))
if !envconfig.SchedSpread() {
// Try to pack into as few GPUs as possible, starting from 1 GPU
@ -63,8 +64,8 @@ func pickBestFullFitByLibrary(f *ggml.GGML, modelPath string, projectors []strin
}
// If multiple Libraries are detected, pick the Library which loads the most layers for the model
func pickBestPartialFitByLibrary(f *ggml.GGML, projectors []string, adapters []string, opts api.Options, gpus discover.GpuInfoList, numParallel int) discover.GpuInfoList {
byLibrary := gpus.ByLibrary()
func pickBestPartialFitByLibrary(f *ggml.GGML, projectors []string, adapters []string, opts api.Options, gpus []ml.DeviceInfo, numParallel int) []ml.DeviceInfo {
byLibrary := ml.ByLibrary(gpus)
if len(byLibrary) <= 1 {
return gpus
}
@ -81,10 +82,10 @@ func pickBestPartialFitByLibrary(f *ggml.GGML, projectors []string, adapters []s
}
// This algorithm looks for a complete fit to determine if we need to unload other models
func predictServerFit(allGpus discover.GpuInfoList, f *ggml.GGML, adapters, projectors []string, opts api.Options, numParallel int) (bool, uint64) {
func predictServerFit(allGpus []ml.DeviceInfo, f *ggml.GGML, adapters, projectors []string, opts api.Options, numParallel int) (bool, uint64) {
// Split up the GPUs by type and try them
var estimatedVRAM uint64
for _, gpus := range allGpus.ByLibrary() {
for _, gpus := range ml.ByLibrary(allGpus) {
var layerCount int
estimate := estimateGPULayers(gpus, f, projectors, opts, numParallel)
layerCount, estimatedVRAM = estimate.Layers, estimate.VRAMSize
@ -97,14 +98,23 @@ func predictServerFit(allGpus discover.GpuInfoList, f *ggml.GGML, adapters, proj
return true, estimatedVRAM
}
}
if len(gpus) == 1 && gpus[0].Library == "cpu" && estimate.TotalSize <= gpus[0].FreeMemory {
return true, estimatedVRAM
}
}
return false, estimatedVRAM
}
func verifyCPUFit(f *ggml.GGML, modelPath string, projectors []string, adapters []string, opts api.Options, systemInfo ml.SystemInfo, numParallel int) bool {
estimate := estimateGPULayers(nil, f, projectors, opts, numParallel)
if estimate.TotalSize > systemInfo.FreeMemory {
return false
}
slog.Info("new model will fit in available system memory for CPU inference, loading",
"model", modelPath,
"parallel", numParallel,
"required", format.HumanBytes2(estimate.TotalSize),
)
return true
}
type MemoryEstimate struct {
// How many layers we predict we can load
Layers int
@ -141,7 +151,7 @@ type MemoryEstimate struct {
// Given a model and one or more GPU targets, predict how many layers and bytes we can load, and the total size
// The GPUs provided must all be the same Library
func estimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []string, opts api.Options, numParallel int) MemoryEstimate {
func estimateGPULayers(gpus []ml.DeviceInfo, f *ggml.GGML, projectors []string, opts api.Options, numParallel int) MemoryEstimate {
// Graph size for a partial offload, applies to all GPUs
var graphPartialOffload uint64
@ -175,10 +185,17 @@ func estimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []strin
overhead := envconfig.GpuOverhead()
availableList := make([]string, len(gpus))
libraries := []string{}
for i, gpu := range gpus {
availableList[i] = format.HumanBytes2(gpu.FreeMemory)
if !slices.Contains(libraries, gpu.Library) {
libraries = append(libraries, gpu.Library)
}
slog.Debug("evaluating", "library", gpus[0].Library, "gpu_count", len(gpus), "available", availableList)
}
if len(libraries) == 0 {
libraries = []string{"cpu"}
}
slog.Debug("evaluating", "library", strings.Join(libraries, ","), "gpu_count", len(gpus), "available", availableList)
for _, projector := range projectors {
llamaEngineProjectorWeights += projectorMemoryRequirements(projector)
@ -196,7 +213,7 @@ func estimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []strin
}
useFlashAttention := envconfig.FlashAttention(f.FlashAttention()) &&
(discover.GpuInfoList)(gpus).FlashAttentionSupported() &&
ml.FlashAttentionSupported(gpus) &&
f.SupportsFlashAttention()
var kvct string
@ -231,7 +248,7 @@ func estimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []strin
}
// on metal there's no partial offload overhead
if gpus[0].Library == "Metal" {
if len(gpus) > 0 && gpus[0].Library == "Metal" {
graphPartialOffload = graphFullOffload
} else if len(gpus) > 1 {
// multigpu should always use the partial graph size
@ -256,7 +273,7 @@ func estimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []strin
gpuAllocations := make([]uint64, len(gpus))
type gs struct {
i int
g *discover.GpuInfo
g *ml.DeviceInfo
}
gpusWithSpace := []gs{}
for i := range gpus {
@ -265,19 +282,11 @@ func estimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []strin
gzo = gpuZeroOverhead
}
// Only include GPUs that can fit the graph, gpu minimum, the layer buffer and at least more layer
if gpus[i].FreeMemory < overhead+gzo+max(graphPartialOffload, graphFullOffload)+gpus[i].MinimumMemory+2*layerSize {
var compute string
if gpus[i].Library == "ROCm" {
compute = fmt.Sprintf("gfx%x%02x", gpus[i].ComputeMajor, gpus[i].ComputeMinor)
} else {
compute = fmt.Sprintf("%d.%d", gpus[i].ComputeMajor, gpus[i].ComputeMinor)
}
if gpus[i].FreeMemory < overhead+gzo+max(graphPartialOffload, graphFullOffload)+gpus[i].MinimumMemory()+2*layerSize {
slog.Debug("gpu has too little memory to allocate any layers",
"id", gpus[i].ID,
"library", gpus[i].Library,
"variant", gpus[i].Variant,
"compute", compute,
"compute", gpus[i].Compute(),
"driver", fmt.Sprintf("%d.%d", gpus[i].DriverMajor, gpus[i].DriverMinor),
"name", gpus[i].Name,
"total", format.HumanBytes2(gpus[i].TotalMemory),
@ -291,7 +300,7 @@ func estimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []strin
continue
}
gpusWithSpace = append(gpusWithSpace, gs{i, &gpus[i]})
gpuAllocations[i] += gpus[i].MinimumMemory + layerSize // We hold off on graph until we know partial vs. full
gpuAllocations[i] += gpus[i].MinimumMemory() + layerSize // We hold off on graph until we know partial vs. full
}
var gpuZeroID int
@ -397,7 +406,7 @@ func estimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []strin
VRAMSize: 0,
GPUSizes: []uint64{},
inferenceLibrary: gpus[0].Library,
inferenceLibrary: strings.Join(libraries, ","),
layersRequested: opts.NumGPU,
layersModel: int(f.KV().BlockCount()) + 1,
availableList: availableList,
@ -411,7 +420,7 @@ func estimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []strin
projectorGraph: ollamaEngineProjectorGraph,
}
if gpus[0].Library == "cpu" {
if len(gpus) == 0 {
return estimate
}
if layerCount == 0 {

View File

@ -10,7 +10,7 @@ import (
"github.com/stretchr/testify/require"
"github.com/ollama/ollama/api"
"github.com/ollama/ollama/discover"
"github.com/ollama/ollama/format"
"github.com/ollama/ollama/fs/ggml"
"github.com/ollama/ollama/ml"
)
@ -54,13 +54,7 @@ func TestEstimateGPULayers(t *testing.T) {
}
// Simple CPU scenario
gpus := []discover.GpuInfo{
{
DeviceID: ml.DeviceID{
Library: "cpu",
},
},
}
gpus := []ml.DeviceInfo{}
projectors := []string{}
opts := api.DefaultOptions()
t.Run("cpu", func(t *testing.T) {
@ -77,19 +71,17 @@ func TestEstimateGPULayers(t *testing.T) {
memoryLayerOutput := uint64(4)
// Dual CUDA scenario with asymmetry
gpuMinimumMemory := uint64(2048)
gpus = []discover.GpuInfo{
gpuMinimumMemory := uint64(457 * format.MebiByte)
gpus = []ml.DeviceInfo{
{
DeviceID: ml.DeviceID{
Library: "cuda",
Library: "CUDA",
},
MinimumMemory: gpuMinimumMemory,
},
{
DeviceID: ml.DeviceID{
Library: "cuda",
Library: "CUDA",
},
MinimumMemory: gpuMinimumMemory,
},
}
// Nested array: GPU0 layer space, GPU1 layer space, expected gpu0, expected gpu1

View File

@ -27,7 +27,6 @@ import (
"golang.org/x/sync/semaphore"
"github.com/ollama/ollama/api"
"github.com/ollama/ollama/discover"
"github.com/ollama/ollama/envconfig"
"github.com/ollama/ollama/format"
"github.com/ollama/ollama/fs/ggml"
@ -66,7 +65,7 @@ func (e filteredEnv) LogValue() slog.Value {
type LlamaServer interface {
ModelPath() string
Load(ctx context.Context, gpus discover.GpuInfoList, requireFull bool) ([]ml.DeviceID, error)
Load(ctx context.Context, systemInfo ml.SystemInfo, gpus []ml.DeviceInfo, requireFull bool) ([]ml.DeviceID, error)
Ping(ctx context.Context) error
WaitUntilRunning(ctx context.Context) error
Completion(ctx context.Context, req CompletionRequest, fn func(CompletionResponse)) error
@ -115,7 +114,7 @@ type llamaServer struct {
llmServer
ggml *ggml.GGML
gpus discover.GpuInfoList // The set of GPUs covered by the memory estimate
gpus []ml.DeviceInfo // The set of GPUs covered by the memory estimate
estimate MemoryEstimate
}
@ -146,7 +145,7 @@ func LoadModel(model string, maxArraySize int) (*ggml.GGML, error) {
}
// NewLlamaServer will run a server for the given GPUs
func NewLlamaServer(gpus discover.GpuInfoList, modelPath string, f *ggml.GGML, adapters, projectors []string, opts api.Options, numParallel int) (LlamaServer, error) {
func NewLlamaServer(systemInfo ml.SystemInfo, gpus []ml.DeviceInfo, modelPath string, f *ggml.GGML, adapters, projectors []string, opts api.Options, numParallel int) (LlamaServer, error) {
var llamaModel *llama.Model
var textProcessor model.TextProcessor
var err error
@ -179,7 +178,7 @@ func NewLlamaServer(gpus discover.GpuInfoList, modelPath string, f *ggml.GGML, a
loadRequest := LoadRequest{LoraPath: adapters, KvSize: opts.NumCtx * numParallel, BatchSize: opts.NumBatch, Parallel: numParallel, MultiUserCache: envconfig.MultiUserCache()}
defaultThreads := discover.GetSystemInfo().GetOptimalThreadCount()
defaultThreads := systemInfo.ThreadCount
if opts.NumThread > 0 {
loadRequest.NumThreads = opts.NumThread
} else if defaultThreads > 0 {
@ -200,7 +199,7 @@ func NewLlamaServer(gpus discover.GpuInfoList, modelPath string, f *ggml.GGML, a
// This will disable flash attention unless all GPUs on the system support it, even if we end up selecting a subset
// that can handle it.
if fa && !gpus.FlashAttentionSupported() {
if fa && !ml.FlashAttentionSupported(gpus) {
slog.Warn("flash attention enabled but not supported by gpu")
fa = false
}
@ -227,120 +226,20 @@ func NewLlamaServer(gpus discover.GpuInfoList, modelPath string, f *ggml.GGML, a
slog.Warn("quantized kv cache requested but flash attention disabled", "type", kvct)
}
availableLibs := make(map[string]string)
if entries, err := os.ReadDir(discover.LibOllamaPath); err == nil {
for _, entry := range entries {
availableLibs[entry.Name()] = filepath.Join(discover.LibOllamaPath, entry.Name())
}
}
var gpuLibs []string
for _, gpu := range gpus {
gpuLibs = append(gpuLibs, gpu.RunnerName())
}
requested := envconfig.LLMLibrary()
if availableLibs[requested] != "" {
slog.Info("using requested gpu library", "requested", requested)
gpuLibs = []string{requested}
}
var compatible []string
for _, gpuLib := range gpuLibs {
var matchingLibs []string
for k := range availableLibs {
// exact match first
if k == gpuLib {
matchingLibs = append([]string{k}, matchingLibs...)
continue
}
// then match the family (e.g. 'cuda')
if strings.Split(k, "_")[0] == strings.Split(gpuLib, "_")[0] {
matchingLibs = append(matchingLibs, k)
}
}
if len(matchingLibs) > 0 {
compatible = append(compatible, matchingLibs[0])
}
}
exe, err := os.Executable()
if err != nil {
return nil, fmt.Errorf("unable to lookup executable path: %w", err)
}
if eval, err := filepath.EvalSymlinks(exe); err == nil {
exe = eval
}
// iterate through compatible GPU libraries such as 'cuda_v12', 'rocm', etc.
// adding each library's respective path to the LD_LIBRARY_PATH, until finally running
// without any LD_LIBRARY_PATH flags
for {
port := 0
if a, err := net.ResolveTCPAddr("tcp", "localhost:0"); err == nil {
var l *net.TCPListener
if l, err = net.ListenTCP("tcp", a); err == nil {
port = l.Addr().(*net.TCPAddr).Port
l.Close()
}
}
if port == 0 {
slog.Debug("ResolveTCPAddr failed, using random port")
port = rand.Intn(65535-49152) + 49152 // get a random port in the ephemeral range
}
params := []string{"runner"}
if textProcessor != nil {
// New engine
// TODO - if we have failure to load scenarios, add logic to retry with the old runner
params = append(params, "--ollama-engine")
}
params = append(params, "--model", modelPath)
params = append(params, "--port", strconv.Itoa(port))
var pathEnv string
switch runtime.GOOS {
case "windows":
pathEnv = "PATH"
case "darwin":
pathEnv = "DYLD_LIBRARY_PATH"
default:
pathEnv = "LD_LIBRARY_PATH"
}
// Note: we always put our dependency paths first
// since these are the exact version we compiled/linked against
libraryPaths := []string{discover.LibOllamaPath}
if libraryPath, ok := os.LookupEnv(pathEnv); ok {
libraryPaths = append(libraryPaths, filepath.SplitList(libraryPath)...)
}
ggmlPaths := []string{discover.LibOllamaPath}
for _, c := range compatible {
if libpath, ok := availableLibs[c]; ok {
slog.Debug("adding gpu library", "path", libpath)
libraryPaths = append([]string{libpath}, libraryPaths...)
ggmlPaths = append(ggmlPaths, libpath)
}
}
for _, gpu := range gpus {
if gpu.DependencyPath != nil {
slog.Debug("adding gpu dependency paths", "paths", gpu.DependencyPath)
libraryPaths = append(gpu.DependencyPath, libraryPaths...)
ggmlPaths = append(ggmlPaths, gpu.DependencyPath...)
}
}
// finally, add the root library path
libraryPaths = append(libraryPaths, discover.LibOllamaPath)
gpuLibs := ml.LibraryPaths(gpus)
status := NewStatusWriter(os.Stderr)
cmd, port, err := StartRunner(
textProcessor != nil,
modelPath,
gpuLibs,
status,
ml.GetVisibleDevicesEnv(gpus),
)
s := llmServer{
port: port,
cmd: exec.Command(exe, params...),
status: NewStatusWriter(os.Stderr),
cmd: cmd,
status: status,
options: opts,
modelPath: modelPath,
loadRequest: loadRequest,
@ -354,70 +253,18 @@ func NewLlamaServer(gpus discover.GpuInfoList, modelPath string, f *ggml.GGML, a
done: make(chan error, 1),
}
s.cmd.Env = os.Environ()
s.cmd.Stdout = os.Stdout
s.cmd.Stderr = s.status
s.cmd.SysProcAttr = LlamaServerSysProcAttr
// Always filter down the set of GPUs in case there are any unsupported devices that might crash
envWorkarounds := gpus.GetVisibleDevicesEnv()
pathEnvVal := strings.Join(libraryPaths, string(filepath.ListSeparator))
// Update or add the path variable with our adjusted version
pathNeeded := true
ollamaPathNeeded := true
envWorkaroundDone := make([]bool, len(envWorkarounds))
for i := range s.cmd.Env {
cmp := strings.SplitN(s.cmd.Env[i], "=", 2)
if strings.EqualFold(cmp[0], pathEnv) {
s.cmd.Env[i] = pathEnv + "=" + pathEnvVal
pathNeeded = false
} else if strings.EqualFold(cmp[0], "OLLAMA_LIBRARY_PATH") {
s.cmd.Env[i] = "OLLAMA_LIBRARY_PATH=" + strings.Join(ggmlPaths, string(filepath.ListSeparator))
ollamaPathNeeded = false
} else if len(envWorkarounds) != 0 {
for j, kv := range envWorkarounds {
tmp := strings.SplitN(kv, "=", 2)
if strings.EqualFold(cmp[0], tmp[0]) {
s.cmd.Env[i] = kv
envWorkaroundDone[j] = true
}
}
}
}
if pathNeeded {
s.cmd.Env = append(s.cmd.Env, pathEnv+"="+pathEnvVal)
}
if ollamaPathNeeded {
s.cmd.Env = append(s.cmd.Env, "OLLAMA_LIBRARY_PATH="+strings.Join(ggmlPaths, string(filepath.ListSeparator)))
}
for i, done := range envWorkaroundDone {
if !done {
s.cmd.Env = append(s.cmd.Env, envWorkarounds[i])
}
}
slog.Info("starting runner", "cmd", s.cmd)
slog.Debug("subprocess", "", filteredEnv(s.cmd.Env))
if err = s.cmd.Start(); err != nil {
if err != nil {
var msg string
if s.status != nil && s.status.LastErrMsg != "" {
msg = s.status.LastErrMsg
}
err := fmt.Errorf("error starting runner: %v %s", err, msg)
if len(compatible) == 0 {
if llamaModel != nil {
llama.FreeModel(llamaModel)
}
return nil, err
}
slog.Warn("unable to start runner with compatible gpu", "error", err, "compatible", compatible)
compatible = compatible[1:]
continue
}
// reap subprocess when it exits
go func() {
err := s.cmd.Wait()
@ -439,6 +286,110 @@ func NewLlamaServer(gpus discover.GpuInfoList, modelPath string, f *ggml.GGML, a
return &llamaServer{llmServer: s, ggml: f}, nil
}
}
func StartRunner(ollamaEngine bool, modelPath string, gpuLibs []string, out io.Writer, extraEnvs map[string]string) (cmd *exec.Cmd, port int, err error) {
var exe string
exe, err = os.Executable()
if err != nil {
return nil, 0, fmt.Errorf("unable to lookup executable path: %w", err)
}
if eval, err := filepath.EvalSymlinks(exe); err == nil {
exe = eval
}
port = 0
if a, err := net.ResolveTCPAddr("tcp", "localhost:0"); err == nil {
var l *net.TCPListener
if l, err = net.ListenTCP("tcp", a); err == nil {
port = l.Addr().(*net.TCPAddr).Port
l.Close()
}
}
if port == 0 {
slog.Debug("ResolveTCPAddr failed, using random port")
port = rand.Intn(65535-49152) + 49152 // get a random port in the ephemeral range
}
params := []string{"runner"}
if ollamaEngine {
params = append(params, "--ollama-engine")
}
if modelPath != "" {
params = append(params, "--model", modelPath)
}
params = append(params, "--port", strconv.Itoa(port))
var pathEnv string
switch runtime.GOOS {
case "windows":
pathEnv = "PATH"
case "darwin":
pathEnv = "DYLD_LIBRARY_PATH"
default:
pathEnv = "LD_LIBRARY_PATH"
}
// Note: we always put our dependency paths first
// since these are the exact version we compiled/linked against
libraryPaths := append([]string{}, gpuLibs...)
if libraryPath, ok := os.LookupEnv(pathEnv); ok {
libraryPaths = append(libraryPaths, filepath.SplitList(libraryPath)...)
}
cmd = exec.Command(exe, params...)
cmd.Env = os.Environ()
cmd.Stdout = out
cmd.Stderr = out
cmd.SysProcAttr = LlamaServerSysProcAttr
// Always filter down the set of GPUs in case there are any unsupported devices that might crash
pathEnvVal := strings.Join(libraryPaths, string(filepath.ListSeparator))
// Update or add the path variable with our adjusted version
pathNeeded := true
ollamaPathNeeded := true
extraEnvsDone := map[string]bool{}
for k := range extraEnvs {
extraEnvsDone[k] = false
}
for i := range cmd.Env {
cmp := strings.SplitN(cmd.Env[i], "=", 2)
if strings.EqualFold(cmp[0], pathEnv) {
cmd.Env[i] = pathEnv + "=" + pathEnvVal
pathNeeded = false
} else if strings.EqualFold(cmp[0], "OLLAMA_LIBRARY_PATH") {
cmd.Env[i] = "OLLAMA_LIBRARY_PATH=" + strings.Join(gpuLibs, string(filepath.ListSeparator))
ollamaPathNeeded = false
} else if len(extraEnvs) != 0 {
for k, v := range extraEnvs {
if strings.EqualFold(cmp[0], k) {
cmd.Env[i] = k + "=" + v
extraEnvsDone[k] = true
}
}
}
}
if pathNeeded {
cmd.Env = append(cmd.Env, pathEnv+"="+pathEnvVal)
}
if ollamaPathNeeded {
cmd.Env = append(cmd.Env, "OLLAMA_LIBRARY_PATH="+strings.Join(gpuLibs, string(filepath.ListSeparator)))
}
for k, done := range extraEnvsDone {
if !done {
cmd.Env = append(cmd.Env, k+"="+extraEnvs[k])
}
}
slog.Info("starting runner", "cmd", cmd)
slog.Debug("subprocess", "", filteredEnv(cmd.Env))
if err = cmd.Start(); err != nil {
return nil, 0, err
}
err = nil
return
}
func (s *llmServer) ModelPath() string {
@ -497,13 +448,18 @@ type LoadResponse struct {
var ErrLoadRequiredFull = errors.New("unable to load full model on GPU")
func (s *llamaServer) Load(ctx context.Context, gpus discover.GpuInfoList, requireFull bool) ([]ml.DeviceID, error) {
systemInfo := discover.GetSystemInfo()
systemTotalMemory := systemInfo.System.TotalMemory
systemFreeMemory := systemInfo.System.FreeMemory
systemSwapFreeMemory := systemInfo.System.FreeSwap
func (s *llamaServer) Load(ctx context.Context, systemInfo ml.SystemInfo, gpus []ml.DeviceInfo, requireFull bool) ([]ml.DeviceID, error) {
systemTotalMemory := systemInfo.TotalMemory
systemFreeMemory := systemInfo.FreeMemory
systemSwapFreeMemory := systemInfo.FreeSwap
slog.Info("system memory", "total", format.HumanBytes2(systemTotalMemory), "free", format.HumanBytes2(systemFreeMemory), "free_swap", format.HumanBytes2(systemSwapFreeMemory))
if len(gpus) == 0 || s.options.NumGPU == 0 {
if !verifyCPUFit(s.ggml, s.modelPath, []string{s.loadRequest.ProjectorPath}, s.loadRequest.LoraPath, s.options, systemInfo, s.numParallel) {
slog.Info("model requires more memory than is currently available, evicting a model to make space", "estimate", s.estimate)
return nil, fmt.Errorf("model requires more system memory than is currently available %w", ErrLoadRequiredFull)
}
} else {
g := pickBestFullFitByLibrary(s.ggml, s.modelPath, []string{s.loadRequest.ProjectorPath}, s.loadRequest.LoraPath, s.options, gpus, s.numParallel)
if g == nil {
if !requireFull {
@ -513,31 +469,37 @@ func (s *llamaServer) Load(ctx context.Context, gpus discover.GpuInfoList, requi
return nil, ErrLoadRequiredFull
}
}
gpus = g
}
s.estimate = estimateGPULayers(gpus, s.ggml, []string{s.loadRequest.ProjectorPath}, s.options, s.numParallel)
if len(gpus) > 1 || gpus[0].Library != "cpu" {
if len(gpus) >= 1 {
switch {
case gpus[0].Library == "Metal" && s.estimate.VRAMSize > systemInfo.System.TotalMemory:
case s.options.NumGPU == 0:
gpus = []ml.DeviceInfo{}
case gpus[0].Library == "Metal" && s.estimate.VRAMSize > systemInfo.TotalMemory:
// disable partial offloading when model is greater than total system memory as this
// can lead to locking up the system
s.options.NumGPU = 0
gpus = []ml.DeviceInfo{}
case gpus[0].Library != "Metal" && s.estimate.Layers == 0:
// Don't bother loading into the GPU if no layers can fit
gpus = discover.GpuInfoList{discover.GetCPUInfo()}
case s.options.NumGPU < 0 && s.estimate.Layers > 0 && gpus[0].Library != "cpu":
gpus = []ml.DeviceInfo{}
case s.options.NumGPU < 0 && s.estimate.Layers > 0:
s.options.NumGPU = s.estimate.Layers
}
} else {
s.options.NumGPU = 0
}
// On linux and windows, over-allocating CPU memory will almost always result in an error
// Darwin has fully dynamic swap so has no direct concept of free swap space
if runtime.GOOS != "darwin" {
systemMemoryRequired := s.estimate.TotalSize - s.estimate.VRAMSize
available := systemInfo.System.FreeMemory + systemInfo.System.FreeSwap
available := systemInfo.FreeMemory + systemInfo.FreeSwap
if systemMemoryRequired > available {
slog.Warn("model request too large for system", "requested", format.HumanBytes2(systemMemoryRequired), "available", format.HumanBytes2(available), "total", format.HumanBytes2(systemInfo.System.TotalMemory), "free", format.HumanBytes2(systemInfo.System.FreeMemory), "swap", format.HumanBytes2(systemInfo.System.FreeSwap))
slog.Warn("model request too large for system", "requested", format.HumanBytes2(systemMemoryRequired), "available", format.HumanBytes2(available), "total", format.HumanBytes2(systemInfo.TotalMemory), "free", format.HumanBytes2(systemInfo.FreeMemory), "swap", format.HumanBytes2(systemInfo.FreeSwap))
return nil, fmt.Errorf("model requires more system memory (%s) than is available (%s)", format.HumanBytes2(systemMemoryRequired), format.HumanBytes2(available))
}
}
@ -564,10 +526,10 @@ func (s *llamaServer) Load(ctx context.Context, gpus discover.GpuInfoList, requi
// Windows CUDA should not use mmap for best performance
// Linux with a model larger than free space, mmap leads to thrashing
// For CPU loads we want the memory to be allocated, not FS cache
if (runtime.GOOS == "windows" && gpus[0].Library == "CUDA" && s.options.UseMMap == nil) ||
(runtime.GOOS == "linux" && systemInfo.System.FreeMemory < s.estimate.TotalSize && s.options.UseMMap == nil) ||
(gpus[0].Library == "cpu" && s.options.UseMMap == nil) ||
(gpus[0].Library == "Vulkan" && s.options.UseMMap == nil) ||
if (runtime.GOOS == "windows" && len(gpus) > 0 && gpus[0].Library == "CUDA" && s.options.UseMMap == nil) ||
(runtime.GOOS == "linux" && systemInfo.FreeMemory < s.estimate.TotalSize && s.options.UseMMap == nil) ||
(len(gpus) == 0 && s.options.UseMMap == nil) ||
(len(gpus) > 0 && gpus[0].Library == "Vulkan" && s.options.UseMMap == nil) ||
(s.options.UseMMap != nil && !*s.options.UseMMap) {
s.loadRequest.UseMmap = false
}
@ -605,8 +567,8 @@ func (s *llamaServer) Load(ctx context.Context, gpus discover.GpuInfoList, requi
// createGPULayers maps from the tensor splits assigned by the memory estimates to explicit assignment
// of particular layers onto GPUs
func createGPULayers(estimate MemoryEstimate, ggml *ggml.GGML, gpus discover.GpuInfoList, numGPU int) ml.GPULayersList {
if numGPU <= 0 {
func createGPULayers(estimate MemoryEstimate, ggml *ggml.GGML, gpus []ml.DeviceInfo, numGPU int) ml.GPULayersList {
if numGPU <= 0 || len(gpus) == 0 {
return nil
}
@ -662,7 +624,7 @@ func createGPULayers(estimate MemoryEstimate, ggml *ggml.GGML, gpus discover.Gpu
// allowing for faster iteration, but may return less information.
//
// Returns the list of GPU IDs that were used in the final allocation on success
func (s *ollamaServer) Load(ctx context.Context, gpus discover.GpuInfoList, requireFull bool) ([]ml.DeviceID, error) {
func (s *ollamaServer) Load(ctx context.Context, systemInfo ml.SystemInfo, gpus []ml.DeviceInfo, requireFull bool) ([]ml.DeviceID, error) {
var success bool
defer func() {
if !success {
@ -675,25 +637,22 @@ func (s *ollamaServer) Load(ctx context.Context, gpus discover.GpuInfoList, requ
slog.Info("loading model", "model layers", s.totalLayers, "requested", s.options.NumGPU)
systemInfo := discover.GetSystemInfo()
systemTotalMemory := systemInfo.System.TotalMemory
systemFreeMemory := systemInfo.System.FreeMemory
systemSwapFreeMemory := systemInfo.System.FreeSwap
systemTotalMemory := systemInfo.TotalMemory
systemFreeMemory := systemInfo.FreeMemory
systemSwapFreeMemory := systemInfo.FreeSwap
slog.Info("system memory", "total", format.HumanBytes2(systemTotalMemory), "free", format.HumanBytes2(systemFreeMemory), "free_swap", format.HumanBytes2(systemSwapFreeMemory))
if !(len(gpus) == 1 && gpus[0].Library == "cpu") {
for _, gpu := range gpus {
available := gpu.FreeMemory - envconfig.GpuOverhead() - gpu.MinimumMemory
if gpu.FreeMemory < envconfig.GpuOverhead()+gpu.MinimumMemory {
available := gpu.FreeMemory - envconfig.GpuOverhead() - gpu.MinimumMemory()
if gpu.FreeMemory < envconfig.GpuOverhead()+gpu.MinimumMemory() {
available = 0
}
slog.Info("gpu memory", "id", gpu.ID, "library", gpu.Library,
"available", format.HumanBytes2(available),
"free", format.HumanBytes2(gpu.FreeMemory),
"minimum", format.HumanBytes2(gpu.MinimumMemory),
"minimum", format.HumanBytes2(gpu.MinimumMemory()),
"overhead", format.HumanBytes2(envconfig.GpuOverhead()))
}
}
pastAllocations := make(map[uint64]struct{})
var backoff float32
@ -762,7 +721,6 @@ nextOperation:
if err != nil {
return nil, err
}
slog.Debug("new layout created", "layers", newGPULayers)
s.loadRequest.GPULayers = newGPULayers
@ -864,20 +822,27 @@ func uniqueDeviceIDs(gpuLayers ml.GPULayersList) []ml.DeviceID {
// - Calculating how much space each GPU has available for layers, based on free memory and space occupied by the graph
// - Assigning layers
// - Ensuring that we don't exceed limits, such as requirements about partial offloading or system memory
func (s *ollamaServer) createLayout(systemInfo discover.SystemInfo, systemGPUs discover.GpuInfoList, memory *ml.BackendMemory, requireFull bool, backoff float32) (ml.GPULayersList, error) {
if s.totalLayers == 0 || s.options.NumGPU == 0 || len(systemGPUs) == 0 || (len(systemGPUs) == 1 && systemGPUs[0].Library == "cpu") {
return ml.GPULayersList{}, nil
}
gpus := append(make(discover.GpuInfoList, 0, len(systemGPUs)), systemGPUs...)
sort.Sort(sort.Reverse(discover.ByFreeMemory(gpus)))
func (s *ollamaServer) createLayout(systemInfo ml.SystemInfo, systemGPUs []ml.DeviceInfo, memory *ml.BackendMemory, requireFull bool, backoff float32) (ml.GPULayersList, error) {
if memory == nil {
memory = &ml.BackendMemory{CPU: ml.DeviceMemory{
Weights: make([]uint64, s.totalLayers),
Cache: make([]uint64, s.totalLayers),
}}
}
gpuLayers, layers, err := s.buildLayout(systemGPUs, memory, requireFull, backoff)
if err != nil {
return nil, err
}
err = s.verifyLayout(systemInfo, memory, requireFull, gpuLayers, layers)
if err != nil {
return nil, err
}
return gpuLayers, nil
}
func (s *ollamaServer) buildLayout(systemGPUs []ml.DeviceInfo, memory *ml.BackendMemory, requireFull bool, backoff float32) (ml.GPULayersList, []uint64, error) {
gpus := append(make([]ml.DeviceInfo, 0, len(systemGPUs)), systemGPUs...)
sort.Sort(sort.Reverse(ml.ByFreeMemory(gpus)))
layers := make([]uint64, len(memory.CPU.Weights))
for i := range layers {
@ -891,7 +856,7 @@ func (s *ollamaServer) createLayout(systemInfo discover.SystemInfo, systemGPUs d
}
gpuLayers := ml.GPULayersList{}
for _, gl := range gpus.ByLibrary() {
for _, gl := range ml.ByLibrary(gpus) {
// If a GPU already has a graph allocated on it, then we should continue to use it.
// Otherwise, we lose information that we got from previous allocations, which can
// cause cycling. Plus, we get more information about required allocation from each
@ -905,7 +870,7 @@ func (s *ollamaServer) createLayout(systemInfo discover.SystemInfo, systemGPUs d
lastUsedGPU = i
}
reserved := uint64(float32(gl[i].FreeMemory)*backoff) + gl[i].MinimumMemory + envconfig.GpuOverhead() + memory.GPUs[j].Graph
reserved := uint64(float32(gl[i].FreeMemory)*backoff) + gl[i].MinimumMemory() + envconfig.GpuOverhead() + memory.GPUs[j].Graph
if gl[i].FreeMemory > reserved {
gl[i].FreeMemory -= reserved
} else {
@ -914,7 +879,7 @@ func (s *ollamaServer) createLayout(systemInfo discover.SystemInfo, systemGPUs d
slog.Debug("available gpu", "id", gl[i].ID, "library", gl[i].Library,
"available layer vram", format.HumanBytes2(gl[i].FreeMemory),
"backoff", fmt.Sprintf("%.2f", backoff), "minimum", format.HumanBytes2(gl[i].MinimumMemory),
"backoff", fmt.Sprintf("%.2f", backoff), "minimum", format.HumanBytes2(gl[i].MinimumMemory()),
"overhead", format.HumanBytes2(envconfig.GpuOverhead()),
"graph", format.HumanBytes2(memory.GPUs[j].Graph))
@ -933,7 +898,11 @@ func (s *ollamaServer) createLayout(systemInfo discover.SystemInfo, systemGPUs d
gpuLayers = libraryGpuLayers
}
}
return gpuLayers, layers, nil
}
// verifyLayout ensures that we don't exceed limits, such as requirements about partial offloading or system memory
func (s *ollamaServer) verifyLayout(systemInfo ml.SystemInfo, memory *ml.BackendMemory, requireFull bool, gpuLayers ml.GPULayersList, layers []uint64) error {
// These sizes will only increase as we go through additional iterations and get additional information.
cpuSize := memory.InputWeights + memory.CPU.Graph
var vramSize uint64
@ -961,24 +930,24 @@ nextLayer:
if requireFull {
if gpuLayers.Sum() < len(layers) && (s.options.NumGPU < 0 || gpuLayers.Sum() < s.options.NumGPU) {
return nil, ErrLoadRequiredFull
return ErrLoadRequiredFull
}
if cpuSize > systemInfo.System.FreeMemory {
return nil, ErrLoadRequiredFull
if cpuSize > systemInfo.FreeMemory {
return ErrLoadRequiredFull
}
}
// On linux and windows, over-allocating CPU memory will almost always result in an error
// Darwin has fully dynamic swap so has no direct concept of free swap space
if runtime.GOOS != "darwin" {
available := systemInfo.System.FreeMemory + systemInfo.System.FreeSwap
available := systemInfo.FreeMemory + systemInfo.FreeSwap
if cpuSize > available {
slog.Warn("model request too large for system", "requested", format.HumanBytes2(cpuSize), "available", format.HumanBytes2(available), "total", format.HumanBytes2(systemInfo.System.TotalMemory), "free", format.HumanBytes2(systemInfo.System.FreeMemory), "swap", format.HumanBytes2(systemInfo.System.FreeSwap))
return nil, fmt.Errorf("model requires more system memory (%s) than is available (%s)", format.HumanBytes2(cpuSize), format.HumanBytes2(available))
slog.Warn("model request too large for system", "requested", format.HumanBytes2(cpuSize), "available", format.HumanBytes2(available), "total", format.HumanBytes2(systemInfo.TotalMemory), "free", format.HumanBytes2(systemInfo.FreeMemory), "swap", format.HumanBytes2(systemInfo.FreeSwap))
return fmt.Errorf("model requires more system memory (%s) than is available (%s)", format.HumanBytes2(cpuSize), format.HumanBytes2(available))
}
} else {
if vramSize > systemInfo.System.TotalMemory {
if vramSize > systemInfo.TotalMemory {
// disable partial offloading when model is greater than total system memory as this
// can lead to locking up the system
s.options.NumGPU = 0
@ -990,11 +959,11 @@ nextLayer:
slog.Debug("insufficient VRAM to load any model layers")
}
return gpuLayers, nil
return nil
}
// assignLayers packs the maximum number of layers onto the smallest set of GPUs and comes up with a layer assignment
func assignLayers(layers []uint64, gpus discover.GpuInfoList, requireFull bool, requestedLayers int, lastUsedGPU int) (gpuLayers ml.GPULayersList) {
func assignLayers(layers []uint64, gpus []ml.DeviceInfo, requireFull bool, requestedLayers int, lastUsedGPU int) (gpuLayers ml.GPULayersList) {
// If we can't fit everything then prefer offloading layers other than the output layer
for range 2 {
// requestedLayers may be -1 if nothing was requested
@ -1028,7 +997,7 @@ func assignLayers(layers []uint64, gpus discover.GpuInfoList, requireFull bool,
// findBestFit binary searches to find the smallest capacity factor that can fit
// the max number of layers. The capacity factor is multiplied by the free space on
// each GPU and a small one will force even balancing.
func findBestFit(layers []uint64, gpus discover.GpuInfoList, requestedLayers int, forceRequest bool) (gpuLayers ml.GPULayersList) {
func findBestFit(layers []uint64, gpus []ml.DeviceInfo, requestedLayers int, forceRequest bool) (gpuLayers ml.GPULayersList) {
var high float32 = 1
var low float32 = 0
@ -1053,12 +1022,11 @@ func findBestFit(layers []uint64, gpus discover.GpuInfoList, requestedLayers int
low = mid
}
}
return bestAssignments
}
// greedyFit assigns layers incrementally to GPUs, spilling over as each runs out of free space
func greedyFit(layers []uint64, gpus discover.GpuInfoList, capacity float32, requestedLayers int) (gpuLayers ml.GPULayersList) {
func greedyFit(layers []uint64, gpus []ml.DeviceInfo, capacity float32, requestedLayers int) (gpuLayers ml.GPULayersList) {
device := len(gpus) - 1
gpuLayers = ml.GPULayersList{{DeviceID: gpus[device].DeviceID}}
freeSpace := uint64(float32(gpus[device].FreeMemory) * capacity)
@ -1082,7 +1050,6 @@ func greedyFit(layers []uint64, gpus discover.GpuInfoList, capacity float32, req
freeSpace = uint64(float32(gpus[device].FreeMemory) * capacity)
}
}
return gpuLayers
}
@ -1814,7 +1781,7 @@ func (s *ollamaServer) VRAMByGPU(id ml.DeviceID) uint64 {
}
func (s *ollamaServer) GetDeviceInfos(ctx context.Context) []ml.DeviceInfo {
devices, err := discover.GetDevicesFromRunner(ctx, s)
devices, err := ml.GetDevicesFromRunner(ctx, s)
if err != nil {
if s.cmd != nil && s.cmd.ProcessState == nil {
// Still running but hit an error, log

View File

@ -8,7 +8,6 @@ import (
"testing"
"github.com/ollama/ollama/api"
"github.com/ollama/ollama/discover"
"github.com/ollama/ollama/format"
"github.com/ollama/ollama/ml"
"golang.org/x/sync/semaphore"
@ -20,6 +19,8 @@ func TestLLMServerFitGPU(t *testing.T) {
free int
}
minMemory := 457 * format.MebiByte
tests := []struct {
name string
gpus []gpu
@ -37,91 +38,91 @@ func TestLLMServerFitGPU(t *testing.T) {
},
{
name: "Full single GPU",
gpus: []gpu{{id: ml.DeviceID{ID: "gpu0"}, free: 256 * format.MebiByte}},
gpus: []gpu{{id: ml.DeviceID{ID: "gpu0"}, free: 256*format.MebiByte + minMemory}},
layers: []int{50 * format.MebiByte, 50 * format.MebiByte, 50 * format.MebiByte},
numGPU: -1,
expected: ml.GPULayersList{{DeviceID: ml.DeviceID{ID: "gpu0"}, Layers: []int{0, 1, 2}}},
},
{
name: "Partial single GPU",
gpus: []gpu{{id: ml.DeviceID{ID: "gpu0"}, free: 256 * format.MebiByte}},
gpus: []gpu{{id: ml.DeviceID{ID: "gpu0"}, free: 256*format.MebiByte + minMemory}},
layers: []int{100 * format.MebiByte, 100 * format.MebiByte, 100 * format.MebiByte, 100 * format.MebiByte},
numGPU: -1,
expected: ml.GPULayersList{{DeviceID: ml.DeviceID{ID: "gpu0"}, Layers: []int{1, 2}}},
},
{
name: "Single GPU with numGPU 1",
gpus: []gpu{{id: ml.DeviceID{ID: "gpu0"}, free: 256 * format.MebiByte}},
gpus: []gpu{{id: ml.DeviceID{ID: "gpu0"}, free: 256*format.MebiByte + minMemory}},
layers: []int{50 * format.MebiByte, 50 * format.MebiByte, 50 * format.MebiByte},
numGPU: 1,
expected: ml.GPULayersList{{DeviceID: ml.DeviceID{ID: "gpu0"}, Layers: []int{1}}},
},
{
name: "Single GPU with numGPU 0",
gpus: []gpu{{id: ml.DeviceID{ID: "gpu0"}, free: 256 * format.MebiByte}},
gpus: []gpu{{id: ml.DeviceID{ID: "gpu0"}, free: 256*format.MebiByte + minMemory}},
layers: []int{50 * format.MebiByte, 50 * format.MebiByte, 50 * format.MebiByte},
numGPU: 0,
expected: ml.GPULayersList{},
},
{
name: "Single GPU with numGPU 999",
gpus: []gpu{{id: ml.DeviceID{ID: "gpu0"}, free: 256 * format.MebiByte}},
gpus: []gpu{{id: ml.DeviceID{ID: "gpu0"}, free: 256*format.MebiByte + minMemory}},
layers: []int{100 * format.MebiByte, 100 * format.MebiByte, 100 * format.MebiByte, 100 * format.MebiByte},
numGPU: 999,
expected: ml.GPULayersList{{DeviceID: ml.DeviceID{ID: "gpu0"}, Layers: []int{0, 1, 2, 3}}},
},
{
name: "Multi GPU fits on one",
gpus: []gpu{{id: ml.DeviceID{ID: "gpu0"}, free: 128 * format.MebiByte}, {id: ml.DeviceID{ID: "gpu1"}, free: 256 * format.MebiByte}},
gpus: []gpu{{id: ml.DeviceID{ID: "gpu0"}, free: 128*format.MebiByte + minMemory}, {id: ml.DeviceID{ID: "gpu1"}, free: 256*format.MebiByte + minMemory}},
layers: []int{50 * format.MebiByte, 50 * format.MebiByte, 50 * format.MebiByte},
numGPU: -1,
expected: ml.GPULayersList{{DeviceID: ml.DeviceID{ID: "gpu1"}, Layers: []int{0, 1, 2}}},
},
{
name: "Multi GPU split",
gpus: []gpu{{id: ml.DeviceID{ID: "gpu0"}, free: 128 * format.MebiByte}, {id: ml.DeviceID{ID: "gpu1"}, free: 256 * format.MebiByte}},
gpus: []gpu{{id: ml.DeviceID{ID: "gpu0"}, free: 128*format.MebiByte + minMemory}, {id: ml.DeviceID{ID: "gpu1"}, free: 256*format.MebiByte + minMemory}},
layers: []int{256 * format.MebiByte, 50 * format.MebiByte, 50 * format.MebiByte},
numGPU: -1,
expected: ml.GPULayersList{{DeviceID: ml.DeviceID{ID: "gpu1"}, Layers: []int{0}}, {DeviceID: ml.DeviceID{ID: "gpu0"}, Layers: []int{1, 2}}},
},
{
name: "Multi GPU partial",
gpus: []gpu{{id: ml.DeviceID{ID: "gpu0"}, free: 128 * format.MebiByte}, {id: ml.DeviceID{ID: "gpu1"}, free: 256 * format.MebiByte}},
gpus: []gpu{{id: ml.DeviceID{ID: "gpu0"}, free: 128*format.MebiByte + minMemory}, {id: ml.DeviceID{ID: "gpu1"}, free: 256*format.MebiByte + minMemory}},
layers: []int{256 * format.MebiByte, 256 * format.MebiByte, 50 * format.MebiByte},
numGPU: -1,
expected: ml.GPULayersList{{DeviceID: ml.DeviceID{ID: "gpu1"}, Layers: []int{1}}},
},
{
name: "Multi GPU numGPU 1",
gpus: []gpu{{id: ml.DeviceID{ID: "gpu0"}, free: 128 * format.MebiByte}, {id: ml.DeviceID{ID: "gpu1"}, free: 256 * format.MebiByte}},
gpus: []gpu{{id: ml.DeviceID{ID: "gpu0"}, free: 128*format.MebiByte + minMemory}, {id: ml.DeviceID{ID: "gpu1"}, free: 256*format.MebiByte + minMemory}},
layers: []int{50 * format.MebiByte, 50 * format.MebiByte, 50 * format.MebiByte},
numGPU: 1,
expected: ml.GPULayersList{{DeviceID: ml.DeviceID{ID: "gpu1"}, Layers: []int{1}}},
},
{
name: "Multi GPU numGPU 2",
gpus: []gpu{{id: ml.DeviceID{ID: "gpu0"}, free: 128 * format.MebiByte}, {id: ml.DeviceID{ID: "gpu1"}, free: 256 * format.MebiByte}},
gpus: []gpu{{id: ml.DeviceID{ID: "gpu0"}, free: 128*format.MebiByte + minMemory}, {id: ml.DeviceID{ID: "gpu1"}, free: 256*format.MebiByte + minMemory}},
layers: []int{256 * format.MebiByte, 50 * format.MebiByte, 50 * format.MebiByte},
numGPU: 2,
expected: ml.GPULayersList{{DeviceID: ml.DeviceID{ID: "gpu1"}, Layers: []int{0}}, {DeviceID: ml.DeviceID{ID: "gpu0"}, Layers: []int{1}}},
},
{
name: "Multi GPU numGPU 999",
gpus: []gpu{{id: ml.DeviceID{ID: "gpu0"}, free: 128 * format.MebiByte}, {id: ml.DeviceID{ID: "gpu1"}, free: 256 * format.MebiByte}},
gpus: []gpu{{id: ml.DeviceID{ID: "gpu0"}, free: 128*format.MebiByte + minMemory}, {id: ml.DeviceID{ID: "gpu1"}, free: 256*format.MebiByte + minMemory}},
layers: []int{256 * format.MebiByte, 256 * format.MebiByte, 50 * format.MebiByte},
numGPU: 999,
expected: ml.GPULayersList{{DeviceID: ml.DeviceID{ID: "gpu1"}, Layers: []int{0, 1}}, {DeviceID: ml.DeviceID{ID: "gpu0"}, Layers: []int{2}}},
},
{
name: "Multi GPU different libraries",
gpus: []gpu{{id: ml.DeviceID{Library: "CUDA", ID: "gpu0"}, free: 128 * format.MebiByte}, {id: ml.DeviceID{Library: "ROCm", ID: "gpu1"}, free: 256 * format.MebiByte}},
gpus: []gpu{{id: ml.DeviceID{Library: "CUDA", ID: "gpu0"}, free: 128*format.MebiByte + minMemory}, {id: ml.DeviceID{Library: "ROCm", ID: "gpu1"}, free: 256*format.MebiByte + minMemory}},
layers: []int{128 * format.MebiByte, 128 * format.MebiByte, 50 * format.MebiByte},
numGPU: -1,
expected: ml.GPULayersList{{DeviceID: ml.DeviceID{ID: "gpu1", Library: "ROCm"}, Layers: []int{0, 1}}},
},
{
name: "requireFull",
gpus: []gpu{{id: ml.DeviceID{ID: "gpu0"}, free: 256 * format.MebiByte}},
gpus: []gpu{{id: ml.DeviceID{ID: "gpu0"}, free: 256*format.MebiByte + minMemory}},
layers: []int{100 * format.MebiByte, 100 * format.MebiByte, 100 * format.MebiByte, 100 * format.MebiByte},
numGPU: -1,
requireFull: true,
@ -139,12 +140,12 @@ func TestLLMServerFitGPU(t *testing.T) {
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
var systemInfo discover.SystemInfo
systemInfo.System.TotalMemory = format.GibiByte
systemInfo.System.FreeMemory = 512 * format.MebiByte
systemInfo.System.FreeSwap = 256 * format.MebiByte
var systemInfo ml.SystemInfo
systemInfo.TotalMemory = format.GibiByte
systemInfo.FreeMemory = 512 * format.MebiByte
systemInfo.FreeSwap = 256 * format.MebiByte
gpus := make(discover.GpuInfoList, len(tt.gpus))
gpus := make([]ml.DeviceInfo, len(tt.gpus))
for i := range tt.gpus {
gpus[i].DeviceID = tt.gpus[i].id
gpus[i].FreeMemory = uint64(tt.gpus[i].free)

View File

@ -3,15 +3,21 @@ package ml
import (
"context"
"encoding/binary"
"encoding/json"
"fmt"
"hash/maphash"
"io"
"log/slog"
"net/http"
"runtime"
"slices"
"sort"
"strconv"
"strings"
"time"
"github.com/ollama/ollama/format"
"github.com/ollama/ollama/logutil"
)
// GPULayers is a set of layers to be allocated on a single GPU
@ -282,6 +288,20 @@ type DeviceInfo struct {
LibraryPath []string
}
type SystemInfo struct {
// ThreadCount is the optimal number of threads to use for inference
ThreadCount int `json:"threads,omitempty"`
// TotalMemory is the total amount of system memory
TotalMemory uint64 `json:"total_memory,omitempty"`
// FreeMemory is the amount of memory currently available on the system for loading models
FreeMemory uint64 `json:"free_memory,omitempty"`
// FreeSwap is the amount of system swap space reported as available
FreeSwap uint64 `json:"free_swap,omitempty"`
}
func (d DeviceInfo) Compute() string {
// AMD gfx is encoded into the major minor in hex form
if strings.EqualFold(d.Library, "ROCm") {
@ -294,6 +314,71 @@ func (d DeviceInfo) Driver() string {
return strconv.Itoa(d.DriverMajor) + "." + strconv.Itoa(d.DriverMinor)
}
// MinimumMemory reports the amount of memory that should be set aside
// on the device for overhead (e.g. VRAM consumed by context structures independent
// of model allocations)
func (d DeviceInfo) MinimumMemory() uint64 {
if d.Library == "Metal" {
return 512 * format.MebiByte
}
return 457 * format.MebiByte
}
// Sort by Free Space.
// iGPUs are reported first, thus Reverse() yields the largest discrete GPU first
type ByFreeMemory []DeviceInfo
func (a ByFreeMemory) Len() int { return len(a) }
func (a ByFreeMemory) Swap(i, j int) { a[i], a[j] = a[j], a[i] }
func (a ByFreeMemory) Less(i, j int) bool {
if a[i].Integrated && !a[j].Integrated {
return true
} else if !a[i].Integrated && a[j].Integrated {
return false
}
return a[i].FreeMemory < a[j].FreeMemory
}
func ByLibrary(l []DeviceInfo) [][]DeviceInfo {
resp := [][]DeviceInfo{}
libs := []string{}
for _, info := range l {
found := false
requested := info.Library
for i, lib := range libs {
if lib == requested {
resp[i] = append(resp[i], info)
found = true
break
}
}
if !found {
libs = append(libs, requested)
resp = append(resp, []DeviceInfo{info})
}
}
return resp
}
func LibraryPaths(l []DeviceInfo) []string {
var gpuLibs []string
for _, gpu := range l {
for _, dir := range gpu.LibraryPath {
needed := true
for _, existing := range gpuLibs {
if dir == existing {
needed = false
break
}
}
if needed {
gpuLibs = append(gpuLibs, dir)
}
}
}
return gpuLibs
}
type DeviceComparison int
const (
@ -336,3 +421,133 @@ func (a DeviceInfo) IsBetter(b DeviceInfo) bool {
sort.Sort(sort.Reverse(sort.StringSlice(cmp)))
return cmp[0] == bLibSplit[1]
}
// For each GPU, check if it does NOT support flash attention
func FlashAttentionSupported(l []DeviceInfo) bool {
for _, gpu := range l {
supportsFA := gpu.Library == "cpu" ||
gpu.Name == "Metal" || gpu.Library == "Metal" ||
(gpu.Library == "CUDA" && gpu.DriverMajor >= 7 && !(gpu.ComputeMajor == 7 && gpu.ComputeMinor == 2)) ||
gpu.Library == "ROCm"
if !supportsFA {
return false
}
}
return true
}
// Given the list of GPUs this instantiation is targeted for,
// figure out the visible devices environment variables
func GetVisibleDevicesEnv(l []DeviceInfo) map[string]string {
if len(l) == 0 {
return nil
}
env := map[string]string{}
for _, d := range l {
d.updateVisibleDevicesEnv(env)
}
return env
}
func (d DeviceInfo) updateVisibleDevicesEnv(env map[string]string) {
var envVar string
switch d.Library {
case "ROCm":
envVar = "ROCR_VISIBLE_DEVICES"
if runtime.GOOS != "linux" {
envVar = "HIP_VISIBLE_DEVICES"
}
case "Vulkan":
envVar = "GGML_VK_VISIBLE_DEVICES"
default:
return
}
v, existing := env[envVar]
if existing {
v = v + ","
}
if d.FilteredID != "" {
v = v + d.FilteredID
} else {
v = v + d.ID
}
env[envVar] = v
}
type BaseRunner interface {
// GetPort returns the localhost port number the runner is running on
GetPort() int
// HasExited indicates if the runner is no longer running. This can be used during
// bootstrap to detect if a given filtered device is incompatible and triggered an assert
HasExited() bool
}
type RunnerDiscovery interface {
BaseRunner
// GetDeviceInfos will perform a query of the underlying device libraries
// for device identification and free VRAM information
// During bootstrap scenarios, this routine may take seconds to complete
GetDeviceInfos(ctx context.Context) []DeviceInfo
}
type FilteredRunnerDiscovery interface {
RunnerDiscovery
// GetActiveDeviceIDs returns the filtered set of devices actively in
// use by this runner for running models. If the runner is a bootstrap runner, no devices
// will be active yet so no device IDs are returned.
// This routine will not query the underlying device and will return immediately
GetActiveDeviceIDs() []DeviceID
}
func GetDevicesFromRunner(ctx context.Context, runner BaseRunner) ([]DeviceInfo, error) {
var moreDevices []DeviceInfo
port := runner.GetPort()
tick := time.Tick(10 * time.Millisecond)
for {
select {
case <-ctx.Done():
return nil, fmt.Errorf("failed to finish discovery before timeout")
case <-tick:
r, err := http.NewRequestWithContext(ctx, http.MethodGet, fmt.Sprintf("http://127.0.0.1:%d/info", port), nil)
if err != nil {
return nil, fmt.Errorf("failed to create request: %w", err)
}
r.Header.Set("Content-Type", "application/json")
resp, err := http.DefaultClient.Do(r)
if err != nil {
// slog.Warn("failed to send request", "error", err)
if runner.HasExited() {
return nil, fmt.Errorf("runner crashed")
}
continue
}
defer resp.Body.Close()
if resp.StatusCode == http.StatusNotFound {
// old runner, fall back to bootstrapping model
return nil, fmt.Errorf("llamarunner free vram reporting not supported")
}
body, err := io.ReadAll(resp.Body)
if err != nil {
slog.Warn("failed to read response", "error", err)
continue
}
if resp.StatusCode != 200 {
logutil.Trace("runner failed to discover free VRAM", "status", resp.StatusCode, "response", body)
return nil, fmt.Errorf("runner error: %s", string(body))
}
if err := json.Unmarshal(body, &moreDevices); err != nil {
slog.Warn("unmarshal encode response", "error", err)
continue
}
return moreDevices, nil
}
}
}

View File

@ -84,11 +84,11 @@ function buildCPU() {
Remove-Item -ea 0 -recurse -force -path "${script:SRC_DIR}\dist\windows-${script:ARCH}"
New-Item "${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\" -ItemType Directory -ea 0
& cmake --fresh --preset CPU --install-prefix $script:DIST_DIR
& cmake -B build\cpu --preset CPU --install-prefix $script:DIST_DIR
if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
& cmake --build --preset CPU --config Release --parallel $script:JOBS
& cmake --build build\cpu --target ggml-cpu --config Release --parallel $script:JOBS
if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
& cmake --install build --component CPU --strip
& cmake --install build\cpu --component CPU --strip
if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
}
}
@ -105,11 +105,11 @@ function buildCUDA11() {
$hashEnv.Keys | foreach { if ($_.Contains("CUDA_PATH_V11")) { $x=$hashEnv[$_]; if (test-path -literalpath "$x\bin\nvcc.exe" ) { $cuda=$x} }}
write-host "Building CUDA v11 backend libraries $cuda"
$env:CUDAToolkit_ROOT=$cuda
& cmake --fresh --preset "CUDA 11" -T cuda="$cuda" -DCMAKE_CUDA_COMPILER="$cuda\bin\nvcc.exe" -G "Visual Studio 16 2019" --install-prefix $script:DIST_DIR -DOLLAMA_RUNNER_DIR="cuda_v11"
& cmake -B build\cuda_v11 --preset "CUDA 11" -T cuda="$cuda" -DCMAKE_CUDA_COMPILER="$cuda\bin\nvcc.exe" -G "Visual Studio 16 2019" --install-prefix $script:DIST_DIR -DOLLAMA_RUNNER_DIR="cuda_v11"
if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
& cmake --build --preset "CUDA 11" --config Release --parallel $script:JOBS
& cmake --build build\cuda_v11 --target ggml-cuda --config Release --parallel $script:JOBS
if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
& cmake --install build --component "CUDA" --strip
& cmake --install build\cuda_v11 --component "CUDA" --strip
if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
}
}
@ -124,11 +124,11 @@ function buildCUDA12() {
$hashEnv.Keys | foreach { if ($_.Contains("CUDA_PATH_V12_8")) { $x=$hashEnv[$_]; if (test-path -literalpath "$x\bin\nvcc.exe" ) { $cuda=$x} }}
write-host "Building CUDA v12 backend libraries $cuda"
$env:CUDAToolkit_ROOT=$cuda
& cmake --fresh --preset "CUDA 12" -T cuda="$cuda" --install-prefix $script:DIST_DIR -DOLLAMA_RUNNER_DIR="cuda_v12"
& cmake -B build\cuda_v12 --preset "CUDA 12" -T cuda="$cuda" --install-prefix $script:DIST_DIR -DOLLAMA_RUNNER_DIR="cuda_v12"
if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
& cmake --build --preset "CUDA 12" --config Release --parallel $script:JOBS
& cmake --build build\cuda_v12 --target ggml-cuda --config Release --parallel $script:JOBS
if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
& cmake --install build --component "CUDA" --strip
& cmake --install build\cuda_v12 --component "CUDA" --strip
if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
}
}
@ -143,11 +143,11 @@ function buildCUDA13() {
$hashEnv.Keys | foreach { if ($_.Contains("CUDA_PATH_V13")) { $x=$hashEnv[$_]; if (test-path -literalpath "$x\bin\nvcc.exe" ) { $cuda=$x} }}
$env:CUDAToolkit_ROOT=$cuda
write-host "Building CUDA v13 backend libraries $cuda"
& cmake --fresh --preset "CUDA 13" -T cuda="$cuda" --install-prefix $script:DIST_DIR -DOLLAMA_RUNNER_DIR="cuda_v13"
& cmake -B build\cuda_v13 --preset "CUDA 13" -T cuda="$cuda" --install-prefix $script:DIST_DIR -DOLLAMA_RUNNER_DIR="cuda_v13"
if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
& cmake --build --preset "CUDA 13" --config Release --parallel $script:JOBS
& cmake --build build\cuda_v13 --target ggml-cuda --config Release --parallel $script:JOBS
if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
& cmake --install build --component "CUDA" --strip
& cmake --install build\cuda_v13 --component "CUDA" --strip
if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
}
}
@ -165,7 +165,7 @@ function buildROCm() {
$env:HIPCXX="${env:HIP_PATH}\bin\clang++.exe"
$env:HIP_PLATFORM="amd"
$env:CMAKE_PREFIX_PATH="${env:HIP_PATH}"
& cmake --fresh --preset "ROCm 6" -G Ninja -DOLLAMA_RUNNER_DIR="rocm" `
& cmake --fresh -B build\rocm --preset "ROCm 6" -G Ninja -DOLLAMA_RUNNER_DIR="rocm" `
-DCMAKE_C_COMPILER=clang `
-DCMAKE_CXX_COMPILER=clang++ `
-DCMAKE_C_FLAGS="-parallel-jobs=4 -Wno-ignored-attributes -Wno-deprecated-pragma" `
@ -175,9 +175,9 @@ function buildROCm() {
$env:HIPCXX=""
$env:HIP_PLATFORM=""
$env:CMAKE_PREFIX_PATH=""
& cmake --build --preset "ROCm 6" --config Release --parallel $script:JOBS
& cmake --build build\rocm --target ggml-hip --config Release --parallel $script:JOBS
if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
& cmake --install build --component "HIP" --strip
& cmake --install build\rocm --component "HIP" --strip
if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
Remove-Item -Path $script:DIST_DIR\lib\ollama\rocm\rocblas\library\*gfx906* -ErrorAction SilentlyContinue
}

View File

@ -9,9 +9,9 @@ import (
"github.com/gin-gonic/gin"
"github.com/ollama/ollama/api"
"github.com/ollama/ollama/discover"
"github.com/ollama/ollama/fs/ggml"
"github.com/ollama/ollama/llm"
"github.com/ollama/ollama/ml"
)
func TestGenerateDebugRenderOnly(t *testing.T) {
@ -37,9 +37,9 @@ func TestGenerateDebugRenderOnly(t *testing.T) {
loaded: make(map[string]*runnerRef),
newServerFn: newMockServer(&mock),
getGpuFn: getGpuFn,
getCpuFn: getCpuFn,
getSystemInfoFn: getSystemInfoFn,
waitForRecovery: 250 * time.Millisecond,
loadFn: func(req *LlmRequest, _ *ggml.GGML, _ discover.GpuInfoList, _ bool) bool {
loadFn: func(req *LlmRequest, _ *ggml.GGML, _ ml.SystemInfo, _ []ml.DeviceInfo, _ bool) bool {
// add small delay to simulate loading
time.Sleep(time.Millisecond)
req.successCh <- &runnerRef{
@ -230,9 +230,9 @@ func TestChatDebugRenderOnly(t *testing.T) {
loaded: make(map[string]*runnerRef),
newServerFn: newMockServer(&mock),
getGpuFn: getGpuFn,
getCpuFn: getCpuFn,
getSystemInfoFn: getSystemInfoFn,
waitForRecovery: 250 * time.Millisecond,
loadFn: func(req *LlmRequest, _ *ggml.GGML, _ discover.GpuInfoList, _ bool) bool {
loadFn: func(req *LlmRequest, _ *ggml.GGML, _ ml.SystemInfo, _ []ml.DeviceInfo, _ bool) bool {
// add small delay to simulate loading
time.Sleep(time.Millisecond)
req.successCh <- &runnerRef{

View File

@ -12,9 +12,9 @@ import (
"github.com/google/go-cmp/cmp"
"github.com/ollama/ollama/api"
"github.com/ollama/ollama/discover"
"github.com/ollama/ollama/fs/ggml"
"github.com/ollama/ollama/llm"
"github.com/ollama/ollama/ml"
)
// TestGenerateWithBuiltinRenderer tests that api/generate uses built-in renderers
@ -42,9 +42,9 @@ func TestGenerateWithBuiltinRenderer(t *testing.T) {
loaded: make(map[string]*runnerRef),
newServerFn: newMockServer(&mock),
getGpuFn: getGpuFn,
getCpuFn: getCpuFn,
getSystemInfoFn: getSystemInfoFn,
waitForRecovery: 250 * time.Millisecond,
loadFn: func(req *LlmRequest, _ *ggml.GGML, _ discover.GpuInfoList, _ bool) bool {
loadFn: func(req *LlmRequest, _ *ggml.GGML, _ ml.SystemInfo, _ []ml.DeviceInfo, _ bool) bool {
time.Sleep(time.Millisecond)
req.successCh <- &runnerRef{
llama: &mock,
@ -226,9 +226,9 @@ func TestGenerateWithDebugRenderOnly(t *testing.T) {
loaded: make(map[string]*runnerRef),
newServerFn: newMockServer(&mock),
getGpuFn: getGpuFn,
getCpuFn: getCpuFn,
getSystemInfoFn: getSystemInfoFn,
waitForRecovery: 250 * time.Millisecond,
loadFn: func(req *LlmRequest, _ *ggml.GGML, _ discover.GpuInfoList, _ bool) bool {
loadFn: func(req *LlmRequest, _ *ggml.GGML, _ ml.SystemInfo, _ []ml.DeviceInfo, _ bool) bool {
time.Sleep(time.Millisecond)
req.successCh <- &runnerRef{
llama: &mock,

View File

@ -17,9 +17,9 @@ import (
"github.com/google/go-cmp/cmp"
"github.com/ollama/ollama/api"
"github.com/ollama/ollama/discover"
"github.com/ollama/ollama/fs/ggml"
"github.com/ollama/ollama/llm"
"github.com/ollama/ollama/ml"
)
type mockRunner struct {
@ -48,8 +48,8 @@ func (mockRunner) Tokenize(_ context.Context, s string) (tokens []int, err error
return
}
func newMockServer(mock *mockRunner) func(discover.GpuInfoList, string, *ggml.GGML, []string, []string, api.Options, int) (llm.LlamaServer, error) {
return func(_ discover.GpuInfoList, _ string, _ *ggml.GGML, _, _ []string, _ api.Options, _ int) (llm.LlamaServer, error) {
func newMockServer(mock *mockRunner) func(ml.SystemInfo, []ml.DeviceInfo, string, *ggml.GGML, []string, []string, api.Options, int) (llm.LlamaServer, error) {
return func(_ ml.SystemInfo, _ []ml.DeviceInfo, _ string, _ *ggml.GGML, _, _ []string, _ api.Options, _ int) (llm.LlamaServer, error) {
return mock, nil
}
}
@ -157,9 +157,9 @@ func TestGenerateChat(t *testing.T) {
loaded: make(map[string]*runnerRef),
newServerFn: newMockServer(&mock),
getGpuFn: getGpuFn,
getCpuFn: getCpuFn,
getSystemInfoFn: getSystemInfoFn,
waitForRecovery: 250 * time.Millisecond,
loadFn: func(req *LlmRequest, _ *ggml.GGML, _ discover.GpuInfoList, _ bool) bool {
loadFn: func(req *LlmRequest, _ *ggml.GGML, _ ml.SystemInfo, _ []ml.DeviceInfo, _ bool) bool {
// add small delay to simulate loading
time.Sleep(time.Millisecond)
req.successCh <- &runnerRef{
@ -768,9 +768,9 @@ func TestGenerate(t *testing.T) {
loaded: make(map[string]*runnerRef),
newServerFn: newMockServer(&mock),
getGpuFn: getGpuFn,
getCpuFn: getCpuFn,
getSystemInfoFn: getSystemInfoFn,
waitForRecovery: 250 * time.Millisecond,
loadFn: func(req *LlmRequest, _ *ggml.GGML, _ discover.GpuInfoList, _ bool) bool {
loadFn: func(req *LlmRequest, _ *ggml.GGML, _ ml.SystemInfo, _ []ml.DeviceInfo, _ bool) bool {
// add small delay to simulate loading
time.Sleep(time.Millisecond)
req.successCh <- &runnerRef{
@ -1193,9 +1193,9 @@ func TestChatWithPromptEndingInThinkTag(t *testing.T) {
loaded: make(map[string]*runnerRef),
newServerFn: newMockServer(mock),
getGpuFn: getGpuFn,
getCpuFn: getCpuFn,
getSystemInfoFn: getSystemInfoFn,
waitForRecovery: 250 * time.Millisecond,
loadFn: func(req *LlmRequest, _ *ggml.GGML, _ discover.GpuInfoList, _ bool) bool {
loadFn: func(req *LlmRequest, _ *ggml.GGML, _ ml.SystemInfo, _ []ml.DeviceInfo, _ bool) bool {
time.Sleep(time.Millisecond)
req.successCh <- &runnerRef{llama: mock}
return false

View File

@ -14,9 +14,9 @@ import (
"github.com/gin-gonic/gin"
"github.com/ollama/ollama/api"
"github.com/ollama/ollama/discover"
"github.com/ollama/ollama/fs/ggml"
"github.com/ollama/ollama/llm"
"github.com/ollama/ollama/ml"
)
func getTestTools() []api.Tool {
@ -275,9 +275,9 @@ func TestChatHarmonyParserStreamingRealtime(t *testing.T) {
loaded: make(map[string]*runnerRef),
newServerFn: newMockServer(&mock),
getGpuFn: getGpuFn,
getCpuFn: getCpuFn,
getSystemInfoFn: getSystemInfoFn,
waitForRecovery: 100 * time.Millisecond,
loadFn: func(req *LlmRequest, _ *ggml.GGML, _ discover.GpuInfoList, _ bool) bool {
loadFn: func(req *LlmRequest, _ *ggml.GGML, _ ml.SystemInfo, _ []ml.DeviceInfo, _ bool) bool {
req.successCh <- &runnerRef{
llama: &mock,
}
@ -426,9 +426,9 @@ func TestChatHarmonyParserStreamingSimple(t *testing.T) {
loaded: make(map[string]*runnerRef),
newServerFn: newMockServer(&mock),
getGpuFn: getGpuFn,
getCpuFn: getCpuFn,
getSystemInfoFn: getSystemInfoFn,
waitForRecovery: 100 * time.Millisecond,
loadFn: func(req *LlmRequest, _ *ggml.GGML, _ discover.GpuInfoList, _ bool) bool {
loadFn: func(req *LlmRequest, _ *ggml.GGML, _ ml.SystemInfo, _ []ml.DeviceInfo, _ bool) bool {
req.successCh <- &runnerRef{
llama: &mock,
}
@ -608,9 +608,9 @@ func TestChatHarmonyParserStreaming(t *testing.T) {
loaded: make(map[string]*runnerRef),
newServerFn: newMockServer(&mock),
getGpuFn: getGpuFn,
getCpuFn: getCpuFn,
getSystemInfoFn: getSystemInfoFn,
waitForRecovery: 250 * time.Millisecond,
loadFn: func(req *LlmRequest, _ *ggml.GGML, _ discover.GpuInfoList, _ bool) bool {
loadFn: func(req *LlmRequest, _ *ggml.GGML, _ ml.SystemInfo, _ []ml.DeviceInfo, _ bool) bool {
req.successCh <- &runnerRef{
llama: &mock,
}

View File

@ -5,12 +5,9 @@ import (
"errors"
"fmt"
"log/slog"
"os"
"reflect"
"runtime"
"slices"
"sort"
"strconv"
"strings"
"sync"
"time"
@ -52,12 +49,10 @@ type Scheduler struct {
activeLoading llm.LlamaServer
loaded map[string]*runnerRef
loadFn func(req *LlmRequest, f *ggml.GGML, gpus discover.GpuInfoList, requireFull bool) bool
newServerFn func(gpus discover.GpuInfoList, model string, f *ggml.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error)
getGpuFn func(ctx context.Context, runners []discover.FilteredRunnerDiscovery) discover.GpuInfoList
getCpuFn func() discover.GpuInfo
// waitForRecovery sets the limit for how long to wait for memory usage to recover after unload before scheduling the next model
loadFn func(req *LlmRequest, f *ggml.GGML, systemInfo ml.SystemInfo, gpus []ml.DeviceInfo, requireFull bool) bool
newServerFn func(systemInfo ml.SystemInfo, gpus []ml.DeviceInfo, model string, f *ggml.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error)
getGpuFn func(ctx context.Context, runners []ml.FilteredRunnerDiscovery) []ml.DeviceInfo
getSystemInfoFn func() ml.SystemInfo
waitForRecovery time.Duration
}
@ -77,8 +72,8 @@ func InitScheduler(ctx context.Context) *Scheduler {
unloadedCh: make(chan any, maxQueue),
loaded: make(map[string]*runnerRef),
newServerFn: llm.NewLlamaServer,
getGpuFn: discover.GetGPUInfo,
getCpuFn: discover.GetCPUInfo,
getGpuFn: discover.GPUDevices,
getSystemInfoFn: discover.GetSystemInfo,
waitForRecovery: 5 * time.Second,
}
sched.loadFn = sched.load
@ -133,6 +128,8 @@ func (s *Scheduler) Run(ctx context.Context) {
}
func (s *Scheduler) processPending(ctx context.Context) {
maxRunners := envconfig.MaxRunners()
for {
select {
case <-ctx.Done():
@ -152,7 +149,7 @@ func (s *Scheduler) processPending(ctx context.Context) {
s.loadedMu.Lock()
runner := s.loaded[pending.model.ModelPath]
loadedCount := len(s.loaded)
runnersSnapshot := make([]discover.FilteredRunnerDiscovery, 0, len(s.loaded))
runnersSnapshot := make([]ml.FilteredRunnerDiscovery, 0, len(s.loaded))
for _, r := range s.loaded {
runnersSnapshot = append(runnersSnapshot, r)
}
@ -167,39 +164,29 @@ func (s *Scheduler) processPending(ctx context.Context) {
pending.useLoadedRunner(runner, s.finishedReqCh)
break
}
} else if envconfig.MaxRunners() > 0 && loadedCount >= int(envconfig.MaxRunners()) {
} else if maxRunners > 0 && loadedCount >= int(maxRunners) {
slog.Debug("max runners achieved, unloading one to make room", "runner_count", loadedCount)
runnerToExpire = s.findRunnerToUnload()
} else {
// Either no models are loaded or below envconfig.MaxRunners
// Get a refreshed GPU list
var gpus discover.GpuInfoList
var gpus []ml.DeviceInfo
if pending.opts.NumGPU == 0 {
gpus = discover.GpuInfoList{s.getCpuFn()}
gpus = []ml.DeviceInfo{}
} else {
gpus = s.getGpuFn(ctx, runnersSnapshot)
}
if envconfig.MaxRunners() <= 0 {
// No user specified MaxRunners, so figure out what automatic setting to use
// If all GPUs have reliable free memory reporting, defaultModelsPerGPU * the number of GPUs
// if any GPU has unreliable free memory reporting, 1x the number of GPUs
allReliable := true
for _, gpu := range gpus {
if gpu.UnreliableFreeMemory {
allReliable = false
break
}
}
if allReliable {
// HACK
os.Setenv("OLLAMA_MAX_LOADED_MODELS", strconv.Itoa(defaultModelsPerGPU*len(gpus)))
slog.Debug("updating default concurrency", "OLLAMA_MAX_LOADED_MODELS", envconfig.MaxRunners(), "gpu_count", len(gpus))
systemInfo := s.getSystemInfoFn()
if maxRunners <= 0 {
// No user specified MaxRunners, so figure out what automatic setting to use for the next load attempt
if pending.opts.NumGPU == 0 {
// Need to get actual GPU list to set the correct default max models
g := s.getGpuFn(ctx, runnersSnapshot)
maxRunners = uint(defaultModelsPerGPU * max(len(g), 1))
} else {
// HACK
os.Setenv("OLLAMA_MAX_LOADED_MODELS", strconv.Itoa(len(gpus)))
slog.Info("one or more GPUs detected that are unable to accurately report free memory - disabling default concurrency")
maxRunners = uint(defaultModelsPerGPU * max(len(gpus), 1))
}
slog.Debug("updating default concurrency", "OLLAMA_MAX_LOADED_MODELS", maxRunners, "gpu_count", len(gpus))
}
// Load model for fitting
@ -215,14 +202,14 @@ func (s *Scheduler) processPending(ctx context.Context) {
if loadedCount == 0 {
// No models loaded. Load the model but prefer the best fit.
slog.Debug("loading first model", "model", pending.model.ModelPath)
s.loadFn(pending, ggml, gpus, false)
s.loadFn(pending, ggml, systemInfo, gpus, false)
break
}
// More than one loaded model, so we have to see if the
// new one fits
needEvict := s.loadFn(pending, ggml, gpus, true)
needEvict := s.loadFn(pending, ggml, systemInfo, gpus, true)
if !needEvict {
slog.Debug("new model fits with existing models, loading")
break
@ -353,7 +340,7 @@ func (s *Scheduler) processCompleted(ctx context.Context) {
runner.refMu.Unlock()
} else {
slog.Debug("starting background wait for VRAM recovery", "runner", runner)
runnersSnapshot := make([]discover.FilteredRunnerDiscovery, 0, len(s.loaded))
runnersSnapshot := make([]ml.FilteredRunnerDiscovery, 0, len(s.loaded))
for _, r := range s.loaded {
runnersSnapshot = append(runnersSnapshot, r)
}
@ -395,7 +382,7 @@ func (pending *LlmRequest) useLoadedRunner(runner *runnerRef, finished chan *Llm
// load creates a new model based on req and loads it. If requireFull is true then the model must be loaded fully onto GPUs
// (if any). Returns whether the scheduler needs to evict a model to make this one fit.
func (s *Scheduler) load(req *LlmRequest, f *ggml.GGML, gpus discover.GpuInfoList, requireFull bool) bool {
func (s *Scheduler) load(req *LlmRequest, f *ggml.GGML, systemInfo ml.SystemInfo, gpus []ml.DeviceInfo, requireFull bool) bool {
numParallel := max(int(envconfig.NumParallel()), 1)
// Embedding models should always be loaded with parallel=1
@ -420,7 +407,7 @@ func (s *Scheduler) load(req *LlmRequest, f *ggml.GGML, gpus discover.GpuInfoLis
if llama == nil {
var err error
llama, err = s.newServerFn(gpus, req.model.ModelPath, f, req.model.AdapterPaths, req.model.ProjectorPaths, req.opts, numParallel)
llama, err = s.newServerFn(systemInfo, gpus, req.model.ModelPath, f, req.model.AdapterPaths, req.model.ProjectorPaths, req.opts, numParallel)
if err != nil {
// some older models are not compatible with newer versions of llama.cpp
// show a generalized compatibility error until there is a better way to
@ -443,9 +430,16 @@ func (s *Scheduler) load(req *LlmRequest, f *ggml.GGML, gpus discover.GpuInfoLis
s.loadedMu.Unlock()
gpuIDs, err := llama.Load(req.ctx, gpus, requireFull)
gpuIDs, err := llama.Load(req.ctx, systemInfo, gpus, requireFull)
if err != nil {
if errors.Is(err, llm.ErrLoadRequiredFull) {
if !requireFull {
// No other models loaded, yet we still don't fit, so report an error
slog.Info("model is too large for system memory", "requireFull", requireFull)
s.activeLoading.Close()
s.activeLoading = nil
req.errCh <- err
}
return true
}
@ -456,6 +450,20 @@ func (s *Scheduler) load(req *LlmRequest, f *ggml.GGML, gpus discover.GpuInfoLis
return false
}
// Determine if we have discrete GPUs which we should monitor VRAM usage on during shutdown
discreteGPUs := false
iGPUScan:
for _, devid := range gpuIDs {
for _, dev := range gpus {
if dev.DeviceID == devid {
if !dev.Integrated {
discreteGPUs = true
break iGPUScan
}
}
}
}
runner := &runnerRef{
model: req.model,
modelPath: req.model.ModelPath,
@ -463,6 +471,7 @@ func (s *Scheduler) load(req *LlmRequest, f *ggml.GGML, gpus discover.GpuInfoLis
Options: &req.opts,
sessionDuration: sessionDuration,
gpus: gpuIDs,
discreteGPUs: discreteGPUs,
vramSize: llama.VRAMSize(),
totalSize: llama.TotalSize(),
loading: true,
@ -510,7 +519,10 @@ func (s *Scheduler) load(req *LlmRequest, f *ggml.GGML, gpus discover.GpuInfoLis
return false
}
func (s *Scheduler) updateFreeSpace(allGpus discover.GpuInfoList) {
func (s *Scheduler) updateFreeSpace(allGpus []ml.DeviceInfo) {
if len(allGpus) == 0 {
return
}
predMap := map[ml.DeviceID]uint64{} // Sum up the total predicted usage per GPU for all runners
s.loadedMu.Lock()
runners := make([]*runnerRef, 0, len(s.loaded))
@ -558,6 +570,7 @@ type runnerRef struct {
pid int
loading bool // True only during initial load, then false forever
gpus []ml.DeviceID // Recorded at time of provisioning
discreteGPUs bool // True if all devices are discrete GPUs - used to skip VRAM recovery check for iGPUs
vramSize uint64
totalSize uint64
@ -627,14 +640,12 @@ func (runner *runnerRef) needsReload(ctx context.Context, req *LlmRequest) bool
// a before and after GPU memory allocation. The returned channel
// will be notified when we're done waiting, or have timed out and should
// proceed anyway
func (s *Scheduler) waitForVRAMRecovery(runner *runnerRef, runners []discover.FilteredRunnerDiscovery) chan any {
func (s *Scheduler) waitForVRAMRecovery(runner *runnerRef, runners []ml.FilteredRunnerDiscovery) chan any {
finished := make(chan any, 1)
// CPU or Metal don't need checking, so no waiting required
// windows can page VRAM, only cuda currently can report accurate used vram usage
if len(runner.gpus) == 0 ||
(len(runner.gpus) == 1 && (runner.gpus[0].Library == "cpu" || runner.gpus[0].Library == "Metal")) ||
(runtime.GOOS == "windows" && runner.gpus[0].Library != "CUDA") {
// CPU, Metal and iGPUs don't need checking, so no waiting required
if len(runner.gpus) == 0 || !runner.discreteGPUs ||
(len(runner.gpus) == 1 && runner.gpus[0].Library == "Metal") {
finished <- struct{}{}
slog.Debug("no need to wait for VRAM recovery", "runner", runner)
return finished
@ -668,7 +679,11 @@ func (s *Scheduler) waitForVRAMRecovery(runner *runnerRef, runners []discover.Fi
totalMemoryNow += gpu.TotalMemory
freeMemoryNow += gpu.FreeMemory
}
logutil.Trace("gpu VRAM convergence", "percent", int(max(float32(freeMemoryNow-freeMemoryBefore), 0.0)/float32(runner.vramSize)*100))
if freeMemoryNow > freeMemoryBefore {
logutil.Trace("gpu VRAM convergence", "percent", int(float32(freeMemoryNow-freeMemoryBefore)/float32(runner.vramSize)*100))
} else {
logutil.Trace("gpu VRAM convergence", "percent", 0)
}
// If we're within ~75% of the estimated memory usage recovered, bail out
if float32(freeMemoryNow-freeMemoryBefore) > float32(runner.vramSize)*0.75 {
slog.Debug(fmt.Sprintf("gpu VRAM free memory converged after %0.2f seconds", time.Since(start).Seconds()), "free_before", format.HumanBytes2(freeMemoryBefore), "free_now", format.HumanBytes2(freeMemoryNow), "runner", runner)

View File

@ -13,7 +13,6 @@ import (
"github.com/ollama/ollama/api"
"github.com/ollama/ollama/app/lifecycle"
"github.com/ollama/ollama/discover"
"github.com/ollama/ollama/format"
"github.com/ollama/ollama/fs/ggml"
"github.com/ollama/ollama/llm"
@ -50,11 +49,12 @@ func TestSchedLoad(t *testing.T) {
sessionDuration: &api.Duration{Duration: 2 * time.Second},
}
// Fail to load model first
s.newServerFn = func(gpus discover.GpuInfoList, model string, f *ggml.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
s.newServerFn = func(systemInfo ml.SystemInfo, gpus []ml.DeviceInfo, model string, f *ggml.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
return nil, errors.New("something failed to load model blah")
}
gpus := discover.GpuInfoList{}
s.load(req, f, gpus, false)
gpus := []ml.DeviceInfo{}
systemInfo := ml.SystemInfo{}
s.load(req, f, systemInfo, gpus, false)
require.Empty(t, req.successCh)
require.Len(t, req.errCh, 1)
s.loadedMu.Lock()
@ -64,11 +64,11 @@ func TestSchedLoad(t *testing.T) {
require.Contains(t, err.Error(), "this model may be incompatible")
server := &mockLlm{vramSize: 10, vramByGPU: map[ml.DeviceID]uint64{}}
s.newServerFn = func(gpus discover.GpuInfoList, model string, f *ggml.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
s.newServerFn = func(systemInfo ml.SystemInfo, gpus []ml.DeviceInfo, model string, f *ggml.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
server.modelPath = model
return server, nil
}
s.load(req, f, gpus, false)
s.load(req, f, systemInfo, gpus, false)
select {
case err := <-req.errCh:
require.NoError(t, err)
@ -82,7 +82,7 @@ func TestSchedLoad(t *testing.T) {
req.model.ModelPath = "dummy_model_path"
server.waitResp = errors.New("wait failure")
s.load(req, f, gpus, false)
s.load(req, f, systemInfo, gpus, false)
select {
case err := <-req.errCh:
require.Contains(t, err.Error(), "wait failure")
@ -106,7 +106,7 @@ type reqBundle struct {
f *ggml.GGML
}
func (scenario *reqBundle) newServer(gpus discover.GpuInfoList, model string, f *ggml.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
func (scenario *reqBundle) newServer(systemInfo ml.SystemInfo, gpus []ml.DeviceInfo, model string, f *ggml.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
scenario.srv.modelPath = model
return scenario.srv, nil
}
@ -152,20 +152,20 @@ func newScenarioRequest(t *testing.T, ctx context.Context, modelName string, vra
return b
}
func getGpuFn(ctx context.Context, runners []discover.FilteredRunnerDiscovery) discover.GpuInfoList {
func getGpuFn(ctx context.Context, runners []ml.FilteredRunnerDiscovery) []ml.DeviceInfo {
slog.Info("test getGpuFn called", "runners", runners)
g := discover.GpuInfo{DeviceID: ml.DeviceID{Library: "metal"}}
g := ml.DeviceInfo{DeviceID: ml.DeviceID{Library: "Metal"}}
g.TotalMemory = 24 * format.GigaByte
g.FreeMemory = 12 * format.GigaByte
return []discover.GpuInfo{g}
return []ml.DeviceInfo{g}
}
func getCpuFn() discover.GpuInfo {
slog.Info("test getCpuFn called")
g := discover.GpuInfo{DeviceID: ml.DeviceID{Library: "cpu"}}
g.TotalMemory = 32 * format.GigaByte
g.FreeMemory = 26 * format.GigaByte
return g
func getSystemInfoFn() ml.SystemInfo {
slog.Info("test getSystemInfoFn called")
return ml.SystemInfo{
TotalMemory: 32 * format.GigaByte,
FreeMemory: 26 * format.GigaByte,
}
}
func TestSchedRequestsSameModelSameRequest(t *testing.T) {
@ -174,7 +174,7 @@ func TestSchedRequestsSameModelSameRequest(t *testing.T) {
s := InitScheduler(ctx)
s.waitForRecovery = 10 * time.Millisecond
s.getGpuFn = getGpuFn
s.getCpuFn = getCpuFn
s.getSystemInfoFn = getSystemInfoFn
a := newScenarioRequest(t, ctx, "ollama-model-1", 10, &api.Duration{Duration: 5 * time.Millisecond}, nil)
b := newScenarioRequest(t, ctx, "ollama-model-1", 11, &api.Duration{Duration: 0}, nil)
b.req.model = a.req.model
@ -218,7 +218,7 @@ func TestSchedRequestsSimpleReloadSameModel(t *testing.T) {
s := InitScheduler(ctx)
s.waitForRecovery = 10 * time.Millisecond
s.getGpuFn = getGpuFn
s.getCpuFn = getCpuFn
s.getSystemInfoFn = getSystemInfoFn
a := newScenarioRequest(t, ctx, "ollama-model-1", 10, &api.Duration{Duration: 5 * time.Millisecond}, nil)
b := newScenarioRequest(t, ctx, "ollama-model-1", 20, &api.Duration{Duration: 5 * time.Millisecond}, nil)
tmpModel := *a.req.model
@ -251,12 +251,12 @@ func TestSchedRequestsSimpleReloadSameModel(t *testing.T) {
a.ctxDone()
// Report recovered VRAM usage
time.Sleep(1 * time.Millisecond)
s.getGpuFn = func(ctx context.Context, runners []discover.FilteredRunnerDiscovery) discover.GpuInfoList {
slog.Info("XXX altered getGpuFn called")
g := discover.GpuInfo{DeviceID: ml.DeviceID{Library: "metal"}}
s.getGpuFn = func(ctx context.Context, runners []ml.FilteredRunnerDiscovery) []ml.DeviceInfo {
slog.Info("altered getGpuFn called")
g := ml.DeviceInfo{DeviceID: ml.DeviceID{Library: "Metal"}}
g.TotalMemory = 24 * format.GigaByte
g.FreeMemory = 24 * format.GigaByte
return []discover.GpuInfo{g}
return []ml.DeviceInfo{g}
}
select {
case resp := <-b.req.successCh:
@ -271,26 +271,26 @@ func TestSchedRequestsSimpleReloadSameModel(t *testing.T) {
}
func TestSchedRequestsMultipleLoadedModels(t *testing.T) {
ctx, done := context.WithTimeout(t.Context(), 500*time.Millisecond)
slog.Info("TestRequestsMultipleLoadedModels")
ctx, done := context.WithTimeout(t.Context(), 1000*time.Millisecond)
defer done()
s := InitScheduler(ctx)
s.waitForRecovery = 10 * time.Millisecond
s.getGpuFn = getGpuFn // 1 metal GPU
s.getCpuFn = getCpuFn // 1 CPU
s.getGpuFn = getGpuFn // 1 Metal GPU
s.getSystemInfoFn = getSystemInfoFn
// Multiple loaded models
a := newScenarioRequest(t, ctx, "model-a-1g-gpu", 1*format.GigaByte, nil, map[ml.DeviceID]uint64{{Library: "metal"}: 1 * format.GigaByte})
a := newScenarioRequest(t, ctx, "model-a-1g-gpu", 1*format.GigaByte, nil, map[ml.DeviceID]uint64{{Library: "Metal"}: 1 * format.GigaByte})
a.req.sessionDuration = &api.Duration{Duration: 5 * time.Millisecond}
b := newScenarioRequest(t, ctx, "model-b-10g-gpu", 10*format.GigaByte, nil, map[ml.DeviceID]uint64{{Library: "metal"}: 10 * format.GigaByte})
b := newScenarioRequest(t, ctx, "model-b-10g-gpu", 10*format.GigaByte, nil, map[ml.DeviceID]uint64{{Library: "Metal"}: 10 * format.GigaByte})
b.req.sessionDuration = &api.Duration{Duration: 5 * time.Millisecond}
c := newScenarioRequest(t, ctx, "model-c-10g-cpu", 10*format.GigaByte, nil, nil /* No GPU load */)
c.req.opts.NumGPU = 0 // CPU load, will be allowed
b.req.sessionDuration = &api.Duration{Duration: 10 * time.Millisecond} // longer than b to cause the scheduler to favor unloading b over c
d := newScenarioRequest(t, ctx, "model-d-10g-gpu", 13*format.GigaByte, nil, map[ml.DeviceID]uint64{{Library: "metal"}: 13 * format.GigaByte}) // Needs prior unloaded
d := newScenarioRequest(t, ctx, "model-d-10g-gpu", 13*format.GigaByte, nil, map[ml.DeviceID]uint64{{Library: "Metal"}: 13 * format.GigaByte}) // Needs prior unloaded
t.Setenv("OLLAMA_MAX_LOADED_MODELS", "1")
s.newServerFn = a.newServer
slog.Info("a")
slog.Info("Loading A")
s.pendingReqCh <- a.req
s.Run(ctx)
select {
@ -309,7 +309,7 @@ func TestSchedRequestsMultipleLoadedModels(t *testing.T) {
t.Setenv("OLLAMA_MAX_LOADED_MODELS", "0")
s.newServerFn = b.newServer
slog.Info("b")
slog.Info("Loading B")
s.pendingReqCh <- b.req
select {
case resp := <-b.req.successCh:
@ -327,7 +327,7 @@ func TestSchedRequestsMultipleLoadedModels(t *testing.T) {
// This is a CPU load with NumGPU = 0 so it should load
s.newServerFn = c.newServer
slog.Info("c")
slog.Info("Loading C")
s.pendingReqCh <- c.req
select {
case resp := <-c.req.successCh:
@ -337,6 +337,7 @@ func TestSchedRequestsMultipleLoadedModels(t *testing.T) {
case err := <-c.req.errCh:
t.Fatal(err.Error())
case <-ctx.Done():
slog.Info("FAIL: scheduler state", "s.loaded", s.loaded)
t.Fatal("timeout")
}
s.loadedMu.Lock()
@ -361,11 +362,11 @@ func TestSchedRequestsMultipleLoadedModels(t *testing.T) {
b.ctxDone()
// Report recovered VRAM usage so scheduler will finish waiting and unload
time.Sleep(1 * time.Millisecond)
s.getGpuFn = func(ctx context.Context, runners []discover.FilteredRunnerDiscovery) discover.GpuInfoList {
g := discover.GpuInfo{DeviceID: ml.DeviceID{Library: "metal"}}
s.getGpuFn = func(ctx context.Context, runners []ml.FilteredRunnerDiscovery) []ml.DeviceInfo {
g := ml.DeviceInfo{DeviceID: ml.DeviceID{Library: "Metal"}}
g.TotalMemory = 24 * format.GigaByte
g.FreeMemory = 24 * format.GigaByte
return []discover.GpuInfo{g}
return []ml.DeviceInfo{g}
}
select {
case resp := <-d.req.successCh:
@ -404,7 +405,7 @@ func TestSchedGetRunner(t *testing.T) {
s := InitScheduler(ctx)
s.waitForRecovery = 10 * time.Millisecond
s.getGpuFn = getGpuFn
s.getCpuFn = getCpuFn
s.getSystemInfoFn = getSystemInfoFn
s.newServerFn = a.newServer
slog.Info("a")
successCh1a, errCh1a := s.GetRunner(a.ctx, a.req.model, a.req.opts, a.req.sessionDuration)
@ -462,13 +463,14 @@ func TestSchedExpireRunner(t *testing.T) {
}
var f *ggml.GGML
gpus := discover.GpuInfoList{}
gpus := []ml.DeviceInfo{}
systemInfo := ml.SystemInfo{}
server := &mockLlm{vramSize: 10, vramByGPU: map[ml.DeviceID]uint64{}}
s.newServerFn = func(gpus discover.GpuInfoList, model string, f *ggml.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
s.newServerFn = func(systemInfo ml.SystemInfo, gpus []ml.DeviceInfo, model string, f *ggml.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
server.modelPath = model
return server, nil
}
s.load(req, f, gpus, false)
s.load(req, f, systemInfo, gpus, false)
select {
case err := <-req.errCh:
@ -497,19 +499,15 @@ func TestSchedExpireRunner(t *testing.T) {
// TODO - add one scenario that triggers the bogus finished event with positive ref count
func TestSchedPrematureExpired(t *testing.T) {
ctx, done := context.WithTimeout(t.Context(), 500*time.Millisecond)
ctx, done := context.WithTimeout(t.Context(), 1000*time.Millisecond)
defer done()
// Same model, same request
scenario1a := newScenarioRequest(t, ctx, "ollama-model-1a", 10, nil, nil)
scenario1a := newScenarioRequest(t, ctx, "ollama-model-1a", 10, &api.Duration{Duration: 100 * time.Millisecond}, nil)
s := InitScheduler(ctx)
s.waitForRecovery = 10 * time.Millisecond
s.getGpuFn = func(ctx context.Context, runners []discover.FilteredRunnerDiscovery) discover.GpuInfoList {
g := discover.GpuInfo{DeviceID: ml.DeviceID{Library: "metal"}}
g.TotalMemory = 24 * format.GigaByte
g.FreeMemory = 12 * format.GigaByte
return []discover.GpuInfo{g}
}
s.getGpuFn = getGpuFn
s.getSystemInfoFn = getSystemInfoFn
s.newServerFn = scenario1a.newServer
successCh1a, errCh1a := s.GetRunner(scenario1a.ctx, scenario1a.req.model, scenario1a.req.opts, scenario1a.req.sessionDuration)
require.Len(t, s.pendingReqCh, 1)
@ -574,7 +572,7 @@ func TestSchedUseLoadedRunner(t *testing.T) {
func TestSchedUpdateFreeSpace(t *testing.T) {
ctx, done := context.WithTimeout(t.Context(), 100*time.Millisecond)
defer done()
gpus := discover.GpuInfoList{
gpus := []ml.DeviceInfo{
{
DeviceID: ml.DeviceID{
ID: "1",
@ -756,8 +754,12 @@ func (s *mockLlm) ModelPath() string {
return s.modelPath
}
func (s *mockLlm) Load(ctx context.Context, gpus discover.GpuInfoList, requireFull bool) ([]ml.DeviceID, error) {
func (s *mockLlm) Load(ctx context.Context, sytemInfo ml.SystemInfo, gpus []ml.DeviceInfo, requireFull bool) ([]ml.DeviceID, error) {
if requireFull {
if len(gpus) == 0 {
slog.Info("mockLlm.Load CPU based load")
return nil, nil
}
for _, g := range gpus {
if g.FreeMemory >= s.vramSize {
return []ml.DeviceID{g.DeviceID}, nil