mirror of
https://github.com/zebrajr/ollama.git
synced 2025-12-06 00:19:51 +01:00
DRY out the runner lifecycle code (#12540)
* DRY out the runner lifecycle code Now that discovery uses the runners as well, this unifies the runner spawning code into a single place. This also unifies GPU discovery types with the newer ml.DeviceInfo * win: make incremental builds better Place build artifacts in discrete directories so incremental builds don't have to start fresh * Adjust sort order to consider iGPUs * handle cpu inference oom scenarios * review comments
This commit is contained in:
parent
1c093e97af
commit
3258a89b6e
|
|
@ -2065,12 +2065,6 @@ power management:
|
||||||
cpus := linuxCPUDetails(buf)
|
cpus := linuxCPUDetails(buf)
|
||||||
|
|
||||||
slog.Info("example", "scenario", k, "cpus", cpus)
|
slog.Info("example", "scenario", k, "cpus", cpus)
|
||||||
si := SystemInfo{
|
|
||||||
System: CPUInfo{
|
|
||||||
CPUs: cpus,
|
|
||||||
},
|
|
||||||
}
|
|
||||||
threadCount := si.GetOptimalThreadCount()
|
|
||||||
if len(v.expCPUs) != len(cpus) {
|
if len(v.expCPUs) != len(cpus) {
|
||||||
t.Fatalf("incorrect number of sockets: expected:%v got:%v", v.expCPUs, cpus)
|
t.Fatalf("incorrect number of sockets: expected:%v got:%v", v.expCPUs, cpus)
|
||||||
}
|
}
|
||||||
|
|
@ -2085,10 +2079,6 @@ power management:
|
||||||
t.Fatalf("incorrect number of threads: expected:%v got:%v", v.expCPUs[i], c)
|
t.Fatalf("incorrect number of threads: expected:%v got:%v", v.expCPUs[i], c)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if threadCount != v.expThreadCount {
|
|
||||||
t.Fatalf("incorrect thread count expected:%d got:%d", v.expThreadCount, threadCount)
|
|
||||||
}
|
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
||||||
172
discover/gpu.go
172
discover/gpu.go
|
|
@ -1,16 +1,13 @@
|
||||||
package discover
|
package discover
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"context"
|
|
||||||
"log/slog"
|
"log/slog"
|
||||||
"os"
|
"os"
|
||||||
"path/filepath"
|
|
||||||
"regexp"
|
"regexp"
|
||||||
"runtime"
|
"runtime"
|
||||||
"strconv"
|
"strconv"
|
||||||
"strings"
|
"strings"
|
||||||
|
|
||||||
"github.com/ollama/ollama/format"
|
|
||||||
"github.com/ollama/ollama/ml"
|
"github.com/ollama/ollama/ml"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
@ -18,159 +15,28 @@ import (
|
||||||
// Included to drive logic for reducing Ollama-allocated overhead on L4T/Jetson devices.
|
// Included to drive logic for reducing Ollama-allocated overhead on L4T/Jetson devices.
|
||||||
var CudaTegra string = os.Getenv("JETSON_JETPACK")
|
var CudaTegra string = os.Getenv("JETSON_JETPACK")
|
||||||
|
|
||||||
func GetCPUInfo() GpuInfo {
|
|
||||||
mem, err := GetCPUMem()
|
|
||||||
if err != nil {
|
|
||||||
slog.Warn("error looking up system memory", "error", err)
|
|
||||||
}
|
|
||||||
|
|
||||||
return GpuInfo{
|
|
||||||
memInfo: mem,
|
|
||||||
DeviceID: ml.DeviceID{
|
|
||||||
Library: "cpu",
|
|
||||||
ID: "0",
|
|
||||||
},
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
func GetGPUInfo(ctx context.Context, runners []FilteredRunnerDiscovery) GpuInfoList {
|
|
||||||
devs := GPUDevices(ctx, runners)
|
|
||||||
return devInfoToInfoList(devs)
|
|
||||||
}
|
|
||||||
|
|
||||||
func devInfoToInfoList(devs []ml.DeviceInfo) GpuInfoList {
|
|
||||||
resp := []GpuInfo{}
|
|
||||||
// Our current packaging model places ggml-hip in the main directory
|
|
||||||
// but keeps rocm in an isolated directory. We have to add it to
|
|
||||||
// the [LD_LIBRARY_]PATH so ggml-hip will load properly
|
|
||||||
rocmDir := filepath.Join(LibOllamaPath, "rocm")
|
|
||||||
if _, err := os.Stat(rocmDir); err != nil {
|
|
||||||
rocmDir = ""
|
|
||||||
}
|
|
||||||
|
|
||||||
for _, dev := range devs {
|
|
||||||
info := GpuInfo{
|
|
||||||
DeviceID: dev.DeviceID,
|
|
||||||
filterID: dev.FilteredID,
|
|
||||||
Name: dev.Description,
|
|
||||||
memInfo: memInfo{
|
|
||||||
TotalMemory: dev.TotalMemory,
|
|
||||||
FreeMemory: dev.FreeMemory,
|
|
||||||
},
|
|
||||||
// TODO can we avoid variant
|
|
||||||
DependencyPath: dev.LibraryPath,
|
|
||||||
DriverMajor: dev.DriverMajor,
|
|
||||||
DriverMinor: dev.DriverMinor,
|
|
||||||
ComputeMajor: dev.ComputeMajor,
|
|
||||||
ComputeMinor: dev.ComputeMinor,
|
|
||||||
}
|
|
||||||
if dev.Library == "CUDA" || dev.Library == "ROCm" {
|
|
||||||
info.MinimumMemory = 457 * format.MebiByte
|
|
||||||
}
|
|
||||||
if dev.Library == "ROCm" && rocmDir != "" {
|
|
||||||
info.DependencyPath = append(info.DependencyPath, rocmDir)
|
|
||||||
}
|
|
||||||
// TODO any special processing of Vulkan devices?
|
|
||||||
resp = append(resp, info)
|
|
||||||
}
|
|
||||||
if len(resp) == 0 {
|
|
||||||
mem, err := GetCPUMem()
|
|
||||||
if err != nil {
|
|
||||||
slog.Warn("error looking up system memory", "error", err)
|
|
||||||
}
|
|
||||||
|
|
||||||
resp = append(resp, GpuInfo{
|
|
||||||
memInfo: mem,
|
|
||||||
DeviceID: ml.DeviceID{
|
|
||||||
Library: "cpu",
|
|
||||||
ID: "0",
|
|
||||||
},
|
|
||||||
})
|
|
||||||
}
|
|
||||||
return resp
|
|
||||||
}
|
|
||||||
|
|
||||||
// Given the list of GPUs this instantiation is targeted for,
|
|
||||||
// figure out the visible devices environment variable
|
|
||||||
//
|
|
||||||
// If different libraries are detected, the first one is what we use
|
|
||||||
func (l GpuInfoList) GetVisibleDevicesEnv() []string {
|
|
||||||
if len(l) == 0 {
|
|
||||||
return nil
|
|
||||||
}
|
|
||||||
res := []string{}
|
|
||||||
envVar := rocmGetVisibleDevicesEnv(l)
|
|
||||||
if envVar != "" {
|
|
||||||
res = append(res, envVar)
|
|
||||||
}
|
|
||||||
envVar = vkGetVisibleDevicesEnv(l)
|
|
||||||
if envVar != "" {
|
|
||||||
res = append(res, envVar)
|
|
||||||
}
|
|
||||||
return res
|
|
||||||
}
|
|
||||||
|
|
||||||
func rocmGetVisibleDevicesEnv(gpuInfo []GpuInfo) string {
|
|
||||||
ids := []string{}
|
|
||||||
for _, info := range gpuInfo {
|
|
||||||
if info.Library != "ROCm" {
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
// If the devices requires a numeric ID, for filtering purposes, we use the unfiltered ID number
|
|
||||||
if info.filterID != "" {
|
|
||||||
ids = append(ids, info.filterID)
|
|
||||||
} else {
|
|
||||||
ids = append(ids, info.ID)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if len(ids) == 0 {
|
|
||||||
return ""
|
|
||||||
}
|
|
||||||
envVar := "ROCR_VISIBLE_DEVICES="
|
|
||||||
if runtime.GOOS != "linux" {
|
|
||||||
envVar = "HIP_VISIBLE_DEVICES="
|
|
||||||
}
|
|
||||||
// There are 3 potential env vars to use to select GPUs.
|
|
||||||
// ROCR_VISIBLE_DEVICES supports UUID or numeric but does not work on Windows
|
|
||||||
// HIP_VISIBLE_DEVICES supports numeric IDs only
|
|
||||||
// GPU_DEVICE_ORDINAL supports numeric IDs only
|
|
||||||
return envVar + strings.Join(ids, ",")
|
|
||||||
}
|
|
||||||
|
|
||||||
func vkGetVisibleDevicesEnv(gpuInfo []GpuInfo) string {
|
|
||||||
ids := []string{}
|
|
||||||
for _, info := range gpuInfo {
|
|
||||||
if info.Library != "Vulkan" {
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
if info.filterID != "" {
|
|
||||||
ids = append(ids, info.filterID)
|
|
||||||
} else {
|
|
||||||
ids = append(ids, info.ID)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if len(ids) == 0 {
|
|
||||||
return ""
|
|
||||||
}
|
|
||||||
envVar := "GGML_VK_VISIBLE_DEVICES="
|
|
||||||
return envVar + strings.Join(ids, ",")
|
|
||||||
}
|
|
||||||
|
|
||||||
// GetSystemInfo returns the last cached state of the GPUs on the system
|
// GetSystemInfo returns the last cached state of the GPUs on the system
|
||||||
func GetSystemInfo() SystemInfo {
|
func GetSystemInfo() ml.SystemInfo {
|
||||||
deviceMu.Lock()
|
memInfo, err := GetCPUMem()
|
||||||
defer deviceMu.Unlock()
|
if err != nil {
|
||||||
gpus := devInfoToInfoList(devices)
|
slog.Warn("error looking up system memory", "error", err)
|
||||||
if len(gpus) == 1 && gpus[0].Library == "cpu" {
|
}
|
||||||
gpus = []GpuInfo{}
|
var threadCount int
|
||||||
|
cpus := GetCPUDetails()
|
||||||
|
for _, c := range cpus {
|
||||||
|
threadCount += c.CoreCount - c.EfficiencyCoreCount
|
||||||
}
|
}
|
||||||
|
|
||||||
return SystemInfo{
|
if threadCount == 0 {
|
||||||
System: CPUInfo{
|
// Fall back to Go's num CPU
|
||||||
CPUs: GetCPUDetails(),
|
threadCount = runtime.NumCPU()
|
||||||
GpuInfo: GetCPUInfo(),
|
}
|
||||||
},
|
|
||||||
GPUs: gpus,
|
return ml.SystemInfo{
|
||||||
|
ThreadCount: threadCount,
|
||||||
|
TotalMemory: memInfo.TotalMemory,
|
||||||
|
FreeMemory: memInfo.FreeMemory,
|
||||||
|
FreeSwap: memInfo.FreeSwap,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -4,13 +4,8 @@ package discover
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"context"
|
"context"
|
||||||
"encoding/json"
|
|
||||||
"fmt"
|
|
||||||
"io"
|
"io"
|
||||||
"log/slog"
|
"log/slog"
|
||||||
"math/rand"
|
|
||||||
"net"
|
|
||||||
"net/http"
|
|
||||||
"os"
|
"os"
|
||||||
"os/exec"
|
"os/exec"
|
||||||
"path/filepath"
|
"path/filepath"
|
||||||
|
|
@ -23,6 +18,7 @@ import (
|
||||||
|
|
||||||
"github.com/ollama/ollama/envconfig"
|
"github.com/ollama/ollama/envconfig"
|
||||||
"github.com/ollama/ollama/format"
|
"github.com/ollama/ollama/format"
|
||||||
|
"github.com/ollama/ollama/llm"
|
||||||
"github.com/ollama/ollama/logutil"
|
"github.com/ollama/ollama/logutil"
|
||||||
"github.com/ollama/ollama/ml"
|
"github.com/ollama/ollama/ml"
|
||||||
)
|
)
|
||||||
|
|
@ -36,7 +32,7 @@ var (
|
||||||
bootstrapped bool
|
bootstrapped bool
|
||||||
)
|
)
|
||||||
|
|
||||||
func GPUDevices(ctx context.Context, runners []FilteredRunnerDiscovery) []ml.DeviceInfo {
|
func GPUDevices(ctx context.Context, runners []ml.FilteredRunnerDiscovery) []ml.DeviceInfo {
|
||||||
deviceMu.Lock()
|
deviceMu.Lock()
|
||||||
defer deviceMu.Unlock()
|
defer deviceMu.Unlock()
|
||||||
startDiscovery := time.Now()
|
startDiscovery := time.Now()
|
||||||
|
|
@ -154,9 +150,9 @@ func GPUDevices(ctx context.Context, runners []FilteredRunnerDiscovery) []ml.Dev
|
||||||
slog.Error("Unknown Library:" + devices[i].Library)
|
slog.Error("Unknown Library:" + devices[i].Library)
|
||||||
}
|
}
|
||||||
|
|
||||||
extraEnvs := []string{
|
extraEnvs := map[string]string{
|
||||||
"GGML_CUDA_INIT=1", // force deep initialization to trigger crash on unsupported GPUs
|
"GGML_CUDA_INIT": "1", // force deep initialization to trigger crash on unsupported GPUs
|
||||||
envVar + "=" + id, // Filter to just this one GPU
|
envVar: id, // Filter to just this one GPU
|
||||||
}
|
}
|
||||||
if len(bootstrapDevices(ctx2ndPass, devices[i].LibraryPath, extraEnvs)) == 0 {
|
if len(bootstrapDevices(ctx2ndPass, devices[i].LibraryPath, extraEnvs)) == 0 {
|
||||||
needsDelete[i] = true
|
needsDelete[i] = true
|
||||||
|
|
@ -449,100 +445,35 @@ func (r *bootstrapRunner) HasExited() bool {
|
||||||
return false
|
return false
|
||||||
}
|
}
|
||||||
|
|
||||||
func bootstrapDevices(ctx context.Context, ollamaLibDirs []string, extraEnvs []string) []ml.DeviceInfo {
|
func bootstrapDevices(ctx context.Context, ollamaLibDirs []string, extraEnvs map[string]string) []ml.DeviceInfo {
|
||||||
// TODO DRY out with llm/server.go
|
var out io.Writer
|
||||||
slog.Debug("spawning runner with", "OLLAMA_LIBRARY_PATH", ollamaLibDirs, "extra_envs", extraEnvs)
|
if envconfig.LogLevel() == logutil.LevelTrace {
|
||||||
|
out = os.Stderr
|
||||||
|
}
|
||||||
start := time.Now()
|
start := time.Now()
|
||||||
defer func() {
|
defer func() {
|
||||||
slog.Debug("bootstrap discovery took", "duration", time.Since(start), "OLLAMA_LIBRARY_PATH", ollamaLibDirs, "extra_envs", extraEnvs)
|
slog.Debug("bootstrap discovery took", "duration", time.Since(start), "OLLAMA_LIBRARY_PATH", ollamaLibDirs, "extra_envs", extraEnvs)
|
||||||
}()
|
}()
|
||||||
port := 0
|
|
||||||
if a, err := net.ResolveTCPAddr("tcp", "localhost:0"); err == nil {
|
|
||||||
var l *net.TCPListener
|
|
||||||
if l, err = net.ListenTCP("tcp", a); err == nil {
|
|
||||||
port = l.Addr().(*net.TCPAddr).Port
|
|
||||||
l.Close()
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if port == 0 {
|
|
||||||
slog.Debug("ResolveTCPAddr failed, using random port")
|
|
||||||
port = rand.Intn(65535-49152) + 49152 // get a random port in the ephemeral range
|
|
||||||
}
|
|
||||||
params := []string{"runner", "--ollama-engine", "--port", strconv.Itoa(port)}
|
|
||||||
var pathEnv string
|
|
||||||
switch runtime.GOOS {
|
|
||||||
case "windows":
|
|
||||||
pathEnv = "PATH"
|
|
||||||
case "darwin":
|
|
||||||
pathEnv = "DYLD_LIBRARY_PATH"
|
|
||||||
default:
|
|
||||||
pathEnv = "LD_LIBRARY_PATH"
|
|
||||||
}
|
|
||||||
libraryPaths := append([]string{LibOllamaPath}, ollamaLibDirs...)
|
|
||||||
if rocmDir != "" {
|
|
||||||
libraryPaths = append(libraryPaths, rocmDir)
|
|
||||||
}
|
|
||||||
// Note: we always put our dependency paths first
|
|
||||||
// since these are the exact version we compiled/linked against
|
|
||||||
if libraryPath, ok := os.LookupEnv(pathEnv); ok {
|
|
||||||
libraryPaths = append(libraryPaths, filepath.SplitList(libraryPath)...)
|
|
||||||
}
|
|
||||||
|
|
||||||
cmd := exec.Command(exe, params...)
|
logutil.Trace("starting runner for device discovery", "libDirs", ollamaLibDirs, "extraEnvs", extraEnvs)
|
||||||
cmd.Env = os.Environ()
|
cmd, port, err := llm.StartRunner(
|
||||||
if envconfig.LogLevel() == logutil.LevelTrace {
|
true, // ollama engine
|
||||||
cmd.Stdout = os.Stdout
|
"", // no model
|
||||||
cmd.Stderr = os.Stderr
|
ollamaLibDirs,
|
||||||
}
|
out,
|
||||||
|
extraEnvs,
|
||||||
// cmd.SysProcAttr = llm.LlamaServerSysProcAttr // circular dependency - bring back once refactored
|
)
|
||||||
pathEnvVal := strings.Join(libraryPaths, string(filepath.ListSeparator))
|
if err != nil {
|
||||||
pathNeeded := true
|
slog.Debug("failed to start runner to discovery GPUs", "error", err)
|
||||||
ollamaPathNeeded := true
|
|
||||||
extraDone := make([]bool, len(extraEnvs))
|
|
||||||
for i := range cmd.Env {
|
|
||||||
cmp := strings.SplitN(cmd.Env[i], "=", 2)
|
|
||||||
if strings.EqualFold(cmp[0], pathEnv) {
|
|
||||||
cmd.Env[i] = pathEnv + "=" + pathEnvVal
|
|
||||||
pathNeeded = false
|
|
||||||
} else if strings.EqualFold(cmp[0], "OLLAMA_LIBRARY_PATH") {
|
|
||||||
cmd.Env[i] = "OLLAMA_LIBRARY_PATH=" + strings.Join(ollamaLibDirs, string(filepath.ListSeparator))
|
|
||||||
ollamaPathNeeded = false
|
|
||||||
} else {
|
|
||||||
for j := range extraEnvs {
|
|
||||||
if extraDone[j] {
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
extra := strings.SplitN(extraEnvs[j], "=", 2)
|
|
||||||
if cmp[0] == extra[0] {
|
|
||||||
cmd.Env[i] = extraEnvs[j]
|
|
||||||
extraDone[j] = true
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if pathNeeded {
|
|
||||||
cmd.Env = append(cmd.Env, pathEnv+"="+pathEnvVal)
|
|
||||||
}
|
|
||||||
if ollamaPathNeeded {
|
|
||||||
cmd.Env = append(cmd.Env, "OLLAMA_LIBRARY_PATH="+strings.Join(ollamaLibDirs, string(filepath.ListSeparator)))
|
|
||||||
}
|
|
||||||
for i := range extraDone {
|
|
||||||
if !extraDone[i] {
|
|
||||||
cmd.Env = append(cmd.Env, extraEnvs[i])
|
|
||||||
}
|
|
||||||
}
|
|
||||||
logutil.Trace("starting runner for device discovery", "env", cmd.Env, "cmd", cmd)
|
|
||||||
if err := cmd.Start(); err != nil {
|
|
||||||
slog.Warn("unable to start discovery subprocess", "cmd", cmd, "error", err)
|
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
go func() {
|
go func() {
|
||||||
cmd.Wait() // exit status ignored
|
cmd.Wait() // exit status ignored
|
||||||
}()
|
}()
|
||||||
|
|
||||||
defer cmd.Process.Kill()
|
defer cmd.Process.Kill()
|
||||||
devices, err := GetDevicesFromRunner(ctx, &bootstrapRunner{port: port, cmd: cmd})
|
devices, err := ml.GetDevicesFromRunner(ctx, &bootstrapRunner{port: port, cmd: cmd})
|
||||||
if err != nil {
|
if err != nil {
|
||||||
if cmd.ProcessState != nil && cmd.ProcessState.ExitCode() >= 0 {
|
if cmd.ProcessState != nil && cmd.ProcessState.ExitCode() >= 0 {
|
||||||
// Expected during bootstrapping while we filter out unsupported AMD GPUs
|
// Expected during bootstrapping while we filter out unsupported AMD GPUs
|
||||||
|
|
@ -555,52 +486,3 @@ func bootstrapDevices(ctx context.Context, ollamaLibDirs []string, extraEnvs []s
|
||||||
|
|
||||||
return devices
|
return devices
|
||||||
}
|
}
|
||||||
|
|
||||||
func GetDevicesFromRunner(ctx context.Context, runner BaseRunner) ([]ml.DeviceInfo, error) {
|
|
||||||
var moreDevices []ml.DeviceInfo
|
|
||||||
port := runner.GetPort()
|
|
||||||
tick := time.Tick(10 * time.Millisecond)
|
|
||||||
for {
|
|
||||||
select {
|
|
||||||
case <-ctx.Done():
|
|
||||||
return nil, fmt.Errorf("failed to finish discovery before timeout")
|
|
||||||
case <-tick:
|
|
||||||
r, err := http.NewRequestWithContext(ctx, http.MethodGet, fmt.Sprintf("http://127.0.0.1:%d/info", port), nil)
|
|
||||||
if err != nil {
|
|
||||||
return nil, fmt.Errorf("failed to create request: %w", err)
|
|
||||||
}
|
|
||||||
r.Header.Set("Content-Type", "application/json")
|
|
||||||
|
|
||||||
resp, err := http.DefaultClient.Do(r)
|
|
||||||
if err != nil {
|
|
||||||
// slog.Warn("failed to send request", "error", err)
|
|
||||||
if runner.HasExited() {
|
|
||||||
return nil, fmt.Errorf("runner crashed")
|
|
||||||
}
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
defer resp.Body.Close()
|
|
||||||
|
|
||||||
if resp.StatusCode == http.StatusNotFound {
|
|
||||||
// old runner, fall back to bootstrapping model
|
|
||||||
return nil, fmt.Errorf("llamarunner free vram reporting not supported")
|
|
||||||
}
|
|
||||||
|
|
||||||
body, err := io.ReadAll(resp.Body)
|
|
||||||
if err != nil {
|
|
||||||
slog.Warn("failed to read response", "error", err)
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
if resp.StatusCode != 200 {
|
|
||||||
logutil.Trace("runner failed to discover free VRAM", "status", resp.StatusCode, "response", body)
|
|
||||||
return nil, fmt.Errorf("runner error: %s", string(body))
|
|
||||||
}
|
|
||||||
|
|
||||||
if err := json.Unmarshal(body, &moreDevices); err != nil {
|
|
||||||
slog.Warn("unmarshal encode response", "error", err)
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
return moreDevices, nil
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
|
||||||
|
|
@ -1,10 +1,8 @@
|
||||||
package discover
|
package discover
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"context"
|
|
||||||
"log/slog"
|
"log/slog"
|
||||||
"path/filepath"
|
"path/filepath"
|
||||||
"runtime"
|
|
||||||
"strings"
|
"strings"
|
||||||
|
|
||||||
"github.com/ollama/ollama/format"
|
"github.com/ollama/ollama/format"
|
||||||
|
|
@ -17,50 +15,6 @@ type memInfo struct {
|
||||||
FreeSwap uint64 `json:"free_swap,omitempty"` // TODO split this out for system only
|
FreeSwap uint64 `json:"free_swap,omitempty"` // TODO split this out for system only
|
||||||
}
|
}
|
||||||
|
|
||||||
// Beginning of an `ollama info` command
|
|
||||||
type GpuInfo struct { // TODO better name maybe "InferenceProcessor"?
|
|
||||||
ml.DeviceID
|
|
||||||
memInfo
|
|
||||||
|
|
||||||
// Optional variant to select (e.g. versions, cpu feature flags)
|
|
||||||
Variant string `json:"variant"`
|
|
||||||
|
|
||||||
// MinimumMemory represents the minimum memory required to use the GPU
|
|
||||||
MinimumMemory uint64 `json:"-"`
|
|
||||||
|
|
||||||
// Any extra PATH/LD_LIBRARY_PATH dependencies required for the Library to operate properly
|
|
||||||
DependencyPath []string `json:"lib_path,omitempty"`
|
|
||||||
|
|
||||||
// Set to true if we can NOT reliably discover FreeMemory. A value of true indicates
|
|
||||||
// the FreeMemory is best effort, and may over or under report actual memory usage
|
|
||||||
// False indicates FreeMemory can generally be trusted on this GPU
|
|
||||||
UnreliableFreeMemory bool
|
|
||||||
|
|
||||||
// GPU information
|
|
||||||
filterID string // AMD/Vulkan Workaround: The numeric ID of the device used to filter out other devices
|
|
||||||
Name string `json:"name"` // user friendly name if available
|
|
||||||
ComputeMajor int `json:"compute_major"` // Compute Capability or gfx
|
|
||||||
ComputeMinor int `json:"compute_minor"`
|
|
||||||
|
|
||||||
// Driver Information - TODO no need to put this on each GPU
|
|
||||||
DriverMajor int `json:"driver_major,omitempty"`
|
|
||||||
DriverMinor int `json:"driver_minor,omitempty"`
|
|
||||||
|
|
||||||
// TODO other performance capability info to help in scheduling decisions
|
|
||||||
}
|
|
||||||
|
|
||||||
func (gpu GpuInfo) RunnerName() string {
|
|
||||||
if gpu.Variant != "" {
|
|
||||||
return gpu.Library + "_" + gpu.Variant
|
|
||||||
}
|
|
||||||
return gpu.Library
|
|
||||||
}
|
|
||||||
|
|
||||||
type CPUInfo struct {
|
|
||||||
GpuInfo
|
|
||||||
CPUs []CPU
|
|
||||||
}
|
|
||||||
|
|
||||||
// CPU type represents a CPU Package occupying a socket
|
// CPU type represents a CPU Package occupying a socket
|
||||||
type CPU struct {
|
type CPU struct {
|
||||||
ID string `cpuinfo:"processor"`
|
ID string `cpuinfo:"processor"`
|
||||||
|
|
@ -71,32 +25,6 @@ type CPU struct {
|
||||||
ThreadCount int
|
ThreadCount int
|
||||||
}
|
}
|
||||||
|
|
||||||
type GpuInfoList []GpuInfo
|
|
||||||
|
|
||||||
func (l GpuInfoList) ByLibrary() []GpuInfoList {
|
|
||||||
resp := []GpuInfoList{}
|
|
||||||
libs := []string{}
|
|
||||||
for _, info := range l {
|
|
||||||
found := false
|
|
||||||
requested := info.Library
|
|
||||||
if info.Variant != "" {
|
|
||||||
requested += "_" + info.Variant
|
|
||||||
}
|
|
||||||
for i, lib := range libs {
|
|
||||||
if lib == requested {
|
|
||||||
resp[i] = append(resp[i], info)
|
|
||||||
found = true
|
|
||||||
break
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if !found {
|
|
||||||
libs = append(libs, requested)
|
|
||||||
resp = append(resp, []GpuInfo{info})
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return resp
|
|
||||||
}
|
|
||||||
|
|
||||||
func LogDetails(devices []ml.DeviceInfo) {
|
func LogDetails(devices []ml.DeviceInfo) {
|
||||||
for _, dev := range devices {
|
for _, dev := range devices {
|
||||||
var libs []string
|
var libs []string
|
||||||
|
|
@ -141,74 +69,3 @@ func LogDetails(devices []ml.DeviceInfo) {
|
||||||
)
|
)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Sort by Free Space
|
|
||||||
type ByFreeMemory []GpuInfo
|
|
||||||
|
|
||||||
func (a ByFreeMemory) Len() int { return len(a) }
|
|
||||||
func (a ByFreeMemory) Swap(i, j int) { a[i], a[j] = a[j], a[i] }
|
|
||||||
func (a ByFreeMemory) Less(i, j int) bool { return a[i].FreeMemory < a[j].FreeMemory }
|
|
||||||
|
|
||||||
type SystemInfo struct {
|
|
||||||
System CPUInfo `json:"system"`
|
|
||||||
GPUs []GpuInfo `json:"gpus"`
|
|
||||||
}
|
|
||||||
|
|
||||||
// Return the optimal number of threads to use for inference
|
|
||||||
func (si SystemInfo) GetOptimalThreadCount() int {
|
|
||||||
if len(si.System.CPUs) == 0 {
|
|
||||||
// Fall back to Go's num CPU
|
|
||||||
return runtime.NumCPU()
|
|
||||||
}
|
|
||||||
|
|
||||||
coreCount := 0
|
|
||||||
for _, c := range si.System.CPUs {
|
|
||||||
coreCount += c.CoreCount - c.EfficiencyCoreCount
|
|
||||||
}
|
|
||||||
|
|
||||||
return coreCount
|
|
||||||
}
|
|
||||||
|
|
||||||
// For each GPU, check if it does NOT support flash attention
|
|
||||||
func (l GpuInfoList) FlashAttentionSupported() bool {
|
|
||||||
for _, gpu := range l {
|
|
||||||
supportsFA := gpu.Library == "cpu" ||
|
|
||||||
gpu.Name == "Metal" || gpu.Library == "Metal" ||
|
|
||||||
(gpu.Library == "CUDA" && gpu.DriverMajor >= 7 && !(gpu.ComputeMajor == 7 && gpu.ComputeMinor == 2)) || // We don't have kernels for Jetson Xavier
|
|
||||||
gpu.Library == "ROCm" ||
|
|
||||||
gpu.Library == "Vulkan"
|
|
||||||
|
|
||||||
if !supportsFA {
|
|
||||||
return false
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return true
|
|
||||||
}
|
|
||||||
|
|
||||||
type BaseRunner interface {
|
|
||||||
// GetPort returns the localhost port number the runner is running on
|
|
||||||
GetPort() int
|
|
||||||
|
|
||||||
// HasExited indicates if the runner is no longer running. This can be used during
|
|
||||||
// bootstrap to detect if a given filtered device is incompatible and triggered an assert
|
|
||||||
HasExited() bool
|
|
||||||
}
|
|
||||||
|
|
||||||
type RunnerDiscovery interface {
|
|
||||||
BaseRunner
|
|
||||||
|
|
||||||
// GetDeviceInfos will perform a query of the underlying device libraries
|
|
||||||
// for device identification and free VRAM information
|
|
||||||
// During bootstrap scenarios, this routine may take seconds to complete
|
|
||||||
GetDeviceInfos(ctx context.Context) []ml.DeviceInfo
|
|
||||||
}
|
|
||||||
|
|
||||||
type FilteredRunnerDiscovery interface {
|
|
||||||
RunnerDiscovery
|
|
||||||
|
|
||||||
// GetActiveDeviceIDs returns the filtered set of devices actively in
|
|
||||||
// use by this runner for running models. If the runner is a bootstrap runner, no devices
|
|
||||||
// will be active yet so no device IDs are returned.
|
|
||||||
// This routine will not query the underlying device and will return immediately
|
|
||||||
GetActiveDeviceIDs() []ml.DeviceID
|
|
||||||
}
|
|
||||||
|
|
|
||||||
|
|
@ -4,27 +4,28 @@ import (
|
||||||
"fmt"
|
"fmt"
|
||||||
"log/slog"
|
"log/slog"
|
||||||
"os"
|
"os"
|
||||||
|
"slices"
|
||||||
"sort"
|
"sort"
|
||||||
"strings"
|
"strings"
|
||||||
|
|
||||||
"github.com/ollama/ollama/api"
|
"github.com/ollama/ollama/api"
|
||||||
"github.com/ollama/ollama/discover"
|
|
||||||
"github.com/ollama/ollama/envconfig"
|
"github.com/ollama/ollama/envconfig"
|
||||||
"github.com/ollama/ollama/format"
|
"github.com/ollama/ollama/format"
|
||||||
"github.com/ollama/ollama/fs/ggml"
|
"github.com/ollama/ollama/fs/ggml"
|
||||||
|
"github.com/ollama/ollama/ml"
|
||||||
)
|
)
|
||||||
|
|
||||||
// pickBestFullFitByLibrary will try to find the optimal placement of the model in the available GPUs where the model fully fits
|
// pickBestFullFitByLibrary will try to find the optimal placement of the model in the available GPUs where the model fully fits
|
||||||
// The list of GPUs returned will always be the same brand (library)
|
// The list of GPUs returned will always be the same brand (library)
|
||||||
// If the model can not be fit fully within the available GPU(s) nil is returned
|
// If the model can not be fit fully within the available GPU(s) nil is returned
|
||||||
func pickBestFullFitByLibrary(f *ggml.GGML, modelPath string, projectors []string, adapters []string, opts api.Options, gpus discover.GpuInfoList, numParallel int) discover.GpuInfoList {
|
func pickBestFullFitByLibrary(f *ggml.GGML, modelPath string, projectors []string, adapters []string, opts api.Options, gpus []ml.DeviceInfo, numParallel int) []ml.DeviceInfo {
|
||||||
for _, gl := range gpus.ByLibrary() {
|
for _, gl := range ml.ByLibrary(gpus) {
|
||||||
sgl := append(make(discover.GpuInfoList, 0, len(gl)), gl...)
|
sgl := append(make([]ml.DeviceInfo, 0, len(gl)), gl...)
|
||||||
|
|
||||||
// TODO - potentially sort by performance capability, existing models loaded, etc.
|
// TODO - potentially sort by performance capability, existing models loaded, etc.
|
||||||
// TODO - Eliminate any GPUs that already have envconfig.MaxRunners loaded on them
|
// TODO - Eliminate any GPUs that already have envconfig.MaxRunners loaded on them
|
||||||
// Note: at present, this will favor most current available VRAM descending and ignoring faster GPU speed in mixed setups
|
// Note: at present, this will favor most current available VRAM descending and ignoring faster GPU speed in mixed setups
|
||||||
sort.Sort(sort.Reverse(discover.ByFreeMemory(sgl)))
|
sort.Sort(sort.Reverse(ml.ByFreeMemory(sgl)))
|
||||||
|
|
||||||
if !envconfig.SchedSpread() {
|
if !envconfig.SchedSpread() {
|
||||||
// Try to pack into as few GPUs as possible, starting from 1 GPU
|
// Try to pack into as few GPUs as possible, starting from 1 GPU
|
||||||
|
|
@ -63,8 +64,8 @@ func pickBestFullFitByLibrary(f *ggml.GGML, modelPath string, projectors []strin
|
||||||
}
|
}
|
||||||
|
|
||||||
// If multiple Libraries are detected, pick the Library which loads the most layers for the model
|
// If multiple Libraries are detected, pick the Library which loads the most layers for the model
|
||||||
func pickBestPartialFitByLibrary(f *ggml.GGML, projectors []string, adapters []string, opts api.Options, gpus discover.GpuInfoList, numParallel int) discover.GpuInfoList {
|
func pickBestPartialFitByLibrary(f *ggml.GGML, projectors []string, adapters []string, opts api.Options, gpus []ml.DeviceInfo, numParallel int) []ml.DeviceInfo {
|
||||||
byLibrary := gpus.ByLibrary()
|
byLibrary := ml.ByLibrary(gpus)
|
||||||
if len(byLibrary) <= 1 {
|
if len(byLibrary) <= 1 {
|
||||||
return gpus
|
return gpus
|
||||||
}
|
}
|
||||||
|
|
@ -81,10 +82,10 @@ func pickBestPartialFitByLibrary(f *ggml.GGML, projectors []string, adapters []s
|
||||||
}
|
}
|
||||||
|
|
||||||
// This algorithm looks for a complete fit to determine if we need to unload other models
|
// This algorithm looks for a complete fit to determine if we need to unload other models
|
||||||
func predictServerFit(allGpus discover.GpuInfoList, f *ggml.GGML, adapters, projectors []string, opts api.Options, numParallel int) (bool, uint64) {
|
func predictServerFit(allGpus []ml.DeviceInfo, f *ggml.GGML, adapters, projectors []string, opts api.Options, numParallel int) (bool, uint64) {
|
||||||
// Split up the GPUs by type and try them
|
// Split up the GPUs by type and try them
|
||||||
var estimatedVRAM uint64
|
var estimatedVRAM uint64
|
||||||
for _, gpus := range allGpus.ByLibrary() {
|
for _, gpus := range ml.ByLibrary(allGpus) {
|
||||||
var layerCount int
|
var layerCount int
|
||||||
estimate := estimateGPULayers(gpus, f, projectors, opts, numParallel)
|
estimate := estimateGPULayers(gpus, f, projectors, opts, numParallel)
|
||||||
layerCount, estimatedVRAM = estimate.Layers, estimate.VRAMSize
|
layerCount, estimatedVRAM = estimate.Layers, estimate.VRAMSize
|
||||||
|
|
@ -97,14 +98,23 @@ func predictServerFit(allGpus discover.GpuInfoList, f *ggml.GGML, adapters, proj
|
||||||
return true, estimatedVRAM
|
return true, estimatedVRAM
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if len(gpus) == 1 && gpus[0].Library == "cpu" && estimate.TotalSize <= gpus[0].FreeMemory {
|
|
||||||
return true, estimatedVRAM
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
return false, estimatedVRAM
|
return false, estimatedVRAM
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func verifyCPUFit(f *ggml.GGML, modelPath string, projectors []string, adapters []string, opts api.Options, systemInfo ml.SystemInfo, numParallel int) bool {
|
||||||
|
estimate := estimateGPULayers(nil, f, projectors, opts, numParallel)
|
||||||
|
if estimate.TotalSize > systemInfo.FreeMemory {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
slog.Info("new model will fit in available system memory for CPU inference, loading",
|
||||||
|
"model", modelPath,
|
||||||
|
"parallel", numParallel,
|
||||||
|
"required", format.HumanBytes2(estimate.TotalSize),
|
||||||
|
)
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
|
||||||
type MemoryEstimate struct {
|
type MemoryEstimate struct {
|
||||||
// How many layers we predict we can load
|
// How many layers we predict we can load
|
||||||
Layers int
|
Layers int
|
||||||
|
|
@ -141,7 +151,7 @@ type MemoryEstimate struct {
|
||||||
|
|
||||||
// Given a model and one or more GPU targets, predict how many layers and bytes we can load, and the total size
|
// Given a model and one or more GPU targets, predict how many layers and bytes we can load, and the total size
|
||||||
// The GPUs provided must all be the same Library
|
// The GPUs provided must all be the same Library
|
||||||
func estimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []string, opts api.Options, numParallel int) MemoryEstimate {
|
func estimateGPULayers(gpus []ml.DeviceInfo, f *ggml.GGML, projectors []string, opts api.Options, numParallel int) MemoryEstimate {
|
||||||
// Graph size for a partial offload, applies to all GPUs
|
// Graph size for a partial offload, applies to all GPUs
|
||||||
var graphPartialOffload uint64
|
var graphPartialOffload uint64
|
||||||
|
|
||||||
|
|
@ -175,10 +185,17 @@ func estimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []strin
|
||||||
|
|
||||||
overhead := envconfig.GpuOverhead()
|
overhead := envconfig.GpuOverhead()
|
||||||
availableList := make([]string, len(gpus))
|
availableList := make([]string, len(gpus))
|
||||||
|
libraries := []string{}
|
||||||
for i, gpu := range gpus {
|
for i, gpu := range gpus {
|
||||||
availableList[i] = format.HumanBytes2(gpu.FreeMemory)
|
availableList[i] = format.HumanBytes2(gpu.FreeMemory)
|
||||||
|
if !slices.Contains(libraries, gpu.Library) {
|
||||||
|
libraries = append(libraries, gpu.Library)
|
||||||
}
|
}
|
||||||
slog.Debug("evaluating", "library", gpus[0].Library, "gpu_count", len(gpus), "available", availableList)
|
}
|
||||||
|
if len(libraries) == 0 {
|
||||||
|
libraries = []string{"cpu"}
|
||||||
|
}
|
||||||
|
slog.Debug("evaluating", "library", strings.Join(libraries, ","), "gpu_count", len(gpus), "available", availableList)
|
||||||
|
|
||||||
for _, projector := range projectors {
|
for _, projector := range projectors {
|
||||||
llamaEngineProjectorWeights += projectorMemoryRequirements(projector)
|
llamaEngineProjectorWeights += projectorMemoryRequirements(projector)
|
||||||
|
|
@ -196,7 +213,7 @@ func estimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []strin
|
||||||
}
|
}
|
||||||
|
|
||||||
useFlashAttention := envconfig.FlashAttention(f.FlashAttention()) &&
|
useFlashAttention := envconfig.FlashAttention(f.FlashAttention()) &&
|
||||||
(discover.GpuInfoList)(gpus).FlashAttentionSupported() &&
|
ml.FlashAttentionSupported(gpus) &&
|
||||||
f.SupportsFlashAttention()
|
f.SupportsFlashAttention()
|
||||||
|
|
||||||
var kvct string
|
var kvct string
|
||||||
|
|
@ -231,7 +248,7 @@ func estimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []strin
|
||||||
}
|
}
|
||||||
|
|
||||||
// on metal there's no partial offload overhead
|
// on metal there's no partial offload overhead
|
||||||
if gpus[0].Library == "Metal" {
|
if len(gpus) > 0 && gpus[0].Library == "Metal" {
|
||||||
graphPartialOffload = graphFullOffload
|
graphPartialOffload = graphFullOffload
|
||||||
} else if len(gpus) > 1 {
|
} else if len(gpus) > 1 {
|
||||||
// multigpu should always use the partial graph size
|
// multigpu should always use the partial graph size
|
||||||
|
|
@ -256,7 +273,7 @@ func estimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []strin
|
||||||
gpuAllocations := make([]uint64, len(gpus))
|
gpuAllocations := make([]uint64, len(gpus))
|
||||||
type gs struct {
|
type gs struct {
|
||||||
i int
|
i int
|
||||||
g *discover.GpuInfo
|
g *ml.DeviceInfo
|
||||||
}
|
}
|
||||||
gpusWithSpace := []gs{}
|
gpusWithSpace := []gs{}
|
||||||
for i := range gpus {
|
for i := range gpus {
|
||||||
|
|
@ -265,19 +282,11 @@ func estimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []strin
|
||||||
gzo = gpuZeroOverhead
|
gzo = gpuZeroOverhead
|
||||||
}
|
}
|
||||||
// Only include GPUs that can fit the graph, gpu minimum, the layer buffer and at least more layer
|
// Only include GPUs that can fit the graph, gpu minimum, the layer buffer and at least more layer
|
||||||
if gpus[i].FreeMemory < overhead+gzo+max(graphPartialOffload, graphFullOffload)+gpus[i].MinimumMemory+2*layerSize {
|
if gpus[i].FreeMemory < overhead+gzo+max(graphPartialOffload, graphFullOffload)+gpus[i].MinimumMemory()+2*layerSize {
|
||||||
var compute string
|
|
||||||
if gpus[i].Library == "ROCm" {
|
|
||||||
compute = fmt.Sprintf("gfx%x%02x", gpus[i].ComputeMajor, gpus[i].ComputeMinor)
|
|
||||||
} else {
|
|
||||||
compute = fmt.Sprintf("%d.%d", gpus[i].ComputeMajor, gpus[i].ComputeMinor)
|
|
||||||
}
|
|
||||||
|
|
||||||
slog.Debug("gpu has too little memory to allocate any layers",
|
slog.Debug("gpu has too little memory to allocate any layers",
|
||||||
"id", gpus[i].ID,
|
"id", gpus[i].ID,
|
||||||
"library", gpus[i].Library,
|
"library", gpus[i].Library,
|
||||||
"variant", gpus[i].Variant,
|
"compute", gpus[i].Compute(),
|
||||||
"compute", compute,
|
|
||||||
"driver", fmt.Sprintf("%d.%d", gpus[i].DriverMajor, gpus[i].DriverMinor),
|
"driver", fmt.Sprintf("%d.%d", gpus[i].DriverMajor, gpus[i].DriverMinor),
|
||||||
"name", gpus[i].Name,
|
"name", gpus[i].Name,
|
||||||
"total", format.HumanBytes2(gpus[i].TotalMemory),
|
"total", format.HumanBytes2(gpus[i].TotalMemory),
|
||||||
|
|
@ -291,7 +300,7 @@ func estimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []strin
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
gpusWithSpace = append(gpusWithSpace, gs{i, &gpus[i]})
|
gpusWithSpace = append(gpusWithSpace, gs{i, &gpus[i]})
|
||||||
gpuAllocations[i] += gpus[i].MinimumMemory + layerSize // We hold off on graph until we know partial vs. full
|
gpuAllocations[i] += gpus[i].MinimumMemory() + layerSize // We hold off on graph until we know partial vs. full
|
||||||
}
|
}
|
||||||
|
|
||||||
var gpuZeroID int
|
var gpuZeroID int
|
||||||
|
|
@ -397,7 +406,7 @@ func estimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []strin
|
||||||
VRAMSize: 0,
|
VRAMSize: 0,
|
||||||
GPUSizes: []uint64{},
|
GPUSizes: []uint64{},
|
||||||
|
|
||||||
inferenceLibrary: gpus[0].Library,
|
inferenceLibrary: strings.Join(libraries, ","),
|
||||||
layersRequested: opts.NumGPU,
|
layersRequested: opts.NumGPU,
|
||||||
layersModel: int(f.KV().BlockCount()) + 1,
|
layersModel: int(f.KV().BlockCount()) + 1,
|
||||||
availableList: availableList,
|
availableList: availableList,
|
||||||
|
|
@ -411,7 +420,7 @@ func estimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []strin
|
||||||
projectorGraph: ollamaEngineProjectorGraph,
|
projectorGraph: ollamaEngineProjectorGraph,
|
||||||
}
|
}
|
||||||
|
|
||||||
if gpus[0].Library == "cpu" {
|
if len(gpus) == 0 {
|
||||||
return estimate
|
return estimate
|
||||||
}
|
}
|
||||||
if layerCount == 0 {
|
if layerCount == 0 {
|
||||||
|
|
|
||||||
|
|
@ -10,7 +10,7 @@ import (
|
||||||
"github.com/stretchr/testify/require"
|
"github.com/stretchr/testify/require"
|
||||||
|
|
||||||
"github.com/ollama/ollama/api"
|
"github.com/ollama/ollama/api"
|
||||||
"github.com/ollama/ollama/discover"
|
"github.com/ollama/ollama/format"
|
||||||
"github.com/ollama/ollama/fs/ggml"
|
"github.com/ollama/ollama/fs/ggml"
|
||||||
"github.com/ollama/ollama/ml"
|
"github.com/ollama/ollama/ml"
|
||||||
)
|
)
|
||||||
|
|
@ -54,13 +54,7 @@ func TestEstimateGPULayers(t *testing.T) {
|
||||||
}
|
}
|
||||||
|
|
||||||
// Simple CPU scenario
|
// Simple CPU scenario
|
||||||
gpus := []discover.GpuInfo{
|
gpus := []ml.DeviceInfo{}
|
||||||
{
|
|
||||||
DeviceID: ml.DeviceID{
|
|
||||||
Library: "cpu",
|
|
||||||
},
|
|
||||||
},
|
|
||||||
}
|
|
||||||
projectors := []string{}
|
projectors := []string{}
|
||||||
opts := api.DefaultOptions()
|
opts := api.DefaultOptions()
|
||||||
t.Run("cpu", func(t *testing.T) {
|
t.Run("cpu", func(t *testing.T) {
|
||||||
|
|
@ -77,19 +71,17 @@ func TestEstimateGPULayers(t *testing.T) {
|
||||||
memoryLayerOutput := uint64(4)
|
memoryLayerOutput := uint64(4)
|
||||||
|
|
||||||
// Dual CUDA scenario with asymmetry
|
// Dual CUDA scenario with asymmetry
|
||||||
gpuMinimumMemory := uint64(2048)
|
gpuMinimumMemory := uint64(457 * format.MebiByte)
|
||||||
gpus = []discover.GpuInfo{
|
gpus = []ml.DeviceInfo{
|
||||||
{
|
{
|
||||||
DeviceID: ml.DeviceID{
|
DeviceID: ml.DeviceID{
|
||||||
Library: "cuda",
|
Library: "CUDA",
|
||||||
},
|
},
|
||||||
MinimumMemory: gpuMinimumMemory,
|
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
DeviceID: ml.DeviceID{
|
DeviceID: ml.DeviceID{
|
||||||
Library: "cuda",
|
Library: "CUDA",
|
||||||
},
|
},
|
||||||
MinimumMemory: gpuMinimumMemory,
|
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
// Nested array: GPU0 layer space, GPU1 layer space, expected gpu0, expected gpu1
|
// Nested array: GPU0 layer space, GPU1 layer space, expected gpu0, expected gpu1
|
||||||
|
|
|
||||||
415
llm/server.go
415
llm/server.go
|
|
@ -27,7 +27,6 @@ import (
|
||||||
"golang.org/x/sync/semaphore"
|
"golang.org/x/sync/semaphore"
|
||||||
|
|
||||||
"github.com/ollama/ollama/api"
|
"github.com/ollama/ollama/api"
|
||||||
"github.com/ollama/ollama/discover"
|
|
||||||
"github.com/ollama/ollama/envconfig"
|
"github.com/ollama/ollama/envconfig"
|
||||||
"github.com/ollama/ollama/format"
|
"github.com/ollama/ollama/format"
|
||||||
"github.com/ollama/ollama/fs/ggml"
|
"github.com/ollama/ollama/fs/ggml"
|
||||||
|
|
@ -66,7 +65,7 @@ func (e filteredEnv) LogValue() slog.Value {
|
||||||
|
|
||||||
type LlamaServer interface {
|
type LlamaServer interface {
|
||||||
ModelPath() string
|
ModelPath() string
|
||||||
Load(ctx context.Context, gpus discover.GpuInfoList, requireFull bool) ([]ml.DeviceID, error)
|
Load(ctx context.Context, systemInfo ml.SystemInfo, gpus []ml.DeviceInfo, requireFull bool) ([]ml.DeviceID, error)
|
||||||
Ping(ctx context.Context) error
|
Ping(ctx context.Context) error
|
||||||
WaitUntilRunning(ctx context.Context) error
|
WaitUntilRunning(ctx context.Context) error
|
||||||
Completion(ctx context.Context, req CompletionRequest, fn func(CompletionResponse)) error
|
Completion(ctx context.Context, req CompletionRequest, fn func(CompletionResponse)) error
|
||||||
|
|
@ -115,7 +114,7 @@ type llamaServer struct {
|
||||||
llmServer
|
llmServer
|
||||||
|
|
||||||
ggml *ggml.GGML
|
ggml *ggml.GGML
|
||||||
gpus discover.GpuInfoList // The set of GPUs covered by the memory estimate
|
gpus []ml.DeviceInfo // The set of GPUs covered by the memory estimate
|
||||||
estimate MemoryEstimate
|
estimate MemoryEstimate
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -146,7 +145,7 @@ func LoadModel(model string, maxArraySize int) (*ggml.GGML, error) {
|
||||||
}
|
}
|
||||||
|
|
||||||
// NewLlamaServer will run a server for the given GPUs
|
// NewLlamaServer will run a server for the given GPUs
|
||||||
func NewLlamaServer(gpus discover.GpuInfoList, modelPath string, f *ggml.GGML, adapters, projectors []string, opts api.Options, numParallel int) (LlamaServer, error) {
|
func NewLlamaServer(systemInfo ml.SystemInfo, gpus []ml.DeviceInfo, modelPath string, f *ggml.GGML, adapters, projectors []string, opts api.Options, numParallel int) (LlamaServer, error) {
|
||||||
var llamaModel *llama.Model
|
var llamaModel *llama.Model
|
||||||
var textProcessor model.TextProcessor
|
var textProcessor model.TextProcessor
|
||||||
var err error
|
var err error
|
||||||
|
|
@ -179,7 +178,7 @@ func NewLlamaServer(gpus discover.GpuInfoList, modelPath string, f *ggml.GGML, a
|
||||||
|
|
||||||
loadRequest := LoadRequest{LoraPath: adapters, KvSize: opts.NumCtx * numParallel, BatchSize: opts.NumBatch, Parallel: numParallel, MultiUserCache: envconfig.MultiUserCache()}
|
loadRequest := LoadRequest{LoraPath: adapters, KvSize: opts.NumCtx * numParallel, BatchSize: opts.NumBatch, Parallel: numParallel, MultiUserCache: envconfig.MultiUserCache()}
|
||||||
|
|
||||||
defaultThreads := discover.GetSystemInfo().GetOptimalThreadCount()
|
defaultThreads := systemInfo.ThreadCount
|
||||||
if opts.NumThread > 0 {
|
if opts.NumThread > 0 {
|
||||||
loadRequest.NumThreads = opts.NumThread
|
loadRequest.NumThreads = opts.NumThread
|
||||||
} else if defaultThreads > 0 {
|
} else if defaultThreads > 0 {
|
||||||
|
|
@ -200,7 +199,7 @@ func NewLlamaServer(gpus discover.GpuInfoList, modelPath string, f *ggml.GGML, a
|
||||||
|
|
||||||
// This will disable flash attention unless all GPUs on the system support it, even if we end up selecting a subset
|
// This will disable flash attention unless all GPUs on the system support it, even if we end up selecting a subset
|
||||||
// that can handle it.
|
// that can handle it.
|
||||||
if fa && !gpus.FlashAttentionSupported() {
|
if fa && !ml.FlashAttentionSupported(gpus) {
|
||||||
slog.Warn("flash attention enabled but not supported by gpu")
|
slog.Warn("flash attention enabled but not supported by gpu")
|
||||||
fa = false
|
fa = false
|
||||||
}
|
}
|
||||||
|
|
@ -227,120 +226,20 @@ func NewLlamaServer(gpus discover.GpuInfoList, modelPath string, f *ggml.GGML, a
|
||||||
slog.Warn("quantized kv cache requested but flash attention disabled", "type", kvct)
|
slog.Warn("quantized kv cache requested but flash attention disabled", "type", kvct)
|
||||||
}
|
}
|
||||||
|
|
||||||
availableLibs := make(map[string]string)
|
gpuLibs := ml.LibraryPaths(gpus)
|
||||||
if entries, err := os.ReadDir(discover.LibOllamaPath); err == nil {
|
status := NewStatusWriter(os.Stderr)
|
||||||
for _, entry := range entries {
|
cmd, port, err := StartRunner(
|
||||||
availableLibs[entry.Name()] = filepath.Join(discover.LibOllamaPath, entry.Name())
|
textProcessor != nil,
|
||||||
}
|
modelPath,
|
||||||
}
|
gpuLibs,
|
||||||
|
status,
|
||||||
var gpuLibs []string
|
ml.GetVisibleDevicesEnv(gpus),
|
||||||
for _, gpu := range gpus {
|
)
|
||||||
gpuLibs = append(gpuLibs, gpu.RunnerName())
|
|
||||||
}
|
|
||||||
|
|
||||||
requested := envconfig.LLMLibrary()
|
|
||||||
if availableLibs[requested] != "" {
|
|
||||||
slog.Info("using requested gpu library", "requested", requested)
|
|
||||||
gpuLibs = []string{requested}
|
|
||||||
}
|
|
||||||
|
|
||||||
var compatible []string
|
|
||||||
for _, gpuLib := range gpuLibs {
|
|
||||||
var matchingLibs []string
|
|
||||||
for k := range availableLibs {
|
|
||||||
// exact match first
|
|
||||||
if k == gpuLib {
|
|
||||||
matchingLibs = append([]string{k}, matchingLibs...)
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
|
|
||||||
// then match the family (e.g. 'cuda')
|
|
||||||
if strings.Split(k, "_")[0] == strings.Split(gpuLib, "_")[0] {
|
|
||||||
matchingLibs = append(matchingLibs, k)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if len(matchingLibs) > 0 {
|
|
||||||
compatible = append(compatible, matchingLibs[0])
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
exe, err := os.Executable()
|
|
||||||
if err != nil {
|
|
||||||
return nil, fmt.Errorf("unable to lookup executable path: %w", err)
|
|
||||||
}
|
|
||||||
|
|
||||||
if eval, err := filepath.EvalSymlinks(exe); err == nil {
|
|
||||||
exe = eval
|
|
||||||
}
|
|
||||||
|
|
||||||
// iterate through compatible GPU libraries such as 'cuda_v12', 'rocm', etc.
|
|
||||||
// adding each library's respective path to the LD_LIBRARY_PATH, until finally running
|
|
||||||
// without any LD_LIBRARY_PATH flags
|
|
||||||
for {
|
|
||||||
port := 0
|
|
||||||
if a, err := net.ResolveTCPAddr("tcp", "localhost:0"); err == nil {
|
|
||||||
var l *net.TCPListener
|
|
||||||
if l, err = net.ListenTCP("tcp", a); err == nil {
|
|
||||||
port = l.Addr().(*net.TCPAddr).Port
|
|
||||||
l.Close()
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if port == 0 {
|
|
||||||
slog.Debug("ResolveTCPAddr failed, using random port")
|
|
||||||
port = rand.Intn(65535-49152) + 49152 // get a random port in the ephemeral range
|
|
||||||
}
|
|
||||||
params := []string{"runner"}
|
|
||||||
if textProcessor != nil {
|
|
||||||
// New engine
|
|
||||||
// TODO - if we have failure to load scenarios, add logic to retry with the old runner
|
|
||||||
params = append(params, "--ollama-engine")
|
|
||||||
}
|
|
||||||
params = append(params, "--model", modelPath)
|
|
||||||
params = append(params, "--port", strconv.Itoa(port))
|
|
||||||
|
|
||||||
var pathEnv string
|
|
||||||
switch runtime.GOOS {
|
|
||||||
case "windows":
|
|
||||||
pathEnv = "PATH"
|
|
||||||
case "darwin":
|
|
||||||
pathEnv = "DYLD_LIBRARY_PATH"
|
|
||||||
default:
|
|
||||||
pathEnv = "LD_LIBRARY_PATH"
|
|
||||||
}
|
|
||||||
|
|
||||||
// Note: we always put our dependency paths first
|
|
||||||
// since these are the exact version we compiled/linked against
|
|
||||||
libraryPaths := []string{discover.LibOllamaPath}
|
|
||||||
if libraryPath, ok := os.LookupEnv(pathEnv); ok {
|
|
||||||
libraryPaths = append(libraryPaths, filepath.SplitList(libraryPath)...)
|
|
||||||
}
|
|
||||||
|
|
||||||
ggmlPaths := []string{discover.LibOllamaPath}
|
|
||||||
for _, c := range compatible {
|
|
||||||
if libpath, ok := availableLibs[c]; ok {
|
|
||||||
slog.Debug("adding gpu library", "path", libpath)
|
|
||||||
libraryPaths = append([]string{libpath}, libraryPaths...)
|
|
||||||
ggmlPaths = append(ggmlPaths, libpath)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
for _, gpu := range gpus {
|
|
||||||
if gpu.DependencyPath != nil {
|
|
||||||
slog.Debug("adding gpu dependency paths", "paths", gpu.DependencyPath)
|
|
||||||
libraryPaths = append(gpu.DependencyPath, libraryPaths...)
|
|
||||||
ggmlPaths = append(ggmlPaths, gpu.DependencyPath...)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// finally, add the root library path
|
|
||||||
libraryPaths = append(libraryPaths, discover.LibOllamaPath)
|
|
||||||
|
|
||||||
s := llmServer{
|
s := llmServer{
|
||||||
port: port,
|
port: port,
|
||||||
cmd: exec.Command(exe, params...),
|
cmd: cmd,
|
||||||
status: NewStatusWriter(os.Stderr),
|
status: status,
|
||||||
options: opts,
|
options: opts,
|
||||||
modelPath: modelPath,
|
modelPath: modelPath,
|
||||||
loadRequest: loadRequest,
|
loadRequest: loadRequest,
|
||||||
|
|
@ -354,70 +253,18 @@ func NewLlamaServer(gpus discover.GpuInfoList, modelPath string, f *ggml.GGML, a
|
||||||
done: make(chan error, 1),
|
done: make(chan error, 1),
|
||||||
}
|
}
|
||||||
|
|
||||||
s.cmd.Env = os.Environ()
|
if err != nil {
|
||||||
s.cmd.Stdout = os.Stdout
|
|
||||||
s.cmd.Stderr = s.status
|
|
||||||
s.cmd.SysProcAttr = LlamaServerSysProcAttr
|
|
||||||
|
|
||||||
// Always filter down the set of GPUs in case there are any unsupported devices that might crash
|
|
||||||
envWorkarounds := gpus.GetVisibleDevicesEnv()
|
|
||||||
pathEnvVal := strings.Join(libraryPaths, string(filepath.ListSeparator))
|
|
||||||
|
|
||||||
// Update or add the path variable with our adjusted version
|
|
||||||
pathNeeded := true
|
|
||||||
ollamaPathNeeded := true
|
|
||||||
envWorkaroundDone := make([]bool, len(envWorkarounds))
|
|
||||||
for i := range s.cmd.Env {
|
|
||||||
cmp := strings.SplitN(s.cmd.Env[i], "=", 2)
|
|
||||||
if strings.EqualFold(cmp[0], pathEnv) {
|
|
||||||
s.cmd.Env[i] = pathEnv + "=" + pathEnvVal
|
|
||||||
pathNeeded = false
|
|
||||||
} else if strings.EqualFold(cmp[0], "OLLAMA_LIBRARY_PATH") {
|
|
||||||
s.cmd.Env[i] = "OLLAMA_LIBRARY_PATH=" + strings.Join(ggmlPaths, string(filepath.ListSeparator))
|
|
||||||
ollamaPathNeeded = false
|
|
||||||
} else if len(envWorkarounds) != 0 {
|
|
||||||
for j, kv := range envWorkarounds {
|
|
||||||
tmp := strings.SplitN(kv, "=", 2)
|
|
||||||
if strings.EqualFold(cmp[0], tmp[0]) {
|
|
||||||
s.cmd.Env[i] = kv
|
|
||||||
envWorkaroundDone[j] = true
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if pathNeeded {
|
|
||||||
s.cmd.Env = append(s.cmd.Env, pathEnv+"="+pathEnvVal)
|
|
||||||
}
|
|
||||||
if ollamaPathNeeded {
|
|
||||||
s.cmd.Env = append(s.cmd.Env, "OLLAMA_LIBRARY_PATH="+strings.Join(ggmlPaths, string(filepath.ListSeparator)))
|
|
||||||
}
|
|
||||||
for i, done := range envWorkaroundDone {
|
|
||||||
if !done {
|
|
||||||
s.cmd.Env = append(s.cmd.Env, envWorkarounds[i])
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
slog.Info("starting runner", "cmd", s.cmd)
|
|
||||||
slog.Debug("subprocess", "", filteredEnv(s.cmd.Env))
|
|
||||||
|
|
||||||
if err = s.cmd.Start(); err != nil {
|
|
||||||
var msg string
|
var msg string
|
||||||
if s.status != nil && s.status.LastErrMsg != "" {
|
if s.status != nil && s.status.LastErrMsg != "" {
|
||||||
msg = s.status.LastErrMsg
|
msg = s.status.LastErrMsg
|
||||||
}
|
}
|
||||||
err := fmt.Errorf("error starting runner: %v %s", err, msg)
|
err := fmt.Errorf("error starting runner: %v %s", err, msg)
|
||||||
if len(compatible) == 0 {
|
|
||||||
if llamaModel != nil {
|
if llamaModel != nil {
|
||||||
llama.FreeModel(llamaModel)
|
llama.FreeModel(llamaModel)
|
||||||
}
|
}
|
||||||
return nil, err
|
return nil, err
|
||||||
}
|
}
|
||||||
|
|
||||||
slog.Warn("unable to start runner with compatible gpu", "error", err, "compatible", compatible)
|
|
||||||
compatible = compatible[1:]
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
|
|
||||||
// reap subprocess when it exits
|
// reap subprocess when it exits
|
||||||
go func() {
|
go func() {
|
||||||
err := s.cmd.Wait()
|
err := s.cmd.Wait()
|
||||||
|
|
@ -438,7 +285,111 @@ func NewLlamaServer(gpus discover.GpuInfoList, modelPath string, f *ggml.GGML, a
|
||||||
} else {
|
} else {
|
||||||
return &llamaServer{llmServer: s, ggml: f}, nil
|
return &llamaServer{llmServer: s, ggml: f}, nil
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func StartRunner(ollamaEngine bool, modelPath string, gpuLibs []string, out io.Writer, extraEnvs map[string]string) (cmd *exec.Cmd, port int, err error) {
|
||||||
|
var exe string
|
||||||
|
exe, err = os.Executable()
|
||||||
|
if err != nil {
|
||||||
|
return nil, 0, fmt.Errorf("unable to lookup executable path: %w", err)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if eval, err := filepath.EvalSymlinks(exe); err == nil {
|
||||||
|
exe = eval
|
||||||
|
}
|
||||||
|
|
||||||
|
port = 0
|
||||||
|
if a, err := net.ResolveTCPAddr("tcp", "localhost:0"); err == nil {
|
||||||
|
var l *net.TCPListener
|
||||||
|
if l, err = net.ListenTCP("tcp", a); err == nil {
|
||||||
|
port = l.Addr().(*net.TCPAddr).Port
|
||||||
|
l.Close()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if port == 0 {
|
||||||
|
slog.Debug("ResolveTCPAddr failed, using random port")
|
||||||
|
port = rand.Intn(65535-49152) + 49152 // get a random port in the ephemeral range
|
||||||
|
}
|
||||||
|
params := []string{"runner"}
|
||||||
|
if ollamaEngine {
|
||||||
|
params = append(params, "--ollama-engine")
|
||||||
|
}
|
||||||
|
if modelPath != "" {
|
||||||
|
params = append(params, "--model", modelPath)
|
||||||
|
}
|
||||||
|
params = append(params, "--port", strconv.Itoa(port))
|
||||||
|
|
||||||
|
var pathEnv string
|
||||||
|
switch runtime.GOOS {
|
||||||
|
case "windows":
|
||||||
|
pathEnv = "PATH"
|
||||||
|
case "darwin":
|
||||||
|
pathEnv = "DYLD_LIBRARY_PATH"
|
||||||
|
default:
|
||||||
|
pathEnv = "LD_LIBRARY_PATH"
|
||||||
|
}
|
||||||
|
|
||||||
|
// Note: we always put our dependency paths first
|
||||||
|
// since these are the exact version we compiled/linked against
|
||||||
|
libraryPaths := append([]string{}, gpuLibs...)
|
||||||
|
if libraryPath, ok := os.LookupEnv(pathEnv); ok {
|
||||||
|
libraryPaths = append(libraryPaths, filepath.SplitList(libraryPath)...)
|
||||||
|
}
|
||||||
|
|
||||||
|
cmd = exec.Command(exe, params...)
|
||||||
|
|
||||||
|
cmd.Env = os.Environ()
|
||||||
|
cmd.Stdout = out
|
||||||
|
cmd.Stderr = out
|
||||||
|
cmd.SysProcAttr = LlamaServerSysProcAttr
|
||||||
|
|
||||||
|
// Always filter down the set of GPUs in case there are any unsupported devices that might crash
|
||||||
|
pathEnvVal := strings.Join(libraryPaths, string(filepath.ListSeparator))
|
||||||
|
|
||||||
|
// Update or add the path variable with our adjusted version
|
||||||
|
pathNeeded := true
|
||||||
|
ollamaPathNeeded := true
|
||||||
|
extraEnvsDone := map[string]bool{}
|
||||||
|
for k := range extraEnvs {
|
||||||
|
extraEnvsDone[k] = false
|
||||||
|
}
|
||||||
|
for i := range cmd.Env {
|
||||||
|
cmp := strings.SplitN(cmd.Env[i], "=", 2)
|
||||||
|
if strings.EqualFold(cmp[0], pathEnv) {
|
||||||
|
cmd.Env[i] = pathEnv + "=" + pathEnvVal
|
||||||
|
pathNeeded = false
|
||||||
|
} else if strings.EqualFold(cmp[0], "OLLAMA_LIBRARY_PATH") {
|
||||||
|
cmd.Env[i] = "OLLAMA_LIBRARY_PATH=" + strings.Join(gpuLibs, string(filepath.ListSeparator))
|
||||||
|
ollamaPathNeeded = false
|
||||||
|
} else if len(extraEnvs) != 0 {
|
||||||
|
for k, v := range extraEnvs {
|
||||||
|
if strings.EqualFold(cmp[0], k) {
|
||||||
|
cmd.Env[i] = k + "=" + v
|
||||||
|
extraEnvsDone[k] = true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if pathNeeded {
|
||||||
|
cmd.Env = append(cmd.Env, pathEnv+"="+pathEnvVal)
|
||||||
|
}
|
||||||
|
if ollamaPathNeeded {
|
||||||
|
cmd.Env = append(cmd.Env, "OLLAMA_LIBRARY_PATH="+strings.Join(gpuLibs, string(filepath.ListSeparator)))
|
||||||
|
}
|
||||||
|
for k, done := range extraEnvsDone {
|
||||||
|
if !done {
|
||||||
|
cmd.Env = append(cmd.Env, k+"="+extraEnvs[k])
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
slog.Info("starting runner", "cmd", cmd)
|
||||||
|
slog.Debug("subprocess", "", filteredEnv(cmd.Env))
|
||||||
|
|
||||||
|
if err = cmd.Start(); err != nil {
|
||||||
|
return nil, 0, err
|
||||||
|
}
|
||||||
|
err = nil
|
||||||
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
func (s *llmServer) ModelPath() string {
|
func (s *llmServer) ModelPath() string {
|
||||||
|
|
@ -497,13 +448,18 @@ type LoadResponse struct {
|
||||||
|
|
||||||
var ErrLoadRequiredFull = errors.New("unable to load full model on GPU")
|
var ErrLoadRequiredFull = errors.New("unable to load full model on GPU")
|
||||||
|
|
||||||
func (s *llamaServer) Load(ctx context.Context, gpus discover.GpuInfoList, requireFull bool) ([]ml.DeviceID, error) {
|
func (s *llamaServer) Load(ctx context.Context, systemInfo ml.SystemInfo, gpus []ml.DeviceInfo, requireFull bool) ([]ml.DeviceID, error) {
|
||||||
systemInfo := discover.GetSystemInfo()
|
systemTotalMemory := systemInfo.TotalMemory
|
||||||
systemTotalMemory := systemInfo.System.TotalMemory
|
systemFreeMemory := systemInfo.FreeMemory
|
||||||
systemFreeMemory := systemInfo.System.FreeMemory
|
systemSwapFreeMemory := systemInfo.FreeSwap
|
||||||
systemSwapFreeMemory := systemInfo.System.FreeSwap
|
|
||||||
slog.Info("system memory", "total", format.HumanBytes2(systemTotalMemory), "free", format.HumanBytes2(systemFreeMemory), "free_swap", format.HumanBytes2(systemSwapFreeMemory))
|
slog.Info("system memory", "total", format.HumanBytes2(systemTotalMemory), "free", format.HumanBytes2(systemFreeMemory), "free_swap", format.HumanBytes2(systemSwapFreeMemory))
|
||||||
|
|
||||||
|
if len(gpus) == 0 || s.options.NumGPU == 0 {
|
||||||
|
if !verifyCPUFit(s.ggml, s.modelPath, []string{s.loadRequest.ProjectorPath}, s.loadRequest.LoraPath, s.options, systemInfo, s.numParallel) {
|
||||||
|
slog.Info("model requires more memory than is currently available, evicting a model to make space", "estimate", s.estimate)
|
||||||
|
return nil, fmt.Errorf("model requires more system memory than is currently available %w", ErrLoadRequiredFull)
|
||||||
|
}
|
||||||
|
} else {
|
||||||
g := pickBestFullFitByLibrary(s.ggml, s.modelPath, []string{s.loadRequest.ProjectorPath}, s.loadRequest.LoraPath, s.options, gpus, s.numParallel)
|
g := pickBestFullFitByLibrary(s.ggml, s.modelPath, []string{s.loadRequest.ProjectorPath}, s.loadRequest.LoraPath, s.options, gpus, s.numParallel)
|
||||||
if g == nil {
|
if g == nil {
|
||||||
if !requireFull {
|
if !requireFull {
|
||||||
|
|
@ -513,31 +469,37 @@ func (s *llamaServer) Load(ctx context.Context, gpus discover.GpuInfoList, requi
|
||||||
return nil, ErrLoadRequiredFull
|
return nil, ErrLoadRequiredFull
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
gpus = g
|
gpus = g
|
||||||
|
}
|
||||||
|
|
||||||
s.estimate = estimateGPULayers(gpus, s.ggml, []string{s.loadRequest.ProjectorPath}, s.options, s.numParallel)
|
s.estimate = estimateGPULayers(gpus, s.ggml, []string{s.loadRequest.ProjectorPath}, s.options, s.numParallel)
|
||||||
|
|
||||||
if len(gpus) > 1 || gpus[0].Library != "cpu" {
|
if len(gpus) >= 1 {
|
||||||
switch {
|
switch {
|
||||||
case gpus[0].Library == "Metal" && s.estimate.VRAMSize > systemInfo.System.TotalMemory:
|
case s.options.NumGPU == 0:
|
||||||
|
gpus = []ml.DeviceInfo{}
|
||||||
|
case gpus[0].Library == "Metal" && s.estimate.VRAMSize > systemInfo.TotalMemory:
|
||||||
// disable partial offloading when model is greater than total system memory as this
|
// disable partial offloading when model is greater than total system memory as this
|
||||||
// can lead to locking up the system
|
// can lead to locking up the system
|
||||||
s.options.NumGPU = 0
|
s.options.NumGPU = 0
|
||||||
|
gpus = []ml.DeviceInfo{}
|
||||||
case gpus[0].Library != "Metal" && s.estimate.Layers == 0:
|
case gpus[0].Library != "Metal" && s.estimate.Layers == 0:
|
||||||
// Don't bother loading into the GPU if no layers can fit
|
// Don't bother loading into the GPU if no layers can fit
|
||||||
gpus = discover.GpuInfoList{discover.GetCPUInfo()}
|
gpus = []ml.DeviceInfo{}
|
||||||
case s.options.NumGPU < 0 && s.estimate.Layers > 0 && gpus[0].Library != "cpu":
|
case s.options.NumGPU < 0 && s.estimate.Layers > 0:
|
||||||
s.options.NumGPU = s.estimate.Layers
|
s.options.NumGPU = s.estimate.Layers
|
||||||
}
|
}
|
||||||
|
} else {
|
||||||
|
s.options.NumGPU = 0
|
||||||
}
|
}
|
||||||
|
|
||||||
// On linux and windows, over-allocating CPU memory will almost always result in an error
|
// On linux and windows, over-allocating CPU memory will almost always result in an error
|
||||||
// Darwin has fully dynamic swap so has no direct concept of free swap space
|
// Darwin has fully dynamic swap so has no direct concept of free swap space
|
||||||
if runtime.GOOS != "darwin" {
|
if runtime.GOOS != "darwin" {
|
||||||
systemMemoryRequired := s.estimate.TotalSize - s.estimate.VRAMSize
|
systemMemoryRequired := s.estimate.TotalSize - s.estimate.VRAMSize
|
||||||
available := systemInfo.System.FreeMemory + systemInfo.System.FreeSwap
|
available := systemInfo.FreeMemory + systemInfo.FreeSwap
|
||||||
if systemMemoryRequired > available {
|
if systemMemoryRequired > available {
|
||||||
slog.Warn("model request too large for system", "requested", format.HumanBytes2(systemMemoryRequired), "available", format.HumanBytes2(available), "total", format.HumanBytes2(systemInfo.System.TotalMemory), "free", format.HumanBytes2(systemInfo.System.FreeMemory), "swap", format.HumanBytes2(systemInfo.System.FreeSwap))
|
slog.Warn("model request too large for system", "requested", format.HumanBytes2(systemMemoryRequired), "available", format.HumanBytes2(available), "total", format.HumanBytes2(systemInfo.TotalMemory), "free", format.HumanBytes2(systemInfo.FreeMemory), "swap", format.HumanBytes2(systemInfo.FreeSwap))
|
||||||
return nil, fmt.Errorf("model requires more system memory (%s) than is available (%s)", format.HumanBytes2(systemMemoryRequired), format.HumanBytes2(available))
|
return nil, fmt.Errorf("model requires more system memory (%s) than is available (%s)", format.HumanBytes2(systemMemoryRequired), format.HumanBytes2(available))
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
@ -564,10 +526,10 @@ func (s *llamaServer) Load(ctx context.Context, gpus discover.GpuInfoList, requi
|
||||||
// Windows CUDA should not use mmap for best performance
|
// Windows CUDA should not use mmap for best performance
|
||||||
// Linux with a model larger than free space, mmap leads to thrashing
|
// Linux with a model larger than free space, mmap leads to thrashing
|
||||||
// For CPU loads we want the memory to be allocated, not FS cache
|
// For CPU loads we want the memory to be allocated, not FS cache
|
||||||
if (runtime.GOOS == "windows" && gpus[0].Library == "CUDA" && s.options.UseMMap == nil) ||
|
if (runtime.GOOS == "windows" && len(gpus) > 0 && gpus[0].Library == "CUDA" && s.options.UseMMap == nil) ||
|
||||||
(runtime.GOOS == "linux" && systemInfo.System.FreeMemory < s.estimate.TotalSize && s.options.UseMMap == nil) ||
|
(runtime.GOOS == "linux" && systemInfo.FreeMemory < s.estimate.TotalSize && s.options.UseMMap == nil) ||
|
||||||
(gpus[0].Library == "cpu" && s.options.UseMMap == nil) ||
|
(len(gpus) == 0 && s.options.UseMMap == nil) ||
|
||||||
(gpus[0].Library == "Vulkan" && s.options.UseMMap == nil) ||
|
(len(gpus) > 0 && gpus[0].Library == "Vulkan" && s.options.UseMMap == nil) ||
|
||||||
(s.options.UseMMap != nil && !*s.options.UseMMap) {
|
(s.options.UseMMap != nil && !*s.options.UseMMap) {
|
||||||
s.loadRequest.UseMmap = false
|
s.loadRequest.UseMmap = false
|
||||||
}
|
}
|
||||||
|
|
@ -605,8 +567,8 @@ func (s *llamaServer) Load(ctx context.Context, gpus discover.GpuInfoList, requi
|
||||||
|
|
||||||
// createGPULayers maps from the tensor splits assigned by the memory estimates to explicit assignment
|
// createGPULayers maps from the tensor splits assigned by the memory estimates to explicit assignment
|
||||||
// of particular layers onto GPUs
|
// of particular layers onto GPUs
|
||||||
func createGPULayers(estimate MemoryEstimate, ggml *ggml.GGML, gpus discover.GpuInfoList, numGPU int) ml.GPULayersList {
|
func createGPULayers(estimate MemoryEstimate, ggml *ggml.GGML, gpus []ml.DeviceInfo, numGPU int) ml.GPULayersList {
|
||||||
if numGPU <= 0 {
|
if numGPU <= 0 || len(gpus) == 0 {
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -662,7 +624,7 @@ func createGPULayers(estimate MemoryEstimate, ggml *ggml.GGML, gpus discover.Gpu
|
||||||
// allowing for faster iteration, but may return less information.
|
// allowing for faster iteration, but may return less information.
|
||||||
//
|
//
|
||||||
// Returns the list of GPU IDs that were used in the final allocation on success
|
// Returns the list of GPU IDs that were used in the final allocation on success
|
||||||
func (s *ollamaServer) Load(ctx context.Context, gpus discover.GpuInfoList, requireFull bool) ([]ml.DeviceID, error) {
|
func (s *ollamaServer) Load(ctx context.Context, systemInfo ml.SystemInfo, gpus []ml.DeviceInfo, requireFull bool) ([]ml.DeviceID, error) {
|
||||||
var success bool
|
var success bool
|
||||||
defer func() {
|
defer func() {
|
||||||
if !success {
|
if !success {
|
||||||
|
|
@ -675,25 +637,22 @@ func (s *ollamaServer) Load(ctx context.Context, gpus discover.GpuInfoList, requ
|
||||||
|
|
||||||
slog.Info("loading model", "model layers", s.totalLayers, "requested", s.options.NumGPU)
|
slog.Info("loading model", "model layers", s.totalLayers, "requested", s.options.NumGPU)
|
||||||
|
|
||||||
systemInfo := discover.GetSystemInfo()
|
systemTotalMemory := systemInfo.TotalMemory
|
||||||
systemTotalMemory := systemInfo.System.TotalMemory
|
systemFreeMemory := systemInfo.FreeMemory
|
||||||
systemFreeMemory := systemInfo.System.FreeMemory
|
systemSwapFreeMemory := systemInfo.FreeSwap
|
||||||
systemSwapFreeMemory := systemInfo.System.FreeSwap
|
|
||||||
slog.Info("system memory", "total", format.HumanBytes2(systemTotalMemory), "free", format.HumanBytes2(systemFreeMemory), "free_swap", format.HumanBytes2(systemSwapFreeMemory))
|
slog.Info("system memory", "total", format.HumanBytes2(systemTotalMemory), "free", format.HumanBytes2(systemFreeMemory), "free_swap", format.HumanBytes2(systemSwapFreeMemory))
|
||||||
|
|
||||||
if !(len(gpus) == 1 && gpus[0].Library == "cpu") {
|
|
||||||
for _, gpu := range gpus {
|
for _, gpu := range gpus {
|
||||||
available := gpu.FreeMemory - envconfig.GpuOverhead() - gpu.MinimumMemory
|
available := gpu.FreeMemory - envconfig.GpuOverhead() - gpu.MinimumMemory()
|
||||||
if gpu.FreeMemory < envconfig.GpuOverhead()+gpu.MinimumMemory {
|
if gpu.FreeMemory < envconfig.GpuOverhead()+gpu.MinimumMemory() {
|
||||||
available = 0
|
available = 0
|
||||||
}
|
}
|
||||||
slog.Info("gpu memory", "id", gpu.ID, "library", gpu.Library,
|
slog.Info("gpu memory", "id", gpu.ID, "library", gpu.Library,
|
||||||
"available", format.HumanBytes2(available),
|
"available", format.HumanBytes2(available),
|
||||||
"free", format.HumanBytes2(gpu.FreeMemory),
|
"free", format.HumanBytes2(gpu.FreeMemory),
|
||||||
"minimum", format.HumanBytes2(gpu.MinimumMemory),
|
"minimum", format.HumanBytes2(gpu.MinimumMemory()),
|
||||||
"overhead", format.HumanBytes2(envconfig.GpuOverhead()))
|
"overhead", format.HumanBytes2(envconfig.GpuOverhead()))
|
||||||
}
|
}
|
||||||
}
|
|
||||||
|
|
||||||
pastAllocations := make(map[uint64]struct{})
|
pastAllocations := make(map[uint64]struct{})
|
||||||
var backoff float32
|
var backoff float32
|
||||||
|
|
@ -762,7 +721,6 @@ nextOperation:
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, err
|
return nil, err
|
||||||
}
|
}
|
||||||
|
|
||||||
slog.Debug("new layout created", "layers", newGPULayers)
|
slog.Debug("new layout created", "layers", newGPULayers)
|
||||||
|
|
||||||
s.loadRequest.GPULayers = newGPULayers
|
s.loadRequest.GPULayers = newGPULayers
|
||||||
|
|
@ -864,20 +822,27 @@ func uniqueDeviceIDs(gpuLayers ml.GPULayersList) []ml.DeviceID {
|
||||||
// - Calculating how much space each GPU has available for layers, based on free memory and space occupied by the graph
|
// - Calculating how much space each GPU has available for layers, based on free memory and space occupied by the graph
|
||||||
// - Assigning layers
|
// - Assigning layers
|
||||||
// - Ensuring that we don't exceed limits, such as requirements about partial offloading or system memory
|
// - Ensuring that we don't exceed limits, such as requirements about partial offloading or system memory
|
||||||
func (s *ollamaServer) createLayout(systemInfo discover.SystemInfo, systemGPUs discover.GpuInfoList, memory *ml.BackendMemory, requireFull bool, backoff float32) (ml.GPULayersList, error) {
|
func (s *ollamaServer) createLayout(systemInfo ml.SystemInfo, systemGPUs []ml.DeviceInfo, memory *ml.BackendMemory, requireFull bool, backoff float32) (ml.GPULayersList, error) {
|
||||||
if s.totalLayers == 0 || s.options.NumGPU == 0 || len(systemGPUs) == 0 || (len(systemGPUs) == 1 && systemGPUs[0].Library == "cpu") {
|
|
||||||
return ml.GPULayersList{}, nil
|
|
||||||
}
|
|
||||||
|
|
||||||
gpus := append(make(discover.GpuInfoList, 0, len(systemGPUs)), systemGPUs...)
|
|
||||||
sort.Sort(sort.Reverse(discover.ByFreeMemory(gpus)))
|
|
||||||
|
|
||||||
if memory == nil {
|
if memory == nil {
|
||||||
memory = &ml.BackendMemory{CPU: ml.DeviceMemory{
|
memory = &ml.BackendMemory{CPU: ml.DeviceMemory{
|
||||||
Weights: make([]uint64, s.totalLayers),
|
Weights: make([]uint64, s.totalLayers),
|
||||||
Cache: make([]uint64, s.totalLayers),
|
Cache: make([]uint64, s.totalLayers),
|
||||||
}}
|
}}
|
||||||
}
|
}
|
||||||
|
gpuLayers, layers, err := s.buildLayout(systemGPUs, memory, requireFull, backoff)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
err = s.verifyLayout(systemInfo, memory, requireFull, gpuLayers, layers)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
return gpuLayers, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func (s *ollamaServer) buildLayout(systemGPUs []ml.DeviceInfo, memory *ml.BackendMemory, requireFull bool, backoff float32) (ml.GPULayersList, []uint64, error) {
|
||||||
|
gpus := append(make([]ml.DeviceInfo, 0, len(systemGPUs)), systemGPUs...)
|
||||||
|
sort.Sort(sort.Reverse(ml.ByFreeMemory(gpus)))
|
||||||
|
|
||||||
layers := make([]uint64, len(memory.CPU.Weights))
|
layers := make([]uint64, len(memory.CPU.Weights))
|
||||||
for i := range layers {
|
for i := range layers {
|
||||||
|
|
@ -891,7 +856,7 @@ func (s *ollamaServer) createLayout(systemInfo discover.SystemInfo, systemGPUs d
|
||||||
}
|
}
|
||||||
|
|
||||||
gpuLayers := ml.GPULayersList{}
|
gpuLayers := ml.GPULayersList{}
|
||||||
for _, gl := range gpus.ByLibrary() {
|
for _, gl := range ml.ByLibrary(gpus) {
|
||||||
// If a GPU already has a graph allocated on it, then we should continue to use it.
|
// If a GPU already has a graph allocated on it, then we should continue to use it.
|
||||||
// Otherwise, we lose information that we got from previous allocations, which can
|
// Otherwise, we lose information that we got from previous allocations, which can
|
||||||
// cause cycling. Plus, we get more information about required allocation from each
|
// cause cycling. Plus, we get more information about required allocation from each
|
||||||
|
|
@ -905,7 +870,7 @@ func (s *ollamaServer) createLayout(systemInfo discover.SystemInfo, systemGPUs d
|
||||||
lastUsedGPU = i
|
lastUsedGPU = i
|
||||||
}
|
}
|
||||||
|
|
||||||
reserved := uint64(float32(gl[i].FreeMemory)*backoff) + gl[i].MinimumMemory + envconfig.GpuOverhead() + memory.GPUs[j].Graph
|
reserved := uint64(float32(gl[i].FreeMemory)*backoff) + gl[i].MinimumMemory() + envconfig.GpuOverhead() + memory.GPUs[j].Graph
|
||||||
if gl[i].FreeMemory > reserved {
|
if gl[i].FreeMemory > reserved {
|
||||||
gl[i].FreeMemory -= reserved
|
gl[i].FreeMemory -= reserved
|
||||||
} else {
|
} else {
|
||||||
|
|
@ -914,7 +879,7 @@ func (s *ollamaServer) createLayout(systemInfo discover.SystemInfo, systemGPUs d
|
||||||
|
|
||||||
slog.Debug("available gpu", "id", gl[i].ID, "library", gl[i].Library,
|
slog.Debug("available gpu", "id", gl[i].ID, "library", gl[i].Library,
|
||||||
"available layer vram", format.HumanBytes2(gl[i].FreeMemory),
|
"available layer vram", format.HumanBytes2(gl[i].FreeMemory),
|
||||||
"backoff", fmt.Sprintf("%.2f", backoff), "minimum", format.HumanBytes2(gl[i].MinimumMemory),
|
"backoff", fmt.Sprintf("%.2f", backoff), "minimum", format.HumanBytes2(gl[i].MinimumMemory()),
|
||||||
"overhead", format.HumanBytes2(envconfig.GpuOverhead()),
|
"overhead", format.HumanBytes2(envconfig.GpuOverhead()),
|
||||||
"graph", format.HumanBytes2(memory.GPUs[j].Graph))
|
"graph", format.HumanBytes2(memory.GPUs[j].Graph))
|
||||||
|
|
||||||
|
|
@ -933,7 +898,11 @@ func (s *ollamaServer) createLayout(systemInfo discover.SystemInfo, systemGPUs d
|
||||||
gpuLayers = libraryGpuLayers
|
gpuLayers = libraryGpuLayers
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
return gpuLayers, layers, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// verifyLayout ensures that we don't exceed limits, such as requirements about partial offloading or system memory
|
||||||
|
func (s *ollamaServer) verifyLayout(systemInfo ml.SystemInfo, memory *ml.BackendMemory, requireFull bool, gpuLayers ml.GPULayersList, layers []uint64) error {
|
||||||
// These sizes will only increase as we go through additional iterations and get additional information.
|
// These sizes will only increase as we go through additional iterations and get additional information.
|
||||||
cpuSize := memory.InputWeights + memory.CPU.Graph
|
cpuSize := memory.InputWeights + memory.CPU.Graph
|
||||||
var vramSize uint64
|
var vramSize uint64
|
||||||
|
|
@ -961,24 +930,24 @@ nextLayer:
|
||||||
|
|
||||||
if requireFull {
|
if requireFull {
|
||||||
if gpuLayers.Sum() < len(layers) && (s.options.NumGPU < 0 || gpuLayers.Sum() < s.options.NumGPU) {
|
if gpuLayers.Sum() < len(layers) && (s.options.NumGPU < 0 || gpuLayers.Sum() < s.options.NumGPU) {
|
||||||
return nil, ErrLoadRequiredFull
|
return ErrLoadRequiredFull
|
||||||
}
|
}
|
||||||
|
|
||||||
if cpuSize > systemInfo.System.FreeMemory {
|
if cpuSize > systemInfo.FreeMemory {
|
||||||
return nil, ErrLoadRequiredFull
|
return ErrLoadRequiredFull
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// On linux and windows, over-allocating CPU memory will almost always result in an error
|
// On linux and windows, over-allocating CPU memory will almost always result in an error
|
||||||
// Darwin has fully dynamic swap so has no direct concept of free swap space
|
// Darwin has fully dynamic swap so has no direct concept of free swap space
|
||||||
if runtime.GOOS != "darwin" {
|
if runtime.GOOS != "darwin" {
|
||||||
available := systemInfo.System.FreeMemory + systemInfo.System.FreeSwap
|
available := systemInfo.FreeMemory + systemInfo.FreeSwap
|
||||||
if cpuSize > available {
|
if cpuSize > available {
|
||||||
slog.Warn("model request too large for system", "requested", format.HumanBytes2(cpuSize), "available", format.HumanBytes2(available), "total", format.HumanBytes2(systemInfo.System.TotalMemory), "free", format.HumanBytes2(systemInfo.System.FreeMemory), "swap", format.HumanBytes2(systemInfo.System.FreeSwap))
|
slog.Warn("model request too large for system", "requested", format.HumanBytes2(cpuSize), "available", format.HumanBytes2(available), "total", format.HumanBytes2(systemInfo.TotalMemory), "free", format.HumanBytes2(systemInfo.FreeMemory), "swap", format.HumanBytes2(systemInfo.FreeSwap))
|
||||||
return nil, fmt.Errorf("model requires more system memory (%s) than is available (%s)", format.HumanBytes2(cpuSize), format.HumanBytes2(available))
|
return fmt.Errorf("model requires more system memory (%s) than is available (%s)", format.HumanBytes2(cpuSize), format.HumanBytes2(available))
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
if vramSize > systemInfo.System.TotalMemory {
|
if vramSize > systemInfo.TotalMemory {
|
||||||
// disable partial offloading when model is greater than total system memory as this
|
// disable partial offloading when model is greater than total system memory as this
|
||||||
// can lead to locking up the system
|
// can lead to locking up the system
|
||||||
s.options.NumGPU = 0
|
s.options.NumGPU = 0
|
||||||
|
|
@ -990,11 +959,11 @@ nextLayer:
|
||||||
slog.Debug("insufficient VRAM to load any model layers")
|
slog.Debug("insufficient VRAM to load any model layers")
|
||||||
}
|
}
|
||||||
|
|
||||||
return gpuLayers, nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
// assignLayers packs the maximum number of layers onto the smallest set of GPUs and comes up with a layer assignment
|
// assignLayers packs the maximum number of layers onto the smallest set of GPUs and comes up with a layer assignment
|
||||||
func assignLayers(layers []uint64, gpus discover.GpuInfoList, requireFull bool, requestedLayers int, lastUsedGPU int) (gpuLayers ml.GPULayersList) {
|
func assignLayers(layers []uint64, gpus []ml.DeviceInfo, requireFull bool, requestedLayers int, lastUsedGPU int) (gpuLayers ml.GPULayersList) {
|
||||||
// If we can't fit everything then prefer offloading layers other than the output layer
|
// If we can't fit everything then prefer offloading layers other than the output layer
|
||||||
for range 2 {
|
for range 2 {
|
||||||
// requestedLayers may be -1 if nothing was requested
|
// requestedLayers may be -1 if nothing was requested
|
||||||
|
|
@ -1028,7 +997,7 @@ func assignLayers(layers []uint64, gpus discover.GpuInfoList, requireFull bool,
|
||||||
// findBestFit binary searches to find the smallest capacity factor that can fit
|
// findBestFit binary searches to find the smallest capacity factor that can fit
|
||||||
// the max number of layers. The capacity factor is multiplied by the free space on
|
// the max number of layers. The capacity factor is multiplied by the free space on
|
||||||
// each GPU and a small one will force even balancing.
|
// each GPU and a small one will force even balancing.
|
||||||
func findBestFit(layers []uint64, gpus discover.GpuInfoList, requestedLayers int, forceRequest bool) (gpuLayers ml.GPULayersList) {
|
func findBestFit(layers []uint64, gpus []ml.DeviceInfo, requestedLayers int, forceRequest bool) (gpuLayers ml.GPULayersList) {
|
||||||
var high float32 = 1
|
var high float32 = 1
|
||||||
var low float32 = 0
|
var low float32 = 0
|
||||||
|
|
||||||
|
|
@ -1053,12 +1022,11 @@ func findBestFit(layers []uint64, gpus discover.GpuInfoList, requestedLayers int
|
||||||
low = mid
|
low = mid
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return bestAssignments
|
return bestAssignments
|
||||||
}
|
}
|
||||||
|
|
||||||
// greedyFit assigns layers incrementally to GPUs, spilling over as each runs out of free space
|
// greedyFit assigns layers incrementally to GPUs, spilling over as each runs out of free space
|
||||||
func greedyFit(layers []uint64, gpus discover.GpuInfoList, capacity float32, requestedLayers int) (gpuLayers ml.GPULayersList) {
|
func greedyFit(layers []uint64, gpus []ml.DeviceInfo, capacity float32, requestedLayers int) (gpuLayers ml.GPULayersList) {
|
||||||
device := len(gpus) - 1
|
device := len(gpus) - 1
|
||||||
gpuLayers = ml.GPULayersList{{DeviceID: gpus[device].DeviceID}}
|
gpuLayers = ml.GPULayersList{{DeviceID: gpus[device].DeviceID}}
|
||||||
freeSpace := uint64(float32(gpus[device].FreeMemory) * capacity)
|
freeSpace := uint64(float32(gpus[device].FreeMemory) * capacity)
|
||||||
|
|
@ -1082,7 +1050,6 @@ func greedyFit(layers []uint64, gpus discover.GpuInfoList, capacity float32, req
|
||||||
freeSpace = uint64(float32(gpus[device].FreeMemory) * capacity)
|
freeSpace = uint64(float32(gpus[device].FreeMemory) * capacity)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return gpuLayers
|
return gpuLayers
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -1814,7 +1781,7 @@ func (s *ollamaServer) VRAMByGPU(id ml.DeviceID) uint64 {
|
||||||
}
|
}
|
||||||
|
|
||||||
func (s *ollamaServer) GetDeviceInfos(ctx context.Context) []ml.DeviceInfo {
|
func (s *ollamaServer) GetDeviceInfos(ctx context.Context) []ml.DeviceInfo {
|
||||||
devices, err := discover.GetDevicesFromRunner(ctx, s)
|
devices, err := ml.GetDevicesFromRunner(ctx, s)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
if s.cmd != nil && s.cmd.ProcessState == nil {
|
if s.cmd != nil && s.cmd.ProcessState == nil {
|
||||||
// Still running but hit an error, log
|
// Still running but hit an error, log
|
||||||
|
|
|
||||||
|
|
@ -8,7 +8,6 @@ import (
|
||||||
"testing"
|
"testing"
|
||||||
|
|
||||||
"github.com/ollama/ollama/api"
|
"github.com/ollama/ollama/api"
|
||||||
"github.com/ollama/ollama/discover"
|
|
||||||
"github.com/ollama/ollama/format"
|
"github.com/ollama/ollama/format"
|
||||||
"github.com/ollama/ollama/ml"
|
"github.com/ollama/ollama/ml"
|
||||||
"golang.org/x/sync/semaphore"
|
"golang.org/x/sync/semaphore"
|
||||||
|
|
@ -20,6 +19,8 @@ func TestLLMServerFitGPU(t *testing.T) {
|
||||||
free int
|
free int
|
||||||
}
|
}
|
||||||
|
|
||||||
|
minMemory := 457 * format.MebiByte
|
||||||
|
|
||||||
tests := []struct {
|
tests := []struct {
|
||||||
name string
|
name string
|
||||||
gpus []gpu
|
gpus []gpu
|
||||||
|
|
@ -37,91 +38,91 @@ func TestLLMServerFitGPU(t *testing.T) {
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
name: "Full single GPU",
|
name: "Full single GPU",
|
||||||
gpus: []gpu{{id: ml.DeviceID{ID: "gpu0"}, free: 256 * format.MebiByte}},
|
gpus: []gpu{{id: ml.DeviceID{ID: "gpu0"}, free: 256*format.MebiByte + minMemory}},
|
||||||
layers: []int{50 * format.MebiByte, 50 * format.MebiByte, 50 * format.MebiByte},
|
layers: []int{50 * format.MebiByte, 50 * format.MebiByte, 50 * format.MebiByte},
|
||||||
numGPU: -1,
|
numGPU: -1,
|
||||||
expected: ml.GPULayersList{{DeviceID: ml.DeviceID{ID: "gpu0"}, Layers: []int{0, 1, 2}}},
|
expected: ml.GPULayersList{{DeviceID: ml.DeviceID{ID: "gpu0"}, Layers: []int{0, 1, 2}}},
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
name: "Partial single GPU",
|
name: "Partial single GPU",
|
||||||
gpus: []gpu{{id: ml.DeviceID{ID: "gpu0"}, free: 256 * format.MebiByte}},
|
gpus: []gpu{{id: ml.DeviceID{ID: "gpu0"}, free: 256*format.MebiByte + minMemory}},
|
||||||
layers: []int{100 * format.MebiByte, 100 * format.MebiByte, 100 * format.MebiByte, 100 * format.MebiByte},
|
layers: []int{100 * format.MebiByte, 100 * format.MebiByte, 100 * format.MebiByte, 100 * format.MebiByte},
|
||||||
numGPU: -1,
|
numGPU: -1,
|
||||||
expected: ml.GPULayersList{{DeviceID: ml.DeviceID{ID: "gpu0"}, Layers: []int{1, 2}}},
|
expected: ml.GPULayersList{{DeviceID: ml.DeviceID{ID: "gpu0"}, Layers: []int{1, 2}}},
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
name: "Single GPU with numGPU 1",
|
name: "Single GPU with numGPU 1",
|
||||||
gpus: []gpu{{id: ml.DeviceID{ID: "gpu0"}, free: 256 * format.MebiByte}},
|
gpus: []gpu{{id: ml.DeviceID{ID: "gpu0"}, free: 256*format.MebiByte + minMemory}},
|
||||||
layers: []int{50 * format.MebiByte, 50 * format.MebiByte, 50 * format.MebiByte},
|
layers: []int{50 * format.MebiByte, 50 * format.MebiByte, 50 * format.MebiByte},
|
||||||
numGPU: 1,
|
numGPU: 1,
|
||||||
expected: ml.GPULayersList{{DeviceID: ml.DeviceID{ID: "gpu0"}, Layers: []int{1}}},
|
expected: ml.GPULayersList{{DeviceID: ml.DeviceID{ID: "gpu0"}, Layers: []int{1}}},
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
name: "Single GPU with numGPU 0",
|
name: "Single GPU with numGPU 0",
|
||||||
gpus: []gpu{{id: ml.DeviceID{ID: "gpu0"}, free: 256 * format.MebiByte}},
|
gpus: []gpu{{id: ml.DeviceID{ID: "gpu0"}, free: 256*format.MebiByte + minMemory}},
|
||||||
layers: []int{50 * format.MebiByte, 50 * format.MebiByte, 50 * format.MebiByte},
|
layers: []int{50 * format.MebiByte, 50 * format.MebiByte, 50 * format.MebiByte},
|
||||||
numGPU: 0,
|
numGPU: 0,
|
||||||
expected: ml.GPULayersList{},
|
expected: ml.GPULayersList{},
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
name: "Single GPU with numGPU 999",
|
name: "Single GPU with numGPU 999",
|
||||||
gpus: []gpu{{id: ml.DeviceID{ID: "gpu0"}, free: 256 * format.MebiByte}},
|
gpus: []gpu{{id: ml.DeviceID{ID: "gpu0"}, free: 256*format.MebiByte + minMemory}},
|
||||||
layers: []int{100 * format.MebiByte, 100 * format.MebiByte, 100 * format.MebiByte, 100 * format.MebiByte},
|
layers: []int{100 * format.MebiByte, 100 * format.MebiByte, 100 * format.MebiByte, 100 * format.MebiByte},
|
||||||
numGPU: 999,
|
numGPU: 999,
|
||||||
expected: ml.GPULayersList{{DeviceID: ml.DeviceID{ID: "gpu0"}, Layers: []int{0, 1, 2, 3}}},
|
expected: ml.GPULayersList{{DeviceID: ml.DeviceID{ID: "gpu0"}, Layers: []int{0, 1, 2, 3}}},
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
name: "Multi GPU fits on one",
|
name: "Multi GPU fits on one",
|
||||||
gpus: []gpu{{id: ml.DeviceID{ID: "gpu0"}, free: 128 * format.MebiByte}, {id: ml.DeviceID{ID: "gpu1"}, free: 256 * format.MebiByte}},
|
gpus: []gpu{{id: ml.DeviceID{ID: "gpu0"}, free: 128*format.MebiByte + minMemory}, {id: ml.DeviceID{ID: "gpu1"}, free: 256*format.MebiByte + minMemory}},
|
||||||
layers: []int{50 * format.MebiByte, 50 * format.MebiByte, 50 * format.MebiByte},
|
layers: []int{50 * format.MebiByte, 50 * format.MebiByte, 50 * format.MebiByte},
|
||||||
numGPU: -1,
|
numGPU: -1,
|
||||||
expected: ml.GPULayersList{{DeviceID: ml.DeviceID{ID: "gpu1"}, Layers: []int{0, 1, 2}}},
|
expected: ml.GPULayersList{{DeviceID: ml.DeviceID{ID: "gpu1"}, Layers: []int{0, 1, 2}}},
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
name: "Multi GPU split",
|
name: "Multi GPU split",
|
||||||
gpus: []gpu{{id: ml.DeviceID{ID: "gpu0"}, free: 128 * format.MebiByte}, {id: ml.DeviceID{ID: "gpu1"}, free: 256 * format.MebiByte}},
|
gpus: []gpu{{id: ml.DeviceID{ID: "gpu0"}, free: 128*format.MebiByte + minMemory}, {id: ml.DeviceID{ID: "gpu1"}, free: 256*format.MebiByte + minMemory}},
|
||||||
layers: []int{256 * format.MebiByte, 50 * format.MebiByte, 50 * format.MebiByte},
|
layers: []int{256 * format.MebiByte, 50 * format.MebiByte, 50 * format.MebiByte},
|
||||||
numGPU: -1,
|
numGPU: -1,
|
||||||
expected: ml.GPULayersList{{DeviceID: ml.DeviceID{ID: "gpu1"}, Layers: []int{0}}, {DeviceID: ml.DeviceID{ID: "gpu0"}, Layers: []int{1, 2}}},
|
expected: ml.GPULayersList{{DeviceID: ml.DeviceID{ID: "gpu1"}, Layers: []int{0}}, {DeviceID: ml.DeviceID{ID: "gpu0"}, Layers: []int{1, 2}}},
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
name: "Multi GPU partial",
|
name: "Multi GPU partial",
|
||||||
gpus: []gpu{{id: ml.DeviceID{ID: "gpu0"}, free: 128 * format.MebiByte}, {id: ml.DeviceID{ID: "gpu1"}, free: 256 * format.MebiByte}},
|
gpus: []gpu{{id: ml.DeviceID{ID: "gpu0"}, free: 128*format.MebiByte + minMemory}, {id: ml.DeviceID{ID: "gpu1"}, free: 256*format.MebiByte + minMemory}},
|
||||||
layers: []int{256 * format.MebiByte, 256 * format.MebiByte, 50 * format.MebiByte},
|
layers: []int{256 * format.MebiByte, 256 * format.MebiByte, 50 * format.MebiByte},
|
||||||
numGPU: -1,
|
numGPU: -1,
|
||||||
expected: ml.GPULayersList{{DeviceID: ml.DeviceID{ID: "gpu1"}, Layers: []int{1}}},
|
expected: ml.GPULayersList{{DeviceID: ml.DeviceID{ID: "gpu1"}, Layers: []int{1}}},
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
name: "Multi GPU numGPU 1",
|
name: "Multi GPU numGPU 1",
|
||||||
gpus: []gpu{{id: ml.DeviceID{ID: "gpu0"}, free: 128 * format.MebiByte}, {id: ml.DeviceID{ID: "gpu1"}, free: 256 * format.MebiByte}},
|
gpus: []gpu{{id: ml.DeviceID{ID: "gpu0"}, free: 128*format.MebiByte + minMemory}, {id: ml.DeviceID{ID: "gpu1"}, free: 256*format.MebiByte + minMemory}},
|
||||||
layers: []int{50 * format.MebiByte, 50 * format.MebiByte, 50 * format.MebiByte},
|
layers: []int{50 * format.MebiByte, 50 * format.MebiByte, 50 * format.MebiByte},
|
||||||
numGPU: 1,
|
numGPU: 1,
|
||||||
expected: ml.GPULayersList{{DeviceID: ml.DeviceID{ID: "gpu1"}, Layers: []int{1}}},
|
expected: ml.GPULayersList{{DeviceID: ml.DeviceID{ID: "gpu1"}, Layers: []int{1}}},
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
name: "Multi GPU numGPU 2",
|
name: "Multi GPU numGPU 2",
|
||||||
gpus: []gpu{{id: ml.DeviceID{ID: "gpu0"}, free: 128 * format.MebiByte}, {id: ml.DeviceID{ID: "gpu1"}, free: 256 * format.MebiByte}},
|
gpus: []gpu{{id: ml.DeviceID{ID: "gpu0"}, free: 128*format.MebiByte + minMemory}, {id: ml.DeviceID{ID: "gpu1"}, free: 256*format.MebiByte + minMemory}},
|
||||||
layers: []int{256 * format.MebiByte, 50 * format.MebiByte, 50 * format.MebiByte},
|
layers: []int{256 * format.MebiByte, 50 * format.MebiByte, 50 * format.MebiByte},
|
||||||
numGPU: 2,
|
numGPU: 2,
|
||||||
expected: ml.GPULayersList{{DeviceID: ml.DeviceID{ID: "gpu1"}, Layers: []int{0}}, {DeviceID: ml.DeviceID{ID: "gpu0"}, Layers: []int{1}}},
|
expected: ml.GPULayersList{{DeviceID: ml.DeviceID{ID: "gpu1"}, Layers: []int{0}}, {DeviceID: ml.DeviceID{ID: "gpu0"}, Layers: []int{1}}},
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
name: "Multi GPU numGPU 999",
|
name: "Multi GPU numGPU 999",
|
||||||
gpus: []gpu{{id: ml.DeviceID{ID: "gpu0"}, free: 128 * format.MebiByte}, {id: ml.DeviceID{ID: "gpu1"}, free: 256 * format.MebiByte}},
|
gpus: []gpu{{id: ml.DeviceID{ID: "gpu0"}, free: 128*format.MebiByte + minMemory}, {id: ml.DeviceID{ID: "gpu1"}, free: 256*format.MebiByte + minMemory}},
|
||||||
layers: []int{256 * format.MebiByte, 256 * format.MebiByte, 50 * format.MebiByte},
|
layers: []int{256 * format.MebiByte, 256 * format.MebiByte, 50 * format.MebiByte},
|
||||||
numGPU: 999,
|
numGPU: 999,
|
||||||
expected: ml.GPULayersList{{DeviceID: ml.DeviceID{ID: "gpu1"}, Layers: []int{0, 1}}, {DeviceID: ml.DeviceID{ID: "gpu0"}, Layers: []int{2}}},
|
expected: ml.GPULayersList{{DeviceID: ml.DeviceID{ID: "gpu1"}, Layers: []int{0, 1}}, {DeviceID: ml.DeviceID{ID: "gpu0"}, Layers: []int{2}}},
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
name: "Multi GPU different libraries",
|
name: "Multi GPU different libraries",
|
||||||
gpus: []gpu{{id: ml.DeviceID{Library: "CUDA", ID: "gpu0"}, free: 128 * format.MebiByte}, {id: ml.DeviceID{Library: "ROCm", ID: "gpu1"}, free: 256 * format.MebiByte}},
|
gpus: []gpu{{id: ml.DeviceID{Library: "CUDA", ID: "gpu0"}, free: 128*format.MebiByte + minMemory}, {id: ml.DeviceID{Library: "ROCm", ID: "gpu1"}, free: 256*format.MebiByte + minMemory}},
|
||||||
layers: []int{128 * format.MebiByte, 128 * format.MebiByte, 50 * format.MebiByte},
|
layers: []int{128 * format.MebiByte, 128 * format.MebiByte, 50 * format.MebiByte},
|
||||||
numGPU: -1,
|
numGPU: -1,
|
||||||
expected: ml.GPULayersList{{DeviceID: ml.DeviceID{ID: "gpu1", Library: "ROCm"}, Layers: []int{0, 1}}},
|
expected: ml.GPULayersList{{DeviceID: ml.DeviceID{ID: "gpu1", Library: "ROCm"}, Layers: []int{0, 1}}},
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
name: "requireFull",
|
name: "requireFull",
|
||||||
gpus: []gpu{{id: ml.DeviceID{ID: "gpu0"}, free: 256 * format.MebiByte}},
|
gpus: []gpu{{id: ml.DeviceID{ID: "gpu0"}, free: 256*format.MebiByte + minMemory}},
|
||||||
layers: []int{100 * format.MebiByte, 100 * format.MebiByte, 100 * format.MebiByte, 100 * format.MebiByte},
|
layers: []int{100 * format.MebiByte, 100 * format.MebiByte, 100 * format.MebiByte, 100 * format.MebiByte},
|
||||||
numGPU: -1,
|
numGPU: -1,
|
||||||
requireFull: true,
|
requireFull: true,
|
||||||
|
|
@ -139,12 +140,12 @@ func TestLLMServerFitGPU(t *testing.T) {
|
||||||
|
|
||||||
for _, tt := range tests {
|
for _, tt := range tests {
|
||||||
t.Run(tt.name, func(t *testing.T) {
|
t.Run(tt.name, func(t *testing.T) {
|
||||||
var systemInfo discover.SystemInfo
|
var systemInfo ml.SystemInfo
|
||||||
systemInfo.System.TotalMemory = format.GibiByte
|
systemInfo.TotalMemory = format.GibiByte
|
||||||
systemInfo.System.FreeMemory = 512 * format.MebiByte
|
systemInfo.FreeMemory = 512 * format.MebiByte
|
||||||
systemInfo.System.FreeSwap = 256 * format.MebiByte
|
systemInfo.FreeSwap = 256 * format.MebiByte
|
||||||
|
|
||||||
gpus := make(discover.GpuInfoList, len(tt.gpus))
|
gpus := make([]ml.DeviceInfo, len(tt.gpus))
|
||||||
for i := range tt.gpus {
|
for i := range tt.gpus {
|
||||||
gpus[i].DeviceID = tt.gpus[i].id
|
gpus[i].DeviceID = tt.gpus[i].id
|
||||||
gpus[i].FreeMemory = uint64(tt.gpus[i].free)
|
gpus[i].FreeMemory = uint64(tt.gpus[i].free)
|
||||||
|
|
|
||||||
215
ml/device.go
215
ml/device.go
|
|
@ -3,15 +3,21 @@ package ml
|
||||||
import (
|
import (
|
||||||
"context"
|
"context"
|
||||||
"encoding/binary"
|
"encoding/binary"
|
||||||
|
"encoding/json"
|
||||||
"fmt"
|
"fmt"
|
||||||
"hash/maphash"
|
"hash/maphash"
|
||||||
|
"io"
|
||||||
"log/slog"
|
"log/slog"
|
||||||
|
"net/http"
|
||||||
|
"runtime"
|
||||||
"slices"
|
"slices"
|
||||||
"sort"
|
"sort"
|
||||||
"strconv"
|
"strconv"
|
||||||
"strings"
|
"strings"
|
||||||
|
"time"
|
||||||
|
|
||||||
"github.com/ollama/ollama/format"
|
"github.com/ollama/ollama/format"
|
||||||
|
"github.com/ollama/ollama/logutil"
|
||||||
)
|
)
|
||||||
|
|
||||||
// GPULayers is a set of layers to be allocated on a single GPU
|
// GPULayers is a set of layers to be allocated on a single GPU
|
||||||
|
|
@ -282,6 +288,20 @@ type DeviceInfo struct {
|
||||||
LibraryPath []string
|
LibraryPath []string
|
||||||
}
|
}
|
||||||
|
|
||||||
|
type SystemInfo struct {
|
||||||
|
// ThreadCount is the optimal number of threads to use for inference
|
||||||
|
ThreadCount int `json:"threads,omitempty"`
|
||||||
|
|
||||||
|
// TotalMemory is the total amount of system memory
|
||||||
|
TotalMemory uint64 `json:"total_memory,omitempty"`
|
||||||
|
|
||||||
|
// FreeMemory is the amount of memory currently available on the system for loading models
|
||||||
|
FreeMemory uint64 `json:"free_memory,omitempty"`
|
||||||
|
|
||||||
|
// FreeSwap is the amount of system swap space reported as available
|
||||||
|
FreeSwap uint64 `json:"free_swap,omitempty"`
|
||||||
|
}
|
||||||
|
|
||||||
func (d DeviceInfo) Compute() string {
|
func (d DeviceInfo) Compute() string {
|
||||||
// AMD gfx is encoded into the major minor in hex form
|
// AMD gfx is encoded into the major minor in hex form
|
||||||
if strings.EqualFold(d.Library, "ROCm") {
|
if strings.EqualFold(d.Library, "ROCm") {
|
||||||
|
|
@ -294,6 +314,71 @@ func (d DeviceInfo) Driver() string {
|
||||||
return strconv.Itoa(d.DriverMajor) + "." + strconv.Itoa(d.DriverMinor)
|
return strconv.Itoa(d.DriverMajor) + "." + strconv.Itoa(d.DriverMinor)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// MinimumMemory reports the amount of memory that should be set aside
|
||||||
|
// on the device for overhead (e.g. VRAM consumed by context structures independent
|
||||||
|
// of model allocations)
|
||||||
|
func (d DeviceInfo) MinimumMemory() uint64 {
|
||||||
|
if d.Library == "Metal" {
|
||||||
|
return 512 * format.MebiByte
|
||||||
|
}
|
||||||
|
return 457 * format.MebiByte
|
||||||
|
}
|
||||||
|
|
||||||
|
// Sort by Free Space.
|
||||||
|
// iGPUs are reported first, thus Reverse() yields the largest discrete GPU first
|
||||||
|
type ByFreeMemory []DeviceInfo
|
||||||
|
|
||||||
|
func (a ByFreeMemory) Len() int { return len(a) }
|
||||||
|
func (a ByFreeMemory) Swap(i, j int) { a[i], a[j] = a[j], a[i] }
|
||||||
|
func (a ByFreeMemory) Less(i, j int) bool {
|
||||||
|
if a[i].Integrated && !a[j].Integrated {
|
||||||
|
return true
|
||||||
|
} else if !a[i].Integrated && a[j].Integrated {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
return a[i].FreeMemory < a[j].FreeMemory
|
||||||
|
}
|
||||||
|
|
||||||
|
func ByLibrary(l []DeviceInfo) [][]DeviceInfo {
|
||||||
|
resp := [][]DeviceInfo{}
|
||||||
|
libs := []string{}
|
||||||
|
for _, info := range l {
|
||||||
|
found := false
|
||||||
|
requested := info.Library
|
||||||
|
for i, lib := range libs {
|
||||||
|
if lib == requested {
|
||||||
|
resp[i] = append(resp[i], info)
|
||||||
|
found = true
|
||||||
|
break
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if !found {
|
||||||
|
libs = append(libs, requested)
|
||||||
|
resp = append(resp, []DeviceInfo{info})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return resp
|
||||||
|
}
|
||||||
|
|
||||||
|
func LibraryPaths(l []DeviceInfo) []string {
|
||||||
|
var gpuLibs []string
|
||||||
|
for _, gpu := range l {
|
||||||
|
for _, dir := range gpu.LibraryPath {
|
||||||
|
needed := true
|
||||||
|
for _, existing := range gpuLibs {
|
||||||
|
if dir == existing {
|
||||||
|
needed = false
|
||||||
|
break
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if needed {
|
||||||
|
gpuLibs = append(gpuLibs, dir)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return gpuLibs
|
||||||
|
}
|
||||||
|
|
||||||
type DeviceComparison int
|
type DeviceComparison int
|
||||||
|
|
||||||
const (
|
const (
|
||||||
|
|
@ -336,3 +421,133 @@ func (a DeviceInfo) IsBetter(b DeviceInfo) bool {
|
||||||
sort.Sort(sort.Reverse(sort.StringSlice(cmp)))
|
sort.Sort(sort.Reverse(sort.StringSlice(cmp)))
|
||||||
return cmp[0] == bLibSplit[1]
|
return cmp[0] == bLibSplit[1]
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// For each GPU, check if it does NOT support flash attention
|
||||||
|
func FlashAttentionSupported(l []DeviceInfo) bool {
|
||||||
|
for _, gpu := range l {
|
||||||
|
supportsFA := gpu.Library == "cpu" ||
|
||||||
|
gpu.Name == "Metal" || gpu.Library == "Metal" ||
|
||||||
|
(gpu.Library == "CUDA" && gpu.DriverMajor >= 7 && !(gpu.ComputeMajor == 7 && gpu.ComputeMinor == 2)) ||
|
||||||
|
gpu.Library == "ROCm"
|
||||||
|
|
||||||
|
if !supportsFA {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
|
||||||
|
// Given the list of GPUs this instantiation is targeted for,
|
||||||
|
// figure out the visible devices environment variables
|
||||||
|
func GetVisibleDevicesEnv(l []DeviceInfo) map[string]string {
|
||||||
|
if len(l) == 0 {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
env := map[string]string{}
|
||||||
|
for _, d := range l {
|
||||||
|
d.updateVisibleDevicesEnv(env)
|
||||||
|
}
|
||||||
|
return env
|
||||||
|
}
|
||||||
|
|
||||||
|
func (d DeviceInfo) updateVisibleDevicesEnv(env map[string]string) {
|
||||||
|
var envVar string
|
||||||
|
switch d.Library {
|
||||||
|
case "ROCm":
|
||||||
|
envVar = "ROCR_VISIBLE_DEVICES"
|
||||||
|
if runtime.GOOS != "linux" {
|
||||||
|
envVar = "HIP_VISIBLE_DEVICES"
|
||||||
|
}
|
||||||
|
case "Vulkan":
|
||||||
|
envVar = "GGML_VK_VISIBLE_DEVICES"
|
||||||
|
default:
|
||||||
|
return
|
||||||
|
}
|
||||||
|
v, existing := env[envVar]
|
||||||
|
if existing {
|
||||||
|
v = v + ","
|
||||||
|
}
|
||||||
|
if d.FilteredID != "" {
|
||||||
|
v = v + d.FilteredID
|
||||||
|
} else {
|
||||||
|
v = v + d.ID
|
||||||
|
}
|
||||||
|
env[envVar] = v
|
||||||
|
}
|
||||||
|
|
||||||
|
type BaseRunner interface {
|
||||||
|
// GetPort returns the localhost port number the runner is running on
|
||||||
|
GetPort() int
|
||||||
|
|
||||||
|
// HasExited indicates if the runner is no longer running. This can be used during
|
||||||
|
// bootstrap to detect if a given filtered device is incompatible and triggered an assert
|
||||||
|
HasExited() bool
|
||||||
|
}
|
||||||
|
|
||||||
|
type RunnerDiscovery interface {
|
||||||
|
BaseRunner
|
||||||
|
|
||||||
|
// GetDeviceInfos will perform a query of the underlying device libraries
|
||||||
|
// for device identification and free VRAM information
|
||||||
|
// During bootstrap scenarios, this routine may take seconds to complete
|
||||||
|
GetDeviceInfos(ctx context.Context) []DeviceInfo
|
||||||
|
}
|
||||||
|
|
||||||
|
type FilteredRunnerDiscovery interface {
|
||||||
|
RunnerDiscovery
|
||||||
|
|
||||||
|
// GetActiveDeviceIDs returns the filtered set of devices actively in
|
||||||
|
// use by this runner for running models. If the runner is a bootstrap runner, no devices
|
||||||
|
// will be active yet so no device IDs are returned.
|
||||||
|
// This routine will not query the underlying device and will return immediately
|
||||||
|
GetActiveDeviceIDs() []DeviceID
|
||||||
|
}
|
||||||
|
|
||||||
|
func GetDevicesFromRunner(ctx context.Context, runner BaseRunner) ([]DeviceInfo, error) {
|
||||||
|
var moreDevices []DeviceInfo
|
||||||
|
port := runner.GetPort()
|
||||||
|
tick := time.Tick(10 * time.Millisecond)
|
||||||
|
for {
|
||||||
|
select {
|
||||||
|
case <-ctx.Done():
|
||||||
|
return nil, fmt.Errorf("failed to finish discovery before timeout")
|
||||||
|
case <-tick:
|
||||||
|
r, err := http.NewRequestWithContext(ctx, http.MethodGet, fmt.Sprintf("http://127.0.0.1:%d/info", port), nil)
|
||||||
|
if err != nil {
|
||||||
|
return nil, fmt.Errorf("failed to create request: %w", err)
|
||||||
|
}
|
||||||
|
r.Header.Set("Content-Type", "application/json")
|
||||||
|
|
||||||
|
resp, err := http.DefaultClient.Do(r)
|
||||||
|
if err != nil {
|
||||||
|
// slog.Warn("failed to send request", "error", err)
|
||||||
|
if runner.HasExited() {
|
||||||
|
return nil, fmt.Errorf("runner crashed")
|
||||||
|
}
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
defer resp.Body.Close()
|
||||||
|
|
||||||
|
if resp.StatusCode == http.StatusNotFound {
|
||||||
|
// old runner, fall back to bootstrapping model
|
||||||
|
return nil, fmt.Errorf("llamarunner free vram reporting not supported")
|
||||||
|
}
|
||||||
|
|
||||||
|
body, err := io.ReadAll(resp.Body)
|
||||||
|
if err != nil {
|
||||||
|
slog.Warn("failed to read response", "error", err)
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
if resp.StatusCode != 200 {
|
||||||
|
logutil.Trace("runner failed to discover free VRAM", "status", resp.StatusCode, "response", body)
|
||||||
|
return nil, fmt.Errorf("runner error: %s", string(body))
|
||||||
|
}
|
||||||
|
|
||||||
|
if err := json.Unmarshal(body, &moreDevices); err != nil {
|
||||||
|
slog.Warn("unmarshal encode response", "error", err)
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
return moreDevices, nil
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
|
||||||
|
|
@ -84,11 +84,11 @@ function buildCPU() {
|
||||||
Remove-Item -ea 0 -recurse -force -path "${script:SRC_DIR}\dist\windows-${script:ARCH}"
|
Remove-Item -ea 0 -recurse -force -path "${script:SRC_DIR}\dist\windows-${script:ARCH}"
|
||||||
New-Item "${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\" -ItemType Directory -ea 0
|
New-Item "${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\" -ItemType Directory -ea 0
|
||||||
|
|
||||||
& cmake --fresh --preset CPU --install-prefix $script:DIST_DIR
|
& cmake -B build\cpu --preset CPU --install-prefix $script:DIST_DIR
|
||||||
if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
|
if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
|
||||||
& cmake --build --preset CPU --config Release --parallel $script:JOBS
|
& cmake --build build\cpu --target ggml-cpu --config Release --parallel $script:JOBS
|
||||||
if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
|
if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
|
||||||
& cmake --install build --component CPU --strip
|
& cmake --install build\cpu --component CPU --strip
|
||||||
if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
|
if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
@ -105,11 +105,11 @@ function buildCUDA11() {
|
||||||
$hashEnv.Keys | foreach { if ($_.Contains("CUDA_PATH_V11")) { $x=$hashEnv[$_]; if (test-path -literalpath "$x\bin\nvcc.exe" ) { $cuda=$x} }}
|
$hashEnv.Keys | foreach { if ($_.Contains("CUDA_PATH_V11")) { $x=$hashEnv[$_]; if (test-path -literalpath "$x\bin\nvcc.exe" ) { $cuda=$x} }}
|
||||||
write-host "Building CUDA v11 backend libraries $cuda"
|
write-host "Building CUDA v11 backend libraries $cuda"
|
||||||
$env:CUDAToolkit_ROOT=$cuda
|
$env:CUDAToolkit_ROOT=$cuda
|
||||||
& cmake --fresh --preset "CUDA 11" -T cuda="$cuda" -DCMAKE_CUDA_COMPILER="$cuda\bin\nvcc.exe" -G "Visual Studio 16 2019" --install-prefix $script:DIST_DIR -DOLLAMA_RUNNER_DIR="cuda_v11"
|
& cmake -B build\cuda_v11 --preset "CUDA 11" -T cuda="$cuda" -DCMAKE_CUDA_COMPILER="$cuda\bin\nvcc.exe" -G "Visual Studio 16 2019" --install-prefix $script:DIST_DIR -DOLLAMA_RUNNER_DIR="cuda_v11"
|
||||||
if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
|
if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
|
||||||
& cmake --build --preset "CUDA 11" --config Release --parallel $script:JOBS
|
& cmake --build build\cuda_v11 --target ggml-cuda --config Release --parallel $script:JOBS
|
||||||
if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
|
if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
|
||||||
& cmake --install build --component "CUDA" --strip
|
& cmake --install build\cuda_v11 --component "CUDA" --strip
|
||||||
if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
|
if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
@ -124,11 +124,11 @@ function buildCUDA12() {
|
||||||
$hashEnv.Keys | foreach { if ($_.Contains("CUDA_PATH_V12_8")) { $x=$hashEnv[$_]; if (test-path -literalpath "$x\bin\nvcc.exe" ) { $cuda=$x} }}
|
$hashEnv.Keys | foreach { if ($_.Contains("CUDA_PATH_V12_8")) { $x=$hashEnv[$_]; if (test-path -literalpath "$x\bin\nvcc.exe" ) { $cuda=$x} }}
|
||||||
write-host "Building CUDA v12 backend libraries $cuda"
|
write-host "Building CUDA v12 backend libraries $cuda"
|
||||||
$env:CUDAToolkit_ROOT=$cuda
|
$env:CUDAToolkit_ROOT=$cuda
|
||||||
& cmake --fresh --preset "CUDA 12" -T cuda="$cuda" --install-prefix $script:DIST_DIR -DOLLAMA_RUNNER_DIR="cuda_v12"
|
& cmake -B build\cuda_v12 --preset "CUDA 12" -T cuda="$cuda" --install-prefix $script:DIST_DIR -DOLLAMA_RUNNER_DIR="cuda_v12"
|
||||||
if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
|
if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
|
||||||
& cmake --build --preset "CUDA 12" --config Release --parallel $script:JOBS
|
& cmake --build build\cuda_v12 --target ggml-cuda --config Release --parallel $script:JOBS
|
||||||
if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
|
if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
|
||||||
& cmake --install build --component "CUDA" --strip
|
& cmake --install build\cuda_v12 --component "CUDA" --strip
|
||||||
if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
|
if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
@ -143,11 +143,11 @@ function buildCUDA13() {
|
||||||
$hashEnv.Keys | foreach { if ($_.Contains("CUDA_PATH_V13")) { $x=$hashEnv[$_]; if (test-path -literalpath "$x\bin\nvcc.exe" ) { $cuda=$x} }}
|
$hashEnv.Keys | foreach { if ($_.Contains("CUDA_PATH_V13")) { $x=$hashEnv[$_]; if (test-path -literalpath "$x\bin\nvcc.exe" ) { $cuda=$x} }}
|
||||||
$env:CUDAToolkit_ROOT=$cuda
|
$env:CUDAToolkit_ROOT=$cuda
|
||||||
write-host "Building CUDA v13 backend libraries $cuda"
|
write-host "Building CUDA v13 backend libraries $cuda"
|
||||||
& cmake --fresh --preset "CUDA 13" -T cuda="$cuda" --install-prefix $script:DIST_DIR -DOLLAMA_RUNNER_DIR="cuda_v13"
|
& cmake -B build\cuda_v13 --preset "CUDA 13" -T cuda="$cuda" --install-prefix $script:DIST_DIR -DOLLAMA_RUNNER_DIR="cuda_v13"
|
||||||
if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
|
if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
|
||||||
& cmake --build --preset "CUDA 13" --config Release --parallel $script:JOBS
|
& cmake --build build\cuda_v13 --target ggml-cuda --config Release --parallel $script:JOBS
|
||||||
if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
|
if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
|
||||||
& cmake --install build --component "CUDA" --strip
|
& cmake --install build\cuda_v13 --component "CUDA" --strip
|
||||||
if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
|
if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
@ -165,7 +165,7 @@ function buildROCm() {
|
||||||
$env:HIPCXX="${env:HIP_PATH}\bin\clang++.exe"
|
$env:HIPCXX="${env:HIP_PATH}\bin\clang++.exe"
|
||||||
$env:HIP_PLATFORM="amd"
|
$env:HIP_PLATFORM="amd"
|
||||||
$env:CMAKE_PREFIX_PATH="${env:HIP_PATH}"
|
$env:CMAKE_PREFIX_PATH="${env:HIP_PATH}"
|
||||||
& cmake --fresh --preset "ROCm 6" -G Ninja -DOLLAMA_RUNNER_DIR="rocm" `
|
& cmake --fresh -B build\rocm --preset "ROCm 6" -G Ninja -DOLLAMA_RUNNER_DIR="rocm" `
|
||||||
-DCMAKE_C_COMPILER=clang `
|
-DCMAKE_C_COMPILER=clang `
|
||||||
-DCMAKE_CXX_COMPILER=clang++ `
|
-DCMAKE_CXX_COMPILER=clang++ `
|
||||||
-DCMAKE_C_FLAGS="-parallel-jobs=4 -Wno-ignored-attributes -Wno-deprecated-pragma" `
|
-DCMAKE_C_FLAGS="-parallel-jobs=4 -Wno-ignored-attributes -Wno-deprecated-pragma" `
|
||||||
|
|
@ -175,9 +175,9 @@ function buildROCm() {
|
||||||
$env:HIPCXX=""
|
$env:HIPCXX=""
|
||||||
$env:HIP_PLATFORM=""
|
$env:HIP_PLATFORM=""
|
||||||
$env:CMAKE_PREFIX_PATH=""
|
$env:CMAKE_PREFIX_PATH=""
|
||||||
& cmake --build --preset "ROCm 6" --config Release --parallel $script:JOBS
|
& cmake --build build\rocm --target ggml-hip --config Release --parallel $script:JOBS
|
||||||
if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
|
if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
|
||||||
& cmake --install build --component "HIP" --strip
|
& cmake --install build\rocm --component "HIP" --strip
|
||||||
if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
|
if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
|
||||||
Remove-Item -Path $script:DIST_DIR\lib\ollama\rocm\rocblas\library\*gfx906* -ErrorAction SilentlyContinue
|
Remove-Item -Path $script:DIST_DIR\lib\ollama\rocm\rocblas\library\*gfx906* -ErrorAction SilentlyContinue
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -9,9 +9,9 @@ import (
|
||||||
|
|
||||||
"github.com/gin-gonic/gin"
|
"github.com/gin-gonic/gin"
|
||||||
"github.com/ollama/ollama/api"
|
"github.com/ollama/ollama/api"
|
||||||
"github.com/ollama/ollama/discover"
|
|
||||||
"github.com/ollama/ollama/fs/ggml"
|
"github.com/ollama/ollama/fs/ggml"
|
||||||
"github.com/ollama/ollama/llm"
|
"github.com/ollama/ollama/llm"
|
||||||
|
"github.com/ollama/ollama/ml"
|
||||||
)
|
)
|
||||||
|
|
||||||
func TestGenerateDebugRenderOnly(t *testing.T) {
|
func TestGenerateDebugRenderOnly(t *testing.T) {
|
||||||
|
|
@ -37,9 +37,9 @@ func TestGenerateDebugRenderOnly(t *testing.T) {
|
||||||
loaded: make(map[string]*runnerRef),
|
loaded: make(map[string]*runnerRef),
|
||||||
newServerFn: newMockServer(&mock),
|
newServerFn: newMockServer(&mock),
|
||||||
getGpuFn: getGpuFn,
|
getGpuFn: getGpuFn,
|
||||||
getCpuFn: getCpuFn,
|
getSystemInfoFn: getSystemInfoFn,
|
||||||
waitForRecovery: 250 * time.Millisecond,
|
waitForRecovery: 250 * time.Millisecond,
|
||||||
loadFn: func(req *LlmRequest, _ *ggml.GGML, _ discover.GpuInfoList, _ bool) bool {
|
loadFn: func(req *LlmRequest, _ *ggml.GGML, _ ml.SystemInfo, _ []ml.DeviceInfo, _ bool) bool {
|
||||||
// add small delay to simulate loading
|
// add small delay to simulate loading
|
||||||
time.Sleep(time.Millisecond)
|
time.Sleep(time.Millisecond)
|
||||||
req.successCh <- &runnerRef{
|
req.successCh <- &runnerRef{
|
||||||
|
|
@ -230,9 +230,9 @@ func TestChatDebugRenderOnly(t *testing.T) {
|
||||||
loaded: make(map[string]*runnerRef),
|
loaded: make(map[string]*runnerRef),
|
||||||
newServerFn: newMockServer(&mock),
|
newServerFn: newMockServer(&mock),
|
||||||
getGpuFn: getGpuFn,
|
getGpuFn: getGpuFn,
|
||||||
getCpuFn: getCpuFn,
|
getSystemInfoFn: getSystemInfoFn,
|
||||||
waitForRecovery: 250 * time.Millisecond,
|
waitForRecovery: 250 * time.Millisecond,
|
||||||
loadFn: func(req *LlmRequest, _ *ggml.GGML, _ discover.GpuInfoList, _ bool) bool {
|
loadFn: func(req *LlmRequest, _ *ggml.GGML, _ ml.SystemInfo, _ []ml.DeviceInfo, _ bool) bool {
|
||||||
// add small delay to simulate loading
|
// add small delay to simulate loading
|
||||||
time.Sleep(time.Millisecond)
|
time.Sleep(time.Millisecond)
|
||||||
req.successCh <- &runnerRef{
|
req.successCh <- &runnerRef{
|
||||||
|
|
|
||||||
|
|
@ -12,9 +12,9 @@ import (
|
||||||
"github.com/google/go-cmp/cmp"
|
"github.com/google/go-cmp/cmp"
|
||||||
|
|
||||||
"github.com/ollama/ollama/api"
|
"github.com/ollama/ollama/api"
|
||||||
"github.com/ollama/ollama/discover"
|
|
||||||
"github.com/ollama/ollama/fs/ggml"
|
"github.com/ollama/ollama/fs/ggml"
|
||||||
"github.com/ollama/ollama/llm"
|
"github.com/ollama/ollama/llm"
|
||||||
|
"github.com/ollama/ollama/ml"
|
||||||
)
|
)
|
||||||
|
|
||||||
// TestGenerateWithBuiltinRenderer tests that api/generate uses built-in renderers
|
// TestGenerateWithBuiltinRenderer tests that api/generate uses built-in renderers
|
||||||
|
|
@ -42,9 +42,9 @@ func TestGenerateWithBuiltinRenderer(t *testing.T) {
|
||||||
loaded: make(map[string]*runnerRef),
|
loaded: make(map[string]*runnerRef),
|
||||||
newServerFn: newMockServer(&mock),
|
newServerFn: newMockServer(&mock),
|
||||||
getGpuFn: getGpuFn,
|
getGpuFn: getGpuFn,
|
||||||
getCpuFn: getCpuFn,
|
getSystemInfoFn: getSystemInfoFn,
|
||||||
waitForRecovery: 250 * time.Millisecond,
|
waitForRecovery: 250 * time.Millisecond,
|
||||||
loadFn: func(req *LlmRequest, _ *ggml.GGML, _ discover.GpuInfoList, _ bool) bool {
|
loadFn: func(req *LlmRequest, _ *ggml.GGML, _ ml.SystemInfo, _ []ml.DeviceInfo, _ bool) bool {
|
||||||
time.Sleep(time.Millisecond)
|
time.Sleep(time.Millisecond)
|
||||||
req.successCh <- &runnerRef{
|
req.successCh <- &runnerRef{
|
||||||
llama: &mock,
|
llama: &mock,
|
||||||
|
|
@ -226,9 +226,9 @@ func TestGenerateWithDebugRenderOnly(t *testing.T) {
|
||||||
loaded: make(map[string]*runnerRef),
|
loaded: make(map[string]*runnerRef),
|
||||||
newServerFn: newMockServer(&mock),
|
newServerFn: newMockServer(&mock),
|
||||||
getGpuFn: getGpuFn,
|
getGpuFn: getGpuFn,
|
||||||
getCpuFn: getCpuFn,
|
getSystemInfoFn: getSystemInfoFn,
|
||||||
waitForRecovery: 250 * time.Millisecond,
|
waitForRecovery: 250 * time.Millisecond,
|
||||||
loadFn: func(req *LlmRequest, _ *ggml.GGML, _ discover.GpuInfoList, _ bool) bool {
|
loadFn: func(req *LlmRequest, _ *ggml.GGML, _ ml.SystemInfo, _ []ml.DeviceInfo, _ bool) bool {
|
||||||
time.Sleep(time.Millisecond)
|
time.Sleep(time.Millisecond)
|
||||||
req.successCh <- &runnerRef{
|
req.successCh <- &runnerRef{
|
||||||
llama: &mock,
|
llama: &mock,
|
||||||
|
|
|
||||||
|
|
@ -17,9 +17,9 @@ import (
|
||||||
"github.com/google/go-cmp/cmp"
|
"github.com/google/go-cmp/cmp"
|
||||||
|
|
||||||
"github.com/ollama/ollama/api"
|
"github.com/ollama/ollama/api"
|
||||||
"github.com/ollama/ollama/discover"
|
|
||||||
"github.com/ollama/ollama/fs/ggml"
|
"github.com/ollama/ollama/fs/ggml"
|
||||||
"github.com/ollama/ollama/llm"
|
"github.com/ollama/ollama/llm"
|
||||||
|
"github.com/ollama/ollama/ml"
|
||||||
)
|
)
|
||||||
|
|
||||||
type mockRunner struct {
|
type mockRunner struct {
|
||||||
|
|
@ -48,8 +48,8 @@ func (mockRunner) Tokenize(_ context.Context, s string) (tokens []int, err error
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
func newMockServer(mock *mockRunner) func(discover.GpuInfoList, string, *ggml.GGML, []string, []string, api.Options, int) (llm.LlamaServer, error) {
|
func newMockServer(mock *mockRunner) func(ml.SystemInfo, []ml.DeviceInfo, string, *ggml.GGML, []string, []string, api.Options, int) (llm.LlamaServer, error) {
|
||||||
return func(_ discover.GpuInfoList, _ string, _ *ggml.GGML, _, _ []string, _ api.Options, _ int) (llm.LlamaServer, error) {
|
return func(_ ml.SystemInfo, _ []ml.DeviceInfo, _ string, _ *ggml.GGML, _, _ []string, _ api.Options, _ int) (llm.LlamaServer, error) {
|
||||||
return mock, nil
|
return mock, nil
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
@ -157,9 +157,9 @@ func TestGenerateChat(t *testing.T) {
|
||||||
loaded: make(map[string]*runnerRef),
|
loaded: make(map[string]*runnerRef),
|
||||||
newServerFn: newMockServer(&mock),
|
newServerFn: newMockServer(&mock),
|
||||||
getGpuFn: getGpuFn,
|
getGpuFn: getGpuFn,
|
||||||
getCpuFn: getCpuFn,
|
getSystemInfoFn: getSystemInfoFn,
|
||||||
waitForRecovery: 250 * time.Millisecond,
|
waitForRecovery: 250 * time.Millisecond,
|
||||||
loadFn: func(req *LlmRequest, _ *ggml.GGML, _ discover.GpuInfoList, _ bool) bool {
|
loadFn: func(req *LlmRequest, _ *ggml.GGML, _ ml.SystemInfo, _ []ml.DeviceInfo, _ bool) bool {
|
||||||
// add small delay to simulate loading
|
// add small delay to simulate loading
|
||||||
time.Sleep(time.Millisecond)
|
time.Sleep(time.Millisecond)
|
||||||
req.successCh <- &runnerRef{
|
req.successCh <- &runnerRef{
|
||||||
|
|
@ -768,9 +768,9 @@ func TestGenerate(t *testing.T) {
|
||||||
loaded: make(map[string]*runnerRef),
|
loaded: make(map[string]*runnerRef),
|
||||||
newServerFn: newMockServer(&mock),
|
newServerFn: newMockServer(&mock),
|
||||||
getGpuFn: getGpuFn,
|
getGpuFn: getGpuFn,
|
||||||
getCpuFn: getCpuFn,
|
getSystemInfoFn: getSystemInfoFn,
|
||||||
waitForRecovery: 250 * time.Millisecond,
|
waitForRecovery: 250 * time.Millisecond,
|
||||||
loadFn: func(req *LlmRequest, _ *ggml.GGML, _ discover.GpuInfoList, _ bool) bool {
|
loadFn: func(req *LlmRequest, _ *ggml.GGML, _ ml.SystemInfo, _ []ml.DeviceInfo, _ bool) bool {
|
||||||
// add small delay to simulate loading
|
// add small delay to simulate loading
|
||||||
time.Sleep(time.Millisecond)
|
time.Sleep(time.Millisecond)
|
||||||
req.successCh <- &runnerRef{
|
req.successCh <- &runnerRef{
|
||||||
|
|
@ -1193,9 +1193,9 @@ func TestChatWithPromptEndingInThinkTag(t *testing.T) {
|
||||||
loaded: make(map[string]*runnerRef),
|
loaded: make(map[string]*runnerRef),
|
||||||
newServerFn: newMockServer(mock),
|
newServerFn: newMockServer(mock),
|
||||||
getGpuFn: getGpuFn,
|
getGpuFn: getGpuFn,
|
||||||
getCpuFn: getCpuFn,
|
getSystemInfoFn: getSystemInfoFn,
|
||||||
waitForRecovery: 250 * time.Millisecond,
|
waitForRecovery: 250 * time.Millisecond,
|
||||||
loadFn: func(req *LlmRequest, _ *ggml.GGML, _ discover.GpuInfoList, _ bool) bool {
|
loadFn: func(req *LlmRequest, _ *ggml.GGML, _ ml.SystemInfo, _ []ml.DeviceInfo, _ bool) bool {
|
||||||
time.Sleep(time.Millisecond)
|
time.Sleep(time.Millisecond)
|
||||||
req.successCh <- &runnerRef{llama: mock}
|
req.successCh <- &runnerRef{llama: mock}
|
||||||
return false
|
return false
|
||||||
|
|
|
||||||
|
|
@ -14,9 +14,9 @@ import (
|
||||||
|
|
||||||
"github.com/gin-gonic/gin"
|
"github.com/gin-gonic/gin"
|
||||||
"github.com/ollama/ollama/api"
|
"github.com/ollama/ollama/api"
|
||||||
"github.com/ollama/ollama/discover"
|
|
||||||
"github.com/ollama/ollama/fs/ggml"
|
"github.com/ollama/ollama/fs/ggml"
|
||||||
"github.com/ollama/ollama/llm"
|
"github.com/ollama/ollama/llm"
|
||||||
|
"github.com/ollama/ollama/ml"
|
||||||
)
|
)
|
||||||
|
|
||||||
func getTestTools() []api.Tool {
|
func getTestTools() []api.Tool {
|
||||||
|
|
@ -275,9 +275,9 @@ func TestChatHarmonyParserStreamingRealtime(t *testing.T) {
|
||||||
loaded: make(map[string]*runnerRef),
|
loaded: make(map[string]*runnerRef),
|
||||||
newServerFn: newMockServer(&mock),
|
newServerFn: newMockServer(&mock),
|
||||||
getGpuFn: getGpuFn,
|
getGpuFn: getGpuFn,
|
||||||
getCpuFn: getCpuFn,
|
getSystemInfoFn: getSystemInfoFn,
|
||||||
waitForRecovery: 100 * time.Millisecond,
|
waitForRecovery: 100 * time.Millisecond,
|
||||||
loadFn: func(req *LlmRequest, _ *ggml.GGML, _ discover.GpuInfoList, _ bool) bool {
|
loadFn: func(req *LlmRequest, _ *ggml.GGML, _ ml.SystemInfo, _ []ml.DeviceInfo, _ bool) bool {
|
||||||
req.successCh <- &runnerRef{
|
req.successCh <- &runnerRef{
|
||||||
llama: &mock,
|
llama: &mock,
|
||||||
}
|
}
|
||||||
|
|
@ -426,9 +426,9 @@ func TestChatHarmonyParserStreamingSimple(t *testing.T) {
|
||||||
loaded: make(map[string]*runnerRef),
|
loaded: make(map[string]*runnerRef),
|
||||||
newServerFn: newMockServer(&mock),
|
newServerFn: newMockServer(&mock),
|
||||||
getGpuFn: getGpuFn,
|
getGpuFn: getGpuFn,
|
||||||
getCpuFn: getCpuFn,
|
getSystemInfoFn: getSystemInfoFn,
|
||||||
waitForRecovery: 100 * time.Millisecond,
|
waitForRecovery: 100 * time.Millisecond,
|
||||||
loadFn: func(req *LlmRequest, _ *ggml.GGML, _ discover.GpuInfoList, _ bool) bool {
|
loadFn: func(req *LlmRequest, _ *ggml.GGML, _ ml.SystemInfo, _ []ml.DeviceInfo, _ bool) bool {
|
||||||
req.successCh <- &runnerRef{
|
req.successCh <- &runnerRef{
|
||||||
llama: &mock,
|
llama: &mock,
|
||||||
}
|
}
|
||||||
|
|
@ -608,9 +608,9 @@ func TestChatHarmonyParserStreaming(t *testing.T) {
|
||||||
loaded: make(map[string]*runnerRef),
|
loaded: make(map[string]*runnerRef),
|
||||||
newServerFn: newMockServer(&mock),
|
newServerFn: newMockServer(&mock),
|
||||||
getGpuFn: getGpuFn,
|
getGpuFn: getGpuFn,
|
||||||
getCpuFn: getCpuFn,
|
getSystemInfoFn: getSystemInfoFn,
|
||||||
waitForRecovery: 250 * time.Millisecond,
|
waitForRecovery: 250 * time.Millisecond,
|
||||||
loadFn: func(req *LlmRequest, _ *ggml.GGML, _ discover.GpuInfoList, _ bool) bool {
|
loadFn: func(req *LlmRequest, _ *ggml.GGML, _ ml.SystemInfo, _ []ml.DeviceInfo, _ bool) bool {
|
||||||
req.successCh <- &runnerRef{
|
req.successCh <- &runnerRef{
|
||||||
llama: &mock,
|
llama: &mock,
|
||||||
}
|
}
|
||||||
|
|
|
||||||
111
server/sched.go
111
server/sched.go
|
|
@ -5,12 +5,9 @@ import (
|
||||||
"errors"
|
"errors"
|
||||||
"fmt"
|
"fmt"
|
||||||
"log/slog"
|
"log/slog"
|
||||||
"os"
|
|
||||||
"reflect"
|
"reflect"
|
||||||
"runtime"
|
|
||||||
"slices"
|
"slices"
|
||||||
"sort"
|
"sort"
|
||||||
"strconv"
|
|
||||||
"strings"
|
"strings"
|
||||||
"sync"
|
"sync"
|
||||||
"time"
|
"time"
|
||||||
|
|
@ -52,12 +49,10 @@ type Scheduler struct {
|
||||||
activeLoading llm.LlamaServer
|
activeLoading llm.LlamaServer
|
||||||
loaded map[string]*runnerRef
|
loaded map[string]*runnerRef
|
||||||
|
|
||||||
loadFn func(req *LlmRequest, f *ggml.GGML, gpus discover.GpuInfoList, requireFull bool) bool
|
loadFn func(req *LlmRequest, f *ggml.GGML, systemInfo ml.SystemInfo, gpus []ml.DeviceInfo, requireFull bool) bool
|
||||||
newServerFn func(gpus discover.GpuInfoList, model string, f *ggml.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error)
|
newServerFn func(systemInfo ml.SystemInfo, gpus []ml.DeviceInfo, model string, f *ggml.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error)
|
||||||
getGpuFn func(ctx context.Context, runners []discover.FilteredRunnerDiscovery) discover.GpuInfoList
|
getGpuFn func(ctx context.Context, runners []ml.FilteredRunnerDiscovery) []ml.DeviceInfo
|
||||||
getCpuFn func() discover.GpuInfo
|
getSystemInfoFn func() ml.SystemInfo
|
||||||
|
|
||||||
// waitForRecovery sets the limit for how long to wait for memory usage to recover after unload before scheduling the next model
|
|
||||||
waitForRecovery time.Duration
|
waitForRecovery time.Duration
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -77,8 +72,8 @@ func InitScheduler(ctx context.Context) *Scheduler {
|
||||||
unloadedCh: make(chan any, maxQueue),
|
unloadedCh: make(chan any, maxQueue),
|
||||||
loaded: make(map[string]*runnerRef),
|
loaded: make(map[string]*runnerRef),
|
||||||
newServerFn: llm.NewLlamaServer,
|
newServerFn: llm.NewLlamaServer,
|
||||||
getGpuFn: discover.GetGPUInfo,
|
getGpuFn: discover.GPUDevices,
|
||||||
getCpuFn: discover.GetCPUInfo,
|
getSystemInfoFn: discover.GetSystemInfo,
|
||||||
waitForRecovery: 5 * time.Second,
|
waitForRecovery: 5 * time.Second,
|
||||||
}
|
}
|
||||||
sched.loadFn = sched.load
|
sched.loadFn = sched.load
|
||||||
|
|
@ -133,6 +128,8 @@ func (s *Scheduler) Run(ctx context.Context) {
|
||||||
}
|
}
|
||||||
|
|
||||||
func (s *Scheduler) processPending(ctx context.Context) {
|
func (s *Scheduler) processPending(ctx context.Context) {
|
||||||
|
maxRunners := envconfig.MaxRunners()
|
||||||
|
|
||||||
for {
|
for {
|
||||||
select {
|
select {
|
||||||
case <-ctx.Done():
|
case <-ctx.Done():
|
||||||
|
|
@ -152,7 +149,7 @@ func (s *Scheduler) processPending(ctx context.Context) {
|
||||||
s.loadedMu.Lock()
|
s.loadedMu.Lock()
|
||||||
runner := s.loaded[pending.model.ModelPath]
|
runner := s.loaded[pending.model.ModelPath]
|
||||||
loadedCount := len(s.loaded)
|
loadedCount := len(s.loaded)
|
||||||
runnersSnapshot := make([]discover.FilteredRunnerDiscovery, 0, len(s.loaded))
|
runnersSnapshot := make([]ml.FilteredRunnerDiscovery, 0, len(s.loaded))
|
||||||
for _, r := range s.loaded {
|
for _, r := range s.loaded {
|
||||||
runnersSnapshot = append(runnersSnapshot, r)
|
runnersSnapshot = append(runnersSnapshot, r)
|
||||||
}
|
}
|
||||||
|
|
@ -167,39 +164,29 @@ func (s *Scheduler) processPending(ctx context.Context) {
|
||||||
pending.useLoadedRunner(runner, s.finishedReqCh)
|
pending.useLoadedRunner(runner, s.finishedReqCh)
|
||||||
break
|
break
|
||||||
}
|
}
|
||||||
} else if envconfig.MaxRunners() > 0 && loadedCount >= int(envconfig.MaxRunners()) {
|
} else if maxRunners > 0 && loadedCount >= int(maxRunners) {
|
||||||
slog.Debug("max runners achieved, unloading one to make room", "runner_count", loadedCount)
|
slog.Debug("max runners achieved, unloading one to make room", "runner_count", loadedCount)
|
||||||
runnerToExpire = s.findRunnerToUnload()
|
runnerToExpire = s.findRunnerToUnload()
|
||||||
} else {
|
} else {
|
||||||
// Either no models are loaded or below envconfig.MaxRunners
|
// Either no models are loaded or below envconfig.MaxRunners
|
||||||
// Get a refreshed GPU list
|
// Get a refreshed GPU list
|
||||||
var gpus discover.GpuInfoList
|
var gpus []ml.DeviceInfo
|
||||||
if pending.opts.NumGPU == 0 {
|
if pending.opts.NumGPU == 0 {
|
||||||
gpus = discover.GpuInfoList{s.getCpuFn()}
|
gpus = []ml.DeviceInfo{}
|
||||||
} else {
|
} else {
|
||||||
gpus = s.getGpuFn(ctx, runnersSnapshot)
|
gpus = s.getGpuFn(ctx, runnersSnapshot)
|
||||||
}
|
}
|
||||||
|
systemInfo := s.getSystemInfoFn()
|
||||||
if envconfig.MaxRunners() <= 0 {
|
if maxRunners <= 0 {
|
||||||
// No user specified MaxRunners, so figure out what automatic setting to use
|
// No user specified MaxRunners, so figure out what automatic setting to use for the next load attempt
|
||||||
// If all GPUs have reliable free memory reporting, defaultModelsPerGPU * the number of GPUs
|
if pending.opts.NumGPU == 0 {
|
||||||
// if any GPU has unreliable free memory reporting, 1x the number of GPUs
|
// Need to get actual GPU list to set the correct default max models
|
||||||
allReliable := true
|
g := s.getGpuFn(ctx, runnersSnapshot)
|
||||||
for _, gpu := range gpus {
|
maxRunners = uint(defaultModelsPerGPU * max(len(g), 1))
|
||||||
if gpu.UnreliableFreeMemory {
|
|
||||||
allReliable = false
|
|
||||||
break
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if allReliable {
|
|
||||||
// HACK
|
|
||||||
os.Setenv("OLLAMA_MAX_LOADED_MODELS", strconv.Itoa(defaultModelsPerGPU*len(gpus)))
|
|
||||||
slog.Debug("updating default concurrency", "OLLAMA_MAX_LOADED_MODELS", envconfig.MaxRunners(), "gpu_count", len(gpus))
|
|
||||||
} else {
|
} else {
|
||||||
// HACK
|
maxRunners = uint(defaultModelsPerGPU * max(len(gpus), 1))
|
||||||
os.Setenv("OLLAMA_MAX_LOADED_MODELS", strconv.Itoa(len(gpus)))
|
|
||||||
slog.Info("one or more GPUs detected that are unable to accurately report free memory - disabling default concurrency")
|
|
||||||
}
|
}
|
||||||
|
slog.Debug("updating default concurrency", "OLLAMA_MAX_LOADED_MODELS", maxRunners, "gpu_count", len(gpus))
|
||||||
}
|
}
|
||||||
|
|
||||||
// Load model for fitting
|
// Load model for fitting
|
||||||
|
|
@ -215,14 +202,14 @@ func (s *Scheduler) processPending(ctx context.Context) {
|
||||||
if loadedCount == 0 {
|
if loadedCount == 0 {
|
||||||
// No models loaded. Load the model but prefer the best fit.
|
// No models loaded. Load the model but prefer the best fit.
|
||||||
slog.Debug("loading first model", "model", pending.model.ModelPath)
|
slog.Debug("loading first model", "model", pending.model.ModelPath)
|
||||||
s.loadFn(pending, ggml, gpus, false)
|
s.loadFn(pending, ggml, systemInfo, gpus, false)
|
||||||
break
|
break
|
||||||
}
|
}
|
||||||
|
|
||||||
// More than one loaded model, so we have to see if the
|
// More than one loaded model, so we have to see if the
|
||||||
// new one fits
|
// new one fits
|
||||||
|
|
||||||
needEvict := s.loadFn(pending, ggml, gpus, true)
|
needEvict := s.loadFn(pending, ggml, systemInfo, gpus, true)
|
||||||
if !needEvict {
|
if !needEvict {
|
||||||
slog.Debug("new model fits with existing models, loading")
|
slog.Debug("new model fits with existing models, loading")
|
||||||
break
|
break
|
||||||
|
|
@ -353,7 +340,7 @@ func (s *Scheduler) processCompleted(ctx context.Context) {
|
||||||
runner.refMu.Unlock()
|
runner.refMu.Unlock()
|
||||||
} else {
|
} else {
|
||||||
slog.Debug("starting background wait for VRAM recovery", "runner", runner)
|
slog.Debug("starting background wait for VRAM recovery", "runner", runner)
|
||||||
runnersSnapshot := make([]discover.FilteredRunnerDiscovery, 0, len(s.loaded))
|
runnersSnapshot := make([]ml.FilteredRunnerDiscovery, 0, len(s.loaded))
|
||||||
for _, r := range s.loaded {
|
for _, r := range s.loaded {
|
||||||
runnersSnapshot = append(runnersSnapshot, r)
|
runnersSnapshot = append(runnersSnapshot, r)
|
||||||
}
|
}
|
||||||
|
|
@ -395,7 +382,7 @@ func (pending *LlmRequest) useLoadedRunner(runner *runnerRef, finished chan *Llm
|
||||||
|
|
||||||
// load creates a new model based on req and loads it. If requireFull is true then the model must be loaded fully onto GPUs
|
// load creates a new model based on req and loads it. If requireFull is true then the model must be loaded fully onto GPUs
|
||||||
// (if any). Returns whether the scheduler needs to evict a model to make this one fit.
|
// (if any). Returns whether the scheduler needs to evict a model to make this one fit.
|
||||||
func (s *Scheduler) load(req *LlmRequest, f *ggml.GGML, gpus discover.GpuInfoList, requireFull bool) bool {
|
func (s *Scheduler) load(req *LlmRequest, f *ggml.GGML, systemInfo ml.SystemInfo, gpus []ml.DeviceInfo, requireFull bool) bool {
|
||||||
numParallel := max(int(envconfig.NumParallel()), 1)
|
numParallel := max(int(envconfig.NumParallel()), 1)
|
||||||
|
|
||||||
// Embedding models should always be loaded with parallel=1
|
// Embedding models should always be loaded with parallel=1
|
||||||
|
|
@ -420,7 +407,7 @@ func (s *Scheduler) load(req *LlmRequest, f *ggml.GGML, gpus discover.GpuInfoLis
|
||||||
|
|
||||||
if llama == nil {
|
if llama == nil {
|
||||||
var err error
|
var err error
|
||||||
llama, err = s.newServerFn(gpus, req.model.ModelPath, f, req.model.AdapterPaths, req.model.ProjectorPaths, req.opts, numParallel)
|
llama, err = s.newServerFn(systemInfo, gpus, req.model.ModelPath, f, req.model.AdapterPaths, req.model.ProjectorPaths, req.opts, numParallel)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
// some older models are not compatible with newer versions of llama.cpp
|
// some older models are not compatible with newer versions of llama.cpp
|
||||||
// show a generalized compatibility error until there is a better way to
|
// show a generalized compatibility error until there is a better way to
|
||||||
|
|
@ -443,9 +430,16 @@ func (s *Scheduler) load(req *LlmRequest, f *ggml.GGML, gpus discover.GpuInfoLis
|
||||||
|
|
||||||
s.loadedMu.Unlock()
|
s.loadedMu.Unlock()
|
||||||
|
|
||||||
gpuIDs, err := llama.Load(req.ctx, gpus, requireFull)
|
gpuIDs, err := llama.Load(req.ctx, systemInfo, gpus, requireFull)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
if errors.Is(err, llm.ErrLoadRequiredFull) {
|
if errors.Is(err, llm.ErrLoadRequiredFull) {
|
||||||
|
if !requireFull {
|
||||||
|
// No other models loaded, yet we still don't fit, so report an error
|
||||||
|
slog.Info("model is too large for system memory", "requireFull", requireFull)
|
||||||
|
s.activeLoading.Close()
|
||||||
|
s.activeLoading = nil
|
||||||
|
req.errCh <- err
|
||||||
|
}
|
||||||
return true
|
return true
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -456,6 +450,20 @@ func (s *Scheduler) load(req *LlmRequest, f *ggml.GGML, gpus discover.GpuInfoLis
|
||||||
return false
|
return false
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Determine if we have discrete GPUs which we should monitor VRAM usage on during shutdown
|
||||||
|
discreteGPUs := false
|
||||||
|
iGPUScan:
|
||||||
|
for _, devid := range gpuIDs {
|
||||||
|
for _, dev := range gpus {
|
||||||
|
if dev.DeviceID == devid {
|
||||||
|
if !dev.Integrated {
|
||||||
|
discreteGPUs = true
|
||||||
|
break iGPUScan
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
runner := &runnerRef{
|
runner := &runnerRef{
|
||||||
model: req.model,
|
model: req.model,
|
||||||
modelPath: req.model.ModelPath,
|
modelPath: req.model.ModelPath,
|
||||||
|
|
@ -463,6 +471,7 @@ func (s *Scheduler) load(req *LlmRequest, f *ggml.GGML, gpus discover.GpuInfoLis
|
||||||
Options: &req.opts,
|
Options: &req.opts,
|
||||||
sessionDuration: sessionDuration,
|
sessionDuration: sessionDuration,
|
||||||
gpus: gpuIDs,
|
gpus: gpuIDs,
|
||||||
|
discreteGPUs: discreteGPUs,
|
||||||
vramSize: llama.VRAMSize(),
|
vramSize: llama.VRAMSize(),
|
||||||
totalSize: llama.TotalSize(),
|
totalSize: llama.TotalSize(),
|
||||||
loading: true,
|
loading: true,
|
||||||
|
|
@ -510,7 +519,10 @@ func (s *Scheduler) load(req *LlmRequest, f *ggml.GGML, gpus discover.GpuInfoLis
|
||||||
return false
|
return false
|
||||||
}
|
}
|
||||||
|
|
||||||
func (s *Scheduler) updateFreeSpace(allGpus discover.GpuInfoList) {
|
func (s *Scheduler) updateFreeSpace(allGpus []ml.DeviceInfo) {
|
||||||
|
if len(allGpus) == 0 {
|
||||||
|
return
|
||||||
|
}
|
||||||
predMap := map[ml.DeviceID]uint64{} // Sum up the total predicted usage per GPU for all runners
|
predMap := map[ml.DeviceID]uint64{} // Sum up the total predicted usage per GPU for all runners
|
||||||
s.loadedMu.Lock()
|
s.loadedMu.Lock()
|
||||||
runners := make([]*runnerRef, 0, len(s.loaded))
|
runners := make([]*runnerRef, 0, len(s.loaded))
|
||||||
|
|
@ -558,6 +570,7 @@ type runnerRef struct {
|
||||||
pid int
|
pid int
|
||||||
loading bool // True only during initial load, then false forever
|
loading bool // True only during initial load, then false forever
|
||||||
gpus []ml.DeviceID // Recorded at time of provisioning
|
gpus []ml.DeviceID // Recorded at time of provisioning
|
||||||
|
discreteGPUs bool // True if all devices are discrete GPUs - used to skip VRAM recovery check for iGPUs
|
||||||
vramSize uint64
|
vramSize uint64
|
||||||
totalSize uint64
|
totalSize uint64
|
||||||
|
|
||||||
|
|
@ -627,14 +640,12 @@ func (runner *runnerRef) needsReload(ctx context.Context, req *LlmRequest) bool
|
||||||
// a before and after GPU memory allocation. The returned channel
|
// a before and after GPU memory allocation. The returned channel
|
||||||
// will be notified when we're done waiting, or have timed out and should
|
// will be notified when we're done waiting, or have timed out and should
|
||||||
// proceed anyway
|
// proceed anyway
|
||||||
func (s *Scheduler) waitForVRAMRecovery(runner *runnerRef, runners []discover.FilteredRunnerDiscovery) chan any {
|
func (s *Scheduler) waitForVRAMRecovery(runner *runnerRef, runners []ml.FilteredRunnerDiscovery) chan any {
|
||||||
finished := make(chan any, 1)
|
finished := make(chan any, 1)
|
||||||
|
|
||||||
// CPU or Metal don't need checking, so no waiting required
|
// CPU, Metal and iGPUs don't need checking, so no waiting required
|
||||||
// windows can page VRAM, only cuda currently can report accurate used vram usage
|
if len(runner.gpus) == 0 || !runner.discreteGPUs ||
|
||||||
if len(runner.gpus) == 0 ||
|
(len(runner.gpus) == 1 && runner.gpus[0].Library == "Metal") {
|
||||||
(len(runner.gpus) == 1 && (runner.gpus[0].Library == "cpu" || runner.gpus[0].Library == "Metal")) ||
|
|
||||||
(runtime.GOOS == "windows" && runner.gpus[0].Library != "CUDA") {
|
|
||||||
finished <- struct{}{}
|
finished <- struct{}{}
|
||||||
slog.Debug("no need to wait for VRAM recovery", "runner", runner)
|
slog.Debug("no need to wait for VRAM recovery", "runner", runner)
|
||||||
return finished
|
return finished
|
||||||
|
|
@ -668,7 +679,11 @@ func (s *Scheduler) waitForVRAMRecovery(runner *runnerRef, runners []discover.Fi
|
||||||
totalMemoryNow += gpu.TotalMemory
|
totalMemoryNow += gpu.TotalMemory
|
||||||
freeMemoryNow += gpu.FreeMemory
|
freeMemoryNow += gpu.FreeMemory
|
||||||
}
|
}
|
||||||
logutil.Trace("gpu VRAM convergence", "percent", int(max(float32(freeMemoryNow-freeMemoryBefore), 0.0)/float32(runner.vramSize)*100))
|
if freeMemoryNow > freeMemoryBefore {
|
||||||
|
logutil.Trace("gpu VRAM convergence", "percent", int(float32(freeMemoryNow-freeMemoryBefore)/float32(runner.vramSize)*100))
|
||||||
|
} else {
|
||||||
|
logutil.Trace("gpu VRAM convergence", "percent", 0)
|
||||||
|
}
|
||||||
// If we're within ~75% of the estimated memory usage recovered, bail out
|
// If we're within ~75% of the estimated memory usage recovered, bail out
|
||||||
if float32(freeMemoryNow-freeMemoryBefore) > float32(runner.vramSize)*0.75 {
|
if float32(freeMemoryNow-freeMemoryBefore) > float32(runner.vramSize)*0.75 {
|
||||||
slog.Debug(fmt.Sprintf("gpu VRAM free memory converged after %0.2f seconds", time.Since(start).Seconds()), "free_before", format.HumanBytes2(freeMemoryBefore), "free_now", format.HumanBytes2(freeMemoryNow), "runner", runner)
|
slog.Debug(fmt.Sprintf("gpu VRAM free memory converged after %0.2f seconds", time.Since(start).Seconds()), "free_before", format.HumanBytes2(freeMemoryBefore), "free_now", format.HumanBytes2(freeMemoryNow), "runner", runner)
|
||||||
|
|
|
||||||
|
|
@ -13,7 +13,6 @@ import (
|
||||||
|
|
||||||
"github.com/ollama/ollama/api"
|
"github.com/ollama/ollama/api"
|
||||||
"github.com/ollama/ollama/app/lifecycle"
|
"github.com/ollama/ollama/app/lifecycle"
|
||||||
"github.com/ollama/ollama/discover"
|
|
||||||
"github.com/ollama/ollama/format"
|
"github.com/ollama/ollama/format"
|
||||||
"github.com/ollama/ollama/fs/ggml"
|
"github.com/ollama/ollama/fs/ggml"
|
||||||
"github.com/ollama/ollama/llm"
|
"github.com/ollama/ollama/llm"
|
||||||
|
|
@ -50,11 +49,12 @@ func TestSchedLoad(t *testing.T) {
|
||||||
sessionDuration: &api.Duration{Duration: 2 * time.Second},
|
sessionDuration: &api.Duration{Duration: 2 * time.Second},
|
||||||
}
|
}
|
||||||
// Fail to load model first
|
// Fail to load model first
|
||||||
s.newServerFn = func(gpus discover.GpuInfoList, model string, f *ggml.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
|
s.newServerFn = func(systemInfo ml.SystemInfo, gpus []ml.DeviceInfo, model string, f *ggml.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
|
||||||
return nil, errors.New("something failed to load model blah")
|
return nil, errors.New("something failed to load model blah")
|
||||||
}
|
}
|
||||||
gpus := discover.GpuInfoList{}
|
gpus := []ml.DeviceInfo{}
|
||||||
s.load(req, f, gpus, false)
|
systemInfo := ml.SystemInfo{}
|
||||||
|
s.load(req, f, systemInfo, gpus, false)
|
||||||
require.Empty(t, req.successCh)
|
require.Empty(t, req.successCh)
|
||||||
require.Len(t, req.errCh, 1)
|
require.Len(t, req.errCh, 1)
|
||||||
s.loadedMu.Lock()
|
s.loadedMu.Lock()
|
||||||
|
|
@ -64,11 +64,11 @@ func TestSchedLoad(t *testing.T) {
|
||||||
require.Contains(t, err.Error(), "this model may be incompatible")
|
require.Contains(t, err.Error(), "this model may be incompatible")
|
||||||
|
|
||||||
server := &mockLlm{vramSize: 10, vramByGPU: map[ml.DeviceID]uint64{}}
|
server := &mockLlm{vramSize: 10, vramByGPU: map[ml.DeviceID]uint64{}}
|
||||||
s.newServerFn = func(gpus discover.GpuInfoList, model string, f *ggml.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
|
s.newServerFn = func(systemInfo ml.SystemInfo, gpus []ml.DeviceInfo, model string, f *ggml.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
|
||||||
server.modelPath = model
|
server.modelPath = model
|
||||||
return server, nil
|
return server, nil
|
||||||
}
|
}
|
||||||
s.load(req, f, gpus, false)
|
s.load(req, f, systemInfo, gpus, false)
|
||||||
select {
|
select {
|
||||||
case err := <-req.errCh:
|
case err := <-req.errCh:
|
||||||
require.NoError(t, err)
|
require.NoError(t, err)
|
||||||
|
|
@ -82,7 +82,7 @@ func TestSchedLoad(t *testing.T) {
|
||||||
|
|
||||||
req.model.ModelPath = "dummy_model_path"
|
req.model.ModelPath = "dummy_model_path"
|
||||||
server.waitResp = errors.New("wait failure")
|
server.waitResp = errors.New("wait failure")
|
||||||
s.load(req, f, gpus, false)
|
s.load(req, f, systemInfo, gpus, false)
|
||||||
select {
|
select {
|
||||||
case err := <-req.errCh:
|
case err := <-req.errCh:
|
||||||
require.Contains(t, err.Error(), "wait failure")
|
require.Contains(t, err.Error(), "wait failure")
|
||||||
|
|
@ -106,7 +106,7 @@ type reqBundle struct {
|
||||||
f *ggml.GGML
|
f *ggml.GGML
|
||||||
}
|
}
|
||||||
|
|
||||||
func (scenario *reqBundle) newServer(gpus discover.GpuInfoList, model string, f *ggml.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
|
func (scenario *reqBundle) newServer(systemInfo ml.SystemInfo, gpus []ml.DeviceInfo, model string, f *ggml.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
|
||||||
scenario.srv.modelPath = model
|
scenario.srv.modelPath = model
|
||||||
return scenario.srv, nil
|
return scenario.srv, nil
|
||||||
}
|
}
|
||||||
|
|
@ -152,20 +152,20 @@ func newScenarioRequest(t *testing.T, ctx context.Context, modelName string, vra
|
||||||
return b
|
return b
|
||||||
}
|
}
|
||||||
|
|
||||||
func getGpuFn(ctx context.Context, runners []discover.FilteredRunnerDiscovery) discover.GpuInfoList {
|
func getGpuFn(ctx context.Context, runners []ml.FilteredRunnerDiscovery) []ml.DeviceInfo {
|
||||||
slog.Info("test getGpuFn called", "runners", runners)
|
slog.Info("test getGpuFn called", "runners", runners)
|
||||||
g := discover.GpuInfo{DeviceID: ml.DeviceID{Library: "metal"}}
|
g := ml.DeviceInfo{DeviceID: ml.DeviceID{Library: "Metal"}}
|
||||||
g.TotalMemory = 24 * format.GigaByte
|
g.TotalMemory = 24 * format.GigaByte
|
||||||
g.FreeMemory = 12 * format.GigaByte
|
g.FreeMemory = 12 * format.GigaByte
|
||||||
return []discover.GpuInfo{g}
|
return []ml.DeviceInfo{g}
|
||||||
}
|
}
|
||||||
|
|
||||||
func getCpuFn() discover.GpuInfo {
|
func getSystemInfoFn() ml.SystemInfo {
|
||||||
slog.Info("test getCpuFn called")
|
slog.Info("test getSystemInfoFn called")
|
||||||
g := discover.GpuInfo{DeviceID: ml.DeviceID{Library: "cpu"}}
|
return ml.SystemInfo{
|
||||||
g.TotalMemory = 32 * format.GigaByte
|
TotalMemory: 32 * format.GigaByte,
|
||||||
g.FreeMemory = 26 * format.GigaByte
|
FreeMemory: 26 * format.GigaByte,
|
||||||
return g
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func TestSchedRequestsSameModelSameRequest(t *testing.T) {
|
func TestSchedRequestsSameModelSameRequest(t *testing.T) {
|
||||||
|
|
@ -174,7 +174,7 @@ func TestSchedRequestsSameModelSameRequest(t *testing.T) {
|
||||||
s := InitScheduler(ctx)
|
s := InitScheduler(ctx)
|
||||||
s.waitForRecovery = 10 * time.Millisecond
|
s.waitForRecovery = 10 * time.Millisecond
|
||||||
s.getGpuFn = getGpuFn
|
s.getGpuFn = getGpuFn
|
||||||
s.getCpuFn = getCpuFn
|
s.getSystemInfoFn = getSystemInfoFn
|
||||||
a := newScenarioRequest(t, ctx, "ollama-model-1", 10, &api.Duration{Duration: 5 * time.Millisecond}, nil)
|
a := newScenarioRequest(t, ctx, "ollama-model-1", 10, &api.Duration{Duration: 5 * time.Millisecond}, nil)
|
||||||
b := newScenarioRequest(t, ctx, "ollama-model-1", 11, &api.Duration{Duration: 0}, nil)
|
b := newScenarioRequest(t, ctx, "ollama-model-1", 11, &api.Duration{Duration: 0}, nil)
|
||||||
b.req.model = a.req.model
|
b.req.model = a.req.model
|
||||||
|
|
@ -218,7 +218,7 @@ func TestSchedRequestsSimpleReloadSameModel(t *testing.T) {
|
||||||
s := InitScheduler(ctx)
|
s := InitScheduler(ctx)
|
||||||
s.waitForRecovery = 10 * time.Millisecond
|
s.waitForRecovery = 10 * time.Millisecond
|
||||||
s.getGpuFn = getGpuFn
|
s.getGpuFn = getGpuFn
|
||||||
s.getCpuFn = getCpuFn
|
s.getSystemInfoFn = getSystemInfoFn
|
||||||
a := newScenarioRequest(t, ctx, "ollama-model-1", 10, &api.Duration{Duration: 5 * time.Millisecond}, nil)
|
a := newScenarioRequest(t, ctx, "ollama-model-1", 10, &api.Duration{Duration: 5 * time.Millisecond}, nil)
|
||||||
b := newScenarioRequest(t, ctx, "ollama-model-1", 20, &api.Duration{Duration: 5 * time.Millisecond}, nil)
|
b := newScenarioRequest(t, ctx, "ollama-model-1", 20, &api.Duration{Duration: 5 * time.Millisecond}, nil)
|
||||||
tmpModel := *a.req.model
|
tmpModel := *a.req.model
|
||||||
|
|
@ -251,12 +251,12 @@ func TestSchedRequestsSimpleReloadSameModel(t *testing.T) {
|
||||||
a.ctxDone()
|
a.ctxDone()
|
||||||
// Report recovered VRAM usage
|
// Report recovered VRAM usage
|
||||||
time.Sleep(1 * time.Millisecond)
|
time.Sleep(1 * time.Millisecond)
|
||||||
s.getGpuFn = func(ctx context.Context, runners []discover.FilteredRunnerDiscovery) discover.GpuInfoList {
|
s.getGpuFn = func(ctx context.Context, runners []ml.FilteredRunnerDiscovery) []ml.DeviceInfo {
|
||||||
slog.Info("XXX altered getGpuFn called")
|
slog.Info("altered getGpuFn called")
|
||||||
g := discover.GpuInfo{DeviceID: ml.DeviceID{Library: "metal"}}
|
g := ml.DeviceInfo{DeviceID: ml.DeviceID{Library: "Metal"}}
|
||||||
g.TotalMemory = 24 * format.GigaByte
|
g.TotalMemory = 24 * format.GigaByte
|
||||||
g.FreeMemory = 24 * format.GigaByte
|
g.FreeMemory = 24 * format.GigaByte
|
||||||
return []discover.GpuInfo{g}
|
return []ml.DeviceInfo{g}
|
||||||
}
|
}
|
||||||
select {
|
select {
|
||||||
case resp := <-b.req.successCh:
|
case resp := <-b.req.successCh:
|
||||||
|
|
@ -271,26 +271,26 @@ func TestSchedRequestsSimpleReloadSameModel(t *testing.T) {
|
||||||
}
|
}
|
||||||
|
|
||||||
func TestSchedRequestsMultipleLoadedModels(t *testing.T) {
|
func TestSchedRequestsMultipleLoadedModels(t *testing.T) {
|
||||||
ctx, done := context.WithTimeout(t.Context(), 500*time.Millisecond)
|
slog.Info("TestRequestsMultipleLoadedModels")
|
||||||
|
ctx, done := context.WithTimeout(t.Context(), 1000*time.Millisecond)
|
||||||
defer done()
|
defer done()
|
||||||
s := InitScheduler(ctx)
|
s := InitScheduler(ctx)
|
||||||
s.waitForRecovery = 10 * time.Millisecond
|
s.waitForRecovery = 10 * time.Millisecond
|
||||||
s.getGpuFn = getGpuFn // 1 metal GPU
|
s.getGpuFn = getGpuFn // 1 Metal GPU
|
||||||
s.getCpuFn = getCpuFn // 1 CPU
|
s.getSystemInfoFn = getSystemInfoFn
|
||||||
|
|
||||||
// Multiple loaded models
|
// Multiple loaded models
|
||||||
a := newScenarioRequest(t, ctx, "model-a-1g-gpu", 1*format.GigaByte, nil, map[ml.DeviceID]uint64{{Library: "metal"}: 1 * format.GigaByte})
|
a := newScenarioRequest(t, ctx, "model-a-1g-gpu", 1*format.GigaByte, nil, map[ml.DeviceID]uint64{{Library: "Metal"}: 1 * format.GigaByte})
|
||||||
a.req.sessionDuration = &api.Duration{Duration: 5 * time.Millisecond}
|
a.req.sessionDuration = &api.Duration{Duration: 5 * time.Millisecond}
|
||||||
b := newScenarioRequest(t, ctx, "model-b-10g-gpu", 10*format.GigaByte, nil, map[ml.DeviceID]uint64{{Library: "metal"}: 10 * format.GigaByte})
|
b := newScenarioRequest(t, ctx, "model-b-10g-gpu", 10*format.GigaByte, nil, map[ml.DeviceID]uint64{{Library: "Metal"}: 10 * format.GigaByte})
|
||||||
b.req.sessionDuration = &api.Duration{Duration: 5 * time.Millisecond}
|
b.req.sessionDuration = &api.Duration{Duration: 5 * time.Millisecond}
|
||||||
c := newScenarioRequest(t, ctx, "model-c-10g-cpu", 10*format.GigaByte, nil, nil /* No GPU load */)
|
c := newScenarioRequest(t, ctx, "model-c-10g-cpu", 10*format.GigaByte, nil, nil /* No GPU load */)
|
||||||
c.req.opts.NumGPU = 0 // CPU load, will be allowed
|
c.req.opts.NumGPU = 0 // CPU load, will be allowed
|
||||||
b.req.sessionDuration = &api.Duration{Duration: 10 * time.Millisecond} // longer than b to cause the scheduler to favor unloading b over c
|
b.req.sessionDuration = &api.Duration{Duration: 10 * time.Millisecond} // longer than b to cause the scheduler to favor unloading b over c
|
||||||
d := newScenarioRequest(t, ctx, "model-d-10g-gpu", 13*format.GigaByte, nil, map[ml.DeviceID]uint64{{Library: "metal"}: 13 * format.GigaByte}) // Needs prior unloaded
|
d := newScenarioRequest(t, ctx, "model-d-10g-gpu", 13*format.GigaByte, nil, map[ml.DeviceID]uint64{{Library: "Metal"}: 13 * format.GigaByte}) // Needs prior unloaded
|
||||||
|
|
||||||
t.Setenv("OLLAMA_MAX_LOADED_MODELS", "1")
|
|
||||||
s.newServerFn = a.newServer
|
s.newServerFn = a.newServer
|
||||||
slog.Info("a")
|
slog.Info("Loading A")
|
||||||
s.pendingReqCh <- a.req
|
s.pendingReqCh <- a.req
|
||||||
s.Run(ctx)
|
s.Run(ctx)
|
||||||
select {
|
select {
|
||||||
|
|
@ -309,7 +309,7 @@ func TestSchedRequestsMultipleLoadedModels(t *testing.T) {
|
||||||
|
|
||||||
t.Setenv("OLLAMA_MAX_LOADED_MODELS", "0")
|
t.Setenv("OLLAMA_MAX_LOADED_MODELS", "0")
|
||||||
s.newServerFn = b.newServer
|
s.newServerFn = b.newServer
|
||||||
slog.Info("b")
|
slog.Info("Loading B")
|
||||||
s.pendingReqCh <- b.req
|
s.pendingReqCh <- b.req
|
||||||
select {
|
select {
|
||||||
case resp := <-b.req.successCh:
|
case resp := <-b.req.successCh:
|
||||||
|
|
@ -327,7 +327,7 @@ func TestSchedRequestsMultipleLoadedModels(t *testing.T) {
|
||||||
|
|
||||||
// This is a CPU load with NumGPU = 0 so it should load
|
// This is a CPU load with NumGPU = 0 so it should load
|
||||||
s.newServerFn = c.newServer
|
s.newServerFn = c.newServer
|
||||||
slog.Info("c")
|
slog.Info("Loading C")
|
||||||
s.pendingReqCh <- c.req
|
s.pendingReqCh <- c.req
|
||||||
select {
|
select {
|
||||||
case resp := <-c.req.successCh:
|
case resp := <-c.req.successCh:
|
||||||
|
|
@ -337,6 +337,7 @@ func TestSchedRequestsMultipleLoadedModels(t *testing.T) {
|
||||||
case err := <-c.req.errCh:
|
case err := <-c.req.errCh:
|
||||||
t.Fatal(err.Error())
|
t.Fatal(err.Error())
|
||||||
case <-ctx.Done():
|
case <-ctx.Done():
|
||||||
|
slog.Info("FAIL: scheduler state", "s.loaded", s.loaded)
|
||||||
t.Fatal("timeout")
|
t.Fatal("timeout")
|
||||||
}
|
}
|
||||||
s.loadedMu.Lock()
|
s.loadedMu.Lock()
|
||||||
|
|
@ -361,11 +362,11 @@ func TestSchedRequestsMultipleLoadedModels(t *testing.T) {
|
||||||
b.ctxDone()
|
b.ctxDone()
|
||||||
// Report recovered VRAM usage so scheduler will finish waiting and unload
|
// Report recovered VRAM usage so scheduler will finish waiting and unload
|
||||||
time.Sleep(1 * time.Millisecond)
|
time.Sleep(1 * time.Millisecond)
|
||||||
s.getGpuFn = func(ctx context.Context, runners []discover.FilteredRunnerDiscovery) discover.GpuInfoList {
|
s.getGpuFn = func(ctx context.Context, runners []ml.FilteredRunnerDiscovery) []ml.DeviceInfo {
|
||||||
g := discover.GpuInfo{DeviceID: ml.DeviceID{Library: "metal"}}
|
g := ml.DeviceInfo{DeviceID: ml.DeviceID{Library: "Metal"}}
|
||||||
g.TotalMemory = 24 * format.GigaByte
|
g.TotalMemory = 24 * format.GigaByte
|
||||||
g.FreeMemory = 24 * format.GigaByte
|
g.FreeMemory = 24 * format.GigaByte
|
||||||
return []discover.GpuInfo{g}
|
return []ml.DeviceInfo{g}
|
||||||
}
|
}
|
||||||
select {
|
select {
|
||||||
case resp := <-d.req.successCh:
|
case resp := <-d.req.successCh:
|
||||||
|
|
@ -404,7 +405,7 @@ func TestSchedGetRunner(t *testing.T) {
|
||||||
s := InitScheduler(ctx)
|
s := InitScheduler(ctx)
|
||||||
s.waitForRecovery = 10 * time.Millisecond
|
s.waitForRecovery = 10 * time.Millisecond
|
||||||
s.getGpuFn = getGpuFn
|
s.getGpuFn = getGpuFn
|
||||||
s.getCpuFn = getCpuFn
|
s.getSystemInfoFn = getSystemInfoFn
|
||||||
s.newServerFn = a.newServer
|
s.newServerFn = a.newServer
|
||||||
slog.Info("a")
|
slog.Info("a")
|
||||||
successCh1a, errCh1a := s.GetRunner(a.ctx, a.req.model, a.req.opts, a.req.sessionDuration)
|
successCh1a, errCh1a := s.GetRunner(a.ctx, a.req.model, a.req.opts, a.req.sessionDuration)
|
||||||
|
|
@ -462,13 +463,14 @@ func TestSchedExpireRunner(t *testing.T) {
|
||||||
}
|
}
|
||||||
|
|
||||||
var f *ggml.GGML
|
var f *ggml.GGML
|
||||||
gpus := discover.GpuInfoList{}
|
gpus := []ml.DeviceInfo{}
|
||||||
|
systemInfo := ml.SystemInfo{}
|
||||||
server := &mockLlm{vramSize: 10, vramByGPU: map[ml.DeviceID]uint64{}}
|
server := &mockLlm{vramSize: 10, vramByGPU: map[ml.DeviceID]uint64{}}
|
||||||
s.newServerFn = func(gpus discover.GpuInfoList, model string, f *ggml.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
|
s.newServerFn = func(systemInfo ml.SystemInfo, gpus []ml.DeviceInfo, model string, f *ggml.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
|
||||||
server.modelPath = model
|
server.modelPath = model
|
||||||
return server, nil
|
return server, nil
|
||||||
}
|
}
|
||||||
s.load(req, f, gpus, false)
|
s.load(req, f, systemInfo, gpus, false)
|
||||||
|
|
||||||
select {
|
select {
|
||||||
case err := <-req.errCh:
|
case err := <-req.errCh:
|
||||||
|
|
@ -497,19 +499,15 @@ func TestSchedExpireRunner(t *testing.T) {
|
||||||
|
|
||||||
// TODO - add one scenario that triggers the bogus finished event with positive ref count
|
// TODO - add one scenario that triggers the bogus finished event with positive ref count
|
||||||
func TestSchedPrematureExpired(t *testing.T) {
|
func TestSchedPrematureExpired(t *testing.T) {
|
||||||
ctx, done := context.WithTimeout(t.Context(), 500*time.Millisecond)
|
ctx, done := context.WithTimeout(t.Context(), 1000*time.Millisecond)
|
||||||
defer done()
|
defer done()
|
||||||
|
|
||||||
// Same model, same request
|
// Same model, same request
|
||||||
scenario1a := newScenarioRequest(t, ctx, "ollama-model-1a", 10, nil, nil)
|
scenario1a := newScenarioRequest(t, ctx, "ollama-model-1a", 10, &api.Duration{Duration: 100 * time.Millisecond}, nil)
|
||||||
s := InitScheduler(ctx)
|
s := InitScheduler(ctx)
|
||||||
s.waitForRecovery = 10 * time.Millisecond
|
s.waitForRecovery = 10 * time.Millisecond
|
||||||
s.getGpuFn = func(ctx context.Context, runners []discover.FilteredRunnerDiscovery) discover.GpuInfoList {
|
s.getGpuFn = getGpuFn
|
||||||
g := discover.GpuInfo{DeviceID: ml.DeviceID{Library: "metal"}}
|
s.getSystemInfoFn = getSystemInfoFn
|
||||||
g.TotalMemory = 24 * format.GigaByte
|
|
||||||
g.FreeMemory = 12 * format.GigaByte
|
|
||||||
return []discover.GpuInfo{g}
|
|
||||||
}
|
|
||||||
s.newServerFn = scenario1a.newServer
|
s.newServerFn = scenario1a.newServer
|
||||||
successCh1a, errCh1a := s.GetRunner(scenario1a.ctx, scenario1a.req.model, scenario1a.req.opts, scenario1a.req.sessionDuration)
|
successCh1a, errCh1a := s.GetRunner(scenario1a.ctx, scenario1a.req.model, scenario1a.req.opts, scenario1a.req.sessionDuration)
|
||||||
require.Len(t, s.pendingReqCh, 1)
|
require.Len(t, s.pendingReqCh, 1)
|
||||||
|
|
@ -574,7 +572,7 @@ func TestSchedUseLoadedRunner(t *testing.T) {
|
||||||
func TestSchedUpdateFreeSpace(t *testing.T) {
|
func TestSchedUpdateFreeSpace(t *testing.T) {
|
||||||
ctx, done := context.WithTimeout(t.Context(), 100*time.Millisecond)
|
ctx, done := context.WithTimeout(t.Context(), 100*time.Millisecond)
|
||||||
defer done()
|
defer done()
|
||||||
gpus := discover.GpuInfoList{
|
gpus := []ml.DeviceInfo{
|
||||||
{
|
{
|
||||||
DeviceID: ml.DeviceID{
|
DeviceID: ml.DeviceID{
|
||||||
ID: "1",
|
ID: "1",
|
||||||
|
|
@ -756,8 +754,12 @@ func (s *mockLlm) ModelPath() string {
|
||||||
return s.modelPath
|
return s.modelPath
|
||||||
}
|
}
|
||||||
|
|
||||||
func (s *mockLlm) Load(ctx context.Context, gpus discover.GpuInfoList, requireFull bool) ([]ml.DeviceID, error) {
|
func (s *mockLlm) Load(ctx context.Context, sytemInfo ml.SystemInfo, gpus []ml.DeviceInfo, requireFull bool) ([]ml.DeviceID, error) {
|
||||||
if requireFull {
|
if requireFull {
|
||||||
|
if len(gpus) == 0 {
|
||||||
|
slog.Info("mockLlm.Load CPU based load")
|
||||||
|
return nil, nil
|
||||||
|
}
|
||||||
for _, g := range gpus {
|
for _, g := range gpus {
|
||||||
if g.FreeMemory >= s.vramSize {
|
if g.FreeMemory >= s.vramSize {
|
||||||
return []ml.DeviceID{g.DeviceID}, nil
|
return []ml.DeviceID{g.DeviceID}, nil
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue
Block a user