From a2cc8571c5b2b8f77a8a5e2f65cb7aaa56482dc4 Mon Sep 17 00:00:00 2001 From: Jesse Gross Date: Tue, 13 May 2025 13:04:20 -0700 Subject: [PATCH] llm: Consistently track unassigned model data In some cases, if we fail to assign a piece of the model to a GPU then we lose track of this data. Although it doesn't change the memory allocation, it does affect the total size of the model reported by tools such as ollama ps (and also the percent offloaded). This makes it look like setting num_gpu isn't reflected in ollama ps, which isn't true but the offloading percent may appear to not change. Spreading the model across more GPUs will continue to impact the reported total size of the model. --- llm/memory.go | 22 +++++++++++++--------- 1 file changed, 13 insertions(+), 9 deletions(-) diff --git a/llm/memory.go b/llm/memory.go index b5a8dd5c..46472330 100644 --- a/llm/memory.go +++ b/llm/memory.go @@ -216,6 +216,8 @@ func EstimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []strin if len(gpusWithSpace) > 0 { gpuZeroID = gpusWithSpace[0].i gpuAllocations[gpuZeroID] += gpuZeroOverhead + } else { + overflow += gpuZeroOverhead } // For all the layers, find where they can fit on the GPU(s) @@ -256,15 +258,17 @@ func EstimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []strin } // Determine if we need to consider output then find where it fits - if memoryLayerOutput > 0 && (opts.NumGPU < 0 || layerCount < opts.NumGPU) { - for j := len(gpusWithSpace); j > 0; j-- { - g := gpusWithSpace[layerCount%j] - used := gpuAllocations[g.i] + max(graphPartialOffload, graphFullOffload) - if g.g.FreeMemory > overhead+used+memoryLayerOutput { - gpuAllocations[g.i] += memoryLayerOutput - layerCounts[g.i]++ - layerCount++ - break + if memoryLayerOutput > 0 { + if opts.NumGPU < 0 || layerCount < opts.NumGPU { + for j := len(gpusWithSpace); j > 0; j-- { + g := gpusWithSpace[layerCount%j] + used := gpuAllocations[g.i] + max(graphPartialOffload, graphFullOffload) + if g.g.FreeMemory > overhead+used+memoryLayerOutput { + gpuAllocations[g.i] += memoryLayerOutput + layerCounts[g.i]++ + layerCount++ + break + } } }