From 3fe74fba42b8d496a1ab3e8298bdc9b8ffb0f336 Mon Sep 17 00:00:00 2001 From: Jesse Gross Date: Mon, 19 May 2025 11:40:44 -0700 Subject: [PATCH] llm: Use first layer as memory buffer in estimation This is a partial revert of 0478d44 "Fixed over vram allcation dure to small initial layer sizes." Previously we used the size of the first layer as an extra reserved amount of space to buffer our memory estimates. The above commit changed this to use the largest layer. However, this had performance impacts on more models than the original commit was trying to fix. There is just a heuristic without an ideal solution so this goes back to the historic behavior. Fixes: #10765, #10756, #10752, #10726 --- llm/memory.go | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/llm/memory.go b/llm/memory.go index e9ed1738..05b3b2fd 100644 --- a/llm/memory.go +++ b/llm/memory.go @@ -1,12 +1,9 @@ package llm import ( - "cmp" "fmt" "log/slog" - "maps" "os" - "slices" "strconv" "strings" @@ -125,10 +122,12 @@ func EstimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []strin } layers := f.Tensors().GroupLayers() - // add one layer (chosing the max layer) worth of memory as a buffer - layerSize = slices.MaxFunc(slices.Collect(maps.Values(layers)), func(a, b ggml.Layer) int { - return cmp.Compare(a.Size(), b.Size()) - }).Size() + // add one layer worth of memory as a buffer + if blk0, ok := layers["blk.0"]; ok { + layerSize = blk0.Size() + } else { + slog.Warn("model missing blk.0 layer size") + } var kvct string if envconfig.FlashAttention() &&