mirror of
https://github.com/zebrajr/ollama.git
synced 2025-12-06 00:19:51 +01:00
llm: Use first layer as memory buffer in estimation
This is a partial revert of 0478d44 "Fixed over vram allcation dure to
small initial layer sizes."
Previously we used the size of the first layer as an extra reserved
amount of space to buffer our memory estimates. The above commit
changed this to use the largest layer. However, this had performance
impacts on more models than the original commit was trying to fix.
There is just a heuristic without an ideal solution so this goes back
to the historic behavior.
Fixes: #10765, #10756, #10752, #10726
This commit is contained in:
parent
1a0cfd080a
commit
3fe74fba42
|
|
@ -1,12 +1,9 @@
|
|||
package llm
|
||||
|
||||
import (
|
||||
"cmp"
|
||||
"fmt"
|
||||
"log/slog"
|
||||
"maps"
|
||||
"os"
|
||||
"slices"
|
||||
"strconv"
|
||||
"strings"
|
||||
|
||||
|
|
@ -125,10 +122,12 @@ func EstimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []strin
|
|||
}
|
||||
|
||||
layers := f.Tensors().GroupLayers()
|
||||
// add one layer (chosing the max layer) worth of memory as a buffer
|
||||
layerSize = slices.MaxFunc(slices.Collect(maps.Values(layers)), func(a, b ggml.Layer) int {
|
||||
return cmp.Compare(a.Size(), b.Size())
|
||||
}).Size()
|
||||
// add one layer worth of memory as a buffer
|
||||
if blk0, ok := layers["blk.0"]; ok {
|
||||
layerSize = blk0.Size()
|
||||
} else {
|
||||
slog.Warn("model missing blk.0 layer size")
|
||||
}
|
||||
|
||||
var kvct string
|
||||
if envconfig.FlashAttention() &&
|
||||
|
|
|
|||
Loading…
Reference in New Issue
Block a user