mirror of
https://github.com/zebrajr/ollama.git
synced 2025-12-06 00:19:51 +01:00
Mllama has large embeddings (100 MB per image) and each embedding is represented as 1 token when passed to llama.cpp. Batches are pre- allocated for the size of the tokens times the batch size, so this results in allocations of over 50 GB at the default batch size. On some systems, these mallocs will fail. Since an image is represented as a single token and mllama doesn't support more than 1 image per request, we only need to allocate a batch size of 1, which is much more reasonable. In addition, for non-multimodal models, we don't need to allocate the embedding batches at all. Fixes #7464
174 lines
3.6 KiB
Go
174 lines
3.6 KiB
Go
package main
|
|
|
|
import (
|
|
"errors"
|
|
"fmt"
|
|
"hash/maphash"
|
|
"log/slog"
|
|
"slices"
|
|
"sync"
|
|
"time"
|
|
|
|
"github.com/ollama/ollama/llama"
|
|
)
|
|
|
|
const imageCacheSize = 4
|
|
|
|
type ImageContext struct {
|
|
// mu is required to be held when generating embeddings or accessing the cache
|
|
mu sync.Mutex
|
|
|
|
clip *llama.ClipContext
|
|
mllama *llama.MllamaContext
|
|
|
|
// cache of images to embeddings
|
|
images []imageCache
|
|
imageHash maphash.Hash
|
|
}
|
|
|
|
func NewImageContext(llamaContext *llama.Context, modelPath string) (*ImageContext, error) {
|
|
arch, err := llama.GetModelArch(modelPath)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("unable to determine vision architecture: %w (%s)", err, modelPath)
|
|
}
|
|
|
|
var c ImageContext
|
|
if arch == "clip" {
|
|
c.clip, err = llama.NewClipContext(llamaContext, modelPath)
|
|
} else if arch == "mllama" {
|
|
c.mllama, err = llama.NewMllamaContext(llamaContext, modelPath)
|
|
} else {
|
|
return nil, fmt.Errorf("unknown vision model architecture: %s", arch)
|
|
}
|
|
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
c.images = make([]imageCache, imageCacheSize)
|
|
|
|
return &c, nil
|
|
}
|
|
|
|
func (c *ImageContext) Free(modelPath string) {
|
|
if c == nil {
|
|
return
|
|
}
|
|
|
|
if c.clip != nil {
|
|
c.clip.Free()
|
|
}
|
|
if c.mllama != nil {
|
|
c.mllama.Free()
|
|
}
|
|
}
|
|
|
|
func (c *ImageContext) NewEmbed(llamaContext *llama.Context, data []byte, aspectRatioId int) [][]float32 {
|
|
if c == nil {
|
|
return nil
|
|
}
|
|
|
|
hash := c.hashImage(data)
|
|
|
|
c.mu.Lock()
|
|
defer c.mu.Unlock()
|
|
|
|
embed, err := c.findImage(hash)
|
|
if err != nil {
|
|
if c.mllama != nil {
|
|
embed = c.mllama.NewEmbed(llamaContext, data, aspectRatioId)
|
|
} else if c.clip != nil {
|
|
embed = c.clip.NewEmbed(llamaContext, data)
|
|
} else {
|
|
return nil
|
|
}
|
|
|
|
c.addImage(hash, embed)
|
|
}
|
|
|
|
return embed
|
|
}
|
|
|
|
func (c *ImageContext) BatchSize(configuredBatchSize int) int {
|
|
// If images are not supported, we don't need to allocate embedding batches
|
|
if c == nil {
|
|
return 0
|
|
}
|
|
|
|
// Mllama maps an image to 1 embedding token (llava creates many tokens)
|
|
// and doesn't support more than a single image per request.
|
|
// The embeddings are large (100 MB), so allocating a big batch can fail
|
|
// on some systems
|
|
if c.mllama != nil {
|
|
return 1
|
|
}
|
|
|
|
return configuredBatchSize
|
|
}
|
|
|
|
func (c *ImageContext) EmbedSize(llamaContext *llama.Context) int {
|
|
if c != nil && c.mllama != nil {
|
|
return c.mllama.EmbedSize(llamaContext)
|
|
} else {
|
|
return llamaContext.Model().NEmbd()
|
|
}
|
|
}
|
|
|
|
func (c *ImageContext) NeedCrossAttention(inputs ...input) bool {
|
|
if c == nil || c.mllama == nil {
|
|
return false
|
|
}
|
|
|
|
return slices.ContainsFunc(inputs, func(input input) bool {
|
|
return input.embed != nil
|
|
})
|
|
}
|
|
|
|
type imageCache struct {
|
|
key uint64
|
|
val [][]float32
|
|
lastUsed time.Time
|
|
}
|
|
|
|
func (c *ImageContext) hashImage(image []byte) uint64 {
|
|
c.imageHash.Reset()
|
|
_, _ = c.imageHash.Write(image)
|
|
return c.imageHash.Sum64()
|
|
}
|
|
|
|
var errImageNotFound = errors.New("image not found in cache")
|
|
|
|
func (c *ImageContext) findImage(hash uint64) ([][]float32, error) {
|
|
for i := range c.images {
|
|
if c.images[i].key == hash {
|
|
slog.Debug("loading image embeddings from cache", "entry", i)
|
|
c.images[i].lastUsed = time.Now()
|
|
return c.images[i].val, nil
|
|
}
|
|
}
|
|
|
|
return nil, errImageNotFound
|
|
}
|
|
|
|
func (c *ImageContext) addImage(hash uint64, embed [][]float32) {
|
|
best := time.Now()
|
|
var bestImage int
|
|
|
|
for i := range c.images {
|
|
if c.images[i].key == hash {
|
|
bestImage = i
|
|
break
|
|
}
|
|
|
|
if c.images[i].lastUsed.Compare(best) < 0 {
|
|
best = c.images[i].lastUsed
|
|
bestImage = i
|
|
}
|
|
}
|
|
|
|
slog.Debug("storing image embeddings in cache", "entry", bestImage, "used", c.images[bestImage].lastUsed)
|
|
c.images[bestImage].key = hash
|
|
c.images[bestImage].val = embed
|
|
c.images[bestImage].lastUsed = time.Now()
|
|
}
|