mirror of
https://github.com/zebrajr/ollama.git
synced 2025-12-06 00:19:51 +01:00
When a model is partially offloaded to system RAM, we can either do the calculations on the CPU or we can temporarily transfer the data to the GPU to do the calculations there. Small batches tend to be better on the CPU, large batches on the GPU. The llamarunner used the GPU in most cases and the ollamarunner used the CPU. Although the ollamarunner saw an improvement in token generation performance, there was a large performance hit in prompt processing (3-10x). There is an existing heuristic to dynamically switch between these two modes but in practice it doesn't have enough information to accurately make that decision. This adds authoritative data to make the check work to get the best of both worlds. Fixes #12037
117 lines
3.1 KiB
Go
117 lines
3.1 KiB
Go
package ollamarunner
|
|
|
|
import (
|
|
"errors"
|
|
|
|
"github.com/ollama/ollama/ml"
|
|
"github.com/ollama/ollama/model/input"
|
|
)
|
|
|
|
// Tensors can't be used across multiple compute graphs. This is a problem
|
|
// if a single embedding is split across batches using views since all of
|
|
// the views will have the same source tensor. We also don't want to
|
|
// recompute the entire embedding for each batch.
|
|
//
|
|
// To avoid this, we compute all of the tensors for the embedding on the
|
|
// first use and then store the result in system memory. When we need
|
|
// additional tensors, we recreate them from the stored data.
|
|
|
|
// multimodalEntry represents the embeddings of a single object (such
|
|
// as an image).
|
|
type multimodalEntry struct {
|
|
// mm is the original set of tensors created by EncodeMultimodal
|
|
mm []input.Multimodal
|
|
|
|
// data is the computed result of mm. Nil if not yet computed
|
|
data [][]float32
|
|
}
|
|
|
|
// multimodalStore maps from an individual tensor (of which there
|
|
// may be many in a single multimodal object) to its parent embedding
|
|
type multimodalStore map[ml.Tensor]*multimodalEntry
|
|
|
|
func newMultimodalStore() multimodalStore {
|
|
return make(multimodalStore)
|
|
}
|
|
|
|
// addMultimodal stores an embedding for later use in a compute graph
|
|
func (m multimodalStore) addMultimodal(embedding []input.Multimodal) {
|
|
entry := &multimodalEntry{mm: embedding}
|
|
|
|
for _, e := range embedding {
|
|
if e.Tensor != nil {
|
|
m[e.Tensor] = entry
|
|
}
|
|
}
|
|
}
|
|
|
|
// getMultimodal takes a source set of tensors (which may contain a whole or
|
|
// parts of one or more images) and returns the equivalent that can be used in
|
|
// the current context
|
|
func (m multimodalStore) getMultimodal(backend ml.Backend, ctx ml.Context, in []input.Multimodal, reserve bool) ([]input.Multimodal, error) {
|
|
out := make([]input.Multimodal, len(in))
|
|
for i := range out {
|
|
if in[i].Tensor != nil {
|
|
var err error
|
|
out[i].Tensor, err = m.getTensor(backend, ctx, in[i].Tensor, reserve)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
}
|
|
|
|
out[i].Data = in[i].Data
|
|
}
|
|
|
|
return out, nil
|
|
}
|
|
|
|
func (m multimodalStore) getTensor(backend ml.Backend, ctx ml.Context, in ml.Tensor, reserve bool) (ml.Tensor, error) {
|
|
entry := m[in]
|
|
|
|
if entry.data == nil {
|
|
computeCtx := backend.NewContext()
|
|
defer computeCtx.Close()
|
|
|
|
var tensors []ml.Tensor
|
|
for _, t := range entry.mm {
|
|
if t.Tensor != nil {
|
|
tensors = append(tensors, t.Tensor)
|
|
}
|
|
}
|
|
|
|
if len(tensors) == 0 {
|
|
return nil, nil
|
|
}
|
|
|
|
computeCtx.Forward(tensors...)
|
|
entry.data = make([][]float32, len(entry.mm))
|
|
|
|
// Multimodal processing is computationally intensive, so treat it similarly to a large batch
|
|
computeCtx.SetBatchSize(512)
|
|
|
|
if !reserve {
|
|
computeCtx.Compute(tensors...)
|
|
|
|
for i, t := range entry.mm {
|
|
if t.Tensor != nil {
|
|
entry.data[i] = t.Tensor.Floats()
|
|
}
|
|
}
|
|
} else {
|
|
computeCtx.Reserve()
|
|
}
|
|
}
|
|
|
|
for i, t := range entry.mm {
|
|
if in == t.Tensor {
|
|
if !reserve {
|
|
return ctx.Input().FromFloats(entry.data[i], t.Tensor.Shape()...), nil
|
|
} else {
|
|
return ctx.Input().Empty(t.Tensor.DType(), t.Tensor.Shape()...), nil
|
|
}
|
|
}
|
|
}
|
|
|
|
return nil, errors.New("multimodal tensor not found")
|
|
}
|