From 0d6e35d3c67cf37de1c425d178c71d7351083013 Mon Sep 17 00:00:00 2001
From: Michael Yang <mxyng@pm.me>
Date: Thu, 8 May 2025 13:17:30 -0700
Subject: [PATCH 001/108] fix: stream accumulator exits early (#10593)

the stream accumulator exits as soon as it sees `api.ProgressResponse(status="success")` which isn't strictly correctly
since some requests may have multiple successes, e.g. `/api/create` when the source model needs to be pulled.
---
 server/routes.go      | 22 +++++------
 server/routes_test.go | 86 +++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 96 insertions(+), 12 deletions(-)

diff --git a/server/routes.go b/server/routes.go
index 8886073c..8b0c7aca 100644
--- a/server/routes.go
+++ b/server/routes.go
@@ -1341,31 +1341,29 @@ func Serve(ln net.Listener) error {
 
 func waitForStream(c *gin.Context, ch chan any) {
 	c.Header("Content-Type", "application/json")
+	var latest api.ProgressResponse
 	for resp := range ch {
 		switch r := resp.(type) {
 		case api.ProgressResponse:
-			if r.Status == "success" {
-				c.JSON(http.StatusOK, r)
-				return
-			}
+			latest = r
 		case gin.H:
 			status, ok := r["status"].(int)
 			if !ok {
 				status = http.StatusInternalServerError
 			}
-			if errorMsg, ok := r["error"].(string); ok {
-				c.JSON(status, gin.H{"error": errorMsg})
-				return
-			} else {
-				c.JSON(status, gin.H{"error": "unexpected error format in progress response"})
-				return
+			errorMsg, ok := r["error"].(string)
+			if !ok {
+				errorMsg = "unknown error"
 			}
+			c.JSON(status, gin.H{"error": errorMsg})
+			return
 		default:
-			c.JSON(http.StatusInternalServerError, gin.H{"error": "unexpected progress response"})
+			c.JSON(http.StatusInternalServerError, gin.H{"error": "unknown message type"})
 			return
 		}
 	}
-	c.JSON(http.StatusInternalServerError, gin.H{"error": "unexpected end of progress response"})
+
+	c.JSON(http.StatusOK, latest)
 }
 
 func streamResponse(c *gin.Context, ch chan any) {
diff --git a/server/routes_test.go b/server/routes_test.go
index fd63b78b..7c44bc95 100644
--- a/server/routes_test.go
+++ b/server/routes_test.go
@@ -21,6 +21,8 @@ import (
 	"testing"
 	"unicode"
 
+	"github.com/gin-gonic/gin"
+	"github.com/google/go-cmp/cmp"
 	"github.com/ollama/ollama/api"
 	"github.com/ollama/ollama/fs/ggml"
 	"github.com/ollama/ollama/openai"
@@ -882,3 +884,87 @@ func TestFilterThinkTags(t *testing.T) {
 		}
 	}
 }
+
+func TestWaitForStream(t *testing.T) {
+	gin.SetMode(gin.TestMode)
+
+	cases := []struct {
+		name       string
+		messages   []any
+		expectCode int
+		expectBody string
+	}{
+		{
+			name: "error",
+			messages: []any{
+				gin.H{"error": "internal server error"},
+			},
+			expectCode: http.StatusInternalServerError,
+			expectBody: `{"error":"internal server error"}`,
+		},
+		{
+			name: "error status",
+			messages: []any{
+				gin.H{"status": http.StatusNotFound, "error": "not found"},
+			},
+			expectCode: http.StatusNotFound,
+			expectBody: `{"error":"not found"}`,
+		},
+		{
+			name: "unknown error",
+			messages: []any{
+				gin.H{"msg": "something else"},
+			},
+			expectCode: http.StatusInternalServerError,
+			expectBody: `{"error":"unknown error"}`,
+		},
+		{
+			name: "unknown type",
+			messages: []any{
+				struct{}{},
+			},
+			expectCode: http.StatusInternalServerError,
+			expectBody: `{"error":"unknown message type"}`,
+		},
+		{
+			name: "progress success",
+			messages: []any{
+				api.ProgressResponse{Status: "success"},
+			},
+			expectCode: http.StatusOK,
+			expectBody: `{"status":"success"}`,
+		},
+		{
+			name: "progress more than success",
+			messages: []any{
+				api.ProgressResponse{Status: "success"},
+				api.ProgressResponse{Status: "one more thing"},
+			},
+			expectCode: http.StatusOK,
+			expectBody: `{"status":"one more thing"}`,
+		},
+	}
+
+	for _, tt := range cases {
+		t.Run(tt.name, func(t *testing.T) {
+			w := httptest.NewRecorder()
+			c, _ := gin.CreateTestContext(w)
+
+			ch := make(chan any, len(tt.messages))
+			for _, msg := range tt.messages {
+				ch <- msg
+			}
+			close(ch)
+
+			waitForStream(c, ch)
+
+			if w.Code != tt.expectCode {
+				t.Errorf("expected status %d, got %d", tt.expectCode, w.Code)
+			}
+
+			if diff := cmp.Diff(w.Body.String(), tt.expectBody); diff != "" {
+				t.Errorf("body mismatch (-want +got):\n%s", diff)
+			}
+		})
+	}
+}

From 3fa78598a1e86faee9390ded4b43e78ca3bef816 Mon Sep 17 00:00:00 2001
From: Bruce MacDonald <brucewmacdonald@gmail.com>
Date: Fri, 9 May 2025 18:05:43 -0700
Subject: [PATCH 002/108] cmd: strip single quotes from image page (#10636)

---
 cmd/interactive.go      |  2 ++
 cmd/interactive_test.go | 23 +++++++++++++++++++++++
 2 files changed, 25 insertions(+)

diff --git a/cmd/interactive.go b/cmd/interactive.go
index 82a3bfcb..70222711 100644
--- a/cmd/interactive.go
+++ b/cmd/interactive.go
@@ -531,6 +531,8 @@ func extractFileData(input string) (string, []api.ImageData, error) {
 			return "", imgs, err
 		}
 		fmt.Fprintf(os.Stderr, "Added image '%s'\n", nfp)
+		input = strings.ReplaceAll(input, "'"+nfp+"'", "")
+		input = strings.ReplaceAll(input, "'"+fp+"'", "")
 		input = strings.ReplaceAll(input, fp, "")
 		imgs = append(imgs, data)
 	}
diff --git a/cmd/interactive_test.go b/cmd/interactive_test.go
index 3f60448b..ba833191 100644
--- a/cmd/interactive_test.go
+++ b/cmd/interactive_test.go
@@ -1,6 +1,8 @@
 package cmd
 
 import (
+	"os"
+	"path/filepath"
 	"testing"
 
 	"github.com/stretchr/testify/assert"
@@ -50,3 +52,24 @@ d:\path with\spaces\seven.JPEG inbetween7 c:\users\jdoe\eight.png inbetween8
 	assert.Contains(t, res[9], "ten.PNG")
 	assert.Contains(t, res[9], "E:")
 }
+
+// Ensure that file paths wrapped in single quotes are removed with the quotes.
+func TestExtractFileDataRemovesQuotedFilepath(t *testing.T) {
+	dir := t.TempDir()
+	fp := filepath.Join(dir, "img.jpg")
+	data := make([]byte, 600)
+	copy(data, []byte{
+		0xff, 0xd8, 0xff, 0xe0, 0x00, 0x10, 'J', 'F', 'I', 'F',
+		0x00, 0x01, 0x01, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+		0xff, 0xd9,
+	})
+	if err := os.WriteFile(fp, data, 0o600); err != nil {
+		t.Fatalf("failed to write test image: %v", err)
+	}
+
+	input := "before '" + fp + "' after"
+	cleaned, imgs, err := extractFileData(input)
+	assert.NoError(t, err)
+	assert.Len(t, imgs, 1)
+	assert.Equal(t, cleaned, "before  after")
+}

From 867d75b21e4cc13d3cf74f1db1707508103d0030 Mon Sep 17 00:00:00 2001
From: AliAhmedNada <aliahmedalihassan@gmail.com>
Date: Sat, 10 May 2025 20:36:40 +0300
Subject: [PATCH 003/108] readme: add ojira to community integrations (#10648)

---
 README.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/README.md b/README.md
index e99226ae..9ddcdacd 100644
--- a/README.md
+++ b/README.md
@@ -315,6 +315,7 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [Ollama-chats RPG](https://github.com/drazdra/ollama-chats)
 - [IntelliBar](https://intellibar.app/) (AI-powered assistant for macOS)
 - [Jirapt](https://github.com/AliAhmedNada/jirapt) (Jira Integration to generate issues, tasks, epics)
+- [ojira](https://github.com/AliAhmedNada/ojira) (Jira chrome plugin to easily generate descriptions for tasks)
 - [QA-Pilot](https://github.com/reid41/QA-Pilot) (Interactive chat tool that can leverage Ollama models for rapid understanding and navigation of GitHub code repositories)
 - [ChatOllama](https://github.com/sugarforever/chat-ollama) (Open Source Chatbot based on Ollama with Knowledge Bases)
 - [CRAG Ollama Chat](https://github.com/Nagi-ovo/CRAG-Ollama-Chat) (Simple Web Search with Corrective RAG)

From 5969674cf167a9c65aa3d556de0064cf9ad1572d Mon Sep 17 00:00:00 2001
From: Michael Yang <mxyng@pm.me>
Date: Sat, 10 May 2025 11:27:15 -0700
Subject: [PATCH 004/108] feat: add threshold to dump options (#10639)

ml.Dump will preserve default values if not specified
---
 ml/backend.go | 56 ++++++++++++++++++++++++++++++++++++---------------
 1 file changed, 40 insertions(+), 16 deletions(-)

diff --git a/ml/backend.go b/ml/backend.go
index 0cd33bd8..ba24ecb4 100644
--- a/ml/backend.go
+++ b/ml/backend.go
@@ -5,6 +5,7 @@ import (
 	"context"
 	"encoding/binary"
 	"fmt"
+	"math"
 	"os"
 	"slices"
 	"strconv"
@@ -214,35 +215,58 @@ func mul[T number](s ...T) T {
 	return p
 }
 
-type DumpOptions struct {
-	// Items is the number of elements to print at the beginning and end of each dimension.
-	Items int
+type DumpOptions func(*dumpOptions)
 
-	// Precision is the number of decimal places to print. Applies to float32 and float64.
-	Precision int
+// DumpWithPrecision sets the number of decimal places to print. Applies to float32 and float64.
+func DumpWithPrecision(n int) DumpOptions {
+	return func(opts *dumpOptions) {
+		opts.Precision = n
+	}
 }
 
-func Dump(ctx Context, t Tensor, opts ...DumpOptions) string {
-	if len(opts) < 1 {
-		opts = append(opts, DumpOptions{
-			Items:     3,
-			Precision: 4,
-		})
+// DumpWithThreshold sets the threshold for printing the entire tensor. If the number of elements
+// is less than or equal to this value, the entire tensor will be printed. Otherwise, only the
+// beginning and end of each dimension will be printed.
+func DumpWithThreshold(n int) DumpOptions {
+	return func(opts *dumpOptions) {
+		opts.Threshold = n
+	}
+}
+
+// DumpWithEdgeItems sets the number of elements to print at the beginning and end of each dimension.
+func DumpWithEdgeItems(n int) DumpOptions {
+	return func(opts *dumpOptions) {
+		opts.EdgeItems = n
+	}
+}
+
+type dumpOptions struct {
+	Precision, Threshold, EdgeItems int
+}
+
+func Dump(ctx Context, t Tensor, optsFuncs ...DumpOptions) string {
+	opts := dumpOptions{Precision: 4, Threshold: 1000, EdgeItems: 3}
+	for _, optsFunc := range optsFuncs {
+		optsFunc(&opts)
+	}
+
+	if mul(t.Shape()...) <= opts.Threshold {
+		opts.EdgeItems = math.MaxInt
 	}
 
 	switch t.DType() {
 	case DTypeF32:
-		return dump[[]float32](ctx, t, opts[0].Items, func(f float32) string {
-			return strconv.FormatFloat(float64(f), 'f', opts[0].Precision, 32)
+		return dump[[]float32](ctx, t, opts.EdgeItems, func(f float32) string {
+			return strconv.FormatFloat(float64(f), 'f', opts.Precision, 32)
 		})
 	case DTypeF16, DTypeQ80, DTypeQ40:
 		f32 := ctx.Input().Empty(DTypeF32, t.Shape()...)
 		f32 = t.Copy(ctx, f32)
-		return dump[[]float32](ctx, f32, opts[0].Items, func(f float32) string {
-			return strconv.FormatFloat(float64(f), 'f', opts[0].Precision, 32)
+		return dump[[]float32](ctx, f32, opts.EdgeItems, func(f float32) string {
+			return strconv.FormatFloat(float64(f), 'f', opts.Precision, 32)
 		})
 	case DTypeI32:
-		return dump[[]int32](ctx, t, opts[0].Items, func(i int32) string {
+		return dump[[]int32](ctx, t, opts.EdgeItems, func(i int32) string {
 			return strconv.FormatInt(int64(i), 10)
 		})
 	default:

From 69ce44b33c49ad7035d75cac6b35a96b1ccc2e33 Mon Sep 17 00:00:00 2001
From: frob <rick+github@frob.com.au>
Date: Sat, 10 May 2025 20:31:04 +0200
Subject: [PATCH 005/108] envconfig: Remove no longer supported max vram var
 (#10623)

Co-authored-by: Richard Lyons <frob@cloudstaff.com>
---
 envconfig/config.go | 2 --
 1 file changed, 2 deletions(-)

diff --git a/envconfig/config.go b/envconfig/config.go
index b18e93f8..a1f05615 100644
--- a/envconfig/config.go
+++ b/envconfig/config.go
@@ -209,8 +209,6 @@ var (
 	MaxRunners = Uint("OLLAMA_MAX_LOADED_MODELS", 0)
 	// MaxQueue sets the maximum number of queued requests. MaxQueue can be configured via the OLLAMA_MAX_QUEUE environment variable.
 	MaxQueue = Uint("OLLAMA_MAX_QUEUE", 512)
-	// MaxVRAM sets a maximum VRAM override in bytes. MaxVRAM can be configured via the OLLAMA_MAX_VRAM environment variable.
-	MaxVRAM = Uint("OLLAMA_MAX_VRAM", 0)
 )
 
 func Uint64(key string, defaultValue uint64) func() uint64 {

From ecf14a220f852528bae860ec09b52590f06e6744 Mon Sep 17 00:00:00 2001
From: frob <rick+github@frob.com.au>
Date: Sat, 10 May 2025 20:57:30 +0200
Subject: [PATCH 006/108] llama: allocate grammar buffer based on schema length
 (#10649)

---
 llama/llama.go | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/llama/llama.go b/llama/llama.go
index 3e157c0a..063eb7c8 100644
--- a/llama/llama.go
+++ b/llama/llama.go
@@ -637,8 +637,8 @@ func SchemaToGrammar(schema []byte) []byte {
 	cStr := C.CString(string(schema))
 	defer C.free(unsafe.Pointer(cStr))
 
-	// Allocate buffer for grammar output with reasonable size
-	const maxLen = 32768 // 32KB
+	// Allocate buffer for grammar based on schema length but with upper bound
+	maxLen := min(1024*1024, len(schema)*4)
 	buf := make([]byte, maxLen)
 
 	// Call C function to convert schema to grammar

From 76724e2f2912537cc20a1b11175d21231563ffff Mon Sep 17 00:00:00 2001
From: HardCodeDev <hardcodedev777@yandex.com>
Date: Mon, 12 May 2025 00:40:41 +0400
Subject: [PATCH 007/108] readme: add OllamaPlusPlus C++ library to community
 integrations (#10664)

---
 README.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/README.md b/README.md
index 9ddcdacd..65e79462 100644
--- a/README.md
+++ b/README.md
@@ -527,6 +527,7 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [Abso](https://github.com/lunary-ai/abso) (OpenAI-compatible TypeScript SDK for any LLM provider)
 - [Nichey](https://github.com/goodreasonai/nichey) is a Python package for generating custom wikis for your research topic
 - [Ollama for D](https://github.com/kassane/ollama-d)
+- [OllamaPlusPlus](https://github.com/HardCodeDev777/OllamaPlusPlus) (Very simple C++ library for Ollama)
 
 ### Mobile
 

From 82a9e9462ab28415aab53fa47e33d70849274941 Mon Sep 17 00:00:00 2001
From: HardCodeDev <hardcodedev777@yandex.com>
Date: Mon, 12 May 2025 00:44:51 +0400
Subject: [PATCH 008/108] readme: add UnityCodeLama to community integrations
 (#10665)

---
 README.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/README.md b/README.md
index 65e79462..7cb61ac2 100644
--- a/README.md
+++ b/README.md
@@ -584,6 +584,7 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [Simple-Discord-AI](https://github.com/zyphixor/simple-discord-ai)
 - [LLM Telegram Bot](https://github.com/innightwolfsleep/llm_telegram_bot) (telegram bot, primary for RP. Oobabooga-like buttons, [A1111](https://github.com/AUTOMATIC1111/stable-diffusion-webui) API integration e.t.c)
 - [mcp-llm](https://github.com/sammcj/mcp-llm) (MCP Server to allow LLMs to call other LLMs)
+- [UnityCodeLama](https://github.com/HardCodeDev777/UnityCodeLama) (Unity Edtior tool to analyze scripts via Ollama)
 
 ### Supported backends
 

From f95a1f2bef74cc5badd12b8a6f873d47298595ad Mon Sep 17 00:00:00 2001
From: Michael Yang <mxyng@pm.me>
Date: Mon, 12 May 2025 11:43:00 -0700
Subject: [PATCH 009/108] feat: add trace log level (#10650)

reduce prompt log to trace level
---
 app/lifecycle/logging.go      | 22 ++-----------
 discover/gpu.go               |  2 +-
 envconfig/config.go           | 19 +++++++++--
 envconfig/config_test.go      | 33 ++++++++++++++++++++
 llm/server.go                 | 59 +++++++++++++++++++++--------------
 logutil/logutil.go            | 29 +++++++++++++++++
 ml/backend/ggml/ggml.go       |  3 +-
 model/model.go                |  3 +-
 model/process_text.go         |  4 +++
 model/process_text_spm.go     |  9 ++++--
 runner/llamarunner/runner.go  | 24 +++-----------
 runner/ollamarunner/runner.go | 22 +++----------
 server/routes.go              | 26 ++-------------
 13 files changed, 141 insertions(+), 114 deletions(-)
 create mode 100644 logutil/logutil.go

diff --git a/app/lifecycle/logging.go b/app/lifecycle/logging.go
index 9985fc3f..22e3de19 100644
--- a/app/lifecycle/logging.go
+++ b/app/lifecycle/logging.go
@@ -4,20 +4,14 @@ import (
 	"fmt"
 	"log/slog"
 	"os"
-	"path/filepath"
 	"strconv"
 	"strings"
 
 	"github.com/ollama/ollama/envconfig"
+	"github.com/ollama/ollama/logutil"
 )
 
 func InitLogging() {
-	level := slog.LevelInfo
-
-	if envconfig.Debug() {
-		level = slog.LevelDebug
-	}
-
 	var logFile *os.File
 	var err error
 	// Detect if we're a GUI app on windows, and if not, send logs to console
@@ -33,20 +27,8 @@ func InitLogging() {
 			return
 		}
 	}
-	handler := slog.NewTextHandler(logFile, &slog.HandlerOptions{
-		Level:     level,
-		AddSource: true,
-		ReplaceAttr: func(_ []string, attr slog.Attr) slog.Attr {
-			if attr.Key == slog.SourceKey {
-				source := attr.Value.Any().(*slog.Source)
-				source.File = filepath.Base(source.File)
-			}
-			return attr
-		},
-	})
-
-	slog.SetDefault(slog.New(handler))
 
+	slog.SetDefault(logutil.NewLogger(logFile, envconfig.LogLevel()))
 	slog.Info("ollama app started")
 }
 
diff --git a/discover/gpu.go b/discover/gpu.go
index ba906a18..15bad446 100644
--- a/discover/gpu.go
+++ b/discover/gpu.go
@@ -670,7 +670,7 @@ func loadOneapiMgmt(oneapiLibPaths []string) (int, *C.oneapi_handle_t, string, e
 }
 
 func getVerboseState() C.uint16_t {
-	if envconfig.Debug() {
+	if envconfig.LogLevel() < slog.LevelInfo {
 		return C.uint16_t(1)
 	}
 	return C.uint16_t(0)
diff --git a/envconfig/config.go b/envconfig/config.go
index a1f05615..9d7c2e21 100644
--- a/envconfig/config.go
+++ b/envconfig/config.go
@@ -149,9 +149,22 @@ func Bool(k string) func() bool {
 	}
 }
 
+// LogLevel returns the log level for the application.
+// Values are 0 or false INFO (Default), 1 or true DEBUG, 2 TRACE
+func LogLevel() slog.Level {
+	level := slog.LevelInfo
+	if s := Var("OLLAMA_DEBUG"); s != "" {
+		if b, _ := strconv.ParseBool(s); b {
+			level = slog.LevelDebug
+		} else if i, _ := strconv.ParseInt(s, 10, 64); i != 0 {
+			level = slog.Level(i * -4)
+		}
+	}
+
+	return level
+}
+
 var (
-	// Debug enabled additional debug information.
-	Debug = Bool("OLLAMA_DEBUG")
 	// FlashAttention enables the experimental flash attention feature.
 	FlashAttention = Bool("OLLAMA_FLASH_ATTENTION")
 	// KvCacheType is the quantization type for the K/V cache.
@@ -236,7 +249,7 @@ type EnvVar struct {
 
 func AsMap() map[string]EnvVar {
 	ret := map[string]EnvVar{
-		"OLLAMA_DEBUG":             {"OLLAMA_DEBUG", Debug(), "Show additional debug information (e.g. OLLAMA_DEBUG=1)"},
+		"OLLAMA_DEBUG":             {"OLLAMA_DEBUG", LogLevel(), "Show additional debug information (e.g. OLLAMA_DEBUG=1)"},
 		"OLLAMA_FLASH_ATTENTION":   {"OLLAMA_FLASH_ATTENTION", FlashAttention(), "Enabled flash attention"},
 		"OLLAMA_KV_CACHE_TYPE":     {"OLLAMA_KV_CACHE_TYPE", KvCacheType(), "Quantization type for the K/V cache (default: f16)"},
 		"OLLAMA_GPU_OVERHEAD":      {"OLLAMA_GPU_OVERHEAD", GpuOverhead(), "Reserve a portion of VRAM per GPU (bytes)"},
diff --git a/envconfig/config_test.go b/envconfig/config_test.go
index 9e80645c..f232f1cd 100644
--- a/envconfig/config_test.go
+++ b/envconfig/config_test.go
@@ -1,11 +1,13 @@
 package envconfig
 
 import (
+	"log/slog"
 	"math"
 	"testing"
 	"time"
 
 	"github.com/google/go-cmp/cmp"
+	"github.com/ollama/ollama/logutil"
 )
 
 func TestHost(t *testing.T) {
@@ -292,3 +294,34 @@ func TestContextLength(t *testing.T) {
 		})
 	}
 }
+
+func TestLogLevel(t *testing.T) {
+	cases := map[string]slog.Level{
+		// Default to INFO
+		"":      slog.LevelInfo,
+		"false": slog.LevelInfo,
+		"f":     slog.LevelInfo,
+		"0":     slog.LevelInfo,
+
+		// True values enable Debug
+		"true": slog.LevelDebug,
+		"t":    slog.LevelDebug,
+
+		// Positive values increase verbosity
+		"1": slog.LevelDebug,
+		"2": logutil.LevelTrace,
+
+		// Negative values decrease verbosity
+		"-1": slog.LevelWarn,
+		"-2": slog.LevelError,
+	}
+
+	for k, v := range cases {
+		t.Run(k, func(t *testing.T) {
+			t.Setenv("OLLAMA_DEBUG", k)
+			if i := LogLevel(); i != v {
+				t.Errorf("%s: expected %d, got %d", k, v, i)
+			}
+		})
+	}
+}
diff --git a/llm/server.go b/llm/server.go
index 8884d105..085e0980 100644
--- a/llm/server.go
+++ b/llm/server.go
@@ -17,6 +17,7 @@ import (
 	"os/exec"
 	"path/filepath"
 	"runtime"
+	"slices"
 	"strconv"
 	"strings"
 	"sync"
@@ -30,9 +31,37 @@ import (
 	"github.com/ollama/ollama/format"
 	"github.com/ollama/ollama/fs/ggml"
 	"github.com/ollama/ollama/llama"
+	"github.com/ollama/ollama/logutil"
 	"github.com/ollama/ollama/model"
 )
 
+type filteredEnv []string
+
+func (e filteredEnv) LogValue() slog.Value {
+	var attrs []slog.Attr
+	for _, env := range e {
+		if key, value, ok := strings.Cut(env, "="); ok {
+			switch {
+			case strings.HasPrefix(key, "OLLAMA_"),
+				strings.HasPrefix(key, "CUDA_"),
+				strings.HasPrefix(key, "ROCR_"),
+				strings.HasPrefix(key, "ROCM_"),
+				strings.HasPrefix(key, "HIP_"),
+				strings.HasPrefix(key, "GPU_"),
+				strings.HasPrefix(key, "HSA_"),
+				strings.HasPrefix(key, "GGML_"),
+				slices.Contains([]string{
+					"PATH",
+					"LD_LIBRARY_PATH",
+					"DYLD_LIBRARY_PATH",
+				}, key):
+				attrs = append(attrs, slog.String(key, value))
+			}
+		}
+	}
+	return slog.GroupValue(attrs...)
+}
+
 type LlamaServer interface {
 	Ping(ctx context.Context) error
 	WaitUntilRunning(ctx context.Context) error
@@ -148,10 +177,6 @@ func NewLlamaServer(gpus discover.GpuInfoList, modelPath string, f *ggml.GGML, a
 		params = append(params, "--n-gpu-layers", strconv.Itoa(opts.NumGPU))
 	}
 
-	if envconfig.Debug() {
-		params = append(params, "--verbose")
-	}
-
 	if opts.MainGPU > 0 {
 		params = append(params, "--main-gpu", strconv.Itoa(opts.MainGPU))
 	}
@@ -404,26 +429,7 @@ func NewLlamaServer(gpus discover.GpuInfoList, modelPath string, f *ggml.GGML, a
 		}
 
 		slog.Info("starting llama server", "cmd", s.cmd)
-		if envconfig.Debug() {
-			filteredEnv := []string{}
-			for _, ev := range s.cmd.Env {
-				if strings.HasPrefix(ev, "OLLAMA_") ||
-					strings.HasPrefix(ev, "CUDA_") ||
-					strings.HasPrefix(ev, "ROCR_") ||
-					strings.HasPrefix(ev, "ROCM_") ||
-					strings.HasPrefix(ev, "HIP_") ||
-					strings.HasPrefix(ev, "GPU_") ||
-					strings.HasPrefix(ev, "HSA_") ||
-					strings.HasPrefix(ev, "GGML_") ||
-					strings.HasPrefix(ev, "PATH=") ||
-					strings.HasPrefix(ev, "LD_LIBRARY_PATH=") ||
-					strings.HasPrefix(ev, "DYLD_LIBRARY_PATH=") {
-					filteredEnv = append(filteredEnv, ev)
-				}
-			}
-			// Log at debug as the environment is inherited and might contain sensitive information
-			slog.Debug("subprocess", "environment", filteredEnv)
-		}
+		slog.Debug("subprocess", "", filteredEnv(s.cmd.Env))
 
 		if err = s.cmd.Start(); err != nil {
 			var msg string
@@ -721,6 +727,9 @@ type CompletionResponse struct {
 }
 
 func (s *llmServer) Completion(ctx context.Context, req CompletionRequest, fn func(CompletionResponse)) error {
+	slog.Debug("completion request", "images", len(req.Images), "prompt", len(req.Prompt), "format", string(req.Format))
+	slog.Log(ctx, logutil.LevelTrace, "completion request", "prompt", req.Prompt)
+
 	if len(req.Format) > 0 {
 		switch string(req.Format) {
 		case `null`, `""`:
@@ -884,6 +893,8 @@ type EmbeddingResponse struct {
 }
 
 func (s *llmServer) Embedding(ctx context.Context, input string) ([]float32, error) {
+	slog.Log(ctx, logutil.LevelTrace, "embedding request", "input", input)
+
 	if err := s.sem.Acquire(ctx, 1); err != nil {
 		if errors.Is(err, context.Canceled) {
 			slog.Info("aborting embedding request due to client closing the connection")
diff --git a/logutil/logutil.go b/logutil/logutil.go
new file mode 100644
index 00000000..406caf54
--- /dev/null
+++ b/logutil/logutil.go
@@ -0,0 +1,29 @@
+package logutil
+
+import (
+	"io"
+	"log/slog"
+	"path/filepath"
+)
+
+const LevelTrace slog.Level = -8
+
+func NewLogger(w io.Writer, level slog.Level) *slog.Logger {
+	return slog.New(slog.NewTextHandler(w, &slog.HandlerOptions{
+		Level:     level,
+		AddSource: true,
+		ReplaceAttr: func(_ []string, attr slog.Attr) slog.Attr {
+			switch attr.Key {
+			case slog.LevelKey:
+				switch attr.Value.Any().(slog.Level) {
+				case LevelTrace:
+					attr.Value = slog.StringValue("TRACE")
+				}
+			case slog.SourceKey:
+				source := attr.Value.Any().(*slog.Source)
+				source.File = filepath.Base(source.File)
+			}
+			return attr
+		},
+	}))
+}
diff --git a/ml/backend/ggml/ggml.go b/ml/backend/ggml/ggml.go
index aace1335..ff340561 100644
--- a/ml/backend/ggml/ggml.go
+++ b/ml/backend/ggml/ggml.go
@@ -27,6 +27,7 @@ import (
 	"github.com/ollama/ollama/format"
 	"github.com/ollama/ollama/fs"
 	fsggml "github.com/ollama/ollama/fs/ggml"
+	"github.com/ollama/ollama/logutil"
 	"github.com/ollama/ollama/ml"
 	ggml "github.com/ollama/ollama/ml/backend/ggml/ggml/src"
 	"golang.org/x/sync/errgroup"
@@ -222,7 +223,7 @@ func New(ctx context.Context, r *os.File, params ml.BackendParams) (ml.Backend,
 			tt := C.ggml_new_tensor(ctxs[bt], t.source.Kind, C.int(len(t.source.Shape)), (*C.int64_t)(unsafe.Pointer(&t.source.Shape[0])))
 			C.ggml_set_name(tt, cname)
 
-			slog.Debug("created tensor", "name", name, "shape", t.source.Shape, "dtype", t.source.Kind, "buffer_type", C.GoString(C.ggml_backend_buft_name(bt)))
+			slog.Log(context.TODO(), logutil.LevelTrace, "created tensor", "name", name, "shape", t.source.Shape, "dtype", t.source.Kind, "buffer_type", C.GoString(C.ggml_backend_buft_name(bt)))
 			//nolint:staticcheck // TODO: check if buffer type supports this tensor
 			return tt
 		}
diff --git a/model/model.go b/model/model.go
index ab96c4c7..7883b851 100644
--- a/model/model.go
+++ b/model/model.go
@@ -19,6 +19,7 @@ import (
 	"github.com/ollama/ollama/fs"
 	fsggml "github.com/ollama/ollama/fs/ggml"
 	"github.com/ollama/ollama/kvcache"
+	"github.com/ollama/ollama/logutil"
 	"github.com/ollama/ollama/ml"
 	_ "github.com/ollama/ollama/ml/backend"
 	"github.com/ollama/ollama/model/input"
@@ -202,7 +203,7 @@ func populateFields(base Base, v reflect.Value, tags ...Tag) reflect.Value {
 				names := fn(tagsCopy)
 				for _, name := range names {
 					if tensor := base.Backend().Get(strings.Join(name, ".")); tensor != nil {
-						slog.Debug("found tensor", "", tensor)
+						slog.Log(context.TODO(), logutil.LevelTrace, "found tensor", "", tensor)
 						vv.Set(reflect.ValueOf(tensor))
 						break
 					}
diff --git a/model/process_text.go b/model/process_text.go
index 90b220a2..7b87ccc3 100644
--- a/model/process_text.go
+++ b/model/process_text.go
@@ -2,6 +2,7 @@ package model
 
 import (
 	"cmp"
+	"context"
 	"iter"
 	"log/slog"
 	"slices"
@@ -10,6 +11,7 @@ import (
 
 	"github.com/dlclark/regexp2"
 	heap "github.com/emirpasic/gods/v2/trees/binaryheap"
+	"github.com/ollama/ollama/logutil"
 )
 
 type Special int32
@@ -322,6 +324,7 @@ func (bpe BytePairEncoding) Encode(s string, addSpecial bool) ([]int32, error) {
 		}
 	}
 
+	slog.Log(context.TODO(), logutil.LevelTrace, "encoded", "ids", ids)
 	return ids, nil
 }
 
@@ -349,5 +352,6 @@ func (bpe BytePairEncoding) Decode(ids []int32) (string, error) {
 		}
 	}
 
+	slog.Log(context.TODO(), logutil.LevelTrace, "decoded", "string", sb.String())
 	return sb.String(), nil
 }
diff --git a/model/process_text_spm.go b/model/process_text_spm.go
index 446d5d60..b1cff7d2 100644
--- a/model/process_text_spm.go
+++ b/model/process_text_spm.go
@@ -2,10 +2,13 @@ package model
 
 import (
 	"container/heap"
+	"context"
 	"fmt"
 	"log/slog"
 	"strconv"
 	"strings"
+
+	"github.com/ollama/ollama/logutil"
 )
 
 const spmWhitespaceSep = "▁"
@@ -22,7 +25,7 @@ func (spm SentencePieceModel) Vocabulary() *Vocabulary {
 }
 
 func NewSentencePieceModel(vocab *Vocabulary) SentencePieceModel {
-	slog.Debug("Tokens", "num tokens", len(vocab.Values), "vals", vocab.Values[:5], "scores", vocab.Scores[:5], "types", vocab.Types[:5])
+	slog.Log(context.TODO(), logutil.LevelTrace, "Tokens", "num tokens", len(vocab.Values), "vals", vocab.Values[:5], "scores", vocab.Scores[:5], "types", vocab.Types[:5])
 
 	counter := map[int]int{}
 	var maxTokenLen int
@@ -36,7 +39,7 @@ func NewSentencePieceModel(vocab *Vocabulary) SentencePieceModel {
 		}
 	}
 
-	slog.Debug("Token counts", "normal", counter[TOKEN_TYPE_NORMAL], "unknown", counter[TOKEN_TYPE_UNKNOWN], "control", counter[TOKEN_TYPE_CONTROL],
+	slog.Log(context.TODO(), logutil.LevelTrace, "Token counts", "normal", counter[TOKEN_TYPE_NORMAL], "unknown", counter[TOKEN_TYPE_UNKNOWN], "control", counter[TOKEN_TYPE_CONTROL],
 		"user defined", counter[TOKEN_TYPE_USER_DEFINED], "unused", counter[TOKEN_TYPE_UNUSED], "byte", counter[TOKEN_TYPE_BYTE],
 		"max token len", maxTokenLen)
 
@@ -199,6 +202,7 @@ func (spm SentencePieceModel) Encode(s string, addSpecial bool) ([]int32, error)
 		}
 	}
 
+	slog.Log(context.TODO(), logutil.LevelTrace, "encoded", "ids", ids)
 	return ids, nil
 }
 
@@ -257,5 +261,6 @@ func (spm SentencePieceModel) Decode(ids []int32) (string, error) {
 		}
 	}
 
+	slog.Log(context.TODO(), logutil.LevelTrace, "decoded", "string", sb.String())
 	return sb.String(), nil
 }
diff --git a/runner/llamarunner/runner.go b/runner/llamarunner/runner.go
index 5341d4fb..73e50ee0 100644
--- a/runner/llamarunner/runner.go
+++ b/runner/llamarunner/runner.go
@@ -11,7 +11,6 @@ import (
 	"net"
 	"net/http"
 	"os"
-	"path/filepath"
 	"regexp"
 	"runtime"
 	"strconv"
@@ -23,8 +22,10 @@ import (
 	"golang.org/x/sync/semaphore"
 
 	"github.com/ollama/ollama/api"
+	"github.com/ollama/ollama/envconfig"
 	"github.com/ollama/ollama/llama"
 	"github.com/ollama/ollama/llm"
+	"github.com/ollama/ollama/logutil"
 	"github.com/ollama/ollama/runner/common"
 )
 
@@ -680,8 +681,6 @@ func (s *Server) embeddings(w http.ResponseWriter, r *http.Request) {
 
 	w.Header().Set("Content-Type", "application/json")
 
-	slog.Debug("embedding request", "content", req.Content)
-
 	seq, err := s.NewSequence(req.Content, nil, NewSequenceParams{embedding: true})
 	if err != nil {
 		http.Error(w, fmt.Sprintf("Failed to create new sequence: %v", err), http.StatusInternalServerError)
@@ -815,7 +814,7 @@ func Execute(args []string) error {
 	kvCacheType := fs.String("kv-cache-type", "", "quantization type for KV cache (default: f16)")
 	port := fs.Int("port", 8080, "Port to expose the server on")
 	threads := fs.Int("threads", runtime.NumCPU(), "Number of threads to use during generation")
-	verbose := fs.Bool("verbose", false, "verbose output (default: disabled)")
+	_ = fs.Bool("verbose", false, "verbose output (default: disabled)")
 	noMmap := fs.Bool("no-mmap", false, "do not memory-map model (slower load but may reduce pageouts if not using mlock)")
 	tensorSplit := fs.String("tensor-split", "", "fraction of the model to offload to each GPU, comma-separated list of proportions")
 	multiUserCache := fs.Bool("multiuser-cache", false, "optimize input cache algorithm for multiple users")
@@ -830,22 +829,7 @@ func Execute(args []string) error {
 	if err := fs.Parse(args); err != nil {
 		return err
 	}
-	level := slog.LevelInfo
-	if *verbose {
-		level = slog.LevelDebug
-	}
-	handler := slog.NewTextHandler(os.Stderr, &slog.HandlerOptions{
-		Level:     level,
-		AddSource: true,
-		ReplaceAttr: func(_ []string, attr slog.Attr) slog.Attr {
-			if attr.Key == slog.SourceKey {
-				source := attr.Value.Any().(*slog.Source)
-				source.File = filepath.Base(source.File)
-			}
-			return attr
-		},
-	})
-	slog.SetDefault(slog.New(handler))
+	slog.SetDefault(logutil.NewLogger(os.Stderr, envconfig.LogLevel()))
 	slog.Info("starting go runner")
 
 	llama.BackendInit()
diff --git a/runner/ollamarunner/runner.go b/runner/ollamarunner/runner.go
index b028a721..9a522223 100644
--- a/runner/ollamarunner/runner.go
+++ b/runner/ollamarunner/runner.go
@@ -12,7 +12,6 @@ import (
 	"net"
 	"net/http"
 	"os"
-	"path/filepath"
 	"regexp"
 	"runtime"
 	"strconv"
@@ -24,7 +23,9 @@ import (
 	"golang.org/x/sync/semaphore"
 
 	"github.com/ollama/ollama/api"
+	"github.com/ollama/ollama/envconfig"
 	"github.com/ollama/ollama/llm"
+	"github.com/ollama/ollama/logutil"
 	"github.com/ollama/ollama/ml"
 	"github.com/ollama/ollama/model"
 	"github.com/ollama/ollama/model/input"
@@ -816,7 +817,7 @@ func Execute(args []string) error {
 	kvCacheType := fs.String("kv-cache-type", "", "quantization type for KV cache (default: f16)")
 	port := fs.Int("port", 8080, "Port to expose the server on")
 	threads := fs.Int("threads", runtime.NumCPU(), "Number of threads to use during generation")
-	verbose := fs.Bool("verbose", false, "verbose output (default: disabled)")
+	_ = fs.Bool("verbose", false, "verbose output (default: disabled)")
 	_ = fs.Bool("no-mmap", false, "do not memory-map model (slower load but may reduce pageouts if not using mlock)")
 	tensorSplit := fs.String("tensor-split", "", "fraction of the model to offload to each GPU, comma-separated list of proportions")
 	multiUserCache := fs.Bool("multiuser-cache", false, "optimize input cache algorithm for multiple users")
@@ -831,22 +832,7 @@ func Execute(args []string) error {
 	if err := fs.Parse(args); err != nil {
 		return err
 	}
-	level := slog.LevelInfo
-	if *verbose {
-		level = slog.LevelDebug
-	}
-	handler := slog.NewTextHandler(os.Stderr, &slog.HandlerOptions{
-		Level:     level,
-		AddSource: true,
-		ReplaceAttr: func(_ []string, attr slog.Attr) slog.Attr {
-			if attr.Key == slog.SourceKey {
-				source := attr.Value.Any().(*slog.Source)
-				source.File = filepath.Base(source.File)
-			}
-			return attr
-		},
-	})
-	slog.SetDefault(slog.New(handler))
+	slog.SetDefault(logutil.NewLogger(os.Stderr, envconfig.LogLevel()))
 	slog.Info("starting ollama engine")
 
 	server := &Server{
diff --git a/server/routes.go b/server/routes.go
index 8b0c7aca..f1706f74 100644
--- a/server/routes.go
+++ b/server/routes.go
@@ -17,7 +17,6 @@ import (
 	"net/netip"
 	"os"
 	"os/signal"
-	"path/filepath"
 	"regexp"
 	"slices"
 	"strings"
@@ -33,6 +32,7 @@ import (
 	"github.com/ollama/ollama/envconfig"
 	"github.com/ollama/ollama/fs/ggml"
 	"github.com/ollama/ollama/llm"
+	"github.com/ollama/ollama/logutil"
 	"github.com/ollama/ollama/model/models/mllama"
 	"github.com/ollama/ollama/openai"
 	"github.com/ollama/ollama/server/internal/client/ollama"
@@ -295,8 +295,6 @@ func (s *Server) GenerateHandler(c *gin.Context) {
 		prompt = b.String()
 	}
 
-	slog.Debug("generate request", "images", len(images), "prompt", prompt)
-
 	ch := make(chan any)
 	go func() {
 		// TODO (jmorganca): avoid building the response twice both here and below
@@ -1226,26 +1224,8 @@ func (s *Server) GenerateRoutes(rc *ollama.Registry) (http.Handler, error) {
 }
 
 func Serve(ln net.Listener) error {
-	level := slog.LevelInfo
-	if envconfig.Debug() {
-		level = slog.LevelDebug
-	}
-
+	slog.SetDefault(logutil.NewLogger(os.Stderr, envconfig.LogLevel()))
 	slog.Info("server config", "env", envconfig.Values())
-	handler := slog.NewTextHandler(os.Stderr, &slog.HandlerOptions{
-		Level:     level,
-		AddSource: true,
-		ReplaceAttr: func(_ []string, attr slog.Attr) slog.Attr {
-			if attr.Key == slog.SourceKey {
-				source := attr.Value.Any().(*slog.Source)
-				source.File = filepath.Base(source.File)
-			}
-
-			return attr
-		},
-	})
-
-	slog.SetDefault(slog.New(handler))
 
 	blobsDir, err := GetBlobsPath("")
 	if err != nil {
@@ -1521,8 +1501,6 @@ func (s *Server) ChatHandler(c *gin.Context) {
 		return
 	}
 
-	slog.Debug("chat request", "images", len(images), "prompt", prompt)
-
 	ch := make(chan any)
 	go func() {
 		defer close(ch)

From ad035ad595295d0026a5a94f8180962bbf0fa935 Mon Sep 17 00:00:00 2001
From: Bruce MacDonald <brucewmacdonald@gmail.com>
Date: Mon, 12 May 2025 12:04:20 -0700
Subject: [PATCH 010/108] convert: quantize from safetensors needs kv (#10675)

When creating a quantized model from safetensors we
need the array KV values to be loaded.Changing this
value to -1 loads the KV values on the returned
layer to be used and saved during quantization.
---
 server/create.go | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/server/create.go b/server/create.go
index 7ffa60a2..41c8731c 100644
--- a/server/create.go
+++ b/server/create.go
@@ -295,7 +295,7 @@ func convertFromSafetensors(files map[string]string, baseLayers []*layerGGML, is
 	}
 	defer bin.Close()
 
-	f, _, err := ggml.Decode(bin, 1024)
+	f, _, err := ggml.Decode(bin, -1)
 	if err != nil {
 		return nil, err
 	}

From 0cefd46f23ed60d5b41a90e6b6a1864e4214da3b Mon Sep 17 00:00:00 2001
From: Jeffrey Morgan <jmorganca@gmail.com>
Date: Mon, 12 May 2025 12:17:26 -0700
Subject: [PATCH 011/108] llama: update to commit de4c07f93 (#10655)

---
 Makefile.sync                                 |    2 +-
 llama/build-info.cpp                          |    2 +-
 llama/llama.cpp/.rsync-filter                 |   10 +-
 llama/llama.cpp/common/common.cpp             |   19 +-
 llama/llama.cpp/common/common.h               |   18 +-
 llama/llama.cpp/common/sampling.cpp           |  107 +-
 llama/llama.cpp/include/llama.h               |   63 +-
 llama/llama.cpp/src/llama-adapter.cpp         |    6 +
 llama/llama.cpp/src/llama-batch.cpp           |    6 +-
 llama/llama.cpp/src/llama-batch.h             |    3 +-
 llama/llama.cpp/src/llama-chat.cpp            |   24 +-
 llama/llama.cpp/src/llama-chat.h              |    1 +
 llama/llama.cpp/src/llama-context.cpp         |  891 +++--
 llama/llama.cpp/src/llama-context.h           |   79 +-
 llama/llama.cpp/src/llama-cparams.h           |    3 +-
 llama/llama.cpp/src/llama-graph.cpp           |   58 +-
 llama/llama.cpp/src/llama-graph.h             |   20 +-
 llama/llama.cpp/src/llama-kv-cache.cpp        | 1814 ++++++++---
 llama/llama.cpp/src/llama-kv-cache.h          |  405 ++-
 llama/llama.cpp/src/llama-memory.h            |   12 +-
 llama/llama.cpp/src/llama-model-loader.cpp    |   24 +-
 llama/llama.cpp/src/llama-model-saver.cpp     |  281 ++
 llama/llama.cpp/src/llama-model-saver.h       |   37 +
 llama/llama.cpp/src/llama-model.cpp           |  153 +-
 llama/llama.cpp/src/llama-model.h             |    9 +-
 llama/llama.cpp/src/llama-quant.cpp           |    4 +-
 llama/llama.cpp/src/llama-sampling.cpp        |   24 +-
 llama/llama.cpp/src/llama-vocab.cpp           |   46 +-
 llama/llama.cpp/src/llama-vocab.h             |    6 +
 llama/llama.cpp/src/llama.cpp                 |    9 +
 .../llava => tools/mtmd}/clip-impl.h          |   30 +-
 .../{examples/llava => tools/mtmd}/clip.cpp   | 2855 +++++++++--------
 .../{examples/llava => tools/mtmd}/clip.h     |    8 +-
 .../{examples/llava => tools/mtmd}/llava.cpp  |    7 +-
 .../{examples/llava => tools/mtmd}/llava.h    |    0
 .../llava/llava.go => tools/mtmd/mtmd.go}     |    2 +-
 llama/llama.go                                |    4 +-
 ...loc-and-free-using-the-same-compiler.patch |   34 +-
 llama/patches/0002-pretokenizer.patch         |    8 +-
 llama/patches/0003-embeddings.patch           |    8 +-
 llama/patches/0004-clip-unicode.patch         |   20 +-
 llama/patches/0005-solar-pro.patch            |   20 +-
 llama/patches/0006-add-mllama-support.patch   |  303 +-
 llama/patches/0007-add-unpad-operator.patch   |   38 +-
 .../0008-fix-deepseek-deseret-regex.patch     |    4 +-
 ...nsure-KV-cache-is-fully-defragmented.patch |  332 +-
 ...target-ggml-cpu-for-all-cpu-variants.patch |    2 +-
 llama/patches/0013-remove-amx.patch           |    2 +-
 .../0014-fix-string-arr-kv-loading.patch      |   12 +-
 llama/patches/0015-ollama-debug-tensor.patch  |    4 +-
 ...add-ollama-vocab-for-grammar-support.patch |    2 +-
 ...t-fail-when-tensor-data-changes-1322.patch |   38 -
 ml/backend/ggml/ggml.go                       |    1 +
 ml/backend/ggml/ggml/include/ggml-backend.h   |    8 +-
 ml/backend/ggml/ggml/include/ggml-cpp.h       |    2 +-
 ml/backend/ggml/ggml/include/ggml-opt.h       |   75 +-
 ml/backend/ggml/ggml/include/ggml.h           |   17 +-
 ml/backend/ggml/ggml/src/CMakeLists.txt       |    2 +-
 ml/backend/ggml/ggml/src/ggml-backend.cpp     |   14 +-
 .../ggml/ggml/src/ggml-cpu/CMakeLists.txt     |   29 +-
 .../ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp    |    2 -
 .../ggml/ggml/src/ggml-cpu/ggml-cpu-quants.c  |  117 +-
 ml/backend/ggml/ggml/src/ggml-cpu/ggml-cpu.c  |   13 -
 .../ggml/ggml/src/ggml-cpu/ggml-cpu.cpp       |   45 +-
 .../ggml/src/ggml-cpu/llamafile/sgemm.cpp     |  501 +++
 ml/backend/ggml/ggml/src/ggml-cpu/ops.cpp     |   13 -
 ml/backend/ggml/ggml/src/ggml-cpu/vec.cpp     |    6 -
 .../ggml/ggml/src/ggml-cuda/CMakeLists.txt    |   27 +-
 ml/backend/ggml/ggml/src/ggml-cuda/acc.cu     |   66 +-
 ml/backend/ggml/ggml/src/ggml-cuda/common.cuh |   23 +-
 .../ggml/ggml/src/ggml-cuda/cp-async.cuh      |   11 +
 ml/backend/ggml/ggml/src/ggml-cuda/cpy.cu     |    2 +
 .../ggml/ggml/src/ggml-cuda/fattn-common.cuh  |   28 +-
 .../ggml/ggml/src/ggml-cuda/fattn-mma-f16.cuh |  965 +++---
 .../ggml/ggml/src/ggml-cuda/fattn-tile-f16.cu |    4 +-
 .../ggml/ggml/src/ggml-cuda/fattn-tile-f32.cu |    4 +-
 .../ggml/ggml/src/ggml-cuda/fattn-vec-f16.cuh |    3 +-
 .../ggml/ggml/src/ggml-cuda/fattn-vec-f32.cuh |    2 +-
 .../ggml/ggml/src/ggml-cuda/fattn-wmma-f16.cu |    2 +-
 ml/backend/ggml/ggml/src/ggml-cuda/fattn.cu   |  116 +-
 ml/backend/ggml/ggml/src/ggml-cuda/getrows.cu |   26 +-
 .../ggml/ggml/src/ggml-cuda/ggml-cuda.cu      |   41 +-
 ml/backend/ggml/ggml/src/ggml-cuda/mmq.cu     |   17 +-
 ml/backend/ggml/ggml/src/ggml-cuda/mmq.cuh    |   62 +-
 ml/backend/ggml/ggml/src/ggml-cuda/mmvq.cu    |   11 +
 .../ggml/ggml/src/ggml-cuda/quantize.cu       |    1 +
 ml/backend/ggml/ggml/src/ggml-cuda/sum.cu     |    2 +-
 ...ttn-mma-f16-instance-ncols1_1-ncols2_16.cu |    5 +
 ...attn-mma-f16-instance-ncols1_1-ncols2_8.cu |   12 +-
 ...ttn-mma-f16-instance-ncols1_16-ncols2_1.cu |   12 +-
 ...ttn-mma-f16-instance-ncols1_16-ncols2_2.cu |   12 +-
 ...ttn-mma-f16-instance-ncols1_16-ncols2_4.cu |   12 +-
 ...ttn-mma-f16-instance-ncols1_2-ncols2_16.cu |    5 +
 ...attn-mma-f16-instance-ncols1_2-ncols2_4.cu |   12 +-
 ...attn-mma-f16-instance-ncols1_2-ncols2_8.cu |   12 +-
 ...ttn-mma-f16-instance-ncols1_32-ncols2_1.cu |   12 +-
 ...ttn-mma-f16-instance-ncols1_32-ncols2_2.cu |   12 +-
 ...ttn-mma-f16-instance-ncols1_4-ncols2_16.cu |    5 +
 ...attn-mma-f16-instance-ncols1_4-ncols2_2.cu |   12 +-
 ...attn-mma-f16-instance-ncols1_4-ncols2_4.cu |   12 +-
 ...attn-mma-f16-instance-ncols1_4-ncols2_8.cu |   12 +-
 ...ttn-mma-f16-instance-ncols1_64-ncols2_1.cu |   12 +-
 ...attn-mma-f16-instance-ncols1_8-ncols2_1.cu |   12 +-
 ...attn-mma-f16-instance-ncols1_8-ncols2_2.cu |   12 +-
 ...attn-mma-f16-instance-ncols1_8-ncols2_4.cu |   12 +-
 ...attn-mma-f16-instance-ncols1_8-ncols2_8.cu |   12 +-
 .../src/ggml-metal/ggml-metal-embed.metal     |  566 ++--
 .../ggml/src/ggml-metal/ggml-metal-impl.h     |   47 +-
 .../ggml/ggml/src/ggml-metal/ggml-metal.m     |  389 ++-
 .../ggml/ggml/src/ggml-metal/ggml-metal.metal |  519 ++-
 ml/backend/ggml/ggml/src/ggml-opt.cpp         |  570 ++--
 ml/backend/ggml/ggml/src/ggml-quants.c        |    6 -
 ml/backend/ggml/ggml/src/ggml.c               |   49 +-
 113 files changed, 8097 insertions(+), 4383 deletions(-)
 create mode 100644 llama/llama.cpp/src/llama-model-saver.cpp
 create mode 100644 llama/llama.cpp/src/llama-model-saver.h
 rename llama/llama.cpp/{examples/llava => tools/mtmd}/clip-impl.h (90%)
 rename llama/llama.cpp/{examples/llava => tools/mtmd}/clip.cpp (58%)
 rename llama/llama.cpp/{examples/llava => tools/mtmd}/clip.h (96%)
 rename llama/llama.cpp/{examples/llava => tools/mtmd}/llava.cpp (99%)
 rename llama/llama.cpp/{examples/llava => tools/mtmd}/llava.h (100%)
 rename llama/llama.cpp/{examples/llava/llava.go => tools/mtmd/mtmd.go} (92%)
 delete mode 100644 llama/patches/0017-ggml-Don-t-assert-fail-when-tensor-data-changes-1322.patch
 create mode 100644 ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_1-ncols2_16.cu
 create mode 100644 ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_2-ncols2_16.cu
 create mode 100644 ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_4-ncols2_16.cu

diff --git a/Makefile.sync b/Makefile.sync
index 238d7627..bceae7f5 100644
--- a/Makefile.sync
+++ b/Makefile.sync
@@ -1,6 +1,6 @@
 UPSTREAM=https://github.com/ggerganov/llama.cpp.git
 WORKDIR=llama/vendor
-FETCH_HEAD=e1e8e0991ffd9e99a445c6812bb519d5bac9f4b5
+FETCH_HEAD=de4c07f93783a1a96456a44dc16b9db538ee1618
 
 .PHONY: help
 help:
diff --git a/llama/build-info.cpp b/llama/build-info.cpp
index 27ce8e70..afef6b85 100644
--- a/llama/build-info.cpp
+++ b/llama/build-info.cpp
@@ -1,4 +1,4 @@
 int LLAMA_BUILD_NUMBER = 0;
-char const *LLAMA_COMMIT = "e1e8e0991ffd9e99a445c6812bb519d5bac9f4b5";
+char const *LLAMA_COMMIT = "de4c07f93783a1a96456a44dc16b9db538ee1618";
 char const *LLAMA_COMPILER = "";
 char const *LLAMA_BUILD_TARGET = "";
diff --git a/llama/llama.cpp/.rsync-filter b/llama/llama.cpp/.rsync-filter
index 9bc06684..1f81b007 100644
--- a/llama/llama.cpp/.rsync-filter
+++ b/llama/llama.cpp/.rsync-filter
@@ -10,11 +10,11 @@ include common/stb_image.*
 include include/
 include include/llama.*
 include include/llama-*.*
-include examples/
-include examples/llava/
-include examples/llava/clip.*
-include examples/llava/clip-impl.*
-include examples/llava/llava.*
+include tools/
+include tools/mtmd/
+include tools/mtmd/clip.*
+include tools/mtmd/clip-impl.*
+include tools/mtmd/llava.*
 include src/
 include src/llama.*
 include src/llama-*.*
diff --git a/llama/llama.cpp/common/common.cpp b/llama/llama.cpp/common/common.cpp
index 94f545f8..2b1d8da5 100644
--- a/llama/llama.cpp/common/common.cpp
+++ b/llama/llama.cpp/common/common.cpp
@@ -1096,7 +1096,6 @@ struct llama_context_params common_context_params_to_llama(const common_params &
     cparams.n_threads         = params.cpuparams.n_threads;
     cparams.n_threads_batch   = params.cpuparams_batch.n_threads == -1 ?
                                 params.cpuparams.n_threads : params.cpuparams_batch.n_threads;
-    cparams.logits_all        = params.logits_all;
     cparams.embeddings        = params.embedding;
     cparams.rope_scaling_type = params.rope_scaling_type;
     cparams.rope_freq_base    = params.rope_freq_base;
@@ -1114,6 +1113,7 @@ struct llama_context_params common_context_params_to_llama(const common_params &
     cparams.offload_kqv       = !params.no_kv_offload;
     cparams.flash_attn        = params.flash_attn;
     cparams.no_perf           = params.no_perf;
+    cparams.op_offload        = !params.no_op_offload;
 
     if (params.reranking) {
         cparams.embeddings    = true;
@@ -1565,3 +1565,20 @@ common_control_vector_data common_control_vector_load(const std::vector<common_c
 
     return result;
 }
+
+ggml_opt_dataset_t common_opt_dataset_init(struct llama_context * ctx, const std::vector<llama_token> & tokens, int64_t stride) {
+    const int64_t ne_datapoint = llama_n_ctx(ctx);
+    const int64_t ndata        = (tokens.size() - ne_datapoint - 1) / stride;
+    ggml_opt_dataset_t result = ggml_opt_dataset_init(
+        GGML_TYPE_I32, GGML_TYPE_I32, ne_datapoint, ne_datapoint, ndata, /*ndata_shard =*/ 1);
+
+    llama_token * data   = (llama_token *) ggml_opt_dataset_data(result)->data;
+    llama_token * labels = (llama_token *) ggml_opt_dataset_labels(result)->data;
+
+    for (int64_t idata = 0; idata < ndata; ++idata) {
+        memcpy(data   + idata*ne_datapoint, tokens.data() + idata*stride + 0, ne_datapoint*sizeof(llama_token));
+        memcpy(labels + idata*ne_datapoint, tokens.data() + idata*stride + 1, ne_datapoint*sizeof(llama_token));
+    }
+
+    return result;
+}
diff --git a/llama/llama.cpp/common/common.h b/llama/llama.cpp/common/common.h
index 0a9dc059..dea34267 100644
--- a/llama/llama.cpp/common/common.h
+++ b/llama/llama.cpp/common/common.h
@@ -66,7 +66,6 @@ enum llama_example {
     LLAMA_EXAMPLE_COMMON,
     LLAMA_EXAMPLE_SPECULATIVE,
     LLAMA_EXAMPLE_MAIN,
-    LLAMA_EXAMPLE_INFILL,
     LLAMA_EXAMPLE_EMBEDDING,
     LLAMA_EXAMPLE_PERPLEXITY,
     LLAMA_EXAMPLE_RETRIEVAL,
@@ -96,6 +95,7 @@ enum common_sampler_type {
     COMMON_SAMPLER_TYPE_XTC         = 8,
     COMMON_SAMPLER_TYPE_INFILL      = 9,
     COMMON_SAMPLER_TYPE_PENALTIES   = 10,
+    COMMON_SAMPLER_TYPE_TOP_N_SIGMA = 11,
 };
 
 // dimensionality reduction methods, used by cvector-generator
@@ -161,6 +161,7 @@ struct common_params_sampling {
     std::vector<enum common_sampler_type> samplers = {
         COMMON_SAMPLER_TYPE_PENALTIES,
         COMMON_SAMPLER_TYPE_DRY,
+        COMMON_SAMPLER_TYPE_TOP_N_SIGMA,
         COMMON_SAMPLER_TYPE_TOP_K,
         COMMON_SAMPLER_TYPE_TYPICAL_P,
         COMMON_SAMPLER_TYPE_TOP_P,
@@ -323,7 +324,6 @@ struct common_params {
     bool ctx_shift         = true;  // context shift on inifinite text generation
 
     bool input_prefix_bos  = false; // prefix BOS to user inputs, preceding input_prefix
-    bool logits_all        = false; // return logits for all tokens in the batch
     bool use_mmap          = true;  // use mmap for faster loads
     bool use_mlock         = false; // use mlock to keep model in memory
     bool verbose_prompt    = false; // print prompt tokens before generation
@@ -332,6 +332,7 @@ struct common_params {
     bool no_kv_offload     = false; // disable KV offloading
     bool warmup            = true;  // warmup run
     bool check_tensors     = false; // validate tensor data
+    bool no_op_offload     = false; // globally disable offload host tensor operations to device
 
     bool single_turn       = false; // single turn chat conversation
 
@@ -340,7 +341,7 @@ struct common_params {
 
     common_conversation_mode conversation_mode = COMMON_CONVERSATION_MODE_AUTO;
 
-    // multimodal models (see examples/llava)
+    // multimodal models (see tools/mtmd)
     struct common_params_model mmproj;
     bool mmproj_use_gpu = true;     // use GPU for multimodal model
     bool no_mmproj = false;         // explicitly disable multimodal model
@@ -409,13 +410,14 @@ struct common_params {
 
     bool process_output = false; // collect data for the output tensor
     bool compute_ppl    = true;  // whether to compute perplexity
+    bool parse_special  = false; // whether to parse special tokens during imatrix tokenization
 
     // cvector-generator params
     int n_pca_batch = 100;
     int n_pca_iterations = 1000;
     dimre_method cvector_dimre_method = DIMRE_METHOD_PCA;
-    std::string cvector_positive_file = "examples/cvector-generator/positive.txt";
-    std::string cvector_negative_file = "examples/cvector-generator/negative.txt";
+    std::string cvector_positive_file = "tools/cvector-generator/positive.txt";
+    std::string cvector_negative_file = "tools/cvector-generator/negative.txt";
 
     bool spm_infill = false; // suffix/prefix/middle pattern for infill
 
@@ -664,3 +666,9 @@ const char * const LLM_KV_SPLIT_COUNT         = "split.count";
 const char * const LLM_KV_SPLIT_TENSORS_COUNT = "split.tensors.count";
 
 }
+
+//
+// training utils
+//
+
+ggml_opt_dataset_t common_opt_dataset_init(struct llama_context * ctx, const std::vector<llama_token> & tokens, int64_t stride);
diff --git a/llama/llama.cpp/common/sampling.cpp b/llama/llama.cpp/common/sampling.cpp
index 1735b650..28705e24 100644
--- a/llama/llama.cpp/common/sampling.cpp
+++ b/llama/llama.cpp/common/sampling.cpp
@@ -1,6 +1,7 @@
 #include "sampling.h"
 
 #include "common.h"
+#include "log.h"
 
 #include <cmath>
 #include <unordered_map>
@@ -229,51 +230,48 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, co
                 params.logit_bias.data()));
 
     if (params.mirostat == 0) {
-        if (params.top_n_sigma >= 0) {
-            llama_sampler_chain_add(result->chain, llama_sampler_init_top_k        (params.top_k));
-            llama_sampler_chain_add(result->chain, llama_sampler_init_temp         (params.temp));
-            llama_sampler_chain_add(result->chain, llama_sampler_init_top_n_sigma  (params.top_n_sigma));
-        } else {
-            for (const auto & cnstr : params.samplers) {
-                switch (cnstr) {
-                    case COMMON_SAMPLER_TYPE_DRY:
-                        {
-                            std::vector<const char *> c_breakers;
-                            c_breakers.reserve(params.dry_sequence_breakers.size());
-                            for (const auto & str : params.dry_sequence_breakers) {
-                                c_breakers.push_back(str.c_str());
-                            }
-
-                            llama_sampler_chain_add(result->chain, llama_sampler_init_dry      (vocab, llama_model_n_ctx_train(model), params.dry_multiplier, params.dry_base, params.dry_allowed_length, params.dry_penalty_last_n, c_breakers.data(), c_breakers.size()));
+        for (const auto & cnstr : params.samplers) {
+            switch (cnstr) {
+                case COMMON_SAMPLER_TYPE_DRY:
+                    {
+                        std::vector<const char *> c_breakers;
+                        c_breakers.reserve(params.dry_sequence_breakers.size());
+                        for (const auto & str : params.dry_sequence_breakers) {
+                            c_breakers.push_back(str.c_str());
                         }
-                        break;
-                    case COMMON_SAMPLER_TYPE_TOP_K:
-                        llama_sampler_chain_add(result->chain, llama_sampler_init_top_k    (params.top_k));
-                        break;
-                    case COMMON_SAMPLER_TYPE_TOP_P:
-                        llama_sampler_chain_add(result->chain, llama_sampler_init_top_p    (params.top_p, params.min_keep));
-                        break;
-                    case COMMON_SAMPLER_TYPE_MIN_P:
-                        llama_sampler_chain_add(result->chain, llama_sampler_init_min_p    (params.min_p, params.min_keep));
-                        break;
-                    case COMMON_SAMPLER_TYPE_XTC:
-                        llama_sampler_chain_add(result->chain, llama_sampler_init_xtc      (params.xtc_probability, params.xtc_threshold, params.min_keep, params.seed));
-                        break;
-                    case COMMON_SAMPLER_TYPE_TYPICAL_P:
-                        llama_sampler_chain_add(result->chain, llama_sampler_init_typical  (params.typ_p, params.min_keep));
-                        break;
-                    case COMMON_SAMPLER_TYPE_TEMPERATURE:
-                        llama_sampler_chain_add(result->chain, llama_sampler_init_temp_ext (params.temp, params.dynatemp_range, params.dynatemp_exponent));
-                        break;
-                    case COMMON_SAMPLER_TYPE_INFILL:
-                        llama_sampler_chain_add(result->chain, llama_sampler_init_infill   (vocab));
-                        break;
-                    case COMMON_SAMPLER_TYPE_PENALTIES:
-                        llama_sampler_chain_add(result->chain, llama_sampler_init_penalties(params.penalty_last_n, params.penalty_repeat, params.penalty_freq, params.penalty_present));
-                        break;
-                    default:
-                        GGML_ASSERT(false && "unknown sampler type");
-                }
+
+                        llama_sampler_chain_add(result->chain, llama_sampler_init_dry      (vocab, llama_model_n_ctx_train(model), params.dry_multiplier, params.dry_base, params.dry_allowed_length, params.dry_penalty_last_n, c_breakers.data(), c_breakers.size()));
+                    }
+                    break;
+                case COMMON_SAMPLER_TYPE_TOP_K:
+                    llama_sampler_chain_add(result->chain, llama_sampler_init_top_k       (params.top_k));
+                    break;
+                case COMMON_SAMPLER_TYPE_TOP_P:
+                    llama_sampler_chain_add(result->chain, llama_sampler_init_top_p       (params.top_p, params.min_keep));
+                    break;
+                case COMMON_SAMPLER_TYPE_TOP_N_SIGMA:
+                    llama_sampler_chain_add(result->chain, llama_sampler_init_top_n_sigma (params.top_n_sigma));
+                    break;
+                case COMMON_SAMPLER_TYPE_MIN_P:
+                    llama_sampler_chain_add(result->chain, llama_sampler_init_min_p       (params.min_p, params.min_keep));
+                    break;
+                case COMMON_SAMPLER_TYPE_XTC:
+                    llama_sampler_chain_add(result->chain, llama_sampler_init_xtc         (params.xtc_probability, params.xtc_threshold, params.min_keep, params.seed));
+                    break;
+                case COMMON_SAMPLER_TYPE_TYPICAL_P:
+                    llama_sampler_chain_add(result->chain, llama_sampler_init_typical     (params.typ_p, params.min_keep));
+                    break;
+                case COMMON_SAMPLER_TYPE_TEMPERATURE:
+                    llama_sampler_chain_add(result->chain, llama_sampler_init_temp_ext    (params.temp, params.dynatemp_range, params.dynatemp_exponent));
+                    break;
+                case COMMON_SAMPLER_TYPE_INFILL:
+                    llama_sampler_chain_add(result->chain, llama_sampler_init_infill      (vocab));
+                    break;
+                case COMMON_SAMPLER_TYPE_PENALTIES:
+                    llama_sampler_chain_add(result->chain, llama_sampler_init_penalties   (params.penalty_last_n, params.penalty_repeat, params.penalty_freq, params.penalty_present));
+                    break;
+                default:
+                    GGML_ASSERT(false && "unknown sampler type");
             }
         }
         llama_sampler_chain_add(result->chain, llama_sampler_init_dist(params.seed));
@@ -475,6 +473,7 @@ char common_sampler_type_to_chr(enum common_sampler_type cnstr) {
         case COMMON_SAMPLER_TYPE_TOP_K:       return 'k';
         case COMMON_SAMPLER_TYPE_TYPICAL_P:   return 'y';
         case COMMON_SAMPLER_TYPE_TOP_P:       return 'p';
+        case COMMON_SAMPLER_TYPE_TOP_N_SIGMA: return 's';
         case COMMON_SAMPLER_TYPE_MIN_P:       return 'm';
         case COMMON_SAMPLER_TYPE_TEMPERATURE: return 't';
         case COMMON_SAMPLER_TYPE_XTC:         return 'x';
@@ -490,6 +489,7 @@ std::string common_sampler_type_to_str(enum common_sampler_type cnstr) {
         case COMMON_SAMPLER_TYPE_TOP_K:       return "top_k";
         case COMMON_SAMPLER_TYPE_TYPICAL_P:   return "typ_p";
         case COMMON_SAMPLER_TYPE_TOP_P:       return "top_p";
+        case COMMON_SAMPLER_TYPE_TOP_N_SIGMA: return "top_n_sigma";
         case COMMON_SAMPLER_TYPE_MIN_P:       return "min_p";
         case COMMON_SAMPLER_TYPE_TEMPERATURE: return "temperature";
         case COMMON_SAMPLER_TYPE_XTC:         return "xtc";
@@ -504,6 +504,7 @@ std::vector<common_sampler_type> common_sampler_types_from_names(const std::vect
         { "dry",         COMMON_SAMPLER_TYPE_DRY },
         { "top_k",       COMMON_SAMPLER_TYPE_TOP_K },
         { "top_p",       COMMON_SAMPLER_TYPE_TOP_P },
+        { "top_n_sigma", COMMON_SAMPLER_TYPE_TOP_N_SIGMA },
         { "typ_p",       COMMON_SAMPLER_TYPE_TYPICAL_P },
         { "min_p",       COMMON_SAMPLER_TYPE_MIN_P },
         { "temperature", COMMON_SAMPLER_TYPE_TEMPERATURE },
@@ -517,6 +518,7 @@ std::vector<common_sampler_type> common_sampler_types_from_names(const std::vect
     std::unordered_map<std::string, common_sampler_type> sampler_alt_name_map {
         { "top-k",       COMMON_SAMPLER_TYPE_TOP_K },
         { "top-p",       COMMON_SAMPLER_TYPE_TOP_P },
+        { "top-n-sigma", COMMON_SAMPLER_TYPE_TOP_N_SIGMA },
         { "nucleus",     COMMON_SAMPLER_TYPE_TOP_P },
         { "typical-p",   COMMON_SAMPLER_TYPE_TYPICAL_P },
         { "typical",     COMMON_SAMPLER_TYPE_TYPICAL_P },
@@ -533,14 +535,16 @@ std::vector<common_sampler_type> common_sampler_types_from_names(const std::vect
         auto sampler = sampler_canonical_name_map.find(name);
         if (sampler != sampler_canonical_name_map.end()) {
             samplers.push_back(sampler->second);
-        } else {
-            if (allow_alt_names) {
-                sampler = sampler_alt_name_map.find(name);
-                if (sampler != sampler_alt_name_map.end()) {
-                    samplers.push_back(sampler->second);
-                }
+            continue;
+        }
+        if (allow_alt_names) {
+            sampler = sampler_alt_name_map.find(name);
+            if (sampler != sampler_alt_name_map.end()) {
+                samplers.push_back(sampler->second);
+                continue;
             }
         }
+        LOG_WRN("%s: unable to match sampler by name '%s'\n", __func__, name.c_str());
     }
 
     return samplers;
@@ -552,6 +556,7 @@ std::vector<common_sampler_type> common_sampler_types_from_chars(const std::stri
         { common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_TOP_K),       COMMON_SAMPLER_TYPE_TOP_K },
         { common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_TYPICAL_P),   COMMON_SAMPLER_TYPE_TYPICAL_P },
         { common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_TOP_P),       COMMON_SAMPLER_TYPE_TOP_P },
+        { common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_TOP_N_SIGMA), COMMON_SAMPLER_TYPE_TOP_N_SIGMA },
         { common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_MIN_P),       COMMON_SAMPLER_TYPE_MIN_P },
         { common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_TEMPERATURE), COMMON_SAMPLER_TYPE_TEMPERATURE },
         { common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_XTC),         COMMON_SAMPLER_TYPE_XTC },
@@ -566,6 +571,8 @@ std::vector<common_sampler_type> common_sampler_types_from_chars(const std::stri
         const auto sampler = sampler_name_map.find(c);
         if (sampler != sampler_name_map.end()) {
             samplers.push_back(sampler->second);
+        } else {
+            LOG_WRN("%s: unable to match sampler by char '%c'\n", __func__, c);
         }
     }
 
diff --git a/llama/llama.cpp/include/llama.h b/llama/llama.cpp/include/llama.h
index f1628e88..41beef21 100644
--- a/llama/llama.cpp/include/llama.h
+++ b/llama/llama.cpp/include/llama.h
@@ -4,6 +4,7 @@
 #include "ggml.h"
 #include "ggml-cpu.h"
 #include "ggml-backend.h"
+#include "ggml-opt.h"
 
 #include <stddef.h>
 #include <stdint.h>
@@ -112,6 +113,7 @@ extern "C" {
         LLAMA_VOCAB_PRE_TYPE_BAILINGMOE     = 32,
         LLAMA_VOCAB_PRE_TYPE_LLAMA4         = 33,
         LLAMA_VOCAB_PRE_TYPE_PIXTRAL        = 34,
+        LLAMA_VOCAB_PRE_TYPE_SEED_CODER     = 35,
     };
 
     enum llama_rope_type {
@@ -352,20 +354,19 @@ extern "C" {
         enum ggml_type type_k; // data type for K cache [EXPERIMENTAL]
         enum ggml_type type_v; // data type for V cache [EXPERIMENTAL]
 
-        // Keep the booleans together and at the end of the struct to avoid misalignment during copy-by-value.
-        // TODO: move at the end of the struct
-        bool logits_all;  // the llama_decode() call computes all logits, not just the last one (DEPRECATED - set llama_batch.logits instead)
-        bool embeddings;  // if true, extract embeddings (together with logits)
-        bool offload_kqv; // whether to offload the KQV ops (including the KV cache) to GPU
-        bool flash_attn;  // whether to use flash attention [EXPERIMENTAL]
-        bool no_perf;     // whether to measure performance timings
-        bool cross_attn;  // whether to use cross attention
-
         // Abort callback
         // if it returns true, execution of llama_decode() will be aborted
         // currently works only with CPU execution
         ggml_abort_callback abort_callback;
         void *              abort_callback_data;
+
+        // Keep the booleans together and at the end of the struct to avoid misalignment during copy-by-value.
+        bool embeddings;  // if true, extract embeddings (together with logits)
+        bool offload_kqv; // whether to offload the KQV ops (including the KV cache) to GPU
+        bool flash_attn;  // whether to use flash attention [EXPERIMENTAL]
+        bool no_perf;     // whether to measure performance timings
+        bool op_offload;  // whether to offload host tensor operations to device
+        bool cross_attn;  // whether to use cross attention
     };
 
     // model quantization parameters
@@ -447,6 +448,10 @@ extern "C" {
                                  size_t    n_paths,
               struct llama_model_params    params);
 
+    LLAMA_API void llama_model_save_to_file(
+            const struct llama_model * model,
+                        const char * path_model);
+
     DEPRECATED(LLAMA_API void llama_free_model(struct llama_model * model),
             "use llama_model_free instead");
 
@@ -930,14 +935,19 @@ extern "C" {
     // Frees a batch of tokens allocated with llama_batch_init()
     LLAMA_API void llama_batch_free(struct llama_batch batch);
 
-    // Processes a batch of tokens with the ecoder part of the encoder-decoder model.
-    // Stores the encoder output internally for later use by the decoder cross-attention layers.
+    // Process a batch of tokens.
+    // In contrast to llama_decode() - this call does not use KV cache.
+    // For encode-decoder contexts, processes the batch using the encoder.
+    // Can store the encoder output internally for later use by the decoder's cross-attention layers.
     //   0 - success
     // < 0 - error. the KV cache state is restored to the state before this call
     LLAMA_API int32_t llama_encode(
             struct llama_context * ctx,
               struct llama_batch   batch);
 
+    // Process a batch of tokens.
+    // Requires KV cache.
+    // For encode-decoder contexts, processes the batch using the decoder.
     // Positive return values does not mean a fatal error, but rather a warning.
     //   0 - success
     //   1 - could not find a KV slot for the batch (try reducing the size of the batch or increase the context)
@@ -1434,6 +1444,37 @@ extern "C" {
     LLAMA_API void                           llama_perf_sampler_print(const struct llama_sampler * chain);
     LLAMA_API void                           llama_perf_sampler_reset(      struct llama_sampler * chain);
 
+    //
+    // training
+    //
+
+    // function that returns whether or not a given tensor contains trainable parameters
+    typedef bool (*llama_opt_param_filter)(const struct ggml_tensor * tensor, void * userdata);
+
+    // always returns true
+    LLAMA_API bool llama_opt_param_filter_all(const struct ggml_tensor * tensor, void * userdata);
+
+    struct llama_opt_params {
+        uint32_t n_ctx_train; // assumed context size post training, use context size specified in llama_context if 0
+
+        llama_opt_param_filter param_filter; // callback for determining which tensors contain trainable parameters
+        void * param_filter_ud;              // userdata for determining which tensors contain trainable parameters
+
+        ggml_opt_get_optimizer_params get_opt_pars; // callback for calculating optimizer parameters
+        void * get_opt_pars_ud;                     // userdata for calculating optimizer parameters
+    };
+
+    LLAMA_API void llama_opt_init(struct llama_context * lctx, struct llama_model * model, struct llama_opt_params lopt_params);
+
+    LLAMA_API void llama_opt_epoch(
+            struct llama_context    * lctx,
+            ggml_opt_dataset_t        dataset,
+            ggml_opt_result_t         result_train,
+            ggml_opt_result_t         result_eval,
+            int64_t                   idata_split,
+            ggml_opt_epoch_callback   callback_train,
+            ggml_opt_epoch_callback   callback_eval);
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/llama/llama.cpp/src/llama-adapter.cpp b/llama/llama.cpp/src/llama-adapter.cpp
index 7ac54d23..8d94034a 100644
--- a/llama/llama.cpp/src/llama-adapter.cpp
+++ b/llama/llama.cpp/src/llama-adapter.cpp
@@ -253,6 +253,9 @@ static void llama_adapter_lora_init_impl(llama_model & model, const char * path_
     std::vector<ggml_backend_buffer_type_t> buft_extra;
     {
         auto * cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
+        if (!cpu_dev) {
+            throw std::runtime_error(format("%s: no CPU backend found", __func__));
+        }
         auto * cpu_reg = ggml_backend_dev_backend_reg(cpu_dev);
 
         auto ggml_backend_dev_get_extra_bufts_fn = (ggml_backend_dev_get_extra_bufts_t)
@@ -291,6 +294,9 @@ static void llama_adapter_lora_init_impl(llama_model & model, const char * path_
                 LLAMA_LOG_WARN("%s: lora for '%s' cannot use buft '%s', fallback to CPU\n", __func__, model_tensor->name, ggml_backend_buft_name(buft));
 
                 auto * cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
+                if (!cpu_dev) {
+                    throw std::runtime_error(format("%s: no CPU backend found", __func__));
+                }
                 buft = ggml_backend_dev_buffer_type(cpu_dev);
 
                 break;
diff --git a/llama/llama.cpp/src/llama-batch.cpp b/llama/llama.cpp/src/llama-batch.cpp
index 8682b0e6..241b316e 100644
--- a/llama/llama.cpp/src/llama-batch.cpp
+++ b/llama/llama.cpp/src/llama-batch.cpp
@@ -189,7 +189,7 @@ llama_ubatch llama_sbatch::split_seq(size_t n_ubatch) {
     return ubatch;
 }
 
-void llama_sbatch::from_batch(const llama_batch & batch, size_t n_embd, bool simple_split, bool logits_all) {
+llama_sbatch::llama_sbatch(const llama_batch & batch, size_t n_embd, bool simple_split, bool logits_all) {
     GGML_ASSERT(batch.n_tokens >= 0);
     this->batch = &batch;
     this->n_embd = n_embd;
@@ -203,6 +203,7 @@ void llama_sbatch::from_batch(const llama_batch & batch, size_t n_embd, bool sim
     for (size_t i = 0; i < n_tokens; ++i) {
         ids[i] = i;
     }
+
     if (simple_split) {
         seq.resize(1);
         llama_sbatch_seq & s = seq[0];
@@ -212,6 +213,7 @@ void llama_sbatch::from_batch(const llama_batch & batch, size_t n_embd, bool sim
         s.length = n_tokens;
         return;
     }
+
     std::sort(ids.begin(), ids.end(),
             [&batch](size_t a, size_t b) {
                 int32_t n_seq_a = batch.n_seq_id ? batch.n_seq_id[a] : 1;
@@ -239,6 +241,7 @@ void llama_sbatch::from_batch(const llama_batch & batch, size_t n_embd, bool sim
                 return n_seq_a > n_seq_b;
             }
     );
+
     // init seq
     llama_sbatch_seq * last_seq = nullptr;
 
@@ -262,6 +265,7 @@ void llama_sbatch::from_batch(const llama_batch & batch, size_t n_embd, bool sim
         seq.push_back(new_seq);
         last_seq = &seq.back();
     }
+
     // keep shared prompts first at the end, then sort by length descending.
     std::sort(seq.begin(), seq.end(),
             [](llama_sbatch_seq & a, llama_sbatch_seq & b) {
diff --git a/llama/llama.cpp/src/llama-batch.h b/llama/llama.cpp/src/llama-batch.h
index f1df40d2..6305051b 100644
--- a/llama/llama.cpp/src/llama-batch.h
+++ b/llama/llama.cpp/src/llama-batch.h
@@ -70,7 +70,8 @@ struct llama_sbatch {
     // sequence-wise split
     llama_ubatch split_seq(size_t n_ubatch);
 
-    void from_batch(const llama_batch & batch, size_t n_embd, bool simple_split = false, bool logits_all = false);
+    llama_sbatch() = default;
+    llama_sbatch(const llama_batch & batch, size_t n_embd, bool simple_split = false, bool logits_all = false);
 };
 
 // temporary allocate memory for the input batch if needed
diff --git a/llama/llama.cpp/src/llama-chat.cpp b/llama/llama.cpp/src/llama-chat.cpp
index 735d2619..d12743e6 100644
--- a/llama/llama.cpp/src/llama-chat.cpp
+++ b/llama/llama.cpp/src/llama-chat.cpp
@@ -35,6 +35,7 @@ static const std::map<std::string, llm_chat_template> LLM_CHAT_TEMPLATES = {
     { "mistral-v3",        LLM_CHAT_TEMPLATE_MISTRAL_V3        },
     { "mistral-v3-tekken", LLM_CHAT_TEMPLATE_MISTRAL_V3_TEKKEN },
     { "mistral-v7",        LLM_CHAT_TEMPLATE_MISTRAL_V7        },
+    { "mistral-v7-tekken", LLM_CHAT_TEMPLATE_MISTRAL_V7_TEKKEN },
     { "phi3",              LLM_CHAT_TEMPLATE_PHI_3             },
     { "phi4",              LLM_CHAT_TEMPLATE_PHI_4             },
     { "falcon3",           LLM_CHAT_TEMPLATE_FALCON_3          },
@@ -202,19 +203,20 @@ int32_t llm_chat_apply_template(
         if (add_ass) {
             ss << "<|im_start|>assistant\n";
         }
-    } else if (tmpl == LLM_CHAT_TEMPLATE_MISTRAL_V7) {
+    } else if (tmpl == LLM_CHAT_TEMPLATE_MISTRAL_V7 || tmpl == LLM_CHAT_TEMPLATE_MISTRAL_V7_TEKKEN) {
         // Official mistral 'v7' template
         // See: https://huggingface.co/mistralai/Mistral-Large-Instruct-2411#basic-instruct-template-v7
+        //      https://huggingface.co/mistralai/Mistral-Small-3.1-24B-Instruct-2503#basic-instruct-template-v7-tekken
+        const char * trailing_space = tmpl == LLM_CHAT_TEMPLATE_MISTRAL_V7 ? " " : "";
         for (auto message : chat) {
             std::string role(message->role);
             std::string content(message->content);
             if (role == "system") {
-                ss << "[SYSTEM_PROMPT] " << content << "[/SYSTEM_PROMPT]";
+                ss << "[SYSTEM_PROMPT]" << trailing_space << content << "[/SYSTEM_PROMPT]";
             } else if (role == "user") {
-                ss << "[INST] " << content << "[/INST]";
-            }
-            else {
-                ss << " " << content << "</s>";
+                ss << "[INST]" << trailing_space << content << "[/INST]";
+            } else {
+                ss << trailing_space << content << "</s>";
             }
         }
     } else if (tmpl == LLM_CHAT_TEMPLATE_MISTRAL_V1
@@ -447,8 +449,16 @@ int32_t llm_chat_apply_template(
         if (add_ass) {
             ss << "<|assistant|>";
         }
-    } else if (tmpl == LLM_CHAT_TEMPLATE_CHATGLM_4 || tmpl == LLM_CHAT_TEMPLATE_GLMEDGE) {
+    } else if (tmpl == LLM_CHAT_TEMPLATE_CHATGLM_4) {
         ss << "[gMASK]" << "<sop>";
+        for (auto message : chat) {
+            std::string role(message->role);
+            ss << "<|" << role << "|>" << "\n" << message->content;
+        }
+        if (add_ass) {
+            ss << "<|assistant|>\n";
+        }
+    } else if (tmpl == LLM_CHAT_TEMPLATE_GLMEDGE) {
         for (auto message : chat) {
             std::string role(message->role);
             ss << "<|" << role << "|>" << "\n" << message->content;
diff --git a/llama/llama.cpp/src/llama-chat.h b/llama/llama.cpp/src/llama-chat.h
index 3f584346..db24ade2 100644
--- a/llama/llama.cpp/src/llama-chat.h
+++ b/llama/llama.cpp/src/llama-chat.h
@@ -14,6 +14,7 @@ enum llm_chat_template {
     LLM_CHAT_TEMPLATE_MISTRAL_V3,
     LLM_CHAT_TEMPLATE_MISTRAL_V3_TEKKEN,
     LLM_CHAT_TEMPLATE_MISTRAL_V7,
+    LLM_CHAT_TEMPLATE_MISTRAL_V7_TEKKEN,
     LLM_CHAT_TEMPLATE_PHI_3,
     LLM_CHAT_TEMPLATE_PHI_4,
     LLM_CHAT_TEMPLATE_FALCON_3,
diff --git a/llama/llama.cpp/src/llama-context.cpp b/llama/llama.cpp/src/llama-context.cpp
index 77177c5e..c22687e4 100644
--- a/llama/llama.cpp/src/llama-context.cpp
+++ b/llama/llama.cpp/src/llama-context.cpp
@@ -6,11 +6,9 @@
 #include "llama-model.h"
 #include "llama-kv-cache.h"
 
-#include <cassert>
 #include <cstring>
 #include <stdexcept>
 #include <cinttypes>
-#include <cmath>
 
 //
 // llama_context
@@ -95,6 +93,7 @@ llama_context::llama_context(
     }
 
     cparams.n_ubatch = std::min(cparams.n_batch, params.n_ubatch == 0 ? params.n_batch : params.n_ubatch);
+    cparams.op_offload = params.op_offload;
 
     const uint32_t n_ctx_per_seq = cparams.n_ctx / cparams.n_seq_max;
 
@@ -118,8 +117,6 @@ llama_context::llama_context(
                 __func__, n_ctx_per_seq, hparams.n_ctx_train);
     }
 
-    logits_all = params.logits_all;
-
     if (!hparams.vocab_only) {
         // GPU backends
         for (auto * dev : model.devices) {
@@ -177,44 +174,13 @@ llama_context::llama_context(
     }
 
     // init the memory module
-    // TODO: for now, always create a unified KV cache
     if (!hparams.vocab_only) {
-        kv_self.reset(static_cast<llama_kv_cache_unified *>(model.create_memory()));
+        llama_memory_params params_mem = {
+            /*.type_k =*/ params.type_k,
+            /*.type_v =*/ params.type_v,
+        };
 
-        LLAMA_LOG_DEBUG("%s: n_ctx = %u\n", __func__, cparams.n_ctx);
-
-        cparams.n_ctx = GGML_PAD(cparams.n_ctx, kv_self->get_padding(cparams));
-
-        LLAMA_LOG_DEBUG("%s: n_ctx = %u (padded)\n", __func__, cparams.n_ctx);
-
-        uint32_t kv_size = cparams.n_ctx;
-        ggml_type type_k = params.type_k;
-        ggml_type type_v = params.type_v;
-
-        if (llama_model_is_recurrent(&model)) {
-            // Mamba needs at least as many KV cells as there are sequences kept at any time
-            kv_size = std::max((uint32_t) 1, params.n_seq_max);
-            // it's probably best to keep as much precision as possible for the states
-            type_k = GGML_TYPE_F32; // required by ggml_ssm_conv for Mamba's conv_states
-            type_v = GGML_TYPE_F32; // required by ggml_ssm_scan for Mamba's ssm_states
-        }
-
-        GGML_ASSERT(hparams.n_embd_head_k % ggml_blck_size(type_k) == 0);
-        GGML_ASSERT(hparams.n_embd_head_v % ggml_blck_size(type_v) == 0);
-
-        if (!kv_self->init(model, cparams, type_k, type_v, kv_size, cparams.offload_kqv)) {
-            throw std::runtime_error("failed to initialize self-attention cache");
-        }
-
-        {
-            const size_t memory_size_k = kv_self->size_k_bytes();
-            const size_t memory_size_v = kv_self->size_v_bytes();
-
-            LLAMA_LOG_INFO("%s: KV self size  = %7.2f MiB, K (%s): %7.2f MiB, V (%s): %7.2f MiB\n", __func__,
-                    (float)(memory_size_k + memory_size_v) / (1024.0f * 1024.0f),
-                    ggml_type_name(type_k), (float)memory_size_k / (1024.0f * 1024.0f),
-                    ggml_type_name(type_v), (float)memory_size_v / (1024.0f * 1024.0f));
-        }
+        memory.reset(model.create_memory(params_mem, cparams));
     }
 
     // init backends
@@ -278,7 +244,7 @@ llama_context::llama_context(
             }
         }
 
-        sched.reset(ggml_backend_sched_new(backend_ptrs.data(), backend_buft.data(), backend_ptrs.size(), max_nodes, pipeline_parallel));
+        sched.reset(ggml_backend_sched_new(backend_ptrs.data(), backend_buft.data(), backend_ptrs.size(), max_nodes, pipeline_parallel, cparams.op_offload));
 
         if (pipeline_parallel) {
             LLAMA_LOG_INFO("%s: pipeline parallelism enabled (n_copies=%d)\n", __func__, ggml_backend_sched_get_n_copies(sched.get()));
@@ -286,7 +252,7 @@ llama_context::llama_context(
     }
 
     // reserve worst-case graph
-    if (!hparams.vocab_only) {
+    if (!hparams.vocab_only && memory) {
         const uint32_t n_seqs = 1; // TODO: worst-case number of sequences
         const uint32_t n_tokens = std::min(cparams.n_ctx, cparams.n_ubatch);
 
@@ -305,7 +271,9 @@ llama_context::llama_context(
         int n_nodes_tg  = -1;
 
         // simulate full KV cache
-        kv_self->n = kv_self->size;
+        llama_kv_cache * kv_self = static_cast<llama_kv_cache *>(memory.get());
+
+        kv_self->set_full();
 
         cross.v_embd.clear();
 
@@ -391,7 +359,9 @@ llama_context::llama_context(
     }
 }
 
-llama_context::~llama_context() = default;
+llama_context::~llama_context() {
+    ggml_opt_free(opt_ctx);
+}
 
 void llama_context::synchronize() {
     ggml_backend_sched_synchronize(sched.get());
@@ -427,6 +397,18 @@ const llama_model & llama_context::get_model() const {
     return model;
 }
 
+const llama_cparams & llama_context::get_cparams() const {
+    return cparams;
+}
+
+ggml_backend_sched_t llama_context::get_sched() const {
+    return sched.get();
+}
+
+ggml_context * llama_context::get_ctx_compute() const {
+    return ctx_compute.get();
+}
+
 uint32_t llama_context::n_ctx() const {
     return cparams.n_ctx;
 }
@@ -456,318 +438,44 @@ uint32_t llama_context::n_threads_batch() const {
 }
 
 llama_kv_cache * llama_context::get_kv_self() {
-    return kv_self.get();
+    llama_kv_cache * kv_self = static_cast<llama_kv_cache *>(memory.get());
+    return kv_self;
 }
 
 const llama_kv_cache * llama_context::get_kv_self() const {
-    return kv_self.get();
-}
-
-ggml_tensor * llama_context::build_rope_shift(
-        ggml_context * ctx0,
-        ggml_tensor * cur,
-        ggml_tensor * shift,
-        ggml_tensor * factors,
-              float   freq_base,
-              float   freq_scale) const {
-    const auto & n_ctx_orig = cparams.n_ctx_orig_yarn;
-
-    const auto & yarn_ext_factor  = cparams.yarn_ext_factor;
-    const auto & yarn_beta_fast   = cparams.yarn_beta_fast;
-    const auto & yarn_beta_slow   = cparams.yarn_beta_slow;
-
-    const auto & hparams = model.hparams;
-
-    const auto & n_rot     = hparams.n_rot;
-    const auto & rope_type = hparams.rope_type;
-
-    // See llm_build_deepseek2() for why attn_factor has to be scaled for YaRN RoPE to work correctly.
-    // See https://github.com/ggerganov/llama.cpp/discussions/7416 for detailed explanation.
-    const float yarn_attn_factor = model.arch == LLM_ARCH_DEEPSEEK2 ? 1.0f / (1.0f + 0.1f * logf(1.0f / freq_scale)) : cparams.yarn_attn_factor;
-
-    ggml_tensor * tmp;
-
-    if (ggml_is_quantized(cur->type)) {
-        // dequantize to f32 -> RoPE -> quantize back
-        tmp = ggml_cast(ctx0, cur, GGML_TYPE_F32);
-
-        tmp = ggml_rope_ext(ctx0, tmp,
-                shift, factors, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
-                yarn_ext_factor, yarn_attn_factor, yarn_beta_fast, yarn_beta_slow);
-
-        tmp = ggml_cpy(ctx0, tmp, cur);
-    } else {
-        // we rotate only the first n_rot dimensions
-        tmp = ggml_rope_ext_inplace(ctx0, cur,
-                shift, factors, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
-                yarn_ext_factor, yarn_attn_factor, yarn_beta_fast, yarn_beta_slow);
-    }
-
-    return tmp;
-}
-
-class llm_graph_input_k_shift : public llm_graph_input_i {
-public:
-    llm_graph_input_k_shift(const llama_kv_cache_unified * kv_self) : kv_self(kv_self) {}
-    virtual ~llm_graph_input_k_shift() = default;
-
-    void set_input(const llama_ubatch * ubatch) override;
-
-    ggml_tensor * k_shift; // I32 [kv_size]
-
-    const llama_kv_cache_unified * kv_self;
-};
-
-void llm_graph_input_k_shift::set_input(const llama_ubatch * ubatch) {
-    GGML_UNUSED(ubatch);
-
-    if (k_shift) {
-        assert(ggml_backend_buffer_is_host(k_shift->buffer));
-
-        int32_t * data = (int32_t *) k_shift->data;
-
-        for (uint32_t i = 0; i < kv_self->size; ++i) {
-            data[i] = kv_self->cells[i].delta;
-        }
-    }
-}
-
-llm_graph_result_ptr llama_context::build_kv_self_shift(
-        ggml_context * ctx0,
-        ggml_cgraph * gf) const {
-    auto res = std::make_unique<llm_graph_result>();
-
-    const auto & hparams = model.hparams;
-
-    const auto & n_layer = hparams.n_layer;
-
-    const auto & n_embd_head_k = hparams.n_embd_head_k;
-  //const auto & n_embd_head_v = hparams.n_embd_head_v;
-
-    //GGML_ASSERT(kv_self->size == n_ctx);
-
-    auto inp = std::make_unique<llm_graph_input_k_shift>(kv_self.get());
-
-    inp->k_shift = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, cparams.n_ctx);
-    ggml_set_input(inp->k_shift);
-
-    for (uint32_t il = 0; il < n_layer; ++il) {
-        const int64_t n_head_kv    = hparams.n_head_kv(il);
-        const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa(il);
-
-        const bool is_swa = hparams.is_swa(il);
-
-        // note: the swa rope params could become part of the cparams in the future
-        //       if we decide to make them configurable, like the non-sliding ones
-        const float freq_base_l  = is_swa ? hparams.rope_freq_base_train_swa  : cparams.rope_freq_base;
-        const float freq_scale_l = is_swa ? hparams.rope_freq_scale_train_swa : cparams.rope_freq_scale;
-
-        ggml_tensor * rope_factors = kv_self->cbs.get_rope_factors(n_ctx_per_seq(), il);
-
-        ggml_tensor * k =
-            ggml_view_3d(ctx0, kv_self->k_l[il],
-                n_embd_head_k, n_head_kv, kv_self->size,
-                ggml_row_size(kv_self->k_l[il]->type, n_embd_head_k),
-                ggml_row_size(kv_self->k_l[il]->type, n_embd_k_gqa),
-                0);
-
-        ggml_tensor * cur = build_rope_shift(ctx0, k, inp->k_shift, rope_factors, freq_base_l, freq_scale_l);
-
-        ggml_build_forward_expand(gf, cur);
-    }
-
-    res->add_input(std::move(inp));
-
-    return res;
-}
-
-llm_graph_result_ptr llama_context::build_kv_self_defrag(
-        ggml_context * ctx0,
-        ggml_cgraph * gf,
-        const std::vector<struct llama_kv_defrag_move> & moves) const {
-    auto res = std::make_unique<llm_graph_result>();
-
-    const auto & hparams = model.hparams;
-
-#if 0
-    // CPU defrag
-    //
-    // TODO: optimizations are possible:
-    //       - multiple threads
-    //       - avoid copying to the host memory when already there
-    //
-    // likely not worth the effort, as we have ggml_graph based defrag
-    //
-
-    const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa();
-    const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa();
-
-    const uint32_t kv_size = size;
-
-    std::vector<uint8_t> buf_k;
-    std::vector<uint8_t> buf_v;
-
-    for (uint32_t il = 0; il < n_layer; ++il) {
-        const size_t k_size_row = ggml_row_size(k_l[il]->type, n_embd_k_gqa);
-        const size_t k_size     = ggml_row_size(k_l[il]->type, n_embd_k_gqa*kv_size);
-
-        const size_t v_size_el = ggml_type_size(v_l[il]->type);
-        const size_t v_size    = ggml_row_size (v_l[il]->type, n_embd_v_gqa*kv_size);
-
-        buf_k.resize(k_size);
-        buf_v.resize(v_size);
-
-        ggml_backend_tensor_get(k_l[il], buf_k.data(), 0, buf_k.size());
-        ggml_backend_tensor_get(v_l[il], buf_v.data(), 0, buf_v.size());
-
-        // batch move [i, i+nm) to [id, id+nm)
-        // note: cells can move only to a lower index
-        for (uint32_t i = 0; i < n_kv; ++i) {
-            const uint32_t id = ids[i];
-
-            if (i == id || id == n_kv) {
-                continue;
-            }
-
-            uint32_t nm = 1;
-
-            while (i + nm < n_kv && ids[i + nm] == id + nm) {
-                nm++;
-            }
-
-            // move keys
-            {
-                const int64_t os =  i*k_size_row;
-                const int64_t od = id*k_size_row;
-
-                memcpy(buf_k.data() + od, buf_k.data() + os, nm*k_size_row);
-            }
-
-            // move values (note: they are transposed)
-            {
-                const int64_t os =  i;
-                const int64_t od = id;
-
-                for (uint32_t j = 0; j < n_embd_v_gqa; ++j) {
-                    memcpy(buf_v.data() + (od + j*kv_size)*v_size_el, buf_v.data() + (os + j*kv_size)*v_size_el, nm*v_size_el);
-                }
-            }
-
-            i += nm - 1;
-        }
-
-        ggml_backend_tensor_set(k_l[il], buf_k.data(), 0, buf_k.size());
-        ggml_backend_tensor_set(v_l[il], buf_v.data(), 0, buf_v.size());
-    }
-#else
-    for (const auto & move : moves) {
-        for (uint32_t il = 0; il < hparams.n_layer; ++il) { // NOLINT
-            const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa(il);
-            const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa(il);
-
-            ggml_tensor * view_k_src = ggml_view_2d(ctx0, kv_self->k_l[il],
-                    n_embd_k_gqa, move.len,
-                    ggml_row_size(kv_self->k_l[il]->type, n_embd_k_gqa),
-                    ggml_row_size(kv_self->k_l[il]->type, n_embd_k_gqa*move.src));
-
-            ggml_tensor * view_k_dst = ggml_view_2d(ctx0, kv_self->k_l[il],
-                    n_embd_k_gqa, move.len,
-                    ggml_row_size(kv_self->k_l[il]->type, n_embd_k_gqa),
-                    ggml_row_size(kv_self->k_l[il]->type, n_embd_k_gqa*move.dst));
-
-            ggml_tensor * view_v_src;
-            ggml_tensor * view_v_dst;
-
-            if (cparams.flash_attn) {
-                // NOTE: the V cache is not transposed when using flash attention
-                view_v_src = ggml_view_2d(ctx0, kv_self->v_l[il],
-                        n_embd_v_gqa, move.len,
-                        ggml_row_size(kv_self->v_l[il]->type, n_embd_v_gqa),
-                        ggml_row_size(kv_self->v_l[il]->type, n_embd_v_gqa*move.src));
-
-                view_v_dst = ggml_view_2d(ctx0, kv_self->v_l[il],
-                        n_embd_v_gqa, move.len,
-                        ggml_row_size(kv_self->v_l[il]->type, n_embd_v_gqa),
-                        ggml_row_size(kv_self->v_l[il]->type, n_embd_v_gqa*move.dst));
-            } else {
-                view_v_src = ggml_view_2d(ctx0, kv_self->v_l[il],
-                        move.len, n_embd_v_gqa,
-                        ggml_row_size(kv_self->v_l[il]->type, kv_self->size),
-                        ggml_row_size(kv_self->v_l[il]->type, move.src));
-
-                view_v_dst = ggml_view_2d(ctx0, kv_self->v_l[il],
-                        move.len, n_embd_v_gqa,
-                        ggml_row_size(kv_self->v_l[il]->type, kv_self->size),
-                        ggml_row_size(kv_self->v_l[il]->type, move.dst));
-            }
-
-            ggml_build_forward_expand(gf, ggml_cpy(ctx0, view_k_src, view_k_dst));
-            ggml_build_forward_expand(gf, ggml_cpy(ctx0, view_v_src, view_v_dst));
-        }
-    }
-#endif
-
-    return res;
+    llama_kv_cache * kv_self = static_cast<llama_kv_cache *>(memory.get());
+    return kv_self;
 }
 
 void llama_context::kv_self_update() {
-    auto & kv = kv_self;
+    bool need_reserve = false;
 
-    if (kv->has_shift) {
-        if (!kv->get_can_shift()) {
-            GGML_ABORT("The current context does not support K-shift");
+    llama_kv_cache * kv_self = static_cast<llama_kv_cache *>(memory.get());
+
+    need_reserve = kv_self->update(*this);
+
+    // reserve a worst case graph if needed
+    if (need_reserve) {
+        LLAMA_LOG_DEBUG("%s: reserving a worst case graph\n", __func__);
+
+        // build worst-case graph
+        uint32_t n_seqs = 1; // TODO: worst-case number of sequences
+        uint32_t n_tokens = std::min(cparams.n_ctx, cparams.n_ubatch);
+
+        // simulate full KV cache
+        kv_self->set_full();
+
+        llama_token token = model.vocab.token_bos(); // not actually used by llama_build_graph, but required to choose between token and embedding inputs graph
+        llama_ubatch ubatch = { true, n_tokens, n_tokens / n_seqs, n_seqs, &token, nullptr, nullptr, nullptr, nullptr, nullptr};
+
+        auto * gf = graph_init();
+        graph_build(ctx_compute.get(), gf, ubatch, LLM_GRAPH_TYPE_DEFAULT);
+
+        // initialize scheduler with the worst-case graph
+        ggml_backend_sched_reset(sched.get());
+        if (!ggml_backend_sched_reserve(sched.get(), gf)) {
+            LLAMA_LOG_ERROR("%s: failed to allocate compute buffers\n", __func__);
         }
-
-        LLAMA_LOG_DEBUG("%s: applying K-shift\n", __func__);
-
-        // apply K-shift if needed
-        if (model.hparams.rope_type != LLAMA_ROPE_TYPE_NONE) {
-            ggml_backend_sched_reset(sched.get());
-
-            auto * gf = graph_init();
-
-            auto res = build_kv_self_shift(ctx_compute.get(), gf);
-
-            ggml_backend_sched_alloc_graph(sched.get(), gf);
-
-            res->set_inputs(nullptr);
-
-            graph_compute(gf, false);
-        }
-
-        {
-            kv->has_shift = false;
-
-            for (uint32_t i = 0; i < kv->size; ++i) {
-                kv->cells[i].delta = 0;
-            }
-        }
-    }
-
-    // defragment the KV cache if needed
-    if (kv->do_defrag) {
-        LLAMA_LOG_DEBUG("%s: defragmenting KV cache\n", __func__);
-        const uint32_t n_max_nodes = graph_max_nodes();
-        const uint32_t max_moves = (n_max_nodes - 2*model.hparams.n_layer)/(6*model.hparams.n_layer);
-        if (!kv->defrag_prepare(n_max_nodes)) {
-            LLAMA_LOG_ERROR("%s: failed to prepare defragmentation\n", __func__);
-            return;
-        }
-
-        for (std::size_t i = 0; i < kv_self->defrag_info.moves.size(); i += max_moves) {
-            std::vector<struct llama_kv_defrag_move> chunk;
-            auto end = std::min(i + max_moves, kv_self->defrag_info.moves.size());
-            chunk.assign(kv_self->defrag_info.moves.begin() + i, kv_self->defrag_info.moves.begin() + end);
-
-            ggml_backend_sched_reset(sched.get());
-            auto * gf = graph_init();
-            auto res = build_kv_self_defrag(ctx_compute.get(), gf, chunk);
-            ggml_backend_sched_alloc_graph(sched.get(), gf);
-            res->set_inputs(nullptr);
-            graph_compute(gf, false);
-        }
-
-        kv->do_defrag = false;
     }
 }
 
@@ -776,9 +484,6 @@ enum llama_pooling_type llama_context::pooling_type() const {
 }
 
 float * llama_context::get_logits() {
-    // reorder logits for backward compatibility
-    output_reorder();
-
     return logits;
 }
 
@@ -821,9 +526,6 @@ float * llama_context::get_logits_ith(int32_t i) {
 }
 
 float * llama_context::get_embeddings() {
-    // reorder embeddings for backward compatibility
-    output_reorder();
-
     return embd;
 }
 
@@ -979,8 +681,8 @@ int llama_context::encode(llama_batch & inp_batch) {
     }
 
     // temporary allocate memory for the input batch if needed
-    // TODO: this is incorrect for multiple sequences because pos_max() is the maximum across all sequences
-    llama_batch_allocr batch_allocr(inp_batch, inp_batch.pos ? -1 : kv_self->pos_max() + 1);
+    // note: during encode, we always pass the full sequence starting from pos = 0
+    llama_batch_allocr batch_allocr(inp_batch, inp_batch.pos ? -1 : 0);
 
     const llama_batch & batch = batch_allocr.batch;
     const int32_t n_tokens = batch.n_tokens;
@@ -1005,11 +707,13 @@ int llama_context::encode(llama_batch & inp_batch) {
         t_compute_start_us = ggml_time_us();
     }
 
+    embd_seq.clear();
+
     n_queued_tokens += n_tokens;
 
     const int64_t n_embd = hparams.n_embd;
 
-    sbatch.from_batch(batch, batch.n_embd, /* simple_split */ true, /* logits_all */ true);
+    llama_sbatch sbatch = llama_sbatch(batch, batch.n_embd, /* simple_split */ true, /* logits_all */ true);
 
     const llama_ubatch ubatch = sbatch.split_simple(n_tokens);
 
@@ -1066,12 +770,12 @@ int llama_context::encode(llama_batch & inp_batch) {
         ggml_backend_t backend_embd = ggml_backend_sched_get_tensor_backend(sched.get(), t_embd);
         GGML_ASSERT(backend_embd != nullptr);
 
-        GGML_ASSERT(embd != nullptr);
-
         switch (cparams.pooling_type) {
             case LLAMA_POOLING_TYPE_NONE:
                 {
                     // extract token embeddings
+                    GGML_ASSERT(embd != nullptr);
+
                     GGML_ASSERT(n_tokens*n_embd <= (int64_t) embd_size);
                     ggml_backend_tensor_get_async(backend_embd, t_embd, embd, 0, n_tokens*n_embd*sizeof(float));
                 } break;
@@ -1096,11 +800,18 @@ int llama_context::encode(llama_batch & inp_batch) {
                 } break;
             case LLAMA_POOLING_TYPE_RANK:
                 {
-                    // TODO: this likely should be the same logic as in llama_decoder_internal, but better to
-                    //       wait for an encoder model that requires this pooling type in order to test it
-                    //       https://github.com/ggerganov/llama.cpp/pull/9510
-                    GGML_ABORT("RANK pooling not implemented yet");
-                }
+                    // extract the rerank score - a single float per sequence
+                    auto & embd_seq_out = embd_seq;
+
+                    for (uint32_t s = 0; s < ubatch.n_seqs; ++s) {
+                        const llama_seq_id seq_id = ubatch.seq_id[s][0];
+                        if (embd_seq_out.find(seq_id) != embd_seq_out.end()) {
+                            continue;
+                        }
+                        embd_seq_out[seq_id].resize(1);
+                        ggml_backend_tensor_get_async(backend_embd, t_embd, embd_seq_out[seq_id].data(), (seq_id)*sizeof(float), sizeof(float));
+                    }
+                } break;
             case LLAMA_POOLING_TYPE_UNSPECIFIED:
                 {
                     GGML_ABORT("unknown pooling type");
@@ -1138,14 +849,21 @@ int llama_context::encode(llama_batch & inp_batch) {
 }
 
 int llama_context::decode(llama_batch & inp_batch) {
+    if (!memory) {
+        LLAMA_LOG_WARN("%s: cannot decode batches with this context (use llama_encode() instead)\n", __func__);
+        return encode(inp_batch);
+    }
+
     if (inp_batch.n_tokens == 0) {
         LLAMA_LOG_ERROR("%s: n_tokens == 0\n", __func__);
         return -1;
     }
 
+    llama_kv_cache * kv_self = static_cast<llama_kv_cache *>(memory.get());
+
     // temporary allocate memory for the input batch if needed
-    // TODO: this is incorrect for multiple sequences because pos_max() is the maximum across all sequences
-    llama_batch_allocr batch_allocr(inp_batch, inp_batch.pos ? -1 : kv_self->pos_max() + 1);
+    // TODO: this is incorrect for multiple sequences because get_pos_max() is the maximum across all sequences
+    llama_batch_allocr batch_allocr(inp_batch, inp_batch.pos ? -1 : kv_self->get_pos_max() + 1);
 
     const llama_batch & batch = batch_allocr.batch;
 
@@ -1156,7 +874,7 @@ int llama_context::decode(llama_batch & inp_batch) {
     const int64_t n_tokens_all = batch.n_tokens;
     const int64_t n_embd       = hparams.n_embd;
 
-    llama_kv_cache_guard kv_guard(kv_self.get());
+    llama_kv_cache_guard kv_guard(kv_self);
 
     GGML_ASSERT((!batch.token && batch.embd) || (batch.token && !batch.embd)); // NOLINT
 
@@ -1190,18 +908,14 @@ int llama_context::decode(llama_batch & inp_batch) {
         for (uint32_t i = 0; i < n_tokens_all; ++i) {
             n_outputs_all += batch.logits[i] != 0;
         }
-    } else if (logits_all || embd_pooled) {
+    } else if (embd_pooled) {
         n_outputs_all = n_tokens_all;
     } else {
         // keep last output only
         n_outputs_all = 1;
     }
 
-    const bool logits_all = n_outputs_all == n_tokens_all;
-
-    sbatch.from_batch(batch, batch.n_embd,
-            /* simple_split */ !kv_self->recurrent,
-            /* logits_all   */ logits_all);
+    llama_sbatch sbatch = kv_self->sbatch_init(batch, /* logits_all */ n_outputs_all == n_tokens_all);
 
     // reserve output buffer
     if (output_reserve(n_outputs_all) < n_outputs_all) {
@@ -1215,22 +929,7 @@ int llama_context::decode(llama_batch & inp_batch) {
     int64_t n_outputs_prev = 0;
 
     while (sbatch.n_tokens > 0) {
-        llama_ubatch ubatch = llama_ubatch();
-
-        const auto & n_ubatch = cparams.n_ubatch;
-
-        if (kv_self->recurrent) {
-            if (embd_pooled) {
-                // Pooled embeddings cannot be split across ubatches (yet)
-                ubatch = sbatch.split_seq(cparams.n_ubatch);
-            } else {
-                // recurrent model architectures are easier to implement
-                // with equal-length sequences
-                ubatch = sbatch.split_equal(cparams.n_ubatch);
-            }
-        } else {
-            ubatch = sbatch.split_simple(n_ubatch);
-        }
+        llama_ubatch ubatch = kv_self->ubatch_next(sbatch, cparams.n_ubatch, embd_pooled);
 
         // count the outputs in this u_batch
         {
@@ -1250,27 +949,12 @@ int llama_context::decode(llama_batch & inp_batch) {
         }
 
         // find KV slot
-        {
-            if (!kv_self->find_slot(ubatch)) {
-                kv_self->defrag();
-                kv_self_update();
-                if (!kv_self->find_slot(ubatch)) {
-                    LLAMA_LOG_WARN("%s: failed to find KV cache slot for ubatch of size %d\n", __func__, ubatch.n_tokens);
-                    return 1;
-                }
-            }
+        if (!kv_self->find_slot(ubatch)) {
+            LLAMA_LOG_WARN("%s: failed to find KV cache slot for ubatch of size %d\n", __func__, ubatch.n_tokens);
 
-            if (!kv_self->recurrent) {
-                // a heuristic, to avoid attending the full cache if it is not yet utilized
-                // after enough generations, the benefit from this heuristic disappears
-                // if we start defragmenting the cache, the benefit from this will be more important
-                const uint32_t pad = kv_self->get_padding(cparams);
-                kv_self->n = std::min(kv_self->size, std::max(pad, GGML_PAD(kv_self->cell_max(), pad)));
-            }
+            return 1;
         }
 
-        //printf("kv_self.n = %5d, kv_self.used = %5d, kv_self.head = %5d\n", kv_self->n, kv_self->used, kv_self->head);
-
         ggml_backend_sched_reset(sched.get());
         ggml_backend_sched_set_eval_callback(sched.get(), cparams.cb_eval, cparams.cb_eval_user_data);
 
@@ -1384,43 +1068,68 @@ int llama_context::decode(llama_batch & inp_batch) {
     // finalize the batch processing
     kv_guard.commit();
 
+    // set to total number of outputs in the batch, for use in llama_get_logits_ith
+    n_outputs = n_outputs_all;
+
     // set output mappings
     {
         bool sorted_output = true;
 
-        GGML_ASSERT(sbatch.out_ids.size() == (size_t) n_outputs_all);
+        auto & out_ids = sbatch.out_ids;
+
+        GGML_ASSERT(out_ids.size() == (size_t) n_outputs_all);
 
         for (int64_t i = 0; i < n_outputs_all; ++i) {
-            int64_t out_id = sbatch.out_ids[i];
+            int64_t out_id = out_ids[i];
             output_ids[out_id] = i;
             if (out_id != i) {
                 sorted_output = false;
             }
         }
 
-        if (sorted_output) {
-            sbatch.out_ids.clear();
+        // make the outputs have the same order they had in the user-provided batch
+        // note: this is mostly relevant for recurrent models atm
+        if (!sorted_output) {
+            const uint32_t n_vocab = model.hparams.n_vocab;
+            const uint32_t n_embd  = model.hparams.n_embd;
+
+            GGML_ASSERT((size_t) n_outputs == out_ids.size());
+
+            // TODO: is there something more efficient which also minimizes swaps?
+            // selection sort, to minimize swaps (from https://en.wikipedia.org/wiki/Selection_sort)
+            for (int32_t i = 0; i < n_outputs - 1; ++i) {
+                int32_t j_min = i;
+                for (int32_t j = i + 1; j < n_outputs; ++j) {
+                    if (out_ids[j] < out_ids[j_min]) {
+                        j_min = j;
+                    }
+                }
+                if (j_min == i) { continue; }
+                std::swap(out_ids[i], out_ids[j_min]);
+                if (logits_size > 0) {
+                    for (uint32_t k = 0; k < n_vocab; k++) {
+                        std::swap(logits[i*n_vocab + k], logits[j_min*n_vocab + k]);
+                    }
+                }
+                if (embd_size > 0) {
+                    for (uint32_t k = 0; k < n_embd; k++) {
+                        std::swap(embd[i*n_embd + k], embd[j_min*n_embd + k]);
+                    }
+                }
+            }
+            std::fill(output_ids.begin(), output_ids.end(), -1);
+            for (int32_t i = 0; i < n_outputs; ++i) {
+                output_ids[out_ids[i]] = i;
+            }
         }
     }
 
-    // set to total number of outputs in the batch, for use in llama_get_logits_ith
-    n_outputs = n_outputs_all;
-
     // wait for the computation to finish (automatically done when obtaining the model output)
     //synchronize();
 
     // decide if we need to defrag the kv cache
-    if (cparams.causal_attn && cparams.defrag_thold > 0.0f) {
-        // - do not defrag small contexts (i.e. < 2048 tokens)
-        // - count the padding towards the number of used tokens
-        const float fragmentation = kv_self->n >= 2048 ? std::max(0.0f, 1.0f - float(kv_self->used + kv_self->get_padding(cparams))/float(kv_self->n)) : 0.0f;
-
-        // queue defragmentation for next llama_kv_cache_update
-        if (fragmentation > cparams.defrag_thold) {
-            LLAMA_LOG_DEBUG("%s: fragmentation: %.2f - requesting defrag\n", __func__, fragmentation);
-
-            kv_self->defrag();
-        }
+    if (cparams.defrag_thold > 0.0f) {
+        kv_self->defrag_sched(cparams.defrag_thold);
     }
 
     // Reset state for the next token before backend sync, to allow the CPU activities in the reset to
@@ -1505,44 +1214,6 @@ int32_t llama_context::output_reserve(int32_t n_outputs) {
     return n_outputs_max;
 }
 
-void llama_context::output_reorder() {
-    auto & out_ids = sbatch.out_ids;
-    if (!out_ids.empty()) {
-        const uint32_t n_vocab = model.hparams.n_vocab;
-        const uint32_t n_embd  = model.hparams.n_embd;
-
-        GGML_ASSERT((size_t) n_outputs == out_ids.size());
-
-        // TODO: is there something more efficient which also minimizes swaps?
-        // selection sort, to minimize swaps (from https://en.wikipedia.org/wiki/Selection_sort)
-        for (int32_t i = 0; i < n_outputs - 1; ++i) {
-            int32_t j_min = i;
-            for (int32_t j = i + 1; j < n_outputs; ++j) {
-                if (out_ids[j] < out_ids[j_min]) {
-                    j_min = j;
-                }
-            }
-            if (j_min == i) { continue; }
-            std::swap(out_ids[i], out_ids[j_min]);
-            if (logits_size > 0) {
-                for (uint32_t k = 0; k < n_vocab; k++) {
-                    std::swap(logits[i*n_vocab + k], logits[j_min*n_vocab + k]);
-                }
-            }
-            if (embd_size > 0) {
-                for (uint32_t k = 0; k < n_embd; k++) {
-                    std::swap(embd[i*n_embd + k], embd[j_min*n_embd + k]);
-                }
-            }
-        }
-        std::fill(output_ids.begin(), output_ids.end(), -1);
-        for (int32_t i = 0; i < n_outputs; ++i) {
-            output_ids[out_ids[i]] = i;
-        }
-        out_ids.clear();
-    }
-}
-
 //
 // graph
 //
@@ -1579,7 +1250,7 @@ llm_graph_result_ptr llama_context::graph_build(
                 /*.backend_cpu =*/ backend_cpu,
                 /*.cvec        =*/ &cvec,
                 /*.loras       =*/ &loras,
-                /*.memory      =*/ kv_self.get(),
+                /*.memory      =*/ memory.get(),
                 /*.cross       =*/ &cross,
                 /*.n_outputs   =*/ n_outputs,
                 /*.cb          =*/ graph_get_cb(),
@@ -1983,8 +1654,6 @@ size_t llama_context::state_write_data(llama_io_write_i & io) {
     {
         LLAMA_LOG_DEBUG("%s: - writing output ids\n", __func__);
 
-        output_reorder();
-
         const auto n_outputs    = this->n_outputs;
         const auto & output_ids = this->output_ids;
 
@@ -2038,6 +1707,8 @@ size_t llama_context::state_write_data(llama_io_write_i & io) {
     }
 
     LLAMA_LOG_DEBUG("%s: - writing KV self\n", __func__);
+    llama_kv_cache * kv_self = static_cast<llama_kv_cache *>(memory.get());
+
     kv_self->state_write(io);
 
     return io.n_bytes();
@@ -2121,8 +1792,13 @@ size_t llama_context::state_read_data(llama_io_read_i & io) {
         }
     }
 
-    LLAMA_LOG_DEBUG("%s: - reading KV self\n", __func__);
-    kv_self->state_read(io);
+    if (memory) {
+        LLAMA_LOG_DEBUG("%s: - reading KV self\n", __func__);
+
+        llama_kv_cache * kv_self = static_cast<llama_kv_cache *>(memory.get());
+
+        kv_self->state_read(io);
+    }
 
     return io.n_bytes();
 }
@@ -2130,7 +1806,11 @@ size_t llama_context::state_read_data(llama_io_read_i & io) {
 size_t llama_context::state_seq_write_data(llama_io_write_i & io, llama_seq_id seq_id) {
     GGML_UNUSED(seq_id);
 
-    kv_self->state_write(io, seq_id);
+    if (memory) {
+        llama_kv_cache * kv_self = static_cast<llama_kv_cache *>(memory.get());
+
+        kv_self->state_write(io, seq_id);
+    }
 
     return io.n_bytes();
 }
@@ -2138,7 +1818,11 @@ size_t llama_context::state_seq_write_data(llama_io_write_i & io, llama_seq_id s
 size_t llama_context::state_seq_read_data(llama_io_read_i & io, llama_seq_id seq_id) {
     GGML_UNUSED(seq_id);
 
-    kv_self->state_read(io, seq_id);
+    if (memory) {
+        llama_kv_cache * kv_self = static_cast<llama_kv_cache *>(memory.get());
+
+        kv_self->state_read(io, seq_id);
+    }
 
     return io.n_bytes();
 }
@@ -2166,6 +1850,215 @@ void llama_context::perf_reset() {
     t_p_eval_us = n_p_eval = 0;
 }
 
+//
+// training
+//
+
+static void llama_set_param(struct ggml_tensor * tensor, llama_opt_param_filter param_filter, void * userdata) {
+    if (!tensor || tensor->type != GGML_TYPE_F32) {
+        return;
+    }
+    if (!param_filter(tensor, userdata)) {
+        return;
+    }
+    if (strcmp(tensor->name, "token_embd.weight") == 0) {
+        return; // FIXME
+    }
+    if (strcmp(tensor->name, "rope_freqs.weight") == 0) {
+        return; // FIXME
+    }
+    ggml_set_param(tensor);
+}
+
+void llama_context::opt_init(struct llama_model * model, struct llama_opt_params lopt_params) {
+    GGML_ASSERT(!opt_ctx);
+    model->hparams.n_ctx_train = lopt_params.n_ctx_train > 0 ? lopt_params.n_ctx_train : n_ctx();
+    const uint32_t n_batch     = std::min(this->n_batch(),  model->hparams.n_ctx_train);
+    const uint32_t n_ubatch    = std::min(this->n_ubatch(), n_batch);
+    GGML_ASSERT(model->hparams.n_ctx_train % n_batch  == 0);
+    GGML_ASSERT(n_batch                    % n_ubatch == 0);
+
+    ggml_opt_params opt_params = ggml_opt_default_params(sched.get(), GGML_OPT_LOSS_TYPE_CROSS_ENTROPY);
+    opt_params.opt_period      = n_batch / n_ubatch;
+    opt_params.get_opt_pars    = lopt_params.get_opt_pars;
+    opt_params.get_opt_pars_ud = lopt_params.get_opt_pars_ud;
+
+    opt_ctx = ggml_opt_init(opt_params);
+
+    llama_opt_param_filter param_filter = lopt_params.param_filter;
+    void * param_filter_ud              = lopt_params.param_filter_ud;
+
+  //llama_set_param(model->tok_embd,        param_filter, param_filter_ud); // FIXME
+    llama_set_param(model->type_embd,       param_filter, param_filter_ud);
+    llama_set_param(model->pos_embd,        param_filter, param_filter_ud);
+    llama_set_param(model->tok_norm,        param_filter, param_filter_ud);
+    llama_set_param(model->tok_norm_b,      param_filter, param_filter_ud);
+    llama_set_param(model->output_norm,     param_filter, param_filter_ud);
+    llama_set_param(model->output_norm_b,   param_filter, param_filter_ud);
+    llama_set_param(model->output,          param_filter, param_filter_ud);
+    llama_set_param(model->output_b,        param_filter, param_filter_ud);
+    llama_set_param(model->output_norm_enc, param_filter, param_filter_ud);
+    llama_set_param(model->cls,             param_filter, param_filter_ud);
+    llama_set_param(model->cls_b,           param_filter, param_filter_ud);
+    llama_set_param(model->cls_out,         param_filter, param_filter_ud);
+    llama_set_param(model->cls_out_b,       param_filter, param_filter_ud);
+
+    for (struct llama_layer & layer : model->layers) {
+        for (size_t i = 0; i < sizeof(layer)/sizeof(struct ggml_tensor *); ++i) {
+            llama_set_param(reinterpret_cast<struct ggml_tensor **>(&layer)[i], param_filter, param_filter_ud);
+        }
+    }
+}
+
+void llama_context::opt_epoch_iter(
+        ggml_opt_dataset_t               dataset,
+        ggml_opt_result_t                result,
+        const std::vector<llama_token> & tokens,
+        const std::vector<llama_token> & labels_sparse,
+        llama_batch                    & batch,
+        ggml_opt_epoch_callback          callback,
+        bool                             train,
+        int64_t                          idata_in_loop,
+        int64_t                          ndata_in_loop,
+        int64_t                          t_loop_start) {
+    GGML_ASSERT(opt_ctx);
+    const uint32_t n_ctx    = llama_model_n_ctx_train(&model);
+    const uint32_t n_batch  = std::min(this->n_batch(),  n_ctx);
+    const uint32_t n_ubatch = std::min(this->n_ubatch(), n_batch);
+
+    llama_kv_cache * kv_self = static_cast<llama_kv_cache *>(memory.get());
+
+    kv_self->clear();
+    llama_kv_cache_guard kv_guard(kv_self);
+
+    for (uint32_t pos_ctx = 0; pos_ctx < n_ctx; pos_ctx += n_batch) {
+        batch.n_tokens = n_batch;
+        for (uint32_t pos_batch = 0; pos_batch < n_batch; ++pos_batch) {
+            batch.token   [pos_batch]    = tokens[pos_ctx + pos_batch];
+            batch.pos     [pos_batch]    = pos_ctx + pos_batch;
+            batch.n_seq_id[pos_batch]    = 1;
+            batch.seq_id  [pos_batch][0] = 0;
+            batch.logits  [pos_batch]    = true;
+        }
+
+        const auto n_tokens_all = batch.n_tokens;
+
+        n_queued_tokens += n_tokens_all;
+
+        // this indicates we are doing pooled embedding, so we ignore batch.logits and output all tokens
+        const bool embd_pooled = cparams.embeddings && cparams.pooling_type != LLAMA_POOLING_TYPE_NONE;
+
+        embd_seq.clear();
+
+        int64_t n_outputs_all = n_tokens_all;
+
+        llama_sbatch sbatch = kv_self->sbatch_init(batch, /*logits_all =*/ true);
+
+        // reserve output buffer
+        if (output_reserve(n_outputs_all) < n_outputs_all) {
+            LLAMA_LOG_ERROR("%s: could not reserve space for batch with %" PRId64 " outputs\n", __func__, n_outputs_all);
+            GGML_ABORT("TODO: handle this error");
+        };
+
+        for (uint32_t pos_batch = 0; pos_batch < n_batch; pos_batch += n_ubatch) {
+            llama_ubatch ubatch = kv_self->ubatch_next(sbatch, cparams.n_ubatch, embd_pooled);
+
+            n_outputs = ubatch.n_tokens;
+
+            // TODO: not sure if this is needed
+            if (!kv_self->find_slot(ubatch)) {
+                LLAMA_LOG_WARN("%s: failed to find KV cache slot for ubatch of size %d\n", __func__, ubatch.n_tokens);
+
+                GGML_ABORT("TODO: handle this error");
+            }
+
+            auto * gf = graph_init();
+            auto res = graph_build(ctx_compute.get(), gf, ubatch, LLM_GRAPH_TYPE_DEFAULT);
+
+            struct ggml_context * ctx_compute_opt;
+            {
+                const size_t size_gf = ggml_graph_size(gf);
+                const size_t size_meta = 4*size_gf*ggml_tensor_overhead() + 2*ggml_graph_overhead_custom(size_gf, /*grads = */ true);
+                struct ggml_init_params params = {
+                    /*.mem_size   =*/ size_meta,
+                    /*.mem_buffer =*/ nullptr,
+                    /*.no_alloc   =*/ true,
+                };
+                ctx_compute_opt = ggml_init(params);
+            }
+            ggml_opt_prepare_alloc(opt_ctx, ctx_compute_opt, gf, res->get_tokens(), res->get_logits());
+            ggml_opt_alloc(opt_ctx, train);
+            res->set_inputs(&ubatch);
+            {
+                struct ggml_tensor * labels = ggml_opt_labels(opt_ctx);
+                GGML_ASSERT(labels->ne[1] == n_ubatch);
+                ggml_set_zero(labels);
+                const float onef = 1.0f;
+                for (uint32_t pos_ubatch = 0; pos_ubatch < n_ubatch; ++pos_ubatch) {
+                    const uint32_t ilabel = pos_ctx + pos_batch + pos_ubatch;
+                    GGML_ASSERT(labels_sparse[ilabel] < labels->ne[0]);
+                    ggml_backend_tensor_set(labels, &onef, (pos_ubatch*labels->ne[0] + labels_sparse[ilabel])*sizeof(float), sizeof(float));
+                }
+            }
+            ggml_opt_eval(opt_ctx, result);
+            if (callback) {
+                callback(train, opt_ctx, dataset, result, idata_in_loop + (pos_ctx + pos_batch)/n_ubatch + 1, ndata_in_loop, t_loop_start);
+            }
+            ggml_free(ctx_compute_opt);
+        }
+    }
+
+    kv_guard.commit();
+}
+
+void llama_context::opt_epoch(
+        ggml_opt_dataset_t        dataset,
+        ggml_opt_result_t         result_train,
+        ggml_opt_result_t         result_eval,
+        int64_t                   idata_split,
+        ggml_opt_epoch_callback   callback_train,
+        ggml_opt_epoch_callback   callback_eval) {
+    const uint32_t n_ctx    = this->n_ctx();
+    const uint32_t n_batch  = std::min(cparams.n_batch,  n_ctx);
+    const uint32_t n_ubatch = std::min(cparams.n_ubatch, n_batch);
+    const  int64_t ndata    = ggml_opt_dataset_ndata(dataset);
+
+    GGML_ASSERT(idata_split >= 0);
+    GGML_ASSERT(idata_split <= ndata);
+
+    const uint32_t ubatch_per_ctx = n_ctx / n_ubatch;
+
+    struct llama_batch batch = llama_batch_init(n_batch, 0, 1);
+    std::vector<llama_token>        tokens(n_ctx);
+    std::vector<llama_token> labels_sparse(n_ctx);
+
+    int64_t idata = 0;
+
+    int64_t t_loop_start = ggml_time_us();
+    int64_t ndata_in_loop = idata_split*ubatch_per_ctx;
+    for (; idata < idata_split; ++idata) {
+        constexpr bool train = true;
+        const int64_t idata_in_loop = idata*ubatch_per_ctx;
+
+        ggml_opt_dataset_get_batch_host(dataset, tokens.data(), n_ctx*sizeof(llama_token), labels_sparse.data(), idata);
+        opt_epoch_iter(dataset, result_train, tokens, labels_sparse, batch,
+            callback_train, train, idata_in_loop, ndata_in_loop, t_loop_start);
+    }
+
+    t_loop_start = ggml_time_us();
+    ndata_in_loop = (ndata - idata_split)*ubatch_per_ctx;
+    for (; idata < ndata; ++idata) {
+        constexpr bool train = false;
+        const int64_t idata_in_loop = (idata - idata_split)*ubatch_per_ctx;
+
+        ggml_opt_dataset_get_batch_host(dataset, tokens.data(), n_ctx*sizeof(llama_token), labels_sparse.data(), idata);
+        opt_epoch_iter(dataset, result_eval, tokens, labels_sparse, batch,
+            callback_eval, train, idata_in_loop, ndata_in_loop, t_loop_start);
+    }
+
+    llama_batch_free(batch);
+}
+
 //
 // interface implementation
 //
@@ -2193,14 +2086,14 @@ llama_context_params llama_context_default_params() {
         /*.cb_eval_user_data           =*/ nullptr,
         /*.type_k                      =*/ GGML_TYPE_F16,
         /*.type_v                      =*/ GGML_TYPE_F16,
-        /*.logits_all                  =*/ false,
+        /*.abort_callback              =*/ nullptr,
+        /*.abort_callback_data         =*/ nullptr,
         /*.embeddings                  =*/ false,
         /*.offload_kqv                 =*/ true,
         /*.flash_attn                  =*/ false,
         /*.no_perf                     =*/ true,
+        /*.op_offload                  =*/ true,
         /*.cross_attn                  =*/ false,
-        /*.abort_callback              =*/ nullptr,
-        /*.abort_callback_data         =*/ nullptr,
     };
 
     return result;
@@ -2498,7 +2391,7 @@ void llama_kv_cache_seq_cp(
          llama_seq_id   seq_id_dst,
             llama_pos   p0,
             llama_pos   p1) {
-    return llama_kv_self_seq_cp(ctx, seq_id_src, seq_id_dst, p0, p1);
+    llama_kv_self_seq_cp(ctx, seq_id_src, seq_id_dst, p0, p1);
 }
 
 void llama_kv_self_seq_cp(
@@ -2512,14 +2405,14 @@ void llama_kv_self_seq_cp(
         return;
     }
 
-    return kv->seq_cp(seq_id_src, seq_id_dst, p0, p1);
+    kv->seq_cp(seq_id_src, seq_id_dst, p0, p1);
 }
 
 // deprecated
 void llama_kv_cache_seq_keep(
         llama_context * ctx,
          llama_seq_id   seq_id) {
-    return llama_kv_self_seq_keep(ctx, seq_id);
+    llama_kv_self_seq_keep(ctx, seq_id);
 }
 
 void llama_kv_self_seq_keep(llama_context * ctx, llama_seq_id seq_id) {
@@ -2528,7 +2421,7 @@ void llama_kv_self_seq_keep(llama_context * ctx, llama_seq_id seq_id) {
         return;
     }
 
-    return kv->seq_keep(seq_id);
+    kv->seq_keep(seq_id);
 }
 
 // deprecated
@@ -2538,7 +2431,7 @@ void llama_kv_cache_seq_add(
             llama_pos   p0,
             llama_pos   p1,
             llama_pos   delta) {
-    return llama_kv_self_seq_add(ctx, seq_id, p0, p1, delta);
+    llama_kv_self_seq_add(ctx, seq_id, p0, p1, delta);
 }
 
 void llama_kv_self_seq_add(
@@ -2552,7 +2445,7 @@ void llama_kv_self_seq_add(
         return;
     }
 
-    return kv->seq_add(seq_id, p0, p1, delta);
+    kv->seq_add(seq_id, p0, p1, delta);
 }
 
 // deprecated
@@ -2562,7 +2455,7 @@ void llama_kv_cache_seq_div(
             llama_pos   p0,
             llama_pos   p1,
                   int   d) {
-    return llama_kv_self_seq_div(ctx, seq_id, p0, p1, d);
+    llama_kv_self_seq_div(ctx, seq_id, p0, p1, d);
 }
 
 void llama_kv_self_seq_div(
@@ -2576,7 +2469,7 @@ void llama_kv_self_seq_div(
         return;
     }
 
-    return kv->seq_div(seq_id, p0, p1, d);
+    kv->seq_div(seq_id, p0, p1, d);
 }
 
 // deprecated
@@ -2595,7 +2488,7 @@ llama_pos llama_kv_self_seq_pos_max(llama_context * ctx, llama_seq_id seq_id) {
 
 // deprecated
 void llama_kv_cache_defrag(llama_context * ctx) {
-    return llama_kv_self_defrag(ctx);
+    llama_kv_self_defrag(ctx);
 }
 
 void llama_kv_self_defrag(llama_context * ctx) {
@@ -2604,7 +2497,8 @@ void llama_kv_self_defrag(llama_context * ctx) {
         return;
     }
 
-    return kv->defrag();
+    // force defrag
+    kv->defrag_sched(-1.0f);
 }
 
 // deprecated
@@ -2788,3 +2682,34 @@ void llama_perf_context_print(const llama_context * ctx) {
 void llama_perf_context_reset(llama_context * ctx) {
     ctx->perf_reset();
 }
+
+//
+// training
+//
+
+bool llama_opt_param_filter_all(const struct ggml_tensor * tensor, void * userdata) {
+    GGML_UNUSED(tensor);
+    GGML_UNUSED(userdata);
+    return true;
+}
+
+void llama_opt_init(struct llama_context * ctx, struct llama_model * model, struct llama_opt_params lopt_params) {
+    ctx->opt_init(model, lopt_params);
+}
+
+void llama_opt_epoch(
+        struct llama_context    * ctx,
+        ggml_opt_dataset_t        dataset,
+        ggml_opt_result_t         result_train,
+        ggml_opt_result_t         result_eval,
+        int64_t                   idata_split,
+        ggml_opt_epoch_callback   callback_train,
+        ggml_opt_epoch_callback   callback_eval) {
+    ctx->opt_epoch(
+        dataset,
+        result_train,
+        result_eval,
+        idata_split,
+        callback_train,
+        callback_eval);
+}
diff --git a/llama/llama.cpp/src/llama-context.h b/llama/llama.cpp/src/llama-context.h
index 30f84bfd..9970dfc6 100644
--- a/llama/llama.cpp/src/llama-context.h
+++ b/llama/llama.cpp/src/llama-context.h
@@ -8,6 +8,7 @@
 #include "llama-kv-cache.h"
 
 #include "ggml-cpp.h"
+#include "ggml-opt.h"
 
 #include <map>
 #include <vector>
@@ -28,7 +29,12 @@ struct llama_context {
 
     void synchronize();
 
-    const llama_model & get_model() const;
+    const llama_model   & get_model()   const;
+    const llama_cparams & get_cparams() const;
+
+    ggml_backend_sched_t get_sched() const;
+
+    ggml_context * get_ctx_compute() const;
 
     uint32_t n_ctx()         const;
     uint32_t n_ctx_per_seq() const;
@@ -130,6 +136,32 @@ struct llama_context {
     llama_perf_context_data perf_get_data() const;
     void perf_reset();
 
+    //
+    // training
+    //
+
+    void opt_init(struct llama_model * model, struct llama_opt_params lopt_params);
+
+    void opt_epoch(
+            ggml_opt_dataset_t      dataset,
+            ggml_opt_result_t       result_train,
+            ggml_opt_result_t       result_eval,
+            int64_t                 idata_split,
+            ggml_opt_epoch_callback callback_train,
+            ggml_opt_epoch_callback callback_eval);
+
+    void opt_epoch_iter(
+            ggml_opt_dataset_t               dataset,
+            ggml_opt_result_t                result,
+            const std::vector<llama_token> & tokens,
+            const std::vector<llama_token> & labels_sparse,
+            llama_batch                    & batch,
+            ggml_opt_epoch_callback          callback,
+            bool                             train,
+            int64_t                          idata_in_loop,
+            int64_t                          ndata_in_loop,
+            int64_t                          t_loop_start);
+
 private:
     //
     // output
@@ -139,50 +171,30 @@ private:
     // Returns max number of outputs for which space was reserved.
     int32_t output_reserve(int32_t n_outputs);
 
-    // make the outputs have the same order they had in the user-provided batch
-    // TODO: maybe remove this
-    void output_reorder();
-
     //
     // graph
     //
 
+public:
     int32_t graph_max_nodes() const;
 
     // zero-out inputs and create the ctx_compute for the compute graph
     ggml_cgraph * graph_init();
 
-    llm_graph_result_ptr graph_build(
-            ggml_context * ctx,
-             ggml_cgraph * gf,
-      const llama_ubatch & ubatch,
-          llm_graph_type   gtype);
-
     // returns the result of ggml_backend_sched_graph_compute_async execution
     ggml_status graph_compute(
             ggml_cgraph * gf,
                    bool   batched);
 
+private:
+    llm_graph_result_ptr graph_build(
+            ggml_context * ctx,
+             ggml_cgraph * gf,
+      const llama_ubatch & ubatch,
+          llm_graph_type   gtype);
+
     llm_graph_cb graph_get_cb() const;
 
-    // used by kv_self_update()
-    ggml_tensor * build_rope_shift(
-        ggml_context * ctx0,
-        ggml_tensor * cur,
-        ggml_tensor * shift,
-        ggml_tensor * factors,
-              float   freq_base,
-              float   freq_scale) const;
-
-    llm_graph_result_ptr build_kv_self_shift(
-            ggml_context * ctx0,
-            ggml_cgraph * gf) const;
-
-    llm_graph_result_ptr build_kv_self_defrag(
-            ggml_context * ctx0,
-            ggml_cgraph * gf,
-            const std::vector<struct llama_kv_defrag_move> & moves) const;
-
     // TODO: read/write lora adapters and cvec
     size_t state_write_data(llama_io_write_i & io);
     size_t state_read_data (llama_io_read_i  & io);
@@ -199,14 +211,10 @@ private:
     llama_cparams       cparams;
     llama_adapter_cvec  cvec;
     llama_adapter_loras loras;
-    llama_sbatch        sbatch;
 
     llama_cross cross; // TODO: tmp for handling cross-attention - need something better probably
 
-    std::unique_ptr<llama_kv_cache_unified> kv_self;
-
-    // TODO: remove
-    bool logits_all = false;
+    std::unique_ptr<llama_memory_i> memory;
 
     // decode output (2-dimensional array: [n_outputs][n_vocab])
     size_t  logits_size = 0; // capacity (of floats) for logits
@@ -233,6 +241,9 @@ private:
 
     ggml_context_ptr ctx_compute;
 
+    // training
+    ggml_opt_context_t opt_ctx = nullptr;
+
     ggml_threadpool_t threadpool       = nullptr;
     ggml_threadpool_t threadpool_batch = nullptr;
 
diff --git a/llama/llama.cpp/src/llama-cparams.h b/llama/llama.cpp/src/llama-cparams.h
index 85ad91b9..7a6156ce 100644
--- a/llama/llama.cpp/src/llama-cparams.h
+++ b/llama/llama.cpp/src/llama-cparams.h
@@ -29,8 +29,9 @@ struct llama_cparams {
     bool offload_kqv;
     bool flash_attn;
     bool no_perf;
-    bool cross_attn;
     bool warmup;
+    bool op_offload;
+    bool cross_attn;
 
     enum llama_pooling_type pooling_type;
 
diff --git a/llama/llama.cpp/src/llama-graph.cpp b/llama/llama.cpp/src/llama-graph.cpp
index b67216a4..f14869cf 100644
--- a/llama/llama.cpp/src/llama-graph.cpp
+++ b/llama/llama.cpp/src/llama-graph.cpp
@@ -284,24 +284,7 @@ void llm_graph_input_s_copy::set_input(const llama_ubatch * ubatch) {
 
         // assuming copy destinations ALWAYS happen ONLY on the cells between head and head+n
         for (uint32_t i = 0; i < n_kv; ++i) {
-            const uint32_t  cell_id = i + kv_self->head;
-
-            //////////////////////////////////////////////
-            // TODO: this should not mutate the KV cache !
-            llama_kv_cell & kv_cell = const_cast<class llama_kv_cache_unified *>(kv_self)->cells[i];
-
-            // prevent out-of-bound sources
-            if (kv_cell.src < 0 || (uint32_t) kv_cell.src >= kv_self->size) {
-                kv_cell.src = cell_id;
-            }
-
-            data[i] = kv_cell.src;
-
-            // TODO: do not mutate the KV cache
-            // ensure copy only happens once
-            if (kv_cell.src != (int32_t) cell_id) {
-                kv_cell.src = cell_id;
-            }
+            data[i] = kv_self->s_copy(i);
         }
     }
 }
@@ -317,18 +300,7 @@ void llm_graph_input_s_mask::set_input(const llama_ubatch * ubatch) {
 
         // clear unused states
         for (int i = 0; i < n_kv; ++i) {
-            const uint32_t  cell_id = i + kv_self->head;
-
-            //////////////////////////////////////////////
-            // TODO: this should not mutate the KV cache !
-            llama_kv_cell & kv_cell = const_cast<class llama_kv_cache_unified *>(kv_self)->cells[i];
-
-            data[i] = (float) (kv_cell.src >= 0);
-
-            // only clear once
-            if (kv_cell.src < 0) {
-                kv_cell.src = cell_id;
-            }
+            data[i] = kv_self->s_mask(i);
         }
     }
 }
@@ -816,7 +788,7 @@ ggml_tensor * llm_graph_context::build_ffn(
             } break;
     }
 
-    if (type_gate == LLM_FFN_PAR) {
+    if (gate && type_gate == LLM_FFN_PAR) {
         cur = ggml_mul(ctx0, cur, tmp);
         cb(cur, "ffn_gate_par", il);
     }
@@ -1005,6 +977,7 @@ ggml_tensor * llm_graph_context::build_inp_embd(ggml_tensor * tok_embd) const {
         inp->tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, ubatch.n_tokens);
         //cb(inp->tokens, "inp_tokens", -1);
         ggml_set_input(inp->tokens);
+        res->t_tokens = inp->tokens;
 
         cur = ggml_get_rows(ctx0, tok_embd, inp->tokens);
 
@@ -1111,7 +1084,7 @@ ggml_tensor * llm_graph_context::build_inp_cls() const {
 }
 
 ggml_tensor * llm_graph_context::build_inp_s_copy() const {
-    const llama_kv_cache_unified * kv_self = static_cast<const llama_kv_cache_unified *>(memory);
+    const llama_kv_cache_recurrent * kv_self = static_cast<const llama_kv_cache_recurrent *>(memory);
 
     auto inp = std::make_unique<llm_graph_input_s_copy>(kv_self);
 
@@ -1128,7 +1101,7 @@ ggml_tensor * llm_graph_context::build_inp_s_copy() const {
 }
 
 ggml_tensor * llm_graph_context::build_inp_s_mask() const {
-    const llama_kv_cache_unified * kv_self = static_cast<const llama_kv_cache_unified *>(memory);
+    const llama_kv_cache_recurrent * kv_self = static_cast<const llama_kv_cache_recurrent *>(memory);
 
     auto inp = std::make_unique<llm_graph_input_s_mask>(kv_self);
 
@@ -1261,8 +1234,19 @@ ggml_tensor * llm_graph_context::build_attn_mha(
         ggml_flash_attn_ext_set_prec(cur, GGML_PREC_F32);
 
         if (v_mla) {
+#if 0
+            // v_mla can be applied as a matrix-vector multiplication with broadcasting across dimension 3 == n_tokens.
+            // However, the code is optimized for dimensions 0 and 1 being large, so this is ineffient.
             cur = ggml_reshape_4d(ctx0, cur, v_mla->ne[0], 1, n_head, n_tokens);
             cur = ggml_mul_mat(ctx0, v_mla, cur);
+#else
+            // It's preferable to do the calculation as a matrix-matrix multiplication with n_tokens in dimension 1.
+            // The permutations are noops and only change how the tensor data is interpreted.
+            cur = ggml_permute(ctx0, cur, 0, 2, 1, 3);
+            cur = ggml_mul_mat(ctx0, v_mla, cur);
+            cur = ggml_permute(ctx0, cur, 0, 2, 1, 3);
+            cur = ggml_cont(ctx0, cur); // Needed because ggml_reshape_2d expects contiguous inputs.
+#endif
         }
 
         cur = ggml_reshape_2d(ctx0, cur, cur->ne[0]*n_head, n_tokens);
@@ -1442,8 +1426,6 @@ ggml_tensor * llm_graph_context::build_attn(
 
     // store to KV cache
     {
-        GGML_ASSERT(!kv_self->recurrent);
-
         const auto kv_head = kv_self->head;
 
         GGML_ASSERT(kv_self->size == n_ctx);
@@ -1612,7 +1594,7 @@ ggml_tensor * llm_graph_context::build_copy_mask_state(
          ggml_tensor * state_mask,
              int32_t   n_state,
              int32_t   n_seqs) const {
-    const llama_kv_cache_unified * kv_self = static_cast<const llama_kv_cache_unified *>(memory);
+    const llama_kv_cache_recurrent * kv_self = static_cast<const llama_kv_cache_recurrent *>(memory);
 
     const auto n_kv    = kv_self->n;
     const auto kv_head = kv_self->head;
@@ -1644,7 +1626,7 @@ ggml_tensor * llm_graph_context::build_rwkv_token_shift_load(
          ggml_tensor * state_mask,
   const llama_ubatch & ubatch,
                  int   il) const {
-    const llama_kv_cache_unified * kv_self = static_cast<const llama_kv_cache_unified *>(memory);
+    const llama_kv_cache_recurrent * kv_self = static_cast<const llama_kv_cache_recurrent *>(memory);
 
     const auto token_shift_count = hparams.token_shift_count;
 
@@ -1665,7 +1647,7 @@ ggml_tensor * llm_graph_context::build_rwkv_token_shift_store(
          ggml_tensor * token_shift,
   const llama_ubatch & ubatch,
                  int   il) const {
-    const llama_kv_cache_unified * kv_self = static_cast<const llama_kv_cache_unified *>(memory);
+    const llama_kv_cache_recurrent * kv_self = static_cast<const llama_kv_cache_recurrent *>(memory);
 
     const auto token_shift_count = hparams.token_shift_count;
     const auto n_embd = hparams.n_embd;
diff --git a/llama/llama.cpp/src/llama-graph.h b/llama/llama.cpp/src/llama-graph.h
index 0fe18150..5a322785 100644
--- a/llama/llama.cpp/src/llama-graph.h
+++ b/llama/llama.cpp/src/llama-graph.h
@@ -19,6 +19,7 @@ struct llama_cparams;
 
 class llama_memory_i;
 class llama_kv_cache_unified;
+class llama_kv_cache_recurrent;
 
 // certain models (typically multi-modal) can produce different types of graphs
 enum llm_graph_type {
@@ -187,26 +188,26 @@ public:
 
 class llm_graph_input_s_copy : public llm_graph_input_i {
 public:
-    llm_graph_input_s_copy(const llama_kv_cache_unified * kv_self) : kv_self(kv_self) {}
+    llm_graph_input_s_copy(const llama_kv_cache_recurrent * kv_self) : kv_self(kv_self) {}
     virtual ~llm_graph_input_s_copy() = default;
 
     void set_input(const llama_ubatch * ubatch) override;
 
     ggml_tensor * s_copy; // I32 [kv_size]
 
-    const llama_kv_cache_unified * kv_self;
+    const llama_kv_cache_recurrent * kv_self;
 };
 
 class llm_graph_input_s_mask : public llm_graph_input_i {
 public:
-    llm_graph_input_s_mask(const llama_kv_cache_unified * kv_self) : kv_self(kv_self) {}
+    llm_graph_input_s_mask(const llama_kv_cache_recurrent * kv_self) : kv_self(kv_self) {}
     virtual ~llm_graph_input_s_mask() = default;
 
     void set_input(const llama_ubatch * ubatch) override;
 
     ggml_tensor * s_mask; // F32 [1, n_kv]
 
-    const llama_kv_cache_unified * kv_self;
+    const llama_kv_cache_recurrent * kv_self;
 };
 
 class llm_graph_input_cross_embd : public llm_graph_input_i {
@@ -308,6 +309,7 @@ class llm_graph_result_i {
 public:
     virtual ~llm_graph_result_i() = default;
 
+    virtual ggml_tensor * get_tokens()      = 0;
     virtual ggml_tensor * get_logits()      = 0;
     virtual ggml_tensor * get_embd()        = 0;
     virtual ggml_tensor * get_embd_pooled() = 0;
@@ -322,6 +324,7 @@ class llm_graph_result : public llm_graph_result_i {
 public:
     virtual ~llm_graph_result() = default;
 
+    ggml_tensor * get_tokens()      override { return t_tokens; }
     ggml_tensor * get_logits()      override { return t_logits; }
     ggml_tensor * get_embd()        override { return t_embd; }
     ggml_tensor * get_embd_pooled() override { return t_embd_pooled; }
@@ -338,6 +341,7 @@ public:
     }
 
     // important graph nodes
+    ggml_tensor * t_tokens      = nullptr;
     ggml_tensor * t_logits      = nullptr;
     ggml_tensor * t_embd        = nullptr;
     ggml_tensor * t_embd_pooled = nullptr;
@@ -361,8 +365,8 @@ struct llm_graph_params {
     const llama_cparams & cparams;
     const llama_ubatch  & ubatch;
 
-    ggml_backend_sched * sched;
-    ggml_backend * backend_cpu;
+    ggml_backend_sched_t sched;
+    ggml_backend_t backend_cpu;
 
     const llama_adapter_cvec  * cvec;
     const llama_adapter_loras * loras;
@@ -413,9 +417,9 @@ struct llm_graph_context {
 
     ggml_context * ctx0 = nullptr;
 
-    ggml_backend_sched * sched;
+    ggml_backend_sched_t sched;
 
-    ggml_backend * backend_cpu; // TODO: needed by build_attn_mha, figure out a way to remove?
+    ggml_backend_t backend_cpu; // TODO: needed by build_attn_mha, figure out a way to remove?
 
     const llama_adapter_cvec  * cvec;
     const llama_adapter_loras * loras;
diff --git a/llama/llama.cpp/src/llama-kv-cache.cpp b/llama/llama.cpp/src/llama-kv-cache.cpp
index 35a750d3..1a50c034 100644
--- a/llama/llama.cpp/src/llama-kv-cache.cpp
+++ b/llama/llama.cpp/src/llama-kv-cache.cpp
@@ -4,33 +4,41 @@
 #include "llama-batch.h"
 #include "llama-cparams.h"
 #include "llama-model.h"
+#include "llama-context.h"
 
 #include <algorithm>
 #include <cassert>
+#include <cmath>
 #include <limits>
 #include <map>
 #include <stdexcept>
 
-llama_kv_cache_unified::llama_kv_cache_unified(const llama_hparams & hparams, callbacks cbs) : hparams(hparams), cbs(std::move(cbs)) {
+//
+// llama_kv_cache_unified
+//
+
+uint32_t llama_kv_cache_unified::get_padding(const llama_cparams & cparams) {
+    // the FA kernels require padding to avoid extra runtime boundary checks
+    return cparams.flash_attn ? 256u : 32u;
 }
 
-bool llama_kv_cache_unified::init(
+llama_kv_cache_unified::llama_kv_cache_unified(
         const llama_model & model,
-      const llama_cparams & cparams,
                 ggml_type   type_k,
                 ggml_type   type_v,
+                     bool   v_trans,
+                     bool   offload,
                  uint32_t   kv_size,
-                     bool   offload) {
+                 uint32_t   padding) : model(model), hparams(model.hparams), v_trans(v_trans), padding(padding) {
     const int32_t n_layer = hparams.n_layer;
 
     has_shift = false;
+    can_shift = true;
 
-    recurrent = llama_model_is_recurrent(&model);
-    v_trans   = !recurrent && !cparams.flash_attn;
-    can_shift = !recurrent;
+    LLAMA_LOG_INFO("%s: kv_size = %d, type_k = '%s', type_v = '%s', n_layer = %d, can_shift = %d, padding = %d\n",
+            __func__, kv_size, ggml_type_name(type_k), ggml_type_name(type_v), n_layer, can_shift, padding);
 
-    LLAMA_LOG_INFO("%s: kv_size = %d, offload = %d, type_k = '%s', type_v = '%s', n_layer = %d, can_shift = %d\n",
-            __func__, kv_size, offload, ggml_type_name(type_k), ggml_type_name(type_v), n_layer, can_shift);
+    GGML_ASSERT(kv_size % padding == 0 && "kv_size must be a multiple of padding");
 
     head = 0;
     size = kv_size;
@@ -76,23 +84,20 @@ bool llama_kv_cache_unified::init(
 
         const char * dev_name = "CPU";
 
-        ggml_backend_buffer_type_t buft;
+        ggml_backend_buffer_type_t buft = ggml_backend_cpu_buffer_type();
+
         if (offload) {
             auto * dev = model.dev_layer(i);
             buft = ggml_backend_dev_buffer_type(dev);
 
             dev_name = ggml_backend_dev_name(dev);
-        } else {
-            buft = ggml_backend_cpu_buffer_type();
         }
 
-        LLAMA_LOG_DEBUG("%s: layer %3d: n_embd_k_gqa = %d, n_embd_v_gqa = %d, dev = %s\n", __func__,
-                i, n_embd_k_gqa, n_embd_v_gqa, dev_name);
+        LLAMA_LOG_DEBUG("%s: layer %3d: dev = %s\n", __func__, i, dev_name);
 
         ggml_context * ctx = ctx_for_buft(buft);
         if (!ctx) {
-            LLAMA_LOG_ERROR("%s: failed to create ggml context for kv cache\n", __func__);
-            return false;
+            throw std::runtime_error("failed to create ggml context for kv cache");
         }
 
         ggml_tensor * k, *v;
@@ -118,55 +123,28 @@ bool llama_kv_cache_unified::init(
 
         ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft);
         if (!buf) {
-            LLAMA_LOG_ERROR("%s: failed to allocate buffer for kv cache\n", __func__);
-            return false;
+            throw std::runtime_error("failed to allocate buffer for kv cache");
         }
         ggml_backend_buffer_clear(buf, 0);
         LLAMA_LOG_INFO("%s: %10s KV buffer size = %8.2f MiB\n", __func__, ggml_backend_buffer_name(buf), ggml_backend_buffer_get_size(buf)/1024.0/1024.0);
         bufs.emplace_back(buf);
     }
 
-    return true;
-}
+    {
+        const size_t memory_size_k = size_k_bytes();
+        const size_t memory_size_v = size_v_bytes();
 
-int32_t llama_kv_cache_unified::get_n_tokens() const {
-    int32_t result = 0;
-
-    for (uint32_t i = 0; i < size; i++) {
-        result += cells[i].seq_id.size();
+        LLAMA_LOG_INFO("%s: KV self size  = %7.2f MiB, K (%s): %7.2f MiB, V (%s): %7.2f MiB\n", __func__,
+                (float)(memory_size_k + memory_size_v) / (1024.0f * 1024.0f),
+                ggml_type_name(type_k), (float)memory_size_k / (1024.0f * 1024.0f),
+                ggml_type_name(type_v), (float)memory_size_v / (1024.0f * 1024.0f));
     }
-
-    return result;
-}
-
-int32_t llama_kv_cache_unified::get_used_cells() const {
-    return used;
-}
-
-size_t llama_kv_cache_unified::total_size() const {
-    size_t size = 0;
-    for (const auto & buf : bufs) {
-        size += ggml_backend_buffer_get_size(buf.get());
-    }
-
-    return size;
-}
-
-llama_pos llama_kv_cache_unified::pos_max() const {
-    llama_pos pos_max = -1;
-    for (const auto & cell : cells) {
-        pos_max = std::max(pos_max, cell.pos);
-    }
-
-    return pos_max;
 }
 
 void llama_kv_cache_unified::clear() {
     for (int32_t i = 0; i < (int32_t) size; ++i) {
         cells[i].pos = -1;
         cells[i].seq_id.clear();
-        cells[i].src = -1;
-        cells[i].tail = -1;
     }
     head = 0;
     used = 0;
@@ -187,35 +165,6 @@ bool llama_kv_cache_unified::seq_rm(llama_seq_id seq_id, llama_pos p0, llama_pos
         p1 = std::numeric_limits<llama_pos>::max();
     }
 
-    // models like Mamba or RWKV can't have a state partially erased
-    if (recurrent) {
-        if (seq_id >= (int64_t) size) {
-            // could be fatal
-            return false;
-        }
-        if (0 <= seq_id) {
-            int32_t & tail_id = cells[seq_id].tail;
-            if (tail_id >= 0) {
-                const llama_kv_cell & cell = cells[tail_id];
-                // partial intersection is invalid
-                if ((0 < p0 && p0 <= cell.pos) || (0 < p1 && p1 <= cell.pos)) {
-                    return false;
-                }
-                // invalidate tails which will be cleared
-                if (p0 <= cell.pos && cell.pos < p1) {
-                    tail_id = -1;
-                }
-            }
-        } else {
-            // seq_id is negative, then the range should include everything or nothing
-            if (p0 != p1 && (p0 != 0 || p1 != std::numeric_limits<llama_pos>::max())) {
-                return false;
-            }
-        }
-
-        return true;
-    }
-
     for (uint32_t i = 0; i < size; ++i) {
         if (cells[i].pos >= p0 && cells[i].pos < p1) {
             if (seq_id < 0) {
@@ -232,7 +181,6 @@ bool llama_kv_cache_unified::seq_rm(llama_seq_id seq_id, llama_pos p0, llama_pos
                 }
 
                 cells[i].pos = -1;
-                cells[i].src = -1;
 
                 if (new_head == size) {
                     new_head = i;
@@ -262,34 +210,6 @@ void llama_kv_cache_unified::seq_cp(llama_seq_id seq_id_src, llama_seq_id seq_id
         p1 = std::numeric_limits<llama_pos>::max();
     }
 
-    if (recurrent) {
-        if ((uint32_t) seq_id_dst < size && (uint32_t) seq_id_src < size) {
-            llama_kv_cell & tail_src = cells[seq_id_src];
-            llama_kv_cell & tail_dst = cells[seq_id_dst];
-            if (tail_dst.tail >= 0) {
-                // clear destination seq_id if it wasn't empty
-                llama_kv_cell & cell_dst = cells[tail_dst.tail];
-
-                cell_dst.seq_id.erase(seq_id_dst);
-                tail_dst.tail = -1;
-                if (cell_dst.seq_id.empty()) {
-                    cell_dst.pos = -1;
-                    cell_dst.delta = -1;
-                    cell_dst.src = -1;
-                    used -= 1;
-                }
-            }
-            if (tail_src.tail >= 0) {
-                llama_kv_cell & cell_src = cells[tail_src.tail];
-
-                cell_src.seq_id.insert(seq_id_dst);
-                tail_dst.tail = tail_src.tail;
-            }
-        }
-
-        return;
-    }
-
     // otherwise, this is the KV of a Transformer-like model
     head = 0;
 
@@ -304,17 +224,12 @@ void llama_kv_cache_unified::seq_keep(llama_seq_id seq_id) {
     uint32_t new_head = size;
 
     for (uint32_t i = 0; i < size; ++i) {
-        if (recurrent && (llama_seq_id) i != seq_id) {
-            cells[i].tail = -1;
-        }
-
         if (!cells[i].has_seq_id(seq_id)) {
             if (cells[i].pos >= 0) {
                 used--;
             }
 
             cells[i].pos = -1;
-            cells[i].src = -1;
             cells[i].seq_id.clear();
 
             if (new_head == size){
@@ -352,20 +267,6 @@ void llama_kv_cache_unified::seq_add(llama_seq_id seq_id, llama_pos p0, llama_po
         return;
     }
 
-    if (recurrent) {
-        // for Mamba-like or RWKV models, only the pos needs to be shifted
-        if (0 <= seq_id && seq_id < (int64_t) size) {
-            const int32_t tail_id = cells[seq_id].tail;
-            if (tail_id >= 0) {
-                llama_kv_cell & cell = cells[tail_id];
-                if (cell.has_seq_id(seq_id) && p0 <= cell.pos && cell.pos < p1) {
-                    cell.pos += delta;
-                }
-            }
-        }
-        return;
-    }
-
     for (uint32_t i = 0; i < size; ++i) {
         if (cells[i].has_seq_id(seq_id) && cells[i].pos >= p0 && cells[i].pos < p1) {
             has_shift = true;
@@ -408,21 +309,6 @@ void llama_kv_cache_unified::seq_div(llama_seq_id seq_id, llama_pos p0, llama_po
         return;
     }
 
-    if (recurrent) {
-        // for Mamba-like or RWKV models, only the pos needs to be changed
-        if (0 <= seq_id && seq_id < (int64_t) size) {
-            const int32_t tail_id = cells[seq_id].tail;
-            if (tail_id >= 0) {
-                llama_kv_cell & cell = cells[tail_id];
-                if (cell.has_seq_id(seq_id) && p0 <= cell.pos && cell.pos < p1) {
-                    cell.pos /= d;
-                }
-            }
-        }
-
-        return;
-    }
-
     for (uint32_t i = 0; i < size; ++i) {
         if (cells[i].has_seq_id(seq_id) && cells[i].pos >= p0 && cells[i].pos < p1) {
             has_shift = true;
@@ -448,23 +334,11 @@ llama_pos llama_kv_cache_unified::seq_pos_max(llama_seq_id seq_id) const {
     return result;
 }
 
-void llama_kv_cache_unified::defrag() {
-    if (!recurrent) {
-        do_defrag = true;
-    }
-}
-
 void llama_kv_cache_unified::restore() {
     if (pending.ranges.empty()) {
         return;
     }
 
-    // TODO: tmp - move to llama_kv_cache_recurrent
-    if (recurrent) {
-        seq_rm(-1, -1, -1);
-        return;
-    }
-
     uint32_t new_head = size;
 
     for (auto & range : pending.ranges) {
@@ -477,7 +351,6 @@ void llama_kv_cache_unified::restore() {
             }
 
             cells[i].pos = -1;
-            cells[i].src = -1;
         }
 
         new_head = std::min(new_head, range.c0);
@@ -489,11 +362,6 @@ void llama_kv_cache_unified::restore() {
 }
 
 void llama_kv_cache_unified::commit() {
-    // TODO: tmp - move to llama_kv_cache_recurrent
-    if (recurrent) {
-        return;
-    }
-
     if (pending.ranges.empty()) {
         LLAMA_LOG_WARN("%s: no pending KV cache updates to commit - might indicate a bug (ref: %s)\n",
                 __func__, "https://github.com/ggml-org/llama.cpp/pull/12695");
@@ -503,8 +371,103 @@ void llama_kv_cache_unified::commit() {
     pending.ranges.clear();
 }
 
-bool llama_kv_cache_unified::get_can_shift() const {
-    return can_shift;
+bool llama_kv_cache_unified::update(llama_context & lctx) {
+    auto * sched = lctx.get_sched();
+
+    if (has_shift) {
+        if (!get_can_shift()) {
+            GGML_ABORT("The current KV cache / model configuration does not support K-shift");
+        }
+
+        LLAMA_LOG_DEBUG("%s: applying K-shift\n", __func__);
+
+        // apply K-shift if needed
+        if (hparams.rope_type != LLAMA_ROPE_TYPE_NONE) {
+            ggml_backend_sched_reset(sched);
+
+            auto * gf = lctx.graph_init();
+
+            auto res = build_graph_shift(lctx.get_cparams(), lctx.get_ctx_compute(), gf);
+
+            ggml_backend_sched_alloc_graph(sched, gf);
+
+            res->set_inputs(nullptr);
+
+            lctx.graph_compute(gf, false);
+        }
+
+        {
+            has_shift = false;
+
+            for (uint32_t i = 0; i < size; ++i) {
+                cells[i].delta = 0;
+            }
+        }
+    }
+
+    if (do_defrag) {
+        LLAMA_LOG_DEBUG("%s: defragmenting KV cache\n", __func__);
+        const uint32_t n_max_nodes = lctx.graph_max_nodes();
+        const uint32_t max_moves = (n_max_nodes - 2*model.hparams.n_layer)/(6*model.hparams.n_layer);
+        if (!defrag_prepare(n_max_nodes)) {
+            LLAMA_LOG_ERROR("%s: failed to prepare defragmentation\n", __func__);
+            return false;
+        }
+
+        for (std::size_t i = 0; i < defrag_info.moves.size(); i += max_moves) {
+            std::vector<struct llama_kv_defrag_move> chunk;
+            auto end = std::min(i + max_moves, defrag_info.moves.size());
+            chunk.assign(defrag_info.moves.begin() + i, defrag_info.moves.begin() + end);
+
+            ggml_backend_sched_reset(sched);
+
+            auto * gf = lctx.graph_init();
+
+            auto res = build_graph_defrag(lctx.get_cparams(), lctx.get_ctx_compute(), gf, chunk);
+
+            ggml_backend_sched_alloc_graph(sched, gf);
+
+            res->set_inputs(nullptr);
+
+            lctx.graph_compute(gf, false);
+        }
+
+        do_defrag = false;
+    }
+
+    // we never need to reserve a worst case graph
+    return false;
+}
+
+void llama_kv_cache_unified::defrag_sched(float thold) {
+    // - do not defrag small contexts (i.e. < 2048 tokens)
+    // - count the padding towards the number of used tokens
+    const float fragmentation = n >= 2048 ? std::max(0.0f, 1.0f - (float(used + padding)/n)) : 0.0f;
+
+    // queue defragmentation for next llama_kv_cache_update
+    if (fragmentation > thold) {
+        LLAMA_LOG_DEBUG("%s: fragmentation: %.2f - requesting defrag\n", __func__, fragmentation);
+
+        do_defrag = true;
+    }
+}
+
+void llama_kv_cache_unified::set_full() {
+    n = size;
+}
+
+llama_sbatch llama_kv_cache_unified::sbatch_init(
+        const llama_batch & batch,
+        bool logits_all) {
+    return llama_sbatch(batch, batch.n_embd, true, logits_all);
+}
+
+llama_ubatch llama_kv_cache_unified::ubatch_next(
+        llama_sbatch & sbatch,
+        uint32_t n_ubatch,
+        bool embd_pooled) const {
+    GGML_UNUSED(embd_pooled);
+    return sbatch.split_simple(n_ubatch);
 }
 
 bool llama_kv_cache_unified::find_slot(
@@ -519,169 +482,6 @@ bool llama_kv_cache_unified::find_slot(
         head = 0;
     }
 
-    if (recurrent) {
-        // For recurrent state architectures (like Mamba or RWKV),
-        // each cache cell can store the state for a whole sequence.
-        // A slot should be always be contiguous.
-
-        // can only process batches with an equal number of new tokens in each sequence
-        GGML_ASSERT(ubatch.equal_seqs);
-
-        int32_t min = size - 1;
-        int32_t max = 0;
-
-        // everything should fit if all seq_ids are smaller than the max
-        for (uint32_t s = 0; s < n_seqs; ++s) {
-            const uint32_t n_seq_id = ubatch.n_seq_id[s];
-            for (uint32_t j = 0; j < n_seq_id; ++j) {
-                const llama_seq_id seq_id = ubatch.seq_id[s][j];
-
-                if (seq_id < 0 || (uint32_t) seq_id >= size) {
-                    // too big seq_id
-                    // TODO: would it be possible to resize the cache instead?
-                    LLAMA_LOG_ERROR("%s: seq_id=%d >= n_seq_max=%d Try using a bigger --parallel value\n", __func__, seq_id, size);
-                    return false;
-                }
-                if (j > 0) {
-                    llama_kv_cell & seq = cells[seq_id];
-                    if (seq.tail >= 0) {
-                        llama_kv_cell & cell = cells[seq.tail];
-                        // clear cells from seq_ids that become shared
-                        // (should not normally happen, but let's handle it anyway)
-                        cell.seq_id.erase(seq_id);
-                        seq.tail = -1;
-                        if (cell.seq_id.empty()) {
-                            cell.pos = -1;
-                            cell.src = -1;
-                            used -= 1;
-                        }
-                    }
-                }
-            }
-        }
-
-#ifndef NDEBUG
-        {
-            std::vector<int32_t> tails_verif;
-            tails_verif.assign(size, -1);
-            for (uint32_t i = 0; i < size; ++i) {
-                llama_kv_cell & cell = cells[i];
-                for (llama_seq_id seq_id : cell.seq_id) {
-                    if (tails_verif[seq_id] != -1) {
-                        LLAMA_LOG_ERROR("%s: duplicate tail for seq_id %d in cell %d and %d\n", __func__, seq_id, i, tails_verif[seq_id]);
-                    }
-                    tails_verif[seq_id] = i;
-                }
-            }
-            for (uint32_t i = 0; i < size; ++i) {
-                if (tails_verif[i] != cells[i].tail) {
-                    LLAMA_LOG_ERROR("%s: wrong tail for seq_id %d, (%d instead of %d)\n", __func__, i, cells[i].tail, tails_verif[i]);
-                }
-            }
-        }
-#endif
-
-        // find next empty cell
-        uint32_t next_empty_cell = head;
-
-        for (uint32_t i = 0; i < size; ++i) {
-            if (next_empty_cell >= size) { next_empty_cell -= size; }
-            llama_kv_cell & cell = cells[next_empty_cell];
-            if (cell.is_empty()) { break; }
-            next_empty_cell += 1;
-        }
-
-        // find usable cell range
-        for (uint32_t s = 0; s < n_seqs; ++s) {
-            const llama_seq_id seq_id = ubatch.seq_id[s][0];
-            llama_kv_cell & seq_meta = cells[seq_id];
-            bool has_cell = false;
-            if (seq_meta.tail >= 0) {
-                llama_kv_cell & cell = cells[seq_meta.tail];
-                GGML_ASSERT(cell.has_seq_id(seq_id));
-                // does this seq_id "own" the cell?
-                if (cell.seq_id.size() == 1) { has_cell = true; }
-            }
-            if (!has_cell) {
-                llama_kv_cell & empty_cell = cells[next_empty_cell];
-                GGML_ASSERT(empty_cell.is_empty());
-                // copy old tail into the empty cell
-                if (seq_meta.tail >= 0) {
-                    llama_kv_cell & orig_cell = cells[seq_meta.tail];
-                    empty_cell.pos = orig_cell.pos;
-                    empty_cell.src = orig_cell.src;
-                    orig_cell.seq_id.erase(seq_id);
-                    empty_cell.seq_id.insert(seq_id); // will be overwritten
-                }
-                seq_meta.tail = next_empty_cell;
-                // find next empty cell
-                if (s + 1 < n_seqs) {
-                    next_empty_cell += 1;
-                    for (uint32_t i = 0; i < size; ++i) {
-                        if (next_empty_cell >= size) { next_empty_cell -= size; }
-                        llama_kv_cell & cell = cells[next_empty_cell];
-                        if (cell.is_empty()) { break; }
-                        next_empty_cell += 1;
-                    }
-                }
-            }
-            if (min > seq_meta.tail) { min = seq_meta.tail; }
-            if (max < seq_meta.tail) { max = seq_meta.tail; }
-        }
-
-        // gather and re-order
-        for (uint32_t s = 0; s < n_seqs; ++s) {
-            int32_t dst_id = s + min;
-            int32_t src_id = cells[ubatch.seq_id[s][0]].tail;
-            if (dst_id != src_id) {
-                llama_kv_cell & dst_cell = cells[dst_id];
-                llama_kv_cell & src_cell = cells[src_id];
-
-                std::swap(dst_cell.pos, src_cell.pos);
-                std::swap(dst_cell.src, src_cell.src);
-                std::swap(dst_cell.seq_id, src_cell.seq_id);
-
-                // swap tails (assuming they NEVER overlap)
-                for (const llama_seq_id seq_id : src_cell.seq_id) {
-                    cells[seq_id].tail = src_id;
-                }
-                for (const llama_seq_id seq_id : dst_cell.seq_id) {
-                    cells[seq_id].tail = dst_id;
-                }
-            }
-        }
-
-        // update the pos of the used seqs
-        for (uint32_t s = 0; s < n_seqs; ++s) {
-            const llama_pos last_pos = ubatch.pos[n_seq_tokens * s + n_seq_tokens - 1];
-            int32_t cell_id = s + min;
-            llama_kv_cell & cell = cells[cell_id];
-
-            if (cell.pos >= 0 && last_pos != cell.pos + (llama_pos) n_seq_tokens) {
-                // What should happen when the pos backtracks or skips a value?
-                // Clearing the state mid-batch would require special-casing which isn't done.
-                LLAMA_LOG_WARN("%s: non-consecutive token position %d after %d for sequence %d with %u new tokens\n",
-                    __func__, last_pos, cell.pos, ubatch.seq_id[s][0], n_seq_tokens);
-            }
-            cell.pos = last_pos;
-            cell.seq_id.clear();
-            for (int32_t j = 0; j < ubatch.n_seq_id[s]; ++j) {
-                const llama_seq_id seq_id = ubatch.seq_id[s][j];
-                cell.seq_id.insert(seq_id);
-                cells[seq_id].tail = cell_id;
-            }
-        }
-
-        // allow getting the range of used cells, from head to head + n
-        head = min;
-        n    = max - min + 1;
-        used = std::count_if(cells.begin(), cells.end(),
-            [](const llama_kv_cell& cell){ return !cell.is_empty(); });
-
-        // sanity check
-        return n >= n_seqs;
-    }
-
     // otherwise, one cell per token.
 
     if (n_tokens > size) {
@@ -733,24 +533,50 @@ bool llama_kv_cache_unified::find_slot(
 
     pending.ranges.push_back({head, head + n_tokens});
 
+    // a heuristic, to avoid attending the full cache if it is not yet utilized
+    // after enough generations, the benefit from this heuristic disappears
+    // if we start defragmenting the cache, the benefit from this will be more important
+    n = std::min(size, std::max(padding, GGML_PAD(cell_max(), padding)));
+
+    //printf("n = %5d, used = %5d, head = %5d\n", n, used, head);
+
     return true;
 }
 
-uint32_t llama_kv_cache_unified::get_padding(const llama_cparams & cparams) const {
-    // the FA kernels require padding to avoid extra runtime boundary checks
-    return cparams.flash_attn ? 256u : 32u;
-}
+int32_t llama_kv_cache_unified::get_n_tokens() const {
+    int32_t result = 0;
 
-uint32_t llama_kv_cache_unified::cell_max() const {
-    for (uint32_t i = size; i > 0; --i) {
-        const llama_kv_cell & cell = cells[i - 1];
-
-        if (cell.pos >= 0 && !cell.is_empty()) {
-            return i;
-        }
+    for (uint32_t i = 0; i < size; i++) {
+        result += cells[i].seq_id.size();
     }
 
-    return 0;
+    return result;
+}
+
+int32_t llama_kv_cache_unified::get_used_cells() const {
+    return used;
+}
+
+bool llama_kv_cache_unified::get_can_shift() const {
+    return can_shift;
+}
+
+llama_pos llama_kv_cache_unified::get_pos_max() const {
+    llama_pos pos_max = -1;
+    for (const auto & cell : cells) {
+        pos_max = std::max(pos_max, cell.pos);
+    }
+
+    return pos_max;
+}
+
+size_t llama_kv_cache_unified::total_size() const {
+    size_t size = 0;
+    for (const auto & buf : bufs) {
+        size += ggml_backend_buffer_get_size(buf.get());
+    }
+
+    return size;
 }
 
 size_t llama_kv_cache_unified::size_k_bytes() const {
@@ -773,6 +599,254 @@ size_t llama_kv_cache_unified::size_v_bytes() const {
     return size_v_bytes;
 }
 
+ggml_tensor * llama_kv_cache_unified::build_rope_shift(
+        const llama_cparams & cparams,
+               ggml_context * ctx,
+                ggml_tensor * cur,
+                ggml_tensor * shift,
+                ggml_tensor * factors,
+                      float   freq_base,
+                      float   freq_scale) const {
+    const auto & n_ctx_orig = cparams.n_ctx_orig_yarn;
+
+    const auto & yarn_ext_factor = cparams.yarn_ext_factor;
+    const auto & yarn_beta_fast  = cparams.yarn_beta_fast;
+    const auto & yarn_beta_slow  = cparams.yarn_beta_slow;
+
+    const auto & n_rot     = hparams.n_rot;
+    const auto & rope_type = hparams.rope_type;
+
+    // See llm_build_deepseek2() for why attn_factor has to be scaled for YaRN RoPE to work correctly.
+    // See https://github.com/ggerganov/llama.cpp/discussions/7416 for detailed explanation.
+    const float yarn_attn_factor = model.arch == LLM_ARCH_DEEPSEEK2 ? 1.0f / (1.0f + 0.1f * logf(1.0f / freq_scale)) : cparams.yarn_attn_factor;
+
+    ggml_tensor * tmp;
+
+    if (ggml_is_quantized(cur->type)) {
+        // dequantize to f32 -> RoPE -> quantize back
+        tmp = ggml_cast(ctx, cur, GGML_TYPE_F32);
+
+        tmp = ggml_rope_ext(ctx, tmp,
+                shift, factors, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+                yarn_ext_factor, yarn_attn_factor, yarn_beta_fast, yarn_beta_slow);
+
+        tmp = ggml_cpy(ctx, tmp, cur);
+    } else {
+        // we rotate only the first n_rot dimensions
+        tmp = ggml_rope_ext_inplace(ctx, cur,
+                shift, factors, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+                yarn_ext_factor, yarn_attn_factor, yarn_beta_fast, yarn_beta_slow);
+    }
+
+    return tmp;
+}
+
+class llm_graph_input_k_shift : public llm_graph_input_i {
+public:
+    llm_graph_input_k_shift(const llama_kv_cache_unified * kv_self) : kv_self(kv_self) {}
+    virtual ~llm_graph_input_k_shift() = default;
+
+    void set_input(const llama_ubatch * ubatch) override;
+
+    ggml_tensor * k_shift; // I32 [kv_size]
+
+    const llama_kv_cache_unified * kv_self;
+};
+
+void llm_graph_input_k_shift::set_input(const llama_ubatch * ubatch) {
+    GGML_UNUSED(ubatch);
+
+    if (k_shift) {
+        assert(ggml_backend_buffer_is_host(k_shift->buffer));
+
+        int32_t * data = (int32_t *) k_shift->data;
+
+        for (uint32_t i = 0; i < kv_self->size; ++i) {
+            data[i] = kv_self->cells[i].delta;
+        }
+    }
+}
+
+llm_graph_result_ptr llama_kv_cache_unified::build_graph_shift(
+        const llama_cparams & cparams,
+               ggml_context * ctx,
+                ggml_cgraph * gf) const {
+    auto res = std::make_unique<llm_graph_result>();
+
+    const auto & n_layer = hparams.n_layer;
+
+    const auto & n_embd_head_k = hparams.n_embd_head_k;
+  //const auto & n_embd_head_v = hparams.n_embd_head_v;
+
+    const uint32_t n_ctx_per_seq = cparams.n_ctx / cparams.n_seq_max;
+
+    //GGML_ASSERT(kv_self->size == n_ctx);
+
+    auto inp = std::make_unique<llm_graph_input_k_shift>(this);
+
+    inp->k_shift = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, cparams.n_ctx);
+    ggml_set_input(inp->k_shift);
+
+    for (uint32_t il = 0; il < n_layer; ++il) {
+        const int64_t n_head_kv    = hparams.n_head_kv(il);
+        const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa(il);
+
+        const bool is_swa = hparams.is_swa(il);
+
+        // note: the swa rope params could become part of the cparams in the future
+        //       if we decide to make them configurable, like the non-sliding ones
+        const float freq_base_l  = is_swa ? hparams.rope_freq_base_train_swa  : cparams.rope_freq_base;
+        const float freq_scale_l = is_swa ? hparams.rope_freq_scale_train_swa : cparams.rope_freq_scale;
+
+        ggml_tensor * rope_factors = model.get_rope_factors(n_ctx_per_seq, il);
+
+        ggml_tensor * k =
+            ggml_view_3d(ctx, k_l[il],
+                n_embd_head_k, n_head_kv, size,
+                ggml_row_size(k_l[il]->type, n_embd_head_k),
+                ggml_row_size(k_l[il]->type, n_embd_k_gqa),
+                0);
+
+        ggml_tensor * cur = build_rope_shift(cparams, ctx, k, inp->k_shift, rope_factors, freq_base_l, freq_scale_l);
+
+        ggml_build_forward_expand(gf, cur);
+    }
+
+    res->add_input(std::move(inp));
+
+    return res;
+}
+
+llm_graph_result_ptr llama_kv_cache_unified::build_graph_defrag(
+        const llama_cparams & cparams,
+               ggml_context * ctx,
+                ggml_cgraph * gf,
+                const std::vector<struct llama_kv_defrag_move> & moves) const {
+    auto res = std::make_unique<llm_graph_result>();
+
+#if 0
+    // CPU defrag
+    //
+    // TODO: optimizations are possible:
+    //       - multiple threads
+    //       - avoid copying to the host memory when already there
+    //
+    // likely not worth the effort, as we have ggml_graph based defrag
+    //
+
+    const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa();
+    const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa();
+
+    const uint32_t kv_size = size;
+
+    std::vector<uint8_t> buf_k;
+    std::vector<uint8_t> buf_v;
+
+    for (uint32_t il = 0; il < n_layer; ++il) {
+        const size_t k_size_row = ggml_row_size(k_l[il]->type, n_embd_k_gqa);
+        const size_t k_size     = ggml_row_size(k_l[il]->type, n_embd_k_gqa*kv_size);
+
+        const size_t v_size_el = ggml_type_size(v_l[il]->type);
+        const size_t v_size    = ggml_row_size (v_l[il]->type, n_embd_v_gqa*kv_size);
+
+        buf_k.resize(k_size);
+        buf_v.resize(v_size);
+
+        ggml_backend_tensor_get(k_l[il], buf_k.data(), 0, buf_k.size());
+        ggml_backend_tensor_get(v_l[il], buf_v.data(), 0, buf_v.size());
+
+        // batch move [i, i+nm) to [id, id+nm)
+        // note: cells can move only to a lower index
+        for (uint32_t i = 0; i < n_kv; ++i) {
+            const uint32_t id = ids[i];
+
+            if (i == id || id == n_kv) {
+                continue;
+            }
+
+            uint32_t nm = 1;
+
+            while (i + nm < n_kv && ids[i + nm] == id + nm) {
+                nm++;
+            }
+
+            // move keys
+            {
+                const int64_t os =  i*k_size_row;
+                const int64_t od = id*k_size_row;
+
+                memcpy(buf_k.data() + od, buf_k.data() + os, nm*k_size_row);
+            }
+
+            // move values (note: they are transposed)
+            {
+                const int64_t os =  i;
+                const int64_t od = id;
+
+                for (uint32_t j = 0; j < n_embd_v_gqa; ++j) {
+                    memcpy(buf_v.data() + (od + j*kv_size)*v_size_el, buf_v.data() + (os + j*kv_size)*v_size_el, nm*v_size_el);
+                }
+            }
+
+            i += nm - 1;
+        }
+
+        ggml_backend_tensor_set(k_l[il], buf_k.data(), 0, buf_k.size());
+        ggml_backend_tensor_set(v_l[il], buf_v.data(), 0, buf_v.size());
+    }
+#else
+    for (const auto & move : moves) {
+        for (uint32_t il = 0; il < hparams.n_layer; ++il) { // NOLINT
+            const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa(il);
+            const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa(il);
+
+            ggml_tensor * view_k_src = ggml_view_2d(ctx, k_l[il],
+                    n_embd_k_gqa, move.len,
+                    ggml_row_size(k_l[il]->type, n_embd_k_gqa),
+                    ggml_row_size(k_l[il]->type, n_embd_k_gqa*move.src));
+
+            ggml_tensor * view_k_dst = ggml_view_2d(ctx, k_l[il],
+                    n_embd_k_gqa, move.len,
+                    ggml_row_size(k_l[il]->type, n_embd_k_gqa),
+                    ggml_row_size(k_l[il]->type, n_embd_k_gqa*move.dst));
+
+            ggml_tensor * view_v_src;
+            ggml_tensor * view_v_dst;
+
+            if (cparams.flash_attn) {
+                // NOTE: the V cache is not transposed when using flash attention
+                view_v_src = ggml_view_2d(ctx, v_l[il],
+                        n_embd_v_gqa, move.len,
+                        ggml_row_size(v_l[il]->type, n_embd_v_gqa),
+                        ggml_row_size(v_l[il]->type, n_embd_v_gqa*move.dst));
+
+                view_v_dst = ggml_view_2d(ctx, v_l[il],
+                        move.len, n_embd_v_gqa,
+                        ggml_row_size(v_l[il]->type, n_embd_v_gqa),
+                        ggml_row_size(v_l[il]->type, move.src));
+            } else {
+                view_v_src = ggml_view_2d(ctx, v_l[il],
+                        move.len, n_embd_v_gqa,
+                        ggml_row_size(v_l[il]->type, size),
+                        ggml_row_size(v_l[il]->type, move.src));
+
+                view_v_dst = ggml_view_2d(ctx, v_l[il],
+                        move.len, n_embd_v_gqa,
+                        ggml_row_size(v_l[il]->type, size),
+                        ggml_row_size(v_l[il]->type, move.dst));
+            }
+
+            ggml_build_forward_expand(gf, ggml_cpy(ctx, view_k_src, view_k_dst));
+            ggml_build_forward_expand(gf, ggml_cpy(ctx, view_v_src, view_v_dst));
+        }
+    }
+
+    //LLAMA_LOG_INFO("gf->n_nodes = %d\n", gf->n_nodes);
+#endif
+
+    return res;
+}
+
 bool llama_kv_cache_unified::defrag_prepare(int32_t n_max_nodes) {
     const uint32_t n_layer = hparams.n_layer;
 
@@ -854,7 +928,7 @@ bool llama_kv_cache_unified::defrag_prepare(int32_t n_max_nodes) {
             cells[i0 + nf] = cell1;
 
             // clear the old cell and move the head there
-            cell1 = llama_kv_cell();
+            cell1 = kv_cell();
             head = n_used;
 
             if (!cont) {
@@ -885,6 +959,18 @@ bool llama_kv_cache_unified::defrag_prepare(int32_t n_max_nodes) {
     return true;
 }
 
+uint32_t llama_kv_cache_unified::cell_max() const {
+    for (uint32_t i = size; i > 0; --i) {
+        const kv_cell & cell = cells[i - 1];
+
+        if (cell.pos >= 0 && !cell.is_empty()) {
+            return i;
+        }
+    }
+
+    return 0;
+}
+
 void llama_kv_cache_unified::state_write(llama_io_write_i & io, llama_seq_id seq_id) const {
     std::vector<std::pair<uint32_t, uint32_t>> cell_ranges; // ranges, from inclusive, to exclusive
     uint32_t cell_count = 0;
@@ -1093,7 +1179,7 @@ bool llama_kv_cache_unified::state_read_meta(llama_io_read_i & io, uint32_t cell
         clear();
 
         for (uint32_t i = 0; i < cell_count; ++i) {
-            llama_kv_cell & cell = cells[i];
+            kv_cell & cell = cells[i];
 
             llama_pos pos;
             uint32_t  n_seq_id;
@@ -1116,15 +1202,6 @@ bool llama_kv_cache_unified::state_read_meta(llama_io_read_i & io, uint32_t cell
                 }
 
                 cell.seq_id.insert(seq_id);
-
-                if (recurrent) {
-                    int32_t & tail = cells[seq_id].tail;
-                    if (tail != -1) {
-                        LLAMA_LOG_ERROR("%s: duplicate tail for seq_id %d in cell %d and %d\n", __func__, seq_id, i, tail);
-                        return false;
-                    }
-                    tail = i;
-                }
             }
         }
 
@@ -1132,14 +1209,6 @@ bool llama_kv_cache_unified::state_read_meta(llama_io_read_i & io, uint32_t cell
         used = cell_count;
     }
 
-    if (recurrent) {
-        for (uint32_t i = 0; i < cell_count; ++i) {
-            uint32_t cell_id = head + i;
-            // make sure the recurrent states will keep their restored state
-            cells[cell_id].src = cell_id;
-        }
-    }
-
     return true;
 }
 
@@ -1157,7 +1226,1034 @@ bool llama_kv_cache_unified::state_read_data(llama_io_read_i & io, uint32_t cell
         LLAMA_LOG_ERROR("%s: not enough cells in kv cache to restore state (%u > %u)\n", __func__, cell_count, size);
         return false;
     }
-    if (v_trans != (bool) v_trans) {
+    if (this->v_trans != (bool) v_trans) {
+        LLAMA_LOG_ERROR("%s: incompatible V transposition\n", __func__);
+        return false;
+    }
+
+    // For each layer, read the keys for each cell, one row is one cell, read as one contiguous block
+    for (uint32_t il = 0; il < n_layer; ++il) {
+        const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa(il) + hparams.n_embd_k_s();
+
+        // Read type of key
+        int32_t k_type_i_ref;
+        io.read_to(&k_type_i_ref, sizeof(k_type_i_ref));
+        const int32_t k_type_i = (int32_t) k_l[il]->type;
+        if (k_type_i != k_type_i_ref) {
+            LLAMA_LOG_ERROR("%s: mismatched key type (%d != %d, layer %d)\n", __func__, k_type_i, k_type_i_ref, il);
+            return false;
+        }
+
+        // Read row size of key
+        uint64_t k_size_row_ref;
+        io.read_to(&k_size_row_ref, sizeof(k_size_row_ref));
+        const size_t k_size_row = ggml_row_size(k_l[il]->type, n_embd_k_gqa);
+        if (k_size_row != k_size_row_ref) {
+            LLAMA_LOG_ERROR("%s: mismatched key row size (%zu != %zu, layer %d)\n", __func__, k_size_row, (size_t) k_size_row_ref, il);
+            return false;
+        }
+
+        if (cell_count) {
+            // Read and set the keys for the whole cell range
+            ggml_backend_tensor_set(k_l[il], io.read(cell_count * k_size_row), head * k_size_row, cell_count * k_size_row);
+        }
+    }
+
+    if (!this->v_trans) {
+        for (uint32_t il = 0; il < n_layer; ++il) {
+            const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(il) + hparams.n_embd_v_s();
+
+            // Read type of value
+            int32_t v_type_i_ref;
+            io.read_to(&v_type_i_ref, sizeof(v_type_i_ref));
+            const int32_t v_type_i = (int32_t)v_l[il]->type;
+            if (v_type_i != v_type_i_ref) {
+                LLAMA_LOG_ERROR("%s: mismatched value type (%d != %d, layer %d)\n", __func__, v_type_i, v_type_i_ref, il);
+                return false;
+            }
+
+            // Read row size of value
+            uint64_t v_size_row_ref;
+            io.read_to(&v_size_row_ref, sizeof(v_size_row_ref));
+            const size_t v_size_row = ggml_row_size(v_l[il]->type, n_embd_v_gqa);
+            if (v_size_row != v_size_row_ref) {
+                LLAMA_LOG_ERROR("%s: mismatched value row size (%zu != %zu, layer %d)\n", __func__, v_size_row, (size_t) v_size_row_ref, il);
+                return false;
+            }
+
+            if (cell_count) {
+                // Read and set the values for the whole cell range
+                ggml_backend_tensor_set(v_l[il], io.read(cell_count * v_size_row), head * v_size_row, cell_count * v_size_row);
+            }
+        }
+    } else {
+        // For each layer, read the values for each cell (transposed)
+        for (uint32_t il = 0; il < n_layer; ++il) {
+            const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(il) + hparams.n_embd_v_s();
+
+            // Read type of value
+            int32_t v_type_i_ref;
+            io.read_to(&v_type_i_ref, sizeof(v_type_i_ref));
+            const int32_t v_type_i = (int32_t)v_l[il]->type;
+            if (v_type_i != v_type_i_ref) {
+                LLAMA_LOG_ERROR("%s: mismatched value type (%d != %d, layer %d)\n", __func__, v_type_i, v_type_i_ref, il);
+                return false;
+            }
+
+            // Read element size of value
+            uint32_t v_size_el_ref;
+            io.read_to(&v_size_el_ref, sizeof(v_size_el_ref));
+            const size_t v_size_el = ggml_type_size(v_l[il]->type);
+            if (v_size_el != v_size_el_ref) {
+                LLAMA_LOG_ERROR("%s: mismatched value element size (%zu != %zu, layer %d)\n", __func__, v_size_el, (size_t) v_size_el_ref, il);
+                return false;
+            }
+
+            // Read GQA embedding size
+            uint32_t n_embd_v_gqa_ref;
+            io.read_to(&n_embd_v_gqa_ref, sizeof(n_embd_v_gqa_ref));
+            if (n_embd_v_gqa != n_embd_v_gqa_ref) {
+                LLAMA_LOG_ERROR("%s: mismatched GQA embedding size (%u != %u, layer %d)\n", __func__, n_embd_v_gqa, n_embd_v_gqa_ref, il);
+                return false;
+            }
+
+            if (cell_count) {
+                // For each row in the transposed matrix, read the values for the whole cell range
+                for (uint32_t j = 0; j < n_embd_v_gqa; ++j) {
+                    const size_t dst_offset = (head + j * size) * v_size_el;
+                    ggml_backend_tensor_set(v_l[il], io.read(cell_count * v_size_el), dst_offset, cell_count * v_size_el);
+                }
+            }
+        }
+    }
+
+    return true;
+}
+
+//
+// llama_kv_cache_recurrent
+//
+
+llama_kv_cache_recurrent::llama_kv_cache_recurrent(
+        const llama_model & model,
+                ggml_type   type_k,
+                ggml_type   type_v,
+                     bool   offload,
+                 uint32_t   kv_size) : hparams(model.hparams) {
+    const int32_t n_layer = hparams.n_layer;
+
+    LLAMA_LOG_INFO("%s: kv_size = %d, type_k = '%s', type_v = '%s', n_layer = %d\n",
+            __func__, kv_size, ggml_type_name(type_k), ggml_type_name(type_v), n_layer);
+
+    head = 0;
+    size = kv_size;
+    used = 0;
+
+    this->type_k = type_k;
+    this->type_v = type_v;
+
+    cells.clear();
+    cells.resize(kv_size);
+
+    // create a context for each buffer type
+    std::map<ggml_backend_buffer_type_t, ggml_context *> ctx_map;
+    auto ctx_for_buft = [&](ggml_backend_buffer_type_t buft) -> ggml_context * {
+        auto it = ctx_map.find(buft);
+        if (it == ctx_map.end()) {
+            ggml_init_params params = {
+                /*.mem_size   =*/ size_t(2u*n_layer*ggml_tensor_overhead()),
+                /*.mem_buffer =*/ NULL,
+                /*.no_alloc   =*/ true,
+            };
+
+            ggml_context * ctx = ggml_init(params);
+            if (!ctx) {
+                return nullptr;
+            }
+
+            ctx_map[buft] = ctx;
+            ctxs.emplace_back(ctx);
+
+            return ctx;
+        }
+
+        return it->second;
+    };
+
+    k_l.reserve(n_layer);
+    v_l.reserve(n_layer);
+
+    for (int i = 0; i < n_layer; i++) {
+        const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa(i) + hparams.n_embd_k_s();
+        const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(i) + hparams.n_embd_v_s();
+
+        const char * dev_name = "CPU";
+
+        ggml_backend_buffer_type_t buft = ggml_backend_cpu_buffer_type();
+
+        if (offload) {
+            auto * dev = model.dev_layer(i);
+            buft = ggml_backend_dev_buffer_type(dev);
+
+            dev_name = ggml_backend_dev_name(dev);
+        }
+
+        LLAMA_LOG_DEBUG("%s, layer %3d: dev = %s\n", __func__, i, dev_name);
+
+        ggml_context * ctx = ctx_for_buft(buft);
+        if (!ctx) {
+            throw std::runtime_error("failed to create ggml context for kv cache");
+        }
+
+        ggml_tensor * k = ggml_new_tensor_1d(ctx, type_k, n_embd_k_gqa*kv_size);
+        ggml_tensor * v = ggml_new_tensor_1d(ctx, type_v, n_embd_v_gqa*kv_size);
+        ggml_format_name(k, "cache_k_l%d", i);
+        ggml_format_name(v, "cache_v_l%d", i);
+        k_l.push_back(k);
+        v_l.push_back(v);
+    }
+
+    // allocate tensors and initialize the buffers to avoid NaNs in the padding
+    for (auto it : ctx_map) {
+        auto * buft = it.first;
+        auto * ctx  = it.second;
+
+        ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft);
+        if (!buf) {
+            throw std::runtime_error("failed to allocate buffer for kv cache");
+        }
+        ggml_backend_buffer_clear(buf, 0);
+        LLAMA_LOG_INFO("%s: %10s KV buffer size = %8.2f MiB\n", __func__, ggml_backend_buffer_name(buf), ggml_backend_buffer_get_size(buf)/1024.0/1024.0);
+        bufs.emplace_back(buf);
+    }
+
+    {
+        const size_t memory_size_k = size_k_bytes();
+        const size_t memory_size_v = size_v_bytes();
+
+        LLAMA_LOG_INFO("%s: KV self size  = %7.2f MiB, K (%s): %7.2f MiB, V (%s): %7.2f MiB\n", __func__,
+                (float)(memory_size_k + memory_size_v) / (1024.0f * 1024.0f),
+                ggml_type_name(type_k), (float)memory_size_k / (1024.0f * 1024.0f),
+                ggml_type_name(type_v), (float)memory_size_v / (1024.0f * 1024.0f));
+    }
+}
+
+void llama_kv_cache_recurrent::clear() {
+    for (int32_t i = 0; i < (int32_t) size; ++i) {
+        cells[i].pos = -1;
+        cells[i].seq_id.clear();
+        cells[i].src = -1;
+        cells[i].tail = -1;
+    }
+    head = 0;
+    used = 0;
+
+    for (auto & buf : bufs) {
+        ggml_backend_buffer_clear(buf.get(), 0);
+    }
+}
+
+bool llama_kv_cache_recurrent::seq_rm(llama_seq_id seq_id, llama_pos p0, llama_pos p1) {
+    uint32_t new_head = size;
+
+    if (p0 < 0) {
+        p0 = 0;
+    }
+
+    if (p1 < 0) {
+        p1 = std::numeric_limits<llama_pos>::max();
+    }
+
+    // models like Mamba or RWKV can't have a state partially erased
+    if (seq_id >= (int64_t) size) {
+        // could be fatal
+        return false;
+    }
+    if (0 <= seq_id) {
+        int32_t & tail_id = cells[seq_id].tail;
+        if (tail_id >= 0) {
+            const kv_cell & cell = cells[tail_id];
+            // partial intersection is invalid
+            if ((0 < p0 && p0 <= cell.pos) || (0 < p1 && p1 <= cell.pos)) {
+                return false;
+            }
+            // invalidate tails which will be cleared
+            if (p0 <= cell.pos && cell.pos < p1) {
+                tail_id = -1;
+            }
+        }
+    } else {
+        // seq_id is negative, then the range should include everything or nothing
+        if (p0 != p1 && (p0 != 0 || p1 != std::numeric_limits<llama_pos>::max())) {
+            return false;
+        }
+    }
+
+    for (uint32_t i = 0; i < size; ++i) {
+        if (cells[i].pos >= p0 && cells[i].pos < p1) {
+            if (seq_id < 0) {
+                cells[i].seq_id.clear();
+            } else if (cells[i].has_seq_id(seq_id)) {
+                cells[i].seq_id.erase(seq_id);
+            } else {
+                continue;
+            }
+            if (cells[i].is_empty()) {
+                // keep count of the number of used cells
+                if (cells[i].pos >= 0) {
+                    used--;
+                }
+                cells[i].pos = -1;
+                cells[i].src = -1;
+                if (new_head == size) {
+                    new_head = i;
+                }
+            }
+        }
+    }
+
+    // If we freed up a slot, set head to it so searching can start there.
+    if (new_head != size && new_head < head) {
+        head = new_head;
+    }
+
+    return true;
+}
+
+void llama_kv_cache_recurrent::seq_cp(llama_seq_id seq_id_src, llama_seq_id seq_id_dst, llama_pos p0, llama_pos p1) {
+    if (seq_id_src == seq_id_dst) {
+        return;
+    }
+
+    if (p0 < 0) {
+        p0 = 0;
+    }
+
+    if (p1 < 0) {
+        p1 = std::numeric_limits<llama_pos>::max();
+    }
+
+    if ((uint32_t) seq_id_dst < size && (uint32_t) seq_id_src < size) {
+        kv_cell & tail_src = cells[seq_id_src];
+        kv_cell & tail_dst = cells[seq_id_dst];
+        if (tail_dst.tail >= 0) {
+            // clear destination seq_id if it wasn't empty
+            kv_cell & cell_dst = cells[tail_dst.tail];
+
+            cell_dst.seq_id.erase(seq_id_dst);
+            tail_dst.tail = -1;
+            if (cell_dst.seq_id.empty()) {
+                cell_dst.pos = -1;
+                cell_dst.src = -1;
+                used -= 1;
+            }
+        }
+        if (tail_src.tail >= 0) {
+            kv_cell & cell_src = cells[tail_src.tail];
+
+            cell_src.seq_id.insert(seq_id_dst);
+            tail_dst.tail = tail_src.tail;
+        }
+    }
+}
+
+void llama_kv_cache_recurrent::seq_keep(llama_seq_id seq_id) {
+    uint32_t new_head = size;
+
+    for (uint32_t i = 0; i < size; ++i) {
+        if ((llama_seq_id) i != seq_id) {
+            cells[i].tail = -1;
+        }
+
+        if (!cells[i].has_seq_id(seq_id)) {
+            if (cells[i].pos >= 0) {
+                used--;
+            }
+
+            cells[i].pos = -1;
+            cells[i].src = -1;
+            cells[i].seq_id.clear();
+
+            if (new_head == size){
+                new_head = i;
+            }
+        } else {
+            cells[i].seq_id.clear();
+            cells[i].seq_id.insert(seq_id);
+        }
+    }
+
+    // If we freed up a slot, set head to it so searching can start there.
+    if (new_head != size && new_head < head) {
+        head = new_head;
+    }
+}
+
+void llama_kv_cache_recurrent::seq_add(llama_seq_id seq_id, llama_pos p0, llama_pos p1, llama_pos delta) {
+    if (delta == 0) {
+        return;
+    }
+
+    if (p0 < 0) {
+        p0 = 0;
+    }
+
+    if (p1 < 0) {
+        p1 = std::numeric_limits<llama_pos>::max();
+    }
+
+    // If there is no range then return early to avoid looping over the
+    if (p0 == p1) {
+        return;
+    }
+
+    // for Mamba-like or RWKV models, only the pos needs to be shifted
+    if (0 <= seq_id && seq_id < (int64_t) size) {
+        const int32_t tail_id = cells[seq_id].tail;
+        if (tail_id >= 0) {
+            kv_cell & cell = cells[tail_id];
+            if (cell.has_seq_id(seq_id) && p0 <= cell.pos && cell.pos < p1) {
+                cell.pos += delta;
+            }
+        }
+    }
+}
+
+void llama_kv_cache_recurrent::seq_div(llama_seq_id seq_id, llama_pos p0, llama_pos p1, int d) {
+    if (d == 1) {
+        return;
+    }
+
+    if (p0 < 0) {
+        p0 = 0;
+    }
+
+    if (p1 < 0) {
+        p1 = std::numeric_limits<llama_pos>::max();
+    }
+
+    // If there is no range then return early to avoid looping over the cache.
+    if (p0 == p1) {
+        return;
+    }
+
+    // for Mamba-like or RWKV models, only the pos needs to be changed
+    if (0 <= seq_id && seq_id < (int64_t) size) {
+        const int32_t tail_id = cells[seq_id].tail;
+        if (tail_id >= 0) {
+            kv_cell & cell = cells[tail_id];
+            if (cell.has_seq_id(seq_id) && p0 <= cell.pos && cell.pos < p1) {
+                cell.pos /= d;
+            }
+        }
+    }
+}
+
+llama_pos llama_kv_cache_recurrent::seq_pos_max(llama_seq_id seq_id) const {
+    llama_pos result = 0;
+
+    for (uint32_t i = 0; i < size; ++i) {
+        if (cells[i].has_seq_id(seq_id)) {
+            result = std::max(result, cells[i].pos);
+        }
+    }
+
+    return result;
+}
+
+void llama_kv_cache_recurrent::restore() {
+    if (pending.ranges.empty()) {
+        return;
+    }
+
+    seq_rm(-1, -1, -1);
+}
+
+void llama_kv_cache_recurrent::commit() {
+    pending.ranges.clear();
+}
+
+bool llama_kv_cache_recurrent::update(llama_context & lctx) {
+    GGML_UNUSED(lctx);
+    return false;
+}
+
+void llama_kv_cache_recurrent::defrag_sched(float thold) {
+    GGML_UNUSED(thold);
+    // noop
+}
+
+void llama_kv_cache_recurrent::set_full() {
+    n = size;
+}
+
+llama_sbatch llama_kv_cache_recurrent::sbatch_init(
+        const llama_batch & batch,
+        bool logits_all) {
+    return llama_sbatch(batch, hparams.n_embd, false, logits_all);
+}
+
+llama_ubatch llama_kv_cache_recurrent::ubatch_next(llama_sbatch & sbatch, uint32_t n_ubatch, bool embd_pooled) const {
+    if (embd_pooled) {
+        // Pooled embeddings cannot be split across ubatches (yet)
+        return sbatch.split_seq(n_ubatch);
+    }
+
+    return sbatch.split_equal(n_ubatch);
+}
+
+bool llama_kv_cache_recurrent::find_slot(
+       const llama_ubatch & ubatch) {
+    const uint32_t n_tokens = ubatch.n_tokens;
+    const uint32_t n_seqs   = ubatch.n_seqs;
+
+    const uint32_t n_seq_tokens = ubatch.n_seq_tokens;
+
+    // if we have enough unused cells before the current head ->
+    //   better to start searching from the beginning of the cache, hoping to fill it
+    if (head > used + 2*n_tokens) {
+        head = 0;
+    }
+
+    // For recurrent state architectures (like Mamba or RWKV),
+    // each cache cell can store the state for a whole sequence.
+    // A slot should be always be contiguous.
+
+    // can only process batches with an equal number of new tokens in each sequence
+    GGML_ASSERT(ubatch.equal_seqs);
+
+    int32_t min = size - 1;
+    int32_t max = 0;
+
+    // everything should fit if all seq_ids are smaller than the max
+    for (uint32_t s = 0; s < n_seqs; ++s) {
+        const uint32_t n_seq_id = ubatch.n_seq_id[s];
+        for (uint32_t j = 0; j < n_seq_id; ++j) {
+            const llama_seq_id seq_id = ubatch.seq_id[s][j];
+
+            if (seq_id < 0 || (uint32_t) seq_id >= size) {
+                // too big seq_id
+                // TODO: would it be possible to resize the cache instead?
+                LLAMA_LOG_ERROR("%s: seq_id=%d >= n_seq_max=%d Try using a bigger --parallel value\n", __func__, seq_id, size);
+                return false;
+            }
+            if (j > 0) {
+                kv_cell & seq = cells[seq_id];
+                if (seq.tail >= 0) {
+                    kv_cell & cell = cells[seq.tail];
+                    // clear cells from seq_ids that become shared
+                    // (should not normally happen, but let's handle it anyway)
+                    cell.seq_id.erase(seq_id);
+                    seq.tail = -1;
+                    if (cell.seq_id.empty()) {
+                        cell.pos = -1;
+                        cell.src = -1;
+                        used -= 1;
+                    }
+                }
+            }
+        }
+    }
+
+#ifndef NDEBUG
+    {
+        std::vector<int32_t> tails_verif;
+        tails_verif.assign(size, -1);
+        for (uint32_t i = 0; i < size; ++i) {
+            kv_cell & cell = cells[i];
+            for (llama_seq_id seq_id : cell.seq_id) {
+                if (tails_verif[seq_id] != -1) {
+                    LLAMA_LOG_ERROR("%s: duplicate tail for seq_id %d in cell %d and %d\n", __func__, seq_id, i, tails_verif[seq_id]);
+                }
+                tails_verif[seq_id] = i;
+            }
+        }
+        for (uint32_t i = 0; i < size; ++i) {
+            if (tails_verif[i] != cells[i].tail) {
+                LLAMA_LOG_ERROR("%s: wrong tail for seq_id %d, (%d instead of %d)\n", __func__, i, cells[i].tail, tails_verif[i]);
+            }
+        }
+    }
+#endif
+
+    // find next empty cell
+    uint32_t next_empty_cell = head;
+
+    for (uint32_t i = 0; i < size; ++i) {
+        if (next_empty_cell >= size) { next_empty_cell -= size; }
+        kv_cell & cell = cells[next_empty_cell];
+        if (cell.is_empty()) { break; }
+        next_empty_cell += 1;
+    }
+
+    // find usable cell range
+    for (uint32_t s = 0; s < n_seqs; ++s) {
+        const llama_seq_id seq_id = ubatch.seq_id[s][0];
+        kv_cell & seq_meta = cells[seq_id];
+        bool has_cell = false;
+        if (seq_meta.tail >= 0) {
+            kv_cell & cell = cells[seq_meta.tail];
+            GGML_ASSERT(cell.has_seq_id(seq_id));
+            // does this seq_id "own" the cell?
+            if (cell.seq_id.size() == 1) { has_cell = true; }
+        }
+        if (!has_cell) {
+            kv_cell & empty_cell = cells[next_empty_cell];
+            GGML_ASSERT(empty_cell.is_empty());
+            // copy old tail into the empty cell
+            if (seq_meta.tail >= 0) {
+                kv_cell & orig_cell = cells[seq_meta.tail];
+                empty_cell.pos = orig_cell.pos;
+                empty_cell.src = orig_cell.src;
+                orig_cell.seq_id.erase(seq_id);
+                empty_cell.seq_id.insert(seq_id); // will be overwritten
+            }
+            seq_meta.tail = next_empty_cell;
+            // find next empty cell
+            if (s + 1 < n_seqs) {
+                next_empty_cell += 1;
+                for (uint32_t i = 0; i < size; ++i) {
+                    if (next_empty_cell >= size) { next_empty_cell -= size; }
+                    kv_cell & cell = cells[next_empty_cell];
+                    if (cell.is_empty()) { break; }
+                    next_empty_cell += 1;
+                }
+            }
+        }
+        if (min > seq_meta.tail) { min = seq_meta.tail; }
+        if (max < seq_meta.tail) { max = seq_meta.tail; }
+    }
+
+    // gather and re-order
+    for (uint32_t s = 0; s < n_seqs; ++s) {
+        int32_t dst_id = s + min;
+        int32_t src_id = cells[ubatch.seq_id[s][0]].tail;
+        if (dst_id != src_id) {
+            kv_cell & dst_cell = cells[dst_id];
+            kv_cell & src_cell = cells[src_id];
+
+            std::swap(dst_cell.pos, src_cell.pos);
+            std::swap(dst_cell.src, src_cell.src);
+            std::swap(dst_cell.seq_id, src_cell.seq_id);
+
+            // swap tails (assuming they NEVER overlap)
+            for (const llama_seq_id seq_id : src_cell.seq_id) {
+                cells[seq_id].tail = src_id;
+            }
+            for (const llama_seq_id seq_id : dst_cell.seq_id) {
+                cells[seq_id].tail = dst_id;
+            }
+        }
+    }
+
+    // update the pos of the used seqs
+    for (uint32_t s = 0; s < n_seqs; ++s) {
+        const llama_pos last_pos = ubatch.pos[n_seq_tokens * s + n_seq_tokens - 1];
+        int32_t cell_id = s + min;
+        kv_cell & cell = cells[cell_id];
+
+        if (cell.pos >= 0 && last_pos != cell.pos + (llama_pos) n_seq_tokens) {
+            // What should happen when the pos backtracks or skips a value?
+            // Clearing the state mid-batch would require special-casing which isn't done.
+            LLAMA_LOG_WARN("%s: non-consecutive token position %d after %d for sequence %d with %u new tokens\n",
+                __func__, last_pos, cell.pos, ubatch.seq_id[s][0], n_seq_tokens);
+        }
+        cell.pos = last_pos;
+        cell.seq_id.clear();
+        for (int32_t j = 0; j < ubatch.n_seq_id[s]; ++j) {
+            const llama_seq_id seq_id = ubatch.seq_id[s][j];
+            cell.seq_id.insert(seq_id);
+            cells[seq_id].tail = cell_id;
+        }
+    }
+
+    // allow getting the range of used cells, from head to head + n
+    head = min;
+    n    = max - min + 1;
+    used = std::count_if(cells.begin(), cells.end(),
+        [](const kv_cell & cell){ return !cell.is_empty(); });
+
+    // sanity check
+    return n >= n_seqs;
+}
+
+int32_t llama_kv_cache_recurrent::get_n_tokens() const {
+    int32_t result = 0;
+
+    for (uint32_t i = 0; i < size; i++) {
+        result += cells[i].seq_id.size();
+    }
+
+    return result;
+}
+
+int32_t llama_kv_cache_recurrent::get_used_cells() const {
+    return used;
+}
+
+llama_pos llama_kv_cache_recurrent::get_pos_max() const {
+    llama_pos pos_max = -1;
+    for (const auto & cell : cells) {
+        pos_max = std::max(pos_max, cell.pos);
+    }
+
+    return pos_max;
+}
+
+bool llama_kv_cache_recurrent::get_can_shift() const {
+    return false;
+}
+
+int32_t llama_kv_cache_recurrent::s_copy(int i) const {
+    const uint32_t cell_id = i + head;
+
+    //////////////////////////////////////////////
+    // TODO: this should not mutate the KV cache !
+    kv_cell & cell = const_cast<kv_cell &>(cells[cell_id]);
+
+    // prevent out-of-bound sources
+    if (cell.src < 0 || (uint32_t) cell.src >= size) {
+        cell.src = cell_id;
+    }
+
+    int32_t res = cell.src;
+
+    // TODO: do not mutate the KV cache
+    // ensure copy only happens once
+    if (cell.src != (int32_t) cell_id) {
+        cell.src = cell_id;
+    }
+
+    return res;
+}
+
+float llama_kv_cache_recurrent::s_mask(int i) const {
+    const uint32_t cell_id = i + head;
+
+    //////////////////////////////////////////////
+    // TODO: this should not mutate the KV cache !
+    kv_cell & cell = const_cast<kv_cell &>(cells[cell_id]);
+
+    float res = (float) (cell.src >= 0);
+
+    // only clear once
+    if (cell.src < 0) {
+        cell.src = cell_id;
+    }
+
+    return res;
+}
+
+uint32_t llama_kv_cache_recurrent::cell_max() const {
+    for (uint32_t i = size; i > 0; --i) {
+        const kv_cell & cell = cells[i - 1];
+
+        if (cell.pos >= 0 && !cell.is_empty()) {
+            return i;
+        }
+    }
+
+    return 0;
+}
+
+size_t llama_kv_cache_recurrent::total_size() const {
+    size_t size = 0;
+    for (const auto & buf : bufs) {
+        size += ggml_backend_buffer_get_size(buf.get());
+    }
+
+    return size;
+}
+
+size_t llama_kv_cache_recurrent::size_k_bytes() const {
+    size_t size_k_bytes = 0;
+
+    for (const auto & k : k_l) {
+        size_k_bytes += ggml_nbytes(k);
+    }
+
+    return size_k_bytes;
+}
+
+size_t llama_kv_cache_recurrent::size_v_bytes() const {
+    size_t size_v_bytes = 0;
+
+    for (const auto & v : v_l) {
+        size_v_bytes += ggml_nbytes(v);
+    }
+
+    return size_v_bytes;
+}
+
+void llama_kv_cache_recurrent::state_write(llama_io_write_i & io, llama_seq_id seq_id) const {
+    std::vector<std::pair<uint32_t, uint32_t>> cell_ranges; // ranges, from inclusive, to exclusive
+    uint32_t cell_count = 0;
+
+    // Count the number of cells with the specified seq_id
+    // Find all the ranges of cells with this seq id (or all, when -1)
+    uint32_t cell_range_begin = size;
+    for (uint32_t i = 0; i < size; ++i) {
+        const auto & cell = cells[i];
+        if ((seq_id == -1 && !cell.is_empty()) || cell.has_seq_id(seq_id)) {
+            ++cell_count;
+            if (cell_range_begin == size) {
+                cell_range_begin = i;
+            }
+        } else {
+            if (cell_range_begin != size) {
+                cell_ranges.emplace_back(cell_range_begin, i);
+                cell_range_begin = size;
+            }
+        }
+    }
+    if (cell_range_begin != size) {
+        cell_ranges.emplace_back(cell_range_begin, size);
+    }
+
+    // DEBUG CHECK: Sum of cell counts in ranges should equal the total cell count
+    uint32_t cell_count_check = 0;
+    for (const auto & range : cell_ranges) {
+        cell_count_check += range.second - range.first;
+    }
+    GGML_ASSERT(cell_count == cell_count_check);
+
+    io.write(&cell_count, sizeof(cell_count));
+
+    state_write_meta(io, cell_ranges, seq_id);
+    state_write_data(io, cell_ranges);
+}
+
+void llama_kv_cache_recurrent::state_read(llama_io_read_i & io, llama_seq_id seq_id) {
+    uint32_t cell_count;
+    io.read_to(&cell_count, sizeof(cell_count));
+
+    bool res = true;
+    res = res && state_read_meta(io, cell_count, seq_id);
+    res = res && state_read_data(io, cell_count);
+
+    if (!res) {
+        if (seq_id == -1) {
+            clear();
+        } else {
+            seq_rm(seq_id, -1, -1);
+        }
+        throw std::runtime_error("failed to restore kv cache");
+    }
+}
+
+void llama_kv_cache_recurrent::state_write_meta(llama_io_write_i & io, const std::vector<std::pair<uint32_t, uint32_t>> & cell_ranges, llama_seq_id seq_id) const {
+    for (const auto & range : cell_ranges) {
+        for (uint32_t i = range.first; i < range.second; ++i) {
+            const auto & cell = cells[i];
+            const llama_pos pos      = cell.pos;
+            const uint32_t  n_seq_id = seq_id == -1 ? cell.seq_id.size() : 0;
+
+            io.write(&pos,      sizeof(pos));
+            io.write(&n_seq_id, sizeof(n_seq_id));
+
+            if (n_seq_id) {
+                for (auto seq_id : cell.seq_id) {
+                    io.write(&seq_id, sizeof(seq_id));
+                }
+            }
+        }
+    }
+}
+
+void llama_kv_cache_recurrent::state_write_data(llama_io_write_i & io, const std::vector<std::pair<uint32_t, uint32_t>> & cell_ranges) const {
+    const uint32_t v_trans = 0;
+    const uint32_t n_layer = hparams.n_layer;
+
+    io.write(&v_trans, sizeof(v_trans));
+    io.write(&n_layer, sizeof(n_layer));
+
+    std::vector<uint8_t> tmp_buf;
+
+    // Iterate and write all the keys first, each row is a cell
+    // Get whole range at a time
+    for (uint32_t il = 0; il < n_layer; ++il) {
+        const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa(il) + hparams.n_embd_k_s();
+
+        // Write key type
+        const int32_t k_type_i = (int32_t)k_l[il]->type;
+        io.write(&k_type_i, sizeof(k_type_i));
+
+        // Write row size of key
+        const uint64_t k_size_row = ggml_row_size(k_l[il]->type, n_embd_k_gqa);
+        io.write(&k_size_row, sizeof(k_size_row));
+
+        // Read each range of cells of k_size length each into tmp_buf and write out
+        for (const auto & range : cell_ranges) {
+            const size_t range_size = range.second - range.first;
+            const size_t buf_size = range_size * k_size_row;
+            io.write_tensor(k_l[il], range.first * k_size_row, buf_size);
+        }
+    }
+
+    if (!v_trans) {
+        for (uint32_t il = 0; il < n_layer; ++il) {
+            const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(il) + hparams.n_embd_v_s();
+
+            // Write value type
+            const int32_t v_type_i = (int32_t)v_l[il]->type;
+            io.write(&v_type_i, sizeof(v_type_i));
+
+            // Write row size of value
+            const uint64_t v_size_row = ggml_row_size(v_l[il]->type, n_embd_v_gqa);
+            io.write(&v_size_row, sizeof(v_size_row));
+
+            // Read each range of cells of v_size length each into tmp_buf and write out
+            for (const auto & range : cell_ranges) {
+                const size_t range_size = range.second - range.first;
+                const size_t buf_size = range_size * v_size_row;
+                io.write_tensor(v_l[il], range.first * v_size_row, buf_size);
+            }
+        }
+    } else {
+        // When v is transposed, we also need the element size and get the element ranges from each row
+        const uint32_t kv_size = size;
+        for (uint32_t il = 0; il < n_layer; ++il) {
+            const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(il) + hparams.n_embd_v_s();
+
+            // Write value type
+            const int32_t v_type_i = (int32_t)v_l[il]->type;
+            io.write(&v_type_i, sizeof(v_type_i));
+
+            // Write element size
+            const uint32_t v_size_el = ggml_type_size(v_l[il]->type);
+            io.write(&v_size_el, sizeof(v_size_el));
+
+            // Write GQA embedding size
+            io.write(&n_embd_v_gqa, sizeof(n_embd_v_gqa));
+
+            // For each row, we get the element values of each cell
+            for (uint32_t j = 0; j < n_embd_v_gqa; ++j) {
+                // Read each range of cells of v_size_el length each into tmp_buf and write out
+                for (const auto & range : cell_ranges) {
+                    const size_t range_size = range.second - range.first;
+                    const size_t src_offset = (range.first + j * kv_size) * v_size_el;
+                    const size_t buf_size = range_size * v_size_el;
+                    io.write_tensor(v_l[il], src_offset, buf_size);
+                }
+            }
+        }
+    }
+}
+
+bool llama_kv_cache_recurrent::state_read_meta(llama_io_read_i & io, uint32_t cell_count, llama_seq_id dest_seq_id) {
+    if (dest_seq_id != -1) {
+        // single sequence
+
+        seq_rm(dest_seq_id, -1, -1);
+
+        llama_sbatch sbatch;
+        llama_ubatch batch = sbatch.reserve_ubatch(cell_count, /* has_embd */ false);
+
+        batch.n_tokens = cell_count;
+        batch.n_seq_tokens = cell_count;
+        batch.n_seqs = 1;
+
+        for (uint32_t i = 0; i < cell_count; ++i) {
+            llama_pos pos;
+            uint32_t n_seq_id;
+
+            io.read_to(&pos,      sizeof(pos));
+            io.read_to(&n_seq_id, sizeof(n_seq_id));
+
+            if (n_seq_id != 0) {
+                LLAMA_LOG_ERROR("%s: invalid seq_id-agnostic kv cell\n", __func__);
+                return false;
+            }
+
+            batch.pos[i] = pos;
+        }
+        batch.n_seq_id[0] = 1;
+        batch.seq_id[0] = &dest_seq_id;
+        if (!find_slot(batch)) {
+            LLAMA_LOG_ERROR("%s: failed to find available cells in kv cache\n", __func__);
+            return false;
+        }
+        commit();
+
+        // DEBUG CHECK: kv.head should be our first cell, kv.head + cell_count - 1 should be our last cell (verify seq_id and pos values)
+        // Assume that this is one contiguous block of cells
+        GGML_ASSERT(head + cell_count <= size);
+        GGML_ASSERT(cells[head].pos == batch.pos[0]);
+        GGML_ASSERT(cells[head + cell_count - 1].pos == batch.pos[cell_count - 1]);
+        GGML_ASSERT(cells[head].has_seq_id(dest_seq_id));
+        GGML_ASSERT(cells[head + cell_count - 1].has_seq_id(dest_seq_id));
+    } else {
+        // whole KV cache restore
+
+        if (cell_count > size) {
+            LLAMA_LOG_ERROR("%s: not enough cells in kv cache\n", __func__);
+            return false;
+        }
+
+        clear();
+
+        for (uint32_t i = 0; i < cell_count; ++i) {
+            kv_cell & cell = cells[i];
+
+            llama_pos pos;
+            uint32_t  n_seq_id;
+
+            io.read_to(&pos,      sizeof(pos));
+            io.read_to(&n_seq_id, sizeof(n_seq_id));
+
+            cell.pos = pos;
+
+            for (uint32_t j = 0; j < n_seq_id; ++j) {
+                llama_seq_id seq_id;
+                io.read_to(&seq_id, sizeof(seq_id));
+
+                // TODO: llama_kv_cache_recurrent should have a notion of max sequences
+                //if (seq_id < 0 || (uint32_t) seq_id >= llama_n_seq_max(ctx)) {
+                if (seq_id < 0) {
+                    //LLAMA_LOG_ERROR("%s: invalid seq_id, %d is out of range [0, %u)\n", __func__, seq_id, llama_n_seq_max(ctx));
+                    LLAMA_LOG_ERROR("%s: invalid seq_id, %d is out of range [0, inf)\n", __func__, seq_id);
+                    return false;
+                }
+
+                cell.seq_id.insert(seq_id);
+
+                int32_t & tail = cells[seq_id].tail;
+                if (tail != -1) {
+                    LLAMA_LOG_ERROR("%s: duplicate tail for seq_id %d in cell %d and %d\n", __func__, seq_id, i, tail);
+                    return false;
+                }
+                tail = i;
+            }
+        }
+
+        head = 0;
+        used = cell_count;
+    }
+
+    for (uint32_t i = 0; i < cell_count; ++i) {
+        uint32_t cell_id = head + i;
+        // make sure the recurrent states will keep their restored state
+        cells[cell_id].src = cell_id;
+    }
+
+    return true;
+}
+
+bool llama_kv_cache_recurrent::state_read_data(llama_io_read_i & io, uint32_t cell_count) {
+    uint32_t v_trans;
+    uint32_t n_layer;
+    io.read_to(&v_trans, sizeof(v_trans));
+    io.read_to(&n_layer, sizeof(n_layer));
+
+    if (n_layer != hparams.n_layer) {
+        LLAMA_LOG_ERROR("%s: mismatched layer count (%u instead of %u)\n", __func__, n_layer, hparams.n_layer);
+        return false;
+    }
+    if (cell_count > size) {
+        LLAMA_LOG_ERROR("%s: not enough cells in kv cache to restore state (%u > %u)\n", __func__, cell_count, size);
+        return false;
+    }
+    if (false != (bool) v_trans) {
         LLAMA_LOG_ERROR("%s: incompatible V transposition\n", __func__);
         return false;
     }
@@ -1309,7 +2405,7 @@ void llama_kv_cache_view_update(llama_kv_cache_view * view, const llama_kv_cache
         view->cells_sequences = (llama_seq_id *)p;
     }
 
-    const std::vector<llama_kv_cell> & kv_cells = kvu->cells;
+    const std::vector<llama_kv_cache_unified::kv_cell> & kv_cells = kvu->cells;
     llama_kv_cache_view_cell * c_curr = view->cells;
     llama_seq_id * cs_curr = view->cells_sequences;
     int32_t used_cells = 0;
diff --git a/llama/llama.cpp/src/llama-kv-cache.h b/llama/llama.cpp/src/llama-kv-cache.h
index 25cbcb56..928b9712 100644
--- a/llama/llama.cpp/src/llama-kv-cache.h
+++ b/llama/llama.cpp/src/llama-kv-cache.h
@@ -2,32 +2,72 @@
 
 #include "llama.h"
 #include "llama-io.h"
+#include "llama-graph.h"
 #include "llama-memory.h"
 
 #include "ggml-cpp.h"
 
-#include <functional>
 #include <set>
 #include <vector>
 
 struct llama_cparams;
 struct llama_hparams;
 struct llama_ubatch;
+struct llama_sbatch;
+struct llama_model;
+struct llama_context;
 
 struct llama_kv_cache : public llama_memory_i {
-    using llama_memory_i::llama_memory_i;
+    virtual ~llama_kv_cache() = default;
 
-    virtual void restore() = 0; // call if batch processing fails - restores the cache state
-    virtual void commit() = 0;  // call after successful batch processing - clears any pending state
+    // call if batch processing fails - restores the cache state
+    virtual void restore() = 0;
 
-    virtual int32_t get_n_tokens()   const = 0;
-    virtual int32_t get_used_cells() const = 0; // TODO: remove, this is too-specific to the unified cache
+    // call after successful batch processing - clears any pending state
+    virtual void commit()  = 0;
 
-    virtual bool get_can_shift() const = 0;
+    // process any pending defrag/shift/etc. operations
+    // optionally call once before processing a new batch
+    virtual bool update(llama_context & lctx) = 0;
+
+    // schedule a defrag if the fragmentation threshold is exceeded. otherwise, do nothing
+    virtual void defrag_sched(float thold) = 0;
+
+    // simulate full cache, used for allocating worst-case compute buffers
+    virtual void set_full() = 0;
+
+    //
+    // batch processing
+    //
+
+    virtual llama_sbatch sbatch_init(const llama_batch & batch, bool logits_all) = 0;
+
+    // different KV caches require different batch splitting strategies
+    virtual llama_ubatch ubatch_next(llama_sbatch & sbatch, uint32_t n_ubatch, bool embd_pooled) const = 0;
+
+    // find an empty slot of size "n_tokens" in the cache
+    virtual bool find_slot(const llama_ubatch & batch) = 0;
+
+    // getters
+    virtual int32_t   get_n_tokens()   const = 0;
+    virtual int32_t   get_used_cells() const = 0; // TODO: remove, this is too-specific to the unified cache
+    virtual llama_pos get_pos_max()    const = 0;
+    virtual bool      get_can_shift()  const = 0;
 
     bool get_can_edit() const override { return get_can_shift(); }
+
+    //
+    // state write/read
+    //
+
+    virtual void state_write(llama_io_write_i & io, llama_seq_id seq_id = -1) const = 0;
+    virtual void state_read (llama_io_read_i  & io, llama_seq_id seq_id = -1) = 0;
 };
 
+//
+// llama_kv_cache_guard
+//
+
 struct llama_kv_cache_guard {
     llama_kv_cache_guard(llama_kv_cache * kv) : kv(kv) {}
 
@@ -42,7 +82,7 @@ struct llama_kv_cache_guard {
 private:
     llama_kv_cache * kv;
 };
-
+ 
 // block of KV slots to move when defragging
 struct llama_kv_defrag_move {
     uint32_t src;
@@ -50,65 +90,50 @@ struct llama_kv_defrag_move {
     uint32_t len;
 };
 
-struct llama_kv_cell {
-    llama_pos pos   = -1;
-    llama_pos delta =  0;
-    int32_t   src   = -1; // used by recurrent state models to copy states
-    int32_t   tail  = -1;
+//
+// llama_kv_cache_unified
+//
 
-    std::set<llama_seq_id> seq_id;
-
-    bool has_seq_id(const llama_seq_id & id) const {
-        return seq_id.find(id) != seq_id.end();
-    }
-
-    bool is_empty() const {
-        return seq_id.empty();
-    }
-
-    bool is_same_seq(const llama_kv_cell & other) const {
-        return seq_id == other.seq_id;
-    }
-};
-
-// ring-buffer of cached KV data
-// TODO: pimpl
 // TODO: add notion of max sequences
 class llama_kv_cache_unified : public llama_kv_cache {
 public:
-    // can be used to query data from the model if needed
-    struct callbacks {
-        std::function<ggml_tensor * (uint32_t n_ctx_per_seq, int il)> get_rope_factors;
+    struct kv_cell {
+        llama_pos pos   = -1;
+        llama_pos delta =  0;
+
+        std::set<llama_seq_id> seq_id;
+
+        bool has_seq_id(const llama_seq_id & id) const {
+            return seq_id.find(id) != seq_id.end();
+        }
+
+        bool is_empty() const {
+            return seq_id.empty();
+        }
+
+        bool is_same_seq(const kv_cell & other) const {
+            return seq_id == other.seq_id;
+        }
     };
 
+    static uint32_t get_padding(const llama_cparams & cparams);
+
     llama_kv_cache_unified(
-            const llama_hparams & hparams,
-            callbacks             cbs);
-
-    virtual ~llama_kv_cache_unified() = default;
-
-    // TODO: become constructor
-    bool init(
-            const llama_model & model,   // TODO: do not reference the model
-          const llama_cparams & cparams,
+            const llama_model & model,
                     ggml_type   type_k,
                     ggml_type   type_v,
+                         bool   v_trans,
+                         bool   offload,
                      uint32_t   kv_size,
-                         bool   offload);
+                     uint32_t   padding);
 
-    int32_t get_n_tokens()   const override;
-    int32_t get_used_cells() const override;
+    ~llama_kv_cache_unified() = default;
 
-    size_t total_size() const;
-
-    // TODO: better data structures to reduce the cost of this operation
-    llama_pos pos_max() const;
+    //
+    // llama_memory_i
+    //
 
     void clear() override;
-    void defrag() override;
-
-    virtual void restore() override;
-    virtual void commit() override;
 
     bool seq_rm  (llama_seq_id seq_id,                              llama_pos p0, llama_pos p1) override;
     void seq_cp  (llama_seq_id seq_id_src, llama_seq_id seq_id_dst, llama_pos p0, llama_pos p1) override;
@@ -118,63 +143,40 @@ public:
 
     llama_pos seq_pos_max(llama_seq_id seq_id) const override;
 
-    bool get_can_shift() const override;
+    //
+    // llama_kv_cache
+    //
+
+    void restore() override;
+    void commit()  override;
+
+    bool update(llama_context & ctx) override;
+
+    void defrag_sched(float thold) override;
+
+    void set_full() override;
+
+    llama_sbatch sbatch_init(const llama_batch & batch, bool logits_all) override;
+
+    llama_ubatch ubatch_next(llama_sbatch & sbatch, uint32_t n_ubatch, bool embd_pooled) const override;
 
-    // find an empty slot of size "n_tokens" in the cache
     // updates the cache head
     // Note: On success, it's important that cache.head points
     // to the first cell of the slot.
-    bool find_slot(const llama_ubatch & batch);
+    bool find_slot(const llama_ubatch & batch) override;
 
-    // TODO: maybe not needed
-    uint32_t get_padding(const llama_cparams & cparams) const;
+    int32_t get_n_tokens()   const override;
+    int32_t get_used_cells() const override;
 
-    // find how many cells are currently in use
-    uint32_t cell_max() const;
+    // TODO: better data structures to reduce the cost of this operation
+    llama_pos get_pos_max() const override;
 
-    size_t size_k_bytes() const;
-    size_t size_v_bytes() const;
-
-    // defrag
-
-    struct {
-        std::vector<llama_kv_defrag_move> moves;
-    } defrag_info;
-
-    // return true if cells have been moved
-    bool defrag_prepare(int32_t n_max_nodes);
-
-    // commit/restore cache
-
-    struct slot_range {
-        uint32_t c0 = 0; // note: these are cell indices, not sequence positions
-        uint32_t c1 = 0;
-    };
-
-    // pending cell updates that are not yet committed
-    struct {
-        std::vector<slot_range> ranges;
-    } pending;
+    bool get_can_shift() const override;
 
     // state write/load
 
-    void state_write(llama_io_write_i & io, llama_seq_id seq_id = -1) const;
-    void state_read (llama_io_read_i  & io, llama_seq_id seq_id = -1);
-
-    // members
-
-    const llama_hparams & hparams;
-
-    callbacks cbs;
-
-    bool has_shift = false;
-    bool do_defrag = false;
-
-    // TODO: remove this and implement llama_kv_cache_recurrent instead
-    bool recurrent = false; // with recurrent state models, a cell can hold the state for more than one past token
-
-    bool v_trans   = true;  // the value tensor is transposed
-    bool can_shift = false;
+    void state_write(llama_io_write_i & io, llama_seq_id seq_id = -1) const override;
+    void state_read (llama_io_read_i  & io, llama_seq_id seq_id = -1) override;
 
     // Note: The value of head isn't only used to optimize searching
     // for a free KV slot. llama_decode_impl also uses it, so it
@@ -186,18 +188,214 @@ public:
     // computed before each graph build
     uint32_t n = 0;
 
-    std::vector<llama_kv_cell> cells;
+    std::vector<kv_cell> cells;
 
     std::vector<ggml_tensor *> k_l; // per layer
     std::vector<ggml_tensor *> v_l;
 
 private:
+    const llama_model & model;
+    const llama_hparams & hparams;
+
+    bool has_shift = false;
+    bool do_defrag = false;
+
+    bool v_trans   = true;  // the value tensor is transposed
+    bool can_shift = false;
+
+    // required padding
+    uint32_t padding = 1;
+
     ggml_type type_k = GGML_TYPE_F16;
     ggml_type type_v = GGML_TYPE_F16;
 
     std::vector<ggml_context_ptr>        ctxs;
     std::vector<ggml_backend_buffer_ptr> bufs;
 
+    // defrag
+    struct {
+        std::vector<llama_kv_defrag_move> moves;
+    } defrag_info;
+
+    // return true if cells have been moved
+    bool defrag_prepare(int32_t n_max_nodes);
+
+    // commit/restore cache
+    struct slot_range {
+        uint32_t c0 = 0; // note: these are cell indices, not sequence positions
+        uint32_t c1 = 0;
+    };
+
+    // pending cell updates that are not yet committed
+    struct {
+        std::vector<slot_range> ranges;
+    } pending;
+
+    // find how many cells are currently in use
+    uint32_t cell_max() const;
+
+    size_t total_size() const;
+
+    size_t size_k_bytes() const;
+    size_t size_v_bytes() const;
+
+    ggml_tensor * build_rope_shift(
+            const llama_cparams & cparams,
+                   ggml_context * ctx,
+                    ggml_tensor * cur,
+                    ggml_tensor * shift,
+                    ggml_tensor * factors,
+                          float   freq_base,
+                          float   freq_scale) const;
+
+    llm_graph_result_ptr build_graph_shift(
+            const llama_cparams & cparams,
+                   ggml_context * ctx,
+                    ggml_cgraph * gf) const;
+
+    llm_graph_result_ptr build_graph_defrag(
+            const llama_cparams & cparams,
+                   ggml_context * ctx,
+                    ggml_cgraph * gf,
+                    const std::vector<llama_kv_defrag_move> & moves) const;
+
+    void state_write_meta(llama_io_write_i & io, const std::vector<std::pair<uint32_t, uint32_t>> & cell_ranges, llama_seq_id seq_id = -1) const;
+    void state_write_data(llama_io_write_i & io, const std::vector<std::pair<uint32_t, uint32_t>> & cell_ranges) const;
+
+    bool state_read_meta(llama_io_read_i & io, uint32_t cell_count, llama_seq_id dest_seq_id = -1);
+    bool state_read_data(llama_io_read_i & io, uint32_t cell_count);
+};
+
+//
+// llama_kv_cache_recurrent
+//
+
+class llama_kv_cache_recurrent : public llama_kv_cache {
+public:
+    struct kv_cell {
+        llama_pos pos  = -1;
+        int32_t   src  = -1; // used to copy states
+        int32_t   tail = -1;
+
+        std::set<llama_seq_id> seq_id;
+
+        bool has_seq_id(const llama_seq_id & id) const {
+            return seq_id.find(id) != seq_id.end();
+        }
+
+        bool is_empty() const {
+            return seq_id.empty();
+        }
+
+        bool is_same_seq(const kv_cell & other) const {
+            return seq_id == other.seq_id;
+        }
+    };
+
+    llama_kv_cache_recurrent(
+            const llama_model & model,
+                    ggml_type   type_k,
+                    ggml_type   type_v,
+                         bool   offload,
+                     uint32_t   kv_size);
+
+    ~llama_kv_cache_recurrent() = default;
+
+    //
+    // llama_memory_i
+    //
+
+    void clear() override;
+
+    bool seq_rm  (llama_seq_id seq_id,                              llama_pos p0, llama_pos p1) override;
+    void seq_cp  (llama_seq_id seq_id_src, llama_seq_id seq_id_dst, llama_pos p0, llama_pos p1) override;
+    void seq_keep(llama_seq_id seq_id) override;
+    void seq_add (llama_seq_id seq_id,                              llama_pos p0, llama_pos p1, llama_pos delta) override;
+    void seq_div (llama_seq_id seq_id,                              llama_pos p0, llama_pos p1, int d) override;
+
+    llama_pos seq_pos_max(llama_seq_id seq_id) const override;
+
+    //
+    // llama_kv_cache
+    //
+
+    void restore() override;
+    void commit()  override;
+
+    bool update(llama_context & lctx) override;
+
+    void defrag_sched(float thold) override;
+
+    void set_full() override;
+
+    llama_sbatch sbatch_init(const llama_batch & batch, bool logits_all) override;
+
+    llama_ubatch ubatch_next(llama_sbatch & sbatch, uint32_t n_ubatch, bool embd_pooled) const override;
+
+    bool find_slot(const llama_ubatch & batch) override;
+
+    int32_t get_n_tokens()   const override;
+    int32_t get_used_cells() const override;
+
+    // TODO: better data structures to reduce the cost of this operation
+    llama_pos get_pos_max() const override;
+
+    bool get_can_shift() const override;
+
+    // TODO: temporary methods - they are not really const as they do const_cast<>, fix this
+    int32_t s_copy(int i) const;
+    float   s_mask(int i) const;
+
+    // state write/load
+
+    void state_write(llama_io_write_i & io, llama_seq_id seq_id = -1) const override;
+    void state_read (llama_io_read_i  & io, llama_seq_id seq_id = -1) override;
+
+    // Note: The value of head isn't only used to optimize searching
+    // for a free KV slot. llama_decode_impl also uses it, so it
+    // cannot be freely changed after a slot has been allocated.
+    uint32_t head = 0;
+    uint32_t size = 0;
+    uint32_t used = 0; // used cells (i.e. at least one seq_id)
+
+    // computed before each graph build
+    uint32_t n = 0;
+
+    std::vector<kv_cell> cells;
+
+    std::vector<ggml_tensor *> k_l; // per layer
+    std::vector<ggml_tensor *> v_l;
+
+private:
+    //const llama_model & model;
+    const llama_hparams & hparams;
+
+    // commit/restore cache
+    // TODO: rework for recurrent cache
+    struct slot_range {
+        uint32_t c0 = 0; // note: these are cell indices, not sequence positions
+        uint32_t c1 = 0;
+    };
+
+    // pending cell updates that are not yet committed
+    struct {
+        std::vector<slot_range> ranges;
+    } pending;
+
+    ggml_type type_k = GGML_TYPE_F16;
+    ggml_type type_v = GGML_TYPE_F16;
+
+    std::vector<ggml_context_ptr>        ctxs;
+    std::vector<ggml_backend_buffer_ptr> bufs;
+
+    // find how many cells are currently in use
+    uint32_t cell_max() const;
+
+    size_t total_size() const;
+
+    size_t size_k_bytes() const;
+    size_t size_v_bytes() const;
+
     void state_write_meta(llama_io_write_i & io, const std::vector<std::pair<uint32_t, uint32_t>> & cell_ranges, llama_seq_id seq_id = -1) const;
     void state_write_data(llama_io_write_i & io, const std::vector<std::pair<uint32_t, uint32_t>> & cell_ranges) const;
 
@@ -205,11 +403,6 @@ private:
     bool state_read_data(llama_io_read_i & io, uint32_t cell_count);
 };
 
-// TODO: temporary reusing llama_kv_cache_unified -- implement recurrent cache and simplify llama_kv_cache_unified
-//class llama_kv_cache_recurrent : public llama_kv_cache_unified {
-//public:
-//    using llama_kv_cache_unified::llama_kv_cache_unified;
-//};
 
 //
 // kv cache view
diff --git a/llama/llama.cpp/src/llama-memory.h b/llama/llama.cpp/src/llama-memory.h
index dfa8c4e9..c7412d59 100644
--- a/llama/llama.cpp/src/llama-memory.h
+++ b/llama/llama.cpp/src/llama-memory.h
@@ -2,12 +2,22 @@
 
 #include "llama.h"
 
+struct llama_memory_params {
+    // kv cache
+    ggml_type type_k;
+    ggml_type type_v;
+
+    // parameters for other types of memory
+    // ...
+};
+
 // general concept of LLM memory
 // the KV cache is a type of LLM memory, but there can be other types
 class llama_memory_i {
 public:
+    virtual ~llama_memory_i() = default;
+
     virtual void clear() = 0;
-    virtual void defrag() = 0;
 
     virtual bool seq_rm  (llama_seq_id seq_id,                              llama_pos p0, llama_pos p1) = 0;
     virtual void seq_cp  (llama_seq_id seq_id_src, llama_seq_id seq_id_dst, llama_pos p0, llama_pos p1) = 0;
diff --git a/llama/llama.cpp/src/llama-model-loader.cpp b/llama/llama.cpp/src/llama-model-loader.cpp
index 2e11507d..2acfd4a8 100644
--- a/llama/llama.cpp/src/llama-model-loader.cpp
+++ b/llama/llama.cpp/src/llama-model-loader.cpp
@@ -301,12 +301,12 @@ namespace GGUFMeta {
             GGUFMeta::GKV<GGUFMeta::ArrayInfo>::get_kv(meta.get(), kid);
 
         switch (arr_info.gt) {
-            case GGUF_TYPE_FLOAT32: GGML_ASSERT((std::is_same<T, float>::value)); break;
-            case GGUF_TYPE_INT32:   GGML_ASSERT(
-                                            (std::is_same<T,  int32_t>::value) ||
-                                            (std::is_same<T, uint32_t>::value));  break;
+            case GGUF_TYPE_UINT32:
+            case GGUF_TYPE_INT32:   GGML_ASSERT((std::is_same<T,  int32_t>::value) ||
+                                                (std::is_same<T, uint32_t>::value)); break;
+            case GGUF_TYPE_FLOAT32: GGML_ASSERT((std::is_same<T,    float>::value)); break;
             default:
-                throw std::runtime_error(format("%s is not a float32, int32 array", key.c_str()));
+                throw std::runtime_error(format("%s is not a float32/uint32/int32 array", key.c_str()));
         }
 
         result.resize(arr_info.length);
@@ -332,12 +332,12 @@ namespace GGUFMeta {
             GGUFMeta::GKV<GGUFMeta::ArrayInfo>::get_kv(meta.get(), kid);
 
         switch (arr_info.gt) {
-            case GGUF_TYPE_FLOAT32: GGML_ASSERT((std::is_same<T, float>::value)); break;
-            case GGUF_TYPE_INT32:   GGML_ASSERT(
-                                            (std::is_same<T,  int32_t>::value) ||
-                                            (std::is_same<T, uint32_t>::value));  break;
+            case GGUF_TYPE_UINT32:
+            case GGUF_TYPE_INT32:   GGML_ASSERT((std::is_same<T,  int32_t>::value) ||
+                                                (std::is_same<T, uint32_t>::value)); break;
+            case GGUF_TYPE_FLOAT32: GGML_ASSERT((std::is_same<T,    float>::value)); break;
             default:
-                throw std::runtime_error(format("%s is not a float32, int32 array", key.c_str()));
+                throw std::runtime_error(format("%s is not a float32/uint32/int32 array", key.c_str()));
         }
 
         if (arr_info.length > N_MAX) {
@@ -826,6 +826,10 @@ void llama_model_loader::init_mappings(bool prefetch, llama_mlocks * mlock_mmaps
         mmaps_used.reserve(files.size());
         for (const auto & file : files) {
             auto * reg = ggml_backend_dev_backend_reg(ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU));
+            if (!reg) {
+                throw std::runtime_error(format("%s: no CPU backend found", __func__));
+            }
+
             auto * is_numa_fn = (decltype(ggml_is_numa) *) ggml_backend_reg_get_proc_address(reg, "ggml_backend_cpu_is_numa");
             std::unique_ptr<llama_mmap> mapping = std::make_unique<llama_mmap>(file.get(), prefetch ? -1 : 0, is_numa_fn());
             mmaps_used.emplace_back(mapping->size(), 0);
diff --git a/llama/llama.cpp/src/llama-model-saver.cpp b/llama/llama.cpp/src/llama-model-saver.cpp
new file mode 100644
index 00000000..a70b9892
--- /dev/null
+++ b/llama/llama.cpp/src/llama-model-saver.cpp
@@ -0,0 +1,281 @@
+#include "llama-model-saver.h"
+
+#include "gguf.h"
+
+#include "llama.h"
+#include "llama-hparams.h"
+#include "llama-model.h"
+#include "llama-vocab.h"
+
+#include <string>
+
+llama_model_saver::llama_model_saver(const struct llama_model & model) : model(model), llm_kv(model.arch) {
+    gguf_ctx = gguf_init_empty();
+}
+
+llama_model_saver::~llama_model_saver() {
+    gguf_free(gguf_ctx);
+}
+
+void llama_model_saver::add_kv(const enum llm_kv key, const uint32_t value) {
+    gguf_set_val_u32(gguf_ctx, llm_kv(key).c_str(), value);
+}
+
+void llama_model_saver::add_kv(const enum llm_kv key, const int32_t value) {
+    gguf_set_val_i32(gguf_ctx, llm_kv(key).c_str(), value);
+}
+
+void llama_model_saver::add_kv(const enum llm_kv key, const float value) {
+    gguf_set_val_f32(gguf_ctx, llm_kv(key).c_str(), value);
+}
+
+void llama_model_saver::add_kv(const enum llm_kv key, const bool value) {
+    gguf_set_val_bool(gguf_ctx, llm_kv(key).c_str(), value);
+}
+
+void llama_model_saver::add_kv(const enum llm_kv key, const char * value) {
+    gguf_set_val_str(gguf_ctx, llm_kv(key).c_str(), value);
+}
+
+[[noreturn]]
+void llama_model_saver::add_kv(const enum llm_kv key, const char value) {
+    GGML_UNUSED(key);
+    GGML_UNUSED(value);
+    GGML_ABORT("fatal error"); // this should never be called, only needed to make the template below compile
+}
+
+template <typename Container>
+void llama_model_saver::add_kv(const enum llm_kv key, const Container & value, const bool per_layer) {
+    const size_t n_values = per_layer ? size_t(model.hparams.n_layer) : value.size();
+    GGML_ASSERT(n_values <= value.size());
+
+    if (n_values == 0) {
+        return;
+    }
+
+    if (per_layer) {
+        bool all_values_the_same = true;
+        for (size_t i = 1; i < n_values; ++i) {
+            if (value[i] != value[0]) {
+                all_values_the_same = false;
+                break;
+            }
+        }
+        if (all_values_the_same) {
+            add_kv(key, value[0]);
+            return;
+        }
+    }
+
+    if (std::is_same<typename Container::value_type, uint8_t>::value) {
+        gguf_set_arr_data(gguf_ctx, llm_kv(key).c_str(), GGUF_TYPE_UINT8, value.data(), n_values);
+    } else if (std::is_same<typename Container::value_type, int8_t>::value) {
+        gguf_set_arr_data(gguf_ctx, llm_kv(key).c_str(), GGUF_TYPE_INT8, value.data(), n_values);
+    } else if (std::is_same<typename Container::value_type, uint32_t>::value) {
+        gguf_set_arr_data(gguf_ctx, llm_kv(key).c_str(), GGUF_TYPE_UINT32, value.data(), n_values);
+    } else if (std::is_same<typename Container::value_type, int32_t>::value) {
+        gguf_set_arr_data(gguf_ctx, llm_kv(key).c_str(), GGUF_TYPE_INT32, value.data(), n_values);
+    } else if (std::is_same<typename Container::value_type, float>::value) {
+        gguf_set_arr_data(gguf_ctx, llm_kv(key).c_str(), GGUF_TYPE_FLOAT32, value.data(), n_values);
+    } else if (std::is_same<Container, std::string>::value) {
+        gguf_set_val_str(gguf_ctx, llm_kv(key).c_str(), reinterpret_cast<const char *>(value.data()));
+    } else {
+        GGML_ABORT("fatal error");
+    }
+}
+
+void llama_model_saver::add_kv(const enum llm_kv key, const std::vector<std::string> & value) {
+    std::vector<const char *> tmp(value.size());
+    for (size_t i = 0; i < value.size(); ++i) {
+        tmp[i] = value[i].c_str();
+    }
+    gguf_set_arr_str(gguf_ctx, llm_kv(key).c_str(), tmp.data(), tmp.size());
+}
+
+void llama_model_saver::add_tensor(const struct ggml_tensor * tensor) {
+    if (!tensor) {
+        return;
+    }
+    if (gguf_find_tensor(gguf_ctx, tensor->name) >= 0) {
+        GGML_ASSERT(std::string(tensor->name) == "rope_freqs.weight"); // FIXME
+        return;
+    }
+    gguf_add_tensor(gguf_ctx, tensor);
+}
+
+void llama_model_saver::add_kv_from_model() {
+    const llama_hparams & hparams = model.hparams;
+    const llama_vocab   & vocab   = model.vocab;
+
+    const int32_t n_vocab = vocab.n_tokens();
+    std::vector<std::string> tokens(n_vocab);
+    std::vector<float>       scores(n_vocab);
+    std::vector<int32_t>     token_types(n_vocab);
+
+    for (int32_t id = 0; id < n_vocab; ++id) {
+        const llama_vocab::token_data & token_data = vocab.get_token_data(id);
+
+        tokens[id] = token_data.text;
+        scores[id] = token_data.score;
+
+        switch(token_data.attr) {
+            case LLAMA_TOKEN_ATTR_UNKNOWN:      token_types[id] = LLAMA_TOKEN_TYPE_UNKNOWN;      break;
+            case LLAMA_TOKEN_ATTR_UNUSED:       token_types[id] = LLAMA_TOKEN_TYPE_UNUSED;       break;
+            case LLAMA_TOKEN_ATTR_NORMAL:       token_types[id] = LLAMA_TOKEN_TYPE_NORMAL;       break;
+            case LLAMA_TOKEN_ATTR_CONTROL:      token_types[id] = LLAMA_TOKEN_TYPE_CONTROL;      break;
+            case LLAMA_TOKEN_ATTR_USER_DEFINED: token_types[id] = LLAMA_TOKEN_TYPE_USER_DEFINED; break;
+            case LLAMA_TOKEN_ATTR_BYTE:         token_types[id] = LLAMA_TOKEN_TYPE_BYTE;         break;
+            case LLAMA_TOKEN_ATTR_UNDEFINED:
+            default:                            token_types[id] = LLAMA_TOKEN_TYPE_UNDEFINED;    break;
+        }
+    }
+
+    // add_kv(LLM_KV_GENERAL_TYPE,                      ???);
+    add_kv(LLM_KV_GENERAL_ARCHITECTURE,              model.arch_name());
+    // add_kv(LLM_KV_GENERAL_QUANTIZATION_VERSION,      ???);
+    // add_kv(LLM_KV_GENERAL_ALIGNMENT,                 ???);
+    add_kv(LLM_KV_GENERAL_NAME,                      model.name);
+    // add_kv(LLM_KV_GENERAL_AUTHOR,                    ???);
+    // add_kv(LLM_KV_GENERAL_VERSION,                   ???);
+    // add_kv(LLM_KV_GENERAL_URL,                       ???);
+    // add_kv(LLM_KV_GENERAL_DESCRIPTION,               ???);
+    // add_kv(LLM_KV_GENERAL_LICENSE,                   ???);
+    // add_kv(LLM_KV_GENERAL_SOURCE_URL,                ???);
+    // add_kv(LLM_KV_GENERAL_SOURCE_HF_REPO,            ???);
+
+    add_kv(LLM_KV_VOCAB_SIZE,                        vocab.n_tokens());
+    add_kv(LLM_KV_CONTEXT_LENGTH,                    hparams.n_ctx_train);
+    add_kv(LLM_KV_EMBEDDING_LENGTH,                  hparams.n_embd);
+    add_kv(LLM_KV_BLOCK_COUNT,                       hparams.n_layer);
+    add_kv(LLM_KV_LEADING_DENSE_BLOCK_COUNT,         hparams.n_layer_dense_lead);
+    add_kv(LLM_KV_FEED_FORWARD_LENGTH,               hparams.n_ff_arr, true);
+    add_kv(LLM_KV_EXPERT_FEED_FORWARD_LENGTH,        hparams.n_ff_exp);
+    add_kv(LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
+    add_kv(LLM_KV_USE_PARALLEL_RESIDUAL,             hparams.use_par_res);
+    // add_kv(LLM_KV_TENSOR_DATA_LAYOUT,                ???);
+    add_kv(LLM_KV_EXPERT_COUNT,                      hparams.n_expert);
+    add_kv(LLM_KV_EXPERT_USED_COUNT,                 hparams.n_expert_used);
+    add_kv(LLM_KV_EXPERT_SHARED_COUNT,               hparams.n_expert_shared);
+    add_kv(LLM_KV_EXPERT_WEIGHTS_SCALE,              hparams.expert_weights_scale);
+    add_kv(LLM_KV_POOLING_TYPE,                      uint32_t(hparams.pooling_type));
+    add_kv(LLM_KV_LOGIT_SCALE,                       hparams.f_logit_scale);
+    add_kv(LLM_KV_DECODER_START_TOKEN_ID,            hparams.dec_start_token_id);
+    add_kv(LLM_KV_ATTN_LOGIT_SOFTCAPPING,            hparams.f_attn_logit_softcapping);
+    add_kv(LLM_KV_FINAL_LOGIT_SOFTCAPPING,           hparams.f_final_logit_softcapping);
+    add_kv(LLM_KV_SWIN_NORM,                         hparams.swin_norm);
+    add_kv(LLM_KV_RESCALE_EVERY_N_LAYERS,            hparams.rescale_every_n_layers);
+    add_kv(LLM_KV_TIME_MIX_EXTRA_DIM,                hparams.time_mix_extra_dim);
+    add_kv(LLM_KV_TIME_DECAY_EXTRA_DIM,              hparams.time_decay_extra_dim);
+    add_kv(LLM_KV_RESIDUAL_SCALE,                    hparams.f_residual_scale);
+    add_kv(LLM_KV_EMBEDDING_SCALE,                   hparams.f_embedding_scale);
+
+    add_kv(LLM_KV_ATTENTION_HEAD_COUNT,              hparams.n_head_arr, true);
+    add_kv(LLM_KV_ATTENTION_HEAD_COUNT_KV,           hparams.n_head_kv_arr, true);
+    add_kv(LLM_KV_ATTENTION_MAX_ALIBI_BIAS,          hparams.f_max_alibi_bias);
+    add_kv(LLM_KV_ATTENTION_CLAMP_KQV,               hparams.f_clamp_kqv);
+    add_kv(LLM_KV_ATTENTION_KEY_LENGTH,              hparams.n_embd_head_k);
+    add_kv(LLM_KV_ATTENTION_VALUE_LENGTH,            hparams.n_embd_head_v);
+    add_kv(LLM_KV_ATTENTION_LAYERNORM_EPS,           hparams.f_norm_eps);
+    add_kv(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS,       hparams.f_norm_rms_eps);
+    add_kv(LLM_KV_ATTENTION_CAUSAL,                  hparams.causal_attn);
+    add_kv(LLM_KV_ATTENTION_Q_LORA_RANK,             hparams.n_lora_q);
+    add_kv(LLM_KV_ATTENTION_KV_LORA_RANK,            hparams.n_lora_kv);
+    add_kv(LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT,  hparams.n_rel_attn_bkts);
+    add_kv(LLM_KV_ATTENTION_SLIDING_WINDOW,          hparams.n_swa);
+    add_kv(LLM_KV_ATTENTION_SCALE,                   hparams.f_attention_scale);
+
+    const float rope_scaling_factor = hparams.rope_freq_scale_train == 1.0f ? 0.0f : 1.0f/hparams.rope_freq_scale_train;
+
+    add_kv(LLM_KV_ROPE_DIMENSION_COUNT,              hparams.n_rot);
+    add_kv(LLM_KV_ROPE_FREQ_BASE,                    hparams.rope_freq_base_train);
+    // add_kv(LLM_KV_ROPE_SCALE_LINEAR,                 rope_scaling_factor); // old name
+    add_kv(LLM_KV_ROPE_SCALING_TYPE,                 llama_rope_scaling_type_name(hparams.rope_scaling_type_train));
+    add_kv(LLM_KV_ROPE_SCALING_FACTOR,               rope_scaling_factor);
+    add_kv(LLM_KV_ROPE_SCALING_ATTN_FACTOR,          hparams.rope_attn_factor);
+    add_kv(LLM_KV_ROPE_SCALING_ORIG_CTX_LEN,         hparams.n_ctx_orig_yarn);
+    add_kv(LLM_KV_ROPE_SCALING_FINETUNED,            hparams.rope_finetuned);
+    add_kv(LLM_KV_ROPE_SCALING_YARN_LOG_MUL,         hparams.rope_yarn_log_mul);
+
+    // TODO: implement split file support
+    // add_kv(LLM_KV_SPLIT_NO,                          ???);
+    // add_kv(LLM_KV_SPLIT_COUNT,                       ???);
+    // add_kv(LLM_KV_SPLIT_TENSORS_COUNT,               ???);
+
+    add_kv(LLM_KV_SSM_INNER_SIZE,                    hparams.ssm_d_inner);
+    add_kv(LLM_KV_SSM_CONV_KERNEL,                   hparams.ssm_d_conv);
+    add_kv(LLM_KV_SSM_STATE_SIZE,                    hparams.ssm_d_state);
+    add_kv(LLM_KV_SSM_TIME_STEP_RANK,                hparams.ssm_dt_rank);
+    add_kv(LLM_KV_SSM_DT_B_C_RMS,                    hparams.ssm_dt_b_c_rms);
+
+    add_kv(LLM_KV_WKV_HEAD_SIZE,                     hparams.wkv_head_size);
+
+    add_kv(LLM_KV_TOKENIZER_MODEL,                   vocab.get_tokenizer_model());
+    add_kv(LLM_KV_TOKENIZER_PRE,                     vocab.get_tokenizer_pre());
+    add_kv(LLM_KV_TOKENIZER_LIST,                    tokens);
+    add_kv(LLM_KV_TOKENIZER_TOKEN_TYPE,              token_types);
+    add_kv(LLM_KV_TOKENIZER_TOKEN_TYPE_COUNT,        vocab.n_token_types());
+    add_kv(LLM_KV_TOKENIZER_SCORES,                  scores);
+    add_kv(LLM_KV_TOKENIZER_MERGES,                  vocab.get_bpe_merges());
+    // FIXME llama_token is type i32 but when reading in a GGUF file u32 is expected, not an issue for writing though
+    add_kv(LLM_KV_TOKENIZER_BOS_ID,                  uint32_t(vocab.token_bos()));
+    add_kv(LLM_KV_TOKENIZER_EOS_ID,                  uint32_t(vocab.token_eos()));
+    add_kv(LLM_KV_TOKENIZER_EOT_ID,                  uint32_t(vocab.token_eot()));
+    add_kv(LLM_KV_TOKENIZER_EOM_ID,                  uint32_t(vocab.token_eom()));
+    add_kv(LLM_KV_TOKENIZER_UNK_ID,                  uint32_t(vocab.token_unk()));
+    add_kv(LLM_KV_TOKENIZER_SEP_ID,                  uint32_t(vocab.token_sep()));
+    add_kv(LLM_KV_TOKENIZER_PAD_ID,                  uint32_t(vocab.token_pad()));
+    // add_kv(LLM_KV_TOKENIZER_CLS_ID,                  uint32_t(vocab.token_bos())); // deprecated
+    // add_kv(LLM_KV_TOKENIZER_MASK_ID,                 ???);
+    add_kv(LLM_KV_TOKENIZER_ADD_BOS,                 vocab.get_add_bos());
+    add_kv(LLM_KV_TOKENIZER_ADD_EOS,                 vocab.get_add_eos());
+    add_kv(LLM_KV_TOKENIZER_ADD_PREFIX,              vocab.get_add_space_prefix());
+    add_kv(LLM_KV_TOKENIZER_REMOVE_EXTRA_WS,         vocab.get_remove_extra_whitespaces());
+    add_kv(LLM_KV_TOKENIZER_PRECOMPILED_CHARSMAP,    vocab.get_precompiled_charsmap());
+    // add_kv(LLM_KV_TOKENIZER_HF_JSON,                 ???);
+    // add_kv(LLM_KV_TOKENIZER_RWKV,                    ???);
+    add_kv(LLM_KV_TOKENIZER_FIM_PRE_ID,              uint32_t(vocab.token_fim_pre()));
+    add_kv(LLM_KV_TOKENIZER_FIM_SUF_ID,              uint32_t(vocab.token_fim_suf()));
+    add_kv(LLM_KV_TOKENIZER_FIM_MID_ID,              uint32_t(vocab.token_fim_mid()));
+    add_kv(LLM_KV_TOKENIZER_FIM_PAD_ID,              uint32_t(vocab.token_fim_pad()));
+    add_kv(LLM_KV_TOKENIZER_FIM_REP_ID,              uint32_t(vocab.token_fim_rep()));
+    add_kv(LLM_KV_TOKENIZER_FIM_SEP_ID,              uint32_t(vocab.token_fim_sep()));
+
+    // TODO: implement LoRA support
+    // add_kv(LLM_KV_ADAPTER_TYPE,                      ???);
+    // add_kv(LLM_KV_ADAPTER_LORA_ALPHA,                ???);
+
+    // deprecated
+    // add_kv(LLM_KV_TOKENIZER_PREFIX_ID,               ???);
+    // add_kv(LLM_KV_TOKENIZER_SUFFIX_ID,               ???);
+    // add_kv(LLM_KV_TOKENIZER_MIDDLE_ID,               ???);
+}
+
+void llama_model_saver::add_tensors_from_model() {
+    if (std::string(model.output->name) != std::string(model.tok_embd->name)) {
+        add_tensor(model.tok_embd); // some models use the same tensor for tok_embd and output
+    }
+    add_tensor(model.type_embd);
+    add_tensor(model.pos_embd);
+    add_tensor(model.tok_norm);
+    add_tensor(model.tok_norm_b);
+    add_tensor(model.output_norm);
+    add_tensor(model.output_norm_b);
+    add_tensor(model.output);
+    add_tensor(model.output_b);
+    add_tensor(model.output_norm_enc);
+    add_tensor(model.cls);
+    add_tensor(model.cls_b);
+    add_tensor(model.cls_out);
+    add_tensor(model.cls_out_b);
+
+    for (const struct llama_layer & layer : model.layers) {
+        for (size_t i = 0; i < sizeof(layer)/sizeof(struct ggml_tensor *); ++i) {
+            add_tensor(reinterpret_cast<const struct ggml_tensor * const *>(&layer)[i]);
+        }
+    }
+}
+
+void llama_model_saver::save(const std::string & path_model) {
+    gguf_write_to_file(gguf_ctx, path_model.c_str(), false);
+}
+
diff --git a/llama/llama.cpp/src/llama-model-saver.h b/llama/llama.cpp/src/llama-model-saver.h
new file mode 100644
index 00000000..a5a434c3
--- /dev/null
+++ b/llama/llama.cpp/src/llama-model-saver.h
@@ -0,0 +1,37 @@
+#pragma once
+
+#include "llama.h"
+#include "llama-arch.h"
+
+#include <vector>
+
+struct llama_model_saver {
+    struct gguf_context * gguf_ctx = nullptr;
+    const struct llama_model & model;
+    const struct LLM_KV llm_kv;
+
+    llama_model_saver(const struct llama_model & model);
+    ~llama_model_saver();
+
+    void add_kv(enum llm_kv key, uint32_t     value);
+    void add_kv(enum llm_kv key, int32_t      value);
+    void add_kv(enum llm_kv key, float        value);
+    void add_kv(enum llm_kv key, bool         value);
+    void add_kv(enum llm_kv key, const char * value);
+
+    [[noreturn]]
+    void add_kv(enum llm_kv key, char value); // needed to make the template below compile
+
+    template <typename Container>
+    void add_kv(enum llm_kv key, const Container & value, bool per_layer = false);
+
+    void add_kv(enum llm_kv key, const std::vector<std::string> & value);
+
+    void add_tensor(const struct ggml_tensor * tensor);
+
+    void add_kv_from_model();
+
+    void add_tensors_from_model();
+
+    void save(const std::string & path_model);
+};
diff --git a/llama/llama.cpp/src/llama-model.cpp b/llama/llama.cpp/src/llama-model.cpp
index 9d099f11..e8298f56 100644
--- a/llama/llama.cpp/src/llama-model.cpp
+++ b/llama/llama.cpp/src/llama-model.cpp
@@ -40,6 +40,7 @@ const char * llm_type_name(llm_type type) {
         case LLM_TYPE_335M:          return "335M";
         case LLM_TYPE_410M:          return "410M";
         case LLM_TYPE_450M:          return "450M";
+        case LLM_TYPE_475M:          return "475M";
         case LLM_TYPE_770M:          return "770M";
         case LLM_TYPE_780M:          return "780M";
         case LLM_TYPE_0_5B:          return "0.5B";
@@ -79,6 +80,7 @@ const char * llm_type_name(llm_type type) {
         case LLM_TYPE_236B:          return "236B";
         case LLM_TYPE_290B:          return "290B";
         case LLM_TYPE_314B:          return "314B";
+        case LLM_TYPE_405B:          return "405B";
         case LLM_TYPE_671B:          return "671B";
         case LLM_TYPE_SMALL:         return "0.1B";
         case LLM_TYPE_MEDIUM:        return "0.4B";
@@ -115,6 +117,10 @@ static const std::map<llama_rope_scaling_type, const char *> LLAMA_ROPE_SCALING_
     { LLAMA_ROPE_SCALING_TYPE_LONGROPE,   "longrope"   },
 };
 
+std::string llama_rope_scaling_type_name(llama_rope_scaling_type rope_scaling_type) {
+    return LLAMA_ROPE_SCALING_TYPES.at(rope_scaling_type);
+}
+
 static llama_rope_scaling_type llama_rope_scaling_type_from_string(const std::string & name) {
     for (const auto & kv : LLAMA_ROPE_SCALING_TYPES) {
         if (kv.second == name) {
@@ -297,6 +303,10 @@ static buft_list_t make_cpu_buft_list(const std::vector<ggml_backend_dev_t> & de
     // add extra buffer types, only if no GPU device is present
     // ref: https://github.com/ggml-org/llama.cpp/issues/12481#issuecomment-2743136094
     auto * cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
+    if (cpu_dev == nullptr) {
+        throw std::runtime_error(format("%s: no CPU backend found", __func__));
+    }
+
     auto * cpu_reg = ggml_backend_dev_backend_reg(cpu_dev);
     auto ggml_backend_dev_get_extra_bufts_fn = (ggml_backend_dev_get_extra_bufts_t)
         ggml_backend_reg_get_proc_address(cpu_reg, "ggml_backend_dev_get_extra_bufts");
@@ -595,6 +605,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
                 switch (hparams.n_layer) {
                     case 32: type = LLM_TYPE_7B; break;
                     case 80: type = LLM_TYPE_70B; break;
+                    case 162: type = LLM_TYPE_405B; break;
                     default: type = LLM_TYPE_UNKNOWN;
                 }
             } break;
@@ -721,7 +732,11 @@ void llama_model::load_hparams(llama_model_loader & ml) {
                 ml.get_key(LLM_KV_MOE_EVERY_N_LAYERS,         hparams.moe_every_n_layers, 0);
 
                 if (hparams.n_layer == 12 && hparams.n_embd == 768) {
-                    type = LLM_TYPE_137M;
+                    if (arch == LLM_ARCH_NOMIC_BERT) {
+                        type = LLM_TYPE_137M;
+                    } else if (arch == LLM_ARCH_NOMIC_BERT_MOE && hparams.moe_every_n_layers == 2) {
+                        type = LLM_TYPE_475M;
+                    }
                 }
             } break;
         case LLM_ARCH_BLOOM:
@@ -782,6 +797,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
             // fall through
         case LLM_ARCH_QWEN2:
             {
+                ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type, false);
                 ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
                 switch (hparams.n_layer) {
                     case 24: type = hparams.n_embd == 1024 ? LLM_TYPE_0_5B : LLM_TYPE_1B; break;
@@ -1505,6 +1521,9 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
     }
 
     ggml_backend_dev_t cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
+    if (cpu_dev == nullptr) {
+        throw std::runtime_error(format("%s: no CPU backend found", __func__));
+    }
     const int i_gpu_start = std::max((int) hparams.n_layer - n_gpu_layers, (int) 0);
     const int act_gpu_layers = devices.empty() ? 0 : std::min(n_gpu_layers, (int)n_layer + 1);
     auto get_layer_buft_list = [&](int il) -> llama_model::impl::layer_dev {
@@ -1672,8 +1691,11 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
                 for (const auto * overrides = ml.tensor_buft_overrides; overrides->pattern != nullptr; ++overrides) {
                     std::regex pattern(overrides->pattern);
                     if (std::regex_search(tensor_name, pattern)) {
-                        LLAMA_LOG_DEBUG("tensor %s buffer type overriden to %s\n", tensor_name.c_str(), ggml_backend_buft_name(overrides->buft));
                         buft = overrides->buft;
+                        LLAMA_LOG_DEBUG("tensor %s (%zu MiB %s) buffer type overridden to %s\n",
+                                tensor_name.c_str(),
+                                ggml_nbytes(t_meta) / 1024 / 1024, ggml_type_name(t_meta->type),
+                                ggml_backend_buft_name(buft));
                         break;
                     }
                 }
@@ -1690,6 +1712,9 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
             auto * buft_dev = ggml_backend_buft_get_device(buft);
             if (ml.use_mmap && buft_dev && buft == ggml_backend_dev_host_buffer_type(buft_dev)) {
                 auto * cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
+                if (!cpu_dev) {
+                    throw std::runtime_error("no CPU backend found");
+                }
                 buft = ggml_backend_dev_buffer_type(cpu_dev);
             }
 
@@ -1917,7 +1942,9 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
                         layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
                         layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd},     TENSOR_NOT_REQUIRED);
 
-                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
+                        if (n_ff > 0) {
+                            layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
+                        }
 
                         if (hparams.rope_scaling_type_train == LLAMA_ROPE_SCALING_TYPE_LONGROPE) {
                             layer.rope_long  = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_LONG,  "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
@@ -1927,9 +1954,11 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
                             layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
                         }
 
-                        layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, 0);
-                        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
-                        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
+                        if (n_ff > 0) {
+                            layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, 0);
+                            layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
+                            layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
+                        }
 
                         // optional MLP bias
                         layer.ffn_gate_b = create_tensor(tn(LLM_TENSOR_FFN_GATE, "bias", i), {n_ff}, TENSOR_NOT_REQUIRED);
@@ -3573,7 +3602,11 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
 
                     // output
                     output_norm   = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
-                    output        = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, 0);
+                    output        = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
+                    // if output is NULL, init from the input tok embed
+                    if (output == NULL) {
+                        output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
+                    }
 
                     for (int i = 0; i < n_layer; ++i) {
                         auto & layer = layers[i];
@@ -4206,6 +4239,9 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
         if (!dev) {
             // FIXME: workaround for CPU backend buft having a NULL device
             dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
+            if (!dev) {
+                throw std::runtime_error(format("%s: no CPU backend found", __func__));
+            }
         }
         ggml_backend_dev_props props;
         ggml_backend_dev_get_props(dev, &props);
@@ -4335,7 +4371,7 @@ uint64_t llama_model::n_elements() const {
 }
 
 void llama_model::print_info() const {
-    const char * rope_scaling_type = LLAMA_ROPE_SCALING_TYPES.at(hparams.rope_scaling_type_train);
+    const std::string rope_scaling_type = llama_rope_scaling_type_name(hparams.rope_scaling_type_train);
 
     auto print_f = [](const std::function<uint32_t(uint32_t)> & f, uint32_t n) {
         bool is_var = false;
@@ -4396,7 +4432,7 @@ void llama_model::print_info() const {
         LLAMA_LOG_INFO("%s: causal attn      = %d\n",     __func__, hparams.causal_attn);
         LLAMA_LOG_INFO("%s: pooling type     = %d\n",     __func__, hparams.pooling_type);
         LLAMA_LOG_INFO("%s: rope type        = %d\n",     __func__, hparams.rope_type);
-        LLAMA_LOG_INFO("%s: rope scaling     = %s\n",     __func__, rope_scaling_type);
+        LLAMA_LOG_INFO("%s: rope scaling     = %s\n",     __func__, rope_scaling_type.c_str());
         LLAMA_LOG_INFO("%s: freq_base_train  = %.1f\n",   __func__, hparams.rope_freq_base_train);
         LLAMA_LOG_INFO("%s: freq_scale_train = %g\n",     __func__, hparams.rope_freq_scale_train);
         LLAMA_LOG_INFO("%s: n_ctx_orig_yarn  = %u\n",     __func__, hparams.n_ctx_orig_yarn);
@@ -4543,6 +4579,19 @@ const ggml_tensor * llama_model::get_tensor(const char * name) const {
     return it->second;
 }
 
+ggml_tensor * llama_model::get_rope_factors(uint32_t n_ctx_per_seq, int il) const {
+    // choose long/short freq factors based on the context size
+    if (layers[il].rope_freqs != nullptr) {
+        return layers[il].rope_freqs;
+    }
+
+    if (n_ctx_per_seq > hparams.n_ctx_orig_yarn) {
+        return layers[il].rope_long;
+    }
+
+    return layers[il].rope_short;
+}
+
 struct llm_build_llama : public llm_graph_context {
     llm_build_llama(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
         const int64_t n_embd_head = hparams.n_embd_head_v;
@@ -4583,7 +4632,7 @@ struct llm_build_llama : public llm_graph_context {
             // self-attention
             {
                 // rope freq factors for llama3; may return nullptr for llama2 and other models
-                ggml_tensor * rope_factors = static_cast<const llama_kv_cache_unified *>(memory)->cbs.get_rope_factors(n_ctx_per_seq, il);
+                ggml_tensor * rope_factors = model.get_rope_factors(n_ctx_per_seq, il);
 
                 // compute Q and K and RoPE them
                 ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
@@ -4905,7 +4954,7 @@ struct llm_build_mllama: public llm_graph_context {
                 // self attention layer
 
                 // rope freq factors for llama3; may return nullptr for llama2 and other models
-                ggml_tensor * rope_factors = static_cast<const llama_kv_cache_unified *>(memory)->cbs.get_rope_factors(n_ctx_per_seq, il);
+                ggml_tensor * rope_factors = model.get_rope_factors(n_ctx_per_seq, il);
 
                 // compute Q and K and RoPE them
                 ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
@@ -5029,6 +5078,7 @@ struct llm_build_deci : public llm_graph_context {
             ggml_tensor * inpSA = inpL;
             const int64_t n_head_kv = hparams.n_head_kv(il);
             const int64_t n_head    = hparams.n_head(il);
+            const int64_t n_ff      = hparams.n_ff(il);
 
             if (n_head == 0) {
                 // attention-free layer of Llama-3_1-Nemotron-51B
@@ -5048,7 +5098,7 @@ struct llm_build_deci : public llm_graph_context {
             } else if (n_head > 0) {
                 // self-attention
                 // rope freq factors for llama3; may return nullptr for llama2 and other models
-                ggml_tensor * rope_factors = static_cast<const llama_kv_cache_unified *>(memory)->cbs.get_rope_factors(n_ctx_per_seq, il);
+                ggml_tensor * rope_factors = model.get_rope_factors(n_ctx_per_seq, il);
 
                 // compute Q and K and RoPE them
                 ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
@@ -5104,6 +5154,11 @@ struct llm_build_deci : public llm_graph_context {
                 inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
             }
 
+            // FFN-free layer of Llama-3_1-Nemotron-Ultra-253B
+            if (n_ff == 0) {
+                continue;
+            }
+
             // For Granite architecture
             if (hparams.f_residual_scale) {
                 cur = ggml_scale(ctx0, cur, hparams.f_residual_scale);
@@ -7530,7 +7585,7 @@ struct llm_build_phi3 : public llm_graph_context {
             // self-attention
             {
                 // rope freq factors for 128k context
-                ggml_tensor * rope_factors = static_cast<const llama_kv_cache_unified *>(memory)->cbs.get_rope_factors(n_ctx_per_seq, il);
+                ggml_tensor * rope_factors = model.get_rope_factors(n_ctx_per_seq, il);
 
                 ggml_tensor* attn_norm_output = build_norm(inpL,
                         model.layers[il].attn_norm,
@@ -8282,7 +8337,7 @@ struct llm_build_minicpm3 : public llm_graph_context {
         for (int il = 0; il < n_layer; ++il) {
             ggml_tensor * inpSA = inpL;
 
-            ggml_tensor * rope_factors = static_cast<const llama_kv_cache_unified *>(memory)->cbs.get_rope_factors(n_ctx_per_seq, il);
+            ggml_tensor * rope_factors = model.get_rope_factors(n_ctx_per_seq, il);
 
             // norm
             cur = build_norm(inpL,
@@ -9049,7 +9104,7 @@ struct llm_build_mamba : public llm_graph_context {
              ggml_tensor * state_mask,
       const llama_ubatch & ubatch,
                      int   il) const {
-        const llama_kv_cache_unified * kv_self = static_cast<const llama_kv_cache_unified *>(memory);
+        const llama_kv_cache_recurrent * kv_self = static_cast<const llama_kv_cache_recurrent *>(memory);
 
         const auto kv_head = kv_self->head;
 
@@ -9350,7 +9405,7 @@ struct llm_build_cohere2 : public llm_graph_context {
             // self-attention
             {
                 // rope freq factors for 128k context
-                ggml_tensor * rope_factors = static_cast<const llama_kv_cache_unified *>(memory)->cbs.get_rope_factors(n_ctx_per_seq, il);
+                ggml_tensor * rope_factors = model.get_rope_factors(n_ctx_per_seq, il);
 
                 // compute Q and K and RoPE them
                 ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
@@ -10288,7 +10343,7 @@ struct llm_build_deepseek : public llm_graph_context {
             // self-attention
             {
                 // rope freq factors for llama3; may return nullptr for llama2 and other models
-                ggml_tensor * rope_factors = static_cast<const llama_kv_cache_unified *>(memory)->cbs.get_rope_factors(n_ctx_per_seq, il);
+                ggml_tensor * rope_factors = model.get_rope_factors(n_ctx_per_seq, il);
 
                 // compute Q and K and RoPE them
                 ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
@@ -11652,7 +11707,7 @@ struct llm_build_exaone : public llm_graph_context {
             // self-attention
             {
                 // rope freq factors for llama3; may return nullptr for llama2 and other models
-                ggml_tensor * rope_factors = static_cast<const llama_kv_cache_unified *>(memory)->cbs.get_rope_factors(n_ctx_per_seq, il);
+                ggml_tensor * rope_factors = model.get_rope_factors(n_ctx_per_seq, il);
 
                 // compute Q and K and RoPE them
                 ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
@@ -11797,7 +11852,7 @@ struct llm_build_rwkv6_base : public llm_graph_context {
             ggml_tensor * state_mask,
             const llama_ubatch & ubatch,
             int   il) const {
-        const llama_kv_cache_unified * kv_self = static_cast<const llama_kv_cache_unified *>(memory);
+        const llama_kv_cache_recurrent * kv_self = static_cast<const llama_kv_cache_recurrent *>(memory);
 
         const auto n_tokens = ubatch.n_tokens;
         const auto n_seqs = ubatch.n_seqs;
@@ -12193,7 +12248,7 @@ struct llm_build_rwkv7_base : public llm_graph_context {
             ggml_tensor *& first_layer_value,
             const llama_ubatch & ubatch,
             int   il) const {
-        const llama_kv_cache_unified * kv_self = static_cast<const llama_kv_cache_unified *>(memory);
+        const llama_kv_cache_recurrent * kv_self = static_cast<const llama_kv_cache_recurrent *>(memory);
 
         const auto n_tokens = ubatch.n_tokens;
         const auto n_seqs = ubatch.n_seqs;
@@ -12741,7 +12796,7 @@ struct llm_build_solar : public llm_graph_context {
             // self-attention
             {
                 // rope freq factors for llama3; may return nullptr for llama2 and other models
-                ggml_tensor * rope_factors = static_cast<const llama_kv_cache_unified *>(memory)->cbs.get_rope_factors(n_ctx_per_seq, il);
+                ggml_tensor * rope_factors = model.get_rope_factors(n_ctx_per_seq, il);
 
                 // compute Q and K and RoPE them
                 ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
@@ -13192,7 +13247,7 @@ struct llm_build_bailingmoe : public llm_graph_context {
             // self-attention
             {
                 // rope freq factors for llama3; may return nullptr for llama2 and other models
-                ggml_tensor * rope_factors = static_cast<const llama_kv_cache_unified *>(memory)->cbs.get_rope_factors(n_ctx_per_seq, il);
+                ggml_tensor * rope_factors = model.get_rope_factors(n_ctx_per_seq, il);
 
                 // compute Q and K and RoPE them
                 ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
@@ -13312,36 +13367,46 @@ struct llm_build_bailingmoe : public llm_graph_context {
     }
 };
 
-llama_memory_i * llama_model::create_memory() const {
+llama_memory_i * llama_model::create_memory(const llama_memory_params & params, llama_cparams & cparams) const {
     llama_memory_i * res;
 
     switch (arch) {
+        case LLM_ARCH_BERT:
+        case LLM_ARCH_JINA_BERT_V2:
+        case LLM_ARCH_NOMIC_BERT:
+        case LLM_ARCH_NOMIC_BERT_MOE:
+            {
+                res = nullptr;
+            } break;
         case LLM_ARCH_MAMBA:
         case LLM_ARCH_RWKV6:
         case LLM_ARCH_RWKV6QWEN2:
         case LLM_ARCH_RWKV7:
         case LLM_ARCH_ARWKV7:
             {
-                res = new llama_kv_cache_unified(hparams, {
-                    /*.get_rope_factors =*/ nullptr
-                });
+                res = new llama_kv_cache_recurrent(
+                        *this,
+                        GGML_TYPE_F32,
+                        GGML_TYPE_F32,
+                        cparams.offload_kqv,
+                        std::max((uint32_t) 1, cparams.n_seq_max));
             } break;
         default:
             {
-                res = new llama_kv_cache_unified(hparams, {
-                    /*.get_rope_factors =*/ [this](uint32_t n_ctx_per_seq, int il) {
-                        // choose long/short freq factors based on the context size
-                        if (layers[il].rope_freqs != nullptr) {
-                            return layers[il].rope_freqs;
-                        }
+                const auto padding = llama_kv_cache_unified::get_padding(cparams);
 
-                        if (n_ctx_per_seq > hparams.n_ctx_orig_yarn) {
-                            return layers[il].rope_long;
-                        }
+                cparams.n_ctx = GGML_PAD(cparams.n_ctx, padding);
 
-                        return layers[il].rope_short;
-                    }
-                });
+                LLAMA_LOG_DEBUG("%s: n_ctx = %u (padded)\n", __func__, cparams.n_ctx);
+
+                res = new llama_kv_cache_unified(
+                        *this,
+                        params.type_k,
+                        params.type_v,
+                        !cparams.flash_attn,
+                        cparams.offload_kqv,
+                        cparams.n_ctx,
+                        padding);
             }
     }
 
@@ -13732,8 +13797,6 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
         case LLM_ARCH_DECI:
         case LLM_ARCH_BAICHUAN:
         case LLM_ARCH_STARCODER:
-        case LLM_ARCH_PLAMO:
-        case LLM_ARCH_ORION:
         case LLM_ARCH_INTERNLM2:
         case LLM_ARCH_MINICPM:
         case LLM_ARCH_XVERSE:
@@ -13772,6 +13835,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
         case LLM_ARCH_PHI2:
         case LLM_ARCH_PHI3:
         case LLM_ARCH_PHIMOE:
+        case LLM_ARCH_PLAMO:
         case LLM_ARCH_GEMMA:
         case LLM_ARCH_GEMMA2:
         case LLM_ARCH_GEMMA3:
@@ -13779,6 +13843,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
         case LLM_ARCH_OPENELM:
         case LLM_ARCH_GPTNEOX:
         case LLM_ARCH_CODESHELL:
+        case LLM_ARCH_ORION:
         case LLM_ARCH_NEMOTRON:
         case LLM_ARCH_EXAONE:
         case LLM_ARCH_MINICPM3:
@@ -13851,6 +13916,14 @@ const char * llama_model_chat_template(const llama_model * model, const char * n
         : LLM_KV(model->arch)(LLM_KV_TOKENIZER_CHAT_TEMPLATE);
     const auto & it = model->gguf_kv.find(key);
     if (it == model->gguf_kv.end()) {
+        // one-off fix for very popular models (so we are not flooded with issues)
+        // do not extend this list unless absolutely necessary
+        // Mistral-Small-2503 does not have built-in chat template
+        llama_vocab_pre_type pre_type = model->vocab.get_pre_type();
+        if (pre_type == LLAMA_VOCAB_PRE_TYPE_TEKKEN && model->layers.size() == 40) {
+            return "mistral-v7-tekken";
+        }
+
         return nullptr;
     }
 
diff --git a/llama/llama.cpp/src/llama-model.h b/llama/llama.cpp/src/llama-model.h
index 6be91282..9281e629 100644
--- a/llama/llama.cpp/src/llama-model.h
+++ b/llama/llama.cpp/src/llama-model.h
@@ -37,6 +37,7 @@ enum llm_type {
     LLM_TYPE_335M,
     LLM_TYPE_410M,
     LLM_TYPE_450M,
+    LLM_TYPE_475M,
     LLM_TYPE_770M,
     LLM_TYPE_780M,
     LLM_TYPE_0_5B,
@@ -78,6 +79,7 @@ enum llm_type {
     LLM_TYPE_236B,
     LLM_TYPE_290B,
     LLM_TYPE_314B,
+    LLM_TYPE_405B,
     LLM_TYPE_671B,
     LLM_TYPE_SMALL,
     LLM_TYPE_MEDIUM,
@@ -97,6 +99,8 @@ enum llm_type {
     LLM_TYPE_235B_A22B,
 };
 
+std::string llama_rope_scaling_type_name(llama_rope_scaling_type rope_scaling_type);
+
 struct llama_layer_posnet {
     // resnet
     struct ggml_tensor * norm1   = nullptr;
@@ -409,8 +413,11 @@ struct llama_model {
 
     const struct ggml_tensor * get_tensor(const char * name) const;
 
+    ggml_tensor * get_rope_factors(uint32_t n_ctx_per_seq, int il) const;
+
+    // note: can mutate `cparams`
     // TODO: move this to new llm_arch_model_i interface
-    llama_memory_i * create_memory() const; // TODO: params
+    llama_memory_i * create_memory(const llama_memory_params & params, llama_cparams & cparams) const;
 
     // TODO: move this to new llm_arch_model_i interface
     llm_graph_result_ptr build_graph(
diff --git a/llama/llama.cpp/src/llama-quant.cpp b/llama/llama.cpp/src/llama-quant.cpp
index 223e1f3f..56531980 100644
--- a/llama/llama.cpp/src/llama-quant.cpp
+++ b/llama/llama.cpp/src/llama-quant.cpp
@@ -519,7 +519,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
         nthread = std::thread::hardware_concurrency();
     }
 
-    // mmap consistently increases speed Linux, and also increases speed on Windows with
+    // mmap consistently increases speed on Linux, and also increases speed on Windows with
     // hot cache. It may cause a slowdown on macOS, possibly related to free memory.
 #if defined(__linux__) || defined(_WIN32)
     constexpr bool use_mmap = true;
@@ -529,7 +529,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
 
     llama_model_kv_override * kv_overrides = nullptr;
     if (params->kv_overrides) {
-        auto v = (std::vector<llama_model_kv_override>*)params->kv_overrides;
+        auto * v = (std::vector<llama_model_kv_override>*)params->kv_overrides;
         kv_overrides = v->data();
     }
 
diff --git a/llama/llama.cpp/src/llama-sampling.cpp b/llama/llama.cpp/src/llama-sampling.cpp
index 75731053..15a10ca8 100644
--- a/llama/llama.cpp/src/llama-sampling.cpp
+++ b/llama/llama.cpp/src/llama-sampling.cpp
@@ -1750,23 +1750,35 @@ static const char * llama_sampler_top_n_sigma_name(const struct llama_sampler *
 static void llama_sampler_top_n_sigma_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) {
     const auto * ctx = (llama_sampler_top_n_sigma *) smpl->ctx;
 
+    if (ctx->n <= 0.0f || cur_p->size <= 1) {
+        return;
+    }
+
     // find max logit and calculate mean
     float max = cur_p->data[0].logit;
     float logits_sum = 0;
+    size_t valid_count = 0;
     for (size_t i = 0; i < cur_p->size; ++i) {
-        if (cur_p->data[i].logit > max) {
-            max = cur_p->data[i].logit;
+        // Only count non-negative infinity values
+        if (cur_p->data[i].logit != -INFINITY) {
+            if (cur_p->data[i].logit > max) {
+                max = cur_p->data[i].logit;
+            }
+            logits_sum += cur_p->data[i].logit;
+            valid_count++;
         }
-        logits_sum += cur_p->data[i].logit;
     }
-    float mean = logits_sum/cur_p->size;
+    float mean = valid_count > 0 ? logits_sum/valid_count : 0;
 
     // calculate standard deviation
     float acc = 0;
     for (size_t i = 0; i < cur_p->size; ++i) {
-        acc += pow(cur_p->data[i].logit - mean, 2);
+        // Skip -infinity in std calculation
+        if (cur_p->data[i].logit != -INFINITY) {
+            acc += pow(cur_p->data[i].logit - mean, 2);
+        }
     }
-    float std = sqrt(acc/cur_p->size);
+    float std = valid_count > 0 ? sqrt(acc/valid_count) : 0;
 
     //apply mask
     for (size_t i = 0; i < cur_p->size; ++i) {
diff --git a/llama/llama.cpp/src/llama-vocab.cpp b/llama/llama.cpp/src/llama-vocab.cpp
index d6515ff6..b098bb25 100644
--- a/llama/llama.cpp/src/llama-vocab.cpp
+++ b/llama/llama.cpp/src/llama-vocab.cpp
@@ -1,5 +1,7 @@
 #include "llama-vocab.h"
 
+#include "ggml.h"
+#include "gguf.h"
 #include "llama-impl.h"
 #include "llama-model-loader.h"
 
@@ -415,6 +417,13 @@ struct llm_tokenizer_bpe : llm_tokenizer {
                     "'(?:[sSdDmMtT]|[lL][lL]|[vV][eE]|[rR][eE])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]|\\s+(?!\\S)|\\s+",
                 };
                 break;
+            case LLAMA_VOCAB_PRE_TYPE_SEED_CODER:
+                regex_exprs = {
+                    // original regex from tokenizer.json
+                    // "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1}| ?[^\\s\\p{L}\\p{N}\r\n]+|\\s*[\r\n]+|\\s+(?!\\S)|\\s+"
+                    "(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1}| ?[^\\s\\p{L}\\p{N}\\r\\n]+|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
+                };
+                break;
             default:
                 // default regex for BPE tokenization pre-processing
                 regex_exprs = {
@@ -1227,6 +1236,9 @@ struct fragment_buffer_variant {
 struct llama_vocab::impl {
     uint32_t n_token_types = 0; // for BERT-style token types
 
+    std::string tokenizer_model;
+    std::string tokenizer_pre;
+
     enum llama_vocab_type     type     = LLAMA_VOCAB_TYPE_SPM;
     enum llama_vocab_pre_type pre_type = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
 
@@ -1362,9 +1374,6 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
 
     // determine vocab type
     {
-        std::string tokenizer_model;
-        std::string tokenizer_pre;
-
         ml.get_key(LLM_KV_TOKENIZER_MODEL, tokenizer_model);
         ml.get_key(LLM_KV_TOKENIZER_PRE,   tokenizer_pre, false);
 
@@ -1459,7 +1468,10 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
 
             const int precompiled_charsmap_keyidx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_PRECOMPILED_CHARSMAP).c_str());
             if (precompiled_charsmap_keyidx != -1) {
-                size_t n_precompiled_charsmap = gguf_get_arr_data_n(ctx, precompiled_charsmap_keyidx);
+                const gguf_type pc_type = gguf_get_arr_type(ctx, precompiled_charsmap_keyidx);
+                GGML_ASSERT(pc_type == GGUF_TYPE_INT8 || pc_type == GGUF_TYPE_UINT8);
+
+                const size_t n_precompiled_charsmap = gguf_get_arr_data_n(ctx, precompiled_charsmap_keyidx);
                 const char * pc = (const char *) gguf_get_arr_data(ctx, precompiled_charsmap_keyidx);
                 precompiled_charsmap.assign(pc, pc + n_precompiled_charsmap);
 #ifdef IS_BIG_ENDIAN
@@ -1625,6 +1637,10 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
                 tokenizer_pre == "bailingmoe") {
                 pre_type = LLAMA_VOCAB_PRE_TYPE_BAILINGMOE;
                 clean_spaces = false;
+            } else if (
+                tokenizer_pre == "seed-coder") {
+                pre_type = LLAMA_VOCAB_PRE_TYPE_SEED_CODER;
+                clean_spaces = false;
             } else {
                 LLAMA_LOG_WARN("%s: missing or unrecognized pre-tokenizer type, using: 'default'\n", __func__);
                 pre_type = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
@@ -2770,6 +2786,14 @@ void llama_vocab::load(llama_model_loader & ml, const LLM_KV & kv) {
     pimpl->load(ml, kv);
 }
 
+std::string llama_vocab::get_tokenizer_model() const {
+    return pimpl->tokenizer_model;
+}
+
+std::string llama_vocab::get_tokenizer_pre() const {
+    return pimpl->tokenizer_pre;
+}
+
 enum llama_vocab_type llama_vocab::get_type() const {
     return pimpl->type;
 }
@@ -2992,6 +3016,20 @@ int llama_vocab::find_bpe_rank(const std::string & token_left, const std::string
     return it->second;
 }
 
+std::vector<std::string> llama_vocab::get_bpe_merges() const {
+    std::vector<std::string> result(pimpl->bpe_ranks.size());
+
+    for (const auto & pair : pimpl->bpe_ranks) {
+        result[pair.second] = pair.first.first + " " + pair.first.second;
+    }
+
+    return result;
+}
+
+std::vector<char> llama_vocab::get_precompiled_charsmap() const {
+    return pimpl->precompiled_charsmap;
+}
+
 int32_t llama_vocab::tokenize(
                   const char * text,
                      int32_t   text_len,
diff --git a/llama/llama.cpp/src/llama-vocab.h b/llama/llama.cpp/src/llama-vocab.h
index 5ce35521..daa6cf30 100644
--- a/llama/llama.cpp/src/llama-vocab.h
+++ b/llama/llama.cpp/src/llama-vocab.h
@@ -21,6 +21,9 @@ struct llama_vocab {
 
     void load(llama_model_loader & ml, const LLM_KV & kv);
 
+    std::string get_tokenizer_model() const;
+    std::string get_tokenizer_pre() const;
+
     enum llama_vocab_type     get_type()     const;
     enum llama_vocab_pre_type get_pre_type() const;
 
@@ -80,6 +83,9 @@ struct llama_vocab {
     int max_token_len() const;
 
     int find_bpe_rank(const std::string & token_left, const std::string & token_right) const;
+    std::vector<std::string> get_bpe_merges() const;
+
+    std::vector<char> get_precompiled_charsmap() const;
 
     int32_t tokenize(
                    const char * text,
diff --git a/llama/llama.cpp/src/llama.cpp b/llama/llama.cpp/src/llama.cpp
index d5164720..9fdddf7b 100644
--- a/llama/llama.cpp/src/llama.cpp
+++ b/llama/llama.cpp/src/llama.cpp
@@ -4,6 +4,7 @@
 #include "llama-mmap.h"
 #include "llama-vocab.h"
 #include "llama-model-loader.h"
+#include "llama-model-saver.h"
 #include "llama-model.h"
 
 #include "ggml.h"
@@ -253,6 +254,13 @@ struct llama_model * llama_model_load_from_splits(
     return llama_model_load_from_file_impl(splits.front(), splits, params);
 }
 
+void llama_model_save_to_file(const struct llama_model * model, const char * path_model) {
+    llama_model_saver ms(*model);
+    ms.add_kv_from_model();
+    ms.add_tensors_from_model();
+    ms.save(path_model);
+}
+
 //
 // chat templates
 //
@@ -338,3 +346,4 @@ const char * llama_print_system_info(void) {
 
     return s.c_str();
 }
+
diff --git a/llama/llama.cpp/examples/llava/clip-impl.h b/llama/llama.cpp/tools/mtmd/clip-impl.h
similarity index 90%
rename from llama/llama.cpp/examples/llava/clip-impl.h
rename to llama/llama.cpp/tools/mtmd/clip-impl.h
index 66cb21ef..23036ba7 100644
--- a/llama/llama.cpp/examples/llava/clip-impl.h
+++ b/llama/llama.cpp/tools/mtmd/clip-impl.h
@@ -31,9 +31,7 @@
 #define KEY_FEATURE_LAYER       "clip.vision.feature_layer"
 #define KEY_PROJ_SCALE_FACTOR   "clip.vision.projector.scale_factor"
 #define KEY_PROJ_TYPE           "clip.projector_type"
-
-#define KEY_USE_GLU_MLP         "clip.use_glu_mlp"  // for qwen2.5vl
-#define KEY_USE_RMS_NORM        "clip.use_rms_norm" // for qwen2.5vl
+#define KEY_SPATIAL_MERGE_SIZE  "clip.vision.spatial_merge_size"
 
 #define KEY_MM_PATCH_MERGE_TYPE   "clip.vision.mm_patch_merge_type"
 #define KEY_IMAGE_GRID_PINPOINTS  "clip.vision.image_grid_pinpoints"
@@ -55,12 +53,16 @@
 #define TN_ATTN_Q          "%s.blk.%d.attn_q.%s"
 #define TN_ATTN_V          "%s.blk.%d.attn_v.%s"
 #define TN_ATTN_OUTPUT     "%s.blk.%d.attn_out.%s"
+#define TN_ATTN_K_NORM     "%s.blk.%d.attn_k_norm.%s"
+#define TN_ATTN_Q_NORM     "%s.blk.%d.attn_q_norm.%s"
 #define TN_FFN_DOWN        "%s.blk.%d.ffn_down.%s"
 #define TN_FFN_GATE        "%s.blk.%d.ffn_gate.%s"
 #define TN_FFN_UP          "%s.blk.%d.ffn_up.%s"
 #define TN_FFN_GATE        "%s.blk.%d.ffn_gate.%s"
-#define TN_LN_1            "%s.blk.%d.ln1.%s"
-#define TN_LN_2            "%s.blk.%d.ln2.%s"
+#define TN_LN_1            "%s.blk.%d.ln1.%s" // layer norm
+#define TN_LN_2            "%s.blk.%d.ln2.%s" // layer norm
+#define TN_LS_1            "%s.blk.%d.ls1.%s" // layer scale
+#define TN_LS_2            "%s.blk.%d.ls2.%s" // layer scale
 #define TN_LN_PRE          "%s.pre_ln.%s"
 #define TN_LN_POST         "%s.post_ln.%s"
 #define TN_LLAVA_PROJ      "mm.%d.%s"
@@ -68,10 +70,14 @@
 #define TN_MVLM_PROJ_BLOCK "mm.model.mb_block.%d.block.%d.%s"
 #define TN_MVLM_PROJ_PEG   "mm.model.peg.%d.%s"
 #define TN_IMAGE_NEWLINE   "model.image_newline"
+#define TN_MM_INP_NORM     "mm.input_norm.weight"
 #define TN_MM_INP_PROJ     "mm.input_projection.weight" // gemma3
 #define TN_MM_SOFT_EMB_N   "mm.soft_emb_norm.weight"    // gemma3
 #define TN_MM_PROJECTOR    "mm.model.fc.weight"         // idefics3
+#define TN_MM_PATCH_MERGER "mm.patch_merger.weight"     // mistral small 3.1
 #define TN_TOK_IMG_BREAK   "v.token_embd.img_break"     // pixtral
+#define TN_TOK_GLM_BOI     "adapter.boi"                // glm-edge (these embeddings are not in text model)
+#define TN_TOK_GLM_EOI     "adapter.eoi"                // glm-edge (these embeddings are not in text model)
 
 // mimicpmv
 #define TN_MINICPMV_POS_EMBD_K "resampler.pos_embed_k"
@@ -88,6 +94,9 @@
 #define TN_GLM_ADAPTER_GATE     "adapter.linear.gate.%s"
 #define TN_GLM_ADAPTER_D_4H_2_H "adapter.linear.dense_4h_to_h.%s"
 
+// align x to upper multiple of n
+#define CLIP_ALIGN(x, n) ((((x) + (n) - 1) / (n)) * (n))
+
 enum projector_type {
     PROJECTOR_TYPE_MLP,
     PROJECTOR_TYPE_MLP_NORM,
@@ -100,6 +109,7 @@ enum projector_type {
     PROJECTOR_TYPE_IDEFICS3,
     PROJECTOR_TYPE_PIXTRAL,
     PROJECTOR_TYPE_QWEN25VL,
+    PROJECTOR_TYPE_INTERNVL,
     PROJECTOR_TYPE_UNKNOWN,
 };
 
@@ -114,6 +124,7 @@ static std::map<projector_type, std::string> PROJECTOR_TYPE_NAMES = {
     { PROJECTOR_TYPE_GEMMA3,    "gemma3"},
     { PROJECTOR_TYPE_IDEFICS3,  "idefics3"},
     { PROJECTOR_TYPE_PIXTRAL,   "pixtral"},
+    { PROJECTOR_TYPE_INTERNVL,  "internvl"},
 };
 
 static projector_type clip_projector_type_from_string(const std::string & str) {
@@ -228,6 +239,15 @@ struct clip_image_u8_batch {
 
 struct clip_image_f32_batch {
     std::vector<clip_image_f32_ptr> entries;
+
+    clip_image_f32_batch clone() const {
+        clip_image_f32_batch new_batch;
+        new_batch.entries.reserve(entries.size());
+        for (const auto & entry : entries) {
+            new_batch.entries.emplace_back(new clip_image_f32(*entry));
+        }
+        return new_batch;
+    }
 };
 
 //
diff --git a/llama/llama.cpp/examples/llava/clip.cpp b/llama/llama.cpp/tools/mtmd/clip.cpp
similarity index 58%
rename from llama/llama.cpp/examples/llava/clip.cpp
rename to llama/llama.cpp/tools/mtmd/clip.cpp
index b3218c78..cdd8ca44 100644
--- a/llama/llama.cpp/examples/llava/clip.cpp
+++ b/llama/llama.cpp/tools/mtmd/clip.cpp
@@ -29,6 +29,7 @@
 #include <limits>
 #include <array>
 #include <numeric>
+#include <functional>
 
 #if defined(_WIN32)
 #define WIN32_LEAN_AND_MEAN
@@ -45,6 +46,17 @@
 
 struct clip_logger_state g_logger_state = {GGML_LOG_LEVEL_CONT, clip_log_callback_default, NULL};
 
+enum ffn_op_type {
+    FFN_GELU,
+    FFN_SILU,
+    FFN_GELU_QUICK,
+};
+
+enum norm_type {
+    NORM_TYPE_NORMAL,
+    NORM_TYPE_RMS,
+};
+
 //#define CLIP_DEBUG_FUNCTIONS
 
 #ifdef CLIP_DEBUG_FUNCTIONS
@@ -168,13 +180,19 @@ enum patch_merge_type {
 struct clip_hparams {
     int32_t image_size;
     int32_t patch_size;
-    int32_t hidden_size;
-    int32_t n_intermediate;
+    int32_t n_embd;
+    int32_t n_ff;
     int32_t projection_dim;
     int32_t n_head;
     int32_t n_layer;
     int32_t proj_scale_factor = 0; // idefics3
 
+    // for models using dynamic image size, we need to have a smaller image size to warmup
+    // otherwise, user will get OOM everytime they load the model
+    int32_t warmup_image_size = 0;
+
+    ffn_op_type ffn_op = FFN_GELU;
+
     patch_merge_type mm_patch_merge_type = PATCH_MERGE_FLAT;
 
     float eps = 1e-6;
@@ -185,145 +203,148 @@ struct clip_hparams {
     std::unordered_set<int32_t> vision_feature_layer;
     int32_t attn_window_size = 0;
     int32_t n_wa_pattern = 0;
+    int32_t spatial_merge_size = 0;
 };
 
 struct clip_layer {
     // attention
-    struct ggml_tensor * k_w = nullptr;
-    struct ggml_tensor * k_b = nullptr;
-    struct ggml_tensor * q_w = nullptr;
-    struct ggml_tensor * q_b = nullptr;
-    struct ggml_tensor * v_w = nullptr;
-    struct ggml_tensor * v_b = nullptr;
+    ggml_tensor * k_w = nullptr;
+    ggml_tensor * k_b = nullptr;
+    ggml_tensor * q_w = nullptr;
+    ggml_tensor * q_b = nullptr;
+    ggml_tensor * v_w = nullptr;
+    ggml_tensor * v_b = nullptr;
 
-    struct ggml_tensor * o_w = nullptr;
-    struct ggml_tensor * o_b = nullptr;
+    ggml_tensor * o_w = nullptr;
+    ggml_tensor * o_b = nullptr;
+
+    ggml_tensor * k_norm = nullptr;
+    ggml_tensor * q_norm = nullptr;
 
     // layernorm 1
-    struct ggml_tensor * ln_1_w = nullptr;
-    struct ggml_tensor * ln_1_b = nullptr;
+    ggml_tensor * ln_1_w = nullptr;
+    ggml_tensor * ln_1_b = nullptr;
 
-    // ff
-    struct ggml_tensor * ff_i_w = nullptr; // legacy naming
-    struct ggml_tensor * ff_i_b = nullptr; // legacy naming
-    struct ggml_tensor * ff_o_w = nullptr; // legacy naming
-    struct ggml_tensor * ff_o_b = nullptr; // legacy naming
-
-    struct ggml_tensor * ff_up_w = nullptr;
-    struct ggml_tensor * ff_up_b = nullptr;
-    struct ggml_tensor * ff_gate_w = nullptr;
-    struct ggml_tensor * ff_gate_b = nullptr;
-    struct ggml_tensor * ff_down_w = nullptr;
-    struct ggml_tensor * ff_down_b = nullptr;
-
-    struct ggml_tensor * ff_g_w = NULL;
-    struct ggml_tensor * ff_g_b = NULL;
+    ggml_tensor * ff_up_w = nullptr;
+    ggml_tensor * ff_up_b = nullptr;
+    ggml_tensor * ff_gate_w = nullptr;
+    ggml_tensor * ff_gate_b = nullptr;
+    ggml_tensor * ff_down_w = nullptr;
+    ggml_tensor * ff_down_b = nullptr;
 
     // layernorm 2
-    struct ggml_tensor * ln_2_w = nullptr;
-    struct ggml_tensor * ln_2_b = nullptr;
+    ggml_tensor * ln_2_w = nullptr;
+    ggml_tensor * ln_2_b = nullptr;
+
+    // layer scale (no bias)
+    ggml_tensor * ls_1_w = nullptr;
+    ggml_tensor * ls_2_w = nullptr;
 };
 
 struct clip_vision_model {
     struct clip_hparams hparams;
 
     // embeddings
-    struct ggml_tensor * class_embedding = nullptr;
-    struct ggml_tensor * patch_embeddings_0 = nullptr;
-    struct ggml_tensor * patch_embeddings_1 = nullptr;  // second Conv2D kernel when we decouple Conv3D along temproal dimension (Qwen2VL)
-    struct ggml_tensor * patch_bias = nullptr;
-    struct ggml_tensor * position_embeddings = nullptr;
+    ggml_tensor * class_embedding = nullptr;
+    ggml_tensor * patch_embeddings_0 = nullptr;
+    ggml_tensor * patch_embeddings_1 = nullptr;  // second Conv2D kernel when we decouple Conv3D along temproal dimension (Qwen2VL)
+    ggml_tensor * patch_bias = nullptr;
+    ggml_tensor * position_embeddings = nullptr;
 
-    struct ggml_tensor * pre_ln_w = nullptr;
-    struct ggml_tensor * pre_ln_b = nullptr;
+    ggml_tensor * pre_ln_w = nullptr;
+    ggml_tensor * pre_ln_b = nullptr;
 
     std::vector<clip_layer> layers;
 
-    struct ggml_tensor * post_ln_w;
-    struct ggml_tensor * post_ln_b;
+    ggml_tensor * post_ln_w;
+    ggml_tensor * post_ln_b;
 
-    struct ggml_tensor * projection;
+    ggml_tensor * projection;
 
     // LLaVA projection
-    struct ggml_tensor * mm_0_w = nullptr;
-    struct ggml_tensor * mm_0_b = nullptr;
-    struct ggml_tensor * mm_2_w = nullptr;
-    struct ggml_tensor * mm_2_b = nullptr;
+    ggml_tensor * mm_input_norm_w = nullptr;
+    ggml_tensor * mm_0_w = nullptr;
+    ggml_tensor * mm_0_b = nullptr;
+    ggml_tensor * mm_2_w = nullptr;
+    ggml_tensor * mm_2_b = nullptr;
 
-    struct ggml_tensor * image_newline = nullptr;
+    ggml_tensor * image_newline = nullptr;
 
     // Yi type models with mlp+normalization projection
-    struct ggml_tensor * mm_1_w = nullptr; // Yi type models have 0, 1, 3, 4
-    struct ggml_tensor * mm_1_b = nullptr;
-    struct ggml_tensor * mm_3_w = nullptr;
-    struct ggml_tensor * mm_3_b = nullptr;
-    struct ggml_tensor * mm_4_w = nullptr;
-    struct ggml_tensor * mm_4_b = nullptr;
+    ggml_tensor * mm_1_w = nullptr; // Yi type models have 0, 1, 3, 4
+    ggml_tensor * mm_1_b = nullptr;
+    ggml_tensor * mm_3_w = nullptr;
+    ggml_tensor * mm_3_b = nullptr;
+    ggml_tensor * mm_4_w = nullptr;
+    ggml_tensor * mm_4_b = nullptr;
 
-    //GLMV-Edge projection
-    struct ggml_tensor * mm_model_adapter_conv_w = nullptr;
-    struct ggml_tensor * mm_model_adapter_conv_b = nullptr;
+    // GLMV-Edge projection
+    ggml_tensor * mm_model_adapter_conv_w = nullptr;
+    ggml_tensor * mm_model_adapter_conv_b = nullptr;
+    ggml_tensor * mm_glm_tok_boi = nullptr;
+    ggml_tensor * mm_glm_tok_eoi = nullptr;
 
     // MobileVLM projection
-    struct ggml_tensor * mm_model_mlp_1_w = nullptr;
-    struct ggml_tensor * mm_model_mlp_1_b = nullptr;
-    struct ggml_tensor * mm_model_mlp_3_w = nullptr;
-    struct ggml_tensor * mm_model_mlp_3_b = nullptr;
-    struct ggml_tensor * mm_model_block_1_block_0_0_w = nullptr;
-    struct ggml_tensor * mm_model_block_1_block_0_1_w = nullptr;
-    struct ggml_tensor * mm_model_block_1_block_0_1_b = nullptr;
-    struct ggml_tensor * mm_model_block_1_block_1_fc1_w = nullptr;
-    struct ggml_tensor * mm_model_block_1_block_1_fc1_b = nullptr;
-    struct ggml_tensor * mm_model_block_1_block_1_fc2_w = nullptr;
-    struct ggml_tensor * mm_model_block_1_block_1_fc2_b = nullptr;
-    struct ggml_tensor * mm_model_block_1_block_2_0_w = nullptr;
-    struct ggml_tensor * mm_model_block_1_block_2_1_w = nullptr;
-    struct ggml_tensor * mm_model_block_1_block_2_1_b = nullptr;
-    struct ggml_tensor * mm_model_block_2_block_0_0_w = nullptr;
-    struct ggml_tensor * mm_model_block_2_block_0_1_w = nullptr;
-    struct ggml_tensor * mm_model_block_2_block_0_1_b = nullptr;
-    struct ggml_tensor * mm_model_block_2_block_1_fc1_w = nullptr;
-    struct ggml_tensor * mm_model_block_2_block_1_fc1_b = nullptr;
-    struct ggml_tensor * mm_model_block_2_block_1_fc2_w = nullptr;
-    struct ggml_tensor * mm_model_block_2_block_1_fc2_b = nullptr;
-    struct ggml_tensor * mm_model_block_2_block_2_0_w = nullptr;
-    struct ggml_tensor * mm_model_block_2_block_2_1_w = nullptr;
-    struct ggml_tensor * mm_model_block_2_block_2_1_b = nullptr;
+    ggml_tensor * mm_model_mlp_1_w = nullptr;
+    ggml_tensor * mm_model_mlp_1_b = nullptr;
+    ggml_tensor * mm_model_mlp_3_w = nullptr;
+    ggml_tensor * mm_model_mlp_3_b = nullptr;
+    ggml_tensor * mm_model_block_1_block_0_0_w = nullptr;
+    ggml_tensor * mm_model_block_1_block_0_1_w = nullptr;
+    ggml_tensor * mm_model_block_1_block_0_1_b = nullptr;
+    ggml_tensor * mm_model_block_1_block_1_fc1_w = nullptr;
+    ggml_tensor * mm_model_block_1_block_1_fc1_b = nullptr;
+    ggml_tensor * mm_model_block_1_block_1_fc2_w = nullptr;
+    ggml_tensor * mm_model_block_1_block_1_fc2_b = nullptr;
+    ggml_tensor * mm_model_block_1_block_2_0_w = nullptr;
+    ggml_tensor * mm_model_block_1_block_2_1_w = nullptr;
+    ggml_tensor * mm_model_block_1_block_2_1_b = nullptr;
+    ggml_tensor * mm_model_block_2_block_0_0_w = nullptr;
+    ggml_tensor * mm_model_block_2_block_0_1_w = nullptr;
+    ggml_tensor * mm_model_block_2_block_0_1_b = nullptr;
+    ggml_tensor * mm_model_block_2_block_1_fc1_w = nullptr;
+    ggml_tensor * mm_model_block_2_block_1_fc1_b = nullptr;
+    ggml_tensor * mm_model_block_2_block_1_fc2_w = nullptr;
+    ggml_tensor * mm_model_block_2_block_1_fc2_b = nullptr;
+    ggml_tensor * mm_model_block_2_block_2_0_w = nullptr;
+    ggml_tensor * mm_model_block_2_block_2_1_w = nullptr;
+    ggml_tensor * mm_model_block_2_block_2_1_b = nullptr;
 
     // MobileVLM_V2 projection
-    struct ggml_tensor * mm_model_mlp_0_w = nullptr;
-    struct ggml_tensor * mm_model_mlp_0_b = nullptr;
-    struct ggml_tensor * mm_model_mlp_2_w = nullptr;
-    struct ggml_tensor * mm_model_mlp_2_b = nullptr;
-    struct ggml_tensor * mm_model_peg_0_w = nullptr;
-    struct ggml_tensor * mm_model_peg_0_b = nullptr;
+    ggml_tensor * mm_model_mlp_0_w = nullptr;
+    ggml_tensor * mm_model_mlp_0_b = nullptr;
+    ggml_tensor * mm_model_mlp_2_w = nullptr;
+    ggml_tensor * mm_model_mlp_2_b = nullptr;
+    ggml_tensor * mm_model_peg_0_w = nullptr;
+    ggml_tensor * mm_model_peg_0_b = nullptr;
 
     // MINICPMV projection
-    struct ggml_tensor * mm_model_pos_embed_k = nullptr;
-    struct ggml_tensor * mm_model_query = nullptr;
-    struct ggml_tensor * mm_model_proj = nullptr;
-    struct ggml_tensor * mm_model_kv_proj = nullptr;
-    struct ggml_tensor * mm_model_attn_q_w = nullptr;
-    struct ggml_tensor * mm_model_attn_q_b = nullptr;
-    struct ggml_tensor * mm_model_attn_k_w = nullptr;
-    struct ggml_tensor * mm_model_attn_k_b = nullptr;
-    struct ggml_tensor * mm_model_attn_v_w = nullptr;
-    struct ggml_tensor * mm_model_attn_v_b = nullptr;
-    struct ggml_tensor * mm_model_attn_o_w = nullptr;
-    struct ggml_tensor * mm_model_attn_o_b = nullptr;
-    struct ggml_tensor * mm_model_ln_q_w = nullptr;
-    struct ggml_tensor * mm_model_ln_q_b = nullptr;
-    struct ggml_tensor * mm_model_ln_kv_w = nullptr;
-    struct ggml_tensor * mm_model_ln_kv_b = nullptr;
-    struct ggml_tensor * mm_model_ln_post_w = nullptr;
-    struct ggml_tensor * mm_model_ln_post_b = nullptr;
+    ggml_tensor * mm_model_pos_embed_k = nullptr;
+    ggml_tensor * mm_model_query = nullptr;
+    ggml_tensor * mm_model_proj = nullptr;
+    ggml_tensor * mm_model_kv_proj = nullptr;
+    ggml_tensor * mm_model_attn_q_w = nullptr;
+    ggml_tensor * mm_model_attn_q_b = nullptr;
+    ggml_tensor * mm_model_attn_k_w = nullptr;
+    ggml_tensor * mm_model_attn_k_b = nullptr;
+    ggml_tensor * mm_model_attn_v_w = nullptr;
+    ggml_tensor * mm_model_attn_v_b = nullptr;
+    ggml_tensor * mm_model_attn_o_w = nullptr;
+    ggml_tensor * mm_model_attn_o_b = nullptr;
+    ggml_tensor * mm_model_ln_q_w = nullptr;
+    ggml_tensor * mm_model_ln_q_b = nullptr;
+    ggml_tensor * mm_model_ln_kv_w = nullptr;
+    ggml_tensor * mm_model_ln_kv_b = nullptr;
+    ggml_tensor * mm_model_ln_post_w = nullptr;
+    ggml_tensor * mm_model_ln_post_b = nullptr;
 
     // gemma3
-    struct ggml_tensor * mm_input_proj_w = nullptr;
-    struct ggml_tensor * mm_soft_emb_norm_w = nullptr;
+    ggml_tensor * mm_input_proj_w = nullptr;
+    ggml_tensor * mm_soft_emb_norm_w = nullptr;
 
     // pixtral
-    struct ggml_tensor * token_embd_img_break = nullptr;
+    ggml_tensor * token_embd_img_break = nullptr;
+    ggml_tensor * mm_patch_merger_w = nullptr;
 };
 
 struct clip_ctx {
@@ -333,11 +354,8 @@ struct clip_ctx {
     struct clip_vision_model vision_model;
     projector_type proj_type = PROJECTOR_TYPE_MLP;
 
-    int32_t max_feature_layer; // unused in newer models like gemma3
     float image_mean[3];
     float image_std[3];
-    bool use_gelu = false;
-    bool use_silu = false;
 
     gguf_context_ptr ctx_gguf;
     ggml_context_ptr ctx_data;
@@ -358,9 +376,12 @@ struct clip_ctx {
 
     clip_ctx(clip_context_params & ctx_params) {
         backend_cpu = ggml_backend_init_by_type(GGML_BACKEND_DEVICE_TYPE_CPU, nullptr);
-        backend     = ctx_params.use_gpu
-                        ? ggml_backend_init_by_type(GGML_BACKEND_DEVICE_TYPE_GPU, nullptr)
-                        : nullptr;
+        if (!backend_cpu) {
+            throw std::runtime_error("failed to initialize CPU backend");
+        }
+        backend = ctx_params.use_gpu
+                    ? ggml_backend_init_by_type(GGML_BACKEND_DEVICE_TYPE_GPU, nullptr)
+                    : nullptr;
 
         if (backend) {
             LOG_INF("%s: CLIP using %s backend\n", __func__, ggml_backend_name(backend));
@@ -375,7 +396,7 @@ struct clip_ctx {
         backend_buft.push_back(ggml_backend_get_default_buffer_type(backend_cpu));
 
         sched.reset(
-            ggml_backend_sched_new(backend_ptrs.data(), backend_buft.data(), backend_ptrs.size(), 8192, false)
+            ggml_backend_sched_new(backend_ptrs.data(), backend_buft.data(), backend_ptrs.size(), 8192, false, true)
         );
     }
 
@@ -387,1154 +408,362 @@ struct clip_ctx {
     }
 };
 
-static ggml_cgraph * clip_image_build_graph_siglip(clip_ctx * ctx, const clip_image_f32 & img) {
-    const auto & model = ctx->vision_model;
-    const auto & hparams = model.hparams;
+struct clip_graph {
+    clip_ctx * ctx;
+    const clip_vision_model & model;
+    const clip_hparams & hparams;
 
-    int image_size_width  = img.nx;
-    int image_size_height = img.ny;
+    // we only support single image per batch
+    const clip_image_f32 & img;
 
-    const int patch_size  = hparams.patch_size;
-    const int num_patches = ((image_size_width / patch_size) * (image_size_height / patch_size));
-    const int hidden_size = hparams.hidden_size;
-    const int n_head      = hparams.n_head;
-    const int d_head      = hidden_size / n_head;
-    const int n_layer     = hparams.n_layer;
-    const float eps       = hparams.eps;
+    const int patch_size;
+    const int n_patches_x;
+    const int n_patches_y;
+    const int n_patches;
+    const int n_embd;
+    const int n_head;
+    const int d_head;
+    const int n_layer;
+    const float eps;
+    const float kq_scale;
 
-    struct ggml_init_params params = {
-        /*.mem_size   =*/ ctx->buf_compute_meta.size(),
-        /*.mem_buffer =*/ ctx->buf_compute_meta.data(),
-        /*.no_alloc   =*/ true,
-    };
+    ggml_context_ptr ctx0_ptr;
+    ggml_context * ctx0;
+    ggml_cgraph * gf;
 
-    ggml_context_ptr ctx0_ptr(ggml_init(params));
-    auto ctx0 = ctx0_ptr.get();
+    clip_graph(clip_ctx * ctx, const clip_image_f32 & img) :
+            ctx(ctx),
+            model(ctx->vision_model),
+            hparams(model.hparams),
+            img(img),
+            patch_size(hparams.patch_size),
+            n_patches_x(img.nx / patch_size),
+            n_patches_y(img.ny / patch_size),
+            n_patches(n_patches_x * n_patches_y),
+            n_embd(hparams.n_embd),
+            n_head(hparams.n_head),
+            d_head(n_embd / n_head),
+            n_layer(hparams.n_layer),
+            eps(hparams.eps),
+            kq_scale(1.0f / sqrtf((float)d_head)) {
+        struct ggml_init_params params = {
+            /*.mem_size   =*/ ctx->buf_compute_meta.size(),
+            /*.mem_buffer =*/ ctx->buf_compute_meta.data(),
+            /*.no_alloc   =*/ true,
+        };
+        ctx0_ptr.reset(ggml_init(params));
+        ctx0 = ctx0_ptr.get();
+        gf = ggml_new_graph(ctx0);
+    }
 
-    struct ggml_cgraph * gf = ggml_new_graph(ctx0);
+    ggml_cgraph * build_siglip() {
+        ggml_tensor * inp = build_inp();
+        ggml_tensor * cur = build_vit(
+                                inp, n_patches,
+                                NORM_TYPE_NORMAL,
+                                hparams.ffn_op,
+                                model.position_embeddings,
+                                nullptr);
 
-    // input raw
-    struct ggml_tensor * inp_raw = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, image_size_width, image_size_height, 3);
-    ggml_set_name(inp_raw, "inp_raw");
-    ggml_set_input(inp_raw);
+        if (ctx->proj_type == PROJECTOR_TYPE_GEMMA3) {
+            const int batch_size = 1;
+            GGML_ASSERT(n_patches_x == n_patches_y);
+            const int patches_per_image = n_patches_x;
+            const int kernel_size = hparams.proj_scale_factor;
 
-    struct ggml_tensor * inp = ggml_conv_2d(ctx0, model.patch_embeddings_0, inp_raw, patch_size, patch_size, 0, 0, 1, 1);
-    inp = ggml_reshape_2d(ctx0, inp, num_patches, hidden_size);
-    inp = ggml_cont(ctx0, ggml_transpose(ctx0, inp));
-    inp = ggml_add(ctx0, inp, model.patch_bias);
+            cur = ggml_cont(ctx0, ggml_transpose(ctx0, cur));
+            cur = ggml_reshape_4d(ctx0, cur, patches_per_image, patches_per_image, n_embd, batch_size);
 
-    // position embeddings
-    struct ggml_tensor * embeddings = ggml_add(ctx0, inp, model.position_embeddings);
+            // doing a pool2d to reduce the number of output tokens
+            cur = ggml_pool_2d(ctx0, cur, GGML_OP_POOL_AVG, kernel_size, kernel_size, kernel_size, kernel_size, 0, 0);
+            cur = ggml_reshape_3d(ctx0, cur, cur->ne[0] * cur->ne[0], n_embd, batch_size);
+            cur = ggml_cont(ctx0, ggml_transpose(ctx0, cur));
 
-    // loop over layers
-    for (int il = 0; il < n_layer; il++) {
-        struct ggml_tensor * cur = embeddings; // embeddings = residual, cur = hidden_states
+            // apply norm before projection
+            cur = ggml_rms_norm(ctx0, cur, eps);
+            cur = ggml_mul(ctx0, cur, model.mm_soft_emb_norm_w);
 
-        // layernorm1
-        {
-            cur = ggml_norm(ctx0, cur, eps);
-            cur = ggml_add(ctx0, ggml_mul(ctx0, cur, model.layers[il].ln_1_w), model.layers[il].ln_1_b);
+            // apply projection
+            cur = ggml_mul_mat(ctx0,
+                ggml_cont(ctx0, ggml_transpose(ctx0, model.mm_input_proj_w)),
+                cur);
+
+        } else if (ctx->proj_type == PROJECTOR_TYPE_IDEFICS3) {
+            // https://github.com/huggingface/transformers/blob/0a950e0bbe1ed58d5401a6b547af19f15f0c195e/src/transformers/models/idefics3/modeling_idefics3.py#L578
+
+            const int scale_factor = model.hparams.proj_scale_factor;
+            const int n_embd = cur->ne[0];
+            const int seq    = cur->ne[1];
+            const int bsz    = 1; // batch size, always 1 for now since we don't support batching
+            const int height = std::sqrt(seq);
+            const int width  = std::sqrt(seq);
+            GGML_ASSERT(scale_factor != 0);
+            cur = ggml_reshape_4d(ctx0, cur, n_embd * scale_factor, width / scale_factor, height, bsz);
+            cur = ggml_permute(ctx0, cur, 0, 2, 1, 3);
+            cur = ggml_reshape_4d(ctx0, ggml_cont(ctx0, cur),
+                n_embd * scale_factor * scale_factor,
+                height / scale_factor,
+                width / scale_factor,
+                bsz);
+            cur = ggml_permute(ctx0, cur, 0, 2, 1, 3);
+            cur = ggml_reshape_3d(ctx0, ggml_cont(ctx0, cur),
+                n_embd * scale_factor * scale_factor,
+                seq / (scale_factor * scale_factor),
+                bsz);
+
+            cur = ggml_mul_mat(ctx0, model.projection, cur);
+        } else {
+            GGML_ABORT("SigLIP: Unsupported projector type");
         }
 
-        // self-attention
-        {
+        // build the graph
+        ggml_build_forward_expand(gf, cur);
 
-            struct ggml_tensor * Q =
-                ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].q_w, cur), model.layers[il].q_b);
+        return gf;
+    }
 
-            Q = ggml_reshape_3d(ctx0, Q, d_head, n_head, num_patches);
-            Q = ggml_cont(ctx0, ggml_permute(ctx0, Q, 0, 2, 1, 3));
+    ggml_cgraph * build_pixtral() {
+        const int n_merge = hparams.spatial_merge_size;
 
-            struct ggml_tensor * K =
-                ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].k_w, cur), model.layers[il].k_b);
+        // 2D input positions
+        ggml_tensor * pos_h = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_patches);
+        ggml_set_name(pos_h, "pos_h");
+        ggml_set_input(pos_h);
 
-            K = ggml_reshape_3d(ctx0, K, d_head, n_head, num_patches);
-            K = ggml_cont(ctx0, ggml_permute(ctx0, K, 0, 2, 1, 3));
+        ggml_tensor * pos_w = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_patches);
+        ggml_set_name(pos_w, "pos_w");
+        ggml_set_input(pos_w);
 
-            struct ggml_tensor * V =
-                ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].v_w, cur), model.layers[il].v_b);
+        auto add_pos = [&](ggml_tensor * cur, const clip_layer &) {
+            return build_rope_2d(ctx0, cur, pos_h, pos_w, hparams.rope_theta);
+        };
 
-            V = ggml_reshape_3d(ctx0, V, d_head, n_head, num_patches);
-            V = ggml_cont(ctx0, ggml_permute(ctx0, V, 1, 2, 0, 3));
+        ggml_tensor * inp = build_inp();
+        ggml_tensor * cur = build_vit(
+                                inp, n_patches,
+                                NORM_TYPE_RMS,
+                                hparams.ffn_op,
+                                nullptr, // no learned pos embd
+                                add_pos);
 
-            struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
-            KQ = ggml_soft_max_ext(ctx0, KQ, nullptr, 1.0f / sqrtf((float)d_head), 0.0f);
+        // mistral small 3.1 patch merger
+        // ref: https://github.com/huggingface/transformers/blob/7a3e208892c06a5e278144eaf38c8599a42f53e7/src/transformers/models/mistral3/modeling_mistral3.py#L67
+        if (model.mm_patch_merger_w) {
+            GGML_ASSERT(hparams.spatial_merge_size > 0);
 
-            struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ);
-            KQV = ggml_reshape_3d(ctx0, KQV, d_head, num_patches, n_head);
-            KQV = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
+            cur = ggml_mul(ctx0, ggml_rms_norm(ctx0, cur, eps), model.mm_input_norm_w);
 
-            cur = ggml_cont_2d(ctx0, KQV, hidden_size, num_patches);
+            // reshape image tokens to 2D grid
+            cur = ggml_reshape_3d(ctx0, cur, n_embd, n_patches_x, n_patches_y);
+            cur = ggml_permute(ctx0, cur, 2, 0, 1, 3); // [x, y, n_embd]
+            cur = ggml_cont(ctx0, cur);
+
+            // torch.nn.functional.unfold is just an im2col under the hood
+            // we just need a dummy kernel to make it work
+            ggml_tensor * kernel = ggml_view_3d(ctx0, cur, n_merge, n_merge, cur->ne[2], 0, 0, 0);
+            cur = ggml_im2col(ctx0, kernel, cur, n_merge, n_merge, 0, 0, 1, 1, true, inp->type);
+
+            // project to n_embd
+            cur = ggml_reshape_2d(ctx0, cur, cur->ne[0], cur->ne[1] * cur->ne[2]);
+            cur = ggml_mul_mat(ctx0, model.mm_patch_merger_w, cur);
         }
 
-        // attention output
-        cur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].o_w, cur), model.layers[il].o_b);
-
-        // re-add the layer input, e.g., residual
-        cur = ggml_add(ctx0, cur, embeddings);
-
-        embeddings = cur; // embeddings = residual, cur = hidden_states
-
-        // layernorm2
+        // LlavaMultiModalProjector (always using GELU activation)
         {
-            cur = ggml_norm(ctx0, cur, eps);
-            cur = ggml_add(ctx0, ggml_mul(ctx0, cur, model.layers[il].ln_2_w), model.layers[il].ln_2_b);
+            cur = ggml_mul_mat(ctx0, model.mm_1_w, cur);
+            if (model.mm_1_b) {
+                cur = ggml_add(ctx0, cur, model.mm_1_b);
+            }
+
+            cur = ggml_gelu(ctx0, cur);
+            cur = ggml_mul_mat(ctx0, model.mm_2_w, cur);
+            if (model.mm_2_b) {
+                cur = ggml_add(ctx0, cur, model.mm_2_b);
+            }
         }
 
-        cur = ggml_mul_mat(ctx0, model.layers[il].ff_i_w, cur);
-        cur = ggml_add(ctx0, cur, model.layers[il].ff_i_b);
-
-        // siglip uses gelu
-        cur = ggml_gelu(ctx0, cur);
-
-        cur = ggml_mul_mat(ctx0, model.layers[il].ff_o_w, cur);
-        cur = ggml_add(ctx0, cur, model.layers[il].ff_o_b);
-
-        // residual 2
-        cur = ggml_add(ctx0, embeddings, cur);
-
-        embeddings = cur;
-    }
-
-    // post-layernorm
-    if (model.post_ln_w) {
-        embeddings = ggml_norm(ctx0, embeddings, eps);
-        ggml_set_name(embeddings, "post_ln");
-
-        embeddings = ggml_add(ctx0, ggml_mul(ctx0, embeddings, model.post_ln_w), model.post_ln_b);
-    }
-
-    if (ctx->proj_type == PROJECTOR_TYPE_GEMMA3) {
-        const int batch_size = 1;
-        const int mm_tokens_per_image = 256; // default value for gemma3
-        const int tokens_per_side = sqrt(mm_tokens_per_image);
-        const int patches_per_image = sqrt(num_patches);
-        const int kernel_size = patches_per_image / tokens_per_side;
-
-        embeddings = ggml_cont(ctx0, ggml_transpose(ctx0, embeddings));
-        embeddings = ggml_reshape_4d(ctx0, embeddings, patches_per_image, patches_per_image, hidden_size, batch_size);
-
-        // doing a pool2d to reduce the number of output tokens to 256
-        embeddings = ggml_pool_2d(ctx0, embeddings, GGML_OP_POOL_AVG, kernel_size, kernel_size, kernel_size, kernel_size, 0, 0);
-        embeddings = ggml_reshape_3d(ctx0, embeddings, embeddings->ne[0] * embeddings->ne[0], hidden_size, batch_size);
-        embeddings = ggml_cont(ctx0, ggml_transpose(ctx0, embeddings));
-
-        // apply norm before projection
-        embeddings = ggml_rms_norm(ctx0, embeddings, eps);
-        embeddings = ggml_mul(ctx0, embeddings, model.mm_soft_emb_norm_w);
-
-        // apply projection
-        embeddings = ggml_mul_mat(ctx0,
-            ggml_cont(ctx0, ggml_transpose(ctx0, model.mm_input_proj_w)),
-            embeddings);
-
-    } else if (ctx->proj_type == PROJECTOR_TYPE_IDEFICS3) {
-        // https://github.com/huggingface/transformers/blob/0a950e0bbe1ed58d5401a6b547af19f15f0c195e/src/transformers/models/idefics3/modeling_idefics3.py#L578
-
-        ggml_tensor * cur = embeddings;
-        const int scale_factor = model.hparams.proj_scale_factor;
-        const int n_embd = cur->ne[0];
-        const int seq    = cur->ne[1];
-        const int bsz    = 1; // batch size, always 1 for now since we don't support batching
-        const int height = std::sqrt(seq);
-        const int width  = std::sqrt(seq);
-        GGML_ASSERT(scale_factor != 0);
-        cur = ggml_reshape_4d(ctx0, cur, n_embd * scale_factor, width / scale_factor, height, bsz);
-        cur = ggml_permute(ctx0, cur, 0, 2, 1, 3);
-        cur = ggml_reshape_4d(ctx0, ggml_cont(ctx0, cur),
-            n_embd * scale_factor * scale_factor,
-            height / scale_factor,
-            width / scale_factor,
-            bsz);
-        cur = ggml_permute(ctx0, cur, 0, 2, 1, 3);
-        cur = ggml_reshape_3d(ctx0, ggml_cont(ctx0, cur),
-            n_embd * scale_factor * scale_factor,
-            seq / (scale_factor * scale_factor),
-            bsz);
-
-        cur = ggml_mul_mat(ctx0, model.projection, cur);
-        embeddings = cur;
-    } else {
-        GGML_ABORT("SigLIP: Unsupported projector type");
-    }
-
-    // build the graph
-    ggml_build_forward_expand(gf, embeddings);
-
-    return gf;
-}
-
-// implementation of the 2D RoPE without adding a new op in ggml
-// this is not efficient (use double the memory), but works on all backends
-// TODO: there was a more efficient which relies on ggml_view and ggml_rope_ext_inplace, but the rope inplace does not work well with non-contiguous tensors ; we should fix that and revert back to the original implementation in https://github.com/ggml-org/llama.cpp/pull/13065
-static ggml_tensor * build_rope_2d(
-    ggml_context * ctx0,
-    ggml_tensor * cur,
-    ggml_tensor * pos_h,
-    ggml_tensor * pos_w,
-    const float freq_base
-) {
-    const int64_t n_dim  = cur->ne[0];
-    const int64_t n_head = cur->ne[1];
-    const int64_t n_pos  = cur->ne[2];
-
-    // for example, if we have cur tensor of shape (n_dim=8, n_head, n_pos)
-    // we will have a list of 4 inv_freq: 1e-0, 1e-1, 1e-2, 1e-3
-    // first half of cur will use 1e-0, 1e-2 (even)
-    // second half of cur will use 1e-1, 1e-3 (odd)
-    // the trick here is to rotate just half of n_dim, so inv_freq will automatically be even
-    //  ^ don't ask me why, it's math! -2(2i) / n_dim == -2i / (n_dim/2)
-    // then for the second half, we use freq_scale to shift the inv_freq
-    //  ^ why? replace (2i) with (2i+1) in the above equation
-    const float freq_scale_odd = std::pow(freq_base, (float)-2/n_dim);
-
-    // first half
-    ggml_tensor * first;
-    {
-        first = ggml_view_3d(ctx0, cur,
-            n_dim/2, n_head, n_pos,
-            ggml_row_size(cur->type, n_dim),
-            ggml_row_size(cur->type, n_dim*n_head),
-            0);
-        first = ggml_rope_ext(
-            ctx0,
-            first,
-            pos_h,      // positions
-            nullptr,    // freq factors
-            n_dim/2,    // n_dims
-            0, 0, freq_base,
-            1.0f, 0.0f, 1.0f, 0.0f, 0.0f
-        );
-    }
-
-    // second half
-    ggml_tensor * second;
-    {
-        second = ggml_view_3d(ctx0, cur,
-            n_dim/2, n_head, n_pos,
-            ggml_row_size(cur->type, n_dim),
-            ggml_row_size(cur->type, n_dim*n_head),
-            n_dim/2 * ggml_element_size(cur));
-        second = ggml_cont(ctx0, second); // copy, because ggml_rope don't play well with non-contiguous tensors
-        second = ggml_rope_ext(
-            ctx0,
-            second,
-            pos_w,      // positions
-            nullptr,    // freq factors
-            n_dim/2,    // n_dims
-            0, 0, freq_base,
-            freq_scale_odd,
-            0.0f, 1.0f, 0.0f, 0.0f
-        );
-    }
-
-    cur = ggml_concat(ctx0, first, second, 0);
-    return cur;
-}
-
-static ggml_cgraph * clip_image_build_graph_pixtral(clip_ctx * ctx, const clip_image_f32 & img) {
-    const auto & model = ctx->vision_model;
-    const auto & hparams = model.hparams;
-
-    GGML_ASSERT(ctx->proj_type == PROJECTOR_TYPE_PIXTRAL);
-
-    int image_size_width  = img.nx;
-    int image_size_height = img.ny;
-
-    const int patch_size  = hparams.patch_size;
-    const int n_patches_x = image_size_width  / patch_size;
-    const int n_patches_y = image_size_height / patch_size;
-    const int num_patches = n_patches_x * n_patches_y;
-    const int hidden_size = hparams.hidden_size;
-    const int n_head      = hparams.n_head;
-    const int d_head      = hidden_size / n_head;
-    const int n_layer     = hparams.n_layer;
-    const float eps       = hparams.eps;
-
-    struct ggml_init_params params = {
-        /*.mem_size   =*/ ctx->buf_compute_meta.size(),
-        /*.mem_buffer =*/ ctx->buf_compute_meta.data(),
-        /*.no_alloc   =*/ true,
-    };
-
-    ggml_context_ptr ctx0_ptr(ggml_init(params));
-    auto ctx0 = ctx0_ptr.get();
-
-    struct ggml_cgraph * gf = ggml_new_graph(ctx0);
-
-    // input raw
-    struct ggml_tensor * inp_raw = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, image_size_width, image_size_height, 3);
-    ggml_set_name(inp_raw, "inp_raw");
-    ggml_set_input(inp_raw);
-
-    // 2D input positions
-    struct ggml_tensor * pos_h = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, num_patches);
-    ggml_set_name(pos_h, "pos_h");
-    ggml_set_input(pos_h);
-    struct ggml_tensor * pos_w = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, num_patches);
-    ggml_set_name(pos_w, "pos_w");
-    ggml_set_input(pos_w);
-
-    struct ggml_tensor * inp = ggml_conv_2d(ctx0, model.patch_embeddings_0, inp_raw, patch_size, patch_size, 0, 0, 1, 1);
-    inp = ggml_reshape_2d(ctx0, inp, num_patches, hidden_size);
-    inp = ggml_cont(ctx0, ggml_transpose(ctx0, inp));
-
-    struct ggml_tensor * embeddings = inp;
-
-    // pre-layer norm
-    embeddings = ggml_mul(ctx0, ggml_rms_norm(ctx0, embeddings, eps), model.pre_ln_w);
-
-    // loop over layers
-    for (int il = 0; il < n_layer; il++) {
-        struct ggml_tensor * cur = embeddings;
-
-        // pre-attention norm
-        cur = ggml_mul(ctx0, ggml_rms_norm(ctx0, cur, eps), model.layers[il].ln_1_w);
-
-        // self-attention
+        // arrangement of the [IMG_BREAK] token
         {
-            struct ggml_tensor * Q = ggml_mul_mat(ctx0, model.layers[il].q_w, cur);
+            // not efficient, but works
+            // the trick is to view the embeddings as a 3D tensor with shape [n_embd, n_patches_per_row, n_rows]
+            // and then concatenate the [IMG_BREAK] token to the end of each row, aka n_patches_per_row dimension
+            // after the concatenation, we have a tensor with shape [n_embd, n_patches_per_row + 1, n_rows]
 
-            Q = ggml_reshape_3d(ctx0, Q, d_head, n_head, num_patches);
-            Q = build_rope_2d(ctx0, Q, pos_h, pos_w, hparams.rope_theta);
-            Q = ggml_cont(ctx0, ggml_permute(ctx0, Q, 0, 2, 1, 3));
+            const int p_y             = n_merge > 0 ? n_patches_y / n_merge : n_patches_y;
+            const int p_x             = n_merge > 0 ? n_patches_x / n_merge : n_patches_x;
+            const int p_total         = p_x * p_y;
+            const int n_embd_text     = cur->ne[0];
+            const int n_tokens_output = p_total + p_y - 1; // one [IMG_BREAK] per row, except the last row
 
-            struct ggml_tensor * K = ggml_mul_mat(ctx0, model.layers[il].k_w, cur);
-
-            K = ggml_reshape_3d(ctx0, K, d_head, n_head, num_patches);
-            K = build_rope_2d(ctx0, K, pos_h, pos_w, hparams.rope_theta);
-            K = ggml_cont(ctx0, ggml_permute(ctx0, K, 0, 2, 1, 3));
-
-            struct ggml_tensor * V = ggml_mul_mat(ctx0, model.layers[il].v_w, cur);
-
-            V = ggml_reshape_3d(ctx0, V, d_head, n_head, num_patches);
-            V = ggml_cont(ctx0, ggml_permute(ctx0, V, 1, 2, 0, 3));
-
-            struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
-            KQ = ggml_soft_max_ext(ctx0, KQ, nullptr, 1.0f / sqrtf((float)d_head), 0.0f);
-
-            struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ);
-            KQV = ggml_reshape_3d(ctx0, KQV, d_head, num_patches, n_head);
-            KQV = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
-
-            cur = ggml_cont_2d(ctx0, KQV, hidden_size, num_patches);
-
-            cur = ggml_mul_mat(ctx0, model.layers[il].o_w, cur);
+            ggml_tensor * tmp = ggml_reshape_3d(ctx0, cur, n_embd_text, p_x, p_y);
+            ggml_tensor * tok = ggml_new_tensor_3d(ctx0, tmp->type, n_embd_text, 1, p_y);
+            tok = ggml_scale(ctx0, tok, 0.0); // clear the tensor
+            tok = ggml_add(ctx0, tok, model.token_embd_img_break);
+            tmp = ggml_concat(ctx0, tmp, tok, 1);
+            cur = ggml_view_2d(ctx0, tmp,
+                n_embd_text, n_tokens_output,
+                ggml_row_size(tmp->type, n_embd_text), 0);
         }
 
-        // re-add the layer input, e.g., residual
-        cur = ggml_add(ctx0, cur, embeddings);
+        // build the graph
+        ggml_build_forward_expand(gf, cur);
 
-        embeddings = cur; // embeddings = residual, cur = hidden_states
+        return gf;
+    }
 
-        // pre-ffn norm
-        cur = ggml_mul(ctx0, ggml_rms_norm(ctx0, cur, eps), model.layers[il].ln_2_w);
+    // Qwen2VL and Qwen2.5VL use M-RoPE
+    ggml_cgraph * build_qwen2vl() {
+        GGML_ASSERT(model.patch_bias == nullptr);
+        GGML_ASSERT(model.class_embedding == nullptr);
 
-        // feed-forward
+        const int batch_size       = 1;
+        const bool use_window_attn = hparams.n_wa_pattern > 0;
+        const int n_wa_pattern     = hparams.n_wa_pattern;
+        const int n_pos            = n_patches;
+        const int num_position_ids = n_pos * 4; // m-rope requires 4 dim per position
+
+        norm_type norm_t = ctx->proj_type == PROJECTOR_TYPE_QWEN25VL
+            ? NORM_TYPE_RMS // qwen 2.5 vl
+            : NORM_TYPE_NORMAL; // qwen 2 vl
+
+        int mrope_sections[4] = {d_head/4, d_head/4, d_head/4, d_head/4};
+
+        ggml_tensor * inp_raw = build_inp_raw();
+        ggml_tensor * inp = ggml_conv_2d(ctx0, model.patch_embeddings_0, inp_raw, patch_size, patch_size, 0, 0, 1, 1);
+
+        GGML_ASSERT(img.nx % (patch_size * 2) == 0);
+        GGML_ASSERT(img.ny % (patch_size * 2) == 0);
+
+        // second conv dimension
         {
-            ggml_tensor * gate_proj = ggml_mul_mat(ctx0, model.layers[il].ff_gate_w, cur);
-            ggml_tensor * up_proj   = ggml_mul_mat(ctx0, model.layers[il].ff_up_w,   cur);
-            gate_proj = ggml_silu(ctx0, gate_proj); // pixtral uses silu
-            cur = ggml_mul(ctx0, up_proj, gate_proj);
-            cur = ggml_mul_mat(ctx0, model.layers[il].ff_down_w, cur);
+            auto inp_1 = ggml_conv_2d(ctx0, model.patch_embeddings_1, inp_raw, patch_size, patch_size, 0, 0, 1, 1);
+            inp = ggml_add(ctx0, inp, inp_1);
+
+            inp = ggml_cont(ctx0, ggml_permute(ctx0, inp, 1, 2, 0, 3));  // [w, h, c, b] -> [c, w, h, b]
+            inp = ggml_reshape_4d(
+                ctx0, inp,
+                n_embd * 2, n_patches_x / 2, n_patches_y, batch_size);
+            inp = ggml_reshape_4d(
+                ctx0, inp,
+                n_embd * 2, n_patches_x / 2, 2, batch_size * (n_patches_y / 2));
+            inp = ggml_cont(ctx0, ggml_permute(ctx0, inp, 0, 2, 1, 3));
+            inp = ggml_reshape_3d(
+                ctx0, inp,
+                n_embd, n_patches_x * n_patches_y, batch_size);
         }
 
-        // residual 2
-        cur = ggml_add(ctx0, embeddings, cur);
+        ggml_tensor * inpL           = inp;
+        ggml_tensor * window_mask    = nullptr;
+        ggml_tensor * window_idx     = nullptr;
+        ggml_tensor * inv_window_idx = nullptr;
 
-        embeddings = cur;
-    }
+        ggml_tensor * positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, num_position_ids);
+        ggml_set_name(positions, "positions");
+        ggml_set_input(positions);
 
-    // LlavaMultiModalProjector (with GELU activation)
-    {
-        embeddings = ggml_mul_mat(ctx0, model.mm_1_w, embeddings);
-        embeddings = ggml_add(ctx0, embeddings, model.mm_1_b);
+        // pre-layernorm
+        if (model.pre_ln_w) {
+            inpL = build_norm(inpL, model.pre_ln_w, model.pre_ln_b, norm_t, eps, -1);
+        }
 
-        embeddings = ggml_gelu(ctx0, embeddings);
-        embeddings = ggml_mul_mat(ctx0, model.mm_2_w, embeddings);
-        embeddings = ggml_add(ctx0, embeddings, model.mm_2_b);
-    }
+        if (use_window_attn) {
+            // handle window attention inputs
+            inv_window_idx = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_pos / 4);
+            ggml_set_name(inv_window_idx, "inv_window_idx");
+            ggml_set_input(inv_window_idx);
+            // mask for window attention
+            window_mask = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_pos, n_pos);
+            ggml_set_name(window_mask, "window_mask");
+            ggml_set_input(window_mask);
 
-    // arrangement of the [IMG_BREAK] token
-    {
-        // not efficient, but works
-        // the trick is to view the embeddings as a 3D tensor with shape [hidden_size, n_patches_per_row, n_rows]
-        // and then concatenate the [IMG_BREAK] token to the end of each row, aka n_patches_per_row dimension
-        // after the concatenation, we have a tensor with shape [hidden_size, n_patches_per_row + 1, n_rows]
+            // inpL shape: [n_embd, n_patches_x * n_patches_y, batch_size]
+            GGML_ASSERT(batch_size == 1);
+            inpL = ggml_reshape_2d(ctx0, inpL, n_embd * 4, n_patches_x * n_patches_y * batch_size / 4);
+            inpL = ggml_get_rows(ctx0, inpL, inv_window_idx);
+            inpL = ggml_reshape_3d(ctx0, inpL, n_embd, n_patches_x * n_patches_y, batch_size);
+        }
 
-        const int n_embd_text     = embeddings->ne[0];
-        const int n_tokens_output = num_patches + n_patches_y - 1; // one [IMG_BREAK] per row, except the last row
-
-        ggml_tensor * cur = ggml_reshape_3d(ctx0, embeddings, n_embd_text, n_patches_x, n_patches_y);
-        ggml_tensor * tok = ggml_new_tensor_3d(ctx0, embeddings->type, n_embd_text, 1, n_patches_y);
-        tok = ggml_scale(ctx0, tok, 0.0); // clear the tensor
-        tok = ggml_add(ctx0, tok, model.token_embd_img_break);
-        cur = ggml_concat(ctx0, cur, tok, 1);
-        embeddings = ggml_view_2d(ctx0, cur,
-            n_embd_text, n_tokens_output,
-            ggml_row_size(cur->type, n_embd_text), 0);
-    }
-
-    // build the graph
-    ggml_build_forward_expand(gf, embeddings);
-
-    return gf;
-}
-
-static ggml_cgraph * clip_image_build_graph_qwen25vl(clip_ctx * ctx, const clip_image_f32_batch & imgs) {
-    const auto & model = ctx->vision_model;
-    const auto & hparams = model.hparams;
-
-    const int image_size_width  = imgs.entries[0]->nx;
-    const int image_size_height = imgs.entries[0]->ny;
-
-    const bool use_window_attn = hparams.n_wa_pattern > 0;
-
-    const int n_wa_pattern         = hparams.n_wa_pattern;
-    const int patch_size           = hparams.patch_size;
-    const int num_patches          = ((image_size_width / patch_size) * (image_size_height / patch_size));
-    const int patches_w            = image_size_width / patch_size;
-    const int patches_h            = image_size_height / patch_size;
-    const int num_positions        = num_patches + (model.class_embedding ? 1 : 0);
-    const int num_position_ids     = num_positions * 4; // m-rope requires 4 dim per position
-    const int hidden_size          = hparams.hidden_size;
-    const int n_head               = hparams.n_head;
-    const int d_head               = hidden_size / n_head;
-    const int n_layer              = hparams.n_layer;
-    const float eps                = hparams.eps;
-
-    int mrope_sections[4] = {d_head/4, d_head/4, d_head/4, d_head/4};
-
-    const int batch_size = imgs.entries.size();
-    GGML_ASSERT(batch_size == 1);
-
-    struct ggml_init_params params = {
-        /*.mem_size   =*/ ctx->buf_compute_meta.size(),
-        /*.mem_buffer =*/ ctx->buf_compute_meta.data(),
-        /*.no_alloc   =*/ true,
-    };
-
-    ggml_context_ptr ctx0_ptr(ggml_init(params));
-    auto ctx0 = ctx0_ptr.get();
-
-    struct ggml_cgraph * gf = ggml_new_graph(ctx0);
-
-    struct ggml_tensor * inp_raw = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, image_size_width, image_size_height, 3, batch_size);
-    ggml_set_name(inp_raw, "inp_raw");
-    ggml_set_input(inp_raw);
-
-    struct ggml_tensor * inp = ggml_conv_2d(ctx0, model.patch_embeddings_0, inp_raw, patch_size, patch_size, 0, 0, 1, 1);
-
-    GGML_ASSERT(image_size_width  % (patch_size * 2) == 0);
-    GGML_ASSERT(image_size_height % (patch_size * 2) == 0);
-
-    auto inp_1 = ggml_conv_2d(ctx0, model.patch_embeddings_1, inp_raw, patch_size, patch_size, 0, 0, 1, 1);
-    inp = ggml_add(ctx0, inp, inp_1);
-
-    inp = ggml_cont(ctx0, ggml_permute(ctx0, inp, 1, 2, 0, 3));  // [w, h, c, b] -> [c, w, h, b]
-    inp = ggml_reshape_4d(
-        ctx0, inp,
-        hidden_size * 2, patches_w / 2, patches_h, batch_size);
-    inp = ggml_reshape_4d(
-        ctx0, inp,
-        hidden_size * 2, patches_w / 2, 2, batch_size * (patches_h / 2));
-    inp = ggml_cont(ctx0, ggml_permute(ctx0, inp, 0, 2, 1, 3));
-    inp = ggml_reshape_3d(
-        ctx0, inp,
-        hidden_size, patches_w * patches_h, batch_size);
-
-    if (model.patch_bias) {
-        // inp = ggml_add(ctx0, inp, ggml_repeat(ctx0, model.patch_bias, inp));
-        inp = ggml_add(ctx0, inp, model.patch_bias);
-    }
-    struct ggml_tensor * embeddings     = inp;
-    struct ggml_tensor * window_mask    = nullptr;
-    struct ggml_tensor * window_idx     = nullptr;
-    struct ggml_tensor * inv_window_idx = nullptr;
-
-    struct ggml_tensor * positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, num_position_ids);
-    ggml_set_name(positions, "positions");
-    ggml_set_input(positions);
-
-    // pre-layernorm
-    if (model.pre_ln_w) {
-        embeddings = ggml_rms_norm(ctx0, embeddings, eps);
-        ggml_set_name(embeddings, "pre_ln");
-
-        embeddings = ggml_mul(ctx0, embeddings, model.pre_ln_w);
-    }
-
-    if (use_window_attn) {
-        // handle window attention inputs
-        inv_window_idx = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, num_positions / 4);
-        ggml_set_name(inv_window_idx, "inv_window_idx");
-        ggml_set_input(inv_window_idx);
-        // mask for window attention
-        window_mask = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, num_positions, num_positions);
-        ggml_set_name(window_mask, "window_mask");
-        ggml_set_input(window_mask);
-
-        // embeddings shape: [hidden_size, patches_w * patches_h, batch_size]
-        GGML_ASSERT(batch_size == 1);
-        embeddings = ggml_reshape_2d(ctx0, embeddings, hidden_size * 4, patches_w * patches_h * batch_size / 4);
-        embeddings = ggml_get_rows(ctx0, embeddings, inv_window_idx);
-        embeddings = ggml_reshape_3d(ctx0, embeddings, hidden_size, patches_w * patches_h, batch_size);
-    }
-
-    // loop over layers
-    for (int il = 0; il < n_layer; il++) {
-        struct ggml_tensor * cur = embeddings; // embeddings = residual, cur = hidden_states
-
-        // rmsnorm1
-        cur = ggml_rms_norm(ctx0, cur, eps);
-        cur = ggml_mul(ctx0, cur, model.layers[il].ln_1_w);
-
-        // self-attention
-        {
-
-            struct ggml_tensor * Q =
-                ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].q_w, cur), model.layers[il].q_b);
-
-            Q = ggml_reshape_4d(ctx0, Q, d_head, n_head, num_positions, batch_size);
-            Q = ggml_rope_multi(
-                ctx0, Q, positions, nullptr,
-                d_head/2, mrope_sections, GGML_ROPE_TYPE_VISION, 32768, 10000, 1, 0, 1, 32, 1);
-            Q = ggml_cont(ctx0, ggml_permute(ctx0, Q, 0, 2, 1, 3));
-            Q = ggml_reshape_3d(ctx0, Q, d_head, num_positions, n_head * batch_size);
-
-            struct ggml_tensor * K =
-                ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].k_w, cur), model.layers[il].k_b);
-
-            K = ggml_reshape_4d(ctx0, K, d_head, n_head, num_positions, batch_size);
-            K = ggml_rope_multi(
-                ctx0, K, positions, nullptr,
-                d_head/2, mrope_sections, GGML_ROPE_TYPE_VISION, 32768, 10000, 1, 0, 1, 32, 1);
-            K = ggml_cont(ctx0, ggml_permute(ctx0, K, 0, 2, 1, 3));
-            K = ggml_reshape_3d(ctx0, K, d_head, num_positions, n_head * batch_size);
-
-            struct ggml_tensor * V =
-                ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].v_w, cur), model.layers[il].v_b);
-
-            V = ggml_reshape_4d(ctx0, V, d_head, n_head, num_positions, batch_size);
-            V = ggml_cont(ctx0, ggml_permute(ctx0, V, 1, 2, 0, 3));
-            V = ggml_reshape_3d(ctx0, V, num_positions, d_head, n_head * batch_size);
-
-            struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
+        // loop over layers
+        for (int il = 0; il < n_layer; il++) {
+            auto & layer = model.layers[il];
             const bool full_attn = use_window_attn ? (il + 1) % n_wa_pattern == 0 : true;
-            if (full_attn) {
-                KQ = ggml_soft_max_ext(ctx0, KQ, nullptr, 1.0f / sqrtf((float)d_head), 0.0f);
-            } else {
-                KQ = ggml_soft_max_ext(ctx0, KQ, window_mask, 1.0f / sqrtf((float)d_head), 0.0f);
-            }
 
-            struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ);
-            KQV = ggml_reshape_4d(ctx0, KQV, d_head, num_positions, n_head, batch_size);
-            KQV = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
+            ggml_tensor * cur = inpL; // inpL = residual, cur = hidden_states
 
-            cur = ggml_cont_3d(ctx0, KQV, hidden_size, num_positions, batch_size);
-        }
+            // layernorm1
+            cur = build_norm(cur, layer.ln_1_w, layer.ln_1_b, norm_t, eps, il);
+            cb(cur, "ln1", il);
 
-        // attention output
-        cur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].o_w, cur), model.layers[il].o_b);
-
-        // re-add the layer input, e.g., residual
-        cur = ggml_add(ctx0, cur, embeddings);
-
-        embeddings = cur; // embeddings = residual, cur = hidden_states
-
-        // rms norm2
-        cur = ggml_rms_norm(ctx0, cur, eps);
-        cur = ggml_mul(ctx0, cur, model.layers[il].ln_2_w);
-
-        // mlp
-        // ffn_up
-        auto cur_up = ggml_mul_mat(ctx0, model.layers[il].ff_o_w, cur);
-        cur_up = ggml_add(ctx0, cur_up, model.layers[il].ff_o_b);
-
-        auto cur_gate = ggml_mul_mat(ctx0, model.layers[il].ff_g_w, cur);
-        cur_gate = ggml_add(ctx0, cur_gate, model.layers[il].ff_g_b);
-        // TODO : only 2 of these 3 are actually used, should we remove one of them?
-        if (ctx->use_gelu) {
-            cur_gate = ggml_gelu_inplace(ctx0, cur_gate);
-        } else if (ctx->use_silu) {
-            cur_gate = ggml_silu_inplace(ctx0, cur_gate);
-        } else {
-            cur_gate = ggml_gelu_quick_inplace(ctx0, cur_gate);
-        }
-        cur = ggml_mul(ctx0, cur_gate, cur_up);
-
-        // ffn_down
-        cur = ggml_mul_mat(ctx0, model.layers[il].ff_i_w, cur);
-        cur = ggml_add(ctx0, cur, model.layers[il].ff_i_b);
-
-        // residual 2
-        cur = ggml_add(ctx0, embeddings, cur);
-
-        embeddings = cur;
-    }
-
-    // post-layernorm
-    if (model.post_ln_w) {
-        embeddings = ggml_rms_norm(ctx0, embeddings, eps);
-        ggml_set_name(embeddings, "post_ln");
-
-        embeddings = ggml_mul(ctx0, embeddings, model.post_ln_w);
-    }
-
-    embeddings = ggml_reshape_3d(ctx0, embeddings, hidden_size * 4, num_positions / 4, batch_size);
-
-    embeddings = ggml_mul_mat(ctx0, model.mm_0_w, embeddings);
-    embeddings = ggml_add(ctx0, embeddings, model.mm_0_b);
-
-    // GELU activation
-    embeddings = ggml_gelu(ctx0, embeddings);
-
-    // Second linear layer
-    embeddings = ggml_mul_mat(ctx0, model.mm_1_w, embeddings);
-    embeddings = ggml_add(ctx0, embeddings, model.mm_1_b);
-
-    if (use_window_attn) {
-        window_idx = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, num_positions / 4);
-        ggml_set_name(window_idx, "window_idx");
-        ggml_set_input(window_idx);
-
-        // embeddings shape: [hidden_size, patches_w * patches_h, batch_size]
-        GGML_ASSERT(batch_size == 1);
-        embeddings = ggml_reshape_2d(ctx0, embeddings, hparams.projection_dim, patches_w * patches_h / 4);
-        embeddings = ggml_get_rows(ctx0, embeddings, window_idx);
-        embeddings = ggml_reshape_3d(ctx0, embeddings, hparams.projection_dim, patches_w * patches_h / 4, batch_size);
-    }
-
-    // build the graph
-    ggml_build_forward_expand(gf, embeddings);
-
-    return gf;
-}
-
-static ggml_cgraph * clip_image_build_graph_legacy(clip_ctx * ctx, const clip_image_f32_batch & imgs, struct clip_image_size load_image_size, bool is_inf = false) {
-    const auto & model = ctx->vision_model;
-    const auto & hparams = model.hparams;
-
-    const int image_size = hparams.image_size;
-    int image_size_width  = image_size;
-    int image_size_height = image_size;
-
-    if (ctx->proj_type == PROJECTOR_TYPE_MINICPMV) {
-        LOG_DBG("%s: %d %d\n", __func__, load_image_size.width, load_image_size.height);
-        image_size_width  = load_image_size.width;
-        image_size_height = load_image_size.height;
-        if (is_inf) {
-            image_size_width  = imgs.entries[0]->nx;
-            image_size_height = imgs.entries[0]->ny;
-        }
-    }
-
-    else if (ctx->proj_type == PROJECTOR_TYPE_QWEN2VL) {
-        // use the image's native resolution when image is avaible
-        if (is_inf) {
-        // if (imgs->data->nx && imgs->data->ny) {
-            image_size_width  = imgs.entries[0]->nx;
-            image_size_height = imgs.entries[0]->ny;
-        }
-    }
-
-    const int patch_size           = hparams.patch_size;
-    const int num_patches          = ((image_size_width / patch_size) * (image_size_height / patch_size));
-    const int patches_w            = image_size_width / patch_size;
-    const int patches_h            = image_size_height / patch_size;
-    const int num_positions        = num_patches + (model.class_embedding ? 1 : 0);
-    const int num_position_ids     = ctx->proj_type == PROJECTOR_TYPE_QWEN2VL ? num_positions * 4 : num_positions;
-    const int hidden_size          = hparams.hidden_size;
-    const int n_head               = hparams.n_head;
-    const int d_head               = hidden_size / n_head;
-    const float eps                = hparams.eps;
-    int mrope_sections[4] = {d_head/4, d_head/4, d_head/4, d_head/4};
-
-    const int batch_size = imgs.entries.size();
-
-    if (ctx->has_llava_projector
-            || ctx->proj_type == PROJECTOR_TYPE_MINICPMV
-            || ctx->proj_type == PROJECTOR_TYPE_GLM_EDGE) {
-        GGML_ASSERT(batch_size == 1);
-    }
-
-    struct ggml_init_params params = {
-        /*.mem_size   =*/ ctx->buf_compute_meta.size(),
-        /*.mem_buffer =*/ ctx->buf_compute_meta.data(),
-        /*.no_alloc   =*/ true,
-    };
-
-    ggml_context_ptr ctx0_ptr(ggml_init(params));
-    auto ctx0 = ctx0_ptr.get();
-
-    struct ggml_cgraph * gf = ggml_new_graph(ctx0);
-
-    struct ggml_tensor * inp_raw = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, image_size_width, image_size_height, 3, batch_size);
-    ggml_set_name(inp_raw, "inp_raw");
-    ggml_set_input(inp_raw);
-
-    struct ggml_tensor * inp = ggml_conv_2d(ctx0, model.patch_embeddings_0, inp_raw, patch_size, patch_size, 0, 0, 1, 1);
-
-    if (ctx->proj_type == PROJECTOR_TYPE_QWEN2VL) {
-        GGML_ASSERT(image_size_width  % (patch_size * 2) == 0);
-        GGML_ASSERT(image_size_height % (patch_size * 2) == 0);
-
-        auto inp_1 = ggml_conv_2d(ctx0, model.patch_embeddings_1, inp_raw, patch_size, patch_size, 0, 0, 1, 1);
-        inp = ggml_add(ctx0, inp, inp_1);
-        inp = ggml_cont(ctx0, ggml_permute(ctx0, inp, 1, 2, 0, 3));  // [w, h, c, b] -> [c, w, h, b]
-        inp = ggml_reshape_4d(
-            ctx0, inp,
-            hidden_size * 2, patches_w / 2, patches_h, batch_size);
-        inp = ggml_reshape_4d(
-            ctx0, inp,
-            hidden_size * 2, patches_w / 2, 2, batch_size * (patches_h / 2));
-        inp = ggml_cont(ctx0, ggml_permute(ctx0, inp, 0, 2, 1, 3));
-        inp = ggml_reshape_3d(
-            ctx0, inp,
-            hidden_size, patches_w * patches_h, batch_size);
-    }
-    else {
-        inp = ggml_reshape_3d(ctx0, inp, num_patches, hidden_size, batch_size);
-        inp = ggml_cont(ctx0, ggml_permute(ctx0, inp, 1, 0, 2, 3));
-    }
-
-    if (model.patch_bias) {
-        // inp = ggml_add(ctx0, inp, ggml_repeat(ctx0, model.patch_bias, inp));
-        inp = ggml_add(ctx0, inp, model.patch_bias);
-    }
-    struct ggml_tensor * embeddings = inp;
-    struct ggml_tensor * pos_embed = nullptr;
-
-    // concat class_embeddings and patch_embeddings
-    if (model.class_embedding) {
-        embeddings = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, hidden_size, num_positions, batch_size);
-        embeddings = ggml_scale(ctx0, embeddings, 0.0f); // set to all zeros
-        embeddings = ggml_acc(ctx0, embeddings, model.class_embedding,
-                embeddings->nb[1], embeddings->nb[2], embeddings->nb[3], 0);
-        embeddings = ggml_acc(ctx0, embeddings, inp,
-                embeddings->nb[1], embeddings->nb[2], embeddings->nb[3], model.class_embedding->nb[1]);
-    }
-
-    struct ggml_tensor * positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, num_position_ids);
-    ggml_set_name(positions, "positions");
-    ggml_set_input(positions);
-
-    if (ctx->proj_type != PROJECTOR_TYPE_QWEN2VL) { // qwen2vl does NOT use learned position embeddings
-        embeddings =
-            ggml_add(ctx0, embeddings, ggml_get_rows(ctx0, model.position_embeddings, positions));
-    }
-
-    if (ctx->proj_type == PROJECTOR_TYPE_MINICPMV) {
-        int pos_w = image_size_width/patch_size;
-        int pos_h = image_size_height/patch_size;
-        int n_output_dim = clip_n_mmproj_embd(ctx);
-        pos_embed = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_output_dim, pos_w * pos_h, 1);
-        ggml_set_name(pos_embed, "pos_embed");
-        ggml_set_input(pos_embed);
-    }
-
-    // pre-layernorm
-    if (model.pre_ln_w) {
-        embeddings = ggml_norm(ctx0, embeddings, eps);
-        ggml_set_name(embeddings, "pre_ln");
-
-        embeddings = ggml_add(ctx0, ggml_mul(ctx0, embeddings, model.pre_ln_w), model.pre_ln_b);
-    }
-
-    std::vector<struct ggml_tensor *> embedding_stack;
-    const auto & vision_feature_layer = hparams.vision_feature_layer;
-
-    // loop over layers
-    for (int il = 0; il < ctx->max_feature_layer; il++) {
-        struct ggml_tensor * cur = embeddings; // embeddings = residual, cur = hidden_states
-
-        // If this is an embedding feature layer, save the output.
-        // NOTE: 0 index here refers to the input to the encoder.
-        if (vision_feature_layer.find(il) != vision_feature_layer.end()) {
-            embedding_stack.push_back(embeddings);
-        }
-
-        //const size_t nb_q_w = model.layers[il].q_w->nb[0];
-
-        // layernorm1
-        {
-            cur = ggml_norm(ctx0, cur, eps);
-
-            cur = ggml_add(ctx0, ggml_mul(ctx0, cur, model.layers[il].ln_1_w),
-                           model.layers[il].ln_1_b);
-        }
-
-        // self-attention
-        {
-
-            struct ggml_tensor * Q =
-                ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].q_w, cur), model.layers[il].q_b);
-
-            Q = ggml_reshape_4d(ctx0, Q, d_head, n_head, num_positions, batch_size);
-            if (ctx->proj_type == PROJECTOR_TYPE_QWEN2VL) {
-                Q = ggml_rope_multi(
-                    ctx0, Q, positions, nullptr,
-                    d_head/2, mrope_sections, GGML_ROPE_TYPE_VISION, 32768, 10000, 1, 0, 1, 32, 1);
-            }
-            Q = ggml_cont(ctx0, ggml_permute(ctx0, Q, 0, 2, 1, 3));
-            Q = ggml_reshape_3d(ctx0, Q, d_head, num_positions, n_head * batch_size);
-
-            struct ggml_tensor * K =
-                ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].k_w, cur), model.layers[il].k_b);
-
-            K = ggml_reshape_4d(ctx0, K, d_head, n_head, num_positions, batch_size);
-            if (ctx->proj_type == PROJECTOR_TYPE_QWEN2VL) {
-                K = ggml_rope_multi(
-                    ctx0, K, positions, nullptr,
-                    d_head/2, mrope_sections, GGML_ROPE_TYPE_VISION, 32768, 10000, 1, 0, 1, 32, 1);
-            }
-            K = ggml_cont(ctx0, ggml_permute(ctx0, K, 0, 2, 1, 3));
-            K = ggml_reshape_3d(ctx0, K, d_head, num_positions, n_head * batch_size);
-
-            struct ggml_tensor * V =
-                ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].v_w, cur), model.layers[il].v_b);
-
-            V = ggml_reshape_4d(ctx0, V, d_head, n_head, num_positions, batch_size);
-            V = ggml_cont(ctx0, ggml_permute(ctx0, V, 1, 2, 0, 3));
-            V = ggml_reshape_3d(ctx0, V, num_positions, d_head, n_head * batch_size);
-
-            struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
-            KQ = ggml_soft_max_ext(ctx0, KQ, nullptr, 1.0f / sqrtf((float)d_head), 0.0f);
-            struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ);
-            KQV = ggml_reshape_4d(ctx0, KQV, d_head, num_positions, n_head, batch_size);
-            KQV = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
-
-            cur = ggml_cont_3d(ctx0, KQV, hidden_size, num_positions, batch_size);
-        }
-
-        // attention output
-        cur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].o_w, cur), model.layers[il].o_b);
-
-        // re-add the layer input, e.g., residual
-        cur = ggml_add(ctx0, cur, embeddings);
-
-        embeddings = cur; // embeddings = residual, cur = hidden_states
-
-        // layernorm2
-        {
-            cur = ggml_norm(ctx0, cur, eps);
-
-            cur = ggml_add(ctx0, ggml_mul(ctx0, cur, model.layers[il].ln_2_w), model.layers[il].ln_2_b);
-        }
-
-        cur = ggml_mul_mat(ctx0, model.layers[il].ff_i_w, cur);
-        cur = ggml_add(ctx0, cur, model.layers[il].ff_i_b);
-
-        if (ctx->use_gelu) {
-            cur = ggml_gelu_inplace(ctx0, cur);
-        } else if (ctx->use_silu) {
-            cur = ggml_silu_inplace(ctx0, cur);
-        } else {
-            cur = ggml_gelu_quick_inplace(ctx0, cur);
-        }
-
-        cur = ggml_mul_mat(ctx0, model.layers[il].ff_o_w, cur);
-        cur = ggml_add(ctx0, cur, model.layers[il].ff_o_b);
-
-        // residual 2
-        cur = ggml_add(ctx0, embeddings, cur);
-
-        embeddings = cur;
-    }
-
-    // post-layernorm
-    if (model.post_ln_w) {
-        embeddings = ggml_norm(ctx0, embeddings, eps);
-        ggml_set_name(embeddings, "post_ln");
-
-        embeddings = ggml_add(ctx0, ggml_mul(ctx0, embeddings, model.post_ln_w), model.post_ln_b);
-    }
-
-    // final layer is a vision feature layer
-    if (vision_feature_layer.find(ctx->max_feature_layer) != vision_feature_layer.end()) {
-        embedding_stack.push_back(embeddings);
-    }
-
-    // If feature layers are explicitly set, stack them (if we have multiple)
-    if (!embedding_stack.empty()) {
-        embeddings = embedding_stack[0];
-        for (size_t i = 1; i < embedding_stack.size(); i++) {
-            embeddings = ggml_concat(ctx0, embeddings, embedding_stack[i], 0);
-        }
-    }
-
-    // llava projector
-    if (ctx->has_llava_projector) {
-        embeddings = ggml_reshape_2d(ctx0, embeddings, embeddings->ne[0], embeddings->ne[1]);
-
-        struct ggml_tensor * patches = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, num_patches);
-        ggml_set_name(patches, "patches");
-        ggml_set_input(patches);
-
-        // shape [1, 576, 1024]
-        // ne is whcn, ne = [1024, 576, 1, 1]
-        embeddings = ggml_get_rows(ctx0, embeddings, patches);
-
-        // print_tensor_info(embeddings, "embeddings");
-
-        // llava projector
-        if (ctx->proj_type == PROJECTOR_TYPE_MLP) {
-            embeddings = ggml_mul_mat(ctx0, model.mm_0_w, embeddings);
-            embeddings = ggml_add(ctx0, embeddings, model.mm_0_b);
-
-            embeddings = ggml_gelu(ctx0, embeddings);
-            if (model.mm_2_w) {
-                embeddings = ggml_mul_mat(ctx0, model.mm_2_w, embeddings);
-                embeddings = ggml_add(ctx0, embeddings, model.mm_2_b);
-            }
-        }
-        else if (ctx->proj_type == PROJECTOR_TYPE_MLP_NORM) {
-            embeddings = ggml_mul_mat(ctx0, model.mm_0_w, embeddings);
-            embeddings = ggml_add(ctx0, embeddings, model.mm_0_b);
-            // ggml_tensor_printf(embeddings, "mm_0_w",0,true,false);
-            // First LayerNorm
-            embeddings = ggml_norm(ctx0, embeddings, eps);
-            embeddings = ggml_add(ctx0, ggml_mul(ctx0, embeddings, model.mm_1_w),
-                                model.mm_1_b);
-
-            // GELU activation
-            embeddings = ggml_gelu(ctx0, embeddings);
-
-            // Second linear layer
-            embeddings = ggml_mul_mat(ctx0, model.mm_3_w, embeddings);
-            embeddings = ggml_add(ctx0, embeddings, model.mm_3_b);
-
-            // Second LayerNorm
-            embeddings = ggml_norm(ctx0, embeddings, eps);
-            embeddings = ggml_add(ctx0, ggml_mul(ctx0, embeddings, model.mm_4_w),
-                                model.mm_4_b);
-        }
-        else if (ctx->proj_type == PROJECTOR_TYPE_LDP) {
-            // MobileVLM projector
-            int n_patch = 24;
-            struct ggml_tensor * mlp_1 = ggml_mul_mat(ctx0, model.mm_model_mlp_1_w, embeddings);
-            mlp_1 = ggml_add(ctx0, mlp_1, model.mm_model_mlp_1_b);
-            mlp_1 = ggml_gelu(ctx0, mlp_1);
-            struct ggml_tensor * mlp_3 = ggml_mul_mat(ctx0, model.mm_model_mlp_3_w, mlp_1);
-            mlp_3 = ggml_add(ctx0, mlp_3, model.mm_model_mlp_3_b);
-            // mlp_3 shape = [1, 576, 2048], ne = [2048, 576, 1, 1]
-
-            // block 1
-            struct ggml_tensor * block_1 = nullptr;
+            // self-attention
             {
-                // transpose from [1, 576, 2048] --> [1, 2048, 576] --> [1, 2048, 24, 24]
-                mlp_3 = ggml_cont(ctx0, ggml_permute(ctx0, mlp_3, 1, 0, 2, 3));
-                mlp_3 = ggml_reshape_4d(ctx0, mlp_3, n_patch, n_patch, mlp_3->ne[1], mlp_3->ne[2]);
-                // stride = 1, padding = 1, bias is nullptr
-                block_1 = ggml_conv_2d_dw(ctx0, model.mm_model_block_1_block_0_0_w, mlp_3, 1, 1, 1, 1, 1, 1);
+                ggml_tensor * Qcur = ggml_add(ctx0,
+                    ggml_mul_mat(ctx0, layer.q_w, cur), layer.q_b);
+                ggml_tensor * Kcur = ggml_add(ctx0,
+                    ggml_mul_mat(ctx0, layer.k_w, cur), layer.k_b);
+                ggml_tensor * Vcur = ggml_add(ctx0,
+                    ggml_mul_mat(ctx0, layer.v_w, cur), layer.v_b);
 
-                // layer norm
-                // // block_1 shape = [1, 2048, 24, 24], ne = [24, 24, 2048, 1]
-                block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 1, 2, 0, 3));
-                // block_1 shape = [1, 24, 24, 2048], ne = [2048, 24, 24, 1]
-                block_1 = ggml_norm(ctx0, block_1, eps);
-                block_1 = ggml_add(ctx0, ggml_mul(ctx0, block_1, model.mm_model_block_1_block_0_1_w), model.mm_model_block_1_block_0_1_b);
-                block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 2, 0, 1, 3));
+                Qcur = ggml_reshape_3d(ctx0, Qcur, d_head, n_head, n_patches);
+                Kcur = ggml_reshape_3d(ctx0, Kcur, d_head, n_head, n_patches);
+                Vcur = ggml_reshape_3d(ctx0, Vcur, d_head, n_head, n_patches);
 
-                // block_1 shape = [1, 2048, 24, 24], ne = [24, 24, 2048, 1]
-                // hardswish
-                struct ggml_tensor * block_1_hw = ggml_hardswish(ctx0, block_1);
+                cb(Qcur, "Qcur", il);
+                cb(Kcur, "Kcur", il);
+                cb(Vcur, "Vcur", il);
 
-                block_1 = ggml_pool_2d(ctx0, block_1_hw, GGML_OP_POOL_AVG, block_1_hw->ne[0], block_1_hw->ne[1], block_1_hw->ne[0], block_1_hw->ne[1], 0, 0);
-                // block_1 shape = [1, 2048, 1, 1], ne = [1, 1, 2048, 1]
-                // pointwise conv
-                block_1 = ggml_reshape_2d(ctx0, block_1, block_1->ne[0]*block_1->ne[1]*block_1->ne[2], block_1->ne[3]);
-                block_1 = ggml_mul_mat(ctx0, model.mm_model_block_1_block_1_fc1_w, block_1);
-                block_1 = ggml_add(ctx0, block_1, model.mm_model_block_1_block_1_fc1_b);
-                block_1 = ggml_relu(ctx0, block_1);
-                block_1 = ggml_mul_mat(ctx0, model.mm_model_block_1_block_1_fc2_w, block_1);
-                block_1 = ggml_add(ctx0, block_1, model.mm_model_block_1_block_1_fc2_b);
-                block_1 = ggml_hardsigmoid(ctx0, block_1);
-                // block_1_hw shape = [1, 2048, 24, 24], ne = [24, 24, 2048, 1], block_1 shape = [1, 2048], ne = [2048, 1, 1, 1]
-                block_1 = ggml_reshape_4d(ctx0, block_1, 1, 1, block_1->ne[0], block_1->ne[1]);
-                block_1 = ggml_mul(ctx0, block_1_hw, block_1);
+                // apply M-RoPE
+                Qcur = ggml_rope_multi(
+                    ctx0, Qcur, positions, nullptr,
+                    d_head/2, mrope_sections, GGML_ROPE_TYPE_VISION, 32768, 10000, 1, 0, 1, 32, 1);
+                Kcur = ggml_rope_multi(
+                    ctx0, Kcur, positions, nullptr,
+                    d_head/2, mrope_sections, GGML_ROPE_TYPE_VISION, 32768, 10000, 1, 0, 1, 32, 1);
 
-                int w = block_1->ne[0], h = block_1->ne[1];
-                block_1 = ggml_reshape_3d(ctx0, block_1, w*h, block_1->ne[2], block_1->ne[3]);
-                block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 1, 0, 2, 3));
+                cb(Qcur, "Qcur_rope", il);
+                cb(Kcur, "Kcur_rope", il);
 
-                // block_1 shape = [1, 24*24, 2048], ne = [24*24, 2048, 1]
-                block_1 = ggml_mul_mat(ctx0, model.mm_model_block_1_block_2_0_w, block_1);
-                block_1 = ggml_reshape_4d(ctx0, block_1, block_1->ne[0], w, h, block_1->ne[3]);
+                ggml_tensor * attn_mask = full_attn ? nullptr : window_mask;
 
-                // block_1 shape = [1, 24, 24, 2048], ne = [2048, 24, 24, 1]
-                block_1 = ggml_norm(ctx0, block_1, eps);
-                block_1 = ggml_add(ctx0, ggml_mul(ctx0, block_1, model.mm_model_block_1_block_2_1_w), model.mm_model_block_1_block_2_1_b);
-                block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 2, 0, 1, 3));
-                // block1 shape = [1, 2048, 24, 24], ne = [24, 24, 2048, 1]
-                // residual
-                block_1 = ggml_add(ctx0, mlp_3, block_1);
+                cur = build_attn(layer.o_w, layer.o_b,
+                    Qcur, Kcur, Vcur, attn_mask, kq_scale, il);
+                cb(cur, "attn_out", il);
             }
 
-            // block_2
-            {
-                // stride = 2
-                block_1 = ggml_conv_2d_dw(ctx0, model.mm_model_block_2_block_0_0_w, block_1, 2, 2, 1, 1, 1, 1);
+            // re-add the layer input, e.g., residual
+            cur = ggml_add(ctx0, cur, inpL);
 
-                // block_1 shape = [1, 2048, 12, 12], ne = [12, 12, 2048, 1]
-                // layer norm
-                block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 1, 2, 0, 3));
-                // block_1 shape = [1, 12, 12, 2048], ne = [2048, 12, 12, 1]
-                block_1 = ggml_norm(ctx0, block_1, eps);
-                block_1 = ggml_add(ctx0, ggml_mul(ctx0, block_1, model.mm_model_block_2_block_0_1_w), model.mm_model_block_2_block_0_1_b);
-                block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 2, 0, 1, 3));
-                // block_1 shape = [1, 2048, 12, 12], ne = [12, 12, 2048, 1]
-                // hardswish
-                struct ggml_tensor * block_1_hw = ggml_hardswish(ctx0, block_1);
+            inpL = cur; // inpL = residual, cur = hidden_states
 
-                // not sure the parameters is right for globalAvgPooling
-                block_1 = ggml_pool_2d(ctx0, block_1_hw, GGML_OP_POOL_AVG, block_1_hw->ne[0], block_1_hw->ne[1], block_1_hw->ne[0], block_1_hw->ne[1], 0, 0);
-                // block_1 shape = [1, 2048, 1, 1], ne = [1, 1, 2048, 1]
-                // pointwise conv
-                block_1 = ggml_reshape_2d(ctx0, block_1, block_1->ne[0]*block_1->ne[1]*block_1->ne[2], block_1->ne[3]);
-                block_1 = ggml_mul_mat(ctx0, model.mm_model_block_2_block_1_fc1_w, block_1);
-                block_1 = ggml_add(ctx0, block_1, model.mm_model_block_2_block_1_fc1_b);
-                block_1 = ggml_relu(ctx0, block_1);
-                block_1 = ggml_mul_mat(ctx0, model.mm_model_block_2_block_1_fc2_w, block_1);
-                block_1 = ggml_add(ctx0, block_1, model.mm_model_block_2_block_1_fc2_b);
-                block_1 = ggml_hardsigmoid(ctx0, block_1);
+            cb(cur, "ffn_inp", il);
 
-                // block_1_hw shape = [1, 2048, 12, 12], ne = [12, 12, 2048, 1], block_1 shape = [1, 2048, 1, 1], ne = [1, 1, 2048, 1]
-                block_1 = ggml_reshape_4d(ctx0, block_1, 1, 1, block_1->ne[0], block_1->ne[1]);
-                block_1 = ggml_mul(ctx0, block_1_hw, block_1);
+            // layernorm2
+            cur = build_norm(cur, layer.ln_2_w, layer.ln_2_b, norm_t, eps, il);
+            cb(cur, "ffn_inp_normed", il);
 
-                int w = block_1->ne[0], h = block_1->ne[1];
-                block_1 = ggml_reshape_3d(ctx0, block_1, w*h, block_1->ne[2], block_1->ne[3]);
-                block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 1, 0, 2, 3));
-                // block_1 shape = [1, 24*24, 2048], ne = [24*24, 2048, 1]
-                block_1 = ggml_mul_mat(ctx0, model.mm_model_block_2_block_2_0_w, block_1);
-                block_1 = ggml_reshape_4d(ctx0, block_1, block_1->ne[0], w, h, block_1->ne[3]);
+            // ffn
+            cur = build_ffn(cur,
+                layer.ff_up_w, layer.ff_up_b,
+                layer.ff_gate_w, layer.ff_gate_b,
+                layer.ff_down_w, layer.ff_down_b,
+                hparams.ffn_op, il);
 
+            cb(cur, "ffn_out", il);
 
-                // block_1 shape = [1, 12, 12, 2048], ne = [2048, 12, 12, 1]
-                block_1 = ggml_norm(ctx0, block_1, eps);
-                block_1 = ggml_add(ctx0, ggml_mul(ctx0, block_1, model.mm_model_block_2_block_2_1_w), model.mm_model_block_2_block_2_1_b);
-                block_1 = ggml_reshape_3d(ctx0, block_1, block_1->ne[0], block_1->ne[1] * block_1->ne[2], block_1->ne[3]);
-                // block_1 shape = [1, 144, 2048], ne = [2048, 144, 1]
-            }
-            embeddings = block_1;
-        }
-        else if (ctx->proj_type == PROJECTOR_TYPE_LDPV2)
-        {
-            int n_patch = 24;
-            struct ggml_tensor * mlp_0 = ggml_mul_mat(ctx0, model.mm_model_mlp_0_w, embeddings);
-            mlp_0 = ggml_add(ctx0, mlp_0, model.mm_model_mlp_0_b);
-            mlp_0 = ggml_gelu(ctx0, mlp_0);
-            struct ggml_tensor * mlp_2 = ggml_mul_mat(ctx0, model.mm_model_mlp_2_w, mlp_0);
-            mlp_2 = ggml_add(ctx0, mlp_2, model.mm_model_mlp_2_b);
-            // mlp_2 ne = [2048, 576, 1, 1]
-            // // AVG Pool Layer 2*2, strides = 2
-            mlp_2 = ggml_cont(ctx0, ggml_permute(ctx0, mlp_2, 1, 0, 2, 3));
-            // mlp_2 ne = [576, 2048, 1, 1]
-            mlp_2 = ggml_reshape_4d(ctx0, mlp_2, n_patch, n_patch, mlp_2->ne[1], mlp_2->ne[2]);
-            // mlp_2 ne [24, 24, 2048, 1]
-            mlp_2 = ggml_pool_2d(ctx0, mlp_2, GGML_OP_POOL_AVG, 2, 2, 2, 2, 0, 0);
-            // weight ne = [3, 3, 2048, 1]
-            struct ggml_tensor * peg_0 = ggml_conv_2d_dw(ctx0, model.mm_model_peg_0_w, mlp_2, 1, 1, 1, 1, 1, 1);
-            peg_0 = ggml_cont(ctx0, ggml_permute(ctx0, peg_0, 1, 2, 0, 3));
-            peg_0 = ggml_add(ctx0, peg_0, model.mm_model_peg_0_b);
-            mlp_2 = ggml_cont(ctx0, ggml_permute(ctx0, mlp_2, 1, 2, 0, 3));
-            peg_0 = ggml_add(ctx0, peg_0, mlp_2);
-            peg_0 = ggml_reshape_3d(ctx0, peg_0, peg_0->ne[0], peg_0->ne[1] * peg_0->ne[2], peg_0->ne[3]);
-            embeddings = peg_0;
-        }
-        else {
-            GGML_ABORT("fatal error");
-        }
-    }
-    // minicpmv projector
-    else if (ctx->proj_type == PROJECTOR_TYPE_MINICPMV) {
-        struct ggml_tensor * q = model.mm_model_query;
-        { // layernorm
-            q = ggml_norm(ctx0, q, eps);
-            q = ggml_add(ctx0, ggml_mul(ctx0, q, model.mm_model_ln_q_w), model.mm_model_ln_q_b);
-        }
-        struct ggml_tensor * v = ggml_mul_mat(ctx0, model.mm_model_kv_proj, embeddings);
-        { // layernorm
-            v = ggml_norm(ctx0, v, eps);
-            v = ggml_add(ctx0, ggml_mul(ctx0, v, model.mm_model_ln_kv_w), model.mm_model_ln_kv_b);
-        }
-        struct ggml_tensor * k;
-        { // position
-            // q = ggml_add(ctx0, q, model.mm_model_pos_embed);
-            k = ggml_add(ctx0, v, pos_embed);
+            // residual 2
+            cur = ggml_add(ctx0, inpL, cur);
+            cb(cur, "layer_out", il);
+
+            inpL = cur;
         }
 
-        { // attention
-            int hidden_size = clip_n_mmproj_embd(ctx);
-            const int d_head = 128;
-            int n_head = hidden_size/d_head;
-            int num_query = 96;
-            if (ctx->minicpmv_version == 2) {
-                num_query = 96;
-            }
-            else if (ctx->minicpmv_version == 3) {
-                num_query = 64;
-            }
-            else if (ctx->minicpmv_version == 4) {
-                num_query = 64;
-            }
-
-            struct ggml_tensor * Q = ggml_add(ctx0, ggml_mul_mat(ctx0, model.mm_model_attn_q_w, q), model.mm_model_attn_q_b);
-            struct ggml_tensor * K = ggml_add(ctx0, ggml_mul_mat(ctx0, model.mm_model_attn_k_w, k), model.mm_model_attn_k_b);
-            struct ggml_tensor * V = ggml_add(ctx0, ggml_mul_mat(ctx0, model.mm_model_attn_v_w, v), model.mm_model_attn_v_b);
-            // permute
-            Q = ggml_reshape_4d(ctx0, Q, d_head, n_head, num_query, batch_size);
-            Q = ggml_cont(ctx0, ggml_permute(ctx0, Q, 0, 2, 1, 3));
-            Q = ggml_reshape_3d(ctx0, Q, d_head, num_query, n_head * batch_size);
-            K = ggml_reshape_4d(ctx0, K, d_head, n_head, num_positions, batch_size);
-            K = ggml_cont(ctx0, ggml_permute(ctx0, K, 0, 2, 1, 3));
-            K = ggml_reshape_3d(ctx0, K, d_head, num_positions, n_head * batch_size);
-            V = ggml_reshape_4d(ctx0, V, d_head, n_head, num_positions, batch_size);
-            V = ggml_cont(ctx0, ggml_permute(ctx0, V, 1, 2, 0, 3));
-            V = ggml_reshape_3d(ctx0, V, num_positions, d_head, n_head * batch_size);
-            struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
-            KQ = ggml_soft_max_ext(ctx0, KQ, nullptr, 1.0f / sqrtf((float)d_head), 0.0f);
-            struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ);
-            KQV = ggml_reshape_4d(ctx0, KQV, d_head, num_query, n_head, batch_size);
-            KQV = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
-            KQV = ggml_cont_3d(ctx0, KQV, hidden_size, num_query, batch_size);
-
-            embeddings = ggml_add(ctx0, ggml_mul_mat(ctx0, model.mm_model_attn_o_w, KQV), model.mm_model_attn_o_b);
+        // post-layernorm
+        if (model.post_ln_w) {
+            inpL = build_norm(inpL, model.post_ln_w, model.post_ln_b, norm_t, eps, n_layer);
         }
-        { // layernorm
-            embeddings = ggml_norm(ctx0, embeddings, eps);
-            embeddings = ggml_add(ctx0, ggml_mul(ctx0, embeddings, model.mm_model_ln_post_w), model.mm_model_ln_post_b);
-        }
-        embeddings = ggml_mul_mat(ctx0, model.mm_model_proj, embeddings);
-    }
 
-    // glm projector
-    else if (ctx->proj_type == PROJECTOR_TYPE_GLM_EDGE) {
-        size_t gridsz = (size_t)sqrt(embeddings->ne[1]);
-        embeddings = ggml_cont(ctx0, ggml_permute(ctx0,embeddings,1,0,2,3));
-        embeddings = ggml_reshape_3d(ctx0, embeddings, gridsz, gridsz, embeddings->ne[1]);
-        embeddings = ggml_conv_2d(ctx0, model.mm_model_adapter_conv_w, embeddings, 2, 2, 0, 0, 1, 1);
-        embeddings = ggml_reshape_3d(ctx0, embeddings,embeddings->ne[0]*embeddings->ne[1] , embeddings->ne[2], batch_size);
-        embeddings = ggml_cont(ctx0, ggml_permute(ctx0,embeddings, 1, 0, 2, 3));
-        embeddings = ggml_add(ctx0, embeddings, model.mm_model_adapter_conv_b);
-        // GLU
-        {
-            embeddings = ggml_mul_mat(ctx0, model.mm_model_mlp_0_w, embeddings);
-            embeddings = ggml_norm(ctx0, embeddings, eps);
-            embeddings = ggml_add(ctx0, ggml_mul(ctx0, embeddings, model.mm_model_ln_q_w), model.mm_model_ln_q_b);
-            embeddings = ggml_gelu_inplace(ctx0, embeddings);
-            struct ggml_tensor * x = embeddings;
-            embeddings = ggml_mul_mat(ctx0, model.mm_model_mlp_2_w, embeddings);
-            x = ggml_mul_mat(ctx0, model.mm_model_mlp_1_w,x);
-            embeddings = ggml_silu_inplace(ctx0, embeddings);
-            embeddings = ggml_mul(ctx0, embeddings,x);
-            embeddings = ggml_mul_mat(ctx0, model.mm_model_mlp_3_w, embeddings);
-        }
-    }
-
-    else if (ctx->proj_type == PROJECTOR_TYPE_QWEN2VL) {
-        embeddings = ggml_reshape_3d(ctx0, embeddings, hidden_size * 4, num_positions / 4, batch_size);
+        // multimodal projection
+        ggml_tensor * embeddings = inpL;
+        embeddings = ggml_reshape_3d(ctx0, embeddings, n_embd * 4, n_pos / 4, batch_size);
 
         embeddings = ggml_mul_mat(ctx0, model.mm_0_w, embeddings);
         embeddings = ggml_add(ctx0, embeddings, model.mm_0_b);
@@ -1545,36 +774,971 @@ static ggml_cgraph * clip_image_build_graph_legacy(clip_ctx * ctx, const clip_im
         // Second linear layer
         embeddings = ggml_mul_mat(ctx0, model.mm_1_w, embeddings);
         embeddings = ggml_add(ctx0, embeddings, model.mm_1_b);
+
+        if (use_window_attn) {
+            window_idx = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_pos / 4);
+            ggml_set_name(window_idx, "window_idx");
+            ggml_set_input(window_idx);
+
+            // embeddings shape: [n_embd, n_patches_x * n_patches_y, batch_size]
+            GGML_ASSERT(batch_size == 1);
+            embeddings = ggml_reshape_2d(ctx0, embeddings, hparams.projection_dim, n_patches_x * n_patches_y / 4);
+            embeddings = ggml_get_rows(ctx0, embeddings, window_idx);
+            embeddings = ggml_reshape_3d(ctx0, embeddings, hparams.projection_dim, n_patches_x * n_patches_y / 4, batch_size);
+        }
+
+        // build the graph
+        ggml_build_forward_expand(gf, embeddings);
+
+        return gf;
     }
 
-    // build the graph
-    ggml_build_forward_expand(gf, embeddings);
+    ggml_cgraph * build_minicpmv() {
+        const int batch_size = 1;
 
-    return gf;
-}
+        GGML_ASSERT(model.class_embedding == nullptr);
+        const int n_pos = n_patches;
+
+        // position embeddings for the projector (not for ViT)
+        int n_output_dim = clip_n_mmproj_embd(ctx);
+        ggml_tensor * pos_embed = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_output_dim, n_pos, batch_size);
+        ggml_set_name(pos_embed, "pos_embed");
+        ggml_set_input(pos_embed);
+
+        // for selecting learned pos embd, used by ViT
+        struct ggml_tensor * positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_pos);
+        ggml_set_name(positions, "positions");
+        ggml_set_input(positions);
+
+        ggml_tensor * learned_pos_embd = ggml_get_rows(ctx0, model.position_embeddings, positions);
+
+        ggml_tensor * inp = build_inp();
+        ggml_tensor * embeddings = build_vit(
+                                inp, n_patches,
+                                NORM_TYPE_NORMAL,
+                                hparams.ffn_op,
+                                learned_pos_embd,
+                                nullptr);
+
+        // resampler projector (it is just another transformer)
+
+        ggml_tensor * q = model.mm_model_query;
+        ggml_tensor * v = ggml_mul_mat(ctx0, model.mm_model_kv_proj, embeddings);
+
+        // norm
+        q = build_norm(q, model.mm_model_ln_q_w, model.mm_model_ln_q_b, NORM_TYPE_NORMAL, eps, -1);
+        v = build_norm(v, model.mm_model_ln_kv_w, model.mm_model_ln_kv_b, NORM_TYPE_NORMAL, eps, -1);
+
+        // k = v + pos_embed
+        ggml_tensor * k = ggml_add(ctx0, v, pos_embed);
+
+        // attention
+        {
+            int n_embd = clip_n_mmproj_embd(ctx);
+            const int d_head = 128;
+            int n_head = n_embd/d_head;
+            int num_query = 96;
+            if (ctx->minicpmv_version == 2) {
+                num_query = 96;
+            } else if (ctx->minicpmv_version == 3) {
+                num_query = 64;
+            } else if (ctx->minicpmv_version == 4) {
+                num_query = 64;
+            }
+
+            ggml_tensor * Q = ggml_add(ctx0,
+                ggml_mul_mat(ctx0, model.mm_model_attn_q_w, q),
+                model.mm_model_attn_q_b);
+            ggml_tensor * K = ggml_add(ctx0,
+                ggml_mul_mat(ctx0, model.mm_model_attn_k_w, k),
+                model.mm_model_attn_k_b);
+            ggml_tensor * V = ggml_add(ctx0,
+                ggml_mul_mat(ctx0, model.mm_model_attn_v_w, v),
+                model.mm_model_attn_v_b);
+
+            Q = ggml_reshape_3d(ctx0, Q, d_head, n_head, num_query);
+            K = ggml_reshape_3d(ctx0, K, d_head, n_head, n_pos);
+            V = ggml_reshape_3d(ctx0, V, d_head, n_head, n_pos);
+
+            cb(Q, "resampler_Q", -1);
+            cb(K, "resampler_K", -1);
+            cb(V, "resampler_V", -1);
+
+            embeddings = build_attn(
+                model.mm_model_attn_o_w,
+                model.mm_model_attn_o_b,
+                Q, K, V, nullptr, kq_scale, -1);
+            cb(embeddings, "resampler_attn_out", -1);
+        }
+        // layernorm
+        embeddings = build_norm(embeddings, model.mm_model_ln_post_w, model.mm_model_ln_post_b, NORM_TYPE_NORMAL, eps, -1);
+
+        // projection
+        embeddings = ggml_mul_mat(ctx0, model.mm_model_proj, embeddings);
+
+        // build the graph
+        ggml_build_forward_expand(gf, embeddings);
+
+        return gf;
+    }
+
+    ggml_cgraph * build_internvl() {
+        GGML_ASSERT(model.class_embedding != nullptr);
+        GGML_ASSERT(model.position_embeddings != nullptr);
+
+        const int n_pos = n_patches + 1;
+        ggml_tensor * inp = build_inp();
+
+        // add CLS token
+        inp = ggml_concat(ctx0, inp, model.class_embedding, 1);
+
+        // The larger models use a different ViT, which uses RMS norm instead of layer norm
+        // ref: https://github.com/ggml-org/llama.cpp/pull/13443#issuecomment-2869786188
+        norm_type norm_t = (hparams.n_embd == 3200 && hparams.n_layer == 45)
+            ? NORM_TYPE_RMS // 6B ViT (Used by InternVL 2.5/3 - 26B, 38B, 78B)
+            : NORM_TYPE_NORMAL; // 300M ViT (Used by all smaller InternVL models)
+
+        ggml_tensor * cur = build_vit(
+                                inp, n_pos,
+                                norm_t,
+                                hparams.ffn_op,
+                                model.position_embeddings,
+                                nullptr);
+
+        // remove CLS token
+        cur = ggml_view_2d(ctx0, cur,
+            n_embd, n_patches,
+            ggml_row_size(cur->type, n_embd), 0);
+
+        // pixel shuffle
+        {
+            const int scale_factor = model.hparams.proj_scale_factor;
+            const int bsz    = 1; // batch size, always 1 for now since we don't support batching
+            const int height = n_patches_y;
+            const int width  = n_patches_x;
+            GGML_ASSERT(scale_factor > 0);
+            cur = ggml_reshape_4d(ctx0, cur, n_embd * scale_factor, height / scale_factor, width, bsz);
+            cur = ggml_permute(ctx0, cur, 0, 2, 1, 3);
+            cur = ggml_reshape_4d(ctx0, ggml_cont(ctx0, cur),
+                n_embd * scale_factor * scale_factor,
+                height / scale_factor,
+                width / scale_factor,
+                bsz);
+            cur = ggml_permute(ctx0, cur, 0, 2, 1, 3);
+            // flatten to 2D
+            cur = ggml_reshape_2d(ctx0, ggml_cont(ctx0, cur),
+                n_embd * scale_factor * scale_factor,
+                cur->ne[1] * cur->ne[2]);
+        }
+
+        // projector (always using GELU activation)
+        {
+            // projector LayerNorm uses pytorch's default eps = 1e-5
+            // ref: https://huggingface.co/OpenGVLab/InternVL3-8B-Instruct/blob/a34d3e4e129a5856abfd6aa6de79776484caa14e/modeling_internvl_chat.py#L79
+            cur = build_norm(cur, model.mm_0_w, model.mm_0_b, NORM_TYPE_NORMAL, 1e-5, -1);
+            cur = ggml_mul_mat(ctx0, model.mm_1_w, cur);
+            cur = ggml_add(ctx0, cur, model.mm_1_b);
+            cur = ggml_gelu(ctx0, cur);
+            cur = ggml_mul_mat(ctx0, model.mm_3_w, cur);
+            cur = ggml_add(ctx0, cur, model.mm_3_b);
+        }
+
+        // build the graph
+        ggml_build_forward_expand(gf, cur);
+
+        return gf;
+    }
+
+    // this graph is used by llava, granite and glm
+    // due to having embedding_stack (used by granite), we cannot reuse build_vit
+    ggml_cgraph * build_llava() {
+        const int batch_size = 1;
+        const int n_pos = n_patches + (model.class_embedding ? 1 : 0);
+
+        GGML_ASSERT(n_patches_x == n_patches_y && "only square images supported");
+
+        // Calculate the deepest feature layer based on hparams and projector type
+        int max_feature_layer = n_layer;
+        {
+            // Get the index of the second to last layer; this is the default for models that have a llava projector
+            int il_last = hparams.n_layer - 1;
+            int deepest_feature_layer = -1;
+
+            if (ctx->proj_type == PROJECTOR_TYPE_MINICPMV || ctx->proj_type == PROJECTOR_TYPE_GLM_EDGE) {
+                il_last += 1;
+            }
+
+            // If we set explicit vision feature layers, only go up to the deepest one
+            // NOTE: only used by granite-vision models for now
+            for (const auto & feature_layer : hparams.vision_feature_layer) {
+                if (feature_layer > deepest_feature_layer) {
+                    deepest_feature_layer = feature_layer;
+                }
+            }
+            max_feature_layer = deepest_feature_layer < 0 ? il_last : deepest_feature_layer;
+        }
+
+        ggml_tensor * inp = build_inp();
+
+        // concat class_embeddings and patch_embeddings
+        if (model.class_embedding) {
+            inp = ggml_concat(ctx0, inp, model.class_embedding, 1);
+        }
+
+        ggml_tensor * positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_pos);
+        ggml_set_name(positions, "positions");
+        ggml_set_input(positions);
+
+        inp = ggml_add(ctx0, inp, ggml_get_rows(ctx0, model.position_embeddings, positions));
+
+        ggml_tensor * inpL = inp;
+
+        // pre-layernorm
+        if (model.pre_ln_w) {
+            inpL = build_norm(inpL, model.pre_ln_w, model.pre_ln_b, NORM_TYPE_NORMAL, eps, -1);
+            cb(inpL, "pre_ln", -1);
+        }
+
+        std::vector<ggml_tensor *> embedding_stack;
+        const auto & vision_feature_layer = hparams.vision_feature_layer;
+
+        // loop over layers
+        for (int il = 0; il < max_feature_layer; il++) {
+            auto & layer = model.layers[il];
+            ggml_tensor * cur = inpL; // inpL = residual, cur = hidden_states
+
+            // If this is an embedding feature layer, save the output.
+            // NOTE: 0 index here refers to the input to the encoder.
+            if (vision_feature_layer.find(il) != vision_feature_layer.end()) {
+                embedding_stack.push_back(cur);
+            }
+
+            // layernorm1
+            cur = build_norm(cur, layer.ln_1_w, layer.ln_1_b, NORM_TYPE_NORMAL, eps, il);
+            cb(cur, "layer_inp_normed", il);
+
+            // self-attention
+            {
+                ggml_tensor * Qcur = ggml_mul_mat(ctx0, layer.q_w, cur);
+                if (layer.q_b) {
+                    Qcur = ggml_add(ctx0, Qcur, layer.q_b);
+                }
+
+                ggml_tensor * Kcur = ggml_mul_mat(ctx0, layer.k_w, cur);
+                if (layer.k_b) {
+                    Kcur = ggml_add(ctx0, Kcur, layer.k_b);
+                }
+
+                ggml_tensor * Vcur = ggml_mul_mat(ctx0, layer.v_w, cur);
+                if (layer.v_b) {
+                    Vcur = ggml_add(ctx0, Vcur, layer.v_b);
+                }
+
+                Qcur = ggml_reshape_3d(ctx0, Qcur, d_head, n_head, n_pos);
+                Kcur = ggml_reshape_3d(ctx0, Kcur, d_head, n_head, n_pos);
+                Vcur = ggml_reshape_3d(ctx0, Vcur, d_head, n_head, n_pos);
+
+                cb(Qcur, "Qcur", il);
+                cb(Kcur, "Kcur", il);
+                cb(Vcur, "Vcur", il);
+
+                cur = build_attn(layer.o_w, layer.o_b,
+                    Qcur, Kcur, Vcur, nullptr, kq_scale, il);
+                cb(cur, "attn_out", il);
+            }
+
+            // re-add the layer input, e.g., residual
+            cur = ggml_add(ctx0, cur, inpL);
+
+            inpL = cur; // inpL = residual, cur = hidden_states
+
+            cb(cur, "ffn_inp", il);
+
+            // layernorm2
+            cur = build_norm(cur, layer.ln_2_w, layer.ln_2_b, NORM_TYPE_NORMAL, eps, il);
+            cb(cur, "ffn_inp_normed", il);
+
+            // ffn
+            cur = build_ffn(cur,
+                layer.ff_up_w, layer.ff_up_b,
+                layer.ff_gate_w, layer.ff_gate_b,
+                layer.ff_down_w, layer.ff_down_b,
+                hparams.ffn_op, il);
+
+            cb(cur, "ffn_out", il);
+
+            // residual 2
+            cur = ggml_add(ctx0, inpL, cur);
+            cb(cur, "layer_out", il);
+
+            inpL = cur;
+        }
+
+        // post-layernorm
+        if (model.post_ln_w) {
+            inpL = build_norm(inpL, model.post_ln_w, model.post_ln_b, NORM_TYPE_NORMAL, eps, -1);
+        }
+
+        ggml_tensor * embeddings = inpL;
+
+        // process vision feature layers (used by granite)
+        {
+            // final layer is a vision feature layer
+            if (vision_feature_layer.find(max_feature_layer) != vision_feature_layer.end()) {
+                embedding_stack.push_back(inpL);
+            }
+
+            // If feature layers are explicitly set, stack them (if we have multiple)
+            if (!embedding_stack.empty()) {
+                embeddings = embedding_stack[0];
+                for (size_t i = 1; i < embedding_stack.size(); i++) {
+                    embeddings = ggml_concat(ctx0, embeddings, embedding_stack[i], 0);
+                }
+            }
+        }
+
+        // llava projector (also used by granite)
+        if (ctx->has_llava_projector) {
+            embeddings = ggml_reshape_2d(ctx0, embeddings, embeddings->ne[0], embeddings->ne[1]);
+
+            ggml_tensor * patches = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_patches);
+            ggml_set_name(patches, "patches");
+            ggml_set_input(patches);
+
+            // shape [1, 576, 1024]
+            // ne is whcn, ne = [1024, 576, 1, 1]
+            embeddings = ggml_get_rows(ctx0, embeddings, patches);
+
+            // print_tensor_info(embeddings, "embeddings");
+
+            // llava projector
+            if (ctx->proj_type == PROJECTOR_TYPE_MLP) {
+                embeddings = ggml_mul_mat(ctx0, model.mm_0_w, embeddings);
+                embeddings = ggml_add(ctx0, embeddings, model.mm_0_b);
+
+                embeddings = ggml_gelu(ctx0, embeddings);
+                if (model.mm_2_w) {
+                    embeddings = ggml_mul_mat(ctx0, model.mm_2_w, embeddings);
+                    embeddings = ggml_add(ctx0, embeddings, model.mm_2_b);
+                }
+            }
+            else if (ctx->proj_type == PROJECTOR_TYPE_MLP_NORM) {
+                embeddings = ggml_mul_mat(ctx0, model.mm_0_w, embeddings);
+                embeddings = ggml_add(ctx0, embeddings, model.mm_0_b);
+                // ggml_tensor_printf(embeddings, "mm_0_w",0,true,false);
+                // First LayerNorm
+                embeddings = ggml_norm(ctx0, embeddings, eps);
+                embeddings = ggml_add(ctx0, ggml_mul(ctx0, embeddings, model.mm_1_w),
+                                    model.mm_1_b);
+
+                // GELU activation
+                embeddings = ggml_gelu(ctx0, embeddings);
+
+                // Second linear layer
+                embeddings = ggml_mul_mat(ctx0, model.mm_3_w, embeddings);
+                embeddings = ggml_add(ctx0, embeddings, model.mm_3_b);
+
+                // Second LayerNorm
+                embeddings = ggml_norm(ctx0, embeddings, eps);
+                embeddings = ggml_add(ctx0, ggml_mul(ctx0, embeddings, model.mm_4_w),
+                                    model.mm_4_b);
+            }
+            else if (ctx->proj_type == PROJECTOR_TYPE_LDP) {
+                // MobileVLM projector
+                int n_patch = 24;
+                ggml_tensor * mlp_1 = ggml_mul_mat(ctx0, model.mm_model_mlp_1_w, embeddings);
+                mlp_1 = ggml_add(ctx0, mlp_1, model.mm_model_mlp_1_b);
+                mlp_1 = ggml_gelu(ctx0, mlp_1);
+                ggml_tensor * mlp_3 = ggml_mul_mat(ctx0, model.mm_model_mlp_3_w, mlp_1);
+                mlp_3 = ggml_add(ctx0, mlp_3, model.mm_model_mlp_3_b);
+                // mlp_3 shape = [1, 576, 2048], ne = [2048, 576, 1, 1]
+
+                // block 1
+                ggml_tensor * block_1 = nullptr;
+                {
+                    // transpose from [1, 576, 2048] --> [1, 2048, 576] --> [1, 2048, 24, 24]
+                    mlp_3 = ggml_cont(ctx0, ggml_permute(ctx0, mlp_3, 1, 0, 2, 3));
+                    mlp_3 = ggml_reshape_4d(ctx0, mlp_3, n_patch, n_patch, mlp_3->ne[1], mlp_3->ne[2]);
+                    // stride = 1, padding = 1, bias is nullptr
+                    block_1 = ggml_conv_2d_dw(ctx0, model.mm_model_block_1_block_0_0_w, mlp_3, 1, 1, 1, 1, 1, 1);
+
+                    // layer norm
+                    // // block_1 shape = [1, 2048, 24, 24], ne = [24, 24, 2048, 1]
+                    block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 1, 2, 0, 3));
+                    // block_1 shape = [1, 24, 24, 2048], ne = [2048, 24, 24, 1]
+                    block_1 = ggml_norm(ctx0, block_1, eps);
+                    block_1 = ggml_add(ctx0, ggml_mul(ctx0, block_1, model.mm_model_block_1_block_0_1_w), model.mm_model_block_1_block_0_1_b);
+                    block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 2, 0, 1, 3));
+
+                    // block_1 shape = [1, 2048, 24, 24], ne = [24, 24, 2048, 1]
+                    // hardswish
+                    ggml_tensor * block_1_hw = ggml_hardswish(ctx0, block_1);
+
+                    block_1 = ggml_pool_2d(ctx0, block_1_hw, GGML_OP_POOL_AVG, block_1_hw->ne[0], block_1_hw->ne[1], block_1_hw->ne[0], block_1_hw->ne[1], 0, 0);
+                    // block_1 shape = [1, 2048, 1, 1], ne = [1, 1, 2048, 1]
+                    // pointwise conv
+                    block_1 = ggml_reshape_2d(ctx0, block_1, block_1->ne[0]*block_1->ne[1]*block_1->ne[2], block_1->ne[3]);
+                    block_1 = ggml_mul_mat(ctx0, model.mm_model_block_1_block_1_fc1_w, block_1);
+                    block_1 = ggml_add(ctx0, block_1, model.mm_model_block_1_block_1_fc1_b);
+                    block_1 = ggml_relu(ctx0, block_1);
+                    block_1 = ggml_mul_mat(ctx0, model.mm_model_block_1_block_1_fc2_w, block_1);
+                    block_1 = ggml_add(ctx0, block_1, model.mm_model_block_1_block_1_fc2_b);
+                    block_1 = ggml_hardsigmoid(ctx0, block_1);
+                    // block_1_hw shape = [1, 2048, 24, 24], ne = [24, 24, 2048, 1], block_1 shape = [1, 2048], ne = [2048, 1, 1, 1]
+                    block_1 = ggml_reshape_4d(ctx0, block_1, 1, 1, block_1->ne[0], block_1->ne[1]);
+                    block_1 = ggml_mul(ctx0, block_1_hw, block_1);
+
+                    int w = block_1->ne[0], h = block_1->ne[1];
+                    block_1 = ggml_reshape_3d(ctx0, block_1, w*h, block_1->ne[2], block_1->ne[3]);
+                    block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 1, 0, 2, 3));
+
+                    // block_1 shape = [1, 24*24, 2048], ne = [24*24, 2048, 1]
+                    block_1 = ggml_mul_mat(ctx0, model.mm_model_block_1_block_2_0_w, block_1);
+                    block_1 = ggml_reshape_4d(ctx0, block_1, block_1->ne[0], w, h, block_1->ne[3]);
+
+                    // block_1 shape = [1, 24, 24, 2048], ne = [2048, 24, 24, 1]
+                    block_1 = ggml_norm(ctx0, block_1, eps);
+                    block_1 = ggml_add(ctx0, ggml_mul(ctx0, block_1, model.mm_model_block_1_block_2_1_w), model.mm_model_block_1_block_2_1_b);
+                    block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 2, 0, 1, 3));
+                    // block1 shape = [1, 2048, 24, 24], ne = [24, 24, 2048, 1]
+                    // residual
+                    block_1 = ggml_add(ctx0, mlp_3, block_1);
+                }
+
+                // block_2
+                {
+                    // stride = 2
+                    block_1 = ggml_conv_2d_dw(ctx0, model.mm_model_block_2_block_0_0_w, block_1, 2, 2, 1, 1, 1, 1);
+
+                    // block_1 shape = [1, 2048, 12, 12], ne = [12, 12, 2048, 1]
+                    // layer norm
+                    block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 1, 2, 0, 3));
+                    // block_1 shape = [1, 12, 12, 2048], ne = [2048, 12, 12, 1]
+                    block_1 = ggml_norm(ctx0, block_1, eps);
+                    block_1 = ggml_add(ctx0, ggml_mul(ctx0, block_1, model.mm_model_block_2_block_0_1_w), model.mm_model_block_2_block_0_1_b);
+                    block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 2, 0, 1, 3));
+                    // block_1 shape = [1, 2048, 12, 12], ne = [12, 12, 2048, 1]
+                    // hardswish
+                    ggml_tensor * block_1_hw = ggml_hardswish(ctx0, block_1);
+
+                    // not sure the parameters is right for globalAvgPooling
+                    block_1 = ggml_pool_2d(ctx0, block_1_hw, GGML_OP_POOL_AVG, block_1_hw->ne[0], block_1_hw->ne[1], block_1_hw->ne[0], block_1_hw->ne[1], 0, 0);
+                    // block_1 shape = [1, 2048, 1, 1], ne = [1, 1, 2048, 1]
+                    // pointwise conv
+                    block_1 = ggml_reshape_2d(ctx0, block_1, block_1->ne[0]*block_1->ne[1]*block_1->ne[2], block_1->ne[3]);
+                    block_1 = ggml_mul_mat(ctx0, model.mm_model_block_2_block_1_fc1_w, block_1);
+                    block_1 = ggml_add(ctx0, block_1, model.mm_model_block_2_block_1_fc1_b);
+                    block_1 = ggml_relu(ctx0, block_1);
+                    block_1 = ggml_mul_mat(ctx0, model.mm_model_block_2_block_1_fc2_w, block_1);
+                    block_1 = ggml_add(ctx0, block_1, model.mm_model_block_2_block_1_fc2_b);
+                    block_1 = ggml_hardsigmoid(ctx0, block_1);
+
+                    // block_1_hw shape = [1, 2048, 12, 12], ne = [12, 12, 2048, 1], block_1 shape = [1, 2048, 1, 1], ne = [1, 1, 2048, 1]
+                    block_1 = ggml_reshape_4d(ctx0, block_1, 1, 1, block_1->ne[0], block_1->ne[1]);
+                    block_1 = ggml_mul(ctx0, block_1_hw, block_1);
+
+                    int w = block_1->ne[0], h = block_1->ne[1];
+                    block_1 = ggml_reshape_3d(ctx0, block_1, w*h, block_1->ne[2], block_1->ne[3]);
+                    block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 1, 0, 2, 3));
+                    // block_1 shape = [1, 24*24, 2048], ne = [24*24, 2048, 1]
+                    block_1 = ggml_mul_mat(ctx0, model.mm_model_block_2_block_2_0_w, block_1);
+                    block_1 = ggml_reshape_4d(ctx0, block_1, block_1->ne[0], w, h, block_1->ne[3]);
+
+
+                    // block_1 shape = [1, 12, 12, 2048], ne = [2048, 12, 12, 1]
+                    block_1 = ggml_norm(ctx0, block_1, eps);
+                    block_1 = ggml_add(ctx0, ggml_mul(ctx0, block_1, model.mm_model_block_2_block_2_1_w), model.mm_model_block_2_block_2_1_b);
+                    block_1 = ggml_reshape_3d(ctx0, block_1, block_1->ne[0], block_1->ne[1] * block_1->ne[2], block_1->ne[3]);
+                    // block_1 shape = [1, 144, 2048], ne = [2048, 144, 1]
+                }
+                embeddings = block_1;
+            }
+            else if (ctx->proj_type == PROJECTOR_TYPE_LDPV2)
+            {
+                int n_patch = 24;
+                ggml_tensor * mlp_0 = ggml_mul_mat(ctx0, model.mm_model_mlp_0_w, embeddings);
+                mlp_0 = ggml_add(ctx0, mlp_0, model.mm_model_mlp_0_b);
+                mlp_0 = ggml_gelu(ctx0, mlp_0);
+                ggml_tensor * mlp_2 = ggml_mul_mat(ctx0, model.mm_model_mlp_2_w, mlp_0);
+                mlp_2 = ggml_add(ctx0, mlp_2, model.mm_model_mlp_2_b);
+                // mlp_2 ne = [2048, 576, 1, 1]
+                // // AVG Pool Layer 2*2, strides = 2
+                mlp_2 = ggml_cont(ctx0, ggml_permute(ctx0, mlp_2, 1, 0, 2, 3));
+                // mlp_2 ne = [576, 2048, 1, 1]
+                mlp_2 = ggml_reshape_4d(ctx0, mlp_2, n_patch, n_patch, mlp_2->ne[1], mlp_2->ne[2]);
+                // mlp_2 ne [24, 24, 2048, 1]
+                mlp_2 = ggml_pool_2d(ctx0, mlp_2, GGML_OP_POOL_AVG, 2, 2, 2, 2, 0, 0);
+                // weight ne = [3, 3, 2048, 1]
+                ggml_tensor * peg_0 = ggml_conv_2d_dw(ctx0, model.mm_model_peg_0_w, mlp_2, 1, 1, 1, 1, 1, 1);
+                peg_0 = ggml_cont(ctx0, ggml_permute(ctx0, peg_0, 1, 2, 0, 3));
+                peg_0 = ggml_add(ctx0, peg_0, model.mm_model_peg_0_b);
+                mlp_2 = ggml_cont(ctx0, ggml_permute(ctx0, mlp_2, 1, 2, 0, 3));
+                peg_0 = ggml_add(ctx0, peg_0, mlp_2);
+                peg_0 = ggml_reshape_3d(ctx0, peg_0, peg_0->ne[0], peg_0->ne[1] * peg_0->ne[2], peg_0->ne[3]);
+                embeddings = peg_0;
+            }
+            else {
+                GGML_ABORT("fatal error");
+            }
+        }
+
+        // glm projector
+        else if (ctx->proj_type == PROJECTOR_TYPE_GLM_EDGE) {
+            size_t gridsz = (size_t)sqrt(embeddings->ne[1]);
+            embeddings = ggml_cont(ctx0, ggml_permute(ctx0,embeddings,1,0,2,3));
+            embeddings = ggml_reshape_3d(ctx0, embeddings, gridsz, gridsz, embeddings->ne[1]);
+            embeddings = ggml_conv_2d(ctx0, model.mm_model_adapter_conv_w, embeddings, 2, 2, 0, 0, 1, 1);
+            embeddings = ggml_reshape_3d(ctx0, embeddings,embeddings->ne[0]*embeddings->ne[1] , embeddings->ne[2], batch_size);
+            embeddings = ggml_cont(ctx0, ggml_permute(ctx0,embeddings, 1, 0, 2, 3));
+            embeddings = ggml_add(ctx0, embeddings, model.mm_model_adapter_conv_b);
+            // GLU
+            {
+                embeddings = ggml_mul_mat(ctx0, model.mm_model_mlp_0_w, embeddings);
+                embeddings = ggml_norm(ctx0, embeddings, eps);
+                embeddings = ggml_add(ctx0, ggml_mul(ctx0, embeddings, model.mm_model_ln_q_w), model.mm_model_ln_q_b);
+                embeddings = ggml_gelu_inplace(ctx0, embeddings);
+                ggml_tensor * x = embeddings;
+                embeddings = ggml_mul_mat(ctx0, model.mm_model_mlp_2_w, embeddings);
+                x = ggml_mul_mat(ctx0, model.mm_model_mlp_1_w,x);
+                embeddings = ggml_silu_inplace(ctx0, embeddings);
+                embeddings = ggml_mul(ctx0, embeddings,x);
+                embeddings = ggml_mul_mat(ctx0, model.mm_model_mlp_3_w, embeddings);
+            }
+            // arrangement of BOI/EOI token embeddings
+            // note: these embeddings are not present in text model, hence we cannot process them as text tokens
+            // see: https://huggingface.co/THUDM/glm-edge-v-2b/blob/main/siglip.py#L53
+            {
+                embeddings = ggml_concat(ctx0, model.mm_glm_tok_boi, embeddings, 1); // BOI
+                embeddings = ggml_concat(ctx0, embeddings, model.mm_glm_tok_eoi, 1); // EOI
+            }
+        }
+
+        else {
+            GGML_ABORT("llava: unknown projector type");
+        }
+
+        // build the graph
+        ggml_build_forward_expand(gf, embeddings);
+
+        return gf;
+    }
+
+private:
+    //
+    // utility functions
+    //
+
+    void cb(ggml_tensor * cur, const char * name, int il) const {
+        // TODO: implement this
+        GGML_UNUSED(cur);
+        GGML_UNUSED(name);
+        GGML_UNUSED(il);
+    }
+
+    // build vision transformer (ViT) cgraph
+    // this function should cover most of the models
+    // if your model has specific features, you should probably duplicate this function
+    ggml_tensor * build_vit(
+                ggml_tensor * inp,
+                int64_t n_pos,
+                norm_type norm_t,
+                ffn_op_type ffn_t,
+                ggml_tensor * learned_pos_embd,
+                std::function<ggml_tensor *(ggml_tensor *, const clip_layer &)> add_pos
+            ) {
+        if (learned_pos_embd) {
+            inp = ggml_add(ctx0, inp, learned_pos_embd);
+            cb(inp, "pos_embed", -1);
+        }
+
+        ggml_tensor * inpL = inp;
+
+        // pre-layernorm
+        if (model.pre_ln_w) {
+            inpL = build_norm(inpL, model.pre_ln_w, model.pre_ln_b, norm_t, eps, -1);
+            cb(inpL, "pre_ln", -1);
+        }
+
+        // loop over layers
+        for (int il = 0; il < n_layer; il++) {
+            auto & layer = model.layers[il];
+            ggml_tensor * cur = inpL; // inpL = residual, cur = hidden_states
+
+            // layernorm1
+            cur = build_norm(cur, layer.ln_1_w, layer.ln_1_b, norm_t, eps, il);
+            cb(cur, "layer_inp_normed", il);
+
+            // self-attention
+            {
+                ggml_tensor * Qcur = ggml_mul_mat(ctx0, layer.q_w, cur);
+                if (layer.q_b) {
+                    Qcur = ggml_add(ctx0, Qcur, layer.q_b);
+                }
+
+                ggml_tensor * Kcur = ggml_mul_mat(ctx0, layer.k_w, cur);
+                if (layer.k_b) {
+                    Kcur = ggml_add(ctx0, Kcur, layer.k_b);
+                }
+
+                ggml_tensor * Vcur = ggml_mul_mat(ctx0, layer.v_w, cur);
+                if (layer.v_b) {
+                    Vcur = ggml_add(ctx0, Vcur, layer.v_b);
+                }
+
+                if (layer.q_norm) {
+                    Qcur = build_norm(Qcur, layer.q_norm, NULL, norm_t, eps, il);
+                    cb(Qcur, "Qcur_norm", il);
+                }
+
+                if (layer.k_norm) {
+                    Kcur = build_norm(Kcur, layer.k_norm, NULL, norm_t, eps, il);
+                    cb(Kcur, "Kcur_norm", il);
+                }
+
+                Qcur = ggml_reshape_3d(ctx0, Qcur, d_head, n_head, n_pos);
+                Kcur = ggml_reshape_3d(ctx0, Kcur, d_head, n_head, n_pos);
+                Vcur = ggml_reshape_3d(ctx0, Vcur, d_head, n_head, n_pos);
+
+                cb(Qcur, "Qcur", il);
+                cb(Kcur, "Kcur", il);
+                cb(Vcur, "Vcur", il);
+
+                if (add_pos) {
+                    Qcur = add_pos(Qcur, layer);
+                    Kcur = add_pos(Kcur, layer);
+                    cb(Qcur, "Qcur_pos", il);
+                    cb(Kcur, "Kcur_pos", il);
+                }
+
+                cur = build_attn(layer.o_w, layer.o_b,
+                    Qcur, Kcur, Vcur, nullptr, kq_scale, il);
+                cb(cur, "attn_out", il);
+            }
+
+            if (layer.ls_1_w) {
+                cur = ggml_mul(ctx0, cur, layer.ls_1_w);
+                cb(cur, "attn_out_scaled", il);
+            }
+
+            // re-add the layer input, e.g., residual
+            cur = ggml_add(ctx0, cur, inpL);
+
+            inpL = cur; // inpL = residual, cur = hidden_states
+
+            cb(cur, "ffn_inp", il);
+
+            // layernorm2
+            cur = build_norm(cur, layer.ln_2_w, layer.ln_2_b, norm_t, eps, il);
+            cb(cur, "ffn_inp_normed", il);
+
+            // ffn
+            cur = build_ffn(cur,
+                layer.ff_up_w, layer.ff_up_b,
+                layer.ff_gate_w, layer.ff_gate_b,
+                layer.ff_down_w, layer.ff_down_b,
+                ffn_t, il);
+
+            cb(cur, "ffn_out", il);
+
+            if (layer.ls_2_w) {
+                cur = ggml_mul(ctx0, cur, layer.ls_2_w);
+                cb(cur, "ffn_out_scaled", il);
+            }
+
+            // residual 2
+            cur = ggml_add(ctx0, inpL, cur);
+            cb(cur, "layer_out", il);
+
+            inpL = cur;
+        }
+
+        // post-layernorm
+        if (model.post_ln_w) {
+            inpL = build_norm(inpL, model.post_ln_w, model.post_ln_b, norm_t, eps, -1);
+        }
+        return inpL;
+    }
+
+    // build the input after conv2d (inp_raw --> patches)
+    // returns tensor with shape [n_embd, n_patches]
+    ggml_tensor * build_inp() {
+        ggml_tensor * inp_raw = build_inp_raw();
+        ggml_tensor * inp = ggml_conv_2d(ctx0, model.patch_embeddings_0, inp_raw, patch_size, patch_size, 0, 0, 1, 1);
+        inp = ggml_reshape_2d(ctx0, inp, n_patches, n_embd);
+        inp = ggml_cont(ctx0, ggml_transpose(ctx0, inp));
+        if (model.patch_bias) {
+            inp = ggml_add(ctx0, inp, model.patch_bias);
+            cb(inp, "patch_bias", -1);
+        }
+        return inp;
+    }
+
+    ggml_tensor * build_inp_raw() {
+        ggml_tensor * inp_raw = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, img.nx, img.ny, 3);
+        ggml_set_name(inp_raw, "inp_raw");
+        ggml_set_input(inp_raw);
+        return inp_raw;
+    }
+
+    ggml_tensor * build_norm(
+            ggml_tensor * cur,
+            ggml_tensor * mw,
+            ggml_tensor * mb,
+            norm_type type,
+            float norm_eps,
+            int il) const {
+
+        cur = type == NORM_TYPE_RMS
+            ? ggml_rms_norm(ctx0, cur, norm_eps)
+            : ggml_norm(ctx0, cur, norm_eps);
+
+        if (mw || mb) {
+            cb(cur, "norm", il);
+        }
+
+        if (mw) {
+            cur = ggml_mul(ctx0, cur, mw);
+            if (mb) {
+                cb(cur, "norm_w", il);
+            }
+        }
+
+        if (mb) {
+            cur = ggml_add(ctx0, cur, mb);
+        }
+
+        return cur;
+    }
+
+    ggml_tensor * build_ffn(
+            ggml_tensor * cur,
+            ggml_tensor * up,
+            ggml_tensor * up_b,
+            ggml_tensor * gate,
+            ggml_tensor * gate_b,
+            ggml_tensor * down,
+            ggml_tensor * down_b,
+            ffn_op_type type_op,
+            int il) const {
+
+        ggml_tensor * tmp = up ? ggml_mul_mat(ctx0, up, cur) : cur;
+        cb(tmp, "ffn_up", il);
+
+        if (up_b) {
+            tmp = ggml_add(ctx0, tmp, up_b);
+            cb(tmp, "ffn_up_b", il);
+        }
+
+        if (gate) {
+            cur = ggml_mul_mat(ctx0, gate, cur);
+            cb(cur, "ffn_gate", il);
+
+            if (gate_b) {
+                cur = ggml_add(ctx0, cur, gate_b);
+                cb(cur, "ffn_gate_b", il);
+            }
+        } else {
+            cur = tmp;
+        }
+
+        switch (type_op) {
+            case FFN_SILU:
+                {
+                    cur = ggml_silu(ctx0, cur);
+                    cb(cur, "ffn_silu", il);
+                } break;
+            case FFN_GELU:
+                {
+                    cur = ggml_gelu(ctx0, cur);
+                    cb(cur, "ffn_gelu", il);
+                } break;
+            case FFN_GELU_QUICK:
+                {
+                    cur = ggml_gelu_quick(ctx0, cur);
+                    cb(cur, "ffn_relu", il);
+                } break;
+        }
+
+        // we only support parallel ffn for now
+        if (gate) {
+            cur = ggml_mul(ctx0, cur, tmp);
+            cb(cur, "ffn_gate_par", il);
+        }
+
+        if (down) {
+            cur = ggml_mul_mat(ctx0, down, cur);
+        }
+
+        if (down_b) {
+            cb(cur, "ffn_down", il);
+        }
+
+        if (down_b) {
+            cur = ggml_add(ctx0, cur, down_b);
+        }
+
+        return cur;
+    }
+
+    ggml_tensor * build_attn(
+            ggml_tensor * wo,
+            ggml_tensor * wo_b,
+            ggml_tensor * q_cur,
+            ggml_tensor * k_cur,
+            ggml_tensor * v_cur,
+            ggml_tensor * kq_mask,
+            float kq_scale,
+            int il) const {
+        // these nodes are added to the graph together so that they are not reordered
+        // by doing so, the number of splits in the graph is reduced
+        ggml_build_forward_expand(gf, q_cur);
+        ggml_build_forward_expand(gf, k_cur);
+        ggml_build_forward_expand(gf, v_cur);
+
+        ggml_tensor * q = ggml_permute(ctx0, q_cur, 0, 2, 1, 3);
+        //cb(q, "q", il);
+
+        ggml_tensor * k = ggml_permute(ctx0, k_cur, 0, 2, 1, 3);
+        //cb(k, "k", il);
+
+        ggml_tensor * v = ggml_permute(ctx0, v_cur, 1, 2, 0, 3);
+        v = ggml_cont(ctx0, v);
+        //cb(k, "v", il);
+
+        ggml_tensor * cur;
+
+        // TODO @ngxson : support flash attention
+        {
+            const auto n_tokens = q->ne[1];
+            const auto n_head   = q->ne[2];
+            // const auto n_kv     = k->ne[1]; // for flash attention
+
+            ggml_tensor * kq = ggml_mul_mat(ctx0, k, q);
+            // F32 may not needed for vision encoders?
+            // ggml_mul_mat_set_prec(kq, GGML_PREC_F32);
+
+            kq = ggml_soft_max_ext(ctx0, kq, kq_mask, kq_scale, 0.0f);
+
+            ggml_tensor * kqv = ggml_mul_mat(ctx0, v, kq);
+            cur = ggml_permute(ctx0, kqv, 0, 2, 1, 3);
+            cur = ggml_cont_2d(ctx0, cur, cur->ne[0]*n_head, n_tokens);
+        }
+
+        cb(cur, "kqv_out", il);
+
+        if (wo) {
+            cur = ggml_mul_mat(ctx0, wo, cur);
+        }
+
+        if (wo_b) {
+            cur = ggml_add(ctx0, cur, wo_b);
+        }
+
+        return cur;
+    }
+
+    // implementation of the 2D RoPE without adding a new op in ggml
+    // this is not efficient (use double the memory), but works on all backends
+    // TODO: there was a more efficient which relies on ggml_view and ggml_rope_ext_inplace, but the rope inplace does not work well with non-contiguous tensors ; we should fix that and revert back to the original implementation in https://github.com/ggml-org/llama.cpp/pull/13065
+    static ggml_tensor * build_rope_2d(
+        ggml_context * ctx0,
+        ggml_tensor * cur,
+        ggml_tensor * pos_h,
+        ggml_tensor * pos_w,
+        const float freq_base
+    ) {
+        const int64_t n_dim  = cur->ne[0];
+        const int64_t n_head = cur->ne[1];
+        const int64_t n_pos  = cur->ne[2];
+
+        // for example, if we have cur tensor of shape (n_dim=8, n_head, n_pos)
+        // we will have a list of 4 inv_freq: 1e-0, 1e-1, 1e-2, 1e-3
+        // first half of cur will use 1e-0, 1e-2 (even)
+        // second half of cur will use 1e-1, 1e-3 (odd)
+        // the trick here is to rotate just half of n_dim, so inv_freq will automatically be even
+        //  ^ don't ask me why, it's math! -2(2i) / n_dim == -2i / (n_dim/2)
+        // then for the second half, we use freq_scale to shift the inv_freq
+        //  ^ why? replace (2i) with (2i+1) in the above equation
+        const float freq_scale_odd = std::pow(freq_base, (float)-2/n_dim);
+
+        // first half
+        ggml_tensor * first;
+        {
+            first = ggml_view_3d(ctx0, cur,
+                n_dim/2, n_head, n_pos,
+                ggml_row_size(cur->type, n_dim),
+                ggml_row_size(cur->type, n_dim*n_head),
+                0);
+            first = ggml_rope_ext(
+                ctx0,
+                first,
+                pos_h,      // positions
+                nullptr,    // freq factors
+                n_dim/2,    // n_dims
+                0, 0, freq_base,
+                1.0f, 0.0f, 1.0f, 0.0f, 0.0f
+            );
+        }
+
+        // second half
+        ggml_tensor * second;
+        {
+            second = ggml_view_3d(ctx0, cur,
+                n_dim/2, n_head, n_pos,
+                ggml_row_size(cur->type, n_dim),
+                ggml_row_size(cur->type, n_dim*n_head),
+                n_dim/2 * ggml_element_size(cur));
+            second = ggml_cont(ctx0, second); // copy, because ggml_rope don't play well with non-contiguous tensors
+            second = ggml_rope_ext(
+                ctx0,
+                second,
+                pos_w,      // positions
+                nullptr,    // freq factors
+                n_dim/2,    // n_dims
+                0, 0, freq_base,
+                freq_scale_odd,
+                0.0f, 1.0f, 0.0f, 0.0f
+            );
+        }
+
+        cur = ggml_concat(ctx0, first, second, 0);
+        return cur;
+    }
+
+};
+
+static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32_batch & imgs) {
+    GGML_ASSERT(imgs.entries.size() == 1 && "n_batch > 1 is not supported");
+    clip_graph graph(ctx, *imgs.entries[0]);
 
-static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32_batch & imgs, struct clip_image_size load_image_size, bool is_inf = false) {
     ggml_cgraph * res;
+
     switch (ctx->proj_type) {
         case PROJECTOR_TYPE_GEMMA3:
         case PROJECTOR_TYPE_IDEFICS3:
             {
-                GGML_ASSERT(imgs.entries.size() == 1);
-                res = clip_image_build_graph_siglip(ctx, *imgs.entries[0]);
+                res = graph.build_siglip();
             } break;
         case PROJECTOR_TYPE_PIXTRAL:
             {
-                GGML_ASSERT(imgs.entries.size() == 1);
-                res = clip_image_build_graph_pixtral(ctx, *imgs.entries[0]);
+                res = graph.build_pixtral();
             } break;
+        case PROJECTOR_TYPE_QWEN2VL:
         case PROJECTOR_TYPE_QWEN25VL:
             {
-                res = clip_image_build_graph_qwen25vl(ctx, imgs);
+                res = graph.build_qwen2vl();
+            } break;
+        case PROJECTOR_TYPE_MINICPMV:
+            {
+                res = graph.build_minicpmv();
+            } break;
+        case PROJECTOR_TYPE_INTERNVL:
+            {
+                res = graph.build_internvl();
             } break;
         default:
             {
-                // TODO: we should have one build_* function per model
-                res = clip_image_build_graph_legacy(ctx, imgs, load_image_size, is_inf);
+                res = graph.build_llava();
             } break;
     }
     return res;
@@ -1628,7 +1792,7 @@ struct clip_model_loader {
                 const char * name = gguf_get_tensor_name(ctx_gguf.get(), i);
                 const size_t offset = gguf_get_tensor_offset(ctx_gguf.get(), i);
                 enum ggml_type type = gguf_get_tensor_type(ctx_gguf.get(), i);
-                struct ggml_tensor * cur = ggml_get_tensor(meta, name);
+                ggml_tensor * cur = ggml_get_tensor(meta, name);
                 size_t tensor_size = ggml_nbytes(cur);
                 model_size += tensor_size;
                 LOG_DBG("%s: tensor[%d]: n_dims = %d, name = %s, tensor_size=%zu, offset=%zu, shape:[%" PRIu64 ", %" PRIu64 ", %" PRIu64 ", %" PRIu64 "], type = %s\n",
@@ -1639,6 +1803,7 @@ struct clip_model_loader {
 
     void load_hparams() {
         auto & hparams = ctx_clip.vision_model.hparams;
+        std::string log_ffn_op; // for logging
 
         // projector type
         std::string proj_type;
@@ -1654,14 +1819,11 @@ struct clip_model_loader {
 
         // other hparams
         {
-            get_i32(KEY_MINICPMV_VERSION, ctx_clip.minicpmv_version, false);
+            get_i32(KEY_MINICPMV_VERSION, ctx_clip.minicpmv_version, false); // legacy
 
-            get_bool(KEY_USE_GELU, ctx_clip.use_gelu, false);
-            get_bool(KEY_USE_SILU, ctx_clip.use_silu, false);
-
-            get_u32(KEY_N_EMBD,         hparams.hidden_size);
+            get_u32(KEY_N_EMBD,         hparams.n_embd);
             get_u32(KEY_N_HEAD,         hparams.n_head);
-            get_u32(KEY_N_FF,           hparams.n_intermediate);
+            get_u32(KEY_N_FF,           hparams.n_ff);
             get_u32(KEY_N_BLOCK,        hparams.n_layer);
             get_u32(KEY_PROJ_DIM,       hparams.projection_dim);
             get_f32(KEY_LAYER_NORM_EPS, hparams.eps);
@@ -1670,11 +1832,34 @@ struct clip_model_loader {
             get_u32(KEY_IMAGE_CROP_RESOLUTION,    hparams.image_crop_resolution, false);
             get_arr_int(KEY_IMAGE_GRID_PINPOINTS, hparams.image_grid_pinpoints, false);
 
+            // default warmup value
+            hparams.warmup_image_size = hparams.image_size;
+
             ctx_clip.has_llava_projector = ctx_clip.proj_type == PROJECTOR_TYPE_MLP
                                         || ctx_clip.proj_type == PROJECTOR_TYPE_MLP_NORM
                                         || ctx_clip.proj_type == PROJECTOR_TYPE_LDP
                                         || ctx_clip.proj_type == PROJECTOR_TYPE_LDPV2;
 
+            {
+                bool use_gelu = false;
+                bool use_silu = false;
+                get_bool(KEY_USE_GELU, use_gelu, false);
+                get_bool(KEY_USE_SILU, use_silu, false);
+                if (use_gelu && use_silu) {
+                    throw std::runtime_error(string_format("%s: both use_gelu and use_silu are set to true\n", __func__));
+                }
+                if (use_gelu) {
+                    hparams.ffn_op = FFN_GELU;
+                    log_ffn_op = "gelu";
+                } else if (use_silu) {
+                    hparams.ffn_op = FFN_SILU;
+                    log_ffn_op = "silu";
+                } else {
+                    hparams.ffn_op = FFN_GELU_QUICK;
+                    log_ffn_op = "gelu_quick";
+                }
+            }
+
             {
                 std::string mm_patch_merge_type;
                 get_string(KEY_MM_PATCH_MERGE_TYPE, mm_patch_merge_type, false);
@@ -1708,30 +1893,6 @@ struct clip_model_loader {
                 hparams.vision_feature_layer.insert(layer);
             }
 
-            // Calculate the deepest feature layer based on hparams and projector type
-            // NOTE: This is only used by build_graph_legacy()
-            {
-                // Get the index of the second to last layer; this is the default for models that have a llava projector
-                int n_layer = hparams.n_layer - 1;
-                int deepest_feature_layer = -1;
-
-                if (ctx_clip.proj_type == PROJECTOR_TYPE_MINICPMV
-                        || ctx_clip.proj_type == PROJECTOR_TYPE_GLM_EDGE
-                        || ctx_clip.proj_type == PROJECTOR_TYPE_QWEN2VL
-                        || ctx_clip.proj_type == PROJECTOR_TYPE_QWEN25VL) {
-                    n_layer += 1;
-                }
-
-                // If we set explicit vision feature layers, only go up to the deepest one
-                // NOTE: only used by granite-vision models for now
-                for (const auto & feature_layer : hparams.vision_feature_layer) {
-                    if (feature_layer > deepest_feature_layer) {
-                        deepest_feature_layer = feature_layer;
-                    }
-                }
-                ctx_clip.max_feature_layer = deepest_feature_layer < 0 ? n_layer : deepest_feature_layer;
-            }
-
             // model-specific params
             switch (ctx_clip.proj_type) {
                 case PROJECTOR_TYPE_MINICPMV:
@@ -1741,15 +1902,41 @@ struct clip_model_loader {
                         }
                     } break;
                 case PROJECTOR_TYPE_IDEFICS3:
+                case PROJECTOR_TYPE_INTERNVL:
                     {
                         get_u32(KEY_PROJ_SCALE_FACTOR, hparams.proj_scale_factor, false);
                     } break;
                 case PROJECTOR_TYPE_PIXTRAL:
                     {
                         hparams.rope_theta = 10000.0f;
+                        hparams.warmup_image_size = hparams.patch_size * 8;
+                        get_u32(KEY_SPATIAL_MERGE_SIZE, hparams.spatial_merge_size, false);
+                    } break;
+                case PROJECTOR_TYPE_GEMMA3:
+                    {
+                        // default value (used by all model sizes in gemma 3 family)
+                        // number of patches for each **side** is reduced by a factor of 4
+                        hparams.proj_scale_factor = 4;
+                        // test model (tinygemma3) has a different value, we optionally read it
+                        get_u32(KEY_PROJ_SCALE_FACTOR, hparams.proj_scale_factor, false);
+                    } break;
+                case PROJECTOR_TYPE_QWEN2VL:
+                    {
+                        // max image size = sqrt(max_pixels) = 3584
+                        // ref: https://huggingface.co/Qwen/Qwen2-VL-7B-Instruct/blob/main/preprocessor_config.json
+                        // however, the model use unreasonable memory past 1024 size, we force it to 1024 otherwise it's unusable
+                        // ref: https://huggingface.co/Qwen/Qwen2-VL-2B-Instruct/discussions/10
+                        hparams.image_size = 1024;
+                        hparams.warmup_image_size = hparams.patch_size * 8;
                     } break;
                 case PROJECTOR_TYPE_QWEN25VL:
                     {
+                        // max image size = sqrt(max_pixels)
+                        // https://huggingface.co/Qwen/Qwen2.5-VL-7B-Instruct/blob/main/preprocessor_config.json
+                        // however, the model use unreasonable memory past 1024 size, we force it to 1024 otherwise it's unusable
+                        // ref: https://huggingface.co/Qwen/Qwen2-VL-2B-Instruct/discussions/10
+                        hparams.image_size = 1024;
+                        hparams.warmup_image_size = hparams.patch_size * 8;
                         get_u32(KEY_WIN_ATTN_PATTERN, hparams.n_wa_pattern);
                     } break;
                 default:
@@ -1757,18 +1944,26 @@ struct clip_model_loader {
             }
 
             LOG_INF("%s: projector:          %s\n", __func__, proj_type.c_str());
+            LOG_INF("%s: n_embd:             %d\n", __func__, hparams.n_embd);
+            LOG_INF("%s: n_head:             %d\n", __func__, hparams.n_head);
+            LOG_INF("%s: n_ff:               %d\n", __func__, hparams.n_ff);
+            LOG_INF("%s: n_layer:            %d\n", __func__, hparams.n_layer);
+            LOG_INF("%s: projection_dim:     %d\n", __func__, hparams.projection_dim);
+            LOG_INF("%s: image_size:         %d\n", __func__, hparams.image_size);
+            LOG_INF("%s: patch_size:         %d\n", __func__, hparams.patch_size);
+            LOG_INF("\n");
             LOG_INF("%s: has_llava_proj:     %d\n", __func__, ctx_clip.has_llava_projector);
             LOG_INF("%s: minicpmv_version:   %d\n", __func__, ctx_clip.minicpmv_version);
             LOG_INF("%s: proj_scale_factor:  %d\n", __func__, hparams.proj_scale_factor);
             LOG_INF("%s: n_wa_pattern:       %d\n", __func__, hparams.n_wa_pattern);
-            LOG_INF("%s: use_silu:           %d\n", __func__, ctx_clip.use_silu);
-            LOG_INF("%s: use_gelu:           %d\n", __func__, ctx_clip.use_gelu);
+            LOG_INF("%s: ffn_op:             %s\n", __func__, log_ffn_op.c_str());
             LOG_INF("%s: model size:         %.2f MiB\n", __func__, model_size / 1024.0 / 1024.0);
             LOG_INF("%s: metadata size:      %.2f MiB\n", __func__, ggml_get_mem_size(ctx_meta.get()) / 1024.0 / 1024.0);
         }
     }
 
     void load_tensors() {
+        auto & hparams = ctx_clip.vision_model.hparams;
         std::map<std::string, size_t> tensor_offset;
         std::vector<ggml_tensor *> tensors_to_load;
 
@@ -1791,14 +1986,14 @@ struct clip_model_loader {
 
         // helper function
         auto get_tensor = [&](const std::string & name, bool required = true) {
-            struct ggml_tensor * cur = ggml_get_tensor(ctx_meta.get(), name.c_str());
+            ggml_tensor * cur = ggml_get_tensor(ctx_meta.get(), name.c_str());
             if (!cur && required) {
                 throw std::runtime_error(string_format("%s: unable to find tensor %s\n", __func__, name.c_str()));
             }
             if (cur) {
                 tensors_to_load.push_back(cur);
                 // add tensors to context
-                struct ggml_tensor * data_tensor = ggml_dup_tensor(ctx_clip.ctx_data.get(), cur);
+                ggml_tensor * data_tensor = ggml_dup_tensor(ctx_clip.ctx_data.get(), cur);
                 ggml_set_name(data_tensor, cur->name);
                 cur = data_tensor;
             }
@@ -1822,15 +2017,20 @@ struct clip_model_loader {
         vision_model.position_embeddings = get_tensor(string_format(TN_POS_EMBD, "v"), false);
 
         // layers
-        vision_model.layers.resize(vision_model.hparams.n_layer);
-        for (int il = 0; il < vision_model.hparams.n_layer; ++il) {
+        vision_model.layers.resize(hparams.n_layer);
+        for (int il = 0; il < hparams.n_layer; ++il) {
             auto & layer = vision_model.layers[il];
             layer.k_w    = get_tensor(string_format(TN_ATTN_K,      "v", il, "weight"));
             layer.q_w    = get_tensor(string_format(TN_ATTN_Q,      "v", il, "weight"));
             layer.v_w    = get_tensor(string_format(TN_ATTN_V,      "v", il, "weight"));
             layer.o_w    = get_tensor(string_format(TN_ATTN_OUTPUT, "v", il, "weight"));
+            layer.k_norm = get_tensor(string_format(TN_ATTN_K_NORM, "v", il, "weight"), false);
+            layer.q_norm = get_tensor(string_format(TN_ATTN_Q_NORM, "v", il, "weight"), false);
             layer.ln_1_w = get_tensor(string_format(TN_LN_1,        "v", il, "weight"), false);
             layer.ln_2_w = get_tensor(string_format(TN_LN_2,        "v", il, "weight"), false);
+            layer.ls_1_w = get_tensor(string_format(TN_LS_1,        "v", il, "weight"), false); // no bias
+            layer.ls_2_w = get_tensor(string_format(TN_LS_2,        "v", il, "weight"), false); // no bias
+
             layer.k_b    = get_tensor(string_format(TN_ATTN_K,      "v", il, "bias"), false);
             layer.q_b    = get_tensor(string_format(TN_ATTN_Q,      "v", il, "bias"), false);
             layer.v_b    = get_tensor(string_format(TN_ATTN_V,      "v", il, "bias"), false);
@@ -1838,7 +2038,7 @@ struct clip_model_loader {
             layer.ln_1_b = get_tensor(string_format(TN_LN_1,        "v", il, "bias"), false);
             layer.ln_2_b = get_tensor(string_format(TN_LN_2,        "v", il, "bias"), false);
 
-            // new naming
+            // ffn
             layer.ff_up_w   = get_tensor(string_format(TN_FFN_UP,   "v", il, "weight"));
             layer.ff_up_b   = get_tensor(string_format(TN_FFN_UP,   "v", il, "bias"),   false);
             layer.ff_gate_w = get_tensor(string_format(TN_FFN_GATE, "v", il, "weight"), false);
@@ -1846,13 +2046,18 @@ struct clip_model_loader {
             layer.ff_down_w = get_tensor(string_format(TN_FFN_DOWN, "v", il, "weight"));
             layer.ff_down_b = get_tensor(string_format(TN_FFN_DOWN, "v", il, "bias"),   false);
 
-            // legacy naming (the in and out is reversed! don't ask me why)
-            layer.ff_i_w = layer.ff_down_w;
-            layer.ff_o_w = layer.ff_up_w;
-            layer.ff_g_w = layer.ff_gate_w;
-            layer.ff_i_b = layer.ff_down_b;
-            layer.ff_o_b = layer.ff_up_b;
-            layer.ff_g_b = layer.ff_gate_b;
+            // some models already exported with legacy (incorrect) naming which is quite messy, let's fix it here
+            // note: Qwen model converted from the old surgery script has n_ff = 0, so we cannot use n_ff to check!
+            if (layer.ff_up_w && layer.ff_down_w && layer.ff_down_w->ne[0] == hparams.n_embd) {
+                // swap up and down weights
+                ggml_tensor * tmp = layer.ff_up_w;
+                layer.ff_up_w = layer.ff_down_w;
+                layer.ff_down_w = tmp;
+                // swap up and down biases
+                tmp = layer.ff_up_b;
+                layer.ff_up_b = layer.ff_down_b;
+                layer.ff_down_b = tmp;
+            }
         }
 
         switch (ctx_clip.proj_type) {
@@ -1943,12 +2148,14 @@ struct clip_model_loader {
                 {
                     vision_model.mm_model_adapter_conv_w = get_tensor(string_format(TN_GLM_ADAPER_CONV, "weight"));
                     vision_model.mm_model_adapter_conv_b = get_tensor(string_format(TN_GLM_ADAPER_CONV, "bias"));
-                    vision_model.mm_model_mlp_0_w = get_tensor(string_format(TN_GLM_ADAPTER_LINEAR,"weight"));
-                    vision_model.mm_model_ln_q_w = get_tensor(string_format(TN_GLM_ADAPTER_NORM_1,"weight"));
-                    vision_model.mm_model_ln_q_b = get_tensor(string_format(TN_GLM_ADAPTER_NORM_1,"bias"));
-                    vision_model.mm_model_mlp_1_w = get_tensor(string_format(TN_GLM_ADAPTER_D_H_2_4H,"weight"));
-                    vision_model.mm_model_mlp_2_w = get_tensor(string_format(TN_GLM_ADAPTER_GATE,"weight"));
-                    vision_model.mm_model_mlp_3_w = get_tensor(string_format(TN_GLM_ADAPTER_D_4H_2_H,"weight"));
+                    vision_model.mm_model_mlp_0_w = get_tensor(string_format(TN_GLM_ADAPTER_LINEAR, "weight"));
+                    vision_model.mm_model_ln_q_w = get_tensor(string_format(TN_GLM_ADAPTER_NORM_1, "weight"));
+                    vision_model.mm_model_ln_q_b = get_tensor(string_format(TN_GLM_ADAPTER_NORM_1, "bias"));
+                    vision_model.mm_model_mlp_1_w = get_tensor(string_format(TN_GLM_ADAPTER_D_H_2_4H, "weight"));
+                    vision_model.mm_model_mlp_2_w = get_tensor(string_format(TN_GLM_ADAPTER_GATE, "weight"));
+                    vision_model.mm_model_mlp_3_w = get_tensor(string_format(TN_GLM_ADAPTER_D_4H_2_H, "weight"));
+                    vision_model.mm_glm_tok_boi = get_tensor(string_format(TN_TOK_GLM_BOI, "weight"));
+                    vision_model.mm_glm_tok_eoi = get_tensor(string_format(TN_TOK_GLM_EOI, "weight"));
                 } break;
             case PROJECTOR_TYPE_QWEN2VL:
             case PROJECTOR_TYPE_QWEN25VL:
@@ -1970,11 +2177,23 @@ struct clip_model_loader {
             case PROJECTOR_TYPE_PIXTRAL:
                 {
                     vision_model.mm_1_w = get_tensor(string_format(TN_LLAVA_PROJ, 1, "weight"));
-                    vision_model.mm_1_b = get_tensor(string_format(TN_LLAVA_PROJ, 1, "bias"));
+                    vision_model.mm_1_b = get_tensor(string_format(TN_LLAVA_PROJ, 1, "bias"), false);
                     vision_model.mm_2_w = get_tensor(string_format(TN_LLAVA_PROJ, 2, "weight"));
-                    vision_model.mm_2_b = get_tensor(string_format(TN_LLAVA_PROJ, 2, "bias"));
+                    vision_model.mm_2_b = get_tensor(string_format(TN_LLAVA_PROJ, 2, "bias"), false);
                     // [IMG_BREAK] token embedding
                     vision_model.token_embd_img_break = get_tensor(TN_TOK_IMG_BREAK);
+                    // for mistral small 3.1
+                    vision_model.mm_input_norm_w   = get_tensor(TN_MM_INP_NORM,     false);
+                    vision_model.mm_patch_merger_w = get_tensor(TN_MM_PATCH_MERGER, false);
+                } break;
+            case PROJECTOR_TYPE_INTERNVL:
+                {
+                    vision_model.mm_0_w = get_tensor(string_format(TN_MVLM_PROJ_MLP, 0, "weight"));
+                    vision_model.mm_0_b = get_tensor(string_format(TN_MVLM_PROJ_MLP, 0, "bias"));
+                    vision_model.mm_1_w = get_tensor(string_format(TN_MVLM_PROJ_MLP, 1, "weight"));
+                    vision_model.mm_1_b = get_tensor(string_format(TN_MVLM_PROJ_MLP, 1, "bias"));
+                    vision_model.mm_3_w = get_tensor(string_format(TN_MVLM_PROJ_MLP, 3, "weight"));
+                    vision_model.mm_3_b = get_tensor(string_format(TN_MVLM_PROJ_MLP, 3, "bias"));
                 } break;
             default:
                 GGML_ASSERT(false && "unknown projector type");
@@ -2016,7 +2235,7 @@ struct clip_model_loader {
             ctx_clip.buf.reset(ggml_backend_alloc_ctx_tensors_from_buft(ctx_clip.ctx_data.get(), buft));
             ggml_backend_buffer_set_usage(ctx_clip.buf.get(), GGML_BACKEND_BUFFER_USAGE_WEIGHTS);
             for (auto & t : tensors_to_load) {
-                struct ggml_tensor * cur = ggml_get_tensor(ctx_clip.ctx_data.get(), t->name);
+                ggml_tensor * cur = ggml_get_tensor(ctx_clip.ctx_data.get(), t->name);
                 const size_t offset = tensor_offset[t->name];
                 fin.seekg(offset, std::ios::beg);
                 if (!fin) {
@@ -2049,16 +2268,14 @@ struct clip_model_loader {
         // create a fake batch
         clip_image_f32_batch batch;
         clip_image_f32_ptr img(clip_image_f32_init());
-        clip_image_size image_size;
-        image_size.width  = ctx_clip.vision_model.hparams.image_size;
-        image_size.height = ctx_clip.vision_model.hparams.image_size;
-        img->nx = image_size.width;
-        img->ny = image_size.height;
-        img->buf.resize(image_size.width * image_size.height * 3);
+        img->nx = ctx_clip.vision_model.hparams.warmup_image_size;
+        img->ny = ctx_clip.vision_model.hparams.warmup_image_size;
+        img->buf.resize(img->nx * img->ny * 3);
         batch.entries.push_back(std::move(img));
 
-        ggml_cgraph * gf = clip_image_build_graph(&ctx_clip, batch, image_size, false);
+        ggml_cgraph * gf = clip_image_build_graph(&ctx_clip, batch);
         ggml_backend_sched_reserve(ctx_clip.sched.get(), gf);
+
         for (size_t i = 0; i < ctx_clip.backend_ptrs.size(); ++i) {
             ggml_backend_t backend = ctx_clip.backend_ptrs[i];
             ggml_backend_buffer_type_t buft = ctx_clip.backend_buft[i];
@@ -2141,9 +2358,10 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity) {
 
 struct clip_ctx * clip_init(const char * fname, struct clip_context_params ctx_params) {
     g_logger_state.verbosity_thold = ctx_params.verbosity;
-    clip_ctx * ctx_clip = new clip_ctx(ctx_params);
+    clip_ctx * ctx_clip = nullptr;
 
     try {
+        ctx_clip = new clip_ctx(ctx_params);
         clip_model_loader loader(fname, *ctx_clip);
         loader.load_hparams();
         loader.load_tensors();
@@ -2456,8 +2674,8 @@ struct image_manipulation {
         float target_width_f  = static_cast<float>(inp_size.width)  * scale;
         float target_height_f = static_cast<float>(inp_size.height) * scale;
 
-        int aligned_width  = GGML_PAD((int)target_width_f,  align_size);
-        int aligned_height = GGML_PAD((int)target_height_f, align_size);
+        int aligned_width  = CLIP_ALIGN((int)target_width_f,  align_size);
+        int aligned_height = CLIP_ALIGN((int)target_height_f, align_size);
 
         return {aligned_width, aligned_height};
     }
@@ -2555,7 +2773,7 @@ struct llava_uhd {
 
         // no pinpoints, dynamically calculate the grid size (e.g. minicpmv)
 
-        auto best_size    = get_best_resize(original_size, slice_size, patch_size, has_slices);
+        auto best_size    = get_best_resize(original_size, slice_size, patch_size, !has_slices);
         res.overview_size = best_size;
 
         if (!has_slices) {
@@ -2776,10 +2994,9 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str
     }
     else if (ctx->proj_type == PROJECTOR_TYPE_QWEN2VL || ctx->proj_type == PROJECTOR_TYPE_QWEN25VL) {
         clip_image_u8 resized;
-        auto patch_size = clip_get_patch_size(ctx) * 2;
-        int nx = ceil((float)img->nx / patch_size) * patch_size;
-        int ny = ceil((float)img->ny / patch_size) * patch_size;
-        image_manipulation::bicubic_resize(*img, resized, nx, ny);
+        auto patch_size = params.patch_size * 2;
+        auto new_size = image_manipulation::calc_size_preserved_ratio(original_size, patch_size, params.image_size);
+        image_manipulation::bicubic_resize(*img, resized, new_size.width, new_size.height);
 
         clip_image_f32_ptr img_f32(clip_image_f32_init());
         // clip_image_f32_ptr res(clip_image_f32_init());
@@ -2790,7 +3007,9 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str
     }
     else if (ctx->proj_type == PROJECTOR_TYPE_GLM_EDGE
             || ctx->proj_type == PROJECTOR_TYPE_GEMMA3
-            || ctx->proj_type == PROJECTOR_TYPE_IDEFICS3) {
+            || ctx->proj_type == PROJECTOR_TYPE_IDEFICS3
+            || ctx->proj_type == PROJECTOR_TYPE_INTERNVL // TODO @ngxson : support dynamic resolution
+    ) {
         clip_image_u8 resized_image;
         int sz = params.image_size;
         image_manipulation::resize_and_pad_image(*img, resized_image, {sz, sz});
@@ -2887,7 +3106,7 @@ int32_t clip_get_patch_size(const struct clip_ctx * ctx) {
 }
 
 int32_t clip_get_hidden_size(const struct clip_ctx * ctx) {
-    return ctx->vision_model.hparams.hidden_size;
+    return ctx->vision_model.hparams.n_embd;
 }
 
 const char * clip_patch_merge_type(const struct clip_ctx * ctx) {
@@ -2940,8 +3159,13 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im
 
     int n_patches = (params.image_size / params.patch_size) * (params.image_size / params.patch_size);
 
-    if (ctx->proj_type == PROJECTOR_TYPE_LDP || ctx->proj_type == PROJECTOR_TYPE_LDPV2 || ctx->proj_type == PROJECTOR_TYPE_GLM_EDGE) {
+    if (ctx->proj_type == PROJECTOR_TYPE_LDP
+            || ctx->proj_type == PROJECTOR_TYPE_LDPV2
+            || ctx->proj_type == PROJECTOR_TYPE_GLM_EDGE) {
         n_patches /= 4;
+        if (ctx->vision_model.mm_glm_tok_boi) {
+            n_patches += 2; // for BOI and EOI token embeddings
+        }
     } else if (ctx->proj_type == PROJECTOR_TYPE_MINICPMV) {
         if (ctx->minicpmv_version == 2) {
             n_patches = 96;
@@ -2961,12 +3185,16 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im
         int y_patch = img->ny / patch_size + (int)(img->ny % patch_size > 0);
         n_patches = x_patch * y_patch;
     } else if (ctx->proj_type == PROJECTOR_TYPE_GEMMA3) {
-        n_patches = 256;
-    } else if (ctx->proj_type == PROJECTOR_TYPE_IDEFICS3) {
-        n_patches /= ctx->vision_model.hparams.proj_scale_factor;
+        int n_per_side = params.image_size / params.patch_size;
+        int n_per_side_2d_pool = n_per_side / params.proj_scale_factor;
+        n_patches = n_per_side_2d_pool * n_per_side_2d_pool;
+    } else if (ctx->proj_type == PROJECTOR_TYPE_IDEFICS3 || ctx->proj_type == PROJECTOR_TYPE_INTERNVL) {
+        // both W and H are divided by proj_scale_factor
+        n_patches /= (params.proj_scale_factor * params.proj_scale_factor);
     } else if (ctx->proj_type == PROJECTOR_TYPE_PIXTRAL) {
-        int n_patches_x = img->nx / params.patch_size;
-        int n_patches_y = img->ny / params.patch_size;
+        int n_merge = params.spatial_merge_size;
+        int n_patches_x = img->nx / params.patch_size / (n_merge > 0 ? n_merge : 1);
+        int n_patches_y = img->ny / params.patch_size / (n_merge > 0 ? n_merge : 1);
         n_patches = n_patches_y*n_patches_x + n_patches_y - 1; // + one [IMG_BREAK] per row, except the last row
     }
 
@@ -3072,15 +3300,15 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
     const clip_image_f32_batch & imgs = *imgs_c_ptr;
     int batch_size = imgs.entries.size();
 
-    if (ctx->has_llava_projector
-            || ctx->proj_type == PROJECTOR_TYPE_MINICPMV
-            || ctx->proj_type == PROJECTOR_TYPE_GLM_EDGE) {
-        GGML_ASSERT(batch_size == 1);
+    // TODO @ngxson : implement batch size > 1 as a loop
+    //                we don't need true batching support because the cgraph will gonna be big anyway
+    if (batch_size != 1) {
+        return false; // only support batch size of 1
     }
 
     // build the inference graph
     ggml_backend_sched_reset(ctx->sched.get());
-    ggml_cgraph * gf = clip_image_build_graph(ctx, imgs, ctx->load_image_size, true);
+    ggml_cgraph * gf = clip_image_build_graph(ctx, imgs);
     ggml_backend_sched_alloc_graph(ctx->sched.get(), gf);
 
     // set inputs
@@ -3092,14 +3320,14 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
 
     const int patch_size    = hparams.patch_size;
     const int num_patches   = ((image_size_width / patch_size) * (image_size_height / patch_size));
-    const int num_positions = num_patches + (model.class_embedding ? 1 : 0);
+    const int n_pos = num_patches + (model.class_embedding ? 1 : 0);
     const int pos_w = ctx->load_image_size.width  / patch_size;
     const int pos_h = ctx->load_image_size.height / patch_size;
 
     const bool use_window_attn = hparams.n_wa_pattern > 0; // for qwen2.5vl
 
     auto get_inp_tensor = [&gf](const char * name) {
-        struct ggml_tensor * inp = ggml_graph_get_tensor(gf, name);
+        ggml_tensor * inp = ggml_graph_get_tensor(gf, name);
         if (inp == nullptr) {
             GGML_ABORT("Failed to get tensor %s", name);
         }
@@ -3208,7 +3436,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
                 const int merge_ratio = 2;
                 const int pw = image_size_width  / patch_size;
                 const int ph = image_size_height / patch_size;
-                std::vector<int> positions(num_positions * 4);
+                std::vector<int> positions(n_pos * 4);
                 int ptr = 0;
                 for (int y = 0; y < ph; y += merge_ratio) {
                     for (int x = 0; x < pw; x += merge_ratio) {
@@ -3285,7 +3513,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
                 }
 
                 const int mpow = merge_ratio * merge_ratio;
-                std::vector<int> positions(num_positions * 4);
+                std::vector<int> positions(n_pos * 4);
 
                 int ptr = 0;
                 for (int y = 0; y < iph; y += merge_ratio) {
@@ -3311,14 +3539,14 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
             {
                 // set the 2D positions
                 int n_patches_per_col = image_size_width / patch_size;
-                std::vector<int> pos_data(num_positions);
+                std::vector<int> pos_data(n_pos);
                 // dimension H
-                for (int i = 0; i < num_positions; i++) {
+                for (int i = 0; i < n_pos; i++) {
                     pos_data[i] = i / n_patches_per_col;
                 }
                 set_input_i32("pos_h", pos_data);
                 // dimension W
-                for (int i = 0; i < num_positions; i++) {
+                for (int i = 0; i < n_pos; i++) {
                     pos_data[i] = i % n_patches_per_col;
                 }
                 set_input_i32("pos_w", pos_data);
@@ -3326,8 +3554,8 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
         case PROJECTOR_TYPE_GLM_EDGE:
         {
             // llava and other models
-            std::vector<int32_t> positions(num_positions);
-            for (int i = 0; i < num_positions; i++) {
+            std::vector<int32_t> positions(n_pos);
+            for (int i = 0; i < n_pos; i++) {
                 positions[i] = i;
             }
             set_input_i32("positions", positions);
@@ -3338,8 +3566,8 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
         case PROJECTOR_TYPE_LDPV2:
             {
                 // llava and other models
-                std::vector<int32_t> positions(num_positions);
-                for (int i = 0; i < num_positions; i++) {
+                std::vector<int32_t> positions(n_pos);
+                for (int i = 0; i < n_pos; i++) {
                     positions[i] = i;
                 }
                 set_input_i32("positions", positions);
@@ -3356,6 +3584,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
             } break;
         case PROJECTOR_TYPE_GEMMA3:
         case PROJECTOR_TYPE_IDEFICS3:
+        case PROJECTOR_TYPE_INTERNVL:
             {
                 // do nothing
             } break;
@@ -3363,7 +3592,15 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
             GGML_ABORT("Unknown projector type");
     }
 
-    ggml_backend_cpu_set_n_threads(ctx->backend_cpu, n_threads);
+    // ggml_backend_cpu_set_n_threads(ctx->backend_cpu, n_threads);
+    ggml_backend_dev_t dev = ggml_backend_get_device(ctx->backend_cpu);
+    ggml_backend_reg_t reg = dev ? ggml_backend_dev_backend_reg(dev) : nullptr;
+    if (reg) {
+        auto ggml_backend_set_n_threads_fn = (ggml_backend_set_n_threads_t) ggml_backend_reg_get_proc_address(reg, "ggml_backend_set_n_threads");
+        if (ggml_backend_set_n_threads_fn) {
+            ggml_backend_set_n_threads_fn(ctx->backend_cpu, n_threads);
+        }
+    }
 
     auto status = ggml_backend_sched_graph_compute(ctx->sched.get(), gf);
     if (status != GGML_STATUS_SUCCESS) {
@@ -3372,7 +3609,15 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
     }
 
     // the last node is the embedding tensor
-    struct ggml_tensor * embeddings = ggml_graph_node(gf, -1);
+    ggml_tensor * embeddings = ggml_graph_node(gf, -1);
+
+    // sanity check (only support batch size of 1 for now)
+    const int n_tokens_out = embeddings->ne[1];
+    const int expected_n_tokens_out = clip_n_output_tokens(ctx, imgs.entries[0].get());
+    if (n_tokens_out != expected_n_tokens_out) {
+        LOG_ERR("%s: expected %d tokens, got %d\n", __func__, expected_n_tokens_out, n_tokens_out);
+        GGML_ABORT("Invalid number of output tokens");
+    }
 
     // copy the embeddings to the location passed by the user
     ggml_backend_tensor_get(embeddings, vec, 0, ggml_nbytes(embeddings));
@@ -3403,7 +3648,7 @@ bool clip_model_quantize(const char * fname_inp, const char * fname_out, const i
 
     for (int i = 0; i < n_tensors; ++i) {
         const char * name = gguf_get_tensor_name(ctx_src, i);
-        struct ggml_tensor * cur = ggml_get_tensor(ctx_data, name);
+        ggml_tensor * cur = ggml_get_tensor(ctx_data, name);
         gguf_add_tensor(ctx_out, cur);
     }
 
@@ -3424,7 +3669,7 @@ bool clip_model_quantize(const char * fname_inp, const char * fname_out, const i
 
     for (int i = 0; i < n_tensors; ++i) {
         const std::string name = gguf_get_tensor_name(ctx_src, i);
-        struct ggml_tensor * cur = ggml_get_tensor(ctx_data, name.c_str());
+        ggml_tensor * cur = ggml_get_tensor(ctx_data, name.c_str());
 
         enum ggml_type new_type;
         void * new_data;
@@ -3523,7 +3768,7 @@ int clip_n_mmproj_embd(const struct clip_ctx * ctx) {
             return ctx->vision_model.mm_model_peg_0_b->ne[0];
         case PROJECTOR_TYPE_MLP:
         case PROJECTOR_TYPE_PIXTRAL:
-            return ctx->vision_model.mm_2_b->ne[0];
+            return ctx->vision_model.mm_2_w->ne[1];
         case PROJECTOR_TYPE_MLP_NORM:
             return ctx->vision_model.mm_3_b->ne[0];
         case PROJECTOR_TYPE_MINICPMV:
@@ -3544,6 +3789,8 @@ int clip_n_mmproj_embd(const struct clip_ctx * ctx) {
             return ctx->vision_model.mm_input_proj_w->ne[0];
         case PROJECTOR_TYPE_IDEFICS3:
             return ctx->vision_model.projection->ne[1];
+        case PROJECTOR_TYPE_INTERNVL:
+            return ctx->vision_model.mm_3_w->ne[1];
         default:
             GGML_ABORT("Unknown projector type");
     }
diff --git a/llama/llama.cpp/examples/llava/clip.h b/llama/llama.cpp/tools/mtmd/clip.h
similarity index 96%
rename from llama/llama.cpp/examples/llava/clip.h
rename to llama/llama.cpp/tools/mtmd/clip.h
index 0a53bd8e..0b0eb029 100644
--- a/llama/llama.cpp/examples/llava/clip.h
+++ b/llama/llama.cpp/tools/mtmd/clip.h
@@ -78,10 +78,10 @@ CLIP_API int clip_uhd_num_image_embeds_col(struct clip_ctx * ctx_clip);
 CLIP_API void clip_add_load_image_size(struct clip_ctx * ctx_clip, struct clip_image_size * load_image_size);
 CLIP_API struct clip_image_size * clip_get_load_image_size(struct clip_ctx * ctx_clip);
 
-CLIP_API struct clip_image_size      * clip_image_size_init();
-CLIP_API struct clip_image_u8        * clip_image_u8_init ();
-CLIP_API struct clip_image_f32       * clip_image_f32_init();
-CLIP_API struct clip_image_f32_batch * clip_image_f32_batch_init(); // only used by libllava
+CLIP_API struct clip_image_size      * clip_image_size_init(void);
+CLIP_API struct clip_image_u8        * clip_image_u8_init (void);
+CLIP_API struct clip_image_f32       * clip_image_f32_init(void);
+CLIP_API struct clip_image_f32_batch * clip_image_f32_batch_init(void); // only used by libllava
 
 // nx, ny are the output image dimensions
 CLIP_API unsigned char * clip_image_u8_get_data(struct clip_image_u8 * img, uint32_t * nx, uint32_t * ny);
diff --git a/llama/llama.cpp/examples/llava/llava.cpp b/llama/llama.cpp/tools/mtmd/llava.cpp
similarity index 99%
rename from llama/llama.cpp/examples/llava/llava.cpp
rename to llama/llama.cpp/tools/mtmd/llava.cpp
index bab027b5..b0eb79bb 100644
--- a/llama/llama.cpp/examples/llava/llava.cpp
+++ b/llama/llama.cpp/tools/mtmd/llava.cpp
@@ -2,6 +2,7 @@
 #include "llava.h"
 
 #include "llama.h"
+#include "ggml-cpp.h"
 
 #include <algorithm>
 #include <cerrno>
@@ -209,7 +210,11 @@ static bool clip_llava_handle_patches(clip_ctx * ctx_clip, std::vector<float *>
     struct ggml_tensor *flatten = ggml_view_2d(model.ctx, permuted_cont, clip_n_mmproj_embd(ctx_clip), num_patches_height * num_patches_width * num_patches_per_side * num_patches_per_side,  size_ele * clip_n_mmproj_embd(ctx_clip), 0);
     // ggml_tensor_printf(flatten,"flatten",__LINE__,false,false);
     ggml_build_forward_expand(gf, flatten);
-    ggml_graph_compute_with_ctx(model.ctx, gf, 1);
+
+    ggml_backend_ptr backend { ggml_backend_init_by_type(GGML_BACKEND_DEVICE_TYPE_CPU, nullptr) };
+    GGML_ASSERT(backend != nullptr && "failed to initialize CPU backend");
+    ggml_backend_graph_compute(backend.get(), gf);
+
     struct ggml_tensor* result = ggml_graph_node(gf, -1);
 
     memcpy(image_embd_out, image_embd_v[0], clip_embd_nbytes(ctx_clip)); // main image as global context
diff --git a/llama/llama.cpp/examples/llava/llava.h b/llama/llama.cpp/tools/mtmd/llava.h
similarity index 100%
rename from llama/llama.cpp/examples/llava/llava.h
rename to llama/llama.cpp/tools/mtmd/llava.h
diff --git a/llama/llama.cpp/examples/llava/llava.go b/llama/llama.cpp/tools/mtmd/mtmd.go
similarity index 92%
rename from llama/llama.cpp/examples/llava/llava.go
rename to llama/llama.cpp/tools/mtmd/mtmd.go
index 37b031cb..06479036 100644
--- a/llama/llama.cpp/examples/llava/llava.go
+++ b/llama/llama.cpp/tools/mtmd/mtmd.go
@@ -1,4 +1,4 @@
-package llava
+package mtmd
 
 // #cgo CXXFLAGS: -std=c++11
 // #cgo CPPFLAGS: -I${SRCDIR}/../../include -I${SRCDIR}/../../common
diff --git a/llama/llama.go b/llama/llama.go
index 063eb7c8..f0f2af82 100644
--- a/llama/llama.go
+++ b/llama/llama.go
@@ -6,7 +6,7 @@ package llama
 #cgo CXXFLAGS: -std=c++17
 #cgo CPPFLAGS: -I${SRCDIR}/llama.cpp/include
 #cgo CPPFLAGS: -I${SRCDIR}/llama.cpp/common
-#cgo CPPFLAGS: -I${SRCDIR}/llama.cpp/examples/llava
+#cgo CPPFLAGS: -I${SRCDIR}/llama.cpp/tools/mtmd
 #cgo CPPFLAGS: -I${SRCDIR}/llama.cpp/src
 #cgo CPPFLAGS: -I${SRCDIR}/../ml/backend/ggml/ggml/include
 
@@ -40,8 +40,8 @@ import (
 	"unsafe"
 
 	_ "github.com/ollama/ollama/llama/llama.cpp/common"
-	_ "github.com/ollama/ollama/llama/llama.cpp/examples/llava"
 	_ "github.com/ollama/ollama/llama/llama.cpp/src"
+	_ "github.com/ollama/ollama/llama/llama.cpp/tools/mtmd"
 	ggml "github.com/ollama/ollama/ml/backend/ggml/ggml/src"
 )
 
diff --git a/llama/patches/0001-ggml-backend-malloc-and-free-using-the-same-compiler.patch b/llama/patches/0001-ggml-backend-malloc-and-free-using-the-same-compiler.patch
index 44aa7095..edeeb4ff 100644
--- a/llama/patches/0001-ggml-backend-malloc-and-free-using-the-same-compiler.patch
+++ b/llama/patches/0001-ggml-backend-malloc-and-free-using-the-same-compiler.patch
@@ -24,7 +24,7 @@ problem.
  9 files changed, 21 insertions(+), 2 deletions(-)
 
 diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp
-index 273075f4..dd11f304 100644
+index b30b4cb3..0ce73a99 100644
 --- a/ggml/src/ggml-backend.cpp
 +++ b/ggml/src/ggml-backend.cpp
 @@ -107,7 +107,6 @@ void ggml_backend_buffer_free(ggml_backend_buffer_t buffer) {
@@ -43,7 +43,7 @@ index 273075f4..dd11f304 100644
  }
  
  static void ggml_backend_multi_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
-@@ -1867,6 +1867,11 @@ static void * ggml_backend_cpu_buffer_get_base(ggml_backend_buffer_t buffer) {
+@@ -1871,6 +1871,11 @@ static void * ggml_backend_cpu_buffer_get_base(ggml_backend_buffer_t buffer) {
  
  static void ggml_backend_cpu_buffer_free_buffer(ggml_backend_buffer_t buffer) {
      ggml_aligned_free(buffer->context, buffer->size);
@@ -55,7 +55,7 @@ index 273075f4..dd11f304 100644
  }
  
  static void ggml_backend_cpu_buffer_memset_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) {
-@@ -1914,7 +1919,7 @@ static const struct ggml_backend_buffer_i ggml_backend_cpu_buffer_i = {
+@@ -1918,7 +1923,7 @@ static const struct ggml_backend_buffer_i ggml_backend_cpu_buffer_i = {
  };
  
  static const struct ggml_backend_buffer_i ggml_backend_cpu_buffer_from_ptr_i = {
@@ -85,7 +85,7 @@ index e2617b06..242e50a7 100644
  
  /**
 diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
-index 9fb2134f..04ce764e 100644
+index b4b85abc..cb0d8528 100644
 --- a/ggml/src/ggml-cuda/ggml-cuda.cu
 +++ b/ggml/src/ggml-cuda/ggml-cuda.cu
 @@ -534,6 +534,7 @@ struct ggml_backend_cuda_buffer_context {
@@ -96,7 +96,7 @@ index 9fb2134f..04ce764e 100644
  }
  
  static bool ggml_backend_buffer_is_cuda(ggml_backend_buffer_t buffer) {
-@@ -789,6 +790,7 @@ struct ggml_backend_cuda_split_buffer_context {
+@@ -790,6 +791,7 @@ struct ggml_backend_cuda_split_buffer_context {
  static void ggml_backend_cuda_split_buffer_free_buffer(ggml_backend_buffer_t buffer) {
      ggml_backend_cuda_split_buffer_context * ctx = (ggml_backend_cuda_split_buffer_context *)buffer->context;
      delete ctx;
@@ -104,7 +104,7 @@ index 9fb2134f..04ce764e 100644
  }
  
  static void * ggml_backend_cuda_split_buffer_get_base(ggml_backend_buffer_t buffer) {
-@@ -1062,6 +1064,7 @@ static const char * ggml_backend_cuda_host_buffer_type_name(ggml_backend_buffer_
+@@ -1067,6 +1069,7 @@ static const char * ggml_backend_cuda_host_buffer_type_name(ggml_backend_buffer_
  
  static void ggml_backend_cuda_host_buffer_free_buffer(ggml_backend_buffer_t buffer) {
      CUDA_CHECK(cudaFreeHost(buffer->context));
@@ -125,10 +125,10 @@ index 50579227..2799a0a5 100644
  
  static void * ggml_backend_kompute_buffer_get_base(ggml_backend_buffer_t buffer) {
 diff --git a/ggml/src/ggml-metal/ggml-metal.m b/ggml/src/ggml-metal/ggml-metal.m
-index d92392ed..425524d0 100644
+index 576f9581..1b56f858 100644
 --- a/ggml/src/ggml-metal/ggml-metal.m
 +++ b/ggml/src/ggml-metal/ggml-metal.m
-@@ -5077,6 +5077,7 @@ static void ggml_backend_metal_buffer_free_buffer(ggml_backend_buffer_t buffer)
+@@ -5214,6 +5214,7 @@ static void ggml_backend_metal_buffer_free_buffer(ggml_backend_buffer_t buffer)
      }
  
      free(ctx);
@@ -149,10 +149,10 @@ index 05a2f4e6..392cc18d 100644
  
  static void * ggml_backend_opencl_buffer_get_base(ggml_backend_buffer_t buffer) {
 diff --git a/ggml/src/ggml-rpc/ggml-rpc.cpp b/ggml/src/ggml-rpc/ggml-rpc.cpp
-index 140a775f..e33c4ba0 100644
+index 4f0abb5a..de1ec184 100644
 --- a/ggml/src/ggml-rpc/ggml-rpc.cpp
 +++ b/ggml/src/ggml-rpc/ggml-rpc.cpp
-@@ -477,6 +477,7 @@ static void ggml_backend_rpc_buffer_free_buffer(ggml_backend_buffer_t buffer) {
+@@ -483,6 +483,7 @@ static void ggml_backend_rpc_buffer_free_buffer(ggml_backend_buffer_t buffer) {
      bool status = send_rpc_cmd(ctx->sock, RPC_CMD_FREE_BUFFER, &request, sizeof(request), nullptr, 0);
      GGML_ASSERT(status);
      delete ctx;
@@ -161,10 +161,10 @@ index 140a775f..e33c4ba0 100644
  
  static void * ggml_backend_rpc_buffer_get_base(ggml_backend_buffer_t buffer) {
 diff --git a/ggml/src/ggml-sycl/ggml-sycl.cpp b/ggml/src/ggml-sycl/ggml-sycl.cpp
-index 66b6f2cc..e3e6deae 100644
+index 0ea72994..ae3a3c33 100644
 --- a/ggml/src/ggml-sycl/ggml-sycl.cpp
 +++ b/ggml/src/ggml-sycl/ggml-sycl.cpp
-@@ -317,6 +317,7 @@ ggml_backend_sycl_buffer_free_buffer(ggml_backend_buffer_t buffer) try {
+@@ -320,6 +320,7 @@ ggml_backend_sycl_buffer_free_buffer(ggml_backend_buffer_t buffer) try {
      ggml_sycl_set_device(ctx->device);
  
      delete ctx;
@@ -172,7 +172,7 @@ index 66b6f2cc..e3e6deae 100644
  }
  catch (sycl::exception const &exc) {
    std::cerr << exc.what() << "Exception caught at file:" << __FILE__
-@@ -762,6 +763,7 @@ struct ggml_backend_sycl_split_buffer_context {
+@@ -765,6 +766,7 @@ struct ggml_backend_sycl_split_buffer_context {
  static void ggml_backend_sycl_split_buffer_free_buffer(ggml_backend_buffer_t buffer) {
      ggml_backend_sycl_split_buffer_context * ctx = (ggml_backend_sycl_split_buffer_context *)buffer->context;
      delete ctx;
@@ -180,7 +180,7 @@ index 66b6f2cc..e3e6deae 100644
  }
  
  static void * ggml_backend_sycl_split_buffer_get_base(ggml_backend_buffer_t buffer) {
-@@ -1096,6 +1098,7 @@ static const char * ggml_backend_sycl_host_buffer_type_name(ggml_backend_buffer_
+@@ -1099,6 +1101,7 @@ static const char * ggml_backend_sycl_host_buffer_type_name(ggml_backend_buffer_
  
  static void ggml_backend_sycl_host_buffer_free_buffer(ggml_backend_buffer_t buffer) {
      ggml_sycl_host_free(buffer->context);
@@ -189,10 +189,10 @@ index 66b6f2cc..e3e6deae 100644
  
  static ggml_backend_buffer_t ggml_backend_sycl_host_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
 diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
-index c0bdb9e1..03d03064 100644
+index e2b357fd..68768029 100644
 --- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp
 +++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
-@@ -8660,6 +8660,7 @@ static void ggml_backend_vk_buffer_free_buffer(ggml_backend_buffer_t buffer) {
+@@ -8962,6 +8962,7 @@ static void ggml_backend_vk_buffer_free_buffer(ggml_backend_buffer_t buffer) {
      ggml_backend_vk_buffer_context * ctx = (ggml_backend_vk_buffer_context *)buffer->context;
      ggml_vk_destroy_buffer(ctx->dev_buffer);
      delete ctx;
@@ -200,7 +200,7 @@ index c0bdb9e1..03d03064 100644
  }
  
  static void * ggml_backend_vk_buffer_get_base(ggml_backend_buffer_t buffer) {
-@@ -8803,6 +8804,7 @@ static const char * ggml_backend_vk_host_buffer_name(ggml_backend_buffer_t buffe
+@@ -9105,6 +9106,7 @@ static const char * ggml_backend_vk_host_buffer_name(ggml_backend_buffer_t buffe
  static void ggml_backend_vk_host_buffer_free_buffer(ggml_backend_buffer_t buffer) {
      VK_LOG_MEMORY("ggml_backend_vk_host_buffer_free_buffer()");
      ggml_vk_host_free(vk_instance.devices[0], buffer->context);
diff --git a/llama/patches/0002-pretokenizer.patch b/llama/patches/0002-pretokenizer.patch
index ecdabe7e..07aa4b0e 100644
--- a/llama/patches/0002-pretokenizer.patch
+++ b/llama/patches/0002-pretokenizer.patch
@@ -10,10 +10,10 @@ logs instead of throwing an error
  1 file changed, 3 insertions(+), 11 deletions(-)
 
 diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp
-index 50ded286..a9ee9f03 100644
+index 9389ca80..806c1b3d 100644
 --- a/src/llama-vocab.cpp
 +++ b/src/llama-vocab.cpp
-@@ -1491,16 +1491,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
+@@ -1503,16 +1503,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
          if (type == LLAMA_VOCAB_TYPE_BPE) {
              add_space_prefix = false;
              clean_spaces = true;
@@ -31,8 +31,8 @@ index 50ded286..a9ee9f03 100644
                  pre_type = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
              } else if (
                      tokenizer_pre == "llama3"   ||
-@@ -1635,7 +1626,8 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
-                 pre_type = LLAMA_VOCAB_PRE_TYPE_BAILINGMOE;
+@@ -1651,7 +1642,8 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
+                 pre_type = LLAMA_VOCAB_PRE_TYPE_SEED_CODER;
                  clean_spaces = false;
              } else {
 -                throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));
diff --git a/llama/patches/0003-embeddings.patch b/llama/patches/0003-embeddings.patch
index 022a83f4..80d6b55e 100644
--- a/llama/patches/0003-embeddings.patch
+++ b/llama/patches/0003-embeddings.patch
@@ -11,10 +11,10 @@ instead of forcing one or the error
  1 file changed, 3 insertions(+), 3 deletions(-)
 
 diff --git a/src/llama-context.cpp b/src/llama-context.cpp
-index 5a2eef9b..9c1fe93f 100644
+index 62246c10..dca22d8b 100644
 --- a/src/llama-context.cpp
 +++ b/src/llama-context.cpp
-@@ -1225,7 +1225,7 @@ int llama_context::decode(llama_batch & inp_batch) {
+@@ -901,7 +901,7 @@ int llama_context::decode(llama_batch & inp_batch) {
      int64_t n_outputs_all = 0;
  
      // count outputs
@@ -23,7 +23,7 @@ index 5a2eef9b..9c1fe93f 100644
          for (uint32_t i = 0; i < n_tokens_all; ++i) {
              n_outputs_all += batch.logits[i] != 0;
          }
-@@ -1337,7 +1337,7 @@ int llama_context::decode(llama_batch & inp_batch) {
+@@ -982,7 +982,7 @@ int llama_context::decode(llama_batch & inp_batch) {
          //    ggml_graph_dump_dot(gf, NULL, "llama.dot");
          //}
  
@@ -32,7 +32,7 @@ index 5a2eef9b..9c1fe93f 100644
          auto * t_embd   = cparams.embeddings ? res->get_embd() : nullptr;
  
          if (t_embd && res->get_embd_pooled()) {
-@@ -1481,7 +1481,7 @@ int32_t llama_context::output_reserve(int32_t n_outputs) {
+@@ -1151,7 +1151,7 @@ int32_t llama_context::output_reserve(int32_t n_outputs) {
      const auto n_embd  = hparams.n_embd;
  
      // TODO: use a per-batch flag for logits presence instead
diff --git a/llama/patches/0004-clip-unicode.patch b/llama/patches/0004-clip-unicode.patch
index 35f54fd3..95710978 100644
--- a/llama/patches/0004-clip-unicode.patch
+++ b/llama/patches/0004-clip-unicode.patch
@@ -6,16 +6,16 @@ Subject: [PATCH] clip-unicode
 fixes loading vision models in llama.cpp on windows
 filesystems for paths that include wide characters
 ---
- examples/llava/clip.cpp | 39 +++++++++++++++++++++++++++++++++++++++
+ tools/mtmd/clip.cpp | 39 +++++++++++++++++++++++++++++++++++++++
  1 file changed, 39 insertions(+)
 
-diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp
-index ad3e7df1..b3218c78 100644
---- a/examples/llava/clip.cpp
-+++ b/examples/llava/clip.cpp
-@@ -30,6 +30,19 @@
- #include <array>
+diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp
+index 41ba45a7..cdd8ca44 100644
+--- a/tools/mtmd/clip.cpp
++++ b/tools/mtmd/clip.cpp
+@@ -31,6 +31,19 @@
  #include <numeric>
+ #include <functional>
  
 +#if defined(_WIN32)
 +#define WIN32_LEAN_AND_MEAN
@@ -32,8 +32,8 @@ index ad3e7df1..b3218c78 100644
 +
  struct clip_logger_state g_logger_state = {GGML_LOG_LEVEL_CONT, clip_log_callback_default, NULL};
  
- //#define CLIP_DEBUG_FUNCTIONS
-@@ -1971,7 +1984,29 @@ struct clip_model_loader {
+ enum ffn_op_type {
+@@ -2190,7 +2203,29 @@ struct clip_model_loader {
          {
              std::vector<uint8_t> read_buf;
  
@@ -63,7 +63,7 @@ index ad3e7df1..b3218c78 100644
              if (!fin) {
                  throw std::runtime_error(string_format("%s: failed to open %s\n", __func__, fname.c_str()));
              }
-@@ -1998,7 +2033,11 @@ struct clip_model_loader {
+@@ -2217,7 +2252,11 @@ struct clip_model_loader {
                      ggml_backend_tensor_set(cur, read_buf.data(), 0, num_bytes);
                  }
              }
diff --git a/llama/patches/0005-solar-pro.patch b/llama/patches/0005-solar-pro.patch
index bf0fe310..c630f243 100644
--- a/llama/patches/0005-solar-pro.patch
+++ b/llama/patches/0005-solar-pro.patch
@@ -138,7 +138,7 @@ index 7ee6a5b7..48dce407 100644
  };
  
 diff --git a/src/llama-model-loader.cpp b/src/llama-model-loader.cpp
-index ea73a8a7..a012aeae 100644
+index 4cce5166..7f6617fa 100644
 --- a/src/llama-model-loader.cpp
 +++ b/src/llama-model-loader.cpp
 @@ -439,6 +439,7 @@ namespace GGUFMeta {
@@ -150,10 +150,10 @@ index ea73a8a7..a012aeae 100644
  llama_model_loader::llama_model_loader(
          const std::string & fname,
 diff --git a/src/llama-model.cpp b/src/llama-model.cpp
-index 822e2bb2..572378c9 100644
+index 3a4e72a3..831b68c0 100644
 --- a/src/llama-model.cpp
 +++ b/src/llama-model.cpp
-@@ -1386,6 +1386,21 @@ void llama_model::load_hparams(llama_model_loader & ml) {
+@@ -1402,6 +1402,21 @@ void llama_model::load_hparams(llama_model_loader & ml) {
                      default: type = LLM_TYPE_UNKNOWN;
                 }
              } break;
@@ -175,7 +175,7 @@ index 822e2bb2..572378c9 100644
          case LLM_ARCH_WAVTOKENIZER_DEC:
              {
                  ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS,    hparams.f_norm_eps);
-@@ -3741,6 +3756,34 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
+@@ -3774,6 +3789,34 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
  
                          layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  
@@ -210,7 +210,7 @@ index 822e2bb2..572378c9 100644
                          layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, 0);
                          layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
                          layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
-@@ -12342,6 +12385,165 @@ struct llm_build_chameleon : public llm_graph_context {
+@@ -12397,6 +12440,165 @@ struct llm_build_chameleon : public llm_graph_context {
      }
  };
  
@@ -376,7 +376,7 @@ index 822e2bb2..572378c9 100644
  struct llm_build_wavtokenizer_dec : public llm_graph_context {
      llm_build_wavtokenizer_dec(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
          ggml_tensor * cur;
-@@ -13092,6 +13294,10 @@ llm_graph_result_ptr llama_model::build_graph(
+@@ -13157,6 +13359,10 @@ llm_graph_result_ptr llama_model::build_graph(
              {
                  llm = std::make_unique<llm_build_chameleon>(*this, params, gf);
              } break;
@@ -387,7 +387,7 @@ index 822e2bb2..572378c9 100644
          case LLM_ARCH_WAVTOKENIZER_DEC:
              {
                  llm = std::make_unique<llm_build_wavtokenizer_dec>(*this, params, gf);
-@@ -13238,6 +13444,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
+@@ -13301,6 +13507,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
          case LLM_ARCH_GRANITE:
          case LLM_ARCH_GRANITE_MOE:
          case LLM_ARCH_CHAMELEON:
@@ -396,10 +396,10 @@ index 822e2bb2..572378c9 100644
              return LLAMA_ROPE_TYPE_NORM;
  
 diff --git a/src/llama-model.h b/src/llama-model.h
-index 95eca002..856e6042 100644
+index 6bdec263..43746c7d 100644
 --- a/src/llama-model.h
 +++ b/src/llama-model.h
-@@ -64,6 +64,7 @@ enum llm_type {
+@@ -65,6 +65,7 @@ enum llm_type {
      LLM_TYPE_15B,
      LLM_TYPE_16B,
      LLM_TYPE_20B,
@@ -407,7 +407,7 @@ index 95eca002..856e6042 100644
      LLM_TYPE_27B,
      LLM_TYPE_30B,
      LLM_TYPE_32B,
-@@ -311,6 +312,8 @@ struct llama_layer {
+@@ -315,6 +316,8 @@ struct llama_layer {
      struct ggml_tensor * ffn_up_scale   = nullptr;
      struct ggml_tensor * ffn_down_scale = nullptr;
  
diff --git a/llama/patches/0006-add-mllama-support.patch b/llama/patches/0006-add-mllama-support.patch
index 9283224f..05f85ec3 100644
--- a/llama/patches/0006-add-mllama-support.patch
+++ b/llama/patches/0006-add-mllama-support.patch
@@ -5,88 +5,27 @@ Subject: [PATCH] add mllama support
 
 adds support for the llama 3.2 vision architecture
 ---
- examples/llava/llava.cpp      |   5 +-
- examples/llava/mtmd.cpp       |   6 +-
  ggml/src/ggml-backend-reg.cpp |   6 +-
  include/llama.h               |   6 +
  src/llama-arch.cpp            |  44 +++++
  src/llama-arch.h              |  10 ++
  src/llama-batch.cpp           |   3 +
- src/llama-context.cpp         |  25 ++-
+ src/llama-context.cpp         |  23 ++-
  src/llama-context.h           |   1 +
  src/llama-cparams.h           |   1 +
  src/llama-graph.cpp           |  25 +++
  src/llama-graph.h             |  12 ++
  src/llama-hparams.cpp         |   4 +
  src/llama-hparams.h           |   7 +
- src/llama-kv-cache.cpp        |  12 +-
+ src/llama-kv-cache.cpp        |  14 +-
  src/llama-model-loader.cpp    |   2 +
- src/llama-model.cpp           | 309 +++++++++++++++++++++++++++++++++-
+ src/llama-model.cpp           | 311 +++++++++++++++++++++++++++++++++-
  src/llama-model.h             |  12 ++
  src/llama-quant.cpp           |   4 +-
- 19 files changed, 473 insertions(+), 21 deletions(-)
+ tools/mtmd/llava.cpp          |   5 +-
+ tools/mtmd/mtmd-helper.cpp    |   7 +-
+ 19 files changed, 475 insertions(+), 22 deletions(-)
 
-diff --git a/examples/llava/llava.cpp b/examples/llava/llava.cpp
-index c00d16ae..bab027b5 100644
---- a/examples/llava/llava.cpp
-+++ b/examples/llava/llava.cpp
-@@ -457,7 +457,7 @@ struct llava_embd_batch {
-     std::vector<llama_seq_id *> seq_ids;
-     std::vector<int8_t>         logits;
-     llama_batch batch;
--    llava_embd_batch(float * embd, int32_t n_tokens, llama_pos pos_0, llama_seq_id seq_id) {
-+    llava_embd_batch(float * embd, int32_t n_embd, int32_t n_tokens, llama_pos pos_0, llama_seq_id seq_id) {
-         pos     .resize(n_tokens);
-         n_seq_id.resize(n_tokens);
-         seq_ids .resize(n_tokens + 1);
-@@ -469,6 +469,7 @@ struct llava_embd_batch {
-             /*n_tokens       =*/ n_tokens,
-             /*tokens         =*/ nullptr,
-             /*embd           =*/ embd,
-+            /*n_embd         =*/ n_embd,
-             /*pos            =*/ pos.data(),
-             /*n_seq_id       =*/ n_seq_id.data(),
-             /*seq_id         =*/ seq_ids.data(),
-@@ -492,7 +493,7 @@ bool llava_eval_image_embed(llama_context * ctx_llama, const struct llava_image_
-             n_eval = n_batch;
-         }
-         float * embd = image_embed->embed+i*n_embd;
--        llava_embd_batch llava_batch = llava_embd_batch(embd, n_eval, *n_past, 0);
-+        llava_embd_batch llava_batch = llava_embd_batch(embd, n_embd, n_eval, *n_past, 0);
-         if (llama_decode(ctx_llama, llava_batch.batch)) {
-             LOG_ERR("%s : failed to eval\n", __func__);
-             return false;
-diff --git a/examples/llava/mtmd.cpp b/examples/llava/mtmd.cpp
-index 7081fd73..c14ac501 100644
---- a/examples/llava/mtmd.cpp
-+++ b/examples/llava/mtmd.cpp
-@@ -476,7 +476,7 @@ struct decode_embd_batch {
-     std::vector<llama_seq_id *> seq_ids;
-     std::vector<int8_t>         logits;
-     llama_batch batch;
--    decode_embd_batch(float * embd, int32_t n_tokens, int n_pos_per_embd, int n_mmproj_embd) : n_pos_per_embd(n_pos_per_embd), n_mmproj_embd(n_mmproj_embd) {
-+    decode_embd_batch(float * embd, int32_t n_embd, int32_t n_tokens, llama_pos pos_0, llama_seq_id seq_id) : n_pos_per_embd(n_pos_per_embd), n_mmproj_embd(n_mmproj_embd) {
-         pos     .resize(n_tokens * n_pos_per_embd);
-         n_seq_id.resize(n_tokens);
-         seq_ids .resize(n_tokens + 1);
-@@ -487,6 +487,7 @@ struct decode_embd_batch {
-             /*n_tokens       =*/ n_tokens,
-             /*tokens         =*/ nullptr,
-             /*embd           =*/ embd,
-+            /*n_embd         =*/ n_embd,
-             /*pos            =*/ pos.data(),
-             /*n_seq_id       =*/ n_seq_id.data(),
-             /*seq_id         =*/ seq_ids.data(),
-@@ -610,7 +611,8 @@ int32_t mtmd_helper_eval(mtmd_context * ctx,
-             int32_t i_batch = 0;
-             int32_t n_img_batches = GGML_PAD(n_tokens, n_batch) / n_batch;
-             float * embd = mtmd_get_output_embd(ctx);
--            decode_embd_batch batch_embd(embd, n_tokens, n_pos_per_embd, n_mmproj_embd);
-+            int n_embd  = llama_model_n_embd(llama_get_model(lctx));
-+            decode_embd_batch batch_embd(embd, n_embd, n_tokens, n_past, 0);
- 
-             const int nx = mtmd_image_tokens_get_nx(chunk.tokens_image.get());
-             const int ny = mtmd_image_tokens_get_ny(chunk.tokens_image.get());
 diff --git a/ggml/src/ggml-backend-reg.cpp b/ggml/src/ggml-backend-reg.cpp
 index 405d8e31..82ae1b5b 100644
 --- a/ggml/src/ggml-backend-reg.cpp
@@ -105,10 +44,10 @@ index 405d8e31..82ae1b5b 100644
          register_backend(ggml_backend_rpc_reg());
  #endif
 diff --git a/include/llama.h b/include/llama.h
-index 06c56395..f1628e88 100644
+index abedebdb..41beef21 100644
 --- a/include/llama.h
 +++ b/include/llama.h
-@@ -256,6 +256,7 @@ extern "C" {
+@@ -258,6 +258,7 @@ extern "C" {
  
          llama_token  *  token;
          float        *  embd;
@@ -116,15 +55,15 @@ index 06c56395..f1628e88 100644
          llama_pos    *  pos;
          int32_t      *  n_seq_id;
          llama_seq_id ** seq_id;
-@@ -358,6 +359,7 @@ extern "C" {
-         bool offload_kqv; // whether to offload the KQV ops (including the KV cache) to GPU
+@@ -365,6 +366,7 @@ extern "C" {
          bool flash_attn;  // whether to use flash attention [EXPERIMENTAL]
          bool no_perf;     // whether to measure performance timings
+         bool op_offload;  // whether to offload host tensor operations to device
 +        bool cross_attn;  // whether to use cross attention
+     };
  
-         // Abort callback
-         // if it returns true, execution of llama_decode() will be aborted
-@@ -459,6 +461,10 @@ extern "C" {
+     // model quantization parameters
+@@ -464,6 +466,10 @@ extern "C" {
              struct llama_context_params   params),
              "use llama_init_from_model instead");
  
@@ -247,10 +186,10 @@ index 525c1b7d..bc8a4f0b 100644
      LLM_TENSOR_CONVNEXT_DW,
      LLM_TENSOR_CONVNEXT_NORM,
 diff --git a/src/llama-batch.cpp b/src/llama-batch.cpp
-index 01d5ca57..8682b0e6 100644
+index a88b2fe3..241b316e 100644
 --- a/src/llama-batch.cpp
 +++ b/src/llama-batch.cpp
-@@ -316,6 +316,7 @@ struct llama_batch llama_batch_get_one(
+@@ -320,6 +320,7 @@ struct llama_batch llama_batch_get_one(
          /*n_tokens       =*/ n_tokens,
          /*tokens         =*/ tokens,
          /*embd           =*/ nullptr,
@@ -258,7 +197,7 @@ index 01d5ca57..8682b0e6 100644
          /*pos            =*/ nullptr,
          /*n_seq_id       =*/ nullptr,
          /*seq_id         =*/ nullptr,
-@@ -328,6 +329,7 @@ struct llama_batch llama_batch_init(int32_t n_tokens_alloc, int32_t embd, int32_
+@@ -332,6 +333,7 @@ struct llama_batch llama_batch_init(int32_t n_tokens_alloc, int32_t embd, int32_
          /*n_tokens       =*/ 0,
          /*tokens         =*/ nullptr,
          /*embd           =*/ nullptr,
@@ -266,7 +205,7 @@ index 01d5ca57..8682b0e6 100644
          /*pos            =*/ nullptr,
          /*n_seq_id       =*/ nullptr,
          /*seq_id         =*/ nullptr,
-@@ -336,6 +338,7 @@ struct llama_batch llama_batch_init(int32_t n_tokens_alloc, int32_t embd, int32_
+@@ -340,6 +342,7 @@ struct llama_batch llama_batch_init(int32_t n_tokens_alloc, int32_t embd, int32_
  
      if (embd) {
          batch.embd = (float *) malloc(sizeof(float) * n_tokens_alloc * embd);
@@ -275,10 +214,10 @@ index 01d5ca57..8682b0e6 100644
          batch.token = (llama_token *) malloc(sizeof(llama_token) * n_tokens_alloc);
      }
 diff --git a/src/llama-context.cpp b/src/llama-context.cpp
-index 9c1fe93f..cd06ad91 100644
+index dca22d8b..c22687e4 100644
 --- a/src/llama-context.cpp
 +++ b/src/llama-context.cpp
-@@ -851,7 +851,7 @@ float * llama_context::get_logits_ith(int32_t i) {
+@@ -514,7 +514,7 @@ float * llama_context::get_logits_ith(int32_t i) {
              throw std::runtime_error(format("corrupt output buffer (j=%d, n_outputs=%d)", j, n_outputs));
          }
  
@@ -287,7 +226,7 @@ index 9c1fe93f..cd06ad91 100644
      } catch (const std::exception & err) {
          LLAMA_LOG_ERROR("%s: invalid logits id %d, reason: %s\n", __func__, i, err.what());
  #ifndef NDEBUG
-@@ -972,6 +972,10 @@ void llama_context::set_warmup(bool value) {
+@@ -632,6 +632,10 @@ void llama_context::set_warmup(bool value) {
      cparams.warmup = value;
  }
  
@@ -298,16 +237,16 @@ index 9c1fe93f..cd06ad91 100644
  void llama_context::set_adapter_lora(
              llama_adapter_lora * adapter,
              float scale) {
-@@ -1047,7 +1051,7 @@ int llama_context::encode(llama_batch & inp_batch) {
+@@ -709,7 +713,7 @@ int llama_context::encode(llama_batch & inp_batch) {
  
      const int64_t n_embd = hparams.n_embd;
  
--    sbatch.from_batch(batch, n_embd, /* simple_split */ true, /* logits_all */ true);
-+    sbatch.from_batch(batch, batch.n_embd, /* simple_split */ true, /* logits_all */ true);
+-    llama_sbatch sbatch = llama_sbatch(batch, n_embd, /* simple_split */ true, /* logits_all */ true);
++    llama_sbatch sbatch = llama_sbatch(batch, batch.n_embd, /* simple_split */ true, /* logits_all */ true);
  
      const llama_ubatch ubatch = sbatch.split_simple(n_tokens);
  
-@@ -1187,10 +1191,9 @@ int llama_context::decode(llama_batch & inp_batch) {
+@@ -863,10 +867,9 @@ int llama_context::decode(llama_batch & inp_batch) {
  
      const llama_batch & batch = batch_allocr.batch;
  
@@ -319,16 +258,16 @@ index 9c1fe93f..cd06ad91 100644
  
      const int64_t n_tokens_all = batch.n_tokens;
      const int64_t n_embd       = hparams.n_embd;
-@@ -1238,7 +1241,7 @@ int llama_context::decode(llama_batch & inp_batch) {
+@@ -1087,7 +1090,7 @@ int llama_context::decode(llama_batch & inp_batch) {
+         // make the outputs have the same order they had in the user-provided batch
+         // note: this is mostly relevant for recurrent models atm
+         if (!sorted_output) {
+-            const uint32_t n_vocab = model.vocab.n_tokens();
++            const uint32_t n_vocab = model.hparams.n_vocab;
+             const uint32_t n_embd  = model.hparams.n_embd;
  
-     const bool logits_all = n_outputs_all == n_tokens_all;
- 
--    sbatch.from_batch(batch, n_embd,
-+    sbatch.from_batch(batch, batch.n_embd,
-             /* simple_split */ !kv_self->recurrent,
-             /* logits_all   */ logits_all);
- 
-@@ -1472,12 +1475,11 @@ int llama_context::decode(llama_batch & inp_batch) {
+             GGML_ASSERT((size_t) n_outputs == out_ids.size());
+@@ -1142,12 +1145,11 @@ int llama_context::decode(llama_batch & inp_batch) {
  
  int32_t llama_context::output_reserve(int32_t n_outputs) {
      const auto & hparams = model.hparams;
@@ -342,16 +281,7 @@ index 9c1fe93f..cd06ad91 100644
      const auto n_embd  = hparams.n_embd;
  
      // TODO: use a per-batch flag for logits presence instead
-@@ -1545,7 +1547,7 @@ int32_t llama_context::output_reserve(int32_t n_outputs) {
- void llama_context::output_reorder() {
-     auto & out_ids = sbatch.out_ids;
-     if (!out_ids.empty()) {
--        const uint32_t n_vocab = model.vocab.n_tokens();
-+        const uint32_t n_vocab = model.hparams.n_vocab;
-         const uint32_t n_embd  = model.hparams.n_embd;
- 
-         GGML_ASSERT((size_t) n_outputs == out_ids.size());
-@@ -2052,7 +2054,7 @@ size_t llama_context::state_write_data(llama_io_write_i & io) {
+@@ -1682,7 +1684,7 @@ size_t llama_context::state_write_data(llama_io_write_i & io) {
      {
          LLAMA_LOG_DEBUG("%s: - writing logits\n", __func__);
  
@@ -360,15 +290,15 @@ index 9c1fe93f..cd06ad91 100644
  
          io.write(&logits_size, sizeof(logits_size));
  
-@@ -2235,6 +2237,7 @@ llama_context_params llama_context_default_params() {
-         /*.offload_kqv                 =*/ true,
+@@ -2091,6 +2093,7 @@ llama_context_params llama_context_default_params() {
          /*.flash_attn                  =*/ false,
          /*.no_perf                     =*/ true,
+         /*.op_offload                  =*/ true,
 +        /*.cross_attn                  =*/ false,
-         /*.abort_callback              =*/ nullptr,
-         /*.abort_callback_data         =*/ nullptr,
      };
-@@ -2362,6 +2365,10 @@ void llama_set_warmup(llama_context * ctx, bool warmup) {
+ 
+     return result;
+@@ -2216,6 +2219,10 @@ void llama_set_warmup(llama_context * ctx, bool warmup) {
      ctx->set_warmup(warmup);
  }
  
@@ -380,10 +310,10 @@ index 9c1fe93f..cd06ad91 100644
      ctx->synchronize();
  }
 diff --git a/src/llama-context.h b/src/llama-context.h
-index 5457f077..a50c4afa 100644
+index c0ceacb1..c4ab242a 100644
 --- a/src/llama-context.h
 +++ b/src/llama-context.h
-@@ -65,6 +65,7 @@ struct llama_context {
+@@ -71,6 +71,7 @@ struct llama_context {
      void set_embeddings (bool value);
      void set_causal_attn(bool value);
      void set_warmup(bool value);
@@ -392,22 +322,22 @@ index 5457f077..a50c4afa 100644
      void set_adapter_lora(
              llama_adapter_lora * adapter,
 diff --git a/src/llama-cparams.h b/src/llama-cparams.h
-index 30e550f0..85ad91b9 100644
+index 246fa577..7a6156ce 100644
 --- a/src/llama-cparams.h
 +++ b/src/llama-cparams.h
-@@ -29,6 +29,7 @@ struct llama_cparams {
-     bool offload_kqv;
-     bool flash_attn;
+@@ -31,6 +31,7 @@ struct llama_cparams {
      bool no_perf;
-+    bool cross_attn;
      bool warmup;
+     bool op_offload;
++    bool cross_attn;
  
      enum llama_pooling_type pooling_type;
+ 
 diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp
-index fabb9ca2..b67216a4 100644
+index b0e3f635..f14869cf 100644
 --- a/src/llama-graph.cpp
 +++ b/src/llama-graph.cpp
-@@ -560,6 +560,12 @@ void llm_graph_input_attn_cross::set_input(const llama_ubatch * ubatch) {
+@@ -532,6 +532,12 @@ void llm_graph_input_attn_cross::set_input(const llama_ubatch * ubatch) {
      }
  }
  
@@ -420,7 +350,7 @@ index fabb9ca2..b67216a4 100644
  //
  // llm_graph_context
  //
-@@ -1532,6 +1538,25 @@ llm_graph_input_attn_cross * llm_graph_context::build_attn_inp_cross() const {
+@@ -1514,6 +1520,25 @@ llm_graph_input_attn_cross * llm_graph_context::build_attn_inp_cross() const {
      return (llm_graph_input_attn_cross *) res->add_input(std::move(inp));
  }
  
@@ -447,10 +377,10 @@ index fabb9ca2..b67216a4 100644
          llm_graph_input_attn_cross * inp,
          ggml_cgraph * gf,
 diff --git a/src/llama-graph.h b/src/llama-graph.h
-index d0c8d321..0fe18150 100644
+index 832a8c09..5a322785 100644
 --- a/src/llama-graph.h
 +++ b/src/llama-graph.h
-@@ -86,6 +86,7 @@ public:
+@@ -87,6 +87,7 @@ public:
  
      ggml_tensor * tokens = nullptr; // I32 [n_batch]
      ggml_tensor * embd   = nullptr; // F32 [n_embd, n_batch]
@@ -458,7 +388,7 @@ index d0c8d321..0fe18150 100644
  };
  
  class llm_graph_input_pos : public llm_graph_input_i {
-@@ -283,6 +284,16 @@ public:
+@@ -284,6 +285,16 @@ public:
      const llama_cross * cross = nullptr;
  };
  
@@ -475,7 +405,7 @@ index d0c8d321..0fe18150 100644
  //
  // llm_graph_result
  //
-@@ -491,6 +502,7 @@ struct llm_graph_context {
+@@ -495,6 +506,7 @@ struct llm_graph_context {
      ggml_tensor * build_inp_cls() const;
      ggml_tensor * build_inp_s_copy() const;
      ggml_tensor * build_inp_s_mask() const;
@@ -535,11 +465,11 @@ index 48dce407..b6fc7e6d 100644
  };
  
 diff --git a/src/llama-kv-cache.cpp b/src/llama-kv-cache.cpp
-index 7c9d46d8..69f8d35a 100644
+index 3dcad65b..a7b0a7eb 100644
 --- a/src/llama-kv-cache.cpp
 +++ b/src/llama-kv-cache.cpp
-@@ -95,8 +95,16 @@ bool llama_kv_cache_unified::init(
-             return false;
+@@ -100,8 +100,16 @@ llama_kv_cache_unified::llama_kv_cache_unified(
+             throw std::runtime_error("failed to create ggml context for kv cache");
          }
  
 -        ggml_tensor * k = ggml_new_tensor_1d(ctx, type_k, n_embd_k_gqa*kv_size);
@@ -557,8 +487,17 @@ index 7c9d46d8..69f8d35a 100644
          ggml_format_name(k, "cache_k_l%d", i);
          ggml_format_name(v, "cache_v_l%d", i);
          k_l.push_back(k);
+@@ -446,7 +454,7 @@ void llama_kv_cache_unified::set_full() {
+ llama_sbatch llama_kv_cache_unified::sbatch_init(
+         const llama_batch & batch,
+         bool logits_all) {
+-    return llama_sbatch(batch, hparams.n_embd, true, logits_all);
++    return llama_sbatch(batch, batch.n_embd, true, logits_all);
+ }
+ 
+ llama_ubatch llama_kv_cache_unified::ubatch_next(
 diff --git a/src/llama-model-loader.cpp b/src/llama-model-loader.cpp
-index a012aeae..2e11507d 100644
+index 7f6617fa..2acfd4a8 100644
 --- a/src/llama-model-loader.cpp
 +++ b/src/llama-model-loader.cpp
 @@ -315,6 +315,8 @@ namespace GGUFMeta {
@@ -571,10 +510,10 @@ index a012aeae..2e11507d 100644
      bool llama_model_loader::get_arr(const std::string & key, std::array<T, N_MAX> & result, bool required) {
          const int kid = gguf_find_key(meta.get(), key.c_str());
 diff --git a/src/llama-model.cpp b/src/llama-model.cpp
-index 572378c9..9d099f11 100644
+index 831b68c0..e8298f56 100644
 --- a/src/llama-model.cpp
 +++ b/src/llama-model.cpp
-@@ -423,6 +423,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
+@@ -433,6 +433,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
  
      // get general kv
      ml.get_key(LLM_KV_GENERAL_NAME, name, false);
@@ -582,7 +521,7 @@ index 572378c9..9d099f11 100644
  
      // everything past this point is not vocab-related
      if (hparams.vocab_only) {
-@@ -434,6 +435,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
+@@ -444,6 +445,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
      ml.get_key(LLM_KV_BLOCK_COUNT,       hparams.n_layer);
      ml.get_key(LLM_KV_EXPERT_COUNT,      hparams.n_expert,      false);
      ml.get_key(LLM_KV_EXPERT_USED_COUNT, hparams.n_expert_used, false);
@@ -590,7 +529,7 @@ index 572378c9..9d099f11 100644
  
      if (arch == LLM_ARCH_WAVTOKENIZER_DEC) {
          ml.get_key(LLM_KV_FEATURES_LENGTH, hparams.n_embd_features);
-@@ -457,9 +459,11 @@ void llama_model::load_hparams(llama_model_loader & ml) {
+@@ -467,9 +469,11 @@ void llama_model::load_hparams(llama_model_loader & ml) {
      std::fill(hparams.n_head_arr.begin(),    hparams.n_head_arr.end(),    0);
      std::fill(hparams.n_head_kv_arr.begin(), hparams.n_head_kv_arr.end(), 0);
      std::fill(hparams.n_ff_arr.begin(),      hparams.n_ff_arr.end(),      0);
@@ -602,7 +541,7 @@ index 572378c9..9d099f11 100644
  
      // n_head_kv is optional, default to n_head
      hparams.n_head_kv_arr = hparams.n_head_arr;
-@@ -512,7 +516,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
+@@ -522,7 +526,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
  
          ml.get_key(LLM_KV_ROPE_DIMENSION_COUNT, hparams.n_rot, false);
  
@@ -611,7 +550,7 @@ index 572378c9..9d099f11 100644
              if (hparams.n_rot != hparams.n_embd_head_k) {
                  throw std::runtime_error(format("invalid n_rot: %u, expected %u", hparams.n_rot, hparams.n_embd_head_k));
              }
-@@ -575,6 +579,16 @@ void llama_model::load_hparams(llama_model_loader & ml) {
+@@ -585,6 +589,16 @@ void llama_model::load_hparams(llama_model_loader & ml) {
                      hparams.use_kq_norm = false;
                  }
              } break;
@@ -628,7 +567,7 @@ index 572378c9..9d099f11 100644
          case LLM_ARCH_DECI:
              {
                  ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
-@@ -1562,7 +1576,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
+@@ -1581,7 +1595,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
          const int64_t n_embd_head_v = hparams.n_embd_head_v;
          const int64_t n_ff          = hparams.n_ff();
          const int64_t n_embd_gqa    = n_embd_v_gqa;
@@ -637,7 +576,7 @@ index 572378c9..9d099f11 100644
          const int64_t n_token_types = vocab.n_token_types();
          const int64_t n_rot         = hparams.n_rot;
          const int64_t n_expert      = hparams.n_expert;
-@@ -1815,6 +1829,52 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
+@@ -1840,6 +1854,52 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
                          }
                      }
                  } break;
@@ -690,7 +629,7 @@ index 572378c9..9d099f11 100644
              case LLM_ARCH_DECI:
                  {
                      tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
-@@ -4707,6 +4767,246 @@ struct llm_build_llama : public llm_graph_context {
+@@ -4756,6 +4816,246 @@ struct llm_build_llama : public llm_graph_context {
      }
  };
  
@@ -832,7 +771,7 @@ index 572378c9..9d099f11 100644
 +                // self attention layer
 +
 +                // rope freq factors for llama3; may return nullptr for llama2 and other models
-+                ggml_tensor * rope_factors = static_cast<const llama_kv_cache_unified *>(memory)->cbs.get_rope_factors(n_ctx_per_seq, il);
++                ggml_tensor * rope_factors = model.get_rope_factors(n_ctx_per_seq, il);
 +
 +                // compute Q and K and RoPE them
 +                ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
@@ -937,7 +876,16 @@ index 572378c9..9d099f11 100644
  struct llm_build_deci : public llm_graph_context {
      llm_build_deci(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
          const int64_t n_embd_head = hparams.n_embd_head_v;
-@@ -13063,6 +13363,10 @@ llm_graph_result_ptr llama_model::build_graph(
+@@ -12496,7 +12796,7 @@ struct llm_build_solar : public llm_graph_context {
+             // self-attention
+             {
+                 // rope freq factors for llama3; may return nullptr for llama2 and other models
+-                ggml_tensor * rope_factors = static_cast<const llama_kv_cache_unified *>(memory)->cbs.get_rope_factors(n_ctx_per_seq, il);
++                ggml_tensor * rope_factors = model.get_rope_factors(n_ctx_per_seq, il);
+ 
+                 // compute Q and K and RoPE them
+                 ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
+@@ -13128,6 +13428,10 @@ llm_graph_result_ptr llama_model::build_graph(
              {
                  llm = std::make_unique<llm_build_llama>(*this, params, gf);
              } break;
@@ -948,7 +896,7 @@ index 572378c9..9d099f11 100644
          case LLM_ARCH_DECI:
              {
                  llm = std::make_unique<llm_build_deci>(*this, params, gf);
-@@ -13424,6 +13728,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
+@@ -13489,6 +13793,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
          // use what we call a normal RoPE, operating on pairs of consecutive head values
          case LLM_ARCH_LLAMA:
          case LLM_ARCH_LLAMA4:
@@ -957,7 +905,7 @@ index 572378c9..9d099f11 100644
          case LLM_ARCH_BAICHUAN:
          case LLM_ARCH_STARCODER:
 diff --git a/src/llama-model.h b/src/llama-model.h
-index 856e6042..6be91282 100644
+index 43746c7d..9281e629 100644
 --- a/src/llama-model.h
 +++ b/src/llama-model.h
 @@ -11,6 +11,7 @@
@@ -968,7 +916,7 @@ index 856e6042..6be91282 100644
  
  struct llama_cparams;
  struct llama_ubatch;
-@@ -73,6 +74,7 @@ enum llm_type {
+@@ -74,6 +75,7 @@ enum llm_type {
      LLM_TYPE_40B,
      LLM_TYPE_65B,
      LLM_TYPE_70B,
@@ -976,7 +924,7 @@ index 856e6042..6be91282 100644
      LLM_TYPE_236B,
      LLM_TYPE_290B,
      LLM_TYPE_314B,
-@@ -314,6 +316,16 @@ struct llama_layer {
+@@ -318,6 +320,16 @@ struct llama_layer {
  
      struct ggml_tensor * bskcn_tv = nullptr;
  
@@ -994,7 +942,7 @@ index 856e6042..6be91282 100644
  
      struct llama_layer_convnext convnext;
 diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
-index 7dc54227..223e1f3f 100644
+index 820d5128..56531980 100644
 --- a/src/llama-quant.cpp
 +++ b/src/llama-quant.cpp
 @@ -639,7 +639,9 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
@@ -1008,3 +956,72 @@ index 7dc54227..223e1f3f 100644
      }
  
      size_t total_size_org = 0;
+diff --git a/tools/mtmd/llava.cpp b/tools/mtmd/llava.cpp
+index ebef8b3c..b0eb79bb 100644
+--- a/tools/mtmd/llava.cpp
++++ b/tools/mtmd/llava.cpp
+@@ -462,7 +462,7 @@ struct llava_embd_batch {
+     std::vector<llama_seq_id *> seq_ids;
+     std::vector<int8_t>         logits;
+     llama_batch batch;
+-    llava_embd_batch(float * embd, int32_t n_tokens, llama_pos pos_0, llama_seq_id seq_id) {
++    llava_embd_batch(float * embd, int32_t n_embd, int32_t n_tokens, llama_pos pos_0, llama_seq_id seq_id) {
+         pos     .resize(n_tokens);
+         n_seq_id.resize(n_tokens);
+         seq_ids .resize(n_tokens + 1);
+@@ -474,6 +474,7 @@ struct llava_embd_batch {
+             /*n_tokens       =*/ n_tokens,
+             /*tokens         =*/ nullptr,
+             /*embd           =*/ embd,
++            /*n_embd         =*/ n_embd,
+             /*pos            =*/ pos.data(),
+             /*n_seq_id       =*/ n_seq_id.data(),
+             /*seq_id         =*/ seq_ids.data(),
+@@ -497,7 +498,7 @@ bool llava_eval_image_embed(llama_context * ctx_llama, const struct llava_image_
+             n_eval = n_batch;
+         }
+         float * embd = image_embed->embed+i*n_embd;
+-        llava_embd_batch llava_batch = llava_embd_batch(embd, n_eval, *n_past, 0);
++        llava_embd_batch llava_batch = llava_embd_batch(embd, n_embd, n_eval, *n_past, 0);
+         if (llama_decode(ctx_llama, llava_batch.batch)) {
+             LOG_ERR("%s : failed to eval\n", __func__);
+             return false;
+diff --git a/tools/mtmd/mtmd-helper.cpp b/tools/mtmd/mtmd-helper.cpp
+index 7a328867..61ebdd43 100644
+--- a/tools/mtmd/mtmd-helper.cpp
++++ b/tools/mtmd/mtmd-helper.cpp
+@@ -58,7 +58,7 @@ struct decode_embd_batch {
+     std::vector<llama_seq_id *> seq_ids;
+     std::vector<int8_t>         logits;
+     llama_batch batch;
+-    decode_embd_batch(float * embd, int32_t n_tokens, int n_pos_per_embd, int n_mmproj_embd) : n_pos_per_embd(n_pos_per_embd), n_mmproj_embd(n_mmproj_embd) {
++    decode_embd_batch(float * embd, int32_t n_embd, int32_t n_tokens, llama_pos pos_0, llama_seq_id seq_id) : n_pos_per_embd(n_pos_per_embd), n_mmproj_embd(n_mmproj_embd) {
+         pos     .resize(n_tokens * n_pos_per_embd);
+         n_seq_id.resize(n_tokens);
+         seq_ids .resize(n_tokens + 1);
+@@ -69,6 +69,7 @@ struct decode_embd_batch {
+             /*n_tokens       =*/ n_tokens,
+             /*tokens         =*/ nullptr,
+             /*embd           =*/ embd,
++            /*n_embd         =*/ n_embd,
+             /*pos            =*/ pos.data(),
+             /*n_seq_id       =*/ n_seq_id.data(),
+             /*seq_id         =*/ seq_ids.data(),
+@@ -131,6 +132,7 @@ struct decode_embd_batch {
+             /*n_tokens       =*/ n_tokens,
+             /*tokens         =*/ nullptr,
+             /*embd           =*/ batch.embd     + offset * n_mmproj_embd,
++            /*n_embd         =*/ batch.n_embd,
+             /*pos            =*/ pos_ptr,
+             /*n_seq_id       =*/ batch.n_seq_id + offset,
+             /*seq_id         =*/ batch.seq_id   + offset,
+@@ -166,7 +168,8 @@ int32_t mtmd_helper_decode_image_chunk(
+     int32_t n_tokens = mtmd_image_tokens_get_n_tokens(image_tokens);
+     int32_t i_batch = 0;
+     int32_t n_img_batches = GGML_PAD(n_tokens, n_batch) / n_batch;
+-    decode_embd_batch batch_embd(encoded_embd, n_tokens, n_pos_per_embd, n_mmproj_embd);
++    int n_embd  = llama_model_n_embd(llama_get_model(lctx));
++    decode_embd_batch batch_embd(encoded_embd, n_embd, n_tokens, n_past, seq_id);
+ 
+     const int nx = mtmd_image_tokens_get_nx(image_tokens);
+     const int ny = mtmd_image_tokens_get_ny(image_tokens);
diff --git a/llama/patches/0007-add-unpad-operator.patch b/llama/patches/0007-add-unpad-operator.patch
index 50acfc63..fc45aeff 100644
--- a/llama/patches/0007-add-unpad-operator.patch
+++ b/llama/patches/0007-add-unpad-operator.patch
@@ -18,7 +18,7 @@ adds the unpad operator to GGML
  10 files changed, 223 insertions(+), 2 deletions(-)
 
 diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h
-index 1b8603e7..53ef31b2 100644
+index e91dedf1..8dc107ba 100644
 --- a/ggml/include/ggml.h
 +++ b/ggml/include/ggml.h
 @@ -489,6 +489,7 @@ extern "C" {
@@ -29,7 +29,7 @@ index 1b8603e7..53ef31b2 100644
          GGML_OP_ARANGE,
          GGML_OP_TIMESTEP_EMBEDDING,
          GGML_OP_ARGSORT,
-@@ -1777,6 +1778,15 @@ extern "C" {
+@@ -1781,6 +1782,15 @@ extern "C" {
              int                   p0,
              int                   p1);
  
@@ -46,10 +46,10 @@ index 1b8603e7..53ef31b2 100644
      // timesteps: [N,]
      // return: [N, dim]
 diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c
-index 64405449..34624cca 100644
+index a30e67f2..835e6495 100644
 --- a/ggml/src/ggml-cpu/ggml-cpu.c
 +++ b/ggml/src/ggml-cpu/ggml-cpu.c
-@@ -1964,6 +1964,10 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
+@@ -1951,6 +1951,10 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
              {
                  ggml_compute_forward_pad_reflect_1d(params, tensor);
              } break;
@@ -60,7 +60,7 @@ index 64405449..34624cca 100644
          case GGML_OP_ARANGE:
              {
                  ggml_compute_forward_arange(params, tensor);
-@@ -2287,6 +2291,7 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
+@@ -2274,6 +2278,7 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
          case GGML_OP_UPSCALE:
          case GGML_OP_PAD:
          case GGML_OP_PAD_REFLECT_1D:
@@ -69,10 +69,10 @@ index 64405449..34624cca 100644
          case GGML_OP_TIMESTEP_EMBEDDING:
          case GGML_OP_ARGSORT:
 diff --git a/ggml/src/ggml-cpu/ops.cpp b/ggml/src/ggml-cpu/ops.cpp
-index 7413192b..becdae07 100644
+index 955fec59..1868a10c 100644
 --- a/ggml/src/ggml-cpu/ops.cpp
 +++ b/ggml/src/ggml-cpu/ops.cpp
-@@ -6703,6 +6703,61 @@ void ggml_compute_forward_pad_reflect_1d(
+@@ -6690,6 +6690,61 @@ void ggml_compute_forward_pad_reflect_1d(
      }
  }
  
@@ -147,10 +147,10 @@ index dc081b9e..a7125555 100644
  void ggml_compute_forward_timestep_embedding(const struct ggml_compute_params * params, struct ggml_tensor * dst);
  void ggml_compute_forward_argsort(const struct ggml_compute_params * params, struct ggml_tensor * dst);
 diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
-index 04ce764e..491acccb 100644
+index cb0d8528..6fe86674 100644
 --- a/ggml/src/ggml-cuda/ggml-cuda.cu
 +++ b/ggml/src/ggml-cuda/ggml-cuda.cu
-@@ -2223,6 +2223,9 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
+@@ -2238,6 +2238,9 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
          case GGML_OP_PAD:
              ggml_cuda_op_pad(ctx, dst);
              break;
@@ -160,7 +160,7 @@ index 04ce764e..491acccb 100644
          case GGML_OP_ARANGE:
              ggml_cuda_op_arange(ctx, dst);
              break;
-@@ -3197,6 +3200,7 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
+@@ -3212,6 +3215,7 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
          case GGML_OP_UPSCALE:
              return op->src[0]->type == GGML_TYPE_F32 && op->op_params[0] == GGML_SCALE_MODE_NEAREST;
          case GGML_OP_PAD:
@@ -233,10 +233,10 @@ index 8fd386b0..e2ededc3 100644
  void ggml_cuda_op_pad(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
 +void ggml_cuda_op_unpad(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
 diff --git a/ggml/src/ggml-metal/ggml-metal.m b/ggml/src/ggml-metal/ggml-metal.m
-index 425524d0..112abef6 100644
+index 1b56f858..7641247e 100644
 --- a/ggml/src/ggml-metal/ggml-metal.m
 +++ b/ggml/src/ggml-metal/ggml-metal.m
-@@ -341,6 +341,7 @@ static void ggml_backend_metal_device_rel(struct ggml_backend_metal_device_conte
+@@ -347,6 +347,7 @@ static void ggml_backend_metal_device_rel(struct ggml_backend_metal_device_conte
      GGML_METAL_KERNEL_TYPE_UPSCALE_F32,
      GGML_METAL_KERNEL_TYPE_PAD_F32,
      GGML_METAL_KERNEL_TYPE_PAD_REFLECT_1D_F32,
@@ -244,7 +244,7 @@ index 425524d0..112abef6 100644
      GGML_METAL_KERNEL_TYPE_ARANGE_F32,
      GGML_METAL_KERNEL_TYPE_TIMESTEP_EMBEDDING_F32,
      GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_ASC,
-@@ -1277,6 +1278,7 @@ @implementation GGMLMetalClass
+@@ -1294,6 +1295,7 @@ @implementation GGMLMetalClass
          GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_UPSCALE_F32,                     upscale_f32,                     true);
          GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_PAD_F32,                         pad_f32,                         true);
          GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_PAD_REFLECT_1D_F32,              pad_reflect_1d_f32,              true);
@@ -252,7 +252,7 @@ index 425524d0..112abef6 100644
          GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_TIMESTEP_EMBEDDING_F32,          timestep_embedding_f32,          true);
          GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ARANGE_F32,                      arange_f32,                      true);
          GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_ASC,             argsort_f32_i32_asc,             true);
-@@ -1647,6 +1649,7 @@ static bool ggml_metal_supports_op(const struct ggml_backend_metal_device_contex
+@@ -1655,6 +1657,7 @@ static bool ggml_metal_supports_op(const struct ggml_backend_metal_device_contex
          case GGML_OP_POOL_2D:
          case GGML_OP_PAD:
          case GGML_OP_PAD_REFLECT_1D:
@@ -260,7 +260,7 @@ index 425524d0..112abef6 100644
          case GGML_OP_TIMESTEP_EMBEDDING:
          case GGML_OP_ARGSORT:
          case GGML_OP_LEAKY_RELU:
-@@ -4047,6 +4050,36 @@ static bool ggml_metal_encode_node(
+@@ -4184,6 +4187,36 @@ static bool ggml_metal_encode_node(
  
                  const int nth = MIN(1024, ne0);
  
@@ -298,10 +298,10 @@ index 425524d0..112abef6 100644
              } break;
          case GGML_OP_ARANGE:
 diff --git a/ggml/src/ggml-metal/ggml-metal.metal b/ggml/src/ggml-metal/ggml-metal.metal
-index 9f4147e9..6ceb3cef 100644
+index 9cfddf45..080a943b 100644
 --- a/ggml/src/ggml-metal/ggml-metal.metal
 +++ b/ggml/src/ggml-metal/ggml-metal.metal
-@@ -2975,6 +2975,51 @@ kernel void kernel_pad_reflect_1d_f32(
+@@ -3121,6 +3121,51 @@ kernel void kernel_pad_reflect_1d_f32(
      }
  }
  
@@ -354,7 +354,7 @@ index 9f4147e9..6ceb3cef 100644
      device        char * dst,
      constant   ggml_metal_kargs_arange & args,
 diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c
-index 7654ae17..3c57aff8 100644
+index 8a654624..6b034d35 100644
 --- a/ggml/src/ggml.c
 +++ b/ggml/src/ggml.c
 @@ -923,6 +923,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
@@ -391,7 +391,7 @@ index 7654ae17..3c57aff8 100644
  
  static_assert(GGML_OP_POOL_COUNT == 2, "GGML_OP_POOL_COUNT != 2");
  
-@@ -4270,6 +4272,25 @@ struct ggml_tensor * ggml_pad_reflect_1d(
+@@ -4274,6 +4276,25 @@ struct ggml_tensor * ggml_pad_reflect_1d(
      return result;
  }
  
diff --git a/llama/patches/0008-fix-deepseek-deseret-regex.patch b/llama/patches/0008-fix-deepseek-deseret-regex.patch
index 5b4753bf..ff4b5757 100644
--- a/llama/patches/0008-fix-deepseek-deseret-regex.patch
+++ b/llama/patches/0008-fix-deepseek-deseret-regex.patch
@@ -12,10 +12,10 @@ regex
  2 files changed, 22 insertions(+), 1 deletion(-)
 
 diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp
-index a9ee9f03..1306864e 100644
+index 806c1b3d..10f34d33 100644
 --- a/src/llama-vocab.cpp
 +++ b/src/llama-vocab.cpp
-@@ -296,7 +296,7 @@ struct llm_tokenizer_bpe : llm_tokenizer {
+@@ -298,7 +298,7 @@ struct llm_tokenizer_bpe : llm_tokenizer {
              case LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_LLM:
                  regex_exprs = {
                      "[\r\n]",
diff --git a/llama/patches/0010-ensure-KV-cache-is-fully-defragmented.patch b/llama/patches/0010-ensure-KV-cache-is-fully-defragmented.patch
index e4b2a408..81c17969 100644
--- a/llama/patches/0010-ensure-KV-cache-is-fully-defragmented.patch
+++ b/llama/patches/0010-ensure-KV-cache-is-fully-defragmented.patch
@@ -15,33 +15,102 @@ but this can leave a cache that still does not have adequate space
 even after defragmentation is triggered. Instead, we should do
 multiple batches of processing until everything is complete.
 ---
- src/llama-context.cpp  | 105 +++++++++++++----------------------------
- src/llama-context.h    |   4 +-
- src/llama-kv-cache.cpp |  39 +++------------
- src/llama-kv-cache.h   |   9 +++-
- 4 files changed, 51 insertions(+), 106 deletions(-)
+ src/llama-context.h    |   1 +
+ src/llama-kv-cache.cpp | 107 ++++++++++++++---------------------------
+ src/llama-kv-cache.h   |  12 ++++-
+ 3 files changed, 47 insertions(+), 73 deletions(-)
 
-diff --git a/src/llama-context.cpp b/src/llama-context.cpp
-index cd06ad91..77177c5e 100644
---- a/src/llama-context.cpp
-+++ b/src/llama-context.cpp
-@@ -583,13 +583,12 @@ llm_graph_result_ptr llama_context::build_kv_self_shift(
+diff --git a/src/llama-context.h b/src/llama-context.h
+index c4ab242a..9970dfc6 100644
+--- a/src/llama-context.h
++++ b/src/llama-context.h
+@@ -5,6 +5,7 @@
+ #include "llama-cparams.h"
+ #include "llama-graph.h"
+ #include "llama-adapter.h"
++#include "llama-kv-cache.h"
  
- llm_graph_result_ptr llama_context::build_kv_self_defrag(
-         ggml_context * ctx0,
--        ggml_cgraph * gf) const {
-+        ggml_cgraph * gf,
-+        const std::vector<struct llama_kv_defrag_move> & moves) const {
+ #include "ggml-cpp.h"
+ #include "ggml-opt.h"
+diff --git a/src/llama-kv-cache.cpp b/src/llama-kv-cache.cpp
+index a7b0a7eb..1a50c034 100644
+--- a/src/llama-kv-cache.cpp
++++ b/src/llama-kv-cache.cpp
+@@ -372,8 +372,6 @@ void llama_kv_cache_unified::commit() {
+ }
+ 
+ bool llama_kv_cache_unified::update(llama_context & lctx) {
+-    bool need_reserve = false;
+-
+     auto * sched = lctx.get_sched();
+ 
+     if (has_shift) {
+@@ -396,8 +394,6 @@ bool llama_kv_cache_unified::update(llama_context & lctx) {
+             res->set_inputs(nullptr);
+ 
+             lctx.graph_compute(gf, false);
+-
+-            need_reserve = true;
+         }
+ 
+         {
+@@ -411,27 +407,36 @@ bool llama_kv_cache_unified::update(llama_context & lctx) {
+ 
+     if (do_defrag) {
+         LLAMA_LOG_DEBUG("%s: defragmenting KV cache\n", __func__);
++        const uint32_t n_max_nodes = lctx.graph_max_nodes();
++        const uint32_t max_moves = (n_max_nodes - 2*model.hparams.n_layer)/(6*model.hparams.n_layer);
++        if (!defrag_prepare(n_max_nodes)) {
++            LLAMA_LOG_ERROR("%s: failed to prepare defragmentation\n", __func__);
++            return false;
++        }
++
++        for (std::size_t i = 0; i < defrag_info.moves.size(); i += max_moves) {
++            std::vector<struct llama_kv_defrag_move> chunk;
++            auto end = std::min(i + max_moves, defrag_info.moves.size());
++            chunk.assign(defrag_info.moves.begin() + i, defrag_info.moves.begin() + end);
+ 
+-        if (defrag_prepare(lctx.graph_max_nodes())) {
+             ggml_backend_sched_reset(sched);
+ 
+             auto * gf = lctx.graph_init();
+ 
+-            auto res = build_graph_defrag(lctx.get_cparams(), lctx.get_ctx_compute(), gf);
++            auto res = build_graph_defrag(lctx.get_cparams(), lctx.get_ctx_compute(), gf, chunk);
+ 
+             ggml_backend_sched_alloc_graph(sched, gf);
+ 
+             res->set_inputs(nullptr);
+ 
+             lctx.graph_compute(gf, false);
+-
+-            need_reserve = true;
+         }
+ 
+         do_defrag = false;
+     }
+ 
+-    return need_reserve;
++    // we never need to reserve a worst case graph
++    return false;
+ }
+ 
+ void llama_kv_cache_unified::defrag_sched(float thold) {
+@@ -715,11 +720,10 @@ llm_graph_result_ptr llama_kv_cache_unified::build_graph_shift(
+ llm_graph_result_ptr llama_kv_cache_unified::build_graph_defrag(
+         const llama_cparams & cparams,
+                ggml_context * ctx,
+-                ggml_cgraph * gf) const {
++                ggml_cgraph * gf,
++                const std::vector<struct llama_kv_defrag_move> & moves) const {
      auto res = std::make_unique<llm_graph_result>();
  
-     const auto & hparams = model.hparams;
- 
--    const auto & ids = kv_self->defrag_info.ids;
+-    const auto & ids = defrag_info.ids;
 -
  #if 0
      // CPU defrag
      //
-@@ -661,32 +660,20 @@ llm_graph_result_ptr llama_context::build_kv_self_defrag(
+@@ -791,32 +795,20 @@ llm_graph_result_ptr llama_kv_cache_unified::build_graph_defrag(
          ggml_backend_tensor_set(v_l[il], buf_v.data(), 0, buf_v.size());
      }
  #else
@@ -63,188 +132,63 @@ index cd06ad91..77177c5e 100644
              const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa(il);
              const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa(il);
  
-             ggml_tensor * view_k_src = ggml_view_2d(ctx0, kv_self->k_l[il],
+             ggml_tensor * view_k_src = ggml_view_2d(ctx, k_l[il],
 -                    n_embd_k_gqa, nm,
 +                    n_embd_k_gqa, move.len,
-                     ggml_row_size(kv_self->k_l[il]->type, n_embd_k_gqa),
--                    ggml_row_size(kv_self->k_l[il]->type, n_embd_k_gqa*i));
-+                    ggml_row_size(kv_self->k_l[il]->type, n_embd_k_gqa*move.src));
+                     ggml_row_size(k_l[il]->type, n_embd_k_gqa),
+-                    ggml_row_size(k_l[il]->type, n_embd_k_gqa*i));
++                    ggml_row_size(k_l[il]->type, n_embd_k_gqa*move.src));
  
-             ggml_tensor * view_k_dst = ggml_view_2d(ctx0, kv_self->k_l[il],
+             ggml_tensor * view_k_dst = ggml_view_2d(ctx, k_l[il],
 -                    n_embd_k_gqa, nm,
 +                    n_embd_k_gqa, move.len,
-                     ggml_row_size(kv_self->k_l[il]->type, n_embd_k_gqa),
--                    ggml_row_size(kv_self->k_l[il]->type, n_embd_k_gqa*id));
-+                    ggml_row_size(kv_self->k_l[il]->type, n_embd_k_gqa*move.dst));
+                     ggml_row_size(k_l[il]->type, n_embd_k_gqa),
+-                    ggml_row_size(k_l[il]->type, n_embd_k_gqa*id));
++                    ggml_row_size(k_l[il]->type, n_embd_k_gqa*move.dst));
  
              ggml_tensor * view_v_src;
              ggml_tensor * view_v_dst;
-@@ -694,34 +681,30 @@ llm_graph_result_ptr llama_context::build_kv_self_defrag(
+@@ -824,31 +816,29 @@ llm_graph_result_ptr llama_kv_cache_unified::build_graph_defrag(
              if (cparams.flash_attn) {
                  // NOTE: the V cache is not transposed when using flash attention
-                 view_v_src = ggml_view_2d(ctx0, kv_self->v_l[il],
+                 view_v_src = ggml_view_2d(ctx, v_l[il],
 -                        n_embd_v_gqa, nm,
 +                        n_embd_v_gqa, move.len,
-                         ggml_row_size(kv_self->v_l[il]->type, n_embd_v_gqa),
--                        ggml_row_size(kv_self->v_l[il]->type, n_embd_v_gqa*i));
-+                        ggml_row_size(kv_self->v_l[il]->type, n_embd_v_gqa*move.src));
+                         ggml_row_size(v_l[il]->type, n_embd_v_gqa),
+-                        ggml_row_size(v_l[il]->type, n_embd_v_gqa*i));
++                        ggml_row_size(v_l[il]->type, n_embd_v_gqa*move.dst));
  
-                 view_v_dst = ggml_view_2d(ctx0, kv_self->v_l[il],
+                 view_v_dst = ggml_view_2d(ctx, v_l[il],
 -                        n_embd_v_gqa, nm,
-+                        n_embd_v_gqa, move.len,
-                         ggml_row_size(kv_self->v_l[il]->type, n_embd_v_gqa),
--                        ggml_row_size(kv_self->v_l[il]->type, n_embd_v_gqa*id));
-+                        ggml_row_size(kv_self->v_l[il]->type, n_embd_v_gqa*move.dst));
++                        move.len, n_embd_v_gqa,
+                         ggml_row_size(v_l[il]->type, n_embd_v_gqa),
+-                        ggml_row_size(v_l[il]->type, n_embd_v_gqa*id));
++                        ggml_row_size(v_l[il]->type, move.src));
              } else {
-                 view_v_src = ggml_view_2d(ctx0, kv_self->v_l[il],
+                 view_v_src = ggml_view_2d(ctx, v_l[il],
 -                        nm, n_embd_v_gqa,
 +                        move.len, n_embd_v_gqa,
-                         ggml_row_size(kv_self->v_l[il]->type, kv_self->size),
--                        ggml_row_size(kv_self->v_l[il]->type, i));
-+                        ggml_row_size(kv_self->v_l[il]->type, move.src));
+                         ggml_row_size(v_l[il]->type, size),
+-                        ggml_row_size(v_l[il]->type, i));
++                        ggml_row_size(v_l[il]->type, move.src));
  
-                 view_v_dst = ggml_view_2d(ctx0, kv_self->v_l[il],
+                 view_v_dst = ggml_view_2d(ctx, v_l[il],
 -                        nm, n_embd_v_gqa,
 +                        move.len, n_embd_v_gqa,
-                         ggml_row_size(kv_self->v_l[il]->type, kv_self->size),
--                        ggml_row_size(kv_self->v_l[il]->type, id));
-+                        ggml_row_size(kv_self->v_l[il]->type, move.dst));
+                         ggml_row_size(v_l[il]->type, size),
+-                        ggml_row_size(v_l[il]->type, id));
++                        ggml_row_size(v_l[il]->type, move.dst));
              }
  
-             ggml_build_forward_expand(gf, ggml_cpy(ctx0, view_k_src, view_k_dst));
-             ggml_build_forward_expand(gf, ggml_cpy(ctx0, view_v_src, view_v_dst));
+             ggml_build_forward_expand(gf, ggml_cpy(ctx, view_k_src, view_k_dst));
+             ggml_build_forward_expand(gf, ggml_cpy(ctx, view_v_src, view_v_dst));
          }
 -
 -        i += nm - 1;
      }
--
--    //LLAMA_LOG_INFO("gf->n_nodes = %d\n", gf->n_nodes);
- #endif
  
-     return res;
-@@ -730,8 +713,6 @@ llm_graph_result_ptr llama_context::build_kv_self_defrag(
- void llama_context::kv_self_update() {
-     auto & kv = kv_self;
- 
--    bool need_reserve = false;
--
-     if (kv->has_shift) {
-         if (!kv->get_can_shift()) {
-             GGML_ABORT("The current context does not support K-shift");
-@@ -752,8 +733,6 @@ void llama_context::kv_self_update() {
-             res->set_inputs(nullptr);
- 
-             graph_compute(gf, false);
--
--            need_reserve = true;
-         }
- 
-         {
-@@ -768,49 +747,28 @@ void llama_context::kv_self_update() {
-     // defragment the KV cache if needed
-     if (kv->do_defrag) {
-         LLAMA_LOG_DEBUG("%s: defragmenting KV cache\n", __func__);
-+        const uint32_t n_max_nodes = graph_max_nodes();
-+        const uint32_t max_moves = (n_max_nodes - 2*model.hparams.n_layer)/(6*model.hparams.n_layer);
-+        if (!kv->defrag_prepare(n_max_nodes)) {
-+            LLAMA_LOG_ERROR("%s: failed to prepare defragmentation\n", __func__);
-+            return;
-+        }
- 
--        if (kv->defrag_prepare(graph_max_nodes())) {
--            ggml_backend_sched_reset(sched.get());
-+        for (std::size_t i = 0; i < kv_self->defrag_info.moves.size(); i += max_moves) {
-+            std::vector<struct llama_kv_defrag_move> chunk;
-+            auto end = std::min(i + max_moves, kv_self->defrag_info.moves.size());
-+            chunk.assign(kv_self->defrag_info.moves.begin() + i, kv_self->defrag_info.moves.begin() + end);
- 
-+            ggml_backend_sched_reset(sched.get());
-             auto * gf = graph_init();
--
--            auto res = build_kv_self_defrag(ctx_compute.get(), gf);
--
-+            auto res = build_kv_self_defrag(ctx_compute.get(), gf, chunk);
-             ggml_backend_sched_alloc_graph(sched.get(), gf);
--
-             res->set_inputs(nullptr);
--
-             graph_compute(gf, false);
--
--            need_reserve = true;
-         }
- 
-         kv->do_defrag = false;
-     }
--
--    // reserve a worst case graph if needed
--    if (need_reserve) {
--        LLAMA_LOG_DEBUG("%s: reserving a worst case graph\n", __func__);
--
--        // build worst-case graph
--        uint32_t n_seqs = 1; // TODO: worst-case number of sequences
--        uint32_t n_tokens = std::min(cparams.n_ctx, cparams.n_ubatch);
--
--        // simulate full KV cache
--        kv_self->n = kv_self->size;
--
--        llama_token token = model.vocab.token_bos(); // not actually used by llama_build_graph, but required to choose between token and embedding inputs graph
--        llama_ubatch ubatch = { true, n_tokens, n_tokens / n_seqs, n_seqs, &token, nullptr, nullptr, nullptr, nullptr, nullptr};
--
--        auto * gf = graph_init();
--        graph_build(ctx_compute.get(), gf, ubatch, LLM_GRAPH_TYPE_DEFAULT);
--
--        // initialize scheduler with the worst-case graph
--        ggml_backend_sched_reset(sched.get());
--        if (!ggml_backend_sched_reserve(sched.get(), gf)) {
--            LLAMA_LOG_ERROR("%s: failed to allocate compute buffers\n", __func__);
--        }
--    }
- }
- 
- enum llama_pooling_type llama_context::pooling_type() const {
-@@ -1294,9 +1252,12 @@ int llama_context::decode(llama_batch & inp_batch) {
-         // find KV slot
-         {
-             if (!kv_self->find_slot(ubatch)) {
--                LLAMA_LOG_WARN("%s: failed to find KV cache slot for ubatch of size %d\n", __func__, ubatch.n_tokens);
--
--                return 1;
-+                kv_self->defrag();
-+                kv_self_update();
-+                if (!kv_self->find_slot(ubatch)) {
-+                    LLAMA_LOG_WARN("%s: failed to find KV cache slot for ubatch of size %d\n", __func__, ubatch.n_tokens);
-+                    return 1;
-+                }
-             }
- 
-             if (!kv_self->recurrent) {
-diff --git a/src/llama-context.h b/src/llama-context.h
-index a50c4afa..30f84bfd 100644
---- a/src/llama-context.h
-+++ b/src/llama-context.h
-@@ -5,6 +5,7 @@
- #include "llama-cparams.h"
- #include "llama-graph.h"
- #include "llama-adapter.h"
-+#include "llama-kv-cache.h"
- 
- #include "ggml-cpp.h"
- 
-@@ -179,7 +180,8 @@ private:
- 
-     llm_graph_result_ptr build_kv_self_defrag(
-             ggml_context * ctx0,
--            ggml_cgraph * gf) const;
-+            ggml_cgraph * gf,
-+            const std::vector<struct llama_kv_defrag_move> & moves) const;
- 
-     // TODO: read/write lora adapters and cvec
-     size_t state_write_data(llama_io_write_i & io);
-diff --git a/src/llama-kv-cache.cpp b/src/llama-kv-cache.cpp
-index 69f8d35a..35a750d3 100644
---- a/src/llama-kv-cache.cpp
-+++ b/src/llama-kv-cache.cpp
-@@ -781,17 +781,7 @@ bool llama_kv_cache_unified::defrag_prepare(int32_t n_max_nodes) {
+     //LLAMA_LOG_INFO("gf->n_nodes = %d\n", gf->n_nodes);
+@@ -865,17 +855,7 @@ bool llama_kv_cache_unified::defrag_prepare(int32_t n_max_nodes) {
  
      assert(n_used <= n_kv);
  
@@ -263,7 +207,7 @@ index 69f8d35a..35a750d3 100644
  
      // determine which KV cells to move where
      //
-@@ -799,10 +789,7 @@ bool llama_kv_cache_unified::defrag_prepare(int32_t n_max_nodes) {
+@@ -883,10 +863,7 @@ bool llama_kv_cache_unified::defrag_prepare(int32_t n_max_nodes) {
      //
      //  if ids[i] == i || ids[i] == n_kv, then cell i is not moved
      //
@@ -275,7 +219,7 @@ index 69f8d35a..35a750d3 100644
  
      for (uint32_t i0 = 0; i0 < n_used; ++i0) {
          const auto & cell0 = cells[i0];
-@@ -851,19 +838,11 @@ bool llama_kv_cache_unified::defrag_prepare(int32_t n_max_nodes) {
+@@ -935,19 +912,11 @@ bool llama_kv_cache_unified::defrag_prepare(int32_t n_max_nodes) {
          // are we moving a continuous block of memory?
          bool cont = false;
  
@@ -295,7 +239,7 @@ index 69f8d35a..35a750d3 100644
                  cont = false;
                  continue;
              }
-@@ -879,8 +858,10 @@ bool llama_kv_cache_unified::defrag_prepare(int32_t n_max_nodes) {
+@@ -963,8 +932,10 @@ bool llama_kv_cache_unified::defrag_prepare(int32_t n_max_nodes) {
              head = n_used;
  
              if (!cont) {
@@ -307,7 +251,7 @@ index 69f8d35a..35a750d3 100644
              }
  
              nf++;
-@@ -890,22 +871,16 @@ bool llama_kv_cache_unified::defrag_prepare(int32_t n_max_nodes) {
+@@ -974,22 +945,16 @@ bool llama_kv_cache_unified::defrag_prepare(int32_t n_max_nodes) {
              }
          }
  
@@ -325,37 +269,47 @@ index 69f8d35a..35a750d3 100644
          return false;
      }
  
--    LLAMA_LOG_DEBUG("(tmp log) KV defrag cell moves: %u\n", n_moves);
+-    LLAMA_LOG_DEBUG("%s: (tmp log) KV defrag cell moves: %u\n", __func__, n_moves);
 -
--    LLAMA_LOG_DEBUG("expected gf nodes: %u\n", 6*n_moves*n_layer);
+-    LLAMA_LOG_DEBUG("%s: expected gf nodes: %u\n", __func__, 6*n_moves*n_layer);
 +    // LLAMA_LOG_DEBUG("(tmp log) KV defrag cell moves: %u\n", n_moves);
  
      return true;
  }
 diff --git a/src/llama-kv-cache.h b/src/llama-kv-cache.h
-index 56c74035..25cbcb56 100644
+index bf3b4b6a..928b9712 100644
 --- a/src/llama-kv-cache.h
 +++ b/src/llama-kv-cache.h
-@@ -43,6 +43,13 @@ private:
+@@ -82,6 +82,13 @@ struct llama_kv_cache_guard {
+ private:
      llama_kv_cache * kv;
  };
- 
++ 
 +// block of KV slots to move when defragging
 +struct llama_kv_defrag_move {
 +    uint32_t src;
 +    uint32_t dst;
 +    uint32_t len;
 +};
-+
- struct llama_kv_cell {
-     llama_pos pos   = -1;
-     llama_pos delta =  0;
-@@ -131,7 +138,7 @@ public:
-     // defrag
  
+ //
+ // llama_kv_cache_unified
+@@ -207,7 +214,7 @@ private:
+ 
+     // defrag
      struct {
 -        std::vector<uint32_t> ids;
 +        std::vector<llama_kv_defrag_move> moves;
      } defrag_info;
  
      // return true if cells have been moved
+@@ -249,7 +256,8 @@ private:
+     llm_graph_result_ptr build_graph_defrag(
+             const llama_cparams & cparams,
+                    ggml_context * ctx,
+-                    ggml_cgraph * gf) const;
++                    ggml_cgraph * gf,
++                    const std::vector<llama_kv_defrag_move> & moves) const;
+ 
+     void state_write_meta(llama_io_write_i & io, const std::vector<std::pair<uint32_t, uint32_t>> & cell_ranges, llama_seq_id seq_id = -1) const;
+     void state_write_data(llama_io_write_i & io, const std::vector<std::pair<uint32_t, uint32_t>> & cell_ranges) const;
diff --git a/llama/patches/0012-add-phony-target-ggml-cpu-for-all-cpu-variants.patch b/llama/patches/0012-add-phony-target-ggml-cpu-for-all-cpu-variants.patch
index 6de840a6..21c1fc42 100644
--- a/llama/patches/0012-add-phony-target-ggml-cpu-for-all-cpu-variants.patch
+++ b/llama/patches/0012-add-phony-target-ggml-cpu-for-all-cpu-variants.patch
@@ -8,7 +8,7 @@ Subject: [PATCH] add phony target ggml-cpu for all cpu variants
  1 file changed, 2 insertions(+)
 
 diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt
-index 43d9fc4f..4c0d3824 100644
+index ddea5ad3..45918bf6 100644
 --- a/ggml/src/CMakeLists.txt
 +++ b/ggml/src/CMakeLists.txt
 @@ -279,6 +279,7 @@ function(ggml_add_cpu_backend_variant tag_name)
diff --git a/llama/patches/0013-remove-amx.patch b/llama/patches/0013-remove-amx.patch
index c2703237..296a3761 100644
--- a/llama/patches/0013-remove-amx.patch
+++ b/llama/patches/0013-remove-amx.patch
@@ -9,7 +9,7 @@ disable amx as it reduces performance on some systems
  1 file changed, 4 deletions(-)
 
 diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt
-index 4c0d3824..79c26312 100644
+index 45918bf6..0beaed86 100644
 --- a/ggml/src/CMakeLists.txt
 +++ b/ggml/src/CMakeLists.txt
 @@ -296,10 +296,6 @@ if (GGML_CPU_ALL_VARIANTS)
diff --git a/llama/patches/0014-fix-string-arr-kv-loading.patch b/llama/patches/0014-fix-string-arr-kv-loading.patch
index 5d94ca2c..07cb397b 100644
--- a/llama/patches/0014-fix-string-arr-kv-loading.patch
+++ b/llama/patches/0014-fix-string-arr-kv-loading.patch
@@ -53,15 +53,15 @@ index 381a9c7d..e45b453d 100644
  }
  
 diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp
-index 1306864e..d6515ff6 100644
+index 10f34d33..b098bb25 100644
 --- a/src/llama-vocab.cpp
 +++ b/src/llama-vocab.cpp
-@@ -1459,7 +1459,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
+@@ -1471,7 +1471,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
+                 const gguf_type pc_type = gguf_get_arr_type(ctx, precompiled_charsmap_keyidx);
+                 GGML_ASSERT(pc_type == GGUF_TYPE_INT8 || pc_type == GGUF_TYPE_UINT8);
  
-             const int precompiled_charsmap_keyidx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_PRECOMPILED_CHARSMAP).c_str());
-             if (precompiled_charsmap_keyidx != -1) {
--                size_t n_precompiled_charsmap = gguf_get_arr_n(ctx, precompiled_charsmap_keyidx);
-+                size_t n_precompiled_charsmap = gguf_get_arr_data_n(ctx, precompiled_charsmap_keyidx);
+-                const size_t n_precompiled_charsmap = gguf_get_arr_n(ctx, precompiled_charsmap_keyidx);
++                const size_t n_precompiled_charsmap = gguf_get_arr_data_n(ctx, precompiled_charsmap_keyidx);
                  const char * pc = (const char *) gguf_get_arr_data(ctx, precompiled_charsmap_keyidx);
                  precompiled_charsmap.assign(pc, pc + n_precompiled_charsmap);
  #ifdef IS_BIG_ENDIAN
diff --git a/llama/patches/0015-ollama-debug-tensor.patch b/llama/patches/0015-ollama-debug-tensor.patch
index 79d997c7..d8f9fc8a 100644
--- a/llama/patches/0015-ollama-debug-tensor.patch
+++ b/llama/patches/0015-ollama-debug-tensor.patch
@@ -8,7 +8,7 @@ Subject: [PATCH] ollama debug tensor
  1 file changed, 6 insertions(+)
 
 diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c
-index 34624cca..59bd3c62 100644
+index 835e6495..3902894b 100644
 --- a/ggml/src/ggml-cpu/ggml-cpu.c
 +++ b/ggml/src/ggml-cpu/ggml-cpu.c
 @@ -15,6 +15,8 @@
@@ -20,7 +20,7 @@ index 34624cca..59bd3c62 100644
  #if defined(_MSC_VER) || defined(__MINGW32__)
  #include <malloc.h> // using malloc.h with MSC/MINGW
  #elif !defined(__FreeBSD__) && !defined(__NetBSD__) && !defined(__OpenBSD__)
-@@ -2859,6 +2861,10 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
+@@ -2846,6 +2848,10 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
  
          ggml_compute_forward(&params, node);
  
diff --git a/llama/patches/0016-add-ollama-vocab-for-grammar-support.patch b/llama/patches/0016-add-ollama-vocab-for-grammar-support.patch
index 26a91ad9..ee81800e 100644
--- a/llama/patches/0016-add-ollama-vocab-for-grammar-support.patch
+++ b/llama/patches/0016-add-ollama-vocab-for-grammar-support.patch
@@ -184,7 +184,7 @@ index f8c291de..2a3a62db 100644
                        const char * grammar_root,
                                bool lazy,
 diff --git a/src/llama-sampling.cpp b/src/llama-sampling.cpp
-index c0a5f934..75731053 100644
+index 804b11e0..15a10ca8 100644
 --- a/src/llama-sampling.cpp
 +++ b/src/llama-sampling.cpp
 @@ -1466,7 +1466,7 @@ static void llama_sampler_grammar_reset(struct llama_sampler * smpl) {
diff --git a/llama/patches/0017-ggml-Don-t-assert-fail-when-tensor-data-changes-1322.patch b/llama/patches/0017-ggml-Don-t-assert-fail-when-tensor-data-changes-1322.patch
deleted file mode 100644
index b3424c9e..00000000
--- a/llama/patches/0017-ggml-Don-t-assert-fail-when-tensor-data-changes-1322.patch
+++ /dev/null
@@ -1,38 +0,0 @@
-From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
-From: Jesse Gross <jesse@kernel.org>
-Date: Thu, 1 May 2025 13:46:10 -0700
-Subject: [PATCH] ggml: Don't assert fail when tensor data changes (#13222)
-
-The following scenario will cause an assertion failure in the graph
-allocator:
- - Build and allocate a graph containing a tensor with a non-NULL data
-   pointer
- - Build and allocate a new graph where that data is NULL
-
-Result:
-ggml-alloc.c:819: GGML_ASSERT(talloc->buffer_id >= 0) failed
-
-This happens during revalidation because we think that memory should
-have been previously allocated based on the current graph but in
-reality the previous graph was different. In this situation, we
-should do a full reallocation pass.
----
- ggml/src/ggml-alloc.c | 5 ++++-
- 1 file changed, 4 insertions(+), 1 deletion(-)
-
-diff --git a/ggml/src/ggml-alloc.c b/ggml/src/ggml-alloc.c
-index a3d3f690..5fd379f6 100644
---- a/ggml/src/ggml-alloc.c
-+++ b/ggml/src/ggml-alloc.c
-@@ -816,7 +816,10 @@ static void ggml_gallocr_init_tensor(ggml_gallocr_t galloc, struct ggml_tensor *
- static bool ggml_gallocr_node_needs_realloc(ggml_gallocr_t galloc, struct ggml_tensor * node, struct tensor_alloc * talloc) {
-     size_t node_size = 0;
-     if (!node->data && !node->view_src) {
--        GGML_ASSERT(talloc->buffer_id >= 0); // prevent segfault when misusing the API
-+        // If we previously had data but don't now then reallocate
-+        if (talloc->buffer_id < 0) {
-+            return false;
-+        }
-         node_size = ggml_backend_buft_get_alloc_size(galloc->bufts[talloc->buffer_id], node);
-     }
-     return talloc->size_max >= node_size;
diff --git a/ml/backend/ggml/ggml.go b/ml/backend/ggml/ggml.go
index ff340561..e97795a6 100644
--- a/ml/backend/ggml/ggml.go
+++ b/ml/backend/ggml/ggml.go
@@ -406,6 +406,7 @@ func New(ctx context.Context, r *os.File, params ml.BackendParams) (ml.Backend,
 			C.int(len(schedBackends)),
 			C.size_t(maxGraphNodes),
 			C._Bool(len(gpus) > 1 && slices.Contains(gpus, output.d)),
+			C._Bool(false),
 		),
 		schedBackends: schedBackends,
 		schedBufts:    schedBufts,
diff --git a/ml/backend/ggml/ggml/include/ggml-backend.h b/ml/backend/ggml/ggml/include/ggml-backend.h
index 64671495..778927f6 100644
--- a/ml/backend/ggml/ggml/include/ggml-backend.h
+++ b/ml/backend/ggml/ggml/include/ggml-backend.h
@@ -38,7 +38,7 @@ extern "C" {
     GGML_API ggml_backend_buffer_t ggml_backend_buft_alloc_buffer  (ggml_backend_buffer_type_t buft, size_t size);
     GGML_API size_t                ggml_backend_buft_get_alignment (ggml_backend_buffer_type_t buft);
     GGML_API size_t                ggml_backend_buft_get_max_size  (ggml_backend_buffer_type_t buft);
-    GGML_API size_t                ggml_backend_buft_get_alloc_size(ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor);
+    GGML_API size_t                ggml_backend_buft_get_alloc_size(ggml_backend_buffer_type_t buft, const struct ggml_tensor * tensor);
     GGML_API bool                  ggml_backend_buft_is_host       (ggml_backend_buffer_type_t buft);
     GGML_API ggml_backend_dev_t    ggml_backend_buft_get_device    (ggml_backend_buffer_type_t buft);
 
@@ -59,7 +59,7 @@ extern "C" {
     GGML_API enum ggml_status               ggml_backend_buffer_init_tensor   (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
     GGML_API size_t                         ggml_backend_buffer_get_alignment (ggml_backend_buffer_t buffer);
     GGML_API size_t                         ggml_backend_buffer_get_max_size  (ggml_backend_buffer_t buffer);
-    GGML_API size_t                         ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
+    GGML_API size_t                         ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor);
     GGML_API void                           ggml_backend_buffer_clear         (ggml_backend_buffer_t buffer, uint8_t value);
     GGML_API bool                           ggml_backend_buffer_is_host       (ggml_backend_buffer_t buffer);
     GGML_API void                           ggml_backend_buffer_set_usage     (ggml_backend_buffer_t buffer, enum ggml_backend_buffer_usage usage);
@@ -248,7 +248,7 @@ extern "C" {
         // preferrably to run on the same backend as the buffer
         ggml_backend_buffer_set_usage(buf_weights, GGML_BACKEND_BUFFER_USAGE_WEIGHTS);
 
-        sched = ggml_backend_sched_new({backend_gpu, backend_gpu2, backend_cpu}, NULL, num_backends, GGML_DEFAULT_GRAPH_SIZE, false);
+        sched = ggml_backend_sched_new({backend_gpu, backend_gpu2, backend_cpu}, NULL, num_backends, GGML_DEFAULT_GRAPH_SIZE, false, true);
 
         // initialize buffers from a max size graph (optional)
         reserve_graph = build_graph(sched, max_batch_size);
@@ -289,7 +289,7 @@ extern "C" {
     typedef bool (*ggml_backend_sched_eval_callback)(struct ggml_tensor * t, bool ask, void * user_data);
 
     // Initialize a backend scheduler, backends with low index are given priority over backends with high index
-    GGML_API ggml_backend_sched_t ggml_backend_sched_new(ggml_backend_t * backends, ggml_backend_buffer_type_t * bufts, int n_backends, size_t graph_size, bool parallel);
+    GGML_API ggml_backend_sched_t ggml_backend_sched_new(ggml_backend_t * backends, ggml_backend_buffer_type_t * bufts, int n_backends, size_t graph_size, bool parallel, bool op_offload);
     GGML_API void                 ggml_backend_sched_free(ggml_backend_sched_t sched);
 
     // Initialize backend buffers from a measure graph
diff --git a/ml/backend/ggml/ggml/include/ggml-cpp.h b/ml/backend/ggml/ggml/include/ggml-cpp.h
index a12342c2..48aa7968 100644
--- a/ml/backend/ggml/ggml/include/ggml-cpp.h
+++ b/ml/backend/ggml/ggml/include/ggml-cpp.h
@@ -24,7 +24,7 @@ typedef std::unique_ptr<gguf_context, gguf_context_deleter> gguf_context_ptr;
 
 struct ggml_gallocr_deleter { void operator()(ggml_gallocr_t galloc) { ggml_gallocr_free(galloc); } };
 
-typedef std::unique_ptr<ggml_gallocr_t, ggml_gallocr_deleter> ggml_gallocr_ptr;
+typedef std::unique_ptr<ggml_gallocr, ggml_gallocr_deleter> ggml_gallocr_ptr;
 
 // ggml-backend
 
diff --git a/ml/backend/ggml/ggml/include/ggml-opt.h b/ml/backend/ggml/ggml/include/ggml-opt.h
index eb5eab9d..da0c24b4 100644
--- a/ml/backend/ggml/ggml/include/ggml-opt.h
+++ b/ml/backend/ggml/ggml/include/ggml-opt.h
@@ -37,13 +37,16 @@ extern "C" {
     // ====== Dataset ======
 
     GGML_API ggml_opt_dataset_t ggml_opt_dataset_init(
-            int64_t ne_datapoint, // number of elements per datapoint
-            int64_t ne_label,     // number of elements per label
-            int64_t ndata,        // total number of datapoints/labels
-            int64_t ndata_shard); // number of datapoints/labels per shard (unit at which the dataset is shuffled/copied)
+            enum ggml_type type_data,    // the type for the internal data tensor
+            enum ggml_type type_label,   // the type for the internal labels tensor
+            int64_t        ne_datapoint, // number of elements per datapoint
+            int64_t        ne_label,     // number of elements per label
+            int64_t        ndata,        // total number of datapoints/labels
+            int64_t        ndata_shard); // number of datapoints/labels per shard (unit at which the dataset is shuffled/copied)
     GGML_API void ggml_opt_dataset_free(ggml_opt_dataset_t dataset);
 
     // get underlying tensors that store the data
+    GGML_API int64_t              ggml_opt_dataset_ndata (ggml_opt_dataset_t dataset);
     GGML_API struct ggml_tensor * ggml_opt_dataset_data  (ggml_opt_dataset_t dataset); // shape = [ne_datapoint, ndata]
     GGML_API struct ggml_tensor * ggml_opt_dataset_labels(ggml_opt_dataset_t dataset); // shape = [nd_label,     ndata]
 
@@ -56,13 +59,19 @@ extern "C" {
             struct ggml_tensor * data_batch,   // shape = [ne_datapoint, ndata_batch]
             struct ggml_tensor * labels_batch, // shape = [ne_label,     ndata_batch]
             int64_t              ibatch);
+    GGML_API void ggml_opt_dataset_get_batch_host(
+            ggml_opt_dataset_t   dataset,
+            void               * data_batch,
+            size_t               nb_data_batch,
+            void               * labels_batch,
+            int64_t              ibatch);
 
     // ====== Model / Context ======
 
     enum ggml_opt_build_type {
-        GGML_OPT_BUILD_TYPE_FORWARD,
-        GGML_OPT_BUILD_TYPE_GRAD,
-        GGML_OPT_BUILD_TYPE_OPT,
+        GGML_OPT_BUILD_TYPE_FORWARD = 10,
+        GGML_OPT_BUILD_TYPE_GRAD    = 20,
+        GGML_OPT_BUILD_TYPE_OPT     = 30,
     };
 
     // parameters that control which optimizer is used and how said optimizer tries to find the minimal loss
@@ -81,20 +90,22 @@ extern "C" {
     // userdata can be used to pass arbitrary data
     typedef struct ggml_opt_optimizer_params (*ggml_opt_get_optimizer_params)(void * userdata);
 
-    // returns the default optimizer params (constant)
+    // returns the default optimizer params (constant, hard-coded values)
     // userdata is not used
     GGML_API struct ggml_opt_optimizer_params ggml_opt_get_default_optimizer_params(void * userdata);
 
+    // casts userdata to ggml_opt_optimizer_params and returns it
+    GGML_API struct ggml_opt_optimizer_params ggml_opt_get_constant_optimizer_params(void * userdata);
+
     // parameters for initializing a new optimization context
     struct ggml_opt_params {
         ggml_backend_sched_t backend_sched; // defines which backends are used to construct the compute graphs
 
-        struct ggml_context * ctx_compute; // created in user code, holds non-static tensors
-
-        // the forward graph is defined by inputs and outputs
-        // those tensors and all tensors inbetween are not intended to be reusable between multiple optimization contexts
-        struct ggml_tensor * inputs;
-        struct ggml_tensor * outputs;
+        // by default the forward graph needs to be reconstructed for each eval
+        // if ctx_compute, inputs, and outputs are set the graphs are instead allocated statically
+        struct ggml_context * ctx_compute;
+        struct ggml_tensor  * inputs;
+        struct ggml_tensor  * outputs;
 
         enum ggml_opt_loss_type  loss_type;
         enum ggml_opt_build_type build_type;
@@ -107,12 +118,9 @@ extern "C" {
 
     // get parameters for an optimization context with defaults set where possible
     // parameters for which no sensible defaults exist are supplied as arguments to this function
-    GGML_API ggml_opt_params ggml_opt_default_params(
-            ggml_backend_sched_t      backend_sched,
-            struct ggml_context     * ctx_compute,
-            struct ggml_tensor      * inputs,
-            struct ggml_tensor      * outputs,
-            enum ggml_opt_loss_type   loss_type);
+    GGML_API struct ggml_opt_params ggml_opt_default_params(
+            ggml_backend_sched_t    backend_sched,
+            enum ggml_opt_loss_type loss_type);
 
     GGML_API ggml_opt_context_t ggml_opt_init(struct ggml_opt_params params);
     GGML_API void ggml_opt_free(ggml_opt_context_t opt_ctx);
@@ -121,6 +129,7 @@ extern "C" {
     GGML_API void ggml_opt_reset(ggml_opt_context_t opt_ctx, bool optimizer);
 
     // get underlying tensors that store data
+    // if not using static graphs these pointers become invalid with the next call to ggml_opt_alloc
     GGML_API struct ggml_tensor * ggml_opt_inputs(  ggml_opt_context_t opt_ctx); // forward graph input tensor
     GGML_API struct ggml_tensor * ggml_opt_outputs( ggml_opt_context_t opt_ctx); // forward graph output tensor
     GGML_API struct ggml_tensor * ggml_opt_labels(  ggml_opt_context_t opt_ctx); // labels to compare outputs against
@@ -128,11 +137,12 @@ extern "C" {
     GGML_API struct ggml_tensor * ggml_opt_pred(    ggml_opt_context_t opt_ctx); // predictions made by outputs
     GGML_API struct ggml_tensor * ggml_opt_ncorrect(ggml_opt_context_t opt_ctx); // number of matching predictions between outputs and labels
 
+    // get the gradient accumulator for a node from the forward graph
     GGML_API struct ggml_tensor * ggml_opt_grad_acc(ggml_opt_context_t opt_ctx, struct ggml_tensor * node);
 
     // ====== Optimization Result ======
 
-    GGML_API ggml_opt_result_t ggml_opt_result_init();
+    GGML_API ggml_opt_result_t ggml_opt_result_init(void);
     GGML_API void ggml_opt_result_free(ggml_opt_result_t result);
     GGML_API void ggml_opt_result_reset(ggml_opt_result_t result);
 
@@ -144,11 +154,20 @@ extern "C" {
 
     // ====== Computation ======
 
-    // do forward pass, increment result if not NULL
-    GGML_API void ggml_opt_forward(ggml_opt_context_t opt_ctx, ggml_opt_result_t result);
+    // if not using static graphs, this function must be called prior to ggml_opt_alloc
+    GGML_API void ggml_opt_prepare_alloc(
+        ggml_opt_context_t    opt_ctx,
+        struct ggml_context * ctx_compute,
+        struct ggml_cgraph  * gf,
+        struct ggml_tensor  * inputs,
+        struct ggml_tensor  * outputs);
 
-    // do forward pass, increment result if not NULL, do backward pass
-    GGML_API void ggml_opt_forward_backward(ggml_opt_context_t opt_ctx, ggml_opt_result_t result);
+    // allocate the next graph for evaluation, either forward or forward + backward
+    // must be called exactly once prior to calling ggml_opt_eval
+    GGML_API void ggml_opt_alloc(ggml_opt_context_t opt_ctx, bool backward);
+
+    // do forward pass, increment result if not NULL, do backward pass if allocated
+    GGML_API void ggml_opt_eval(ggml_opt_context_t opt_ctx, ggml_opt_result_t result);
 
     // ############################################################################
     // ## The high-level functions start here. They do not depend on any private ##
@@ -200,9 +219,9 @@ extern "C" {
     // fit model defined by inputs and outputs to dataset
     GGML_API void ggml_opt_fit(
             ggml_backend_sched_t            backend_sched,  // backend scheduler for constructing the compute graphs
-            ggml_context                  * ctx_compute,    // context with temporarily allocated tensors to calculate the outputs
-            ggml_tensor                   * inputs,         // input tensor with shape [ne_datapoint, ndata_batch]
-            ggml_tensor                   * outputs,        // output tensor, must have shape [ne_label, ndata_batch] if labels are used
+            struct ggml_context           * ctx_compute,    // context with temporarily allocated tensors to calculate the outputs
+            struct ggml_tensor            * inputs,         // input tensor with shape [ne_datapoint, ndata_batch]
+            struct ggml_tensor            * outputs,        // output tensor, must have shape [ne_label, ndata_batch] if labels are used
             ggml_opt_dataset_t              dataset,        // dataset with data and optionally also labels
             enum ggml_opt_loss_type         loss_type,      // loss to minimize
             ggml_opt_get_optimizer_params   get_opt_pars,   // callback to get optimizer params, userdata is pointer to epoch (of type int64_t)
diff --git a/ml/backend/ggml/ggml/include/ggml.h b/ml/backend/ggml/ggml/include/ggml.h
index 53ef31b2..8dc107ba 100644
--- a/ml/backend/ggml/ggml/include/ggml.h
+++ b/ml/backend/ggml/ggml/include/ggml.h
@@ -674,11 +674,15 @@ extern "C" {
     GGML_API bool ggml_is_3d        (const struct ggml_tensor * tensor);
     GGML_API int  ggml_n_dims       (const struct ggml_tensor * tensor); // returns 1 for scalars
 
+    // returns whether the tensor elements can be iterated over with a flattened index (no gaps, no permutation)
     GGML_API bool ggml_is_contiguous  (const struct ggml_tensor * tensor);
     GGML_API bool ggml_is_contiguous_0(const struct ggml_tensor * tensor); // same as ggml_is_contiguous()
     GGML_API bool ggml_is_contiguous_1(const struct ggml_tensor * tensor); // contiguous for dims >= 1
     GGML_API bool ggml_is_contiguous_2(const struct ggml_tensor * tensor); // contiguous for dims >= 2
 
+    // returns whether the tensor elements are allocated as one contiguous block of memory (no gaps, but permutation ok)
+    GGML_API bool ggml_is_contiguously_allocated(const struct ggml_tensor * tensor);
+
     // true for tensor that is stored in memory as CxWxHxN and has been permuted to WxHxCxN
     GGML_API bool ggml_is_contiguous_channels(const struct ggml_tensor * tensor);
 
@@ -765,7 +769,7 @@ extern "C" {
     // Tensor flags
     GGML_API void ggml_set_input(struct ggml_tensor * tensor);
     GGML_API void ggml_set_output(struct ggml_tensor * tensor);
-    GGML_API void ggml_set_param(struct ggml_context * ctx, struct ggml_tensor * tensor);
+    GGML_API void ggml_set_param(struct ggml_tensor * tensor);
     GGML_API void ggml_set_loss(struct ggml_tensor * tensor);
 
     //
@@ -935,7 +939,7 @@ extern "C" {
     GGML_API struct ggml_tensor * ggml_repeat_back(
             struct ggml_context * ctx,
             struct ggml_tensor  * a,
-            struct ggml_tensor  * b);
+            struct ggml_tensor  * b); // sum up values that are adjacent in dims > 0 instead of repeated with same stride
 
     // concat a and b along dim
     // used in stable-diffusion
@@ -2055,15 +2059,14 @@ extern "C" {
 
     GGML_API void ggml_build_forward_expand(struct ggml_cgraph * cgraph, struct ggml_tensor * tensor);
     GGML_API void ggml_build_backward_expand(
-        struct ggml_context * ctx_static,  // context for static gradients (loss + gradient accumulation)
-        struct ggml_context * ctx_compute, // context for gradient computation
-        struct ggml_cgraph  * cgraph,
-        bool                  accumulate); // whether or not gradients should be accumulated, requires static allocation of tensors in ctx_static
+        struct ggml_context *  ctx,        // context for gradient computation
+        struct ggml_cgraph  *  cgraph,
+        struct ggml_tensor  ** grad_accs);
 
     // graph allocation in a context
     GGML_API struct ggml_cgraph * ggml_new_graph       (struct ggml_context * ctx); // size = GGML_DEFAULT_GRAPH_SIZE, grads = false
     GGML_API struct ggml_cgraph * ggml_new_graph_custom(struct ggml_context * ctx, size_t size, bool grads);
-    GGML_API struct ggml_cgraph * ggml_graph_dup       (struct ggml_context * ctx, struct ggml_cgraph * cgraph);
+    GGML_API struct ggml_cgraph * ggml_graph_dup       (struct ggml_context * ctx, struct ggml_cgraph * cgraph, bool force_grads);
     GGML_API void                 ggml_graph_cpy       (struct ggml_cgraph * src, struct ggml_cgraph * dst);
     GGML_API void                 ggml_graph_reset     (struct ggml_cgraph * cgraph); // set regular grads + optimizer momenta to 0, set loss grad to 1
     GGML_API void                 ggml_graph_clear     (struct ggml_cgraph * cgraph);
diff --git a/ml/backend/ggml/ggml/src/CMakeLists.txt b/ml/backend/ggml/ggml/src/CMakeLists.txt
index 79c26312..0beaed86 100644
--- a/ml/backend/ggml/ggml/src/CMakeLists.txt
+++ b/ml/backend/ggml/ggml/src/CMakeLists.txt
@@ -214,7 +214,7 @@ add_library(ggml
 target_link_libraries(ggml PUBLIC ggml-base)
 
 if (CMAKE_SYSTEM_NAME MATCHES "Linux")
-    target_link_libraries(ggml PRIVATE dl stdc++fs)
+    target_link_libraries(ggml PRIVATE dl)
 endif()
 
 function(ggml_add_backend_library backend)
diff --git a/ml/backend/ggml/ggml/src/ggml-backend.cpp b/ml/backend/ggml/ggml/src/ggml-backend.cpp
index dd11f304..0ce73a99 100644
--- a/ml/backend/ggml/ggml/src/ggml-backend.cpp
+++ b/ml/backend/ggml/ggml/src/ggml-backend.cpp
@@ -56,7 +56,7 @@ size_t ggml_backend_buft_get_max_size(ggml_backend_buffer_type_t buft) {
     return SIZE_MAX;
 }
 
-size_t ggml_backend_buft_get_alloc_size(ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor) {
+size_t ggml_backend_buft_get_alloc_size(ggml_backend_buffer_type_t buft, const struct ggml_tensor * tensor) {
     // get_alloc_size is optional, defaults to ggml_nbytes
     if (buft->iface.get_alloc_size) {
         size_t size = buft->iface.get_alloc_size(buft, tensor);
@@ -151,7 +151,7 @@ size_t ggml_backend_buffer_get_max_size(ggml_backend_buffer_t buffer) {
     return ggml_backend_buft_get_max_size(ggml_backend_buffer_get_type(buffer));
 }
 
-size_t ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) {
+size_t ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor) {
     return ggml_backend_buft_get_alloc_size(ggml_backend_buffer_get_type(buffer), tensor);
 }
 
@@ -674,6 +674,8 @@ struct ggml_backend_sched {
     char * context_buffer;
     size_t context_buffer_size;
 
+    bool op_offload;
+
     int debug;
 };
 
@@ -766,7 +768,7 @@ static int ggml_backend_sched_backend_id_from_cur(ggml_backend_sched_t sched, st
         if (tensor->op != GGML_OP_ROPE && src->buffer != NULL && src->buffer->usage == GGML_BACKEND_BUFFER_USAGE_WEIGHTS) {
             int src_backend_id = ggml_backend_sched_backend_from_buffer(sched, src, tensor);
             // check if a backend with higher prio wants to offload the op
-            if (src_backend_id == sched->n_backends - 1 && ggml_backend_buffer_is_host(src->buffer)) {
+            if (sched->op_offload && src_backend_id == sched->n_backends - 1 && ggml_backend_buffer_is_host(src->buffer)) {
                 for (int b = 0; b < src_backend_id; b++) {
                     if (ggml_backend_supports_op(sched->backends[b], tensor) && ggml_backend_offload_op(sched->backends[b], tensor)) {
                         SET_CAUSE(tensor, "1.off");
@@ -1109,7 +1111,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
 
             const int node_backend_id = tensor_backend_id(node);
 
-            assert(node_backend_id != -1); // all nodes should be assigned by now
+            assert(node_backend_id != -1); // all nodes should be assigned by now, this can happen if there is no CPU fallback
 
             // check if we should start a new split based on the sources of the current node
             bool need_new_split = false;
@@ -1452,7 +1454,8 @@ ggml_backend_sched_t ggml_backend_sched_new(
         ggml_backend_buffer_type_t * bufts,
         int n_backends,
         size_t graph_size,
-        bool parallel) {
+        bool parallel,
+        bool op_offload) {
     GGML_ASSERT(n_backends > 0);
     GGML_ASSERT(n_backends <= GGML_SCHED_MAX_BACKENDS);
     GGML_ASSERT(ggml_backend_dev_type(ggml_backend_get_device(backends[n_backends - 1])) == GGML_BACKEND_DEVICE_TYPE_CPU);
@@ -1497,6 +1500,7 @@ ggml_backend_sched_t ggml_backend_sched_new(
     }
 
     sched->galloc = ggml_gallocr_new_n(sched->bufts, n_backends);
+    sched->op_offload = op_offload;
 
     ggml_backend_sched_reset(sched);
 
diff --git a/ml/backend/ggml/ggml/src/ggml-cpu/CMakeLists.txt b/ml/backend/ggml/ggml/src/ggml-cpu/CMakeLists.txt
index 9a3085be..bdaec288 100644
--- a/ml/backend/ggml/ggml/src/ggml-cpu/CMakeLists.txt
+++ b/ml/backend/ggml/ggml/src/ggml-cpu/CMakeLists.txt
@@ -428,6 +428,7 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
             ${KLEIDIAI_SRC}/kai/ukernels/
             ${KLEIDIAI_SRC}/kai/ukernels/matmul/
             ${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/
+            ${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_fp32_bf16p_bf16p/
             ${KLEIDIAI_SRC}/kai/ukernels/matmul/pack/)
 
         set(ARCH_FLAGS_TEMP "${ARCH_FLAGS}")
@@ -438,17 +439,19 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
         string(FIND "${ARCH_FLAGS_TEMP}" "+i8mm" I8MM_ENABLED)
         string(FIND "${ARCH_FLAGS_TEMP}" "+sme" SME_ENABLED)
 
-        set(PRIVATE_ARCH_FLAGS ${ARCH_FLAGS})
+        set(PRIVATE_ARCH_FLAGS ${ARCH_FLAGS_TEMP})
 
-        list(APPEND GGML_KLEIDIAI_SOURCES ${KLEIDIAI_SRC}/kai/ukernels/matmul/pack/kai_lhs_quant_pack_qsi8d32p_f32.c)
-        list(APPEND GGML_KLEIDIAI_SOURCES ${KLEIDIAI_SRC}/kai/ukernels/matmul/pack/kai_rhs_pack_nxk_qsi4c32ps1s0scalef16_qsu4c32s16s0_neon.c)
-        list(APPEND GGML_KLEIDIAI_SOURCES ${KLEIDIAI_SRC}/kai/ukernels/matmul/pack/kai_lhs_quant_pack_qsi8d32p_f32_neon.c)
-        list(APPEND GGML_KLEIDIAI_SOURCES ${KLEIDIAI_SRC}/kai/ukernels/matmul/pack/kai_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0.c)
+        list(APPEND GGML_KLEIDIAI_SOURCES
+            ${KLEIDIAI_SRC}/kai/ukernels/matmul/pack/kai_lhs_quant_pack_qsi8d32p_f32.c
+            ${KLEIDIAI_SRC}/kai/ukernels/matmul/pack/kai_rhs_pack_nxk_qsi4c32ps1s0scalef16_qsu4c32s16s0_neon.c
+            ${KLEIDIAI_SRC}/kai/ukernels/matmul/pack/kai_lhs_quant_pack_qsi8d32p_f32_neon.c
+            ${KLEIDIAI_SRC}/kai/ukernels/matmul/pack/kai_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0.c)
 
         if (NOT DOTPROD_ENABLED MATCHES -1)
-            list(APPEND GGML_KLEIDIAI_SOURCES ${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p1x8_qsi4c32p4x8_1x4x32_neon_dotprod.c)
-            list(APPEND GGML_KLEIDIAI_SOURCES ${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p1x4_qsi4c32p4x4_1x4_neon_dotprod.c)
-            list(APPEND GGML_KLEIDIAI_SOURCES ${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p4x4_qsi4c32p4x4_16x4_neon_dotprod.c)
+            list(APPEND GGML_KLEIDIAI_SOURCES
+                ${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p1x8_qsi4c32p4x8_1x4x32_neon_dotprod.c
+                ${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p1x4_qsi4c32p4x4_1x4_neon_dotprod.c
+                ${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p4x4_qsi4c32p4x4_16x4_neon_dotprod.c)
         endif()
 
         if (NOT I8MM_ENABLED MATCHES -1)
@@ -456,9 +459,13 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
         endif()
 
         if (NOT SME_ENABLED MATCHES -1)
-            list(APPEND GGML_KLEIDIAI_SOURCES ${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p1vlx4_qsi4c32p4vlx4_1vlx4vl_sme2_mopa.c)
-            list(APPEND GGML_KLEIDIAI_SOURCES ${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p1x4_qsi4c32p4vlx4_1x4vl_sme2_sdot.c)
-            set(PRIVATE_ARCH_FLAGS "${PRIVATE_ARCH_FLAGS}+sve+sve2")
+            list(APPEND GGML_KLEIDIAI_SOURCES
+                ${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p1vlx4_qsi4c32p4vlx4_1vlx4vl_sme2_mopa.c
+                ${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p1x4_qsi4c32p4vlx4_1x4vl_sme2_sdot.c
+                ${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_fp32_bf16p_bf16p/kai_matmul_clamp_f32_bf16p2vlx2_bf16p2vlx2_2vlx2vl_sme2_mopa.c
+                ${KLEIDIAI_SRC}/kai/ukernels/matmul/pack/kai_lhs_pack_bf16p2vlx2_f32_sme.c
+                ${KLEIDIAI_SRC}/kai/ukernels/matmul/pack/kai_rhs_pack_kxn_bf16p2vlx2b_f32_x32_sme.c)
+            set(PRIVATE_ARCH_FLAGS "-fno-tree-vectorize;${PRIVATE_ARCH_FLAGS}+sve+sve2")
         endif()
 
         set_source_files_properties(${GGML_KLEIDIAI_SOURCES} PROPERTIES COMPILE_OPTIONS "${PRIVATE_ARCH_FLAGS}")
diff --git a/ml/backend/ggml/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp b/ml/backend/ggml/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp
index 175cba32..8ff6d64a 100644
--- a/ml/backend/ggml/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp
+++ b/ml/backend/ggml/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp
@@ -72,8 +72,6 @@ static_assert(sizeof(block_iq4_nlx4) == 4 * sizeof(ggml_half) + QK4_NL * 2, "wro
 
 #if defined(__GNUC__)
 #pragma GCC diagnostic ignored "-Woverlength-strings"
-#elif defined(_MSC_VER)
-#pragma warning(disable: 4244 4267) // possible loss of data
 #endif
 
 #define UNUSED GGML_UNUSED
diff --git a/ml/backend/ggml/ggml/src/ggml-cpu/ggml-cpu-quants.c b/ml/backend/ggml/ggml/src/ggml-cpu/ggml-cpu-quants.c
index 91a81bdc..ccd0651e 100644
--- a/ml/backend/ggml/ggml/src/ggml-cpu/ggml-cpu-quants.c
+++ b/ml/backend/ggml/ggml/src/ggml-cpu/ggml-cpu-quants.c
@@ -20,12 +20,6 @@
 #define GROUP_MAX_EPS_IQ1_M 1e-7f
 #define GROUP_MAX_EPS_IQ1_S 1e-12f
 
-#if defined(_MSC_VER)
-// disable "possible loss of data" to avoid warnings for hundreds of casts
-// we should just be careful :)
-#pragma warning(disable: 4244 4267)
-#endif
-
 #define UNUSED GGML_UNUSED
 
 // some compilers don't provide _mm256_set_m128i, e.g. gcc 7
@@ -6596,7 +6590,118 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
     }
 
     *s = hsum_float_8(acc);
+#elif defined(__VXE__) || defined(__VXE2__)
+    uint32_t aux[3];
+    uint32_t utmp[4];
 
+    const int32x4_t v_z = vec_splat_s32(0);
+    const uint8x16_t v_3m = vec_splat_u8(0x03);
+
+    const uint8x16_t v_0c = vec_splat_u8(1);
+    const uint8x16_t v_1c = vec_sl(v_0c, 1);
+    const uint8x16_t v_2c = vec_sl(v_0c, 2);
+    const uint8x16_t v_3c = vec_sl(v_0c, 3);
+
+    uint8x16_t q3h[4];
+    uint8x16_t q3b[2];
+    int8x16_t q3bytes[4];
+    int8x16_t q8bytes[4];
+    uint8x16_t qhbits[2];
+
+    float sum = 0;
+
+    for (int i = 0; i < nb; ++i) {
+        const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
+
+        const uint8_t * restrict x0l = x[i].qs;
+        const uint8_t * restrict x0h = x[i].hmask;
+        const int8_t  * restrict y0  = y[i].qs;
+
+        qhbits[0] = vec_xl(0 , x0h);
+        qhbits[1] = vec_xl(16, x0h);
+
+        int32_t isum = 0;
+
+        memcpy(aux, x[i].scales, 12);
+        utmp[3] = ((aux[1] >> 4) & kmask2) | (((aux[2] >> 6) & kmask1) << 4);
+        utmp[2] = ((aux[0] >> 4) & kmask2) | (((aux[2] >> 4) & kmask1) << 4);
+        utmp[1] = (aux[1] & kmask2) | (((aux[2] >> 2) & kmask1) << 4);
+        utmp[0] = (aux[0] & kmask2) | (((aux[2] >> 0) & kmask1) << 4);
+
+        int8_t * scale = (int8_t *)utmp;
+        for (int j = 0; j < 16; ++j) scale[j] -= 32;
+
+        for (int j = 0; j < QK_K/128; ++j) {
+            int32x4_t isum0, isum1, isum2, isum3;
+
+            q3b[0] = vec_xl(0 , x0l);
+            q3b[1] = vec_xl(16, x0l);
+            x0l += 32;
+
+            q8bytes[0] = vec_xl(0  , y0);
+            q8bytes[1] = vec_xl(16 , y0);
+            q8bytes[2] = vec_xl(32 , y0);
+            q8bytes[3] = vec_xl(48 , y0);
+            q8bytes[4] = vec_xl(64 , y0);
+            q8bytes[5] = vec_xl(80 , y0);
+            q8bytes[6] = vec_xl(96 , y0);
+            q8bytes[7] = vec_xl(112, y0);
+            y0 += 128;
+
+            q3h[0] = vec_sl(vec_andc(v_0c, qhbits[0]), 2);
+            q3h[1] = vec_sl(vec_andc(v_0c, qhbits[1]), 2);
+            q3h[2] = vec_sl(vec_andc(v_1c, qhbits[0]), 1);
+            q3h[3] = vec_sl(vec_andc(v_1c, qhbits[1]), 1);
+
+            q3bytes[0] = vec_sub((int8x16_t)vec_and(q3b[0], v_3m), (int8x16_t)q3h[0]);
+            q3bytes[1] = vec_sub((int8x16_t)vec_and(q3b[1], v_3m), (int8x16_t)q3h[1]);
+            q3bytes[2] = vec_sub((int8x16_t)vec_and(vec_sr(q3b[0], 2), v_3m), (int8x16_t)q3h[2]);
+            q3bytes[3] = vec_sub((int8x16_t)vec_and(vec_sr(q3b[1], 2), v_3m), (int8x16_t)q3h[3]);
+
+            isum0 = ggml_vec_dot(v_z, q3bytes[0], q8bytes[0]);
+            isum1 = ggml_vec_dot(v_z, q3bytes[1], q8bytes[1]);
+            isum2 = ggml_vec_dot(v_z, q3bytes[2], q8bytes[2]);
+            isum3 = ggml_vec_dot(v_z, q3bytes[3], q8bytes[3]);
+
+            isum += (isum0[0] + isum0[1] + isum0[2] + isum0[3]) * scale[0];
+            isum += (isum1[0] + isum1[1] + isum1[2] + isum1[3]) * scale[1];
+            isum += (isum2[0] + isum2[1] + isum2[2] + isum2[3]) * scale[2];
+            isum += (isum3[0] + isum3[1] + isum3[2] + isum3[3]) * scale[3];
+
+            scale += 4;
+
+            q3h[0] = vec_andc(v_2c, qhbits[0]);
+            q3h[1] = vec_andc(v_2c, qhbits[1]);
+            q3h[2] = vec_sr(vec_andc(v_3c, qhbits[0]), 1);
+            q3h[3] = vec_sr(vec_andc(v_3c, qhbits[1]), 1);
+
+            q3bytes[0] = vec_sub((int8x16_t)vec_and(vec_sr(q3b[0], 4), v_3m), (int8x16_t)q3h[0]);
+            q3bytes[1] = vec_sub((int8x16_t)vec_and(vec_sr(q3b[1], 4), v_3m), (int8x16_t)q3h[1]);
+            q3bytes[2] = vec_sub((int8x16_t)vec_and(vec_sr(q3b[0], 6), v_3m), (int8x16_t)q3h[2]);
+            q3bytes[3] = vec_sub((int8x16_t)vec_and(vec_sr(q3b[1], 6), v_3m), (int8x16_t)q3h[3]);
+
+            isum0 = ggml_vec_dot(v_z, q3bytes[0], q8bytes[4]);
+            isum1 = ggml_vec_dot(v_z, q3bytes[1], q8bytes[5]);
+            isum2 = ggml_vec_dot(v_z, q3bytes[2], q8bytes[6]);
+            isum3 = ggml_vec_dot(v_z, q3bytes[3], q8bytes[7]);
+
+            isum += (isum0[0] + isum0[1] + isum0[2] + isum0[3]) * scale[0];
+            isum += (isum1[0] + isum1[1] + isum1[2] + isum1[3]) * scale[1];
+            isum += (isum2[0] + isum2[1] + isum2[2] + isum2[3]) * scale[2];
+            isum += (isum3[0] + isum3[1] + isum3[2] + isum3[3]) * scale[3];
+
+            scale += 4;
+
+            if (j == 0) {
+                qhbits[0] = vec_sr(qhbits[0], 4);
+                qhbits[1] = vec_sr(qhbits[1], 4);
+            }
+        }
+
+        sum += d * isum;
+    }
+
+    *s = sum;
 #else
     // scalar version
     // This function is written like this so the compiler can manage to vectorize most of it
diff --git a/ml/backend/ggml/ggml/src/ggml-cpu/ggml-cpu.c b/ml/backend/ggml/ggml/src/ggml-cpu/ggml-cpu.c
index 59bd3c62..3902894b 100644
--- a/ml/backend/ggml/ggml/src/ggml-cpu/ggml-cpu.c
+++ b/ml/backend/ggml/ggml/src/ggml-cpu/ggml-cpu.c
@@ -52,19 +52,6 @@
 #include "llamafile/sgemm.h"
 #endif
 
-#if defined(_MSC_VER)
-// disable "possible loss of data" to avoid hundreds of casts
-// we should just be careful :)
-#pragma warning(disable: 4244 4267)
-
-// disable POSIX deprecation warnings
-// these functions are never going away, anyway
-#pragma warning(disable: 4996)
-
-// unreachable code because of multiple instances of code after GGML_ABORT
-#pragma warning(disable: 4702)
-#endif
-
 // Note: once we move threading into a separate C++ file
 // will use std::hardware_destructive_interference_size instead of hardcoding it here
 // and we'll use C++ attribute syntax.
diff --git a/ml/backend/ggml/ggml/src/ggml-cpu/ggml-cpu.cpp b/ml/backend/ggml/ggml/src/ggml-cpu/ggml-cpu.cpp
index 4b688a67..e013e8b4 100644
--- a/ml/backend/ggml/ggml/src/ggml-cpu/ggml-cpu.cpp
+++ b/ml/backend/ggml/ggml/src/ggml-cpu/ggml-cpu.cpp
@@ -11,24 +11,26 @@
 #include <vector>
 
 #ifdef GGML_USE_CPU_HBM
-#include "ggml-cpu-hbm.h"
+#    include "ggml-cpu-hbm.h"
 #endif
 
 #ifdef GGML_USE_CPU_KLEIDIAI
-#include "kleidiai/kleidiai.h"
-#endif
-
-#if defined(__APPLE__)
-#include <sys/types.h>
-#include <sys/sysctl.h>
+#    include "kleidiai/kleidiai.h"
 #endif
 
 #if defined(_WIN32)
-#define WIN32_LEAN_AND_MEAN
-#ifndef NOMINMAX
-    #define NOMINMAX
+#    define WIN32_LEAN_AND_MEAN
+#    ifndef NOMINMAX
+#        define NOMINMAX
+#    endif
+#    include <windows.h>
+#else
+#    include <unistd.h>
 #endif
-#include <windows.h>
+
+#if defined(__APPLE__)
+#    include <sys/sysctl.h>
+#    include <sys/types.h>
 #endif
 
 // ggml-backend interface
@@ -70,8 +72,10 @@ static ggml_backend_buffer_type_t * ggml_backend_cpu_device_get_extra_buffers_ty
 }
 
 static bool ggml_backend_cpu_is_extra_buffer_type(ggml_backend_buffer_type_t buft) {
-    for (auto extra : ggml_backend_cpu_get_extra_buffers_type()) {
-        if (extra && extra == buft) return true;
+    for (auto * extra : ggml_backend_cpu_get_extra_buffers_type()) {
+        if (extra && extra == buft) {
+            return true;
+        }
     }
     return false;
 }
@@ -330,9 +334,18 @@ static const char * ggml_backend_cpu_device_get_description(ggml_backend_dev_t d
 }
 
 static void ggml_backend_cpu_device_get_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) {
-    // TODO
-    *free = 0;
-    *total = 0;
+#ifdef _WIN32
+    MEMORYSTATUSEX status;
+    status.dwLength = sizeof(status);
+    GlobalMemoryStatusEx(&status);
+    *total = status.ullTotalPhys;
+    *free = status.ullAvailPhys;
+#else
+    long pages = sysconf(_SC_PHYS_PAGES);
+    long page_size = sysconf(_SC_PAGE_SIZE);
+    *total = pages * page_size;
+    *free = *total;
+#endif
 
     GGML_UNUSED(dev);
 }
diff --git a/ml/backend/ggml/ggml/src/ggml-cpu/llamafile/sgemm.cpp b/ml/backend/ggml/ggml/src/ggml-cpu/llamafile/sgemm.cpp
index f6374f78..1d46158f 100644
--- a/ml/backend/ggml/ggml/src/ggml-cpu/llamafile/sgemm.cpp
+++ b/ml/backend/ggml/ggml/src/ggml-cpu/llamafile/sgemm.cpp
@@ -1054,6 +1054,493 @@ class tinyBLAS_Q0_AVX {
       } \
    } \
 
+template <typename TA, typename TB, typename TC>
+class tinyBLAS_BF16_PPC {
+  public:
+    tinyBLAS_BF16_PPC(int64_t k,
+                const TA *A, int64_t lda,
+                const TB *B, int64_t ldb,
+                TC *C, int64_t ldc,
+                int ith, int nth)
+        : A(A), B(B), C(C), k(k), lda(lda), ldb(ldb), ldc(ldc), ith(ith), nth(nth) {
+    }
+
+    void matmul(int64_t m, int64_t n) {
+        mnpack(0, m, 0, n);
+    }
+
+  private:
+    void vector_permute_store(vec_t *c, int numVec, unsigned char *vecOffset) {
+        vec_t t[8], s[8];
+        vec_t swiz1 = {0, 1, 2, 3, 16, 17, 18, 19, 4, 5, 6, 7, 20, 21, 22, 23};
+        vec_t swiz2 = {8, 9, 10, 11, 24, 25, 26, 27, 12, 13, 14, 15, 28, 29, 30, 31};
+        vec_t swiz3 = {0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23};
+        vec_t swiz4 = {8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31};
+
+        if (numVec == 2) {
+            t[0] = vec_perm(c[0], c[1], swiz1);
+            t[1] = vec_perm(c[2], c[3], swiz1);
+            s[0] = vec_perm(t[0], t[1], swiz3);
+            s[1] = vec_perm(t[0], t[1], swiz4);
+            vec_xst(s[0], 0, (vec_t*)vecOffset);
+            vec_xst(s[1], 0, (vec_t*)(vecOffset + 16));
+        } else if (numVec == 4) {
+            t[0] = vec_perm(c[0], c[1], swiz1);
+            t[1] = vec_perm(c[0], c[1], swiz2);
+            t[2] = vec_perm(c[2], c[3], swiz1);
+            t[3] = vec_perm(c[2], c[3], swiz2);
+            s[0] = vec_perm(t[0], t[2], swiz3);
+            s[1] = vec_perm(t[0], t[2], swiz4);
+            s[2] = vec_perm(t[1], t[3], swiz3);
+            s[3] = vec_perm(t[1], t[3], swiz4);
+            for (int i = 0; i < 4; ++i)
+                vec_xst(s[i], 0, (vec_t*)(vecOffset + i * 16));
+        } else if (numVec == 8) {
+            for (int i = 0; i < 4; i += 2) {
+                t[i+0] = vec_perm(c[i+0], c[i+1], swiz1);
+                t[i+1] = vec_perm(c[i+0], c[i+1], swiz2);
+            }
+            for (int i = 4; i < 8; i += 2) {
+                t[i+0] = vec_perm(c[i+0], c[i+1], swiz1);
+                t[i+1] = vec_perm(c[i+0], c[i+1], swiz2);
+            }
+            s[0] = vec_perm(t[0], t[2], swiz3);
+            s[1] = vec_perm(t[0], t[2], swiz4);
+            s[2] = vec_perm(t[1], t[3], swiz3);
+            s[3] = vec_perm(t[1], t[3], swiz4);
+            s[4] = vec_perm(t[4], t[6], swiz3);
+            s[5] = vec_perm(t[4], t[6], swiz4);
+            s[6] = vec_perm(t[5], t[7], swiz3);
+            s[7] = vec_perm(t[5], t[7], swiz4);
+            for (int i = 0; i < 8; ++i)
+                vec_xst(s[i], 0, (vec_t*)(vecOffset + i * 16));
+        }
+    }
+
+    void packNormal(const TA* a, int64_t lda, int rows, int cols, unsigned char* vec) {
+        int64_t i, j;
+        TA *aoffset = NULL;
+        unsigned char *vecOffset = NULL;
+        TA * aoffsets[8];
+        vector unsigned char c_arr[8];
+        aoffset = const_cast<TA*>(a);
+        vecOffset = vec;
+        j = (rows >> 3);
+        if (j > 0) {
+            do {
+                if (cols == 4) {
+                    aoffsets[0] = aoffset;
+                    for (int it = 1; it < 4; ++it)
+                        aoffsets[it] = aoffsets[it-1] + lda;
+                    aoffset += 4 * lda;
+                    for (int i = 0; i < 4; ++i)
+                        c_arr[i] = vec_xl(0, (vector unsigned char*)aoffsets[i]);
+                    vector_permute_store(c_arr, 4, vecOffset);
+                    for (int i = 0; i<4; i++)
+                        aoffsets[i] = aoffsets[i]+lda;
+                    vecOffset +=64;
+                }
+                i = (cols >> 3);
+                if (i > 0) {
+                    aoffsets[0] = aoffset;
+                    for (int it = 1; it < 8; ++it) {
+                        aoffsets[it] = aoffsets[it-1] + lda;
+                    }
+                    aoffset += 8 * lda;
+                    do {
+                        for (int it = 0; it < 8; ++it)
+                            c_arr[it] = vec_xl(0, (vector unsigned char*)aoffsets[it]);
+                        vector_permute_store(c_arr, 8, vecOffset);
+                        for (int it = 0; it < 8; ++it)
+                            aoffsets[it] = aoffsets[it] + 8*lda;
+                        vecOffset += 128;
+                        i--;
+                    } while(i > 0);
+                }
+                j--;
+            } while(j > 0);
+        }
+        if (rows & 4) {
+            aoffsets[0] = aoffset;
+            for (int it = 1; it < 4; ++it)
+                aoffsets[it] = aoffsets[it-1] + lda;
+            aoffset += 4 * lda;
+            if (cols == 4) {
+                for (int it = 0; it < 4; ++it)
+                    c_arr[it] = vec_xl(0, (vector unsigned char*)aoffsets[it]);
+                vector_permute_store(c_arr, 2, vecOffset);
+                for (int it = 0; it< 4; it++)
+                    aoffsets[it] = aoffsets[it] + lda;
+                vecOffset += 32;
+            }
+            i = (cols >> 3);
+            if (i > 0) {
+                do {
+                    for (int it = 0; it < 4; ++it)
+                        c_arr[it] = vec_xl(0, (vector unsigned char*)aoffsets[it]);
+                    vector_permute_store(c_arr, 4, vecOffset);
+                    for (int it = 0; it< 4; it++)
+                        aoffsets[it] = aoffsets[it] + 8*lda;
+                    vecOffset += 64;
+                    i--;
+                } while(i > 0);
+            }
+        }
+        if (rows & 3) {
+            aoffsets[0] = aoffset;
+            for (int it = 1; it < 4; ++it)
+                aoffsets[it] = aoffsets[it-1] + lda;
+            if (cols == 4) {
+                switch(rows) {
+                    case 3: c_arr[2] = vec_xl(0, (vector unsigned char*)aoffsets[2]);
+                    case 2: c_arr[1] = vec_xl(0, (vector unsigned char*)aoffsets[1]);
+                    case 1: c_arr[0] = vec_xl(0, (vector unsigned char*)aoffsets[0]);
+                        break;
+                }
+                vector_permute_store(c_arr, 2, vecOffset);
+                for (int it = 0; it< 4; it++)
+                     aoffsets[it] = aoffsets[it] + lda;
+                vecOffset += 32;
+            }
+            i = (cols >> 3);
+            if (i > 0) {
+                do {
+                    switch(rows) {
+                        case 3: c_arr[2] = vec_xl(0, (vector unsigned char*)aoffsets[2]);
+                        case 2: c_arr[1] = vec_xl(0, (vector unsigned char*)aoffsets[1]);
+                        case 1: c_arr[0] = vec_xl(0, (vector unsigned char*)aoffsets[0]);
+                            break;
+                    }
+                    vector_permute_store(c_arr, 4, vecOffset);
+                    for (int it = 0; it <4; it++)
+                         aoffsets[it] = aoffsets[it] + 8* lda;
+                    vecOffset += 64;
+                    i--;
+                } while(i > 0);
+            }
+        }
+    }
+
+    void mnpack(int64_t m0, int64_t m, int64_t n0, int64_t n) {
+        int64_t mc, nc, mp, np;
+        int m_rem = MIN(m - m0, 8);
+        int n_rem = MIN(n - n0, 8);
+
+        if (m_rem >= 8 && n_rem >= 8) {
+            mc = 8;
+            nc = 8;
+            gemm<8,8>(m0, m, n0, n);
+        } else if (m_rem >= 4 && n_rem >= 8) {
+            mc = 4;
+            nc = 8;
+            gemm<4,8>(m0, m, n0, n);
+        } else if (m_rem >=8 && n_rem >=4){
+                mc = 8;
+                nc = 4;
+                gemm<8,4>(m0, m, n0, n);
+        } else if ((m_rem < 4) && (n_rem >= 8)) {
+            nc = 8;
+            switch(m_rem) {
+                case 1:
+                    mc = 1;
+                    gemm_Mx8<1>(m0, m, n0, n);
+                    break;
+                case 2:
+                    mc = 2;
+                    gemm_Mx8<2>(m0, m, n0, n);
+                    break;
+                case 3:
+                    mc = 3;
+                    gemm_Mx8<3>(m0, m, n0, n);
+                    break;
+                default:
+                    return;
+            }
+        } else if (m_rem >= 4 && n_rem >= 4) {
+            mc = 4;
+            nc = 4;
+            gemm_small<4, 4>(m0, m, n0, n);
+        } else if ((m_rem > 4) && (n_rem < 4)) {
+            mc = 4;
+            switch(n_rem) {
+                case 1:
+                    nc = 1;
+                    gemm_small<4, 1>(m0, m, n0, n);
+                    break;
+                case 2:
+                    nc = 2;
+                    gemm_small<4, 2>(m0, m, n0, n);
+                    break;
+                case 3:
+                    nc = 3;
+                    gemm_small<4, 3>(m0, m, n0, n);
+                    break;
+
+                default:
+                    return;
+            }
+        } else {
+            switch((m_rem << 4) | n_rem) {
+                case 0x43:
+                    mc = 4;
+                    nc = 3;
+                    gemm_small<4, 3>(m0, m, n0, n);
+                    break;
+                case 0x42:
+                    mc = 4;
+                    nc = 2;
+                    gemm_small<4, 2>(m0, m, n0, n);
+                    break;
+                case 0x41:
+                    mc = 4;
+                    nc = 1;
+                    gemm_small<4, 1>(m0, m, n0, n);
+                    break;
+                case 0x34:
+                    mc = 3;
+                    nc = 4;
+                    gemm_small<3, 4>(m0, m, n0, n);
+                    break;
+                case 0x33:
+                    mc = 3;
+                    nc = 3;
+                    gemm_small<3, 3>(m0, m, n0, n);
+                    break;
+                case 0x32:
+                    mc = 3;
+                    nc = 2;
+                    gemm_small<3, 2>(m0, m, n0, n);
+                    break;
+                case 0x31:
+                    mc = 3;
+                    nc = 1;
+                    gemm_small<3, 1>(m0, m, n0, n);
+                    break;
+                case 0x24:
+                    mc = 2;
+                    nc = 4;
+                    gemm_small<2,4>(m0, m, n0, n);
+                    break;
+                case 0x23:
+                    mc = 2;
+                    nc = 3;
+                    gemm_small<2, 3>(m0, m, n0, n);
+                    break;
+                case 0x22:
+                    mc = 2;
+                    nc = 2;
+                    gemm_small<2, 2>(m0, m, n0, n);
+                    break;
+                case 0x21:
+                    mc = 2;
+                    nc = 1;
+                    gemm_small<2, 1>(m0, m, n0, n);
+                    break;
+                case 0x14:
+                    mc = 1;
+                    nc = 4;
+                    gemm_small<1, 4>(m0, m, n0, n);
+                    break;
+                case 0x13:
+                    mc = 1;
+                    nc = 3;
+                    gemm_small<1, 3>(m0, m, n0, n);
+                    break;
+                case 0x12:
+                    mc = 1;
+                    nc = 2;
+                    gemm_small<1, 2>(m0, m, n0, n);
+                    break;
+                case 0x11:
+                    mc = 1;
+                    nc = 1;
+                    gemm_small<1, 1>(m0, m, n0, n);
+                    break;
+                default:
+                    return;
+            }
+        }
+        mp = m0 + (m - m0) / mc * mc;
+        np = n0 + (n - n0) / nc * nc;
+        mnpack(mp, m, n0, np);
+        mnpack(m0, m, np, n);
+    }
+
+    void KERNEL_4x8(int64_t ii, int64_t jj) {
+        vec_t vec_A[4], vec_B[8] , vec_C[4];
+        acc_t acc_0, acc_1;
+        __builtin_mma_xxsetaccz(&acc_0);
+        __builtin_mma_xxsetaccz(&acc_1);
+        for (int l = 0; l < k; l+=8) {
+            packNormal((A+(ii*lda)+l), lda, 4, 8, (uint8_t*)vec_A);
+            packNormal((B+(jj*ldb)+l), ldb, 8, 8, (uint8_t*)vec_B);
+            for (int x = 0; x < 4; x++) {
+                __builtin_mma_xvbf16ger2pp(&acc_0, vec_A[x], vec_B[x]);
+                __builtin_mma_xvbf16ger2pp(&acc_1, vec_A[x], vec_B[x+4]);
+            }
+        }
+        SAVE_ACC(&acc_0, ii, jj);
+        SAVE_ACC(&acc_1, ii, jj+4);
+    }
+
+    void KERNEL_8x4(int64_t ii, int64_t jj) {
+        vec_t vec_A[8], vec_B[4] , vec_C[4];
+        acc_t acc_0, acc_1;
+        __builtin_mma_xxsetaccz(&acc_0);
+        __builtin_mma_xxsetaccz(&acc_1);
+        for (int l = 0; l < k; l+=8) {
+            packNormal((A+(ii*lda)+l), lda, 8, 8, (uint8_t*)vec_A);
+            packNormal((B+(jj*ldb)+l), ldb, 8, 4, (uint8_t*)vec_B);
+            for (int x = 0; x < 4; x++) {
+                __builtin_mma_xvbf16ger2pp(&acc_0, vec_A[x], vec_B[x]);
+                __builtin_mma_xvbf16ger2pp(&acc_1, vec_A[x+4], vec_B[x]);
+            }
+        }
+        SAVE_ACC(&acc_0, ii, jj);
+        SAVE_ACC(&acc_1, ii+4, jj);
+    }
+
+
+    void KERNEL_8x8(int64_t ii, int64_t jj) {
+        vec_t vec_A[8], vec_B[8], vec_C[4];
+        acc_t acc_0, acc_1, acc_2, acc_3;
+        __builtin_mma_xxsetaccz(&acc_0);
+        __builtin_mma_xxsetaccz(&acc_1);
+        __builtin_mma_xxsetaccz(&acc_2);
+        __builtin_mma_xxsetaccz(&acc_3);
+        for (int l = 0; l < k; l+=8) {
+            packNormal(A+(ii*lda)+l, lda, 8, 8, (uint8_t*)vec_A);
+            packNormal(B+(jj*ldb)+l, ldb, 8, 8, (uint8_t*)vec_B);
+            for (int x = 0; x < 4; x++) {
+                __builtin_mma_xvbf16ger2pp(&acc_0, vec_A[x], vec_B[x]);
+                __builtin_mma_xvbf16ger2pp(&acc_1, (vec_t)vec_A[x], (vec_t)vec_B[x+4]);
+                __builtin_mma_xvbf16ger2pp(&acc_2, (vec_t)vec_A[x+4], (vec_t)vec_B[x]);
+                __builtin_mma_xvbf16ger2pp(&acc_3, (vec_t)vec_A[x+4], (vec_t)vec_B[x+4]);
+            }
+        }
+
+        SAVE_ACC(&acc_0, ii, jj);
+        SAVE_ACC(&acc_1, ii, jj+4);
+        SAVE_ACC(&acc_2, ii+4, jj);
+        SAVE_ACC(&acc_3, ii+4, jj+4);
+    }
+
+    template<int RM, int RN>
+    void gemm_small(int64_t m0, int64_t m, int64_t n0, int64_t n) {
+        int64_t ytiles = (m - m0) / RM;
+        int64_t xtiles = (n - n0) / RN;
+        int64_t tiles = xtiles * ytiles;
+        int64_t duty = (tiles + nth - 1) / nth;
+        int64_t start = duty * ith;
+        int64_t end = start + duty;
+        if (end > tiles)
+            end = tiles;
+        for (int64_t job = start; job < end; ++job) {
+            int64_t ii = m0 + job / xtiles * RM;
+            int64_t jj = n0 + job % xtiles * RN;
+            vec_t vec_C[4];
+            acc_t acc_0;
+            __builtin_mma_xxsetaccz(&acc_0);
+            vec_t vec_A[2], vec_B[2];
+            for (int l=0; l<k; l+=4) {
+                packNormal(A+(ii*lda)+l, lda, RM, 4, (uint8_t*)vec_A);
+                packNormal(B+(jj*ldb)+l, ldb, RN, 4, (uint8_t*)vec_B);
+                for (int x = 0; x<2; x++) {
+                    __builtin_mma_xvbf16ger2pp(&acc_0, vec_A[x], vec_B[x]);
+                }
+            }
+            __builtin_mma_disassemble_acc(vec_C, &acc_0);
+            for (int I = 0; I < RM; I++) {
+                for (int J = 0; J < RN; J++) {
+                    *((TC*)(C+ii+((jj+J)*ldc)+I)) = *((TC*)&vec_C[I]+J);
+                }
+            }
+        }
+    }
+
+    template<int RM>
+    void gemm_Mx8(int64_t m0, int64_t m, int64_t n0, int64_t n) {
+        int RN = 8;
+        int64_t ytiles = (m - m0) / RM;
+        int64_t xtiles = (n - n0) / RN;
+        int64_t tiles = xtiles * ytiles;
+        int64_t duty = (tiles + nth - 1) / nth;
+        int64_t start = duty * ith;
+        int64_t end = start + duty;
+        if (end > tiles)
+            end = tiles;
+        for (int64_t job = start; job < end; ++job) {
+            int64_t ii = m0 + job / xtiles * RM;
+            int64_t jj = n0 + job % xtiles * RN;
+            vec_t vec_C[4];
+            acc_t acc_0, acc_1;
+            __builtin_mma_xxsetaccz(&acc_0);
+            __builtin_mma_xxsetaccz(&acc_1);
+            vec_t vec_A[4], vec_B[8];
+            for (int l=0; l<k; l+=8) {
+                packNormal(A+(ii*lda)+l, lda, RM, 8, (uint8_t*)vec_A);
+                packNormal(B+(jj*ldb)+l, ldb, RN, 8, (uint8_t*)vec_B);
+                for (int x = 0; x<4; x++) {
+                    __builtin_mma_xvbf16ger2pp(&acc_0, vec_A[x], vec_B[x]);
+                    __builtin_mma_xvbf16ger2pp(&acc_1, vec_A[x], vec_B[x+4]);
+                }
+            }
+            __builtin_mma_disassemble_acc(vec_C, &acc_0);
+            for (int I = 0; I < RM; I++) {
+                for (int J = 0; J < 4; J++) {
+                    *((TC*)(C+ii+((jj+J)*ldc)+I)) = *((TC*)&vec_C[I]+J);
+                }
+            }
+            __builtin_mma_disassemble_acc(vec_C, &acc_1);
+            for (int I = 0; I < RM; I++) {
+                for (int J = 0; J < 4; J++) {
+                    *((TC*)(C+ii+((jj+4+J)*ldc)+I)) = *((TC*)&vec_C[I]+J);
+                }
+            }
+        }
+    }
+
+    template<int RM, int RN>
+    inline void kernel(int64_t ii, int64_t jj) {
+       if constexpr(RM == 4 && RN == 8) {
+          KERNEL_4x8(ii,jj);
+       } else if constexpr(RM == 8 && RN == 8) {
+          KERNEL_8x8(ii,jj);
+       } else if constexpr(RM == 8 && RN == 4) {
+          KERNEL_8x4(ii,jj);
+       } else {
+          static_assert(false, "RN/RM values not supported");
+       }
+    }
+
+    template <int RM, int RN>
+    NOINLINE void gemm(int64_t m0, int64_t m, int64_t n0, int64_t n) {
+        int64_t ytiles = (m - m0) / RM;
+        int64_t xtiles = (n - n0) / RN;
+        int64_t tiles = xtiles * ytiles;
+        int64_t duty = (tiles + nth - 1) / nth;
+        int64_t start = duty * ith;
+        int64_t end = start + duty;
+        if (end > tiles)
+            end = tiles;
+        for (int64_t job = start; job < end; ++job) {
+            int64_t ii = m0 + job / xtiles * RM;
+            int64_t jj = n0 + job % xtiles * RN;
+            kernel<RM, RN>(ii, jj);
+        }
+    }
+
+    const TA *const A;
+    const TB *const B;
+    TC *C;
+    const int64_t k;
+    const int64_t lda;
+    const int64_t ldb;
+    const int64_t ldc;
+    const int ith;
+    const int nth;
+};
+
 template <typename TA, typename TB, typename TC>
 class tinyBLAS_Q0_PPC {
   public:
@@ -2202,6 +2689,7 @@ class tinyBLAS_PPC {
         boffset = vec;
         j = (rows >> 3);
         if (j > 0) {
+
             do {
                 aoffset1 = aoffset;
                 aoffset2 = aoffset1 + lda;
@@ -2875,9 +3363,22 @@ bool llamafile_sgemm(const struct ggml_compute_params * params, int64_t m, int64
                 (float *)C, ldc};
             return tb.matmul(m, n);
         }
+#elif defined(__MMA__)
+        if ((k % 8))
+                return false;
+        if(Btype == GGML_TYPE_BF16) {
+           tinyBLAS_BF16_PPC<ggml_bf16_t, ggml_bf16_t, float> tb{ k,
+            (const ggml_bf16_t *)A, lda,
+            (const ggml_bf16_t *)B, ldb,
+            (float *)C, ldc,
+            params->ith, params->nth};
+        tb.matmul(m, n);
+        return true;
+        }
 #endif
         return false;
     }
+
     case GGML_TYPE_F16: {
 #if defined(__AVX512F__)
         if (Btype == GGML_TYPE_F16) {
diff --git a/ml/backend/ggml/ggml/src/ggml-cpu/ops.cpp b/ml/backend/ggml/ggml/src/ggml-cpu/ops.cpp
index becdae07..1868a10c 100644
--- a/ml/backend/ggml/ggml/src/ggml-cpu/ops.cpp
+++ b/ml/backend/ggml/ggml/src/ggml-cpu/ops.cpp
@@ -8,19 +8,6 @@
 
 #include <float.h>
 
-#if defined(_MSC_VER)
-// disable "possible loss of data" to avoid hundreds of casts
-// we should just be careful :)
-#pragma warning(disable: 4244 4267)
-
-// disable POSIX deprecation warnings
-// these functions are never going away, anyway
-#pragma warning(disable: 4996)
-
-// unreachable code because of multiple instances of code after GGML_ABORT
-#pragma warning(disable: 4702)
-#endif
-
 // ggml_compute_forward_dup
 
 static void ggml_compute_forward_dup_same_cont(
diff --git a/ml/backend/ggml/ggml/src/ggml-cpu/vec.cpp b/ml/backend/ggml/ggml/src/ggml-cpu/vec.cpp
index dfe2218e..02d40618 100644
--- a/ml/backend/ggml/ggml/src/ggml-cpu/vec.cpp
+++ b/ml/backend/ggml/ggml/src/ggml-cpu/vec.cpp
@@ -2,12 +2,6 @@
 
 #include <cassert>
 
-#if defined(_MSC_VER)
-// disable "possible loss of data" to avoid hundreds of casts
-// we should just be careful :)
-#pragma warning(disable: 4244 4267)
-#endif
-
 // precomputed gelu table for f16 (128 KB)
 ggml_fp16_t ggml_table_gelu_f16[1 << 16];
 
diff --git a/ml/backend/ggml/ggml/src/ggml-cuda/CMakeLists.txt b/ml/backend/ggml/ggml/src/ggml-cuda/CMakeLists.txt
index 8623214c..c9ff4aa3 100644
--- a/ml/backend/ggml/ggml/src/ggml-cuda/CMakeLists.txt
+++ b/ml/backend/ggml/ggml/src/ggml-cuda/CMakeLists.txt
@@ -12,12 +12,30 @@ if (CUDAToolkit_FOUND)
         # 61     == Pascal, __dp4a instruction (per-byte integer dot product)
         # 70     == V100, FP16 tensor cores
         # 75     == Turing, int8 tensor cores
+        # 80     == Ampere, asynchronous data loading, faster tensor core instructions
+        # 86     == RTX 3000, needs CUDA v11.1
+        # 89     == RTX 4000, needs CUDA v11.8
+        #
+        # XX-virtual == compile CUDA code as PTX, do JIT compilation to binary code on first run
+        # XX-real    == compile CUDA code as device code for this specific architecture
+        # no suffix  == compile as both PTX and device code
+        #
+        # The default behavior for a non-native is to build virtual architectures as needed to cover all features needed
+        #     for best performance and to also build real architectures for the most commonly used GPUs.
         if (GGML_NATIVE AND CUDAToolkit_VERSION VERSION_GREATER_EQUAL "11.6" AND CMAKE_VERSION VERSION_GREATER_EQUAL "3.24")
             set(CMAKE_CUDA_ARCHITECTURES "native")
         elseif(GGML_CUDA_F16 OR GGML_CUDA_DMMV_F16)
-            set(CMAKE_CUDA_ARCHITECTURES "60;61;70;75;80")
+            if (CUDAToolkit_VERSION VERSION_GREATER_EQUAL "11.8")
+                set(CMAKE_CUDA_ARCHITECTURES "60-virtual;61-virtual;70-virtual;75-virtual;80-virtual;86-real;89-real")
+            else()
+                set(CMAKE_CUDA_ARCHITECTURES "60-virtual;61-virtual;70-virtual;75-virtual;80-virtual;86-real")
+            endif()
         else()
-            set(CMAKE_CUDA_ARCHITECTURES "50;61;70;75;80")
+            if (CUDAToolkit_VERSION VERSION_GREATER_EQUAL "11.8")
+                set(CMAKE_CUDA_ARCHITECTURES "50-virtual;61-virtual;70-virtual;75-virtual;80-virtual;86-real;89-real")
+            else()
+                set(CMAKE_CUDA_ARCHITECTURES "50-virtual;61-virtual;70-virtual;75-virtual;80-virtual;86-real")
+            endif()
         endif()
     endif()
     message(STATUS "Using CUDA architectures: ${CMAKE_CUDA_ARCHITECTURES}")
@@ -100,7 +118,7 @@ if (CUDAToolkit_FOUND)
 
     set(CUDA_CXX_FLAGS "")
 
-    set(CUDA_FLAGS -use_fast_math)
+    set(CUDA_FLAGS -use_fast_math -extended-lambda)
 
     if (CUDAToolkit_VERSION VERSION_GREATER_EQUAL "12.8")
         # Options are:
@@ -133,6 +151,7 @@ if (CUDAToolkit_FOUND)
                 COMMAND ${NVCC_CMD} -Xcompiler "-dumpfullversion -dumpversion"
                 OUTPUT_VARIABLE CUDA_CCVER
                 ERROR_QUIET
+                OUTPUT_STRIP_TRAILING_WHITESPACE
             )
         else()
             if (CUDA_CCFULLVER MATCHES Apple)
@@ -143,7 +162,7 @@ if (CUDAToolkit_FOUND)
             string(REGEX REPLACE "^.* version ([0-9.]*).*$" "\\1" CUDA_CCVER ${CUDA_CCFULLVER})
         endif()
 
-        message("-- CUDA host compiler is ${CUDA_CCID} ${CUDA_CCVER}")
+        message(STATUS "CUDA host compiler is ${CUDA_CCID} ${CUDA_CCVER}")
 
         ggml_get_flags(${CUDA_CCID} ${CUDA_CCVER})
         list(APPEND CUDA_CXX_FLAGS ${CXX_FLAGS} ${GF_CXX_FLAGS})  # This is passed to -Xcompiler later
diff --git a/ml/backend/ggml/ggml/src/ggml-cuda/acc.cu b/ml/backend/ggml/ggml/src/ggml-cuda/acc.cu
index 96bfe1c9..e084607c 100644
--- a/ml/backend/ggml/ggml/src/ggml-cuda/acc.cu
+++ b/ml/backend/ggml/ggml/src/ggml-cuda/acc.cu
@@ -1,47 +1,61 @@
 #include "acc.cuh"
 
-static __global__ void acc_f32(const float * x, const float * y, float * dst, const int ne,
-    const int ne10, const int ne11, const int ne12,
-    const int nb1, const int nb2, int offset) {
-    const int i = blockDim.x * blockIdx.x + threadIdx.x;
+static __global__ void acc_f32(const float * x, const float * y, float * dst, const int64_t ne,
+        const int64_t ne10, const int64_t ne11, const int64_t ne12, const int64_t ne13,
+        const int64_t s11, const int64_t s12, const int64_t s13, const int64_t offset) {
+    const int64_t i = blockDim.x * blockIdx.x + threadIdx.x;
+
     if (i >= ne) {
         return;
     }
-    int src1_idx = i - offset;
-    int oz = src1_idx / nb2;
-    int oy = (src1_idx - (oz * nb2)) / nb1;
-    int ox = src1_idx % nb1;
-    if (src1_idx >= 0 && ox < ne10 && oy < ne11 && oz < ne12) {
-        dst[i] = x[i] + y[ox + oy * ne10 + oz * ne10 * ne11];
-    } else {
-        dst[i] = x[i];
+
+    int64_t src1_idx = i - offset;
+
+    int64_t tmp = src1_idx;
+    const int64_t i13 = tmp / s13;
+    tmp -= i13 * s13;
+    const int64_t i12 = tmp / s12;
+    tmp -= i12 * s12;
+    const int64_t i11 = tmp / s11;
+    tmp -= i11 * s11;
+    const int64_t i10 = tmp;
+
+    float val = x[i];
+    if (src1_idx >= 0 && i10 < ne10 && i11 < ne11 && i12 < ne12 && i13 < ne13) {
+        val += y[((i13*ne12 + i12) * ne11 + i11) * ne10 + i10];
     }
+    dst[i] = val;
 }
 
-static void acc_f32_cuda(const float * x, const float * y, float * dst, const int n_elements,
-    const int ne10, const int ne11, const int ne12,
-    const int nb1, const int nb2, const int offset, cudaStream_t stream) {
-    int num_blocks = (n_elements + CUDA_ACC_BLOCK_SIZE - 1) / CUDA_ACC_BLOCK_SIZE;
-    acc_f32<<<num_blocks, CUDA_ACC_BLOCK_SIZE, 0, stream>>>(x, y, dst, n_elements, ne10, ne11, ne12, nb1, nb2, offset);
+static void acc_f32_cuda(const float * x, const float * y, float * dst, const int64_t n_elements,
+        const int64_t ne10, const int64_t ne11, const int64_t ne12, const int64_t ne13,
+        const int64_t s1, const int64_t s2, const int64_t s3, const int64_t offset, cudaStream_t stream) {
+    const int num_blocks = (n_elements + CUDA_ACC_BLOCK_SIZE - 1) / CUDA_ACC_BLOCK_SIZE;
+    acc_f32<<<num_blocks, CUDA_ACC_BLOCK_SIZE, 0, stream>>>(x, y, dst, n_elements, ne10, ne11, ne12, ne13, s1, s2, s3, offset);
 }
 
 void ggml_cuda_op_acc(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
     const ggml_tensor * src0 = dst->src[0];
     const ggml_tensor * src1 = dst->src[1];
-    const float * src0_d = (const float *)src0->data;
-    const float * src1_d = (const float *)src1->data;
-    float * dst_d = (float *)dst->data;
+
+    const float * src0_d = (const float *) src0->data;
+    const float * src1_d = (const float *) src1->data;
+    float       * dst_d  = (float       *)  dst->data;
+
     cudaStream_t stream = ctx.stream();
 
     GGML_ASSERT(src0->type == GGML_TYPE_F32);
     GGML_ASSERT(src1->type == GGML_TYPE_F32);
     GGML_ASSERT( dst->type == GGML_TYPE_F32);
-    GGML_ASSERT(dst->ne[3] == 1); // just 3D tensors supported
 
-    int nb1 = dst->op_params[0] / 4; // 4 bytes of float32
-    int nb2 = dst->op_params[1] / 4; // 4 bytes of float32
-    // int nb3 = dst->op_params[2] / 4; // 4 bytes of float32 - unused
-    int offset = dst->op_params[3] / 4; // offset in bytes
+    GGML_ASSERT(ggml_is_contiguous(src1));
+    GGML_ASSERT(dst->nb[0] == ggml_element_size(dst));
+    GGML_ASSERT(ggml_is_contiguously_allocated(dst));
 
-    acc_f32_cuda(src0_d, src1_d, dst_d, ggml_nelements(dst), src1->ne[0], src1->ne[1], src1->ne[2], nb1, nb2, offset, stream);
+    const int64_t s1     = dst->op_params[0] / sizeof(float);
+    const int64_t s2     = dst->op_params[1] / sizeof(float);
+    const int64_t s3     = dst->op_params[2] / sizeof(float);
+    const int64_t offset = dst->op_params[3] / sizeof(float);
+
+    acc_f32_cuda(src0_d, src1_d, dst_d, ggml_nelements(dst), src1->ne[0], src1->ne[1], src1->ne[2], src1->ne[3], s1, s2, s3, offset, stream);
 }
diff --git a/ml/backend/ggml/ggml/src/ggml-cuda/common.cuh b/ml/backend/ggml/ggml/src/ggml-cuda/common.cuh
index 2ea014e6..64fb4ff4 100644
--- a/ml/backend/ggml/ggml/src/ggml-cuda/common.cuh
+++ b/ml/backend/ggml/ggml/src/ggml-cuda/common.cuh
@@ -130,10 +130,6 @@ static int ggml_cuda_highest_compiled_arch(const int arch) {
 
 #define MATRIX_ROW_PADDING 512 // last row of quant. matrices is a multiple of this to avoid out-of-bounds memory accesses
 
-#if defined(_MSC_VER)
-#pragma warning(disable: 4244 4267) // possible loss of data
-#endif
-
 #define GGML_CUDA_MAX_STREAMS 8
 
 [[noreturn]]
@@ -300,6 +296,25 @@ static __device__ void no_device_code(
 #define NO_DEVICE_CODE //GGML_ABORT("NO_DEVICE_CODE not valid in host code.")
 #endif // __CUDA_ARCH__
 
+// The compiler is always able to unroll loops if they contain continue expressions.
+// In such cases loop unrolling can still be achieved via recursion:
+template <int n>
+struct ggml_cuda_unroll {
+    template <typename Func, typename... Args>
+    __device__ void operator()(const Func & f, Args... args) const {
+        f(n - 1, args...);
+        ggml_cuda_unroll<n - 1>{}(f, args...);
+    }
+};
+
+template <>
+struct ggml_cuda_unroll<1> {
+    template <typename Func, typename... Args>
+    __device__ void operator()(const Func & f, Args... args) const {
+        f(0, args...);
+    }
+};
+
 template<int width = WARP_SIZE>
 static __device__ __forceinline__ int warp_reduce_sum(int x) {
 #if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE
diff --git a/ml/backend/ggml/ggml/src/ggml-cuda/cp-async.cuh b/ml/backend/ggml/ggml/src/ggml-cuda/cp-async.cuh
index ecb65999..63d0c482 100644
--- a/ml/backend/ggml/ggml/src/ggml-cuda/cp-async.cuh
+++ b/ml/backend/ggml/ggml/src/ggml-cuda/cp-async.cuh
@@ -2,6 +2,17 @@
 
 #include "common.cuh"
 
+
+static __device__ __forceinline__ unsigned int ggml_cuda_cvta_generic_to_shared(void * generic_ptr) {
+#ifdef CP_ASYNC_AVAILABLE
+    return __cvta_generic_to_shared(generic_ptr);
+#else
+    GGML_UNUSED(generic_ptr);
+    NO_DEVICE_CODE;
+    return 0;
+#endif // CP_ASYNC_AVAILABLE
+}
+
 // Copies data from global to shared memory, cg == cache global.
 // Both the src and dst pointers must be aligned to 16 bit.
 // Shared memory uses 32 bit addressing, the pointer is passed as unsigned int.
diff --git a/ml/backend/ggml/ggml/src/ggml-cuda/cpy.cu b/ml/backend/ggml/ggml/src/ggml-cuda/cpy.cu
index 2d46176e..d027271f 100644
--- a/ml/backend/ggml/ggml/src/ggml-cuda/cpy.cu
+++ b/ml/backend/ggml/ggml/src/ggml-cuda/cpy.cu
@@ -592,6 +592,8 @@ void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, gg
         dest_ptrs_d = ctx.cuda_graph->dest_ptrs_d;
         graph_cpynode_index = ctx.cuda_graph->graph_cpynode_index;
     }
+#else
+    GGML_UNUSED(disable_indirection_for_this_node);
 #endif
     if (src0->type == src1->type && ggml_is_contiguous(src0) && ggml_is_contiguous(src1)) {
         GGML_ASSERT(ggml_nbytes(src0) == ggml_nbytes(src1));
diff --git a/ml/backend/ggml/ggml/src/ggml-cuda/fattn-common.cuh b/ml/backend/ggml/ggml/src/ggml-cuda/fattn-common.cuh
index 56121705..b7180d59 100644
--- a/ml/backend/ggml/ggml/src/ggml-cuda/fattn-common.cuh
+++ b/ml/backend/ggml/ggml/src/ggml-cuda/fattn-common.cuh
@@ -516,7 +516,7 @@ constexpr __device__ dequantize_1_f32_t get_dequantize_1_f32(ggml_type type_V) {
         nullptr;
 }
 
-template<int D, int ncols1, int ncols2, int KQ_stride> // D == head size
+template<int D, int ncols1, int ncols2> // D == head size
 __launch_bounds__(D, 1)
 static __global__ void flash_attn_stream_k_fixup(
         float * __restrict__ dst, const float2 * __restrict__ dst_fixup, const int ne01, const int ne02, const int ne11) {
@@ -665,13 +665,13 @@ static void on_no_fattn_vec_case(const int D) {
         fprintf(stderr, "Compile with GGML_CUDA_FA_ALL_QUANTS for all combinations of q4_0, q4_1, q5_0, q5_1, q8_0, and f16.\n");
         GGML_ABORT("fatal error");
     } else {
-        fprintf(stderr, "Unsupported KV type combination for head_size 256.\n");
+        fprintf(stderr, "Unsupported KV type combination for head_size %d.\n", D);
         fprintf(stderr, "Only f16 is supported.\n");
         GGML_ABORT("fatal error");
     }
 }
 
-template <int D, int ncols1, int ncols2, int KQ_stride>
+template <int DV, int ncols1, int ncols2>
 void launch_fattn(
     ggml_backend_cuda_context & ctx, ggml_tensor * dst, fattn_kernel_t fattn_kernel, const int nwarps, const size_t nbytes_shared,
     const int KQ_row_granularity, const bool need_f16_K, const bool need_f16_V, const bool stream_k, const int warp_size = WARP_SIZE
@@ -691,7 +691,7 @@ void launch_fattn(
 
     GGML_ASSERT(!mask || mask->type == GGML_TYPE_F16);
     GGML_ASSERT(!mask || mask->ne[1] >= GGML_PAD(Q->ne[1], 16) &&
-                                "the Flash-Attention CUDA kernel requires the mask to be padded to 16 and at least n_queries big");
+        "the Flash-Attention CUDA kernel requires the mask to be padded to 16 and at least n_queries big");
 
     GGML_ASSERT(K->ne[1] % FATTN_KQ_STRIDE == 0 && "Incorrect KV cache padding.");
 
@@ -719,6 +719,7 @@ void launch_fattn(
     size_t nb23 = V->nb[3];
 
     if (need_f16_K && K->type != GGML_TYPE_F16) {
+        GGML_ASSERT(ggml_is_contiguously_allocated(K));
         K_f16.alloc(ggml_nelements(K));
         to_fp16_cuda_t to_fp16 = ggml_get_to_fp16_cuda(K->type);
         to_fp16(K_data, K_f16.ptr, ggml_nelements(K), main_stream);
@@ -733,6 +734,7 @@ void launch_fattn(
     }
 
     if (need_f16_V && V->type != GGML_TYPE_F16) {
+        GGML_ASSERT(ggml_is_contiguously_allocated(V));
         V_f16.alloc(ggml_nelements(V));
         to_fp16_cuda_t to_fp16 = ggml_get_to_fp16_cuda(V->type);
         to_fp16(V_data, V_f16.ptr, ggml_nelements(V), main_stream);
@@ -752,10 +754,13 @@ void launch_fattn(
     const int ntiles_total = ntiles_x * (Q->ne[2] / ncols2) * Q->ne[3];
 
     const dim3 block_dim(warp_size, nwarps, 1);
+    int max_blocks_per_sm = 1; // Max. number of active blocks limited by occupancy.
+    CUDA_CHECK(cudaOccupancyMaxActiveBlocksPerMultiprocessor(&max_blocks_per_sm, fattn_kernel, block_dim.x * block_dim.y * block_dim.z, nbytes_shared));
+
     dim3 blocks_num;
     if (stream_k) {
         // For short contexts it can be faster to have the SMs work on whole tiles because this lets us skip the fixup.
-        const int max_blocks = 2*nsm;
+        const int max_blocks = max_blocks_per_sm*nsm;
         const int tiles_nwaves = (ntiles_total + max_blocks - 1) / max_blocks;
         const int tiles_efficiency_percent = 100 * ntiles_total / (max_blocks*tiles_nwaves);
 
@@ -767,14 +772,11 @@ void launch_fattn(
         blocks_num.y = 1;
         blocks_num.z = 1;
 
-        dst_tmp_meta.alloc(blocks_num.x*ncols * (2*2 + D) * sizeof(float));
+        dst_tmp_meta.alloc(blocks_num.x*ncols * (2*2 + DV) * sizeof(float));
     } else {
         GGML_ASSERT(K->ne[1] % KQ_row_granularity == 0);
         const int ntiles_KQ = K->ne[1] / KQ_row_granularity; // Max. number of parallel blocks limited by tensor size.
 
-        int max_blocks_per_sm = 1; // Max. number of active blocks limited by occupancy.
-        CUDA_CHECK(cudaOccupancyMaxActiveBlocksPerMultiprocessor(&max_blocks_per_sm, fattn_kernel, block_dim.x * block_dim.y * block_dim.z, nbytes_shared));
-
         // parallel_blocks should be at least large enough to achieve max. occupancy for a single wave:
         parallel_blocks = std::max((nsm * max_blocks_per_sm) / ntiles_total, 1);
 
@@ -851,19 +853,19 @@ void launch_fattn(
 
     if (stream_k) {
         if (ntiles_total % blocks_num.x != 0) { // Fixup is only needed if the SMs work on fractional tiles.
-            const dim3 block_dim_combine(D, 1, 1);
+            const dim3 block_dim_combine(DV, 1, 1);
             const dim3 blocks_num_combine = {blocks_num.x, ncols1, ncols2};
 
-            flash_attn_stream_k_fixup<D, ncols1, ncols2, KQ_stride>
+            flash_attn_stream_k_fixup<DV, ncols1, ncols2>
                 <<<blocks_num_combine, block_dim_combine, 0, main_stream>>>
                 ((float *) KQV->data, dst_tmp_meta.ptr, Q->ne[1], Q->ne[2], K->ne[1]);
         }
     } else if (parallel_blocks > 1) {
-        const dim3 block_dim_combine(D, 1, 1);
+        const dim3 block_dim_combine(DV, 1, 1);
         const dim3 blocks_num_combine(Q->ne[1], 1, blocks_num.z);
         const size_t nbytes_shared_combine = parallel_blocks*sizeof(float2);
 
-        flash_attn_combine_results<D>
+        flash_attn_combine_results<DV>
             <<<blocks_num_combine, block_dim_combine, nbytes_shared_combine, main_stream>>>
             (dst_tmp.ptr, dst_tmp_meta.ptr, (float *) KQV->data, parallel_blocks);
     }
diff --git a/ml/backend/ggml/ggml/src/ggml-cuda/fattn-mma-f16.cuh b/ml/backend/ggml/ggml/src/ggml-cuda/fattn-mma-f16.cuh
index 04804a15..491780ab 100644
--- a/ml/backend/ggml/ggml/src/ggml-cuda/fattn-mma-f16.cuh
+++ b/ml/backend/ggml/ggml/src/ggml-cuda/fattn-mma-f16.cuh
@@ -13,104 +13,217 @@ typedef tile<16, 16, float> tile_C_KQ_16;
 typedef tile<16,  4, half2> tile_C_VKQ;
 typedef tile<16,  8, half2> tile_C_VKQ_16;
 
-template<int D, int nwarps, int KQ_per_iter>
+// Config options for specific head sizes.
+// Should not affect results, only speed/register pressure/shared memory use.
+//
+// nbatch_fa:      number of KV rows per softmax rescaling of KQ rowsums and VKQ accumulators.
+// nwarps_max:     maximum number of warps per CUDA block, up to 8 warps in total can run per SM (given enough shared memory).
+// Q_in_reg:       whether the Q values should be kept permanently in registers.
+// nstages_target: targeted number of pipeline stages for cp_async (if available), 0 means synchronous data loading.
+// nbatch_K2:      number of K half2 values in direction of DKQ to load in parallel.
+// nbatch_V2:      number of V half2 values in direction of DV to load in parallel.
+// nbatch_combine: number of VKQ half2 values in direction of DV to combine in parallel.
+
+template <int DKQ, int DV>
+struct fattn_mma_f16_config;
+
+template <>
+struct fattn_mma_f16_config< 64,  64> {
+    static constexpr int  nbatch_fa      = 64;
+    static constexpr int  nwarps_max     = 4;
+    static constexpr bool Q_in_reg       = true;
+    static constexpr int  nstages_target = 2;
+    static constexpr int  nbatch_K2      = 32;
+    static constexpr int  nbatch_V2      = 32;
+    static constexpr int  nbatch_combine = 32;
+};
+
+template <>
+struct fattn_mma_f16_config< 80,  80> {
+    static constexpr int  nbatch_fa      = 64;
+    static constexpr int  nwarps_max     = 4;
+    static constexpr bool Q_in_reg       = true;
+    static constexpr int  nstages_target = 2;
+    static constexpr int  nbatch_K2      = 40;
+    static constexpr int  nbatch_V2      = 40;
+    static constexpr int  nbatch_combine = 40;
+};
+
+template <>
+struct fattn_mma_f16_config< 96,  96> {
+    static constexpr int  nbatch_fa      = 64;
+    static constexpr int  nwarps_max     = 4;
+    static constexpr bool Q_in_reg       = true;
+    static constexpr int  nstages_target = 2;
+    static constexpr int  nbatch_K2      = 48;
+    static constexpr int  nbatch_V2      = 48;
+    static constexpr int  nbatch_combine = 48;
+};
+
+template <>
+struct fattn_mma_f16_config<112, 112> {
+    static constexpr int  nbatch_fa      = 64;
+    static constexpr int  nwarps_max     = 4;
+    static constexpr bool Q_in_reg       = true;
+    static constexpr int  nstages_target = 2;
+    static constexpr int  nbatch_K2      = 56;
+    static constexpr int  nbatch_V2      = 56;
+    static constexpr int  nbatch_combine = 56;
+};
+
+template <>
+struct fattn_mma_f16_config<128, 128> {
+    static constexpr int  nbatch_fa      = 64;
+    static constexpr int  nwarps_max     = 4;
+    static constexpr bool Q_in_reg       = true;
+    static constexpr int  nstages_target = 2;
+    static constexpr int  nbatch_K2      = 64;
+    static constexpr int  nbatch_V2      = 64;
+    static constexpr int  nbatch_combine = 64;
+};
+
+template <>
+struct fattn_mma_f16_config<256, 256> {
+    static constexpr int  nbatch_fa      = 32;
+    static constexpr int  nwarps_max     = 4;
+    static constexpr bool Q_in_reg       = true;
+    static constexpr int  nstages_target = 2;
+    static constexpr int  nbatch_K2      = 128;
+    static constexpr int  nbatch_V2      = 128;
+    static constexpr int  nbatch_combine = 128;
+};
+
+template <>
+struct fattn_mma_f16_config<576, 512> {
+    static constexpr int  nbatch_fa      = 32;
+    static constexpr int  nwarps_max     = 8;
+    static constexpr bool Q_in_reg       = false;
+    static constexpr int  nstages_target = 1;
+    static constexpr int  nbatch_K2      = 160;
+    static constexpr int  nbatch_V2      = 128;
+    static constexpr int  nbatch_combine = 128;
+};
+
+// ------------------------------------------------------------------------------------------------------------------
+
+template<int stride_tile, int nwarps, int nbatch_fa, bool use_cp_async>
 static __device__ __forceinline__ void flash_attn_ext_f16_load_tile(
-        const half2 * const __restrict__ KV, half2 * const __restrict__ tile_KV, const int stride_KV) {
-    constexpr int D2_padded = D/2 + 4; // Size of D in half2, padded to avoid shared memory bank conflicts.
+        const half2 * const __restrict__ KV, half2 * const __restrict__ tile_KV, const int D2, const int stride_KV) {
 
-    // If cp.async is available, load up to the highest power of 2 in D asynchronously:
-#ifdef CP_ASYNC_AVAILABLE
-    static_assert(D >= 64 && D < 512, "bad D");
-    constexpr int k0_sync_start = D/2 < 64 ? 32 : (D/2 < 128 ? 64 : 128);
-
-    const unsigned int tile_KV_32 = __cvta_generic_to_shared(tile_KV);
-
-    constexpr int preload = 64;
-    constexpr int h2_per_chunk = 16/sizeof(half2);
-    constexpr int chunks_per_row = k0_sync_start / h2_per_chunk;
-    constexpr int stride_i = WARP_SIZE / chunks_per_row;
-#pragma unroll
-    for (int i0 = 0; i0 < KQ_per_iter; i0 += nwarps*stride_i) {
-        const int i = i0 + threadIdx.y*stride_i + (chunks_per_row == WARP_SIZE ? 0 : threadIdx.x / chunks_per_row);
-        const int k = (chunks_per_row == WARP_SIZE ? threadIdx.x : threadIdx.x % chunks_per_row)*h2_per_chunk;
-
-        cp_async_cg_16<preload>(tile_KV_32 + (i*D2_padded + k)*sizeof(half2), KV + i*stride_KV + k);
-    }
-#else
-    constexpr int k0_sync_start = 0;
-#endif // CP_ASYNC_AVAILABLE
-    static_assert(k0_sync_start % WARP_SIZE == 0, "bad k0_sync_start");
-
-    // If D is not a power of 2, the rest is loaded synchronously.
     // K/V data is loaded with decreasing granularity for D for better memory bandwidth.
-    static_assert(KQ_per_iter % (4*nwarps) == 0, "out of bounds");
-#pragma unroll
-    for (int stride_k : {WARP_SIZE, WARP_SIZE/2, WARP_SIZE/4}) {
-        const int k0_start = stride_k == WARP_SIZE ? k0_sync_start : D/2 - (D/2) % (2*stride_k);
-        const int k0_stop  =                                         D/2 - (D/2) % (1*stride_k);
-        const int stride_i = WARP_SIZE / stride_k;
+    // The minimum granularity with cp.async is 16 bytes, with synchronous data loading it's 4 bytes.
 
-        if (k0_start == k0_stop || k0_stop <= k0_sync_start) {
-            continue;
-        }
+    if (use_cp_async) {
+        constexpr int preload = 64;
+        constexpr int h2_per_chunk = 16/sizeof(half2);
+        const int chunks_per_row = D2 / h2_per_chunk;
 
-#pragma unroll
-        for (int i0 = 0; i0 < KQ_per_iter; i0 += nwarps*stride_i) {
-            const int i = i0 + threadIdx.y*stride_i + (stride_k == WARP_SIZE ? 0 : threadIdx.x / stride_k);
+        const unsigned int tile_KV_32 = ggml_cuda_cvta_generic_to_shared(tile_KV);
 
-#pragma unroll
-            for (int k0 = k0_start; k0 < k0_stop; k0 += stride_k) {
-                const int k = k0 + (stride_k == WARP_SIZE ? threadIdx.x : threadIdx.x % stride_k);
+        auto load = [&] __device__ (const int n) {
+            const int stride_k = WARP_SIZE >> n;
+            const int k0_start = stride_k == WARP_SIZE ? 0 : chunks_per_row - chunks_per_row % (2*stride_k);
+            const int k0_stop  =                             chunks_per_row - chunks_per_row % (1*stride_k);
+            const int stride_i = WARP_SIZE / stride_k;
 
-                tile_KV[i*D2_padded + k] = KV[i*stride_KV + k];
+            if (k0_start == k0_stop) {
+                return;
             }
-        }
+
+#pragma unroll
+            for (int i0 = 0; i0 < nbatch_fa; i0 += nwarps*stride_i) {
+                const int i = i0 + threadIdx.y*stride_i + (stride_k == WARP_SIZE ? 0 : threadIdx.x / stride_k);
+
+                if (i0 + nwarps*stride_i > nbatch_fa && i >= nbatch_fa) {
+                    break;
+                }
+
+#pragma unroll
+                for (int k0 = k0_start; k0 < k0_stop; k0 += stride_k) {
+                    const int k = k0 + (stride_k == WARP_SIZE ? threadIdx.x : threadIdx.x % stride_k);
+
+                    cp_async_cg_16<preload>(tile_KV_32 + i*(stride_tile*sizeof(half2)) + k*16, KV + i*stride_KV + k*h2_per_chunk);
+                }
+            }
+        };
+        ggml_cuda_unroll<5>{}(load);
+    } else {
+        static_assert(nbatch_fa % (4*nwarps) == 0, "out of bounds");
+        auto load = [&] __device__ (const int n) {
+            const int stride_k = WARP_SIZE >> n;
+            const int k0_start = stride_k == WARP_SIZE ? 0 : D2 - D2 % (2*stride_k);
+            const int k0_stop  =                             D2 - D2 % (1*stride_k);
+            const int stride_i = WARP_SIZE / stride_k;
+
+            if (k0_start == k0_stop) {
+                return;
+            }
+
+#pragma unroll
+            for (int i0 = 0; i0 < nbatch_fa; i0 += nwarps*stride_i) {
+                const int i = i0 + threadIdx.y*stride_i + (stride_k == WARP_SIZE ? 0 : threadIdx.x / stride_k);
+
+                if (i0 + nwarps*stride_i > nbatch_fa && i >= nbatch_fa) {
+                    break;
+                }
+
+#pragma unroll
+                for (int k0 = k0_start; k0 < k0_stop; k0 += stride_k) {
+                    const int k = k0 + (stride_k == WARP_SIZE ? threadIdx.x : threadIdx.x % stride_k);
+
+                    tile_KV[i*stride_tile + k] = KV[i*stride_KV + k];
+                }
+            }
+        };
+        ggml_cuda_unroll<3>{}(load);
     }
 }
 
-template<int ncols1, int nwarps, int KQ_per_iter>
+template<int ncols1, int nwarps, int nbatch_fa, bool use_cp_async>
 static __device__ __forceinline__ void flash_attn_ext_f16_load_mask(
         const half2 * const __restrict__ mask_h2, half2 * const __restrict__ tile_mask, const int stride_mask) {
-    static_assert(KQ_per_iter == 2*WARP_SIZE || KQ_per_iter == WARP_SIZE, "bad KQ_per_iter");
-#ifdef CP_ASYNC_AVAILABLE
-    constexpr int preload = KQ_per_iter * sizeof(half);
-    constexpr int cols_per_warp = 8*WARP_SIZE/KQ_per_iter;
-    constexpr int stride_j = nwarps * cols_per_warp;
+    static_assert(nbatch_fa == 2*WARP_SIZE || WARP_SIZE % nbatch_fa == 0, "bad KQ_per_iter");
 
-    const unsigned int tile_mask_32 = __cvta_generic_to_shared(tile_mask);
+    if (use_cp_async) {
+        constexpr int preload = nbatch_fa >= 32 ? nbatch_fa * sizeof(half) : 64;
+        constexpr int cols_per_warp = 8*WARP_SIZE/nbatch_fa;
+        constexpr int stride_j = nwarps * cols_per_warp;
+
+        const unsigned int tile_mask_32 = ggml_cuda_cvta_generic_to_shared(tile_mask);
 
+#pragma unroll
+        for (int j0 = 0; j0 < ncols1; j0 += stride_j) {
+            const int j = j0 + threadIdx.y*cols_per_warp +
+                (nbatch_fa == 2*WARP_SIZE ? threadIdx.x / (WARP_SIZE/4) : threadIdx.x / (WARP_SIZE/cols_per_warp));
+
+            if (j0 + stride_j > ncols1 && j >= ncols1) {
+                break;
+            }
+
+            const int i = 4 * (threadIdx.x % (nbatch_fa/8));
+
+            cp_async_cg_16<preload>(tile_mask_32 + j*(nbatch_fa*sizeof(half) + 16) + i*sizeof(half2), mask_h2 + j*stride_mask + i);
+        }
+        return;
+    }
+
+    constexpr int cols_per_warp = 2*WARP_SIZE/nbatch_fa;
+    constexpr int stride_j = nwarps * cols_per_warp;
 #pragma unroll
     for (int j0 = 0; j0 < ncols1; j0 += stride_j) {
-        const int j = j0 + threadIdx.y*cols_per_warp +
-            (KQ_per_iter == 2*WARP_SIZE ? threadIdx.x / (WARP_SIZE/4) : threadIdx.x / (WARP_SIZE/8));
+        const int j = j0 + threadIdx.y*cols_per_warp + (nbatch_fa == 2*WARP_SIZE ? 0 : threadIdx.x / (WARP_SIZE/cols_per_warp));
 
         if (j0 + stride_j > ncols1 && j >= ncols1) {
             break;
         }
 
-        const int i = 4 * (KQ_per_iter == 2*WARP_SIZE ? threadIdx.x % (WARP_SIZE/4) : threadIdx.x % (WARP_SIZE/8));
+        const int i = nbatch_fa == 2*WARP_SIZE ? threadIdx.x : threadIdx.x % (WARP_SIZE/cols_per_warp);
 
-        cp_async_cg_16<preload>(tile_mask_32 + j*(KQ_per_iter*sizeof(half) + 16) + i*sizeof(half2), mask_h2 + j*stride_mask + i);
+        tile_mask[j*(nbatch_fa/2 + 4) + i] = mask_h2[j*stride_mask + i];
     }
-#else
-    constexpr int cols_per_warp = 2*WARP_SIZE/KQ_per_iter;
-    constexpr int stride_j = nwarps * cols_per_warp;
-#pragma unroll
-    for (int j0 = 0; j0 < ncols1; j0 += stride_j) {
-        const int j = j0 + threadIdx.y*cols_per_warp + (KQ_per_iter == 2*WARP_SIZE ? 0 : threadIdx.x / (WARP_SIZE/2));
-
-        if (j0 + stride_j > ncols1 && j >= ncols1) {
-            break;
-        }
-
-        const int i = KQ_per_iter == 2*WARP_SIZE ? threadIdx.x : threadIdx.x % (WARP_SIZE/2);
-
-        tile_mask[j*(KQ_per_iter/2 + 4) + i] = mask_h2[j*stride_mask + i];
-    }
-#endif // CP_ASYNC_AVAILABLE
 }
 
-template<int D, int ncols1, int ncols2, int nwarps, int KQ_per_iter, int ntiles, bool use_logit_softcap, bool needs_fixup, bool is_fixup, bool last_iter>
+template<int DKQ, int DV, int ncols1, int ncols2, int nwarps, int ntiles, bool use_logit_softcap, bool needs_fixup, bool is_fixup, bool last_iter>
 static __device__ __forceinline__ void flash_attn_ext_f16_iter(
         const float2 * const __restrict__ Q_f2,
         const half2  * const __restrict__ K_h2,
@@ -123,9 +236,11 @@ static __device__ __forceinline__ void flash_attn_ext_f16_iter(
         const float logit_softcap,
         const int ne01,
         const int ne02,
-        const int stride_KV,
+        const int stride_K,
+        const int stride_V,
         const int stride_mask,
         const int jt,
+        half2        * const __restrict__ tile_Q,
         half2        * const __restrict__ tile_K,
         half2        * const __restrict__ tile_V,
         half2        * const __restrict__ tile_mask,
@@ -135,59 +250,107 @@ static __device__ __forceinline__ void flash_attn_ext_f16_iter(
         float        * const __restrict__ KQ_rowsum,
         const int kb0) {
 #ifdef NEW_MMA_AVAILABLE
+    typedef fattn_mma_f16_config<DKQ, DV> c;
+
+#ifdef CP_ASYNC_AVAILABLE
+    constexpr int nstages = c::nstages_target;
+#else
+    constexpr int nstages = 0;
+#endif // CP_ASYNC_AVAILABLE
+
     constexpr int cols_per_warp   = ntiles * tile_B::I;
     constexpr int cols_per_thread = ntiles == 1 ? 2 : ntiles;
     constexpr int np              = nwarps * (cols_per_warp/ncols2) / ncols1; // Number of parallel CUDA warps per Q column.
-    constexpr int D2_padded       = D/2 + 4; // Size of D in half2, padded to avoid shared memory bank conflicts.
 
-    const int k_VKQ_0 = kb0 * KQ_per_iter;
-    tile_C_KQ KQ_C[KQ_per_iter/(np*tile_C_KQ::I) * ntiles];
+    constexpr int stride_tile_Q = DKQ/2        + 4;
+    constexpr int stride_tile_K = c::nbatch_K2 + 4;
+    constexpr int stride_tile_V = c::nbatch_V2 + 4;
+
+    const int k_VKQ_0 = kb0 * c::nbatch_fa;
+    tile_C_KQ KQ_C[c::nbatch_fa/(np*tile_C_KQ::I) * ntiles];
 
     // Use wide variants of tiles if ntiles >= 2.
     tile_B_16     * Q_B_16   = (tile_B_16     *) Q_B;
     tile_C_VKQ_16 * VKQ_C_16 = (tile_C_VKQ_16 *) VKQ_C;
     tile_C_KQ_16  * KQ_C_16  = (tile_C_KQ_16  *) KQ_C;
 
-#ifdef CP_ASYNC_AVAILABLE
-    cp_async_wait_all();
-    __syncthreads();
-    flash_attn_ext_f16_load_tile<D, nwarps, KQ_per_iter>(V_h2 + k_VKQ_0*stride_KV, tile_V, stride_KV);
-#else
-    if (ncols2 > 1 || mask_h2) {
-        flash_attn_ext_f16_load_mask<ncols1, nwarps, KQ_per_iter>(mask_h2 + k_VKQ_0/2, tile_mask, stride_mask);
-    }
-    flash_attn_ext_f16_load_tile<D, nwarps, KQ_per_iter>(K_h2 + k_VKQ_0*stride_KV, tile_K, stride_KV);
-    __syncthreads();
-#endif // CP_ASYNC_AVAILABLE
-
-    // Calculate tile of KQ:
-#pragma unroll
-    for (int i_KQ_00 = 0; i_KQ_00 < KQ_per_iter; i_KQ_00 += np*tile_A::I) {
-        const int i_KQ_0 = i_KQ_00 + (threadIdx.y % np)*tile_A::I;
-#pragma unroll
-        for (int k_KQ_0 = 0; k_KQ_0 < D/2; k_KQ_0 += tile_A::J) {
-            tile_A K_A;
-            load_ldmatrix(K_A, tile_K + i_KQ_0*D2_padded + k_KQ_0, D2_padded);
-            if (ntiles == 1) {
-                mma(KQ_C[i_KQ_00/(np*tile_A::I)], K_A, Q_B[k_KQ_0/tile_A::J]);
-            } else {
-#pragma unroll
-                for (int t = 0; t < ntiles/2; ++t) {
-                    // Wide version of KQ_C is column-major => swap A and B.
-                    mma(KQ_C_16[i_KQ_00/(np*tile_A::I) * ntiles/2 + t], Q_B_16[k_KQ_0/tile_A::J * ntiles/2 + t], K_A);
-                }
-            }
+    if constexpr (nstages > 1) {
+        static_assert(c::nbatch_K2 == DKQ/2, "batching not implemented for multi stage loading");
+        constexpr bool use_cp_async = true;
+        cp_async_wait_all();
+        __syncthreads();
+        flash_attn_ext_f16_load_tile<stride_tile_V, nwarps, c::nbatch_fa, use_cp_async>
+            (V_h2 + k_VKQ_0*stride_V, tile_V, c::nbatch_V2, stride_V);
+    } else {
+        constexpr bool use_cp_async = nstages == 1;
+        if (ncols2 > 1 || mask_h2) {
+            flash_attn_ext_f16_load_mask<ncols1, nwarps, c::nbatch_fa, use_cp_async>(mask_h2 + k_VKQ_0/2, tile_mask, stride_mask);
         }
     }
 
-#ifndef CP_ASYNC_AVAILABLE
-    __syncthreads(); // Only needed if tile_K == tile_V.
-#endif // CP_ASYNC_AVAILABLE
+#pragma unroll
+    for (int k0_start = 0; k0_start < DKQ/2; k0_start += c::nbatch_K2) {
+        const int k0_stop = k0_start + c::nbatch_K2 < DKQ/2 ? k0_start + c::nbatch_K2 : DKQ/2;
+        const int k0_diff = k0_stop - k0_start;
+
+        if (nstages <= 1) {
+            constexpr bool use_cp_async = nstages == 1;
+            flash_attn_ext_f16_load_tile<stride_tile_K, nwarps, c::nbatch_fa, use_cp_async>
+                (K_h2 + k_VKQ_0*stride_K + k0_start, tile_K, k0_diff, stride_K);
+            if (use_cp_async) {
+                cp_async_wait_all();
+            }
+            __syncthreads();
+        }
+
+        // Calculate tile of KQ:
+        if constexpr (c::Q_in_reg) {
+#pragma unroll
+            for (int i_KQ_00 = 0; i_KQ_00 < c::nbatch_fa; i_KQ_00 += np*tile_A::I) {
+                const int i_KQ_0 = i_KQ_00 + (threadIdx.y % np)*tile_A::I;
+#pragma unroll
+                for (int k_KQ_0 = k0_start; k_KQ_0 < k0_stop; k_KQ_0 += tile_A::J) {
+                    tile_A K_A;
+                    load_ldmatrix(K_A, tile_K + i_KQ_0*stride_tile_K + (k_KQ_0 - k0_start), stride_tile_K);
+                    if (ntiles == 1) {
+                        mma(KQ_C[i_KQ_00/(np*tile_A::I)], K_A, Q_B[k_KQ_0/tile_A::J]);
+                    } else {
+#pragma unroll
+                        for (int t = 0; t < ntiles/2; ++t) {
+                            // Wide version of KQ_C is column-major => swap A and B.
+                            mma(KQ_C_16[i_KQ_00/(np*tile_A::I) * ntiles/2 + t], Q_B_16[k_KQ_0/tile_A::J * ntiles/2 + t], K_A);
+                        }
+                    }
+                }
+            }
+        } else {
+            static_assert(ntiles == 2, "ntiles != 2 not implemented");
+#pragma unroll
+            for (int k_KQ_0 = k0_start; k_KQ_0 < k0_stop; k_KQ_0 += tile_A::J) {
+                load_ldmatrix(Q_B_16[0], tile_Q + (threadIdx.y / np)*(tile_B_16::I*stride_tile_Q) + k_KQ_0, stride_tile_Q);
+
+#pragma unroll
+                for (int i_KQ_00 = 0; i_KQ_00 < c::nbatch_fa; i_KQ_00 += np*tile_A::I) {
+                    const int i_KQ_0 = i_KQ_00 + (threadIdx.y % np)*tile_A::I;
+
+                    tile_A K_A;
+                    load_ldmatrix(K_A, tile_K + i_KQ_0*stride_tile_K + (k_KQ_0 - k0_start), stride_tile_K);
+
+                    // Wide version of KQ_C is column-major => swap A and B.
+                    mma(KQ_C_16[i_KQ_00/(np*tile_A::I)], Q_B_16[0], K_A);
+                }
+            }
+        }
+
+        if (nstages <= 1) {
+            __syncthreads(); // Only needed if tile_K == tile_V.
+        }
+    }
 
     if (use_logit_softcap) {
-        static_assert(KQ_per_iter % (np*tile_C_KQ::I) == 0, "bad loop size");
+        static_assert(c::nbatch_fa % (np*tile_C_KQ::I) == 0, "bad loop size");
 #pragma unroll
-        for (int i = 0; i < KQ_per_iter/(np*tile_C_KQ::I) * ntiles; ++i) {
+        for (int i = 0; i < c::nbatch_fa/(np*tile_C_KQ::I) * ntiles; ++i) {
 #pragma unroll
             for (int l = 0; l < tile_C_KQ::ne; ++l) {
                 KQ_C[i].x[l] = logit_softcap*tanhf(KQ_C[i].x[l]);
@@ -205,7 +368,7 @@ static __device__ __forceinline__ void flash_attn_ext_f16_iter(
     if (ntiles == 1) {
         if (ncols2 > 1 || mask_h2) {
 #pragma unroll
-            for (int i00 = 0; i00 < KQ_per_iter; i00 += np*tile_C_KQ::I) {
+            for (int i00 = 0; i00 < c::nbatch_fa; i00 += np*tile_C_KQ::I) {
                 const int i0 = i00 + (threadIdx.y % np)*tile_C_KQ::I;
 #pragma unroll
                 for (int l = 0; l < tile_C_KQ::ne; ++l) {
@@ -213,16 +376,16 @@ static __device__ __forceinline__ void flash_attn_ext_f16_iter(
                     const int j = ((threadIdx.y / np)*tile_C_KQ::J + tile_C_KQ::get_j(l)) / ncols2;
 
                     KQ_C[i00/(np*tile_C_KQ::I)].x[l] += slope *
-                        __half2float(((const half *) tile_mask)[j*(KQ_per_iter + 8) + i]);
+                        __half2float(((const half *) tile_mask)[j*(c::nbatch_fa + 8) + i]);
                 }
             }
         }
 
         // Calculate softmax for each KQ column using the current max. value.
         // The divisor is stored in KQ_rowsum and will be applied at the end.
-        static_assert(KQ_per_iter % (np*tile_C_KQ::I) == 0, "bad loop size");
+        static_assert(c::nbatch_fa % (np*tile_C_KQ::I) == 0, "bad loop size");
 #pragma unroll
-        for (int k = 0; k < KQ_per_iter/(np*tile_C_KQ::I); ++k) {
+        for (int k = 0; k < c::nbatch_fa/(np*tile_C_KQ::I); ++k) {
 #pragma unroll
             for (int l = 0; l < tile_C_KQ::ne; ++l) {
                 KQ_max_new[l % 2] = fmaxf(KQ_max_new[l % 2], KQ_C[k].x[l]);
@@ -238,10 +401,9 @@ static __device__ __forceinline__ void flash_attn_ext_f16_iter(
             }
         }
 
-        static_assert(KQ_per_iter % (np*tile_C_KQ::I) == 0, "bad loop size");
-
+        static_assert(c::nbatch_fa % (np*tile_C_KQ::I) == 0, "bad loop size");
 #pragma unroll
-        for (int k = 0; k < KQ_per_iter/(np*tile_C_KQ::I); ++k) {
+        for (int k = 0; k < c::nbatch_fa/(np*tile_C_KQ::I); ++k) {
 #pragma unroll
             for (int l = 0; l < tile_C_KQ::ne; ++l) {
                 KQ_C[k].x[l] = expf(KQ_C[k].x[l] - KQ_max_new[l % 2]);
@@ -252,7 +414,7 @@ static __device__ __forceinline__ void flash_attn_ext_f16_iter(
     } else { // ntiles > 1
         if (ncols2 > 1 || mask_h2) {
 #pragma unroll
-            for (int i00 = 0; i00 < KQ_per_iter; i00 += np*tile_C_KQ_16::J) {
+            for (int i00 = 0; i00 < c::nbatch_fa; i00 += np*tile_C_KQ_16::J) {
                 const int i0 = i00 + (threadIdx.y % np)*tile_C_KQ_16::J;
 #pragma unroll
                 for (int t = 0; t < ntiles/2; ++t) {
@@ -261,7 +423,7 @@ static __device__ __forceinline__ void flash_attn_ext_f16_iter(
                         const int i = (i0 + tile_C_KQ_16::get_j(l0)) / 2;
                         const int j = ((threadIdx.y / np)*cols_per_warp + t*tile_C_KQ_16::I + tile_C_KQ_16::get_i(l0)) / ncols2;
 
-                        const float2 tmp = __half22float2(tile_mask[j*(KQ_per_iter/2 + 4) + i]);
+                        const float2 tmp = __half22float2(tile_mask[j*(c::nbatch_fa/2 + 4) + i]);
                         const int KQ_index = i00/(np*tile_C_KQ_16::J) * ntiles/2 + t;
                         KQ_C_16[KQ_index].x[l0 + 0] += slope*tmp.x;
                         KQ_C_16[KQ_index].x[l0 + 1] += slope*tmp.y;
@@ -272,9 +434,9 @@ static __device__ __forceinline__ void flash_attn_ext_f16_iter(
 
         // Calculate softmax for each KQ column using the current max. value.
         // The divisor is stored in KQ_rowsum and will be applied at the end.
-        static_assert(KQ_per_iter % (np*tile_C_KQ::I) == 0, "bad loop size");
+        static_assert(c::nbatch_fa % (np*tile_C_KQ::I) == 0, "bad loop size");
 #pragma unroll
-        for (int k = 0; k < KQ_per_iter/(np*tile_C_KQ_16::J); ++k) {
+        for (int k = 0; k < c::nbatch_fa/(np*tile_C_KQ_16::J); ++k) {
 #pragma unroll
             for (int t = 0; t < ntiles/2; ++t) {
 #pragma unroll
@@ -294,9 +456,9 @@ static __device__ __forceinline__ void flash_attn_ext_f16_iter(
             }
         }
 
-        static_assert(KQ_per_iter % (np*tile_C_KQ_16::J) == 0, "bad loop size");
+        static_assert(c::nbatch_fa % (np*tile_C_KQ_16::J) == 0, "bad loop size");
 #pragma unroll
-        for (int k = 0; k < KQ_per_iter/(np*tile_C_KQ_16::J); ++k) {
+        for (int k = 0; k < c::nbatch_fa/(np*tile_C_KQ_16::J); ++k) {
 #pragma unroll
             for (int t = 0; t < ntiles/2; ++t) {
 #pragma unroll
@@ -325,7 +487,7 @@ static __device__ __forceinline__ void flash_attn_ext_f16_iter(
         if (ntiles == 1) {
             const half2 KQ_max_scale_h2 = make_half2(KQ_max_scale[0], KQ_max_scale[1]);
 #pragma unroll
-            for (int i = 0; i < D/tile_C_VKQ::I; ++i) {
+            for (int i = 0; i < DV/tile_C_VKQ::I; ++i) {
 #pragma unroll
                 for (int l = 0; l < tile_C_VKQ::ne; ++l) {
                     VKQ_C[i].x[l] *= KQ_max_scale_h2;
@@ -336,7 +498,7 @@ static __device__ __forceinline__ void flash_attn_ext_f16_iter(
             for (int col = 0; col < cols_per_thread; ++col) {
                 const half2 KQ_max_scale_h2 = make_half2(KQ_max_scale[col], KQ_max_scale[col]);
 #pragma unroll
-                for (int i = 0; i < D/tile_C_VKQ_16::J; ++i) {
+                for (int i = 0; i < DV/tile_C_VKQ_16::J; ++i) {
 #pragma unroll
                     for (int l0 = 0; l0 < tile_C_VKQ_16::ne; l0 += 2) {
                         VKQ_C_16[i*ntiles/2 + col/2].x[l0 + col % 2] *= KQ_max_scale_h2;
@@ -347,16 +509,16 @@ static __device__ __forceinline__ void flash_attn_ext_f16_iter(
     }
 
     // Convert KQ C tiles into B tiles for VKQ calculation:
-    tile_B B[KQ_per_iter/(np*2*tile_B::J) * ntiles];
+    tile_B B[c::nbatch_fa/(np*2*tile_B::J) * ntiles];
     tile_B_16 * B_16 = (tile_B_16 *) B;
-    static_assert(KQ_per_iter % (np*2*tile_B::J) == 0, "bad loop size");
+    static_assert(c::nbatch_fa % (np*2*tile_B::J) == 0, "bad loop size");
     if (ntiles == 1) {
 #pragma unroll
-        for (int k = 0; k < KQ_per_iter/(np*2*tile_B::J); ++k) {
+        for (int k = 0; k < c::nbatch_fa/(np*2*tile_B::J); ++k) {
             B[k] = get_transposed(get_half2(KQ_C[k]));
         }
     } else {
-        for (int k = 0; k < KQ_per_iter/(np*2*tile_B_16::J); ++k) {
+        for (int k = 0; k < c::nbatch_fa/(np*2*tile_B_16::J); ++k) {
 #pragma unroll
             for (int t = 0; t < ntiles/2; ++t) {
                 B_16[k*ntiles/2 + t] = get_half2(KQ_C_16[k*ntiles/2 + t]);
@@ -364,52 +526,67 @@ static __device__ __forceinline__ void flash_attn_ext_f16_iter(
         }
     }
 
-#ifdef CP_ASYNC_AVAILABLE
-    // Preload K tile for next iteration:
-    cp_async_wait_all();
-    __syncthreads();
-    if (!last_iter) {
-        if (ncols2 > 1 || mask_h2) {
-            flash_attn_ext_f16_load_mask<ncols1, nwarps, KQ_per_iter>(mask_h2 + (k_VKQ_0 + KQ_per_iter)/2, tile_mask, stride_mask);
+    if (nstages > 1) {
+        // Preload K tile for next iteration:
+        constexpr bool use_cp_async = true;
+        cp_async_wait_all();
+        __syncthreads();
+        if (!last_iter) {
+            if (ncols2 > 1 || mask_h2) {
+                flash_attn_ext_f16_load_mask<ncols1, nwarps, c::nbatch_fa, use_cp_async>
+                    (mask_h2 + (k_VKQ_0 + c::nbatch_fa)/2, tile_mask, stride_mask);
+            }
+            flash_attn_ext_f16_load_tile<stride_tile_K, nwarps, c::nbatch_fa, use_cp_async>
+                (K_h2 + (k_VKQ_0 + c::nbatch_fa)*stride_K, tile_K, c::nbatch_K2, stride_K);
         }
-        flash_attn_ext_f16_load_tile<D, nwarps, KQ_per_iter>(K_h2 + (k_VKQ_0 + KQ_per_iter)*stride_KV, tile_K, stride_KV);
     }
-#else
-    flash_attn_ext_f16_load_tile<D, nwarps, KQ_per_iter>(V_h2 + k_VKQ_0*stride_KV, tile_V, stride_KV);
-    __syncthreads();
-#endif // CP_ASYNC_AVAILABLE
 
-    // Calculate VKQ tile:
 #pragma unroll
-    for (int i_VKQ_0 = 0; i_VKQ_0 < D; i_VKQ_0 += tile_C_VKQ::I) {
-        static_assert((KQ_per_iter/2) % (np*tile_A::J) == 0, "bad loop size");
-#pragma unroll
-        for (int k00 = 0; k00 < KQ_per_iter/2; k00 += np*tile_A::J) {
-            const int k0 = k00 + (threadIdx.y % np)*tile_A::J;
+    for (int i0_start = 0; i0_start < DV; i0_start += 2*c::nbatch_V2) {
+        const int i0_stop = i0_start + 2*c::nbatch_V2 < DV ? i0_start + 2*c::nbatch_V2 : DV;
+        const int i0_diff = i0_stop - i0_start;
 
-            tile_A A;
-            load_ldmatrix_trans(A, tile_V + 2*k0*D2_padded + i_VKQ_0/2, D2_padded);
-            if (ntiles == 1) {
-                mma(VKQ_C[i_VKQ_0/tile_C_VKQ::I], A, B[k00/(np*tile_A::J)]);
-            } else {
+        if (nstages <= 1) {
+            constexpr bool use_cp_async = nstages == 1;
+            flash_attn_ext_f16_load_tile<stride_tile_V, nwarps, c::nbatch_fa, use_cp_async>
+                (V_h2 + k_VKQ_0*stride_V + i0_start/2, tile_V, i0_diff/2, stride_V);
+            if (use_cp_async) {
+                cp_async_wait_all();
+            }
+            __syncthreads();
+        }
+
+        // Calculate VKQ tile:
 #pragma unroll
-                for (int t = 0; t < ntiles/2; ++t) {
-                    // Wide version of VKQ_C is column-major => swap A and B.
-                    mma(VKQ_C_16[i_VKQ_0/tile_C_VKQ::I * ntiles/2 + t], B_16[k00/(np*tile_A::J) * ntiles/2 + t], A);
+        for (int i_VKQ_0 = i0_start; i_VKQ_0 < i0_stop; i_VKQ_0 += tile_C_VKQ::I) {
+            static_assert((c::nbatch_fa/2) % (np*tile_A::J) == 0, "bad loop size");
+#pragma unroll
+            for (int k00 = 0; k00 < c::nbatch_fa/2; k00 += np*tile_A::J) {
+                const int k0 = k00 + (threadIdx.y % np)*tile_A::J;
+
+                tile_A A;
+                load_ldmatrix_trans(A, tile_V + 2*k0*stride_tile_V + (i_VKQ_0 - i0_start)/2, stride_tile_V);
+                if (ntiles == 1) {
+                    mma(VKQ_C[i_VKQ_0/tile_C_VKQ::I], A, B[k00/(np*tile_A::J)]);
+                } else {
+#pragma unroll
+                    for (int t = 0; t < ntiles/2; ++t) {
+                        // Wide version of VKQ_C is column-major => swap A and B.
+                        mma(VKQ_C_16[i_VKQ_0/tile_C_VKQ::I * ntiles/2 + t], B_16[k00/(np*tile_A::J) * ntiles/2 + t], A);
+                    }
                 }
             }
         }
+
+        if (nstages <= 1) {
+            __syncthreads(); // Only needed if tile_K == tile_V.
+        }
     }
-
-#ifndef CP_ASYNC_AVAILABLE
-    __syncthreads(); // Only needed if tile_K == tile_V.
-#endif // CP_ASYNC_AVAILABLE
-
 #else
     GGML_UNUSED(Q_f2); GGML_UNUSED(K_h2); GGML_UNUSED(V_h2);
     GGML_UNUSED(mask_h2); GGML_UNUSED(dstk); GGML_UNUSED(dstk_fixup);
     GGML_UNUSED(scale); GGML_UNUSED(slope); GGML_UNUSED(logit_softcap);
-    GGML_UNUSED(ne01); GGML_UNUSED(ne02); GGML_UNUSED(stride_KV);
+    GGML_UNUSED(ne01); GGML_UNUSED(ne02); GGML_UNUSED(stride_K); GGML_UNUSED(stride_V);
     GGML_UNUSED(stride_mask); GGML_UNUSED(jt); GGML_UNUSED(tile_K);
     GGML_UNUSED(stride_mask); GGML_UNUSED(jt); GGML_UNUSED(tile_K);
     GGML_UNUSED(tile_V); GGML_UNUSED(tile_mask); GGML_UNUSED(Q_B);
@@ -419,7 +596,7 @@ static __device__ __forceinline__ void flash_attn_ext_f16_iter(
 #endif // NEW_MMA_AVAILABLE
 }
 
-template<int D, int ncols1, int ncols2, int nwarps, int KQ_per_iter, int ntiles, bool use_logit_softcap, bool needs_fixup, bool is_fixup>
+template<int DKQ, int DV, int ncols1, int ncols2, int nwarps, int ntiles, bool use_logit_softcap, bool needs_fixup, bool is_fixup>
 static __device__ __forceinline__ void flash_attn_ext_f16_process_tile(
         const float2 * const __restrict__ Q_f2,
         const half2  * const __restrict__ K_h2,
@@ -434,7 +611,8 @@ static __device__ __forceinline__ void flash_attn_ext_f16_process_tile(
         const int ne02,
         const int stride_Q1,
         const int stride_Q2,
-        const int stride_KV,
+        const int stride_K,
+        const int stride_V,
         const int stride_mask,
         const int jt,
         const int kb0_start,
@@ -442,6 +620,14 @@ static __device__ __forceinline__ void flash_attn_ext_f16_process_tile(
 #ifdef NEW_MMA_AVAILABLE
     //In this kernel Q, K, V are matrices while i, j, k are matrix indices.
 
+    typedef fattn_mma_f16_config<DKQ, DV> c;
+
+#ifdef CP_ASYNC_AVAILABLE
+    constexpr int nstages = c::nstages_target;
+#else
+    constexpr int nstages = 0;
+#endif // CP_ASYNC_AVAILABLE
+
     constexpr int ncols           = ncols1 * ncols2;
     constexpr int cols_per_warp   = ntiles * tile_B::I;
     constexpr int cols_per_thread = ntiles == 1 ? 2 : ntiles;
@@ -449,22 +635,19 @@ static __device__ __forceinline__ void flash_attn_ext_f16_process_tile(
 
     static_assert(nwarps * (cols_per_warp/ncols2) % ncols1 == 0, "bad nwarps");
 
-    static_assert(D           % nwarps == 0, "bad D");
-    static_assert(KQ_per_iter % nwarps == 0, "bad KQ_per_iter");
+    constexpr int stride_tile_Q = DKQ/2        + 4;
+    constexpr int stride_tile_K = c::nbatch_K2 + 4;
+    constexpr int stride_tile_V = c::nbatch_V2 + 4;
 
-    constexpr int D2_padded = D/2 + 4; // Size of D in half2, padded to avoid shared memory bank conflicts.
+    constexpr int stride_tile_KV_max = stride_tile_K > stride_tile_V ? stride_tile_K : stride_tile_V;
 
-    // Temporary shared buffer for loading K/V data with KQ_per_iter*D logical elements:
-    extern __shared__ half2 tile_K[];
-#ifdef CP_ASYNC_AVAILABLE
-    half2 * tile_V    = tile_K + KQ_per_iter*D2_padded;
-#else
-    half2 * tile_V    = tile_K;
-#endif // CP_ASYNC_AVAILABLE
-    half2 * tile_mask = tile_V + KQ_per_iter*D2_padded;
+    extern __shared__ half2 tile_Q[];
+    half2 * tile_K    = c::Q_in_reg ? tile_Q                                : tile_Q + ncols        * stride_tile_Q;
+    half2 * tile_V    = nstages > 1 ? tile_K + c::nbatch_fa * stride_tile_K : tile_K;
+    half2 * tile_mask = nstages > 1 ? tile_V + c::nbatch_fa * stride_tile_V : tile_V + c::nbatch_fa * stride_tile_KV_max;
 
-    tile_B       Q_B[D/(2*tile_B::J) * ntiles];
-    tile_C_VKQ VKQ_C[D/tile_C_VKQ::I * ntiles];
+    tile_B       Q_B[(c::Q_in_reg ? DKQ/(2*tile_B::J) : 1) * ntiles];
+    tile_C_VKQ VKQ_C[DV/tile_C_VKQ::I  * ntiles];
 
     tile_B_16     * Q_B_16   = (tile_B_16     *) Q_B;
     tile_C_VKQ_16 * VKQ_C_16 = (tile_C_VKQ_16 *) VKQ_C;
@@ -476,13 +659,14 @@ static __device__ __forceinline__ void flash_attn_ext_f16_process_tile(
         KQ_max[col] = -FLT_MAX/2.0f;
     }
 
-    // Temporarily load Q data into tile_K, will be loaded into registers afterwards.
+    // Load Q data into tile_Q, either temporarily or permanently.
+    // Q in registers is faster, but register pressure is the biggest bottleneck.
     // The loading is done with decreasing granularity for D for better memory bandwidth.
     const half2 scale_h2 = make_half2(scale, scale);
 #pragma unroll
     for (int stride_k : {WARP_SIZE, WARP_SIZE/2, WARP_SIZE/4}) {
-        const int k0_start  = stride_k == WARP_SIZE ? 0 : D/2 - (D/2) % (2*stride_k);
-        const int k0_stop   =                             D/2 - (D/2) % (1*stride_k);
+        const int k0_start  = stride_k == WARP_SIZE ? 0 : DKQ/2 - (DKQ/2) % (2*stride_k);
+        const int k0_stop   =                             DKQ/2 - (DKQ/2) % (1*stride_k);
         const int stride_jc = WARP_SIZE / stride_k;
 
         if (k0_start == k0_stop) {
@@ -506,14 +690,14 @@ static __device__ __forceinline__ void flash_attn_ext_f16_process_tile(
                     const int k = k0 + (stride_k == WARP_SIZE ? threadIdx.x : threadIdx.x % stride_k);
 
                     const float2 tmp = Q_f2[(jt*ncols1 + j)*stride_Q1 + c*stride_Q2 + k];
-                    tile_K[jc*D2_padded + k] = scale_h2 * make_half2(tmp.x, tmp.y);
+                    tile_Q[jc*stride_tile_Q + k] = scale_h2 * make_half2(tmp.x, tmp.y);
                 }
             } else {
 #pragma unroll
                 for (int k0 = k0_start; k0 < k0_stop; k0 += stride_k) {
                     const int k = k0 + (stride_k == WARP_SIZE ? threadIdx.x : threadIdx.x % stride_k);
 
-                    tile_K[jc*D2_padded + k] = make_half2(0.0f, 0.0f);
+                    tile_Q[jc*stride_tile_Q + k] = make_half2(0.0f, 0.0f);
                 }
             }
         }
@@ -521,18 +705,18 @@ static __device__ __forceinline__ void flash_attn_ext_f16_process_tile(
 
     __syncthreads();
 
-    {
+    if (c::Q_in_reg) {
         const int j0 = (threadIdx.y / np) * cols_per_warp;
 
 #pragma unroll
-        for (int k0 = 0; k0 < D/2; k0 += tile_B::J) {
+        for (int k0 = 0; k0 < DKQ/2; k0 += tile_B::J) {
             if (ntiles == 1) {
-                load_ldmatrix(Q_B[k0/tile_B::J], tile_K + j0*D2_padded + k0, D2_padded);
+                load_ldmatrix(Q_B[k0/tile_B::J], tile_Q + j0*stride_tile_Q + k0, stride_tile_Q);
             } else {
 #pragma unroll
                 for (int t = 0; t < ntiles/2; ++t) {
                     load_ldmatrix(Q_B_16[k0/tile_B_16::J * ntiles/2 + t],
-                        tile_K + (j0 + t*tile_B_16::I)*D2_padded + k0, D2_padded);
+                        tile_Q + (j0 + t*tile_B_16::I)*stride_tile_Q + k0, stride_tile_Q);
                 }
             }
         }
@@ -540,35 +724,37 @@ static __device__ __forceinline__ void flash_attn_ext_f16_process_tile(
 
     __syncthreads();
 
-    // Preload mask and K data for first iteration when using cp_async:
-#ifdef CP_ASYNC_AVAILABLE
-    if (ncols2 > 1 || mask_h2) {
-        flash_attn_ext_f16_load_mask<ncols1, nwarps, KQ_per_iter>(mask_h2 + kb0_start*KQ_per_iter/2, tile_mask, stride_mask);
+    // Preload mask and K data for first iteration when using cp_async with multiple stages:
+    if constexpr (nstages > 1) {
+        static_assert(c::nbatch_K2 == DKQ/2, "batching not implemented for multi-stage pipeline");
+        constexpr bool use_cp_async = true;
+        if (ncols2 > 1 || mask_h2) {
+            flash_attn_ext_f16_load_mask<ncols1, nwarps, c::nbatch_fa, use_cp_async>
+                (mask_h2 + kb0_start*c::nbatch_fa/2, tile_mask, stride_mask);
+        }
+        flash_attn_ext_f16_load_tile<stride_tile_K, nwarps, c::nbatch_fa, use_cp_async>
+            (K_h2 + kb0_start*c::nbatch_fa*stride_K, tile_K, c::nbatch_K2, stride_K);
     }
-    flash_attn_ext_f16_load_tile<D, nwarps, KQ_per_iter>(K_h2 + kb0_start*KQ_per_iter*stride_KV, tile_K, stride_KV);
-#endif // CP_ASYNC_AVAILABLE
 
     // Iterate over ne11 == previous tokens:
     for (int kb0 = kb0_start; kb0 < kb0_stop-1; ++kb0) {
         constexpr bool last_iter = false;
-        flash_attn_ext_f16_iter<D, ncols1, ncols2, nwarps, KQ_per_iter, ntiles, use_logit_softcap, needs_fixup, is_fixup, last_iter>
+        flash_attn_ext_f16_iter<DKQ, DV, ncols1, ncols2, nwarps, ntiles, use_logit_softcap, needs_fixup, is_fixup, last_iter>
             (Q_f2, K_h2, V_h2, mask_h2, dstk, dstk_fixup, scale, slope, logit_softcap,
-             ne01, ne02, stride_KV, stride_mask, jt, tile_K, tile_V, tile_mask, Q_B, VKQ_C, KQ_max, KQ_rowsum, kb0);
+             ne01, ne02, stride_K, stride_V, stride_mask, jt, tile_Q, tile_K, tile_V, tile_mask, Q_B, VKQ_C, KQ_max, KQ_rowsum, kb0);
     }
     { // kb0_start is always < kb0_stop so the last iter can be executed unconditionally.
         constexpr bool last_iter = true;
-        flash_attn_ext_f16_iter<D, ncols1, ncols2, nwarps, KQ_per_iter, ntiles, use_logit_softcap, needs_fixup, is_fixup, last_iter>
+        flash_attn_ext_f16_iter<DKQ, DV, ncols1, ncols2, nwarps, ntiles, use_logit_softcap, needs_fixup, is_fixup, last_iter>
             (Q_f2, K_h2, V_h2, mask_h2, dstk, dstk_fixup, scale, slope, logit_softcap,
-             ne01, ne02, stride_KV, stride_mask, jt, tile_K, tile_V, tile_mask, Q_B, VKQ_C, KQ_max, KQ_rowsum, kb0_stop-1);
+             ne01, ne02, stride_K, stride_V, stride_mask, jt, tile_Q, tile_K, tile_V, tile_mask, Q_B, VKQ_C, KQ_max, KQ_rowsum, kb0_stop-1);
     }
 
-    // With cp_async there is no __syncthreads at the end of the iter,
+    // With multi-stage loading there is no __syncthreads at the end of the iter,
     //     there can be a race condition on shared memory access for combining/writing back results.
-#ifdef CP_ASYNC_AVAILABLE
-    if (nwarps*cols_per_warp > KQ_per_iter) {
+    if (nstages > 1 && nwarps*cols_per_warp > c::nbatch_fa) {
         __syncthreads();
     }
-#endif // CP_ASYNC_AVAILABLE
 
     // Finally, sum up partial KQ rowsums.
     // The partial sums are spread across 8/4 threads each, does not need full reduce.
@@ -584,38 +770,13 @@ static __device__ __forceinline__ void flash_attn_ext_f16_process_tile(
         }
     }
 
-    // Write VKQ accumulators to shared memory in column-major format.
-    // It's faster to do small writes to shared memory, then large write to VRAM than to do small writes to VRAM.
-    // Also for np > 1 the combination is done via these values in shared memory.
-    if (ntiles == 1) {
-        const int jc_cwd = threadIdx.y*tile_B::I + tile_B::get_i(-1); // jc combine write data
-#pragma unroll
-        for (int k0 = 0; k0 < D/2; k0 += tile_B::J) {
-            const tile_B B = get_transposed(VKQ_C[k0/tile_B::J]); // Conversion of C to B matrix puts it in column-major format.
+    // Combine VKQ accumulator values if np > 1.
+    // It's also faster to do small writes to shared memory, then large write to VRAM than to do small writes to VRAM.
+    // So also write VKQ accumulators to shared memory in column-major format if np == 1.
 
-#pragma unroll
-            for (int l = 0; l < tile_B::ne; ++l) {
-                const int k = k0 + tile_B::get_j(l);
-
-                tile_K[jc_cwd*D2_padded + k] = B.x[l];
-            }
-        }
-    } else {
-#pragma unroll
-        for (int t = 0; t < ntiles/2; ++t) {
-            const int j0 = threadIdx.y*cols_per_warp + t*tile_C_VKQ_16::I;
-#pragma unroll
-            for (int k0 = 0; k0 < D/2; k0 += tile_C_VKQ_16::J) {
-#pragma unroll
-                for (int l = 0; l < tile_C_VKQ_16::ne; ++l) {
-                    const int j = j0 + tile_C_VKQ_16::get_i(l);
-                    const int k = k0 + tile_C_VKQ_16::get_j(l);
-
-                    tile_K[j*D2_padded + k] = VKQ_C_16[k0/tile_C_VKQ_16::J * ntiles/2 + t].x[l];
-                }
-            }
-        }
-    }
+    constexpr int nbatch_combine = c::Q_in_reg ? DV/2 : DV/4;
+    constexpr int tile_stride    = nbatch_combine + 4;
+    static_assert((DV/2) % nbatch_combine == 0, "bad nbatch_combine");
 
     if constexpr (ntiles == 1) {
         const int jc_cwmo = (threadIdx.x % (2*tile_C_VKQ::J)) / tile_C_VKQ::J; // jc combine write meta offset
@@ -624,7 +785,7 @@ static __device__ __forceinline__ void flash_attn_ext_f16_process_tile(
 
         if (((!needs_fixup && !is_fixup) || np > 1) && threadIdx.x < 2*tile_C_VKQ::J) {
             // Use the 16 bytes of padding in each row to store the meta data: KQ max, KQ rowsum, KQ max scale.
-            ((float2 *) tile_K)[jc_cwm*(D2_padded/2) + D/4] = KQ_cmr;
+            ((float2 *) tile_Q)[jc_cwm*(tile_stride/2) + nbatch_combine/2] = KQ_cmr;
         }
 
         __syncthreads();
@@ -649,7 +810,7 @@ static __device__ __forceinline__ void flash_attn_ext_f16_process_tile(
 
         if (((!needs_fixup && !is_fixup) || np > 1) && (ntiles == 4 || threadIdx.x % 4 < cols_per_thread)) {
             // Use the 16 bytes of padding in each row to store the meta data: KQ max, KQ rowsum, KQ max scale.
-            ((float2 *) tile_K)[jc_cwm*(D2_padded/2) + D/4] = KQ_cmr;
+            ((float2 *) tile_Q)[jc_cwm*(tile_stride/2) + nbatch_combine/2] = KQ_cmr;
         }
 
         __syncthreads();
@@ -676,11 +837,11 @@ static __device__ __forceinline__ void flash_attn_ext_f16_process_tile(
         constexpr int nmeta = np*cols_per_warp >= WARP_SIZE ? np*cols_per_warp/WARP_SIZE : 1;
 
         const int jc_meta = threadIdx.y*cols_per_warp + (np*cols_per_warp < WARP_SIZE ? threadIdx.x % (np*cols_per_warp) : threadIdx.x);
-        float2 * const meta_ptr = ((float2 *) tile_K) + jc_meta*(D2_padded/2) + D/4;
+        float2 * const meta_ptr = ((float2 *) tile_Q) + jc_meta*(tile_stride/2) + nbatch_combine/2;
         float2 meta[nmeta];
 #pragma unroll
         for (int imeta = 0; imeta < nmeta; ++imeta) {
-            meta[imeta] = meta_ptr[imeta * WARP_SIZE * D2_padded/2];
+            meta[imeta] = meta_ptr[imeta * WARP_SIZE * tile_stride/2];
         }
 
         float KQ_cmn = meta[0].x; // KQ combine max new, max between all parallel warps.
@@ -690,10 +851,9 @@ static __device__ __forceinline__ void flash_attn_ext_f16_process_tile(
         }
 #pragma unroll
         for (int offset = np*cols_per_warp/2; offset >= cols_per_warp; offset >>= 1) {
-            if (offset >= WARP_SIZE) {
-                continue;
+            if (offset < WARP_SIZE) {
+                KQ_cmn = fmaxf(KQ_cmn, __shfl_xor_sync(0xFFFFFFFF, KQ_cmn, offset, WARP_SIZE));
             }
-            KQ_cmn = fmaxf(KQ_cmn, __shfl_xor_sync(0xFFFFFFFF, KQ_cmn, offset, WARP_SIZE));
         }
 
         float KQ_cms[nmeta]; // KQ combine max scale per warp.
@@ -709,18 +869,19 @@ static __device__ __forceinline__ void flash_attn_ext_f16_process_tile(
         }
 #pragma unroll
         for (int offset = np*cols_per_warp/2; offset >= cols_per_warp; offset >>= 1) {
-            if (offset >= WARP_SIZE) {
-                continue;
+            if (offset < WARP_SIZE) {
+                KQ_crs += __shfl_xor_sync(0xFFFFFFFF, KQ_crs, offset, WARP_SIZE);
             }
-            KQ_crs += __shfl_xor_sync(0xFFFFFFFF, KQ_crs, offset, WARP_SIZE);
         }
 
+        __syncthreads();
+
         // Write back combined meta data:
 #pragma unroll
         for (int imeta = 0; imeta < nmeta; ++imeta) {
             if (np*cols_per_warp >= WARP_SIZE || threadIdx.x < np*cols_per_warp) {
                 // Combined KQ max scale + rowsum.
-                meta_ptr[imeta * WARP_SIZE * D2_padded/2] = make_float2(KQ_cms[imeta], KQ_crs);
+                meta_ptr[imeta * WARP_SIZE * tile_stride/2] = make_float2(KQ_cms[imeta], KQ_crs);
             }
         }
 
@@ -734,90 +895,125 @@ static __device__ __forceinline__ void flash_attn_ext_f16_process_tile(
             float2 * dstk_fixup_meta = dstk_fixup + (gridDim.x + blockIdx.x)*ncols;
             dstk_fixup_meta[(threadIdx.y/np)*cols_per_warp + threadIdx.x] = make_float2(KQ_cmn, KQ_crs);
         }
-    }
-
-    if (np > 1) {
+    } else if (np > 1) {
+        // Warps with threadIdx.y % np == 0 execute a __syncthreads() in the if branch.
+        // Therefore, all other warps also need to execute a __syncthreads().
+        // Otherwise the points at which warps synchronize with each other would become misaligned.
         __syncthreads();
     }
 
-    if (np == 1 || threadIdx.y % np == 0) {
-        // The first 2*2*gridDim.x*ncols floats in dstk_fixup are for storing max. values and row sums.
-        // The values after that are for the partial results of the individual blocks.
-        float2 * dstk_fixup_data = dstk_fixup + gridDim.x*(2*ncols) + blockIdx.x*(ncols*(D/2));
+#pragma unroll
+    for (int k00 = 0; k00 < DV/2; k00 += nbatch_combine) {
+        if (ntiles == 1) {
+            const int jc_cwd = threadIdx.y*tile_B::I + tile_B::get_i(-1); // jc combine write data
+#pragma unroll
+            for (int k0 = 0; k0 < nbatch_combine; k0 += tile_B::J) {
+                const tile_B B = get_transposed(VKQ_C[(k00 + k0)/tile_B::J]); // Conversion of C to B matrix puts it in column-major format.
 
 #pragma unroll
-        for (int stride_k : {WARP_SIZE, WARP_SIZE/2, WARP_SIZE/4}) {
-            const int k0_start  = stride_k == WARP_SIZE ? 0 : D/2 - (D/2) % (2*stride_k);
-            const int k0_stop   =                             D/2 - (D/2) % (1*stride_k);
-            const int stride_jc = WARP_SIZE / stride_k;
+                for (int l = 0; l < tile_B::ne; ++l) {
+                    const int k = k0 + tile_B::get_j(l);
 
-            if (k0_start == k0_stop) {
-                continue;
+                    tile_Q[jc_cwd*tile_stride + k] = B.x[l];
+                }
             }
-
+        } else {
 #pragma unroll
-            for (int jc0_dst = 0; jc0_dst < ncols; jc0_dst += (nwarps/np)*stride_jc) {
-                const int jc_dst = jc0_dst + (threadIdx.y/np)*stride_jc + (stride_k == WARP_SIZE ? 0 : threadIdx.x / stride_k);
-
-                if (jc0_dst + (nwarps/np)*stride_jc > ncols && jc_dst >= ncols) {
-                    break;
-                }
-
-                const int jc_tile_K = (jc_dst/cols_per_warp)*(np*cols_per_warp) + jc_dst % cols_per_warp;
-
-                const int j_dst = jc_dst / ncols2;
-                const int c_dst = jc_dst % ncols2;
-
-                if (!is_fixup && jt*ncols1 + j_dst >= ne01) {
-                    continue;
-                }
-
-                const float * meta_j = (const float *) tile_K + jc_tile_K*D2_padded + D/2;
+            for (int t = 0; t < ntiles/2; ++t) {
+                const int j0 = threadIdx.y*cols_per_warp + t*tile_C_VKQ_16::I;
 #pragma unroll
-                for (int k0 = k0_start; k0 < k0_stop; k0 += stride_k) {
-                    const int k = k0 + (stride_k == WARP_SIZE ? threadIdx.x : threadIdx.x % stride_k);
-
-                    float2 dstk_val = make_float2(0.0f, 0.0f);
+                for (int k0 = 0; k0 < nbatch_combine; k0 += tile_C_VKQ_16::J) {
 #pragma unroll
-                    for (int ip = 0; ip < np; ++ip) {
-                        const float KQ_crs = np == 1 ? 1.0f : meta_j[ip*cols_per_warp * D2_padded + 0];
-                        const float2 dstk_val_add = __half22float2(tile_K[(jc_tile_K + ip*cols_per_warp) * D2_padded + k]);
-                        dstk_val.x += dstk_val_add.x*KQ_crs;
-                        dstk_val.y += dstk_val_add.y*KQ_crs;
-                    }
+                    for (int l = 0; l < tile_C_VKQ_16::ne; ++l) {
+                        const int j = j0 + tile_C_VKQ_16::get_i(l);
+                        const int k = k0 + tile_C_VKQ_16::get_j(l);
 
-                    if (!needs_fixup && !is_fixup) {
-                        const float KQ_rowsum_j = meta_j[1];
-                        dstk_val.x /= KQ_rowsum_j;
-                        dstk_val.y /= KQ_rowsum_j;
-                    }
-
-                    if (is_fixup) {
-                        dstk_fixup_data[jc_dst*(D/2) + k] = dstk_val;
-                    } else {
-                        dstk[((jt*ncols1 + j_dst)*ne02 + c_dst)*(D/2) + k] = dstk_val;
+                        tile_Q[j*tile_stride + k] = VKQ_C_16[(k00 + k0)/tile_C_VKQ_16::J * ntiles/2 + t].x[l];
                     }
                 }
             }
         }
-    }
 
-    if (np > 1) {
         __syncthreads();
+
+        if (np == 1 || threadIdx.y % np == 0) {
+            // The first 2*2*gridDim.x*ncols floats in dstk_fixup are for storing max. values and row sums.
+            // The values after that are for the partial results of the individual blocks.
+            float2 * dstk_fixup_data = dstk_fixup + gridDim.x*(2*ncols) + blockIdx.x*(ncols*(DV/2));
+
+#pragma unroll
+            for (int stride_k : {WARP_SIZE, WARP_SIZE/2, WARP_SIZE/4}) {
+                const int k0_start  = stride_k == WARP_SIZE ? 0 : nbatch_combine - nbatch_combine % (2*stride_k);
+                const int k0_stop   =                             nbatch_combine - nbatch_combine % (1*stride_k);
+                const int stride_jc = WARP_SIZE / stride_k;
+
+                if (k0_start == k0_stop) {
+                    continue;
+                }
+
+#pragma unroll
+                for (int jc0_dst = 0; jc0_dst < ncols; jc0_dst += (nwarps/np)*stride_jc) {
+                    const int jc_dst = jc0_dst + (threadIdx.y/np)*stride_jc + (stride_k == WARP_SIZE ? 0 : threadIdx.x / stride_k);
+
+                    if (jc0_dst + (nwarps/np)*stride_jc > ncols && jc_dst >= ncols) {
+                        break;
+                    }
+
+                    const int jc_tile_K = (jc_dst/cols_per_warp)*(np*cols_per_warp) + jc_dst % cols_per_warp;
+
+                    const int j_dst = jc_dst / ncols2;
+                    const int c_dst = jc_dst % ncols2;
+
+                    if (!is_fixup && jt*ncols1 + j_dst >= ne01) {
+                        continue;
+                    }
+
+                    const float * meta_j = (const float *) tile_Q + jc_tile_K*tile_stride + nbatch_combine;
+#pragma unroll
+                    for (int k0 = k0_start; k0 < k0_stop; k0 += stride_k) {
+                        const int k = k0 + (stride_k == WARP_SIZE ? threadIdx.x : threadIdx.x % stride_k);
+
+                        float2 dstk_val = make_float2(0.0f, 0.0f);
+#pragma unroll
+                        for (int ip = 0; ip < np; ++ip) {
+                            const float KQ_crs = np == 1 ? 1.0f : meta_j[ip*cols_per_warp * tile_stride + 0];
+                            const float2 dstk_val_add = __half22float2(tile_Q[(jc_tile_K + ip*cols_per_warp) * tile_stride + k]);
+                            dstk_val.x += dstk_val_add.x*KQ_crs;
+                            dstk_val.y += dstk_val_add.y*KQ_crs;
+                        }
+
+                        if (!needs_fixup && !is_fixup) {
+                            const float KQ_rowsum_j = meta_j[1];
+                            dstk_val.x /= KQ_rowsum_j;
+                            dstk_val.y /= KQ_rowsum_j;
+                        }
+
+                        if (is_fixup) {
+                            dstk_fixup_data[jc_dst*(DV/2) + k00 + k] = dstk_val;
+                        } else {
+                            dstk[((jt*ncols1 + j_dst)*ne02 + c_dst)*(DV/2) + k00 + k] = dstk_val;
+                        }
+                    }
+                }
+            }
+        }
+        if (np > 1) {
+            __syncthreads();
+        }
     }
 #else
     GGML_UNUSED(Q_f2); GGML_UNUSED(K_h2); GGML_UNUSED(V_h2);
     GGML_UNUSED(mask_h2); GGML_UNUSED(dstk); GGML_UNUSED(dstk_fixup);
     GGML_UNUSED(scale); GGML_UNUSED(slope); GGML_UNUSED(logit_softcap);
     GGML_UNUSED(ne01); GGML_UNUSED(ne02); GGML_UNUSED(stride_Q1);
-    GGML_UNUSED(stride_Q2); GGML_UNUSED(stride_KV); GGML_UNUSED(stride_mask);
+    GGML_UNUSED(stride_Q2); GGML_UNUSED(stride_K); GGML_UNUSED(stride_V); GGML_UNUSED(stride_mask);
     GGML_UNUSED(jt); GGML_UNUSED(kb0_start); GGML_UNUSED(kb0_stop);
     NO_DEVICE_CODE;
 #endif // NEW_MMA_AVAILABLE
 }
 
-template<int D, int ncols1, int ncols2, int nwarps, int KQ_per_iter, int ntiles, bool use_logit_softcap>
-__launch_bounds__(nwarps*WARP_SIZE, 2)
+template<int DKQ, int DV, int ncols1, int ncols2, int nwarps, int ntiles, bool use_logit_softcap>
+__launch_bounds__(nwarps*WARP_SIZE, 1)
 static __global__ void flash_attn_ext_f16(
         const char * __restrict__ Q,
         const char * __restrict__ K,
@@ -857,24 +1053,27 @@ static __global__ void flash_attn_ext_f16(
 #if defined(FLASH_ATTN_AVAILABLE) && defined(NEW_MMA_AVAILABLE)
 
     // Skip unused kernel variants for faster compilation:
-    if (use_logit_softcap && !(D == 128 || D == 256)) {
+    if (use_logit_softcap && !(DKQ == 128 || DKQ == 256)) {
         NO_DEVICE_CODE;
         return;
     }
 
-    static_assert(FATTN_KQ_STRIDE % KQ_per_iter == 0, "bad KQ_per_iter");
+    typedef fattn_mma_f16_config<DKQ, DV> c;
+
+    static_assert(FATTN_KQ_STRIDE % fattn_mma_f16_config<DKQ, DV>::nbatch_fa == 0, "bad nbatch_fa");
 
     const int gqa_ratio = ne02 / ne12; // With grouped query attention there are > 1 Q matrices per K, V matrix.
 
     const int stride_Q1   = nb01 / sizeof(float2);
     const int stride_Q2   = nb02 / sizeof(float2);
-    const int stride_KV   = nb11 / sizeof(half2);
+    const int stride_K    = nb11 / sizeof(half2);
+    const int stride_V    = nb21 / sizeof(half2);
     const int stride_mask = nb31 / sizeof(half2);
 
     const int iter_k = ne11 / FATTN_KQ_STRIDE;
     const int iter_j = (ne01 + (ncols1 - 1)) / ncols1;
 
-    constexpr int kb_niter = FATTN_KQ_STRIDE / KQ_per_iter; // Number of kernel iterations per assigned KQ slice.
+    constexpr int kb_niter = FATTN_KQ_STRIDE / c::nbatch_fa; // Number of kernel iterations per assigned KQ slice.
 
     // kbc == k block continuous, current index in continuous ijk space.
     int       kbc      = (blockIdx.x + 0)*iter_k*iter_j*(ne02/ncols2) / gridDim.x;
@@ -893,9 +1092,9 @@ static __global__ void flash_attn_ext_f16(
 
         const float2 * Q_f2    = (const float2 *) (Q + nb02* channel*ncols2);
         const half2  * K_h2    = (const half2  *) (K + nb12*(channel*ncols2 / gqa_ratio));
-        const half2  * V_h2    = (const half2  *) (V + nb12*(channel*ncols2 / gqa_ratio)); // K and V have same shape
+        const half2  * V_h2    = (const half2  *) (V + nb22*(channel*ncols2 / gqa_ratio));
         const half2  * mask_h2 = ncols2 > 1 || mask ? (const half2  *) mask + (nb31/sizeof(half2))*jt*ncols1 : nullptr;
-        float2       * dstk    = ((float2 *) dst) + channel*(ncols2 * D/2);
+        float2       * dstk    = ((float2 *) dst) + channel*(ncols2 * DV/2);
 
         const float slope = ncols2 == 1 ? get_alibi_slope(max_bias, channel, n_head_log2, m0, m1) : 1.0f;
 
@@ -905,14 +1104,14 @@ static __global__ void flash_attn_ext_f16(
         constexpr bool is_fixup = false; // All but (potentially) the last iterations write their data to dst rather than the fixup buffer.
         if (kb0_start == 0) {
             constexpr bool needs_fixup = false; // CUDA block is working on an entire tile.
-            flash_attn_ext_f16_process_tile<D, ncols1, ncols2, nwarps, KQ_per_iter, ntiles, use_logit_softcap, needs_fixup, is_fixup>
+            flash_attn_ext_f16_process_tile<DKQ, DV, ncols1, ncols2, nwarps, ntiles, use_logit_softcap, needs_fixup, is_fixup>
                 (Q_f2, K_h2, V_h2, mask_h2, dstk, dst_meta, scale, slope, logit_softcap,
-                 ne01, ne02, stride_Q1, stride_Q2, stride_KV, stride_mask, jt, kb0_start_kernel, kb0_stop_kernel);
+                 ne01, ne02, stride_Q1, stride_Q2, stride_K, stride_V, stride_mask, jt, kb0_start_kernel, kb0_stop_kernel);
         } else {
             constexpr bool needs_fixup = true; // CUDA block is working on the beginning of a tile.
-            flash_attn_ext_f16_process_tile<D, ncols1, ncols2, nwarps, KQ_per_iter, ntiles, use_logit_softcap, needs_fixup, is_fixup>
+            flash_attn_ext_f16_process_tile<DKQ, DV, ncols1, ncols2, nwarps, ntiles, use_logit_softcap, needs_fixup, is_fixup>
                 (Q_f2, K_h2, V_h2, mask_h2, dstk, dst_meta, scale, slope, logit_softcap,
-                 ne01, ne02, stride_Q1, stride_Q2, stride_KV, stride_mask, jt, kb0_start_kernel, kb0_stop_kernel);
+                 ne01, ne02, stride_Q1, stride_Q2, stride_K, stride_V, stride_mask, jt, kb0_start_kernel, kb0_stop_kernel);
         }
 
         kbc += iter_k;
@@ -931,9 +1130,9 @@ static __global__ void flash_attn_ext_f16(
 
     const float2 * Q_f2    = (const float2 *) (Q + nb02* channel*ncols2);
     const half2  * K_h2    = (const half2  *) (K + nb12*(channel*ncols2 / gqa_ratio));
-    const half2  * V_h2    = (const half2  *) (V + nb12*(channel*ncols2 / gqa_ratio)); // K and V have same shape
+    const half2  * V_h2    = (const half2  *) (V + nb22*(channel*ncols2 / gqa_ratio)); // K and V have same shape
     const half2  * mask_h2 = ncols2 > 1 || mask ? (const half2  *) mask + (nb31/sizeof(half2))*jt*ncols1 : nullptr;
-    float2       * dstk    = ((float2 *) dst) + channel*(ncols2 * D/2);
+    float2       * dstk    = ((float2 *) dst) + channel*(ncols2 * DV/2);
 
     const float slope = ncols2 == 1 ? get_alibi_slope(max_bias, channel, n_head_log2, m0, m1) : 1.0f;
 
@@ -942,9 +1141,9 @@ static __global__ void flash_attn_ext_f16(
 
     constexpr bool is_fixup = true; // Last index writes its data to fixup buffer to avoid data races with other blocks.
     constexpr bool needs_fixup = false;
-    flash_attn_ext_f16_process_tile<D, ncols1, ncols2, nwarps, KQ_per_iter, ntiles, use_logit_softcap, needs_fixup, is_fixup>
+    flash_attn_ext_f16_process_tile<DKQ, DV, ncols1, ncols2, nwarps, ntiles, use_logit_softcap, needs_fixup, is_fixup>
         (Q_f2, K_h2, V_h2, mask_h2, dstk, dst_meta, scale, slope, logit_softcap,
-         ne01, ne02, stride_Q1, stride_Q2, stride_KV, stride_mask, jt, kb0_start_kernel, kb0_stop_kernel);
+         ne01, ne02, stride_Q1, stride_Q2, stride_K, stride_V, stride_mask, jt, kb0_start_kernel, kb0_stop_kernel);
 #else
     GGML_UNUSED(Q); GGML_UNUSED(K); GGML_UNUSED(V); GGML_UNUSED(mask);
     GGML_UNUSED(dst); GGML_UNUSED(dst_meta); GGML_UNUSED(scale);
@@ -960,28 +1159,42 @@ static __global__ void flash_attn_ext_f16(
 #endif // defined(FLASH_ATTN_AVAILABLE) && defined(NEW_MMA_AVAILABLE)
 }
 
-template <int D, int ncols1, int ncols2>
+template <int DKQ, int DV, int ncols1, int ncols2>
 void ggml_cuda_flash_attn_ext_mma_f16_case(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
-    constexpr int ncols         = ncols1 * ncols2;
-    constexpr int KQ_per_iter   = D <= 128 && ncols1 <= 64 ? 64 : 32;
-    constexpr int nwarps        = (KQ_per_iter == 32 && ncols <= 16) ? 2 : 4;
-    constexpr int ntiles        = ncols <= 8 ? 1 : (ncols <= 64 ? 2 : 4);
-    constexpr int cols_per_warp = ntiles * tile_B::I;
+    const ggml_tensor * KQV = dst;
+    const int id = ggml_cuda_get_device();
+    const int cc = ggml_cuda_info().devices[id].cc;
 
-    static_assert(D     %    tile_B::J  == 0, "bad D");
+    typedef fattn_mma_f16_config<DKQ, DV> c;
+
+    constexpr int nbatch_K2      = c::nbatch_K2      < 1 ? DKQ/2 : c::nbatch_K2;
+    constexpr int nbatch_V2      = c::nbatch_V2      < 1 ? DV /2 : c::nbatch_V2;
+    constexpr int nbatch_combine = c::nbatch_combine < 1 ? DV /2 : c::nbatch_combine;
+
+    const int nstages = cp_async_available(cc) ? c::nstages_target : 0;
+
+    constexpr int ncols         = ncols1 * ncols2;
+    constexpr int ntiles        = ncols <= 8 ? 1 : 2; // Number of tiles per warp.
+    constexpr int cols_per_warp = ntiles * tile_B::I;
+    constexpr int nwarps_max_x  = ncols / cols_per_warp;
+    constexpr int nwarps_max_y  = c::nbatch_fa / tile_A::I;
+    constexpr int nwarps        = nwarps_max_x*nwarps_max_y <= c::nwarps_max ? nwarps_max_x*nwarps_max_y : c::nwarps_max;
+
+    static_assert(DKQ   % tile_B::J     == 0, "bad DKQ");
+    static_assert(DV    % tile_A::J     == 0, "bad DV");
     static_assert(ncols % cols_per_warp == 0, "bad ncols");
 
-    const ggml_tensor * KQV = dst;
-    const int id    = ggml_cuda_get_device();
-    const int cc    = ggml_cuda_info().devices[id].cc;
+    const size_t nbytes_shared_KV_1stage = c::nbatch_fa         * std::max(c::nbatch_K2 + 4,  c::nbatch_V2 + 4) * sizeof(half2);
+    const size_t nbytes_shared_KV_2stage = c::nbatch_fa         *         (c::nbatch_K2 + 4 + c::nbatch_V2 + 4) * sizeof(half2);
+    const size_t nbytes_shared_Q         = ncols                * (DKQ/2 + 4)                                   * sizeof(half2);
+    const size_t nbytes_shared_mask      = ncols1               * (c::nbatch_fa/2 + 4)                          * sizeof(half2);
+    const size_t nbytes_shared_combine   = nwarps*cols_per_warp * (nbatch_combine + 4)                          * sizeof(half2);
 
-    const int KQ_shared_rows = cp_async_available(cc) ? 2*KQ_per_iter : KQ_per_iter;
+    const size_t nbytes_shared_KV = nstages <= 1 ? nbytes_shared_KV_1stage : nbytes_shared_KV_2stage;
 
-    const size_t nbytes_shared_KV      = KQ_shared_rows       * (D           + 8) * sizeof(half);
-    const size_t nbytes_shared_mask    = ncols1               * (KQ_per_iter + 8) * sizeof(half);
-    const size_t nbytes_shared_combine = nwarps*cols_per_warp * (D           + 8) * sizeof(half);
-
-    const size_t nbytes_shared_total = std::max(nbytes_shared_KV + nbytes_shared_mask, nbytes_shared_combine);
+    const size_t nbytes_shared_total = std::max(nbytes_shared_combine, c::Q_in_reg ?
+        std::max(nbytes_shared_Q,  nbytes_shared_KV + nbytes_shared_mask) :
+                 nbytes_shared_Q + nbytes_shared_KV + nbytes_shared_mask);
 
     float logit_softcap;
     memcpy(&logit_softcap, (const float *) KQV->op_params + 2, sizeof(float));
@@ -989,59 +1202,73 @@ void ggml_cuda_flash_attn_ext_mma_f16_case(ggml_backend_cuda_context & ctx, ggml
     fattn_kernel_t fattn_kernel;
     if (logit_softcap == 0.0f) {
         constexpr bool use_logit_softcap = false;
-        fattn_kernel = flash_attn_ext_f16<D, ncols1, ncols2, nwarps, KQ_per_iter, ntiles, use_logit_softcap>;
+        fattn_kernel = flash_attn_ext_f16<DKQ, DV, ncols1, ncols2, nwarps, ntiles, use_logit_softcap>;
+
+#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && !defined(GGML_USE_MUSA)
+        static bool shared_memory_limit_raised[GGML_CUDA_MAX_DEVICES] = {false};
+        if (!shared_memory_limit_raised[id]) {
+            CUDA_CHECK(cudaFuncSetAttribute(fattn_kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, nbytes_shared_total));
+            shared_memory_limit_raised[id] = true;
+        }
+#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && !defined(GGML_USE_MUSA)
     } else {
         constexpr bool use_logit_softcap = true;
-        fattn_kernel = flash_attn_ext_f16<D, ncols1, ncols2, nwarps, KQ_per_iter, ntiles, use_logit_softcap>;
+        fattn_kernel = flash_attn_ext_f16<DKQ, DV, ncols1, ncols2, nwarps, ntiles, use_logit_softcap>;
+
+#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && !defined(GGML_USE_MUSA)
+        static bool shared_memory_limit_raised[GGML_CUDA_MAX_DEVICES] = {false};
+        if (!shared_memory_limit_raised[id]) {
+            CUDA_CHECK(cudaFuncSetAttribute(fattn_kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, nbytes_shared_total));
+            shared_memory_limit_raised[id] = true;
+        }
+#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && !defined(GGML_USE_MUSA)
     }
 
-    launch_fattn<D, ncols1, ncols2, KQ_per_iter>
+    launch_fattn<DV, ncols1, ncols2>
         (ctx, dst, fattn_kernel, nwarps, nbytes_shared_total, FATTN_KQ_STRIDE, true, true, true);
 }
 
 
-#define DECL_FATTN_MMA_F16_CASE(D, ncols1, ncols2)                          \
-    template void ggml_cuda_flash_attn_ext_mma_f16_case                     \
-    <D, ncols1, ncols2>(ggml_backend_cuda_context & ctx, ggml_tensor * dst) \
+#define DECL_FATTN_MMA_F16_CASE(DKQ, DV, ncols1, ncols2)                          \
+    template void ggml_cuda_flash_attn_ext_mma_f16_case                           \
+    <DKQ, DV, ncols1, ncols2>(ggml_backend_cuda_context & ctx, ggml_tensor * dst) \
 
-#define DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2(D, ncols) \
-    extern DECL_FATTN_MMA_F16_CASE(D, (ncols)/1, 1); \
-    extern DECL_FATTN_MMA_F16_CASE(D, (ncols)/2, 2); \
-    extern DECL_FATTN_MMA_F16_CASE(D, (ncols)/4, 4); \
-    extern DECL_FATTN_MMA_F16_CASE(D, (ncols)/8, 8); \
+#define DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2(DKQ, DV, ncols)   \
+    extern DECL_FATTN_MMA_F16_CASE(DKQ, DV, (ncols)/ 1,  1); \
+    extern DECL_FATTN_MMA_F16_CASE(DKQ, DV, (ncols)/ 2,  2); \
+    extern DECL_FATTN_MMA_F16_CASE(DKQ, DV, (ncols)/ 4,  4); \
+    extern DECL_FATTN_MMA_F16_CASE(DKQ, DV, (ncols)/ 8,  8); \
+    extern DECL_FATTN_MMA_F16_CASE(DKQ, DV, (ncols)/16, 16); \
 
-DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2( 64,   8)
-DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2( 80,   8)
-DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2( 96,   8)
-DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2(112,   8)
-DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2(128,   8)
-DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2(256,   8)
+DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2( 64,  64,   8)
+DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2( 80,  80,   8)
+DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2( 96,  96,   8)
+DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2(112, 112,   8)
+DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2(128, 128,   8)
+DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2(256, 256,   8)
 
-DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2( 64,  16)
-DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2( 80,  16)
-DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2( 96,  16)
-DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2(112,  16)
-DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2(128,  16)
-DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2(256,  16)
+DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2( 64,  64,  16)
+DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2( 80,  80,  16)
+DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2( 96,  96,  16)
+DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2(112, 112,  16)
+DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2(128, 128,  16)
+DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2(256, 256,  16)
 
-DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2( 64,  32)
-DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2( 80,  32)
-DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2( 96,  32)
-DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2(112,  32)
-DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2(128,  32)
-DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2(256,  32)
+DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2( 64,  64,  32)
+DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2( 80,  80,  32)
+DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2( 96,  96,  32)
+DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2(112, 112,  32)
+DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2(128, 128,  32)
+DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2(256, 256,  32)
 
-DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2( 64,  64)
-DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2( 80,  64)
-DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2( 96,  64)
-DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2(112,  64)
-DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2(128,  64)
-DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2(256,  64)
+DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2( 64,  64,  64)
+DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2( 80,  80,  64)
+DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2( 96,  96,  64)
+DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2(112, 112,  64)
+DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2(128, 128,  64)
+DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2(256, 256,  64)
 
-// Kernels with ncols == 128 are only 4% faster due to register pressure.
-// DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2( 64, 128)
-// DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2( 80, 128)
-// DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2( 96, 128)
-// DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2(112, 128)
-// DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2(128, 128)
-// DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2(256, 128) // Needs too much shared memory.
+// The number of viable configurations for Deepseek is very limited:
+extern DECL_FATTN_MMA_F16_CASE(576, 512, 1, 16);
+extern DECL_FATTN_MMA_F16_CASE(576, 512, 2, 16);
+extern DECL_FATTN_MMA_F16_CASE(576, 512, 4, 16);
diff --git a/ml/backend/ggml/ggml/src/ggml-cuda/fattn-tile-f16.cu b/ml/backend/ggml/ggml/src/ggml-cuda/fattn-tile-f16.cu
index e0039e17..9283560d 100644
--- a/ml/backend/ggml/ggml/src/ggml-cuda/fattn-tile-f16.cu
+++ b/ml/backend/ggml/ggml/src/ggml-cuda/fattn-tile-f16.cu
@@ -307,7 +307,7 @@ void launch_fattn_tile_f16_64_128(ggml_backend_cuda_context & ctx, ggml_tensor *
             constexpr int    nwarps        = 8;
             constexpr size_t nbytes_shared = 0;
             fattn_kernel_t fattn_kernel = flash_attn_tile_ext_f16<D, cols_per_block, nwarps, use_logit_softcap>;
-            launch_fattn<D, cols_per_block, 1, -1>
+            launch_fattn<D, cols_per_block, 1>
                 (ctx, dst, fattn_kernel, nwarps, nbytes_shared, FATTN_KQ_STRIDE_TILE_F16, true, true, false);
         } break;
         case 128: {
@@ -315,7 +315,7 @@ void launch_fattn_tile_f16_64_128(ggml_backend_cuda_context & ctx, ggml_tensor *
             constexpr int    nwarps        = 8;
             constexpr size_t nbytes_shared = 0;
             fattn_kernel_t fattn_kernel = flash_attn_tile_ext_f16<D, cols_per_block, nwarps, use_logit_softcap>;
-            launch_fattn<D, cols_per_block, 1, -1>
+            launch_fattn<D, cols_per_block, 1>
                 (ctx, dst, fattn_kernel, nwarps, nbytes_shared, FATTN_KQ_STRIDE_TILE_F16, true, true, false);
         } break;
         default: {
diff --git a/ml/backend/ggml/ggml/src/ggml-cuda/fattn-tile-f32.cu b/ml/backend/ggml/ggml/src/ggml-cuda/fattn-tile-f32.cu
index fcb6f848..32673adb 100644
--- a/ml/backend/ggml/ggml/src/ggml-cuda/fattn-tile-f32.cu
+++ b/ml/backend/ggml/ggml/src/ggml-cuda/fattn-tile-f32.cu
@@ -318,7 +318,7 @@ void launch_fattn_tile_f32_64_128(ggml_backend_cuda_context & ctx, ggml_tensor *
             constexpr int    nwarps        = 8;
             constexpr size_t nbytes_shared = 0;
             fattn_kernel_t fattn_kernel = flash_attn_tile_ext_f32<D, cols_per_block, nwarps, use_logit_softcap>;
-            launch_fattn<D, cols_per_block, 1, -1>
+            launch_fattn<D, cols_per_block, 1>
                 (ctx, dst, fattn_kernel, nwarps, nbytes_shared, FATTN_KQ_STRIDE_TILE_F32, true, true, false);
         } break;
         case 128: {
@@ -326,7 +326,7 @@ void launch_fattn_tile_f32_64_128(ggml_backend_cuda_context & ctx, ggml_tensor *
             constexpr int    nwarps        = 8;
             constexpr size_t nbytes_shared = 0;
             fattn_kernel_t fattn_kernel = flash_attn_tile_ext_f32<D, cols_per_block, nwarps, use_logit_softcap>;
-            launch_fattn<D, cols_per_block, 1, -1>
+            launch_fattn<D, cols_per_block, 1>
                 (ctx, dst, fattn_kernel, nwarps, nbytes_shared, FATTN_KQ_STRIDE_TILE_F32, true, true, false);
         } break;
         default: {
diff --git a/ml/backend/ggml/ggml/src/ggml-cuda/fattn-vec-f16.cuh b/ml/backend/ggml/ggml/src/ggml-cuda/fattn-vec-f16.cuh
index e17d2d0e..d96e3921 100644
--- a/ml/backend/ggml/ggml/src/ggml-cuda/fattn-vec-f16.cuh
+++ b/ml/backend/ggml/ggml/src/ggml-cuda/fattn-vec-f16.cuh
@@ -168,6 +168,7 @@ static __global__ void flash_attn_vec_ext_f16(
     for (int j = 0; j < ncols; ++j) {
         KQ[j*D + tid] = -HALF_MAX_HALF;
     }
+    __syncthreads();
 
     half2 VKQ[ncols] = {{0.0f, 0.0f}};
 
@@ -315,7 +316,7 @@ void ggml_cuda_flash_attn_ext_vec_f16_case_impl(ggml_backend_cuda_context & ctx,
     constexpr bool need_f16_K = D != 128;
     constexpr bool need_f16_V = D != 128 && D != 64;
     constexpr size_t nbytes_shared = 0;
-    launch_fattn<D, cols_per_block, 1, -1>(ctx, dst, fattn_kernel, nwarps, nbytes_shared, D, need_f16_K, need_f16_V, false);
+    launch_fattn<D, cols_per_block, 1>(ctx, dst, fattn_kernel, nwarps, nbytes_shared, D, need_f16_K, need_f16_V, false);
 }
 
 template <int D, ggml_type type_K, ggml_type type_V>
diff --git a/ml/backend/ggml/ggml/src/ggml-cuda/fattn-vec-f32.cuh b/ml/backend/ggml/ggml/src/ggml-cuda/fattn-vec-f32.cuh
index d42ddca4..7064675d 100644
--- a/ml/backend/ggml/ggml/src/ggml-cuda/fattn-vec-f32.cuh
+++ b/ml/backend/ggml/ggml/src/ggml-cuda/fattn-vec-f32.cuh
@@ -310,7 +310,7 @@ void ggml_cuda_flash_attn_ext_vec_f32_case_impl(ggml_backend_cuda_context & ctx,
     constexpr bool need_f16_K = D != 128;
     constexpr bool need_f16_V = D != 128 && D != 64;
     constexpr size_t nbytes_shared = 0;
-    launch_fattn<D, cols_per_block, 1, -1>(ctx, dst, fattn_kernel, nwarps, nbytes_shared, D, need_f16_K, need_f16_V, false);
+    launch_fattn<D, cols_per_block, 1>(ctx, dst, fattn_kernel, nwarps, nbytes_shared, D, need_f16_K, need_f16_V, false);
 }
 
 template <int D, ggml_type type_K, ggml_type type_V>
diff --git a/ml/backend/ggml/ggml/src/ggml-cuda/fattn-wmma-f16.cu b/ml/backend/ggml/ggml/src/ggml-cuda/fattn-wmma-f16.cu
index bc21b27a..c5668adb 100644
--- a/ml/backend/ggml/ggml/src/ggml-cuda/fattn-wmma-f16.cu
+++ b/ml/backend/ggml/ggml/src/ggml-cuda/fattn-wmma-f16.cu
@@ -490,7 +490,7 @@ void ggml_cuda_flash_attn_ext_wmma_f16_case(ggml_backend_cuda_context & ctx, ggm
         fattn_kernel = flash_attn_ext_f16<
             D, cols_per_block, nwarps, get_VKQ_stride(D, nwarps, frag_m), KQ_acc_t, use_logit_softcap>;
     }
-    launch_fattn<D, cols_per_block, 1, -1>(ctx, dst, fattn_kernel, nwarps, 0, FATTN_KQ_STRIDE, true, true, false, warp_size);
+    launch_fattn<D, cols_per_block, 1>(ctx, dst, fattn_kernel, nwarps, 0, FATTN_KQ_STRIDE, true, true, false, warp_size);
 }
 
 void ggml_cuda_flash_attn_ext_wmma_f16(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
diff --git a/ml/backend/ggml/ggml/src/ggml-cuda/fattn.cu b/ml/backend/ggml/ggml/src/ggml-cuda/fattn.cu
index 7a2d1e45..9c5c803d 100644
--- a/ml/backend/ggml/ggml/src/ggml-cuda/fattn.cu
+++ b/ml/backend/ggml/ggml/src/ggml-cuda/fattn.cu
@@ -8,58 +8,32 @@
 #include "fattn-wmma-f16.cuh"
 #include "fattn.cuh"
 
-template <int D, int ncols2>
+template <int DKQ, int DV, int ncols2>
 static void ggml_cuda_flash_attn_ext_mma_f16_switch_ncols1(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
     const ggml_tensor * Q = dst->src[0];
 
-    if (Q->ne[1] <= 8/ncols2) {
-        ggml_cuda_flash_attn_ext_mma_f16_case<D, 8/ncols2, ncols2>(ctx, dst);
-        return;
+    if constexpr (ncols2 <= 8) {
+        if (Q->ne[1] <= 8/ncols2) {
+            ggml_cuda_flash_attn_ext_mma_f16_case<DKQ, DV, 8/ncols2, ncols2>(ctx, dst);
+            return;
+        }
     }
 
     if (Q->ne[1] <= 16/ncols2) {
-        ggml_cuda_flash_attn_ext_mma_f16_case<D, 16/ncols2, ncols2>(ctx, dst);
+        ggml_cuda_flash_attn_ext_mma_f16_case<DKQ, DV, 16/ncols2, ncols2>(ctx, dst);
         return;
     }
 
     if (Q->ne[1] <= 32/ncols2) {
-        ggml_cuda_flash_attn_ext_mma_f16_case<D, 32/ncols2, ncols2>(ctx, dst);
+        ggml_cuda_flash_attn_ext_mma_f16_case<DKQ, DV, 32/ncols2, ncols2>(ctx, dst);
         return;
     }
 
-    ggml_cuda_flash_attn_ext_mma_f16_case<D, 64/ncols2, ncols2>(ctx, dst);
+    ggml_cuda_flash_attn_ext_mma_f16_case<DKQ, DV, 64/ncols2, ncols2>(ctx, dst);
 }
 
-template <int ncols2>
-static void ggml_cuda_flash_attn_ext_mma_f16_switch_hs(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
-    const ggml_tensor * Q = dst->src[0];
-
-    switch (Q->ne[0]) {
-        case 64:
-            ggml_cuda_flash_attn_ext_mma_f16_switch_ncols1< 64, ncols2>(ctx, dst);
-            break;
-        case 80:
-            ggml_cuda_flash_attn_ext_mma_f16_switch_ncols1< 80, ncols2>(ctx, dst);
-            break;
-        case 96:
-            ggml_cuda_flash_attn_ext_mma_f16_switch_ncols1< 96, ncols2>(ctx, dst);
-            break;
-        case 112:
-            ggml_cuda_flash_attn_ext_mma_f16_switch_ncols1<112, ncols2>(ctx, dst);
-            break;
-        case 128:
-            ggml_cuda_flash_attn_ext_mma_f16_switch_ncols1<128, ncols2>(ctx, dst);
-            break;
-        case 256:
-            ggml_cuda_flash_attn_ext_mma_f16_switch_ncols1<256, ncols2>(ctx, dst);
-            break;
-        default:
-            GGML_ABORT("fatal error");
-            break;
-    }
-}
-
-static void ggml_cuda_flash_attn_ext_mma_f16(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
+template <int DKQ, int DV>
+static void ggml_cuda_flash_attn_ext_mma_f16_switch_ncols2(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
     const ggml_tensor * KQV  = dst;
     const ggml_tensor * Q    = dst->src[0];
     const ggml_tensor * K    = dst->src[1];
@@ -68,27 +42,79 @@ static void ggml_cuda_flash_attn_ext_mma_f16(ggml_backend_cuda_context & ctx, gg
     float max_bias = 0.0f;
     memcpy(&max_bias, (const float *) KQV->op_params + 1, sizeof(float));
 
-    const float use_gqa_opt = mask && max_bias == 0.0f;
+    const bool use_gqa_opt = mask && max_bias == 0.0f;
 
     GGML_ASSERT(Q->ne[2] % K->ne[2] == 0);
     const int gqa_ratio = Q->ne[2] / K->ne[2];
 
     if (use_gqa_opt && gqa_ratio % 8 == 0) {
-        ggml_cuda_flash_attn_ext_mma_f16_switch_hs<8>(ctx, dst);
+        ggml_cuda_flash_attn_ext_mma_f16_switch_ncols1<DKQ, DV, 8>(ctx, dst);
         return;
     }
 
-    if (use_gqa_opt && gqa_ratio == 4) {
-        ggml_cuda_flash_attn_ext_mma_f16_switch_hs<4>(ctx, dst);
+    if (use_gqa_opt && gqa_ratio % 4 == 0) {
+        ggml_cuda_flash_attn_ext_mma_f16_switch_ncols1<DKQ, DV, 4>(ctx, dst);
         return;
     }
 
-    if (use_gqa_opt && gqa_ratio == 2) {
-        ggml_cuda_flash_attn_ext_mma_f16_switch_hs<2>(ctx, dst);
+    if (use_gqa_opt && gqa_ratio % 2 == 0) {
+        ggml_cuda_flash_attn_ext_mma_f16_switch_ncols1<DKQ, DV, 2>(ctx, dst);
         return;
     }
 
-    ggml_cuda_flash_attn_ext_mma_f16_switch_hs<1>(ctx, dst);
+    ggml_cuda_flash_attn_ext_mma_f16_switch_ncols1<DKQ, DV, 1>(ctx, dst);
+}
+
+static void ggml_cuda_flash_attn_ext_mma_f16(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
+    const ggml_tensor * KQV  = dst;
+    const ggml_tensor * Q    = dst->src[0];
+    const ggml_tensor * K    = dst->src[1];
+    const ggml_tensor * V    = dst->src[2];
+    const ggml_tensor * mask = dst->src[3];
+
+    switch (Q->ne[0]) {
+        case 64:
+            GGML_ASSERT(V->ne[0] == 64);
+            ggml_cuda_flash_attn_ext_mma_f16_switch_ncols2< 64,  64>(ctx, dst);
+            break;
+        case 80:
+            GGML_ASSERT(V->ne[0] == 80);
+            ggml_cuda_flash_attn_ext_mma_f16_switch_ncols2< 80,  80>(ctx, dst);
+            break;
+        case 96:
+            GGML_ASSERT(V->ne[0] == 96);
+            ggml_cuda_flash_attn_ext_mma_f16_switch_ncols2< 96,  96>(ctx, dst);
+            break;
+        case 112:
+            GGML_ASSERT(V->ne[0] == 112);
+            ggml_cuda_flash_attn_ext_mma_f16_switch_ncols2<112, 112>(ctx, dst);
+            break;
+        case 128:
+            GGML_ASSERT(V->ne[0] == 128);
+            ggml_cuda_flash_attn_ext_mma_f16_switch_ncols2<128, 128>(ctx, dst);
+            break;
+        case 256:
+            GGML_ASSERT(V->ne[0] == 256);
+            ggml_cuda_flash_attn_ext_mma_f16_switch_ncols2<256, 256>(ctx, dst);
+            break;
+        case 576: {
+            // For Deepseek, go straight to the ncols1 switch to avoid compiling unnecessary kernels.
+            GGML_ASSERT(V->ne[0] == 512);
+            float max_bias = 0.0f;
+            memcpy(&max_bias, (const float *) KQV->op_params + 1, sizeof(float));
+
+            const bool use_gqa_opt = mask && max_bias == 0.0f;
+            GGML_ASSERT(use_gqa_opt);
+
+            GGML_ASSERT(Q->ne[2] % K->ne[2] == 0);
+            const int gqa_ratio = Q->ne[2] / K->ne[2];
+            GGML_ASSERT(gqa_ratio % 16 == 0);
+            ggml_cuda_flash_attn_ext_mma_f16_switch_ncols1<576, 512, 16>(ctx, dst);
+        } break;
+        default:
+            GGML_ABORT("fatal error");
+            break;
+    }
 }
 
 #define FATTN_VEC_F16_CASE(D, type_K, type_V)                               \
@@ -299,7 +325,7 @@ void ggml_cuda_flash_attn_ext(ggml_backend_cuda_context & ctx, ggml_tensor * dst
     const bool gqa_opt_applies = ((Q->ne[2] / K->ne[2]) % 2 == 0) && mask; // The mma-based kernels have GQA-specific optimizations
     const bool mma_needs_data_conversion = K->type != GGML_TYPE_F16 || V->type != GGML_TYPE_F16;
     const bool mma_faster_for_bs1 = new_mma_available(cc) && gqa_opt_applies && cc < GGML_CUDA_CC_ADA_LOVELACE && !mma_needs_data_conversion;
-    const bool can_use_vector_kernel = Q->ne[0] % (2*warp_size) == 0;
+    const bool can_use_vector_kernel = Q->ne[0] <= 256 && Q->ne[0] % (2*warp_size) == 0;
     if (Q->ne[1] == 1 && can_use_vector_kernel && !mma_faster_for_bs1) {
         if (prec == GGML_PREC_DEFAULT) {
             ggml_cuda_flash_attn_ext_vec_f16(ctx, dst);
diff --git a/ml/backend/ggml/ggml/src/ggml-cuda/getrows.cu b/ml/backend/ggml/ggml/src/ggml-cuda/getrows.cu
index ea8bf691..963e4d03 100644
--- a/ml/backend/ggml/ggml/src/ggml-cuda/getrows.cu
+++ b/ml/backend/ggml/ggml/src/ggml-cuda/getrows.cu
@@ -10,10 +10,11 @@ static __global__ void k_get_rows(
         /*const size_t nb00,*/ const size_t nb01, const size_t nb02, const size_t nb03,
         const size_t s10, const size_t s11, const size_t s12/*, const size_t s13*/) {
 
-    const int i00 = (blockIdx.x*blockDim.x + threadIdx.x)*2;
-    const int i10 =  blockDim.y*blockIdx.y + threadIdx.y;
-    const int i11 = (blockIdx.z*blockDim.z + threadIdx.z)/ne12;
-    const int i12 = (blockIdx.z*blockDim.z + threadIdx.z)%ne12;
+    // The x and y dimensions of the grid are swapped because the maximum allowed grid size for x is higher.
+    const int i00 = (blockIdx.y * blockDim.x + threadIdx.x)*2;
+    const int i10 =  blockIdx.x;
+    const int i11 =  blockIdx.z / ne12;
+    const int i12 =  blockIdx.z % ne12;
 
     if (i00 >= ne00) {
         return;
@@ -46,10 +47,11 @@ static __global__ void k_get_rows_float(
         /*const size_t nb00,*/ const size_t nb01, const size_t nb02, const size_t nb03,
         const size_t s10, const size_t s11, const size_t s12/*, const size_t s13*/) {
 
-    const int i00 =  blockIdx.x*blockDim.x + threadIdx.x;
-    const int i10 =  blockDim.y*blockIdx.y + threadIdx.y;
-    const int i11 = (blockIdx.z*blockDim.z + threadIdx.z)/ne12;
-    const int i12 = (blockIdx.z*blockDim.z + threadIdx.z)%ne12;
+    // The x and y dimensions of the grid are swapped because the maximum allowed grid size for x is higher.
+    const int i00 = blockIdx.y * blockDim.x + threadIdx.x;
+    const int i10 = blockIdx.x;
+    const int i11 = blockIdx.z / ne12;
+    const int i12 = blockIdx.z % ne12;
 
     if (i00 >= ne00) {
         return;
@@ -94,8 +96,8 @@ static void get_rows_cuda_q(
         const size_t nb1, const size_t nb2, const size_t nb3,
         cudaStream_t stream) {
     const dim3 block_dims(CUDA_GET_ROWS_BLOCK_SIZE, 1, 1);
-    const int block_num_x = (ne00 + 2*CUDA_GET_ROWS_BLOCK_SIZE - 1) / (2*CUDA_GET_ROWS_BLOCK_SIZE);
-    const dim3 block_nums(block_num_x, ne10, ne11*ne12);
+    const int block_num_y = (ne00 + 2*CUDA_GET_ROWS_BLOCK_SIZE - 1) / (2*CUDA_GET_ROWS_BLOCK_SIZE);
+    const dim3 block_nums(ne10, block_num_y, ne11*ne12);
 
     // strides in elements
     // const size_t s0 = nb0 / sizeof(dst_t);
@@ -127,8 +129,8 @@ static void get_rows_cuda_float(
         const size_t nb1, const size_t nb2, const size_t nb3,
         cudaStream_t stream) {
     const dim3 block_dims(CUDA_GET_ROWS_BLOCK_SIZE, 1, 1);
-    const int block_num_x = (ne00 + CUDA_GET_ROWS_BLOCK_SIZE - 1) / CUDA_GET_ROWS_BLOCK_SIZE;
-    const dim3 block_nums(block_num_x, ne10, ne11*ne12);
+    const int block_num_y = (ne00 + CUDA_GET_ROWS_BLOCK_SIZE - 1) / CUDA_GET_ROWS_BLOCK_SIZE;
+    const dim3 block_nums(ne10, block_num_y, ne11*ne12);
 
     // strides in elements
     // const size_t s0 = nb0 / sizeof(dst_t);
diff --git a/ml/backend/ggml/ggml/src/ggml-cuda/ggml-cuda.cu b/ml/backend/ggml/ggml/src/ggml-cuda/ggml-cuda.cu
index 491acccb..6fe86674 100644
--- a/ml/backend/ggml/ggml/src/ggml-cuda/ggml-cuda.cu
+++ b/ml/backend/ggml/ggml/src/ggml-cuda/ggml-cuda.cu
@@ -556,8 +556,8 @@ static enum ggml_status ggml_backend_cuda_buffer_init_tensor(ggml_backend_buffer
 
     if (ggml_is_quantized(tensor->type) && tensor->view_src == nullptr && ggml_backend_buffer_get_usage(buffer) != GGML_BACKEND_BUFFER_USAGE_COMPUTE) {
         // initialize padding to 0 to avoid possible NaN values
-        size_t original_size = ggml_nbytes(tensor);
-        size_t padded_size = ggml_backend_buft_get_alloc_size(buffer->buft, tensor);
+        const size_t original_size = ggml_nbytes(tensor);
+        const size_t padded_size = ggml_backend_buft_get_alloc_size(buffer->buft, tensor);
 
         if (padded_size > original_size) {
             ggml_cuda_set_device(ctx->device);
@@ -680,6 +680,7 @@ static size_t ggml_backend_cuda_buffer_type_get_alloc_size(ggml_backend_buffer_t
 
     if (ggml_is_quantized(tensor->type)) {
         if (ne0 % MATRIX_ROW_PADDING != 0) {
+            GGML_ASSERT(tensor->nb[0] == ggml_element_size(tensor));
             size += ggml_row_size(tensor->type, MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING);
         }
     }
@@ -802,6 +803,7 @@ static void * ggml_backend_cuda_split_buffer_get_base(ggml_backend_buffer_t buff
 
 static enum ggml_status ggml_backend_cuda_split_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
     GGML_ASSERT(tensor->view_src == nullptr); // views of split tensors are not supported
+    GGML_ASSERT(ggml_is_contiguous(tensor) && "split buffers only supported for contiguous tensors");
 
     ggml_backend_cuda_split_buffer_context * ctx = (ggml_backend_cuda_split_buffer_context *)buffer->context;
     ggml_backend_cuda_split_buffer_type_context * buft_ctx = (ggml_backend_cuda_split_buffer_type_context *)buffer->buft->context;
@@ -853,6 +855,7 @@ static void ggml_backend_cuda_split_buffer_set_tensor(ggml_backend_buffer_t buff
     // split tensors must always be set in their entirety at once
     GGML_ASSERT(offset == 0);
     GGML_ASSERT(size == ggml_nbytes(tensor));
+    GGML_ASSERT(ggml_is_contiguous(tensor) && "split buffers only supported for contiguous tensors");
 
     ggml_backend_cuda_split_buffer_type_context * buft_ctx = (ggml_backend_cuda_split_buffer_type_context *)buffer->buft->context;
 
@@ -891,6 +894,7 @@ static void ggml_backend_cuda_split_buffer_get_tensor(ggml_backend_buffer_t buff
     // split tensors must always be set in their entirety at once
     GGML_ASSERT(offset == 0);
     GGML_ASSERT(size == ggml_nbytes(tensor));
+    GGML_ASSERT(ggml_is_contiguous(tensor) && "split buffers only supported for contiguous tensors");
 
     ggml_backend_cuda_split_buffer_type_context * buft_ctx = (ggml_backend_cuda_split_buffer_type_context *)buffer->buft->context;
 
@@ -972,6 +976,7 @@ static size_t ggml_backend_cuda_split_buffer_type_get_alignment(ggml_backend_buf
 
 static size_t ggml_backend_cuda_split_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const ggml_tensor * tensor) {
     ggml_backend_cuda_split_buffer_type_context * ctx = (ggml_backend_cuda_split_buffer_type_context *)buft->context;
+    GGML_ASSERT(ggml_is_contiguous(tensor) && "split buffers only supported for contiguous tensors");
 
     size_t total_size = 0;
 
@@ -1534,6 +1539,8 @@ static void ggml_cuda_op_mul_mat(
 
         // If src0 is on a temporary compute buffer (partial offloading) there may be some padding that needs to be cleared:
         if (ne00 % MATRIX_ROW_PADDING != 0 && ggml_is_quantized(src0->type) && ggml_backend_buffer_get_usage(src0->buffer) == GGML_BACKEND_BUFFER_USAGE_COMPUTE && src0->view_src == nullptr) {
+            GGML_ASSERT(ggml_is_contiguously_allocated(src0));
+            GGML_ASSERT(!src0->view_src);
             const size_t nbytes_data    = ggml_row_size(src0->type, (dev[id].row_high - dev[id].row_low)*ne00);
             const size_t nbytes_padding = ggml_row_size(src0->type, MATRIX_ROW_PADDING - ne00 % MATRIX_ROW_PADDING);
             CUDA_CHECK(cudaMemsetAsync(dev[id].src0_dd + nbytes_data, 0, nbytes_padding, stream));
@@ -1905,13 +1912,19 @@ static void ggml_cuda_mul_mat_batched_cublas(ggml_backend_cuda_context & ctx, co
 static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
     const bool split = ggml_backend_buft_is_cuda_split(src0->buffer->buft);
 
+    // If src0 is a temporary compute buffer it may have some padding that needs to be cleared for mul_mat_vec_q or mul_mat_q.
+    // But if src0 is also a view of another tensor then this cannot be done safely because it may overwrite valid tensor data.
+    // Therefore, in such cases use cuBLAS.
+    const bool bad_padding_clear = ggml_backend_buffer_get_usage(src0->buffer) == GGML_BACKEND_BUFFER_USAGE_COMPUTE
+        && ggml_nbytes(src0) != ggml_backend_buffer_get_alloc_size(src0->buffer, src0) && src0->view_src;
+
     bool use_mul_mat_vec   = (src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16 || src0->type == GGML_TYPE_BF16)
         && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32
         && src0->ne[0] % 2 == 0 && src1->ne[1] == 1;
-    bool use_mul_mat_vec_q = ggml_is_quantized(src0->type)
+    bool use_mul_mat_vec_q = ggml_is_quantized(src0->type) && !bad_padding_clear
         && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32
         && src1->ne[1] <= MMVQ_MAX_BATCH_SIZE;
-    bool use_mul_mat_q     = ggml_is_quantized(src0->type)
+    bool use_mul_mat_q     = ggml_is_quantized(src0->type) && !bad_padding_clear
         && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32;
 
     bool any_gpus_with_slow_fp16   = false;
@@ -2065,9 +2078,11 @@ static void ggml_cuda_mul_mat_id(ggml_backend_cuda_context & ctx, ggml_tensor *
         }
 
         ggml_tensor src0_slice = *src0;
-        src0_slice.ne[2] = 1;
-        src0_slice.nb[3] = src0_slice.nb[2];
-        src0_slice.data  = (char *) src0->data + i02*nb02;
+        src0_slice.ne[2]    = 1;
+        src0_slice.nb[3]    = src0_slice.nb[2];
+        src0_slice.op       = GGML_OP_VIEW;
+        src0_slice.view_src = dst->src[0]; // non-const pointer to src0
+        src0_slice.data     = (char *) src0->data + i02*nb02;
 
         ggml_tensor src1_slice;
         memset(&src1_slice, 0, sizeof(src1_slice));
@@ -3213,16 +3228,16 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
             return false;
 #endif // FLASH_ATTN_AVAILABLE
             if (op->src[1]->ne[0] != op->src[2]->ne[0]) {
-                // different head sizes of K and V are not supported yet
-                return false;
+                const int cc = ggml_cuda_info().devices[dev_ctx->device].cc;
+                if (!new_mma_available(cc) || cc < GGML_CUDA_CC_AMPERE) {
+                    return false;
+                }
+                const int gqa_ratio = op->src[0]->ne[2] / op->src[1]->ne[2];
+                return op->src[1]->ne[0] == 576 && op->src[2]->ne[0] == 512 && op->src[3] && gqa_ratio % 16 == 0;
             }
             if (op->src[0]->ne[0] == 192) {
                 return false;
             }
-            if (op->src[0]->ne[0] == 576) {
-                // DeepSeek MLA
-                return false;
-            }
             if (op->src[0]->ne[3] != 1) {
                 return false;
             }
diff --git a/ml/backend/ggml/ggml/src/ggml-cuda/mmq.cu b/ml/backend/ggml/ggml/src/ggml-cuda/mmq.cu
index f397a7e0..e1cf843d 100644
--- a/ml/backend/ggml/ggml/src/ggml-cuda/mmq.cu
+++ b/ml/backend/ggml/ggml/src/ggml-cuda/mmq.cu
@@ -89,6 +89,17 @@ void ggml_cuda_mul_mat_q(
     const float * src1_d = (const float *) src1->data;
     float       *  dst_d = (float       *)  dst->data;
 
+    // If src0 is a temporary compute buffer, clear any potential padding.
+    if (ggml_backend_buffer_get_usage(src0->buffer) == GGML_BACKEND_BUFFER_USAGE_COMPUTE) {
+        const size_t size_data  = ggml_nbytes(src0);
+        const size_t size_alloc = ggml_backend_buffer_get_alloc_size(src0->buffer, src0);
+        if (size_alloc > size_data) {
+            GGML_ASSERT(ggml_is_contiguously_allocated(src0));
+            GGML_ASSERT(!src0->view_src);
+            CUDA_CHECK(cudaMemsetAsync((char *) src0->data + size_data, 0, size_alloc - size_data, stream));
+        }
+    }
+
     const int64_t ne10_padded = GGML_PAD(ne10, MATRIX_ROW_PADDING);
 
     const int64_t s01 = src0->nb[1] / ts_src0;
@@ -118,7 +129,7 @@ void ggml_cuda_mul_mat_q(
 
         const mmq_args args = {
             src0_d, src0->type, (const int *) src1_q8_1.ptr, nullptr, nullptr, dst_d,
-            ne00, ne01, ne1, s01, s1,
+            ne00, ne01, ne1, s01, ne11, s1,
             ne02, ne12, s02, s12, s2,
             ne03, ne13, s03, s13, s3,
             use_stream_k};
@@ -202,7 +213,7 @@ void ggml_cuda_mul_mat_q(
     // Note that ne02 is used instead of ne12 because the number of y channels determines the z dimension of the CUDA grid.
     const mmq_args args = {
         src0_d, src0->type, (const int *) src1_q8_1.ptr, ids_dst_dev, expert_bounds_dev, dst_d,
-        ne00, ne01, ne_get_rows, s01, s1,
+        ne00, ne01, ne_get_rows, s01, ne_get_rows, s1,
         ne02, ne02, s02, s12, s2,
         ne03, ne13, s03, s13, s3,
         use_stream_k};
@@ -241,7 +252,7 @@ void ggml_cuda_op_mul_mat_q(
         ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_VOLTA && src1_ncols == ne11;
     const mmq_args args = {
         src0_dd_i, src0->type, (const int *) src1_ddq_i, nullptr, nullptr, dst_dd_i,
-        ne00, row_diff, src1_ncols, stride01, nrows_dst,
+        ne00, row_diff, src1_ncols, stride01, ne11, nrows_dst,
         1, 1, 0, 0, 0,
         1, 1, 0, 0, 0,
         use_stream_k};
diff --git a/ml/backend/ggml/ggml/src/ggml-cuda/mmq.cuh b/ml/backend/ggml/ggml/src/ggml-cuda/mmq.cuh
index 8c93e832..80baf459 100644
--- a/ml/backend/ggml/ggml/src/ggml-cuda/mmq.cuh
+++ b/ml/backend/ggml/ggml/src/ggml-cuda/mmq.cuh
@@ -2522,7 +2522,7 @@ template <ggml_type type, int mmq_x, int nwarps, bool need_check, bool fixup>
 static __device__ __forceinline__ void mul_mat_q_process_tile(
         const char * __restrict__ x, const int offset_x, const int * __restrict__ y,
         const int * __restrict__ ids_dst, float * __restrict__ dst, float * __restrict__ tmp_fixup,
-        const int nrows_x, const int ncols_y, const int stride_row_x, const int stride_col_dst,
+        const int stride_row_x, const int ncols_y, const int stride_col_dst,
         const int tile_x_max_i, const int tile_y_max_j, const int kb0_start, const int kb0_stop) {
 
     constexpr int              qk         = ggml_cuda_type_traits<type>::qk;
@@ -2606,7 +2606,7 @@ template <ggml_type type, int mmq_x, int nwarps, bool need_check>
 static __global__ void mul_mat_q(
         const char * __restrict__ x, const int * __restrict__ y, const int32_t * __restrict__ ids_dst,
         const int32_t * __restrict__ expert_bounds, float * __restrict__ dst, float * __restrict__ tmp_fixup,
-        const int ncols_x, const int nrows_x, const int ncols_y, const int stride_row_x, const int stride_col_dst,
+        const int ncols_x, const int nrows_x, const int ncols_dst, const int stride_row_x, const int ncols_y, const int stride_col_dst,
         const int channel_ratio, const int nchannels_y, const int stride_channel_x, const int stride_channel_y, const int stride_channel_dst,
         const int sample_ratio, const int nsamples_y, const int stride_sample_x, const int stride_sample_y, const int stride_sample_dst) {
 
@@ -2619,8 +2619,8 @@ static __global__ void mul_mat_q(
     constexpr int qk    = ggml_cuda_type_traits<type>::qk;
     constexpr int mmq_y = get_mmq_y_device();
 
-    const int ntx = (ncols_y + mmq_x - 1) / mmq_x; // Number of tiles x
-    const int nty = (nrows_x + mmq_y - 1) / mmq_y; // Number of tiles y
+    const int ntx = (ncols_dst + mmq_x - 1) / mmq_x; // Number of tiles x
+    const int nty = (nrows_x   + mmq_y - 1) / mmq_y; // Number of tiles y
 
     // Initialize the ids for writing back data with just the index.
     // For regular matrix multiplications this is never changed.
@@ -2636,6 +2636,7 @@ static __global__ void mul_mat_q(
 
         ids_dst_shared[j] = j;
     }
+    __syncthreads();
 
     // On AMD or old CUDA the performance with stream-k was worse, use conventional tiling instead:
 #if (defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) || __CUDA_ARCH__ < GGML_CUDA_CC_VOLTA
@@ -2647,8 +2648,8 @@ static __global__ void mul_mat_q(
 
         // Defaults for regular matrix multiplication:
         int col_low    = 0;
-        int col_high   = ncols_y;
-        int col_diff   = ncols_y;
+        int col_high   = ncols_dst;
+        int col_diff   = ncols_dst;
         int offset_y   = wt*stride_sample_y   + zt*stride_channel_y;
         int offset_dst = wt*stride_sample_dst + zt*stride_channel_dst + jt*mmq_x*stride_col_dst;
 
@@ -2664,6 +2665,7 @@ static __global__ void mul_mat_q(
                 return;
             }
 
+            // __syncthreads(); // There is no previous tile that could cause a race condition.
 #pragma unroll
             for (int j0 = 0; j0 < mmq_x; j0 += nwarps*WARP_SIZE) {
                 const int j = j0 + threadIdx.y*WARP_SIZE + threadIdx.x;
@@ -2674,6 +2676,7 @@ static __global__ void mul_mat_q(
 
                 ids_dst_shared[j] = ids_dst[col_low + jt*mmq_x + j];
             }
+            __syncthreads();
         }
 
         offset_y   += (col_low + jt*mmq_x)*(sizeof(block_q8_1_mmq)/sizeof(int));
@@ -2686,7 +2689,7 @@ static __global__ void mul_mat_q(
 
         constexpr bool fixup = false;
         mul_mat_q_process_tile<type, mmq_x, nwarps, need_check, fixup>
-            (x, offset_x, y + offset_y, ids_dst_shared, dst + offset_dst, tmp_fixup, nrows_x, ncols_y, stride_row_x, stride_col_dst,
+            (x, offset_x, y + offset_y, ids_dst_shared, dst + offset_dst, tmp_fixup, stride_row_x, ncols_y, stride_col_dst,
              tile_x_max_i, tile_y_max_j, 0, ncols_x/qk);
         return;
     }
@@ -2717,8 +2720,8 @@ static __global__ void mul_mat_q(
 
         // Defaults for regular matrix multiplication:
         int col_low    = 0;
-        int col_high   = ncols_y;
-        int col_diff   = ncols_y;
+        int col_high   = ncols_dst;
+        int col_diff   = ncols_dst;
         int offset_y   = wt*stride_sample_y   + zt*stride_channel_y;
         int offset_dst = wt*stride_sample_dst + zt*stride_channel_dst + jt*mmq_x*stride_col_dst;
 
@@ -2740,6 +2743,7 @@ static __global__ void mul_mat_q(
                 continue;
             }
 
+            __syncthreads();
 #pragma unroll
             for (int j0 = 0; j0 < mmq_x; j0 += nwarps*WARP_SIZE) {
                 const int j = j0 + threadIdx.y*WARP_SIZE + threadIdx.x;
@@ -2750,6 +2754,7 @@ static __global__ void mul_mat_q(
 
                 ids_dst_shared[j] = ids_dst[col_low + jt*mmq_x + j];
             }
+            __syncthreads();
         }
 
         offset_y   += (col_low + jt*mmq_x)*(sizeof(block_q8_1_mmq)/sizeof(int));
@@ -2762,7 +2767,7 @@ static __global__ void mul_mat_q(
 
         constexpr bool fixup = false; // All but (potentially) the last iterations write their data to dst rather than the fixup buffer.
         mul_mat_q_process_tile<type, mmq_x, nwarps, need_check, fixup>
-            (x, offset_x, y + offset_y, ids_dst_shared, dst + offset_dst, tmp_fixup, nrows_x, ncols_y, stride_row_x, stride_col_dst,
+            (x, offset_x, y + offset_y, ids_dst_shared, dst + offset_dst, tmp_fixup, stride_row_x, ncols_y, stride_col_dst,
              tile_x_max_i, tile_y_max_j, kb0_start, kb0_stop);
 
         kbc += blocks_per_ne00;
@@ -2787,8 +2792,8 @@ static __global__ void mul_mat_q(
 
     // Defaults for regular matrix multiplication:
     int col_low    = 0;
-    int col_high   = ncols_y;
-    int col_diff   = ncols_y;
+    int col_high   = ncols_dst;
+    int col_diff   = ncols_dst;
     int offset_y   = wt*stride_sample_y   + zt*stride_channel_y;
     int offset_dst = wt*stride_sample_dst + zt*stride_channel_dst + jt*mmq_x*stride_col_dst;
 
@@ -2805,6 +2810,7 @@ static __global__ void mul_mat_q(
         }
 
         // The memory layout for the fixup buffer is always contiguous, therefore reset ids:
+        __syncthreads();
 #pragma unroll
         for (int j0 = 0; j0 < mmq_x; j0 += nwarps*WARP_SIZE) {
             const int j = j0 + threadIdx.y*WARP_SIZE + threadIdx.x;
@@ -2815,6 +2821,7 @@ static __global__ void mul_mat_q(
 
             ids_dst_shared[j] = j;
         }
+        __syncthreads();
     }
 
     offset_y   += (col_low + jt*mmq_x)*(sizeof(block_q8_1_mmq)/sizeof(int));
@@ -2827,7 +2834,7 @@ static __global__ void mul_mat_q(
 
     constexpr bool fixup = true; // Last index writes its data to fixup buffer to avoid data races with other blocks.
     mul_mat_q_process_tile<type, mmq_x, nwarps, need_check, fixup>
-        (x, offset_x, y + offset_y, ids_dst_shared, dst + offset_dst, tmp_fixup, nrows_x, ncols_y, stride_row_x, stride_col_dst,
+        (x, offset_x, y + offset_y, ids_dst_shared, dst + offset_dst, tmp_fixup, stride_row_x, ncols_y, stride_col_dst,
          tile_x_max_i, tile_y_max_j, kb0_start, kb0_stop);
 }
 
@@ -2835,7 +2842,7 @@ static __global__ void mul_mat_q(
 template <ggml_type type, int mmq_x, int nwarps, bool need_check>
 static __global__ void mul_mat_q_stream_k_fixup(
         const int32_t * ids_dst, const int32_t * expert_bounds, float * __restrict__ dst, const float * __restrict__ tmp_last_tile,
-        const int ncols_x, const int nrows_x, const int ncols_y, const int stride_col_dst,
+        const int ncols_x, const int nrows_x, const int ncols_dst, const int stride_col_dst,
         const int nchannels_y, const int stride_channel_dst, const int nsamples_y, const int stride_sample_dst) {
     constexpr int     mmq_y           = get_mmq_y_device();
     constexpr int     qk              = ggml_cuda_type_traits<type>::qk;
@@ -2844,8 +2851,8 @@ static __global__ void mul_mat_q_stream_k_fixup(
 
     float sum[mmq_x*mmq_y / (nwarps*WARP_SIZE)] = {0.0f};
 
-    const int ntx  = (ncols_y + mmq_x - 1) / mmq_x;
-    const int nty  = (nrows_x + mmq_y - 1) / mmq_y;
+    const int ntx  = (ncols_dst + mmq_x - 1) / mmq_x;
+    const int nty  = (nrows_x   + mmq_y - 1) / mmq_y;
 
     const int bidx0 = blockIdx.x;
 
@@ -2918,8 +2925,8 @@ static __global__ void mul_mat_q_stream_k_fixup(
         const int offset_dst = wt*stride_sample_dst + zt*stride_channel_dst + jt*mmq_x*stride_col_dst + it*mmq_y;
         dst += offset_dst;
 
-        const int i_max = nrows_x - it*mmq_y - 1;
-        const int j_max = ncols_y - jt*mmq_x - 1;
+        const int i_max = nrows_x   - it*mmq_y - 1;
+        const int j_max = ncols_dst - jt*mmq_x - 1;
 
 #pragma unroll
         for (int j0 = 0; j0 < mmq_x; j0 += nwarps) {
@@ -2951,6 +2958,7 @@ static __global__ void mul_mat_q_stream_k_fixup(
     for (int j = threadIdx.y*WARP_SIZE + threadIdx.x; j < mmq_x; j += nwarps*WARP_SIZE) {
         ids_dst_shared[j] = ids_dst[col_low + j];
     }
+    __syncthreads();
 
     const int offset_dst = it*mmq_y;
     dst += offset_dst;
@@ -2981,7 +2989,7 @@ static __global__ void mul_mat_q_stream_k_fixup(
 
 struct mmq_args {
     const char * x; ggml_type type_x; const int * y; const int32_t * ids_dst; const int32_t * expert_bounds; float * dst;
-    int64_t ncols_x; int64_t nrows_x; int64_t ncols_y; int64_t stride_row_x; int64_t nrows_dst;
+    int64_t ncols_x; int64_t nrows_x; int64_t ncols_dst; int64_t stride_row_x; int64_t ncols_y; int64_t nrows_dst;
     int64_t nchannels_x; int64_t nchannels_y; int64_t stride_channel_x; int64_t stride_channel_y; int64_t stride_channel_dst;
     int64_t nsamples_x; int64_t nsamples_y; int64_t stride_sample_x; int64_t stride_sample_y; int64_t stride_sample_dst;
     bool use_stream_k;
@@ -3017,8 +3025,8 @@ static void launch_mul_mat_q(ggml_backend_cuda_context & ctx, const mmq_args & a
     }
 #endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && !defined(GGML_USE_MUSA)
 
-    const int nty  = (args.nrows_x + mmq_y - 1) / mmq_y;
-    const int ntx  = (args.ncols_y + mmq_x - 1) / mmq_x;
+    const int nty  = (args.nrows_x   + mmq_y - 1) / mmq_y;
+    const int ntx  = (args.ncols_dst + mmq_x - 1) / mmq_x;
     const int ntzw = args.nchannels_y * args.nsamples_y;
     const dim3 block_nums_xy_tiling(nty, ntx, ntzw);
 
@@ -3032,14 +3040,14 @@ static void launch_mul_mat_q(ggml_backend_cuda_context & ctx, const mmq_args & a
             constexpr bool need_check = false;
             mul_mat_q<type, mmq_x, MMQ_NWARPS, need_check><<<block_nums_xy_tiling, block_dims, nbytes_shared, stream>>>
                 (args.x, args.y, args.ids_dst, args.expert_bounds, args.dst, nullptr,
-                 args.ncols_x, args.nrows_x, args.ncols_y, args.stride_row_x, args.nrows_dst,
+                 args.ncols_x, args.nrows_x, args.ncols_dst, args.stride_row_x, args.ncols_y, args.nrows_dst,
                  channel_ratio, args.nchannels_y, args.stride_channel_x, args.stride_channel_y, args.stride_channel_dst,
                  sample_ratio, args.nsamples_y, args.stride_sample_x, args.stride_sample_y, args.stride_sample_dst);
         } else {
             constexpr bool need_check = true;
             mul_mat_q<type, mmq_x, MMQ_NWARPS, need_check><<<block_nums_xy_tiling, block_dims, nbytes_shared, stream>>>
                 (args.x, args.y, args.ids_dst, args.expert_bounds, args.dst, nullptr,
-                 args.ncols_x, args.nrows_x, args.ncols_y, args.stride_row_x, args.nrows_dst,
+                 args.ncols_x, args.nrows_x, args.ncols_dst, args.stride_row_x, args.ncols_y, args.nrows_dst,
                  channel_ratio, args.nchannels_y, args.stride_channel_x, args.stride_channel_y, args.stride_channel_dst,
                  sample_ratio, args.nsamples_y, args.stride_sample_x, args.stride_sample_y, args.stride_sample_dst);
         }
@@ -3060,7 +3068,7 @@ static void launch_mul_mat_q(ggml_backend_cuda_context & ctx, const mmq_args & a
 
         mul_mat_q<type, mmq_x, MMQ_NWARPS, need_check><<<block_nums_stream_k, block_dims, nbytes_shared, stream>>>
             (args.x, args.y, args.ids_dst, args.expert_bounds, args.dst, tmp_fixup.ptr,
-             args.ncols_x, args.nrows_x, args.ncols_y, args.stride_row_x, args.nrows_dst,
+             args.ncols_x, args.nrows_x, args.ncols_dst, args.stride_row_x, args.ncols_y, args.nrows_dst,
              channel_ratio, args.nchannels_y, args.stride_channel_x, args.stride_channel_y, args.stride_channel_dst,
              sample_ratio, args.nsamples_y, args.stride_sample_x, args.stride_sample_y, args.stride_sample_dst);
 
@@ -3069,14 +3077,14 @@ static void launch_mul_mat_q(ggml_backend_cuda_context & ctx, const mmq_args & a
         }
 
         mul_mat_q_stream_k_fixup<type, mmq_x, MMQ_NWARPS, need_check><<<block_nums_stream_k, block_dims, 0, stream>>>
-            (args.ids_dst, args.expert_bounds, args.dst, tmp_fixup.ptr, args.ncols_x, args.nrows_x, args.ncols_y,
+            (args.ids_dst, args.expert_bounds, args.dst, tmp_fixup.ptr, args.ncols_x, args.nrows_x, args.ncols_dst,
              args.nrows_dst, args.nchannels_y, args.stride_channel_dst, args.nsamples_y, args.stride_sample_dst);
     } else {
         constexpr bool need_check = true;
 
         mul_mat_q<type, mmq_x, MMQ_NWARPS, need_check><<<block_nums_stream_k, block_dims, nbytes_shared, stream>>>
             (args.x, args.y, args.ids_dst, args.expert_bounds, args.dst, tmp_fixup.ptr,
-             args.ncols_x, args.nrows_x, args.ncols_y, args.stride_row_x, args.nrows_dst,
+             args.ncols_x, args.nrows_x, args.ncols_dst, args.stride_row_x, args.ncols_y, args.nrows_dst,
              channel_ratio, args.nchannels_y, args.stride_channel_x, args.stride_channel_y, args.stride_channel_dst,
              sample_ratio, args.nsamples_y, args.stride_sample_x, args.stride_sample_y, args.stride_sample_dst);
 
@@ -3085,7 +3093,7 @@ static void launch_mul_mat_q(ggml_backend_cuda_context & ctx, const mmq_args & a
         }
 
         mul_mat_q_stream_k_fixup<type, mmq_x, MMQ_NWARPS, need_check><<<block_nums_stream_k, block_dims, 0, stream>>>
-            (args.ids_dst, args.expert_bounds, args.dst, tmp_fixup.ptr, args.ncols_x, args.nrows_x, args.ncols_y,
+            (args.ids_dst, args.expert_bounds, args.dst, tmp_fixup.ptr, args.ncols_x, args.nrows_x, args.ncols_dst,
              args.nrows_dst, args.nchannels_y, args.stride_channel_dst, args.nsamples_y, args.stride_sample_dst);
     }
 }
diff --git a/ml/backend/ggml/ggml/src/ggml-cuda/mmvq.cu b/ml/backend/ggml/ggml/src/ggml-cuda/mmvq.cu
index 132c466f..dc7adf50 100644
--- a/ml/backend/ggml/ggml/src/ggml-cuda/mmvq.cu
+++ b/ml/backend/ggml/ggml/src/ggml-cuda/mmvq.cu
@@ -513,6 +513,17 @@ void ggml_cuda_mul_mat_vec_q(
     const int32_t *  ids_d = ids ? (const int32_t *)  ids->data : nullptr;
     float         *  dst_d =       (float         *)  dst->data;
 
+    // If src0 is a temporary compute buffer, clear any potential padding.
+    if (ggml_backend_buffer_get_usage(src0->buffer) == GGML_BACKEND_BUFFER_USAGE_COMPUTE) {
+        const size_t size_data  = ggml_nbytes(src0);
+        const size_t size_alloc = ggml_backend_buffer_get_alloc_size(src0->buffer, src0);
+        if (size_alloc > size_data) {
+            GGML_ASSERT(ggml_is_contiguously_allocated(src0));
+            GGML_ASSERT(!src0->view_src);
+            CUDA_CHECK(cudaMemsetAsync((char *) src0->data + size_data, 0, size_alloc - size_data, stream));
+        }
+    }
+
     const int64_t ne10_padded = GGML_PAD(ne10, MATRIX_ROW_PADDING);
     ggml_cuda_pool_alloc<char> src1_q8_1(ctx.pool(), ne13*ne12 * ne11*ne10_padded * sizeof(block_q8_1)/QK8_1);
     {
diff --git a/ml/backend/ggml/ggml/src/ggml-cuda/quantize.cu b/ml/backend/ggml/ggml/src/ggml-cuda/quantize.cu
index 931a45ad..cb931814 100644
--- a/ml/backend/ggml/ggml/src/ggml-cuda/quantize.cu
+++ b/ml/backend/ggml/ggml/src/ggml-cuda/quantize.cu
@@ -163,6 +163,7 @@ void quantize_mmq_q8_1_cuda(
         const float * x, const int32_t * ids, void * vy, const ggml_type type_src0,
         const int64_t ne00, const int64_t s01, const int64_t s02, const int64_t s03,
         const int64_t ne0, const int64_t ne1, const int64_t ne2, const int64_t ne3, cudaStream_t stream) {
+    GGML_ASSERT(ne00 % 4 == 0);
     GGML_ASSERT(ne0 % (4*QK8_1) == 0);
 
     const int64_t block_num_x = (ne0 + 4*CUDA_QUANTIZE_BLOCK_SIZE_MMQ - 1) / (4*CUDA_QUANTIZE_BLOCK_SIZE_MMQ);
diff --git a/ml/backend/ggml/ggml/src/ggml-cuda/sum.cu b/ml/backend/ggml/ggml/src/ggml-cuda/sum.cu
index f9589080..eb3d7cdb 100644
--- a/ml/backend/ggml/ggml/src/ggml-cuda/sum.cu
+++ b/ml/backend/ggml/ggml/src/ggml-cuda/sum.cu
@@ -31,7 +31,7 @@ void ggml_cuda_op_sum(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
 
     GGML_ASSERT(src0->type == GGML_TYPE_F32);
     GGML_ASSERT( dst->type == GGML_TYPE_F32);
-    GGML_ASSERT(ggml_is_contiguous(src0));
+    GGML_ASSERT(ggml_is_contiguously_allocated(src0));
 
     const float * src0_d = (const float *) src0->data;
     float * dst_d = (float *) dst->data;
diff --git a/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_1-ncols2_16.cu b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_1-ncols2_16.cu
new file mode 100644
index 00000000..fb26abeb
--- /dev/null
+++ b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_1-ncols2_16.cu
@@ -0,0 +1,5 @@
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
+
+#include "../fattn-mma-f16.cuh"
+
+DECL_FATTN_MMA_F16_CASE(576, 512, 1, 16);
diff --git a/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_1-ncols2_8.cu b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_1-ncols2_8.cu
index 80108615..dc168290 100644
--- a/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_1-ncols2_8.cu
+++ b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_1-ncols2_8.cu
@@ -2,9 +2,9 @@
 
 #include "../fattn-mma-f16.cuh"
 
-DECL_FATTN_MMA_F16_CASE(64, 1, 8);
-DECL_FATTN_MMA_F16_CASE(80, 1, 8);
-DECL_FATTN_MMA_F16_CASE(96, 1, 8);
-DECL_FATTN_MMA_F16_CASE(112, 1, 8);
-DECL_FATTN_MMA_F16_CASE(128, 1, 8);
-DECL_FATTN_MMA_F16_CASE(256, 1, 8);
+DECL_FATTN_MMA_F16_CASE(64, 64, 1, 8);
+DECL_FATTN_MMA_F16_CASE(80, 80, 1, 8);
+DECL_FATTN_MMA_F16_CASE(96, 96, 1, 8);
+DECL_FATTN_MMA_F16_CASE(112, 112, 1, 8);
+DECL_FATTN_MMA_F16_CASE(128, 128, 1, 8);
+DECL_FATTN_MMA_F16_CASE(256, 256, 1, 8);
diff --git a/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_16-ncols2_1.cu b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_16-ncols2_1.cu
index 66161c0a..9d3cfd8e 100644
--- a/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_16-ncols2_1.cu
+++ b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_16-ncols2_1.cu
@@ -2,9 +2,9 @@
 
 #include "../fattn-mma-f16.cuh"
 
-DECL_FATTN_MMA_F16_CASE(64, 16, 1);
-DECL_FATTN_MMA_F16_CASE(80, 16, 1);
-DECL_FATTN_MMA_F16_CASE(96, 16, 1);
-DECL_FATTN_MMA_F16_CASE(112, 16, 1);
-DECL_FATTN_MMA_F16_CASE(128, 16, 1);
-DECL_FATTN_MMA_F16_CASE(256, 16, 1);
+DECL_FATTN_MMA_F16_CASE(64, 64, 16, 1);
+DECL_FATTN_MMA_F16_CASE(80, 80, 16, 1);
+DECL_FATTN_MMA_F16_CASE(96, 96, 16, 1);
+DECL_FATTN_MMA_F16_CASE(112, 112, 16, 1);
+DECL_FATTN_MMA_F16_CASE(128, 128, 16, 1);
+DECL_FATTN_MMA_F16_CASE(256, 256, 16, 1);
diff --git a/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_16-ncols2_2.cu b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_16-ncols2_2.cu
index ee88c72a..2e1883af 100644
--- a/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_16-ncols2_2.cu
+++ b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_16-ncols2_2.cu
@@ -2,9 +2,9 @@
 
 #include "../fattn-mma-f16.cuh"
 
-DECL_FATTN_MMA_F16_CASE(64, 16, 2);
-DECL_FATTN_MMA_F16_CASE(80, 16, 2);
-DECL_FATTN_MMA_F16_CASE(96, 16, 2);
-DECL_FATTN_MMA_F16_CASE(112, 16, 2);
-DECL_FATTN_MMA_F16_CASE(128, 16, 2);
-DECL_FATTN_MMA_F16_CASE(256, 16, 2);
+DECL_FATTN_MMA_F16_CASE(64, 64, 16, 2);
+DECL_FATTN_MMA_F16_CASE(80, 80, 16, 2);
+DECL_FATTN_MMA_F16_CASE(96, 96, 16, 2);
+DECL_FATTN_MMA_F16_CASE(112, 112, 16, 2);
+DECL_FATTN_MMA_F16_CASE(128, 128, 16, 2);
+DECL_FATTN_MMA_F16_CASE(256, 256, 16, 2);
diff --git a/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_16-ncols2_4.cu b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_16-ncols2_4.cu
index d888a5a4..2074e954 100644
--- a/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_16-ncols2_4.cu
+++ b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_16-ncols2_4.cu
@@ -2,9 +2,9 @@
 
 #include "../fattn-mma-f16.cuh"
 
-DECL_FATTN_MMA_F16_CASE(64, 16, 4);
-DECL_FATTN_MMA_F16_CASE(80, 16, 4);
-DECL_FATTN_MMA_F16_CASE(96, 16, 4);
-DECL_FATTN_MMA_F16_CASE(112, 16, 4);
-DECL_FATTN_MMA_F16_CASE(128, 16, 4);
-DECL_FATTN_MMA_F16_CASE(256, 16, 4);
+DECL_FATTN_MMA_F16_CASE(64, 64, 16, 4);
+DECL_FATTN_MMA_F16_CASE(80, 80, 16, 4);
+DECL_FATTN_MMA_F16_CASE(96, 96, 16, 4);
+DECL_FATTN_MMA_F16_CASE(112, 112, 16, 4);
+DECL_FATTN_MMA_F16_CASE(128, 128, 16, 4);
+DECL_FATTN_MMA_F16_CASE(256, 256, 16, 4);
diff --git a/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_2-ncols2_16.cu b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_2-ncols2_16.cu
new file mode 100644
index 00000000..f011a208
--- /dev/null
+++ b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_2-ncols2_16.cu
@@ -0,0 +1,5 @@
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
+
+#include "../fattn-mma-f16.cuh"
+
+DECL_FATTN_MMA_F16_CASE(576, 512, 2, 16);
diff --git a/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_2-ncols2_4.cu b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_2-ncols2_4.cu
index d93a2d08..24c64cf0 100644
--- a/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_2-ncols2_4.cu
+++ b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_2-ncols2_4.cu
@@ -2,9 +2,9 @@
 
 #include "../fattn-mma-f16.cuh"
 
-DECL_FATTN_MMA_F16_CASE(64, 2, 4);
-DECL_FATTN_MMA_F16_CASE(80, 2, 4);
-DECL_FATTN_MMA_F16_CASE(96, 2, 4);
-DECL_FATTN_MMA_F16_CASE(112, 2, 4);
-DECL_FATTN_MMA_F16_CASE(128, 2, 4);
-DECL_FATTN_MMA_F16_CASE(256, 2, 4);
+DECL_FATTN_MMA_F16_CASE(64, 64, 2, 4);
+DECL_FATTN_MMA_F16_CASE(80, 80, 2, 4);
+DECL_FATTN_MMA_F16_CASE(96, 96, 2, 4);
+DECL_FATTN_MMA_F16_CASE(112, 112, 2, 4);
+DECL_FATTN_MMA_F16_CASE(128, 128, 2, 4);
+DECL_FATTN_MMA_F16_CASE(256, 256, 2, 4);
diff --git a/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_2-ncols2_8.cu b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_2-ncols2_8.cu
index 617464c9..163b1d93 100644
--- a/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_2-ncols2_8.cu
+++ b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_2-ncols2_8.cu
@@ -2,9 +2,9 @@
 
 #include "../fattn-mma-f16.cuh"
 
-DECL_FATTN_MMA_F16_CASE(64, 2, 8);
-DECL_FATTN_MMA_F16_CASE(80, 2, 8);
-DECL_FATTN_MMA_F16_CASE(96, 2, 8);
-DECL_FATTN_MMA_F16_CASE(112, 2, 8);
-DECL_FATTN_MMA_F16_CASE(128, 2, 8);
-DECL_FATTN_MMA_F16_CASE(256, 2, 8);
+DECL_FATTN_MMA_F16_CASE(64, 64, 2, 8);
+DECL_FATTN_MMA_F16_CASE(80, 80, 2, 8);
+DECL_FATTN_MMA_F16_CASE(96, 96, 2, 8);
+DECL_FATTN_MMA_F16_CASE(112, 112, 2, 8);
+DECL_FATTN_MMA_F16_CASE(128, 128, 2, 8);
+DECL_FATTN_MMA_F16_CASE(256, 256, 2, 8);
diff --git a/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_32-ncols2_1.cu b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_32-ncols2_1.cu
index 970d2b68..0543532e 100644
--- a/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_32-ncols2_1.cu
+++ b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_32-ncols2_1.cu
@@ -2,9 +2,9 @@
 
 #include "../fattn-mma-f16.cuh"
 
-DECL_FATTN_MMA_F16_CASE(64, 32, 1);
-DECL_FATTN_MMA_F16_CASE(80, 32, 1);
-DECL_FATTN_MMA_F16_CASE(96, 32, 1);
-DECL_FATTN_MMA_F16_CASE(112, 32, 1);
-DECL_FATTN_MMA_F16_CASE(128, 32, 1);
-DECL_FATTN_MMA_F16_CASE(256, 32, 1);
+DECL_FATTN_MMA_F16_CASE(64, 64, 32, 1);
+DECL_FATTN_MMA_F16_CASE(80, 80, 32, 1);
+DECL_FATTN_MMA_F16_CASE(96, 96, 32, 1);
+DECL_FATTN_MMA_F16_CASE(112, 112, 32, 1);
+DECL_FATTN_MMA_F16_CASE(128, 128, 32, 1);
+DECL_FATTN_MMA_F16_CASE(256, 256, 32, 1);
diff --git a/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_32-ncols2_2.cu b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_32-ncols2_2.cu
index 65cd377c..407b6cf4 100644
--- a/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_32-ncols2_2.cu
+++ b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_32-ncols2_2.cu
@@ -2,9 +2,9 @@
 
 #include "../fattn-mma-f16.cuh"
 
-DECL_FATTN_MMA_F16_CASE(64, 32, 2);
-DECL_FATTN_MMA_F16_CASE(80, 32, 2);
-DECL_FATTN_MMA_F16_CASE(96, 32, 2);
-DECL_FATTN_MMA_F16_CASE(112, 32, 2);
-DECL_FATTN_MMA_F16_CASE(128, 32, 2);
-DECL_FATTN_MMA_F16_CASE(256, 32, 2);
+DECL_FATTN_MMA_F16_CASE(64, 64, 32, 2);
+DECL_FATTN_MMA_F16_CASE(80, 80, 32, 2);
+DECL_FATTN_MMA_F16_CASE(96, 96, 32, 2);
+DECL_FATTN_MMA_F16_CASE(112, 112, 32, 2);
+DECL_FATTN_MMA_F16_CASE(128, 128, 32, 2);
+DECL_FATTN_MMA_F16_CASE(256, 256, 32, 2);
diff --git a/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_4-ncols2_16.cu b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_4-ncols2_16.cu
new file mode 100644
index 00000000..f5fd0e23
--- /dev/null
+++ b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_4-ncols2_16.cu
@@ -0,0 +1,5 @@
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
+
+#include "../fattn-mma-f16.cuh"
+
+DECL_FATTN_MMA_F16_CASE(576, 512, 4, 16);
diff --git a/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_4-ncols2_2.cu b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_4-ncols2_2.cu
index f4a8bf34..5e466850 100644
--- a/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_4-ncols2_2.cu
+++ b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_4-ncols2_2.cu
@@ -2,9 +2,9 @@
 
 #include "../fattn-mma-f16.cuh"
 
-DECL_FATTN_MMA_F16_CASE(64, 4, 2);
-DECL_FATTN_MMA_F16_CASE(80, 4, 2);
-DECL_FATTN_MMA_F16_CASE(96, 4, 2);
-DECL_FATTN_MMA_F16_CASE(112, 4, 2);
-DECL_FATTN_MMA_F16_CASE(128, 4, 2);
-DECL_FATTN_MMA_F16_CASE(256, 4, 2);
+DECL_FATTN_MMA_F16_CASE(64, 64, 4, 2);
+DECL_FATTN_MMA_F16_CASE(80, 80, 4, 2);
+DECL_FATTN_MMA_F16_CASE(96, 96, 4, 2);
+DECL_FATTN_MMA_F16_CASE(112, 112, 4, 2);
+DECL_FATTN_MMA_F16_CASE(128, 128, 4, 2);
+DECL_FATTN_MMA_F16_CASE(256, 256, 4, 2);
diff --git a/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_4-ncols2_4.cu b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_4-ncols2_4.cu
index de191a8a..1ada657f 100644
--- a/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_4-ncols2_4.cu
+++ b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_4-ncols2_4.cu
@@ -2,9 +2,9 @@
 
 #include "../fattn-mma-f16.cuh"
 
-DECL_FATTN_MMA_F16_CASE(64, 4, 4);
-DECL_FATTN_MMA_F16_CASE(80, 4, 4);
-DECL_FATTN_MMA_F16_CASE(96, 4, 4);
-DECL_FATTN_MMA_F16_CASE(112, 4, 4);
-DECL_FATTN_MMA_F16_CASE(128, 4, 4);
-DECL_FATTN_MMA_F16_CASE(256, 4, 4);
+DECL_FATTN_MMA_F16_CASE(64, 64, 4, 4);
+DECL_FATTN_MMA_F16_CASE(80, 80, 4, 4);
+DECL_FATTN_MMA_F16_CASE(96, 96, 4, 4);
+DECL_FATTN_MMA_F16_CASE(112, 112, 4, 4);
+DECL_FATTN_MMA_F16_CASE(128, 128, 4, 4);
+DECL_FATTN_MMA_F16_CASE(256, 256, 4, 4);
diff --git a/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_4-ncols2_8.cu b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_4-ncols2_8.cu
index e8cb0e1b..bad296b4 100644
--- a/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_4-ncols2_8.cu
+++ b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_4-ncols2_8.cu
@@ -2,9 +2,9 @@
 
 #include "../fattn-mma-f16.cuh"
 
-DECL_FATTN_MMA_F16_CASE(64, 4, 8);
-DECL_FATTN_MMA_F16_CASE(80, 4, 8);
-DECL_FATTN_MMA_F16_CASE(96, 4, 8);
-DECL_FATTN_MMA_F16_CASE(112, 4, 8);
-DECL_FATTN_MMA_F16_CASE(128, 4, 8);
-DECL_FATTN_MMA_F16_CASE(256, 4, 8);
+DECL_FATTN_MMA_F16_CASE(64, 64, 4, 8);
+DECL_FATTN_MMA_F16_CASE(80, 80, 4, 8);
+DECL_FATTN_MMA_F16_CASE(96, 96, 4, 8);
+DECL_FATTN_MMA_F16_CASE(112, 112, 4, 8);
+DECL_FATTN_MMA_F16_CASE(128, 128, 4, 8);
+DECL_FATTN_MMA_F16_CASE(256, 256, 4, 8);
diff --git a/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_64-ncols2_1.cu b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_64-ncols2_1.cu
index a532e962..0d7a9c72 100644
--- a/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_64-ncols2_1.cu
+++ b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_64-ncols2_1.cu
@@ -2,9 +2,9 @@
 
 #include "../fattn-mma-f16.cuh"
 
-DECL_FATTN_MMA_F16_CASE(64, 64, 1);
-DECL_FATTN_MMA_F16_CASE(80, 64, 1);
-DECL_FATTN_MMA_F16_CASE(96, 64, 1);
-DECL_FATTN_MMA_F16_CASE(112, 64, 1);
-DECL_FATTN_MMA_F16_CASE(128, 64, 1);
-DECL_FATTN_MMA_F16_CASE(256, 64, 1);
+DECL_FATTN_MMA_F16_CASE(64, 64, 64, 1);
+DECL_FATTN_MMA_F16_CASE(80, 80, 64, 1);
+DECL_FATTN_MMA_F16_CASE(96, 96, 64, 1);
+DECL_FATTN_MMA_F16_CASE(112, 112, 64, 1);
+DECL_FATTN_MMA_F16_CASE(128, 128, 64, 1);
+DECL_FATTN_MMA_F16_CASE(256, 256, 64, 1);
diff --git a/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_8-ncols2_1.cu b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_8-ncols2_1.cu
index bf25181a..9d5a9976 100644
--- a/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_8-ncols2_1.cu
+++ b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_8-ncols2_1.cu
@@ -2,9 +2,9 @@
 
 #include "../fattn-mma-f16.cuh"
 
-DECL_FATTN_MMA_F16_CASE(64, 8, 1);
-DECL_FATTN_MMA_F16_CASE(80, 8, 1);
-DECL_FATTN_MMA_F16_CASE(96, 8, 1);
-DECL_FATTN_MMA_F16_CASE(112, 8, 1);
-DECL_FATTN_MMA_F16_CASE(128, 8, 1);
-DECL_FATTN_MMA_F16_CASE(256, 8, 1);
+DECL_FATTN_MMA_F16_CASE(64, 64, 8, 1);
+DECL_FATTN_MMA_F16_CASE(80, 80, 8, 1);
+DECL_FATTN_MMA_F16_CASE(96, 96, 8, 1);
+DECL_FATTN_MMA_F16_CASE(112, 112, 8, 1);
+DECL_FATTN_MMA_F16_CASE(128, 128, 8, 1);
+DECL_FATTN_MMA_F16_CASE(256, 256, 8, 1);
diff --git a/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_8-ncols2_2.cu b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_8-ncols2_2.cu
index 378c132e..a6e6f093 100644
--- a/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_8-ncols2_2.cu
+++ b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_8-ncols2_2.cu
@@ -2,9 +2,9 @@
 
 #include "../fattn-mma-f16.cuh"
 
-DECL_FATTN_MMA_F16_CASE(64, 8, 2);
-DECL_FATTN_MMA_F16_CASE(80, 8, 2);
-DECL_FATTN_MMA_F16_CASE(96, 8, 2);
-DECL_FATTN_MMA_F16_CASE(112, 8, 2);
-DECL_FATTN_MMA_F16_CASE(128, 8, 2);
-DECL_FATTN_MMA_F16_CASE(256, 8, 2);
+DECL_FATTN_MMA_F16_CASE(64, 64, 8, 2);
+DECL_FATTN_MMA_F16_CASE(80, 80, 8, 2);
+DECL_FATTN_MMA_F16_CASE(96, 96, 8, 2);
+DECL_FATTN_MMA_F16_CASE(112, 112, 8, 2);
+DECL_FATTN_MMA_F16_CASE(128, 128, 8, 2);
+DECL_FATTN_MMA_F16_CASE(256, 256, 8, 2);
diff --git a/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_8-ncols2_4.cu b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_8-ncols2_4.cu
index 372641be..86d4ffae 100644
--- a/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_8-ncols2_4.cu
+++ b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_8-ncols2_4.cu
@@ -2,9 +2,9 @@
 
 #include "../fattn-mma-f16.cuh"
 
-DECL_FATTN_MMA_F16_CASE(64, 8, 4);
-DECL_FATTN_MMA_F16_CASE(80, 8, 4);
-DECL_FATTN_MMA_F16_CASE(96, 8, 4);
-DECL_FATTN_MMA_F16_CASE(112, 8, 4);
-DECL_FATTN_MMA_F16_CASE(128, 8, 4);
-DECL_FATTN_MMA_F16_CASE(256, 8, 4);
+DECL_FATTN_MMA_F16_CASE(64, 64, 8, 4);
+DECL_FATTN_MMA_F16_CASE(80, 80, 8, 4);
+DECL_FATTN_MMA_F16_CASE(96, 96, 8, 4);
+DECL_FATTN_MMA_F16_CASE(112, 112, 8, 4);
+DECL_FATTN_MMA_F16_CASE(128, 128, 8, 4);
+DECL_FATTN_MMA_F16_CASE(256, 256, 8, 4);
diff --git a/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_8-ncols2_8.cu b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_8-ncols2_8.cu
index 9ff5968b..680a13ca 100644
--- a/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_8-ncols2_8.cu
+++ b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_8-ncols2_8.cu
@@ -2,9 +2,9 @@
 
 #include "../fattn-mma-f16.cuh"
 
-DECL_FATTN_MMA_F16_CASE(64, 8, 8);
-DECL_FATTN_MMA_F16_CASE(80, 8, 8);
-DECL_FATTN_MMA_F16_CASE(96, 8, 8);
-DECL_FATTN_MMA_F16_CASE(112, 8, 8);
-DECL_FATTN_MMA_F16_CASE(128, 8, 8);
-DECL_FATTN_MMA_F16_CASE(256, 8, 8);
+DECL_FATTN_MMA_F16_CASE(64, 64, 8, 8);
+DECL_FATTN_MMA_F16_CASE(80, 80, 8, 8);
+DECL_FATTN_MMA_F16_CASE(96, 96, 8, 8);
+DECL_FATTN_MMA_F16_CASE(112, 112, 8, 8);
+DECL_FATTN_MMA_F16_CASE(128, 128, 8, 8);
+DECL_FATTN_MMA_F16_CASE(256, 256, 8, 8);
diff --git a/ml/backend/ggml/ggml/src/ggml-metal/ggml-metal-embed.metal b/ml/backend/ggml/ggml/src/ggml-metal/ggml-metal-embed.metal
index 223dc180..56fdb3cd 100644
--- a/ml/backend/ggml/ggml/src/ggml-metal/ggml-metal-embed.metal
+++ b/ml/backend/ggml/ggml/src/ggml-metal/ggml-metal-embed.metal
@@ -2071,6 +2071,10 @@ typedef struct {
     float    attn_factor;
     float    beta_fast;
     float    beta_slow;
+    int32_t  sect_0;
+    int32_t  sect_1;
+    int32_t  sect_2;
+    int32_t  sect_3;
 } ggml_metal_kargs_rope;
 
 typedef struct {
@@ -2163,21 +2167,42 @@ typedef struct {
 } ggml_metal_kargs_mul_mv_ext;
 
 typedef struct {
-    int32_t  nei0;
-    int32_t  nei1;
-    uint64_t nbi1;
+    int32_t  ne10;
+    int32_t  ne11;  // n_expert_used (bcast)
+    uint64_t nb11;
+    uint64_t nb12;
+    int32_t  neh11; // n_tokens
+    uint64_t nbh11;
+    int32_t  ne20;  // n_expert_used
+    uint64_t nb21;
+} ggml_metal_kargs_mul_mm_id_map0;
+
+typedef struct {
+    int32_t  ne20; // n_expert_used
+    int32_t  neh0;
+    int32_t  neh1;
+    uint64_t nbh1;
+    uint64_t nbh2;
+    int32_t  ne0;
+    uint64_t nb1;
+    uint64_t nb2;
+} ggml_metal_kargs_mul_mm_id_map1;
+
+typedef struct {
     int32_t  ne00;
     int32_t  ne02;
     uint64_t nb01;
     uint64_t nb02;
-    int32_t  ne11;
-    int32_t  ne12;
-    int32_t  ne13;
-    uint64_t nb10;
-    uint64_t nb11;
-    uint64_t nb12;
-    int32_t  ne0;
-    int32_t  ne1;
+    uint64_t nb03;
+    int32_t  neh12;
+    uint64_t nbh10;
+    uint64_t nbh11;
+    uint64_t nbh12;
+    uint64_t nbh13;
+    int32_t  neh0;
+    int32_t  neh1;
+    int16_t  r2;
+    int16_t  r3;
 } ggml_metal_kargs_mul_mm_id;
 
 typedef struct {
@@ -5166,8 +5191,148 @@ kernel void kernel_rope_neox(
     }
 }
 
+template<typename T>
+kernel void kernel_rope_multi(
+        constant ggml_metal_kargs_rope & args,
+        device const char * src0,
+        device const char * src1,
+        device const char * src2,
+        device       char * dst,
+        ushort  tiitg[[thread_index_in_threadgroup]],
+        ushort3 tptg [[threads_per_threadgroup]],
+        uint3   tgpig[[threadgroup_position_in_grid]]) {
+    const int i3 = tgpig[2];
+    const int i2 = tgpig[1];
+    const int i1 = tgpig[0];
+
+    float corr_dims[2];
+    rope_yarn_corr_dims(args.n_dims, args.n_ctx_orig, args.freq_base, args.beta_fast, args.beta_slow, corr_dims);
+
+    device const int32_t * pos = (device const int32_t *) src1;
+
+    const float inv_ndims = -1.f/args.n_dims;
+
+    float cos_theta;
+    float sin_theta;
+
+    for (int i0 = 2*tiitg; i0 < args.ne0; i0 += 2*tptg.x) {
+        if (i0 < args.n_dims) {
+            const int ic = i0/2;
+
+            // mrope theta calculations
+            // note: the rest is the same as kernel_rope_neox
+            const int sect_dims = args.sect_0 + args.sect_1 + args.sect_2 + args.sect_3;
+            const int sec_w01   = args.sect_0 + args.sect_1;               // end of section 1
+            const int sec_w012  = args.sect_0 + args.sect_1 + args.sect_2; // end of section 2
+            const int sector    = ic % sect_dims;
+
+            float theta_base;
+            if (sector < args.sect_0) {
+                theta_base = (float) pos[i2];
+            } else if (sector < sec_w01) {
+                theta_base = (float) pos[i2 + args.ne02];
+            } else if (sector < sec_w012) {
+                theta_base = (float) pos[i2 + args.ne02 * 2];
+            } else {
+                theta_base = (float) pos[i2 + args.ne02 * 3];
+            }
+            // end of mrope
+
+            const float theta = theta_base * pow(args.freq_base, inv_ndims*i0);
+
+            const float freq_factor = src2 != src0 ? ((device const float *) src2)[ic] : 1.0f;
+
+            rope_yarn(theta/freq_factor, args.freq_scale, corr_dims, i0, args.ext_factor, args.attn_factor, &cos_theta, &sin_theta);
+
+            device const T * const src = (device T *)(src0 + i3*args.nb03 + i2*args.nb02 + i1*args.nb01 + ic*args.nb00);
+            device       T * dst_data  = (device T *)( dst + i3*args.nb3  + i2*args.nb2  + i1*args.nb1  + ic*args.nb0);
+
+            const float x0 = src[0];
+            const float x1 = src[args.n_dims/2];
+
+            dst_data[0]             = x0*cos_theta - x1*sin_theta;
+            dst_data[args.n_dims/2] = x0*sin_theta + x1*cos_theta;
+        } else {
+            device const T * const src = (device T *)(src0 + i3*args.nb03 + i2*args.nb02 + i1*args.nb01 + i0*args.nb00);
+            device       T * dst_data  = (device T *)( dst + i3*args.nb3  + i2*args.nb2  + i1*args.nb1  + i0*args.nb0);
+
+            dst_data[0] = src[0];
+            dst_data[1] = src[1];
+        }
+    }
+}
+
+template<typename T>
+kernel void kernel_rope_vision(
+        constant ggml_metal_kargs_rope & args,
+        device const char * src0,
+        device const char * src1,
+        device const char * src2,
+        device       char * dst,
+        ushort  tiitg[[thread_index_in_threadgroup]],
+        ushort3 tptg [[threads_per_threadgroup]],
+        uint3   tgpig[[threadgroup_position_in_grid]]) {
+    const int i3 = tgpig[2];
+    const int i2 = tgpig[1];
+    const int i1 = tgpig[0];
+
+    float corr_dims[2];
+    rope_yarn_corr_dims(args.n_dims, args.n_ctx_orig, args.freq_base, args.beta_fast, args.beta_slow, corr_dims);
+
+    device const int32_t * pos = (device const int32_t *) src1;
+
+    const float inv_ndims = -1.f/args.n_dims;
+
+    float cos_theta;
+    float sin_theta;
+
+    for (int i0 = 2*tiitg; i0 < args.ne0; i0 += 2*tptg.x) {
+        if (i0 < 2*args.n_dims) { // different from kernel_rope_multi
+            const int ic = i0/2;
+
+            // mrope theta calculations (only support 2 dimensions)
+            const int sect_dims = args.sect_0 + args.sect_1;
+            const int sector    = ic % sect_dims;
+
+            float p;
+            float theta_base;
+            if (sector < args.sect_1) {
+                p = (float) sector;
+                theta_base = (float) pos[i2];
+            } else {
+                p = (float) sector - args.sect_0;
+                theta_base = (float) pos[i2 + args.ne02];
+            }
+
+            const float theta = theta_base * pow(args.freq_base, 2.0f * inv_ndims * p);
+            // end of mrope
+
+            const float freq_factor = src2 != src0 ? ((device const float *) src2)[ic] : 1.0f;
+
+            rope_yarn(theta/freq_factor, args.freq_scale, corr_dims, i0, args.ext_factor, args.attn_factor, &cos_theta, &sin_theta);
+
+            device const T * const src = (device T *)(src0 + i3*args.nb03 + i2*args.nb02 + i1*args.nb01 + ic*args.nb00);
+            device       T * dst_data  = (device T *)( dst + i3*args.nb3  + i2*args.nb2  + i1*args.nb1  + ic*args.nb0);
+
+            const float x0 = src[0];
+            const float x1 = src[args.n_dims]; // different from kernel_rope_multi
+
+            dst_data[0]           = x0*cos_theta - x1*sin_theta;
+            dst_data[args.n_dims] = x0*sin_theta + x1*cos_theta; // different from kernel_rope_multi
+        } else {
+            device const T * const src = (device T *)(src0 + i3*args.nb03 + i2*args.nb02 + i1*args.nb01 + i0*args.nb00);
+            device       T * dst_data  = (device T *)( dst + i3*args.nb3  + i2*args.nb2  + i1*args.nb1  + i0*args.nb0);
+
+            dst_data[0] = src[0];
+            dst_data[1] = src[1];
+        }
+    }
+}
+
 typedef decltype(kernel_rope_norm<float>) kernel_rope_norm_t;
 typedef decltype(kernel_rope_neox<float>) kernel_rope_neox_t;
+typedef decltype(kernel_rope_multi<float>) kernel_rope_multi_t;
+typedef decltype(kernel_rope_vision<float>) kernel_rope_vision_t;
 
 template [[host_name("kernel_rope_norm_f32")]] kernel kernel_rope_norm_t kernel_rope_norm<float>;
 template [[host_name("kernel_rope_norm_f16")]] kernel kernel_rope_norm_t kernel_rope_norm<half>;
@@ -5175,6 +5340,12 @@ template [[host_name("kernel_rope_norm_f16")]] kernel kernel_rope_norm_t kernel_
 template [[host_name("kernel_rope_neox_f32")]] kernel kernel_rope_neox_t kernel_rope_neox<float>;
 template [[host_name("kernel_rope_neox_f16")]] kernel kernel_rope_neox_t kernel_rope_neox<half>;
 
+template [[host_name("kernel_rope_multi_f32")]] kernel kernel_rope_multi_t kernel_rope_multi<float>;
+template [[host_name("kernel_rope_multi_f16")]] kernel kernel_rope_multi_t kernel_rope_multi<half>;
+
+template [[host_name("kernel_rope_vision_f32")]] kernel kernel_rope_vision_t kernel_rope_vision<float>;
+template [[host_name("kernel_rope_vision_f16")]] kernel kernel_rope_vision_t kernel_rope_vision<half>;
+
 typedef void (im2col_t)(
         device const float * x,
         device        char * dst,
@@ -8834,127 +9005,219 @@ kernel void kernel_mul_mm(
     }
 }
 
-// same as kernel_mul_mm_impl, but src1 and dst are accessed via indices stored in rowids
-// TODO: this kernel needs to be reimplemented from scratch for better performance
-template<typename block_q, short nl, void (*dequantize_func)(device const block_q *, short, thread half4x4 &)>
-void kernel_mul_mm_id_impl(
-        int32_t  ne00,
-        int32_t  ne02,
-        uint64_t nb01,
-        uint64_t nb02,
-        int32_t  ne11,
-        int32_t  ne12,
-        uint64_t nb10,
-        uint64_t nb11,
-        uint64_t nb12,
-        int32_t  ne0,
-        int32_t  ne1,
-        int64_t  ne0ne1,
-        device   const char * src0,
-        device   const char * src1,
-        threadgroup ushort2 * rowids,
-        device         char * dst,
-        threadgroup    char * shmem,
+template<typename T4>
+kernel void kernel_mul_mm_id_map0(
+        constant ggml_metal_kargs_mul_mm_id_map0 & args,
+        device  const char * src1,
+        device  const char * src2,
+        device        char * hsrc1,
+        device        char * htpe,
+        device        char * hids,
+        uint3   tgpig[[threadgroup_position_in_grid]],
+        ushort3 tpitg[[thread_position_in_threadgroup]],
+        ushort3   ntg[[threads_per_threadgroup]]) {
+    const int ide = tgpig[0]; // expert id
+
+    int n_all = 0;
+
+    device int32_t * ids_i32 = (device int32_t *) (hids);
+
+    for (int i21 = 0; i21 < args.neh11; i21++) { // n_tokens
+        device const int32_t * src2_i32 = (device const int32_t *) (src2 + i21*args.nb21);
+
+        for (int i20 = 0; i20 < args.ne20; i20++) { // n_expert_used
+            if (src2_i32[i20] != ide) {
+                continue;
+            }
+
+            device const float4 *  src1_f32x4 = (device const float4 *) ( src1 + i21*args.nb12 + (i20%args.ne11)*args.nb11);
+            device       T4     * hsrc1_f32x4 = (device       T4     *) (hsrc1 + (ide*args.neh11 + n_all)*args.nbh11);
+
+            for (int64_t i00 = tpitg.x; i00 < args.ne10/4; i00 += ntg.x) {
+                hsrc1_f32x4[i00] = (T4) (src1_f32x4[i00]);
+            }
+
+            if (tpitg.x == 0) {
+                ids_i32[i21*args.ne20 + i20] = ide*args.neh11 + n_all;
+            }
+
+            ++n_all;
+        }
+    }
+
+    if (tpitg.x == 0) {
+        device int32_t * tpe_i32 = (device int32_t *) (htpe);
+        tpe_i32[ide] = n_all;
+    }
+}
+
+typedef decltype(kernel_mul_mm_id_map0<half4>) kernel_mul_mm_id_map0_t;
+
+template [[host_name("kernel_mul_mm_id_map0_f16")]] kernel kernel_mul_mm_id_map0_t kernel_mul_mm_id_map0<half4>;
+
+template<typename T>
+kernel void kernel_mul_mm_id_map1(
+        constant ggml_metal_kargs_mul_mm_id_map1 & args,
+        device  const char * hdst,
+        device  const char * hids,
+        device        char * dst,
+        uint3   tgpig[[threadgroup_position_in_grid]],
+        ushort3 tpitg[[thread_position_in_threadgroup]],
+        ushort3   ntg[[threads_per_threadgroup]]) {
+    const int i20 = tgpig[0]; // used expert
+    const int i21 = tgpig[1]; // token
+
+    device const int32_t * ids_i32    = (device const int32_t *) (hids);
+    device       float4  * dst_f32x4  = (device       float4  *) (dst + i20*args.nb1 + i21*args.nb2);
+
+    const int id = ids_i32[i21*args.ne20 + i20];
+
+    const int ide = id / args.neh1;
+    const int idt = id % args.neh1;
+
+    device const float4 * hdst_f32x4 = (device const float4 *) (hdst + idt*args.nbh1 + ide*args.nbh2);
+
+    for (int64_t i0 = tpitg.x; i0 < args.neh0/4; i0 += ntg.x) {
+        dst_f32x4[i0] = hdst_f32x4[i0];
+    }
+}
+
+typedef decltype(kernel_mul_mm_id_map1<float>) kernel_mul_mm_id_map1_t;
+
+template [[host_name("kernel_mul_mm_id_map1_f32")]] kernel kernel_mul_mm_id_map1_t kernel_mul_mm_id_map1<float>;
+
+template<typename T, typename T4x4, typename simdgroup_T8x8, typename block_q, short nl, void (*dequantize_func)(device const block_q *, short, thread T4x4 &)>
+kernel void kernel_mul_mm_id(
+        constant ggml_metal_kargs_mul_mm_id & args,
+        device const char * src0,
+        device const char * src1,
+        device const char * tpe,
+        device       char * dst,
+        threadgroup  char * shmem [[threadgroup(0)]],
         uint3  tgpig[[threadgroup_position_in_grid]],
         ushort tiitg[[thread_index_in_threadgroup]],
         ushort sgitg[[simdgroup_index_in_threadgroup]]) {
 
-    threadgroup half  * sa = (threadgroup half  *)(shmem);
-    threadgroup float * sb = (threadgroup float *)(shmem + 4096);
+    threadgroup T    * sa = (threadgroup T    *)(shmem);
+    threadgroup half * sb = (threadgroup half *)(shmem + 4096);
 
     const int r0 = tgpig.y;
     const int r1 = tgpig.x;
+    const int im = tgpig.z;
 
-    if (r1*BLOCK_SIZE_N >= ne1) return;
+    device const int32_t * tpe_i32 = (device const int32_t *) (tpe);
+
+    const int neh1 = tpe_i32[im];
+
+    if (r1*BLOCK_SIZE_N >= neh1) {
+        return;
+    }
 
     // if this block is of 64x32 shape or smaller
-    short n_rows = (ne0 - r0 * BLOCK_SIZE_M < BLOCK_SIZE_M) ? (ne0 - r0 * BLOCK_SIZE_M) : BLOCK_SIZE_M;
-    short n_cols = (ne1 - r1 * BLOCK_SIZE_N < BLOCK_SIZE_N) ? (ne1 - r1 * BLOCK_SIZE_N) : BLOCK_SIZE_N;
+    const short n_rows = (args.neh0 - r0*BLOCK_SIZE_M < BLOCK_SIZE_M) ? (args.neh0 - r0*BLOCK_SIZE_M) : BLOCK_SIZE_M;
+    const short n_cols = (     neh1 - r1*BLOCK_SIZE_N < BLOCK_SIZE_N) ? (     neh1 - r1*BLOCK_SIZE_N) : BLOCK_SIZE_N;
 
     // a thread shouldn't load data outside of the matrix
-    short thread_row = ((short)tiitg/THREAD_PER_ROW) < n_rows ? ((short)tiitg/THREAD_PER_ROW) : n_rows - 1;
-    short thread_col = ((short)tiitg/THREAD_PER_COL) < n_cols ? ((short)tiitg/THREAD_PER_COL) : n_cols - 1;
+    const short thread_row = ((short)tiitg/THREAD_PER_ROW) < n_rows ? ((short)tiitg/THREAD_PER_ROW) : n_rows - 1;
+    const short thread_col = ((short)tiitg/THREAD_PER_COL) < n_cols ? ((short)tiitg/THREAD_PER_COL) : n_cols - 1;
 
-    simdgroup_half8x8  ma[4];
-    simdgroup_float8x8 mb[2];
+    simdgroup_T8x8     ma[4];
+    simdgroup_half8x8  mb[2];
     simdgroup_float8x8 mc[8];
-    for (int i = 0; i < 8; i++){
+
+    for (short i = 0; i < 8; i++){
         mc[i] = make_filled_simdgroup_matrix<float, 8>(0.f);
     }
+
     short il = (tiitg % THREAD_PER_ROW);
 
-    ushort offset1 = il/nl;
+    const int i12 = im%args.neh12;
+    const int i13 = im/args.neh12;
 
-    threadgroup const auto & id = rowids[r1 * BLOCK_SIZE_N + thread_col];
+    const uint64_t offset0 = (i12/args.r2)*args.nb02 + (i13/args.r3)*args.nb03;
+    const short    offset1 = il/nl;
 
-    device const block_q * x = (device const block_q *)(src0 + (r0 * BLOCK_SIZE_M + thread_row) * nb01) + offset1;
-    device const float   * y = (device const float   *)(src1
-        + nb12 * id[1]
-        + nb11 * (id[0] % ne11)
-        + nb10 * (BLOCK_SIZE_K / THREAD_PER_COL * (tiitg % THREAD_PER_COL)));
+    device const block_q * x = (device const block_q *)(src0
+        + args.nb01*(r0*BLOCK_SIZE_M + thread_row) + offset0) + offset1;
 
-    for (int loop_k = 0; loop_k < ne00; loop_k += BLOCK_SIZE_K) {
+    device const half   * y = (device const half   *)(src1
+        + args.nbh13*i13
+        + args.nbh12*i12
+        + args.nbh11*(r1*BLOCK_SIZE_N + thread_col)
+        + args.nbh10*(BLOCK_SIZE_K / THREAD_PER_COL * (tiitg % THREAD_PER_COL)));
+
+    for (int loop_k = 0; loop_k < args.ne00; loop_k += BLOCK_SIZE_K) {
         // load data and store to threadgroup memory
-        half4x4 temp_a;
+        T4x4 temp_a;
         dequantize_func(x, il, temp_a);
+
         threadgroup_barrier(mem_flags::mem_threadgroup);
 
-        for (int i = 0; i < 16; i++) {
-            *(sa + SG_MAT_SIZE * ((tiitg / THREAD_PER_ROW / 8) \
-            +                     (tiitg % THREAD_PER_ROW) * 16 + (i / 8) * 8) \
-            +                     (tiitg / THREAD_PER_ROW) % 8  + (i & 7) * 8) = temp_a[i/4][i%4];
+        #pragma unroll(16)
+        for (short i = 0; i < 16; i++) {
+            *(sa + SG_MAT_SIZE * ((tiitg/THREAD_PER_ROW/8) \
+            +                     (tiitg%THREAD_PER_ROW)*16 + (i/8)*8) \
+            +                     (tiitg/THREAD_PER_ROW)%8  + (i&7)*8) = temp_a[i/4][i%4];
         }
 
-        *(threadgroup float2x4 *)(sb + (tiitg % THREAD_PER_COL) * 8 * 32 + 8 * (tiitg / THREAD_PER_COL)) = *((device float2x4 *)y);
+        *(threadgroup half2x4 *)(sb + 32*8*(tiitg%THREAD_PER_COL) + 8*(tiitg/THREAD_PER_COL)) = *((device half2x4 *) y);
 
         il = (il + 2 < nl) ? il + 2 : il % 2;
-        x  = (il < 2) ? x + (2+nl-1)/nl : x;
+        x  = (il < 2) ? x + (2 + nl - 1)/nl : x;
         y += BLOCK_SIZE_K;
 
         threadgroup_barrier(mem_flags::mem_threadgroup);
 
         // load matrices from threadgroup memory and conduct outer products
-        threadgroup half  * lsma = (sa + THREAD_MAT_M * SG_MAT_SIZE * (sgitg % 2));
-        threadgroup float * lsmb = (sb + THREAD_MAT_N * SG_MAT_SIZE * (sgitg / 2));
+        threadgroup const T    * lsma = (sa + THREAD_MAT_M*SG_MAT_SIZE*(sgitg%2));
+        threadgroup const half * lsmb = (sb + THREAD_MAT_N*SG_MAT_SIZE*(sgitg/2));
 
-        #pragma unroll(BLOCK_SIZE_K/8)
-        for (int ik = 0; ik < BLOCK_SIZE_K / 8; ik++) {
+        #pragma unroll(4)
+        for (short ik = 0; ik < BLOCK_SIZE_K/8; ik++) {
             #pragma unroll(4)
-            for (int i = 0; i < 4; i++) {
+            for (short i = 0; i < 4; i++) {
                 simdgroup_load(ma[i], lsma + SG_MAT_SIZE * i);
             }
+
             simdgroup_barrier(mem_flags::mem_none);
+
             #pragma unroll(2)
-            for (int i = 0; i < 2; i++) {
+            for (short i = 0; i < 2; i++) {
                 simdgroup_load(mb[i], lsmb + SG_MAT_SIZE * i);
             }
 
-            lsma += BLOCK_SIZE_M / SG_MAT_ROW * SG_MAT_SIZE;
-            lsmb += BLOCK_SIZE_N / SG_MAT_ROW * SG_MAT_SIZE;
-
             #pragma unroll(8)
-            for (int i = 0; i < 8; i++){
+            for (short i = 0; i < 8; i++){
                 simdgroup_multiply_accumulate(mc[i], mb[i/4], ma[i%4], mc[i]);
             }
+
+            lsma += (BLOCK_SIZE_M/SG_MAT_ROW)*SG_MAT_SIZE;
+            lsmb += (BLOCK_SIZE_N/SG_MAT_ROW)*SG_MAT_SIZE;
         }
     }
 
-    {
+    if ((r0 + 1) * BLOCK_SIZE_M <= args.neh0 && (r1 + 1) * BLOCK_SIZE_N <= neh1) {
+        device float * C = (device float *) dst +
+            (BLOCK_SIZE_M * r0 + 32*(sgitg &  1)) + \
+            (BLOCK_SIZE_N * r1 + 16*(sgitg >> 1)) * args.neh0 + im*args.neh1*args.neh0;
+
+        for (short i = 0; i < 8; i++) {
+            simdgroup_store(mc[i], C + 8 * (i%4) + 8 * args.neh0 * (i/4), args.neh0);
+        }
+    } else {
+        // block is smaller than 64x32, we should avoid writing data outside of the matrix
         threadgroup_barrier(mem_flags::mem_threadgroup);
         threadgroup float * temp_str = ((threadgroup float *) shmem) \
-                                      + 32 * (sgitg&1) + (16 * (sgitg>>1)) * BLOCK_SIZE_M;
-        for (int i = 0; i < 8; i++) {
-            simdgroup_store(mc[i], temp_str + 8 * (i%4) + 8 * BLOCK_SIZE_M * (i/4), BLOCK_SIZE_M);
+                                     + 32*(sgitg&1) + (16*(sgitg >> 1))*BLOCK_SIZE_M;
+        for (short i = 0; i < 8; i++) {
+            simdgroup_store(mc[i], temp_str + 8*(i%4) + 8*BLOCK_SIZE_M*(i/4), BLOCK_SIZE_M);
         }
 
         threadgroup_barrier(mem_flags::mem_threadgroup);
 
         if (sgitg == 0) {
             for (int j = tiitg; j < n_cols; j += BLOCK_SIZE_N) {
-                threadgroup const auto & jid = rowids[r1 * BLOCK_SIZE_N + j];
-                int64_t joff = jid[0]*ne0 + jid[1]*ne0ne1;
-
-                device float  * D  = (device float  *) dst + (r0*BLOCK_SIZE_M) + joff;
+                device float  * D  = (device float  *) dst + (r0*BLOCK_SIZE_M) + (r1*BLOCK_SIZE_N + j)*args.neh0 + im*args.neh1*args.neh0;
                 device float4 * D4 = (device float4 *) D;
 
                 threadgroup float  * C  = temp_str + (j*BLOCK_SIZE_M);
@@ -8974,66 +9237,6 @@ void kernel_mul_mm_id_impl(
     }
 }
 
-template<typename block_q, short nl, void (*dequantize_func)(device const block_q *, short, thread half4x4 &)>
-kernel void kernel_mul_mm_id(
-        constant ggml_metal_kargs_mul_mm_id & args,
-        device const char * src0s,
-        device const char * src1,
-        device       char * dst,
-        device const char * ids,
-        threadgroup  char * shmem [[threadgroup(0)]],
-        uint3  tgpig[[threadgroup_position_in_grid]],
-        ushort tiitg[[thread_index_in_threadgroup]],
-        ushort sgitg[[simdgroup_index_in_threadgroup]]) {
-
-    const int32_t i02 = tgpig.z;
-
-    tgpig.z = 0;
-
-    device const char * src0 = src0s + i02*args.nb02;
-
-    // row indices
-    threadgroup ushort2 * rowids = (threadgroup ushort2 *)(shmem + 8192);
-
-    // TODO: parallelize this loop
-    int32_t _ne1 = 0;
-    for (ushort ii1 = 0; ii1 < args.nei1; ii1++) {
-        for (ushort ii0 = 0; ii0 < args.nei0; ii0++) {
-            int32_t id = ((device int32_t *) (ids + ii1*args.nbi1))[ii0];
-            if (id == i02) {
-                if (tiitg == 0) {
-                    rowids[_ne1] = ushort2(ii0, ii1);
-                }
-                _ne1++;
-            }
-        }
-    }
-
-    threadgroup_barrier(mem_flags::mem_threadgroup);
-
-    kernel_mul_mm_id_impl<block_q, nl, dequantize_func>(
-        args.ne00,
-        args.ne02,
-        args.nb01,
-        args.nb02,
-        args.ne11,
-        args.ne12,
-        args.nb10,
-        args.nb11,
-        args.nb12,
-        args.ne0,
-        _ne1,
-        (int64_t)args.ne0*args.ne1,
-        src0,
-        src1,
-        rowids,
-        dst,
-        shmem,
-        tgpig,
-        tiitg,
-        sgitg);
-}
-
 #define QK_NL 16
 
 //
@@ -9074,63 +9277,64 @@ template [[host_name("kernel_get_rows_iq4_xs")]]  kernel get_rows_q_t kernel_get
 // matrix-matrix multiplication
 //
 
-typedef decltype(kernel_mul_mm<half, half4x4, simdgroup_half8x8, float4x4, 1, dequantize_f32>) mat_mm_t;
+typedef decltype(kernel_mul_mm<half, half4x4, simdgroup_half8x8, float4x4, 1, dequantize_f32>) mul_mm_t;
 
-template [[host_name("kernel_mul_mm_f32_f32")]]     kernel mat_mm_t kernel_mul_mm<half,   half4x4,   simdgroup_half8x8,   float4x4,      1,     dequantize_f32>;
-template [[host_name("kernel_mul_mm_f16_f32")]]     kernel mat_mm_t kernel_mul_mm<half,   half4x4,   simdgroup_half8x8,   half4x4,       1,     dequantize_f16>;
+template [[host_name("kernel_mul_mm_f32_f32")]]     kernel mul_mm_t kernel_mul_mm<half,   half4x4,   simdgroup_half8x8,   float4x4,      1,     dequantize_f32>;
+template [[host_name("kernel_mul_mm_f16_f32")]]     kernel mul_mm_t kernel_mul_mm<half,   half4x4,   simdgroup_half8x8,   half4x4,       1,     dequantize_f16>;
 #if defined(GGML_METAL_USE_BF16)
-template [[host_name("kernel_mul_mm_bf16_f32")]]    kernel mat_mm_t kernel_mul_mm<bfloat, bfloat4x4, simdgroup_bfloat8x8, bfloat4x4,     1,     dequantize_bf16>;
+template [[host_name("kernel_mul_mm_bf16_f32")]]    kernel mul_mm_t kernel_mul_mm<bfloat, bfloat4x4, simdgroup_bfloat8x8, bfloat4x4,     1,     dequantize_bf16>;
 #endif
-template [[host_name("kernel_mul_mm_q4_0_f32")]]    kernel mat_mm_t kernel_mul_mm<half,   half4x4,   simdgroup_half8x8,   block_q4_0,    2,     dequantize_q4_0>;
-template [[host_name("kernel_mul_mm_q4_1_f32")]]    kernel mat_mm_t kernel_mul_mm<half,   half4x4,   simdgroup_half8x8,   block_q4_1,    2,     dequantize_q4_1>;
-template [[host_name("kernel_mul_mm_q5_0_f32")]]    kernel mat_mm_t kernel_mul_mm<half,   half4x4,   simdgroup_half8x8,   block_q5_0,    2,     dequantize_q5_0>;
-template [[host_name("kernel_mul_mm_q5_1_f32")]]    kernel mat_mm_t kernel_mul_mm<half,   half4x4,   simdgroup_half8x8,   block_q5_1,    2,     dequantize_q5_1>;
-template [[host_name("kernel_mul_mm_q8_0_f32")]]    kernel mat_mm_t kernel_mul_mm<half,   half4x4,   simdgroup_half8x8,   block_q8_0,    2,     dequantize_q8_0>;
-template [[host_name("kernel_mul_mm_q2_K_f32")]]    kernel mat_mm_t kernel_mul_mm<half,   half4x4,   simdgroup_half8x8,   block_q2_K,    QK_NL, dequantize_q2_K>;
-template [[host_name("kernel_mul_mm_q3_K_f32")]]    kernel mat_mm_t kernel_mul_mm<half,   half4x4,   simdgroup_half8x8,   block_q3_K,    QK_NL, dequantize_q3_K>;
-template [[host_name("kernel_mul_mm_q4_K_f32")]]    kernel mat_mm_t kernel_mul_mm<half,   half4x4,   simdgroup_half8x8,   block_q4_K,    QK_NL, dequantize_q4_K>;
-template [[host_name("kernel_mul_mm_q5_K_f32")]]    kernel mat_mm_t kernel_mul_mm<half,   half4x4,   simdgroup_half8x8,   block_q5_K,    QK_NL, dequantize_q5_K>;
-template [[host_name("kernel_mul_mm_q6_K_f32")]]    kernel mat_mm_t kernel_mul_mm<half,   half4x4,   simdgroup_half8x8,   block_q6_K,    QK_NL, dequantize_q6_K>;
-template [[host_name("kernel_mul_mm_iq2_xxs_f32")]] kernel mat_mm_t kernel_mul_mm<half,   half4x4,   simdgroup_half8x8,   block_iq2_xxs, QK_NL, dequantize_iq2_xxs>;
-template [[host_name("kernel_mul_mm_iq2_xs_f32")]]  kernel mat_mm_t kernel_mul_mm<half,   half4x4,   simdgroup_half8x8,   block_iq2_xs,  QK_NL, dequantize_iq2_xs>;
-template [[host_name("kernel_mul_mm_iq3_xxs_f32")]] kernel mat_mm_t kernel_mul_mm<half,   half4x4,   simdgroup_half8x8,   block_iq3_xxs, QK_NL, dequantize_iq3_xxs>;
-template [[host_name("kernel_mul_mm_iq3_s_f32")]]   kernel mat_mm_t kernel_mul_mm<half,   half4x4,   simdgroup_half8x8,   block_iq3_s,   QK_NL, dequantize_iq3_s>;
-template [[host_name("kernel_mul_mm_iq2_s_f32")]]   kernel mat_mm_t kernel_mul_mm<half,   half4x4,   simdgroup_half8x8,   block_iq2_s,   QK_NL, dequantize_iq2_s>;
-template [[host_name("kernel_mul_mm_iq1_s_f32")]]   kernel mat_mm_t kernel_mul_mm<half,   half4x4,   simdgroup_half8x8,   block_iq1_s,   QK_NL, dequantize_iq1_s>;
-template [[host_name("kernel_mul_mm_iq1_m_f32")]]   kernel mat_mm_t kernel_mul_mm<half,   half4x4,   simdgroup_half8x8,   block_iq1_m,   QK_NL, dequantize_iq1_m>;
-template [[host_name("kernel_mul_mm_iq4_nl_f32")]]  kernel mat_mm_t kernel_mul_mm<half,   half4x4,   simdgroup_half8x8,   block_iq4_nl,  2,     dequantize_iq4_nl>;
-template [[host_name("kernel_mul_mm_iq4_xs_f32")]]  kernel mat_mm_t kernel_mul_mm<half,   half4x4,   simdgroup_half8x8,   block_iq4_xs,  QK_NL, dequantize_iq4_xs>;
+template [[host_name("kernel_mul_mm_q4_0_f32")]]    kernel mul_mm_t kernel_mul_mm<half,   half4x4,   simdgroup_half8x8,   block_q4_0,    2,     dequantize_q4_0>;
+template [[host_name("kernel_mul_mm_q4_1_f32")]]    kernel mul_mm_t kernel_mul_mm<half,   half4x4,   simdgroup_half8x8,   block_q4_1,    2,     dequantize_q4_1>;
+template [[host_name("kernel_mul_mm_q5_0_f32")]]    kernel mul_mm_t kernel_mul_mm<half,   half4x4,   simdgroup_half8x8,   block_q5_0,    2,     dequantize_q5_0>;
+template [[host_name("kernel_mul_mm_q5_1_f32")]]    kernel mul_mm_t kernel_mul_mm<half,   half4x4,   simdgroup_half8x8,   block_q5_1,    2,     dequantize_q5_1>;
+template [[host_name("kernel_mul_mm_q8_0_f32")]]    kernel mul_mm_t kernel_mul_mm<half,   half4x4,   simdgroup_half8x8,   block_q8_0,    2,     dequantize_q8_0>;
+template [[host_name("kernel_mul_mm_q2_K_f32")]]    kernel mul_mm_t kernel_mul_mm<half,   half4x4,   simdgroup_half8x8,   block_q2_K,    QK_NL, dequantize_q2_K>;
+template [[host_name("kernel_mul_mm_q3_K_f32")]]    kernel mul_mm_t kernel_mul_mm<half,   half4x4,   simdgroup_half8x8,   block_q3_K,    QK_NL, dequantize_q3_K>;
+template [[host_name("kernel_mul_mm_q4_K_f32")]]    kernel mul_mm_t kernel_mul_mm<half,   half4x4,   simdgroup_half8x8,   block_q4_K,    QK_NL, dequantize_q4_K>;
+template [[host_name("kernel_mul_mm_q5_K_f32")]]    kernel mul_mm_t kernel_mul_mm<half,   half4x4,   simdgroup_half8x8,   block_q5_K,    QK_NL, dequantize_q5_K>;
+template [[host_name("kernel_mul_mm_q6_K_f32")]]    kernel mul_mm_t kernel_mul_mm<half,   half4x4,   simdgroup_half8x8,   block_q6_K,    QK_NL, dequantize_q6_K>;
+template [[host_name("kernel_mul_mm_iq2_xxs_f32")]] kernel mul_mm_t kernel_mul_mm<half,   half4x4,   simdgroup_half8x8,   block_iq2_xxs, QK_NL, dequantize_iq2_xxs>;
+template [[host_name("kernel_mul_mm_iq2_xs_f32")]]  kernel mul_mm_t kernel_mul_mm<half,   half4x4,   simdgroup_half8x8,   block_iq2_xs,  QK_NL, dequantize_iq2_xs>;
+template [[host_name("kernel_mul_mm_iq3_xxs_f32")]] kernel mul_mm_t kernel_mul_mm<half,   half4x4,   simdgroup_half8x8,   block_iq3_xxs, QK_NL, dequantize_iq3_xxs>;
+template [[host_name("kernel_mul_mm_iq3_s_f32")]]   kernel mul_mm_t kernel_mul_mm<half,   half4x4,   simdgroup_half8x8,   block_iq3_s,   QK_NL, dequantize_iq3_s>;
+template [[host_name("kernel_mul_mm_iq2_s_f32")]]   kernel mul_mm_t kernel_mul_mm<half,   half4x4,   simdgroup_half8x8,   block_iq2_s,   QK_NL, dequantize_iq2_s>;
+template [[host_name("kernel_mul_mm_iq1_s_f32")]]   kernel mul_mm_t kernel_mul_mm<half,   half4x4,   simdgroup_half8x8,   block_iq1_s,   QK_NL, dequantize_iq1_s>;
+template [[host_name("kernel_mul_mm_iq1_m_f32")]]   kernel mul_mm_t kernel_mul_mm<half,   half4x4,   simdgroup_half8x8,   block_iq1_m,   QK_NL, dequantize_iq1_m>;
+template [[host_name("kernel_mul_mm_iq4_nl_f32")]]  kernel mul_mm_t kernel_mul_mm<half,   half4x4,   simdgroup_half8x8,   block_iq4_nl,  2,     dequantize_iq4_nl>;
+template [[host_name("kernel_mul_mm_iq4_xs_f32")]]  kernel mul_mm_t kernel_mul_mm<half,   half4x4,   simdgroup_half8x8,   block_iq4_xs,  QK_NL, dequantize_iq4_xs>;
 
 //
 // indirect matrix-matrix multiplication
 //
 
-typedef decltype(kernel_mul_mm_id<float4x4, 1, dequantize_f32>) mat_mm_id_t;
+typedef decltype(kernel_mul_mm_id<half, half4x4, simdgroup_half8x8, float4x4, 1, dequantize_f32>) mul_mm_id;
 
-template [[host_name("kernel_mul_mm_id_f32_f32")]]     kernel mat_mm_id_t kernel_mul_mm_id<float4x4,      1,     dequantize_f32>;
-template [[host_name("kernel_mul_mm_id_f16_f32")]]     kernel mat_mm_id_t kernel_mul_mm_id<half4x4,       1,     dequantize_f16>;
+template [[host_name("kernel_mul_mm_id_f32_f16")]]     kernel mul_mm_id kernel_mul_mm_id<half,   half4x4,   simdgroup_half8x8,   float4x4,      1,     dequantize_f32>;
+template [[host_name("kernel_mul_mm_id_f16_f16")]]     kernel mul_mm_id kernel_mul_mm_id<half,   half4x4,   simdgroup_half8x8,   half4x4,       1,     dequantize_f16>;
 #if defined(GGML_METAL_USE_BF16)
-template [[host_name("kernel_mul_mm_id_bf16_f32")]]    kernel mat_mm_id_t kernel_mul_mm_id<bfloat4x4,     1,     dequantize_bf16>;
+template [[host_name("kernel_mul_mm_id_bf16_f16")]]    kernel mul_mm_id kernel_mul_mm_id<bfloat, bfloat4x4, simdgroup_bfloat8x8, bfloat4x4,     1,     dequantize_bf16>;
 #endif
-template [[host_name("kernel_mul_mm_id_q4_0_f32")]]    kernel mat_mm_id_t kernel_mul_mm_id<block_q4_0,    2,     dequantize_q4_0>;
-template [[host_name("kernel_mul_mm_id_q4_1_f32")]]    kernel mat_mm_id_t kernel_mul_mm_id<block_q4_1,    2,     dequantize_q4_1>;
-template [[host_name("kernel_mul_mm_id_q5_0_f32")]]    kernel mat_mm_id_t kernel_mul_mm_id<block_q5_0,    2,     dequantize_q5_0>;
-template [[host_name("kernel_mul_mm_id_q5_1_f32")]]    kernel mat_mm_id_t kernel_mul_mm_id<block_q5_1,    2,     dequantize_q5_1>;
-template [[host_name("kernel_mul_mm_id_q8_0_f32")]]    kernel mat_mm_id_t kernel_mul_mm_id<block_q8_0,    2,     dequantize_q8_0>;
-template [[host_name("kernel_mul_mm_id_q2_K_f32")]]    kernel mat_mm_id_t kernel_mul_mm_id<block_q2_K,    QK_NL, dequantize_q2_K>;
-template [[host_name("kernel_mul_mm_id_q3_K_f32")]]    kernel mat_mm_id_t kernel_mul_mm_id<block_q3_K,    QK_NL, dequantize_q3_K>;
-template [[host_name("kernel_mul_mm_id_q4_K_f32")]]    kernel mat_mm_id_t kernel_mul_mm_id<block_q4_K,    QK_NL, dequantize_q4_K>;
-template [[host_name("kernel_mul_mm_id_q5_K_f32")]]    kernel mat_mm_id_t kernel_mul_mm_id<block_q5_K,    QK_NL, dequantize_q5_K>;
-template [[host_name("kernel_mul_mm_id_q6_K_f32")]]    kernel mat_mm_id_t kernel_mul_mm_id<block_q6_K,    QK_NL, dequantize_q6_K>;
-template [[host_name("kernel_mul_mm_id_iq2_xxs_f32")]] kernel mat_mm_id_t kernel_mul_mm_id<block_iq2_xxs, QK_NL, dequantize_iq2_xxs>;
-template [[host_name("kernel_mul_mm_id_iq2_xs_f32")]]  kernel mat_mm_id_t kernel_mul_mm_id<block_iq2_xs,  QK_NL, dequantize_iq2_xs>;
-template [[host_name("kernel_mul_mm_id_iq3_xxs_f32")]] kernel mat_mm_id_t kernel_mul_mm_id<block_iq3_xxs, QK_NL, dequantize_iq3_xxs>;
-template [[host_name("kernel_mul_mm_id_iq3_s_f32")]]   kernel mat_mm_id_t kernel_mul_mm_id<block_iq3_s,   QK_NL, dequantize_iq3_s>;
-template [[host_name("kernel_mul_mm_id_iq2_s_f32")]]   kernel mat_mm_id_t kernel_mul_mm_id<block_iq2_s,   QK_NL, dequantize_iq2_s>;
-template [[host_name("kernel_mul_mm_id_iq1_s_f32")]]   kernel mat_mm_id_t kernel_mul_mm_id<block_iq1_s,   QK_NL, dequantize_iq1_s>;
-template [[host_name("kernel_mul_mm_id_iq1_m_f32")]]   kernel mat_mm_id_t kernel_mul_mm_id<block_iq1_m,   QK_NL, dequantize_iq1_m>;
-template [[host_name("kernel_mul_mm_id_iq4_nl_f32")]]  kernel mat_mm_id_t kernel_mul_mm_id<block_iq4_nl,  2,     dequantize_iq4_nl>;
-template [[host_name("kernel_mul_mm_id_iq4_xs_f32")]]  kernel mat_mm_id_t kernel_mul_mm_id<block_iq4_xs,  QK_NL, dequantize_iq4_xs>;
+template [[host_name("kernel_mul_mm_id_q4_0_f16")]]    kernel mul_mm_id kernel_mul_mm_id<half,   half4x4,   simdgroup_half8x8,   block_q4_0,    2,     dequantize_q4_0>;
+template [[host_name("kernel_mul_mm_id_q4_1_f16")]]    kernel mul_mm_id kernel_mul_mm_id<half,   half4x4,   simdgroup_half8x8,   block_q4_1,    2,     dequantize_q4_1>;
+template [[host_name("kernel_mul_mm_id_q5_0_f16")]]    kernel mul_mm_id kernel_mul_mm_id<half,   half4x4,   simdgroup_half8x8,   block_q5_0,    2,     dequantize_q5_0>;
+template [[host_name("kernel_mul_mm_id_q5_1_f16")]]    kernel mul_mm_id kernel_mul_mm_id<half,   half4x4,   simdgroup_half8x8,   block_q5_1,    2,     dequantize_q5_1>;
+template [[host_name("kernel_mul_mm_id_q8_0_f16")]]    kernel mul_mm_id kernel_mul_mm_id<half,   half4x4,   simdgroup_half8x8,   block_q8_0,    2,     dequantize_q8_0>;
+template [[host_name("kernel_mul_mm_id_q2_K_f16")]]    kernel mul_mm_id kernel_mul_mm_id<half,   half4x4,   simdgroup_half8x8,   block_q2_K,    QK_NL, dequantize_q2_K>;
+template [[host_name("kernel_mul_mm_id_q3_K_f16")]]    kernel mul_mm_id kernel_mul_mm_id<half,   half4x4,   simdgroup_half8x8,   block_q3_K,    QK_NL, dequantize_q3_K>;
+template [[host_name("kernel_mul_mm_id_q4_K_f16")]]    kernel mul_mm_id kernel_mul_mm_id<half,   half4x4,   simdgroup_half8x8,   block_q4_K,    QK_NL, dequantize_q4_K>;
+template [[host_name("kernel_mul_mm_id_q5_K_f16")]]    kernel mul_mm_id kernel_mul_mm_id<half,   half4x4,   simdgroup_half8x8,   block_q5_K,    QK_NL, dequantize_q5_K>;
+template [[host_name("kernel_mul_mm_id_q6_K_f16")]]    kernel mul_mm_id kernel_mul_mm_id<half,   half4x4,   simdgroup_half8x8,   block_q6_K,    QK_NL, dequantize_q6_K>;
+template [[host_name("kernel_mul_mm_id_iq2_xxs_f16")]] kernel mul_mm_id kernel_mul_mm_id<half,   half4x4,   simdgroup_half8x8,   block_iq2_xxs, QK_NL, dequantize_iq2_xxs>;
+template [[host_name("kernel_mul_mm_id_iq2_xs_f16")]]  kernel mul_mm_id kernel_mul_mm_id<half,   half4x4,   simdgroup_half8x8,   block_iq2_xs,  QK_NL, dequantize_iq2_xs>;
+template [[host_name("kernel_mul_mm_id_iq3_xxs_f16")]] kernel mul_mm_id kernel_mul_mm_id<half,   half4x4,   simdgroup_half8x8,   block_iq3_xxs, QK_NL, dequantize_iq3_xxs>;
+template [[host_name("kernel_mul_mm_id_iq3_s_f16")]]   kernel mul_mm_id kernel_mul_mm_id<half,   half4x4,   simdgroup_half8x8,   block_iq3_s,   QK_NL, dequantize_iq3_s>;
+template [[host_name("kernel_mul_mm_id_iq2_s_f16")]]   kernel mul_mm_id kernel_mul_mm_id<half,   half4x4,   simdgroup_half8x8,   block_iq2_s,   QK_NL, dequantize_iq2_s>;
+template [[host_name("kernel_mul_mm_id_iq1_s_f16")]]   kernel mul_mm_id kernel_mul_mm_id<half,   half4x4,   simdgroup_half8x8,   block_iq1_s,   QK_NL, dequantize_iq1_s>;
+template [[host_name("kernel_mul_mm_id_iq1_m_f16")]]   kernel mul_mm_id kernel_mul_mm_id<half,   half4x4,   simdgroup_half8x8,   block_iq1_m,   QK_NL, dequantize_iq1_m>;
+template [[host_name("kernel_mul_mm_id_iq4_nl_f16")]]  kernel mul_mm_id kernel_mul_mm_id<half,   half4x4,   simdgroup_half8x8,   block_iq4_nl,  2,     dequantize_iq4_nl>;
+template [[host_name("kernel_mul_mm_id_iq4_xs_f16")]]  kernel mul_mm_id kernel_mul_mm_id<half,   half4x4,   simdgroup_half8x8,   block_iq4_xs,  QK_NL, dequantize_iq4_xs>;
+
 
 //
 // matrix-vector multiplication
diff --git a/ml/backend/ggml/ggml/src/ggml-metal/ggml-metal-impl.h b/ml/backend/ggml/ggml/src/ggml-metal/ggml-metal-impl.h
index 8721b272..17eab976 100644
--- a/ml/backend/ggml/ggml/src/ggml-metal/ggml-metal-impl.h
+++ b/ml/backend/ggml/ggml/src/ggml-metal/ggml-metal-impl.h
@@ -207,6 +207,10 @@ typedef struct {
     float    attn_factor;
     float    beta_fast;
     float    beta_slow;
+    int32_t  sect_0;
+    int32_t  sect_1;
+    int32_t  sect_2;
+    int32_t  sect_3;
 } ggml_metal_kargs_rope;
 
 typedef struct {
@@ -299,21 +303,42 @@ typedef struct {
 } ggml_metal_kargs_mul_mv_ext;
 
 typedef struct {
-    int32_t  nei0;
-    int32_t  nei1;
-    uint64_t nbi1;
+    int32_t  ne10;
+    int32_t  ne11;  // n_expert_used (bcast)
+    uint64_t nb11;
+    uint64_t nb12;
+    int32_t  neh11; // n_tokens
+    uint64_t nbh11;
+    int32_t  ne20;  // n_expert_used
+    uint64_t nb21;
+} ggml_metal_kargs_mul_mm_id_map0;
+
+typedef struct {
+    int32_t  ne20; // n_expert_used
+    int32_t  neh0;
+    int32_t  neh1;
+    uint64_t nbh1;
+    uint64_t nbh2;
+    int32_t  ne0;
+    uint64_t nb1;
+    uint64_t nb2;
+} ggml_metal_kargs_mul_mm_id_map1;
+
+typedef struct {
     int32_t  ne00;
     int32_t  ne02;
     uint64_t nb01;
     uint64_t nb02;
-    int32_t  ne11;
-    int32_t  ne12;
-    int32_t  ne13;
-    uint64_t nb10;
-    uint64_t nb11;
-    uint64_t nb12;
-    int32_t  ne0;
-    int32_t  ne1;
+    uint64_t nb03;
+    int32_t  neh12;
+    uint64_t nbh10;
+    uint64_t nbh11;
+    uint64_t nbh12;
+    uint64_t nbh13;
+    int32_t  neh0;
+    int32_t  neh1;
+    int16_t  r2;
+    int16_t  r3;
 } ggml_metal_kargs_mul_mm_id;
 
 typedef struct {
diff --git a/ml/backend/ggml/ggml/src/ggml-metal/ggml-metal.m b/ml/backend/ggml/ggml/src/ggml-metal/ggml-metal.m
index 112abef6..7641247e 100644
--- a/ml/backend/ggml/ggml/src/ggml-metal/ggml-metal.m
+++ b/ml/backend/ggml/ggml/src/ggml-metal/ggml-metal.m
@@ -306,30 +306,36 @@ enum ggml_metal_kernel_type {
     GGML_METAL_KERNEL_TYPE_MUL_MM_IQ1_M_F32,
     GGML_METAL_KERNEL_TYPE_MUL_MM_IQ4_NL_F32,
     GGML_METAL_KERNEL_TYPE_MUL_MM_IQ4_XS_F32,
-    GGML_METAL_KERNEL_TYPE_MUL_MM_ID_F32_F32,
-    GGML_METAL_KERNEL_TYPE_MUL_MM_ID_F16_F32,
-    GGML_METAL_KERNEL_TYPE_MUL_MM_ID_BF16_F32,
-    GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q4_0_F32,
-    GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q4_1_F32,
-    GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q5_0_F32,
-    GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q5_1_F32,
-    GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q8_0_F32,
-    GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q2_K_F32,
-    GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q3_K_F32,
-    GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q4_K_F32,
-    GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q5_K_F32,
-    GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q6_K_F32,
-    GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ2_XXS_F32,
-    GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ2_XS_F32,
-    GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ3_XXS_F32,
-    GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ3_S_F32,
-    GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ2_S_F32,
-    GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ1_S_F32,
-    GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ1_M_F32,
-    GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ4_NL_F32,
-    GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ4_XS_F32,
+    GGML_METAL_KERNEL_TYPE_MUL_MM_ID_MAP0_F16,
+    GGML_METAL_KERNEL_TYPE_MUL_MM_ID_MAP1_F32,
+    GGML_METAL_KERNEL_TYPE_MUL_MM_ID_F32_F16,
+    GGML_METAL_KERNEL_TYPE_MUL_MM_ID_F16_F16,
+    GGML_METAL_KERNEL_TYPE_MUL_MM_ID_BF16_F16,
+    GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q4_0_F16,
+    GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q4_1_F16,
+    GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q5_0_F16,
+    GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q5_1_F16,
+    GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q8_0_F16,
+    GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q2_K_F16,
+    GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q3_K_F16,
+    GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q4_K_F16,
+    GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q5_K_F16,
+    GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q6_K_F16,
+    GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ2_XXS_F16,
+    GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ2_XS_F16,
+    GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ3_XXS_F16,
+    GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ3_S_F16,
+    GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ2_S_F16,
+    GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ1_S_F16,
+    GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ1_M_F16,
+    GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ4_NL_F16,
+    GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ4_XS_F16,
     GGML_METAL_KERNEL_TYPE_ROPE_NORM_F32,
     GGML_METAL_KERNEL_TYPE_ROPE_NORM_F16,
+    GGML_METAL_KERNEL_TYPE_ROPE_MULTI_F32,
+    GGML_METAL_KERNEL_TYPE_ROPE_MULTI_F16,
+    GGML_METAL_KERNEL_TYPE_ROPE_VISION_F32,
+    GGML_METAL_KERNEL_TYPE_ROPE_VISION_F16,
     GGML_METAL_KERNEL_TYPE_ROPE_NEOX_F32,
     GGML_METAL_KERNEL_TYPE_ROPE_NEOX_F16,
     GGML_METAL_KERNEL_TYPE_IM2COL_F16,
@@ -651,7 +657,8 @@ static void ggml_metal_mem_pool_reset(struct ggml_metal_mem_pool * mem_pool) {
     }
 
     if (mem_pool->heaps_to_remove.count > 0) {
-        for (NSUInteger i = 0; i < [mem_pool->heaps_to_remove count]; i++) {
+        // remove in reverse order
+        for (NSUInteger i = [mem_pool->heaps_to_remove count] - 1; ; --i) {
             NSUInteger index = [[mem_pool->heaps_to_remove objectAtIndex:i] intValue];
             ggml_metal_heap_ptr * ptr = [mem_pool->heaps objectAtIndex:index];
 
@@ -660,6 +667,10 @@ static void ggml_metal_mem_pool_reset(struct ggml_metal_mem_pool * mem_pool) {
 
             [mem_pool->heaps removeObjectAtIndex:index];
             [ptr release];
+
+            if (i == 0) {
+                break;
+            }
         }
 
         [mem_pool->heaps_to_remove removeAllObjects];
@@ -673,7 +684,7 @@ static void ggml_metal_mem_pool_clear(struct ggml_metal_mem_pool * mem_pool) {
 }
 
 static id<MTLBuffer> ggml_metal_mem_pool_alloc(struct ggml_metal_mem_pool * mem_pool, size_t size) {
-    const size_t alignment = 32;
+    const size_t alignment = 256;
 
     const size_t size_aligned = GGML_PAD(size, alignment);
 
@@ -1243,30 +1254,36 @@ static struct ggml_backend_metal_context * ggml_metal_init(ggml_backend_dev_t de
         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_IQ1_M_F32,                mul_mm_iq1_m_f32,                has_simdgroup_mm);
         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_IQ4_NL_F32,               mul_mm_iq4_nl_f32,               has_simdgroup_mm);
         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_IQ4_XS_F32,               mul_mm_iq4_xs_f32,               has_simdgroup_mm);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_F32_F32,               mul_mm_id_f32_f32,               has_simdgroup_mm);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_F16_F32,               mul_mm_id_f16_f32,               has_simdgroup_mm);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_BF16_F32,              mul_mm_id_bf16_f32,              has_simdgroup_mm && use_bfloat);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q4_0_F32,              mul_mm_id_q4_0_f32,              has_simdgroup_mm);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q4_1_F32,              mul_mm_id_q4_1_f32,              has_simdgroup_mm);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q5_0_F32,              mul_mm_id_q5_0_f32,              has_simdgroup_mm);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q5_1_F32,              mul_mm_id_q5_1_f32,              has_simdgroup_mm);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q8_0_F32,              mul_mm_id_q8_0_f32,              has_simdgroup_mm);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q2_K_F32,              mul_mm_id_q2_K_f32,              has_simdgroup_mm);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q3_K_F32,              mul_mm_id_q3_K_f32,              has_simdgroup_mm);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q4_K_F32,              mul_mm_id_q4_K_f32,              has_simdgroup_mm);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q5_K_F32,              mul_mm_id_q5_K_f32,              has_simdgroup_mm);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q6_K_F32,              mul_mm_id_q6_K_f32,              has_simdgroup_mm);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ2_XXS_F32,           mul_mm_id_iq2_xxs_f32,           has_simdgroup_mm);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ2_XS_F32,            mul_mm_id_iq2_xs_f32,            has_simdgroup_mm);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ3_XXS_F32,           mul_mm_id_iq3_xxs_f32,           has_simdgroup_mm);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ3_S_F32,             mul_mm_id_iq3_s_f32,             has_simdgroup_mm);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ2_S_F32,             mul_mm_id_iq2_s_f32,             has_simdgroup_mm);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ1_S_F32,             mul_mm_id_iq1_s_f32,             has_simdgroup_mm);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ1_M_F32,             mul_mm_id_iq1_m_f32,             has_simdgroup_mm);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ4_NL_F32,            mul_mm_id_iq4_nl_f32,            has_simdgroup_mm);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ4_XS_F32,            mul_mm_id_iq4_xs_f32,            has_simdgroup_mm);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_MAP0_F16,              mul_mm_id_map0_f16,              has_simdgroup_mm);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_MAP1_F32,              mul_mm_id_map1_f32,              has_simdgroup_mm);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_F32_F16,               mul_mm_id_f32_f16,               has_simdgroup_mm);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_F16_F16,               mul_mm_id_f16_f16,               has_simdgroup_mm);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_BF16_F16,              mul_mm_id_bf16_f16,              has_simdgroup_mm && use_bfloat);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q4_0_F16,              mul_mm_id_q4_0_f16,              has_simdgroup_mm);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q4_1_F16,              mul_mm_id_q4_1_f16,              has_simdgroup_mm);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q5_0_F16,              mul_mm_id_q5_0_f16,              has_simdgroup_mm);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q5_1_F16,              mul_mm_id_q5_1_f16,              has_simdgroup_mm);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q8_0_F16,              mul_mm_id_q8_0_f16,              has_simdgroup_mm);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q2_K_F16,              mul_mm_id_q2_K_f16,              has_simdgroup_mm);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q3_K_F16,              mul_mm_id_q3_K_f16,              has_simdgroup_mm);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q4_K_F16,              mul_mm_id_q4_K_f16,              has_simdgroup_mm);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q5_K_F16,              mul_mm_id_q5_K_f16,              has_simdgroup_mm);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q6_K_F16,              mul_mm_id_q6_K_f16,              has_simdgroup_mm);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ2_XXS_F16,           mul_mm_id_iq2_xxs_f16,           has_simdgroup_mm);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ2_XS_F16,            mul_mm_id_iq2_xs_f16,            has_simdgroup_mm);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ3_XXS_F16,           mul_mm_id_iq3_xxs_f16,           has_simdgroup_mm);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ3_S_F16,             mul_mm_id_iq3_s_f16,             has_simdgroup_mm);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ2_S_F16,             mul_mm_id_iq2_s_f16,             has_simdgroup_mm);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ1_S_F16,             mul_mm_id_iq1_s_f16,             has_simdgroup_mm);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ1_M_F16,             mul_mm_id_iq1_m_f16,             has_simdgroup_mm);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ4_NL_F16,            mul_mm_id_iq4_nl_f16,            has_simdgroup_mm);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ4_XS_F16,            mul_mm_id_iq4_xs_f16,            has_simdgroup_mm);
         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ROPE_NORM_F32,                   rope_norm_f32,                   true);
         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ROPE_NORM_F16,                   rope_norm_f16,                   true);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ROPE_MULTI_F32,                  rope_multi_f32,                  true);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ROPE_MULTI_F16,                  rope_multi_f16,                  true);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ROPE_VISION_F32,                 rope_vision_f32,                 true);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ROPE_VISION_F16,                 rope_vision_f16,                 true);
         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ROPE_NEOX_F32,                   rope_neox_f32,                   true);
         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ROPE_NEOX_F16,                   rope_neox_f16,                   true);
         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_IM2COL_F16,                      im2col_f16,                      true);
@@ -1630,16 +1647,7 @@ static bool ggml_metal_supports_op(const struct ggml_backend_metal_device_contex
         case GGML_OP_NORM:
             return has_simdgroup_reduction && (op->ne[0] % 4 == 0 && ggml_is_contiguous_1(op->src[0]));
         case GGML_OP_ROPE:
-            {
-                const int mode = ((const int32_t *) op->op_params)[2];
-                if (mode & GGML_ROPE_TYPE_MROPE) {
-                    return false;
-                }
-                if (mode & GGML_ROPE_TYPE_VISION) {
-                    return false;
-                }
-                return true;
-            }
+            return true;
         case GGML_OP_IM2COL:
             return op->src[0]->type == GGML_TYPE_F16;
         case GGML_OP_POOL_1D:
@@ -3002,7 +3010,7 @@ static bool ggml_metal_encode_node(
                     [encoder setBuffer:id_dst  offset:offs_dst     atIndex:3];
 
                     [encoder setThreadgroupMemoryLength:8192 atIndex:0];
-                    [encoder dispatchThreadgroups:MTLSizeMake( (ne11 + 31)/32, (ne01 + 63)/64, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(128, 1, 1)];
+                    [encoder dispatchThreadgroups:MTLSizeMake((ne11 + 31)/32, (ne01 + 63)/64, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(128, 1, 1)];
                 } else {
                     id<MTLComputePipelineState> pipeline = nil;
 
@@ -3222,8 +3230,6 @@ static bool ggml_metal_encode_node(
             } break;
         case GGML_OP_MUL_MAT_ID:
             {
-                const int n_as = src0->ne[2];
-
                 // src2 = ids
                 const enum ggml_type src2t = src2->type; GGML_UNUSED(src2t);
 
@@ -3237,24 +3243,21 @@ static bool ggml_metal_encode_node(
                 GGML_ASSERT(ne03 == 1);
                 GGML_ASSERT(ne13 == 1);
 
+                const uint32_t r2 = 1;
+                const uint32_t r3 = 1;
+
                 // find the break-even point where the matrix-matrix kernel becomes more efficient compared
                 // to the matrix-vector kernel
                 // ne20 = n_used_experts
-                // ne21 = n_rows
-                const int dst_rows = ne20*ne21;
-                const int dst_rows_min = n_as;
-                const int dst_rows_max = (device.maxThreadgroupMemoryLength/2 - 8192)/4;
-
-                // max size of the rowids array in the kernel shared buffer
-                //GGML_ASSERT(dst_rows <= dst_rows_max);
+                // ne21 = n_rows (batch size)
+                const int ne21_mm_id_min = 32;
 
                 // for now the matrix-matrix multiplication kernel only works on A14+/M1+ SoCs
                 // AMD GPU and older A-chips will reuse matrix-vector multiplication kernel
                 if ([device supportsFamily:MTLGPUFamilyApple7] &&
                         ne00 % 32 == 0 && ne00 >= 64 &&
-                        //ne01 / ne02 >= 512 &&    // NOTE: this is based on Mixtral shapes, might need adjustments
-                        dst_rows >  dst_rows_min &&
-                        dst_rows <= dst_rows_max) {
+                        (ne21 >= ne21_mm_id_min)) {
+                    GGML_ASSERT(ne00 % 4 == 0);
 
                     // some Metal matrix data types require aligned pointers
                     // ref: https://developer.apple.com/metal/Metal-Shading-Language-Specification.pdf (Table 2.5)
@@ -3265,62 +3268,169 @@ static bool ggml_metal_encode_node(
                         default: break;
                     }
 
-                    id<MTLComputePipelineState> pipeline = nil;
+                    const int64_t neh10 = ne10; // n_embd
+                    const int64_t neh11 = ne21; // n_tokens
+                    const int64_t neh12 = ne02; // n_expert
 
-                    switch (src0->type) {
-                        case GGML_TYPE_F32:     pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_F32_F32    ].pipeline; break;
-                        case GGML_TYPE_F16:     pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_F16_F32    ].pipeline; break;
-                        case GGML_TYPE_BF16:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_BF16_F32   ].pipeline; break;
-                        case GGML_TYPE_Q4_0:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q4_0_F32   ].pipeline; break;
-                        case GGML_TYPE_Q4_1:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q4_1_F32   ].pipeline; break;
-                        case GGML_TYPE_Q5_0:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q5_0_F32   ].pipeline; break;
-                        case GGML_TYPE_Q5_1:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q5_1_F32   ].pipeline; break;
-                        case GGML_TYPE_Q8_0:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q8_0_F32   ].pipeline; break;
-                        case GGML_TYPE_Q2_K:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q2_K_F32   ].pipeline; break;
-                        case GGML_TYPE_Q3_K:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q3_K_F32   ].pipeline; break;
-                        case GGML_TYPE_Q4_K:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q4_K_F32   ].pipeline; break;
-                        case GGML_TYPE_Q5_K:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q5_K_F32   ].pipeline; break;
-                        case GGML_TYPE_Q6_K:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q6_K_F32   ].pipeline; break;
-                        case GGML_TYPE_IQ2_XXS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ2_XXS_F32].pipeline; break;
-                        case GGML_TYPE_IQ2_XS:  pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ2_XS_F32 ].pipeline; break;
-                        case GGML_TYPE_IQ3_XXS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ3_XXS_F32].pipeline; break;
-                        case GGML_TYPE_IQ3_S:   pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ3_S_F32  ].pipeline; break;
-                        case GGML_TYPE_IQ2_S:   pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ2_S_F32  ].pipeline; break;
-                        case GGML_TYPE_IQ1_S:   pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ1_S_F32  ].pipeline; break;
-                        case GGML_TYPE_IQ1_M:   pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ1_M_F32  ].pipeline; break;
-                        case GGML_TYPE_IQ4_NL:  pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ4_NL_F32 ].pipeline; break;
-                        case GGML_TYPE_IQ4_XS:  pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ4_XS_F32 ].pipeline; break;
-                        default: GGML_ABORT("MUL_MAT_ID not implemented");
+                    const uint64_t nbh10 = ggml_type_size(GGML_TYPE_F16);
+                    const uint64_t nbh11 = nbh10*neh10;
+                    const uint64_t nbh12 = nbh11*neh11;
+                    const uint64_t nbh13 = nbh12*neh12;
+
+                    const size_t s_src1 = ggml_type_size(GGML_TYPE_F16)*neh10*neh11*neh12;
+                    id<MTLBuffer> h_src1 = ggml_metal_mem_pool_alloc(mem_pool, s_src1);
+                    if (!h_src1) {
+                        GGML_LOG_ERROR("%s: failed to allocate buffer from memory pool, size = %zu\n", __func__, s_src1);
+                        return false;
                     }
 
-                    ggml_metal_kargs_mul_mm_id args = {
-                        /*.nei0 =*/ ne20,
-                        /*.nei1 =*/ ne21,
-                        /*.nbi1 =*/ nb21,
-                        /*.ne00 =*/ ne00,
-                        /*.ne02 =*/ ne02,
-                        /*.nb01 =*/ nb01,
-                        /*.nb02 =*/ nb02,
-                        /*.ne11 =*/ ne11,
-                        /*.ne12 =*/ ne12,
-                        /*.ne13 =*/ ne13,
-                        /*.nb10 =*/ nb10,
-                        /*.nb11 =*/ nb11,
-                        /*.nb12 =*/ nb12,
-                        /*.ne0  =*/ ne0,
-                        /*.ne1  =*/ ne1,
-                    };
+                    const int64_t neh0 = ne0;
+                    const int64_t neh1 = ne21;
+                    const int64_t neh2 = ne02;
 
-                    [encoder setComputePipelineState:pipeline];
-                    [encoder setBytes:&args    length:sizeof(args) atIndex:0];
-                    [encoder setBuffer:id_src0 offset:offs_src0    atIndex:1];
-                    [encoder setBuffer:id_src1 offset:offs_src1    atIndex:2];
-                    [encoder setBuffer:id_dst  offset:offs_dst     atIndex:3];
-                    [encoder setBuffer:id_src2 offset:offs_src2    atIndex:4];
+                    const uint64_t nbh0 = ggml_type_size(GGML_TYPE_F32);
+                    const uint64_t nbh1 = nbh0*neh0;
+                    const uint64_t nbh2 = nbh1*neh1;
+                  //const uint64_t nbh3 = nbh2*neh2;
 
-                    [encoder setThreadgroupMemoryLength:GGML_PAD(8192 + dst_rows*4/*sizeof(ushort2)*/, 16) atIndex:0];
+                    const size_t s_dst = ggml_type_size(GGML_TYPE_F32)*neh0*neh1*neh2;
+                    id<MTLBuffer> h_dst = ggml_metal_mem_pool_alloc(mem_pool, s_dst);
+                    if (!h_dst) {
+                        GGML_LOG_ERROR("%s: failed to allocate buffer from memory pool, size = %zu\n", __func__, s_dst);
+                        return false;
+                    }
 
-                    [encoder dispatchThreadgroups:MTLSizeMake((ne21 + 31)/32, (ne01 + 63)/64, n_as) threadsPerThreadgroup:MTLSizeMake(128, 1, 1)];
+                    // tokens per expert
+                    const size_t s_tpe = ggml_type_size(GGML_TYPE_I32)*ne02;
+                    id<MTLBuffer> h_tpe = ggml_metal_mem_pool_alloc(mem_pool, s_tpe);
+                    if (!h_tpe) {
+                        GGML_LOG_ERROR("%s: failed to allocate buffer from memory pool, size = %zu\n", __func__, s_tpe);
+                        return false;
+                    }
+
+                    // id map
+                    // [n_expert_used, n_tokens]
+                    const size_t s_ids = ggml_type_size(GGML_TYPE_I32)*ne20*ne21;
+                    id<MTLBuffer> h_ids = ggml_metal_mem_pool_alloc(mem_pool, s_ids);
+                    if (!h_ids) {
+                        GGML_LOG_ERROR("%s: failed to allocate buffer from memory pool, size = %zu\n", __func__, s_ids);
+                        return false;
+                    }
+
+                    {
+                        const int nth = MIN(1024, ne10/4);
+
+                        ggml_metal_kargs_mul_mm_id_map0 args = {
+                            ne10,
+                            ne11,  // n_expert_used (bcast)
+                            nb11,
+                            nb12,
+                            neh11, // n_tokens
+                            nbh11,
+                            ne20,  // n_expert_used
+                            nb21,
+                        };
+
+                        id<MTLComputePipelineState> pipeline = nil;
+
+                        pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_MAP0_F16].pipeline;
+
+                        [encoder setComputePipelineState:pipeline];
+                        [encoder setBytes:&args    length:sizeof(args) atIndex:0];
+                        [encoder setBuffer:id_src1 offset:offs_src1    atIndex:1];
+                        [encoder setBuffer:id_src2 offset:offs_src2    atIndex:2];
+                        [encoder setBuffer: h_src1 offset:0            atIndex:3];
+                        [encoder setBuffer: h_tpe  offset:0            atIndex:4];
+                        [encoder setBuffer: h_ids  offset:0            atIndex:5];
+
+                        [encoder dispatchThreadgroups:MTLSizeMake(ne02, 1, 1) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
+                    }
+
+                    {
+                        id<MTLComputePipelineState> pipeline = nil;
+
+                        switch (src0->type) {
+                            case GGML_TYPE_F32:     pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_F32_F16    ].pipeline; break;
+                            case GGML_TYPE_F16:     pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_F16_F16    ].pipeline; break;
+                            case GGML_TYPE_BF16:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_BF16_F16   ].pipeline; break;
+                            case GGML_TYPE_Q4_0:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q4_0_F16   ].pipeline; break;
+                            case GGML_TYPE_Q4_1:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q4_1_F16   ].pipeline; break;
+                            case GGML_TYPE_Q5_0:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q5_0_F16   ].pipeline; break;
+                            case GGML_TYPE_Q5_1:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q5_1_F16   ].pipeline; break;
+                            case GGML_TYPE_Q8_0:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q8_0_F16   ].pipeline; break;
+                            case GGML_TYPE_Q2_K:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q2_K_F16   ].pipeline; break;
+                            case GGML_TYPE_Q3_K:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q3_K_F16   ].pipeline; break;
+                            case GGML_TYPE_Q4_K:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q4_K_F16   ].pipeline; break;
+                            case GGML_TYPE_Q5_K:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q5_K_F16   ].pipeline; break;
+                            case GGML_TYPE_Q6_K:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q6_K_F16   ].pipeline; break;
+                            case GGML_TYPE_IQ2_XXS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ2_XXS_F16].pipeline; break;
+                            case GGML_TYPE_IQ2_XS:  pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ2_XS_F16 ].pipeline; break;
+                            case GGML_TYPE_IQ3_XXS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ3_XXS_F16].pipeline; break;
+                            case GGML_TYPE_IQ3_S:   pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ3_S_F16  ].pipeline; break;
+                            case GGML_TYPE_IQ2_S:   pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ2_S_F16  ].pipeline; break;
+                            case GGML_TYPE_IQ1_S:   pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ1_S_F16  ].pipeline; break;
+                            case GGML_TYPE_IQ1_M:   pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ1_M_F16  ].pipeline; break;
+                            case GGML_TYPE_IQ4_NL:  pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ4_NL_F16 ].pipeline; break;
+                            case GGML_TYPE_IQ4_XS:  pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ4_XS_F16 ].pipeline; break;
+                            default: GGML_ABORT("MUL_MAT_ID not implemented");
+                        }
+
+                        ggml_metal_kargs_mul_mm_id args = {
+                            /*.ne00  =*/ ne00,
+                            /*.ne02  =*/ ne02,
+                            /*.nb01  =*/ nb01,
+                            /*.nb02  =*/ nb02,
+                            /*.nb03  =*/ nb03,
+                            /*.neh12 =*/ neh12,
+                            /*.nbh10 =*/ nbh10,
+                            /*.nbh11 =*/ nbh11,
+                            /*.nbh12 =*/ nbh12,
+                            /*.nbh13 =*/ nbh13,
+                            /*.neh0  =*/ neh0,
+                            /*.neh1  =*/ neh1,
+                            /*.r2    =*/ r2,
+                            /*.r3    =*/ r3,
+                        };
+
+                        [encoder setComputePipelineState:pipeline];
+                        [encoder setBytes:&args    length:sizeof(args) atIndex:0];
+                        [encoder setBuffer:id_src0 offset:offs_src0    atIndex:1];
+                        [encoder setBuffer: h_src1 offset:0            atIndex:2];
+                        [encoder setBuffer: h_tpe  offset:0            atIndex:3];
+                        [encoder setBuffer: h_dst  offset:0            atIndex:4];
+
+                        [encoder setThreadgroupMemoryLength:8192 atIndex:0];
+                        [encoder dispatchThreadgroups:MTLSizeMake((ne21 + 31)/32, (ne01 + 63)/64, ne02) threadsPerThreadgroup:MTLSizeMake(128, 1, 1)];
+                    }
+
+                    {
+                        GGML_ASSERT(ne0 % 4 == 0);
+
+                        const int nth = MIN(1024, ne0/4);
+
+                        ggml_metal_kargs_mul_mm_id_map1 args = {
+                            ne20, // n_expert_used
+                            neh0,
+                            neh1,
+                            nbh1,
+                            nbh2,
+                            ne0,
+                            nb1,
+                            nb2,
+                        };
+
+                        id<MTLComputePipelineState> pipeline = nil;
+
+                        pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_MAP1_F32].pipeline;
+
+                        [encoder setComputePipelineState:pipeline];
+                        [encoder setBytes:&args   length:sizeof(args) atIndex:0];
+                        [encoder setBuffer: h_dst offset:0            atIndex:1];
+                        [encoder setBuffer: h_ids offset:0            atIndex:2];
+                        [encoder setBuffer:id_dst offset:offs_dst     atIndex:3];
+
+                        [encoder dispatchThreadgroups:MTLSizeMake(ne20, ne21, 1) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
+                    }
                 } else {
                     id<MTLComputePipelineState> pipeline = nil;
 
@@ -3514,7 +3624,7 @@ static bool ggml_metal_encode_node(
                     [encoder setBuffer:id_src2 offset:offs_src2 atIndex:4];
 
                     const int64_t _ne1 = 1;
-                    const int64_t ne123 = dst_rows;
+                    const int64_t ne123 = ne20*ne21;
 
                     if (smem > 0) {
                         [encoder setThreadgroupMemoryLength:smem atIndex:0];
@@ -3718,6 +3828,7 @@ static bool ggml_metal_encode_node(
             } break;
         case GGML_OP_ROPE:
             {
+
                 // make sure we have one or more position id(ne10) per token(ne02)
                 GGML_ASSERT(ne10 % ne02 == 0);
                 GGML_ASSERT(ne10 >= ne02);
@@ -3744,20 +3855,42 @@ static bool ggml_metal_encode_node(
                 memcpy(&beta_fast,   (const int32_t *) dst->op_params +  9, sizeof(float));
                 memcpy(&beta_slow,   (const int32_t *) dst->op_params + 10, sizeof(float));
 
-                const bool is_neox = mode & GGML_ROPE_TYPE_NEOX;
+                const bool is_neox   = mode & GGML_ROPE_TYPE_NEOX;
+                const bool is_mrope  = mode & GGML_ROPE_TYPE_MROPE;
+                const bool is_vision = mode == GGML_ROPE_TYPE_VISION;
+
+                // mrope
+                const int sect_0 = ((const int32_t *) dst->op_params)[11];
+                const int sect_1 = ((const int32_t *) dst->op_params)[12];
+                const int sect_2 = ((const int32_t *) dst->op_params)[13];
+                const int sect_3 = ((const int32_t *) dst->op_params)[14];
 
                 id<MTLComputePipelineState> pipeline = nil;
 
-                if (!is_neox) {
+                if (is_neox) {
                     switch (src0->type) {
-                        case GGML_TYPE_F32: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ROPE_NORM_F32].pipeline; break;
-                        case GGML_TYPE_F16: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ROPE_NORM_F16].pipeline; break;
+                        case GGML_TYPE_F32: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ROPE_NEOX_F32].pipeline; break;
+                        case GGML_TYPE_F16: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ROPE_NEOX_F16].pipeline; break;
+                        default: GGML_ABORT("fatal error");
+                    };
+                } else if (is_mrope && !is_vision) {
+                    GGML_ASSERT(ne10*4 >= ne02); // need at least 4 pos per token
+                    switch (src0->type) {
+                        case GGML_TYPE_F32: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ROPE_MULTI_F32].pipeline; break;
+                        case GGML_TYPE_F16: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ROPE_MULTI_F16].pipeline; break;
+                        default: GGML_ABORT("fatal error");
+                    };
+                } else if (is_vision) {
+                    GGML_ASSERT(ne10*4 >= ne02); // need at least 4 pos per token
+                    switch (src0->type) {
+                        case GGML_TYPE_F32: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ROPE_VISION_F32].pipeline; break;
+                        case GGML_TYPE_F16: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ROPE_VISION_F16].pipeline; break;
                         default: GGML_ABORT("fatal error");
                     };
                 } else {
                     switch (src0->type) {
-                        case GGML_TYPE_F32: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ROPE_NEOX_F32].pipeline; break;
-                        case GGML_TYPE_F16: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ROPE_NEOX_F16].pipeline; break;
+                        case GGML_TYPE_F32: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ROPE_NORM_F32].pipeline; break;
+                        case GGML_TYPE_F16: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ROPE_NORM_F16].pipeline; break;
                         default: GGML_ABORT("fatal error");
                     };
                 }
@@ -3788,6 +3921,10 @@ static bool ggml_metal_encode_node(
                     /*.attn_factor =*/ attn_factor,
                     /*.beta_fast   =*/ beta_fast,
                     /*.beta_slow   =*/ beta_slow,
+                    /* sect_0      =*/ sect_0,
+                    /* sect_1      =*/ sect_1,
+                    /* sect_2      =*/ sect_2,
+                    /* sect_3      =*/ sect_3,
                 };
 
                 [encoder setComputePipelineState:pipeline];
diff --git a/ml/backend/ggml/ggml/src/ggml-metal/ggml-metal.metal b/ml/backend/ggml/ggml/src/ggml-metal/ggml-metal.metal
index 6ceb3cef..080a943b 100644
--- a/ml/backend/ggml/ggml/src/ggml-metal/ggml-metal.metal
+++ b/ml/backend/ggml/ggml/src/ggml-metal/ggml-metal.metal
@@ -2713,8 +2713,148 @@ kernel void kernel_rope_neox(
     }
 }
 
+template<typename T>
+kernel void kernel_rope_multi(
+        constant ggml_metal_kargs_rope & args,
+        device const char * src0,
+        device const char * src1,
+        device const char * src2,
+        device       char * dst,
+        ushort  tiitg[[thread_index_in_threadgroup]],
+        ushort3 tptg [[threads_per_threadgroup]],
+        uint3   tgpig[[threadgroup_position_in_grid]]) {
+    const int i3 = tgpig[2];
+    const int i2 = tgpig[1];
+    const int i1 = tgpig[0];
+
+    float corr_dims[2];
+    rope_yarn_corr_dims(args.n_dims, args.n_ctx_orig, args.freq_base, args.beta_fast, args.beta_slow, corr_dims);
+
+    device const int32_t * pos = (device const int32_t *) src1;
+
+    const float inv_ndims = -1.f/args.n_dims;
+
+    float cos_theta;
+    float sin_theta;
+
+    for (int i0 = 2*tiitg; i0 < args.ne0; i0 += 2*tptg.x) {
+        if (i0 < args.n_dims) {
+            const int ic = i0/2;
+
+            // mrope theta calculations
+            // note: the rest is the same as kernel_rope_neox
+            const int sect_dims = args.sect_0 + args.sect_1 + args.sect_2 + args.sect_3;
+            const int sec_w01   = args.sect_0 + args.sect_1;               // end of section 1
+            const int sec_w012  = args.sect_0 + args.sect_1 + args.sect_2; // end of section 2
+            const int sector    = ic % sect_dims;
+
+            float theta_base;
+            if (sector < args.sect_0) {
+                theta_base = (float) pos[i2];
+            } else if (sector < sec_w01) {
+                theta_base = (float) pos[i2 + args.ne02];
+            } else if (sector < sec_w012) {
+                theta_base = (float) pos[i2 + args.ne02 * 2];
+            } else {
+                theta_base = (float) pos[i2 + args.ne02 * 3];
+            }
+            // end of mrope
+
+            const float theta = theta_base * pow(args.freq_base, inv_ndims*i0);
+
+            const float freq_factor = src2 != src0 ? ((device const float *) src2)[ic] : 1.0f;
+
+            rope_yarn(theta/freq_factor, args.freq_scale, corr_dims, i0, args.ext_factor, args.attn_factor, &cos_theta, &sin_theta);
+
+            device const T * const src = (device T *)(src0 + i3*args.nb03 + i2*args.nb02 + i1*args.nb01 + ic*args.nb00);
+            device       T * dst_data  = (device T *)( dst + i3*args.nb3  + i2*args.nb2  + i1*args.nb1  + ic*args.nb0);
+
+            const float x0 = src[0];
+            const float x1 = src[args.n_dims/2];
+
+            dst_data[0]             = x0*cos_theta - x1*sin_theta;
+            dst_data[args.n_dims/2] = x0*sin_theta + x1*cos_theta;
+        } else {
+            device const T * const src = (device T *)(src0 + i3*args.nb03 + i2*args.nb02 + i1*args.nb01 + i0*args.nb00);
+            device       T * dst_data  = (device T *)( dst + i3*args.nb3  + i2*args.nb2  + i1*args.nb1  + i0*args.nb0);
+
+            dst_data[0] = src[0];
+            dst_data[1] = src[1];
+        }
+    }
+}
+
+template<typename T>
+kernel void kernel_rope_vision(
+        constant ggml_metal_kargs_rope & args,
+        device const char * src0,
+        device const char * src1,
+        device const char * src2,
+        device       char * dst,
+        ushort  tiitg[[thread_index_in_threadgroup]],
+        ushort3 tptg [[threads_per_threadgroup]],
+        uint3   tgpig[[threadgroup_position_in_grid]]) {
+    const int i3 = tgpig[2];
+    const int i2 = tgpig[1];
+    const int i1 = tgpig[0];
+
+    float corr_dims[2];
+    rope_yarn_corr_dims(args.n_dims, args.n_ctx_orig, args.freq_base, args.beta_fast, args.beta_slow, corr_dims);
+
+    device const int32_t * pos = (device const int32_t *) src1;
+
+    const float inv_ndims = -1.f/args.n_dims;
+
+    float cos_theta;
+    float sin_theta;
+
+    for (int i0 = 2*tiitg; i0 < args.ne0; i0 += 2*tptg.x) {
+        if (i0 < 2*args.n_dims) { // different from kernel_rope_multi
+            const int ic = i0/2;
+
+            // mrope theta calculations (only support 2 dimensions)
+            const int sect_dims = args.sect_0 + args.sect_1;
+            const int sector    = ic % sect_dims;
+
+            float p;
+            float theta_base;
+            if (sector < args.sect_1) {
+                p = (float) sector;
+                theta_base = (float) pos[i2];
+            } else {
+                p = (float) sector - args.sect_0;
+                theta_base = (float) pos[i2 + args.ne02];
+            }
+
+            const float theta = theta_base * pow(args.freq_base, 2.0f * inv_ndims * p);
+            // end of mrope
+
+            const float freq_factor = src2 != src0 ? ((device const float *) src2)[ic] : 1.0f;
+
+            rope_yarn(theta/freq_factor, args.freq_scale, corr_dims, i0, args.ext_factor, args.attn_factor, &cos_theta, &sin_theta);
+
+            device const T * const src = (device T *)(src0 + i3*args.nb03 + i2*args.nb02 + i1*args.nb01 + ic*args.nb00);
+            device       T * dst_data  = (device T *)( dst + i3*args.nb3  + i2*args.nb2  + i1*args.nb1  + ic*args.nb0);
+
+            const float x0 = src[0];
+            const float x1 = src[args.n_dims]; // different from kernel_rope_multi
+
+            dst_data[0]           = x0*cos_theta - x1*sin_theta;
+            dst_data[args.n_dims] = x0*sin_theta + x1*cos_theta; // different from kernel_rope_multi
+        } else {
+            device const T * const src = (device T *)(src0 + i3*args.nb03 + i2*args.nb02 + i1*args.nb01 + i0*args.nb00);
+            device       T * dst_data  = (device T *)( dst + i3*args.nb3  + i2*args.nb2  + i1*args.nb1  + i0*args.nb0);
+
+            dst_data[0] = src[0];
+            dst_data[1] = src[1];
+        }
+    }
+}
+
 typedef decltype(kernel_rope_norm<float>) kernel_rope_norm_t;
 typedef decltype(kernel_rope_neox<float>) kernel_rope_neox_t;
+typedef decltype(kernel_rope_multi<float>) kernel_rope_multi_t;
+typedef decltype(kernel_rope_vision<float>) kernel_rope_vision_t;
 
 template [[host_name("kernel_rope_norm_f32")]] kernel kernel_rope_norm_t kernel_rope_norm<float>;
 template [[host_name("kernel_rope_norm_f16")]] kernel kernel_rope_norm_t kernel_rope_norm<half>;
@@ -2722,6 +2862,12 @@ template [[host_name("kernel_rope_norm_f16")]] kernel kernel_rope_norm_t kernel_
 template [[host_name("kernel_rope_neox_f32")]] kernel kernel_rope_neox_t kernel_rope_neox<float>;
 template [[host_name("kernel_rope_neox_f16")]] kernel kernel_rope_neox_t kernel_rope_neox<half>;
 
+template [[host_name("kernel_rope_multi_f32")]] kernel kernel_rope_multi_t kernel_rope_multi<float>;
+template [[host_name("kernel_rope_multi_f16")]] kernel kernel_rope_multi_t kernel_rope_multi<half>;
+
+template [[host_name("kernel_rope_vision_f32")]] kernel kernel_rope_vision_t kernel_rope_vision<float>;
+template [[host_name("kernel_rope_vision_f16")]] kernel kernel_rope_vision_t kernel_rope_vision<half>;
+
 typedef void (im2col_t)(
         device const float * x,
         device        char * dst,
@@ -6381,127 +6527,219 @@ kernel void kernel_mul_mm(
     }
 }
 
-// same as kernel_mul_mm_impl, but src1 and dst are accessed via indices stored in rowids
-// TODO: this kernel needs to be reimplemented from scratch for better performance
-template<typename block_q, short nl, void (*dequantize_func)(device const block_q *, short, thread half4x4 &)>
-void kernel_mul_mm_id_impl(
-        int32_t  ne00,
-        int32_t  ne02,
-        uint64_t nb01,
-        uint64_t nb02,
-        int32_t  ne11,
-        int32_t  ne12,
-        uint64_t nb10,
-        uint64_t nb11,
-        uint64_t nb12,
-        int32_t  ne0,
-        int32_t  ne1,
-        int64_t  ne0ne1,
-        device   const char * src0,
-        device   const char * src1,
-        threadgroup ushort2 * rowids,
-        device         char * dst,
-        threadgroup    char * shmem,
+template<typename T4>
+kernel void kernel_mul_mm_id_map0(
+        constant ggml_metal_kargs_mul_mm_id_map0 & args,
+        device  const char * src1,
+        device  const char * src2,
+        device        char * hsrc1,
+        device        char * htpe,
+        device        char * hids,
+        uint3   tgpig[[threadgroup_position_in_grid]],
+        ushort3 tpitg[[thread_position_in_threadgroup]],
+        ushort3   ntg[[threads_per_threadgroup]]) {
+    const int ide = tgpig[0]; // expert id
+
+    int n_all = 0;
+
+    device int32_t * ids_i32 = (device int32_t *) (hids);
+
+    for (int i21 = 0; i21 < args.neh11; i21++) { // n_tokens
+        device const int32_t * src2_i32 = (device const int32_t *) (src2 + i21*args.nb21);
+
+        for (int i20 = 0; i20 < args.ne20; i20++) { // n_expert_used
+            if (src2_i32[i20] != ide) {
+                continue;
+            }
+
+            device const float4 *  src1_f32x4 = (device const float4 *) ( src1 + i21*args.nb12 + (i20%args.ne11)*args.nb11);
+            device       T4     * hsrc1_f32x4 = (device       T4     *) (hsrc1 + (ide*args.neh11 + n_all)*args.nbh11);
+
+            for (int64_t i00 = tpitg.x; i00 < args.ne10/4; i00 += ntg.x) {
+                hsrc1_f32x4[i00] = (T4) (src1_f32x4[i00]);
+            }
+
+            if (tpitg.x == 0) {
+                ids_i32[i21*args.ne20 + i20] = ide*args.neh11 + n_all;
+            }
+
+            ++n_all;
+        }
+    }
+
+    if (tpitg.x == 0) {
+        device int32_t * tpe_i32 = (device int32_t *) (htpe);
+        tpe_i32[ide] = n_all;
+    }
+}
+
+typedef decltype(kernel_mul_mm_id_map0<half4>) kernel_mul_mm_id_map0_t;
+
+template [[host_name("kernel_mul_mm_id_map0_f16")]] kernel kernel_mul_mm_id_map0_t kernel_mul_mm_id_map0<half4>;
+
+template<typename T>
+kernel void kernel_mul_mm_id_map1(
+        constant ggml_metal_kargs_mul_mm_id_map1 & args,
+        device  const char * hdst,
+        device  const char * hids,
+        device        char * dst,
+        uint3   tgpig[[threadgroup_position_in_grid]],
+        ushort3 tpitg[[thread_position_in_threadgroup]],
+        ushort3   ntg[[threads_per_threadgroup]]) {
+    const int i20 = tgpig[0]; // used expert
+    const int i21 = tgpig[1]; // token
+
+    device const int32_t * ids_i32    = (device const int32_t *) (hids);
+    device       float4  * dst_f32x4  = (device       float4  *) (dst + i20*args.nb1 + i21*args.nb2);
+
+    const int id = ids_i32[i21*args.ne20 + i20];
+
+    const int ide = id / args.neh1;
+    const int idt = id % args.neh1;
+
+    device const float4 * hdst_f32x4 = (device const float4 *) (hdst + idt*args.nbh1 + ide*args.nbh2);
+
+    for (int64_t i0 = tpitg.x; i0 < args.neh0/4; i0 += ntg.x) {
+        dst_f32x4[i0] = hdst_f32x4[i0];
+    }
+}
+
+typedef decltype(kernel_mul_mm_id_map1<float>) kernel_mul_mm_id_map1_t;
+
+template [[host_name("kernel_mul_mm_id_map1_f32")]] kernel kernel_mul_mm_id_map1_t kernel_mul_mm_id_map1<float>;
+
+template<typename T, typename T4x4, typename simdgroup_T8x8, typename block_q, short nl, void (*dequantize_func)(device const block_q *, short, thread T4x4 &)>
+kernel void kernel_mul_mm_id(
+        constant ggml_metal_kargs_mul_mm_id & args,
+        device const char * src0,
+        device const char * src1,
+        device const char * tpe,
+        device       char * dst,
+        threadgroup  char * shmem [[threadgroup(0)]],
         uint3  tgpig[[threadgroup_position_in_grid]],
         ushort tiitg[[thread_index_in_threadgroup]],
         ushort sgitg[[simdgroup_index_in_threadgroup]]) {
 
-    threadgroup half  * sa = (threadgroup half  *)(shmem);
-    threadgroup float * sb = (threadgroup float *)(shmem + 4096);
+    threadgroup T    * sa = (threadgroup T    *)(shmem);
+    threadgroup half * sb = (threadgroup half *)(shmem + 4096);
 
     const int r0 = tgpig.y;
     const int r1 = tgpig.x;
+    const int im = tgpig.z;
 
-    if (r1*BLOCK_SIZE_N >= ne1) return;
+    device const int32_t * tpe_i32 = (device const int32_t *) (tpe);
+
+    const int neh1 = tpe_i32[im];
+
+    if (r1*BLOCK_SIZE_N >= neh1) {
+        return;
+    }
 
     // if this block is of 64x32 shape or smaller
-    short n_rows = (ne0 - r0 * BLOCK_SIZE_M < BLOCK_SIZE_M) ? (ne0 - r0 * BLOCK_SIZE_M) : BLOCK_SIZE_M;
-    short n_cols = (ne1 - r1 * BLOCK_SIZE_N < BLOCK_SIZE_N) ? (ne1 - r1 * BLOCK_SIZE_N) : BLOCK_SIZE_N;
+    const short n_rows = (args.neh0 - r0*BLOCK_SIZE_M < BLOCK_SIZE_M) ? (args.neh0 - r0*BLOCK_SIZE_M) : BLOCK_SIZE_M;
+    const short n_cols = (     neh1 - r1*BLOCK_SIZE_N < BLOCK_SIZE_N) ? (     neh1 - r1*BLOCK_SIZE_N) : BLOCK_SIZE_N;
 
     // a thread shouldn't load data outside of the matrix
-    short thread_row = ((short)tiitg/THREAD_PER_ROW) < n_rows ? ((short)tiitg/THREAD_PER_ROW) : n_rows - 1;
-    short thread_col = ((short)tiitg/THREAD_PER_COL) < n_cols ? ((short)tiitg/THREAD_PER_COL) : n_cols - 1;
+    const short thread_row = ((short)tiitg/THREAD_PER_ROW) < n_rows ? ((short)tiitg/THREAD_PER_ROW) : n_rows - 1;
+    const short thread_col = ((short)tiitg/THREAD_PER_COL) < n_cols ? ((short)tiitg/THREAD_PER_COL) : n_cols - 1;
 
-    simdgroup_half8x8  ma[4];
-    simdgroup_float8x8 mb[2];
+    simdgroup_T8x8     ma[4];
+    simdgroup_half8x8  mb[2];
     simdgroup_float8x8 mc[8];
-    for (int i = 0; i < 8; i++){
+
+    for (short i = 0; i < 8; i++){
         mc[i] = make_filled_simdgroup_matrix<float, 8>(0.f);
     }
+
     short il = (tiitg % THREAD_PER_ROW);
 
-    ushort offset1 = il/nl;
+    const int i12 = im%args.neh12;
+    const int i13 = im/args.neh12;
 
-    threadgroup const auto & id = rowids[r1 * BLOCK_SIZE_N + thread_col];
+    const uint64_t offset0 = (i12/args.r2)*args.nb02 + (i13/args.r3)*args.nb03;
+    const short    offset1 = il/nl;
 
-    device const block_q * x = (device const block_q *)(src0 + (r0 * BLOCK_SIZE_M + thread_row) * nb01) + offset1;
-    device const float   * y = (device const float   *)(src1
-        + nb12 * id[1]
-        + nb11 * (id[0] % ne11)
-        + nb10 * (BLOCK_SIZE_K / THREAD_PER_COL * (tiitg % THREAD_PER_COL)));
+    device const block_q * x = (device const block_q *)(src0
+        + args.nb01*(r0*BLOCK_SIZE_M + thread_row) + offset0) + offset1;
 
-    for (int loop_k = 0; loop_k < ne00; loop_k += BLOCK_SIZE_K) {
+    device const half   * y = (device const half   *)(src1
+        + args.nbh13*i13
+        + args.nbh12*i12
+        + args.nbh11*(r1*BLOCK_SIZE_N + thread_col)
+        + args.nbh10*(BLOCK_SIZE_K / THREAD_PER_COL * (tiitg % THREAD_PER_COL)));
+
+    for (int loop_k = 0; loop_k < args.ne00; loop_k += BLOCK_SIZE_K) {
         // load data and store to threadgroup memory
-        half4x4 temp_a;
+        T4x4 temp_a;
         dequantize_func(x, il, temp_a);
+
         threadgroup_barrier(mem_flags::mem_threadgroup);
 
-        for (int i = 0; i < 16; i++) {
-            *(sa + SG_MAT_SIZE * ((tiitg / THREAD_PER_ROW / 8) \
-            +                     (tiitg % THREAD_PER_ROW) * 16 + (i / 8) * 8) \
-            +                     (tiitg / THREAD_PER_ROW) % 8  + (i & 7) * 8) = temp_a[i/4][i%4];
+        #pragma unroll(16)
+        for (short i = 0; i < 16; i++) {
+            *(sa + SG_MAT_SIZE * ((tiitg/THREAD_PER_ROW/8) \
+            +                     (tiitg%THREAD_PER_ROW)*16 + (i/8)*8) \
+            +                     (tiitg/THREAD_PER_ROW)%8  + (i&7)*8) = temp_a[i/4][i%4];
         }
 
-        *(threadgroup float2x4 *)(sb + (tiitg % THREAD_PER_COL) * 8 * 32 + 8 * (tiitg / THREAD_PER_COL)) = *((device float2x4 *)y);
+        *(threadgroup half2x4 *)(sb + 32*8*(tiitg%THREAD_PER_COL) + 8*(tiitg/THREAD_PER_COL)) = *((device half2x4 *) y);
 
         il = (il + 2 < nl) ? il + 2 : il % 2;
-        x  = (il < 2) ? x + (2+nl-1)/nl : x;
+        x  = (il < 2) ? x + (2 + nl - 1)/nl : x;
         y += BLOCK_SIZE_K;
 
         threadgroup_barrier(mem_flags::mem_threadgroup);
 
         // load matrices from threadgroup memory and conduct outer products
-        threadgroup half  * lsma = (sa + THREAD_MAT_M * SG_MAT_SIZE * (sgitg % 2));
-        threadgroup float * lsmb = (sb + THREAD_MAT_N * SG_MAT_SIZE * (sgitg / 2));
+        threadgroup const T    * lsma = (sa + THREAD_MAT_M*SG_MAT_SIZE*(sgitg%2));
+        threadgroup const half * lsmb = (sb + THREAD_MAT_N*SG_MAT_SIZE*(sgitg/2));
 
-        #pragma unroll(BLOCK_SIZE_K/8)
-        for (int ik = 0; ik < BLOCK_SIZE_K / 8; ik++) {
+        #pragma unroll(4)
+        for (short ik = 0; ik < BLOCK_SIZE_K/8; ik++) {
             #pragma unroll(4)
-            for (int i = 0; i < 4; i++) {
+            for (short i = 0; i < 4; i++) {
                 simdgroup_load(ma[i], lsma + SG_MAT_SIZE * i);
             }
+
             simdgroup_barrier(mem_flags::mem_none);
+
             #pragma unroll(2)
-            for (int i = 0; i < 2; i++) {
+            for (short i = 0; i < 2; i++) {
                 simdgroup_load(mb[i], lsmb + SG_MAT_SIZE * i);
             }
 
-            lsma += BLOCK_SIZE_M / SG_MAT_ROW * SG_MAT_SIZE;
-            lsmb += BLOCK_SIZE_N / SG_MAT_ROW * SG_MAT_SIZE;
-
             #pragma unroll(8)
-            for (int i = 0; i < 8; i++){
+            for (short i = 0; i < 8; i++){
                 simdgroup_multiply_accumulate(mc[i], mb[i/4], ma[i%4], mc[i]);
             }
+
+            lsma += (BLOCK_SIZE_M/SG_MAT_ROW)*SG_MAT_SIZE;
+            lsmb += (BLOCK_SIZE_N/SG_MAT_ROW)*SG_MAT_SIZE;
         }
     }
 
-    {
+    if ((r0 + 1) * BLOCK_SIZE_M <= args.neh0 && (r1 + 1) * BLOCK_SIZE_N <= neh1) {
+        device float * C = (device float *) dst +
+            (BLOCK_SIZE_M * r0 + 32*(sgitg &  1)) + \
+            (BLOCK_SIZE_N * r1 + 16*(sgitg >> 1)) * args.neh0 + im*args.neh1*args.neh0;
+
+        for (short i = 0; i < 8; i++) {
+            simdgroup_store(mc[i], C + 8 * (i%4) + 8 * args.neh0 * (i/4), args.neh0);
+        }
+    } else {
+        // block is smaller than 64x32, we should avoid writing data outside of the matrix
         threadgroup_barrier(mem_flags::mem_threadgroup);
         threadgroup float * temp_str = ((threadgroup float *) shmem) \
-                                      + 32 * (sgitg&1) + (16 * (sgitg>>1)) * BLOCK_SIZE_M;
-        for (int i = 0; i < 8; i++) {
-            simdgroup_store(mc[i], temp_str + 8 * (i%4) + 8 * BLOCK_SIZE_M * (i/4), BLOCK_SIZE_M);
+                                     + 32*(sgitg&1) + (16*(sgitg >> 1))*BLOCK_SIZE_M;
+        for (short i = 0; i < 8; i++) {
+            simdgroup_store(mc[i], temp_str + 8*(i%4) + 8*BLOCK_SIZE_M*(i/4), BLOCK_SIZE_M);
         }
 
         threadgroup_barrier(mem_flags::mem_threadgroup);
 
         if (sgitg == 0) {
             for (int j = tiitg; j < n_cols; j += BLOCK_SIZE_N) {
-                threadgroup const auto & jid = rowids[r1 * BLOCK_SIZE_N + j];
-                int64_t joff = jid[0]*ne0 + jid[1]*ne0ne1;
-
-                device float  * D  = (device float  *) dst + (r0*BLOCK_SIZE_M) + joff;
+                device float  * D  = (device float  *) dst + (r0*BLOCK_SIZE_M) + (r1*BLOCK_SIZE_N + j)*args.neh0 + im*args.neh1*args.neh0;
                 device float4 * D4 = (device float4 *) D;
 
                 threadgroup float  * C  = temp_str + (j*BLOCK_SIZE_M);
@@ -6521,66 +6759,6 @@ void kernel_mul_mm_id_impl(
     }
 }
 
-template<typename block_q, short nl, void (*dequantize_func)(device const block_q *, short, thread half4x4 &)>
-kernel void kernel_mul_mm_id(
-        constant ggml_metal_kargs_mul_mm_id & args,
-        device const char * src0s,
-        device const char * src1,
-        device       char * dst,
-        device const char * ids,
-        threadgroup  char * shmem [[threadgroup(0)]],
-        uint3  tgpig[[threadgroup_position_in_grid]],
-        ushort tiitg[[thread_index_in_threadgroup]],
-        ushort sgitg[[simdgroup_index_in_threadgroup]]) {
-
-    const int32_t i02 = tgpig.z;
-
-    tgpig.z = 0;
-
-    device const char * src0 = src0s + i02*args.nb02;
-
-    // row indices
-    threadgroup ushort2 * rowids = (threadgroup ushort2 *)(shmem + 8192);
-
-    // TODO: parallelize this loop
-    int32_t _ne1 = 0;
-    for (ushort ii1 = 0; ii1 < args.nei1; ii1++) {
-        for (ushort ii0 = 0; ii0 < args.nei0; ii0++) {
-            int32_t id = ((device int32_t *) (ids + ii1*args.nbi1))[ii0];
-            if (id == i02) {
-                if (tiitg == 0) {
-                    rowids[_ne1] = ushort2(ii0, ii1);
-                }
-                _ne1++;
-            }
-        }
-    }
-
-    threadgroup_barrier(mem_flags::mem_threadgroup);
-
-    kernel_mul_mm_id_impl<block_q, nl, dequantize_func>(
-        args.ne00,
-        args.ne02,
-        args.nb01,
-        args.nb02,
-        args.ne11,
-        args.ne12,
-        args.nb10,
-        args.nb11,
-        args.nb12,
-        args.ne0,
-        _ne1,
-        (int64_t)args.ne0*args.ne1,
-        src0,
-        src1,
-        rowids,
-        dst,
-        shmem,
-        tgpig,
-        tiitg,
-        sgitg);
-}
-
 #define QK_NL 16
 
 //
@@ -6621,63 +6799,64 @@ template [[host_name("kernel_get_rows_iq4_xs")]]  kernel get_rows_q_t kernel_get
 // matrix-matrix multiplication
 //
 
-typedef decltype(kernel_mul_mm<half, half4x4, simdgroup_half8x8, float4x4, 1, dequantize_f32>) mat_mm_t;
+typedef decltype(kernel_mul_mm<half, half4x4, simdgroup_half8x8, float4x4, 1, dequantize_f32>) mul_mm_t;
 
-template [[host_name("kernel_mul_mm_f32_f32")]]     kernel mat_mm_t kernel_mul_mm<half,   half4x4,   simdgroup_half8x8,   float4x4,      1,     dequantize_f32>;
-template [[host_name("kernel_mul_mm_f16_f32")]]     kernel mat_mm_t kernel_mul_mm<half,   half4x4,   simdgroup_half8x8,   half4x4,       1,     dequantize_f16>;
+template [[host_name("kernel_mul_mm_f32_f32")]]     kernel mul_mm_t kernel_mul_mm<half,   half4x4,   simdgroup_half8x8,   float4x4,      1,     dequantize_f32>;
+template [[host_name("kernel_mul_mm_f16_f32")]]     kernel mul_mm_t kernel_mul_mm<half,   half4x4,   simdgroup_half8x8,   half4x4,       1,     dequantize_f16>;
 #if defined(GGML_METAL_USE_BF16)
-template [[host_name("kernel_mul_mm_bf16_f32")]]    kernel mat_mm_t kernel_mul_mm<bfloat, bfloat4x4, simdgroup_bfloat8x8, bfloat4x4,     1,     dequantize_bf16>;
+template [[host_name("kernel_mul_mm_bf16_f32")]]    kernel mul_mm_t kernel_mul_mm<bfloat, bfloat4x4, simdgroup_bfloat8x8, bfloat4x4,     1,     dequantize_bf16>;
 #endif
-template [[host_name("kernel_mul_mm_q4_0_f32")]]    kernel mat_mm_t kernel_mul_mm<half,   half4x4,   simdgroup_half8x8,   block_q4_0,    2,     dequantize_q4_0>;
-template [[host_name("kernel_mul_mm_q4_1_f32")]]    kernel mat_mm_t kernel_mul_mm<half,   half4x4,   simdgroup_half8x8,   block_q4_1,    2,     dequantize_q4_1>;
-template [[host_name("kernel_mul_mm_q5_0_f32")]]    kernel mat_mm_t kernel_mul_mm<half,   half4x4,   simdgroup_half8x8,   block_q5_0,    2,     dequantize_q5_0>;
-template [[host_name("kernel_mul_mm_q5_1_f32")]]    kernel mat_mm_t kernel_mul_mm<half,   half4x4,   simdgroup_half8x8,   block_q5_1,    2,     dequantize_q5_1>;
-template [[host_name("kernel_mul_mm_q8_0_f32")]]    kernel mat_mm_t kernel_mul_mm<half,   half4x4,   simdgroup_half8x8,   block_q8_0,    2,     dequantize_q8_0>;
-template [[host_name("kernel_mul_mm_q2_K_f32")]]    kernel mat_mm_t kernel_mul_mm<half,   half4x4,   simdgroup_half8x8,   block_q2_K,    QK_NL, dequantize_q2_K>;
-template [[host_name("kernel_mul_mm_q3_K_f32")]]    kernel mat_mm_t kernel_mul_mm<half,   half4x4,   simdgroup_half8x8,   block_q3_K,    QK_NL, dequantize_q3_K>;
-template [[host_name("kernel_mul_mm_q4_K_f32")]]    kernel mat_mm_t kernel_mul_mm<half,   half4x4,   simdgroup_half8x8,   block_q4_K,    QK_NL, dequantize_q4_K>;
-template [[host_name("kernel_mul_mm_q5_K_f32")]]    kernel mat_mm_t kernel_mul_mm<half,   half4x4,   simdgroup_half8x8,   block_q5_K,    QK_NL, dequantize_q5_K>;
-template [[host_name("kernel_mul_mm_q6_K_f32")]]    kernel mat_mm_t kernel_mul_mm<half,   half4x4,   simdgroup_half8x8,   block_q6_K,    QK_NL, dequantize_q6_K>;
-template [[host_name("kernel_mul_mm_iq2_xxs_f32")]] kernel mat_mm_t kernel_mul_mm<half,   half4x4,   simdgroup_half8x8,   block_iq2_xxs, QK_NL, dequantize_iq2_xxs>;
-template [[host_name("kernel_mul_mm_iq2_xs_f32")]]  kernel mat_mm_t kernel_mul_mm<half,   half4x4,   simdgroup_half8x8,   block_iq2_xs,  QK_NL, dequantize_iq2_xs>;
-template [[host_name("kernel_mul_mm_iq3_xxs_f32")]] kernel mat_mm_t kernel_mul_mm<half,   half4x4,   simdgroup_half8x8,   block_iq3_xxs, QK_NL, dequantize_iq3_xxs>;
-template [[host_name("kernel_mul_mm_iq3_s_f32")]]   kernel mat_mm_t kernel_mul_mm<half,   half4x4,   simdgroup_half8x8,   block_iq3_s,   QK_NL, dequantize_iq3_s>;
-template [[host_name("kernel_mul_mm_iq2_s_f32")]]   kernel mat_mm_t kernel_mul_mm<half,   half4x4,   simdgroup_half8x8,   block_iq2_s,   QK_NL, dequantize_iq2_s>;
-template [[host_name("kernel_mul_mm_iq1_s_f32")]]   kernel mat_mm_t kernel_mul_mm<half,   half4x4,   simdgroup_half8x8,   block_iq1_s,   QK_NL, dequantize_iq1_s>;
-template [[host_name("kernel_mul_mm_iq1_m_f32")]]   kernel mat_mm_t kernel_mul_mm<half,   half4x4,   simdgroup_half8x8,   block_iq1_m,   QK_NL, dequantize_iq1_m>;
-template [[host_name("kernel_mul_mm_iq4_nl_f32")]]  kernel mat_mm_t kernel_mul_mm<half,   half4x4,   simdgroup_half8x8,   block_iq4_nl,  2,     dequantize_iq4_nl>;
-template [[host_name("kernel_mul_mm_iq4_xs_f32")]]  kernel mat_mm_t kernel_mul_mm<half,   half4x4,   simdgroup_half8x8,   block_iq4_xs,  QK_NL, dequantize_iq4_xs>;
+template [[host_name("kernel_mul_mm_q4_0_f32")]]    kernel mul_mm_t kernel_mul_mm<half,   half4x4,   simdgroup_half8x8,   block_q4_0,    2,     dequantize_q4_0>;
+template [[host_name("kernel_mul_mm_q4_1_f32")]]    kernel mul_mm_t kernel_mul_mm<half,   half4x4,   simdgroup_half8x8,   block_q4_1,    2,     dequantize_q4_1>;
+template [[host_name("kernel_mul_mm_q5_0_f32")]]    kernel mul_mm_t kernel_mul_mm<half,   half4x4,   simdgroup_half8x8,   block_q5_0,    2,     dequantize_q5_0>;
+template [[host_name("kernel_mul_mm_q5_1_f32")]]    kernel mul_mm_t kernel_mul_mm<half,   half4x4,   simdgroup_half8x8,   block_q5_1,    2,     dequantize_q5_1>;
+template [[host_name("kernel_mul_mm_q8_0_f32")]]    kernel mul_mm_t kernel_mul_mm<half,   half4x4,   simdgroup_half8x8,   block_q8_0,    2,     dequantize_q8_0>;
+template [[host_name("kernel_mul_mm_q2_K_f32")]]    kernel mul_mm_t kernel_mul_mm<half,   half4x4,   simdgroup_half8x8,   block_q2_K,    QK_NL, dequantize_q2_K>;
+template [[host_name("kernel_mul_mm_q3_K_f32")]]    kernel mul_mm_t kernel_mul_mm<half,   half4x4,   simdgroup_half8x8,   block_q3_K,    QK_NL, dequantize_q3_K>;
+template [[host_name("kernel_mul_mm_q4_K_f32")]]    kernel mul_mm_t kernel_mul_mm<half,   half4x4,   simdgroup_half8x8,   block_q4_K,    QK_NL, dequantize_q4_K>;
+template [[host_name("kernel_mul_mm_q5_K_f32")]]    kernel mul_mm_t kernel_mul_mm<half,   half4x4,   simdgroup_half8x8,   block_q5_K,    QK_NL, dequantize_q5_K>;
+template [[host_name("kernel_mul_mm_q6_K_f32")]]    kernel mul_mm_t kernel_mul_mm<half,   half4x4,   simdgroup_half8x8,   block_q6_K,    QK_NL, dequantize_q6_K>;
+template [[host_name("kernel_mul_mm_iq2_xxs_f32")]] kernel mul_mm_t kernel_mul_mm<half,   half4x4,   simdgroup_half8x8,   block_iq2_xxs, QK_NL, dequantize_iq2_xxs>;
+template [[host_name("kernel_mul_mm_iq2_xs_f32")]]  kernel mul_mm_t kernel_mul_mm<half,   half4x4,   simdgroup_half8x8,   block_iq2_xs,  QK_NL, dequantize_iq2_xs>;
+template [[host_name("kernel_mul_mm_iq3_xxs_f32")]] kernel mul_mm_t kernel_mul_mm<half,   half4x4,   simdgroup_half8x8,   block_iq3_xxs, QK_NL, dequantize_iq3_xxs>;
+template [[host_name("kernel_mul_mm_iq3_s_f32")]]   kernel mul_mm_t kernel_mul_mm<half,   half4x4,   simdgroup_half8x8,   block_iq3_s,   QK_NL, dequantize_iq3_s>;
+template [[host_name("kernel_mul_mm_iq2_s_f32")]]   kernel mul_mm_t kernel_mul_mm<half,   half4x4,   simdgroup_half8x8,   block_iq2_s,   QK_NL, dequantize_iq2_s>;
+template [[host_name("kernel_mul_mm_iq1_s_f32")]]   kernel mul_mm_t kernel_mul_mm<half,   half4x4,   simdgroup_half8x8,   block_iq1_s,   QK_NL, dequantize_iq1_s>;
+template [[host_name("kernel_mul_mm_iq1_m_f32")]]   kernel mul_mm_t kernel_mul_mm<half,   half4x4,   simdgroup_half8x8,   block_iq1_m,   QK_NL, dequantize_iq1_m>;
+template [[host_name("kernel_mul_mm_iq4_nl_f32")]]  kernel mul_mm_t kernel_mul_mm<half,   half4x4,   simdgroup_half8x8,   block_iq4_nl,  2,     dequantize_iq4_nl>;
+template [[host_name("kernel_mul_mm_iq4_xs_f32")]]  kernel mul_mm_t kernel_mul_mm<half,   half4x4,   simdgroup_half8x8,   block_iq4_xs,  QK_NL, dequantize_iq4_xs>;
 
 //
 // indirect matrix-matrix multiplication
 //
 
-typedef decltype(kernel_mul_mm_id<float4x4, 1, dequantize_f32>) mat_mm_id_t;
+typedef decltype(kernel_mul_mm_id<half, half4x4, simdgroup_half8x8, float4x4, 1, dequantize_f32>) mul_mm_id;
 
-template [[host_name("kernel_mul_mm_id_f32_f32")]]     kernel mat_mm_id_t kernel_mul_mm_id<float4x4,      1,     dequantize_f32>;
-template [[host_name("kernel_mul_mm_id_f16_f32")]]     kernel mat_mm_id_t kernel_mul_mm_id<half4x4,       1,     dequantize_f16>;
+template [[host_name("kernel_mul_mm_id_f32_f16")]]     kernel mul_mm_id kernel_mul_mm_id<half,   half4x4,   simdgroup_half8x8,   float4x4,      1,     dequantize_f32>;
+template [[host_name("kernel_mul_mm_id_f16_f16")]]     kernel mul_mm_id kernel_mul_mm_id<half,   half4x4,   simdgroup_half8x8,   half4x4,       1,     dequantize_f16>;
 #if defined(GGML_METAL_USE_BF16)
-template [[host_name("kernel_mul_mm_id_bf16_f32")]]    kernel mat_mm_id_t kernel_mul_mm_id<bfloat4x4,     1,     dequantize_bf16>;
+template [[host_name("kernel_mul_mm_id_bf16_f16")]]    kernel mul_mm_id kernel_mul_mm_id<bfloat, bfloat4x4, simdgroup_bfloat8x8, bfloat4x4,     1,     dequantize_bf16>;
 #endif
-template [[host_name("kernel_mul_mm_id_q4_0_f32")]]    kernel mat_mm_id_t kernel_mul_mm_id<block_q4_0,    2,     dequantize_q4_0>;
-template [[host_name("kernel_mul_mm_id_q4_1_f32")]]    kernel mat_mm_id_t kernel_mul_mm_id<block_q4_1,    2,     dequantize_q4_1>;
-template [[host_name("kernel_mul_mm_id_q5_0_f32")]]    kernel mat_mm_id_t kernel_mul_mm_id<block_q5_0,    2,     dequantize_q5_0>;
-template [[host_name("kernel_mul_mm_id_q5_1_f32")]]    kernel mat_mm_id_t kernel_mul_mm_id<block_q5_1,    2,     dequantize_q5_1>;
-template [[host_name("kernel_mul_mm_id_q8_0_f32")]]    kernel mat_mm_id_t kernel_mul_mm_id<block_q8_0,    2,     dequantize_q8_0>;
-template [[host_name("kernel_mul_mm_id_q2_K_f32")]]    kernel mat_mm_id_t kernel_mul_mm_id<block_q2_K,    QK_NL, dequantize_q2_K>;
-template [[host_name("kernel_mul_mm_id_q3_K_f32")]]    kernel mat_mm_id_t kernel_mul_mm_id<block_q3_K,    QK_NL, dequantize_q3_K>;
-template [[host_name("kernel_mul_mm_id_q4_K_f32")]]    kernel mat_mm_id_t kernel_mul_mm_id<block_q4_K,    QK_NL, dequantize_q4_K>;
-template [[host_name("kernel_mul_mm_id_q5_K_f32")]]    kernel mat_mm_id_t kernel_mul_mm_id<block_q5_K,    QK_NL, dequantize_q5_K>;
-template [[host_name("kernel_mul_mm_id_q6_K_f32")]]    kernel mat_mm_id_t kernel_mul_mm_id<block_q6_K,    QK_NL, dequantize_q6_K>;
-template [[host_name("kernel_mul_mm_id_iq2_xxs_f32")]] kernel mat_mm_id_t kernel_mul_mm_id<block_iq2_xxs, QK_NL, dequantize_iq2_xxs>;
-template [[host_name("kernel_mul_mm_id_iq2_xs_f32")]]  kernel mat_mm_id_t kernel_mul_mm_id<block_iq2_xs,  QK_NL, dequantize_iq2_xs>;
-template [[host_name("kernel_mul_mm_id_iq3_xxs_f32")]] kernel mat_mm_id_t kernel_mul_mm_id<block_iq3_xxs, QK_NL, dequantize_iq3_xxs>;
-template [[host_name("kernel_mul_mm_id_iq3_s_f32")]]   kernel mat_mm_id_t kernel_mul_mm_id<block_iq3_s,   QK_NL, dequantize_iq3_s>;
-template [[host_name("kernel_mul_mm_id_iq2_s_f32")]]   kernel mat_mm_id_t kernel_mul_mm_id<block_iq2_s,   QK_NL, dequantize_iq2_s>;
-template [[host_name("kernel_mul_mm_id_iq1_s_f32")]]   kernel mat_mm_id_t kernel_mul_mm_id<block_iq1_s,   QK_NL, dequantize_iq1_s>;
-template [[host_name("kernel_mul_mm_id_iq1_m_f32")]]   kernel mat_mm_id_t kernel_mul_mm_id<block_iq1_m,   QK_NL, dequantize_iq1_m>;
-template [[host_name("kernel_mul_mm_id_iq4_nl_f32")]]  kernel mat_mm_id_t kernel_mul_mm_id<block_iq4_nl,  2,     dequantize_iq4_nl>;
-template [[host_name("kernel_mul_mm_id_iq4_xs_f32")]]  kernel mat_mm_id_t kernel_mul_mm_id<block_iq4_xs,  QK_NL, dequantize_iq4_xs>;
+template [[host_name("kernel_mul_mm_id_q4_0_f16")]]    kernel mul_mm_id kernel_mul_mm_id<half,   half4x4,   simdgroup_half8x8,   block_q4_0,    2,     dequantize_q4_0>;
+template [[host_name("kernel_mul_mm_id_q4_1_f16")]]    kernel mul_mm_id kernel_mul_mm_id<half,   half4x4,   simdgroup_half8x8,   block_q4_1,    2,     dequantize_q4_1>;
+template [[host_name("kernel_mul_mm_id_q5_0_f16")]]    kernel mul_mm_id kernel_mul_mm_id<half,   half4x4,   simdgroup_half8x8,   block_q5_0,    2,     dequantize_q5_0>;
+template [[host_name("kernel_mul_mm_id_q5_1_f16")]]    kernel mul_mm_id kernel_mul_mm_id<half,   half4x4,   simdgroup_half8x8,   block_q5_1,    2,     dequantize_q5_1>;
+template [[host_name("kernel_mul_mm_id_q8_0_f16")]]    kernel mul_mm_id kernel_mul_mm_id<half,   half4x4,   simdgroup_half8x8,   block_q8_0,    2,     dequantize_q8_0>;
+template [[host_name("kernel_mul_mm_id_q2_K_f16")]]    kernel mul_mm_id kernel_mul_mm_id<half,   half4x4,   simdgroup_half8x8,   block_q2_K,    QK_NL, dequantize_q2_K>;
+template [[host_name("kernel_mul_mm_id_q3_K_f16")]]    kernel mul_mm_id kernel_mul_mm_id<half,   half4x4,   simdgroup_half8x8,   block_q3_K,    QK_NL, dequantize_q3_K>;
+template [[host_name("kernel_mul_mm_id_q4_K_f16")]]    kernel mul_mm_id kernel_mul_mm_id<half,   half4x4,   simdgroup_half8x8,   block_q4_K,    QK_NL, dequantize_q4_K>;
+template [[host_name("kernel_mul_mm_id_q5_K_f16")]]    kernel mul_mm_id kernel_mul_mm_id<half,   half4x4,   simdgroup_half8x8,   block_q5_K,    QK_NL, dequantize_q5_K>;
+template [[host_name("kernel_mul_mm_id_q6_K_f16")]]    kernel mul_mm_id kernel_mul_mm_id<half,   half4x4,   simdgroup_half8x8,   block_q6_K,    QK_NL, dequantize_q6_K>;
+template [[host_name("kernel_mul_mm_id_iq2_xxs_f16")]] kernel mul_mm_id kernel_mul_mm_id<half,   half4x4,   simdgroup_half8x8,   block_iq2_xxs, QK_NL, dequantize_iq2_xxs>;
+template [[host_name("kernel_mul_mm_id_iq2_xs_f16")]]  kernel mul_mm_id kernel_mul_mm_id<half,   half4x4,   simdgroup_half8x8,   block_iq2_xs,  QK_NL, dequantize_iq2_xs>;
+template [[host_name("kernel_mul_mm_id_iq3_xxs_f16")]] kernel mul_mm_id kernel_mul_mm_id<half,   half4x4,   simdgroup_half8x8,   block_iq3_xxs, QK_NL, dequantize_iq3_xxs>;
+template [[host_name("kernel_mul_mm_id_iq3_s_f16")]]   kernel mul_mm_id kernel_mul_mm_id<half,   half4x4,   simdgroup_half8x8,   block_iq3_s,   QK_NL, dequantize_iq3_s>;
+template [[host_name("kernel_mul_mm_id_iq2_s_f16")]]   kernel mul_mm_id kernel_mul_mm_id<half,   half4x4,   simdgroup_half8x8,   block_iq2_s,   QK_NL, dequantize_iq2_s>;
+template [[host_name("kernel_mul_mm_id_iq1_s_f16")]]   kernel mul_mm_id kernel_mul_mm_id<half,   half4x4,   simdgroup_half8x8,   block_iq1_s,   QK_NL, dequantize_iq1_s>;
+template [[host_name("kernel_mul_mm_id_iq1_m_f16")]]   kernel mul_mm_id kernel_mul_mm_id<half,   half4x4,   simdgroup_half8x8,   block_iq1_m,   QK_NL, dequantize_iq1_m>;
+template [[host_name("kernel_mul_mm_id_iq4_nl_f16")]]  kernel mul_mm_id kernel_mul_mm_id<half,   half4x4,   simdgroup_half8x8,   block_iq4_nl,  2,     dequantize_iq4_nl>;
+template [[host_name("kernel_mul_mm_id_iq4_xs_f16")]]  kernel mul_mm_id kernel_mul_mm_id<half,   half4x4,   simdgroup_half8x8,   block_iq4_xs,  QK_NL, dequantize_iq4_xs>;
+
 
 //
 // matrix-vector multiplication
diff --git a/ml/backend/ggml/ggml/src/ggml-opt.cpp b/ml/backend/ggml/ggml/src/ggml-opt.cpp
index 7c3e2410..58d77578 100644
--- a/ml/backend/ggml/ggml/src/ggml-opt.cpp
+++ b/ml/backend/ggml/ggml/src/ggml-opt.cpp
@@ -28,16 +28,19 @@ struct ggml_opt_dataset {
 };
 
 struct ggml_opt_context {
-    ggml_backend_sched_t    backend_sched        = nullptr;
-    ggml_cgraph           * allocated_graph      = nullptr;
-    ggml_cgraph           * allocated_graph_copy = nullptr;
-    struct ggml_context   * ctx_static           = nullptr;
-    struct ggml_context   * ctx_static_cpu       = nullptr;
-    struct ggml_context   * ctx_compute          = nullptr;
-    struct ggml_context   * ctx_copy             = nullptr;
-    ggml_backend_buffer_t   buf_static           = nullptr;
-    ggml_backend_buffer_t   buf_static_cpu       = nullptr;
-    std::mt19937            rng;
+    ggml_backend_sched_t       backend_sched        = nullptr;
+    ggml_cgraph              * allocated_graph      = nullptr;
+    ggml_cgraph              * allocated_graph_copy = nullptr;
+    struct ggml_context      * ctx_static           = nullptr;
+    struct ggml_context      * ctx_cpu              = nullptr;
+    struct ggml_context      * ctx_compute          = nullptr;
+    struct ggml_context      * ctx_copy             = nullptr;
+    ggml_backend_buffer_t      buf_static           = nullptr;
+    ggml_backend_buffer_t      buf_cpu              = nullptr;
+    std::mt19937               rng;
+    enum ggml_opt_loss_type    loss_type;
+    enum ggml_opt_build_type   build_type;
+    enum ggml_opt_build_type   build_type_alloc;
 
     struct ggml_tensor * inputs  = nullptr;
     struct ggml_tensor * outputs = nullptr;
@@ -50,6 +53,11 @@ struct ggml_opt_context {
     struct ggml_cgraph * gf      = nullptr;
     struct ggml_cgraph * gb_grad = nullptr;
     struct ggml_cgraph * gb_opt  = nullptr;
+    bool static_graphs           = false;
+    bool eval_ready              = false;
+    std::vector<struct ggml_tensor *> grad_accs;
+    std::vector<struct ggml_tensor *> grad_m;
+    std::vector<struct ggml_tensor *> grad_v;
 
     int64_t iter               = 1;
     int32_t opt_period         = 1;
@@ -73,7 +81,13 @@ struct ggml_opt_result {
 
 // ====== Dataset ======
 
-ggml_opt_dataset_t ggml_opt_dataset_init(int64_t ne_datapoint, int64_t ne_label, int64_t ndata, int64_t ndata_shard) {
+ggml_opt_dataset_t ggml_opt_dataset_init(
+        enum ggml_type type_data,
+        enum ggml_type type_label,
+        int64_t        ne_datapoint,
+        int64_t        ne_label,
+        int64_t        ndata,
+        int64_t        ndata_shard) {
     GGML_ASSERT(ne_datapoint >  0);
     GGML_ASSERT(ne_label     >= 0);
     GGML_ASSERT(ndata        >  0);
@@ -92,11 +106,11 @@ ggml_opt_dataset_t ggml_opt_dataset_init(int64_t ne_datapoint, int64_t ne_label,
         result->ctx = ggml_init(params);
     }
 
-    result->data = ggml_new_tensor_2d(result->ctx, GGML_TYPE_F32, ne_datapoint, ndata);
+    result->data = ggml_new_tensor_2d(result->ctx, type_data, ne_datapoint, ndata);
     result->nbs_data = ggml_nbytes(result->data) * ndata_shard/ndata;
 
     if (ne_label > 0) {
-        result->labels = ggml_new_tensor_2d(result->ctx, GGML_TYPE_F32, ne_label, ndata);
+        result->labels = ggml_new_tensor_2d(result->ctx, type_label, ne_label, ndata);
         result->nbs_labels = ggml_nbytes(result->labels) * ndata_shard/ndata;
     } else {
         result->labels = nullptr;
@@ -119,6 +133,10 @@ void ggml_opt_dataset_free(ggml_opt_dataset_t dataset) {
     delete dataset;
 }
 
+int64_t ggml_opt_dataset_ndata(ggml_opt_dataset_t dataset) {
+    return dataset->ndata;
+}
+
 struct ggml_tensor * ggml_opt_dataset_data(ggml_opt_dataset_t dataset) {
     return dataset->data;
 }
@@ -144,6 +162,8 @@ void ggml_opt_dataset_get_batch(ggml_opt_dataset_t dataset, struct ggml_tensor *
     GGML_ASSERT(   data_batch && ggml_is_contiguous(data_batch));
     GGML_ASSERT(!labels_batch || ggml_is_contiguous(labels_batch));
     GGML_ASSERT((labels_batch == nullptr) == (dataset->labels == nullptr));
+    GGML_ASSERT(                   data_batch->type == dataset->data->type);
+    GGML_ASSERT(!labels_batch || labels_batch->type == dataset->labels->type);
 
     const size_t nb_data_batch = ggml_nbytes(data_batch);
     GGML_ASSERT(nb_data_batch % dataset->nbs_data == 0);
@@ -171,6 +191,31 @@ void ggml_opt_dataset_get_batch(ggml_opt_dataset_t dataset, struct ggml_tensor *
     }
 }
 
+void ggml_opt_dataset_get_batch_host(ggml_opt_dataset_t dataset, void * data_batch, size_t nb_data_batch, void * labels_batch, int64_t ibatch) {
+    GGML_ASSERT((labels_batch == nullptr) == (dataset->labels == nullptr));
+    GGML_ASSERT(nb_data_batch % dataset->nbs_data == 0);
+
+    const int64_t shards_per_batch = nb_data_batch / dataset->nbs_data;
+
+    GGML_ASSERT((ibatch + 1)*shards_per_batch <= int64_t(dataset->permutation.size()));
+
+    for (int64_t ishard_batch = 0; ishard_batch < shards_per_batch; ++ishard_batch) {
+        const int64_t ishard = dataset->permutation[ibatch*shards_per_batch + ishard_batch];
+
+        const char * ptr_data       = (const char *) dataset->data->data + ishard      *dataset->nbs_data;
+        char       * ptr_data_batch = (char       *) data_batch          + ishard_batch*dataset->nbs_data;
+        memcpy(ptr_data_batch, ptr_data, dataset->nbs_data);
+
+        if (!labels_batch) {
+            continue;
+        }
+
+        const char * ptr_labels       = (const char *) dataset->labels->data + ishard      *dataset->nbs_labels;
+        char       * ptr_labels_batch = (char       *) labels_batch          + ishard_batch*dataset->nbs_labels;
+        memcpy(ptr_labels_batch, ptr_labels, dataset->nbs_labels);
+    }
+}
+
 // ====== Model / Context ======
 
 struct ggml_opt_optimizer_params ggml_opt_get_default_optimizer_params(void * userdata) {
@@ -187,17 +232,18 @@ struct ggml_opt_optimizer_params ggml_opt_get_default_optimizer_params(void * us
     return result;
 }
 
+struct ggml_opt_optimizer_params ggml_opt_get_constant_optimizer_params(void * userdata) {
+    return *((struct ggml_opt_optimizer_params *) userdata);
+}
+
 struct ggml_opt_params ggml_opt_default_params(
         ggml_backend_sched_t      backend_sched,
-        struct ggml_context     * ctx_compute,
-        struct ggml_tensor      * inputs,
-        struct ggml_tensor      * outputs,
         enum ggml_opt_loss_type   loss_type) {
     return {
         /*backend_sched   =*/ backend_sched,
-        /*ctx_compute     =*/ ctx_compute,
-        /*inputs          =*/ inputs,
-        /*logits          =*/ outputs,
+        /*ctx_compute     =*/ nullptr,
+        /*inputs          =*/ nullptr,
+        /*logits          =*/ nullptr,
         /*loss_type       =*/ loss_type,
         /*build_type      =*/ GGML_OPT_BUILD_TYPE_OPT,
         /*opt_period      =*/ 1,
@@ -266,195 +312,246 @@ static ggml_cgraph * dup_graph(ggml_context * ctx, ggml_cgraph * src) {
     return dst;
 }
 
-static void ggml_opt_alloc_graph(ggml_opt_context_t opt_ctx, ggml_cgraph * graph) {
-    GGML_ASSERT(graph);
-    if (opt_ctx->allocated_graph == graph) {
-        return;
-    }
+static void ggml_opt_build(ggml_opt_context_t opt_ctx) {
+    GGML_ASSERT(opt_ctx->ctx_compute && "no compute context set, either use static graphs or set one with ggml_opt_prepare_alloc");
+    GGML_ASSERT((!opt_ctx->static_graphs || opt_ctx->inputs->data) && "when using static graphs the inputs must be allocated statically");
 
-    ggml_backend_sched_reset(opt_ctx->backend_sched); // clear allocation of previous graph
+    const bool accumulate = opt_ctx->build_type_alloc >= GGML_OPT_BUILD_TYPE_GRAD &&
+        !(opt_ctx->static_graphs && opt_ctx->build_type_alloc == GGML_OPT_BUILD_TYPE_OPT && opt_ctx->opt_period == 1);
 
-    {
-        ggml_init_params params = {
-            /*.mem_size   =*/ ggml_tensor_overhead() * GGML_DEFAULT_GRAPH_SIZE,
-            /*.mem_buffer =*/ nullptr,
-            /*.no_alloc   =*/ true,
-        };
-        ggml_free(opt_ctx->ctx_copy);
-        opt_ctx->ctx_copy = ggml_init(params);
-    }
-
-    opt_ctx->allocated_graph_copy = dup_graph(opt_ctx->ctx_copy, graph);
-
-    ggml_backend_sched_alloc_graph(opt_ctx->backend_sched, opt_ctx->allocated_graph_copy);
-    opt_ctx->allocated_graph = graph;
-}
-
-ggml_opt_context_t ggml_opt_init(struct ggml_opt_params params) {
-    ggml_opt_context_t result = new struct ggml_opt_context;
-    result->backend_sched   = params.backend_sched;
-    result->ctx_compute     = params.ctx_compute;
-    result->inputs          = params.inputs;
-    result->outputs         = params.outputs;
-    result->opt_period      = params.opt_period;
-    result->get_opt_pars    = params.get_opt_pars;
-    result->get_opt_pars_ud = params.get_opt_pars_ud;
-
-    GGML_ASSERT(result->inputs->data && "the inputs must be allocated statically");
-    GGML_ASSERT(result->opt_period >= 1);
-
-    const bool accumulate = params.build_type == GGML_OPT_BUILD_TYPE_GRAD ||
-        (params.build_type == GGML_OPT_BUILD_TYPE_OPT && result->opt_period > 1);
-
-    ggml_set_input(result->inputs);
-    ggml_set_output(result->outputs);
-
-    result->gf = ggml_new_graph_custom(result->ctx_compute, GGML_DEFAULT_GRAPH_SIZE, /*grads =*/ true); // Forward pass.
-    ggml_build_forward_expand(result->gf, result->outputs);
+    ggml_set_input(opt_ctx->inputs);
+    ggml_set_output(opt_ctx->outputs);
 
     int n_param = 0;
-    for (int i = 0; i < result->gf->n_nodes; ++i) {
-        if (result->gf->nodes[i]->flags & GGML_TENSOR_FLAG_PARAM) {
+    for (int i = 0; i < opt_ctx->gf->n_nodes; ++i) {
+        const struct ggml_tensor * node = opt_ctx->gf->nodes[i];
+        if (node->flags & GGML_TENSOR_FLAG_PARAM) {
             n_param++;
         }
+        GGML_ASSERT(!(node->flags & GGML_TENSOR_FLAG_LOSS) && "support for extra loss terms not implemented");
     }
 
-    {
+    if (!opt_ctx->ctx_static) {
         // The static context is used for:
-        //   - gradients (1 tensor per param if using gradient accumulation)
+        //   - gradients (1 per loss, 1 tensor per param if using gradient accumulation)
         //   - optimizer momenta (2 tensors per param)
-        //   - labels
-        //   - loss + its gradient (up to 5 tensors)
-        //   - pred
-        //   - ncorrect (2 tensors).
-        const size_t tensors_per_param = (accumulate ? 1 : 0) + (params.build_type == GGML_OPT_BUILD_TYPE_OPT ? 2 : 0);
-        const size_t size_meta = (tensors_per_param*n_param + 9) * ggml_tensor_overhead();
+        //   - labels (if using static graphs)
+        //   - loss (if using static graphs, up to 5 tensors)
+        //   - pred (if using static graphs)
+        //   - ncorrect (if using static graphs, 2 tensors).
+        constexpr size_t n_loss = 1;
+        const size_t tensors_per_param = (accumulate ? 1 : 0) +
+            (opt_ctx->build_type_alloc == GGML_OPT_BUILD_TYPE_OPT ? 2 : 0);
+        const size_t tensors_const = opt_ctx->static_graphs ? 9 : 0;
+        const size_t size_meta = (n_loss + tensors_per_param*n_param + tensors_const) * ggml_tensor_overhead();
         struct ggml_init_params params = {
             /*.mem_size   =*/ size_meta,
             /*.mem_buffer =*/ nullptr,
             /*.no_alloc   =*/ true,
         };
-        result->ctx_static = ggml_init(params);
+        opt_ctx->ctx_static = ggml_init(params);
     }
+    GGML_ASSERT(opt_ctx->build_type <= opt_ctx->build_type_alloc);
+
     {
-        // The static cpu context is used for:
-        //   - optimizer parameters (1 for the entire context)
+        // The cpu context is allocated statically if using static graphs, dynamically otherwise.
+        // It is used for:
+        //   - optimizer parameters (1 shared for all optimizer invocations)
         const size_t size_meta = 1 * ggml_tensor_overhead();
         struct ggml_init_params params = {
             /*.mem_size   =*/ size_meta,
             /*.mem_buffer =*/ nullptr,
             /*.no_alloc   =*/ true,
         };
-        result->ctx_static_cpu = ggml_init(params);
+        ggml_free(opt_ctx->ctx_cpu);
+        opt_ctx->ctx_cpu = ggml_init(params);
+
+        ggml_backend_buffer_free(opt_ctx->buf_cpu);
+        opt_ctx->buf_cpu = nullptr;
     }
 
+    struct ggml_context * ctx_results = opt_ctx->static_graphs ? opt_ctx->ctx_static : opt_ctx->ctx_compute;
 
-    switch (params.loss_type) {
+    switch (opt_ctx->loss_type) {
         case GGML_OPT_LOSS_TYPE_MEAN: {
-            result->loss = ggml_sum(result->ctx_static, result->outputs);
-            ggml_set_name(result->loss, "loss_sum");
-            const float scale = 1.0f / (result->opt_period * ggml_nelements(result->outputs));
-            result->loss = ggml_scale(result->ctx_static, result->loss, scale);
-            ggml_set_name(result->loss, "loss_mean");
-            result->loss_per_datapoint = true;
+            opt_ctx->loss = ggml_sum(ctx_results, opt_ctx->outputs);
+            ggml_set_name(opt_ctx->loss, "loss_sum");
+            const float scale = 1.0f / (opt_ctx->opt_period * ggml_nelements(opt_ctx->outputs));
+            opt_ctx->loss = ggml_scale(ctx_results, opt_ctx->loss, scale);
+            ggml_set_name(opt_ctx->loss, "loss_mean");
+            opt_ctx->loss_per_datapoint = true;
             break;
         }
         case GGML_OPT_LOSS_TYPE_SUM: {
-            result->loss = ggml_sum(result->ctx_static, result->outputs);
-            ggml_set_name(result->loss, "loss_sum");
-            result->loss_per_datapoint = false;
+            opt_ctx->loss = ggml_sum(ctx_results, opt_ctx->outputs);
+            ggml_set_name(opt_ctx->loss, "loss_sum");
+            opt_ctx->loss_per_datapoint = false;
             break;
         }
         case GGML_OPT_LOSS_TYPE_CROSS_ENTROPY: {
-            result->labels = ggml_dup_tensor(result->ctx_static, result->outputs);
-            ggml_set_input(result->labels);
-            ggml_set_name(result->labels, "labels");
-            result->loss = ggml_cross_entropy_loss(result->ctx_static, result->outputs, result->labels);
-            ggml_set_name(result->loss, "loss_cross_entropy");
-            if (result->opt_period > 1) {
-                result->loss = ggml_scale(result->ctx_static, result->loss, 1.0f / result->opt_period);
-                ggml_set_name(result->loss, "loss_cross_entropy_scaled");
+            opt_ctx->labels = ggml_dup_tensor(ctx_results, opt_ctx->outputs);
+            ggml_set_input(opt_ctx->labels);
+            ggml_set_name(opt_ctx->labels, "labels");
+            opt_ctx->loss = ggml_cross_entropy_loss(ctx_results, opt_ctx->outputs, opt_ctx->labels);
+            ggml_set_name(opt_ctx->loss, "loss_cross_entropy");
+            if (opt_ctx->opt_period > 1) {
+                opt_ctx->loss = ggml_scale(ctx_results, opt_ctx->loss, 1.0f / opt_ctx->opt_period);
+                ggml_set_name(opt_ctx->loss, "loss_cross_entropy_scaled");
             }
-            result->loss_per_datapoint = true;
+            opt_ctx->loss_per_datapoint = true;
             break;
         }
         case GGML_OPT_LOSS_TYPE_MEAN_SQUARED_ERROR: {
-            result->labels = ggml_dup_tensor(result->ctx_static, result->outputs);
-            ggml_set_input(result->labels);
-            ggml_set_name(result->labels, "labels");
-            result->loss = ggml_sub(result->ctx_static, result->outputs, result->labels);
-            ggml_set_name(result->loss, "loss_error");
-            result->loss = ggml_sqr(result->ctx_static, result->loss);
-            ggml_set_name(result->loss, "loss_squared_error");
-            result->loss = ggml_sum(result->ctx_static, result->loss);
-            ggml_set_name(result->loss, "loss_sum_squared_error");
-            const float scale = 1.0f / (result->opt_period * ggml_nelements(result->outputs));
-            result->loss = ggml_scale(result->ctx_static, result->loss, scale);
-            ggml_set_name(result->loss, "loss_mean_squared_error");
-            result->loss_per_datapoint = true;
+            opt_ctx->labels = ggml_dup_tensor(ctx_results, opt_ctx->outputs);
+            ggml_set_input(opt_ctx->labels);
+            ggml_set_name(opt_ctx->labels, "labels");
+            opt_ctx->loss = ggml_sub(ctx_results, opt_ctx->outputs, opt_ctx->labels);
+            ggml_set_name(opt_ctx->loss, "loss_error");
+            opt_ctx->loss = ggml_sqr(ctx_results, opt_ctx->loss);
+            ggml_set_name(opt_ctx->loss, "loss_squared_error");
+            opt_ctx->loss = ggml_sum(ctx_results, opt_ctx->loss);
+            ggml_set_name(opt_ctx->loss, "loss_sum_squared_error");
+            const float scale = 1.0f / (opt_ctx->opt_period * ggml_nelements(opt_ctx->outputs));
+            opt_ctx->loss = ggml_scale(ctx_results, opt_ctx->loss, scale);
+            ggml_set_name(opt_ctx->loss, "loss_mean_squared_error");
+            opt_ctx->loss_per_datapoint = true;
             break;
         }
     }
-    ggml_set_output(result->loss);
-    ggml_set_loss(result->loss);
-    ggml_build_forward_expand(result->gf, result->loss);
+    ggml_set_output(opt_ctx->loss);
+    ggml_set_loss(opt_ctx->loss);
+    ggml_build_forward_expand(opt_ctx->gf, opt_ctx->loss);
 
-    result->pred = ggml_argmax(result->ctx_static, result->outputs);
-    ggml_set_name(result->pred, "pred");
-    ggml_set_output(result->pred);
-    ggml_build_forward_expand(result->gf, result->pred);
+    if (opt_ctx->loss_type == GGML_OPT_LOSS_TYPE_CROSS_ENTROPY) {
+        opt_ctx->pred = ggml_argmax(ctx_results, opt_ctx->outputs);
+        ggml_set_name(opt_ctx->pred, "pred");
+        ggml_set_output(opt_ctx->pred);
+        ggml_build_forward_expand(opt_ctx->gf, opt_ctx->pred);
 
-    if (result->labels) {
-        result->ncorrect = ggml_count_equal(result->ctx_static, result->pred, ggml_argmax(result->ctx_static, result->labels));
-        ggml_set_name(result->ncorrect, "ncorrect");
-        ggml_set_output(result->ncorrect);
-        ggml_build_forward_expand(result->gf, result->ncorrect);
-    } else {
-        result->ncorrect = nullptr;
+        opt_ctx->ncorrect = ggml_count_equal(ctx_results, opt_ctx->pred, ggml_argmax(ctx_results, opt_ctx->labels));
+        ggml_set_name(opt_ctx->ncorrect, "ncorrect");
+        ggml_set_output(opt_ctx->ncorrect);
+        ggml_build_forward_expand(opt_ctx->gf, opt_ctx->ncorrect);
     }
 
-    if (params.build_type == GGML_OPT_BUILD_TYPE_FORWARD) {
-        result->buf_static = ggml_backend_alloc_ctx_tensors(result->ctx_static, ggml_backend_sched_get_backend(result->backend_sched, 0));
-        return result;
+    if (opt_ctx->buf_static) {
+        if (opt_ctx->build_type == GGML_OPT_BUILD_TYPE_FORWARD) {
+            return;
+        }
+    } else if (opt_ctx->build_type_alloc == GGML_OPT_BUILD_TYPE_FORWARD) {
+        opt_ctx->buf_static = ggml_backend_alloc_ctx_tensors(
+            opt_ctx->ctx_static, ggml_backend_sched_get_backend(opt_ctx->backend_sched, 0));
+        return;
     }
 
-    // gb_grad == graph backward gradients, forward pass, then backward pass to calculate gradients.
-    result->gb_grad = ggml_graph_dup(result->ctx_compute, result->gf);
-    ggml_build_backward_expand(result->ctx_static, result->ctx_compute, result->gb_grad, accumulate);
+    if (opt_ctx->grad_accs.empty()) {
+        GGML_ASSERT(opt_ctx->build_type_alloc >= GGML_OPT_BUILD_TYPE_GRAD);
 
-    if (params.build_type == GGML_OPT_BUILD_TYPE_GRAD) {
-        result->buf_static = ggml_backend_alloc_ctx_tensors(result->ctx_static, ggml_backend_sched_get_backend(result->backend_sched, 0));
-        ggml_graph_reset(result->gb_grad);
-        return result;
-    }
+        const int n_nodes = opt_ctx->gf->n_nodes;
+        opt_ctx->grad_accs.resize(n_nodes);
+        for (int i = 0; i < n_nodes; ++i) {
+            ggml_tensor * node = opt_ctx->gf->nodes[i];
+            if ((accumulate && (node->flags & GGML_TENSOR_FLAG_PARAM)) || (node->flags & GGML_TENSOR_FLAG_LOSS)) {
+                opt_ctx->grad_accs[i] = ggml_new_tensor(opt_ctx->ctx_static, GGML_TYPE_F32, GGML_MAX_DIMS, node->ne);
+            } else {
+                opt_ctx->grad_accs[i] = nullptr;
+            }
+        }
 
-    GGML_ASSERT(params.build_type == GGML_OPT_BUILD_TYPE_OPT);
-
-    // gb_opt == graph backward optimize, forward pass, then backward pass to calculate gradients, then optimizer step.
-    result->gb_opt = ggml_graph_dup(result->ctx_compute, result->gb_grad);
-
-    result->adamw_params = ggml_new_tensor_1d(result->ctx_static_cpu, GGML_TYPE_F32, 7);
-    ggml_set_input(result->adamw_params);
-    ggml_set_name(result->adamw_params, "adamw_params");
-
-    for (int i = result->gf->n_nodes-1; i >= 0; --i) {
-        struct ggml_tensor * node = result->gb_opt->nodes[i];
-        struct ggml_tensor * grad = ggml_graph_get_grad(result->gb_opt, node);
-
-        if (node->flags & GGML_TENSOR_FLAG_PARAM) {
-            struct ggml_tensor * m        = ggml_dup_tensor(result->ctx_static, node);
-            struct ggml_tensor * v        = ggml_dup_tensor(result->ctx_static, node);
-            struct ggml_tensor * opt_step = ggml_opt_step_adamw(result->ctx_compute, node, grad, m, v, result->adamw_params);
-            ggml_build_forward_expand(result->gb_opt, opt_step);
+        if (opt_ctx->build_type_alloc >= GGML_OPT_BUILD_TYPE_OPT) {
+            opt_ctx->grad_m.resize(n_nodes);
+            opt_ctx->grad_v.resize(n_nodes);
+            for (int i = 0; i < n_nodes; ++i) {
+                ggml_tensor * node = opt_ctx->gf->nodes[i];
+                if (node->flags & GGML_TENSOR_FLAG_PARAM) {
+                    opt_ctx->grad_m[i] = ggml_new_tensor(opt_ctx->ctx_static, GGML_TYPE_F32, GGML_MAX_DIMS, node->ne);
+                    opt_ctx->grad_v[i] = ggml_new_tensor(opt_ctx->ctx_static, GGML_TYPE_F32, GGML_MAX_DIMS, node->ne);
+                } else {
+                    opt_ctx->grad_m[i] = nullptr;
+                    opt_ctx->grad_v[i] = nullptr;
+                }
+            }
         }
     }
 
-    result->buf_static = ggml_backend_alloc_ctx_tensors(
-        result->ctx_static, ggml_backend_sched_get_backend(result->backend_sched, 0));
+    // gb_grad == graph backward gradients, forward pass, then backward pass to calculate gradients.
+    opt_ctx->gb_grad = ggml_graph_dup(opt_ctx->ctx_compute, opt_ctx->gf, /*force_grads =*/ true);
+    ggml_build_backward_expand(opt_ctx->ctx_compute, opt_ctx->gb_grad, opt_ctx->grad_accs.data());
 
-    result->buf_static_cpu = ggml_backend_alloc_ctx_tensors_from_buft(result->ctx_static_cpu, ggml_backend_cpu_buffer_type());
+    if (opt_ctx->buf_static) {
+        if (opt_ctx->build_type == GGML_OPT_BUILD_TYPE_GRAD) {
+            return;
+        }
+    } else if (opt_ctx->build_type_alloc == GGML_OPT_BUILD_TYPE_GRAD) {
+        opt_ctx->buf_static = ggml_backend_alloc_ctx_tensors(opt_ctx->ctx_static, ggml_backend_sched_get_backend(opt_ctx->backend_sched, 0));
+        ggml_graph_reset(opt_ctx->gb_grad);
+    }
 
-    ggml_graph_reset(result->gb_opt);
+    GGML_ASSERT(opt_ctx->build_type_alloc == GGML_OPT_BUILD_TYPE_OPT);
+
+    // gb_opt == graph backward optimize, forward pass, then backward pass to calculate gradients, then optimizer step.
+    opt_ctx->gb_opt = ggml_graph_dup(opt_ctx->ctx_compute, opt_ctx->gb_grad, /*force_grads =*/ true);
+
+    opt_ctx->adamw_params = ggml_new_tensor_1d(opt_ctx->ctx_cpu, GGML_TYPE_F32, 7);
+    ggml_set_input(opt_ctx->adamw_params);
+    ggml_set_name(opt_ctx->adamw_params, "adamw_params");
+
+    for (int i = opt_ctx->gf->n_nodes-1; i >= 0; --i) {
+        struct ggml_tensor * node = opt_ctx->gb_opt->nodes[i];
+        struct ggml_tensor * grad = ggml_graph_get_grad(opt_ctx->gb_opt, node);
+
+        if (grad && (node->flags & GGML_TENSOR_FLAG_PARAM)) {
+            struct ggml_tensor * m        = opt_ctx->grad_m[i];
+            struct ggml_tensor * v        = opt_ctx->grad_v[i];
+            struct ggml_tensor * opt_step = ggml_opt_step_adamw(opt_ctx->ctx_compute, node, grad, m, v, opt_ctx->adamw_params);
+
+            ggml_set_name(m,        (std::string("AdamW m for ")    + std::string(node->name)).c_str());
+            ggml_set_name(v,        (std::string("AdamW v for ")    + std::string(node->name)).c_str());
+            ggml_set_name(opt_step, (std::string("AdamW step for ") + std::string(node->name)).c_str());
+
+            ggml_build_forward_expand(opt_ctx->gb_opt, opt_step);
+        }
+    }
+
+    if (!opt_ctx->buf_static) {
+        opt_ctx->buf_static = ggml_backend_alloc_ctx_tensors(
+            opt_ctx->ctx_static, ggml_backend_sched_get_backend(opt_ctx->backend_sched, 0));
+        ggml_graph_reset(opt_ctx->gb_opt);
+    }
+
+    opt_ctx->buf_cpu = ggml_backend_alloc_ctx_tensors_from_buft(opt_ctx->ctx_cpu, ggml_backend_cpu_buffer_type());
+}
+
+ggml_opt_context_t ggml_opt_init(struct ggml_opt_params params) {
+    ggml_opt_context_t result = new struct ggml_opt_context;
+    result->backend_sched    = params.backend_sched;
+    result->ctx_compute      = params.ctx_compute;
+    result->loss_type        = params.loss_type;
+    result->build_type       = params.build_type;
+    result->build_type_alloc = params.build_type;
+    result->inputs           = params.inputs;
+    result->outputs          = params.outputs;
+    result->opt_period       = params.opt_period;
+    result->get_opt_pars     = params.get_opt_pars;
+    result->get_opt_pars_ud  = params.get_opt_pars_ud;
+
+    GGML_ASSERT(result->opt_period >= 1);
+
+    result->static_graphs = result->ctx_compute;
+
+    if (!result->static_graphs) {
+        GGML_ASSERT(!result->inputs);
+        GGML_ASSERT(!result->outputs);
+        return result;
+    }
+
+    GGML_ASSERT(result->inputs);
+    GGML_ASSERT(result->outputs);
+
+    result->gf = ggml_new_graph_custom(result->ctx_compute, GGML_DEFAULT_GRAPH_SIZE, /*grads =*/ true); // Forward pass.
+    ggml_build_forward_expand(result->gf, result->outputs);
+
+    ggml_opt_build(result);
 
     return result;
 }
@@ -464,9 +561,9 @@ void ggml_opt_free(ggml_opt_context_t opt_ctx) {
         return;
     }
     ggml_backend_buffer_free(opt_ctx->buf_static);
-    ggml_backend_buffer_free(opt_ctx->buf_static_cpu);
+    ggml_backend_buffer_free(opt_ctx->buf_cpu);
     ggml_free(opt_ctx->ctx_static);
-    ggml_free(opt_ctx->ctx_static_cpu);
+    ggml_free(opt_ctx->ctx_cpu);
     delete opt_ctx;
 }
 
@@ -582,8 +679,79 @@ void ggml_opt_result_accuracy(ggml_opt_result_t result, double * accuracy, doubl
 
 // ====== Computation ======
 
-static void ggml_opt_eval_graph(ggml_opt_context_t opt_ctx, ggml_cgraph * graph, ggml_opt_result * result) {
-    if (graph != opt_ctx->gf) {
+void ggml_opt_prepare_alloc(
+        ggml_opt_context_t    opt_ctx,
+        struct ggml_context * ctx_compute,
+        struct ggml_cgraph  * gf,
+        struct ggml_tensor  * inputs,
+        struct ggml_tensor  * outputs) {
+    GGML_ASSERT(!opt_ctx->static_graphs);
+    opt_ctx->ctx_compute = ctx_compute;
+    opt_ctx->gf          = gf;
+    opt_ctx->inputs      = inputs;
+    opt_ctx->outputs     = outputs;
+}
+
+void ggml_opt_alloc(ggml_opt_context_t opt_ctx, bool backward) {
+    GGML_ASSERT(!opt_ctx->eval_ready);
+    if (opt_ctx->build_type == GGML_OPT_BUILD_TYPE_OPT && opt_ctx->opt_period > 1 && opt_ctx->opt_i == 0) {
+        ggml_graph_reset(opt_ctx->gb_grad);
+    }
+    if (backward) {
+        const int32_t opt_i_next = (opt_ctx->opt_i + 1) % opt_ctx->opt_period;
+        opt_ctx->build_type = opt_i_next == 0 ? GGML_OPT_BUILD_TYPE_OPT : GGML_OPT_BUILD_TYPE_GRAD;
+    } else {
+        opt_ctx->build_type = GGML_OPT_BUILD_TYPE_FORWARD;
+    }
+
+    if (!opt_ctx->static_graphs) {
+        ggml_opt_build(opt_ctx);
+    }
+
+    struct ggml_cgraph * graph = nullptr;
+    switch (opt_ctx->build_type) {
+        case GGML_OPT_BUILD_TYPE_FORWARD: {
+            graph = opt_ctx->gf;
+        } break;
+        case GGML_OPT_BUILD_TYPE_GRAD: {
+            graph = opt_ctx->gb_grad;
+        } break;
+        case GGML_OPT_BUILD_TYPE_OPT: {
+            graph = opt_ctx->gb_opt;
+        } break;
+    }
+    GGML_ASSERT(graph);
+
+    if (opt_ctx->allocated_graph == graph) {
+        opt_ctx->eval_ready = true;
+        return;
+    }
+
+    ggml_backend_sched_reset(opt_ctx->backend_sched); // clear allocation of previous graph
+
+    if (opt_ctx->static_graphs) {
+        ggml_init_params params = {
+            /*.mem_size   =*/ graph->size*ggml_tensor_overhead() + ggml_graph_overhead_custom(graph->size, graph->grads),
+            /*.mem_buffer =*/ nullptr,
+            /*.no_alloc   =*/ true,
+        };
+        ggml_free(opt_ctx->ctx_copy);
+        opt_ctx->ctx_copy = ggml_init(params);
+
+        opt_ctx->allocated_graph_copy = dup_graph(opt_ctx->ctx_copy, graph);
+    } else {
+        opt_ctx->allocated_graph_copy = graph;
+    }
+
+    ggml_backend_sched_alloc_graph(opt_ctx->backend_sched, opt_ctx->allocated_graph_copy);
+    opt_ctx->allocated_graph = graph;
+
+    opt_ctx->eval_ready = true;
+}
+
+void ggml_opt_eval(ggml_opt_context_t opt_ctx, ggml_opt_result_t result) {
+    GGML_ASSERT(opt_ctx->eval_ready);
+    if (opt_ctx->allocated_graph == opt_ctx->gb_opt) {
         struct ggml_opt_optimizer_params opt_pars = opt_ctx->get_opt_pars(opt_ctx->get_opt_pars_ud);
 
         GGML_ASSERT(opt_pars.adamw.alpha >  0.0f);
@@ -609,9 +777,19 @@ static void ggml_opt_eval_graph(ggml_opt_context_t opt_ctx, ggml_cgraph * graph,
         adamw_par_data[6] = beta2h;
     }
 
-    ggml_opt_alloc_graph(opt_ctx, graph);
     ggml_backend_sched_graph_compute(opt_ctx->backend_sched, opt_ctx->allocated_graph_copy);
     opt_ctx->iter += opt_ctx->allocated_graph == opt_ctx->gb_opt;
+    opt_ctx->opt_i = (opt_ctx->opt_i + 1) % opt_ctx->opt_period;
+
+    if (!opt_ctx->static_graphs) {
+        opt_ctx->gf                   = nullptr;
+        opt_ctx->gb_grad              = nullptr;
+        opt_ctx->gb_opt               = nullptr;
+        opt_ctx->allocated_graph      = nullptr;
+        opt_ctx->allocated_graph_copy = nullptr;
+    }
+
+    opt_ctx->eval_ready = false;
 
     if (!result) {
         return;
@@ -635,12 +813,14 @@ static void ggml_opt_eval_graph(ggml_opt_context_t opt_ctx, ggml_cgraph * graph,
     ggml_backend_tensor_get(opt_ctx->loss, &loss, 0, ggml_nbytes(opt_ctx->loss));
     result->loss.push_back(loss);
 
-    GGML_ASSERT(opt_ctx->pred->type == GGML_TYPE_I32);
-    std::vector<int32_t> pred(ndata);
-    ggml_backend_tensor_get(opt_ctx->pred, pred.data(), 0, ggml_nbytes(opt_ctx->pred));
-    result->pred.insert(result->pred.end(), pred.begin(), pred.end());
+    if (opt_ctx->pred) {
+        GGML_ASSERT(opt_ctx->pred->type == GGML_TYPE_I32);
+        std::vector<int32_t> pred(ndata);
+        ggml_backend_tensor_get(opt_ctx->pred, pred.data(), 0, ggml_nbytes(opt_ctx->pred));
+        result->pred.insert(result->pred.end(), pred.begin(), pred.end());
+    }
 
-    if (!opt_ctx->labels || result->ncorrect < 0) {
+    if (!opt_ctx->ncorrect || result->ncorrect < 0) {
         result->ncorrect = -1;
         return;
     }
@@ -652,26 +832,6 @@ static void ggml_opt_eval_graph(ggml_opt_context_t opt_ctx, ggml_cgraph * graph,
     result->ncorrect += ncorrect;
 }
 
-void ggml_opt_forward(ggml_opt_context_t opt_ctx, ggml_opt_result * result) {
-    ggml_opt_eval_graph(opt_ctx, opt_ctx->gf, result);
-}
-
-void ggml_opt_forward_backward(ggml_opt_context_t opt_ctx, ggml_opt_result * result) {
-    if (opt_ctx->opt_period == 1) {
-        ggml_opt_eval_graph(opt_ctx, opt_ctx->gb_opt, result);
-        return;
-    }
-
-    const int32_t opt_i_next = (opt_ctx->opt_i + 1) % opt_ctx->opt_period;
-    if (opt_i_next == 0) {
-        ggml_opt_eval_graph(opt_ctx, opt_ctx->gb_opt, result);
-        ggml_opt_reset(opt_ctx, /*optimizer =*/ false);
-    } else {
-        ggml_opt_eval_graph(opt_ctx, opt_ctx->gb_grad, result);
-    }
-    opt_ctx->opt_i = opt_i_next;
-}
-
 // ====== High-Level Functions ======
 
 void ggml_opt_epoch(
@@ -700,16 +860,18 @@ void ggml_opt_epoch(
     int64_t ibatch = 0;
     int64_t t_loop_start = ggml_time_us();
     for (; ibatch < ibatch_split; ++ibatch) {
+        ggml_opt_alloc(opt_ctx, /*backward =*/ true);
         ggml_opt_dataset_get_batch(dataset, inputs, labels, ibatch);
-        ggml_opt_forward_backward(opt_ctx, result_train);
+        ggml_opt_eval(opt_ctx, result_train);
         if (callback_train) {
             callback_train(true, opt_ctx, dataset, result_train, ibatch+1, ibatch_split, t_loop_start);
         }
     }
     t_loop_start = ggml_time_us();
     for (; ibatch < nbatches; ++ibatch) {
+        ggml_opt_alloc(opt_ctx, /*backward =*/ false);
         ggml_opt_dataset_get_batch(dataset, inputs, labels, ibatch);
-        ggml_opt_forward(opt_ctx, result_eval);
+        ggml_opt_eval(opt_ctx, result_eval);
         if (callback_eval) {
             callback_eval(false, opt_ctx, dataset, result_eval, ibatch+1-ibatch_split, nbatches-ibatch_split, t_loop_start);
         }
@@ -726,13 +888,26 @@ void ggml_opt_epoch_callback_progress_bar(
         int64_t            t_start_us) {
     fprintf(stderr, "%s[", train ? "train: " : "val:   ");
 
-    constexpr int64_t bar_length = 25;
+    // The progress bar consists of partially filled blocks, unicode has 8 separate fill levels.
+    constexpr int64_t bar_length = 8;
+    const int64_t ibatch8 = 8 * ibatch;
     for (int64_t j = 0; j < bar_length; ++j) {
-        const int64_t ibatch_j = ibatch_max * j/bar_length;
-        if (ibatch_j < ibatch) {
-            fprintf(stderr, "=");
-        } else if (ibatch_max * (j - 1)/bar_length < ibatch) {
-            fprintf(stderr, ">");
+        if        (ibatch_max * (8*j + 8) / bar_length < ibatch8) {
+            fprintf(stderr, "\u2588"); // full block
+        } else if (ibatch_max * (8*j + 7) / bar_length < ibatch8) {
+            fprintf(stderr, "\u2589"); // 7/8 filled
+        } else if (ibatch_max * (8*j + 6) / bar_length < ibatch8) {
+            fprintf(stderr, "\u258A"); // 6/8 filled
+        } else if (ibatch_max * (8*j + 5) / bar_length < ibatch8) {
+            fprintf(stderr, "\u258B"); // 5/8 filled
+        } else if (ibatch_max * (8*j + 4) / bar_length < ibatch8) {
+            fprintf(stderr, "\u258C"); // 4/8 filled
+        } else if (ibatch_max * (8*j + 3) / bar_length < ibatch8) {
+            fprintf(stderr, "\u258D"); // 3/8 filled
+        } else if (ibatch_max * (8*j + 2) / bar_length < ibatch8) {
+            fprintf(stderr, "\u258E"); // 2/8 filled
+        } else if (ibatch_max * (8*j + 1) / bar_length < ibatch8) {
+            fprintf(stderr, "\u258F"); // 1/8 filled
         } else {
             fprintf(stderr, " ");
         }
@@ -764,8 +939,8 @@ void ggml_opt_epoch_callback_progress_bar(
     const int64_t t_eta_m = t_eta_s / 60;
     t_eta_s -= t_eta_m * 60;
 
-    fprintf(stderr, "| data=%06" PRId64 "/%06" PRId64 ", loss=%.6lf+-%.6lf, accuracy=%.2lf+-%.2lf%%, "
-            "t=%02" PRId64 ":%02" PRId64 ":%02" PRId64 ", ETA=%02" PRId64 ":%02" PRId64 ":%02" PRId64 "]\r",
+    fprintf(stderr, "] data=%07" PRId64 "/%07" PRId64 " loss=%.5lf±%.5lf acc=%.2lf±%.2lf%% "
+            "t=%02" PRId64 ":%02" PRId64 ":%02" PRId64 " ETA=%02" PRId64 ":%02" PRId64 ":%02" PRId64 " \r",
             idata, idata_max, loss, loss_unc, 100.0*accuracy, 100.0*accuracy_unc,
             t_ibatch_h, t_ibatch_m, t_ibatch_s, t_eta_h, t_eta_m, t_eta_s);
     if (ibatch == ibatch_max) {
@@ -806,7 +981,10 @@ void ggml_opt_fit(
 
     int64_t epoch = 1;
 
-    ggml_opt_params params = ggml_opt_default_params(backend_sched, ctx_compute, inputs, outputs, loss_type);
+    ggml_opt_params params = ggml_opt_default_params(backend_sched, loss_type);
+    params.ctx_compute     = ctx_compute;
+    params.inputs          = inputs;
+    params.outputs         = outputs;
     params.opt_period      = opt_period;
     params.get_opt_pars    = get_opt_pars;
     params.get_opt_pars_ud = &epoch;
diff --git a/ml/backend/ggml/ggml/src/ggml-quants.c b/ml/backend/ggml/ggml/src/ggml-quants.c
index ac918a60..84ec6dfe 100644
--- a/ml/backend/ggml/ggml/src/ggml-quants.c
+++ b/ml/backend/ggml/ggml/src/ggml-quants.c
@@ -19,12 +19,6 @@
 #define GROUP_MAX_EPS_IQ1_M 1e-7f
 #define GROUP_MAX_EPS_IQ1_S 1e-12f
 
-#if defined(_MSC_VER)
-// disable "possible loss of data" to avoid warnings for hundreds of casts
-// we should just be careful :)
-#pragma warning(disable: 4244 4267)
-#endif
-
 #define UNUSED GGML_UNUSED
 
 // reference implementation for deterministic creation of model files
diff --git a/ml/backend/ggml/ggml/src/ggml.c b/ml/backend/ggml/ggml/src/ggml.c
index 3c57aff8..6b034d35 100644
--- a/ml/backend/ggml/ggml/src/ggml.c
+++ b/ml/backend/ggml/ggml/src/ggml.c
@@ -1301,6 +1301,10 @@ bool ggml_is_contiguous_2(const struct ggml_tensor * tensor) {
     return ggml_is_contiguous_n(tensor, 2);
 }
 
+bool ggml_is_contiguously_allocated(const struct ggml_tensor * tensor) {
+    return ggml_nbytes(tensor) == ggml_nelements(tensor) * ggml_type_size(tensor->type)/ggml_blck_size(tensor->type);
+}
+
 bool ggml_is_permuted(const struct ggml_tensor * tensor) {
     static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
 
@@ -2730,11 +2734,11 @@ void ggml_mul_mat_set_prec(
     c = ggml_mul_mat_id(ctx, as, b, ids);
 
     as  -> [cols, rows, n_expert]
-    ids -> [n_experts_used, n_tokens] (i32)
     b   -> [cols, n_expert_used, n_tokens]
+    ids -> [n_expert_used, n_tokens] (i32)
     c   -> [rows, n_expert_used, n_tokens]
 
-    in b, n_experts_used can be broadcasted to match the n_expert_used of ids
+    in b, n_expert_used can be broadcasted to match the n_expert_used of ids
 
     c ~= as[:,:,i] @ b[:,i%r,t], i = ids[e,t] for all e,t in ids
 */
@@ -5516,7 +5520,7 @@ static void ggml_compute_backward(
             // tensor = src0 * 1 + src1 * 0
             if (src0_needs_grads) {
                 // dsrc0 = dtensor * 1
-                ggml_add_or_set(ctx, cgraph, isrc0, grad);
+                ggml_add_or_set(ctx, cgraph, isrc0, ggml_reshape(ctx, grad, src0));
             }
             if (src1_needs_grads) {
                 // dsrc1 = dtensor * 0 -> noop
@@ -5797,10 +5801,9 @@ void ggml_build_forward_expand(struct ggml_cgraph * cgraph, struct ggml_tensor *
 }
 
 void ggml_build_backward_expand(
-        struct ggml_context * ctx_static,
-        struct ggml_context * ctx_compute,
-        struct ggml_cgraph  * cgraph,
-        bool                  accumulate) {
+        struct ggml_context *  ctx,
+        struct ggml_cgraph  *  cgraph,
+        struct ggml_tensor  ** grad_accs) {
     GGML_ASSERT(cgraph->n_nodes > 0);
     GGML_ASSERT(cgraph->grads);
     GGML_ASSERT(cgraph->grad_accs);
@@ -5873,21 +5876,24 @@ void ggml_build_backward_expand(
         GGML_ASSERT(!node->view_src || node->op == GGML_OP_CPY || node->op == GGML_OP_VIEW ||
             node->op == GGML_OP_RESHAPE || node->op == GGML_OP_PERMUTE || node->op == GGML_OP_TRANSPOSE);
 
-        const size_t igrad = ggml_hash_find(&cgraph->visited_hash_set, node);
-        GGML_ASSERT(igrad != GGML_HASHSET_FULL);
-        GGML_ASSERT(ggml_bitset_get(cgraph->visited_hash_set.used, igrad));
-        if ((accumulate && (node->flags & GGML_TENSOR_FLAG_PARAM)) || (node->flags & GGML_TENSOR_FLAG_LOSS)) {
-            cgraph->grad_accs[igrad] = ggml_dup_tensor(ctx_static, node);
-            cgraph->grads[igrad]     = cgraph->grad_accs[igrad];
-            ggml_format_name(cgraph->grad_accs[igrad], "grad acc for %s", node->name);
+        const size_t ihash = ggml_hash_find(&cgraph->visited_hash_set, node);
+        GGML_ASSERT(ihash != GGML_HASHSET_FULL);
+        GGML_ASSERT(ggml_bitset_get(cgraph->visited_hash_set.used, ihash));
+        if (grad_accs && grad_accs[i]) {
+            cgraph->grad_accs[ihash] = grad_accs[i];
+            cgraph->grads[ihash]     = cgraph->grad_accs[ihash];
+        } else if (node->flags & GGML_TENSOR_FLAG_LOSS) {
+            // loss tensors always need a gradient accumulator
+            cgraph->grad_accs[ihash] = ggml_new_tensor(ctx, GGML_TYPE_F32, GGML_MAX_DIMS, node->ne);
+            cgraph->grads[ihash]     = cgraph->grad_accs[ihash];
         }
-        grads_needed[igrad] = true;
+        grads_needed[ihash] = true;
     }
 
     for (int i = n_nodes_f - 1; i >= 0; --i) {
         // inplace operations to add gradients are not created by ggml_compute_backward except for gradient accumulation
         // use allocator to automatically make inplace operations
-        ggml_compute_backward(ctx_compute, cgraph, i, grads_needed);
+        ggml_compute_backward(ctx, cgraph, i, grads_needed);
     }
 
     free(grads_needed);
@@ -6033,8 +6039,8 @@ void ggml_graph_cpy(struct ggml_cgraph * src, struct ggml_cgraph * dst) {
     }
 }
 
-struct ggml_cgraph * ggml_graph_dup(struct ggml_context * ctx, struct ggml_cgraph * cgraph) {
-    struct ggml_cgraph * result = ggml_new_graph_custom(ctx, cgraph->size, cgraph->grads != NULL);
+struct ggml_cgraph * ggml_graph_dup(struct ggml_context * ctx, struct ggml_cgraph * cgraph, bool force_grads) {
+    struct ggml_cgraph * result = ggml_new_graph_custom(ctx, cgraph->size, cgraph->grads || force_grads);
     ggml_graph_cpy(cgraph, result);
     return result;
 }
@@ -6053,6 +6059,9 @@ struct ggml_tensor * ggml_set_zero(struct ggml_tensor * tensor) {
 }
 
 void ggml_graph_reset(struct ggml_cgraph * cgraph) {
+    if (!cgraph) {
+        return;
+    }
     GGML_ASSERT(cgraph->grads != NULL);
 
     for (int i = 0; i < cgraph->n_nodes; i++) {
@@ -6362,8 +6371,8 @@ void ggml_set_output(struct ggml_tensor * tensor) {
     tensor->flags |= GGML_TENSOR_FLAG_OUTPUT;
 }
 
-void ggml_set_param(struct ggml_context * ctx, struct ggml_tensor * tensor) {
-    GGML_UNUSED(ctx); // TODO: remove this parameter
+void ggml_set_param(struct ggml_tensor * tensor) {
+    GGML_ASSERT(tensor->op == GGML_OP_NONE);
     tensor->flags |= GGML_TENSOR_FLAG_PARAM;
 }
 

From 9d6df9080502adcb6f25950e3d829ab05ec8cfc8 Mon Sep 17 00:00:00 2001
From: Daniel Hiltgen <dhiltgen@users.noreply.github.com>
Date: Mon, 12 May 2025 15:23:31 -0700
Subject: [PATCH 012/108] Follow up to #10363 (#10647)

The quantization PR didn't block all unsupported file types,
which this PR fixes.  It also updates the API docs to reflect
the now reduced set of supported types.
---
 docs/api.md                 |  67 ++++++-----
 fs/ggml/type.go             | 125 +++++++++-----------
 server/quantization.go      |  55 +--------
 server/quantization_test.go | 223 ------------------------------------
 4 files changed, 88 insertions(+), 382 deletions(-)

diff --git a/docs/api.md b/docs/api.md
index b2b11573..abd27615 100644
--- a/docs/api.md
+++ b/docs/api.md
@@ -19,7 +19,7 @@
 
 ### Model names
 
-Model names follow a `model:tag` format, where `model` can have an optional namespace such as `example/model`. Some examples are `orca-mini:3b-q4_1` and `llama3:70b`. The tag is optional and, if not provided, will default to `latest`. The tag is used to identify a specific version.
+Model names follow a `model:tag` format, where `model` can have an optional namespace such as `example/model`. Some examples are `orca-mini:3b-q8_0` and `llama3:70b`. The tag is optional and, if not provided, will default to `latest`. The tag is used to identify a specific version.
 
 ### Durations
 
@@ -952,19 +952,8 @@ If you are creating a model from a safetensors directory or from a GGUF file, yo
 
 | Type | Recommended |
 | --- | :-: |
-| q2_K | |
-| q3_K_L | |
-| q3_K_M | |
-| q3_K_S | |
-| q4_0 | |
-| q4_1 | |
 | q4_K_M | * |
 | q4_K_S | |
-| q5_0 | |
-| q5_1 | |
-| q5_K_M | |
-| q5_K_S | |
-| q6_K | |
 | q8_0 | * |
 
 ### Examples
@@ -1009,8 +998,8 @@ Quantize a non-quantized model.
 
 ```shell
 curl http://localhost:11434/api/create -d '{
-  "model": "llama3.1:quantized",
-  "from": "llama3.1:8b-instruct-fp16",
+  "model": "llama3.2:quantized",
+  "from": "llama3.2:3b-instruct-fp16",
   "quantize": "q4_K_M"
 }'
 ```
@@ -1020,12 +1009,14 @@ curl http://localhost:11434/api/create -d '{
 A stream of JSON objects is returned:
 
 ```json
-{"status":"quantizing F16 model to Q4_K_M"}
-{"status":"creating new layer sha256:667b0c1932bc6ffc593ed1d03f895bf2dc8dc6df21db3042284a6f4416b06a29"}
-{"status":"using existing layer sha256:11ce4ee3e170f6adebac9a991c22e22ab3f8530e154ee669954c4bc73061c258"}
-{"status":"using existing layer sha256:0ba8f0e314b4264dfd19df045cde9d4c394a52474bf92ed6a3de22a4ca31a177"}
+{"status":"quantizing F16 model to Q4_K_M","digest":"0","total":6433687776,"completed":12302}
+{"status":"quantizing F16 model to Q4_K_M","digest":"0","total":6433687776,"completed":6433687552}
+{"status":"verifying conversion"}
+{"status":"creating new layer sha256:fb7f4f211b89c6c4928ff4ddb73db9f9c0cfca3e000c3e40d6cf27ddc6ca72eb"}
+{"status":"using existing layer sha256:966de95ca8a62200913e3f8bfbf84c8494536f1b94b49166851e76644e966396"}
+{"status":"using existing layer sha256:fcc5a6bec9daf9b561a68827b67ab6088e1dba9d1fa2a50d7bbcc8384e0a265d"}
+{"status":"using existing layer sha256:a70ff7e570d97baaf4e62ac6e6ad9975e04caa6d900d3742d37698494479e0cd"}
 {"status":"using existing layer sha256:56bb8bd477a519ffa694fc449c2413c6f0e1d3b1c88fa7e3c9d88d3ae49d4dcb"}
-{"status":"creating new layer sha256:455f34728c9b5dd3376378bfb809ee166c145b0b4c1f1a6feca069055066ef9a"}
 {"status":"writing manifest"}
 {"status":"success"}
 ```
@@ -1163,29 +1154,37 @@ A single JSON object will be returned.
 {
   "models": [
     {
-      "name": "codellama:13b",
-      "modified_at": "2023-11-04T14:56:49.277302595-07:00",
-      "size": 7365960935,
-      "digest": "9f438cb9cd581fc025612d27f7c1a6669ff83a8bb0ed86c94fcf4c5440555697",
+      "name": "deepseek-r1:latest",
+      "model": "deepseek-r1:latest",
+      "modified_at": "2025-05-10T08:06:48.639712648-07:00",
+      "size": 4683075271,
+      "digest": "0a8c266910232fd3291e71e5ba1e058cc5af9d411192cf88b6d30e92b6e73163",
       "details": {
+        "parent_model": "",
         "format": "gguf",
-        "family": "llama",
-        "families": null,
-        "parameter_size": "13B",
-        "quantization_level": "Q4_0"
+        "family": "qwen2",
+        "families": [
+          "qwen2"
+        ],
+        "parameter_size": "7.6B",
+        "quantization_level": "Q4_K_M"
       }
     },
     {
-      "name": "llama3:latest",
-      "modified_at": "2023-12-07T09:32:18.757212583-08:00",
-      "size": 3825819519,
-      "digest": "fe938a131f40e6f6d40083c9f0f430a515233eb2edaa6d72eb85c50d64f2300e",
+      "name": "llama3.2:latest",
+      "model": "llama3.2:latest",
+      "modified_at": "2025-05-04T17:37:44.706015396-07:00",
+      "size": 2019393189,
+      "digest": "a80c4f17acd55265feec403c7aef86be0c25983ab279d83f3bcd3abbcb5b8b72",
       "details": {
+        "parent_model": "",
         "format": "gguf",
         "family": "llama",
-        "families": null,
-        "parameter_size": "7B",
-        "quantization_level": "Q4_0"
+        "families": [
+          "llama"
+        ],
+        "parameter_size": "3.2B",
+        "quantization_level": "Q4_K_M"
       }
     }
   ]
diff --git a/fs/ggml/type.go b/fs/ggml/type.go
index 8172c46d..4d3d5bca 100644
--- a/fs/ggml/type.go
+++ b/fs/ggml/type.go
@@ -12,42 +12,42 @@ type FileType uint32
 const (
 	FileTypeF32 FileType = iota
 	FileTypeF16
-	FileTypeQ4_0
-	FileTypeQ4_1
+	fileTypeQ4_0
+	fileTypeQ4_1
 	fileTypeQ4_1_F16 // unused by GGML
 	fileTypeQ4_2     // unused by GGML
 	fileTypeQ4_3     // unused by GGML
 	FileTypeQ8_0
-	FileTypeQ5_0
-	FileTypeQ5_1
-	FileTypeQ2_K
-	FileTypeQ3_K_S
-	FileTypeQ3_K_M
-	FileTypeQ3_K_L
+	fileTypeQ5_0
+	fileTypeQ5_1
+	fileTypeQ2_K
+	fileTypeQ3_K_S
+	fileTypeQ3_K_M
+	fileTypeQ3_K_L
 	FileTypeQ4_K_S
 	FileTypeQ4_K_M
-	FileTypeQ5_K_S
-	FileTypeQ5_K_M
-	FileTypeQ6_K
-	fileTypeIQ2_XXS // not supported by ollama
-	fileTypeIQ2_XS  // not supported by ollama
-	FileTypeQ2_K_S
-	fileTypeIQ3_XS  // not supported by ollama
-	fileTypeIQ3_XXS // not supported by ollama
-	fileTypeIQ1_S   // not supported by ollama
-	fileTypeIQ4_NL  // not supported by ollama
-	fileTypeIQ3_S   // not supported by ollama
-	fileTypeIQ3_M   // not supported by ollama
-	fileTypeIQ2_S   // not supported by ollama
-	fileTypeIQ2_M   // not supported by ollama
-	fileTypeIQ4_XS  // not supported by ollama
-	fileTypeIQ1_M   // not supported by ollama
+	fileTypeQ5_K_S
+	fileTypeQ5_K_M
+	fileTypeQ6_K
+	fileTypeIQ2_XXS
+	fileTypeIQ2_XS
+	fileTypeQ2_K_S
+	fileTypeIQ3_XS
+	fileTypeIQ3_XXS
+	fileTypeIQ1_S
+	fileTypeIQ4_NL
+	fileTypeIQ3_S
+	fileTypeIQ3_M
+	fileTypeIQ2_S
+	fileTypeIQ2_M
+	fileTypeIQ4_XS
+	fileTypeIQ1_M
 	FileTypeBF16
 	fileTypeQ4_0_4_4 // unused by GGML
 	fileTypeQ4_0_4_8 // unused by GGML
 	fileTypeQ4_0_8_8 // unused by GGML
-	fileTypeTQ1_0    // not supported by ollama
-	fileTypeTQ2_0    // not supported by ollama
+	fileTypeTQ1_0
+	fileTypeTQ2_0
 
 	FileTypeUnknown = 1024
 )
@@ -60,36 +60,12 @@ func ParseFileType(s string) (FileType, error) {
 		return FileTypeF32, nil
 	case "F16":
 		return FileTypeF16, nil
-	case "Q4_0":
-		return FileTypeQ4_0, nil
-	case "Q4_1":
-		return FileTypeQ4_1, nil
 	case "Q8_0":
 		return FileTypeQ8_0, nil
-	case "Q5_0":
-		return FileTypeQ5_0, nil
-	case "Q5_1":
-		return FileTypeQ5_1, nil
-	case "Q2_K":
-		return FileTypeQ2_K, nil
-	case "Q3_K_S":
-		return FileTypeQ3_K_S, nil
-	case "Q3_K_M":
-		return FileTypeQ3_K_M, nil
-	case "Q3_K_L":
-		return FileTypeQ3_K_L, nil
 	case "Q4_K_S":
 		return FileTypeQ4_K_S, nil
 	case "Q4_K_M", "Q4_K":
 		return FileTypeQ4_K_M, nil
-	case "Q5_K_S":
-		return FileTypeQ5_K_S, nil
-	case "Q5_K_M", "Q5_K":
-		return FileTypeQ5_K_M, nil
-	case "Q6_K":
-		return FileTypeQ6_K, nil
-	case "Q2_K_S":
-		return FileTypeQ2_K_S, nil
 	case "BF16":
 		return FileTypeBF16, nil
 	default:
@@ -111,40 +87,41 @@ func ParseFileType(s string) (FileType, error) {
 }
 
 func (t FileType) String() string {
+	// Note: this routine will return a broader set of file types for existing models
 	switch t {
 	case FileTypeF32:
 		return "F32"
 	case FileTypeF16:
 		return "F16"
-	case FileTypeQ4_0:
+	case fileTypeQ4_0:
 		return "Q4_0"
-	case FileTypeQ4_1:
+	case fileTypeQ4_1:
 		return "Q4_1"
 	case FileTypeQ8_0:
 		return "Q8_0"
-	case FileTypeQ5_0:
+	case fileTypeQ5_0:
 		return "Q5_0"
-	case FileTypeQ5_1:
+	case fileTypeQ5_1:
 		return "Q5_1"
-	case FileTypeQ2_K:
+	case fileTypeQ2_K:
 		return "Q2_K"
-	case FileTypeQ3_K_S:
+	case fileTypeQ3_K_S:
 		return "Q3_K_S"
-	case FileTypeQ3_K_M:
+	case fileTypeQ3_K_M:
 		return "Q3_K_M"
-	case FileTypeQ3_K_L:
+	case fileTypeQ3_K_L:
 		return "Q3_K_L"
 	case FileTypeQ4_K_S:
 		return "Q4_K_S"
 	case FileTypeQ4_K_M:
 		return "Q4_K_M"
-	case FileTypeQ5_K_S:
+	case fileTypeQ5_K_S:
 		return "Q5_K_S"
-	case FileTypeQ5_K_M:
+	case fileTypeQ5_K_M:
 		return "Q5_K_M"
-	case FileTypeQ6_K:
+	case fileTypeQ6_K:
 		return "Q6_K"
-	case FileTypeQ2_K_S:
+	case fileTypeQ2_K_S:
 		return "Q2_K_S"
 	case FileTypeBF16:
 		return "BF16"
@@ -163,35 +140,35 @@ func (ftype FileType) ToTensorType() TensorType {
 		return TensorTypeF32
 	case FileTypeF16:
 		return TensorTypeF16
-	case FileTypeQ4_0:
+	case fileTypeQ4_0:
 		return TensorTypeQ4_0
-	case FileTypeQ4_1:
+	case fileTypeQ4_1:
 		return TensorTypeQ4_1
 	case FileTypeQ8_0:
 		return TensorTypeQ8_0
-	case FileTypeQ5_0:
+	case fileTypeQ5_0:
 		return TensorTypeQ5_0
-	case FileTypeQ5_1:
+	case fileTypeQ5_1:
 		return TensorTypeQ5_1
-	case FileTypeQ2_K:
+	case fileTypeQ2_K:
 		return TensorTypeQ2_K
-	case FileTypeQ3_K_S:
+	case fileTypeQ3_K_S:
 		return TensorTypeQ3_K
-	case FileTypeQ3_K_M:
+	case fileTypeQ3_K_M:
 		return TensorTypeQ3_K
-	case FileTypeQ3_K_L:
+	case fileTypeQ3_K_L:
 		return TensorTypeQ3_K
 	case FileTypeQ4_K_S:
 		return TensorTypeQ4_K
 	case FileTypeQ4_K_M:
 		return TensorTypeQ4_K
-	case FileTypeQ5_K_S:
+	case fileTypeQ5_K_S:
 		return TensorTypeQ5_K
-	case FileTypeQ5_K_M:
+	case fileTypeQ5_K_M:
 		return TensorTypeQ5_K
-	case FileTypeQ6_K:
+	case fileTypeQ6_K:
 		return TensorTypeQ6_K
-	case FileTypeQ2_K_S:
+	case fileTypeQ2_K_S:
 		return TensorTypeQ2_K
 	case FileTypeBF16:
 		return TensorTypeBF16
diff --git a/server/quantization.go b/server/quantization.go
index 80bc093d..adfc948e 100644
--- a/server/quantization.go
+++ b/server/quantization.go
@@ -70,23 +70,7 @@ func getTensorNewType(kv fsggml.KV, qs *quantizeState, newType fsggml.TensorType
 			newType = fsggml.TensorTypeQ6_K
 		}
 	} else if strings.Contains(name, "attn_v.weight") {
-		if ftype == fsggml.FileTypeQ2_K {
-			if kv.GQA() >= 4 {
-				newType = fsggml.TensorTypeQ4_K
-			} else {
-				newType = fsggml.TensorTypeQ3_K
-			}
-		} else if ftype == fsggml.FileTypeQ2_K_S && kv.GQA() >= 4 {
-			newType = fsggml.TensorTypeQ4_K
-		} else if ftype == fsggml.FileTypeQ3_K_M {
-			if qs.iAttnV < 2 {
-				newType = fsggml.TensorTypeQ5_K
-			} else {
-				newType = fsggml.TensorTypeQ4_K
-			}
-		} else if ftype == fsggml.FileTypeQ3_K_L {
-			newType = fsggml.TensorTypeQ5_K
-		} else if (ftype == fsggml.FileTypeQ4_K_M || ftype == fsggml.FileTypeQ5_K_M) &&
+		if (ftype == fsggml.FileTypeQ4_K_M) &&
 			useMoreBits(qs.iAttnV, qs.nAttnV) {
 			newType = fsggml.TensorTypeQ6_K
 		} else if ftype == fsggml.FileTypeQ4_K_S && qs.iAttnV < 4 {
@@ -114,54 +98,23 @@ func getTensorNewType(kv fsggml.KV, qs *quantizeState, newType fsggml.TensorType
 	} else if strings.Contains(name, "ffn_down") {
 		iLayer := qs.iFfnDown
 		n_layer := qs.nFfnDown
-		if ftype == fsggml.FileTypeQ2_K {
-			newType = fsggml.TensorTypeQ3_K
-		} else if ftype == fsggml.FileTypeQ2_K_S {
-			if iLayer < n_layer/8 {
-				newType = fsggml.TensorTypeQ4_K
-			}
-		} else if ftype == fsggml.FileTypeQ3_K_M {
-			if iLayer < n_layer/16 {
-				newType = fsggml.TensorTypeQ5_K
-			} else if useMoreBits(iLayer, n_layer) {
-				newType = fsggml.TensorTypeQ4_K
-			} else {
-				newType = fsggml.TensorTypeQ3_K
-			}
-		} else if ftype == fsggml.FileTypeQ3_K_L {
-			newType = fsggml.TensorTypeQ5_K
-		} else if ftype == fsggml.FileTypeQ4_K_M {
+		if ftype == fsggml.FileTypeQ4_K_M {
 			if useMoreBits(iLayer, n_layer) {
 				newType = fsggml.TensorTypeQ6_K
 			}
-		} else if ftype == fsggml.FileTypeQ5_K_M && useMoreBits(iLayer, n_layer) {
-			newType = fsggml.TensorTypeQ6_K
 		} else if ftype == fsggml.FileTypeQ4_K_S && iLayer < n_layer/8 {
 			newType = fsggml.TensorTypeQ5_K
 		}
 		qs.iFfnDown++
 	} else if strings.Contains(name, "attn_output.weight") {
 		if nExperts == 8 {
-			if ftype == fsggml.FileTypeQ2_K || ftype == fsggml.FileTypeQ3_K_S || ftype == fsggml.FileTypeQ3_K_M ||
-				ftype == fsggml.FileTypeQ4_K_S || ftype == fsggml.FileTypeQ4_K_M {
-				newType = fsggml.TensorTypeQ5_K
-			}
-		} else {
-			if ftype == fsggml.FileTypeQ2_K {
-				newType = fsggml.TensorTypeQ3_K
-			} else if ftype == fsggml.FileTypeQ3_K_M {
-				newType = fsggml.TensorTypeQ4_K
-			} else if ftype == fsggml.FileTypeQ3_K_L {
+			if ftype == fsggml.FileTypeQ4_K_S || ftype == fsggml.FileTypeQ4_K_M {
 				newType = fsggml.TensorTypeQ5_K
 			}
 		}
 	} else if strings.Contains(name, "attn_qkv.weight") {
-		if ftype == fsggml.FileTypeQ3_K_M || ftype == fsggml.FileTypeQ3_K_L {
-			newType = fsggml.TensorTypeQ4_K
-		} else if ftype == fsggml.FileTypeQ4_K_M {
+		if ftype == fsggml.FileTypeQ4_K_M {
 			newType = fsggml.TensorTypeQ5_K
-		} else if ftype == fsggml.FileTypeQ5_K_M {
-			newType = fsggml.TensorTypeQ6_K
 		}
 	}
 
diff --git a/server/quantization_test.go b/server/quantization_test.go
index b7e13350..495297df 100644
--- a/server/quantization_test.go
+++ b/server/quantization_test.go
@@ -42,71 +42,6 @@ func TestGetTensorNewType(t *testing.T) {
 			ftype:       fsggml.FileTypeF32,
 			expected:    fsggml.TensorTypeQ6_K,
 		},
-		{
-			name: "attn_v.weight_q4_k",
-			kv: map[string]any{
-				"general.architecture":        "foo",
-				"foo.attention.head_count":    uint32(4),
-				"foo.attention.head_count_kv": uint32(1),
-			},
-			newType:     fsggml.TensorTypeQ4_0,
-			tensor_name: "blk.0.attn_v.weight",
-			shape:       []uint64{256},
-			ftype:       fsggml.FileTypeQ2_K,
-			expected:    fsggml.TensorTypeQ4_K,
-		},
-		{
-			name:        "attn_v.weight_q3_k",
-			kv:          map[string]any{},
-			newType:     fsggml.TensorTypeQ4_0,
-			tensor_name: "blk.0.attn_v.weight",
-			shape:       []uint64{256},
-			ftype:       fsggml.FileTypeQ2_K,
-			expected:    fsggml.TensorTypeQ3_K,
-		},
-		{
-			name: "attn_v.weight_q2_k_s_q4_k",
-			kv: map[string]any{
-				"general.architecture":        "foo",
-				"foo.attention.head_count":    uint32(4),
-				"foo.attention.head_count_kv": uint32(1),
-			},
-			newType:     fsggml.TensorTypeQ4_0,
-			tensor_name: "blk.0.attn_v.weight",
-			shape:       []uint64{256},
-			ftype:       fsggml.FileTypeQ2_K_S,
-			expected:    fsggml.TensorTypeQ4_K,
-		},
-		{
-			name:        "attn_v.weight_q3_k_m",
-			kv:          map[string]any{},
-			newType:     fsggml.TensorTypeQ4_0,
-			tensor_name: "blk.0.attn_v.weight",
-			shape:       []uint64{256},
-			ftype:       fsggml.FileTypeQ3_K_M,
-			expected:    fsggml.TensorTypeQ5_K,
-		},
-		{
-			name: "attn_v.weight_q3_k_m_i",
-			qs: quantizeState{
-				iAttnV: 2,
-			},
-			kv:          map[string]any{},
-			newType:     fsggml.TensorTypeQ4_0,
-			tensor_name: "blk.0.attn_v.weight",
-			shape:       []uint64{256},
-			ftype:       fsggml.FileTypeQ3_K_M,
-			expected:    fsggml.TensorTypeQ4_K,
-		},
-		{
-			name:        "attn_v.weight_q3_k_l",
-			kv:          map[string]any{},
-			newType:     fsggml.TensorTypeQ4_0,
-			tensor_name: "blk.0.attn_v.weight",
-			shape:       []uint64{256},
-			ftype:       fsggml.FileTypeQ3_K_L,
-			expected:    fsggml.TensorTypeQ5_K,
-		},
 		{
 			name: "attn_v.weight_q4_k_m",
 			qs: quantizeState{
@@ -156,88 +91,6 @@ func TestGetTensorNewType(t *testing.T) {
 			ftype:       fsggml.FileTypeF32,
 			expected:    fsggml.TensorTypeQ8_0,
 		},
-		{
-			name:        "ffn_down_q2_k",
-			qs:          quantizeState{},
-			kv:          map[string]any{},
-			newType:     fsggml.TensorTypeQ4_0,
-			tensor_name: "ffn_down",
-			shape:       []uint64{256},
-			ftype:       fsggml.FileTypeQ2_K,
-			expected:    fsggml.TensorTypeQ3_K,
-		},
-		{
-			name:        "ffn_down_q2_k_s",
-			qs:          quantizeState{},
-			kv:          map[string]any{},
-			newType:     fsggml.TensorTypeQ4_0,
-			tensor_name: "ffn_down",
-			shape:       []uint64{256},
-			ftype:       fsggml.FileTypeQ2_K_S,
-			expected:    fsggml.TensorTypeQ4_0,
-		},
-		{
-			name: "ffn_down_q2_k_s_layers",
-			qs: quantizeState{
-				iFfnDown: 2,
-				nFfnDown: 3 * 8,
-			},
-			kv:          map[string]any{},
-			newType:     fsggml.TensorTypeQ4_0,
-			tensor_name: "ffn_down",
-			shape:       []uint64{256},
-			ftype:       fsggml.FileTypeQ2_K_S,
-			expected:    fsggml.TensorTypeQ4_K,
-		},
-		{
-			name: "ffn_down_q3_k_m_base",
-			qs: quantizeState{
-				iFfnDown: 1,
-				nFfnDown: 8,
-			},
-			kv:          map[string]any{},
-			newType:     fsggml.TensorTypeQ4_0,
-			tensor_name: "ffn_down",
-			shape:       []uint64{256},
-			ftype:       fsggml.FileTypeQ3_K_M,
-			expected:    fsggml.TensorTypeQ3_K,
-		},
-		{
-			name: "ffn_down_q3_k_m_16",
-			qs: quantizeState{
-				iFfnDown: 2,
-				nFfnDown: 3 * 16,
-			},
-			kv:          map[string]any{},
-			newType:     fsggml.TensorTypeQ4_0,
-			tensor_name: "ffn_down",
-			shape:       []uint64{256},
-			ftype:       fsggml.FileTypeQ3_K_M,
-			expected:    fsggml.TensorTypeQ5_K,
-		},
-		{
-			name: "ffn_down_q3_k_m_8",
-			qs: quantizeState{
-				iFfnDown: 2,
-				nFfnDown: 3 * 8,
-			},
-			kv:          map[string]any{},
-			newType:     fsggml.TensorTypeQ4_0,
-			tensor_name: "ffn_down",
-			shape:       []uint64{256},
-			ftype:       fsggml.FileTypeQ3_K_M,
-			expected:    fsggml.TensorTypeQ4_K,
-		},
-		{
-			name:        "ffn_down_q3_k_l",
-			qs:          quantizeState{},
-			kv:          map[string]any{},
-			newType:     fsggml.TensorTypeQ4_0,
-			tensor_name: "ffn_down",
-			shape:       []uint64{256},
-			ftype:       fsggml.FileTypeQ3_K_L,
-			expected:    fsggml.TensorTypeQ5_K,
-		},
 		{
 			name: "ffn_down_q4_k_m",
 			qs: quantizeState{
@@ -264,19 +117,6 @@ func TestGetTensorNewType(t *testing.T) {
 			ftype:       fsggml.FileTypeQ4_K_M,
 			expected:    fsggml.TensorTypeQ6_K,
 		},
-		{
-			name: "ffn_down_q5_k_m",
-			qs: quantizeState{
-				iFfnDown: 2,
-				nFfnDown: 3 * 8,
-			},
-			kv:          map[string]any{},
-			newType:     fsggml.TensorTypeQ4_0,
-			tensor_name: "ffn_down",
-			shape:       []uint64{256},
-			ftype:       fsggml.FileTypeQ5_K_M,
-			expected:    fsggml.TensorTypeQ6_K,
-		},
 		{
 			name: "ffn_down_q4_k_s",
 			qs: quantizeState{
@@ -290,59 +130,6 @@ func TestGetTensorNewType(t *testing.T) {
 			ftype:       fsggml.FileTypeQ4_K_S,
 			expected:    fsggml.TensorTypeQ5_K,
 		},
-		{
-			name: "attn_output.weight_8_expert",
-			qs:   quantizeState{},
-			kv: map[string]any{
-				"general.architecture": "foo",
-				"foo.expert_count":     uint32(8),
-			},
-			newType:     fsggml.TensorTypeQ4_0,
-			tensor_name: "blk.0.attn_output.weight",
-			shape:       []uint64{256},
-			ftype:       fsggml.FileTypeQ2_K,
-			expected:    fsggml.TensorTypeQ5_K,
-		},
-		{
-			name:        "attn_output.weight_q2",
-			qs:          quantizeState{},
-			kv:          map[string]any{},
-			newType:     fsggml.TensorTypeQ4_0,
-			tensor_name: "blk.0.attn_output.weight",
-			shape:       []uint64{256},
-			ftype:       fsggml.FileTypeQ2_K,
-			expected:    fsggml.TensorTypeQ3_K,
-		},
-		{
-			name:        "attn_output.weight_q3_k_m",
-			qs:          quantizeState{},
-			kv:          map[string]any{},
-			newType:     fsggml.TensorTypeQ4_0,
-			tensor_name: "blk.0.attn_output.weight",
-			shape:       []uint64{256},
-			ftype:       fsggml.FileTypeQ3_K_M,
-			expected:    fsggml.TensorTypeQ4_K,
-		},
-		{
-			name:        "attn_output.weight_q3_k_l",
-			qs:          quantizeState{},
-			kv:          map[string]any{},
-			newType:     fsggml.TensorTypeQ4_0,
-			tensor_name: "blk.0.attn_output.weight",
-			shape:       []uint64{256},
-			ftype:       fsggml.FileTypeQ3_K_L,
-			expected:    fsggml.TensorTypeQ5_K,
-		},
-		{
-			name:        "attn_qkv.weight_q3_k_m",
-			qs:          quantizeState{},
-			kv:          map[string]any{},
-			newType:     fsggml.TensorTypeQ4_0,
-			tensor_name: "blk.0.attn_qkv.weight",
-			shape:       []uint64{256},
-			ftype:       fsggml.FileTypeQ3_K_M,
-			expected:    fsggml.TensorTypeQ4_K,
-		},
 		{
 			name:        "attn_qkv.weight_q4_k_m",
 			qs:          quantizeState{},
@@ -353,16 +140,6 @@ func TestGetTensorNewType(t *testing.T) {
 			ftype:       fsggml.FileTypeQ4_K_M,
 			expected:    fsggml.TensorTypeQ5_K,
 		},
-		{
-			name:        "attn_qkv.weight_q5_k_m",
-			qs:          quantizeState{},
-			kv:          map[string]any{},
-			newType:     fsggml.TensorTypeQ4_0,
-			tensor_name: "blk.0.attn_qkv.weight",
-			shape:       []uint64{256},
-			ftype:       fsggml.FileTypeQ5_K_M,
-			expected:    fsggml.TensorTypeQ6_K,
-		},
 	}
 	for _, tt := range cases {
 		t.Run(tt.name, func(t *testing.T) {

From a7240c6d636836f0bca01790038d7194f519604b Mon Sep 17 00:00:00 2001
From: Bruce MacDonald <brucewmacdonald@gmail.com>
Date: Mon, 12 May 2025 16:08:42 -0700
Subject: [PATCH 013/108] models: remove unused qwen2vl processing (#10677)

---
 model/models/qwen2vl/imageproc.go      | 74 ------------------------
 model/models/qwen2vl/imageproc_test.go | 78 --------------------------
 2 files changed, 152 deletions(-)
 delete mode 100644 model/models/qwen2vl/imageproc.go
 delete mode 100644 model/models/qwen2vl/imageproc_test.go

diff --git a/model/models/qwen2vl/imageproc.go b/model/models/qwen2vl/imageproc.go
deleted file mode 100644
index 964b3907..00000000
--- a/model/models/qwen2vl/imageproc.go
+++ /dev/null
@@ -1,74 +0,0 @@
-package qwen2vl
-
-import (
-	"fmt"
-	"image"
-	_ "image/jpeg"
-	_ "image/png"
-	"io"
-	"math"
-
-	"github.com/ollama/ollama/model/imageproc"
-)
-
-const (
-	DefaultFactor    = 28
-	DefaultMinPixels = 56 * 56
-	DefaultMaxPixels = 14 * 14 * 4 * 1280
-)
-
-// smartResize calculates the size of the image to resize to based on the
-// factor, minPixels, and maxPixels.
-func smartResize(size image.Point, factor, minPixels, maxPixels int) image.Point {
-	// 1. Both dimensions of size are divisible by factor
-	// 2. The area of the image is between minPixels and maxPixels
-	// 3. The aspect ratio of the image is as close to 1:1 as possible
-
-	if size.Y < factor || size.X < factor {
-		panic("image is too small to resize")
-	} else if max(size.X, size.Y)/min(size.X, size.Y) > 200 {
-		panic("aspect ratio must be less than 200:1")
-	}
-
-	f := float64(factor)
-	width := float64(size.X)
-	height := float64(size.Y)
-
-	xBar := math.Round(width/f) * f
-	yBar := math.Round(height/f) * f
-
-	if xBar*yBar > float64(maxPixels) {
-		beta := math.Sqrt(height * width / float64(maxPixels))
-		xBar = math.Floor(width/beta/f) * f
-		yBar = math.Floor(height/beta/f) * f
-	} else if xBar*yBar < float64(minPixels) {
-		beta := math.Sqrt(float64(minPixels) / (height * width))
-		xBar = math.Ceil(width*beta/f) * f
-		yBar = math.Ceil(height*beta/f) * f
-	}
-
-	return image.Point{int(xBar), int(yBar)}
-}
-
-func resizeImage(img image.Image, format string, size image.Point) image.Image {
-	if format == "png" {
-		img = imageproc.Composite(img)
-	}
-
-	return imageproc.Resize(img, size, imageproc.ResizeBilinear)
-}
-
-func Preprocess(imageData io.Reader) ([]float32, map[string]any, error) {
-	img, format, err := image.Decode(imageData)
-	if err != nil {
-		return nil, nil, fmt.Errorf("failed to decode image: %w", err)
-	}
-
-	size := smartResize(img.Bounds().Max, DefaultFactor, DefaultMinPixels, DefaultMaxPixels)
-	img = resizeImage(img, format, size)
-
-	data := imageproc.Normalize(img, imageproc.ClipDefaultMean, imageproc.ClipDefaultSTD, true, true)
-
-	opts := map[string]any{}
-	return data, opts, nil
-}
diff --git a/model/models/qwen2vl/imageproc_test.go b/model/models/qwen2vl/imageproc_test.go
deleted file mode 100644
index 817b61a5..00000000
--- a/model/models/qwen2vl/imageproc_test.go
+++ /dev/null
@@ -1,78 +0,0 @@
-package qwen2vl
-
-import (
-	"bytes"
-	"image"
-	"image/png"
-	"testing"
-)
-
-func TestSmartResize(t *testing.T) {
-	type smartResizeCase struct {
-		TestImage image.Image
-		Expected  image.Point
-	}
-
-	cases := []smartResizeCase{
-		{
-			TestImage: image.NewRGBA(image.Rect(0, 0, 1024, 1024)),
-			Expected:  image.Point{980, 980},
-		},
-		{
-			TestImage: image.NewRGBA(image.Rect(0, 0, 1024, 768)),
-			Expected:  image.Point{1036, 756},
-		},
-		{
-			TestImage: image.NewRGBA(image.Rect(0, 0, 2000, 2000)),
-			Expected:  image.Point{980, 980},
-		},
-	}
-
-	for _, c := range cases {
-		b := c.TestImage.Bounds().Max
-		actual := smartResize(b, DefaultFactor, DefaultMinPixels, DefaultMaxPixels)
-		if actual != c.Expected {
-			t.Errorf("expected: %v, actual: %v", c.Expected, actual)
-		}
-	}
-}
-
-func TestPreprocess(t *testing.T) {
-	type preprocessCase struct {
-		TestImage   image.Image
-		ExpectedLen int
-	}
-
-	cases := []preprocessCase{
-		{
-			TestImage:   image.NewRGBA(image.Rect(0, 0, 256, 256)),
-			ExpectedLen: 252 * 252 * 3 * 1,
-		},
-		{
-			TestImage:   image.NewRGBA(image.Rect(0, 0, 2000, 2000)),
-			ExpectedLen: 980 * 980 * 3 * 1,
-		},
-	}
-
-	for _, c := range cases {
-		var buf bytes.Buffer
-		err := png.Encode(&buf, c.TestImage)
-		if err != nil {
-			t.Fatal(err)
-		}
-
-		imgData, _, err := Preprocess(&buf)
-		if err != nil {
-			t.Fatalf("error processing: %q", err)
-		}
-
-		switch len(imgData) {
-		case 0:
-			t.Errorf("no image data returned")
-		case c.ExpectedLen:
-			// ok
-		default:
-			t.Errorf("unexpected image data length: %d, expected: %d", len(imgData), c.ExpectedLen)
-		}
-	}
-}

From 526b2ed10296cc3d1ae89121eedcbbbe257741a3 Mon Sep 17 00:00:00 2001
From: Michael Yang <mxyng@pm.me>
Date: Mon, 12 May 2025 17:29:46 -0700
Subject: [PATCH 014/108] fix vocabulary (#10679)

---
 model/models/gemma2/model.go        |  2 ++
 model/models/gemma3/model_text.go   | 13 -------------
 model/models/llama/model.go         |  3 +++
 model/models/llama4/model.go        |  3 +++
 model/models/mistral3/model.go      | 17 +++++++++++++++++
 model/models/mistral3/model_text.go | 13 -------------
 model/models/mllama/model.go        |  3 +++
 7 files changed, 28 insertions(+), 26 deletions(-)

diff --git a/model/models/gemma2/model.go b/model/models/gemma2/model.go
index d418f682..3156b006 100644
--- a/model/models/gemma2/model.go
+++ b/model/models/gemma2/model.go
@@ -45,6 +45,8 @@ func New(c fs.Config) (model.Model, error) {
 				Types:  c.Ints("tokenizer.ggml.token_type"),
 				BOS:    int32(c.Uint("tokenizer.ggml.bos_token_id")),
 				EOS:    int32(c.Uint("tokenizer.ggml.eos_token_id")),
+				// TODO: set EOT to EOS otherwise 0 will stop generation
+				EOT: int32(c.Uint("tokenizer.ggml.eos_token_id")),
 			},
 		),
 		Layers: make([]Layer, c.Uint("block_count")),
diff --git a/model/models/gemma3/model_text.go b/model/models/gemma3/model_text.go
index c1e843d8..741818a2 100644
--- a/model/models/gemma3/model_text.go
+++ b/model/models/gemma3/model_text.go
@@ -7,7 +7,6 @@ import (
 	"github.com/ollama/ollama/kvcache"
 	"github.com/ollama/ollama/ml"
 	"github.com/ollama/ollama/ml/nn"
-	"github.com/ollama/ollama/model"
 	"github.com/ollama/ollama/model/input"
 )
 
@@ -20,9 +19,6 @@ type TextConfig struct {
 }
 
 type TextModel struct {
-	model.Base
-	model.SentencePieceModel
-
 	TokenEmbedding *nn.Embedding `gguf:"token_embd"`
 	Layers         []TextLayer   `gguf:"blk"`
 	OutputNorm     *nn.RMSNorm   `gguf:"output_norm"`
@@ -45,15 +41,6 @@ func newTextModel(c fs.Config) *TextModel {
 	numBlocks := int(c.Uint("block_count"))
 
 	m := TextModel{
-		SentencePieceModel: model.NewSentencePieceModel(
-			&model.Vocabulary{
-				Values: c.Strings("tokenizer.ggml.tokens"),
-				Scores: c.Floats("tokenizer.ggml.scores"),
-				Types:  c.Ints("tokenizer.ggml.token_type"),
-				BOS:    int32(c.Uint("tokenizer.ggml.bos_token_id")),
-				EOS:    int32(c.Uint("tokenizer.ggml.eos_token_id")),
-			},
-		),
 		Layers: make([]TextLayer, numBlocks),
 		TextConfig: &TextConfig{
 			hiddenSize:     int(c.Uint("embedding_length")),
diff --git a/model/models/llama/model.go b/model/models/llama/model.go
index 3e5a5427..c75d7eb2 100644
--- a/model/models/llama/model.go
+++ b/model/models/llama/model.go
@@ -47,6 +47,9 @@ func New(c fs.Config) (model.Model, error) {
 				AddBOS: c.Bool("tokenizer.ggml.add_bos_token", true),
 				EOS:    int32(c.Uint("tokenizer.ggml.eos_token_id")),
 				AddEOS: c.Bool("tokenizer.ggml.add_eos_token", false),
+				// TODO: set EOT to EOS otherwise 0 will stop generation
+				EOT:    int32(c.Uint("tokenizer.ggml.eos_token_id")),
+				AddEOT: c.Bool("tokenizer.ggml.add_eos_token", false),
 			},
 		),
 		Layers: make([]Layer, c.Uint("block_count")),
diff --git a/model/models/llama4/model.go b/model/models/llama4/model.go
index 632d313e..798f0d16 100644
--- a/model/models/llama4/model.go
+++ b/model/models/llama4/model.go
@@ -45,6 +45,9 @@ func New(c fs.Config) (model.Model, error) {
 				AddBOS: c.Bool("tokenizer.ggml.add_bos_token", true),
 				EOS:    int32(c.Uint("tokenizer.ggml.eos_token_id")),
 				AddEOS: c.Bool("tokenizer.ggml.add_eos_token", false),
+				// TODO: set EOT to EOS otherwise 0 will stop generation
+				EOT:    int32(c.Uint("tokenizer.ggml.eos_token_id")),
+				AddEOT: c.Bool("tokenizer.ggml.add_eos_token", false),
 			},
 		),
 		ImageProcessor: newImageProcessor(c),
diff --git a/model/models/mistral3/model.go b/model/models/mistral3/model.go
index f749fdcd..c9685244 100644
--- a/model/models/mistral3/model.go
+++ b/model/models/mistral3/model.go
@@ -16,6 +16,8 @@ import (
 
 type Model struct {
 	model.Base
+	model.BytePairEncoding
+
 	*TextModel
 	*VisionModel         `gguf:"v,vision"`
 	*MultiModalProjector `gguf:"mm"`
@@ -40,6 +42,21 @@ func New(c fs.Config) (model.Model, error) {
 		VisionModel:         newVisionModel(c),
 		ImageProcessor:      newImageProcessor(c),
 		MultiModalProjector: newMultiModalProjector(c),
+		BytePairEncoding: model.NewBytePairEncoding(
+			c.String("tokenizer.ggml.pretokenizer", `[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}]*[\p{Ll}\p{Lm}\p{Lo}\p{M}]+|[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}]+[\p{Ll}\p{Lm}\p{Lo}\p{M}]*|\p{N}| ?[^\s\p{L}\p{N}]+[\r\n/]*|\s*[\r\n]+|\s+(?!\S)|\s+`),
+			&model.Vocabulary{
+				Values: c.Strings("tokenizer.ggml.tokens"),
+				Types:  c.Ints("tokenizer.ggml.token_type"),
+				Merges: c.Strings("tokenizer.ggml.merges"),
+				BOS:    int32(c.Uint("tokenizer.ggml.bos_token_id", 1)),
+				AddBOS: c.Bool("tokenizer.ggml.add_bos_token", true),
+				EOS:    int32(c.Uint("tokenizer.ggml.eos_token_id", 2)),
+				AddEOS: c.Bool("tokenizer.ggml.add_eos_token", false),
+				// TODO: set EOT to EOS otherwise 0 will stop generation
+				EOT:    int32(c.Uint("tokenizer.ggml.eos_token_id")),
+				AddEOT: c.Bool("tokenizer.ggml.add_eos_token", false),
+			},
+		),
 	}
 
 	m.Cache = kvcache.NewCausalCache(m.TextModel.Shift)
diff --git a/model/models/mistral3/model_text.go b/model/models/mistral3/model_text.go
index 1bf72acd..565b001a 100644
--- a/model/models/mistral3/model_text.go
+++ b/model/models/mistral3/model_text.go
@@ -21,7 +21,6 @@ type TextOptions struct {
 
 type TextModel struct {
 	model.Base
-	model.BytePairEncoding
 
 	TokenEmbedding *nn.Embedding `gguf:"token_embd"`
 	Layers         []Layer       `gguf:"blk"`
@@ -148,18 +147,6 @@ func NewTextModel(c fs.Config) (*TextModel, error) {
 	}
 
 	textModel := &TextModel{
-		BytePairEncoding: model.NewBytePairEncoding(
-			c.String("tokenizer.ggml.pretokenizer", `[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}]*[\p{Ll}\p{Lm}\p{Lo}\p{M}]+|[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}]+[\p{Ll}\p{Lm}\p{Lo}\p{M}]*|\p{N}| ?[^\s\p{L}\p{N}]+[\r\n/]*|\s*[\r\n]+|\s+(?!\S)|\s+`),
-			&model.Vocabulary{
-				Values: c.Strings("tokenizer.ggml.tokens"),
-				Types:  c.Ints("tokenizer.ggml.token_type"),
-				Merges: c.Strings("tokenizer.ggml.merges"),
-				BOS:    int32(c.Uint("tokenizer.ggml.bos_token_id", 1)),
-				AddBOS: c.Bool("tokenizer.ggml.add_bos_token", true),
-				EOS:    int32(c.Uint("tokenizer.ggml.eos_token_id", 2)),
-				AddEOS: c.Bool("tokenizer.ggml.add_eos_token", false),
-			},
-		),
 		Layers: make([]Layer, c.Uint("block_count")),
 		TextOptions: &TextOptions{
 			hiddenSize: int(c.Uint("embedding_length")),
diff --git a/model/models/mllama/model.go b/model/models/mllama/model.go
index 149876c9..3fa26ded 100644
--- a/model/models/mllama/model.go
+++ b/model/models/mllama/model.go
@@ -49,6 +49,9 @@ func New(c fs.Config) (model.Model, error) {
 				AddBOS: c.Bool("tokenizer.ggml.add_bos_token", true),
 				EOS:    int32(c.Uint("tokenizer.ggml.eos_token_id")),
 				AddEOS: c.Bool("tokenizer.ggml.add_eos_token", false),
+				// TODO: set EOT to EOS otherwise 0 will stop generation
+				EOT:    int32(c.Uint("tokenizer.ggml.eos_token_id")),
+				AddEOT: c.Bool("tokenizer.ggml.add_eos_token", false),
 			},
 		),
 		ImageProcessor: newImageProcessor(c),

From c7f4ae7b9c8976b4d50c59eb87e9582ea9c5c82f Mon Sep 17 00:00:00 2001
From: Jeffrey Morgan <jmorganca@gmail.com>
Date: Mon, 12 May 2025 20:41:42 -0700
Subject: [PATCH 015/108] server: add webp image input support (#10653)

---
 cmd/interactive.go      |  6 +++---
 cmd/interactive_test.go | 19 +++++++++++++++----
 server/routes.go        |  6 ++++++
 3 files changed, 24 insertions(+), 7 deletions(-)

diff --git a/cmd/interactive.go b/cmd/interactive.go
index 70222711..d7e6fbcf 100644
--- a/cmd/interactive.go
+++ b/cmd/interactive.go
@@ -44,7 +44,7 @@ func generateInteractive(cmd *cobra.Command, opts runOptions) error {
 		fmt.Fprintln(os.Stderr, "Use \"\"\" to begin a multi-line message.")
 
 		if opts.MultiModal {
-			fmt.Fprintf(os.Stderr, "Use %s to include .jpg or .png images.\n", filepath.FromSlash("/path/to/file"))
+			fmt.Fprintf(os.Stderr, "Use %s to include .jpg, .png, or .webp images.\n", filepath.FromSlash("/path/to/file"))
 		}
 
 		fmt.Fprintln(os.Stderr, "")
@@ -511,7 +511,7 @@ func extractFileNames(input string) []string {
 	// Regex to match file paths starting with optional drive letter, / ./ \ or .\ and include escaped or unescaped spaces (\ or %20)
 	// and followed by more characters and a file extension
 	// This will capture non filename strings, but we'll check for file existence to remove mismatches
-	regexPattern := `(?:[a-zA-Z]:)?(?:\./|/|\\)[\S\\ ]+?\.(?i:jpg|jpeg|png)\b`
+	regexPattern := `(?:[a-zA-Z]:)?(?:\./|/|\\)[\S\\ ]+?\.(?i:jpg|jpeg|png|webp)\b`
 	re := regexp.MustCompile(regexPattern)
 
 	return re.FindAllString(input, -1)
@@ -553,7 +553,7 @@ func getImageData(filePath string) ([]byte, error) {
 	}
 
 	contentType := http.DetectContentType(buf)
-	allowedTypes := []string{"image/jpeg", "image/jpg", "image/png"}
+	allowedTypes := []string{"image/jpeg", "image/jpg", "image/png", "image/webp"}
 	if !slices.Contains(allowedTypes, contentType) {
 		return nil, fmt.Errorf("invalid image type: %s", contentType)
 	}
diff --git a/cmd/interactive_test.go b/cmd/interactive_test.go
index ba833191..809f53ff 100644
--- a/cmd/interactive_test.go
+++ b/cmd/interactive_test.go
@@ -12,14 +12,17 @@ func TestExtractFilenames(t *testing.T) {
 	// Unix style paths
 	input := ` some preamble 
  ./relative\ path/one.png inbetween1 ./not a valid two.jpg inbetween2 ./1.svg
-/unescaped space /three.jpeg inbetween3 /valid\ path/dir/four.png "./quoted with spaces/five.JPG`
+/unescaped space /three.jpeg inbetween3 /valid\ path/dir/four.png "./quoted with spaces/five.JPG
+/unescaped space /six.webp inbetween6 /valid\ path/dir/seven.WEBP`
 	res := extractFileNames(input)
-	assert.Len(t, res, 5)
+	assert.Len(t, res, 7)
 	assert.Contains(t, res[0], "one.png")
 	assert.Contains(t, res[1], "two.jpg")
 	assert.Contains(t, res[2], "three.jpeg")
 	assert.Contains(t, res[3], "four.png")
 	assert.Contains(t, res[4], "five.JPG")
+	assert.Contains(t, res[5], "six.webp")
+	assert.Contains(t, res[6], "seven.WEBP")
 	assert.NotContains(t, res[4], '"')
 	assert.NotContains(t, res, "inbetween1")
 	assert.NotContains(t, res, "./1.svg")
@@ -30,10 +33,12 @@ func TestExtractFilenames(t *testing.T) {
  /absolute/nospace/three.jpeg inbetween3 /absolute/with space/four.png inbetween4
 ./relative\ path/five.JPG inbetween5 "./relative with/spaces/six.png inbetween6
 d:\path with\spaces\seven.JPEG inbetween7 c:\users\jdoe\eight.png inbetween8 
- d:\program files\someplace\nine.png inbetween9 "E:\program files\someplace\ten.PNG some ending
+ d:\program files\someplace\nine.png inbetween9 "E:\program files\someplace\ten.PNG
+c:/users/jdoe/eleven.webp inbetween11 c:/program files/someplace/twelve.WebP inbetween12
+d:\path with\spaces\thirteen.WEBP some ending
 `
 	res = extractFileNames(input)
-	assert.Len(t, res, 10)
+	assert.Len(t, res, 13)
 	assert.NotContains(t, res, "inbetween2")
 	assert.Contains(t, res[0], "one.png")
 	assert.Contains(t, res[0], "c:")
@@ -51,6 +56,12 @@ d:\path with\spaces\seven.JPEG inbetween7 c:\users\jdoe\eight.png inbetween8
 	assert.Contains(t, res[8], "d:")
 	assert.Contains(t, res[9], "ten.PNG")
 	assert.Contains(t, res[9], "E:")
+	assert.Contains(t, res[10], "eleven.webp")
+	assert.Contains(t, res[10], "c:")
+	assert.Contains(t, res[11], "twelve.WebP")
+	assert.Contains(t, res[11], "c:")
+	assert.Contains(t, res[12], "thirteen.WEBP")
+	assert.Contains(t, res[12], "d:")
 }
 
 // Ensure that file paths wrapped in single quotes are removed with the quotes.
diff --git a/server/routes.go b/server/routes.go
index f1706f74..fd65669a 100644
--- a/server/routes.go
+++ b/server/routes.go
@@ -8,6 +8,7 @@ import (
 	"encoding/json"
 	"errors"
 	"fmt"
+	"image"
 	"io"
 	"io/fs"
 	"log/slog"
@@ -25,6 +26,7 @@ import (
 
 	"github.com/gin-contrib/cors"
 	"github.com/gin-gonic/gin"
+	"golang.org/x/image/webp"
 	"golang.org/x/sync/errgroup"
 
 	"github.com/ollama/ollama/api"
@@ -1304,6 +1306,10 @@ func Serve(ln net.Listener) error {
 
 	s.sched.Run(schedCtx)
 
+	// register the experimental webp decoder
+	// so webp images can be used in multimodal inputs
+	image.RegisterFormat("webp", "RIFF????WEBP", webp.Decode, webp.DecodeConfig)
+
 	// At startup we retrieve GPU information so we can get log messages before loading a model
 	// This will log warnings to the log in case we have problems with detected GPUs
 	gpus := discover.GetGPUInfo()

From 4b903f088aa8d14404e5650d42db8c15530803d5 Mon Sep 17 00:00:00 2001
From: Jeffrey Morgan <jmorganca@gmail.com>
Date: Tue, 13 May 2025 13:11:11 -0700
Subject: [PATCH 016/108] llama: fix crash on snowflake embedding model
 (#10690)

---
 llama/llama.cpp/src/llama-vocab.cpp                |  2 --
 llama/patches/0014-fix-string-arr-kv-loading.patch | 14 ++++++++------
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/llama/llama.cpp/src/llama-vocab.cpp b/llama/llama.cpp/src/llama-vocab.cpp
index b098bb25..9f5fd57b 100644
--- a/llama/llama.cpp/src/llama-vocab.cpp
+++ b/llama/llama.cpp/src/llama-vocab.cpp
@@ -1469,8 +1469,6 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
             const int precompiled_charsmap_keyidx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_PRECOMPILED_CHARSMAP).c_str());
             if (precompiled_charsmap_keyidx != -1) {
                 const gguf_type pc_type = gguf_get_arr_type(ctx, precompiled_charsmap_keyidx);
-                GGML_ASSERT(pc_type == GGUF_TYPE_INT8 || pc_type == GGUF_TYPE_UINT8);
-
                 const size_t n_precompiled_charsmap = gguf_get_arr_data_n(ctx, precompiled_charsmap_keyidx);
                 const char * pc = (const char *) gguf_get_arr_data(ctx, precompiled_charsmap_keyidx);
                 precompiled_charsmap.assign(pc, pc + n_precompiled_charsmap);
diff --git a/llama/patches/0014-fix-string-arr-kv-loading.patch b/llama/patches/0014-fix-string-arr-kv-loading.patch
index 07cb397b..f879c50e 100644
--- a/llama/patches/0014-fix-string-arr-kv-loading.patch
+++ b/llama/patches/0014-fix-string-arr-kv-loading.patch
@@ -9,8 +9,8 @@ such as vocab fields
 ---
  ggml/include/gguf.h | 1 +
  ggml/src/gguf.cpp   | 7 +++++--
- src/llama-vocab.cpp | 2 +-
- 3 files changed, 7 insertions(+), 3 deletions(-)
+ src/llama-vocab.cpp | 4 +---
+ 3 files changed, 7 insertions(+), 5 deletions(-)
 
 diff --git a/ggml/include/gguf.h b/ggml/include/gguf.h
 index 79ee2020..3efb22f0 100644
@@ -53,13 +53,15 @@ index 381a9c7d..e45b453d 100644
  }
  
 diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp
-index 10f34d33..b098bb25 100644
+index 10f34d33..9f5fd57b 100644
 --- a/src/llama-vocab.cpp
 +++ b/src/llama-vocab.cpp
-@@ -1471,7 +1471,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
+@@ -1469,9 +1469,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
+             const int precompiled_charsmap_keyidx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_PRECOMPILED_CHARSMAP).c_str());
+             if (precompiled_charsmap_keyidx != -1) {
                  const gguf_type pc_type = gguf_get_arr_type(ctx, precompiled_charsmap_keyidx);
-                 GGML_ASSERT(pc_type == GGUF_TYPE_INT8 || pc_type == GGUF_TYPE_UINT8);
- 
+-                GGML_ASSERT(pc_type == GGUF_TYPE_INT8 || pc_type == GGUF_TYPE_UINT8);
+-
 -                const size_t n_precompiled_charsmap = gguf_get_arr_n(ctx, precompiled_charsmap_keyidx);
 +                const size_t n_precompiled_charsmap = gguf_get_arr_data_n(ctx, precompiled_charsmap_keyidx);
                  const char * pc = (const char *) gguf_get_arr_data(ctx, precompiled_charsmap_keyidx);

From c6bcdc4223c50071b59a19c42cc54ec9932f696f Mon Sep 17 00:00:00 2001
From: Daniel Hiltgen <dhiltgen@users.noreply.github.com>
Date: Tue, 13 May 2025 13:12:54 -0700
Subject: [PATCH 017/108] Revert "remove cuda v11 (#10569)" (#10692)

Bring back v11 until we can better warn users that their driver
is too old.

This reverts commit fa393554b927f154145488c852297a2330cb5f13.
---
 .github/workflows/release.yaml |  6 ++++++
 .github/workflows/test.yaml    |  6 +++---
 CMakePresets.json              | 13 +++++++++++++
 Dockerfile                     | 17 ++++++++++++++++-
 discover/cuda_common.go        |  3 ---
 discover/path.go               |  2 +-
 docs/gpu.md                    |  2 +-
 docs/troubleshooting.md        |  2 +-
 llm/server.go                  |  2 +-
 scripts/build_windows.ps1      | 14 ++++++++++++++
 scripts/env.sh                 |  2 ++
 11 files changed, 58 insertions(+), 11 deletions(-)

diff --git a/.github/workflows/release.yaml b/.github/workflows/release.yaml
index b2e12246..f423106e 100644
--- a/.github/workflows/release.yaml
+++ b/.github/workflows/release.yaml
@@ -103,6 +103,11 @@ jobs:
         arch: [amd64]
         preset: ['CPU']
         include:
+          - os: windows
+            arch: amd64
+            preset: 'CUDA 11'
+            install: https://developer.download.nvidia.com/compute/cuda/11.3.1/local_installers/cuda_11.3.1_465.89_win10.exe
+            cuda-version: '11.3'
           - os: windows
             arch: amd64
             preset: 'CUDA 12'
@@ -319,6 +324,7 @@ jobs:
             case "$COMPONENT" in
               bin/ollama)               echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}.tar.in ;;
               lib/ollama/*.so)          echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}.tar.in ;;
+              lib/ollama/cuda_v11)      echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}.tar.in ;;
               lib/ollama/cuda_v12)      echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}.tar.in ;;
               lib/ollama/cuda_jetpack5) echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}-jetpack5.tar.in ;;
               lib/ollama/cuda_jetpack6) echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}-jetpack6.tar.in ;;
diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml
index 2e709339..27e229fc 100644
--- a/.github/workflows/test.yaml
+++ b/.github/workflows/test.yaml
@@ -46,7 +46,7 @@ jobs:
         include:
           - preset: CPU
           - preset: CUDA
-            container: nvidia/cuda:12.8.1-devel-ubuntu22.04
+            container: nvidia/cuda:11.8.0-devel-ubuntu22.04
             flags: '-DCMAKE_CUDA_ARCHITECTURES=87'
           - preset: ROCm
             container: rocm/dev-ubuntu-22.04:6.1.2
@@ -78,7 +78,7 @@ jobs:
         include:
           - preset: CPU
           - preset: CUDA
-            install: https://developer.download.nvidia.com/compute/cuda/12.8.0/local_installers/cuda_12.8.0_571.96_windows.exe
+            install: https://developer.download.nvidia.com/compute/cuda/11.3.1/local_installers/cuda_11.3.1_465.89_win10.exe
             flags: '-DCMAKE_CUDA_ARCHITECTURES=80'
           - preset: ROCm
             install: https://download.amd.com/developer/eula/rocm-hub/AMD-Software-PRO-Edition-24.Q4-WinSvr2022-For-HIP.exe
@@ -102,7 +102,7 @@ jobs:
           $ErrorActionPreference = "Stop"
           if ("${{ steps.cache-install.outputs.cache-hit }}" -ne 'true') {
             Invoke-WebRequest -Uri "${{ matrix.install }}" -OutFile "install.exe"
-            Start-Process -FilePath .\install.exe -ArgumentList (@("-s", "cudart_12.8", "nvcc_12.8", "cublas_12.8", "cublas_dev_12.8")) -NoNewWindow -Wait
+            Start-Process -FilePath .\install.exe -ArgumentList (@("-s", "cudart_11.3", "nvcc_11.3", "cublas_11.3", "cublas_dev_11.3")) -NoNewWindow -Wait
           }
 
           $cudaPath = (Resolve-Path "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\*").path
diff --git a/CMakePresets.json b/CMakePresets.json
index 2f29e041..0b70d8ba 100644
--- a/CMakePresets.json
+++ b/CMakePresets.json
@@ -17,6 +17,14 @@
       "name": "CUDA",
       "inherits": [ "Default" ]
     },
+    {
+      "name": "CUDA 11",
+      "inherits": [ "CUDA" ],
+      "cacheVariables": {
+        "CMAKE_CUDA_ARCHITECTURES": "50;52;53;60;61;70;75;80;86",
+        "CMAKE_CUDA_FLAGS": "-Wno-deprecated-gpu-targets"
+      }
+    },
     {
       "name": "CUDA 12",
       "inherits": [ "CUDA" ],
@@ -70,6 +78,11 @@
       "configurePreset": "CUDA",
       "targets": [ "ggml-cuda" ]
     },
+    {
+      "name": "CUDA 11",
+      "inherits": [ "CUDA" ],
+      "configurePreset": "CUDA 11"
+    },
     {
       "name": "CUDA 12",
       "inherits": [ "CUDA" ],
diff --git a/Dockerfile b/Dockerfile
index 1196dc53..4c6619e7 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -7,10 +7,14 @@ ARG JETPACK5VERSION=r35.4.1
 ARG JETPACK6VERSION=r36.4.0
 ARG CMAKEVERSION=3.31.2
 
+# CUDA v11 requires gcc v10.  v10.3 has regressions, so the rockylinux 8.5 AppStream has the latest compatible version
 FROM --platform=linux/amd64 rocm/dev-almalinux-8:${ROCMVERSION}-complete AS base-amd64
 RUN yum install -y yum-utils \
-    && dnf install -y ccache \
+    && yum-config-manager --add-repo https://dl.rockylinux.org/vault/rocky/8.5/AppStream/\$basearch/os/ \
+    && rpm --import https://dl.rockylinux.org/pub/rocky/RPM-GPG-KEY-Rocky-8 \
+    && dnf install -y yum-utils ccache gcc-toolset-10-gcc-10.2.1-8.2.el8 gcc-toolset-10-gcc-c++-10.2.1-8.2.el8 gcc-toolset-10-binutils-2.35-11.el8 \
     && yum-config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/rhel8/x86_64/cuda-rhel8.repo
+ENV PATH=/opt/rh/gcc-toolset-10/root/usr/bin:$PATH
 
 FROM --platform=linux/arm64 almalinux:8 AS base-arm64
 # install epel-release for ccache
@@ -34,6 +38,15 @@ RUN --mount=type=cache,target=/root/.ccache \
         && cmake --build --parallel --preset 'CPU' \
         && cmake --install build --component CPU --strip --parallel 8
 
+FROM base AS cuda-11
+ARG CUDA11VERSION=11.3
+RUN dnf install -y cuda-toolkit-${CUDA11VERSION//./-}
+ENV PATH=/usr/local/cuda-11/bin:$PATH
+RUN --mount=type=cache,target=/root/.ccache \
+    cmake --preset 'CUDA 11' \
+        && cmake --build --parallel --preset 'CUDA 11' \
+        && cmake --install build --component CUDA --strip --parallel 8
+
 FROM base AS cuda-12
 ARG CUDA12VERSION=12.8
 RUN dnf install -y cuda-toolkit-${CUDA12VERSION//./-}
@@ -85,9 +98,11 @@ RUN --mount=type=cache,target=/root/.cache/go-build \
     go build -trimpath -buildmode=pie -o /bin/ollama .
 
 FROM --platform=linux/amd64 scratch AS amd64
+COPY --from=cuda-11 dist/lib/ollama/cuda_v11 /lib/ollama/cuda_v11
 COPY --from=cuda-12 dist/lib/ollama/cuda_v12 /lib/ollama/cuda_v12
 
 FROM --platform=linux/arm64 scratch AS arm64
+COPY --from=cuda-11 dist/lib/ollama/cuda_v11 /lib/ollama/cuda_v11
 COPY --from=cuda-12 dist/lib/ollama/cuda_v12 /lib/ollama/cuda_v12
 COPY --from=jetpack-5 dist/lib/ollama/cuda_v11 /lib/ollama/cuda_jetpack5
 COPY --from=jetpack-6 dist/lib/ollama/cuda_v12 /lib/ollama/cuda_jetpack6
diff --git a/discover/cuda_common.go b/discover/cuda_common.go
index f46c7cfa..04829529 100644
--- a/discover/cuda_common.go
+++ b/discover/cuda_common.go
@@ -3,7 +3,6 @@
 package discover
 
 import (
-	"fmt"
 	"log/slog"
 	"os"
 	"regexp"
@@ -60,8 +59,6 @@ func cudaVariant(gpuInfo CudaGPUInfo) string {
 
 	// driver 12.0 has problems with the cuda v12 library, so run v11 on those older drivers
 	if gpuInfo.DriverMajor < 12 || (gpuInfo.DriverMajor == 12 && gpuInfo.DriverMinor == 0) {
-		// The detected driver is older than Feb 2023
-		slog.Warn("old CUDA driver detected - please upgrade to a newer driver", "version", fmt.Sprintf("%d.%d", gpuInfo.DriverMajor, gpuInfo.DriverMinor))
 		return "v11"
 	}
 	return "v12"
diff --git a/discover/path.go b/discover/path.go
index 68e63009..8a20d8c2 100644
--- a/discover/path.go
+++ b/discover/path.go
@@ -12,7 +12,7 @@ import (
 // '../lib/ollama' on Linux and the executable's directory on macOS
 // note: distribution builds, additional GPU-specific libraries are
 // found in subdirectories of the returned path, such as
-// 'cuda_v12', 'rocm', etc.
+// 'cuda_v11', 'cuda_v12', 'rocm', etc.
 var LibOllamaPath string = func() string {
 	exe, err := os.Executable()
 	if err != nil {
diff --git a/docs/gpu.md b/docs/gpu.md
index 61ff6e45..b54c66ab 100644
--- a/docs/gpu.md
+++ b/docs/gpu.md
@@ -1,6 +1,6 @@
 # GPU
 ## Nvidia
-Ollama supports Nvidia GPUs with compute capability 5.0+ and driver version 531 and newer.
+Ollama supports Nvidia GPUs with compute capability 5.0+.
 
 Check your compute compatibility to see if your card is supported:
 [https://developer.nvidia.com/cuda-gpus](https://developer.nvidia.com/cuda-gpus)
diff --git a/docs/troubleshooting.md b/docs/troubleshooting.md
index 995b33ac..ba5487fe 100644
--- a/docs/troubleshooting.md
+++ b/docs/troubleshooting.md
@@ -43,7 +43,7 @@ Ollama includes multiple LLM libraries compiled for different GPUs and CPU vecto
 In the server log, you will see a message that looks something like this (varies from release to release):
 
 ```
-Dynamic LLM libraries [rocm_v6 cpu cpu_avx cpu_avx2 cuda_v12 rocm_v5]
+Dynamic LLM libraries [rocm_v6 cpu cpu_avx cpu_avx2 cuda_v11 rocm_v5]
 ```
 
 **Experimental LLM Library Override**
diff --git a/llm/server.go b/llm/server.go
index 085e0980..a64669c2 100644
--- a/llm/server.go
+++ b/llm/server.go
@@ -311,7 +311,7 @@ func NewLlamaServer(gpus discover.GpuInfoList, modelPath string, f *ggml.GGML, a
 		params = append(params, "--mmproj", projectors[0])
 	}
 
-	// iterate through compatible GPU libraries such as 'cuda_v12', 'rocm', etc.
+	// iterate through compatible GPU libraries such as 'cuda_v12', 'cuda_v11', 'rocm', etc.
 	// adding each library's respective path to the LD_LIBRARY_PATH, until finally running
 	// without any LD_LIBRARY_PATH flags
 	for {
diff --git a/scripts/build_windows.ps1 b/scripts/build_windows.ps1
index eaac2c60..e4c0b3d9 100644
--- a/scripts/build_windows.ps1
+++ b/scripts/build_windows.ps1
@@ -27,6 +27,7 @@ function checkEnv() {
         $env:VCToolsRedistDir=(get-item "${MSVC_INSTALL}\VC\Redist\MSVC\*")[0]
     }
     # Locate CUDA versions
+    # Note: this assumes every version found will be built
     $cudaList=(get-item "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v*\bin\" -ea 'silentlycontinue')
     if ($cudaList.length -eq 0) {
         $d=(get-command -ea 'silentlycontinue' nvcc).path
@@ -93,6 +94,19 @@ function buildOllama() {
 
         $hashEnv = @{}
         Get-ChildItem env: | foreach { $hashEnv[$_.Name] = $_.Value }
+        if ("$script:CUDA_DIRS".Contains("v11")) {
+            $hashEnv.Keys | foreach { if ($_.Contains("CUDA_PATH_V11")) { $v11="$_" }}
+            $env:CUDAToolkit_ROOT=$hashEnv[$v11]
+            write-host "Building CUDA v11 backend libraries"
+            # Note: cuda v11 requires msvc 2019 so force the older generator
+            # to avoid 2022 (or newer) from being used as the default
+            & cmake --fresh --preset "CUDA 11" -G "Visual Studio 16 2019" --install-prefix $script:DIST_DIR
+            if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
+            & cmake --build --preset "CUDA 11"  --config Release --parallel $script:JOBS
+            if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
+            & cmake --install build --component "CUDA" --strip
+            if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
+        }
         if ("$script:CUDA_DIRS".Contains("v12")) {
             $hashEnv.Keys | foreach { if ($_.Contains("CUDA_PATH_V12")) { $v12="$_" }}
             $env:CUDAToolkit_ROOT=$hashEnv[$v12]
diff --git a/scripts/env.sh b/scripts/env.sh
index 65a970bd..c5e6f530 100644
--- a/scripts/env.sh
+++ b/scripts/env.sh
@@ -10,7 +10,9 @@ OLLAMA_COMMON_BUILD_ARGS="--build-arg=VERSION \
     --build-arg=GOFLAGS \
     --build-arg=OLLAMA_CUSTOM_CPU_DEFS \
     --build-arg=OLLAMA_SKIP_CUDA_GENERATE \
+    --build-arg=OLLAMA_SKIP_CUDA_11_GENERATE \
     --build-arg=OLLAMA_SKIP_CUDA_12_GENERATE \
+    --build-arg=CUDA_V11_ARCHITECTURES \
     --build-arg=CUDA_V12_ARCHITECTURES \
     --build-arg=OLLAMA_SKIP_ROCM_GENERATE \
     --build-arg=OLLAMA_FAST_BUILD \

From f46df4e5d2e964ccfd0f23f9377240b6d9897ed8 Mon Sep 17 00:00:00 2001
From: Jeffrey Morgan <jmorganca@gmail.com>
Date: Tue, 13 May 2025 14:02:08 -0700
Subject: [PATCH 018/108] llama: fix defrag patch to defragment when no slots
 are available (#10695)

---
 llama/llama.cpp/src/llama-context.cpp         | 18 ++++++---
 ...nsure-KV-cache-is-fully-defragmented.patch | 39 ++++++++++++++++++-
 2 files changed, 50 insertions(+), 7 deletions(-)

diff --git a/llama/llama.cpp/src/llama-context.cpp b/llama/llama.cpp/src/llama-context.cpp
index c22687e4..c5948e8f 100644
--- a/llama/llama.cpp/src/llama-context.cpp
+++ b/llama/llama.cpp/src/llama-context.cpp
@@ -950,9 +950,12 @@ int llama_context::decode(llama_batch & inp_batch) {
 
         // find KV slot
         if (!kv_self->find_slot(ubatch)) {
-            LLAMA_LOG_WARN("%s: failed to find KV cache slot for ubatch of size %d\n", __func__, ubatch.n_tokens);
-
-            return 1;
+            kv_self->defrag_sched(-1.0f);
+            kv_self->update(*this);
+            if (!kv_self->find_slot(ubatch)) {
+                LLAMA_LOG_WARN("%s: failed to find KV cache slot for ubatch of size %d\n", __func__, ubatch.n_tokens);
+                return 1;
+            }
         }
 
         ggml_backend_sched_reset(sched.get());
@@ -1967,9 +1970,12 @@ void llama_context::opt_epoch_iter(
 
             // TODO: not sure if this is needed
             if (!kv_self->find_slot(ubatch)) {
-                LLAMA_LOG_WARN("%s: failed to find KV cache slot for ubatch of size %d\n", __func__, ubatch.n_tokens);
-
-                GGML_ABORT("TODO: handle this error");
+                kv_self->defrag_sched(-1.0f);
+                kv_self->update(*this);
+                if (!kv_self->find_slot(ubatch)) {
+                    LLAMA_LOG_WARN("%s: failed to find KV cache slot for ubatch of size %d\n", __func__, ubatch.n_tokens);
+                    GGML_ABORT("TODO: handle this error");
+                }
             }
 
             auto * gf = graph_init();
diff --git a/llama/patches/0010-ensure-KV-cache-is-fully-defragmented.patch b/llama/patches/0010-ensure-KV-cache-is-fully-defragmented.patch
index 81c17969..c5faeaaa 100644
--- a/llama/patches/0010-ensure-KV-cache-is-fully-defragmented.patch
+++ b/llama/patches/0010-ensure-KV-cache-is-fully-defragmented.patch
@@ -15,11 +15,48 @@ but this can leave a cache that still does not have adequate space
 even after defragmentation is triggered. Instead, we should do
 multiple batches of processing until everything is complete.
 ---
+ src/llama-context.cpp  |  18 ++++---
  src/llama-context.h    |   1 +
  src/llama-kv-cache.cpp | 107 ++++++++++++++---------------------------
  src/llama-kv-cache.h   |  12 ++++-
- 3 files changed, 47 insertions(+), 73 deletions(-)
+ 4 files changed, 59 insertions(+), 79 deletions(-)
 
+diff --git a/src/llama-context.cpp b/src/llama-context.cpp
+index c22687e4..c5948e8f 100644
+--- a/src/llama-context.cpp
++++ b/src/llama-context.cpp
+@@ -950,9 +950,12 @@ int llama_context::decode(llama_batch & inp_batch) {
+ 
+         // find KV slot
+         if (!kv_self->find_slot(ubatch)) {
+-            LLAMA_LOG_WARN("%s: failed to find KV cache slot for ubatch of size %d\n", __func__, ubatch.n_tokens);
+-
+-            return 1;
++            kv_self->defrag_sched(-1.0f);
++            kv_self->update(*this);
++            if (!kv_self->find_slot(ubatch)) {
++                LLAMA_LOG_WARN("%s: failed to find KV cache slot for ubatch of size %d\n", __func__, ubatch.n_tokens);
++                return 1;
++            }
+         }
+ 
+         ggml_backend_sched_reset(sched.get());
+@@ -1967,9 +1970,12 @@ void llama_context::opt_epoch_iter(
+ 
+             // TODO: not sure if this is needed
+             if (!kv_self->find_slot(ubatch)) {
+-                LLAMA_LOG_WARN("%s: failed to find KV cache slot for ubatch of size %d\n", __func__, ubatch.n_tokens);
+-
+-                GGML_ABORT("TODO: handle this error");
++                kv_self->defrag_sched(-1.0f);
++                kv_self->update(*this);
++                if (!kv_self->find_slot(ubatch)) {
++                    LLAMA_LOG_WARN("%s: failed to find KV cache slot for ubatch of size %d\n", __func__, ubatch.n_tokens);
++                    GGML_ABORT("TODO: handle this error");
++                }
+             }
+ 
+             auto * gf = graph_init();
 diff --git a/src/llama-context.h b/src/llama-context.h
 index c4ab242a..9970dfc6 100644
 --- a/src/llama-context.h

From 8cc33f4c2ba9347b3de3b5fe197d486df741d3e4 Mon Sep 17 00:00:00 2001
From: Parth Sareen <parth.sareen@ollama.com>
Date: Tue, 13 May 2025 15:39:27 -0700
Subject: [PATCH 019/108] llama: fix memory leak for grammar (#10696)

---
 llama/sampling_ext.cpp | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/llama/sampling_ext.cpp b/llama/sampling_ext.cpp
index 78b889bd..1cacd301 100644
--- a/llama/sampling_ext.cpp
+++ b/llama/sampling_ext.cpp
@@ -114,6 +114,9 @@ void grammar_free(struct llama_grammar *g) {
         if (g->vocab != nullptr) {
             delete g->vocab;
         }
+        if (g->o_vocab != nullptr) {
+                delete g->o_vocab;
+        }
         llama_grammar_free_impl(g);
     }
 }

From 0478d440f0ba62202bc4b98043ae4a7d0b85e4ba Mon Sep 17 00:00:00 2001
From: tej <37236721+itej89@users.noreply.github.com>
Date: Tue, 13 May 2025 18:42:39 -0500
Subject: [PATCH 020/108] Fixed over vram allcation dure to small initial layer
 sizes.

Co-authored-by: Tej Kiran <kiran.tej@amd.com>
Co-authored-by: Michael Yang <mxyng@pm.me>
Co-authored-by: Tej Kiran <itej89@gmailcom>
---
 llm/memory.go | 24 +++++++++++++-----------
 1 file changed, 13 insertions(+), 11 deletions(-)

diff --git a/llm/memory.go b/llm/memory.go
index e05327f7..76082bf7 100644
--- a/llm/memory.go
+++ b/llm/memory.go
@@ -1,9 +1,12 @@
 package llm
 
 import (
+	"cmp"
 	"fmt"
 	"log/slog"
+	"maps"
 	"os"
+	"slices"
 	"strconv"
 	"strings"
 
@@ -120,12 +123,10 @@ func EstimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []strin
 	}
 
 	layers := f.Tensors().GroupLayers()
-	// add one layer worth of memory as a buffer
-	if blk0, ok := layers["blk.0"]; ok {
-		layerSize = blk0.Size()
-	} else {
-		slog.Warn("model missing blk.0 layer size")
-	}
+	// add one layer (chosing the max layer) worth of memory as a buffer
+	layerSize = slices.MaxFunc(slices.Collect(maps.Values(layers)), func(a, b ggml.Layer) int {
+		return cmp.Compare(a.Size(), b.Size())
+	}).Size()
 
 	var kvct string
 	if envconfig.FlashAttention() &&
@@ -219,7 +220,7 @@ func EstimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []strin
 	}
 
 	// For all the layers, find where they can fit on the GPU(s)
-	for i := range int(f.KV().BlockCount()) {
+	for i := int(f.KV().BlockCount()) - 1; i >= 0; i-- {
 		// Some models have inconsistent layer sizes
 		if blk, ok := layers[fmt.Sprintf("blk.%d", i)]; ok {
 			layerSize = blk.Size()
@@ -229,6 +230,7 @@ func EstimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []strin
 
 		if opts.NumGPU >= 0 && layerCount >= opts.NumGPU {
 			// Stop allocating on GPU(s) once we hit the users target NumGPU
+			overflow += layerSize
 			continue
 		}
 
@@ -245,13 +247,13 @@ func EstimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []strin
 				gpusWithSpace = append(gpusWithSpace[:i%j], gpusWithSpace[i%j+1:]...)
 			}
 		}
+
+		if len(gpusWithSpace) == 0 {
+			overflow += layerSize
+		}
 	}
 	if layerCount >= int(f.KV().BlockCount()) {
 		fullyLoaded = true
-	} else {
-		for i := layerCount; i < int(f.KV().BlockCount()); i++ {
-			overflow += layerSize
-		}
 	}
 
 	// Determine if we need to consider output then find where it fits

From 23125648b8748d9f2ec93c3038db8689f5693f6e Mon Sep 17 00:00:00 2001
From: Michael Yang <mxyng@pm.me>
Date: Tue, 13 May 2025 17:36:02 -0700
Subject: [PATCH 021/108] chore: update mllama to use ollama engine (#10637)

---
 Makefile.sync                                 |   17 +-
 convert/convert.go                            |   28 +-
 convert/convert_mllama.go                     |  160 +++
 convert/reader.go                             |    5 +-
 fs/ggml/ggml.go                               |    1 +
 llama/llama.cpp/include/llama.h               |    6 -
 llama/llama.cpp/src/llama-arch.cpp            |   44 -
 llama/llama.cpp/src/llama-arch.h              |   10 -
 llama/llama.cpp/src/llama-batch.cpp           |    3 -
 llama/llama.cpp/src/llama-context.cpp         |   23 +-
 llama/llama.cpp/src/llama-context.h           |    1 -
 llama/llama.cpp/src/llama-cparams.h           |    1 -
 llama/llama.cpp/src/llama-graph.cpp           |   25 -
 llama/llama.cpp/src/llama-graph.h             |   12 -
 llama/llama.cpp/src/llama-hparams.cpp         |    4 -
 llama/llama.cpp/src/llama-hparams.h           |    7 -
 llama/llama.cpp/src/llama-kv-cache.cpp        |   14 +-
 llama/llama.cpp/src/llama-model-loader.cpp    |    2 -
 llama/llama.cpp/src/llama-model.cpp           |  309 +----
 llama/llama.cpp/src/llama-model.h             |   12 -
 llama/llama.cpp/src/llama-quant.cpp           |    4 +-
 llama/llama.cpp/tools/mtmd/llava.cpp          |    5 +-
 llama/llama.go                                |   58 -
 llama/mllama.cpp                              |  887 --------------
 llama/mllama.h                                |   61 -
 llama/patches/0005-solar-pro.patch            |    2 +-
 llama/patches/0006-add-mllama-support.patch   | 1027 -----------------
 ... => 0006-fix-deepseek-deseret-regex.patch} |    0
 llama/patches/0007-add-unpad-operator.patch   |  419 -------
 ...tain-ordering-for-rules-for-grammar.patch} |    0
 ...sure-KV-cache-is-fully-defragmented.patch} |   26 +-
 ...patch => 0009-sort-devices-by-score.patch} |    2 +-
 ...arget-ggml-cpu-for-all-cpu-variants.patch} |    0
 ...remove-amx.patch => 0011-remove-amx.patch} |    0
 ...h => 0012-fix-string-arr-kv-loading.patch} |    0
 ...r.patch => 0013-ollama-debug-tensor.patch} |    4 +-
 ...dd-ollama-vocab-for-grammar-support.patch} |    0
 llm/memory.go                                 |   41 +-
 llm/server.go                                 |    5 +-
 ml/backend.go                                 |    1 -
 ml/backend/ggml/ggml.go                       |   11 -
 ml/backend/ggml/ggml/include/ggml.h           |   10 -
 ml/backend/ggml/ggml/src/ggml-backend-reg.cpp |    6 +-
 ml/backend/ggml/ggml/src/ggml-cpu/ggml-cpu.c  |    5 -
 ml/backend/ggml/ggml/src/ggml-cpu/ops.cpp     |   55 -
 ml/backend/ggml/ggml/src/ggml-cpu/ops.h       |    1 -
 .../ggml/ggml/src/ggml-cuda/ggml-cuda.cu      |    4 -
 ml/backend/ggml/ggml/src/ggml-cuda/pad.cu     |   46 -
 ml/backend/ggml/ggml/src/ggml-cuda/pad.cuh    |    1 -
 .../src/ggml-metal/ggml-metal-embed.metal     |   45 -
 .../ggml/ggml/src/ggml-metal/ggml-metal.m     |   33 -
 .../ggml/ggml/src/ggml-metal/ggml-metal.metal |   45 -
 ml/backend/ggml/ggml/src/ggml.c               |   25 +-
 model/models/llama4/model_vision.go           |    2 +-
 model/models/mllama/imageproc.go              |  201 ----
 model/models/mllama/imageproc_test.go         |  420 -------
 model/models/mllama/model.go                  |   51 +-
 model/models/mllama/model_text.go             |   18 +-
 model/models/mllama/model_vision.go           |   73 +-
 model/models/mllama/process_image.go          |  164 ++-
 model/models/mllama/process_image_test.go     |  387 +++++++
 runner/llamarunner/image.go                   |   42 +-
 runner/llamarunner/runner.go                  |   22 +-
 server/prompt.go                              |   79 +-
 server/prompt_test.go                         |  119 +-
 server/routes.go                              |   39 +-
 server/sched.go                               |    9 +-
 67 files changed, 785 insertions(+), 4354 deletions(-)
 create mode 100644 convert/convert_mllama.go
 delete mode 100644 llama/mllama.cpp
 delete mode 100644 llama/mllama.h
 delete mode 100644 llama/patches/0006-add-mllama-support.patch
 rename llama/patches/{0008-fix-deepseek-deseret-regex.patch => 0006-fix-deepseek-deseret-regex.patch} (100%)
 delete mode 100644 llama/patches/0007-add-unpad-operator.patch
 rename llama/patches/{0009-maintain-ordering-for-rules-for-grammar.patch => 0007-maintain-ordering-for-rules-for-grammar.patch} (100%)
 rename llama/patches/{0010-ensure-KV-cache-is-fully-defragmented.patch => 0008-ensure-KV-cache-is-fully-defragmented.patch} (94%)
 rename llama/patches/{0011-sort-devices-by-score.patch => 0009-sort-devices-by-score.patch} (99%)
 rename llama/patches/{0012-add-phony-target-ggml-cpu-for-all-cpu-variants.patch => 0010-add-phony-target-ggml-cpu-for-all-cpu-variants.patch} (100%)
 rename llama/patches/{0013-remove-amx.patch => 0011-remove-amx.patch} (100%)
 rename llama/patches/{0014-fix-string-arr-kv-loading.patch => 0012-fix-string-arr-kv-loading.patch} (100%)
 rename llama/patches/{0015-ollama-debug-tensor.patch => 0013-ollama-debug-tensor.patch} (91%)
 rename llama/patches/{0016-add-ollama-vocab-for-grammar-support.patch => 0014-add-ollama-vocab-for-grammar-support.patch} (100%)
 delete mode 100644 model/models/mllama/imageproc.go
 delete mode 100644 model/models/mllama/imageproc_test.go
 create mode 100644 model/models/mllama/process_image_test.go

diff --git a/Makefile.sync b/Makefile.sync
index bceae7f5..711667c9 100644
--- a/Makefile.sync
+++ b/Makefile.sync
@@ -15,11 +15,13 @@ help:
 	@echo "    make -f $(lastword $(MAKEFILE_LIST)) clean sync"
 
 .PHONY: sync
-sync: llama/build-info.cpp llama/llama.cpp ml/backend/ggml/ggml
+sync: llama/build-info.cpp ml/backend/ggml/ggml/src/ggml-metal/ggml-metal-embed.metal
 
-.PHONY: llama/build-info.cpp
-llama/build-info.cpp: llama/build-info.cpp.in
-	sed -e 's|@FETCH_HEAD@|$(FETCH_HEAD)|' $< > $@
+llama/build-info.cpp: llama/build-info.cpp.in llama/llama.cpp
+	sed -e 's|@FETCH_HEAD@|$(FETCH_HEAD)|' <$< >$@
+
+ml/backend/ggml/ggml/src/ggml-metal/ggml-metal-embed.metal: ml/backend/ggml/ggml
+	go generate ./$(@D)
 
 .PHONY: llama/llama.cpp
 llama/llama.cpp: llama/vendor/
@@ -30,12 +32,13 @@ ml/backend/ggml/ggml: llama/vendor/ggml/
 	rsync -arvzc -f "merge $@/.rsync-filter" $< $@
 
 PATCHES=$(wildcard llama/patches/*.patch)
+PATCHED=$(join $(dir $(PATCHES)), $(addsuffix ed, $(addprefix ., $(notdir $(PATCHES)))))
 
 .PHONY: apply-patches
 .NOTPARALLEL:
-apply-patches: $(addsuffix ed, $(PATCHES))
+apply-patches: $(PATCHED)
 
-%.patched: %.patch
+llama/patches/.%.patched: llama/patches/%.patch
 	@if git -c user.name=nobody -c 'user.email=<>' -C $(WORKDIR) am -3 $(realpath $<); then touch $@; else git -C $(WORKDIR) am --abort; exit 1; fi
 
 .PHONY: checkout
@@ -57,4 +60,4 @@ format-patches: llama/patches
 
 .PHONE: clean
 clean: checkout
-	$(RM) $(addsuffix ed, $(PATCHES))
+	$(RM) llama/patches/.*.patched
diff --git a/convert/convert.go b/convert/convert.go
index 249ec807..48804d7f 100644
--- a/convert/convert.go
+++ b/convert/convert.go
@@ -1,6 +1,7 @@
 package convert
 
 import (
+	"cmp"
 	"encoding/json"
 	"errors"
 	"fmt"
@@ -14,13 +15,12 @@ import (
 )
 
 type ModelParameters struct {
-	Architectures []string       `json:"architectures"`
-	VocabSize     uint32         `json:"vocab_size"`
-	TextModel     TextParameters `json:"text_config"`
-}
+	Architectures []string `json:"architectures"`
+	VocabSize     uint32   `json:"vocab_size"`
 
-type TextParameters struct {
-	VocabSize uint32 `json:"vocab_size"`
+	TextModel struct {
+		VocabSize uint32 `json:"vocab_size"`
+	} `json:"text_config"`
 }
 
 type AdapterParameters struct {
@@ -173,6 +173,8 @@ func ConvertModel(fsys fs.FS, f *os.File) error {
 	switch p.Architectures[0] {
 	case "LlamaForCausalLM":
 		conv = &llamaModel{}
+	case "MllamaForConditionalGeneration":
+		conv = &mllamaModel{}
 	case "Llama4ForConditionalGeneration":
 		conv = &llama4Model{}
 	case "Mistral3ForConditionalGeneration":
@@ -212,24 +214,22 @@ func ConvertModel(fsys fs.FS, f *os.File) error {
 		return err
 	}
 
-	vocabSize := int(p.VocabSize)
-	if vocabSize == 0 {
-		tVocabSize := int(p.TextModel.VocabSize)
-		vocabSize = tVocabSize
-	}
+	vocabSize := int(cmp.Or(p.VocabSize, p.TextModel.VocabSize))
 
 	switch {
 	case vocabSize == 0:
-		slog.Warn("vocabulary size was not explicitly set by the model", "default size", len(t.Vocabulary.Tokens))
+		slog.Debug("vocabulary size was not explicitly set by the model", "default size", len(t.Vocabulary.Tokens))
 	case vocabSize > len(t.Vocabulary.Tokens):
-		slog.Warn("vocabulary is smaller than expected, padding with dummy tokens", "expect", vocabSize, "actual", len(t.Vocabulary.Tokens))
+		slog.Debug("vocabulary is smaller than expected, padding with dummy tokens", "expect", vocabSize, "actual", len(t.Vocabulary.Tokens))
 		for i := range vocabSize - len(t.Vocabulary.Tokens) {
 			t.Vocabulary.Tokens = append(t.Vocabulary.Tokens, fmt.Sprintf("[PAD%d]", i))
 			t.Vocabulary.Scores = append(t.Vocabulary.Scores, -1)
 			t.Vocabulary.Types = append(t.Vocabulary.Types, tokenTypeUserDefined)
 		}
 	case vocabSize < len(t.Vocabulary.Tokens):
-		return fmt.Errorf("vocabulary is larger than expected '%d' instead of '%d'", len(t.Vocabulary.Tokens), vocabSize)
+		slog.Debug("vocabulary is larger than expected", "want", vocabSize, "got", len(t.Vocabulary.Tokens))
+		p.VocabSize = uint32(len(t.Vocabulary.Tokens))
+		p.TextModel.VocabSize = uint32(len(t.Vocabulary.Tokens))
 	default:
 		slog.Debug("vocabulary", "size", len(t.Vocabulary.Tokens))
 	}
diff --git a/convert/convert_mllama.go b/convert/convert_mllama.go
new file mode 100644
index 00000000..12478be7
--- /dev/null
+++ b/convert/convert_mllama.go
@@ -0,0 +1,160 @@
+package convert
+
+import (
+	"strings"
+
+	"github.com/ollama/ollama/fs/ggml"
+	"github.com/pdevine/tensor"
+	"github.com/pdevine/tensor/native"
+)
+
+type mllamaModel struct {
+	ModelParameters
+	TextModel struct {
+		llamaModel
+
+		CrossAttentionLayers []int32 `json:"cross_attention_layers"`
+	} `json:"text_config"`
+	VisionModel struct {
+		NumHiddenLayers           uint32  `json:"num_hidden_layers"`
+		NumGlobalLayers           uint32  `json:"num_global_layers"`
+		IntermediateLayersIndices []int32 `json:"intermediate_layers_indices"`
+
+		HiddenSize       uint32 `json:"hidden_size"`
+		IntermediateSize uint32 `json:"intermediate_size"`
+
+		AttentionHeads uint32 `json:"attention_heads"`
+
+		ImageSize   uint32  `json:"image_size"`
+		PatchSize   uint32  `json:"patch_size"`
+		NumChannels uint32  `json:"num_channels"`
+		MaxNumTiles uint32  `json:"max_num_tiles"`
+		NormEpsilon float32 `json:"norm_eps"`
+		RopeTheta   float32 `json:"rope.freq_base"`
+	} `json:"vision_config"`
+}
+
+func (m *mllamaModel) KV(t *Tokenizer) ggml.KV {
+	kv := m.ModelParameters.KV(t)
+	kv["general.architecture"] = "mllama"
+
+	for k, v := range m.TextModel.KV(t) {
+		if strings.HasPrefix(k, "llama.") {
+			kv[strings.ReplaceAll(k, "llama.", "mllama.")] = v
+		}
+	}
+
+	kv["mllama.attention.cross_attention_layers"] = m.TextModel.CrossAttentionLayers
+
+	kv["mllama.vision.block_count"] = m.VisionModel.NumHiddenLayers
+	kv["mllama.vision.global.block_count"] = m.VisionModel.NumGlobalLayers
+	kv["mllama.vision.intermediate_layers_indices"] = m.VisionModel.IntermediateLayersIndices
+
+	kv["mllama.vision.embedding_length"] = m.VisionModel.HiddenSize
+	kv["mllama.vision.feed_forward_length"] = m.VisionModel.IntermediateSize
+
+	kv["mllama.vision.attention.head_count"] = m.VisionModel.AttentionHeads
+	kv["mllama.vision.attention.layer_norm_epsilon"] = m.VisionModel.NormEpsilon
+
+	kv["mllama.vision.image_size"] = m.VisionModel.ImageSize
+	kv["mllama.vision.patch_size"] = m.VisionModel.PatchSize
+	kv["mllama.vision.max_num_tiles"] = m.VisionModel.MaxNumTiles
+	kv["mllama.vision.num_channels"] = m.VisionModel.NumChannels
+
+	return kv
+}
+
+func (m *mllamaModel) Replacements() []string {
+	return append(
+		m.TextModel.Replacements(),
+		"language_model.", "",
+		"gate_attn", "attn_gate",
+		"gate_ffn", "ffn_gate",
+		"cross_attn.", "cross_attn_",
+		"vision_model", "v",
+		"class_embedding", "class_embd",
+		"patch_embedding", "patch_embd",
+		"gated_positional_embedding.tile_embedding", "tile_position_embd",
+		"gated_positional_embedding.embedding", "position_embd.weight",
+		"gated_positional_embedding", "position_embd",
+		"embedding.weight", "weight",
+		"pre_tile_positional_embedding", "pre_tile_position_embd",
+		"post_tile_positional_embedding", "post_tile_position_embd",
+		"layernorm_pre", "pre_ln",
+		"layernorm_post", "post_ln",
+		"global_transformer.layers", "global.blk",
+		"transformer.layers", "blk",
+		"mlp.fc1", "ffn_up",
+		"mlp.fc2", "ffn_down",
+		"multi_modal_projector", "mm.0",
+	)
+}
+
+func (m *mllamaModel) Tensors(ts []Tensor) []*ggml.Tensor {
+	var out []*ggml.Tensor
+	var text []Tensor
+	for _, t := range ts {
+		if t.Name() == "v.position_embd.gate" {
+			for _, name := range []string{"v.position_embd.gate", "v.tile_position_embd.gate"} {
+				tt := t.Clone()
+				tt.SetRepacker(m.repack(name))
+				out = append(out, &ggml.Tensor{
+					Name:     name,
+					Kind:     t.Kind(),
+					Shape:    t.Shape(),
+					WriterTo: tt,
+				})
+			}
+		} else if t.Name() == "v.pre_tile_position_embd.gate" || t.Name() == "v.post_tile_position_embd.gate" {
+			t.SetRepacker(m.repack(t.Name()))
+			out = append(out, &ggml.Tensor{
+				Name:     t.Name(),
+				Kind:     t.Kind(),
+				Shape:    t.Shape(),
+				WriterTo: t,
+			})
+		} else if strings.HasPrefix(t.Name(), "v.") || strings.HasPrefix(t.Name(), "mm.") {
+			out = append(out, &ggml.Tensor{
+				Name:     t.Name(),
+				Kind:     t.Kind(),
+				Shape:    t.Shape(),
+				WriterTo: t,
+			})
+		} else {
+			text = append(text, t)
+		}
+	}
+
+	return append(out, m.TextModel.Tensors(text)...)
+}
+
+func (m *mllamaModel) repack(name string) Repacker {
+	return func(_ string, data []float32, shape []uint64) (_ []float32, err error) {
+		dims := make([]int, len(shape))
+		for i, dim := range shape {
+			dims[i] = int(dim)
+		}
+
+		var t tensor.Tensor = tensor.New(tensor.WithShape(dims...), tensor.WithBacking(data))
+
+		t, err = tensor.Tanh(t)
+		if err != nil {
+			return nil, err
+		}
+
+		if name == "v.position_embd.gate" {
+			t, err = tensor.Sub(float32(1), t)
+			if err != nil {
+				return nil, err
+			}
+		}
+
+		t = tensor.Materialize(t)
+		// flatten tensor so it can be return as a vector
+		if err := t.Reshape(t.Shape().TotalSize()); err != nil {
+			return nil, err
+		}
+
+		return native.VectorF32(t.(*tensor.Dense))
+	}
+}
diff --git a/convert/reader.go b/convert/reader.go
index ab81d5c0..07d12f0d 100644
--- a/convert/reader.go
+++ b/convert/reader.go
@@ -38,7 +38,10 @@ const (
 func (t tensorBase) Kind() uint32 {
 	if strings.HasSuffix(t.name, ".ffn_gate_inp.weight") ||
 		t.name == "token_types.weight" ||
-		t.name == "v.positional_embedding_vlm" {
+		t.name == "v.positional_embedding_vlm" ||
+		t.name == "v.tile_position_embd.weight" ||
+		t.name == "v.pre_tile_position_embd.weight" ||
+		t.name == "v.post_tile_position_embd.weight" {
 		// these tensors are always F32
 		return 0
 	}
diff --git a/fs/ggml/ggml.go b/fs/ggml/ggml.go
index 735d41fa..c29d715b 100644
--- a/fs/ggml/ggml.go
+++ b/fs/ggml/ggml.go
@@ -125,6 +125,7 @@ func (kv KV) OllamaEngineRequired() bool {
 		"gemma3",
 		"mistral3",
 		"llama4",
+		"mllama",
 	}, kv.Architecture())
 }
 
diff --git a/llama/llama.cpp/include/llama.h b/llama/llama.cpp/include/llama.h
index 41beef21..abedebdb 100644
--- a/llama/llama.cpp/include/llama.h
+++ b/llama/llama.cpp/include/llama.h
@@ -258,7 +258,6 @@ extern "C" {
 
         llama_token  *  token;
         float        *  embd;
-        int32_t         n_embd;
         llama_pos    *  pos;
         int32_t      *  n_seq_id;
         llama_seq_id ** seq_id;
@@ -366,7 +365,6 @@ extern "C" {
         bool flash_attn;  // whether to use flash attention [EXPERIMENTAL]
         bool no_perf;     // whether to measure performance timings
         bool op_offload;  // whether to offload host tensor operations to device
-        bool cross_attn;  // whether to use cross attention
     };
 
     // model quantization parameters
@@ -466,10 +464,6 @@ extern "C" {
             struct llama_context_params   params),
             "use llama_init_from_model instead");
 
-    // TODO (jmorganca): this should most likely be passed in as part of a batch
-    // and not set on the context for all batches.
-    LLAMA_API void llama_set_cross_attention(struct llama_context * ctx, bool cross_attn_state);
-
     // Frees all allocated memory
     LLAMA_API void llama_free(struct llama_context * ctx);
 
diff --git a/llama/llama.cpp/src/llama-arch.cpp b/llama/llama.cpp/src/llama-arch.cpp
index eb7b5325..5ab3f572 100644
--- a/llama/llama.cpp/src/llama-arch.cpp
+++ b/llama/llama.cpp/src/llama-arch.cpp
@@ -6,7 +6,6 @@
 
 static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
     { LLM_ARCH_LLAMA,            "llama"            },
-    { LLM_ARCH_MLLAMA,           "mllama"           },
     { LLM_ARCH_LLAMA4,           "llama4"           },
     { LLM_ARCH_DECI,             "deci"             },
     { LLM_ARCH_FALCON,           "falcon"           },
@@ -145,7 +144,6 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
     { LLM_KV_ATTENTION_SLIDING_WINDOW,               "%s.attention.sliding_window"               },
     { LLM_KV_ATTENTION_SCALE,                        "%s.attention.scale"                        },
     { LLM_KV_ATTENTION_BLOCK_SKIP_CONNECTION,        "%s.attention.block_skip_connection"        },
-    { LLM_KV_ATTENTION_CROSS_ATTENTION_LAYERS,       "%s.attention.cross_attention_layers"       },
     { LLM_KV_ATTENTION_KEY_LENGTH_MLA,               "%s.attention.key_length_mla"               },
     { LLM_KV_ATTENTION_VALUE_LENGTH_MLA,             "%s.attention.value_length_mla"             },
 
@@ -275,40 +273,6 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
             { LLM_TENSOR_FFN_UP_SHEXP,    "blk.%d.ffn_up_shexp" },
         },
     },
-    {
-        LLM_ARCH_MLLAMA,
-        {
-            { LLM_TENSOR_TOKEN_EMBD,      "token_embd" },
-            { LLM_TENSOR_OUTPUT_NORM,     "output_norm" },
-            { LLM_TENSOR_OUTPUT,          "output" },
-            { LLM_TENSOR_ROPE_FREQS,      "rope_freqs" },
-            { LLM_TENSOR_ATTN_NORM,       "blk.%d.attn_norm" },
-            { LLM_TENSOR_ATTN_Q,          "blk.%d.attn_q" },
-            { LLM_TENSOR_ATTN_K,          "blk.%d.attn_k" },
-            { LLM_TENSOR_ATTN_V,          "blk.%d.attn_v" },
-            { LLM_TENSOR_ATTN_OUT,        "blk.%d.attn_output" },
-            { LLM_TENSOR_ATTN_ROT_EMBD,   "blk.%d.attn_rot_embd" },
-            { LLM_TENSOR_FFN_GATE_INP,    "blk.%d.ffn_gate_inp" },
-            { LLM_TENSOR_FFN_NORM,        "blk.%d.ffn_norm" },
-            { LLM_TENSOR_FFN_GATE,        "blk.%d.ffn_gate" },
-            { LLM_TENSOR_FFN_DOWN,        "blk.%d.ffn_down" },
-            { LLM_TENSOR_FFN_UP,          "blk.%d.ffn_up" },
-            { LLM_TENSOR_FFN_GATE_EXP,    "blk.%d.ffn_gate.%d" },
-            { LLM_TENSOR_FFN_DOWN_EXP,    "blk.%d.ffn_down.%d" },
-            { LLM_TENSOR_FFN_UP_EXP,      "blk.%d.ffn_up.%d" },
-            { LLM_TENSOR_FFN_GATE_EXPS,   "blk.%d.ffn_gate_exps" },
-            { LLM_TENSOR_FFN_DOWN_EXPS,   "blk.%d.ffn_down_exps" },
-            { LLM_TENSOR_FFN_UP_EXPS,     "blk.%d.ffn_up_exps" },
-            { LLM_TENSOR_CROSS_ATTN_K_NORM,    "blk.%d.cross_attn_k_norm" },
-            { LLM_TENSOR_CROSS_ATTN_K_PROJ,    "blk.%d.cross_attn_k_proj" },
-            { LLM_TENSOR_CROSS_ATTN_O_PROJ,    "blk.%d.cross_attn_o_proj" },
-            { LLM_TENSOR_CROSS_ATTN_Q_NORM,    "blk.%d.cross_attn_q_norm" },
-            { LLM_TENSOR_CROSS_ATTN_Q_PROJ,    "blk.%d.cross_attn_q_proj" },
-            { LLM_TENSOR_CROSS_ATTN_V_PROJ,    "blk.%d.cross_attn_v_proj" },
-            { LLM_TENSOR_CROSS_ATTN_ATTN_GATE, "blk.%d.cross_attn_attn_gate" },
-            { LLM_TENSOR_CROSS_ATTN_MLP_GATE,  "blk.%d.cross_attn_mlp_gate" },
-        },
-    },
     {
         LLM_ARCH_DECI,
         {
@@ -1737,14 +1701,6 @@ static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
     // this tensor is loaded for T5, but never used
     {LLM_TENSOR_DEC_CROSS_ATTN_REL_B,       {LLM_TENSOR_LAYER_REPEATING, GGML_OP_NONE}},
     {LLM_TENSOR_BSKCN_TV,                   {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
-    {LLM_TENSOR_CROSS_ATTN_K_NORM,          {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
-    {LLM_TENSOR_CROSS_ATTN_K_PROJ,          {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
-    {LLM_TENSOR_CROSS_ATTN_O_PROJ,          {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
-    {LLM_TENSOR_CROSS_ATTN_Q_NORM,          {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
-    {LLM_TENSOR_CROSS_ATTN_Q_PROJ,          {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
-    {LLM_TENSOR_CROSS_ATTN_V_PROJ,          {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
-    {LLM_TENSOR_CROSS_ATTN_ATTN_GATE,       {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
-    {LLM_TENSOR_CROSS_ATTN_MLP_GATE,        {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
     {LLM_TENSOR_CONV1D,                     {LLM_TENSOR_LAYER_INPUT,     GGML_OP_IM2COL}},
     {LLM_TENSOR_POS_NET_NORM,               {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
     {LLM_TENSOR_POS_NET_NORM1,              {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
diff --git a/llama/llama.cpp/src/llama-arch.h b/llama/llama.cpp/src/llama-arch.h
index bc8a4f0b..525c1b7d 100644
--- a/llama/llama.cpp/src/llama-arch.h
+++ b/llama/llama.cpp/src/llama-arch.h
@@ -11,7 +11,6 @@
 enum llm_arch {
     LLM_ARCH_LLAMA,
     LLM_ARCH_LLAMA4,
-    LLM_ARCH_MLLAMA,
     LLM_ARCH_DECI,
     LLM_ARCH_FALCON,
     LLM_ARCH_BAICHUAN,
@@ -149,7 +148,6 @@ enum llm_kv {
     LLM_KV_ATTENTION_SLIDING_WINDOW,
     LLM_KV_ATTENTION_SCALE,
     LLM_KV_ATTENTION_BLOCK_SKIP_CONNECTION,
-    LLM_KV_ATTENTION_CROSS_ATTENTION_LAYERS,
     LLM_KV_ATTENTION_KEY_LENGTH_MLA,
     LLM_KV_ATTENTION_VALUE_LENGTH_MLA,
 
@@ -351,14 +349,6 @@ enum llm_tensor {
     LLM_TENSOR_CLS,
     LLM_TENSOR_CLS_OUT,
     LLM_TENSOR_BSKCN_TV,
-    LLM_TENSOR_CROSS_ATTN_K_NORM,
-    LLM_TENSOR_CROSS_ATTN_K_PROJ,
-    LLM_TENSOR_CROSS_ATTN_O_PROJ,
-    LLM_TENSOR_CROSS_ATTN_Q_NORM,
-    LLM_TENSOR_CROSS_ATTN_Q_PROJ,
-    LLM_TENSOR_CROSS_ATTN_V_PROJ,
-    LLM_TENSOR_CROSS_ATTN_ATTN_GATE,
-    LLM_TENSOR_CROSS_ATTN_MLP_GATE,
     LLM_TENSOR_CONV1D,
     LLM_TENSOR_CONVNEXT_DW,
     LLM_TENSOR_CONVNEXT_NORM,
diff --git a/llama/llama.cpp/src/llama-batch.cpp b/llama/llama.cpp/src/llama-batch.cpp
index 241b316e..a88b2fe3 100644
--- a/llama/llama.cpp/src/llama-batch.cpp
+++ b/llama/llama.cpp/src/llama-batch.cpp
@@ -320,7 +320,6 @@ struct llama_batch llama_batch_get_one(
         /*n_tokens       =*/ n_tokens,
         /*tokens         =*/ tokens,
         /*embd           =*/ nullptr,
-        /*n_embd         =*/ 0,
         /*pos            =*/ nullptr,
         /*n_seq_id       =*/ nullptr,
         /*seq_id         =*/ nullptr,
@@ -333,7 +332,6 @@ struct llama_batch llama_batch_init(int32_t n_tokens_alloc, int32_t embd, int32_
         /*n_tokens       =*/ 0,
         /*tokens         =*/ nullptr,
         /*embd           =*/ nullptr,
-        /*n_embd         =*/ 0,
         /*pos            =*/ nullptr,
         /*n_seq_id       =*/ nullptr,
         /*seq_id         =*/ nullptr,
@@ -342,7 +340,6 @@ struct llama_batch llama_batch_init(int32_t n_tokens_alloc, int32_t embd, int32_
 
     if (embd) {
         batch.embd = (float *) malloc(sizeof(float) * n_tokens_alloc * embd);
-        batch.n_embd = embd;
     } else {
         batch.token = (llama_token *) malloc(sizeof(llama_token) * n_tokens_alloc);
     }
diff --git a/llama/llama.cpp/src/llama-context.cpp b/llama/llama.cpp/src/llama-context.cpp
index c5948e8f..1f3a3956 100644
--- a/llama/llama.cpp/src/llama-context.cpp
+++ b/llama/llama.cpp/src/llama-context.cpp
@@ -514,7 +514,7 @@ float * llama_context::get_logits_ith(int32_t i) {
             throw std::runtime_error(format("corrupt output buffer (j=%d, n_outputs=%d)", j, n_outputs));
         }
 
-        return logits + j*model.hparams.n_vocab;
+        return logits + j*model.vocab.n_tokens();
     } catch (const std::exception & err) {
         LLAMA_LOG_ERROR("%s: invalid logits id %d, reason: %s\n", __func__, i, err.what());
 #ifndef NDEBUG
@@ -632,10 +632,6 @@ void llama_context::set_warmup(bool value) {
     cparams.warmup = value;
 }
 
-void llama_context::set_cross_attn(bool value) {
-    cparams.cross_attn = value;
-}
-
 void llama_context::set_adapter_lora(
             llama_adapter_lora * adapter,
             float scale) {
@@ -713,7 +709,7 @@ int llama_context::encode(llama_batch & inp_batch) {
 
     const int64_t n_embd = hparams.n_embd;
 
-    llama_sbatch sbatch = llama_sbatch(batch, batch.n_embd, /* simple_split */ true, /* logits_all */ true);
+    llama_sbatch sbatch = llama_sbatch(batch, n_embd, /* simple_split */ true, /* logits_all */ true);
 
     const llama_ubatch ubatch = sbatch.split_simple(n_tokens);
 
@@ -867,9 +863,10 @@ int llama_context::decode(llama_batch & inp_batch) {
 
     const llama_batch & batch = batch_allocr.batch;
 
+    const auto & vocab   = model.vocab;
     const auto & hparams = model.hparams;
 
-    const int32_t n_vocab = hparams.n_vocab;
+    const int32_t n_vocab = vocab.n_tokens();
 
     const int64_t n_tokens_all = batch.n_tokens;
     const int64_t n_embd       = hparams.n_embd;
@@ -1093,7 +1090,7 @@ int llama_context::decode(llama_batch & inp_batch) {
         // make the outputs have the same order they had in the user-provided batch
         // note: this is mostly relevant for recurrent models atm
         if (!sorted_output) {
-            const uint32_t n_vocab = model.hparams.n_vocab;
+            const uint32_t n_vocab = model.vocab.n_tokens();
             const uint32_t n_embd  = model.hparams.n_embd;
 
             GGML_ASSERT((size_t) n_outputs == out_ids.size());
@@ -1148,11 +1145,12 @@ int llama_context::decode(llama_batch & inp_batch) {
 
 int32_t llama_context::output_reserve(int32_t n_outputs) {
     const auto & hparams = model.hparams;
+    const auto & vocab   = model.vocab;
 
     const int64_t n_outputs_max = std::max<int64_t>(n_outputs, n_seq_max());
 
     const auto n_batch = cparams.n_batch;
-    const auto n_vocab = hparams.n_vocab;
+    const auto n_vocab = vocab.n_tokens();
     const auto n_embd  = hparams.n_embd;
 
     // TODO: use a per-batch flag for logits presence instead
@@ -1687,7 +1685,7 @@ size_t llama_context::state_write_data(llama_io_write_i & io) {
     {
         LLAMA_LOG_DEBUG("%s: - writing logits\n", __func__);
 
-        const uint64_t logits_size = std::min((uint64_t) this->logits_size, (uint64_t) n_outputs * model.hparams.n_vocab);
+        const uint64_t logits_size = std::min((uint64_t) this->logits_size, (uint64_t) n_outputs * model.vocab.n_tokens());
 
         io.write(&logits_size, sizeof(logits_size));
 
@@ -2099,7 +2097,6 @@ llama_context_params llama_context_default_params() {
         /*.flash_attn                  =*/ false,
         /*.no_perf                     =*/ true,
         /*.op_offload                  =*/ true,
-        /*.cross_attn                  =*/ false,
     };
 
     return result;
@@ -2225,10 +2222,6 @@ void llama_set_warmup(llama_context * ctx, bool warmup) {
     ctx->set_warmup(warmup);
 }
 
-void llama_set_cross_attention(struct llama_context * ctx, bool cross_attention) {
-    ctx->set_cross_attn(cross_attention);
-}
-
 void llama_synchronize(llama_context * ctx) {
     ctx->synchronize();
 }
diff --git a/llama/llama.cpp/src/llama-context.h b/llama/llama.cpp/src/llama-context.h
index 9970dfc6..0264e937 100644
--- a/llama/llama.cpp/src/llama-context.h
+++ b/llama/llama.cpp/src/llama-context.h
@@ -72,7 +72,6 @@ struct llama_context {
     void set_embeddings (bool value);
     void set_causal_attn(bool value);
     void set_warmup(bool value);
-    void set_cross_attn(bool value);
 
     void set_adapter_lora(
             llama_adapter_lora * adapter,
diff --git a/llama/llama.cpp/src/llama-cparams.h b/llama/llama.cpp/src/llama-cparams.h
index 7a6156ce..246fa577 100644
--- a/llama/llama.cpp/src/llama-cparams.h
+++ b/llama/llama.cpp/src/llama-cparams.h
@@ -31,7 +31,6 @@ struct llama_cparams {
     bool no_perf;
     bool warmup;
     bool op_offload;
-    bool cross_attn;
 
     enum llama_pooling_type pooling_type;
 
diff --git a/llama/llama.cpp/src/llama-graph.cpp b/llama/llama.cpp/src/llama-graph.cpp
index f14869cf..b0e3f635 100644
--- a/llama/llama.cpp/src/llama-graph.cpp
+++ b/llama/llama.cpp/src/llama-graph.cpp
@@ -532,12 +532,6 @@ void llm_graph_input_attn_cross::set_input(const llama_ubatch * ubatch) {
     }
 }
 
-void llm_graph_input_cross_attn_state::set_input(const llama_ubatch * ubatch) {
-    if (ubatch->embd) {
-        ggml_backend_tensor_set(cross_attn_state, ubatch->embd, 0, ggml_nbytes(cross_attn_state));
-    }
-}
-
 //
 // llm_graph_context
 //
@@ -1520,25 +1514,6 @@ llm_graph_input_attn_cross * llm_graph_context::build_attn_inp_cross() const {
     return (llm_graph_input_attn_cross *) res->add_input(std::move(inp));
 }
 
-ggml_tensor * llm_graph_context::build_inp_cross_attn_state() const {
-    const int64_t n_embd = hparams.n_embd;
-
-    auto inp = std::make_unique<llm_graph_input_cross_attn_state>();
-
-    ggml_tensor * cur = nullptr;
-
-    inp->cross_attn_state = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_embd, 1601, 4);
-    ggml_set_input(inp->cross_attn_state);
-
-    cur = inp->cross_attn_state;
-
-    cb(cur, "inp_cross_attn_state", -1);
-
-    res->add_input(std::move(inp));
-
-    return cur;
-}
-
 ggml_tensor * llm_graph_context::build_attn(
         llm_graph_input_attn_cross * inp,
         ggml_cgraph * gf,
diff --git a/llama/llama.cpp/src/llama-graph.h b/llama/llama.cpp/src/llama-graph.h
index 5a322785..832a8c09 100644
--- a/llama/llama.cpp/src/llama-graph.h
+++ b/llama/llama.cpp/src/llama-graph.h
@@ -87,7 +87,6 @@ public:
 
     ggml_tensor * tokens = nullptr; // I32 [n_batch]
     ggml_tensor * embd   = nullptr; // F32 [n_embd, n_batch]
-    ggml_tensor * cross_attn_state; // F32 [4, n_embd, 1061]
 };
 
 class llm_graph_input_pos : public llm_graph_input_i {
@@ -285,16 +284,6 @@ public:
     const llama_cross * cross = nullptr;
 };
 
-class llm_graph_input_cross_attn_state : public llm_graph_input_i {
-public:
-    llm_graph_input_cross_attn_state()          = default;
-    virtual ~llm_graph_input_cross_attn_state() = default;
-
-    void set_input(const llama_ubatch * ubatch) override;
-
-    ggml_tensor * cross_attn_state; // F32 [4, n_embd, 1061]
-};
-
 //
 // llm_graph_result
 //
@@ -506,7 +495,6 @@ struct llm_graph_context {
     ggml_tensor * build_inp_cls() const;
     ggml_tensor * build_inp_s_copy() const;
     ggml_tensor * build_inp_s_mask() const;
-    ggml_tensor * build_inp_cross_attn_state() const;
 
     ggml_tensor * build_inp_cross_embd() const;
     ggml_tensor * build_inp_pos_bucket_enc() const;
diff --git a/llama/llama.cpp/src/llama-hparams.cpp b/llama/llama.cpp/src/llama-hparams.cpp
index 6a02de03..8a667960 100644
--- a/llama/llama.cpp/src/llama-hparams.cpp
+++ b/llama/llama.cpp/src/llama-hparams.cpp
@@ -85,7 +85,3 @@ bool llama_hparams::is_swa(uint32_t il) const {
 
     GGML_ABORT("fatal error");
 }
-
-bool llama_hparams::cross_attention_layers(uint32_t il) const {
-    return std::find(cross_attn_layers.begin(), cross_attn_layers.end(), il) != cross_attn_layers.end();
-}
diff --git a/llama/llama.cpp/src/llama-hparams.h b/llama/llama.cpp/src/llama-hparams.h
index b6fc7e6d..48dce407 100644
--- a/llama/llama.cpp/src/llama-hparams.h
+++ b/llama/llama.cpp/src/llama-hparams.h
@@ -2,8 +2,6 @@
 
 #include "llama.h"
 
-#include <algorithm>
-
 #include <array>
 
 // bump if necessary
@@ -44,7 +42,6 @@ struct llama_hparams {
     uint32_t n_expert = 0;
     uint32_t n_expert_used = 0;
     uint32_t n_rel_attn_bkts = 0;
-    uint32_t n_vocab = 0;
 
     // note: deepseek2 using MLA converts into MQA with larger heads, then decompresses to MHA
     uint32_t n_embd_head_k_mla = 0;
@@ -59,7 +56,6 @@ struct llama_hparams {
     std::array<uint32_t, LLAMA_MAX_LAYERS> n_ff_arr;
 
     std::array<std::array<uint32_t, LLAMA_MAX_LAYERS>, 4> n_bskcn_arr = {};
-    std::array<uint32_t, LLAMA_MAX_LAYERS> cross_attn_layers;
 
     uint32_t n_layer_dense_lead = 0;
     uint32_t n_lora_q           = 0;
@@ -163,9 +159,6 @@ struct llama_hparams {
     // Block skip connection
     bool n_bskcn(uint32_t n, uint32_t il) const;
 
-    // cross attention layers
-    bool cross_attention_layers(uint32_t il) const;
-
     bool is_swa(uint32_t il) const;
 };
 
diff --git a/llama/llama.cpp/src/llama-kv-cache.cpp b/llama/llama.cpp/src/llama-kv-cache.cpp
index 1a50c034..60e67b03 100644
--- a/llama/llama.cpp/src/llama-kv-cache.cpp
+++ b/llama/llama.cpp/src/llama-kv-cache.cpp
@@ -100,16 +100,8 @@ llama_kv_cache_unified::llama_kv_cache_unified(
             throw std::runtime_error("failed to create ggml context for kv cache");
         }
 
-        ggml_tensor * k, *v;
-
-        // for cross attention layers
-        if (model.arch == LLM_ARCH_MLLAMA && hparams.cross_attention_layers(i)) {
-            k = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, hparams.n_embd_head_k, 6404, hparams.n_head_kv(i));
-            v = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, hparams.n_embd_head_v, 6404, hparams.n_head_kv(i));
-        } else {
-            k = ggml_new_tensor_1d(ctx, type_k, n_embd_k_gqa*kv_size);
-            v = ggml_new_tensor_1d(ctx, type_v, n_embd_v_gqa*kv_size);
-        }
+        ggml_tensor * k = ggml_new_tensor_1d(ctx, type_k, n_embd_k_gqa*kv_size);
+        ggml_tensor * v = ggml_new_tensor_1d(ctx, type_v, n_embd_v_gqa*kv_size);
         ggml_format_name(k, "cache_k_l%d", i);
         ggml_format_name(v, "cache_v_l%d", i);
         k_l.push_back(k);
@@ -459,7 +451,7 @@ void llama_kv_cache_unified::set_full() {
 llama_sbatch llama_kv_cache_unified::sbatch_init(
         const llama_batch & batch,
         bool logits_all) {
-    return llama_sbatch(batch, batch.n_embd, true, logits_all);
+    return llama_sbatch(batch, hparams.n_embd, true, logits_all);
 }
 
 llama_ubatch llama_kv_cache_unified::ubatch_next(
diff --git a/llama/llama.cpp/src/llama-model-loader.cpp b/llama/llama.cpp/src/llama-model-loader.cpp
index 2acfd4a8..7f6617fa 100644
--- a/llama/llama.cpp/src/llama-model-loader.cpp
+++ b/llama/llama.cpp/src/llama-model-loader.cpp
@@ -315,8 +315,6 @@ namespace GGUFMeta {
         return true;
     }
 
-    template bool llama_model_loader::get_arr<std::array<unsigned int, 512>>(enum llm_kv kid, std::array<unsigned int, 512>& result, bool required);
-
     template<typename T, size_t N_MAX>
     bool llama_model_loader::get_arr(const std::string & key, std::array<T, N_MAX> & result, bool required) {
         const int kid = gguf_find_key(meta.get(), key.c_str());
diff --git a/llama/llama.cpp/src/llama-model.cpp b/llama/llama.cpp/src/llama-model.cpp
index e8298f56..db62973f 100644
--- a/llama/llama.cpp/src/llama-model.cpp
+++ b/llama/llama.cpp/src/llama-model.cpp
@@ -433,7 +433,6 @@ void llama_model::load_hparams(llama_model_loader & ml) {
 
     // get general kv
     ml.get_key(LLM_KV_GENERAL_NAME, name, false);
-    ml.get_key(LLM_KV_VOCAB_SIZE, hparams.n_vocab, false) || ml.get_arr_n(LLM_KV_TOKENIZER_LIST, hparams.n_vocab, false);
 
     // everything past this point is not vocab-related
     if (hparams.vocab_only) {
@@ -445,7 +444,6 @@ void llama_model::load_hparams(llama_model_loader & ml) {
     ml.get_key(LLM_KV_BLOCK_COUNT,       hparams.n_layer);
     ml.get_key(LLM_KV_EXPERT_COUNT,      hparams.n_expert,      false);
     ml.get_key(LLM_KV_EXPERT_USED_COUNT, hparams.n_expert_used, false);
-    ml.get_key(LLM_KV_VOCAB_SIZE,        hparams.n_vocab,       false);
 
     if (arch == LLM_ARCH_WAVTOKENIZER_DEC) {
         ml.get_key(LLM_KV_FEATURES_LENGTH, hparams.n_embd_features);
@@ -469,11 +467,9 @@ void llama_model::load_hparams(llama_model_loader & ml) {
     std::fill(hparams.n_head_arr.begin(),    hparams.n_head_arr.end(),    0);
     std::fill(hparams.n_head_kv_arr.begin(), hparams.n_head_kv_arr.end(), 0);
     std::fill(hparams.n_ff_arr.begin(),      hparams.n_ff_arr.end(),      0);
-    std::fill(hparams.cross_attn_layers.begin(), hparams.cross_attn_layers.end(), -1);
 
     ml.get_key_or_arr(LLM_KV_FEED_FORWARD_LENGTH,  hparams.n_ff_arr,   hparams.n_layer, false);
     ml.get_key_or_arr(LLM_KV_ATTENTION_HEAD_COUNT, hparams.n_head_arr, hparams.n_layer, false);
-    ml.get_arr(LLM_KV_ATTENTION_CROSS_ATTENTION_LAYERS, hparams.cross_attn_layers, false);
 
     // n_head_kv is optional, default to n_head
     hparams.n_head_kv_arr = hparams.n_head_arr;
@@ -526,7 +522,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
 
         ml.get_key(LLM_KV_ROPE_DIMENSION_COUNT, hparams.n_rot, false);
 
-        if (arch == LLM_ARCH_LLAMA || arch == LLM_ARCH_MLLAMA || arch == LLM_ARCH_DECI || arch == LLM_ARCH_FALCON) {
+        if (arch == LLM_ARCH_LLAMA || arch == LLM_ARCH_DECI || arch == LLM_ARCH_FALCON) {
             if (hparams.n_rot != hparams.n_embd_head_k) {
                 throw std::runtime_error(format("invalid n_rot: %u, expected %u", hparams.n_rot, hparams.n_embd_head_k));
             }
@@ -589,16 +585,6 @@ void llama_model::load_hparams(llama_model_loader & ml) {
                     hparams.use_kq_norm = false;
                 }
             } break;
-        case LLM_ARCH_MLLAMA:
-            {
-                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
-
-                switch (hparams.n_layer) {
-                    case 40: type = LLM_TYPE_11B; break;
-                    case 100: type = LLM_TYPE_90B; break;
-                    default: type = LLM_TYPE_UNKNOWN;
-                }
-            } break;
         case LLM_ARCH_DECI:
             {
                 ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
@@ -1595,7 +1581,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
         const int64_t n_embd_head_v = hparams.n_embd_head_v;
         const int64_t n_ff          = hparams.n_ff();
         const int64_t n_embd_gqa    = n_embd_v_gqa;
-        const int64_t n_vocab       = hparams.n_vocab;
+        const int64_t n_vocab       = vocab.n_tokens();
         const int64_t n_token_types = vocab.n_token_types();
         const int64_t n_rot         = hparams.n_rot;
         const int64_t n_expert      = hparams.n_expert;
@@ -1854,52 +1840,6 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
                         }
                     }
                 } break;
-            case LLM_ARCH_MLLAMA:
-                {
-                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab+8}, 0);
-
-                    // output
-                    {
-                        output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
-                        output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
-
-                        // if output is NULL, init from the input tok embed
-                        if (output == NULL) {
-                            output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
-                        }
-                    }
-
-                    for (int i = 0; i < n_layer; ++i) {
-                        auto & layer = layers[i];
-
-                        if (hparams.cross_attention_layers(i)) {
-                            layer.cross_attn_k_norm = create_tensor(tn(LLM_TENSOR_CROSS_ATTN_K_NORM,   "weight", i), {128}, 0);
-                            layer.cross_attn_k_proj = create_tensor(tn(LLM_TENSOR_CROSS_ATTN_K_PROJ,   "weight", i), {n_embd, 1024}, 0);
-                            layer.cross_attn_o_proj = create_tensor(tn(LLM_TENSOR_CROSS_ATTN_O_PROJ,   "weight", i), {n_embd, n_embd}, 0);
-                            layer.cross_attn_q_norm = create_tensor(tn(LLM_TENSOR_CROSS_ATTN_Q_NORM, "weight", i), {128}, 0);
-                            layer.cross_attn_q_proj = create_tensor(tn(LLM_TENSOR_CROSS_ATTN_Q_PROJ, "weight", i), {n_embd, n_embd}, 0);
-                            layer.cross_attn_v_proj = create_tensor(tn(LLM_TENSOR_CROSS_ATTN_V_PROJ, "weight", i), {n_embd, 1024}, 0);
-                            layer.cross_attn_attn_gate = create_tensor(tn(LLM_TENSOR_CROSS_ATTN_ATTN_GATE, i), {1}, 0);
-                            layer.cross_attn_mlp_gate = create_tensor(tn(LLM_TENSOR_CROSS_ATTN_MLP_GATE, i), {1}, 0);
-                            layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
-                            layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
-                            layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, 0);
-                            layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
-                            layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
-                        } else {
-                            layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
-                            layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
-                            layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_k_gqa}, 0);
-                            layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_v_gqa}, 0);
-                            layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
-                            layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
-                            layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0));
-                            layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, 0);
-                            layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
-                            layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
-                        }
-                    }
-                } break;
             case LLM_ARCH_DECI:
                 {
                     tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
@@ -4816,246 +4756,6 @@ struct llm_build_llama : public llm_graph_context {
     }
 };
 
-struct llm_build_mllama: public llm_graph_context {
-    llm_build_mllama(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
-        // mutable variable, needed during the last layer of the computation to skip unused tokens
-        int32_t n_tokens = this->n_tokens;
-
-        const int64_t n_embd_head = hparams.n_embd_head_v;
-        GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
-        GGML_ASSERT(n_embd_head == hparams.n_rot);
-
-        ggml_tensor * cur;
-        ggml_tensor * inpL;
-        ggml_tensor * inpCAS;
-
-        inpL = build_inp_embd(model.tok_embd);
-        inpCAS = build_inp_cross_attn_state();
-
-          // inp_pos - contains the positions
-        ggml_tensor * inp_pos = build_inp_pos();
-
-        auto * inp_attn = build_attn_inp_kv_unified();
-        const llama_kv_cache_unified * kv_self = static_cast<const llama_kv_cache_unified *>(memory);
-
-        for (int il = 0; il < n_layer; ++il) {
-            ggml_tensor * inpSA = inpL;
-
-            // norm
-            cur = build_norm(inpL,
-                    model.layers[il].attn_norm, NULL,
-                    LLM_NORM_RMS, il);
-            cb(cur, "attn_norm", il);
-
-            if (hparams.cross_attention_layers(il)) {
-                if (!ubatch.embd && !cparams.cross_attn) {
-                    continue;
-                }
-
-                // cross attention layer
-                ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].cross_attn_q_proj, cur);
-                cb(Qcur, "Qcur", il);
-
-                Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
-                cb(Qcur, "Qcur", il);
-
-                Qcur = ggml_cont(ctx0, ggml_permute(ctx0, Qcur, 0, 2, 1, 3));
-                cb(Qcur, "Qcur", il);
-
-                Qcur = build_norm(Qcur, model.layers[il].cross_attn_q_norm, NULL, LLM_NORM_RMS, il);
-                cb(Qcur, "Qcur", il);
-
-                ggml_tensor * Kcur, * Vcur;
-                if (ubatch.embd) {
-                    Kcur = ggml_mul_mat(ctx0, model.layers[il].cross_attn_k_proj, inpCAS);
-                    cb(Kcur, "Kcur", il);
-
-                    Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, 6404);
-                    cb(Kcur, "Kcur", il);
-
-                    Kcur = ggml_cont(ctx0, ggml_permute(ctx0, Kcur, 0, 2, 1, 3));
-                    cb(Kcur, "Kcur", il);
-
-                    Kcur = build_norm(Kcur, model.layers[il].cross_attn_k_norm, NULL, LLM_NORM_RMS, il);
-                    cb(Kcur, "Kcur", il);
-
-                    ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, kv_self->k_l[il]));
-
-                    Vcur = ggml_mul_mat(ctx0, model.layers[il].cross_attn_v_proj, inpCAS);
-                    cb(Vcur, "Vcur", il);
-
-                    Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, 6404);
-                    cb(Vcur, "Vcur", il);
-
-                    Vcur = ggml_permute(ctx0, Vcur, 0, 2, 1, 3);
-                    cb(Vcur, "Vcur", il);
-
-                    ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, kv_self->v_l[il]));
-                } else {
-                    Kcur = ggml_view_tensor(ctx0, kv_self->k_l[il]);
-                    cb(Kcur, "Kcur (view)", il);
-
-                    Vcur = ggml_view_tensor(ctx0, kv_self->v_l[il]);
-                    cb(Vcur, "Vcur (view)", il);
-                }
-
-                struct ggml_tensor * kq = ggml_mul_mat(ctx0, Kcur, Qcur);
-                cb(kq, "kq", il);
-
-                // TODO: apply causal masks
-                struct ggml_tensor * kq_soft_max = ggml_soft_max_ext(ctx0, kq, nullptr, 1.f/sqrtf(float(n_embd_head)), hparams.f_max_alibi_bias);
-                cb(kq_soft_max, "kq_soft_max", il);
-
-                Vcur = ggml_cont(ctx0, ggml_transpose(ctx0, Vcur));
-                cb(Vcur, "Vcur", il);
-
-                struct ggml_tensor * kqv = ggml_mul_mat(ctx0, Vcur, kq_soft_max);
-                cb(kqv, "kqv", il);
-
-                struct ggml_tensor * kqv_merged = ggml_permute(ctx0, kqv, 0, 2, 1, 3);
-                cb(kqv_merged, "kqv_merged", il);
-
-                cur = ggml_cont_2d(ctx0, kqv_merged, n_embd_head_v*n_head, n_tokens);
-                cb(cur, "kqv_merged_cont", il);
-
-                cur = ggml_mul_mat(ctx0, model.layers[il].cross_attn_o_proj, cur);
-                cb(cur, "cur", il);
-
-                // TODO: do this in place once?
-                cur = ggml_mul(ctx0, cur, ggml_tanh(ctx0, model.layers[il].cross_attn_attn_gate));
-
-                struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
-                cb(ffn_inp, "ffn_inp", il);
-
-                // feed-forward network
-                cur = build_norm(ffn_inp,
-                        model.layers[il].ffn_norm, NULL,
-                        LLM_NORM_RMS, il);
-                cb(cur, "ffn_norm", il);
-
-                cur = build_ffn(cur,
-                        model.layers[il].ffn_up,   model.layers[il].ffn_up_b,   NULL,
-                        model.layers[il].ffn_gate, model.layers[il].ffn_gate_b, NULL,
-                        model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
-                        NULL,
-                        LLM_FFN_SILU, LLM_FFN_PAR, il);
-                cb(cur, "ffn_out", il);
-
-                // TODO: do this inplace once?
-                cur = ggml_add_inplace(ctx0, ggml_mul_inplace(ctx0, cur, ggml_tanh(ctx0, model.layers[il].cross_attn_mlp_gate)), ffn_inp);
-                cb(cur, "ffn_out", il);
-
-                cur = build_cvec(cur, il);
-                cb(cur, "l_out", il);
-
-                // input for next layer
-                inpL = cur;
-            } else {
-                // self attention layer
-
-                // rope freq factors for llama3; may return nullptr for llama2 and other models
-                ggml_tensor * rope_factors = model.get_rope_factors(n_ctx_per_seq, il);
-
-                // compute Q and K and RoPE them
-                ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
-                cb(Qcur, "Qcur", il);
-                if (model.layers[il].bq) {
-                    Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
-                    cb(Qcur, "Qcur", il);
-                }
-
-                ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
-                cb(Kcur, "Kcur", il);
-                if (model.layers[il].bk) {
-                    Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
-                    cb(Kcur, "Kcur", il);
-                }
-
-                ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
-                cb(Vcur, "Vcur", il);
-                if (model.layers[il].bv) {
-                    Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
-                    cb(Vcur, "Vcur", il);
-                }
-
-                Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head,    n_tokens);
-                Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
-                Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
-
-                Qcur = ggml_rope_ext(
-                        ctx0, Qcur, inp_pos, rope_factors,
-                        n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
-                        ext_factor, attn_factor, beta_fast, beta_slow
-                        );
-
-                Kcur = ggml_rope_ext(
-                        ctx0, Kcur, inp_pos, rope_factors,
-                        n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
-                        ext_factor, attn_factor, beta_fast, beta_slow
-                        );
-
-                cb(Qcur, "Qcur", il);
-                cb(Kcur, "Kcur", il);
-                cb(Vcur, "Vcur", il);
-
-                cur = build_attn(inp_attn, gf,
-                    model.layers[il].wo, model.layers[il].bo,
-                    Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
-
-                if (il == n_layer - 1) {
-                    // skip computing output for unused tokens
-                    struct ggml_tensor * inp_out_ids = build_inp_out_ids();
-                    n_tokens = n_outputs;
-                    cur   = ggml_get_rows(ctx0,   cur, inp_out_ids);
-                    inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
-                }
-
-                struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
-                cb(ffn_inp, "ffn_inp", il);
-
-                // feed-forward network
-                cur = build_norm(ffn_inp,
-                        model.layers[il].ffn_norm, NULL,
-                        LLM_NORM_RMS, il);
-                cb(cur, "ffn_norm", il);
-
-                cur = build_ffn(cur,
-                        model.layers[il].ffn_up,   model.layers[il].ffn_up_b,   NULL,
-                        model.layers[il].ffn_gate, model.layers[il].ffn_gate_b, NULL,
-                        model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
-                        NULL,
-                        LLM_FFN_SILU, LLM_FFN_PAR, il);
-                cb(cur, "ffn_out", il);
-
-                cur = ggml_add(ctx0, cur, ffn_inp);
-                cb(cur, "ffn_out", il);
-
-                cur = build_cvec(cur, il);
-                cb(cur, "l_out", il);
-
-                // input for next layer
-                inpL = cur;
-            }
-        }
-
-        cur = inpL;
-
-        cur = build_norm(cur,
-                model.output_norm, NULL,
-                LLM_NORM_RMS, -1);
-        cb(cur, "result_norm", -1);
-        res->t_embd = cur;
-
-        // lm_head
-        cur = build_lora_mm(model.output, cur);
-
-        cb(cur, "result_output", -1);
-        res->t_logits = cur;
-
-        ggml_build_forward_expand(gf, cur);
-    }
-};
-
 struct llm_build_deci : public llm_graph_context {
     llm_build_deci(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
         const int64_t n_embd_head = hparams.n_embd_head_v;
@@ -13428,10 +13128,6 @@ llm_graph_result_ptr llama_model::build_graph(
             {
                 llm = std::make_unique<llm_build_llama>(*this, params, gf);
             } break;
-        case LLM_ARCH_MLLAMA:
-            {
-                llm = std::make_unique<llm_build_mllama>(*this, params, gf);
-            } break;
         case LLM_ARCH_DECI:
             {
                 llm = std::make_unique<llm_build_deci>(*this, params, gf);
@@ -13793,7 +13489,6 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
         // use what we call a normal RoPE, operating on pairs of consecutive head values
         case LLM_ARCH_LLAMA:
         case LLM_ARCH_LLAMA4:
-        case LLM_ARCH_MLLAMA:
         case LLM_ARCH_DECI:
         case LLM_ARCH_BAICHUAN:
         case LLM_ARCH_STARCODER:
diff --git a/llama/llama.cpp/src/llama-model.h b/llama/llama.cpp/src/llama-model.h
index 9281e629..43746c7d 100644
--- a/llama/llama.cpp/src/llama-model.h
+++ b/llama/llama.cpp/src/llama-model.h
@@ -11,7 +11,6 @@
 #include <string>
 #include <unordered_map>
 #include <vector>
-#include <stdexcept>
 
 struct llama_cparams;
 struct llama_ubatch;
@@ -75,7 +74,6 @@ enum llm_type {
     LLM_TYPE_40B,
     LLM_TYPE_65B,
     LLM_TYPE_70B,
-    LLM_TYPE_90B,
     LLM_TYPE_236B,
     LLM_TYPE_290B,
     LLM_TYPE_314B,
@@ -320,16 +318,6 @@ struct llama_layer {
 
     struct ggml_tensor * bskcn_tv = nullptr;
 
-    // cross attention
-    struct ggml_tensor * cross_attn_k_norm = nullptr;
-    struct ggml_tensor * cross_attn_k_proj = nullptr;
-    struct ggml_tensor * cross_attn_o_proj = nullptr;
-    struct ggml_tensor * cross_attn_q_norm = nullptr;
-    struct ggml_tensor * cross_attn_q_proj = nullptr;
-    struct ggml_tensor * cross_attn_v_proj = nullptr;
-    struct ggml_tensor * cross_attn_attn_gate = nullptr;
-    struct ggml_tensor * cross_attn_mlp_gate = nullptr;
-
     struct llama_layer_posnet posnet;
 
     struct llama_layer_convnext convnext;
diff --git a/llama/llama.cpp/src/llama-quant.cpp b/llama/llama.cpp/src/llama-quant.cpp
index 56531980..820d5128 100644
--- a/llama/llama.cpp/src/llama-quant.cpp
+++ b/llama/llama.cpp/src/llama-quant.cpp
@@ -639,9 +639,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
         if (llama_model_has_encoder(&model)) {
             n_attn_layer *= 3;
         }
-        if (qs.n_attention_wv != n_attn_layer) {
-            LLAMA_LOG_WARN("%s: n_attention_wv is unexpected, expected: %d, found: %d\n", __func__, n_attn_layer, qs.n_attention_wv);
-        }
+        GGML_ASSERT((qs.n_attention_wv == n_attn_layer) && "n_attention_wv is unexpected");
     }
 
     size_t total_size_org = 0;
diff --git a/llama/llama.cpp/tools/mtmd/llava.cpp b/llama/llama.cpp/tools/mtmd/llava.cpp
index b0eb79bb..ebef8b3c 100644
--- a/llama/llama.cpp/tools/mtmd/llava.cpp
+++ b/llama/llama.cpp/tools/mtmd/llava.cpp
@@ -462,7 +462,7 @@ struct llava_embd_batch {
     std::vector<llama_seq_id *> seq_ids;
     std::vector<int8_t>         logits;
     llama_batch batch;
-    llava_embd_batch(float * embd, int32_t n_embd, int32_t n_tokens, llama_pos pos_0, llama_seq_id seq_id) {
+    llava_embd_batch(float * embd, int32_t n_tokens, llama_pos pos_0, llama_seq_id seq_id) {
         pos     .resize(n_tokens);
         n_seq_id.resize(n_tokens);
         seq_ids .resize(n_tokens + 1);
@@ -474,7 +474,6 @@ struct llava_embd_batch {
             /*n_tokens       =*/ n_tokens,
             /*tokens         =*/ nullptr,
             /*embd           =*/ embd,
-            /*n_embd         =*/ n_embd,
             /*pos            =*/ pos.data(),
             /*n_seq_id       =*/ n_seq_id.data(),
             /*seq_id         =*/ seq_ids.data(),
@@ -498,7 +497,7 @@ bool llava_eval_image_embed(llama_context * ctx_llama, const struct llava_image_
             n_eval = n_batch;
         }
         float * embd = image_embed->embed+i*n_embd;
-        llava_embd_batch llava_batch = llava_embd_batch(embd, n_embd, n_eval, *n_past, 0);
+        llava_embd_batch llava_batch = llava_embd_batch(embd, n_eval, *n_past, 0);
         if (llama_decode(ctx_llama, llava_batch.batch)) {
             LOG_ERR("%s : failed to eval\n", __func__);
             return false;
diff --git a/llama/llama.go b/llama/llama.go
index f0f2af82..1251be3a 100644
--- a/llama/llama.go
+++ b/llama/llama.go
@@ -17,7 +17,6 @@ package llama
 #include "llava.h"
 #include "gguf.h"
 
-#include "mllama.h"
 #include "sampling_ext.h"
 
 extern bool llamaProgressCallback(float progress, void *user_data);
@@ -510,63 +509,6 @@ func (c *ClipContext) NewEmbed(llamaContext *Context, data []byte) ([][]float32,
 	return embed, nil
 }
 
-type MllamaContext struct {
-	c *C.struct_mllama_ctx
-}
-
-func NewMllamaContext(llamaContext *Context, modelPath string) (*MllamaContext, error) {
-	mp := C.CString(modelPath)
-	defer C.free(unsafe.Pointer(mp))
-	c := C.mllama_model_load(mp, 1)
-	if c == nil {
-		return nil, fmt.Errorf("unable to load mllama model: %v", modelPath)
-	}
-
-	projEmbedSize := int(C.mllama_n_embd(c))
-	modelEmbedSize := llamaContext.Model().NEmbd()
-	if projEmbedSize != modelEmbedSize {
-		return nil, fmt.Errorf("projector embedding size (%d) does not match model (%d)", projEmbedSize, modelEmbedSize)
-	}
-
-	return &MllamaContext{c: c}, nil
-}
-
-func (m *MllamaContext) Free() {
-	C.mllama_free(m.c)
-}
-
-func (m *MllamaContext) NewEmbed(llamaContext *Context, data []byte, aspectRatioId int) ([][]float32, error) {
-	img := C.mllama_image_init()
-	defer C.mllama_image_free(img)
-
-	ok := bool(C.mllama_image_load_from_data(unsafe.Pointer(&data[0]), C.int(len(data)), 560, 560, 3, 4, C.int(aspectRatioId), img))
-	if !ok {
-		return nil, errors.New("unable to load mllama image data")
-	}
-
-	rows := make([]float32, m.EmbedSize(llamaContext))
-	ok = bool(C.mllama_image_encode(m.c, C.int(llamaContext.numThreads), img, (*C.float)(unsafe.Pointer(&rows[0]))))
-	if !ok {
-		return nil, errors.New("unable to make mllama embedding from image")
-	}
-
-	embed := make([][]float32, 1)
-	embed[0] = rows
-
-	return embed, nil
-}
-
-func (m *MllamaContext) EmbedSize(llamaContext *Context) int {
-	numTokens := int(C.mllama_n_positions(m.c) * C.mllama_n_tiles(m.c))
-	numEmbed := llamaContext.Model().NEmbd()
-
-	return numTokens * numEmbed
-}
-
-func (c *Context) SetCrossAttention(state bool) {
-	C.llama_set_cross_attention(c.c, C.bool(state))
-}
-
 func (c *Context) Synchronize() {
 	C.llama_synchronize(c.c)
 }
diff --git a/llama/mllama.cpp b/llama/mllama.cpp
deleted file mode 100644
index 1ba8f5be..00000000
--- a/llama/mllama.cpp
+++ /dev/null
@@ -1,887 +0,0 @@
-// NOTE: This is modified from clip.cpp for Mllama only
-#include "mllama.h"
-
-#include "ggml-alloc.h"
-#include "ggml-backend.h"
-#include "ggml-cpu.h"
-#include "ggml.h"
-#include "gguf.h"
-
-#ifdef GGML_USE_CUDA
-#include "ggml-cuda.h"
-#endif
-
-#ifdef GGML_USE_METAL
-#include "ggml-metal.h"
-#endif
-
-#ifdef GGML_USE_CANN
-#include "ggml-cann.h"
-#endif
-
-#ifdef GGML_USE_VULKAN
-#include "ggml-vulkan.h"
-#endif
-
-#include <algorithm>
-#include <cmath>
-#include <cstdarg>
-#include <cstdlib>
-#include <cstring>
-#include <fstream>
-#include <stdexcept>
-#include <vector>
-
-#define REQUIRE(x)                                           \
-    do {                                                     \
-        if (!(x)) {                                          \
-            throw std::runtime_error("REQUIRE failed: " #x); \
-        }                                                    \
-    } while (0)
-
-#define LOG(fmt, ...) fprintf(stderr, "%s: " fmt "\n", __func__, ##__VA_ARGS__)
-
-#if defined(_WIN32)
-#define WIN32_LEAN_AND_MEAN
-#ifndef NOMINMAX
-    #define NOMINMAX
-#endif
-#include <windows.h>
-#if __GLIBCXX__
-#include <cstdio>
-#include <ext/stdio_filebuf.h>
-#include <fcntl.h>
-#endif
-#endif
-
-struct mllama_image {
-    int width;
-    int height;
-
-    int num_channels = 3;
-    int num_tiles = 4;
-
-    int aspect_ratio_id;
-
-    std::vector<float> data;
-};
-
-static std::string format(const char *fmt, ...) {
-    va_list args;
-    va_start(args, fmt);
-    std::vector<char> b(128);
-    int n = vsnprintf(b.data(), b.size(), fmt, args);
-    REQUIRE(n >= 0 && n < b.size());
-    va_end(args);
-    return std::string(b.data(), b.size());
-}
-
-//
-// utilities to get data from a gguf file
-//
-
-static int get_key_index(const gguf_context *ctx, const char *key) {
-    int key_index = gguf_find_key(ctx, key);
-    REQUIRE(key_index != -1);
-    return key_index;
-}
-
-static std::vector<uint32_t> get_u32_array(const gguf_context *ctx, const std::string &key) {
-    const int i = get_key_index(ctx, key.c_str());
-    const int n = gguf_get_arr_n(ctx, i);
-    const uint32_t *data = (uint32_t *)gguf_get_arr_data(ctx, i);
-
-    std::vector<uint32_t> s(n);
-    for (size_t j = 0; j < s.size(); j++) {
-        s[j] = data[j];
-    }
-
-    return s;
-}
-
-static uint32_t get_u32(const gguf_context *ctx, const std::string &key) {
-    return gguf_get_val_u32(ctx, get_key_index(ctx, key.c_str()));
-}
-
-static float get_f32(const gguf_context *ctx, const std::string &key) {
-    return gguf_get_val_f32(ctx, get_key_index(ctx, key.c_str()));
-}
-
-static std::string get_ftype(int ftype) {
-    return ggml_type_name(static_cast<ggml_type>(ftype));
-}
-
-//
-// mllama layers
-//
-
-struct mllama_hparams {
-    uint32_t image_size;
-    uint32_t patch_size;
-    uint32_t hidden_size;
-    uint32_t n_intermediate;
-    uint32_t projection_dim;
-    uint32_t n_head;
-    uint32_t n_layer;
-    uint32_t n_global_layer;
-    uint32_t n_tiles;
-
-    float eps;
-
-    std::vector<bool> intermediate_layers;
-};
-
-struct mllama_layer {
-    // attention
-    struct ggml_tensor *k_w;
-    struct ggml_tensor *k_b;
-    struct ggml_tensor *q_w;
-    struct ggml_tensor *q_b;
-    struct ggml_tensor *v_w;
-    struct ggml_tensor *v_b;
-
-    struct ggml_tensor *o_w;
-    struct ggml_tensor *o_b;
-
-    struct ggml_tensor *attn_gate;
-
-    // layernorm 1
-    struct ggml_tensor *ln_1_w;
-    struct ggml_tensor *ln_1_b;
-
-    // ff
-    struct ggml_tensor *ff_i_w;
-    struct ggml_tensor *ff_i_b;
-
-    struct ggml_tensor *ff_o_w;
-    struct ggml_tensor *ff_o_b;
-
-    struct ggml_tensor *ff_gate;
-
-    // layernorm 2
-    struct ggml_tensor *ln_2_w;
-    struct ggml_tensor *ln_2_b;
-};
-
-struct mllama_vision_model {
-    struct mllama_hparams hparams;
-
-    // embeddings
-    struct ggml_tensor *class_embedding;
-    struct ggml_tensor *patch_embeddings;
-    struct ggml_tensor *position_embeddings;
-    struct ggml_tensor *position_embeddings_gate;
-    struct ggml_tensor *tile_position_embeddings;
-    struct ggml_tensor *tile_position_embeddings_gate;
-    struct ggml_tensor *pre_tile_position_embeddings;
-    struct ggml_tensor *pre_tile_position_embeddings_gate;
-    struct ggml_tensor *post_tile_position_embeddings;
-    struct ggml_tensor *post_tile_position_embeddings_gate;
-
-    struct ggml_tensor *pre_ln_w;
-    struct ggml_tensor *pre_ln_b;
-
-    std::vector<mllama_layer> layers;
-    std::vector<mllama_layer> global_layers;
-
-    struct ggml_tensor *post_ln_w;
-    struct ggml_tensor *post_ln_b;
-
-    struct ggml_tensor *mm_0_w;
-    struct ggml_tensor *mm_0_b;
-};
-
-struct mllama_ctx {
-    struct mllama_vision_model vision_model;
-
-    uint32_t ftype = 1;
-
-    struct gguf_context *ctx_gguf;
-    struct ggml_context *ctx_data;
-
-    std::vector<uint8_t> buf_compute_meta;
-
-    // memory buffers to evaluate the model
-    ggml_backend_buffer_t params_buffer = nullptr;
-
-    ggml_backend_t backend = nullptr;
-    ggml_gallocr_t compute_alloc = nullptr;
-};
-
-static ggml_tensor *mllama_image_build_encoder_layer(
-    struct ggml_context *ctx0, const size_t il, const struct mllama_layer &layer, struct ggml_tensor *embeddings,
-    const float eps, const int hidden_size, const int batch_size, const int n_head, const int d_head) {
-    struct ggml_tensor *cur = embeddings;
-
-    {
-        // layernorm1
-        cur = ggml_norm(ctx0, cur, eps);
-        cur = ggml_add(ctx0, ggml_mul(ctx0, cur, layer.ln_1_w), layer.ln_1_b);
-        ggml_set_name(cur, format("%d pre layernorm", il).c_str());
-    }
-
-    {
-        // self-attention
-        struct ggml_tensor *Q = ggml_mul_mat(ctx0, layer.q_w, cur);
-        if (layer.q_b != nullptr) {
-            Q = ggml_add(ctx0, Q, layer.q_b);
-        }
-
-        Q = ggml_reshape_4d(ctx0, Q, d_head, n_head, Q->ne[1], batch_size);
-        Q = ggml_cont(ctx0, ggml_permute(ctx0, Q, 0, 2, 1, 3));
-        ggml_set_name(Q, format("%d query", il).c_str());
-
-        struct ggml_tensor *K = ggml_mul_mat(ctx0, layer.k_w, cur);
-        if (layer.k_b != nullptr) {
-            K = ggml_add(ctx0, K, layer.k_b);
-        }
-
-        K = ggml_reshape_4d(ctx0, K, d_head, n_head, K->ne[1], batch_size);
-        K = ggml_cont(ctx0, ggml_permute(ctx0, K, 0, 2, 1, 3));
-        ggml_set_name(K, format("%d key", il).c_str());
-
-        struct ggml_tensor *V = ggml_mul_mat(ctx0, layer.v_w, cur);
-        if (layer.v_b != nullptr) {
-            V = ggml_add(ctx0, V, layer.v_b);
-        }
-
-        V = ggml_reshape_4d(ctx0, V, d_head, n_head, V->ne[1], batch_size);
-        V = ggml_cont(ctx0, ggml_permute(ctx0, V, 1, 2, 0, 3));
-        ggml_set_name(V, format("%d value", il).c_str());
-
-        struct ggml_tensor *KQ = ggml_mul_mat(ctx0, K, Q);
-        KQ = ggml_scale_inplace(ctx0, KQ, 1.0f / sqrtf((float)d_head));
-        KQ = ggml_soft_max_inplace(ctx0, KQ);
-        ggml_set_name(KQ, format("%d KQ", il).c_str());
-
-        struct ggml_tensor *KQV = ggml_mul_mat(ctx0, V, KQ);
-        KQV = ggml_reshape_4d(ctx0, KQV, d_head, KQV->ne[1], n_head, batch_size);
-        KQV = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
-        KQV = ggml_cont_3d(ctx0, KQV, hidden_size, KQV->ne[2], batch_size);
-        ggml_set_name(KQV, format("%d KQV", il).c_str());
-
-        cur = ggml_mul_mat(ctx0, layer.o_w, KQV);
-        if (layer.o_b != nullptr) {
-            cur = ggml_add(ctx0, cur, layer.o_b);
-        }
-        ggml_set_name(cur, format("%d self attention", il).c_str());
-
-        if (layer.attn_gate != nullptr) {
-            cur = ggml_mul_inplace(ctx0, cur, layer.attn_gate);
-            ggml_set_name(cur, format("%d self attention gate", il).c_str());
-        }
-    }
-
-    cur = ggml_add(ctx0, cur, embeddings);
-    ggml_set_name(cur, format("%d residual", il).c_str());
-
-    embeddings = cur;
-
-    {
-        // layernorm2
-        cur = ggml_norm(ctx0, cur, eps);
-        cur = ggml_add(ctx0, ggml_mul(ctx0, cur, layer.ln_2_w), layer.ln_2_b);
-        ggml_set_name(cur, format("%d post layernorm", il).c_str());
-    }
-
-    {
-        // feed forward
-        cur = ggml_add(ctx0, ggml_mul_mat(ctx0, layer.ff_i_w, cur), layer.ff_i_b);
-        cur = ggml_gelu_inplace(ctx0, cur);
-        cur = ggml_add(ctx0, ggml_mul_mat(ctx0, layer.ff_o_w, cur), layer.ff_o_b);
-        ggml_set_name(cur, format("%d feed forward", il).c_str());
-
-        if (layer.ff_gate != nullptr) {
-            cur = ggml_mul_inplace(ctx0, cur, layer.ff_gate);
-            ggml_set_name(cur, format("%d feed forward gate", il).c_str());
-        }
-    }
-
-    // residual 2
-    cur = ggml_add(ctx0, cur, embeddings);
-    ggml_set_name(cur, format("%d residual", il).c_str());
-
-    embeddings = cur;
-
-    return embeddings;
-}
-
-static ggml_cgraph *mllama_image_build_graph(mllama_ctx *ctx, const mllama_image_batch *imgs) {
-    const auto &model = ctx->vision_model;
-    const auto &hparams = model.hparams;
-
-    const int image_size = hparams.image_size;
-    const int image_size_width = image_size;
-    const int image_size_height = image_size;
-
-    const int patch_size = hparams.patch_size;
-    const int num_patches = ((image_size_width / patch_size) * (image_size_height / patch_size));
-    const int num_positions = num_patches + (model.class_embedding == nullptr ? 0 : 1);
-    const int hidden_size = hparams.hidden_size;
-    const int n_head = hparams.n_head;
-    const int d_head = hidden_size / n_head;
-
-    const int batch_size = imgs->size;
-    REQUIRE(batch_size == 1);
-
-    int num_tiles = 4;
-    int num_channels = 3;
-    if (imgs->data != nullptr) {
-        num_tiles = imgs->data[0].num_tiles > 0 ? imgs->data[0].num_tiles : num_tiles;
-        num_channels = imgs->data[0].num_channels > 0 ? imgs->data[0].num_channels : num_channels;
-    }
-
-    struct ggml_init_params params = {
-        ctx->buf_compute_meta.size(), // mem_size
-        ctx->buf_compute_meta.data(), // mem_buffer
-        true,                         // no_alloc
-    };
-
-    struct ggml_context *ctx0 = ggml_init(params);
-    struct ggml_cgraph *gf = ggml_new_graph(ctx0);
-
-    struct ggml_tensor *inp_raw = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, image_size_width, image_size_height, num_channels, num_tiles);
-    ggml_set_name(inp_raw, "inp_raw");
-    ggml_set_input(inp_raw);
-
-    struct ggml_tensor *inp = ggml_conv_2d(ctx0, model.patch_embeddings, inp_raw, patch_size, patch_size, 0, 0, 1, 1);
-
-    inp = ggml_reshape_3d(ctx0, inp, num_patches, hidden_size, num_tiles);
-    inp = ggml_cont(ctx0, ggml_permute(ctx0, inp, 1, 0, 2, 3));
-
-    struct ggml_tensor *aspect_ratios = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, imgs->size);
-    ggml_set_name(aspect_ratios, "aspect_ratios");
-    ggml_set_input(aspect_ratios);
-
-    if (model.pre_tile_position_embeddings != nullptr) {
-        struct ggml_tensor *pre_tile_position_embeddings = ggml_get_rows(ctx0, model.pre_tile_position_embeddings, aspect_ratios);
-        ggml_set_name(pre_tile_position_embeddings, "pre_tile_position_embeddings");
-
-        pre_tile_position_embeddings = ggml_reshape_3d(ctx0, pre_tile_position_embeddings, hidden_size, 1, num_tiles);
-        if (model.pre_tile_position_embeddings_gate != nullptr) {
-            pre_tile_position_embeddings = ggml_mul_inplace(ctx0, pre_tile_position_embeddings, model.pre_tile_position_embeddings_gate);
-        }
-
-        inp = ggml_add(ctx0, inp, pre_tile_position_embeddings);
-    }
-
-    struct ggml_tensor *embeddings = inp;
-
-    if (model.class_embedding != nullptr) {
-        // concat class_embeddings and patch_embeddings
-        embeddings = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, hidden_size, num_positions, num_tiles);
-        ggml_set_name(embeddings, "embeddings");
-        ggml_set_input(embeddings);
-        for (int i = 0; i < num_tiles; ++i) {
-            // repeat class embeddings for each tile
-            embeddings = ggml_acc_inplace(ctx0, embeddings, model.class_embedding, embeddings->nb[1], embeddings->nb[2], embeddings->nb[3], i * embeddings->nb[2]);
-        }
-
-        embeddings = ggml_acc_inplace(ctx0, embeddings, inp, embeddings->nb[1], embeddings->nb[2], embeddings->nb[3], model.class_embedding->nb[1]);
-    }
-
-    struct ggml_tensor *positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, num_positions);
-    ggml_set_name(positions, "positions");
-    ggml_set_input(positions);
-
-    struct ggml_tensor *position_embd = ggml_get_rows(ctx0, model.position_embeddings, positions);
-    if (model.position_embeddings_gate != nullptr) {
-        position_embd = ggml_mul_inplace(ctx0, position_embd, model.position_embeddings_gate);
-    }
-
-    embeddings = ggml_add(ctx0, embeddings, position_embd);
-
-    if (model.tile_position_embeddings != nullptr) {
-        struct ggml_tensor *tile_position_embeddings = ggml_get_rows(ctx0, model.tile_position_embeddings, aspect_ratios);
-        ggml_set_name(tile_position_embeddings, "tile_position_embeddings");
-
-        tile_position_embeddings = ggml_reshape_3d(ctx0, tile_position_embeddings, hidden_size, num_positions, num_tiles);
-        if (model.tile_position_embeddings_gate != nullptr) {
-            tile_position_embeddings = ggml_mul_inplace(ctx0, tile_position_embeddings, model.tile_position_embeddings_gate);
-        }
-
-        embeddings = ggml_add(ctx0, embeddings, tile_position_embeddings);
-    }
-
-    // pre-layernorm
-    if (model.pre_ln_w != nullptr) {
-        embeddings = ggml_mul(ctx0, ggml_norm(ctx0, embeddings, hparams.eps), model.pre_ln_w);
-        if (model.pre_ln_b != nullptr) {
-            embeddings = ggml_add(ctx0, embeddings, model.pre_ln_b);
-        }
-
-        ggml_set_name(embeddings, "pre layernorm");
-    }
-
-    const int num_padding_patches = 8 - (embeddings->ne[1] % 8) % 8;
-
-    embeddings = ggml_pad(ctx0, embeddings, 0, num_padding_patches, 0, 0);
-    embeddings = ggml_view_3d(ctx0, embeddings, embeddings->ne[0], embeddings->ne[1] * embeddings->ne[2], batch_size, embeddings->nb[1], embeddings->nb[2] * embeddings->ne[3], 0);
-
-    std::vector<struct ggml_tensor *> intermediate_embeddings;
-
-    // encoder
-    for (size_t il = 0; il < model.layers.size(); il++) {
-        if (hparams.intermediate_layers[il]) {
-            intermediate_embeddings.push_back(embeddings);
-        }
-
-        embeddings = mllama_image_build_encoder_layer(
-            ctx0, il, model.layers[il], embeddings,
-            hparams.eps, hidden_size, batch_size, n_head, d_head);
-    }
-
-    // post-layernorm
-    if (model.post_ln_w != nullptr) {
-        embeddings = ggml_mul(ctx0, ggml_norm(ctx0, embeddings, hparams.eps), model.post_ln_w);
-        if (model.post_ln_b != nullptr) {
-            embeddings = ggml_add(ctx0, embeddings, model.post_ln_b);
-        }
-
-        ggml_set_name(embeddings, "post layernorm");
-    }
-
-    embeddings = ggml_reshape_3d(ctx0, embeddings, hidden_size, num_positions + num_padding_patches, num_tiles);
-
-    if (model.post_tile_position_embeddings != nullptr) {
-        struct ggml_tensor *post_tile_position_embeddings = ggml_get_rows(ctx0, model.post_tile_position_embeddings, aspect_ratios);
-        ggml_set_name(post_tile_position_embeddings, "post_tile_position_embeddings");
-
-        post_tile_position_embeddings = ggml_reshape_3d(ctx0, post_tile_position_embeddings, hidden_size, 1, num_tiles);
-        if (model.post_tile_position_embeddings_gate != nullptr) {
-            post_tile_position_embeddings = ggml_mul(ctx0, post_tile_position_embeddings, model.post_tile_position_embeddings_gate);
-        }
-
-        embeddings = ggml_add(ctx0, embeddings, post_tile_position_embeddings);
-    }
-
-    embeddings = ggml_reshape_3d(ctx0, embeddings, hidden_size, num_tiles * (num_positions + num_padding_patches), 1);
-
-    // global encoder
-    for (size_t il = 0; il < model.global_layers.size(); il++) {
-        embeddings = mllama_image_build_encoder_layer(
-            ctx0, il, model.global_layers[il], embeddings,
-            hparams.eps, hidden_size, batch_size, n_head, d_head);
-    }
-
-    struct ggml_tensor *stacked_embeddings = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, 0, hidden_size, (num_positions + num_padding_patches) * num_tiles);
-    for (size_t i = 0; i < intermediate_embeddings.size(); ++i) {
-        stacked_embeddings = ggml_concat(ctx0, stacked_embeddings, ggml_reshape_3d(ctx0, intermediate_embeddings[i], 1, intermediate_embeddings[i]->ne[0], intermediate_embeddings[i]->ne[1]), 0);
-    }
-
-    stacked_embeddings = ggml_reshape_4d(ctx0, stacked_embeddings, intermediate_embeddings.size() * hidden_size, num_positions + num_padding_patches, num_tiles, batch_size);
-    stacked_embeddings = ggml_unpad(ctx0, stacked_embeddings, 0, num_padding_patches, 0, 0);
-
-    embeddings = ggml_reshape_3d(ctx0, embeddings, hidden_size, num_positions + num_padding_patches, num_tiles);
-    embeddings = ggml_unpad(ctx0, embeddings, 0, num_padding_patches, 0, 0);
-    embeddings = ggml_concat(ctx0, embeddings, stacked_embeddings, 0);
-
-    // mllama projector
-    embeddings = ggml_add(ctx0, ggml_mul_mat(ctx0, model.mm_0_w, embeddings), model.mm_0_b);
-    ggml_set_name(embeddings, "multi modal projector");
-
-    // build the graph
-    ggml_build_forward_expand(gf, embeddings);
-
-    ggml_free(ctx0);
-
-    return gf;
-}
-
-static struct ggml_tensor *mllama_tensor_load(struct ggml_context *ctx, const char *name, const bool optional) {
-    struct ggml_tensor *cur = ggml_get_tensor(ctx, name);
-    REQUIRE(cur != nullptr || optional);
-    return cur;
-}
-
-static std::vector<struct mllama_layer> mllama_layers_load(struct ggml_context *ctx, const char *prefix, const int n) {
-    std::vector<struct mllama_layer> layers(n);
-    for (size_t i = 0; i < layers.size(); i++) {
-        auto &layer = layers[i];
-        layer.ln_1_w = mllama_tensor_load(ctx, format("%s.blk.%d.ln1.weight", prefix, i).c_str(), false);
-        layer.ln_1_b = mllama_tensor_load(ctx, format("%s.blk.%d.ln1.bias", prefix, i).c_str(), false);
-        layer.ln_2_w = mllama_tensor_load(ctx, format("%s.blk.%d.ln2.weight", prefix, i).c_str(), false);
-        layer.ln_2_b = mllama_tensor_load(ctx, format("%s.blk.%d.ln2.bias", prefix, i).c_str(), false);
-
-        layer.k_w = mllama_tensor_load(ctx, format("%s.blk.%d.attn_k.weight", prefix, i).c_str(), false);
-        layer.k_b = mllama_tensor_load(ctx, format("%s.blk.%d.attn_k.bias", prefix, i).c_str(), true);
-        layer.q_w = mllama_tensor_load(ctx, format("%s.blk.%d.attn_q.weight", prefix, i).c_str(), false);
-        layer.q_b = mllama_tensor_load(ctx, format("%s.blk.%d.attn_q.bias", prefix, i).c_str(), true);
-        layer.v_w = mllama_tensor_load(ctx, format("%s.blk.%d.attn_v.weight", prefix, i).c_str(), false);
-        layer.v_b = mllama_tensor_load(ctx, format("%s.blk.%d.attn_v.bias", prefix, i).c_str(), true);
-        layer.o_w = mllama_tensor_load(ctx, format("%s.blk.%d.attn_out.weight", prefix, i).c_str(), false);
-        layer.o_b = mllama_tensor_load(ctx, format("%s.blk.%d.attn_out.bias", prefix, i).c_str(), true);
-
-        layer.ff_i_w = mllama_tensor_load(ctx, format("%s.blk.%d.ffn_down.weight", prefix, i).c_str(), false);
-        layer.ff_i_b = mllama_tensor_load(ctx, format("%s.blk.%d.ffn_down.bias", prefix, i).c_str(), false);
-        layer.ff_o_w = mllama_tensor_load(ctx, format("%s.blk.%d.ffn_up.weight", prefix, i).c_str(), false);
-        layer.ff_o_b = mllama_tensor_load(ctx, format("%s.blk.%d.ffn_up.bias", prefix, i).c_str(), false);
-
-        layer.attn_gate = mllama_tensor_load(ctx, format("%s.blk.%d.attn_gate", prefix, i).c_str(), true);
-        layer.ff_gate = mllama_tensor_load(ctx, format("%s.blk.%d.ffn_gate", prefix, i).c_str(), true);
-    }
-
-    return layers;
-}
-
-// read and create ggml_context containing the tensors and their data
-struct mllama_ctx *mllama_model_load(const char *fname, const int verbosity = 1) {
-    struct ggml_context *meta = nullptr;
-
-    struct gguf_init_params params = {
-        true,  // no_alloc
-        &meta, // ctx
-    };
-
-    struct gguf_context *ctx = gguf_init_from_file(fname, params);
-    REQUIRE(ctx != nullptr);
-
-    if (verbosity >= 1) {
-        const int n_tensors = gguf_get_n_tensors(ctx);
-        const int n_kv = gguf_get_n_kv(ctx);
-        const std::string ftype = get_ftype(get_u32(ctx, "general.file_type"));
-        const int idx_desc = get_key_index(ctx, "general.description");
-        const std::string description = gguf_get_val_str(ctx, idx_desc);
-        const int idx_name = gguf_find_key(ctx, "general.name");
-        if (idx_name != -1) { // make name optional temporarily as some of the uploaded models missing it due to a bug
-            const std::string name = gguf_get_val_str(ctx, idx_name);
-            LOG("model name:   %s", name.c_str());
-        }
-        LOG("description:  %s", description.c_str());
-        LOG("GGUF version: %d", gguf_get_version(ctx));
-        LOG("alignment:    %zu", gguf_get_alignment(ctx));
-        LOG("n_tensors:    %d", n_tensors);
-        LOG("n_kv:         %d", n_kv);
-        LOG("ftype:        %s", ftype.c_str());
-        LOG("");
-    }
-    const int n_tensors = gguf_get_n_tensors(ctx);
-
-    mllama_ctx *new_mllama = new mllama_ctx{};
-
-    ggml_backend_t backend = ggml_backend_init_best();
-    if (backend == nullptr) {
-        LOG("%s: failed to initialize backend\n", __func__);
-        mllama_free(new_mllama);
-        gguf_free(ctx);
-        return nullptr;
-    }
-    LOG("%s: using %s backend\n", __func__, ggml_backend_name(backend));
-    new_mllama->backend = backend;
-
-    // load tensors
-    {
-        std::vector<uint8_t> read_buf;
-        struct ggml_init_params params = {
-            (n_tensors + 1) * ggml_tensor_overhead(), // mem_size
-            nullptr,                                  // mem_buffer
-            true,                                     // no_alloc
-        };
-
-        new_mllama->ctx_data = ggml_init(params);
-        if (!new_mllama->ctx_data) {
-            LOG("ggml_init() failed");
-            mllama_free(new_mllama);
-            gguf_free(ctx);
-            return nullptr;
-        }
-
-#ifdef _WIN32
-        int wlen = MultiByteToWideChar(CP_UTF8, 0, fname, -1, NULL, 0);
-        if (!wlen) {
-            return NULL;
-        }
-        wchar_t * wbuf = (wchar_t *) malloc(wlen * sizeof(wchar_t));
-        wlen = MultiByteToWideChar(CP_UTF8, 0, fname, -1, wbuf, wlen);
-        if (!wlen) {
-            free(wbuf);
-            return NULL;
-        }
-#if __GLIBCXX__
-        int fd = _wopen(wbuf, _O_RDONLY | _O_BINARY);
-        __gnu_cxx::stdio_filebuf<char> buffer(fd, std::ios_base::in);
-        std::istream fin(&buffer);
-#else // MSVC
-        // unused in our current build
-        auto fin = std::ifstream(wbuf, std::ios::binary);
-#endif
-        free(wbuf);
-#else
-        auto fin = std::ifstream(fname, std::ios::binary);
-#endif
-        if (!fin) {
-            LOG("cannot open model file for loading tensors\n");
-            mllama_free(new_mllama);
-            gguf_free(ctx);
-            return nullptr;
-        }
-
-        // add tensors to context
-        for (int i = 0; i < n_tensors; ++i) {
-            const char *name = gguf_get_tensor_name(ctx, i);
-            struct ggml_tensor *t = ggml_get_tensor(meta, name);
-            struct ggml_tensor *cur = ggml_dup_tensor(new_mllama->ctx_data, t);
-            ggml_set_name(cur, name);
-        }
-
-        // alloc memory and offload data
-        new_mllama->params_buffer = ggml_backend_alloc_ctx_tensors(new_mllama->ctx_data, new_mllama->backend);
-        for (int i = 0; i < n_tensors; ++i) {
-            const char *name = gguf_get_tensor_name(ctx, i);
-            struct ggml_tensor *cur = ggml_get_tensor(new_mllama->ctx_data, name);
-            const size_t offset = gguf_get_data_offset(ctx) + gguf_get_tensor_offset(ctx, i);
-            fin.seekg(offset, std::ios::beg);
-            if (!fin) {
-                LOG("failed to seek for tensor %s\n", name);
-                mllama_free(new_mllama);
-                gguf_free(ctx);
-                return nullptr;
-            }
-            int num_bytes = ggml_nbytes(cur);
-            if (ggml_backend_buffer_is_host(new_mllama->params_buffer)) {
-                // for the CPU and Metal backend, we can read directly into the tensor
-                fin.read(reinterpret_cast<char *>(cur->data), num_bytes);
-            } else {
-                // read into a temporary buffer first, then copy to device memory
-                read_buf.resize(num_bytes);
-                fin.read(reinterpret_cast<char *>(read_buf.data()), num_bytes);
-                ggml_backend_tensor_set(cur, read_buf.data(), 0, num_bytes);
-            }
-        }
-
-#if defined(_WIN32) && defined(__GLIBCXX__)
-        close(fd);
-#else
-        fin.close();
-#endif
-    }
-
-    // vision model
-    // load vision model
-    auto &vision_model = new_mllama->vision_model;
-    auto &hparams = vision_model.hparams;
-    hparams.hidden_size = get_u32(ctx, "mllama.vision.embedding_length");
-    hparams.n_head = get_u32(ctx, "mllama.vision.attention.head_count");
-    hparams.n_intermediate = get_u32(ctx, "mllama.vision.feed_forward_length");
-    hparams.n_layer = get_u32(ctx, "mllama.vision.block_count");
-    hparams.n_global_layer = get_u32(ctx, "mllama.vision.global.block_count");
-    hparams.n_tiles = get_u32(ctx, "mllama.vision.max_num_tiles");
-    hparams.image_size = get_u32(ctx, "mllama.vision.image_size");
-    hparams.patch_size = get_u32(ctx, "mllama.vision.patch_size");
-    hparams.projection_dim = get_u32(ctx, "mllama.vision.projection_dim");
-    hparams.eps = get_f32(ctx, "mllama.vision.attention.layer_norm_epsilon");
-
-    std::vector<uint32_t> intermediate_layers_indices = get_u32_array(ctx, "mllama.vision.intermediate_layers_indices");
-    hparams.intermediate_layers.resize(hparams.n_layer);
-    for (size_t i = 0; i < intermediate_layers_indices.size(); i++) {
-        hparams.intermediate_layers[intermediate_layers_indices[i]] = true;
-    }
-
-    if (verbosity >= 2) {
-        LOG("");
-        LOG("vision model hparams");
-        LOG("image_size         %d", hparams.image_size);
-        LOG("patch_size         %d", hparams.patch_size);
-        LOG("v_hidden_size      %d", hparams.hidden_size);
-        LOG("v_n_intermediate   %d", hparams.n_intermediate);
-        LOG("v_projection_dim   %d", hparams.projection_dim);
-        LOG("v_n_head           %d", hparams.n_head);
-        LOG("v_n_layer          %d", hparams.n_layer);
-        LOG("v_n_global_layer   %d", hparams.n_global_layer);
-        LOG("v_eps              %f", hparams.eps);
-    }
-
-    vision_model.class_embedding = mllama_tensor_load(new_mllama->ctx_data, "v.class_embd", true);
-    vision_model.patch_embeddings = mllama_tensor_load(new_mllama->ctx_data, "v.patch_embd.weight", true);
-
-    vision_model.position_embeddings = mllama_tensor_load(new_mllama->ctx_data, "v.position_embd.weight", true);
-    vision_model.position_embeddings_gate = mllama_tensor_load(new_mllama->ctx_data, "v.position_embd.gate", true);
-
-    vision_model.pre_ln_w = mllama_tensor_load(new_mllama->ctx_data, "v.pre_ln.weight", true);
-    vision_model.pre_ln_b = mllama_tensor_load(new_mllama->ctx_data, "v.pre_ln.bias", true);
-    vision_model.post_ln_w = mllama_tensor_load(new_mllama->ctx_data, "v.post_ln.weight", true);
-    vision_model.post_ln_b = mllama_tensor_load(new_mllama->ctx_data, "v.post_ln.bias", true);
-
-    vision_model.tile_position_embeddings = mllama_tensor_load(new_mllama->ctx_data, "v.tile_position_embd.weight", true);
-    vision_model.tile_position_embeddings_gate = mllama_tensor_load(new_mllama->ctx_data, "v.tile_position_embd.gate", true);
-
-    vision_model.pre_tile_position_embeddings = mllama_tensor_load(new_mllama->ctx_data, "v.pre_tile_position_embd.weight", true);
-    vision_model.pre_tile_position_embeddings_gate = mllama_tensor_load(new_mllama->ctx_data, "v.pre_tile_position_embd.gate", true);
-
-    vision_model.post_tile_position_embeddings = mllama_tensor_load(new_mllama->ctx_data, "v.post_tile_position_embd.weight", true);
-    vision_model.post_tile_position_embeddings_gate = mllama_tensor_load(new_mllama->ctx_data, "v.post_tile_position_embd.gate", true);
-
-    vision_model.mm_0_w = mllama_tensor_load(new_mllama->ctx_data, "mm.0.weight", false);
-    vision_model.mm_0_b = mllama_tensor_load(new_mllama->ctx_data, "mm.0.bias", false);
-
-    vision_model.layers = mllama_layers_load(new_mllama->ctx_data, "v", hparams.n_layer);
-    vision_model.global_layers = mllama_layers_load(new_mllama->ctx_data, "v.global", hparams.n_global_layer);
-
-    ggml_free(meta);
-
-    new_mllama->ctx_gguf = ctx;
-
-    {
-        // measure mem requirement and allocate
-        new_mllama->buf_compute_meta.resize(GGML_DEFAULT_GRAPH_SIZE * ggml_tensor_overhead() + ggml_graph_overhead());
-        new_mllama->compute_alloc = ggml_gallocr_new(ggml_backend_get_default_buffer_type(new_mllama->backend));
-        struct mllama_image_batch batch;
-        batch.size = 1;
-        ggml_cgraph *gf = mllama_image_build_graph(new_mllama, &batch);
-        ggml_gallocr_reserve(new_mllama->compute_alloc, gf);
-        size_t compute_memory_buffer_size = ggml_gallocr_get_buffer_size(new_mllama->compute_alloc, 0);
-        LOG("compute allocated memory: %.2f MB", compute_memory_buffer_size / 1024.0 / 1024.0);
-    }
-
-    return new_mllama;
-}
-
-struct mllama_image *mllama_image_init() {
-    return new mllama_image();
-}
-
-void mllama_image_free(struct mllama_image *img) { delete img; }
-void mllama_image_batch_free(struct mllama_image_batch *batch) {
-    if (batch->size > 0) {
-        delete[] batch->data;
-        batch->size = 0;
-    }
-}
-
-bool mllama_image_load_from_data(const void *data, const int n, const int width, const int height, const int num_channels, const int num_tiles, const int aspect_ratio_id, struct mllama_image *img) {
-    img->width = width;
-    img->height = height;
-    img->num_channels = num_channels;
-    img->num_tiles = num_tiles;
-    img->aspect_ratio_id = aspect_ratio_id;
-    img->data.resize(n);
-
-    memcpy(img->data.data(), data, n);
-    return true;
-}
-
-inline int mllama(int x, int lower, int upper) {
-    return std::max(lower, std::min(x, upper));
-}
-
-void mllama_free(mllama_ctx *ctx) {
-    ggml_free(ctx->ctx_data);
-    gguf_free(ctx->ctx_gguf);
-
-    ggml_backend_buffer_free(ctx->params_buffer);
-    ggml_backend_free(ctx->backend);
-    ggml_gallocr_free(ctx->compute_alloc);
-    delete ctx;
-}
-
-bool mllama_image_encode(struct mllama_ctx *ctx, const int n_threads, mllama_image *img, float *vec) {
-    mllama_image_batch imgs{};
-    imgs.size = 1;
-    imgs.data = img;
-    return mllama_image_batch_encode(ctx, n_threads, &imgs, vec);
-}
-
-bool mllama_image_batch_encode(mllama_ctx *ctx, const int n_threads, const mllama_image_batch *imgs, float *vec) {
-    int batch_size = imgs->size;
-    REQUIRE(batch_size == 1);
-
-    // build the inference graph
-    ggml_cgraph *gf = mllama_image_build_graph(ctx, imgs);
-    ggml_gallocr_alloc_graph(ctx->compute_alloc, gf);
-
-    // set inputs
-    const auto &model = ctx->vision_model;
-    const auto &hparams = model.hparams;
-
-    const int image_size = hparams.image_size;
-    int image_size_width = image_size;
-    int image_size_height = image_size;
-
-    const int patch_size = hparams.patch_size;
-    const int num_patches = ((image_size_width / patch_size) * (image_size_height / patch_size));
-    const int num_positions = num_patches + (model.class_embedding == nullptr ? 0 : 1);
-
-    {
-        struct ggml_tensor *inp_raw = ggml_graph_get_tensor(gf, "inp_raw");
-        ggml_backend_tensor_set(inp_raw, imgs->data[0].data.data(), 0, ggml_nbytes(inp_raw));
-    }
-
-    {
-        struct ggml_tensor *embeddings = ggml_graph_get_tensor(gf, "embeddings");
-        if (embeddings != nullptr) {
-            void *zeros = malloc(ggml_nbytes(embeddings));
-            memset(zeros, 0, ggml_nbytes(embeddings));
-            ggml_backend_tensor_set(embeddings, zeros, 0, ggml_nbytes(embeddings));
-            free(zeros);
-        }
-    }
-
-    {
-        struct ggml_tensor *positions = ggml_graph_get_tensor(gf, "positions");
-        if (positions != nullptr) {
-            int *positions_data = (int *)malloc(ggml_nbytes(positions));
-            for (int i = 0; i < num_positions; i++) {
-                positions_data[i] = i;
-            }
-            ggml_backend_tensor_set(positions, positions_data, 0, ggml_nbytes(positions));
-            free(positions_data);
-        }
-    }
-
-    {
-        struct ggml_tensor *aspect_ratios = ggml_graph_get_tensor(gf, "aspect_ratios");
-        if (aspect_ratios != nullptr) {
-            int *aspect_ratios_data = (int *)malloc(ggml_nbytes(aspect_ratios));
-            aspect_ratios_data[0] = imgs->data[0].aspect_ratio_id;
-            ggml_backend_tensor_set(aspect_ratios, aspect_ratios_data, 0, ggml_nbytes(aspect_ratios));
-            free(aspect_ratios_data);
-        }
-    }
-
-    if (ggml_backend_is_cpu(ctx->backend)) {
-        ggml_backend_cpu_set_n_threads(ctx->backend, n_threads);
-    }
-
-    ggml_backend_graph_compute(ctx->backend, gf);
-
-    // the last node is the embedding tensor
-    struct ggml_tensor *embeddings = ggml_graph_node(gf, ggml_graph_n_nodes(gf) - 1);
-
-    // copy the embeddings to the location passed by the user
-    ggml_backend_tensor_get(embeddings, vec, 0, ggml_nbytes(embeddings));
-
-    return true;
-}
-
-int32_t mllama_image_size(const struct mllama_ctx *ctx) {
-    return ctx->vision_model.hparams.image_size;
-}
-
-int32_t mllama_patch_size(const struct mllama_ctx *ctx) {
-    return ctx->vision_model.hparams.patch_size;
-}
-
-int32_t mllama_hidden_size(const struct mllama_ctx *ctx) {
-    return ctx->vision_model.hparams.hidden_size;
-}
-
-int mllama_n_patches(const struct mllama_ctx *ctx) {
-    const auto &hparams = ctx->vision_model.hparams;
-    return (hparams.image_size / hparams.patch_size) * (hparams.image_size / hparams.patch_size);
-}
-
-int mllama_n_positions(const struct mllama_ctx *ctx) {
-    return mllama_n_patches(ctx) + (ctx->vision_model.class_embedding == nullptr ? 0 : 1);
-}
-
-int mllama_n_tiles(const struct mllama_ctx *ctx) {
-    return ctx->vision_model.hparams.n_tiles;
-}
-
-int mllama_n_embd(const struct mllama_ctx *ctx) {
-    return ctx->vision_model.hparams.projection_dim;
-}
-
-size_t mllama_n_embd_bytes(const struct mllama_ctx *ctx) {
-    return mllama_n_positions(ctx) * mllama_n_embd(ctx) * mllama_n_tiles(ctx) * sizeof(float);
-}
diff --git a/llama/mllama.h b/llama/mllama.h
deleted file mode 100644
index 446dbb9e..00000000
--- a/llama/mllama.h
+++ /dev/null
@@ -1,61 +0,0 @@
-#ifndef MLLAMA_H
-#define MLLAMA_H
-
-#include <stddef.h>
-#include <stdint.h>
-
-#ifdef LLAMA_SHARED
-#if defined(_WIN32) && !defined(__MINGW32__)
-#ifdef LLAMA_BUILD
-#define MLLAMA_API __declspec(dllexport)
-#else
-#define MLLAMA_API __declspec(dllimport)
-#endif
-#else
-#define MLLAMA_API __attribute__((visibility("default")))
-#endif
-#else
-#define MLLAMA_API
-#endif
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-struct mllama_ctx;
-
-struct mllama_image_batch {
-    struct mllama_image *data;
-    size_t size;
-};
-
-MLLAMA_API struct mllama_ctx *mllama_model_load(const char *fname, int verbosity);
-MLLAMA_API struct mllama_ctx *mllama_model_load_cpu(const char *fname, int verbosity);
-
-MLLAMA_API void mllama_free(struct mllama_ctx *ctx);
-
-MLLAMA_API int32_t mllama_image_size(const struct mllama_ctx *ctx);
-MLLAMA_API int32_t mllama_patch_size(const struct mllama_ctx *ctx);
-MLLAMA_API int32_t mllama_hidden_size(const struct mllama_ctx *ctx);
-
-MLLAMA_API int mllama_n_patches(const struct mllama_ctx *ctx);
-MLLAMA_API int mllama_n_positions(const struct mllama_ctx *ctx);
-MLLAMA_API int mllama_n_tiles(const struct mllama_ctx *ctx);
-MLLAMA_API int mllama_n_embd(const struct mllama_ctx *ctx);
-MLLAMA_API size_t mllama_n_embd_bytes(const struct mllama_ctx *ctx);
-
-MLLAMA_API struct mllama_image *mllama_image_init();
-
-MLLAMA_API void mllama_image_free(struct mllama_image *img);
-MLLAMA_API void mllama_image_batch_free(struct mllama_image_batch *batch);
-
-MLLAMA_API bool mllama_image_load_from_data(const void *data, const int n, const int nx, const int ny, const int nc, const int nt, const int aspect_ratio_id, struct mllama_image *img);
-
-MLLAMA_API bool mllama_image_encode(struct mllama_ctx *ctx, int n_threads, struct mllama_image *img, float *vec);
-MLLAMA_API bool mllama_image_batch_encode(struct mllama_ctx *ctx, int n_threads, const struct mllama_image_batch *imgs, float *vec);
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif // MLLAMA_H
diff --git a/llama/patches/0005-solar-pro.patch b/llama/patches/0005-solar-pro.patch
index c630f243..deb53c22 100644
--- a/llama/patches/0005-solar-pro.patch
+++ b/llama/patches/0005-solar-pro.patch
@@ -270,7 +270,7 @@ index 3a4e72a3..831b68c0 100644
 +            // self-attention
 +            {
 +                // rope freq factors for llama3; may return nullptr for llama2 and other models
-+                ggml_tensor * rope_factors = static_cast<const llama_kv_cache_unified *>(memory)->cbs.get_rope_factors(n_ctx_per_seq, il);
++                ggml_tensor * rope_factors = model.get_rope_factors(n_ctx_per_seq, il);
 +
 +                // compute Q and K and RoPE them
 +                ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
diff --git a/llama/patches/0006-add-mllama-support.patch b/llama/patches/0006-add-mllama-support.patch
deleted file mode 100644
index 05f85ec3..00000000
--- a/llama/patches/0006-add-mllama-support.patch
+++ /dev/null
@@ -1,1027 +0,0 @@
-From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
-From: jmorganca <jmorganca@gmail.com>
-Date: Sun, 20 Apr 2025 16:12:36 -0700
-Subject: [PATCH] add mllama support
-
-adds support for the llama 3.2 vision architecture
----
- ggml/src/ggml-backend-reg.cpp |   6 +-
- include/llama.h               |   6 +
- src/llama-arch.cpp            |  44 +++++
- src/llama-arch.h              |  10 ++
- src/llama-batch.cpp           |   3 +
- src/llama-context.cpp         |  23 ++-
- src/llama-context.h           |   1 +
- src/llama-cparams.h           |   1 +
- src/llama-graph.cpp           |  25 +++
- src/llama-graph.h             |  12 ++
- src/llama-hparams.cpp         |   4 +
- src/llama-hparams.h           |   7 +
- src/llama-kv-cache.cpp        |  14 +-
- src/llama-model-loader.cpp    |   2 +
- src/llama-model.cpp           | 311 +++++++++++++++++++++++++++++++++-
- src/llama-model.h             |  12 ++
- src/llama-quant.cpp           |   4 +-
- tools/mtmd/llava.cpp          |   5 +-
- tools/mtmd/mtmd-helper.cpp    |   7 +-
- 19 files changed, 475 insertions(+), 22 deletions(-)
-
-diff --git a/ggml/src/ggml-backend-reg.cpp b/ggml/src/ggml-backend-reg.cpp
-index 405d8e31..82ae1b5b 100644
---- a/ggml/src/ggml-backend-reg.cpp
-+++ b/ggml/src/ggml-backend-reg.cpp
-@@ -178,9 +178,9 @@ struct ggml_backend_registry {
- #ifdef GGML_USE_CANN
-         register_backend(ggml_backend_cann_reg());
- #endif
--#ifdef GGML_USE_BLAS
--        register_backend(ggml_backend_blas_reg());
--#endif
-+// #ifdef GGML_USE_BLAS
-+//         register_backend(ggml_backend_blas_reg());
-+// #endif
- #ifdef GGML_USE_RPC
-         register_backend(ggml_backend_rpc_reg());
- #endif
-diff --git a/include/llama.h b/include/llama.h
-index abedebdb..41beef21 100644
---- a/include/llama.h
-+++ b/include/llama.h
-@@ -258,6 +258,7 @@ extern "C" {
- 
-         llama_token  *  token;
-         float        *  embd;
-+        int32_t         n_embd;
-         llama_pos    *  pos;
-         int32_t      *  n_seq_id;
-         llama_seq_id ** seq_id;
-@@ -365,6 +366,7 @@ extern "C" {
-         bool flash_attn;  // whether to use flash attention [EXPERIMENTAL]
-         bool no_perf;     // whether to measure performance timings
-         bool op_offload;  // whether to offload host tensor operations to device
-+        bool cross_attn;  // whether to use cross attention
-     };
- 
-     // model quantization parameters
-@@ -464,6 +466,10 @@ extern "C" {
-             struct llama_context_params   params),
-             "use llama_init_from_model instead");
- 
-+    // TODO (jmorganca): this should most likely be passed in as part of a batch
-+    // and not set on the context for all batches.
-+    LLAMA_API void llama_set_cross_attention(struct llama_context * ctx, bool cross_attn_state);
-+
-     // Frees all allocated memory
-     LLAMA_API void llama_free(struct llama_context * ctx);
- 
-diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp
-index 5ab3f572..eb7b5325 100644
---- a/src/llama-arch.cpp
-+++ b/src/llama-arch.cpp
-@@ -6,6 +6,7 @@
- 
- static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
-     { LLM_ARCH_LLAMA,            "llama"            },
-+    { LLM_ARCH_MLLAMA,           "mllama"           },
-     { LLM_ARCH_LLAMA4,           "llama4"           },
-     { LLM_ARCH_DECI,             "deci"             },
-     { LLM_ARCH_FALCON,           "falcon"           },
-@@ -144,6 +145,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
-     { LLM_KV_ATTENTION_SLIDING_WINDOW,               "%s.attention.sliding_window"               },
-     { LLM_KV_ATTENTION_SCALE,                        "%s.attention.scale"                        },
-     { LLM_KV_ATTENTION_BLOCK_SKIP_CONNECTION,        "%s.attention.block_skip_connection"        },
-+    { LLM_KV_ATTENTION_CROSS_ATTENTION_LAYERS,       "%s.attention.cross_attention_layers"       },
-     { LLM_KV_ATTENTION_KEY_LENGTH_MLA,               "%s.attention.key_length_mla"               },
-     { LLM_KV_ATTENTION_VALUE_LENGTH_MLA,             "%s.attention.value_length_mla"             },
- 
-@@ -273,6 +275,40 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
-             { LLM_TENSOR_FFN_UP_SHEXP,    "blk.%d.ffn_up_shexp" },
-         },
-     },
-+    {
-+        LLM_ARCH_MLLAMA,
-+        {
-+            { LLM_TENSOR_TOKEN_EMBD,      "token_embd" },
-+            { LLM_TENSOR_OUTPUT_NORM,     "output_norm" },
-+            { LLM_TENSOR_OUTPUT,          "output" },
-+            { LLM_TENSOR_ROPE_FREQS,      "rope_freqs" },
-+            { LLM_TENSOR_ATTN_NORM,       "blk.%d.attn_norm" },
-+            { LLM_TENSOR_ATTN_Q,          "blk.%d.attn_q" },
-+            { LLM_TENSOR_ATTN_K,          "blk.%d.attn_k" },
-+            { LLM_TENSOR_ATTN_V,          "blk.%d.attn_v" },
-+            { LLM_TENSOR_ATTN_OUT,        "blk.%d.attn_output" },
-+            { LLM_TENSOR_ATTN_ROT_EMBD,   "blk.%d.attn_rot_embd" },
-+            { LLM_TENSOR_FFN_GATE_INP,    "blk.%d.ffn_gate_inp" },
-+            { LLM_TENSOR_FFN_NORM,        "blk.%d.ffn_norm" },
-+            { LLM_TENSOR_FFN_GATE,        "blk.%d.ffn_gate" },
-+            { LLM_TENSOR_FFN_DOWN,        "blk.%d.ffn_down" },
-+            { LLM_TENSOR_FFN_UP,          "blk.%d.ffn_up" },
-+            { LLM_TENSOR_FFN_GATE_EXP,    "blk.%d.ffn_gate.%d" },
-+            { LLM_TENSOR_FFN_DOWN_EXP,    "blk.%d.ffn_down.%d" },
-+            { LLM_TENSOR_FFN_UP_EXP,      "blk.%d.ffn_up.%d" },
-+            { LLM_TENSOR_FFN_GATE_EXPS,   "blk.%d.ffn_gate_exps" },
-+            { LLM_TENSOR_FFN_DOWN_EXPS,   "blk.%d.ffn_down_exps" },
-+            { LLM_TENSOR_FFN_UP_EXPS,     "blk.%d.ffn_up_exps" },
-+            { LLM_TENSOR_CROSS_ATTN_K_NORM,    "blk.%d.cross_attn_k_norm" },
-+            { LLM_TENSOR_CROSS_ATTN_K_PROJ,    "blk.%d.cross_attn_k_proj" },
-+            { LLM_TENSOR_CROSS_ATTN_O_PROJ,    "blk.%d.cross_attn_o_proj" },
-+            { LLM_TENSOR_CROSS_ATTN_Q_NORM,    "blk.%d.cross_attn_q_norm" },
-+            { LLM_TENSOR_CROSS_ATTN_Q_PROJ,    "blk.%d.cross_attn_q_proj" },
-+            { LLM_TENSOR_CROSS_ATTN_V_PROJ,    "blk.%d.cross_attn_v_proj" },
-+            { LLM_TENSOR_CROSS_ATTN_ATTN_GATE, "blk.%d.cross_attn_attn_gate" },
-+            { LLM_TENSOR_CROSS_ATTN_MLP_GATE,  "blk.%d.cross_attn_mlp_gate" },
-+        },
-+    },
-     {
-         LLM_ARCH_DECI,
-         {
-@@ -1701,6 +1737,14 @@ static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
-     // this tensor is loaded for T5, but never used
-     {LLM_TENSOR_DEC_CROSS_ATTN_REL_B,       {LLM_TENSOR_LAYER_REPEATING, GGML_OP_NONE}},
-     {LLM_TENSOR_BSKCN_TV,                   {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
-+    {LLM_TENSOR_CROSS_ATTN_K_NORM,          {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
-+    {LLM_TENSOR_CROSS_ATTN_K_PROJ,          {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
-+    {LLM_TENSOR_CROSS_ATTN_O_PROJ,          {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
-+    {LLM_TENSOR_CROSS_ATTN_Q_NORM,          {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
-+    {LLM_TENSOR_CROSS_ATTN_Q_PROJ,          {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
-+    {LLM_TENSOR_CROSS_ATTN_V_PROJ,          {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
-+    {LLM_TENSOR_CROSS_ATTN_ATTN_GATE,       {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
-+    {LLM_TENSOR_CROSS_ATTN_MLP_GATE,        {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
-     {LLM_TENSOR_CONV1D,                     {LLM_TENSOR_LAYER_INPUT,     GGML_OP_IM2COL}},
-     {LLM_TENSOR_POS_NET_NORM,               {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
-     {LLM_TENSOR_POS_NET_NORM1,              {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
-diff --git a/src/llama-arch.h b/src/llama-arch.h
-index 525c1b7d..bc8a4f0b 100644
---- a/src/llama-arch.h
-+++ b/src/llama-arch.h
-@@ -11,6 +11,7 @@
- enum llm_arch {
-     LLM_ARCH_LLAMA,
-     LLM_ARCH_LLAMA4,
-+    LLM_ARCH_MLLAMA,
-     LLM_ARCH_DECI,
-     LLM_ARCH_FALCON,
-     LLM_ARCH_BAICHUAN,
-@@ -148,6 +149,7 @@ enum llm_kv {
-     LLM_KV_ATTENTION_SLIDING_WINDOW,
-     LLM_KV_ATTENTION_SCALE,
-     LLM_KV_ATTENTION_BLOCK_SKIP_CONNECTION,
-+    LLM_KV_ATTENTION_CROSS_ATTENTION_LAYERS,
-     LLM_KV_ATTENTION_KEY_LENGTH_MLA,
-     LLM_KV_ATTENTION_VALUE_LENGTH_MLA,
- 
-@@ -349,6 +351,14 @@ enum llm_tensor {
-     LLM_TENSOR_CLS,
-     LLM_TENSOR_CLS_OUT,
-     LLM_TENSOR_BSKCN_TV,
-+    LLM_TENSOR_CROSS_ATTN_K_NORM,
-+    LLM_TENSOR_CROSS_ATTN_K_PROJ,
-+    LLM_TENSOR_CROSS_ATTN_O_PROJ,
-+    LLM_TENSOR_CROSS_ATTN_Q_NORM,
-+    LLM_TENSOR_CROSS_ATTN_Q_PROJ,
-+    LLM_TENSOR_CROSS_ATTN_V_PROJ,
-+    LLM_TENSOR_CROSS_ATTN_ATTN_GATE,
-+    LLM_TENSOR_CROSS_ATTN_MLP_GATE,
-     LLM_TENSOR_CONV1D,
-     LLM_TENSOR_CONVNEXT_DW,
-     LLM_TENSOR_CONVNEXT_NORM,
-diff --git a/src/llama-batch.cpp b/src/llama-batch.cpp
-index a88b2fe3..241b316e 100644
---- a/src/llama-batch.cpp
-+++ b/src/llama-batch.cpp
-@@ -320,6 +320,7 @@ struct llama_batch llama_batch_get_one(
-         /*n_tokens       =*/ n_tokens,
-         /*tokens         =*/ tokens,
-         /*embd           =*/ nullptr,
-+        /*n_embd         =*/ 0,
-         /*pos            =*/ nullptr,
-         /*n_seq_id       =*/ nullptr,
-         /*seq_id         =*/ nullptr,
-@@ -332,6 +333,7 @@ struct llama_batch llama_batch_init(int32_t n_tokens_alloc, int32_t embd, int32_
-         /*n_tokens       =*/ 0,
-         /*tokens         =*/ nullptr,
-         /*embd           =*/ nullptr,
-+        /*n_embd         =*/ 0,
-         /*pos            =*/ nullptr,
-         /*n_seq_id       =*/ nullptr,
-         /*seq_id         =*/ nullptr,
-@@ -340,6 +342,7 @@ struct llama_batch llama_batch_init(int32_t n_tokens_alloc, int32_t embd, int32_
- 
-     if (embd) {
-         batch.embd = (float *) malloc(sizeof(float) * n_tokens_alloc * embd);
-+        batch.n_embd = embd;
-     } else {
-         batch.token = (llama_token *) malloc(sizeof(llama_token) * n_tokens_alloc);
-     }
-diff --git a/src/llama-context.cpp b/src/llama-context.cpp
-index dca22d8b..c22687e4 100644
---- a/src/llama-context.cpp
-+++ b/src/llama-context.cpp
-@@ -514,7 +514,7 @@ float * llama_context::get_logits_ith(int32_t i) {
-             throw std::runtime_error(format("corrupt output buffer (j=%d, n_outputs=%d)", j, n_outputs));
-         }
- 
--        return logits + j*model.vocab.n_tokens();
-+        return logits + j*model.hparams.n_vocab;
-     } catch (const std::exception & err) {
-         LLAMA_LOG_ERROR("%s: invalid logits id %d, reason: %s\n", __func__, i, err.what());
- #ifndef NDEBUG
-@@ -632,6 +632,10 @@ void llama_context::set_warmup(bool value) {
-     cparams.warmup = value;
- }
- 
-+void llama_context::set_cross_attn(bool value) {
-+    cparams.cross_attn = value;
-+}
-+
- void llama_context::set_adapter_lora(
-             llama_adapter_lora * adapter,
-             float scale) {
-@@ -709,7 +713,7 @@ int llama_context::encode(llama_batch & inp_batch) {
- 
-     const int64_t n_embd = hparams.n_embd;
- 
--    llama_sbatch sbatch = llama_sbatch(batch, n_embd, /* simple_split */ true, /* logits_all */ true);
-+    llama_sbatch sbatch = llama_sbatch(batch, batch.n_embd, /* simple_split */ true, /* logits_all */ true);
- 
-     const llama_ubatch ubatch = sbatch.split_simple(n_tokens);
- 
-@@ -863,10 +867,9 @@ int llama_context::decode(llama_batch & inp_batch) {
- 
-     const llama_batch & batch = batch_allocr.batch;
- 
--    const auto & vocab   = model.vocab;
-     const auto & hparams = model.hparams;
- 
--    const int32_t n_vocab = vocab.n_tokens();
-+    const int32_t n_vocab = hparams.n_vocab;
- 
-     const int64_t n_tokens_all = batch.n_tokens;
-     const int64_t n_embd       = hparams.n_embd;
-@@ -1087,7 +1090,7 @@ int llama_context::decode(llama_batch & inp_batch) {
-         // make the outputs have the same order they had in the user-provided batch
-         // note: this is mostly relevant for recurrent models atm
-         if (!sorted_output) {
--            const uint32_t n_vocab = model.vocab.n_tokens();
-+            const uint32_t n_vocab = model.hparams.n_vocab;
-             const uint32_t n_embd  = model.hparams.n_embd;
- 
-             GGML_ASSERT((size_t) n_outputs == out_ids.size());
-@@ -1142,12 +1145,11 @@ int llama_context::decode(llama_batch & inp_batch) {
- 
- int32_t llama_context::output_reserve(int32_t n_outputs) {
-     const auto & hparams = model.hparams;
--    const auto & vocab   = model.vocab;
- 
-     const int64_t n_outputs_max = std::max<int64_t>(n_outputs, n_seq_max());
- 
-     const auto n_batch = cparams.n_batch;
--    const auto n_vocab = vocab.n_tokens();
-+    const auto n_vocab = hparams.n_vocab;
-     const auto n_embd  = hparams.n_embd;
- 
-     // TODO: use a per-batch flag for logits presence instead
-@@ -1682,7 +1684,7 @@ size_t llama_context::state_write_data(llama_io_write_i & io) {
-     {
-         LLAMA_LOG_DEBUG("%s: - writing logits\n", __func__);
- 
--        const uint64_t logits_size = std::min((uint64_t) this->logits_size, (uint64_t) n_outputs * model.vocab.n_tokens());
-+        const uint64_t logits_size = std::min((uint64_t) this->logits_size, (uint64_t) n_outputs * model.hparams.n_vocab);
- 
-         io.write(&logits_size, sizeof(logits_size));
- 
-@@ -2091,6 +2093,7 @@ llama_context_params llama_context_default_params() {
-         /*.flash_attn                  =*/ false,
-         /*.no_perf                     =*/ true,
-         /*.op_offload                  =*/ true,
-+        /*.cross_attn                  =*/ false,
-     };
- 
-     return result;
-@@ -2216,6 +2219,10 @@ void llama_set_warmup(llama_context * ctx, bool warmup) {
-     ctx->set_warmup(warmup);
- }
- 
-+void llama_set_cross_attention(struct llama_context * ctx, bool cross_attention) {
-+    ctx->set_cross_attn(cross_attention);
-+}
-+
- void llama_synchronize(llama_context * ctx) {
-     ctx->synchronize();
- }
-diff --git a/src/llama-context.h b/src/llama-context.h
-index c0ceacb1..c4ab242a 100644
---- a/src/llama-context.h
-+++ b/src/llama-context.h
-@@ -71,6 +71,7 @@ struct llama_context {
-     void set_embeddings (bool value);
-     void set_causal_attn(bool value);
-     void set_warmup(bool value);
-+    void set_cross_attn(bool value);
- 
-     void set_adapter_lora(
-             llama_adapter_lora * adapter,
-diff --git a/src/llama-cparams.h b/src/llama-cparams.h
-index 246fa577..7a6156ce 100644
---- a/src/llama-cparams.h
-+++ b/src/llama-cparams.h
-@@ -31,6 +31,7 @@ struct llama_cparams {
-     bool no_perf;
-     bool warmup;
-     bool op_offload;
-+    bool cross_attn;
- 
-     enum llama_pooling_type pooling_type;
- 
-diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp
-index b0e3f635..f14869cf 100644
---- a/src/llama-graph.cpp
-+++ b/src/llama-graph.cpp
-@@ -532,6 +532,12 @@ void llm_graph_input_attn_cross::set_input(const llama_ubatch * ubatch) {
-     }
- }
- 
-+void llm_graph_input_cross_attn_state::set_input(const llama_ubatch * ubatch) {
-+    if (ubatch->embd) {
-+        ggml_backend_tensor_set(cross_attn_state, ubatch->embd, 0, ggml_nbytes(cross_attn_state));
-+    }
-+}
-+
- //
- // llm_graph_context
- //
-@@ -1514,6 +1520,25 @@ llm_graph_input_attn_cross * llm_graph_context::build_attn_inp_cross() const {
-     return (llm_graph_input_attn_cross *) res->add_input(std::move(inp));
- }
- 
-+ggml_tensor * llm_graph_context::build_inp_cross_attn_state() const {
-+    const int64_t n_embd = hparams.n_embd;
-+
-+    auto inp = std::make_unique<llm_graph_input_cross_attn_state>();
-+
-+    ggml_tensor * cur = nullptr;
-+
-+    inp->cross_attn_state = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_embd, 1601, 4);
-+    ggml_set_input(inp->cross_attn_state);
-+
-+    cur = inp->cross_attn_state;
-+
-+    cb(cur, "inp_cross_attn_state", -1);
-+
-+    res->add_input(std::move(inp));
-+
-+    return cur;
-+}
-+
- ggml_tensor * llm_graph_context::build_attn(
-         llm_graph_input_attn_cross * inp,
-         ggml_cgraph * gf,
-diff --git a/src/llama-graph.h b/src/llama-graph.h
-index 832a8c09..5a322785 100644
---- a/src/llama-graph.h
-+++ b/src/llama-graph.h
-@@ -87,6 +87,7 @@ public:
- 
-     ggml_tensor * tokens = nullptr; // I32 [n_batch]
-     ggml_tensor * embd   = nullptr; // F32 [n_embd, n_batch]
-+    ggml_tensor * cross_attn_state; // F32 [4, n_embd, 1061]
- };
- 
- class llm_graph_input_pos : public llm_graph_input_i {
-@@ -284,6 +285,16 @@ public:
-     const llama_cross * cross = nullptr;
- };
- 
-+class llm_graph_input_cross_attn_state : public llm_graph_input_i {
-+public:
-+    llm_graph_input_cross_attn_state()          = default;
-+    virtual ~llm_graph_input_cross_attn_state() = default;
-+
-+    void set_input(const llama_ubatch * ubatch) override;
-+
-+    ggml_tensor * cross_attn_state; // F32 [4, n_embd, 1061]
-+};
-+
- //
- // llm_graph_result
- //
-@@ -495,6 +506,7 @@ struct llm_graph_context {
-     ggml_tensor * build_inp_cls() const;
-     ggml_tensor * build_inp_s_copy() const;
-     ggml_tensor * build_inp_s_mask() const;
-+    ggml_tensor * build_inp_cross_attn_state() const;
- 
-     ggml_tensor * build_inp_cross_embd() const;
-     ggml_tensor * build_inp_pos_bucket_enc() const;
-diff --git a/src/llama-hparams.cpp b/src/llama-hparams.cpp
-index 8a667960..6a02de03 100644
---- a/src/llama-hparams.cpp
-+++ b/src/llama-hparams.cpp
-@@ -85,3 +85,7 @@ bool llama_hparams::is_swa(uint32_t il) const {
- 
-     GGML_ABORT("fatal error");
- }
-+
-+bool llama_hparams::cross_attention_layers(uint32_t il) const {
-+    return std::find(cross_attn_layers.begin(), cross_attn_layers.end(), il) != cross_attn_layers.end();
-+}
-diff --git a/src/llama-hparams.h b/src/llama-hparams.h
-index 48dce407..b6fc7e6d 100644
---- a/src/llama-hparams.h
-+++ b/src/llama-hparams.h
-@@ -2,6 +2,8 @@
- 
- #include "llama.h"
- 
-+#include <algorithm>
-+
- #include <array>
- 
- // bump if necessary
-@@ -42,6 +44,7 @@ struct llama_hparams {
-     uint32_t n_expert = 0;
-     uint32_t n_expert_used = 0;
-     uint32_t n_rel_attn_bkts = 0;
-+    uint32_t n_vocab = 0;
- 
-     // note: deepseek2 using MLA converts into MQA with larger heads, then decompresses to MHA
-     uint32_t n_embd_head_k_mla = 0;
-@@ -56,6 +59,7 @@ struct llama_hparams {
-     std::array<uint32_t, LLAMA_MAX_LAYERS> n_ff_arr;
- 
-     std::array<std::array<uint32_t, LLAMA_MAX_LAYERS>, 4> n_bskcn_arr = {};
-+    std::array<uint32_t, LLAMA_MAX_LAYERS> cross_attn_layers;
- 
-     uint32_t n_layer_dense_lead = 0;
-     uint32_t n_lora_q           = 0;
-@@ -159,6 +163,9 @@ struct llama_hparams {
-     // Block skip connection
-     bool n_bskcn(uint32_t n, uint32_t il) const;
- 
-+    // cross attention layers
-+    bool cross_attention_layers(uint32_t il) const;
-+
-     bool is_swa(uint32_t il) const;
- };
- 
-diff --git a/src/llama-kv-cache.cpp b/src/llama-kv-cache.cpp
-index 3dcad65b..a7b0a7eb 100644
---- a/src/llama-kv-cache.cpp
-+++ b/src/llama-kv-cache.cpp
-@@ -100,8 +100,16 @@ llama_kv_cache_unified::llama_kv_cache_unified(
-             throw std::runtime_error("failed to create ggml context for kv cache");
-         }
- 
--        ggml_tensor * k = ggml_new_tensor_1d(ctx, type_k, n_embd_k_gqa*kv_size);
--        ggml_tensor * v = ggml_new_tensor_1d(ctx, type_v, n_embd_v_gqa*kv_size);
-+        ggml_tensor * k, *v;
-+
-+        // for cross attention layers
-+        if (model.arch == LLM_ARCH_MLLAMA && hparams.cross_attention_layers(i)) {
-+            k = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, hparams.n_embd_head_k, 6404, hparams.n_head_kv(i));
-+            v = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, hparams.n_embd_head_v, 6404, hparams.n_head_kv(i));
-+        } else {
-+            k = ggml_new_tensor_1d(ctx, type_k, n_embd_k_gqa*kv_size);
-+            v = ggml_new_tensor_1d(ctx, type_v, n_embd_v_gqa*kv_size);
-+        }
-         ggml_format_name(k, "cache_k_l%d", i);
-         ggml_format_name(v, "cache_v_l%d", i);
-         k_l.push_back(k);
-@@ -446,7 +454,7 @@ void llama_kv_cache_unified::set_full() {
- llama_sbatch llama_kv_cache_unified::sbatch_init(
-         const llama_batch & batch,
-         bool logits_all) {
--    return llama_sbatch(batch, hparams.n_embd, true, logits_all);
-+    return llama_sbatch(batch, batch.n_embd, true, logits_all);
- }
- 
- llama_ubatch llama_kv_cache_unified::ubatch_next(
-diff --git a/src/llama-model-loader.cpp b/src/llama-model-loader.cpp
-index 7f6617fa..2acfd4a8 100644
---- a/src/llama-model-loader.cpp
-+++ b/src/llama-model-loader.cpp
-@@ -315,6 +315,8 @@ namespace GGUFMeta {
-         return true;
-     }
- 
-+    template bool llama_model_loader::get_arr<std::array<unsigned int, 512>>(enum llm_kv kid, std::array<unsigned int, 512>& result, bool required);
-+
-     template<typename T, size_t N_MAX>
-     bool llama_model_loader::get_arr(const std::string & key, std::array<T, N_MAX> & result, bool required) {
-         const int kid = gguf_find_key(meta.get(), key.c_str());
-diff --git a/src/llama-model.cpp b/src/llama-model.cpp
-index 831b68c0..e8298f56 100644
---- a/src/llama-model.cpp
-+++ b/src/llama-model.cpp
-@@ -433,6 +433,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
- 
-     // get general kv
-     ml.get_key(LLM_KV_GENERAL_NAME, name, false);
-+    ml.get_key(LLM_KV_VOCAB_SIZE, hparams.n_vocab, false) || ml.get_arr_n(LLM_KV_TOKENIZER_LIST, hparams.n_vocab, false);
- 
-     // everything past this point is not vocab-related
-     if (hparams.vocab_only) {
-@@ -444,6 +445,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
-     ml.get_key(LLM_KV_BLOCK_COUNT,       hparams.n_layer);
-     ml.get_key(LLM_KV_EXPERT_COUNT,      hparams.n_expert,      false);
-     ml.get_key(LLM_KV_EXPERT_USED_COUNT, hparams.n_expert_used, false);
-+    ml.get_key(LLM_KV_VOCAB_SIZE,        hparams.n_vocab,       false);
- 
-     if (arch == LLM_ARCH_WAVTOKENIZER_DEC) {
-         ml.get_key(LLM_KV_FEATURES_LENGTH, hparams.n_embd_features);
-@@ -467,9 +469,11 @@ void llama_model::load_hparams(llama_model_loader & ml) {
-     std::fill(hparams.n_head_arr.begin(),    hparams.n_head_arr.end(),    0);
-     std::fill(hparams.n_head_kv_arr.begin(), hparams.n_head_kv_arr.end(), 0);
-     std::fill(hparams.n_ff_arr.begin(),      hparams.n_ff_arr.end(),      0);
-+    std::fill(hparams.cross_attn_layers.begin(), hparams.cross_attn_layers.end(), -1);
- 
-     ml.get_key_or_arr(LLM_KV_FEED_FORWARD_LENGTH,  hparams.n_ff_arr,   hparams.n_layer, false);
-     ml.get_key_or_arr(LLM_KV_ATTENTION_HEAD_COUNT, hparams.n_head_arr, hparams.n_layer, false);
-+    ml.get_arr(LLM_KV_ATTENTION_CROSS_ATTENTION_LAYERS, hparams.cross_attn_layers, false);
- 
-     // n_head_kv is optional, default to n_head
-     hparams.n_head_kv_arr = hparams.n_head_arr;
-@@ -522,7 +526,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
- 
-         ml.get_key(LLM_KV_ROPE_DIMENSION_COUNT, hparams.n_rot, false);
- 
--        if (arch == LLM_ARCH_LLAMA || arch == LLM_ARCH_DECI || arch == LLM_ARCH_FALCON) {
-+        if (arch == LLM_ARCH_LLAMA || arch == LLM_ARCH_MLLAMA || arch == LLM_ARCH_DECI || arch == LLM_ARCH_FALCON) {
-             if (hparams.n_rot != hparams.n_embd_head_k) {
-                 throw std::runtime_error(format("invalid n_rot: %u, expected %u", hparams.n_rot, hparams.n_embd_head_k));
-             }
-@@ -585,6 +589,16 @@ void llama_model::load_hparams(llama_model_loader & ml) {
-                     hparams.use_kq_norm = false;
-                 }
-             } break;
-+        case LLM_ARCH_MLLAMA:
-+            {
-+                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
-+
-+                switch (hparams.n_layer) {
-+                    case 40: type = LLM_TYPE_11B; break;
-+                    case 100: type = LLM_TYPE_90B; break;
-+                    default: type = LLM_TYPE_UNKNOWN;
-+                }
-+            } break;
-         case LLM_ARCH_DECI:
-             {
-                 ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
-@@ -1581,7 +1595,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
-         const int64_t n_embd_head_v = hparams.n_embd_head_v;
-         const int64_t n_ff          = hparams.n_ff();
-         const int64_t n_embd_gqa    = n_embd_v_gqa;
--        const int64_t n_vocab       = vocab.n_tokens();
-+        const int64_t n_vocab       = hparams.n_vocab;
-         const int64_t n_token_types = vocab.n_token_types();
-         const int64_t n_rot         = hparams.n_rot;
-         const int64_t n_expert      = hparams.n_expert;
-@@ -1840,6 +1854,52 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
-                         }
-                     }
-                 } break;
-+            case LLM_ARCH_MLLAMA:
-+                {
-+                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab+8}, 0);
-+
-+                    // output
-+                    {
-+                        output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
-+                        output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
-+
-+                        // if output is NULL, init from the input tok embed
-+                        if (output == NULL) {
-+                            output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
-+                        }
-+                    }
-+
-+                    for (int i = 0; i < n_layer; ++i) {
-+                        auto & layer = layers[i];
-+
-+                        if (hparams.cross_attention_layers(i)) {
-+                            layer.cross_attn_k_norm = create_tensor(tn(LLM_TENSOR_CROSS_ATTN_K_NORM,   "weight", i), {128}, 0);
-+                            layer.cross_attn_k_proj = create_tensor(tn(LLM_TENSOR_CROSS_ATTN_K_PROJ,   "weight", i), {n_embd, 1024}, 0);
-+                            layer.cross_attn_o_proj = create_tensor(tn(LLM_TENSOR_CROSS_ATTN_O_PROJ,   "weight", i), {n_embd, n_embd}, 0);
-+                            layer.cross_attn_q_norm = create_tensor(tn(LLM_TENSOR_CROSS_ATTN_Q_NORM, "weight", i), {128}, 0);
-+                            layer.cross_attn_q_proj = create_tensor(tn(LLM_TENSOR_CROSS_ATTN_Q_PROJ, "weight", i), {n_embd, n_embd}, 0);
-+                            layer.cross_attn_v_proj = create_tensor(tn(LLM_TENSOR_CROSS_ATTN_V_PROJ, "weight", i), {n_embd, 1024}, 0);
-+                            layer.cross_attn_attn_gate = create_tensor(tn(LLM_TENSOR_CROSS_ATTN_ATTN_GATE, i), {1}, 0);
-+                            layer.cross_attn_mlp_gate = create_tensor(tn(LLM_TENSOR_CROSS_ATTN_MLP_GATE, i), {1}, 0);
-+                            layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
-+                            layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
-+                            layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, 0);
-+                            layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
-+                            layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
-+                        } else {
-+                            layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
-+                            layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
-+                            layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_k_gqa}, 0);
-+                            layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_v_gqa}, 0);
-+                            layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
-+                            layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
-+                            layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0));
-+                            layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, 0);
-+                            layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
-+                            layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
-+                        }
-+                    }
-+                } break;
-             case LLM_ARCH_DECI:
-                 {
-                     tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
-@@ -4756,6 +4816,246 @@ struct llm_build_llama : public llm_graph_context {
-     }
- };
- 
-+struct llm_build_mllama: public llm_graph_context {
-+    llm_build_mllama(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
-+        // mutable variable, needed during the last layer of the computation to skip unused tokens
-+        int32_t n_tokens = this->n_tokens;
-+
-+        const int64_t n_embd_head = hparams.n_embd_head_v;
-+        GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
-+        GGML_ASSERT(n_embd_head == hparams.n_rot);
-+
-+        ggml_tensor * cur;
-+        ggml_tensor * inpL;
-+        ggml_tensor * inpCAS;
-+
-+        inpL = build_inp_embd(model.tok_embd);
-+        inpCAS = build_inp_cross_attn_state();
-+
-+          // inp_pos - contains the positions
-+        ggml_tensor * inp_pos = build_inp_pos();
-+
-+        auto * inp_attn = build_attn_inp_kv_unified();
-+        const llama_kv_cache_unified * kv_self = static_cast<const llama_kv_cache_unified *>(memory);
-+
-+        for (int il = 0; il < n_layer; ++il) {
-+            ggml_tensor * inpSA = inpL;
-+
-+            // norm
-+            cur = build_norm(inpL,
-+                    model.layers[il].attn_norm, NULL,
-+                    LLM_NORM_RMS, il);
-+            cb(cur, "attn_norm", il);
-+
-+            if (hparams.cross_attention_layers(il)) {
-+                if (!ubatch.embd && !cparams.cross_attn) {
-+                    continue;
-+                }
-+
-+                // cross attention layer
-+                ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].cross_attn_q_proj, cur);
-+                cb(Qcur, "Qcur", il);
-+
-+                Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
-+                cb(Qcur, "Qcur", il);
-+
-+                Qcur = ggml_cont(ctx0, ggml_permute(ctx0, Qcur, 0, 2, 1, 3));
-+                cb(Qcur, "Qcur", il);
-+
-+                Qcur = build_norm(Qcur, model.layers[il].cross_attn_q_norm, NULL, LLM_NORM_RMS, il);
-+                cb(Qcur, "Qcur", il);
-+
-+                ggml_tensor * Kcur, * Vcur;
-+                if (ubatch.embd) {
-+                    Kcur = ggml_mul_mat(ctx0, model.layers[il].cross_attn_k_proj, inpCAS);
-+                    cb(Kcur, "Kcur", il);
-+
-+                    Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, 6404);
-+                    cb(Kcur, "Kcur", il);
-+
-+                    Kcur = ggml_cont(ctx0, ggml_permute(ctx0, Kcur, 0, 2, 1, 3));
-+                    cb(Kcur, "Kcur", il);
-+
-+                    Kcur = build_norm(Kcur, model.layers[il].cross_attn_k_norm, NULL, LLM_NORM_RMS, il);
-+                    cb(Kcur, "Kcur", il);
-+
-+                    ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, kv_self->k_l[il]));
-+
-+                    Vcur = ggml_mul_mat(ctx0, model.layers[il].cross_attn_v_proj, inpCAS);
-+                    cb(Vcur, "Vcur", il);
-+
-+                    Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, 6404);
-+                    cb(Vcur, "Vcur", il);
-+
-+                    Vcur = ggml_permute(ctx0, Vcur, 0, 2, 1, 3);
-+                    cb(Vcur, "Vcur", il);
-+
-+                    ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, kv_self->v_l[il]));
-+                } else {
-+                    Kcur = ggml_view_tensor(ctx0, kv_self->k_l[il]);
-+                    cb(Kcur, "Kcur (view)", il);
-+
-+                    Vcur = ggml_view_tensor(ctx0, kv_self->v_l[il]);
-+                    cb(Vcur, "Vcur (view)", il);
-+                }
-+
-+                struct ggml_tensor * kq = ggml_mul_mat(ctx0, Kcur, Qcur);
-+                cb(kq, "kq", il);
-+
-+                // TODO: apply causal masks
-+                struct ggml_tensor * kq_soft_max = ggml_soft_max_ext(ctx0, kq, nullptr, 1.f/sqrtf(float(n_embd_head)), hparams.f_max_alibi_bias);
-+                cb(kq_soft_max, "kq_soft_max", il);
-+
-+                Vcur = ggml_cont(ctx0, ggml_transpose(ctx0, Vcur));
-+                cb(Vcur, "Vcur", il);
-+
-+                struct ggml_tensor * kqv = ggml_mul_mat(ctx0, Vcur, kq_soft_max);
-+                cb(kqv, "kqv", il);
-+
-+                struct ggml_tensor * kqv_merged = ggml_permute(ctx0, kqv, 0, 2, 1, 3);
-+                cb(kqv_merged, "kqv_merged", il);
-+
-+                cur = ggml_cont_2d(ctx0, kqv_merged, n_embd_head_v*n_head, n_tokens);
-+                cb(cur, "kqv_merged_cont", il);
-+
-+                cur = ggml_mul_mat(ctx0, model.layers[il].cross_attn_o_proj, cur);
-+                cb(cur, "cur", il);
-+
-+                // TODO: do this in place once?
-+                cur = ggml_mul(ctx0, cur, ggml_tanh(ctx0, model.layers[il].cross_attn_attn_gate));
-+
-+                struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
-+                cb(ffn_inp, "ffn_inp", il);
-+
-+                // feed-forward network
-+                cur = build_norm(ffn_inp,
-+                        model.layers[il].ffn_norm, NULL,
-+                        LLM_NORM_RMS, il);
-+                cb(cur, "ffn_norm", il);
-+
-+                cur = build_ffn(cur,
-+                        model.layers[il].ffn_up,   model.layers[il].ffn_up_b,   NULL,
-+                        model.layers[il].ffn_gate, model.layers[il].ffn_gate_b, NULL,
-+                        model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
-+                        NULL,
-+                        LLM_FFN_SILU, LLM_FFN_PAR, il);
-+                cb(cur, "ffn_out", il);
-+
-+                // TODO: do this inplace once?
-+                cur = ggml_add_inplace(ctx0, ggml_mul_inplace(ctx0, cur, ggml_tanh(ctx0, model.layers[il].cross_attn_mlp_gate)), ffn_inp);
-+                cb(cur, "ffn_out", il);
-+
-+                cur = build_cvec(cur, il);
-+                cb(cur, "l_out", il);
-+
-+                // input for next layer
-+                inpL = cur;
-+            } else {
-+                // self attention layer
-+
-+                // rope freq factors for llama3; may return nullptr for llama2 and other models
-+                ggml_tensor * rope_factors = model.get_rope_factors(n_ctx_per_seq, il);
-+
-+                // compute Q and K and RoPE them
-+                ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
-+                cb(Qcur, "Qcur", il);
-+                if (model.layers[il].bq) {
-+                    Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
-+                    cb(Qcur, "Qcur", il);
-+                }
-+
-+                ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
-+                cb(Kcur, "Kcur", il);
-+                if (model.layers[il].bk) {
-+                    Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
-+                    cb(Kcur, "Kcur", il);
-+                }
-+
-+                ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
-+                cb(Vcur, "Vcur", il);
-+                if (model.layers[il].bv) {
-+                    Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
-+                    cb(Vcur, "Vcur", il);
-+                }
-+
-+                Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head,    n_tokens);
-+                Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
-+                Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
-+
-+                Qcur = ggml_rope_ext(
-+                        ctx0, Qcur, inp_pos, rope_factors,
-+                        n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
-+                        ext_factor, attn_factor, beta_fast, beta_slow
-+                        );
-+
-+                Kcur = ggml_rope_ext(
-+                        ctx0, Kcur, inp_pos, rope_factors,
-+                        n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
-+                        ext_factor, attn_factor, beta_fast, beta_slow
-+                        );
-+
-+                cb(Qcur, "Qcur", il);
-+                cb(Kcur, "Kcur", il);
-+                cb(Vcur, "Vcur", il);
-+
-+                cur = build_attn(inp_attn, gf,
-+                    model.layers[il].wo, model.layers[il].bo,
-+                    Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
-+
-+                if (il == n_layer - 1) {
-+                    // skip computing output for unused tokens
-+                    struct ggml_tensor * inp_out_ids = build_inp_out_ids();
-+                    n_tokens = n_outputs;
-+                    cur   = ggml_get_rows(ctx0,   cur, inp_out_ids);
-+                    inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
-+                }
-+
-+                struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
-+                cb(ffn_inp, "ffn_inp", il);
-+
-+                // feed-forward network
-+                cur = build_norm(ffn_inp,
-+                        model.layers[il].ffn_norm, NULL,
-+                        LLM_NORM_RMS, il);
-+                cb(cur, "ffn_norm", il);
-+
-+                cur = build_ffn(cur,
-+                        model.layers[il].ffn_up,   model.layers[il].ffn_up_b,   NULL,
-+                        model.layers[il].ffn_gate, model.layers[il].ffn_gate_b, NULL,
-+                        model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
-+                        NULL,
-+                        LLM_FFN_SILU, LLM_FFN_PAR, il);
-+                cb(cur, "ffn_out", il);
-+
-+                cur = ggml_add(ctx0, cur, ffn_inp);
-+                cb(cur, "ffn_out", il);
-+
-+                cur = build_cvec(cur, il);
-+                cb(cur, "l_out", il);
-+
-+                // input for next layer
-+                inpL = cur;
-+            }
-+        }
-+
-+        cur = inpL;
-+
-+        cur = build_norm(cur,
-+                model.output_norm, NULL,
-+                LLM_NORM_RMS, -1);
-+        cb(cur, "result_norm", -1);
-+        res->t_embd = cur;
-+
-+        // lm_head
-+        cur = build_lora_mm(model.output, cur);
-+
-+        cb(cur, "result_output", -1);
-+        res->t_logits = cur;
-+
-+        ggml_build_forward_expand(gf, cur);
-+    }
-+};
-+
- struct llm_build_deci : public llm_graph_context {
-     llm_build_deci(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
-         const int64_t n_embd_head = hparams.n_embd_head_v;
-@@ -12496,7 +12796,7 @@ struct llm_build_solar : public llm_graph_context {
-             // self-attention
-             {
-                 // rope freq factors for llama3; may return nullptr for llama2 and other models
--                ggml_tensor * rope_factors = static_cast<const llama_kv_cache_unified *>(memory)->cbs.get_rope_factors(n_ctx_per_seq, il);
-+                ggml_tensor * rope_factors = model.get_rope_factors(n_ctx_per_seq, il);
- 
-                 // compute Q and K and RoPE them
-                 ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
-@@ -13128,6 +13428,10 @@ llm_graph_result_ptr llama_model::build_graph(
-             {
-                 llm = std::make_unique<llm_build_llama>(*this, params, gf);
-             } break;
-+        case LLM_ARCH_MLLAMA:
-+            {
-+                llm = std::make_unique<llm_build_mllama>(*this, params, gf);
-+            } break;
-         case LLM_ARCH_DECI:
-             {
-                 llm = std::make_unique<llm_build_deci>(*this, params, gf);
-@@ -13489,6 +13793,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
-         // use what we call a normal RoPE, operating on pairs of consecutive head values
-         case LLM_ARCH_LLAMA:
-         case LLM_ARCH_LLAMA4:
-+        case LLM_ARCH_MLLAMA:
-         case LLM_ARCH_DECI:
-         case LLM_ARCH_BAICHUAN:
-         case LLM_ARCH_STARCODER:
-diff --git a/src/llama-model.h b/src/llama-model.h
-index 43746c7d..9281e629 100644
---- a/src/llama-model.h
-+++ b/src/llama-model.h
-@@ -11,6 +11,7 @@
- #include <string>
- #include <unordered_map>
- #include <vector>
-+#include <stdexcept>
- 
- struct llama_cparams;
- struct llama_ubatch;
-@@ -74,6 +75,7 @@ enum llm_type {
-     LLM_TYPE_40B,
-     LLM_TYPE_65B,
-     LLM_TYPE_70B,
-+    LLM_TYPE_90B,
-     LLM_TYPE_236B,
-     LLM_TYPE_290B,
-     LLM_TYPE_314B,
-@@ -318,6 +320,16 @@ struct llama_layer {
- 
-     struct ggml_tensor * bskcn_tv = nullptr;
- 
-+    // cross attention
-+    struct ggml_tensor * cross_attn_k_norm = nullptr;
-+    struct ggml_tensor * cross_attn_k_proj = nullptr;
-+    struct ggml_tensor * cross_attn_o_proj = nullptr;
-+    struct ggml_tensor * cross_attn_q_norm = nullptr;
-+    struct ggml_tensor * cross_attn_q_proj = nullptr;
-+    struct ggml_tensor * cross_attn_v_proj = nullptr;
-+    struct ggml_tensor * cross_attn_attn_gate = nullptr;
-+    struct ggml_tensor * cross_attn_mlp_gate = nullptr;
-+
-     struct llama_layer_posnet posnet;
- 
-     struct llama_layer_convnext convnext;
-diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
-index 820d5128..56531980 100644
---- a/src/llama-quant.cpp
-+++ b/src/llama-quant.cpp
-@@ -639,7 +639,9 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
-         if (llama_model_has_encoder(&model)) {
-             n_attn_layer *= 3;
-         }
--        GGML_ASSERT((qs.n_attention_wv == n_attn_layer) && "n_attention_wv is unexpected");
-+        if (qs.n_attention_wv != n_attn_layer) {
-+            LLAMA_LOG_WARN("%s: n_attention_wv is unexpected, expected: %d, found: %d\n", __func__, n_attn_layer, qs.n_attention_wv);
-+        }
-     }
- 
-     size_t total_size_org = 0;
-diff --git a/tools/mtmd/llava.cpp b/tools/mtmd/llava.cpp
-index ebef8b3c..b0eb79bb 100644
---- a/tools/mtmd/llava.cpp
-+++ b/tools/mtmd/llava.cpp
-@@ -462,7 +462,7 @@ struct llava_embd_batch {
-     std::vector<llama_seq_id *> seq_ids;
-     std::vector<int8_t>         logits;
-     llama_batch batch;
--    llava_embd_batch(float * embd, int32_t n_tokens, llama_pos pos_0, llama_seq_id seq_id) {
-+    llava_embd_batch(float * embd, int32_t n_embd, int32_t n_tokens, llama_pos pos_0, llama_seq_id seq_id) {
-         pos     .resize(n_tokens);
-         n_seq_id.resize(n_tokens);
-         seq_ids .resize(n_tokens + 1);
-@@ -474,6 +474,7 @@ struct llava_embd_batch {
-             /*n_tokens       =*/ n_tokens,
-             /*tokens         =*/ nullptr,
-             /*embd           =*/ embd,
-+            /*n_embd         =*/ n_embd,
-             /*pos            =*/ pos.data(),
-             /*n_seq_id       =*/ n_seq_id.data(),
-             /*seq_id         =*/ seq_ids.data(),
-@@ -497,7 +498,7 @@ bool llava_eval_image_embed(llama_context * ctx_llama, const struct llava_image_
-             n_eval = n_batch;
-         }
-         float * embd = image_embed->embed+i*n_embd;
--        llava_embd_batch llava_batch = llava_embd_batch(embd, n_eval, *n_past, 0);
-+        llava_embd_batch llava_batch = llava_embd_batch(embd, n_embd, n_eval, *n_past, 0);
-         if (llama_decode(ctx_llama, llava_batch.batch)) {
-             LOG_ERR("%s : failed to eval\n", __func__);
-             return false;
-diff --git a/tools/mtmd/mtmd-helper.cpp b/tools/mtmd/mtmd-helper.cpp
-index 7a328867..61ebdd43 100644
---- a/tools/mtmd/mtmd-helper.cpp
-+++ b/tools/mtmd/mtmd-helper.cpp
-@@ -58,7 +58,7 @@ struct decode_embd_batch {
-     std::vector<llama_seq_id *> seq_ids;
-     std::vector<int8_t>         logits;
-     llama_batch batch;
--    decode_embd_batch(float * embd, int32_t n_tokens, int n_pos_per_embd, int n_mmproj_embd) : n_pos_per_embd(n_pos_per_embd), n_mmproj_embd(n_mmproj_embd) {
-+    decode_embd_batch(float * embd, int32_t n_embd, int32_t n_tokens, llama_pos pos_0, llama_seq_id seq_id) : n_pos_per_embd(n_pos_per_embd), n_mmproj_embd(n_mmproj_embd) {
-         pos     .resize(n_tokens * n_pos_per_embd);
-         n_seq_id.resize(n_tokens);
-         seq_ids .resize(n_tokens + 1);
-@@ -69,6 +69,7 @@ struct decode_embd_batch {
-             /*n_tokens       =*/ n_tokens,
-             /*tokens         =*/ nullptr,
-             /*embd           =*/ embd,
-+            /*n_embd         =*/ n_embd,
-             /*pos            =*/ pos.data(),
-             /*n_seq_id       =*/ n_seq_id.data(),
-             /*seq_id         =*/ seq_ids.data(),
-@@ -131,6 +132,7 @@ struct decode_embd_batch {
-             /*n_tokens       =*/ n_tokens,
-             /*tokens         =*/ nullptr,
-             /*embd           =*/ batch.embd     + offset * n_mmproj_embd,
-+            /*n_embd         =*/ batch.n_embd,
-             /*pos            =*/ pos_ptr,
-             /*n_seq_id       =*/ batch.n_seq_id + offset,
-             /*seq_id         =*/ batch.seq_id   + offset,
-@@ -166,7 +168,8 @@ int32_t mtmd_helper_decode_image_chunk(
-     int32_t n_tokens = mtmd_image_tokens_get_n_tokens(image_tokens);
-     int32_t i_batch = 0;
-     int32_t n_img_batches = GGML_PAD(n_tokens, n_batch) / n_batch;
--    decode_embd_batch batch_embd(encoded_embd, n_tokens, n_pos_per_embd, n_mmproj_embd);
-+    int n_embd  = llama_model_n_embd(llama_get_model(lctx));
-+    decode_embd_batch batch_embd(encoded_embd, n_embd, n_tokens, n_past, seq_id);
- 
-     const int nx = mtmd_image_tokens_get_nx(image_tokens);
-     const int ny = mtmd_image_tokens_get_ny(image_tokens);
diff --git a/llama/patches/0008-fix-deepseek-deseret-regex.patch b/llama/patches/0006-fix-deepseek-deseret-regex.patch
similarity index 100%
rename from llama/patches/0008-fix-deepseek-deseret-regex.patch
rename to llama/patches/0006-fix-deepseek-deseret-regex.patch
diff --git a/llama/patches/0007-add-unpad-operator.patch b/llama/patches/0007-add-unpad-operator.patch
deleted file mode 100644
index fc45aeff..00000000
--- a/llama/patches/0007-add-unpad-operator.patch
+++ /dev/null
@@ -1,419 +0,0 @@
-From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
-From: jmorganca <jmorganca@gmail.com>
-Date: Sun, 13 Apr 2025 22:10:06 -0400
-Subject: [PATCH] add unpad operator
-
-adds the unpad operator to GGML
----
- ggml/include/ggml.h                  | 10 +++++
- ggml/src/ggml-cpu/ggml-cpu.c         |  5 +++
- ggml/src/ggml-cpu/ops.cpp            | 55 ++++++++++++++++++++++++++++
- ggml/src/ggml-cpu/ops.h              |  1 +
- ggml/src/ggml-cuda/ggml-cuda.cu      |  4 ++
- ggml/src/ggml-cuda/pad.cu            | 46 +++++++++++++++++++++++
- ggml/src/ggml-cuda/pad.cuh           |  1 +
- ggml/src/ggml-metal/ggml-metal.m     | 33 +++++++++++++++++
- ggml/src/ggml-metal/ggml-metal.metal | 45 +++++++++++++++++++++++
- ggml/src/ggml.c                      | 25 ++++++++++++-
- 10 files changed, 223 insertions(+), 2 deletions(-)
-
-diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h
-index e91dedf1..8dc107ba 100644
---- a/ggml/include/ggml.h
-+++ b/ggml/include/ggml.h
-@@ -489,6 +489,7 @@ extern "C" {
-         GGML_OP_UPSCALE, // nearest interpolate
-         GGML_OP_PAD,
-         GGML_OP_PAD_REFLECT_1D,
-+        GGML_OP_UNPAD,
-         GGML_OP_ARANGE,
-         GGML_OP_TIMESTEP_EMBEDDING,
-         GGML_OP_ARGSORT,
-@@ -1781,6 +1782,15 @@ extern "C" {
-             int                   p0,
-             int                   p1);
- 
-+    // unpad each dimension: [x, ..., x, y, ..., y] -> [x, ..., x]
-+    GGML_API struct ggml_tensor * ggml_unpad(
-+            struct ggml_context * ctx,
-+            struct ggml_tensor  * a,
-+            int                  p0,
-+            int                  p1,
-+            int                  p2,
-+            int                  p3);
-+
-     // Ref: https://github.com/CompVis/stable-diffusion/blob/main/ldm/modules/diffusionmodules/util.py#L151
-     // timesteps: [N,]
-     // return: [N, dim]
-diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c
-index a30e67f2..835e6495 100644
---- a/ggml/src/ggml-cpu/ggml-cpu.c
-+++ b/ggml/src/ggml-cpu/ggml-cpu.c
-@@ -1951,6 +1951,10 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
-             {
-                 ggml_compute_forward_pad_reflect_1d(params, tensor);
-             } break;
-+        case GGML_OP_UNPAD:
-+            {
-+                ggml_compute_forward_unpad(params, tensor);
-+            } break;
-         case GGML_OP_ARANGE:
-             {
-                 ggml_compute_forward_arange(params, tensor);
-@@ -2274,6 +2278,7 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
-         case GGML_OP_UPSCALE:
-         case GGML_OP_PAD:
-         case GGML_OP_PAD_REFLECT_1D:
-+        case GGML_OP_UNPAD:
-         case GGML_OP_ARANGE:
-         case GGML_OP_TIMESTEP_EMBEDDING:
-         case GGML_OP_ARGSORT:
-diff --git a/ggml/src/ggml-cpu/ops.cpp b/ggml/src/ggml-cpu/ops.cpp
-index 955fec59..1868a10c 100644
---- a/ggml/src/ggml-cpu/ops.cpp
-+++ b/ggml/src/ggml-cpu/ops.cpp
-@@ -6690,6 +6690,61 @@ void ggml_compute_forward_pad_reflect_1d(
-     }
- }
- 
-+// ggml_compute_forward_unpad
-+
-+static void ggml_compute_forward_unpad_f32(
-+    const struct ggml_compute_params *params,
-+    struct ggml_tensor *dst) {
-+
-+    const struct ggml_tensor * src0 = dst->src[0];
-+
-+    GGML_ASSERT(src0->nb[0] == sizeof(float));
-+    GGML_ASSERT( dst->nb[0] == sizeof(float));
-+
-+    const int ith = params->ith;
-+    const int nth = params->nth;
-+
-+    GGML_TENSOR_UNARY_OP_LOCALS
-+
-+    float * dst_ptr = (float *) dst->data;
-+
-+    // TODO: optimize
-+
-+    for (int64_t i2 = 0; i2 < ne2; ++i2) {
-+        for (int64_t i1 = ith; i1 < ne1; i1 += nth) {
-+            for (int64_t i0 = 0; i0 < ne0; ++i0) {
-+                for (int64_t i3 = 0; i3 < ne3; ++i3) {
-+                    const int64_t dst_idx = i3*(ne0*ne1*ne2) + i2*(ne0*ne1) + i1*ne0 + i0;
-+
-+                    const float * src_ptr = (const float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
-+
-+                    if (i0 < ne00 && i1 < ne01 && i2 < ne02 && i3 < ne03) {
-+                        dst_ptr[dst_idx] = *src_ptr;
-+                    }
-+                }
-+            }
-+        }
-+    }
-+}
-+
-+void ggml_compute_forward_unpad(
-+    const struct ggml_compute_params * params,
-+    struct ggml_tensor * dst) {
-+
-+    const struct ggml_tensor * src0 = dst->src[0];
-+
-+    switch (src0->type) {
-+        case GGML_TYPE_F32:
-+            {
-+                ggml_compute_forward_unpad_f32(params, dst);
-+            } break;
-+        default:
-+            {
-+                GGML_ABORT("fatal error");
-+            }
-+    }
-+}
-+
- // ggml_compute_forward_arange
- 
- static void ggml_compute_forward_arange_f32(
-diff --git a/ggml/src/ggml-cpu/ops.h b/ggml/src/ggml-cpu/ops.h
-index dc081b9e..a7125555 100644
---- a/ggml/src/ggml-cpu/ops.h
-+++ b/ggml/src/ggml-cpu/ops.h
-@@ -72,6 +72,7 @@ void ggml_compute_forward_pool_2d_back(const struct ggml_compute_params * params
- void ggml_compute_forward_upscale(const struct ggml_compute_params * params, struct ggml_tensor * dst);
- void ggml_compute_forward_pad(const struct ggml_compute_params * params, struct ggml_tensor * dst);
- void ggml_compute_forward_pad_reflect_1d(const struct ggml_compute_params * params, struct ggml_tensor * dst);
-+void ggml_compute_forward_unpad(const struct ggml_compute_params * params, struct ggml_tensor * dst);
- void ggml_compute_forward_arange(const struct ggml_compute_params * params, struct ggml_tensor * dst);
- void ggml_compute_forward_timestep_embedding(const struct ggml_compute_params * params, struct ggml_tensor * dst);
- void ggml_compute_forward_argsort(const struct ggml_compute_params * params, struct ggml_tensor * dst);
-diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
-index cb0d8528..6fe86674 100644
---- a/ggml/src/ggml-cuda/ggml-cuda.cu
-+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
-@@ -2238,6 +2238,9 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
-         case GGML_OP_PAD:
-             ggml_cuda_op_pad(ctx, dst);
-             break;
-+        case GGML_OP_UNPAD:
-+            ggml_cuda_op_unpad(ctx, dst);
-+            break;
-         case GGML_OP_ARANGE:
-             ggml_cuda_op_arange(ctx, dst);
-             break;
-@@ -3212,6 +3215,7 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
-         case GGML_OP_UPSCALE:
-             return op->src[0]->type == GGML_TYPE_F32 && op->op_params[0] == GGML_SCALE_MODE_NEAREST;
-         case GGML_OP_PAD:
-+        case GGML_OP_UNPAD:
-         case GGML_OP_ARANGE:
-         case GGML_OP_TIMESTEP_EMBEDDING:
-         case GGML_OP_LEAKY_RELU:
-diff --git a/ggml/src/ggml-cuda/pad.cu b/ggml/src/ggml-cuda/pad.cu
-index 77432b04..7d45a7e1 100644
---- a/ggml/src/ggml-cuda/pad.cu
-+++ b/ggml/src/ggml-cuda/pad.cu
-@@ -47,3 +47,49 @@ void ggml_cuda_op_pad(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
-         src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3],
-         dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3], stream);
- }
-+
-+static __global__ void unpad_f32(const float * x, float * dst, const int ne0, const int ne00, const int ne01, const int ne02, const int ne03) {
-+    // blockIdx.z: idx of ne2*ne3, aka ne02*ne03
-+    // blockIdx.y: idx of ne1
-+    // blockIDx.x: idx of ne0 / BLOCK_SIZE
-+    int nidx = threadIdx.x + blockIdx.x * blockDim.x;
-+    if (nidx >= ne0) {
-+        return;
-+    }
-+
-+    // operation
-+    int offset_dst =
-+        nidx +
-+        blockIdx.y * ne0 +
-+        blockIdx.z * ne0 * gridDim.y;
-+    if (nidx < ne00 && blockIdx.y < ne01 && blockIdx.z < ne02*ne03) {
-+        int offset_src =
-+            nidx +
-+            blockIdx.y * ne00 +
-+            blockIdx.z * ne00 * ne01;
-+        dst[offset_dst] = x[offset_src];
-+    }
-+}
-+
-+static void unpad_f32_cuda(const float * x, float * dst,
-+    const int ne00, const int ne01, const int ne02, const int ne03,
-+    const int ne0, const int ne1, const int ne2, const int ne3, cudaStream_t stream) {
-+    int num_blocks = (ne0 + CUDA_PAD_BLOCK_SIZE - 1) / CUDA_PAD_BLOCK_SIZE;
-+    dim3 gridDim(num_blocks, ne1, ne2*ne3);
-+    unpad_f32<<<gridDim, CUDA_PAD_BLOCK_SIZE, 0, stream>>>(x, dst, ne0, ne00, ne01, ne02, ne03);
-+}
-+
-+void ggml_cuda_op_unpad(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
-+    const ggml_tensor * src0 = dst->src[0];
-+    const float * src0_d = (const float *)src0->data;
-+    float * dst_d = (float *)dst->data;
-+    cudaStream_t stream = ctx.stream();
-+
-+    GGML_ASSERT(src0->type == GGML_TYPE_F32);
-+    GGML_ASSERT(dst->type == GGML_TYPE_F32);
-+    GGML_ASSERT(src0->ne[3] == 1 && dst->ne[3] == 1); // just 3D tensors
-+
-+    unpad_f32_cuda(src0_d, dst_d,
-+        src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3],
-+        dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3], stream);
-+}
-\ No newline at end of file
-diff --git a/ggml/src/ggml-cuda/pad.cuh b/ggml/src/ggml-cuda/pad.cuh
-index 8fd386b0..e2ededc3 100644
---- a/ggml/src/ggml-cuda/pad.cuh
-+++ b/ggml/src/ggml-cuda/pad.cuh
-@@ -3,3 +3,4 @@
- #define CUDA_PAD_BLOCK_SIZE 256
- 
- void ggml_cuda_op_pad(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
-+void ggml_cuda_op_unpad(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
-diff --git a/ggml/src/ggml-metal/ggml-metal.m b/ggml/src/ggml-metal/ggml-metal.m
-index 1b56f858..7641247e 100644
---- a/ggml/src/ggml-metal/ggml-metal.m
-+++ b/ggml/src/ggml-metal/ggml-metal.m
-@@ -347,6 +347,7 @@ static void ggml_backend_metal_device_rel(struct ggml_backend_metal_device_conte
-     GGML_METAL_KERNEL_TYPE_UPSCALE_F32,
-     GGML_METAL_KERNEL_TYPE_PAD_F32,
-     GGML_METAL_KERNEL_TYPE_PAD_REFLECT_1D_F32,
-+    GGML_METAL_KERNEL_TYPE_UNPAD_F32,
-     GGML_METAL_KERNEL_TYPE_ARANGE_F32,
-     GGML_METAL_KERNEL_TYPE_TIMESTEP_EMBEDDING_F32,
-     GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_ASC,
-@@ -1294,6 +1295,7 @@ @implementation GGMLMetalClass
-         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_UPSCALE_F32,                     upscale_f32,                     true);
-         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_PAD_F32,                         pad_f32,                         true);
-         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_PAD_REFLECT_1D_F32,              pad_reflect_1d_f32,              true);
-+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_UNPAD_F32,                       unpad_f32,                       true);
-         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_TIMESTEP_EMBEDDING_F32,          timestep_embedding_f32,          true);
-         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ARANGE_F32,                      arange_f32,                      true);
-         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_ASC,             argsort_f32_i32_asc,             true);
-@@ -1655,6 +1657,7 @@ static bool ggml_metal_supports_op(const struct ggml_backend_metal_device_contex
-         case GGML_OP_POOL_2D:
-         case GGML_OP_PAD:
-         case GGML_OP_PAD_REFLECT_1D:
-+        case GGML_OP_UNPAD:
-         case GGML_OP_TIMESTEP_EMBEDDING:
-         case GGML_OP_ARGSORT:
-         case GGML_OP_LEAKY_RELU:
-@@ -4184,6 +4187,36 @@ static bool ggml_metal_encode_node(
- 
-                 const int nth = MIN(1024, ne0);
- 
-+                [encoder dispatchThreadgroups:MTLSizeMake(ne1, ne2, ne3) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
-+            } break;
-+        case GGML_OP_UNPAD:
-+            {
-+                GGML_ASSERT(src0->type == GGML_TYPE_F32);
-+
-+                id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_UNPAD_F32].pipeline;
-+
-+                [encoder setComputePipelineState:pipeline];
-+                [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
-+                [encoder setBuffer:id_dst  offset:offs_dst  atIndex:1];
-+                [encoder setBytes:&ne00 length:sizeof(ne00) atIndex:2];
-+                [encoder setBytes:&ne01 length:sizeof(ne01) atIndex:3];
-+                [encoder setBytes:&ne02 length:sizeof(ne02) atIndex:4];
-+                [encoder setBytes:&ne03 length:sizeof(ne03) atIndex:5];
-+                [encoder setBytes:&nb00 length:sizeof(nb00) atIndex:6];
-+                [encoder setBytes:&nb01 length:sizeof(nb01) atIndex:7];
-+                [encoder setBytes:&nb02 length:sizeof(nb02) atIndex:8];
-+                [encoder setBytes:&nb03 length:sizeof(nb03) atIndex:9];
-+                [encoder setBytes:&ne0  length:sizeof(ne0)  atIndex:10];
-+                [encoder setBytes:&ne1  length:sizeof(ne1)  atIndex:11];
-+                [encoder setBytes:&ne2  length:sizeof(ne2)  atIndex:12];
-+                [encoder setBytes:&ne3  length:sizeof(ne3)  atIndex:13];
-+                [encoder setBytes:&nb0  length:sizeof(nb0)  atIndex:14];
-+                [encoder setBytes:&nb1  length:sizeof(nb1)  atIndex:15];
-+                [encoder setBytes:&nb2  length:sizeof(nb2)  atIndex:16];
-+                [encoder setBytes:&nb3  length:sizeof(nb3)  atIndex:17];
-+
-+                const int nth = MIN(1024, ne0);
-+
-                 [encoder dispatchThreadgroups:MTLSizeMake(ne1, ne2, ne3) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
-             } break;
-         case GGML_OP_ARANGE:
-diff --git a/ggml/src/ggml-metal/ggml-metal.metal b/ggml/src/ggml-metal/ggml-metal.metal
-index 9cfddf45..080a943b 100644
---- a/ggml/src/ggml-metal/ggml-metal.metal
-+++ b/ggml/src/ggml-metal/ggml-metal.metal
-@@ -3121,6 +3121,51 @@ kernel void kernel_pad_reflect_1d_f32(
-     }
- }
- 
-+kernel void kernel_unpad_f32(
-+    device  const char * src0,
-+    device        char * dst,
-+    constant   int64_t & ne00,
-+    constant   int64_t & ne01,
-+    constant   int64_t & ne02,
-+    constant   int64_t & ne03,
-+    constant  uint64_t & nb00,
-+    constant  uint64_t & nb01,
-+    constant  uint64_t & nb02,
-+    constant  uint64_t & nb03,
-+    constant   int64_t & ne0,
-+    constant   int64_t & ne1,
-+    constant   int64_t & ne2,
-+    constant   int64_t & ne3,
-+    constant  uint64_t & nb0,
-+    constant  uint64_t & nb1,
-+    constant  uint64_t & nb2,
-+    constant  uint64_t & nb3,
-+    uint3 tgpig[[threadgroup_position_in_grid]],
-+    uint3 tpitg[[thread_position_in_threadgroup]],
-+    uint3   ntg[[threads_per_threadgroup]]) {
-+
-+    const int64_t i3 = tgpig.z;
-+    const int64_t i2 = tgpig.y;
-+    const int64_t i1 = tgpig.x;
-+
-+    const int64_t i03 = i3;
-+    const int64_t i02 = i2;
-+    const int64_t i01 = i1;
-+
-+    device const float * src0_ptr = (device const float *) (src0 + i03*nb03 + i02*nb02 + i01*nb01);
-+    device       float * dst_ptr  = (device       float *) (dst  +  i3*nb3  +  i2*nb2  +  i1*nb1);
-+
-+    if (i1 < ne01 && i2 < ne02 && i3 < ne03) {
-+        for (int i0 = tpitg.x; i0 < ne0; i0 += ntg.x) {
-+            if (i0 < ne00) {
-+                dst_ptr[i0] = src0_ptr[i0];
-+            }
-+        }
-+
-+        return;
-+    }
-+}
-+
- kernel void kernel_arange_f32(
-     device        char * dst,
-     constant   ggml_metal_kargs_arange & args,
-diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c
-index 8a654624..6b034d35 100644
---- a/ggml/src/ggml.c
-+++ b/ggml/src/ggml.c
-@@ -923,6 +923,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
-     "UPSCALE",
-     "PAD",
-     "PAD_REFLECT_1D",
-+    "UNPAD",
-     "ARANGE",
-     "TIMESTEP_EMBEDDING",
-     "ARGSORT",
-@@ -953,7 +954,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
-     "OPT_STEP_ADAMW",
- };
- 
--static_assert(GGML_OP_COUNT == 82, "GGML_OP_COUNT != 82");
-+static_assert(GGML_OP_COUNT == 83, "GGML_OP_COUNT != 83");
- 
- static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
-     "none",
-@@ -1018,6 +1019,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
-     "upscale(x)",
-     "pad(x)",
-     "pad_reflect_1d(x)",
-+    "unpad(x)",
-     "arange(start, stop, step)",
-     "timestep_embedding(timesteps, dim, max_period)",
-     "argsort(x)",
-@@ -1048,7 +1050,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
-     "adamw(x)",
- };
- 
--static_assert(GGML_OP_COUNT == 82, "GGML_OP_COUNT != 82");
-+static_assert(GGML_OP_COUNT == 83, "GGML_OP_COUNT != 83");
- 
- static_assert(GGML_OP_POOL_COUNT == 2, "GGML_OP_POOL_COUNT != 2");
- 
-@@ -4274,6 +4276,25 @@ struct ggml_tensor * ggml_pad_reflect_1d(
-     return result;
- }
- 
-+// ggml_unpad
-+
-+struct ggml_tensor * ggml_unpad(
-+    struct ggml_context * ctx,
-+    struct ggml_tensor  * a,
-+    int p0, int p1, int p2, int p3) {
-+
-+    struct ggml_tensor * result = ggml_new_tensor_4d(ctx, a->type,
-+            a->ne[0] - p0,
-+            a->ne[1] - p1,
-+            a->ne[2] - p2,
-+            a->ne[3] - p3);
-+
-+    result->op = GGML_OP_UNPAD;
-+    result->src[0] = a;
-+
-+    return result;
-+}
-+
- // ggml_arange
- 
- struct ggml_tensor * ggml_arange(
diff --git a/llama/patches/0009-maintain-ordering-for-rules-for-grammar.patch b/llama/patches/0007-maintain-ordering-for-rules-for-grammar.patch
similarity index 100%
rename from llama/patches/0009-maintain-ordering-for-rules-for-grammar.patch
rename to llama/patches/0007-maintain-ordering-for-rules-for-grammar.patch
diff --git a/llama/patches/0010-ensure-KV-cache-is-fully-defragmented.patch b/llama/patches/0008-ensure-KV-cache-is-fully-defragmented.patch
similarity index 94%
rename from llama/patches/0010-ensure-KV-cache-is-fully-defragmented.patch
rename to llama/patches/0008-ensure-KV-cache-is-fully-defragmented.patch
index c5faeaaa..52116ce3 100644
--- a/llama/patches/0010-ensure-KV-cache-is-fully-defragmented.patch
+++ b/llama/patches/0008-ensure-KV-cache-is-fully-defragmented.patch
@@ -58,7 +58,7 @@ index c22687e4..c5948e8f 100644
  
              auto * gf = graph_init();
 diff --git a/src/llama-context.h b/src/llama-context.h
-index c4ab242a..9970dfc6 100644
+index c0ceacb1..0264e937 100644
 --- a/src/llama-context.h
 +++ b/src/llama-context.h
 @@ -5,6 +5,7 @@
@@ -70,10 +70,10 @@ index c4ab242a..9970dfc6 100644
  #include "ggml-cpp.h"
  #include "ggml-opt.h"
 diff --git a/src/llama-kv-cache.cpp b/src/llama-kv-cache.cpp
-index a7b0a7eb..1a50c034 100644
+index 3dcad65b..60e67b03 100644
 --- a/src/llama-kv-cache.cpp
 +++ b/src/llama-kv-cache.cpp
-@@ -372,8 +372,6 @@ void llama_kv_cache_unified::commit() {
+@@ -364,8 +364,6 @@ void llama_kv_cache_unified::commit() {
  }
  
  bool llama_kv_cache_unified::update(llama_context & lctx) {
@@ -82,7 +82,7 @@ index a7b0a7eb..1a50c034 100644
      auto * sched = lctx.get_sched();
  
      if (has_shift) {
-@@ -396,8 +394,6 @@ bool llama_kv_cache_unified::update(llama_context & lctx) {
+@@ -388,8 +386,6 @@ bool llama_kv_cache_unified::update(llama_context & lctx) {
              res->set_inputs(nullptr);
  
              lctx.graph_compute(gf, false);
@@ -91,7 +91,7 @@ index a7b0a7eb..1a50c034 100644
          }
  
          {
-@@ -411,27 +407,36 @@ bool llama_kv_cache_unified::update(llama_context & lctx) {
+@@ -403,27 +399,36 @@ bool llama_kv_cache_unified::update(llama_context & lctx) {
  
      if (do_defrag) {
          LLAMA_LOG_DEBUG("%s: defragmenting KV cache\n", __func__);
@@ -133,7 +133,7 @@ index a7b0a7eb..1a50c034 100644
  }
  
  void llama_kv_cache_unified::defrag_sched(float thold) {
-@@ -715,11 +720,10 @@ llm_graph_result_ptr llama_kv_cache_unified::build_graph_shift(
+@@ -707,11 +712,10 @@ llm_graph_result_ptr llama_kv_cache_unified::build_graph_shift(
  llm_graph_result_ptr llama_kv_cache_unified::build_graph_defrag(
          const llama_cparams & cparams,
                 ggml_context * ctx,
@@ -147,7 +147,7 @@ index a7b0a7eb..1a50c034 100644
  #if 0
      // CPU defrag
      //
-@@ -791,32 +795,20 @@ llm_graph_result_ptr llama_kv_cache_unified::build_graph_defrag(
+@@ -783,32 +787,20 @@ llm_graph_result_ptr llama_kv_cache_unified::build_graph_defrag(
          ggml_backend_tensor_set(v_l[il], buf_v.data(), 0, buf_v.size());
      }
  #else
@@ -185,7 +185,7 @@ index a7b0a7eb..1a50c034 100644
  
              ggml_tensor * view_v_src;
              ggml_tensor * view_v_dst;
-@@ -824,31 +816,29 @@ llm_graph_result_ptr llama_kv_cache_unified::build_graph_defrag(
+@@ -816,31 +808,29 @@ llm_graph_result_ptr llama_kv_cache_unified::build_graph_defrag(
              if (cparams.flash_attn) {
                  // NOTE: the V cache is not transposed when using flash attention
                  view_v_src = ggml_view_2d(ctx, v_l[il],
@@ -225,7 +225,7 @@ index a7b0a7eb..1a50c034 100644
      }
  
      //LLAMA_LOG_INFO("gf->n_nodes = %d\n", gf->n_nodes);
-@@ -865,17 +855,7 @@ bool llama_kv_cache_unified::defrag_prepare(int32_t n_max_nodes) {
+@@ -857,17 +847,7 @@ bool llama_kv_cache_unified::defrag_prepare(int32_t n_max_nodes) {
  
      assert(n_used <= n_kv);
  
@@ -244,7 +244,7 @@ index a7b0a7eb..1a50c034 100644
  
      // determine which KV cells to move where
      //
-@@ -883,10 +863,7 @@ bool llama_kv_cache_unified::defrag_prepare(int32_t n_max_nodes) {
+@@ -875,10 +855,7 @@ bool llama_kv_cache_unified::defrag_prepare(int32_t n_max_nodes) {
      //
      //  if ids[i] == i || ids[i] == n_kv, then cell i is not moved
      //
@@ -256,7 +256,7 @@ index a7b0a7eb..1a50c034 100644
  
      for (uint32_t i0 = 0; i0 < n_used; ++i0) {
          const auto & cell0 = cells[i0];
-@@ -935,19 +912,11 @@ bool llama_kv_cache_unified::defrag_prepare(int32_t n_max_nodes) {
+@@ -927,19 +904,11 @@ bool llama_kv_cache_unified::defrag_prepare(int32_t n_max_nodes) {
          // are we moving a continuous block of memory?
          bool cont = false;
  
@@ -276,7 +276,7 @@ index a7b0a7eb..1a50c034 100644
                  cont = false;
                  continue;
              }
-@@ -963,8 +932,10 @@ bool llama_kv_cache_unified::defrag_prepare(int32_t n_max_nodes) {
+@@ -955,8 +924,10 @@ bool llama_kv_cache_unified::defrag_prepare(int32_t n_max_nodes) {
              head = n_used;
  
              if (!cont) {
@@ -288,7 +288,7 @@ index a7b0a7eb..1a50c034 100644
              }
  
              nf++;
-@@ -974,22 +945,16 @@ bool llama_kv_cache_unified::defrag_prepare(int32_t n_max_nodes) {
+@@ -966,22 +937,16 @@ bool llama_kv_cache_unified::defrag_prepare(int32_t n_max_nodes) {
              }
          }
  
diff --git a/llama/patches/0011-sort-devices-by-score.patch b/llama/patches/0009-sort-devices-by-score.patch
similarity index 99%
rename from llama/patches/0011-sort-devices-by-score.patch
rename to llama/patches/0009-sort-devices-by-score.patch
index 8c3908cf..e27d1ae9 100644
--- a/llama/patches/0011-sort-devices-by-score.patch
+++ b/llama/patches/0009-sort-devices-by-score.patch
@@ -11,7 +11,7 @@ with the fastest acceleration is loaded
  1 file changed, 13 insertions(+), 8 deletions(-)
 
 diff --git a/ggml/src/ggml-backend-reg.cpp b/ggml/src/ggml-backend-reg.cpp
-index 82ae1b5b..1487f322 100644
+index 405d8e31..4e67d243 100644
 --- a/ggml/src/ggml-backend-reg.cpp
 +++ b/ggml/src/ggml-backend-reg.cpp
 @@ -157,7 +157,7 @@ struct ggml_backend_reg_entry {
diff --git a/llama/patches/0012-add-phony-target-ggml-cpu-for-all-cpu-variants.patch b/llama/patches/0010-add-phony-target-ggml-cpu-for-all-cpu-variants.patch
similarity index 100%
rename from llama/patches/0012-add-phony-target-ggml-cpu-for-all-cpu-variants.patch
rename to llama/patches/0010-add-phony-target-ggml-cpu-for-all-cpu-variants.patch
diff --git a/llama/patches/0013-remove-amx.patch b/llama/patches/0011-remove-amx.patch
similarity index 100%
rename from llama/patches/0013-remove-amx.patch
rename to llama/patches/0011-remove-amx.patch
diff --git a/llama/patches/0014-fix-string-arr-kv-loading.patch b/llama/patches/0012-fix-string-arr-kv-loading.patch
similarity index 100%
rename from llama/patches/0014-fix-string-arr-kv-loading.patch
rename to llama/patches/0012-fix-string-arr-kv-loading.patch
diff --git a/llama/patches/0015-ollama-debug-tensor.patch b/llama/patches/0013-ollama-debug-tensor.patch
similarity index 91%
rename from llama/patches/0015-ollama-debug-tensor.patch
rename to llama/patches/0013-ollama-debug-tensor.patch
index d8f9fc8a..53d91127 100644
--- a/llama/patches/0015-ollama-debug-tensor.patch
+++ b/llama/patches/0013-ollama-debug-tensor.patch
@@ -8,7 +8,7 @@ Subject: [PATCH] ollama debug tensor
  1 file changed, 6 insertions(+)
 
 diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c
-index 835e6495..3902894b 100644
+index a30e67f2..2462d2b8 100644
 --- a/ggml/src/ggml-cpu/ggml-cpu.c
 +++ b/ggml/src/ggml-cpu/ggml-cpu.c
 @@ -15,6 +15,8 @@
@@ -20,7 +20,7 @@ index 835e6495..3902894b 100644
  #if defined(_MSC_VER) || defined(__MINGW32__)
  #include <malloc.h> // using malloc.h with MSC/MINGW
  #elif !defined(__FreeBSD__) && !defined(__NetBSD__) && !defined(__OpenBSD__)
-@@ -2846,6 +2848,10 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
+@@ -2841,6 +2843,10 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
  
          ggml_compute_forward(&params, node);
  
diff --git a/llama/patches/0016-add-ollama-vocab-for-grammar-support.patch b/llama/patches/0014-add-ollama-vocab-for-grammar-support.patch
similarity index 100%
rename from llama/patches/0016-add-ollama-vocab-for-grammar-support.patch
rename to llama/patches/0014-add-ollama-vocab-for-grammar-support.patch
diff --git a/llm/memory.go b/llm/memory.go
index 76082bf7..b5a8dd5c 100644
--- a/llm/memory.go
+++ b/llm/memory.go
@@ -111,9 +111,8 @@ func EstimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []strin
 	slog.Debug("evaluating", "library", gpus[0].Library, "gpu_count", len(gpus), "available", availableList)
 
 	for _, projector := range projectors {
-		weight, graph := projectorMemoryRequirements(projector)
+		weight := projectorMemoryRequirements(projector)
 		projectorWeights += weight
-		projectorGraph += graph
 
 		// multimodal models require at least 2048 context
 		opts.NumCtx = max(opts.NumCtx, 2048)
@@ -409,51 +408,21 @@ func (m MemoryEstimate) LogValue() slog.Value {
 	return slog.GroupValue(attrs...)
 }
 
-func projectorMemoryRequirements(filename string) (weights, graphSize uint64) {
+func projectorMemoryRequirements(filename string) (weights uint64) {
 	file, err := os.Open(filename)
 	if err != nil {
-		return 0, 0
+		return 0
 	}
 	defer file.Close()
 
 	ggml, _, err := ggml.Decode(file, 1024)
 	if err != nil {
-		return 0, 0
+		return 0
 	}
 
 	for _, layer := range ggml.Tensors().GroupLayers() {
 		weights += layer.Size()
 	}
 
-	switch arch := ggml.KV().Architecture(); arch {
-	case "mllama":
-		kv := func(n string) uint64 {
-			if v, ok := ggml.KV()[arch+".vision."+n].(uint32); ok {
-				return uint64(v)
-			}
-
-			return 0
-		}
-
-		imageSize := kv("image_size")
-
-		maxNumTiles := kv("max_num_tiles")
-		embeddingLength := kv("embedding_length")
-		headCount := kv("attention.head_count")
-
-		numPatches := (imageSize / kv("patch_size")) * (imageSize / kv("patch_size"))
-		if _, ok := ggml.Tensors().GroupLayers()["v"]["class_embd"]; ok {
-			numPatches++
-		}
-
-		numPaddedPatches := numPatches + 8 - (numPatches%8)%8
-
-		graphSize = 4 * (8 +
-			imageSize*imageSize*kv("num_channels")*maxNumTiles +
-			embeddingLength*numPatches*maxNumTiles +
-			9*embeddingLength*numPaddedPatches*maxNumTiles +
-			numPaddedPatches*maxNumTiles*numPaddedPatches*maxNumTiles*headCount)
-	}
-
-	return weights, graphSize
+	return weights
 }
diff --git a/llm/server.go b/llm/server.go
index a64669c2..c07315fa 100644
--- a/llm/server.go
+++ b/llm/server.go
@@ -679,9 +679,8 @@ ws ::= ([ \t\n] ws)?
 const maxBufferSize = 512 * format.KiloByte
 
 type ImageData struct {
-	Data          []byte `json:"data"`
-	ID            int    `json:"id"`
-	AspectRatioID int    `json:"aspect_ratio_id"`
+	Data []byte `json:"data"`
+	ID   int    `json:"id"`
 }
 
 type CompletionRequest struct {
diff --git a/ml/backend.go b/ml/backend.go
index ba24ecb4..f84a9984 100644
--- a/ml/backend.go
+++ b/ml/backend.go
@@ -161,7 +161,6 @@ type Tensor interface {
 	Set(ctx Context, t2 Tensor, offset int, strides ...int) Tensor
 
 	Pad(ctx Context, shape ...int) Tensor
-	Unpad(ctx Context, shape ...int) Tensor
 
 	Stack(ctx Context, dim int, s ...Tensor) Tensor
 
diff --git a/ml/backend/ggml/ggml.go b/ml/backend/ggml/ggml.go
index e97795a6..e1aa687c 100644
--- a/ml/backend/ggml/ggml.go
+++ b/ml/backend/ggml/ggml.go
@@ -1017,17 +1017,6 @@ func (t *Tensor) Sigmoid(ctx ml.Context) ml.Tensor {
 	}
 }
 
-func (t *Tensor) Unpad(ctx ml.Context, shape ...int) ml.Tensor {
-	if len(shape) != 4 {
-		panic("expected 4 dimensions")
-	}
-
-	return &Tensor{
-		b: t.b,
-		t: C.ggml_unpad(ctx.(*Context).ctx, t.t, C.int(shape[0]), C.int(shape[1]), C.int(shape[2]), C.int(shape[3])),
-	}
-}
-
 func (t *Tensor) View(ctx ml.Context, offset int, shape ...int) ml.Tensor {
 	switch len(shape) {
 	case 1:
diff --git a/ml/backend/ggml/ggml/include/ggml.h b/ml/backend/ggml/ggml/include/ggml.h
index 8dc107ba..e91dedf1 100644
--- a/ml/backend/ggml/ggml/include/ggml.h
+++ b/ml/backend/ggml/ggml/include/ggml.h
@@ -489,7 +489,6 @@ extern "C" {
         GGML_OP_UPSCALE, // nearest interpolate
         GGML_OP_PAD,
         GGML_OP_PAD_REFLECT_1D,
-        GGML_OP_UNPAD,
         GGML_OP_ARANGE,
         GGML_OP_TIMESTEP_EMBEDDING,
         GGML_OP_ARGSORT,
@@ -1782,15 +1781,6 @@ extern "C" {
             int                   p0,
             int                   p1);
 
-    // unpad each dimension: [x, ..., x, y, ..., y] -> [x, ..., x]
-    GGML_API struct ggml_tensor * ggml_unpad(
-            struct ggml_context * ctx,
-            struct ggml_tensor  * a,
-            int                  p0,
-            int                  p1,
-            int                  p2,
-            int                  p3);
-
     // Ref: https://github.com/CompVis/stable-diffusion/blob/main/ldm/modules/diffusionmodules/util.py#L151
     // timesteps: [N,]
     // return: [N, dim]
diff --git a/ml/backend/ggml/ggml/src/ggml-backend-reg.cpp b/ml/backend/ggml/ggml/src/ggml-backend-reg.cpp
index 1487f322..4e67d243 100644
--- a/ml/backend/ggml/ggml/src/ggml-backend-reg.cpp
+++ b/ml/backend/ggml/ggml/src/ggml-backend-reg.cpp
@@ -178,9 +178,9 @@ struct ggml_backend_registry {
 #ifdef GGML_USE_CANN
         register_backend(ggml_backend_cann_reg());
 #endif
-// #ifdef GGML_USE_BLAS
-//         register_backend(ggml_backend_blas_reg());
-// #endif
+#ifdef GGML_USE_BLAS
+        register_backend(ggml_backend_blas_reg());
+#endif
 #ifdef GGML_USE_RPC
         register_backend(ggml_backend_rpc_reg());
 #endif
diff --git a/ml/backend/ggml/ggml/src/ggml-cpu/ggml-cpu.c b/ml/backend/ggml/ggml/src/ggml-cpu/ggml-cpu.c
index 3902894b..2462d2b8 100644
--- a/ml/backend/ggml/ggml/src/ggml-cpu/ggml-cpu.c
+++ b/ml/backend/ggml/ggml/src/ggml-cpu/ggml-cpu.c
@@ -1953,10 +1953,6 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
             {
                 ggml_compute_forward_pad_reflect_1d(params, tensor);
             } break;
-        case GGML_OP_UNPAD:
-            {
-                ggml_compute_forward_unpad(params, tensor);
-            } break;
         case GGML_OP_ARANGE:
             {
                 ggml_compute_forward_arange(params, tensor);
@@ -2280,7 +2276,6 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
         case GGML_OP_UPSCALE:
         case GGML_OP_PAD:
         case GGML_OP_PAD_REFLECT_1D:
-        case GGML_OP_UNPAD:
         case GGML_OP_ARANGE:
         case GGML_OP_TIMESTEP_EMBEDDING:
         case GGML_OP_ARGSORT:
diff --git a/ml/backend/ggml/ggml/src/ggml-cpu/ops.cpp b/ml/backend/ggml/ggml/src/ggml-cpu/ops.cpp
index 1868a10c..955fec59 100644
--- a/ml/backend/ggml/ggml/src/ggml-cpu/ops.cpp
+++ b/ml/backend/ggml/ggml/src/ggml-cpu/ops.cpp
@@ -6690,61 +6690,6 @@ void ggml_compute_forward_pad_reflect_1d(
     }
 }
 
-// ggml_compute_forward_unpad
-
-static void ggml_compute_forward_unpad_f32(
-    const struct ggml_compute_params *params,
-    struct ggml_tensor *dst) {
-
-    const struct ggml_tensor * src0 = dst->src[0];
-
-    GGML_ASSERT(src0->nb[0] == sizeof(float));
-    GGML_ASSERT( dst->nb[0] == sizeof(float));
-
-    const int ith = params->ith;
-    const int nth = params->nth;
-
-    GGML_TENSOR_UNARY_OP_LOCALS
-
-    float * dst_ptr = (float *) dst->data;
-
-    // TODO: optimize
-
-    for (int64_t i2 = 0; i2 < ne2; ++i2) {
-        for (int64_t i1 = ith; i1 < ne1; i1 += nth) {
-            for (int64_t i0 = 0; i0 < ne0; ++i0) {
-                for (int64_t i3 = 0; i3 < ne3; ++i3) {
-                    const int64_t dst_idx = i3*(ne0*ne1*ne2) + i2*(ne0*ne1) + i1*ne0 + i0;
-
-                    const float * src_ptr = (const float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
-
-                    if (i0 < ne00 && i1 < ne01 && i2 < ne02 && i3 < ne03) {
-                        dst_ptr[dst_idx] = *src_ptr;
-                    }
-                }
-            }
-        }
-    }
-}
-
-void ggml_compute_forward_unpad(
-    const struct ggml_compute_params * params,
-    struct ggml_tensor * dst) {
-
-    const struct ggml_tensor * src0 = dst->src[0];
-
-    switch (src0->type) {
-        case GGML_TYPE_F32:
-            {
-                ggml_compute_forward_unpad_f32(params, dst);
-            } break;
-        default:
-            {
-                GGML_ABORT("fatal error");
-            }
-    }
-}
-
 // ggml_compute_forward_arange
 
 static void ggml_compute_forward_arange_f32(
diff --git a/ml/backend/ggml/ggml/src/ggml-cpu/ops.h b/ml/backend/ggml/ggml/src/ggml-cpu/ops.h
index a7125555..dc081b9e 100644
--- a/ml/backend/ggml/ggml/src/ggml-cpu/ops.h
+++ b/ml/backend/ggml/ggml/src/ggml-cpu/ops.h
@@ -72,7 +72,6 @@ void ggml_compute_forward_pool_2d_back(const struct ggml_compute_params * params
 void ggml_compute_forward_upscale(const struct ggml_compute_params * params, struct ggml_tensor * dst);
 void ggml_compute_forward_pad(const struct ggml_compute_params * params, struct ggml_tensor * dst);
 void ggml_compute_forward_pad_reflect_1d(const struct ggml_compute_params * params, struct ggml_tensor * dst);
-void ggml_compute_forward_unpad(const struct ggml_compute_params * params, struct ggml_tensor * dst);
 void ggml_compute_forward_arange(const struct ggml_compute_params * params, struct ggml_tensor * dst);
 void ggml_compute_forward_timestep_embedding(const struct ggml_compute_params * params, struct ggml_tensor * dst);
 void ggml_compute_forward_argsort(const struct ggml_compute_params * params, struct ggml_tensor * dst);
diff --git a/ml/backend/ggml/ggml/src/ggml-cuda/ggml-cuda.cu b/ml/backend/ggml/ggml/src/ggml-cuda/ggml-cuda.cu
index 6fe86674..cb0d8528 100644
--- a/ml/backend/ggml/ggml/src/ggml-cuda/ggml-cuda.cu
+++ b/ml/backend/ggml/ggml/src/ggml-cuda/ggml-cuda.cu
@@ -2238,9 +2238,6 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
         case GGML_OP_PAD:
             ggml_cuda_op_pad(ctx, dst);
             break;
-        case GGML_OP_UNPAD:
-            ggml_cuda_op_unpad(ctx, dst);
-            break;
         case GGML_OP_ARANGE:
             ggml_cuda_op_arange(ctx, dst);
             break;
@@ -3215,7 +3212,6 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
         case GGML_OP_UPSCALE:
             return op->src[0]->type == GGML_TYPE_F32 && op->op_params[0] == GGML_SCALE_MODE_NEAREST;
         case GGML_OP_PAD:
-        case GGML_OP_UNPAD:
         case GGML_OP_ARANGE:
         case GGML_OP_TIMESTEP_EMBEDDING:
         case GGML_OP_LEAKY_RELU:
diff --git a/ml/backend/ggml/ggml/src/ggml-cuda/pad.cu b/ml/backend/ggml/ggml/src/ggml-cuda/pad.cu
index 7d45a7e1..77432b04 100644
--- a/ml/backend/ggml/ggml/src/ggml-cuda/pad.cu
+++ b/ml/backend/ggml/ggml/src/ggml-cuda/pad.cu
@@ -47,49 +47,3 @@ void ggml_cuda_op_pad(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
         src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3],
         dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3], stream);
 }
-
-static __global__ void unpad_f32(const float * x, float * dst, const int ne0, const int ne00, const int ne01, const int ne02, const int ne03) {
-    // blockIdx.z: idx of ne2*ne3, aka ne02*ne03
-    // blockIdx.y: idx of ne1
-    // blockIDx.x: idx of ne0 / BLOCK_SIZE
-    int nidx = threadIdx.x + blockIdx.x * blockDim.x;
-    if (nidx >= ne0) {
-        return;
-    }
-
-    // operation
-    int offset_dst =
-        nidx +
-        blockIdx.y * ne0 +
-        blockIdx.z * ne0 * gridDim.y;
-    if (nidx < ne00 && blockIdx.y < ne01 && blockIdx.z < ne02*ne03) {
-        int offset_src =
-            nidx +
-            blockIdx.y * ne00 +
-            blockIdx.z * ne00 * ne01;
-        dst[offset_dst] = x[offset_src];
-    }
-}
-
-static void unpad_f32_cuda(const float * x, float * dst,
-    const int ne00, const int ne01, const int ne02, const int ne03,
-    const int ne0, const int ne1, const int ne2, const int ne3, cudaStream_t stream) {
-    int num_blocks = (ne0 + CUDA_PAD_BLOCK_SIZE - 1) / CUDA_PAD_BLOCK_SIZE;
-    dim3 gridDim(num_blocks, ne1, ne2*ne3);
-    unpad_f32<<<gridDim, CUDA_PAD_BLOCK_SIZE, 0, stream>>>(x, dst, ne0, ne00, ne01, ne02, ne03);
-}
-
-void ggml_cuda_op_unpad(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
-    const ggml_tensor * src0 = dst->src[0];
-    const float * src0_d = (const float *)src0->data;
-    float * dst_d = (float *)dst->data;
-    cudaStream_t stream = ctx.stream();
-
-    GGML_ASSERT(src0->type == GGML_TYPE_F32);
-    GGML_ASSERT(dst->type == GGML_TYPE_F32);
-    GGML_ASSERT(src0->ne[3] == 1 && dst->ne[3] == 1); // just 3D tensors
-
-    unpad_f32_cuda(src0_d, dst_d,
-        src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3],
-        dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3], stream);
-}
\ No newline at end of file
diff --git a/ml/backend/ggml/ggml/src/ggml-cuda/pad.cuh b/ml/backend/ggml/ggml/src/ggml-cuda/pad.cuh
index e2ededc3..8fd386b0 100644
--- a/ml/backend/ggml/ggml/src/ggml-cuda/pad.cuh
+++ b/ml/backend/ggml/ggml/src/ggml-cuda/pad.cuh
@@ -3,4 +3,3 @@
 #define CUDA_PAD_BLOCK_SIZE 256
 
 void ggml_cuda_op_pad(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
-void ggml_cuda_op_unpad(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
diff --git a/ml/backend/ggml/ggml/src/ggml-metal/ggml-metal-embed.metal b/ml/backend/ggml/ggml/src/ggml-metal/ggml-metal-embed.metal
index 56fdb3cd..3656c238 100644
--- a/ml/backend/ggml/ggml/src/ggml-metal/ggml-metal-embed.metal
+++ b/ml/backend/ggml/ggml/src/ggml-metal/ggml-metal-embed.metal
@@ -5599,51 +5599,6 @@ kernel void kernel_pad_reflect_1d_f32(
     }
 }
 
-kernel void kernel_unpad_f32(
-    device  const char * src0,
-    device        char * dst,
-    constant   int64_t & ne00,
-    constant   int64_t & ne01,
-    constant   int64_t & ne02,
-    constant   int64_t & ne03,
-    constant  uint64_t & nb00,
-    constant  uint64_t & nb01,
-    constant  uint64_t & nb02,
-    constant  uint64_t & nb03,
-    constant   int64_t & ne0,
-    constant   int64_t & ne1,
-    constant   int64_t & ne2,
-    constant   int64_t & ne3,
-    constant  uint64_t & nb0,
-    constant  uint64_t & nb1,
-    constant  uint64_t & nb2,
-    constant  uint64_t & nb3,
-    uint3 tgpig[[threadgroup_position_in_grid]],
-    uint3 tpitg[[thread_position_in_threadgroup]],
-    uint3   ntg[[threads_per_threadgroup]]) {
-
-    const int64_t i3 = tgpig.z;
-    const int64_t i2 = tgpig.y;
-    const int64_t i1 = tgpig.x;
-
-    const int64_t i03 = i3;
-    const int64_t i02 = i2;
-    const int64_t i01 = i1;
-
-    device const float * src0_ptr = (device const float *) (src0 + i03*nb03 + i02*nb02 + i01*nb01);
-    device       float * dst_ptr  = (device       float *) (dst  +  i3*nb3  +  i2*nb2  +  i1*nb1);
-
-    if (i1 < ne01 && i2 < ne02 && i3 < ne03) {
-        for (int i0 = tpitg.x; i0 < ne0; i0 += ntg.x) {
-            if (i0 < ne00) {
-                dst_ptr[i0] = src0_ptr[i0];
-            }
-        }
-
-        return;
-    }
-}
-
 kernel void kernel_arange_f32(
     device        char * dst,
     constant   ggml_metal_kargs_arange & args,
diff --git a/ml/backend/ggml/ggml/src/ggml-metal/ggml-metal.m b/ml/backend/ggml/ggml/src/ggml-metal/ggml-metal.m
index 7641247e..1b56f858 100644
--- a/ml/backend/ggml/ggml/src/ggml-metal/ggml-metal.m
+++ b/ml/backend/ggml/ggml/src/ggml-metal/ggml-metal.m
@@ -347,7 +347,6 @@ enum ggml_metal_kernel_type {
     GGML_METAL_KERNEL_TYPE_UPSCALE_F32,
     GGML_METAL_KERNEL_TYPE_PAD_F32,
     GGML_METAL_KERNEL_TYPE_PAD_REFLECT_1D_F32,
-    GGML_METAL_KERNEL_TYPE_UNPAD_F32,
     GGML_METAL_KERNEL_TYPE_ARANGE_F32,
     GGML_METAL_KERNEL_TYPE_TIMESTEP_EMBEDDING_F32,
     GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_ASC,
@@ -1295,7 +1294,6 @@ static struct ggml_backend_metal_context * ggml_metal_init(ggml_backend_dev_t de
         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_UPSCALE_F32,                     upscale_f32,                     true);
         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_PAD_F32,                         pad_f32,                         true);
         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_PAD_REFLECT_1D_F32,              pad_reflect_1d_f32,              true);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_UNPAD_F32,                       unpad_f32,                       true);
         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_TIMESTEP_EMBEDDING_F32,          timestep_embedding_f32,          true);
         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ARANGE_F32,                      arange_f32,                      true);
         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_ASC,             argsort_f32_i32_asc,             true);
@@ -1657,7 +1655,6 @@ static bool ggml_metal_supports_op(const struct ggml_backend_metal_device_contex
         case GGML_OP_POOL_2D:
         case GGML_OP_PAD:
         case GGML_OP_PAD_REFLECT_1D:
-        case GGML_OP_UNPAD:
         case GGML_OP_TIMESTEP_EMBEDDING:
         case GGML_OP_ARGSORT:
         case GGML_OP_LEAKY_RELU:
@@ -4187,36 +4184,6 @@ static bool ggml_metal_encode_node(
 
                 const int nth = MIN(1024, ne0);
 
-                [encoder dispatchThreadgroups:MTLSizeMake(ne1, ne2, ne3) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
-            } break;
-        case GGML_OP_UNPAD:
-            {
-                GGML_ASSERT(src0->type == GGML_TYPE_F32);
-
-                id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_UNPAD_F32].pipeline;
-
-                [encoder setComputePipelineState:pipeline];
-                [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
-                [encoder setBuffer:id_dst  offset:offs_dst  atIndex:1];
-                [encoder setBytes:&ne00 length:sizeof(ne00) atIndex:2];
-                [encoder setBytes:&ne01 length:sizeof(ne01) atIndex:3];
-                [encoder setBytes:&ne02 length:sizeof(ne02) atIndex:4];
-                [encoder setBytes:&ne03 length:sizeof(ne03) atIndex:5];
-                [encoder setBytes:&nb00 length:sizeof(nb00) atIndex:6];
-                [encoder setBytes:&nb01 length:sizeof(nb01) atIndex:7];
-                [encoder setBytes:&nb02 length:sizeof(nb02) atIndex:8];
-                [encoder setBytes:&nb03 length:sizeof(nb03) atIndex:9];
-                [encoder setBytes:&ne0  length:sizeof(ne0)  atIndex:10];
-                [encoder setBytes:&ne1  length:sizeof(ne1)  atIndex:11];
-                [encoder setBytes:&ne2  length:sizeof(ne2)  atIndex:12];
-                [encoder setBytes:&ne3  length:sizeof(ne3)  atIndex:13];
-                [encoder setBytes:&nb0  length:sizeof(nb0)  atIndex:14];
-                [encoder setBytes:&nb1  length:sizeof(nb1)  atIndex:15];
-                [encoder setBytes:&nb2  length:sizeof(nb2)  atIndex:16];
-                [encoder setBytes:&nb3  length:sizeof(nb3)  atIndex:17];
-
-                const int nth = MIN(1024, ne0);
-
                 [encoder dispatchThreadgroups:MTLSizeMake(ne1, ne2, ne3) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
             } break;
         case GGML_OP_ARANGE:
diff --git a/ml/backend/ggml/ggml/src/ggml-metal/ggml-metal.metal b/ml/backend/ggml/ggml/src/ggml-metal/ggml-metal.metal
index 080a943b..9cfddf45 100644
--- a/ml/backend/ggml/ggml/src/ggml-metal/ggml-metal.metal
+++ b/ml/backend/ggml/ggml/src/ggml-metal/ggml-metal.metal
@@ -3121,51 +3121,6 @@ kernel void kernel_pad_reflect_1d_f32(
     }
 }
 
-kernel void kernel_unpad_f32(
-    device  const char * src0,
-    device        char * dst,
-    constant   int64_t & ne00,
-    constant   int64_t & ne01,
-    constant   int64_t & ne02,
-    constant   int64_t & ne03,
-    constant  uint64_t & nb00,
-    constant  uint64_t & nb01,
-    constant  uint64_t & nb02,
-    constant  uint64_t & nb03,
-    constant   int64_t & ne0,
-    constant   int64_t & ne1,
-    constant   int64_t & ne2,
-    constant   int64_t & ne3,
-    constant  uint64_t & nb0,
-    constant  uint64_t & nb1,
-    constant  uint64_t & nb2,
-    constant  uint64_t & nb3,
-    uint3 tgpig[[threadgroup_position_in_grid]],
-    uint3 tpitg[[thread_position_in_threadgroup]],
-    uint3   ntg[[threads_per_threadgroup]]) {
-
-    const int64_t i3 = tgpig.z;
-    const int64_t i2 = tgpig.y;
-    const int64_t i1 = tgpig.x;
-
-    const int64_t i03 = i3;
-    const int64_t i02 = i2;
-    const int64_t i01 = i1;
-
-    device const float * src0_ptr = (device const float *) (src0 + i03*nb03 + i02*nb02 + i01*nb01);
-    device       float * dst_ptr  = (device       float *) (dst  +  i3*nb3  +  i2*nb2  +  i1*nb1);
-
-    if (i1 < ne01 && i2 < ne02 && i3 < ne03) {
-        for (int i0 = tpitg.x; i0 < ne0; i0 += ntg.x) {
-            if (i0 < ne00) {
-                dst_ptr[i0] = src0_ptr[i0];
-            }
-        }
-
-        return;
-    }
-}
-
 kernel void kernel_arange_f32(
     device        char * dst,
     constant   ggml_metal_kargs_arange & args,
diff --git a/ml/backend/ggml/ggml/src/ggml.c b/ml/backend/ggml/ggml/src/ggml.c
index 6b034d35..8a654624 100644
--- a/ml/backend/ggml/ggml/src/ggml.c
+++ b/ml/backend/ggml/ggml/src/ggml.c
@@ -923,7 +923,6 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
     "UPSCALE",
     "PAD",
     "PAD_REFLECT_1D",
-    "UNPAD",
     "ARANGE",
     "TIMESTEP_EMBEDDING",
     "ARGSORT",
@@ -954,7 +953,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
     "OPT_STEP_ADAMW",
 };
 
-static_assert(GGML_OP_COUNT == 83, "GGML_OP_COUNT != 83");
+static_assert(GGML_OP_COUNT == 82, "GGML_OP_COUNT != 82");
 
 static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
     "none",
@@ -1019,7 +1018,6 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
     "upscale(x)",
     "pad(x)",
     "pad_reflect_1d(x)",
-    "unpad(x)",
     "arange(start, stop, step)",
     "timestep_embedding(timesteps, dim, max_period)",
     "argsort(x)",
@@ -1050,7 +1048,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
     "adamw(x)",
 };
 
-static_assert(GGML_OP_COUNT == 83, "GGML_OP_COUNT != 83");
+static_assert(GGML_OP_COUNT == 82, "GGML_OP_COUNT != 82");
 
 static_assert(GGML_OP_POOL_COUNT == 2, "GGML_OP_POOL_COUNT != 2");
 
@@ -4276,25 +4274,6 @@ struct ggml_tensor * ggml_pad_reflect_1d(
     return result;
 }
 
-// ggml_unpad
-
-struct ggml_tensor * ggml_unpad(
-    struct ggml_context * ctx,
-    struct ggml_tensor  * a,
-    int p0, int p1, int p2, int p3) {
-
-    struct ggml_tensor * result = ggml_new_tensor_4d(ctx, a->type,
-            a->ne[0] - p0,
-            a->ne[1] - p1,
-            a->ne[2] - p2,
-            a->ne[3] - p3);
-
-    result->op = GGML_OP_UNPAD;
-    result->src[0] = a;
-
-    return result;
-}
-
 // ggml_arange
 
 struct ggml_tensor * ggml_arange(
diff --git a/model/models/llama4/model_vision.go b/model/models/llama4/model_vision.go
index 3bf9cee7..e6b1afef 100644
--- a/model/models/llama4/model_vision.go
+++ b/model/models/llama4/model_vision.go
@@ -208,7 +208,7 @@ func (m *VisionModel) Forward(ctx ml.Context, pixelValues ml.Tensor) ml.Tensor {
 	}
 
 	hiddenStates = m.LayerNormPost.Forward(ctx, hiddenStates, m.eps)
-	hiddenStates = hiddenStates.Unpad(ctx, 0, 1, 0, 0)
+	hiddenStates = hiddenStates.Pad(ctx, 0, -1, 0, 0)
 	hiddenStates = m.VisionAdapter.Forward(ctx, hiddenStates, m.VisionOptions)
 	return hiddenStates
 }
diff --git a/model/models/mllama/imageproc.go b/model/models/mllama/imageproc.go
deleted file mode 100644
index 13f2fb8b..00000000
--- a/model/models/mllama/imageproc.go
+++ /dev/null
@@ -1,201 +0,0 @@
-package mllama
-
-import (
-	"fmt"
-	"image"
-	_ "image/jpeg"
-	_ "image/png"
-	"io"
-	"math"
-	"slices"
-
-	"golang.org/x/image/draw"
-
-	"github.com/ollama/ollama/model/imageproc"
-)
-
-func getSupportedAspectRatios(maxTiles int) []image.Point {
-	ratios := []image.Point{}
-
-	for w := range maxTiles {
-		for h := range maxTiles {
-			if (w+1)*(h+1) <= maxTiles {
-				ratios = append(ratios, image.Point{w + 1, h + 1})
-			}
-		}
-	}
-
-	return ratios
-}
-
-func clip(a, a_min, a_max int) int {
-	if a < a_min {
-		return a_min
-	} else if a > a_max {
-		return a_max
-	}
-
-	return a
-}
-
-func getOptimalTiledCanvas(imageSize image.Point, maxImageTiles, tileSize int) image.Point {
-	possibleTileArrangements := getSupportedAspectRatios(maxImageTiles)
-	possibleCanvasSizes := []image.Point{}
-	for _, pta := range possibleTileArrangements {
-		possibleCanvasSizes = append(possibleCanvasSizes, image.Point{pta.X * tileSize, pta.Y * tileSize})
-	}
-
-	scales := []float64{}
-
-	for _, pcs := range possibleCanvasSizes {
-		scaleHeight := float64(pcs.Y) / float64(imageSize.Y)
-		scaleWidth := float64(pcs.X) / float64(imageSize.X)
-
-		if scaleWidth > scaleHeight {
-			scales = append(scales, scaleHeight)
-		} else {
-			scales = append(scales, scaleWidth)
-		}
-	}
-
-	var minUpscale float64
-	var maxDownscale float64
-	var upscale bool
-
-	for _, s := range scales {
-		if s > 1.0 {
-			upscale = true
-			if minUpscale == 0 {
-				minUpscale = s
-			} else {
-				minUpscale = math.Min(minUpscale, s)
-			}
-		} else {
-			maxDownscale = math.Max(maxDownscale, s)
-		}
-	}
-
-	selectedScale := maxDownscale
-	if upscale {
-		selectedScale = minUpscale
-	}
-
-	var selectedCanvas image.Point
-	for n, pcs := range possibleCanvasSizes {
-		if scales[n] == selectedScale {
-			// choose the smallest possible canvas
-			if selectedCanvas.X == 0 && selectedCanvas.Y == 0 {
-				selectedCanvas = pcs
-			} else if pcs.X*pcs.Y < selectedCanvas.X*selectedCanvas.Y {
-				selectedCanvas = pcs
-			}
-		}
-	}
-	return selectedCanvas
-}
-
-func getImageSizeFitToCanvas(imageSize, canvasSize image.Point, tileSize int) image.Point {
-	targetWidth := clip(imageSize.X, tileSize, canvasSize.X)
-	targetHeight := clip(imageSize.Y, tileSize, canvasSize.Y)
-
-	scaleWidth := float64(targetWidth) / float64(imageSize.X)
-	scaleHeight := float64(targetHeight) / float64(imageSize.Y)
-
-	var w, h int
-
-	if scaleWidth < scaleHeight {
-		w = targetWidth
-		h = min(int(math.Floor(float64(imageSize.Y)*scaleWidth)), targetHeight)
-	} else {
-		w = min(int(math.Floor(float64(imageSize.X)*scaleHeight)), targetWidth)
-		h = targetHeight
-	}
-
-	return image.Point{w, h}
-}
-
-func resizeImage(img image.Image, format string, outputSize image.Point, maxImageTiles int) (image.Image, image.Point) {
-	if format == "png" {
-		img = imageproc.Composite(img)
-	}
-
-	b := img.Bounds()
-	tileSize := outputSize.Y
-
-	canvasSize := getOptimalTiledCanvas(b.Max, maxImageTiles, tileSize)
-	aspectRatio := image.Point{canvasSize.X / tileSize, canvasSize.Y / tileSize}
-	newSize := getImageSizeFitToCanvas(b.Max, canvasSize, tileSize)
-
-	return imageproc.Resize(img, newSize, imageproc.ResizeBilinear), aspectRatio
-}
-
-func padImage(img image.Image, outputSize, aspectRatio image.Point) image.Image {
-	paddedSize := image.Point{
-		X: outputSize.X * aspectRatio.X,
-		Y: outputSize.Y * aspectRatio.Y,
-	}
-
-	dst := image.NewRGBA(image.Rect(0, 0, paddedSize.X, paddedSize.Y))
-	draw.Draw(dst, img.Bounds(), img, image.Point{0, 0}, draw.Over)
-
-	return dst
-}
-
-func splitToTiles(img image.Image, numTilesSize image.Point) []image.Image {
-	b := img.Bounds()
-	width := b.Max.X - b.Min.X
-	height := b.Max.Y - b.Min.Y
-	tileHeight := height / numTilesSize.Y
-	tileWidth := width / numTilesSize.X
-
-	images := []image.Image{}
-
-	for h := range numTilesSize.Y {
-		for w := range numTilesSize.X {
-			rect := image.Rect(tileWidth*w, tileHeight*h, tileWidth*(w+1), tileHeight*(h+1))
-			images = append(images, img.(interface {
-				SubImage(image.Rectangle) image.Image
-			}).SubImage(rect))
-		}
-	}
-
-	return images
-}
-
-func packImages(img image.Image, aspectRatio image.Point) []float32 {
-	subImages := splitToTiles(img, aspectRatio)
-
-	var pixelVals []float32
-
-	rescale := true
-	channelFirst := true
-
-	for _, subImg := range subImages {
-		vals := imageproc.Normalize(subImg, imageproc.ClipDefaultMean, imageproc.ClipDefaultSTD, rescale, channelFirst)
-		pixelVals = append(pixelVals, vals...)
-	}
-
-	return pixelVals
-}
-
-func Preprocess(imageData io.Reader) ([]float32, map[string]any, error) {
-	outputSize := image.Point{560, 560}
-	maxTiles := 4
-
-	img, format, err := image.Decode(imageData)
-	if err != nil {
-		return nil, nil, fmt.Errorf("failed to decode image: %w", err)
-	}
-
-	newImage, aspectRatio := resizeImage(img, format, outputSize, maxTiles)
-	newImage = padImage(newImage, outputSize, aspectRatio)
-
-	data := packImages(newImage, aspectRatio)
-	aspectRatioIndex := slices.Index(getSupportedAspectRatios(maxTiles), aspectRatio) + 1
-
-	opts := map[string]any{
-		"aspectRatioIndex": aspectRatioIndex,
-	}
-
-	return data, opts, nil
-}
diff --git a/model/models/mllama/imageproc_test.go b/model/models/mllama/imageproc_test.go
deleted file mode 100644
index a14b91bd..00000000
--- a/model/models/mllama/imageproc_test.go
+++ /dev/null
@@ -1,420 +0,0 @@
-package mllama
-
-import (
-	"bytes"
-	"image"
-	"image/png"
-	"testing"
-
-	"github.com/google/go-cmp/cmp"
-)
-
-func TestAspectRatios(t *testing.T) {
-	type aspectCase struct {
-		MaxTiles int
-		Expected []image.Point
-	}
-
-	cases := []aspectCase{
-		{
-			MaxTiles: 1,
-			Expected: []image.Point{{1, 1}},
-		},
-		{
-			MaxTiles: 2,
-			Expected: []image.Point{{1, 1}, {1, 2}, {2, 1}},
-		},
-		{
-			MaxTiles: 3,
-			Expected: []image.Point{{1, 1}, {1, 2}, {1, 3}, {2, 1}, {3, 1}},
-		},
-		{
-			MaxTiles: 4,
-			Expected: []image.Point{{1, 1}, {1, 2}, {1, 3}, {1, 4}, {2, 1}, {2, 2}, {3, 1}, {4, 1}},
-		},
-	}
-
-	for _, c := range cases {
-		actual := getSupportedAspectRatios(c.MaxTiles)
-
-		if diff := cmp.Diff(actual, c.Expected); diff != "" {
-			t.Errorf("mismatch (-got +want):\n%s", diff)
-		}
-	}
-}
-
-func TestGetImageSizeFitToCanvas(t *testing.T) {
-	type imageSizeCase struct {
-		ImageRect  image.Point
-		CanvasRect image.Point
-		TileSize   int
-		Expected   image.Point
-	}
-
-	cases := []imageSizeCase{
-		{
-			ImageRect:  image.Point{400, 400},
-			CanvasRect: image.Point{640, 480},
-			TileSize:   200,
-			Expected:   image.Point{400, 400},
-		},
-		{
-			ImageRect:  image.Point{1024, 768},
-			CanvasRect: image.Point{640, 480},
-			TileSize:   200,
-			Expected:   image.Point{640, 480},
-		},
-		{
-			ImageRect:  image.Point{500, 500},
-			CanvasRect: image.Point{1000, 1000},
-			TileSize:   750,
-			Expected:   image.Point{750, 750},
-		},
-		{
-			ImageRect:  image.Point{500, 1000},
-			CanvasRect: image.Point{2000, 2000},
-			TileSize:   2000,
-			Expected:   image.Point{1000, 2000},
-		},
-		{
-			ImageRect:  image.Point{4000, 3000},
-			CanvasRect: image.Point{2000, 1000},
-			TileSize:   1000,
-			Expected:   image.Point{1333, 1000},
-		},
-		{
-			ImageRect:  image.Point{667, 1000},
-			CanvasRect: image.Point{1000, 1000},
-			TileSize:   560,
-			Expected:   image.Point{667, 1000},
-		},
-	}
-
-	for _, c := range cases {
-		actual := getImageSizeFitToCanvas(c.ImageRect, c.CanvasRect, c.TileSize)
-
-		if actual != c.Expected {
-			t.Errorf("incorrect image rect: '%#v'. expected: '%#v'", actual, c.Expected)
-		}
-	}
-}
-
-func TestGetOptimalTiledCanvas(t *testing.T) {
-	type tiledCanvasSizeCase struct {
-		ImageSize     image.Point
-		MaxImageTiles int
-		TileSize      int
-		Expected      image.Point
-	}
-
-	cases := []tiledCanvasSizeCase{
-		{
-			ImageSize:     image.Point{1024, 768},
-			MaxImageTiles: 4,
-			TileSize:      1000,
-			Expected:      image.Point{2000, 1000},
-		},
-		{
-			ImageSize:     image.Point{1024, 768},
-			MaxImageTiles: 4,
-			TileSize:      560,
-			Expected:      image.Point{1120, 1120},
-		},
-		{
-			ImageSize:     image.Point{800, 600},
-			MaxImageTiles: 4,
-			TileSize:      560,
-			Expected:      image.Point{1120, 1120},
-		},
-		{
-			ImageSize:     image.Point{640, 480},
-			MaxImageTiles: 4,
-			TileSize:      560,
-			Expected:      image.Point{1120, 560},
-		},
-		{
-			ImageSize:     image.Point{320, 200},
-			MaxImageTiles: 4,
-			TileSize:      560,
-			Expected:      image.Point{560, 560},
-		},
-		{
-			ImageSize:     image.Point{1320, 200},
-			MaxImageTiles: 4,
-			TileSize:      560,
-			Expected:      image.Point{1680, 560},
-		},
-		{
-			ImageSize:     image.Point{2000, 200},
-			MaxImageTiles: 4,
-			TileSize:      560,
-			Expected:      image.Point{2240, 560},
-		},
-		{
-			ImageSize:     image.Point{10000, 200},
-			MaxImageTiles: 4,
-			TileSize:      560,
-			Expected:      image.Point{2240, 560},
-		},
-		{
-			ImageSize:     image.Point{480, 640},
-			MaxImageTiles: 4,
-			TileSize:      560,
-			Expected:      image.Point{560, 1120},
-		},
-		{
-			ImageSize:     image.Point{200, 320},
-			MaxImageTiles: 4,
-			TileSize:      560,
-			Expected:      image.Point{560, 560},
-		},
-		{
-			ImageSize:     image.Point{200, 1320},
-			MaxImageTiles: 4,
-			TileSize:      560,
-			Expected:      image.Point{560, 1680},
-		},
-		{
-			ImageSize:     image.Point{200, 2000},
-			MaxImageTiles: 4,
-			TileSize:      560,
-			Expected:      image.Point{560, 2240},
-		},
-		{
-			ImageSize:     image.Point{200, 10000},
-			MaxImageTiles: 4,
-			TileSize:      560,
-			Expected:      image.Point{560, 2240},
-		},
-		{
-			ImageSize:     image.Point{10000, 10000},
-			MaxImageTiles: 4,
-			TileSize:      560,
-			Expected:      image.Point{1120, 1120},
-		},
-	}
-
-	for _, c := range cases {
-		actual := getOptimalTiledCanvas(c.ImageSize, c.MaxImageTiles, c.TileSize)
-
-		if actual != c.Expected {
-			t.Errorf("incorrect tiled canvas: '%#v'. expected: '%#v'", actual, c.Expected)
-		}
-	}
-}
-
-func TestSplitToTiles(t *testing.T) {
-	type splitCase struct {
-		TestImage    image.Image
-		NumTilesSize image.Point
-		Expected     []image.Image
-	}
-
-	cases := []splitCase{
-		{
-			TestImage:    image.NewRGBA(image.Rect(0, 0, 1024, 768)),
-			NumTilesSize: image.Point{1, 1},
-			Expected:     []image.Image{image.NewRGBA(image.Rect(0, 0, 1024, 768))},
-		},
-		{
-			TestImage:    image.NewRGBA(image.Rect(0, 0, 1000, 500)),
-			NumTilesSize: image.Point{2, 1},
-			Expected: []image.Image{
-				image.NewRGBA(image.Rect(0, 0, 500, 500)),
-				image.NewRGBA(image.Rect(500, 0, 1000, 500)),
-			},
-		},
-		{
-			TestImage:    image.NewRGBA(image.Rect(0, 0, 1000, 1000)),
-			NumTilesSize: image.Point{2, 2},
-			Expected: []image.Image{
-				image.NewRGBA(image.Rect(0, 0, 500, 500)),
-				image.NewRGBA(image.Rect(500, 0, 1000, 500)),
-				image.NewRGBA(image.Rect(0, 500, 500, 1000)),
-				image.NewRGBA(image.Rect(500, 500, 1000, 1000)),
-			},
-		},
-	}
-
-	for _, c := range cases {
-		actual := splitToTiles(c.TestImage, c.NumTilesSize)
-
-		if len(actual) != len(c.Expected) {
-			t.Errorf("incorrect number of images '%d': expected: '%d'", len(actual), len(c.Expected))
-		}
-
-		for i := range actual {
-			if actual[i].Bounds() != c.Expected[i].Bounds() {
-				t.Errorf("image size incorrect: '%#v': expected: '%#v'", actual[i].Bounds(), c.Expected[i].Bounds())
-			}
-		}
-	}
-}
-
-func TestResize(t *testing.T) {
-	type resizeCase struct {
-		TestImage           image.Image
-		OutputSize          image.Point
-		MaxImageTiles       int
-		ExpectedImage       image.Image
-		ExpectedAspectRatio image.Point
-	}
-
-	cases := []resizeCase{
-		{
-			TestImage:           image.NewRGBA(image.Rect(0, 0, 200, 200)),
-			OutputSize:          image.Point{100, 100},
-			MaxImageTiles:       1,
-			ExpectedImage:       image.NewRGBA(image.Rect(0, 0, 100, 100)),
-			ExpectedAspectRatio: image.Point{1, 1},
-		},
-		{
-			TestImage:           image.NewRGBA(image.Rect(0, 0, 200, 200)),
-			OutputSize:          image.Point{100, 100},
-			MaxImageTiles:       2,
-			ExpectedImage:       image.NewRGBA(image.Rect(0, 0, 100, 100)),
-			ExpectedAspectRatio: image.Point{1, 1},
-		},
-		{
-			TestImage:           image.NewRGBA(image.Rect(0, 0, 10, 10)),
-			OutputSize:          image.Point{560, 560},
-			MaxImageTiles:       4,
-			ExpectedImage:       image.NewRGBA(image.Rect(0, 0, 560, 560)),
-			ExpectedAspectRatio: image.Point{1, 1},
-		},
-		{
-			TestImage:           image.NewRGBA(image.Rect(0, 0, 2560, 1920)),
-			OutputSize:          image.Point{560, 560},
-			MaxImageTiles:       4,
-			ExpectedImage:       image.NewRGBA(image.Rect(0, 0, 1120, 840)),
-			ExpectedAspectRatio: image.Point{2, 2},
-		},
-		{
-			TestImage:           image.NewRGBA(image.Rect(0, 0, 1024, 768)),
-			OutputSize:          image.Point{560, 560},
-			MaxImageTiles:       4,
-			ExpectedImage:       image.NewRGBA(image.Rect(0, 0, 1024, 768)),
-			ExpectedAspectRatio: image.Point{2, 2},
-		},
-	}
-
-	for _, c := range cases {
-		actualImage, actualAspectRatio := resizeImage(c.TestImage, "png", c.OutputSize, c.MaxImageTiles)
-
-		if actualImage.Bounds() != c.ExpectedImage.Bounds() {
-			t.Errorf("image size incorrect: '%#v': expected: '%#v'", actualImage.Bounds(), c.ExpectedImage.Bounds())
-		}
-
-		if actualAspectRatio != c.ExpectedAspectRatio {
-			t.Errorf("aspect ratio incorrect: '%#v': expected: '%#v'", actualAspectRatio, c.ExpectedAspectRatio)
-		}
-	}
-}
-
-func TestPad(t *testing.T) {
-	type padCase struct {
-		TestImage   image.Image
-		OutputSize  image.Point
-		AspectRatio image.Point
-		Expected    image.Image
-	}
-
-	cases := []padCase{
-		{
-			TestImage:   image.NewRGBA(image.Rect(0, 0, 1000, 667)),
-			OutputSize:  image.Point{560, 560},
-			AspectRatio: image.Point{2, 2},
-			Expected:    image.NewRGBA(image.Rect(0, 0, 1120, 1120)),
-		},
-	}
-
-	for _, c := range cases {
-		actual := padImage(c.TestImage, c.OutputSize, c.AspectRatio)
-
-		if actual.Bounds() != c.Expected.Bounds() {
-			t.Errorf("image size incorrect: '%#v': expected: '%#v'", actual.Bounds(), c.Expected.Bounds())
-		}
-	}
-}
-
-func TestPackImages(t *testing.T) {
-	type packCase struct {
-		TestImage    image.Image
-		AspectRatio  image.Point
-		ExpectedVals int
-	}
-
-	cases := []packCase{
-		{
-			TestImage:    image.NewRGBA(image.Rect(0, 0, 1120, 1120)),
-			AspectRatio:  image.Point{2, 2},
-			ExpectedVals: 2 * 2 * 3 * 560 * 560,
-		},
-		{
-			TestImage:    image.NewRGBA(image.Rect(0, 0, 560, 560)),
-			AspectRatio:  image.Point{1, 1},
-			ExpectedVals: 1 * 1 * 3 * 560 * 560,
-		},
-		{
-			TestImage:    image.NewRGBA(image.Rect(0, 0, 1120, 560)),
-			AspectRatio:  image.Point{1, 2},
-			ExpectedVals: 1 * 2 * 3 * 560 * 560,
-		},
-	}
-
-	for _, c := range cases {
-		actualVals := packImages(c.TestImage, c.AspectRatio)
-		if len(actualVals) != c.ExpectedVals {
-			t.Errorf("packed image size incorrect: '%d': expected: '%d'", len(actualVals), c.ExpectedVals)
-		}
-	}
-}
-
-func TestPreprocess(t *testing.T) {
-	type preprocessCase struct {
-		TestImage             image.Image
-		ExpectedVals          int
-		ExpectedAspectRatioID int
-	}
-
-	cases := []preprocessCase{
-		{
-			TestImage:             image.NewRGBA(image.Rect(0, 0, 10, 10)),
-			ExpectedVals:          0,
-			ExpectedAspectRatioID: 1,
-		},
-		{
-			TestImage:             image.NewRGBA(image.Rect(0, 0, 1024, 768)),
-			ExpectedVals:          0,
-			ExpectedAspectRatioID: 6,
-		},
-	}
-
-	for _, c := range cases {
-		var buf bytes.Buffer
-		err := png.Encode(&buf, c.TestImage)
-		if err != nil {
-			t.Fatal(err)
-		}
-
-		imgData, opts, err := Preprocess(&buf)
-		if err != nil {
-			t.Fatalf("error processing: %q", err)
-		}
-
-		if len(imgData) == 0 {
-			t.Errorf("no image data returned")
-		}
-
-		ar, ok := opts["aspectRatioIndex"]
-		if !ok {
-			t.Fatalf("no aspect ratio found")
-		}
-
-		aspectRatioID := ar.(int)
-
-		if aspectRatioID != c.ExpectedAspectRatioID {
-			t.Errorf("aspect ratio incorrect: '%d': expected: '%d'", aspectRatioID, c.ExpectedAspectRatioID)
-		}
-	}
-}
diff --git a/model/models/mllama/model.go b/model/models/mllama/model.go
index 3fa26ded..93b443ef 100644
--- a/model/models/mllama/model.go
+++ b/model/models/mllama/model.go
@@ -2,11 +2,7 @@ package mllama
 
 import (
 	"bytes"
-	"encoding/binary"
-	"fmt"
-	"hash/fnv"
 	"image"
-	"slices"
 
 	"github.com/ollama/ollama/fs"
 	"github.com/ollama/ollama/kvcache"
@@ -34,10 +30,6 @@ const (
 )
 
 func New(c fs.Config) (model.Model, error) {
-	// Verify unified config
-	if c.Uint("vision.block_count") == 0 {
-		return nil, fmt.Errorf("non-unified vision model not supported")
-	}
 	m := Model{
 		BytePairEncoding: model.NewBytePairEncoding(
 			c.String("tokenizer.ggml.pretokenizer", `(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+`),
@@ -76,22 +68,19 @@ func (m *Model) EncodeMultimodal(ctx ml.Context, multimodalData []byte) (any, er
 		return nil, err
 	}
 
-	f32s, aspectRatioID, err := m.ImageProcessor.ProcessImage(image)
+	f32s, ratio, err := m.ImageProcessor.ProcessImage(image)
 	if err != nil {
 		return nil, err
 	}
 
-	pixelValues, err := ctx.Input().FromFloatSlice(f32s,
-		m.ImageProcessor.imageSize,
-		m.ImageProcessor.imageSize,
-		m.ImageProcessor.numChannels,
-		m.ImageProcessor.maxNumTiles,
-	)
+	pixelValues, err := ctx.Input().FromFloatSlice(f32s, m.imageSize, m.imageSize, m.numChannels, ratio.numTiles())
 	if err != nil {
 		return nil, err
 	}
 
-	aspectRatio, err := ctx.Input().FromIntSlice([]int32{int32(aspectRatioID)}, 1)
+	pixelValues = pixelValues.Pad(ctx, 0, 0, 0, m.ImageProcessor.maxNumTiles-ratio.numTiles())
+
+	aspectRatio, err := ctx.Input().FromIntSlice([]int32{int32(ratio.rank)}, 1)
 	if err != nil {
 		return nil, err
 	}
@@ -102,41 +91,19 @@ func (m *Model) EncodeMultimodal(ctx ml.Context, multimodalData []byte) (any, er
 }
 
 func (m *Model) PostTokenize(inputs []input.Input) ([]input.Input, error) {
-	var images []input.Input
-	fnvHash := fnv.New64a()
-
 	for i := range inputs {
-		if inputs[i].Multimodal == nil {
-			if len(images) > 0 {
-				inputs[i].Multimodal = []ml.Tensor{images[0].Multimodal.(ml.Tensor)}
-				inputs[i].MultimodalHash = images[0].MultimodalHash
-				for j := 1; j < len(images); j++ {
-					inputs[i].Multimodal = append(inputs[i].Multimodal.([]ml.Tensor), images[0].Multimodal.(ml.Tensor))
-					fnvHash.Reset()
-					binary.Write(fnvHash, binary.NativeEndian, inputs[i].MultimodalHash)
-					binary.Write(fnvHash, binary.NativeEndian, inputs[j].MultimodalHash)
-					inputs[i].MultimodalHash = fnvHash.Sum64()
-				}
-				images = nil
-			}
-		} else {
-			images = append(images, inputs[i])
-			inputs[i].Token = -1
+		if inputs[i].Multimodal != nil {
+			inputs[i].Token = 128256 // <|image|>
 		}
 	}
 
-	inputs = slices.DeleteFunc(inputs, func(input input.Input) bool { return input.Token == -1 })
-
 	return inputs, nil
 }
 
 func (m *Model) Forward(ctx ml.Context, batch input.Batch) (ml.Tensor, error) {
 	var crossAttentionStates ml.Tensor
 	if len(batch.Multimodal) > 0 {
-		images := batch.Multimodal[len(batch.Multimodal)-1].Multimodal.([]ml.Tensor)
-		if len(images) > 0 {
-			crossAttentionStates = images[len(images)-1]
-		}
+		crossAttentionStates = batch.Multimodal[len(batch.Multimodal)-1].Multimodal.(ml.Tensor)
 	}
 
 	positions, err := ctx.Input().FromIntSlice(batch.Positions, len(batch.Positions))
@@ -150,7 +117,7 @@ func (m *Model) Forward(ctx ml.Context, batch input.Batch) (ml.Tensor, error) {
 	}
 
 	// TODO: attention mask, cross attention mask
-	return m.TextModel.Forward(ctx, batch.Inputs, positions, outputs, nil, crossAttentionStates, nil, m.Cache.(*kvcache.WrapperCache)), nil
+	return m.TextModel.Forward(ctx, batch.Inputs, positions, outputs, crossAttentionStates, nil, m.Cache.(*kvcache.WrapperCache)), nil
 }
 
 func init() {
diff --git a/model/models/mllama/model_text.go b/model/models/mllama/model_text.go
index 490eb696..9bd414af 100644
--- a/model/models/mllama/model_text.go
+++ b/model/models/mllama/model_text.go
@@ -18,7 +18,7 @@ type TextSelfAttention struct {
 	RopeFactors ml.Tensor  `gguf:"rope_freqs.weight"`
 }
 
-func (sa *TextSelfAttention) Forward(ctx ml.Context, hiddenState, positions, _ ml.Tensor, cache *kvcache.WrapperCache, opts *TextModelOptions) ml.Tensor {
+func (sa *TextSelfAttention) Forward(ctx ml.Context, hiddenState, positions ml.Tensor, cache *kvcache.WrapperCache, opts *TextModelOptions) ml.Tensor {
 	batchSize := hiddenState.Dim(1)
 	headDim := opts.hiddenSize / opts.numHeads
 	ropeType := uint32(0)
@@ -69,11 +69,11 @@ type TextSelfAttentionDecoderLayer struct {
 	MLP     *TextMLP
 }
 
-func (d *TextSelfAttentionDecoderLayer) Forward(ctx ml.Context, hiddenState, positions, outputs, mask, _, _ ml.Tensor, cache *kvcache.WrapperCache, opts *TextModelOptions) ml.Tensor {
+func (d *TextSelfAttentionDecoderLayer) Forward(ctx ml.Context, hiddenState, positions, outputs, _, _ ml.Tensor, cache *kvcache.WrapperCache, opts *TextModelOptions) ml.Tensor {
 	residual := hiddenState
 
 	hiddenState = d.AttentionNorm.Forward(ctx, hiddenState, opts.eps)
-	hiddenState = d.SelfAttention.Forward(ctx, hiddenState, positions, mask, cache, opts)
+	hiddenState = d.SelfAttention.Forward(ctx, hiddenState, positions, cache, opts)
 
 	// In the final layer (outputs != nil), optimize by pruning to just the token positions
 	// we need logits for.
@@ -151,7 +151,7 @@ type TextCrossAttentionDecoderLayer struct {
 	MLPGate ml.Tensor `gguf:"cross_attn_mlp_gate"`
 }
 
-func (d *TextCrossAttentionDecoderLayer) Forward(ctx ml.Context, hiddenState, _, _, _, crossAttentionStates, crossAttentionMask ml.Tensor, cache *kvcache.WrapperCache, opts *TextModelOptions) ml.Tensor {
+func (d *TextCrossAttentionDecoderLayer) Forward(ctx ml.Context, hiddenState, _, _, crossAttentionStates, crossAttentionMask ml.Tensor, cache *kvcache.WrapperCache, opts *TextModelOptions) ml.Tensor {
 	residual := hiddenState
 
 	hiddenState = d.AttentionNorm.Forward(ctx, hiddenState, opts.eps)
@@ -167,14 +167,14 @@ func (d *TextCrossAttentionDecoderLayer) Forward(ctx ml.Context, hiddenState, _,
 }
 
 type TextDecoderLayer interface {
-	Forward(ctx ml.Context, hiddenState, positionIDs, outputs, mask, crossAttentionStates, crossAttentionMask ml.Tensor, cache *kvcache.WrapperCache, opts *TextModelOptions) ml.Tensor
+	Forward(ctx ml.Context, hiddenState, positionIDs, outputs, crossAttentionStates, crossAttentionMask ml.Tensor, cache *kvcache.WrapperCache, opts *TextModelOptions) ml.Tensor
 }
 
 type TextDecoder struct {
 	Layers []TextDecoderLayer
 }
 
-func (d *TextDecoder) Forward(ctx ml.Context, hiddenState, positionIDs, outputs, mask, crossAttentionStates, crossAttentionMask ml.Tensor, cache *kvcache.WrapperCache, opts *TextModelOptions) ml.Tensor {
+func (d *TextDecoder) Forward(ctx ml.Context, hiddenState, positionIDs, outputs, crossAttentionStates, crossAttentionMask ml.Tensor, cache *kvcache.WrapperCache, opts *TextModelOptions) ml.Tensor {
 	for i, layer := range d.Layers {
 		layerType := selfAttentionLayer
 		if slices.Contains(opts.crossAttentionLayers, int32(i)) {
@@ -190,7 +190,7 @@ func (d *TextDecoder) Forward(ctx ml.Context, hiddenState, positionIDs, outputs,
 				lastLayerOutputs = outputs
 			}
 
-			hiddenState = layer.Forward(ctx, hiddenState, positionIDs, lastLayerOutputs, mask, crossAttentionStates, crossAttentionMask, cache, opts)
+			hiddenState = layer.Forward(ctx, hiddenState, positionIDs, lastLayerOutputs, crossAttentionStates, crossAttentionMask, cache, opts)
 		}
 	}
 
@@ -214,9 +214,9 @@ type TextModel struct {
 	*TextModelOptions
 }
 
-func (m *TextModel) Forward(ctx ml.Context, inputIDs, positionIDs, outputs, mask, crossAttentionStates, crossAttentionMask ml.Tensor, cache *kvcache.WrapperCache) ml.Tensor {
+func (m *TextModel) Forward(ctx ml.Context, inputIDs, positionIDs, outputs, crossAttentionStates, crossAttentionMask ml.Tensor, cache *kvcache.WrapperCache) ml.Tensor {
 	hiddenState := m.TokenEmbedding.Forward(ctx, inputIDs)
-	hiddenState = m.Transformer.Forward(ctx, hiddenState, positionIDs, outputs, mask, crossAttentionStates, crossAttentionMask, cache, m.TextModelOptions)
+	hiddenState = m.Transformer.Forward(ctx, hiddenState, positionIDs, outputs, crossAttentionStates, crossAttentionMask, cache, m.TextModelOptions)
 	hiddenState = m.OutputNorm.Forward(ctx, hiddenState, m.eps)
 	return m.Output.Forward(ctx, hiddenState)
 }
diff --git a/model/models/mllama/model_vision.go b/model/models/mllama/model_vision.go
index bd3d150a..77ea5373 100644
--- a/model/models/mllama/model_vision.go
+++ b/model/models/mllama/model_vision.go
@@ -15,7 +15,7 @@ type VisionSelfAttention struct {
 	Query  *nn.Linear `gguf:"attn_q"`
 	Key    *nn.Linear `gguf:"attn_k"`
 	Value  *nn.Linear `gguf:"attn_v"`
-	Output *nn.Linear `gguf:"attn_out"`
+	Output *nn.Linear `gguf:"attn_output"`
 
 	Gate ml.Tensor `gguf:"attn_gate"`
 }
@@ -45,36 +45,29 @@ func (sa *VisionSelfAttention) Forward(ctx ml.Context, hiddenState ml.Tensor, op
 	attention = attention.Reshape(ctx, opts.hiddenSize, attention.Dim(2), batchSize)
 
 	hiddenState = sa.Output.Forward(ctx, attention)
-	if sa.Gate != nil {
-		hiddenState = hiddenState.Mul(ctx, sa.Gate)
-	}
-
 	return hiddenState
 }
 
 type VisionMLP struct {
-	Down *nn.Linear `gguf:"ffn_down"`
 	Up   *nn.Linear `gguf:"ffn_up"`
-
-	Gate ml.Tensor `gguf:"ffn_gate"`
+	Down *nn.Linear `gguf:"ffn_down"`
 }
 
 func (mlp *VisionMLP) Forward(ctx ml.Context, hiddenState ml.Tensor, opts *VisionModelOptions) ml.Tensor {
-	hiddenState = mlp.Down.Forward(ctx, hiddenState).GELU(ctx)
-	hiddenState = mlp.Up.Forward(ctx, hiddenState)
-	if mlp.Gate != nil {
-		hiddenState = hiddenState.Mul(ctx, mlp.Gate)
-	}
+	hiddenState = mlp.Up.Forward(ctx, hiddenState).GELU(ctx)
+	hiddenState = mlp.Down.Forward(ctx, hiddenState)
 
 	return hiddenState
 }
 
 type VisionEncoderLayer struct {
-	AttentionNorm *nn.LayerNorm `gguf:"ln1"`
+	AttentionNorm *nn.LayerNorm `gguf:"attn_norm"`
 	SelfAttention *VisionSelfAttention
+	AttentionGate ml.Tensor `gguf:"attn_gate"`
 
-	MLPNorm *nn.LayerNorm `gguf:"ln2"`
+	MLPNorm *nn.LayerNorm `gguf:"ffn_norm"`
 	MLP     *VisionMLP
+	MLPGate ml.Tensor `gguf:"ffn_gate"`
 }
 
 func (e *VisionEncoderLayer) Forward(ctx ml.Context, hiddenState ml.Tensor, opts *VisionModelOptions) ml.Tensor {
@@ -83,13 +76,22 @@ func (e *VisionEncoderLayer) Forward(ctx ml.Context, hiddenState ml.Tensor, opts
 	// self attention
 	hiddenState = e.AttentionNorm.Forward(ctx, hiddenState, opts.eps)
 	hiddenState = e.SelfAttention.Forward(ctx, hiddenState, opts)
+
+	if e.AttentionGate != nil {
+		hiddenState = hiddenState.Mul(ctx, e.AttentionGate)
+	}
 	hiddenState = hiddenState.Add(ctx, residual)
 	residual = hiddenState
 
 	// feed forward
 	hiddenState = e.MLPNorm.Forward(ctx, hiddenState, opts.eps)
 	hiddenState = e.MLP.Forward(ctx, hiddenState, opts)
-	return hiddenState.Add(ctx, residual)
+	hiddenState = hiddenState.Add(ctx, residual)
+	if e.MLPGate != nil {
+		hiddenState = hiddenState.Mul(ctx, e.MLPGate)
+	}
+
+	return hiddenState
 }
 
 type VisionEncoder struct {
@@ -114,9 +116,9 @@ type PrecomputedAspectRatioEmbedding struct {
 	Gate      ml.Tensor `gguf:"gate"`
 }
 
-func (e *PrecomputedAspectRatioEmbedding) Forward(ctx ml.Context, hiddenState ml.Tensor, aspectRatioIDs ml.Tensor, opts *VisionModelOptions) ml.Tensor {
+func (e *PrecomputedAspectRatioEmbedding) Forward(ctx ml.Context, hiddenState ml.Tensor, aspectRatioIDs ml.Tensor, numTiles int, opts *VisionModelOptions) ml.Tensor {
 	embeddings := e.Embedding.Forward(ctx, aspectRatioIDs)
-	embeddings = embeddings.Reshape(ctx, opts.hiddenSize, 1, opts.numTiles)
+	embeddings = embeddings.Reshape(ctx, opts.hiddenSize, 1, numTiles)
 	if e.Gate != nil {
 		embeddings = embeddings.Mul(ctx, e.Gate)
 	}
@@ -132,7 +134,7 @@ type PrecomputedPositionEmbedding struct {
 	TilePositionEmbeddingGate ml.Tensor     `gguf:"tile_position_embd.gate"`
 }
 
-func (e *PrecomputedPositionEmbedding) Forward(ctx ml.Context, hiddenState, positionIDs, aspectRatioIDs ml.Tensor, numPositions int, opts *VisionModelOptions) ml.Tensor {
+func (e *PrecomputedPositionEmbedding) Forward(ctx ml.Context, hiddenState, positionIDs, aspectRatioIDs ml.Tensor, numPositions, numTiles int, opts *VisionModelOptions) ml.Tensor {
 	positionEmbedding := e.PositionEmbedding.Forward(ctx, positionIDs)
 	if e.PositionEmbeddingGate != nil {
 		positionEmbedding = positionEmbedding.Mul(ctx, e.PositionEmbeddingGate)
@@ -141,7 +143,7 @@ func (e *PrecomputedPositionEmbedding) Forward(ctx ml.Context, hiddenState, posi
 	hiddenState = hiddenState.Add(ctx, positionEmbedding)
 
 	tilePositionEmbedding := e.TilePositionEmbedding.Forward(ctx, aspectRatioIDs)
-	tilePositionEmbedding = tilePositionEmbedding.Reshape(ctx, opts.hiddenSize, numPositions, opts.numTiles)
+	tilePositionEmbedding = tilePositionEmbedding.Reshape(ctx, opts.hiddenSize, numPositions, numTiles)
 	if e.TilePositionEmbeddingGate != nil {
 		tilePositionEmbedding = tilePositionEmbedding.Mul(ctx, e.TilePositionEmbeddingGate)
 	}
@@ -150,9 +152,9 @@ func (e *PrecomputedPositionEmbedding) Forward(ctx ml.Context, hiddenState, posi
 }
 
 type VisionModelOptions struct {
-	hiddenSize, numHeads, numTiles int
-	imageSize, patchSize           int
-	eps                            float32
+	hiddenSize, numHeads int
+	imageSize, patchSize int
+	eps                  float32
 
 	intermediateLayersIndices []int32
 }
@@ -181,14 +183,16 @@ func (m *VisionModel) Forward(ctx ml.Context, pixelValues, positionIDs, aspectRa
 		numPositions++
 	}
 
+	numTiles := pixelValues.Dim(3)
+
 	hiddenState := m.PatchEmbeddings.Forward(ctx, pixelValues, m.patchSize, m.patchSize, 0, 0, 1, 1)
-	hiddenState = hiddenState.Reshape(ctx, numPatches, m.hiddenSize, m.numTiles)
+	hiddenState = hiddenState.Reshape(ctx, numPatches, m.hiddenSize, numTiles)
 	hiddenState = hiddenState.Permute(ctx, 1, 0, 2, 3).Contiguous(ctx)
 
-	hiddenState = m.PreTilePositionEmbedding.Forward(ctx, hiddenState, aspectRatioIDs, m.VisionModelOptions)
-	hiddenState = m.ClassEmbedding.Repeat(ctx, 2, m.numTiles).Concat(ctx, hiddenState, 1)
+	hiddenState = m.PreTilePositionEmbedding.Forward(ctx, hiddenState, aspectRatioIDs, numTiles, m.VisionModelOptions)
+	hiddenState = m.ClassEmbedding.Repeat(ctx, 2, numTiles).Concat(ctx, hiddenState, 1)
 
-	hiddenState = m.PositionEmbedding.Forward(ctx, hiddenState, positionIDs, aspectRatioIDs, numPositions, m.VisionModelOptions)
+	hiddenState = m.PositionEmbedding.Forward(ctx, hiddenState, positionIDs, aspectRatioIDs, numPositions, numTiles, m.VisionModelOptions)
 	hiddenState = m.PreLayerNorm.Forward(ctx, hiddenState, m.eps)
 
 	numPaddingPatches := 8 - (hiddenState.Dim(1)%8)%8
@@ -199,18 +203,18 @@ func (m *VisionModel) Forward(ctx ml.Context, pixelValues, positionIDs, aspectRa
 
 	hiddenState = m.PostLayerNorm.Forward(ctx, hiddenState, m.eps)
 
-	hiddenState = hiddenState.Reshape(ctx, m.hiddenSize, numPositions+numPaddingPatches, m.numTiles, batchSize)
-	hiddenState = m.PostTilePositionEmbedding.Forward(ctx, hiddenState, aspectRatioIDs, m.VisionModelOptions)
+	hiddenState = hiddenState.Reshape(ctx, m.hiddenSize, numPositions+numPaddingPatches, numTiles, batchSize)
+	hiddenState = m.PostTilePositionEmbedding.Forward(ctx, hiddenState, aspectRatioIDs, numTiles, m.VisionModelOptions)
 
-	hiddenState = hiddenState.Reshape(ctx, m.hiddenSize, m.numTiles*(numPositions+numPaddingPatches), batchSize)
+	hiddenState = hiddenState.Reshape(ctx, m.hiddenSize, numTiles*(numPositions+numPaddingPatches), batchSize)
 	hiddenState, _ = m.GlobalTransformer.Forward(ctx, hiddenState, nil, m.VisionModelOptions)
 
 	hiddenStates := intermediateHiddenStates[0].Stack(ctx, 0, intermediateHiddenStates[1:]...)
-	hiddenStates = hiddenStates.Reshape(ctx, len(intermediateHiddenStates)*m.hiddenSize, numPositions+numPaddingPatches, m.numTiles, batchSize)
-	hiddenStates = hiddenStates.Unpad(ctx, 0, numPaddingPatches, 0, 0)
+	hiddenStates = hiddenStates.Reshape(ctx, len(intermediateHiddenStates)*m.hiddenSize, numPositions+numPaddingPatches, numTiles, batchSize)
+	hiddenStates = hiddenStates.Pad(ctx, 0, -numPaddingPatches, 0, 0)
 
-	hiddenState = hiddenState.Reshape(ctx, m.hiddenSize, numPositions+numPaddingPatches, m.numTiles, batchSize)
-	hiddenState = hiddenState.Unpad(ctx, 0, numPaddingPatches, 0, 0)
+	hiddenState = hiddenState.Reshape(ctx, m.hiddenSize, numPositions+numPaddingPatches, numTiles, batchSize)
+	hiddenState = hiddenState.Pad(ctx, 0, -numPaddingPatches, 0, 0)
 	return hiddenState.Concat(ctx, hiddenStates, 0)
 }
 
@@ -222,7 +226,6 @@ func newVisionModel(c fs.Config) *VisionModel {
 		VisionModelOptions: &VisionModelOptions{
 			hiddenSize: int(c.Uint("vision.embedding_length")),
 			numHeads:   int(c.Uint("vision.attention.head_count")),
-			numTiles:   int(c.Uint("vision.max_num_tiles")),
 
 			imageSize: int(c.Uint("vision.image_size")),
 			patchSize: int(c.Uint("vision.patch_size")),
diff --git a/model/models/mllama/process_image.go b/model/models/mllama/process_image.go
index 1b0506d3..8e60508f 100644
--- a/model/models/mllama/process_image.go
+++ b/model/models/mllama/process_image.go
@@ -2,17 +2,31 @@ package mllama
 
 import (
 	"image"
-	"image/color"
 	"math"
 	"slices"
 
 	"golang.org/x/image/draw"
 
 	"github.com/ollama/ollama/fs"
+	"github.com/ollama/ollama/model/imageproc"
 )
 
+type supportedAspectRatio struct {
+	rank, width, height int
+}
+
+func (a supportedAspectRatio) Point() image.Point {
+	return image.Point{a.width, a.height}
+}
+
+func (a supportedAspectRatio) numTiles() int {
+	return a.width * a.height
+}
+
 type ImageProcessor struct {
 	imageSize, numChannels, maxNumTiles int
+
+	mean, std [3]float32
 }
 
 func newImageProcessor(c fs.Config) ImageProcessor {
@@ -20,71 +34,49 @@ func newImageProcessor(c fs.Config) ImageProcessor {
 		imageSize:   int(c.Uint("vision.image_size")),
 		numChannels: int(c.Uint("vision.num_channels")),
 		maxNumTiles: int(c.Uint("vision.max_num_tiles")),
+
+		mean: imageproc.ClipDefaultMean,
+		std:  imageproc.ClipDefaultSTD,
 	}
 }
 
-func (p *ImageProcessor) supportedAspectRatios(maxTiles int) []image.Point {
-	ratios := []image.Point{}
-
-	for w := range maxTiles {
-		for h := range maxTiles {
-			if (w+1)*(h+1) <= maxTiles {
-				ratios = append(ratios, image.Point{w + 1, h + 1})
-			}
+func (p ImageProcessor) supportedAspectRatios() (ratios []supportedAspectRatio) {
+	for w := 1; w <= p.maxNumTiles; w++ {
+		for h := 1; h <= p.maxNumTiles/w; h++ {
+			ratios = append(ratios, supportedAspectRatio{len(ratios) + 1, w, h})
 		}
 	}
-
 	return ratios
 }
 
-func (p *ImageProcessor) clip(a, a_min, a_max int) int {
-	if a < a_min {
-		return a_min
-	} else if a > a_max {
-		return a_max
-	}
+func (p ImageProcessor) fitToCanvas(imageSize, canvasSize image.Point) image.Point {
+	tw := min(max(imageSize.X, p.imageSize), canvasSize.X)
+	th := min(max(imageSize.Y, p.imageSize), canvasSize.Y)
 
-	return a
-}
+	r := math.Min(
+		float64(tw)/float64(imageSize.X),
+		float64(th)/float64(imageSize.Y),
+	)
 
-func (p *ImageProcessor) fitToCanvas(imageSize, canvasSize image.Point, tileSize int) image.Point {
-	targetWidth := p.clip(imageSize.X, tileSize, canvasSize.X)
-	targetHeight := p.clip(imageSize.Y, tileSize, canvasSize.Y)
-
-	scaleWidth := float64(targetWidth) / float64(imageSize.X)
-	scaleHeight := float64(targetHeight) / float64(imageSize.Y)
-
-	var w, h int
-
-	if scaleWidth < scaleHeight {
-		w = targetWidth
-		h = min(int(math.Floor(float64(imageSize.Y)*scaleWidth)), targetHeight)
-	} else {
-		w = min(int(math.Floor(float64(imageSize.X)*scaleHeight)), targetWidth)
-		h = targetHeight
-	}
+	w := min(int(math.Floor(float64(imageSize.X)*r)), tw)
+	h := min(int(math.Floor(float64(imageSize.Y)*r)), th)
 
 	return image.Point{w, h}
 }
 
-func (p *ImageProcessor) optimalTiledCanvas(imageSize image.Point, maxImageTiles, tileSize int) image.Point {
-	possibleTileArrangements := p.supportedAspectRatios(maxImageTiles)
-	possibleCanvasSizes := []image.Point{}
-	for _, pta := range possibleTileArrangements {
-		possibleCanvasSizes = append(possibleCanvasSizes, image.Point{pta.X * tileSize, pta.Y * tileSize})
+func (p ImageProcessor) optimalTiledCanvas(imageSize image.Point) image.Point {
+	possibleTileArrangements := p.supportedAspectRatios()
+	possibleCanvasSizes := make([]image.Point, len(possibleTileArrangements))
+	for i, pta := range possibleTileArrangements {
+		possibleCanvasSizes[i] = image.Point{pta.width * p.imageSize, pta.height * p.imageSize}
 	}
 
-	scales := []float64{}
-
-	for _, pcs := range possibleCanvasSizes {
-		scaleHeight := float64(pcs.Y) / float64(imageSize.Y)
-		scaleWidth := float64(pcs.X) / float64(imageSize.X)
-
-		if scaleWidth > scaleHeight {
-			scales = append(scales, scaleHeight)
-		} else {
-			scales = append(scales, scaleWidth)
-		}
+	scales := make([]float64, len(possibleCanvasSizes))
+	for i, pcs := range possibleCanvasSizes {
+		scales[i] = min(
+			float64(pcs.Y)/float64(imageSize.Y),
+			float64(pcs.X)/float64(imageSize.X),
+		)
 	}
 
 	var minUpscale float64
@@ -123,47 +115,41 @@ func (p *ImageProcessor) optimalTiledCanvas(imageSize image.Point, maxImageTiles
 	return selectedCanvas
 }
 
-func (p *ImageProcessor) splitToTiles(img image.Image, numTilesSize image.Point) []image.Image {
+func (p ImageProcessor) splitToTiles(img image.Image, numTilesSize image.Point) []image.Image {
 	b := img.Bounds()
 	width := b.Max.X - b.Min.X
 	height := b.Max.Y - b.Min.Y
 	tileHeight := height / numTilesSize.Y
 	tileWidth := width / numTilesSize.X
 
-	images := []image.Image{}
+	images := make([]image.Image, 0, numTilesSize.Y*numTilesSize.X)
 
 	for h := range numTilesSize.Y {
 		for w := range numTilesSize.X {
 			rect := image.Rect(tileWidth*w, tileHeight*h, tileWidth*(w+1), tileHeight*(h+1))
-			images = append(images, img.(interface {
+			if subImg, ok := img.(interface {
 				SubImage(image.Rectangle) image.Image
-			}).SubImage(rect))
+			}); ok {
+				images = append(images, subImg.SubImage(rect))
+			} else {
+				// Handle the case where img does not implement SubImage
+				// This is a fallback and may not be efficient
+				newImg := image.NewRGBA(rect)
+				draw.Draw(newImg, rect, img, rect.Min, draw.Src)
+				images = append(images, newImg)
+			}
 		}
 	}
 
 	return images
 }
 
-// remove the "alpha" channel by drawing over a prefilled image
-//
-//nolint:unused
-func (p *ImageProcessor) compositeImage(img image.Image) image.Image {
-	dst := image.NewRGBA(img.Bounds())
-
-	white := color.RGBA{255, 255, 255, 255}
-	draw.Draw(dst, dst.Bounds(), &image.Uniform{white}, image.Point{}, draw.Src)
-	draw.Draw(dst, dst.Bounds(), img, img.Bounds().Min, draw.Over)
-
-	return dst
-}
-
-func (p *ImageProcessor) resize(img image.Image, outputSize image.Point, maxImageTiles int) (image.Image, image.Point) {
+func (p ImageProcessor) resize(img image.Image) (image.Image, image.Point) {
 	b := img.Bounds()
-	tileSize := outputSize.Y
 
-	canvasSize := p.optimalTiledCanvas(b.Max, maxImageTiles, tileSize)
-	aspectRatio := image.Point{canvasSize.X / tileSize, canvasSize.Y / tileSize}
-	newSize := p.fitToCanvas(b.Max, canvasSize, tileSize)
+	canvasSize := p.optimalTiledCanvas(b.Max)
+	aspectRatio := image.Point{canvasSize.X / p.imageSize, canvasSize.Y / p.imageSize}
+	newSize := p.fitToCanvas(b.Max, canvasSize)
 
 	dst := image.NewRGBA(image.Rect(0, 0, newSize.X, newSize.Y))
 
@@ -177,10 +163,10 @@ func (p *ImageProcessor) resize(img image.Image, outputSize image.Point, maxImag
 	return dst, aspectRatio
 }
 
-func (p *ImageProcessor) pad(img image.Image, outputSize, aspectRatio image.Point) image.Image {
+func (p ImageProcessor) pad(img image.Image, aspectRatio image.Point) image.Image {
 	paddedSize := image.Point{
-		X: outputSize.X * aspectRatio.X,
-		Y: outputSize.Y * aspectRatio.Y,
+		X: p.imageSize * aspectRatio.X,
+		Y: p.imageSize * aspectRatio.Y,
 	}
 
 	dst := image.NewRGBA(image.Rect(0, 0, paddedSize.X, paddedSize.Y))
@@ -189,7 +175,7 @@ func (p *ImageProcessor) pad(img image.Image, outputSize, aspectRatio image.Poin
 	return dst
 }
 
-func (p *ImageProcessor) pack(img image.Image, aspectRatio image.Point, mean, std [3]float32) []float32 {
+func (p ImageProcessor) pack(img image.Image, aspectRatio image.Point) []float32 {
 	subImages := p.splitToTiles(img, aspectRatio)
 
 	var pixelVals []float32
@@ -205,9 +191,9 @@ func (p *ImageProcessor) pack(img image.Image, aspectRatio image.Point, mean, st
 				gVal := float32(g>>8) / 255.0
 				bVal := float32(b>>8) / 255.0
 
-				rVal = (rVal - mean[0]) / std[0]
-				gVal = (gVal - mean[1]) / std[1]
-				bVal = (bVal - mean[2]) / std[2]
+				rVal = (rVal - p.mean[0]) / p.std[0]
+				gVal = (gVal - p.mean[1]) / p.std[1]
+				bVal = (bVal - p.mean[2]) / p.std[2]
 
 				rVals = append(rVals, rVal)
 				gVals = append(gVals, gVal)
@@ -222,17 +208,15 @@ func (p *ImageProcessor) pack(img image.Image, aspectRatio image.Point, mean, st
 	return pixelVals
 }
 
-func (p ImageProcessor) ProcessImage(img image.Image) ([]float32, int, error) {
-	outputSize := image.Point{p.imageSize, p.imageSize}
+func (p ImageProcessor) ProcessImage(img image.Image) ([]float32, supportedAspectRatio, error) {
+	newImage, newImageRatio := p.resize(img)
+	newImage = p.pad(newImage, newImageRatio)
+	pixelValues := p.pack(newImage, newImageRatio)
 
-	// clip values
-	mean := [3]float32{0.48145466, 0.4578275, 0.40821073}
-	std := [3]float32{0.26862954, 0.26130258, 0.27577711}
+	supportedAspectRatios := p.supportedAspectRatios()
+	aspectRatioID := slices.IndexFunc(supportedAspectRatios, func(i supportedAspectRatio) bool {
+		return i.width == newImageRatio.X && i.height == newImageRatio.Y
+	})
 
-	newImage, aspectRatio := p.resize(img, outputSize, p.maxNumTiles)
-	newImage = p.pad(newImage, outputSize, aspectRatio)
-
-	data := p.pack(newImage, aspectRatio, mean, std)
-	aspectRatioIndex := slices.Index(p.supportedAspectRatios(p.maxNumTiles), aspectRatio) + 1
-	return data, aspectRatioIndex, nil
+	return pixelValues, supportedAspectRatios[aspectRatioID], nil
 }
diff --git a/model/models/mllama/process_image_test.go b/model/models/mllama/process_image_test.go
new file mode 100644
index 00000000..a9669b18
--- /dev/null
+++ b/model/models/mllama/process_image_test.go
@@ -0,0 +1,387 @@
+package mllama
+
+import (
+	"image"
+	"testing"
+
+	"github.com/google/go-cmp/cmp"
+)
+
+func TestSupportedAspectRatios(t *testing.T) {
+	cases := []struct {
+		p    ImageProcessor
+		want []supportedAspectRatio
+	}{
+		{
+			p: ImageProcessor{maxNumTiles: 1},
+			want: []supportedAspectRatio{
+				{1, 1, 1},
+			},
+		},
+		{
+			p: ImageProcessor{maxNumTiles: 2},
+			want: []supportedAspectRatio{
+				{1, 1, 1},
+				{2, 1, 2},
+				{3, 2, 1},
+			},
+		},
+		{
+			p: ImageProcessor{maxNumTiles: 3},
+			want: []supportedAspectRatio{
+				{1, 1, 1},
+				{2, 1, 2},
+				{3, 1, 3},
+				{4, 2, 1},
+				{5, 3, 1},
+			},
+		},
+		{
+			p: ImageProcessor{maxNumTiles: 4},
+			want: []supportedAspectRatio{
+				{1, 1, 1},
+				{2, 1, 2},
+				{3, 1, 3},
+				{4, 1, 4},
+				{5, 2, 1},
+				{6, 2, 2},
+				{7, 3, 1},
+				{8, 4, 1},
+			},
+		},
+	}
+
+	for _, tt := range cases {
+		actual := tt.p.supportedAspectRatios()
+		if diff := cmp.Diff(actual, tt.want, cmp.AllowUnexported(supportedAspectRatio{})); diff != "" {
+			t.Errorf("mismatch (-got +want):\n%s", diff)
+		}
+	}
+}
+
+func TestFitToCanvas(t *testing.T) {
+	cases := []struct {
+		p      ImageProcessor
+		image  image.Point
+		canvas image.Point
+		expect image.Point
+	}{
+		{
+			p:      ImageProcessor{imageSize: 200},
+			image:  image.Point{400, 400},
+			canvas: image.Point{640, 480},
+			expect: image.Point{400, 400},
+		},
+		{
+			p:      ImageProcessor{imageSize: 200},
+			image:  image.Point{1024, 768},
+			canvas: image.Point{640, 480},
+			expect: image.Point{640, 480},
+		},
+		{
+			p:      ImageProcessor{imageSize: 750},
+			image:  image.Point{500, 500},
+			canvas: image.Point{1000, 1000},
+			expect: image.Point{750, 750},
+		},
+		{
+			p:      ImageProcessor{imageSize: 2000},
+			image:  image.Point{500, 1000},
+			canvas: image.Point{2000, 2000},
+			expect: image.Point{1000, 2000},
+		},
+		{
+			p:      ImageProcessor{imageSize: 1000},
+			image:  image.Point{4000, 3000},
+			canvas: image.Point{2000, 1000},
+			expect: image.Point{1333, 1000},
+		},
+		{
+			p:      ImageProcessor{imageSize: 560},
+			image:  image.Point{667, 1000},
+			canvas: image.Point{1000, 1000},
+			expect: image.Point{667, 1000},
+		},
+	}
+
+	for _, tt := range cases {
+		actual := tt.p.fitToCanvas(tt.image, tt.canvas)
+		if diff := cmp.Diff(actual, tt.expect); diff != "" {
+			t.Errorf("mismatch (-got +want):\n%s", diff)
+		}
+	}
+}
+
+func TestOptimalTiledCanvas(t *testing.T) {
+	cases := []struct {
+		p      ImageProcessor
+		image  image.Point
+		expect image.Point
+	}{
+		{
+			p:      ImageProcessor{maxNumTiles: 4, imageSize: 1000},
+			image:  image.Point{1024, 768},
+			expect: image.Point{2000, 1000},
+		},
+		{
+			p:      ImageProcessor{maxNumTiles: 4, imageSize: 560},
+			image:  image.Point{1024, 768},
+			expect: image.Point{1120, 1120},
+		},
+		{
+			p:      ImageProcessor{maxNumTiles: 4, imageSize: 560},
+			image:  image.Point{800, 600},
+			expect: image.Point{1120, 1120},
+		},
+		{
+			p:      ImageProcessor{maxNumTiles: 4, imageSize: 560},
+			image:  image.Point{640, 480},
+			expect: image.Point{1120, 560},
+		},
+		{
+			p:      ImageProcessor{maxNumTiles: 4, imageSize: 560},
+			image:  image.Point{320, 200},
+			expect: image.Point{560, 560},
+		},
+		{
+			p:      ImageProcessor{maxNumTiles: 4, imageSize: 560},
+			image:  image.Point{1320, 200},
+			expect: image.Point{1680, 560},
+		},
+		{
+			p:      ImageProcessor{maxNumTiles: 4, imageSize: 560},
+			image:  image.Point{2000, 200},
+			expect: image.Point{2240, 560},
+		},
+		{
+			p:      ImageProcessor{maxNumTiles: 4, imageSize: 560},
+			image:  image.Point{10000, 200},
+			expect: image.Point{2240, 560},
+		},
+		{
+			p:      ImageProcessor{maxNumTiles: 4, imageSize: 560},
+			image:  image.Point{480, 640},
+			expect: image.Point{560, 1120},
+		},
+		{
+			p:      ImageProcessor{maxNumTiles: 4, imageSize: 560},
+			image:  image.Point{200, 320},
+			expect: image.Point{560, 560},
+		},
+		{
+			p:      ImageProcessor{maxNumTiles: 4, imageSize: 560},
+			image:  image.Point{200, 1320},
+			expect: image.Point{560, 1680},
+		},
+		{
+			p:      ImageProcessor{maxNumTiles: 4, imageSize: 560},
+			image:  image.Point{200, 2000},
+			expect: image.Point{560, 2240},
+		},
+		{
+			p:      ImageProcessor{maxNumTiles: 4, imageSize: 560},
+			image:  image.Point{200, 10000},
+			expect: image.Point{560, 2240},
+		},
+		{
+			p:      ImageProcessor{maxNumTiles: 4, imageSize: 560},
+			image:  image.Point{10000, 10000},
+			expect: image.Point{1120, 1120},
+		},
+	}
+
+	for _, tt := range cases {
+		actual := tt.p.optimalTiledCanvas(tt.image)
+		if diff := cmp.Diff(actual, tt.expect); diff != "" {
+			t.Errorf("mismatch (-got +want):\n%s", diff)
+		}
+	}
+}
+
+func TestSplitToTiles(t *testing.T) {
+	cases := []struct {
+		imageMax image.Point
+		numTiles image.Point
+		expect   []image.Image
+	}{
+		{
+			imageMax: image.Point{1024, 768},
+			numTiles: image.Point{1, 1},
+			expect:   []image.Image{image.NewRGBA(image.Rect(0, 0, 1024, 768))},
+		},
+		{
+			imageMax: image.Point{1000, 500},
+			numTiles: image.Point{2, 1},
+			expect: []image.Image{
+				image.NewRGBA(image.Rect(0, 0, 500, 500)),
+				image.NewRGBA(image.Rect(500, 0, 1000, 500)),
+			},
+		},
+		{
+			imageMax: image.Point{1000, 1000},
+			numTiles: image.Point{2, 2},
+			expect: []image.Image{
+				image.NewRGBA(image.Rect(0, 0, 500, 500)),
+				image.NewRGBA(image.Rect(500, 0, 1000, 500)),
+				image.NewRGBA(image.Rect(0, 500, 500, 1000)),
+				image.NewRGBA(image.Rect(500, 500, 1000, 1000)),
+			},
+		},
+	}
+
+	var p ImageProcessor
+
+	for _, tt := range cases {
+		actual := p.splitToTiles(image.NewRGBA(image.Rectangle{Max: tt.imageMax}), tt.numTiles)
+
+		if len(actual) != len(tt.expect) {
+			t.Errorf("incorrect number of images '%d': expect: '%d'", len(actual), len(tt.expect))
+		}
+
+		for i := range actual {
+			if actual[i].Bounds() != tt.expect[i].Bounds() {
+				t.Errorf("image size incorrect: '%#v': expect: '%#v'", actual[i].Bounds(), tt.expect[i].Bounds())
+			}
+		}
+	}
+}
+
+func TestResize(t *testing.T) {
+	cases := []struct {
+		p                 ImageProcessor
+		imageMax          image.Point
+		expectImage       image.Image
+		expectAspectRatio image.Point
+	}{
+		{
+			p:                 ImageProcessor{maxNumTiles: 1, imageSize: 100},
+			imageMax:          image.Point{200, 200},
+			expectImage:       image.NewRGBA(image.Rect(0, 0, 100, 100)),
+			expectAspectRatio: image.Point{1, 1},
+		},
+		{
+			p:                 ImageProcessor{maxNumTiles: 2, imageSize: 100},
+			imageMax:          image.Point{200, 200},
+			expectImage:       image.NewRGBA(image.Rect(0, 0, 100, 100)),
+			expectAspectRatio: image.Point{1, 1},
+		},
+		{
+			p:                 ImageProcessor{maxNumTiles: 4, imageSize: 560},
+			imageMax:          image.Point{10, 10},
+			expectImage:       image.NewRGBA(image.Rect(0, 0, 560, 560)),
+			expectAspectRatio: image.Point{1, 1},
+		},
+		{
+			p:                 ImageProcessor{maxNumTiles: 4, imageSize: 560},
+			imageMax:          image.Point{2560, 1920},
+			expectImage:       image.NewRGBA(image.Rect(0, 0, 1120, 840)),
+			expectAspectRatio: image.Point{2, 2},
+		},
+		{
+			p:                 ImageProcessor{maxNumTiles: 4, imageSize: 560},
+			imageMax:          image.Point{1024, 768},
+			expectImage:       image.NewRGBA(image.Rect(0, 0, 1024, 768)),
+			expectAspectRatio: image.Point{2, 2},
+		},
+	}
+
+	for _, tt := range cases {
+		actualImage, actualAspectRatio := tt.p.resize(image.Rectangle{Max: tt.imageMax})
+
+		if actualImage.Bounds() != tt.expectImage.Bounds() {
+			t.Errorf("image size incorrect: '%#v': expect: '%#v'", actualImage.Bounds(), tt.expectImage.Bounds())
+		}
+
+		if actualAspectRatio != tt.expectAspectRatio {
+			t.Errorf("aspect ratio incorrect: '%#v': expect: '%#v'", actualAspectRatio, tt.expectAspectRatio)
+		}
+	}
+}
+
+func TestPad(t *testing.T) {
+	cases := []struct {
+		p           ImageProcessor
+		imageMax    image.Point
+		aspectRatio image.Point
+		expect      image.Image
+	}{
+		{
+			p:           ImageProcessor{maxNumTiles: 4, imageSize: 560},
+			imageMax:    image.Point{1000, 667},
+			aspectRatio: image.Point{2, 2},
+			expect:      image.NewRGBA(image.Rect(0, 0, 1120, 1120)),
+		},
+	}
+
+	for _, tt := range cases {
+		actual := tt.p.pad(image.Rectangle{Max: tt.imageMax}, tt.aspectRatio)
+
+		if actual.Bounds() != tt.expect.Bounds() {
+			t.Errorf("image size incorrect: '%#v': expect: '%#v'", actual.Bounds(), tt.expect.Bounds())
+		}
+	}
+}
+
+func TestPackImages(t *testing.T) {
+	cases := []struct {
+		imageMax    image.Point
+		aspectRatio image.Point
+		expectVals  int
+	}{
+		{
+			imageMax:    image.Point{1120, 1120},
+			aspectRatio: image.Point{2, 2},
+			expectVals:  2 * 2 * 3 * 560 * 560,
+		},
+		{
+			imageMax:    image.Point{560, 560},
+			aspectRatio: image.Point{1, 1},
+			expectVals:  1 * 1 * 3 * 560 * 560,
+		},
+		{
+			imageMax:    image.Point{1120, 560},
+			aspectRatio: image.Point{1, 2},
+			expectVals:  1 * 2 * 3 * 560 * 560,
+		},
+	}
+
+	for _, tt := range cases {
+		var p ImageProcessor
+		actualVals := p.pack(image.NewRGBA(image.Rectangle{Max: tt.imageMax}), tt.aspectRatio)
+		if len(actualVals) != tt.expectVals {
+			t.Errorf("packed image size incorrect: '%d': expect: '%d'", len(actualVals), tt.expectVals)
+		}
+	}
+}
+
+func TestPreprocess(t *testing.T) {
+	cases := []struct {
+		imageMax            image.Point
+		expectAspectRatioID int
+	}{
+		{
+			imageMax:            image.Point{10, 10},
+			expectAspectRatioID: 1,
+		},
+		{
+			imageMax:            image.Point{1024, 768},
+			expectAspectRatioID: 6,
+		},
+	}
+
+	p := ImageProcessor{imageSize: 560, maxNumTiles: 4}
+	for _, tt := range cases {
+		img, aspectRatio, err := p.ProcessImage(image.NewRGBA(image.Rectangle{Max: tt.imageMax}))
+		if err != nil {
+			t.Fatalf("error processing: %q", err)
+		}
+
+		if len(img) == 0 {
+			t.Errorf("no image data returned")
+		}
+
+		if aspectRatio.rank != tt.expectAspectRatioID {
+			t.Errorf("aspect ratio incorrect: '%d': expect: '%d'", aspectRatio, tt.expectAspectRatioID)
+		}
+	}
+}
diff --git a/runner/llamarunner/image.go b/runner/llamarunner/image.go
index e7e30a4d..1d0c1a4f 100644
--- a/runner/llamarunner/image.go
+++ b/runner/llamarunner/image.go
@@ -5,7 +5,6 @@ import (
 	"fmt"
 	"hash/maphash"
 	"log/slog"
-	"slices"
 	"sync"
 	"time"
 
@@ -18,8 +17,7 @@ type ImageContext struct {
 	// mu is required to be held when generating embeddings or accessing the cache
 	mu sync.Mutex
 
-	clip   *llama.ClipContext
-	mllama *llama.MllamaContext
+	clip *llama.ClipContext
 
 	// cache of images to embeddings
 	images    []imageCache
@@ -35,8 +33,6 @@ func NewImageContext(llamaContext *llama.Context, modelPath string) (*ImageConte
 	var c ImageContext
 	if arch == "clip" {
 		c.clip, err = llama.NewClipContext(llamaContext, modelPath)
-	} else if arch == "mllama" {
-		c.mllama, err = llama.NewMllamaContext(llamaContext, modelPath)
 	} else {
 		return nil, fmt.Errorf("unknown vision model architecture: %s", arch)
 	}
@@ -58,12 +54,9 @@ func (c *ImageContext) Free(modelPath string) {
 	if c.clip != nil {
 		c.clip.Free()
 	}
-	if c.mllama != nil {
-		c.mllama.Free()
-	}
 }
 
-func (c *ImageContext) NewEmbed(llamaContext *llama.Context, data []byte, aspectRatioId int) ([][]float32, error) {
+func (c *ImageContext) NewEmbed(llamaContext *llama.Context, data []byte) ([][]float32, error) {
 	if c == nil {
 		return nil, nil
 	}
@@ -79,12 +72,7 @@ func (c *ImageContext) NewEmbed(llamaContext *llama.Context, data []byte, aspect
 
 	embed, err := c.findImage(hash)
 	if err != nil {
-		if c.mllama != nil {
-			embed, err = c.mllama.NewEmbed(llamaContext, data, aspectRatioId)
-			if err != nil {
-				return nil, err
-			}
-		} else if c.clip != nil {
+		if c.clip != nil {
 			embed, err = c.clip.NewEmbed(llamaContext, data)
 			if err != nil {
 				return nil, err
@@ -105,33 +93,11 @@ func (c *ImageContext) BatchSize(configuredBatchSize int) int {
 		return 0
 	}
 
-	// Mllama maps an image to 1 embedding token (llava creates many tokens)
-	// and doesn't support more than a single image per request.
-	// The embeddings are large (100 MB), so allocating a big batch can fail
-	// on some systems
-	if c.mllama != nil {
-		return 1
-	}
-
 	return configuredBatchSize
 }
 
 func (c *ImageContext) EmbedSize(llamaContext *llama.Context) int {
-	if c != nil && c.mllama != nil {
-		return c.mllama.EmbedSize(llamaContext)
-	} else {
-		return llamaContext.Model().NEmbd()
-	}
-}
-
-func (c *ImageContext) NeedCrossAttention(inputs ...input) bool {
-	if c == nil || c.mllama == nil {
-		return false
-	}
-
-	return slices.ContainsFunc(inputs, func(input input) bool {
-		return input.embed != nil
-	})
+	return llamaContext.Model().NEmbd()
 }
 
 type imageCache struct {
diff --git a/runner/llamarunner/runner.go b/runner/llamarunner/runner.go
index 73e50ee0..7aa9b96a 100644
--- a/runner/llamarunner/runner.go
+++ b/runner/llamarunner/runner.go
@@ -57,10 +57,6 @@ type Sequence struct {
 	// input cache being used by this sequence
 	cache *InputCacheSlot
 
-	// does this sequence require cross-attention layers to be processed? - if we have seen
-	// an image for certain multi-modal models
-	crossAttention bool
-
 	// channel to send responses over
 	responses chan string
 
@@ -205,7 +201,7 @@ func (s *Server) inputs(prompt string, images []llm.ImageData) ([]input, error)
 				return nil, fmt.Errorf("invalid image index: %d", n)
 			}
 
-			embed, err := s.image.NewEmbed(s.lc, images[imageIndex].Data, images[imageIndex].AspectRatioID)
+			embed, err := s.image.NewEmbed(s.lc, images[imageIndex].Data)
 			if err != nil {
 				return nil, err
 			}
@@ -368,7 +364,6 @@ func (s *Server) processBatch(tokenBatch *llama.Batch, embedBatch *llama.Batch)
 	defer s.mu.Unlock()
 
 	var batch *llama.Batch
-	crossAttention := false
 
 	seqIdx := s.nextSeq - 1
 	for range s.seqs {
@@ -416,9 +411,8 @@ func (s *Server) processBatch(tokenBatch *llama.Batch, embedBatch *llama.Batch)
 					batch = tokenBatch
 				} else {
 					batch = embedBatch
-					seq.crossAttention = s.image.NeedCrossAttention(input)
 				}
-			} else if embedding != batch.IsEmbedding() || crossAttention != seq.crossAttention {
+			} else if embedding != batch.IsEmbedding() {
 				s.nextSeq = seqIdx
 				break
 			}
@@ -427,7 +421,6 @@ func (s *Server) processBatch(tokenBatch *llama.Batch, embedBatch *llama.Batch)
 				break
 			}
 
-			crossAttention = seq.crossAttention
 			batch.Add(input.token, input.embed, len(seq.cache.Inputs)+len(seq.pendingInputs), i+1 == len(seq.inputs), seq.cache.Id)
 			seq.pendingInputs = append(seq.pendingInputs, input)
 			seq.iBatch = batch.NumTokens() - 1
@@ -440,20 +433,11 @@ func (s *Server) processBatch(tokenBatch *llama.Batch, embedBatch *llama.Batch)
 		return nil
 	}
 
-	s.lc.SetCrossAttention(crossAttention)
-
 	err := s.lc.Decode(batch)
 	if err != nil {
 		return fmt.Errorf("failed to decode batch: %w", err)
 	}
 
-	if crossAttention {
-		// synchronize state to ensure the cross attention batch is complete.
-		// needed specifically for multi-GPU systems otherwise an inflight
-		// task may be incorrectly invalidated causing a crash
-		s.lc.Synchronize()
-	}
-
 	for i, seq := range s.seqs {
 		if seq == nil {
 			continue
@@ -622,8 +606,6 @@ func (s *Server) completion(w http.ResponseWriter, r *http.Request) {
 				return
 			}
 
-			seq.crossAttention = s.image.NeedCrossAttention(seq.cache.Inputs...)
-
 			s.seqs[i] = seq
 			s.cond.Signal()
 			found = true
diff --git a/server/prompt.go b/server/prompt.go
index 5b5b958f..147a02b6 100644
--- a/server/prompt.go
+++ b/server/prompt.go
@@ -3,47 +3,32 @@ package server
 import (
 	"bytes"
 	"context"
-	"encoding/binary"
 	"errors"
 	"fmt"
 	"log/slog"
+	"slices"
 	"strings"
 
 	"github.com/ollama/ollama/api"
 	"github.com/ollama/ollama/llm"
-	"github.com/ollama/ollama/model/models/mllama"
 	"github.com/ollama/ollama/template"
 )
 
 type tokenizeFunc func(context.Context, string) ([]int, error)
 
-var errTooManyImages = errors.New("vision model only supports a single image per message")
-
 // chatPrompt accepts a list of messages and returns the prompt and images that should be used for the next chat turn.
 // chatPrompt truncates any messages that exceed the context window of the model, making sure to always include 1) the
 // latest message and 2) system messages
 func chatPrompt(ctx context.Context, m *Model, tokenize tokenizeFunc, opts *api.Options, msgs []api.Message, tools []api.Tool) (prompt string, images []llm.ImageData, _ error) {
 	var system []api.Message
 
-	isMllama := checkMllamaModelFamily(m)
-
-	var imageNumTokens int
 	// TODO: Ideally we would compute this from the projector metadata but some pieces are implementation dependent
-	if isMllama {
-		// Our mllama implementation packs all of the embeddings into a single token
-		imageNumTokens = 1
-	} else {
-		// Clip images are represented as 768 tokens, each an embedding
-		imageNumTokens = 768
-	}
+	// Clip images are represented as 768 tokens, each an embedding
+	imageNumTokens := 768
 
 	n := len(msgs) - 1
 	// in reverse, find all messages that fit into context window
 	for i := n; i >= 0; i-- {
-		if isMllama && len(msgs[i].Images) > 1 {
-			return "", nil, errTooManyImages
-		}
-
 		// always include the last message
 		if i == n {
 			continue
@@ -84,48 +69,17 @@ func chatPrompt(ctx context.Context, m *Model, tokenize tokenizeFunc, opts *api.
 	currMsgIdx := n
 
 	for cnt, msg := range msgs[currMsgIdx:] {
-		prefix := ""
-		imgPrompt := ""
+		if slices.Contains(m.Config.ModelFamilies, "mllama") && len(msg.Images) > 1 {
+			return "", nil, errors.New("this model only supports one image while more than one image requested")
+		}
+
+		var prefix string
 		prompt := msg.Content
 
 		for _, i := range msg.Images {
-			var imgData llm.ImageData
-
-			if isMllama {
-				if len(m.ProjectorPaths) == 0 {
-					imgData = llm.ImageData{
-						ID:   len(images),
-						Data: i,
-					}
-				} else {
-					data, opts, err := mllama.Preprocess(bytes.NewReader(i))
-					if err != nil {
-						return "", nil, err
-					}
-
-					buf := new(bytes.Buffer)
-					err = binary.Write(buf, binary.LittleEndian, data)
-					if err != nil {
-						return "", nil, err
-					}
-
-					ar, ok := opts["aspectRatioIndex"].(int)
-					if !ok {
-						return "", nil, fmt.Errorf("missing aspect ratio for image")
-					}
-
-					imgData = llm.ImageData{
-						ID:            len(images),
-						Data:          buf.Bytes(),
-						AspectRatioID: ar,
-					}
-				}
-				imgPrompt = "<|image|>"
-			} else {
-				imgData = llm.ImageData{
-					ID:   len(images),
-					Data: i,
-				}
+			imgData := llm.ImageData{
+				ID:   len(images),
+				Data: i,
 			}
 
 			imgTag := fmt.Sprintf("[img-%d]", imgData.ID)
@@ -137,7 +91,7 @@ func chatPrompt(ctx context.Context, m *Model, tokenize tokenizeFunc, opts *api.
 
 			images = append(images, imgData)
 		}
-		msgs[currMsgIdx+cnt].Content = prefix + imgPrompt + prompt
+		msgs[currMsgIdx+cnt].Content = prefix + prompt
 	}
 
 	// truncate any messages that do not fit into the context window
@@ -148,12 +102,3 @@ func chatPrompt(ctx context.Context, m *Model, tokenize tokenizeFunc, opts *api.
 
 	return b.String(), images, nil
 }
-
-func checkMllamaModelFamily(m *Model) bool {
-	for _, arch := range m.Config.ModelFamilies {
-		if arch == "mllama" {
-			return true
-		}
-	}
-	return false
-}
diff --git a/server/prompt_test.go b/server/prompt_test.go
index b81c01ee..fb6c96c0 100644
--- a/server/prompt_test.go
+++ b/server/prompt_test.go
@@ -2,8 +2,6 @@ package server
 
 import (
 	"bytes"
-	"image"
-	"image/png"
 	"testing"
 
 	"github.com/google/go-cmp/cmp"
@@ -14,10 +12,9 @@ import (
 
 func TestChatPrompt(t *testing.T) {
 	type expect struct {
-		prompt        string
-		images        [][]byte
-		aspectRatioID int
-		error         error
+		prompt string
+		images [][]byte
+		error  error
 	}
 
 	tmpl, err := template.Parse(`
@@ -28,28 +25,6 @@ func TestChatPrompt(t *testing.T) {
 		t.Fatal(err)
 	}
 	visionModel := Model{Template: tmpl, ProjectorPaths: []string{"vision"}}
-	mllamaModel := Model{Template: tmpl, ProjectorPaths: []string{"vision"}, Config: ConfigV2{ModelFamilies: []string{"mllama"}}}
-
-	createImg := func(width, height int) ([]byte, error) {
-		img := image.NewRGBA(image.Rect(0, 0, width, height))
-		var buf bytes.Buffer
-
-		if err := png.Encode(&buf, img); err != nil {
-			return nil, err
-		}
-
-		return buf.Bytes(), nil
-	}
-
-	imgBuf, err := createImg(5, 5)
-	if err != nil {
-		t.Fatal(err)
-	}
-
-	imgBuf2, err := createImg(6, 6)
-	if err != nil {
-		t.Fatal(err)
-	}
 
 	cases := []struct {
 		name  string
@@ -227,90 +202,6 @@ func TestChatPrompt(t *testing.T) {
 				images: [][]byte{[]byte("one hotdog"), []byte("two hotdogs")},
 			},
 		},
-		{
-			name:  "messages with mllama (no images)",
-			model: mllamaModel,
-			limit: 2048,
-			msgs: []api.Message{
-				{Role: "user", Content: "You're a test, Harry!"},
-				{Role: "assistant", Content: "I-I'm a what?"},
-				{Role: "user", Content: "A test. And a thumping good one at that, I'd wager."},
-			},
-			expect: expect{
-				prompt: "You're a test, Harry! I-I'm a what? A test. And a thumping good one at that, I'd wager. ",
-			},
-		},
-		{
-			name:  "messages with mllama single prompt",
-			model: mllamaModel,
-			limit: 2048,
-			msgs: []api.Message{
-				{Role: "user", Content: "How many hotdogs are in this image?", Images: []api.ImageData{imgBuf}},
-			},
-			expect: expect{
-				prompt:        "[img-0]<|image|>How many hotdogs are in this image? ",
-				images:        [][]byte{imgBuf},
-				aspectRatioID: 1,
-			},
-		},
-		{
-			name:  "messages with mllama",
-			model: mllamaModel,
-			limit: 2048,
-			msgs: []api.Message{
-				{Role: "user", Content: "You're a test, Harry!"},
-				{Role: "assistant", Content: "I-I'm a what?"},
-				{Role: "user", Content: "A test. And a thumping good one at that, I'd wager.", Images: []api.ImageData{imgBuf}},
-			},
-			expect: expect{
-				prompt:        "You're a test, Harry! I-I'm a what? [img-0]<|image|>A test. And a thumping good one at that, I'd wager. ",
-				images:        [][]byte{imgBuf},
-				aspectRatioID: 1,
-			},
-		},
-		{
-			name:  "multiple messages with mllama",
-			model: mllamaModel,
-			limit: 2048,
-			msgs: []api.Message{
-				{Role: "user", Content: "You're a test, Harry!", Images: []api.ImageData{imgBuf}},
-				{Role: "assistant", Content: "I-I'm a what?"},
-				{Role: "user", Content: "A test. And a thumping good one at that, I'd wager.", Images: []api.ImageData{imgBuf2}},
-			},
-			expect: expect{
-				prompt:        "[img-0]<|image|>You're a test, Harry! I-I'm a what? [img-1]<|image|>A test. And a thumping good one at that, I'd wager. ",
-				images:        [][]byte{imgBuf, imgBuf2},
-				aspectRatioID: 1,
-			},
-		},
-		{
-			name:  "earlier image with mllama",
-			model: mllamaModel,
-			limit: 2048,
-			msgs: []api.Message{
-				{Role: "user", Content: "How many hotdogs are in this image?", Images: []api.ImageData{imgBuf}},
-				{Role: "assistant", Content: "There are four hotdogs."},
-				{Role: "user", Content: "Which ones have mustard?"},
-			},
-			expect: expect{
-				prompt:        "[img-0]<|image|>How many hotdogs are in this image? There are four hotdogs. Which ones have mustard? ",
-				images:        [][]byte{imgBuf},
-				aspectRatioID: 1,
-			},
-		},
-		{
-			name:  "too many images with mllama",
-			model: mllamaModel,
-			limit: 2048,
-			msgs: []api.Message{
-				{Role: "user", Content: "You're a test, Harry!"},
-				{Role: "assistant", Content: "I-I'm a what?"},
-				{Role: "user", Content: "A test. And a thumping good one at that, I'd wager.", Images: []api.ImageData{imgBuf, imgBuf}},
-			},
-			expect: expect{
-				error: errTooManyImages,
-			},
-		},
 	}
 
 	for _, tt := range cases {
@@ -341,10 +232,6 @@ func TestChatPrompt(t *testing.T) {
 					if !bytes.Equal(images[i].Data, tt.images[i]) {
 						t.Errorf("expected %q, got %q", tt.images[i], images[i].Data)
 					}
-				} else {
-					if images[i].AspectRatioID != tt.aspectRatioID {
-						t.Errorf("expected aspect ratio %d, got %d", tt.aspectRatioID, images[i].AspectRatioID)
-					}
 				}
 			}
 		})
diff --git a/server/routes.go b/server/routes.go
index fd65669a..d0b8f487 100644
--- a/server/routes.go
+++ b/server/routes.go
@@ -4,7 +4,6 @@ import (
 	"bytes"
 	"cmp"
 	"context"
-	"encoding/binary"
 	"encoding/json"
 	"errors"
 	"fmt"
@@ -35,7 +34,6 @@ import (
 	"github.com/ollama/ollama/fs/ggml"
 	"github.com/ollama/ollama/llm"
 	"github.com/ollama/ollama/logutil"
-	"github.com/ollama/ollama/model/models/mllama"
 	"github.com/ollama/ollama/openai"
 	"github.com/ollama/ollama/server/internal/client/ollama"
 	"github.com/ollama/ollama/server/internal/registry"
@@ -100,6 +98,10 @@ func (s *Server) scheduleRunner(ctx context.Context, name string, caps []model.C
 		return nil, nil, nil, err
 	}
 
+	if slices.Contains(model.Config.ModelFamilies, "mllama") && len(model.ProjectorPaths) > 0 {
+		return nil, nil, nil, fmt.Errorf("'llama3.2-vision' is no longer compatible with your version of Ollama and has been replaced by a newer version. To re-download, run 'ollama pull llama3.2-vision'")
+	}
+
 	if err := model.CheckCapabilities(caps...); err != nil {
 		return nil, nil, nil, fmt.Errorf("%s %w", name, err)
 	}
@@ -206,38 +208,14 @@ func (s *Server) GenerateHandler(c *gin.Context) {
 		return
 	}
 
-	isMllama := checkMllamaModelFamily(m)
-	if isMllama && len(req.Images) > 1 {
-		c.AbortWithStatusJSON(http.StatusBadRequest, gin.H{"error": "this model only supports one image: more than one image sent"})
+	if slices.Contains(m.Config.ModelFamilies, "mllama") && len(req.Images) > 1 {
+		c.AbortWithStatusJSON(http.StatusBadRequest, gin.H{"error": "this model only supports one image while more than one image requested"})
 		return
 	}
 
 	images := make([]llm.ImageData, len(req.Images))
 	for i := range req.Images {
-		if isMllama && len(m.ProjectorPaths) > 0 {
-			data, opts, err := mllama.Preprocess(bytes.NewReader(req.Images[i]))
-			if err != nil {
-				c.AbortWithStatusJSON(http.StatusInternalServerError, gin.H{"error": "error processing image"})
-				return
-			}
-
-			ar, ok := opts["aspectRatioIndex"].(int)
-			if !ok {
-				c.AbortWithStatusJSON(http.StatusInternalServerError, gin.H{"error": "error processing image"})
-				return
-			}
-
-			buf := new(bytes.Buffer)
-			err = binary.Write(buf, binary.LittleEndian, data)
-			if err != nil {
-				c.AbortWithStatusJSON(http.StatusInternalServerError, gin.H{"error": "error processing image"})
-				return
-			}
-
-			images[i] = llm.ImageData{ID: i, Data: buf.Bytes(), AspectRatioID: ar}
-		} else {
-			images[i] = llm.ImageData{ID: i, Data: req.Images[i]}
-		}
+		images[i] = llm.ImageData{ID: i, Data: req.Images[i]}
 	}
 
 	prompt := req.Prompt
@@ -269,9 +247,6 @@ func (s *Server) GenerateHandler(c *gin.Context) {
 
 			for _, i := range images {
 				imgPrompt := ""
-				if isMllama {
-					imgPrompt = "<|image|>"
-				}
 				msgs = append(msgs, api.Message{Role: "user", Content: fmt.Sprintf("[img-%d]"+imgPrompt, i.ID)})
 			}
 
diff --git a/server/sched.go b/server/sched.go
index 43da138e..3fc54e55 100644
--- a/server/sched.go
+++ b/server/sched.go
@@ -8,6 +8,7 @@ import (
 	"os"
 	"reflect"
 	"runtime"
+	"slices"
 	"sort"
 	"strconv"
 	"strings"
@@ -132,11 +133,11 @@ func (s *Scheduler) processPending(ctx context.Context) {
 				continue
 			}
 			numParallel := int(envconfig.NumParallel())
-			// TODO (jmorganca): mllama doesn't support parallel yet
-			// see https://github.com/ollama/ollama/issues/4165
-			if checkMllamaModelFamily(pending.model) && numParallel != 1 {
+			// `mllama` is a snowflake and uses an encoder cache which cannot be used with num_parallel > 1
+			// ref: https://github.com/ollama/ollama/issues/4165
+			if slices.Contains(pending.model.Config.ModelFamilies, "mllama") && numParallel != 1 {
 				numParallel = 1
-				slog.Warn("mllama doesn't support parallel requests yet")
+				slog.Warn("mllama does not currently support parallel requests")
 			}
 
 			for {

From 0aa8b371ddd24a2d0ce859903a9284e9544f5c78 Mon Sep 17 00:00:00 2001
From: Bruce MacDonald <brucewmacdonald@gmail.com>
Date: Tue, 13 May 2025 20:58:02 -0700
Subject: [PATCH 022/108] model: add Qwen2.5-VL support (#10385)

---
 convert/convert.go                            |   2 +
 convert/convert_qwen2.go                      |   3 +
 convert/convert_qwen25vl.go                   | 102 +++++
 convert/tensor.go                             |  56 +++
 fs/ggml/ggml.go                               |  25 ++
 ...15-add-argsort-and-cuda-copy-for-i32.patch | 277 +++++++++++++
 ml/backend.go                                 |  18 +-
 ml/backend/ggml/ggml.go                       |  34 +-
 ml/backend/ggml/ggml/src/ggml-cpu/ops.cpp     |  43 ++
 ml/backend/ggml/ggml/src/ggml-cuda/argsort.cu | 102 ++++-
 ml/backend/ggml/ggml/src/ggml-cuda/cpy.cu     |  49 +++
 model/models/models.go                        |   1 +
 model/models/qwen25vl/model.go                | 187 +++++++++
 model/models/qwen25vl/model_text.go           | 155 +++++++
 model/models/qwen25vl/model_vision.go         | 391 ++++++++++++++++++
 model/models/qwen25vl/process_image.go        | 184 +++++++++
 16 files changed, 1619 insertions(+), 10 deletions(-)
 create mode 100644 convert/convert_qwen25vl.go
 create mode 100644 convert/tensor.go
 create mode 100644 llama/patches/0015-add-argsort-and-cuda-copy-for-i32.patch
 create mode 100644 model/models/qwen25vl/model.go
 create mode 100644 model/models/qwen25vl/model_text.go
 create mode 100644 model/models/qwen25vl/model_vision.go
 create mode 100644 model/models/qwen25vl/process_image.go

diff --git a/convert/convert.go b/convert/convert.go
index 48804d7f..309b0ce1 100644
--- a/convert/convert.go
+++ b/convert/convert.go
@@ -191,6 +191,8 @@ func ConvertModel(fsys fs.FS, f *os.File) error {
 		conv = &phi3Model{}
 	case "Qwen2ForCausalLM":
 		conv = &qwen2Model{}
+	case "Qwen2_5_VLForConditionalGeneration":
+		conv = &qwen25VLModel{}
 	case "BertModel":
 		conv = &bertModel{}
 	case "CohereForCausalLM":
diff --git a/convert/convert_qwen2.go b/convert/convert_qwen2.go
index edcb82e2..3647c4e5 100644
--- a/convert/convert_qwen2.go
+++ b/convert/convert_qwen2.go
@@ -15,6 +15,7 @@ type qwen2Model struct {
 		Type                          string     `json:"type"`
 		Factor                        ropeFactor `json:"factor"`
 		OriginalMaxPositionEmbeddings uint32     `json:"original_max_position_embeddings"`
+		MropeSection                  []int32    `json:"mrope_section"`
 	} `json:"rope_scaling"`
 	RMSNormEPS float32 `json:"rms_norm_eps"`
 }
@@ -39,6 +40,8 @@ func (q *qwen2Model) KV(t *Tokenizer) ggml.KV {
 	case "yarn":
 		kv["qwen2.rope.scaling.type"] = q.RopeScaling.Type
 		kv["qwen2.rope.scaling.factor"] = q.RopeScaling.Factor
+	case "mrope", "default":
+		kv["qwen2.rope.mrope_section"] = q.RopeScaling.MropeSection
 	default:
 		panic("unknown rope scaling type")
 	}
diff --git a/convert/convert_qwen25vl.go b/convert/convert_qwen25vl.go
new file mode 100644
index 00000000..c2d5a633
--- /dev/null
+++ b/convert/convert_qwen25vl.go
@@ -0,0 +1,102 @@
+package convert
+
+import (
+	"cmp"
+	"slices"
+	"strings"
+
+	"github.com/ollama/ollama/fs/ggml"
+)
+
+type qwen25VLModel struct {
+	qwen2Model
+
+	VisionModel struct {
+		Depth               uint32  `json:"depth"`
+		HiddenSize          uint32  `json:"hidden_size"`
+		NumHeads            uint32  `json:"num_heads"`
+		InChannels          uint32  `json:"in_chans"`
+		PatchSize           uint32  `json:"patch_size"`
+		SpatialMergeSize    uint32  `json:"spatial_merge_size"`
+		SpatialPatchSize    uint32  `json:"spatial_patch_size"`
+		WindowSize          uint32  `json:"window_size"`
+		RMSNormEps          float32 `json:"layer_norm_epsilon"`
+		RopeTheta           float32 `json:"rope_theta"`
+		FullAttentionBlocks []int32 `json:"fullatt_block_indexes"`
+		TemporalPatchSize   uint32  `json:"temporal_patch_size"`
+	} `json:"vision_config"`
+}
+
+var _ ModelConverter = (*qwen25VLModel)(nil)
+
+func (q *qwen25VLModel) KV(t *Tokenizer) ggml.KV {
+	kv := q.ModelParameters.KV(t)
+	kv["general.architecture"] = "qwen25vl"
+
+	for k, v := range q.qwen2Model.KV(t) {
+		if strings.HasPrefix(k, "qwen2.") {
+			kv[strings.Replace(k, "qwen2.", "qwen25vl.", 1)] = v
+		}
+	}
+
+	if q.VisionModel.FullAttentionBlocks == nil {
+		kv["qwen25vl.vision.fullatt_block_indexes"] = []int32{7, 15, 23, 31}
+	}
+
+	kv["qwen25vl.vision.block_count"] = cmp.Or(q.VisionModel.Depth, 32)
+	kv["qwen25vl.vision.embedding_length"] = q.VisionModel.HiddenSize
+	kv["qwen25vl.vision.attention.head_count"] = cmp.Or(q.VisionModel.NumHeads, 16)
+	kv["qwen25vl.vision.num_channels"] = q.VisionModel.InChannels
+	kv["qwen25vl.vision.patch_size"] = cmp.Or(q.VisionModel.PatchSize, 14)
+	kv["qwen25vl.vision.spatial_merge_size"] = cmp.Or(q.VisionModel.SpatialMergeSize, 2)
+	kv["qwen25vl.vision.spatial_patch_size"] = q.VisionModel.SpatialPatchSize
+	kv["qwen25vl.vision.window_size"] = cmp.Or(q.VisionModel.WindowSize, 112)
+	kv["qwen25vl.vision.attention.layer_norm_epsilon"] = cmp.Or(q.VisionModel.RMSNormEps, 1e-6)
+	kv["qwen25vl.vision.rope.freq_base"] = cmp.Or(q.VisionModel.RopeTheta, 1e4)
+	kv["qwen25vl.vision.fullatt_block_indexes"] = q.VisionModel.FullAttentionBlocks
+	kv["qwen25vl.vision.temporal_patch_size"] = cmp.Or(q.VisionModel.TemporalPatchSize, 2)
+
+	return kv
+}
+
+func (q *qwen25VLModel) Tensors(ts []Tensor) []*ggml.Tensor {
+	var out []*ggml.Tensor
+
+	for _, t := range ts {
+		if strings.Contains(t.Name(), "patch_embed.proj") {
+			for t := range splitDim(t, 2,
+				strings.NewReplacer("patch_embed.proj", "patch_embd_0"),
+				strings.NewReplacer("patch_embed.proj", "patch_embd_1"),
+			) {
+				t.Shape = slices.DeleteFunc(t.Shape, func(i uint64) bool { return i == 1 })
+				out = append(out, t)
+			}
+		} else if strings.Contains(t.Name(), "attn.qkv") {
+			out = append(out, slices.Collect(splitDim(t, 0,
+				strings.NewReplacer("attn.qkv", "attn_q"),
+				strings.NewReplacer("attn.qkv", "attn_k"),
+				strings.NewReplacer("attn.qkv", "attn_v"),
+			))...)
+		} else {
+			out = append(out, &ggml.Tensor{
+				Name:     t.Name(),
+				Kind:     t.Kind(),
+				Shape:    t.Shape(),
+				WriterTo: t,
+			})
+		}
+	}
+
+	return out
+}
+
+func (p *qwen25VLModel) Replacements() []string {
+	return append(
+		p.qwen2Model.Replacements(),
+		"visual", "v",
+		"blocks", "blk",
+		"attn.proj", "attn_out",
+		"norm1", "ln1",
+		"norm2", "ln2",
+	)
+}
diff --git a/convert/tensor.go b/convert/tensor.go
new file mode 100644
index 00000000..ffb22ead
--- /dev/null
+++ b/convert/tensor.go
@@ -0,0 +1,56 @@
+package convert
+
+import (
+	"iter"
+	"slices"
+	"strings"
+
+	"github.com/ollama/ollama/fs/ggml"
+	"github.com/pdevine/tensor"
+	"github.com/pdevine/tensor/native"
+)
+
+// splitDim splits a tensor along a specified dimension into multiple tensors. The dimension
+// is split evenly based on the number of replacers provided.
+func splitDim(t Tensor, dim int, replacers ...*strings.Replacer) iter.Seq[*ggml.Tensor] {
+	return func(yield func(*ggml.Tensor) bool) {
+		for i, replacer := range replacers {
+			shape := slices.Clone(t.Shape())
+			shape[dim] = shape[dim] / uint64(len(replacers))
+
+			slice := slices.Repeat([]tensor.Slice{nil}, len(shape))
+			slice[dim] = tensor.S(i*int(shape[dim]), (i+1)*int(shape[dim]))
+
+			tt := t.Clone()
+			tt.SetRepacker(func(_ string, data []float32, shape []uint64) ([]float32, error) {
+				dims := make([]int, len(shape))
+				for i := range shape {
+					dims[i] = int(shape[i])
+				}
+
+				var t tensor.Tensor = tensor.New(tensor.WithShape(dims...), tensor.WithBacking(data))
+				t, err := t.Slice(slice...)
+				if err != nil {
+					return nil, err
+				}
+
+				t = tensor.Materialize(t)
+				// flatten tensor so it can be written as a vector
+				if err := t.Reshape(t.Shape().TotalSize()); err != nil {
+					return nil, err
+				}
+
+				return native.VectorF32(t.(*tensor.Dense))
+			})
+
+			if !yield(&ggml.Tensor{
+				Name:     replacer.Replace(t.Name()),
+				Kind:     t.Kind(),
+				Shape:    shape,
+				WriterTo: tt,
+			}) {
+				break
+			}
+		}
+	}
+}
diff --git a/fs/ggml/ggml.go b/fs/ggml/ggml.go
index c29d715b..514b6011 100644
--- a/fs/ggml/ggml.go
+++ b/fs/ggml/ggml.go
@@ -6,6 +6,7 @@ import (
 	"fmt"
 	"io"
 	"log/slog"
+	"math"
 	"slices"
 	"strings"
 
@@ -126,6 +127,7 @@ func (kv KV) OllamaEngineRequired() bool {
 		"mistral3",
 		"llama4",
 		"mllama",
+		"qwen25vl",
 	}, kv.Architecture())
 }
 
@@ -649,6 +651,29 @@ func (llm GGML) VisionGraphSize() (weights, graphSize uint64) {
 		graphSize = 4 * (imageSize*imageSize*numChannels +
 			embeddingLength*patchSize +
 			numPatches*numPatches*headCount)
+	case "qwen25vl":
+		maxPixels := uint64(llm.KV().Uint("vision.max_pixels", 28*28*1280))
+		mergeSize := uint64(llm.KV().Uint("vision.spatial_merge_size", 2))
+		temporalPatchSize := uint64(2)
+
+		// Calculate max possible patches based on max_pixels
+		maxHeight := uint64(math.Sqrt(float64(maxPixels)))
+		maxWidth := maxPixels / maxHeight
+		maxGridHeight := maxHeight / patchSize
+		maxGridWidth := maxWidth / patchSize
+		// Account for merged patches (2x2 grid)
+		numPatches := (maxGridHeight * maxGridWidth) / (mergeSize * mergeSize)
+
+		// Calculate graph size based on typical operations in ProcessImage and createPatches
+		graphSize = 4 * (maxPixels*numChannels + // Original image storage
+			// Normalized pixels
+			maxPixels*numChannels +
+			// Patches storage (numPatches * channels * temporalPatchSize * patchSize^2)
+			numPatches*numChannels*temporalPatchSize*patchSize*patchSize +
+			// Self-attention calculations (similar to other architectures)
+			numPatches*numPatches*headCount +
+			// Additional buffer for processing
+			embeddingLength*numPatches)
 	case "llama4":
 		// vision graph is computed independently in the same schedule
 		// and is negligible compared to the worst case text graph
diff --git a/llama/patches/0015-add-argsort-and-cuda-copy-for-i32.patch b/llama/patches/0015-add-argsort-and-cuda-copy-for-i32.patch
new file mode 100644
index 00000000..b71295c7
--- /dev/null
+++ b/llama/patches/0015-add-argsort-and-cuda-copy-for-i32.patch
@@ -0,0 +1,277 @@
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
+From: Michael Yang <git@mxy.ng>
+Date: Thu, 1 May 2025 13:45:12 -0700
+Subject: [PATCH] add argsort and cuda copy for i32
+
+---
+ ggml/src/ggml-cpu/ops.cpp     |  43 ++++++++++++++
+ ggml/src/ggml-cuda/argsort.cu | 102 +++++++++++++++++++++++++++++++++-
+ ggml/src/ggml-cuda/cpy.cu     |  49 ++++++++++++++++
+ 3 files changed, 192 insertions(+), 2 deletions(-)
+
+diff --git a/ggml/src/ggml-cpu/ops.cpp b/ggml/src/ggml-cpu/ops.cpp
+index becdae07..7a44b6cf 100644
+--- a/ggml/src/ggml-cpu/ops.cpp
++++ b/ggml/src/ggml-cpu/ops.cpp
+@@ -6890,6 +6890,45 @@ static void ggml_compute_forward_argsort_f32(
+     }
+ }
+ 
++static void ggml_compute_forward_argsort_i32(
++    const ggml_compute_params * params,
++    ggml_tensor * dst) {
++
++    const ggml_tensor * src0 = dst->src[0];
++
++    GGML_TENSOR_UNARY_OP_LOCALS
++
++    GGML_ASSERT(nb0 == sizeof(int32_t));
++
++    const int ith = params->ith;
++    const int nth = params->nth;
++
++    const int64_t nr = ggml_nrows(src0);
++
++    ggml_sort_order order = (ggml_sort_order) ggml_get_op_params_i32(dst, 0);
++
++    for (int64_t i = ith; i < nr; i += nth) {
++        int32_t * dst_data = (int32_t *)((char *) dst->data + i*nb1);
++        const int32_t * src_data = (int32_t *)((char *) src0->data + i*nb01);
++
++        for (int64_t j = 0; j < ne0; j++) {
++            dst_data[j] = j;
++        }
++
++        // C doesn't have a functional sort, so we do a bubble sort instead
++        for (int64_t j = 0; j < ne0; j++) {
++            for (int64_t k = j + 1; k < ne0; k++) {
++                if ((order == GGML_SORT_ORDER_ASC  && src_data[dst_data[j]] > src_data[dst_data[k]]) ||
++                    (order == GGML_SORT_ORDER_DESC && src_data[dst_data[j]] < src_data[dst_data[k]])) {
++                    int32_t tmp = dst_data[j];
++                    dst_data[j] = dst_data[k];
++                    dst_data[k] = tmp;
++                }
++            }
++        }
++    }
++}
++
+ void ggml_compute_forward_argsort(
+     const ggml_compute_params * params,
+     ggml_tensor * dst) {
+@@ -6901,6 +6940,10 @@ void ggml_compute_forward_argsort(
+             {
+                 ggml_compute_forward_argsort_f32(params, dst);
+             } break;
++        case GGML_TYPE_I32:
++            {
++                ggml_compute_forward_argsort_i32(params, dst);
++            } break;
+         default:
+             {
+                 GGML_ABORT("fatal error");
+diff --git a/ggml/src/ggml-cuda/argsort.cu b/ggml/src/ggml-cuda/argsort.cu
+index 607ded85..53b02634 100644
+--- a/ggml/src/ggml-cuda/argsort.cu
++++ b/ggml/src/ggml-cuda/argsort.cu
+@@ -85,13 +85,107 @@ static void argsort_f32_i32_cuda(const float * x, int * dst, const int ncols, co
+     }
+ }
+ 
++
++template<ggml_sort_order order>
++static __global__ void k_argsort_i32_i32(const int32_t * x, int * dst, const int ncols, const int ncols_pad) {
++    extern __shared__ int shared_mem[];
++    int * indices = shared_mem;
++
++    const int tid = threadIdx.x;
++    const int row = blockIdx.y;
++
++    // Initialize all indices, handling the case where threads < ncols_pad
++    for (int i = tid; i < ncols_pad; i += blockDim.x) {
++        indices[i] = i < ncols ? i : 0; // Use 0 for padding indices
++    }
++    __syncthreads();
++
++    // Bitonic sort
++    for (int k = 2; k <= ncols_pad; k *= 2) {
++        for (int j = k/2; j > 0; j /= 2) {
++            for (int i = tid; i < ncols_pad; i += blockDim.x) {
++                const int ij = i ^ j;
++                if (ij > i) {
++                    // Only compare values within the actual data range
++                    if (i < ncols && ij < ncols) {
++                        if ((i & k) == 0) {
++                            if (order == GGML_SORT_ORDER_ASC) {
++                                if (x[row * ncols + indices[i]] > x[row * ncols + indices[ij]]) {
++                                    int tmp = indices[i];
++                                    indices[i] = indices[ij];
++                                    indices[ij] = tmp;
++                                }
++                            } else {
++                                if (x[row * ncols + indices[i]] < x[row * ncols + indices[ij]]) {
++                                    int tmp = indices[i];
++                                    indices[i] = indices[ij];
++                                    indices[ij] = tmp;
++                                }
++                            }
++                        } else {
++                            if (order == GGML_SORT_ORDER_ASC) {
++                                if (x[row * ncols + indices[i]] < x[row * ncols + indices[ij]]) {
++                                    int tmp = indices[i];
++                                    indices[i] = indices[ij];
++                                    indices[ij] = tmp;
++                                }
++                            } else {
++                                if (x[row * ncols + indices[i]] > x[row * ncols + indices[ij]]) {
++                                    int tmp = indices[i];
++                                    indices[i] = indices[ij];
++                                    indices[ij] = tmp;
++                                }
++                            }
++                        }
++                    }
++                }
++            }
++            __syncthreads();
++        }
++    }
++
++    // Write sorted indices to output, only threads handling valid data
++    for (int i = tid; i < ncols; i += blockDim.x) {
++        dst[row * ncols + i] = indices[i];
++    }
++}
++
++static void argsort_i32_i32_cuda(const int32_t * x, int * dst, const int ncols, const int nrows, ggml_sort_order order, cudaStream_t stream) {
++    // Bitonic sort requires ncols to be power of 2
++    const int ncols_pad = next_power_of_2(ncols);
++
++    // Ensure thread count doesn't exceed maximum (typically 1024)
++    const int max_threads = 1024;  // This is the typical max for most GPUs
++    const int threads_per_block = ncols_pad > max_threads ? max_threads : ncols_pad;
++
++    const dim3 block_dims(threads_per_block, 1, 1);
++    const dim3 block_nums(1, nrows, 1);
++    const size_t shared_mem = ncols_pad * sizeof(int);
++
++    // Check if shared memory size is within limits
++    const size_t max_shared_mem = ggml_cuda_info().devices[ggml_cuda_get_device()].smpb;
++
++    // Instead of logging an error, use GGML_ASSERT with a descriptive message
++    GGML_ASSERT(shared_mem <= max_shared_mem && "argsort: required shared memory exceeds device limit");
++
++    // Launch kernels with the updated thread configuration
++    if (order == GGML_SORT_ORDER_ASC) {
++        k_argsort_i32_i32<GGML_SORT_ORDER_ASC><<<block_nums, block_dims, shared_mem, stream>>>(x, dst, ncols, ncols_pad);
++    } else if (order == GGML_SORT_ORDER_DESC) {
++        k_argsort_i32_i32<GGML_SORT_ORDER_DESC><<<block_nums, block_dims, shared_mem, stream>>>(x, dst, ncols, ncols_pad);
++    } else {
++        GGML_ABORT("fatal error");
++    }
++}
++
++
+ void ggml_cuda_op_argsort(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
+     const ggml_tensor * src0 = dst->src[0];
+     const float * src0_d = (const float *)src0->data;
+     float * dst_d = (float *)dst->data;
+     cudaStream_t stream = ctx.stream();
+ 
+-    GGML_ASSERT(src0->type == GGML_TYPE_F32);
++    GGML_ASSERT(src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_I32);
+     GGML_ASSERT( dst->type == GGML_TYPE_I32);
+     GGML_ASSERT(ggml_is_contiguous(src0));
+ 
+@@ -100,5 +194,9 @@ void ggml_cuda_op_argsort(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
+ 
+     enum ggml_sort_order order = (enum ggml_sort_order) dst->op_params[0];
+ 
+-    argsort_f32_i32_cuda(src0_d, (int *)dst_d, ncols, nrows, order, stream);
++    if (src0->type == GGML_TYPE_I32) {
++        argsort_i32_i32_cuda((const int32_t *)src0_d, (int *)dst_d, ncols, nrows, order, stream);
++    } else {
++        argsort_f32_i32_cuda(src0_d, (int *)dst_d, ncols, nrows, order, stream);
++    }
+ }
+diff --git a/ggml/src/ggml-cuda/cpy.cu b/ggml/src/ggml-cuda/cpy.cu
+index 2d46176e..47383486 100644
+--- a/ggml/src/ggml-cuda/cpy.cu
++++ b/ggml/src/ggml-cuda/cpy.cu
+@@ -38,6 +38,13 @@ static __device__ void cpy_1_f16_f32(const char * cxi, char * cdsti) {
+     *dsti = *xi;
+ }
+ 
++static __device__ void cpy_1_i32_i32(const char * cxi, char * cdsti) {
++    const int32_t * xi = (const int32_t *) cxi;
++    int32_t * dsti = (int32_t *) cdsti;
++
++    *dsti = *xi;
++}
++
+ template <cpy_kernel_t cpy_1>
+ static __global__ void cpy_f32_f16(const char * cx, char * cdst_direct, const int ne,
+                                    const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
+@@ -68,6 +75,44 @@ static __global__ void cpy_f32_f16(const char * cx, char * cdst_direct, const in
+     cpy_1(cx + x_offset, cdst + dst_offset);
+ }
+ 
++// First, add this template function after the other template functions
++template <cpy_kernel_t cpy_1>
++static __global__ void cpy_i32_i32(const char * cx, char * cdst, const int ne,
++                                 const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
++                                 const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11,
++                                 const int nb12, const int nb13) {
++    const int64_t i = blockDim.x*blockIdx.x + threadIdx.x;
++
++    if (i >= ne) {
++        return;
++    }
++
++    const int64_t i03 = i/(ne00 * ne01 * ne02);
++    const int64_t i02 = (i - i03*ne00*ne01*ne02 )/ (ne00*ne01);
++    const int64_t i01 = (i - i03*ne00*ne01*ne02  -  i02*ne01*ne00) / ne00;
++    const int64_t i00 = i - i03*ne00*ne01*ne02 - i02*ne01*ne00 - i01*ne00;
++    const int64_t x_offset = i00*nb00 + i01*nb01 + i02*nb02 + i03 * nb03;
++
++    const int64_t i13 = i/(ne10 * ne11 * ne12);
++    const int64_t i12 = (i - i13*ne10*ne11*ne12) / (ne10*ne11);
++    const int64_t i11 = (i - i13*ne10*ne11*ne12 - i12*ne10*ne11) / ne10;
++    const int64_t i10 = i - i13*ne10*ne11*ne12 - i12*ne10*ne11 - i11*ne10;
++    const int64_t dst_offset = i10*nb10 + i11*nb11 + i12*nb12 + i13 * nb13;
++
++    cpy_1(cx + x_offset, cdst + dst_offset);
++}
++
++// Then modify the ggml_cpy_i32_i32_cuda function to use the new template
++static void ggml_cpy_i32_i32_cuda(
++    const char * cx, char * cdst, const int ne,
++    const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
++    const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream, char ** cdst_indirect, int graph_cpynode_index) {
++
++    const int num_blocks = (ne + CUDA_CPY_BLOCK_SIZE - 1) / CUDA_CPY_BLOCK_SIZE;
++    cpy_i32_i32<cpy_1_i32_i32><<<num_blocks, CUDA_CPY_BLOCK_SIZE, 0, stream>>>
++        (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
++}
++
+ static __device__ void cpy_blck_f32_q8_0(const char * cxi, char * cdsti) {
+     const float * xi = (const float *) cxi;
+     block_q8_0 * dsti = (block_q8_0 *) cdsti;
+@@ -631,6 +676,8 @@ void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, gg
+         ggml_cpy_f16_f16_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs_d, graph_cpynode_index);
+     } else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F32) {
+         ggml_cpy_f16_f32_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs_d, graph_cpynode_index);
++    } else if (src0->type == GGML_TYPE_I32 && src1->type == GGML_TYPE_I32) {
++        ggml_cpy_i32_i32_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs_d, graph_cpynode_index);
+     } else {
+         GGML_ABORT("%s: unsupported type combination (%s to %s)\n", __func__,
+                 ggml_type_name(src0->type), ggml_type_name(src1->type));
+@@ -686,6 +733,8 @@ void* ggml_cuda_cpy_fn(const ggml_tensor * src0, ggml_tensor * src1) {
+         return (void*) cpy_f32_f16<cpy_1_f32_f16>;
+     } else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F32) {
+         return (void*) cpy_f32_f16<cpy_1_f16_f32>;
++    } else if (src0->type == GGML_TYPE_I32 && src1->type == GGML_TYPE_I32) {
++        return (void*) cpy_i32_i32<cpy_1_i32_i32>;
+     } else {
+         GGML_ABORT("%s: unsupported type combination (%s to %s)\n", __func__,
+                 ggml_type_name(src0->type), ggml_type_name(src1->type));
diff --git a/ml/backend.go b/ml/backend.go
index f84a9984..cb32d818 100644
--- a/ml/backend.go
+++ b/ml/backend.go
@@ -119,6 +119,21 @@ type Context interface {
 	Layer(int) Context
 }
 
+// RopeOptions contains optional parameters for RoPE function
+type RopeOptions struct {
+	OriginalContextLen uint32
+}
+
+// RopeOption defines a function that modifies RopeOpts
+type RopeOption func(*RopeOptions)
+
+// WithContextLen sets a custom context length
+func WithContextLen(len uint32) RopeOption {
+	return func(opts *RopeOptions) {
+		opts.OriginalContextLen = len
+	}
+}
+
 type Tensor interface {
 	Dim(n int) int
 	Stride(n int) int
@@ -144,7 +159,7 @@ type Tensor interface {
 	AvgPool2D(ctx Context, k, s int, p float32) Tensor
 	Conv2D(ctx Context, weight Tensor, s0, s1, p0, p1, d0, d1 int) Tensor
 
-	RoPE(ctx Context, positionIDs, ropeFactors Tensor, dim, ropeType uint32, base, scale float32) Tensor
+	RoPE(ctx Context, positionIDs, ropeFactors Tensor, dim, ropeType uint32, base, scale float32, options ...RopeOption) Tensor
 	IM2Col(ctx Context, weight Tensor, s0, s1, p0, p1, d0, d1 int) Tensor
 
 	Sin(ctx Context) Tensor
@@ -172,6 +187,7 @@ type Tensor interface {
 	Duplicate(ctx Context) Tensor
 
 	TopK(ctx Context, k int) Tensor
+	Argsort(ctx Context) Tensor
 }
 
 // ScaledDotProductAttention implements a fused attention
diff --git a/ml/backend/ggml/ggml.go b/ml/backend/ggml/ggml.go
index e1aa687c..1ba07983 100644
--- a/ml/backend/ggml/ggml.go
+++ b/ml/backend/ggml/ggml.go
@@ -1060,7 +1060,17 @@ const (
 	ropeTypeVision C.int = 24
 )
 
-func (t *Tensor) RoPE(ctx ml.Context, positionIDs, ropeFactors ml.Tensor, ropeDim, ropeType uint32, ropeBase, ropeScale float32) ml.Tensor {
+func (t *Tensor) RoPE(ctx ml.Context, positionIDs, ropeFactors ml.Tensor, ropeDim, ropeType uint32, ropeBase, ropeScale float32, options ...ml.RopeOption) ml.Tensor {
+	// Default options
+	opts := &ml.RopeOptions{
+		OriginalContextLen: 131072,
+	}
+
+	// Apply any provided options
+	for _, option := range options {
+		option(opts)
+	}
+
 	if ropeFactors == nil {
 		ropeFactors = &Tensor{b: t.b}
 	}
@@ -1073,16 +1083,19 @@ func (t *Tensor) RoPE(ctx ml.Context, positionIDs, ropeFactors ml.Tensor, ropeDi
 	return &Tensor{
 		b: t.b,
 		t: C.ggml_rope_ext(
-			ctx.(*Context).ctx, dequant, positionIDs.(*Tensor).t, ropeFactors.(*Tensor).t,
+			ctx.(*Context).ctx,
+			dequant,
+			positionIDs.(*Tensor).t,
+			ropeFactors.(*Tensor).t,
 			C.int(ropeDim),
 			C.int(ropeType),
-			131072, // YaRN n_ctx_train
+			C.int(opts.OriginalContextLen),
 			C.float(ropeBase),
 			C.float(ropeScale),
-			0.,  // YaRN ext_factor
-			1.,  // YaRN attn_factor
-			32., // YaRN beta_fast
-			1.,  // YaRN beta_slow
+			C.float(0.0),
+			C.float(1.0),
+			C.float(32.0),
+			C.float(1.0),
 		),
 	}
 }
@@ -1176,3 +1189,10 @@ func (t *Tensor) TopK(ctx ml.Context, k int) ml.Tensor {
 		t: C.ggml_top_k(ctx.(*Context).ctx, t.t, C.int(k)),
 	}
 }
+
+func (t *Tensor) Argsort(ctx ml.Context) ml.Tensor {
+	return &Tensor{
+		b: t.b,
+		t: C.ggml_argsort(ctx.(*Context).ctx, t.t, C.GGML_SORT_ORDER_ASC),
+	}
+}
diff --git a/ml/backend/ggml/ggml/src/ggml-cpu/ops.cpp b/ml/backend/ggml/ggml/src/ggml-cpu/ops.cpp
index 955fec59..654e2f28 100644
--- a/ml/backend/ggml/ggml/src/ggml-cpu/ops.cpp
+++ b/ml/backend/ggml/ggml/src/ggml-cpu/ops.cpp
@@ -6822,6 +6822,45 @@ static void ggml_compute_forward_argsort_f32(
     }
 }
 
+static void ggml_compute_forward_argsort_i32(
+    const ggml_compute_params * params,
+    ggml_tensor * dst) {
+
+    const ggml_tensor * src0 = dst->src[0];
+
+    GGML_TENSOR_UNARY_OP_LOCALS
+
+    GGML_ASSERT(nb0 == sizeof(int32_t));
+
+    const int ith = params->ith;
+    const int nth = params->nth;
+
+    const int64_t nr = ggml_nrows(src0);
+
+    ggml_sort_order order = (ggml_sort_order) ggml_get_op_params_i32(dst, 0);
+
+    for (int64_t i = ith; i < nr; i += nth) {
+        int32_t * dst_data = (int32_t *)((char *) dst->data + i*nb1);
+        const int32_t * src_data = (int32_t *)((char *) src0->data + i*nb01);
+
+        for (int64_t j = 0; j < ne0; j++) {
+            dst_data[j] = j;
+        }
+
+        // C doesn't have a functional sort, so we do a bubble sort instead
+        for (int64_t j = 0; j < ne0; j++) {
+            for (int64_t k = j + 1; k < ne0; k++) {
+                if ((order == GGML_SORT_ORDER_ASC  && src_data[dst_data[j]] > src_data[dst_data[k]]) ||
+                    (order == GGML_SORT_ORDER_DESC && src_data[dst_data[j]] < src_data[dst_data[k]])) {
+                    int32_t tmp = dst_data[j];
+                    dst_data[j] = dst_data[k];
+                    dst_data[k] = tmp;
+                }
+            }
+        }
+    }
+}
+
 void ggml_compute_forward_argsort(
     const ggml_compute_params * params,
     ggml_tensor * dst) {
@@ -6833,6 +6872,10 @@ void ggml_compute_forward_argsort(
             {
                 ggml_compute_forward_argsort_f32(params, dst);
             } break;
+        case GGML_TYPE_I32:
+            {
+                ggml_compute_forward_argsort_i32(params, dst);
+            } break;
         default:
             {
                 GGML_ABORT("fatal error");
diff --git a/ml/backend/ggml/ggml/src/ggml-cuda/argsort.cu b/ml/backend/ggml/ggml/src/ggml-cuda/argsort.cu
index 607ded85..53b02634 100644
--- a/ml/backend/ggml/ggml/src/ggml-cuda/argsort.cu
+++ b/ml/backend/ggml/ggml/src/ggml-cuda/argsort.cu
@@ -85,13 +85,107 @@ static void argsort_f32_i32_cuda(const float * x, int * dst, const int ncols, co
     }
 }
 
+
+template<ggml_sort_order order>
+static __global__ void k_argsort_i32_i32(const int32_t * x, int * dst, const int ncols, const int ncols_pad) {
+    extern __shared__ int shared_mem[];
+    int * indices = shared_mem;
+
+    const int tid = threadIdx.x;
+    const int row = blockIdx.y;
+
+    // Initialize all indices, handling the case where threads < ncols_pad
+    for (int i = tid; i < ncols_pad; i += blockDim.x) {
+        indices[i] = i < ncols ? i : 0; // Use 0 for padding indices
+    }
+    __syncthreads();
+
+    // Bitonic sort
+    for (int k = 2; k <= ncols_pad; k *= 2) {
+        for (int j = k/2; j > 0; j /= 2) {
+            for (int i = tid; i < ncols_pad; i += blockDim.x) {
+                const int ij = i ^ j;
+                if (ij > i) {
+                    // Only compare values within the actual data range
+                    if (i < ncols && ij < ncols) {
+                        if ((i & k) == 0) {
+                            if (order == GGML_SORT_ORDER_ASC) {
+                                if (x[row * ncols + indices[i]] > x[row * ncols + indices[ij]]) {
+                                    int tmp = indices[i];
+                                    indices[i] = indices[ij];
+                                    indices[ij] = tmp;
+                                }
+                            } else {
+                                if (x[row * ncols + indices[i]] < x[row * ncols + indices[ij]]) {
+                                    int tmp = indices[i];
+                                    indices[i] = indices[ij];
+                                    indices[ij] = tmp;
+                                }
+                            }
+                        } else {
+                            if (order == GGML_SORT_ORDER_ASC) {
+                                if (x[row * ncols + indices[i]] < x[row * ncols + indices[ij]]) {
+                                    int tmp = indices[i];
+                                    indices[i] = indices[ij];
+                                    indices[ij] = tmp;
+                                }
+                            } else {
+                                if (x[row * ncols + indices[i]] > x[row * ncols + indices[ij]]) {
+                                    int tmp = indices[i];
+                                    indices[i] = indices[ij];
+                                    indices[ij] = tmp;
+                                }
+                            }
+                        }
+                    }
+                }
+            }
+            __syncthreads();
+        }
+    }
+
+    // Write sorted indices to output, only threads handling valid data
+    for (int i = tid; i < ncols; i += blockDim.x) {
+        dst[row * ncols + i] = indices[i];
+    }
+}
+
+static void argsort_i32_i32_cuda(const int32_t * x, int * dst, const int ncols, const int nrows, ggml_sort_order order, cudaStream_t stream) {
+    // Bitonic sort requires ncols to be power of 2
+    const int ncols_pad = next_power_of_2(ncols);
+
+    // Ensure thread count doesn't exceed maximum (typically 1024)
+    const int max_threads = 1024;  // This is the typical max for most GPUs
+    const int threads_per_block = ncols_pad > max_threads ? max_threads : ncols_pad;
+
+    const dim3 block_dims(threads_per_block, 1, 1);
+    const dim3 block_nums(1, nrows, 1);
+    const size_t shared_mem = ncols_pad * sizeof(int);
+
+    // Check if shared memory size is within limits
+    const size_t max_shared_mem = ggml_cuda_info().devices[ggml_cuda_get_device()].smpb;
+
+    // Instead of logging an error, use GGML_ASSERT with a descriptive message
+    GGML_ASSERT(shared_mem <= max_shared_mem && "argsort: required shared memory exceeds device limit");
+
+    // Launch kernels with the updated thread configuration
+    if (order == GGML_SORT_ORDER_ASC) {
+        k_argsort_i32_i32<GGML_SORT_ORDER_ASC><<<block_nums, block_dims, shared_mem, stream>>>(x, dst, ncols, ncols_pad);
+    } else if (order == GGML_SORT_ORDER_DESC) {
+        k_argsort_i32_i32<GGML_SORT_ORDER_DESC><<<block_nums, block_dims, shared_mem, stream>>>(x, dst, ncols, ncols_pad);
+    } else {
+        GGML_ABORT("fatal error");
+    }
+}
+
+
 void ggml_cuda_op_argsort(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
     const ggml_tensor * src0 = dst->src[0];
     const float * src0_d = (const float *)src0->data;
     float * dst_d = (float *)dst->data;
     cudaStream_t stream = ctx.stream();
 
-    GGML_ASSERT(src0->type == GGML_TYPE_F32);
+    GGML_ASSERT(src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_I32);
     GGML_ASSERT( dst->type == GGML_TYPE_I32);
     GGML_ASSERT(ggml_is_contiguous(src0));
 
@@ -100,5 +194,9 @@ void ggml_cuda_op_argsort(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
 
     enum ggml_sort_order order = (enum ggml_sort_order) dst->op_params[0];
 
-    argsort_f32_i32_cuda(src0_d, (int *)dst_d, ncols, nrows, order, stream);
+    if (src0->type == GGML_TYPE_I32) {
+        argsort_i32_i32_cuda((const int32_t *)src0_d, (int *)dst_d, ncols, nrows, order, stream);
+    } else {
+        argsort_f32_i32_cuda(src0_d, (int *)dst_d, ncols, nrows, order, stream);
+    }
 }
diff --git a/ml/backend/ggml/ggml/src/ggml-cuda/cpy.cu b/ml/backend/ggml/ggml/src/ggml-cuda/cpy.cu
index d027271f..4abd01d7 100644
--- a/ml/backend/ggml/ggml/src/ggml-cuda/cpy.cu
+++ b/ml/backend/ggml/ggml/src/ggml-cuda/cpy.cu
@@ -38,6 +38,13 @@ static __device__ void cpy_1_f16_f32(const char * cxi, char * cdsti) {
     *dsti = *xi;
 }
 
+static __device__ void cpy_1_i32_i32(const char * cxi, char * cdsti) {
+    const int32_t * xi = (const int32_t *) cxi;
+    int32_t * dsti = (int32_t *) cdsti;
+
+    *dsti = *xi;
+}
+
 template <cpy_kernel_t cpy_1>
 static __global__ void cpy_f32_f16(const char * cx, char * cdst_direct, const int ne,
                                    const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
@@ -68,6 +75,44 @@ static __global__ void cpy_f32_f16(const char * cx, char * cdst_direct, const in
     cpy_1(cx + x_offset, cdst + dst_offset);
 }
 
+// First, add this template function after the other template functions
+template <cpy_kernel_t cpy_1>
+static __global__ void cpy_i32_i32(const char * cx, char * cdst, const int ne,
+                                 const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
+                                 const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11,
+                                 const int nb12, const int nb13) {
+    const int64_t i = blockDim.x*blockIdx.x + threadIdx.x;
+
+    if (i >= ne) {
+        return;
+    }
+
+    const int64_t i03 = i/(ne00 * ne01 * ne02);
+    const int64_t i02 = (i - i03*ne00*ne01*ne02 )/ (ne00*ne01);
+    const int64_t i01 = (i - i03*ne00*ne01*ne02  -  i02*ne01*ne00) / ne00;
+    const int64_t i00 = i - i03*ne00*ne01*ne02 - i02*ne01*ne00 - i01*ne00;
+    const int64_t x_offset = i00*nb00 + i01*nb01 + i02*nb02 + i03 * nb03;
+
+    const int64_t i13 = i/(ne10 * ne11 * ne12);
+    const int64_t i12 = (i - i13*ne10*ne11*ne12) / (ne10*ne11);
+    const int64_t i11 = (i - i13*ne10*ne11*ne12 - i12*ne10*ne11) / ne10;
+    const int64_t i10 = i - i13*ne10*ne11*ne12 - i12*ne10*ne11 - i11*ne10;
+    const int64_t dst_offset = i10*nb10 + i11*nb11 + i12*nb12 + i13 * nb13;
+
+    cpy_1(cx + x_offset, cdst + dst_offset);
+}
+
+// Then modify the ggml_cpy_i32_i32_cuda function to use the new template
+static void ggml_cpy_i32_i32_cuda(
+    const char * cx, char * cdst, const int ne,
+    const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
+    const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream, char ** cdst_indirect, int graph_cpynode_index) {
+
+    const int num_blocks = (ne + CUDA_CPY_BLOCK_SIZE - 1) / CUDA_CPY_BLOCK_SIZE;
+    cpy_i32_i32<cpy_1_i32_i32><<<num_blocks, CUDA_CPY_BLOCK_SIZE, 0, stream>>>
+        (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
+}
+
 static __device__ void cpy_blck_f32_q8_0(const char * cxi, char * cdsti) {
     const float * xi = (const float *) cxi;
     block_q8_0 * dsti = (block_q8_0 *) cdsti;
@@ -633,6 +678,8 @@ void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, gg
         ggml_cpy_f16_f16_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs_d, graph_cpynode_index);
     } else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F32) {
         ggml_cpy_f16_f32_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs_d, graph_cpynode_index);
+    } else if (src0->type == GGML_TYPE_I32 && src1->type == GGML_TYPE_I32) {
+        ggml_cpy_i32_i32_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs_d, graph_cpynode_index);
     } else {
         GGML_ABORT("%s: unsupported type combination (%s to %s)\n", __func__,
                 ggml_type_name(src0->type), ggml_type_name(src1->type));
@@ -688,6 +735,8 @@ void* ggml_cuda_cpy_fn(const ggml_tensor * src0, ggml_tensor * src1) {
         return (void*) cpy_f32_f16<cpy_1_f32_f16>;
     } else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F32) {
         return (void*) cpy_f32_f16<cpy_1_f16_f32>;
+    } else if (src0->type == GGML_TYPE_I32 && src1->type == GGML_TYPE_I32) {
+        return (void*) cpy_i32_i32<cpy_1_i32_i32>;
     } else {
         GGML_ABORT("%s: unsupported type combination (%s to %s)\n", __func__,
                 ggml_type_name(src0->type), ggml_type_name(src1->type));
diff --git a/model/models/models.go b/model/models/models.go
index 73b4c53a..133e5176 100644
--- a/model/models/models.go
+++ b/model/models/models.go
@@ -7,4 +7,5 @@ import (
 	_ "github.com/ollama/ollama/model/models/llama4"
 	_ "github.com/ollama/ollama/model/models/mistral3"
 	_ "github.com/ollama/ollama/model/models/mllama"
+	_ "github.com/ollama/ollama/model/models/qwen25vl"
 )
diff --git a/model/models/qwen25vl/model.go b/model/models/qwen25vl/model.go
new file mode 100644
index 00000000..9d243c30
--- /dev/null
+++ b/model/models/qwen25vl/model.go
@@ -0,0 +1,187 @@
+package qwen25vl
+
+import (
+	"bytes"
+	"fmt"
+	"image"
+	"slices"
+	"sync"
+
+	"github.com/ollama/ollama/fs"
+	"github.com/ollama/ollama/kvcache"
+	"github.com/ollama/ollama/ml"
+	"github.com/ollama/ollama/model"
+	"github.com/ollama/ollama/model/input"
+)
+
+type Model struct {
+	model.Base
+	model.BytePairEncoding
+
+	*TextModel
+	*VisionModel `gguf:"v,vision"`
+
+	ImageProcessor
+}
+
+// Implement MultimodalProcessor interface
+var _ model.MultimodalProcessor = (*Model)(nil)
+
+func New(c fs.Config) (model.Model, error) {
+	m := &Model{
+		BytePairEncoding: model.NewBytePairEncoding(
+			c.String("tokenizer.ggml.pretokenizer", `(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+`),
+			&model.Vocabulary{
+				Values: c.Strings("tokenizer.ggml.tokens"),
+				Types:  c.Ints("tokenizer.ggml.token_type"),
+				Merges: c.Strings("tokenizer.ggml.merges"),
+				BOS:    int32(c.Uint("tokenizer.ggml.bos_token_id")),
+				AddBOS: c.Bool("tokenizer.ggml.add_bos_token", false),
+				EOS:    int32(c.Uint("tokenizer.ggml.eos_token_id")),
+				AddEOS: c.Bool("tokenizer.ggml.add_eos_token", false),
+				EOT:    int32(c.Uint("tokenizer.ggml.eos_token_id")),
+				AddEOT: c.Bool("tokenizer.ggml.add_eos_token", false),
+			},
+		),
+		TextModel:      NewTextModel(c),
+		VisionModel:    newVisionModel(c),
+		ImageProcessor: newImageProcessor(c),
+	}
+
+	m.Cache = kvcache.NewCausalCache(m.TextModel.Shift)
+
+	return m, nil
+}
+
+func (m *Model) PixelValues(ctx ml.Context, multimodalData []byte) (ml.Tensor, *Grid, error) {
+	image, _, err := image.Decode(bytes.NewReader(multimodalData))
+	if err != nil {
+		return nil, nil, err
+	}
+
+	f32s, grid, err := m.ImageProcessor.ProcessImage(image)
+	if err != nil {
+		return nil, nil, err
+	}
+
+	// Calculate tensor dimensions
+	patchDim := m.ImageProcessor.numChannels * m.ImageProcessor.temporalPatchSize *
+		m.ImageProcessor.patchSize * m.ImageProcessor.patchSize
+	numPatches := grid.Temporal * grid.Height * grid.Width
+
+	pixelValues, err := ctx.Input().FromFloatSlice(f32s, patchDim, numPatches)
+	if err != nil {
+		return nil, nil, fmt.Errorf("failed to create tensor from image: %w", err)
+	}
+
+	return pixelValues, grid, nil
+}
+
+func (m *Model) EncodeMultimodal(ctx ml.Context, multimodalData []byte) (any, error) {
+	if len(m.VisionModel.Layers) == 0 {
+		return nil, model.ErrNoVisionModel
+	}
+
+	pixels, grid, err := m.PixelValues(ctx, multimodalData)
+	if err != nil {
+		return nil, err
+	}
+
+	visionOutputs := m.VisionModel.Forward(ctx, pixels, grid)
+	return &chunks{Model: m, Tensor: visionOutputs}, nil
+}
+
+type chunks struct {
+	*Model
+	ml.Tensor
+
+	dataOnce sync.Once
+	data     []float32
+}
+
+type chunk struct {
+	*chunks
+	s, n int
+}
+
+func (r *chunk) floats() []float32 {
+	r.dataOnce.Do(func() {
+		temp := r.Backend().NewContext()
+		defer temp.Close()
+		temp.Forward(r.Tensor).Compute(r.Tensor)
+		r.data = r.Floats()
+	})
+
+	return r.data[r.s*r.Dim(0) : (r.s+r.n)*r.Dim(0)]
+}
+
+// PostTokenize arranges Qwen-2.5-VL's inputs for the forward pass
+func (m *Model) PostTokenize(inputs []input.Input) ([]input.Input, error) {
+	var result []input.Input
+
+	var (
+		imageToken       int32 = 151655
+		visionStartToken int32 = 151652
+		visionEndToken   int32 = 151653
+	)
+
+	nImg := 0
+	for _, inp := range inputs {
+		if inp.Multimodal == nil {
+			// If not a multimodal input, add it to the result unchanged
+			result = append(result, inp)
+		} else {
+			// Adding the 'Picture' prefix is a hack, at the time of writing there is no way to prefix
+			// the image tokens with a prompt, so we add a prefix here
+			nImg++
+			pre, err := m.Encode(fmt.Sprintf(" Picture %d: ", nImg), true)
+			if err != nil {
+				return nil, fmt.Errorf("failed to encode image prompt: %w", err)
+			}
+			for i := range pre {
+				result = append(result, input.Input{Token: pre[i]})
+			}
+
+			// This is an image token with multimodal data
+			chunksData := inp.Multimodal.(*chunks)
+			patchesPerChunk := chunksData.Dim(1)
+
+			// First add the vision start token
+			result = append(result, input.Input{Token: visionStartToken, SameBatch: patchesPerChunk + 2})
+
+			// Add the image token with the multimodal tensor data at the first position
+			// Create a chunk with proper s and n values
+			result = append(result, input.Input{
+				Token:          imageToken,
+				Multimodal:     &chunk{chunks: chunksData, s: 0, n: patchesPerChunk},
+				MultimodalHash: inp.MultimodalHash,
+				SameBatch:      patchesPerChunk,
+			})
+
+			// Add the placeholder tokens for the remaining positions (tokensPerGrid-1)
+			result = append(result, slices.Repeat([]input.Input{{Token: imageToken}}, patchesPerChunk-1)...)
+
+			result = append(result, input.Input{Token: visionEndToken})
+		}
+	}
+
+	return result, nil
+}
+
+func (m *Model) Forward(ctx ml.Context, batch input.Batch) (ml.Tensor, error) {
+	positions, err := ctx.Input().FromIntSlice(batch.Positions, len(batch.Positions))
+	if err != nil {
+		return nil, err
+	}
+
+	outputs, err := ctx.Input().FromIntSlice(batch.Outputs, len(batch.Outputs))
+	if err != nil {
+		return nil, err
+	}
+
+	return m.TextModel.Forward(ctx, batch.Inputs, positions, outputs, batch, m.Cache)
+}
+
+func init() {
+	model.Register("qwen25vl", New)
+}
diff --git a/model/models/qwen25vl/model_text.go b/model/models/qwen25vl/model_text.go
new file mode 100644
index 00000000..6b062f8c
--- /dev/null
+++ b/model/models/qwen25vl/model_text.go
@@ -0,0 +1,155 @@
+package qwen25vl
+
+import (
+	"math"
+
+	"github.com/ollama/ollama/fs"
+	"github.com/ollama/ollama/kvcache"
+	"github.com/ollama/ollama/ml"
+	"github.com/ollama/ollama/ml/nn"
+	"github.com/ollama/ollama/model/input"
+)
+
+type TextOptions struct {
+	ctxLen, hiddenSize, numHeads, numKVHeads int
+	eps, ropeBase, ropeScale                 float32
+	ropeDim, defaultContextLen               uint32
+}
+
+type TextModel struct {
+	TokenEmbedding *nn.Embedding `gguf:"token_embd"`
+	Layers         []Layer       `gguf:"blk"`
+	OutputNorm     *nn.RMSNorm   `gguf:"output_norm"`
+	Output         *nn.Linear    `gguf:"output,alt:token_embd"`
+
+	*TextOptions
+}
+
+func NewTextModel(c fs.Config) *TextModel {
+	m := TextModel{
+		Layers: make([]Layer, c.Uint("block_count")),
+		TextOptions: &TextOptions{
+			ctxLen:            int(c.Uint("context_length")),
+			hiddenSize:        int(c.Uint("embedding_length")),
+			numHeads:          int(c.Uint("attention.head_count")),
+			numKVHeads:        int(c.Uint("attention.head_count_kv")),
+			eps:               c.Float("attention.layer_norm_rms_epsilon"),
+			ropeBase:          c.Float("rope.freq_base"),
+			ropeScale:         c.Float("rope.freq_scale", 1),
+			ropeDim:           c.Uint("rope.dimension_count", 128),
+			defaultContextLen: c.Uint("context_length", 128000),
+		},
+	}
+
+	return &m
+}
+
+// SelfAttention implements the multi-head self-attention mechanism
+// with separate projections for query, key, value and output transformations
+type SelfAttention struct {
+	Query  *nn.Linear `gguf:"attn_q"`
+	Key    *nn.Linear `gguf:"attn_k"`
+	Value  *nn.Linear `gguf:"attn_v"`
+	Output *nn.Linear `gguf:"attn_output"`
+}
+
+func (sa *SelfAttention) Forward(ctx ml.Context, hiddenState, positionIDs ml.Tensor, cache kvcache.Cache, opts *TextOptions) ml.Tensor {
+	batchSize := hiddenState.Dim(1)
+	headDim := opts.hiddenSize / opts.numHeads
+
+	q := sa.Query.Forward(ctx, hiddenState)
+	q = q.Reshape(ctx, headDim, opts.numHeads, batchSize)
+	q = q.RoPE(ctx, positionIDs, nil, opts.ropeDim, 2, opts.ropeBase, opts.ropeScale, ml.WithContextLen(opts.defaultContextLen))
+
+	k := sa.Key.Forward(ctx, hiddenState)
+	k = k.Reshape(ctx, headDim, opts.numKVHeads, batchSize)
+	k = k.RoPE(ctx, positionIDs, nil, opts.ropeDim, 2, opts.ropeBase, opts.ropeScale, ml.WithContextLen(opts.defaultContextLen))
+
+	v := sa.Value.Forward(ctx, hiddenState)
+	v = v.Reshape(ctx, headDim, opts.numKVHeads, batchSize)
+
+	scaleFactor := 1.0 / math.Sqrt(float64(headDim))
+	kqv := nn.Attention(ctx, q, k, v, scaleFactor, cache)
+	kqv = kqv.Reshape(ctx, opts.hiddenSize, batchSize)
+
+	return sa.Output.Forward(ctx, kqv)
+}
+
+// Shift applies rotary position embeddings to the key tensor for causal attention caching
+func (m *TextModel) Shift(ctx ml.Context, layer int, key, shift ml.Tensor) (ml.Tensor, error) {
+	return key.RoPE(ctx, shift, nil, m.ropeDim, 2, m.ropeBase, m.ropeScale, ml.WithContextLen(m.defaultContextLen)), nil
+}
+
+// MLP implements the feed-forward network component with SwiGLU activation
+type MLP struct {
+	Up   *nn.Linear `gguf:"ffn_up"`
+	Down *nn.Linear `gguf:"ffn_down"`
+	Gate *nn.Linear `gguf:"ffn_gate"`
+}
+
+func (mlp *MLP) Forward(ctx ml.Context, hiddenState ml.Tensor, opts *TextOptions) ml.Tensor {
+	// Apply SwiGLU activation gating
+	hiddenState = mlp.Gate.Forward(ctx, hiddenState).SILU(ctx).Mul(ctx, mlp.Up.Forward(ctx, hiddenState))
+	// Project back to hidden dimension
+	return mlp.Down.Forward(ctx, hiddenState)
+}
+
+// Layer represents a single transformer layer combining self-attention and feed-forward components
+type Layer struct {
+	AttentionNorm *nn.RMSNorm `gguf:"attn_norm"`
+	SelfAttention *SelfAttention
+	MLPNorm       *nn.RMSNorm `gguf:"ffn_norm"`
+	MLP           *MLP
+}
+
+func (l *Layer) Forward(ctx ml.Context, hiddenState, positionIDs, outputs ml.Tensor, cache kvcache.Cache, opts *TextOptions) ml.Tensor {
+	// Self-attention branch with residual connection
+	residual := hiddenState
+
+	hiddenState = l.AttentionNorm.Forward(ctx, hiddenState, opts.eps)
+	hiddenState = l.SelfAttention.Forward(ctx, hiddenState, positionIDs, cache, opts)
+
+	// In the final layer (outputs != nil), optimize by pruning to just the token positions
+	// we need logits for.
+	if outputs != nil {
+		hiddenState = hiddenState.Rows(ctx, outputs)
+		residual = residual.Rows(ctx, outputs)
+	}
+
+	hiddenState = hiddenState.Add(ctx, residual)
+	// Feed-forward branch with residual connection
+	residual = hiddenState
+	hiddenState = l.MLPNorm.Forward(ctx, hiddenState, opts.eps)
+	hiddenState = l.MLP.Forward(ctx, hiddenState, opts)
+	return hiddenState.Add(ctx, residual)
+}
+
+func (m *TextModel) Forward(ctx ml.Context, inputs, positions, outputs ml.Tensor, batch input.Batch, cache kvcache.Cache) (ml.Tensor, error) {
+	// Initial token embedding
+	hiddenStates := m.TokenEmbedding.Forward(ctx, inputs).Duplicate(ctx)
+
+	for _, mi := range batch.Multimodal {
+		f32s := mi.Multimodal.(*chunk).floats()
+		img, err := ctx.Input().FromFloatSlice(f32s, len(f32s)/m.hiddenSize, m.hiddenSize)
+		if err != nil {
+			panic(err)
+		}
+
+		ctx.Forward(img.Copy(ctx, hiddenStates.View(ctx, mi.Index*hiddenStates.Stride(1), img.Dim(0)*img.Dim(1))))
+	}
+
+	// Process through transformer layers
+	for i, layer := range m.Layers {
+		cache.SetLayer(i)
+
+		var lastLayerOutputs ml.Tensor
+		if i == len(m.Layers)-1 {
+			lastLayerOutputs = outputs
+		}
+
+		hiddenStates = layer.Forward(ctx, hiddenStates, positions, lastLayerOutputs, cache, m.TextOptions)
+	}
+
+	hiddenStates = m.OutputNorm.Forward(ctx, hiddenStates, m.eps)
+	return m.Output.Forward(ctx, hiddenStates), nil
+}
diff --git a/model/models/qwen25vl/model_vision.go b/model/models/qwen25vl/model_vision.go
new file mode 100644
index 00000000..01eef392
--- /dev/null
+++ b/model/models/qwen25vl/model_vision.go
@@ -0,0 +1,391 @@
+package qwen25vl
+
+import (
+	"fmt"
+	"math"
+	"slices"
+
+	"github.com/ollama/ollama/fs"
+	"github.com/ollama/ollama/ml"
+	"github.com/ollama/ollama/ml/nn"
+)
+
+// We only support batch size of 1
+var batchSize int = 1
+
+func rotateHalf(ctx ml.Context, t ml.Tensor) ml.Tensor {
+	x1 := t.View(ctx, 0, t.Dim(0)/2, t.Stride(1), t.Dim(1), t.Stride(2), t.Dim(2), t.Stride(3), t.Dim(3))
+	x2 := t.View(ctx, t.Stride(0)*t.Dim(0)/2, t.Dim(0)/2, t.Stride(1), t.Dim(1), t.Stride(2), t.Dim(2), t.Stride(3), t.Dim(3)).Contiguous(ctx)
+	return x2.Neg(ctx).Concat(ctx, x1, 0)
+}
+
+func applyRotaryPositionalEmbedding(ctx ml.Context, t, cos, sin ml.Tensor) ml.Tensor {
+	return t.Mul(ctx, cos).Add(ctx, rotateHalf(ctx, t).Mul(ctx, sin))
+}
+
+func blockDiagonalMask(ctx ml.Context, seqLength int, bounds []int, numHeads int) ml.Tensor {
+	// Create a flat slice for the mask (all -inf initially to block all attention)
+	flat := make([]float32, seqLength*seqLength)
+	for i := range flat {
+		flat[i] = float32(math.Inf(-1)) // Negative infinity to block attention
+	}
+
+	// Fill in the mask with zeros for tokens that CAN attend to each other
+	for i := 1; i < len(bounds); i++ {
+		start := bounds[i-1]
+		end := bounds[i]
+
+		// Enable attention within this sequence block by setting values to 0
+		for row := start; row < end; row++ {
+			for col := start; col < end; col++ {
+				idx := row*seqLength + col
+				flat[idx] = 0.0 // 0 allows attention, -inf blocks it
+			}
+		}
+	}
+
+	mask, err := ctx.Input().FromFloatSlice(flat, seqLength, seqLength)
+	if err != nil {
+		panic(err)
+	}
+	// Reshape to match [seqLength, seqLength, 1] for broadcasting
+	mask = mask.Reshape(ctx, seqLength, seqLength, 1)
+
+	return mask
+}
+
+type VisionSelfAttention struct {
+	Query  *nn.Linear `gguf:"attn_q"`
+	Key    *nn.Linear `gguf:"attn_k"`
+	Value  *nn.Linear `gguf:"attn_v"`
+	Output *nn.Linear `gguf:"attn_out"`
+}
+
+func (sa *VisionSelfAttention) Forward(ctx ml.Context, hiddenStates, cos, sin, mask ml.Tensor, opts *VisionModelOptions) ml.Tensor {
+	query := sa.Query.Forward(ctx, hiddenStates)
+	key := sa.Key.Forward(ctx, hiddenStates)
+	value := sa.Value.Forward(ctx, hiddenStates)
+
+	query = query.Reshape(ctx, opts.headDim, opts.numHeads, query.Dim(1), batchSize)
+	key = key.Reshape(ctx, opts.headDim, opts.numHeads, key.Dim(1), batchSize)
+	value = value.Reshape(ctx, opts.headDim, opts.numHeads, value.Dim(1), batchSize)
+
+	query = applyRotaryPositionalEmbedding(ctx, query, cos, sin)
+	key = applyRotaryPositionalEmbedding(ctx, key, cos, sin)
+
+	// Scale factor for scaled dot-product attention
+	scale := 1.0 / math.Sqrt(float64(opts.headDim))
+
+	// Scaled dot-product attention
+	query = query.Permute(ctx, 0, 2, 1, 3)
+	key = key.Permute(ctx, 0, 2, 1, 3)
+	value = value.Permute(ctx, 1, 2, 0, 3).Contiguous(ctx)
+	kq := key.MulmatFullPrec(ctx, query)
+	kq = kq.Scale(ctx, scale)
+	if mask != nil {
+		kq = kq.Add(ctx, mask)
+	}
+	kq = kq.Softmax(ctx)
+	kqv := value.Mulmat(ctx, kq)
+	attention := kqv.Permute(ctx, 0, 2, 1, 3).Contiguous(ctx)
+	attention = attention.Reshape(ctx, opts.hiddenSize, attention.Dim(2), batchSize)
+
+	return sa.Output.Forward(ctx, attention)
+}
+
+// VisionMLP implements the multi-layer perceptron
+type VisionMLP struct {
+	Gate *nn.Linear `gguf:"ffn_gate"`
+	Up   *nn.Linear `gguf:"ffn_up"`
+	Down *nn.Linear `gguf:"ffn_down"`
+}
+
+func (mlp *VisionMLP) Forward(ctx ml.Context, hiddenStates ml.Tensor, opts *VisionModelOptions) ml.Tensor {
+	// Using activation as specified in config (likely GELU or SiLU/Swish)
+	gateOutput := mlp.Gate.Forward(ctx, hiddenStates)
+	upOutput := mlp.Up.Forward(ctx, hiddenStates)
+	hiddenStates = gateOutput.SILU(ctx).Mul(ctx, upOutput)
+
+	return mlp.Down.Forward(ctx, hiddenStates)
+}
+
+type VisionEncoderLayer struct {
+	Norm1         *nn.RMSNorm `gguf:"ln1"`
+	SelfAttention *VisionSelfAttention
+	Norm2         *nn.RMSNorm `gguf:"ln2"`
+	MLP           *VisionMLP
+}
+
+func (e *VisionEncoderLayer) Forward(ctx ml.Context, hiddenStates, cos, sin, mask ml.Tensor, opts *VisionModelOptions) ml.Tensor {
+	residual := hiddenStates
+	hiddenStates = e.Norm1.Forward(ctx, hiddenStates, opts.eps)
+	hiddenStates = e.SelfAttention.Forward(ctx, hiddenStates, cos, sin, mask, opts)
+	hiddenStates = hiddenStates.Add(ctx, residual)
+
+	residual = hiddenStates
+	hiddenStates = e.Norm2.Forward(ctx, hiddenStates, opts.eps)
+	hiddenStates = e.MLP.Forward(ctx, hiddenStates, opts)
+	return hiddenStates.Add(ctx, residual)
+}
+
+// VisionModelOptions contains configuration options
+type VisionModelOptions struct {
+	hiddenSize        int
+	numHeads          int
+	headDim           int
+	patchSize         int
+	numChannels       int
+	eps               float32
+	ropeTheta         float32
+	spatialMergeSize  int
+	windowSize        int
+	fullAttnBlocks    []int32
+	temporalPatchSize int
+}
+
+type PatchEmbedding struct {
+	PatchConv0 *nn.Conv2D `gguf:"patch_embd_0"`
+	PatchConv1 *nn.Conv2D `gguf:"patch_embd_1"`
+}
+
+func (pe *PatchEmbedding) Forward(ctx ml.Context, pixelValues ml.Tensor, opts *VisionModelOptions) ml.Tensor {
+	numPatches := pixelValues.Shape()[1]
+
+	// Reshape the input tensor to match the expected dimensions
+	pixelValues = pixelValues.Reshape(ctx, opts.patchSize*opts.patchSize, opts.temporalPatchSize, opts.numChannels, numPatches)
+
+	// Permute the tensor to bring the temporal dimension to the front
+	pixelValues = pixelValues.Permute(ctx, 1, 0, 2, 3).Contiguous(ctx)
+
+	// Split the tensor into parts for the temporal convolutions
+	in0 := pixelValues.View(ctx, 0, 1, pixelValues.Stride(1), pixelValues.Dim(1), pixelValues.Stride(2), pixelValues.Dim(2), pixelValues.Stride(3), pixelValues.Dim(3)).Contiguous(ctx)
+	in0 = in0.Reshape(ctx, opts.patchSize, opts.patchSize, opts.numChannels, numPatches)
+	in1 := pixelValues.View(ctx, pixelValues.Stride(0), 1, pixelValues.Stride(1), pixelValues.Dim(1), pixelValues.Stride(2), pixelValues.Dim(2), pixelValues.Stride(3), pixelValues.Dim(3)).Contiguous(ctx)
+	in1 = in1.Reshape(ctx, opts.patchSize, opts.patchSize, opts.numChannels, numPatches)
+
+	s0, s1 := opts.patchSize, opts.patchSize // Use full stride
+	p0, p1 := 0, 0                           // padding
+	d0, d1 := 1, 1                           // dilation
+	out0 := pe.PatchConv0.Forward(ctx, in0, s0, s1, p0, p1, d0, d1)
+	out1 := pe.PatchConv1.Forward(ctx, in1, s0, s1, p0, p1, d0, d1)
+
+	// Add the outputs from the two temporal convolutions
+	out := out0.Add(ctx, out1)
+
+	// Reshape the output tensor to match the expected dimensions
+	return out.Reshape(ctx, opts.hiddenSize, numPatches)
+}
+
+// VisionPatchMerger implements patch merging for the Qwen vision model
+type VisionPatchMerger struct {
+	LNQ  *nn.RMSNorm `gguf:"ln_q"`
+	MLP0 *nn.Linear  `gguf:"mlp.0"`
+	MLP2 *nn.Linear  `gguf:"mlp.2"`
+}
+
+// Forward computes patch merging for the vision model
+func (pm *VisionPatchMerger) Forward(ctx ml.Context, visionOutputs ml.Tensor, opts *VisionModelOptions) ml.Tensor {
+	normalized := pm.LNQ.Forward(ctx, visionOutputs, opts.eps)
+
+	hiddenSize := visionOutputs.Dim(0) * (opts.spatialMergeSize * opts.spatialMergeSize)
+
+	// Reshape the normalized output to view the hidden size dimension
+	reshaped := normalized.Reshape(ctx, hiddenSize, normalized.Dim(1)/(opts.spatialMergeSize*opts.spatialMergeSize), batchSize)
+	hidden := pm.MLP0.Forward(ctx, reshaped)
+	activated := hidden.GELU(ctx)
+
+	output := pm.MLP2.Forward(ctx, activated)
+
+	return output
+}
+
+// VisionModel implements the Qwen vision model
+type VisionModel struct {
+	PatchEmbedding *PatchEmbedding
+	Layers         []VisionEncoderLayer `gguf:"blk"`
+	PatchMerger    *VisionPatchMerger   `gguf:"merger"`
+
+	*VisionModelOptions
+}
+
+// Forward computes the vision model for an input tensor
+func (m *VisionModel) Forward(ctx ml.Context, pixelValues ml.Tensor, grid *Grid) ml.Tensor {
+	// Extract patch embeddings
+	hiddenStates := m.PatchEmbedding.Forward(ctx, pixelValues, m.VisionModelOptions)
+
+	positionEmbedding := m.PositionalEmbedding(ctx, grid)
+
+	windowIndex, bounds := m.WindowIndex(ctx, grid)
+
+	spatialMergeUnit := m.spatialMergeSize * m.spatialMergeSize
+
+	hiddenStates = hiddenStates.Reshape(ctx, hiddenStates.Dim(0)*spatialMergeUnit, hiddenStates.Dim(1)/spatialMergeUnit)
+	hiddenStates = hiddenStates.Rows(ctx, windowIndex)
+	hiddenStates = hiddenStates.Reshape(ctx, hiddenStates.Dim(0)/spatialMergeUnit, hiddenStates.Dim(1)*spatialMergeUnit)
+
+	positionEmbedding = positionEmbedding.Reshape(ctx, positionEmbedding.Dim(0)*spatialMergeUnit, positionEmbedding.Dim(1)/spatialMergeUnit)
+	positionEmbedding = positionEmbedding.Rows(ctx, windowIndex)
+	positionEmbedding = positionEmbedding.Reshape(ctx, positionEmbedding.Dim(0)/spatialMergeUnit, positionEmbedding.Dim(1)*spatialMergeUnit)
+	positionEmbedding = positionEmbedding.Concat(ctx, positionEmbedding, 0)
+
+	cos, sin := positionEmbedding.Cos(ctx), positionEmbedding.Sin(ctx)
+	cos = cos.Reshape(ctx, cos.Dim(0), 1, cos.Dim(1))
+	sin = sin.Reshape(ctx, sin.Dim(0), 1, sin.Dim(1))
+
+	mask := blockDiagonalMask(ctx, hiddenStates.Dim(1), bounds, m.VisionModelOptions.numHeads)
+	// Apply encoder layers
+	for i, layer := range m.Layers {
+		if slices.Contains(m.fullAttnBlocks, int32(i)) {
+			hiddenStates = layer.Forward(ctx, hiddenStates, cos, sin, nil, m.VisionModelOptions)
+		} else {
+			hiddenStates = layer.Forward(
+				ctx,
+				hiddenStates,
+				cos,
+				sin,
+				mask,
+				m.VisionModelOptions,
+			)
+		}
+	}
+
+	hiddenStates = m.PatchMerger.Forward(ctx, hiddenStates, m.VisionModelOptions)
+	reverseWindowIndex := windowIndex.Argsort(ctx)
+	return hiddenStates.Rows(ctx, reverseWindowIndex)
+}
+
+// WindowIndex divides the grid into windows and returns:
+//  1. A tensor containing flattened indices of all grid points organized by windows
+//  2. A slice of boundaries that mark where each window's data begins and ends
+//     in the flattened representation, scaled by spatialMergeSize squared
+//
+// The boundaries slice always starts with 0 and contains cumulative ending
+// positions for each window, allowing downstream processing to identify
+// window boundaries in the tensor data.
+func (m *VisionModel) WindowIndex(ctx ml.Context, grid *Grid) (ml.Tensor, []int) {
+	vitMergerWindowSize := m.windowSize / m.spatialMergeSize / m.patchSize
+
+	llmGridH := grid.Height / m.spatialMergeSize
+	llmGridW := grid.Width / m.spatialMergeSize
+
+	// Calculate window parameters
+	numWindowsH := int(math.Ceil(float64(llmGridH) / float64(vitMergerWindowSize)))
+	numWindowsW := int(math.Ceil(float64(llmGridW) / float64(vitMergerWindowSize)))
+
+	// Initialize index_new slice
+	var index []int32
+
+	// Initialize bounds with the first element as 0
+	bounds := []int{0}
+	totalSeqLen := 0
+
+	// Process each window without padding
+	for wh := range numWindowsH {
+		for ww := range numWindowsW {
+			// Calculate window boundaries
+			hStart := wh * vitMergerWindowSize
+			wStart := ww * vitMergerWindowSize
+			hEnd := min(hStart+vitMergerWindowSize, llmGridH)
+			wEnd := min(wStart+vitMergerWindowSize, llmGridW)
+
+			// Calculate sequence length for this window
+			seqLen := (hEnd - hStart) * (wEnd - wStart)
+
+			// Collect indices for this window
+			for h := hStart; h < hEnd; h++ {
+				for w := wStart; w < wEnd; w++ {
+					index = append(index, int32(h*llmGridW+w))
+				}
+			}
+
+			totalSeqLen += seqLen
+			bounds = append(bounds, totalSeqLen*(m.spatialMergeSize*m.spatialMergeSize)+bounds[0])
+		}
+	}
+
+	t, err := ctx.Input().FromIntSlice(index, len(index))
+	if err != nil {
+		panic(err)
+	}
+
+	return t, bounds
+}
+
+// PositionalEmbedding generates rotary position embeddings for attention mechanisms
+func (m *VisionModel) PositionalEmbedding(ctx ml.Context, grid *Grid) ml.Tensor {
+	dim := m.headDim / 2
+	freq := dim / 2
+	theta := float64(m.ropeTheta)
+	merge := m.spatialMergeSize
+
+	// Create frequency patterns for position encoding
+	maxGridSize := max(grid.Height, grid.Width)
+	freqVals := make([]float32, freq*maxGridSize)
+	for i := range maxGridSize {
+		for j := range freq {
+			freqVals[i*freq+j] = float32(i) / float32(math.Pow(theta, float64(j*2)/float64(dim)))
+		}
+	}
+	freqs, err := ctx.Input().FromFloatSlice(freqVals, freq, maxGridSize)
+	if err != nil {
+		panic(fmt.Errorf("failed to create tensor from frequencies: %w", err))
+	}
+
+	// Create position coordinates (y,x pairs) for the grid
+	// In PyTorch: Equivalent to generating position ids with torch.arange()
+	coords := make([]int32, 0, grid.Height*grid.Width*2)
+	for y := range grid.Height {
+		for x := range grid.Width {
+			coords = append(coords, int32(y), int32(x))
+		}
+	}
+	pos, err := ctx.Input().FromIntSlice(coords, 2, grid.Width, grid.Height)
+	if err != nil {
+		panic(fmt.Errorf("failed to create tensor from positions: %w", err))
+	}
+
+	// Reshape and permute positions to match spatial merging pattern
+	pos = pos.Reshape(ctx, 2, grid.Width, merge, grid.Height/merge)
+	pos = pos.Permute(ctx, 0, 2, 1, 3).Contiguous(ctx)
+	pos = pos.Reshape(ctx, 2, merge, merge, grid.Width/merge*grid.Height/merge)
+	pos = pos.Permute(ctx, 0, 2, 1, 3).Contiguous(ctx)
+	pos = pos.Reshape(ctx, 2*merge*merge*grid.Width/merge*grid.Height/merge)
+
+	// Use position indices to look up corresponding frequency values
+	positionalEmbedding := freqs.Rows(ctx, pos)
+	positionalEmbedding = positionalEmbedding.Reshape(ctx, positionalEmbedding.Dim(0)*2, positionalEmbedding.Dim(1)/2)
+	return positionalEmbedding
+}
+
+// newVisionModel creates a new instance of the Qwen vision model
+func newVisionModel(c fs.Config) *VisionModel {
+	patchSize := int(c.Uint("vision.patch_size", 14))
+	hiddenSize := int(c.Uint("vision.embedding_length", 1280))
+	numHeads := int(c.Uint("vision.attention.head_count", 16))
+	numChannels := int(c.Uint("vision.num_channels", 3))
+	eps := c.Float("vision.attention.layer_norm_epsilon", 1e-6)
+	ropeTheta := c.Float("vision.rope.freq_base", 10000.0)
+	spatialMergeSize := int(c.Uint("vision.spatial_merge_size", 2))
+	windowSize := int(c.Uint("vision.window_size", 112))
+	fullAttnBlocks := c.Ints("qwen25vl.vision.fullatt_block_indexes", []int32{7, 15, 23, 31})
+	temporalPatchSize := int(c.Uint("vision.temporal_patch_size", 2))
+
+	model := &VisionModel{
+		Layers: make([]VisionEncoderLayer, c.Uint("vision.block_count", 32)),
+		VisionModelOptions: &VisionModelOptions{
+			hiddenSize:        hiddenSize,
+			numHeads:          numHeads,
+			headDim:           hiddenSize / numHeads,
+			patchSize:         patchSize,
+			numChannels:       numChannels,
+			eps:               eps,
+			ropeTheta:         ropeTheta,
+			spatialMergeSize:  spatialMergeSize,
+			windowSize:        windowSize,
+			temporalPatchSize: temporalPatchSize,
+			fullAttnBlocks:    fullAttnBlocks,
+		},
+	}
+
+	return model
+}
diff --git a/model/models/qwen25vl/process_image.go b/model/models/qwen25vl/process_image.go
new file mode 100644
index 00000000..dc91bdea
--- /dev/null
+++ b/model/models/qwen25vl/process_image.go
@@ -0,0 +1,184 @@
+package qwen25vl
+
+import (
+	"fmt"
+	"image"
+	"math"
+
+	"github.com/ollama/ollama/fs"
+	"github.com/ollama/ollama/model/imageproc"
+)
+
+// ImageProcessor contains configuration for the Qwen 2.5 VL image processing
+type ImageProcessor struct {
+	numChannels       int
+	patchSize         int
+	temporalPatchSize int
+	mergeSize         int
+	minPixels         int
+	maxPixels         int
+	factor            int
+	rescaleFactor     float32
+	imageMean         []float32
+	imageStd          []float32
+}
+
+// newImageProcessor creates a new image processor with default values
+func newImageProcessor(c fs.Config) ImageProcessor {
+	patchSize := int(c.Uint("vision.patch_size", 14))
+	mergeSize := int(c.Uint("vision.spatial_merge_size", 2))
+
+	return ImageProcessor{
+		numChannels:       int(c.Uint("vision.num_channels", 3)), // not set
+		patchSize:         patchSize,
+		temporalPatchSize: 2,
+		mergeSize:         mergeSize,
+		minPixels:         56 * 56,
+		maxPixels:         int(c.Uint("vision.max_pixels", 28*28*1280)), // 1MP limit
+		factor:            patchSize * mergeSize,
+		rescaleFactor:     1.0 / 255.0,
+		imageMean:         imageproc.ClipDefaultMean[:],
+		imageStd:          imageproc.ClipDefaultSTD[:],
+	}
+}
+
+// SmartResize implements the smart resize algorithm
+func (p *ImageProcessor) SmartResize(height, width int) (int, int) {
+	factor := p.factor
+
+	if height < factor || width < factor {
+		panic(fmt.Sprintf("height:%d or width:%d must be larger than factor:%d", height, width, factor))
+	} else if aspectRatio := max(height, width) / min(height, width); aspectRatio > 200 {
+		panic(fmt.Sprintf("absolute aspect ratio must be smaller than 200, got %v", aspectRatio))
+	}
+
+	round := func(x float64) int { return int(math.RoundToEven(x)) }
+
+	hBar := round(float64(height)/float64(factor)) * factor
+	wBar := round(float64(width)/float64(factor)) * factor
+
+	if hBar*wBar > p.maxPixels {
+		beta := math.Sqrt(float64(height*width) / float64(p.maxPixels))
+
+		hBar = int(math.Floor(float64(height)/beta/float64(factor))) * factor
+		wBar = int(math.Floor(float64(width)/beta/float64(factor))) * factor
+	} else if hBar*wBar < p.minPixels {
+		beta := math.Sqrt(float64(p.minPixels) / float64(height*width))
+
+		hBar = int(math.Ceil(float64(height)*beta/float64(factor))) * factor
+		wBar = int(math.Ceil(float64(width)*beta/float64(factor))) * factor
+	}
+
+	return hBar, wBar
+}
+
+type Grid struct {
+	Height   int
+	Width    int
+	Temporal int
+}
+
+func (p *ImageProcessor) ProcessImage(img image.Image) ([]float32, *Grid, error) {
+	origWidth := img.Bounds().Dx()
+	origHeight := img.Bounds().Dy()
+
+	// Calculate smart resize dimensions
+	resizedHeight, resizedWidth := p.SmartResize(origHeight, origWidth)
+
+	// Resize image using existing functions
+	resizedImg := imageproc.Resize(img, image.Point{X: resizedWidth, Y: resizedHeight}, imageproc.ResizeBilinear)
+
+	normalizedPixels := imageproc.Normalize(
+		resizedImg,
+		[3]float32{p.imageMean[0], p.imageMean[1], p.imageMean[2]},
+		[3]float32{p.imageStd[0], p.imageStd[1], p.imageStd[2]},
+		true, // rescale
+		true, // channelFirst
+	)
+
+	// Calculate grid dimensions
+	grid := &Grid{
+		Height:   resizedHeight / p.patchSize,
+		Width:    resizedWidth / p.patchSize,
+		Temporal: 1, // For single images, temporal dimension is 1
+	}
+
+	patches, err := p.createPatches(normalizedPixels, resizedHeight, resizedWidth, grid)
+	if err != nil {
+		return nil, nil, fmt.Errorf("failed to create patches: %v", err)
+	}
+
+	// Return patches and grid dimensions
+	return patches, grid, nil
+}
+
+func (p *ImageProcessor) createPatches(pixels []float32, height, width int, grid *Grid) ([]float32, error) {
+	channels := p.numChannels
+	patchSize := p.patchSize
+	mergeSize := p.mergeSize
+	temporalPatchSize := p.temporalPatchSize
+
+	// Calculate output dimensions
+	numPatches := grid.Temporal * grid.Height * grid.Width
+	patchDim := channels * temporalPatchSize * patchSize * patchSize
+
+	result := make([]float32, numPatches*patchDim)
+	patchIndex := 0
+
+	// Single temporal frame handling (copies to all frames)
+	for range grid.Temporal {
+		for h := 0; h < grid.Height; h += mergeSize {
+			for w := 0; w < grid.Width; w += mergeSize {
+				// Handle the 2x2 merged patches
+				for mh := range mergeSize {
+					for mw := range mergeSize {
+						baseOffset := patchIndex * patchDim
+
+						// Extract patch data for first temporal frame
+						for c := range channels {
+							channelOffset := baseOffset + (c * temporalPatchSize * patchSize * patchSize)
+
+							for py := range patchSize {
+								for px := range patchSize {
+									// Calculate source pixel coordinates
+									y := (h+mh)*patchSize + py
+									x := (w+mw)*patchSize + px
+
+									// Source index in input tensor (CHW format)
+									srcIdx := c*height*width + y*width + x
+
+									// Destination index in first temporal frame
+									dstIdx := channelOffset + (py * patchSize) + px
+
+									if srcIdx < len(pixels) && dstIdx < len(result) {
+										result[dstIdx] = pixels[srcIdx]
+									}
+								}
+							}
+						}
+
+						// Copy first temporal frame to all other frames
+						if temporalPatchSize > 1 {
+							for c := range channels {
+								channelOffset := baseOffset + (c * temporalPatchSize * patchSize * patchSize)
+								firstFrameOffset := channelOffset
+								frameSize := patchSize * patchSize
+
+								// Copy first frame to all other frames
+								for tp := 1; tp < temporalPatchSize; tp++ {
+									currentFrameOffset := channelOffset + (tp * frameSize)
+									copy(result[currentFrameOffset:currentFrameOffset+frameSize],
+										result[firstFrameOffset:firstFrameOffset+frameSize])
+								}
+							}
+						}
+
+						patchIndex++
+					}
+				}
+			}
+		}
+	}
+
+	return result, nil
+}

From ff80718e9c9a08dd10759cdee5c81db366e38368 Mon Sep 17 00:00:00 2001
From: Daniel Hiltgen <dhiltgen@users.noreply.github.com>
Date: Wed, 14 May 2025 14:54:18 -0700
Subject: [PATCH 023/108] fix crash in old clients with quantization progress
 (#10710)

Older clients assumed the digest was at least 19 characters long so increase the size
of the dummy digest to avoid array out of bounds crashes.
---
 server/create.go | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/server/create.go b/server/create.go
index 41c8731c..68e003df 100644
--- a/server/create.go
+++ b/server/create.go
@@ -430,7 +430,7 @@ func quantizeLayer(layer *layerGGML, quantizeType string, fn func(resp api.Progr
 	fnWrap := func(n uint64) {
 		done := doneBytes.Add(n)
 		progress := float32(done) / float32(totalBytes)
-		fn(api.ProgressResponse{Status: fmt.Sprintf("quantizing %s model to %s", ft, quantizeType), Digest: "0", Total: layer.Size, Completed: int64(progress * float32(layer.Size))})
+		fn(api.ProgressResponse{Status: fmt.Sprintf("quantizing %s model to %s", ft, quantizeType), Digest: "0000000000000000000", Total: layer.Size, Completed: int64(progress * float32(layer.Size))})
 	}
 	ftype, err := ggml.ParseFileType(quantizeType)
 	if err != nil {

From bd68d3ae50c67ba46ee94a584fa6d0386e4b8522 Mon Sep 17 00:00:00 2001
From: Bruce MacDonald <brucewmacdonald@gmail.com>
Date: Wed, 14 May 2025 16:42:30 -0700
Subject: [PATCH 024/108] ggml: update qwen25vl vision size estimate (#10711)

---
 fs/ggml/ggml.go | 18 ++++--------------
 1 file changed, 4 insertions(+), 14 deletions(-)

diff --git a/fs/ggml/ggml.go b/fs/ggml/ggml.go
index 514b6011..8c0a2ae5 100644
--- a/fs/ggml/ggml.go
+++ b/fs/ggml/ggml.go
@@ -6,7 +6,6 @@ import (
 	"fmt"
 	"io"
 	"log/slog"
-	"math"
 	"slices"
 	"strings"
 
@@ -653,24 +652,15 @@ func (llm GGML) VisionGraphSize() (weights, graphSize uint64) {
 			numPatches*numPatches*headCount)
 	case "qwen25vl":
 		maxPixels := uint64(llm.KV().Uint("vision.max_pixels", 28*28*1280))
-		mergeSize := uint64(llm.KV().Uint("vision.spatial_merge_size", 2))
-		temporalPatchSize := uint64(2)
 
-		// Calculate max possible patches based on max_pixels
-		maxHeight := uint64(math.Sqrt(float64(maxPixels)))
-		maxWidth := maxPixels / maxHeight
-		maxGridHeight := maxHeight / patchSize
-		maxGridWidth := maxWidth / patchSize
-		// Account for merged patches (2x2 grid)
-		numPatches := (maxGridHeight * maxGridWidth) / (mergeSize * mergeSize)
+		numPatches := maxPixels / (patchSize * patchSize)
 
-		// Calculate graph size based on typical operations in ProcessImage and createPatches
 		graphSize = 4 * (maxPixels*numChannels + // Original image storage
 			// Normalized pixels
 			maxPixels*numChannels +
-			// Patches storage (numPatches * channels * temporalPatchSize * patchSize^2)
-			numPatches*numChannels*temporalPatchSize*patchSize*patchSize +
-			// Self-attention calculations (similar to other architectures)
+			// Patches storage (numPatches * channels * patchSize^2)
+			numPatches*numChannels*patchSize*patchSize +
+			// Self-attention calculations
 			numPatches*numPatches*headCount +
 			// Additional buffer for processing
 			embeddingLength*numPatches)

From 55760195e654992ca5f364aa191b24611b3b7531 Mon Sep 17 00:00:00 2001
From: Michael Yang <mxyng@pm.me>
Date: Thu, 15 May 2025 12:15:01 -0700
Subject: [PATCH 025/108] fix mllama conversion (#10716)

cross attention Q and K projections needs to have their heads swapped, similar to non-cross attention Q and K tensors
---
 convert/convert_llama.go | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/convert/convert_llama.go b/convert/convert_llama.go
index e491a9d8..43969749 100644
--- a/convert/convert_llama.go
+++ b/convert/convert_llama.go
@@ -139,7 +139,8 @@ func (p *llamaModel) Tensors(ts []Tensor) []*ggml.Tensor {
 	}
 
 	for _, t := range ts {
-		if strings.HasSuffix(t.Name(), "attn_q.weight") || strings.HasSuffix(t.Name(), "attn_k.weight") {
+		if strings.HasSuffix(t.Name(), "attn_q.weight") || strings.HasSuffix(t.Name(), "attn_k.weight") ||
+			strings.HasSuffix(t.Name(), "attn_q_proj.weight") || strings.HasSuffix(t.Name(), "attn_k_proj.weight") {
 			if !p.skipRepack {
 				t.SetRepacker(p.repack)
 			}
@@ -181,9 +182,9 @@ func (p *llamaModel) repack(name string, data []float32, shape []uint64) ([]floa
 	}
 
 	var heads uint32
-	if strings.HasSuffix(name, "attn_q.weight") {
+	if strings.HasSuffix(name, "attn_q.weight") || strings.HasSuffix(name, "attn_q_proj.weight") {
 		heads = p.NumAttentionHeads
-	} else if strings.HasSuffix(name, "attn_k.weight") {
+	} else if strings.HasSuffix(name, "attn_k.weight") || strings.HasSuffix(name, "attn_k_proj.weight") {
 		heads = cmp.Or(p.NumKeyValueHeads, p.NumAttentionHeads)
 	} else {
 		return nil, fmt.Errorf("unknown tensor for repack: %s", name)

From ef202789fad6b8a8ab51f4d2ff5450067e3d1f65 Mon Sep 17 00:00:00 2001
From: Michael Yang <mxyng@pm.me>
Date: Thu, 15 May 2025 13:44:44 -0700
Subject: [PATCH 026/108] fix pixel values padding (#10718)

* panic if trying to pad 4d

* fix pixel values padding
---
 ml/backend/ggml/ggml.go      |  2 ++
 model/models/mllama/model.go | 11 ++++++++---
 2 files changed, 10 insertions(+), 3 deletions(-)

diff --git a/ml/backend/ggml/ggml.go b/ml/backend/ggml/ggml.go
index 1ba07983..2821ad11 100644
--- a/ml/backend/ggml/ggml.go
+++ b/ml/backend/ggml/ggml.go
@@ -915,6 +915,8 @@ func (t *Tensor) RMSNorm(ctx ml.Context, w ml.Tensor, eps float32) ml.Tensor {
 func (t *Tensor) Pad(ctx ml.Context, shape ...int) ml.Tensor {
 	if len(shape) != 4 {
 		panic("expected 4 dimensions")
+	} else if shape[3] != 0 {
+		panic("cuda does not support 4d tensors")
 	}
 
 	return &Tensor{
diff --git a/model/models/mllama/model.go b/model/models/mllama/model.go
index 93b443ef..4d5bdd4a 100644
--- a/model/models/mllama/model.go
+++ b/model/models/mllama/model.go
@@ -3,6 +3,7 @@ package mllama
 import (
 	"bytes"
 	"image"
+	"slices"
 
 	"github.com/ollama/ollama/fs"
 	"github.com/ollama/ollama/kvcache"
@@ -73,13 +74,17 @@ func (m *Model) EncodeMultimodal(ctx ml.Context, multimodalData []byte) (any, er
 		return nil, err
 	}
 
-	pixelValues, err := ctx.Input().FromFloatSlice(f32s, m.imageSize, m.imageSize, m.numChannels, ratio.numTiles())
+	if ratio.numTiles() < m.maxNumTiles {
+		// Pad tiles to maxNumTiles
+		f32s = slices.Grow(f32s, m.imageSize*m.imageSize*m.numChannels*m.maxNumTiles)
+		f32s = f32s[:m.imageSize*m.imageSize*m.numChannels*m.maxNumTiles]
+	}
+
+	pixelValues, err := ctx.Input().FromFloatSlice(f32s, m.imageSize, m.imageSize, m.numChannels, m.maxNumTiles)
 	if err != nil {
 		return nil, err
 	}
 
-	pixelValues = pixelValues.Pad(ctx, 0, 0, 0, m.ImageProcessor.maxNumTiles-ratio.numTiles())
-
 	aspectRatio, err := ctx.Input().FromIntSlice([]int32{int32(ratio.rank)}, 1)
 	if err != nil {
 		return nil, err

From 499ae7311fd26cb4e655ebea69712de3e242f629 Mon Sep 17 00:00:00 2001
From: Jesse Gross <jesse@ollama.com>
Date: Fri, 9 May 2025 16:51:47 -0700
Subject: [PATCH 027/108] ollamarunner: Base cached tokens on current prompt

When we restore a sequence from the cache, we split the prompt into
the already used tokens (stored in the cache) and new tokens that
need to be processed. Currently, the references to the used tokens
are coming from the stored previous sequence.

However, even though we know that the used tokens are semantically
equivalent to the prefix of the prompt, tokens can contain pointers
which are no longer valid. As a result, it is better to get the
used tokens from the prompt, which has currently valid pointers.

This doesn't currently have any impact because it isn't possible
to reuse the pointers (which are tensors) anyways. However, it
becomes an issue once we can.
---
 runner/llamarunner/cache.go  | 2 +-
 runner/ollamarunner/cache.go | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/runner/llamarunner/cache.go b/runner/llamarunner/cache.go
index 2e55b09d..2e273e69 100644
--- a/runner/llamarunner/cache.go
+++ b/runner/llamarunner/cache.go
@@ -104,8 +104,8 @@ func (c *InputCache) LoadCacheSlot(prompt []input, cachePrompt bool) (*InputCach
 	slog.Debug("loading cache slot", "id", slot.Id, "cache", len(slot.Inputs), "prompt", len(prompt),
 		"used", numPast, "remaining", len(prompt)-numPast)
 
+	slot.Inputs = prompt[:numPast]
 	prompt = prompt[numPast:]
-	slot.Inputs = slot.Inputs[:numPast]
 
 	return slot, prompt, nil
 }
diff --git a/runner/ollamarunner/cache.go b/runner/ollamarunner/cache.go
index 2138d798..43880a41 100644
--- a/runner/ollamarunner/cache.go
+++ b/runner/ollamarunner/cache.go
@@ -136,8 +136,8 @@ func (c *InputCache) LoadCacheSlot(prompt []input.Input) (*InputCacheSlot, []inp
 	slog.Debug("loading cache slot", "id", slot.Id, "cache", len(slot.Inputs), "prompt", len(prompt),
 		"used", numPast, "remaining", int32(len(prompt))-numPast)
 
+	slot.Inputs = prompt[:numPast]
 	prompt = prompt[numPast:]
-	slot.Inputs = slot.Inputs[:numPast]
 
 	return slot, prompt, nil
 }

From 3c14461d5d2280723b3f961fb99ad128b3eee9af Mon Sep 17 00:00:00 2001
From: Jesse Gross <jesse@ollama.com>
Date: Mon, 5 May 2025 13:32:11 -0700
Subject: [PATCH 028/108] ollamarunner: Separate text and multimodal graphs

For some multimodal models (such as gemma3), we create a single
graph that generates the image embedding and then use this in the
text model. The embedding tensor is completely opaque to the runner.

However, this doesn't work if we need to use the embedding in multiple
batches. This can arise if the embedding is larger than the batch size.
In these cases (as with llama4), we would like to create views that
are more appropriately sized. However, if we do this then the original
source tensor is used in multiple graphs, which isn't allowed. To
avoid that problem, models with this pattern compute the embedding
tensor on first use and recreate the individual views. There is no
longer a single vision and text graph.

This codifies the pattern of separating vision and text graphs. The
logic of computing tensors on demand is moved to the runner, so models
no longer have to worry about this. It also gives the runner visibility
into the multimodal tensors, which is important for memory management.
---
 model/input/input.go                |  26 +++++--
 model/model.go                      |   9 +--
 model/models/gemma3/model.go        |  14 ++--
 model/models/gemma3/model_text.go   |   2 +-
 model/models/llama4/model.go        | 102 ++++++++++++++-------------
 model/models/llama4/model_text.go   |   7 +-
 model/models/mistral3/model.go      |  41 +++--------
 model/models/mistral3/model_text.go |  18 +----
 model/models/mllama/model.go        |   8 ++-
 model/models/qwen25vl/model.go      |  39 ++---------
 model/models/qwen25vl/model_text.go |   7 +-
 runner/ollamarunner/cache_test.go   |  17 ++---
 runner/ollamarunner/multimodal.go   | 103 ++++++++++++++++++++++++++++
 runner/ollamarunner/runner.go       |  34 ++++++---
 14 files changed, 241 insertions(+), 186 deletions(-)
 create mode 100644 runner/ollamarunner/multimodal.go

diff --git a/model/input/input.go b/model/input/input.go
index d66f52a0..bd9b53ec 100644
--- a/model/input/input.go
+++ b/model/input/input.go
@@ -2,16 +2,30 @@ package input
 
 import "github.com/ollama/ollama/ml"
 
+// Multimodal is a multimodal embedding or a component of one.
+// For example, it could be a row of an image that can be processed
+// independently.
+type Multimodal struct {
+	// Tensor is the embedding data. Implementations may chose what to
+	// store here or it may be nil if not needed. However, any ml.Tensor
+	// objects must be stored here and not in Data.
+	Tensor ml.Tensor
+
+	// Data is implementation-specific opaque data, such as metadata on how
+	// to layout Tensor. It may be nil if not needed. It may also store larger
+	// objects such as complete images if they are to be processed later.
+	Data any
+}
+
 // Input represents one token in the input stream
 type Input struct {
 	// Token is a single element of text.
 	Token int32
 
-	// Multimodal is opaque data representing a non-text
-	// element such as an image (or part of one if the image
-	// can be processed in pieces). It may be either together
-	// with Token or on its own.
-	Multimodal any
+	// Multimodal is represents a non-text element such as an
+	// image (or part of one if the image can be processed in pieces).
+	// It may be used either together with Token or on its own.
+	Multimodal []Multimodal
 
 	// MultimodalHash is a unique representation of the data
 	// stored in Multimodal, used for caching and comparing
@@ -32,7 +46,7 @@ type Input struct {
 // Positions slice.
 type MultimodalIndex struct {
 	Index      int
-	Multimodal any
+	Multimodal []Multimodal
 }
 
 // Batch contains the inputs for a model forward pass
diff --git a/model/model.go b/model/model.go
index 7883b851..98381c90 100644
--- a/model/model.go
+++ b/model/model.go
@@ -40,12 +40,13 @@ type MultimodalProcessor interface {
 	// EncodeMultimodal processes a single input (such as an image) and
 	// generates an output (typically an embedding) that can be used by the model.
 	//
-	// The return value is most typically an ml.Tensor, however, different
-	// type are possible, such as an object containing a tensor plus
-	// additional metadata, a slice of tensors or even just the original input.
+	// The return value is one or more tensors, each with optional model-specific
+	// opaque metadata. Typically, the tensors might be views into an embedding
+	// with each view representing a chunk of data that can be processed independently
+	// in different batches.
 	//
 	// The result may be cached by the runner.
-	EncodeMultimodal(ml.Context, []byte) (any, error)
+	EncodeMultimodal(ml.Context, []byte) ([]input.Multimodal, error)
 
 	// PostTokenize is called after tokenization to allow the model to edit the
 	// input stream to correctly arrange multimodal elements.
diff --git a/model/models/gemma3/model.go b/model/models/gemma3/model.go
index bf396b6a..d53eb6cc 100644
--- a/model/models/gemma3/model.go
+++ b/model/models/gemma3/model.go
@@ -82,7 +82,7 @@ func New(c fs.Config) (model.Model, error) {
 	return &m, nil
 }
 
-func (m *Model) EncodeMultimodal(ctx ml.Context, multimodalData []byte) (any, error) {
+func (m *Model) EncodeMultimodal(ctx ml.Context, multimodalData []byte) ([]input.Multimodal, error) {
 	if len(m.VisionModel.Layers) == 0 {
 		return nil, model.ErrNoVisionModel
 	}
@@ -108,22 +108,22 @@ func (m *Model) EncodeMultimodal(ctx ml.Context, multimodalData []byte) (any, er
 
 	visionOutputs := m.VisionModel.Forward(ctx, pixelValues)
 	visionOutputs = m.MultiModalProjector.Forward(ctx, visionOutputs, m.imageSize, m.patchSize, m.VisionModel.eps)
-	return visionOutputs, nil
+	return []input.Multimodal{{Tensor: visionOutputs}}, nil
 }
 
 func (m *Model) PostTokenize(inputs []input.Input) ([]input.Input, error) {
 	var result []input.Input
 
 	for _, inp := range inputs {
-		if inp.Multimodal == nil {
+		if len(inp.Multimodal) == 0 {
 			result = append(result, inp)
 		} else {
-			inputMultimodal := inp.Multimodal.(ml.Tensor)
+			inputMultimodal := inp.Multimodal[0].Tensor
 
 			result = append(result,
-				input.Input{Token: 108, SameBatch: inputMultimodal.Dim(1) + 3},               // "\n\n"
-				input.Input{Token: 255999},                                                   // "<start_of_image>""
-				input.Input{Multimodal: inputMultimodal, MultimodalHash: inp.MultimodalHash}, // image data is on the first placeholder
+				input.Input{Token: 108, SameBatch: inputMultimodal.Dim(1) + 3}, // "\n\n"
+				input.Input{Token: 255999},                                     // "<start_of_image>""
+				input.Input{Multimodal: []input.Multimodal{{Tensor: inputMultimodal}}, MultimodalHash: inp.MultimodalHash}, // image data is on the first placeholder
 			)
 
 			// add image token placeholders
diff --git a/model/models/gemma3/model_text.go b/model/models/gemma3/model_text.go
index 741818a2..a40614af 100644
--- a/model/models/gemma3/model_text.go
+++ b/model/models/gemma3/model_text.go
@@ -165,7 +165,7 @@ func (m *TextModel) Forward(ctx ml.Context, inputs, positions, outputs ml.Tensor
 	// set image embeddings
 	var except []int
 	for _, image := range batch.Multimodal {
-		visionOutputs := image.Multimodal.(ml.Tensor)
+		visionOutputs := image.Multimodal[0].Tensor
 		ctx.Forward(visionOutputs.Copy(ctx, hiddenState.View(ctx, image.Index*hiddenState.Stride(1), visionOutputs.Dim(0)*visionOutputs.Dim(1))))
 
 		for i := range visionOutputs.Dim(1) {
diff --git a/model/models/llama4/model.go b/model/models/llama4/model.go
index 798f0d16..c94aa72f 100644
--- a/model/models/llama4/model.go
+++ b/model/models/llama4/model.go
@@ -4,7 +4,6 @@ import (
 	"bytes"
 	"image"
 	"slices"
-	"sync"
 
 	"github.com/ollama/ollama/fs"
 	"github.com/ollama/ollama/kvcache"
@@ -63,7 +62,7 @@ func New(c fs.Config) (model.Model, error) {
 	return &m, nil
 }
 
-func (m *Model) EncodeMultimodal(ctx ml.Context, multimodalData []byte) (any, error) {
+func (m *Model) EncodeMultimodal(ctx ml.Context, multimodalData []byte) ([]input.Multimodal, error) {
 	if len(m.VisionModel.Layers) < 1 {
 		return nil, model.ErrNoVisionModel
 	}
@@ -103,70 +102,79 @@ func (m *Model) EncodeMultimodal(ctx ml.Context, multimodalData []byte) (any, er
 	visionOutputs := m.VisionModel.Forward(ctx, pixelValues)
 	visionOutputs = visionOutputs.Reshape(ctx, visionOutputs.Dim(0), visionOutputs.Dim(1)*visionOutputs.Dim(2)*visionOutputs.Dim(3))
 	projectedOutputs := m.Projector.Forward(ctx, visionOutputs)
-	return &chunks{Model: m, Tensor: projectedOutputs, aspectRatio: image.Point{ratioW, ratioH}}, nil
+
+	var multimodal []input.Multimodal
+	aspectRatio := image.Point{ratioW, ratioH}
+
+	var offset int
+	patchesPerChunk := projectedOutputs.Dim(1)
+	if aspectRatio.Y*aspectRatio.X > 1 {
+		patchesPerChunk = projectedOutputs.Dim(1) / (aspectRatio.X*aspectRatio.Y + 1)
+
+		for range aspectRatio.Y {
+			for x := range aspectRatio.X {
+				view := projectedOutputs.View(ctx, projectedOutputs.Stride(1)*offset,
+					projectedOutputs.Dim(0), projectedOutputs.Stride(1),
+					patchesPerChunk)
+				var separator separator
+				if x < aspectRatio.X-1 {
+					separator.x = true // <|tile_x_separator|>
+				} else {
+					separator.y = true // <|tile_y_separator|>
+				}
+				multimodal = append(multimodal, input.Multimodal{Tensor: view, Data: &separator})
+				offset += patchesPerChunk
+			}
+		}
+	}
+
+	view := projectedOutputs.View(ctx, projectedOutputs.Stride(1)*offset,
+		projectedOutputs.Dim(0), projectedOutputs.Stride(1),
+		patchesPerChunk)
+	multimodal = append(multimodal, input.Multimodal{Tensor: view, Data: &separator{}})
+
+	return multimodal, nil
 }
 
-type chunks struct {
-	*Model
-	ml.Tensor
-	aspectRatio image.Point
-
-	dataOnce sync.Once
-	data     []float32
-}
-
-type chunk struct {
-	*chunks
-	s, n int
-}
-
-func (r *chunk) floats() []float32 {
-	r.dataOnce.Do(func() {
-		temp := r.Backend().NewContext()
-		defer temp.Close()
-		temp.Forward(r.Tensor).Compute(r.Tensor)
-		r.data = r.Floats()
-	})
-
-	return r.data[r.s*r.Dim(0) : (r.s+r.n)*r.Dim(0)]
+type separator struct {
+	x bool
+	y bool
 }
 
 func (m *Model) PostTokenize(inputs []input.Input) ([]input.Input, error) {
 	var result []input.Input
 	for _, inp := range inputs {
-		if inp.Multimodal == nil {
+		if len(inp.Multimodal) == 0 {
 			result = append(result, inp)
 			continue
 		}
 
-		t := inp.Multimodal.(*chunks)
 		var imageInputs []input.Input
 		imageInputs = append(imageInputs, input.Input{Token: 200080}) // <|image_start|>
 
-		var offset int
-		patchesPerChunk := t.Dim(1)
-		if t.aspectRatio.Y*t.aspectRatio.X > 1 {
-			patchesPerChunk = t.Dim(1) / (t.aspectRatio.X*t.aspectRatio.Y + 1)
+		for i, mm := range inp.Multimodal {
+			patchesPerChunk := mm.Tensor.Dim(1)
 
-			for range t.aspectRatio.Y {
-				for x := range t.aspectRatio.X {
-					imageInputs = append(imageInputs, input.Input{Token: 200092, Multimodal: &chunk{t, offset, patchesPerChunk}, MultimodalHash: inp.MultimodalHash, SameBatch: patchesPerChunk}) // <|patch|>
-					imageInputs = append(imageInputs, slices.Repeat([]input.Input{{Token: 200092}}, patchesPerChunk-1)...)
-					if x < t.aspectRatio.X-1 {
-						imageInputs = append(imageInputs, input.Input{Token: 200084}) // <|tile_x_separator|>
-					}
-					offset += patchesPerChunk
+			if i < len(inp.Multimodal)-1 {
+				separator := mm.Data.(*separator)
+
+				imageInputs = append(imageInputs, input.Input{Token: 200092, Multimodal: []input.Multimodal{{Tensor: mm.Tensor}}, MultimodalHash: inp.MultimodalHash, SameBatch: patchesPerChunk}) // <|patch|>
+				imageInputs = append(imageInputs, slices.Repeat([]input.Input{{Token: 200092}}, patchesPerChunk-1)...)
+
+				if separator.x {
+					imageInputs = append(imageInputs, input.Input{Token: 200084}) // <|tile_x_separator|>
 				}
-
-				imageInputs = append(imageInputs, input.Input{Token: 200085}) // <|tile_y_separator|>
+				if separator.y {
+					imageInputs = append(imageInputs, input.Input{Token: 200085}) // <|tile_y_separator|>
+				}
+			} else {
+				imageInputs = append(imageInputs, input.Input{Token: 200090})                                                                                                                      // <|image|>
+				imageInputs = append(imageInputs, input.Input{Token: 200092, Multimodal: []input.Multimodal{{Tensor: mm.Tensor}}, MultimodalHash: inp.MultimodalHash, SameBatch: patchesPerChunk}) // <|patch|>
+				imageInputs = append(imageInputs, slices.Repeat([]input.Input{{Token: 200092}}, patchesPerChunk-1)...)
+				imageInputs = append(imageInputs, input.Input{Token: 200080}) // <|image_end|>
 			}
 		}
 
-		imageInputs = append(imageInputs, input.Input{Token: 200090})                                                                                                                 // <|image|>
-		imageInputs = append(imageInputs, input.Input{Token: 200092, Multimodal: &chunk{t, offset, patchesPerChunk}, MultimodalHash: inp.MultimodalHash, SameBatch: patchesPerChunk}) // <|patch|>
-		imageInputs = append(imageInputs, slices.Repeat([]input.Input{{Token: 200092}}, patchesPerChunk-1)...)
-		imageInputs = append(imageInputs, input.Input{Token: 200080}) // <|image_end|>
-
 		result = append(result, imageInputs...)
 	}
 
diff --git a/model/models/llama4/model_text.go b/model/models/llama4/model_text.go
index 3f9f578f..d98587bd 100644
--- a/model/models/llama4/model_text.go
+++ b/model/models/llama4/model_text.go
@@ -210,12 +210,7 @@ func (m *TextModel) Forward(ctx ml.Context, inputs, positions, outputs ml.Tensor
 	hiddenStates := m.TokenEmbedding.Forward(ctx, inputs).Duplicate(ctx)
 
 	for _, mi := range batch.Multimodal {
-		f32s := mi.Multimodal.(*chunk).floats()
-		img, err := ctx.Input().FromFloatSlice(f32s, len(f32s)/m.hiddenSize, m.hiddenSize)
-		if err != nil {
-			panic(err)
-		}
-
+		img := mi.Multimodal[0].Tensor
 		ctx.Forward(img.Copy(ctx, hiddenStates.View(ctx, mi.Index*hiddenStates.Stride(1), img.Dim(0)*img.Dim(1))))
 	}
 
diff --git a/model/models/mistral3/model.go b/model/models/mistral3/model.go
index c9685244..b93882a9 100644
--- a/model/models/mistral3/model.go
+++ b/model/models/mistral3/model.go
@@ -4,7 +4,6 @@ import (
 	"bytes"
 	"image"
 	"slices"
-	"sync"
 
 	"github.com/ollama/ollama/fs"
 	"github.com/ollama/ollama/kvcache"
@@ -105,7 +104,7 @@ func newMultiModalProjector(c fs.Config) *MultiModalProjector {
 	}
 }
 
-func (m *Model) EncodeMultimodal(ctx ml.Context, multimodalData []byte) (any, error) {
+func (m *Model) EncodeMultimodal(ctx ml.Context, multimodalData []byte) ([]input.Multimodal, error) {
 	if len(m.VisionModel.Layers) == 0 {
 		return nil, model.ErrNoVisionModel
 	}
@@ -129,37 +128,14 @@ func (m *Model) EncodeMultimodal(ctx ml.Context, multimodalData []byte) (any, er
 	features, size := m.MultiModalProjector.Forward(ctx, visionOutputs, size)
 
 	// split into patches to be sent to the text transformer
-	parent := imageFeatures{tensor: features}
-	rows := make([]*imageRow, size.Y)
+	rows := make([]input.Multimodal, size.Y)
 	for i := range rows {
-		rows[i] = &imageRow{parent: &parent, s: i, shape: []int{features.Dim(0), size.X}}
+		rows[i].Tensor = features.View(ctx, features.Stride(1)*size.X*i, features.Dim(0), features.Stride(1), size.X)
 	}
 
 	return rows, nil
 }
 
-type imageFeatures struct {
-	tensor ml.Tensor
-
-	dataOnce sync.Once
-	data     []float32
-}
-
-type imageRow struct {
-	parent *imageFeatures
-	s      int
-	shape  []int
-}
-
-func (r *imageRow) data() []float32 {
-	n := 1
-	for _, s := range r.shape {
-		n *= s
-	}
-
-	return r.parent.data[r.s*n : (r.s+1)*n]
-}
-
 // PostTokenize arranges Mistral 3's inputs for the forward pass
 // In Mistral 3 and Pixtral, the input patches are arranged as follows:
 // [IMG]...[IMG][IMG_BREAK][IMG]...[IMG][IMG_BREAK][IMG]...[IMG][IMG_END]
@@ -168,15 +144,14 @@ func (r *imageRow) data() []float32 {
 func (m *Model) PostTokenize(inputs []input.Input) ([]input.Input, error) {
 	var result []input.Input
 	for _, inp := range inputs {
-		if inp.Multimodal == nil {
+		if len(inp.Multimodal) == 0 {
 			result = append(result, inp)
 		} else {
-			inputMultimodal := inp.Multimodal.([]*imageRow)
-			for i, row := range inputMultimodal {
+			for i, row := range inp.Multimodal {
 				// [IMG]
-				result = append(result, input.Input{Token: 10, Multimodal: row, MultimodalHash: inp.MultimodalHash, SameBatch: row.shape[1]})
-				result = append(result, slices.Repeat([]input.Input{{Token: 10}}, row.shape[1]-1)...)
-				if i == len(inputMultimodal)-1 {
+				result = append(result, input.Input{Token: 10, Multimodal: []input.Multimodal{{Tensor: row.Tensor}}, MultimodalHash: inp.MultimodalHash, SameBatch: row.Tensor.Dim(1)})
+				result = append(result, slices.Repeat([]input.Input{{Token: 10}}, row.Tensor.Dim(1)-1)...)
+				if i == len(inp.Multimodal)-1 {
 					// [IMG_END]
 					result = append(result, input.Input{Token: 13})
 				} else {
diff --git a/model/models/mistral3/model_text.go b/model/models/mistral3/model_text.go
index 565b001a..17939800 100644
--- a/model/models/mistral3/model_text.go
+++ b/model/models/mistral3/model_text.go
@@ -9,7 +9,6 @@ import (
 	"github.com/ollama/ollama/kvcache"
 	"github.com/ollama/ollama/ml"
 	"github.com/ollama/ollama/ml/nn"
-	"github.com/ollama/ollama/model"
 	"github.com/ollama/ollama/model/input"
 )
 
@@ -20,8 +19,6 @@ type TextOptions struct {
 }
 
 type TextModel struct {
-	model.Base
-
 	TokenEmbedding *nn.Embedding `gguf:"token_embd"`
 	Layers         []Layer       `gguf:"blk"`
 	OutputNorm     *nn.RMSNorm   `gguf:"output_norm"`
@@ -109,20 +106,7 @@ func (m *TextModel) Forward(ctx ml.Context, inputs, positions, outputs ml.Tensor
 
 	// image embeddings
 	for _, image := range batch.Multimodal {
-		row := image.Multimodal.(*imageRow)
-		row.parent.dataOnce.Do(func() {
-			// use a new, throwaway context so the image tensor is not added to the graph
-			temp := m.Backend().NewContext()
-			temp.Forward(row.parent.tensor).Compute(row.parent.tensor)
-			row.parent.data = row.parent.tensor.Floats()
-			temp.Close()
-		})
-
-		imageFeature, err := ctx.Input().FromFloatSlice(row.data(), row.shape...)
-		if err != nil {
-			panic(err)
-		}
-
+		imageFeature := image.Multimodal[0].Tensor
 		ctx.Forward(imageFeature.Copy(ctx, hiddenState.View(ctx, image.Index*hiddenState.Stride(1), imageFeature.Dim(0)*imageFeature.Dim(1))))
 	}
 
diff --git a/model/models/mllama/model.go b/model/models/mllama/model.go
index 4d5bdd4a..15571d9c 100644
--- a/model/models/mllama/model.go
+++ b/model/models/mllama/model.go
@@ -59,7 +59,7 @@ func New(c fs.Config) (model.Model, error) {
 	return &m, nil
 }
 
-func (m *Model) EncodeMultimodal(ctx ml.Context, multimodalData []byte) (any, error) {
+func (m *Model) EncodeMultimodal(ctx ml.Context, multimodalData []byte) ([]input.Multimodal, error) {
 	if len(m.VisionModel.Transformer.Layers) == 0 || len(m.GlobalTransformer.Layers) == 0 {
 		return nil, model.ErrNoVisionModel
 	}
@@ -92,7 +92,9 @@ func (m *Model) EncodeMultimodal(ctx ml.Context, multimodalData []byte) (any, er
 
 	positionIDs := ctx.Arange(0, 1601, 1, ml.DTypeI32)
 	crossAttentionStates := m.VisionModel.Forward(ctx, pixelValues, positionIDs, aspectRatio)
-	return m.Projector.Forward(ctx, crossAttentionStates), nil
+	projectedOutputs := m.Projector.Forward(ctx, crossAttentionStates)
+
+	return []input.Multimodal{{Tensor: projectedOutputs}}, nil
 }
 
 func (m *Model) PostTokenize(inputs []input.Input) ([]input.Input, error) {
@@ -108,7 +110,7 @@ func (m *Model) PostTokenize(inputs []input.Input) ([]input.Input, error) {
 func (m *Model) Forward(ctx ml.Context, batch input.Batch) (ml.Tensor, error) {
 	var crossAttentionStates ml.Tensor
 	if len(batch.Multimodal) > 0 {
-		crossAttentionStates = batch.Multimodal[len(batch.Multimodal)-1].Multimodal.(ml.Tensor)
+		crossAttentionStates = batch.Multimodal[len(batch.Multimodal)-1].Multimodal[0].Tensor
 	}
 
 	positions, err := ctx.Input().FromIntSlice(batch.Positions, len(batch.Positions))
diff --git a/model/models/qwen25vl/model.go b/model/models/qwen25vl/model.go
index 9d243c30..48655450 100644
--- a/model/models/qwen25vl/model.go
+++ b/model/models/qwen25vl/model.go
@@ -5,7 +5,6 @@ import (
 	"fmt"
 	"image"
 	"slices"
-	"sync"
 
 	"github.com/ollama/ollama/fs"
 	"github.com/ollama/ollama/kvcache"
@@ -77,7 +76,7 @@ func (m *Model) PixelValues(ctx ml.Context, multimodalData []byte) (ml.Tensor, *
 	return pixelValues, grid, nil
 }
 
-func (m *Model) EncodeMultimodal(ctx ml.Context, multimodalData []byte) (any, error) {
+func (m *Model) EncodeMultimodal(ctx ml.Context, multimodalData []byte) ([]input.Multimodal, error) {
 	if len(m.VisionModel.Layers) == 0 {
 		return nil, model.ErrNoVisionModel
 	}
@@ -88,31 +87,7 @@ func (m *Model) EncodeMultimodal(ctx ml.Context, multimodalData []byte) (any, er
 	}
 
 	visionOutputs := m.VisionModel.Forward(ctx, pixels, grid)
-	return &chunks{Model: m, Tensor: visionOutputs}, nil
-}
-
-type chunks struct {
-	*Model
-	ml.Tensor
-
-	dataOnce sync.Once
-	data     []float32
-}
-
-type chunk struct {
-	*chunks
-	s, n int
-}
-
-func (r *chunk) floats() []float32 {
-	r.dataOnce.Do(func() {
-		temp := r.Backend().NewContext()
-		defer temp.Close()
-		temp.Forward(r.Tensor).Compute(r.Tensor)
-		r.data = r.Floats()
-	})
-
-	return r.data[r.s*r.Dim(0) : (r.s+r.n)*r.Dim(0)]
+	return []input.Multimodal{{Tensor: visionOutputs}}, nil
 }
 
 // PostTokenize arranges Qwen-2.5-VL's inputs for the forward pass
@@ -142,20 +117,16 @@ func (m *Model) PostTokenize(inputs []input.Input) ([]input.Input, error) {
 				result = append(result, input.Input{Token: pre[i]})
 			}
 
-			// This is an image token with multimodal data
-			chunksData := inp.Multimodal.(*chunks)
-			patchesPerChunk := chunksData.Dim(1)
+			patchesPerChunk := inp.Multimodal[0].Tensor.Dim(1)
 
 			// First add the vision start token
-			result = append(result, input.Input{Token: visionStartToken, SameBatch: patchesPerChunk + 2})
+			result = append(result, input.Input{Token: visionStartToken, SameBatch: patchesPerChunk + 1})
 
 			// Add the image token with the multimodal tensor data at the first position
-			// Create a chunk with proper s and n values
 			result = append(result, input.Input{
 				Token:          imageToken,
-				Multimodal:     &chunk{chunks: chunksData, s: 0, n: patchesPerChunk},
+				Multimodal:     inp.Multimodal,
 				MultimodalHash: inp.MultimodalHash,
-				SameBatch:      patchesPerChunk,
 			})
 
 			// Add the placeholder tokens for the remaining positions (tokensPerGrid-1)
diff --git a/model/models/qwen25vl/model_text.go b/model/models/qwen25vl/model_text.go
index 6b062f8c..800fd961 100644
--- a/model/models/qwen25vl/model_text.go
+++ b/model/models/qwen25vl/model_text.go
@@ -129,12 +129,7 @@ func (m *TextModel) Forward(ctx ml.Context, inputs, positions, outputs ml.Tensor
 	hiddenStates := m.TokenEmbedding.Forward(ctx, inputs).Duplicate(ctx)
 
 	for _, mi := range batch.Multimodal {
-		f32s := mi.Multimodal.(*chunk).floats()
-		img, err := ctx.Input().FromFloatSlice(f32s, len(f32s)/m.hiddenSize, m.hiddenSize)
-		if err != nil {
-			panic(err)
-		}
-
+		img := mi.Multimodal[0].Tensor
 		ctx.Forward(img.Copy(ctx, hiddenStates.View(ctx, mi.Index*hiddenStates.Stride(1), img.Dim(0)*img.Dim(1))))
 	}
 
diff --git a/runner/ollamarunner/cache_test.go b/runner/ollamarunner/cache_test.go
index 062b654c..6897b5e4 100644
--- a/runner/ollamarunner/cache_test.go
+++ b/runner/ollamarunner/cache_test.go
@@ -3,7 +3,6 @@ package ollamarunner
 import (
 	"errors"
 	"fmt"
-	"image"
 	"testing"
 	"time"
 
@@ -12,10 +11,6 @@ import (
 )
 
 func TestCountCommon(t *testing.T) {
-	imgA := image.NewRGBA(image.Rect(0, 0, 100, 100))
-	imgB := image.NewRGBA(image.Rect(0, 0, 50, 50))
-	imgC := image.NewRGBA(image.Rect(50, 50, 100, 100))
-
 	tests := []struct {
 		name     string
 		t1       []input.Input
@@ -36,20 +31,20 @@ func TestCountCommon(t *testing.T) {
 		},
 		{
 			name:     "Image Prefix",
-			t1:       []input.Input{{Multimodal: imgA, MultimodalHash: 1}},
-			t2:       []input.Input{{Multimodal: imgA, MultimodalHash: 1}, {Multimodal: imgB, MultimodalHash: 2}, {Multimodal: imgC, MultimodalHash: 3}},
+			t1:       []input.Input{{MultimodalHash: 1}},
+			t2:       []input.Input{{MultimodalHash: 1}, {MultimodalHash: 2}, {MultimodalHash: 3}},
 			expected: 1,
 		},
 		{
 			name:     "Mixed",
-			t1:       []input.Input{{Token: 1}, {Multimodal: imgA, MultimodalHash: 1}},
-			t2:       []input.Input{{Token: 1}, {Multimodal: imgA, MultimodalHash: 1}, {Token: 5}},
+			t1:       []input.Input{{Token: 1}, {MultimodalHash: 1}},
+			t2:       []input.Input{{Token: 1}, {MultimodalHash: 1}, {Token: 5}},
 			expected: 2,
 		},
 		{
 			name:     "Mixed, Same Length",
-			t1:       []input.Input{{Token: 1}, {Multimodal: imgA, MultimodalHash: 1}},
-			t2:       []input.Input{{Token: 1}, {Multimodal: imgB, MultimodalHash: 2}},
+			t1:       []input.Input{{Token: 1}, {MultimodalHash: 1}},
+			t2:       []input.Input{{Token: 1}, {MultimodalHash: 2}},
 			expected: 1,
 		},
 		{
diff --git a/runner/ollamarunner/multimodal.go b/runner/ollamarunner/multimodal.go
new file mode 100644
index 00000000..16d35921
--- /dev/null
+++ b/runner/ollamarunner/multimodal.go
@@ -0,0 +1,103 @@
+package ollamarunner
+
+import (
+	"errors"
+
+	"github.com/ollama/ollama/ml"
+	"github.com/ollama/ollama/model/input"
+)
+
+// Tensors can't be used across multiple compute graphs. This is a problem
+// if a single embedding is split across batches using views since all of
+// the views will have the same source tensor. We also don't want to
+// recompute the entire embedding for each batch.
+//
+// To avoid this, we compute all of the tensors for the embedding on the
+// first use and then store the result in system memory. When we need
+// additional tensors, we recreate them from the stored data.
+
+// multimodalEntry represents the embeddings of a single object (such
+// as an image).
+type multimodalEntry struct {
+	// mm is the original set of tensors created by EncodeMultimodal
+	mm []input.Multimodal
+
+	// data is the computed result of mm. Nil if not yet computed
+	data [][]float32
+}
+
+// multimodalStore maps from an individual tensor (of which there
+// may be many in a single multimodal object) to its parent embedding
+type multimodalStore map[ml.Tensor]*multimodalEntry
+
+func newMultimodalStore() multimodalStore {
+	return make(multimodalStore)
+}
+
+// addMultimodal stores an embedding for later use in a compute graph
+func (m multimodalStore) addMultimodal(embedding []input.Multimodal) {
+	entry := &multimodalEntry{mm: embedding}
+
+	for _, e := range embedding {
+		if e.Tensor != nil {
+			m[e.Tensor] = entry
+		}
+	}
+}
+
+// getMultimodal takes a source set of tensors (which may contain a whole or
+// parts of one or more images) and returns the equivalent that can be used in
+// the current context
+func (m multimodalStore) getMultimodal(backend ml.Backend, ctx ml.Context, in []input.Multimodal) ([]input.Multimodal, error) {
+	out := make([]input.Multimodal, len(in))
+	for i := range out {
+		if in[i].Tensor != nil {
+			var err error
+			out[i].Tensor, err = m.getTensor(backend, ctx, in[i].Tensor)
+			if err != nil {
+				return nil, err
+			}
+		}
+
+		out[i].Data = in[i].Data
+	}
+
+	return out, nil
+}
+
+func (m multimodalStore) getTensor(backend ml.Backend, ctx ml.Context, in ml.Tensor) (ml.Tensor, error) {
+	entry := m[in]
+
+	if entry.data == nil {
+		computeCtx := backend.NewContext()
+		defer computeCtx.Close()
+
+		var tensors []ml.Tensor
+		for _, t := range entry.mm {
+			if t.Tensor != nil {
+				tensors = append(tensors, t.Tensor)
+			}
+		}
+
+		if len(tensors) == 0 {
+			return nil, nil
+		}
+
+		computeCtx.Forward(tensors...).Compute(tensors...)
+
+		entry.data = make([][]float32, len(entry.mm))
+		for i, t := range entry.mm {
+			if t.Tensor != nil {
+				entry.data[i] = t.Tensor.Floats()
+			}
+		}
+	}
+
+	for i, t := range entry.mm {
+		if in == t.Tensor {
+			return ctx.Input().FromFloatSlice(entry.data[i], t.Tensor.Shape()...)
+		}
+	}
+
+	return nil, errors.New("multimodal tensor not found")
+}
diff --git a/runner/ollamarunner/runner.go b/runner/ollamarunner/runner.go
index 9a522223..4e203b7b 100644
--- a/runner/ollamarunner/runner.go
+++ b/runner/ollamarunner/runner.go
@@ -40,6 +40,9 @@ type Sequence struct {
 	// multimodal embeddings
 	ctxs []ml.Context
 
+	// mmStore holds multimodal embeddings to mange memory and enable splitting across batches
+	mmStore multimodalStore
+
 	// batch index
 	iBatch int
 
@@ -101,7 +104,7 @@ func (s *Server) NewSequence(prompt string, images []llm.ImageData, params NewSe
 
 	startTime := time.Now()
 
-	inputs, ctxs, err := s.inputs(prompt, images)
+	inputs, ctxs, mmStore, err := s.inputs(prompt, images)
 	if err != nil {
 		return nil, fmt.Errorf("failed to process inputs: %w", err)
 	} else if len(inputs) == 0 {
@@ -156,6 +159,7 @@ func (s *Server) NewSequence(prompt string, images []llm.ImageData, params NewSe
 
 	return &Sequence{
 		ctxs:                ctxs,
+		mmStore:             mmStore,
 		inputs:              inputs,
 		numPromptInputs:     len(inputs),
 		startProcessingTime: startTime,
@@ -174,9 +178,10 @@ func (s *Server) NewSequence(prompt string, images []llm.ImageData, params NewSe
 // inputs processes the prompt and images into a list of inputs
 // by splitting the prompt on [img-<n>] tags, tokenizing text and
 // decoding images
-func (s *Server) inputs(prompt string, images []llm.ImageData) ([]input.Input, []ml.Context, error) {
+func (s *Server) inputs(prompt string, images []llm.ImageData) ([]input.Input, []ml.Context, multimodalStore, error) {
 	var inputs []input.Input
 	var ctxs []ml.Context
+	var mmStore multimodalStore
 
 	var parts []string
 	var matches [][]string
@@ -187,6 +192,7 @@ func (s *Server) inputs(prompt string, images []llm.ImageData) ([]input.Input, [
 		re := regexp.MustCompile(`\[img-(\d+)\]`)
 		parts = re.Split(prompt, -1)
 		matches = re.FindAllStringSubmatch(prompt, -1)
+		mmStore = newMultimodalStore()
 	} else {
 		parts = []string{prompt}
 	}
@@ -196,7 +202,7 @@ func (s *Server) inputs(prompt string, images []llm.ImageData) ([]input.Input, [
 		// text - tokenize
 		tokens, err := s.model.(model.TextProcessor).Encode(part, i == 0)
 		if err != nil {
-			return nil, nil, err
+			return nil, nil, nil, err
 		}
 
 		for _, t := range tokens {
@@ -216,7 +222,7 @@ func (s *Server) inputs(prompt string, images []llm.ImageData) ([]input.Input, [
 			}
 
 			if imageIndex < 0 {
-				return nil, nil, fmt.Errorf("invalid image index: %d", n)
+				return nil, nil, nil, fmt.Errorf("invalid image index: %d", n)
 			}
 
 			ctx := s.model.Backend().NewContext()
@@ -224,13 +230,15 @@ func (s *Server) inputs(prompt string, images []llm.ImageData) ([]input.Input, [
 			ctxs = append(ctxs, ctx)
 			imageEmbeddings, err := multimodalProcessor.EncodeMultimodal(ctx, images[imageIndex].Data)
 			if err != nil {
-				return nil, nil, err
+				return nil, nil, nil, err
 			}
 
 			s.multimodalHash.Reset()
 			_, _ = s.multimodalHash.Write(images[imageIndex].Data)
 			imageHash := s.multimodalHash.Sum64()
 
+			mmStore.addMultimodal(imageEmbeddings)
+
 			inputs = append(inputs, input.Input{Multimodal: imageEmbeddings, MultimodalHash: imageHash})
 			postTokenize = true
 		}
@@ -240,11 +248,11 @@ func (s *Server) inputs(prompt string, images []llm.ImageData) ([]input.Input, [
 		var err error
 		inputs, err = multimodalProcessor.PostTokenize(inputs)
 		if err != nil {
-			return nil, nil, err
+			return nil, nil, nil, err
 		}
 	}
 
-	return inputs, ctxs, nil
+	return inputs, ctxs, mmStore, nil
 }
 
 type Server struct {
@@ -363,6 +371,9 @@ func (s *Server) processBatch() error {
 	}
 	defer s.mu.Unlock()
 
+	ctx := s.model.Backend().NewContext()
+	defer ctx.Close()
+
 	var batchInputs []int32
 	var batch input.Batch
 
@@ -433,7 +444,11 @@ func (s *Server) processBatch() error {
 
 			batchInputs = append(batchInputs, inp.Token)
 			if inp.Multimodal != nil {
-				batch.Multimodal = append(batch.Multimodal, input.MultimodalIndex{Index: len(batchInputs) - 1, Multimodal: inp.Multimodal})
+				mm, err := seq.mmStore.getMultimodal(s.model.Backend(), ctx, inp.Multimodal)
+				if err != nil {
+					return err
+				}
+				batch.Multimodal = append(batch.Multimodal, input.MultimodalIndex{Index: len(batchInputs) - 1, Multimodal: mm})
 			}
 
 			batch.Positions = append(batch.Positions, int32(len(seq.cache.Inputs)+len(seq.pendingInputs)))
@@ -459,9 +474,6 @@ func (s *Server) processBatch() error {
 		return nil
 	}
 
-	ctx := s.model.Backend().NewContext()
-	defer ctx.Close()
-
 	modelOutput, err := model.Forward(ctx, s.model, batchInputs, batch)
 	if err != nil {
 		return fmt.Errorf("failed to decode batch: %w", err)

From fe623c2cf44e672dde4552985d9f758a9d09605d Mon Sep 17 00:00:00 2001
From: Jesse Gross <jesse@ollama.com>
Date: Mon, 7 Apr 2025 13:59:11 -0700
Subject: [PATCH 029/108] ollamarunner: Multi-modal worst case graph

We currently preallocate compute graph memory for the worst case
batch of text tokens. This adds support for doing the same for
images.

Note that image models are more complicated than text models in
how they process their inputs so there may be cases where this
approach isn't completely generic for all models. It covers all
currently supported models though.
---
 runner/ollamarunner/multimodal.go | 31 ++++++++++----
 runner/ollamarunner/runner.go     | 71 ++++++++++++++++++++++++++++---
 2 files changed, 88 insertions(+), 14 deletions(-)

diff --git a/runner/ollamarunner/multimodal.go b/runner/ollamarunner/multimodal.go
index 16d35921..d78612fe 100644
--- a/runner/ollamarunner/multimodal.go
+++ b/runner/ollamarunner/multimodal.go
@@ -48,12 +48,12 @@ func (m multimodalStore) addMultimodal(embedding []input.Multimodal) {
 // getMultimodal takes a source set of tensors (which may contain a whole or
 // parts of one or more images) and returns the equivalent that can be used in
 // the current context
-func (m multimodalStore) getMultimodal(backend ml.Backend, ctx ml.Context, in []input.Multimodal) ([]input.Multimodal, error) {
+func (m multimodalStore) getMultimodal(backend ml.Backend, ctx ml.Context, in []input.Multimodal, reserve bool) ([]input.Multimodal, error) {
 	out := make([]input.Multimodal, len(in))
 	for i := range out {
 		if in[i].Tensor != nil {
 			var err error
-			out[i].Tensor, err = m.getTensor(backend, ctx, in[i].Tensor)
+			out[i].Tensor, err = m.getTensor(backend, ctx, in[i].Tensor, reserve)
 			if err != nil {
 				return nil, err
 			}
@@ -65,7 +65,7 @@ func (m multimodalStore) getMultimodal(backend ml.Backend, ctx ml.Context, in []
 	return out, nil
 }
 
-func (m multimodalStore) getTensor(backend ml.Backend, ctx ml.Context, in ml.Tensor) (ml.Tensor, error) {
+func (m multimodalStore) getTensor(backend ml.Backend, ctx ml.Context, in ml.Tensor, reserve bool) (ml.Tensor, error) {
 	entry := m[in]
 
 	if entry.data == nil {
@@ -83,19 +83,32 @@ func (m multimodalStore) getTensor(backend ml.Backend, ctx ml.Context, in ml.Ten
 			return nil, nil
 		}
 
-		computeCtx.Forward(tensors...).Compute(tensors...)
-
+		computeCtx.Forward(tensors...)
 		entry.data = make([][]float32, len(entry.mm))
-		for i, t := range entry.mm {
-			if t.Tensor != nil {
-				entry.data[i] = t.Tensor.Floats()
+
+		if !reserve {
+			computeCtx.Compute(tensors...)
+
+			for i, t := range entry.mm {
+				if t.Tensor != nil {
+					entry.data[i] = t.Tensor.Floats()
+				}
+			}
+		} else {
+			err := computeCtx.Reserve()
+			if err != nil {
+				return nil, err
 			}
 		}
 	}
 
 	for i, t := range entry.mm {
 		if in == t.Tensor {
-			return ctx.Input().FromFloatSlice(entry.data[i], t.Tensor.Shape()...)
+			if !reserve {
+				return ctx.Input().FromFloatSlice(entry.data[i], t.Tensor.Shape()...)
+			} else {
+				return ctx.Input().Empty(t.Tensor.DType(), t.Tensor.Shape()...), nil
+			}
 		}
 	}
 
diff --git a/runner/ollamarunner/runner.go b/runner/ollamarunner/runner.go
index 4e203b7b..cd42d434 100644
--- a/runner/ollamarunner/runner.go
+++ b/runner/ollamarunner/runner.go
@@ -1,12 +1,14 @@
 package ollamarunner
 
 import (
+	"bytes"
 	"context"
 	"encoding/json"
 	"errors"
 	"flag"
 	"fmt"
 	"hash/maphash"
+	"image"
 	"log"
 	"log/slog"
 	"net"
@@ -20,6 +22,7 @@ import (
 	"time"
 	"unicode/utf8"
 
+	"golang.org/x/image/bmp"
 	"golang.org/x/sync/semaphore"
 
 	"github.com/ollama/ollama/api"
@@ -444,7 +447,7 @@ func (s *Server) processBatch() error {
 
 			batchInputs = append(batchInputs, inp.Token)
 			if inp.Multimodal != nil {
-				mm, err := seq.mmStore.getMultimodal(s.model.Backend(), ctx, inp.Multimodal)
+				mm, err := seq.mmStore.getMultimodal(s.model.Backend(), ctx, inp.Multimodal, false)
 				if err != nil {
 					return err
 				}
@@ -732,12 +735,71 @@ func (s *Server) reserveWorstCaseGraph() error {
 	ctx := s.model.Backend().NewContext()
 	defer ctx.Close()
 
+	var err error
+	inputs := make([]input.Input, s.batchSize)
+	mmStore := newMultimodalStore()
+
+	// Multimodal strategy:
+	// - Encode a 2048x2048 image. This assumes that a single image of this
+	//   size is sufficient to trigger the worst case. This is currently true
+	//   because for existing models, only a single image fits in a batch.
+	// - Add the embedding to a full batch of tokens - this is necessary because
+	//   the model may be looking for non-image data, such as <image> tags.
+	// - Run PostTokenize to execute any transformations between generated
+	//   embeddings and what the forward pass expects.
+	// - The result may now be larger than a batch (images may not fit in a
+	//   single batch), so trim based on what will fit and must be grouped together.
+	// - Fill out the rest of the space with text tokens.
+	if multimodalProcessor, ok := s.model.(model.MultimodalProcessor); ok {
+		mmCtx := s.model.Backend().NewContext()
+		defer mmCtx.Close()
+
+		img := image.NewGray(image.Rect(0, 0, 2048, 2048))
+		var buf bytes.Buffer
+		bmp.Encode(&buf, img)
+
+		if inputs[0].Multimodal, err = multimodalProcessor.EncodeMultimodal(mmCtx, buf.Bytes()); err == nil {
+			mmStore.addMultimodal(inputs[0].Multimodal)
+
+			inputs, err = multimodalProcessor.PostTokenize(inputs)
+			if err != nil {
+				return err
+			}
+
+			for i, inp := range inputs {
+				minBatch := 1 + inp.SameBatch
+				if minBatch > s.batchSize {
+					inputs = inputs[i:min(i+minBatch, len(inputs))]
+					break
+				} else if i+minBatch > s.batchSize {
+					inputs = inputs[:i]
+					break
+				}
+			}
+
+			if len(inputs) < s.batchSize {
+				newInputs := make([]input.Input, s.batchSize)
+				copy(newInputs, inputs)
+				inputs = newInputs
+			}
+		}
+	}
+
 	var batch input.Batch
 
-	inputs := make([]int32, s.batchSize)
+	batchInputs := make([]int32, len(inputs))
 	batch.Positions = make([]int32, len(inputs))
 	batch.Sequences = make([]int, len(inputs))
-	for i := range inputs {
+	for i, inp := range inputs {
+		batchInputs[i] = inp.Token
+		if inp.Multimodal != nil {
+			mm, err := mmStore.getMultimodal(s.model.Backend(), ctx, inp.Multimodal, true)
+			if err != nil {
+				return err
+			}
+			batch.Multimodal = append(batch.Multimodal, input.MultimodalIndex{Index: i, Multimodal: mm})
+		}
+
 		batch.Positions[i] = int32(i)
 	}
 
@@ -746,8 +808,7 @@ func (s *Server) reserveWorstCaseGraph() error {
 		batch.Outputs[i] = int32(i)
 	}
 
-	var err error
-	batch.Inputs, err = ctx.Input().FromIntSlice(inputs, len(inputs))
+	batch.Inputs, err = ctx.Input().FromIntSlice(batchInputs, len(batchInputs))
 	if err != nil {
 		return err
 	}

From feb8923adada675b19dc1bc20f39ed6cfb0b99da Mon Sep 17 00:00:00 2001
From: Bruce MacDonald <brucewmacdonald@gmail.com>
Date: Thu, 15 May 2025 15:45:52 -0700
Subject: [PATCH 030/108] cmd: add ellipses to truncated show metadata (#10717)

When a piece of information has been truncated in the show output an ellipses to indicate that more data has not been displayed
---
 cmd/cmd.go      | 50 ++++++++++++++++++++++++++++++++++++++++++-------
 cmd/cmd_test.go |  1 +
 2 files changed, 44 insertions(+), 7 deletions(-)

diff --git a/cmd/cmd.go b/cmd/cmd.go
index 0f8072f0..df9af354 100644
--- a/cmd/cmd.go
+++ b/cmd/cmd.go
@@ -747,11 +747,38 @@ func showInfo(resp *api.ShowResponse, verbose bool, w io.Writer) error {
 				case float64:
 					v = fmt.Sprintf("%g", vData)
 				case []any:
-					n := 3
-					if len(vData) < n {
-						n = len(vData)
+					targetWidth := 10 // Small width where we are displaying the data in a column
+
+					var itemsToShow int
+					totalWidth := 1 // Start with 1 for opening bracket
+
+					// Find how many we can fit
+					for i := range vData {
+						itemStr := fmt.Sprintf("%v", vData[i])
+						width := runewidth.StringWidth(itemStr)
+
+						// Add separator width (", ") for all items except the first
+						if i > 0 {
+							width += 2
+						}
+
+						// Check if adding this item would exceed our width limit
+						if totalWidth+width > targetWidth && i > 0 {
+							break
+						}
+
+						totalWidth += width
+						itemsToShow++
+					}
+
+					// Format the output
+					if itemsToShow < len(vData) {
+						v = fmt.Sprintf("%v", vData[:itemsToShow])
+						v = strings.TrimSuffix(v, "]")
+						v += fmt.Sprintf(" ...+%d more]", len(vData)-itemsToShow)
+					} else {
+						v = fmt.Sprintf("%v", vData)
 					}
-					v = fmt.Sprintf("%v", vData[:n])
 				default:
 					v = fmt.Sprintf("%T", vData)
 				}
@@ -772,10 +799,19 @@ func showInfo(resp *api.ShowResponse, verbose bool, w io.Writer) error {
 
 	head := func(s string, n int) (rows [][]string) {
 		scanner := bufio.NewScanner(strings.NewReader(s))
-		for scanner.Scan() && (len(rows) < n || n < 0) {
-			if text := scanner.Text(); text != "" {
-				rows = append(rows, []string{"", strings.TrimSpace(text)})
+		count := 0
+		for scanner.Scan() {
+			text := strings.TrimSpace(scanner.Text())
+			if text == "" {
+				continue
 			}
+			count++
+			if n < 0 || count <= n {
+				rows = append(rows, []string{"", text})
+			}
+		}
+		if n >= 0 && count > n {
+			rows = append(rows, []string{"", "..."})
 		}
 		return
 	}
diff --git a/cmd/cmd_test.go b/cmd/cmd_test.go
index eb2fb124..cf5fe7ca 100644
--- a/cmd/cmd_test.go
+++ b/cmd/cmd_test.go
@@ -225,6 +225,7 @@ Weigh anchor!
   System
     You are a pirate!    
     Ahoy, matey!         
+    ...                  
 
 `
 		if diff := cmp.Diff(expect, b.String()); diff != "" {

From 27da2cddc514208f4e23539e00485b880e3e2191 Mon Sep 17 00:00:00 2001
From: Daniel Hiltgen <dhiltgen@users.noreply.github.com>
Date: Thu, 15 May 2025 16:33:23 -0700
Subject: [PATCH 031/108] Fix lingering Q4_0 help reference (#10720)

---
 cmd/cmd.go | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cmd/cmd.go b/cmd/cmd.go
index df9af354..ad4be7f9 100644
--- a/cmd/cmd.go
+++ b/cmd/cmd.go
@@ -1318,7 +1318,7 @@ func NewCLI() *cobra.Command {
 	}
 
 	createCmd.Flags().StringP("file", "f", "", "Name of the Modelfile (default \"Modelfile\"")
-	createCmd.Flags().StringP("quantize", "q", "", "Quantize model to this level (e.g. q4_0)")
+	createCmd.Flags().StringP("quantize", "q", "", "Quantize model to this level (e.g. q4_K_M)")
 
 	showCmd := &cobra.Command{
 		Use:     "show MODEL",

From 333e360422744e92275af2c1c2d5bc039ad97e8f Mon Sep 17 00:00:00 2001
From: Michael Yang <mxyng@pm.me>
Date: Fri, 16 May 2025 13:40:23 -0700
Subject: [PATCH 032/108] model: handle multiple eos tokens (#10577)

* get eos_token_id from generation_config.json

* refactor

* include both ids and strings in trace

* comments

* remove special case for gemma3 special vocab (#10743)
---
 convert/convert.go                            |   5 +-
 convert/tokenizer.go                          |  32 +++++
 convert/tokenizer_test.go                     |  61 +++++++++
 llama/llama.go                                |   4 +-
 .../{process_text.go => bytepairencoding.go}  | 126 +-----------------
 ..._text_test.go => bytepairencoding_test.go} |   0
 model/models/gemma2/model.go                  |  11 +-
 model/models/gemma3/model.go                  |  12 +-
 model/models/llama/model.go                   |  10 +-
 model/models/llama4/model.go                  |  10 +-
 model/models/mistral3/model.go                |  18 +--
 model/models/mllama/model.go                  |  10 +-
 model/models/qwen25vl/model.go                |  11 +-
 .../{process_text_spm.go => sentencepiece.go} |  23 +---
 ...text_spm_test.go => sentencepiece_test.go} |   0
 model/textprocessor.go                        |  17 +++
 model/vocabulary.go                           | 112 ++++++++++++++++
 sample/samplers.go                            |   2 +-
 18 files changed, 282 insertions(+), 182 deletions(-)
 rename model/{process_text.go => bytepairencoding.go} (66%)
 rename model/{process_text_test.go => bytepairencoding_test.go} (100%)
 rename model/{process_text_spm.go => sentencepiece.go} (89%)
 rename model/{process_text_spm_test.go => sentencepiece_test.go} (100%)
 create mode 100644 model/textprocessor.go
 create mode 100644 model/vocabulary.go

diff --git a/convert/convert.go b/convert/convert.go
index 309b0ce1..4a6df66c 100644
--- a/convert/convert.go
+++ b/convert/convert.go
@@ -53,8 +53,11 @@ func (ModelParameters) KV(t *Tokenizer) ggml.KV {
 	}
 
 	for _, sv := range t.SpecialVocabulary {
-		kv[fmt.Sprintf("tokenizer.ggml.%s_token_id", sv.Key())] = uint32(sv.ID)
 		kv[fmt.Sprintf("tokenizer.ggml.add_%s_token", sv.Key())] = sv.AddToken
+		kv[fmt.Sprintf("tokenizer.ggml.%s_token_id", sv.Key())] = uint32(sv.ID)
+		if len(sv.IDs) > 0 {
+			kv[fmt.Sprintf("tokenizer.ggml.%s_token_ids", sv.Key())] = sv.IDs
+		}
 	}
 
 	return kv
diff --git a/convert/tokenizer.go b/convert/tokenizer.go
index 74e2efed..bedcd4f8 100644
--- a/convert/tokenizer.go
+++ b/convert/tokenizer.go
@@ -110,6 +110,7 @@ func parseTokenizer(fsys fs.FS, specialTokenTypes []string) (*Tokenizer, error)
 	}
 
 	if f, err := fsys.Open("tokenizer_config.json"); errors.Is(err, os.ErrNotExist) {
+		// noop
 	} else if err != nil {
 		return nil, err
 	} else {
@@ -171,6 +172,34 @@ func parseTokenizer(fsys fs.FS, specialTokenTypes []string) (*Tokenizer, error)
 		}
 	}
 
+	if f, err := fsys.Open("generation_config.json"); errors.Is(err, os.ErrNotExist) {
+	} else if err != nil {
+		return nil, err
+	} else {
+		defer f.Close()
+
+		var p map[string]json.RawMessage
+		if err := json.NewDecoder(f).Decode(&p); err != nil {
+			return nil, err
+		}
+
+		for _, st := range specialTokenTypes {
+			if bts, ok := p[fmt.Sprintf("%s_token_id", st)]; ok {
+				var ids []int32
+				if err := json.Unmarshal(bts, &ids); err != nil {
+					// value is not a list so the existing ID is used
+					continue
+				}
+
+				if i := slices.IndexFunc(t.SpecialVocabulary, func(sv *SpecialVocabulary) bool {
+					return sv.Type == st
+				}); i >= 0 {
+					t.SpecialVocabulary[i].IDs = ids
+				}
+			}
+		}
+	}
+
 	return t, nil
 }
 
@@ -280,6 +309,9 @@ type SpecialVocabulary struct {
 	ID       int
 	Content  string
 	AddToken bool
+
+	// IDs is populated by generation_config.json
+	IDs []int32
 }
 
 func (sv SpecialVocabulary) Key() string {
diff --git a/convert/tokenizer_test.go b/convert/tokenizer_test.go
index c6ef9732..813096fd 100644
--- a/convert/tokenizer_test.go
+++ b/convert/tokenizer_test.go
@@ -247,6 +247,67 @@ func TestParseTokenizer(t *testing.T) {
 				Pre: "default",
 			},
 		},
+		{
+			name: "generation config eos token ids",
+			fsys: createTokenizerFS(t, t.TempDir(), map[string]io.Reader{
+				"tokenizer.json": strings.NewReader(`{
+					"added_tokens": [
+						{
+							"id": 0,
+							"content": "<bos>",
+							"special": true
+						},
+						{
+							"id": 1,
+							"content": "<eos>",
+							"special": true
+						},
+						{
+							"id": 2,
+							"content": "<eot>",
+							"special": true
+						},
+						{
+							"id": 3,
+							"content": "<eom>",
+							"special": true
+						}
+					],
+					"model": {
+						"vocab": {
+							"<bos>": 0,
+							"<eos>": 1,
+							"<eot>": 2,
+							"<eom>": 3
+						}
+					}
+				}`),
+				"tokenizer_config.json": strings.NewReader(`{
+					"add_bos_token": true,
+					"add_eos_token": false,
+					"bos_token": "<bos>",
+					"eos_token": "<eos>"
+				}`),
+				"generation_config.json": strings.NewReader(`{
+					"bos_token_id": 0,
+					"eos_token_id": [1, 2, 3]
+				}`),
+			}),
+			specialTokenTypes: []string{"pad", "eos", "bos", "unk"},
+			want: &Tokenizer{
+				Vocabulary: &Vocabulary{
+					Model:  "gpt2",
+					Tokens: []string{"<bos>", "<eos>", "<eot>", "<eom>"},
+					Scores: []float32{0, 1, 2, 3},
+					Types:  []int32{3, 3, 3, 3},
+				},
+				SpecialVocabulary: []*SpecialVocabulary{
+					{Type: "eos", Content: "<eos>", ID: 1, IDs: []int32{1, 2, 3}, AddToken: false},
+					{Type: "bos", Content: "<bos>", ID: 0, AddToken: true},
+				},
+				Pre: "default",
+			},
+		},
 	}
 
 	for _, tt := range cases {
diff --git a/llama/llama.go b/llama/llama.go
index 1251be3a..626ea13a 100644
--- a/llama/llama.go
+++ b/llama/llama.go
@@ -602,7 +602,7 @@ type Grammar struct {
 	mu sync.Mutex
 }
 
-func NewGrammar(grammar string, vocabIds []uint32, vocabValues []string, eogTokens []uint32) *Grammar {
+func NewGrammar(grammar string, vocabIds []uint32, vocabValues []string, eogTokens []int32) *Grammar {
 	cGrammar := C.CString(grammar)
 	defer C.free(unsafe.Pointer(cGrammar))
 
@@ -622,7 +622,7 @@ func NewGrammar(grammar string, vocabIds []uint32, vocabValues []string, eogToke
 		cEogTokens[i] = C.uint32_t(token)
 	}
 
-	g := C.grammar_init(cGrammar, (*C.uint32_t)(unsafe.Pointer(&cTokens[0])), C.size_t(len(cTokens)), (**C.char)(unsafe.Pointer(&cPieces[0])), (*C.uint32_t)(unsafe.Pointer(&cEogTokens[0])), C.size_t(len(cEogTokens)))
+	g := C.grammar_init(cGrammar, unsafe.SliceData(cTokens), C.size_t(len(cTokens)), unsafe.SliceData(cPieces), unsafe.SliceData(cEogTokens), C.size_t(len(cEogTokens)))
 	if g == nil {
 		return nil
 	}
diff --git a/model/process_text.go b/model/bytepairencoding.go
similarity index 66%
rename from model/process_text.go
rename to model/bytepairencoding.go
index 7b87ccc3..6bb9a003 100644
--- a/model/process_text.go
+++ b/model/bytepairencoding.go
@@ -5,116 +5,13 @@ import (
 	"context"
 	"iter"
 	"log/slog"
-	"slices"
 	"strings"
-	"sync"
 
 	"github.com/dlclark/regexp2"
 	heap "github.com/emirpasic/gods/v2/trees/binaryheap"
 	"github.com/ollama/ollama/logutil"
 )
 
-type Special int32
-
-const (
-	SpecialBOS Special = iota
-	SpecialEOS
-)
-
-const (
-	TOKEN_TYPE_NORMAL = iota + 1
-	TOKEN_TYPE_UNKNOWN
-	TOKEN_TYPE_CONTROL
-	TOKEN_TYPE_USER_DEFINED
-	TOKEN_TYPE_UNUSED
-	TOKEN_TYPE_BYTE
-)
-
-type TextProcessor interface {
-	Encode(s string, addSpecial bool) ([]int32, error)
-	Decode([]int32) (string, error)
-	Is(int32, Special) bool
-	Vocabulary() *Vocabulary
-}
-
-type Vocabulary struct {
-	Values []string
-	Types  []int32
-	Scores []float32
-	Merges []string
-
-	BOS, EOS, EOT          int32
-	AddBOS, AddEOS, AddEOT bool
-
-	specialOnce sync.Once
-	special     []string
-
-	valuesOnce sync.Once
-	values     map[string]int32
-
-	mergeOnce sync.Once
-	merge     map[string]int32
-}
-
-func (v *Vocabulary) Is(id int32, special Special) bool {
-	switch special {
-	case SpecialBOS:
-		return id == v.BOS
-	case SpecialEOS:
-		return id == v.EOS || id == v.EOT
-	default:
-		return false
-	}
-}
-
-func (v *Vocabulary) Encode(s string) int32 {
-	v.valuesOnce.Do(func() {
-		v.values = make(map[string]int32, len(v.Values))
-		for i, value := range v.Values {
-			v.values[value] = int32(i)
-		}
-	})
-
-	if id, ok := v.values[s]; ok {
-		return id
-	}
-
-	return -1
-}
-
-func (v *Vocabulary) Decode(id int32) string {
-	return v.Values[id]
-}
-
-func (v *Vocabulary) SpecialVocabulary() []string {
-	v.specialOnce.Do(func() {
-		for i := range v.Values {
-			if slices.Contains([]int{105, 106}, i) {
-				v.special = append(v.special, v.Values[i])
-			} else if v.Types[i] == TOKEN_TYPE_CONTROL {
-				v.special = append(v.special, v.Values[i])
-			}
-		}
-	})
-
-	return v.special
-}
-
-func (v *Vocabulary) Merge(left, right string) int {
-	v.mergeOnce.Do(func() {
-		v.merge = make(map[string]int32, len(v.Merges))
-		for i, merge := range v.Merges {
-			v.merge[merge] = int32(i)
-		}
-	})
-
-	if id, ok := v.merge[left+" "+right]; ok {
-		return int(id)
-	}
-
-	return -1
-}
-
 type BytePairEncoding struct {
 	pre   *regexp2.Regexp
 	vocab *Vocabulary
@@ -304,27 +201,12 @@ func (bpe BytePairEncoding) Encode(s string, addSpecial bool) ([]int32, error) {
 		}
 	}
 
+	slog.Log(context.TODO(), logutil.LevelTrace, "encoded", "string", s, "ids", ids)
+
 	if addSpecial && len(ids) > 0 {
-		if bpe.vocab.AddBOS {
-			if ids[0] == bpe.vocab.BOS {
-				slog.Warn("adding bos token to prompt which already has it", "id", bpe.vocab.BOS)
-			}
-
-			slog.Debug("adding bos token to prompt", "id", bpe.vocab.BOS)
-			ids = append([]int32{bpe.vocab.BOS}, ids...)
-		}
-
-		if bpe.vocab.AddEOS {
-			if ids[len(ids)-1] == bpe.vocab.EOS {
-				slog.Warn("adding eos token to prompt which already has it", "id", bpe.vocab.EOS)
-			}
-
-			slog.Debug("adding eos token to prompt", "id", bpe.vocab.EOS)
-			ids = append(ids, bpe.vocab.EOS)
-		}
+		ids = bpe.vocab.addSpecials(ids)
 	}
 
-	slog.Log(context.TODO(), logutil.LevelTrace, "encoded", "ids", ids)
 	return ids, nil
 }
 
@@ -352,6 +234,6 @@ func (bpe BytePairEncoding) Decode(ids []int32) (string, error) {
 		}
 	}
 
-	slog.Log(context.TODO(), logutil.LevelTrace, "decoded", "string", sb.String())
+	slog.Log(context.TODO(), logutil.LevelTrace, "decoded", "ids", ids, "string", sb.String())
 	return sb.String(), nil
 }
diff --git a/model/process_text_test.go b/model/bytepairencoding_test.go
similarity index 100%
rename from model/process_text_test.go
rename to model/bytepairencoding_test.go
diff --git a/model/models/gemma2/model.go b/model/models/gemma2/model.go
index 3156b006..a87534c5 100644
--- a/model/models/gemma2/model.go
+++ b/model/models/gemma2/model.go
@@ -43,10 +43,13 @@ func New(c fs.Config) (model.Model, error) {
 				Values: c.Strings("tokenizer.ggml.tokens"),
 				Scores: c.Floats("tokenizer.ggml.scores"),
 				Types:  c.Ints("tokenizer.ggml.token_type"),
-				BOS:    int32(c.Uint("tokenizer.ggml.bos_token_id")),
-				EOS:    int32(c.Uint("tokenizer.ggml.eos_token_id")),
-				// TODO: set EOT to EOS otherwise 0 will stop generation
-				EOT: int32(c.Uint("tokenizer.ggml.eos_token_id")),
+				AddBOS: c.Bool("tokenizer.ggml.add_bos_token", true),
+				BOS:    []int32{int32(c.Uint("tokenizer.ggml.bos_token_id"))},
+				AddEOS: c.Bool("tokenizer.ggml.add_eos_token", false),
+				EOS: append(
+					[]int32{int32(c.Uint("tokenizer.ggml.eos_token_id"))},
+					c.Ints("tokenizer.ggml.eos_token_ids")...,
+				),
 			},
 		),
 		Layers: make([]Layer, c.Uint("block_count")),
diff --git a/model/models/gemma3/model.go b/model/models/gemma3/model.go
index d53eb6cc..89d1788e 100644
--- a/model/models/gemma3/model.go
+++ b/model/models/gemma3/model.go
@@ -60,12 +60,16 @@ func New(c fs.Config) (model.Model, error) {
 				Values: c.Strings("tokenizer.ggml.tokens"),
 				Scores: c.Floats("tokenizer.ggml.scores"),
 				Types:  c.Ints("tokenizer.ggml.token_type"),
-				BOS:    int32(c.Uint("tokenizer.ggml.bos_token_id")),
 				AddBOS: c.Bool("tokenizer.ggml.add_bos_token", true),
-				EOS:    int32(1),
+				BOS:    []int32{int32(c.Uint("tokenizer.ggml.bos_token_id"))},
 				AddEOS: c.Bool("tokenizer.ggml.add_eos_token", false),
-				EOT:    int32(106),
-				AddEOT: c.Bool("tokenizer.ggml.add_eot_token", false),
+				EOS: append(
+					[]int32{
+						int32(c.Uint("tokenizer.ggml.eos_token_id")),
+						int32(c.Uint("tokenizer.ggml.eot_token_id", 106)),
+					},
+					c.Ints("tokenizer.ggml.eos_token_ids")...,
+				),
 			},
 		),
 		ImageProcessor: newImageProcessor(c),
diff --git a/model/models/llama/model.go b/model/models/llama/model.go
index c75d7eb2..6e214f0f 100644
--- a/model/models/llama/model.go
+++ b/model/models/llama/model.go
@@ -43,13 +43,13 @@ func New(c fs.Config) (model.Model, error) {
 				Values: c.Strings("tokenizer.ggml.tokens"),
 				Types:  c.Ints("tokenizer.ggml.token_type"),
 				Merges: c.Strings("tokenizer.ggml.merges"),
-				BOS:    int32(c.Uint("tokenizer.ggml.bos_token_id")),
 				AddBOS: c.Bool("tokenizer.ggml.add_bos_token", true),
-				EOS:    int32(c.Uint("tokenizer.ggml.eos_token_id")),
+				BOS:    []int32{int32(c.Uint("tokenizer.ggml.bos_token_id"))},
 				AddEOS: c.Bool("tokenizer.ggml.add_eos_token", false),
-				// TODO: set EOT to EOS otherwise 0 will stop generation
-				EOT:    int32(c.Uint("tokenizer.ggml.eos_token_id")),
-				AddEOT: c.Bool("tokenizer.ggml.add_eos_token", false),
+				EOS: append(
+					[]int32{int32(c.Uint("tokenizer.ggml.eos_token_id"))},
+					c.Ints("tokenizer.ggml.eos_token_ids")...,
+				),
 			},
 		),
 		Layers: make([]Layer, c.Uint("block_count")),
diff --git a/model/models/llama4/model.go b/model/models/llama4/model.go
index c94aa72f..af5173a1 100644
--- a/model/models/llama4/model.go
+++ b/model/models/llama4/model.go
@@ -40,13 +40,13 @@ func New(c fs.Config) (model.Model, error) {
 				Values: c.Strings("tokenizer.ggml.tokens"),
 				Types:  c.Ints("tokenizer.ggml.token_type"),
 				Merges: c.Strings("tokenizer.ggml.merges"),
-				BOS:    int32(c.Uint("tokenizer.ggml.bos_token_id")),
 				AddBOS: c.Bool("tokenizer.ggml.add_bos_token", true),
-				EOS:    int32(c.Uint("tokenizer.ggml.eos_token_id")),
+				BOS:    []int32{int32(c.Uint("tokenizer.ggml.bos_token_id"))},
 				AddEOS: c.Bool("tokenizer.ggml.add_eos_token", false),
-				// TODO: set EOT to EOS otherwise 0 will stop generation
-				EOT:    int32(c.Uint("tokenizer.ggml.eos_token_id")),
-				AddEOT: c.Bool("tokenizer.ggml.add_eos_token", false),
+				EOS: append(
+					[]int32{int32(c.Uint("tokenizer.ggml.eos_token_id"))},
+					c.Ints("tokenizer.ggml.eos_token_ids")...,
+				),
 			},
 		),
 		ImageProcessor: newImageProcessor(c),
diff --git a/model/models/mistral3/model.go b/model/models/mistral3/model.go
index b93882a9..0d384b94 100644
--- a/model/models/mistral3/model.go
+++ b/model/models/mistral3/model.go
@@ -37,25 +37,25 @@ func New(c fs.Config) (model.Model, error) {
 	}
 
 	m := &Model{
-		TextModel:           textModel,
-		VisionModel:         newVisionModel(c),
-		ImageProcessor:      newImageProcessor(c),
-		MultiModalProjector: newMultiModalProjector(c),
 		BytePairEncoding: model.NewBytePairEncoding(
 			c.String("tokenizer.ggml.pretokenizer", `[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}]*[\p{Ll}\p{Lm}\p{Lo}\p{M}]+|[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}]+[\p{Ll}\p{Lm}\p{Lo}\p{M}]*|\p{N}| ?[^\s\p{L}\p{N}]+[\r\n/]*|\s*[\r\n]+|\s+(?!\S)|\s+`),
 			&model.Vocabulary{
 				Values: c.Strings("tokenizer.ggml.tokens"),
 				Types:  c.Ints("tokenizer.ggml.token_type"),
 				Merges: c.Strings("tokenizer.ggml.merges"),
-				BOS:    int32(c.Uint("tokenizer.ggml.bos_token_id", 1)),
 				AddBOS: c.Bool("tokenizer.ggml.add_bos_token", true),
-				EOS:    int32(c.Uint("tokenizer.ggml.eos_token_id", 2)),
+				BOS:    []int32{int32(c.Uint("tokenizer.ggml.bos_token_id"))},
 				AddEOS: c.Bool("tokenizer.ggml.add_eos_token", false),
-				// TODO: set EOT to EOS otherwise 0 will stop generation
-				EOT:    int32(c.Uint("tokenizer.ggml.eos_token_id")),
-				AddEOT: c.Bool("tokenizer.ggml.add_eos_token", false),
+				EOS: append(
+					[]int32{int32(c.Uint("tokenizer.ggml.eos_token_id"))},
+					c.Ints("tokenizer.ggml.eos_token_ids")...,
+				),
 			},
 		),
+		TextModel:           textModel,
+		VisionModel:         newVisionModel(c),
+		ImageProcessor:      newImageProcessor(c),
+		MultiModalProjector: newMultiModalProjector(c),
 	}
 
 	m.Cache = kvcache.NewCausalCache(m.TextModel.Shift)
diff --git a/model/models/mllama/model.go b/model/models/mllama/model.go
index 15571d9c..547e2cb3 100644
--- a/model/models/mllama/model.go
+++ b/model/models/mllama/model.go
@@ -38,13 +38,13 @@ func New(c fs.Config) (model.Model, error) {
 				Values: c.Strings("tokenizer.ggml.tokens"),
 				Types:  c.Ints("tokenizer.ggml.token_type"),
 				Merges: c.Strings("tokenizer.ggml.merges"),
-				BOS:    int32(c.Uint("tokenizer.ggml.bos_token_id")),
 				AddBOS: c.Bool("tokenizer.ggml.add_bos_token", true),
-				EOS:    int32(c.Uint("tokenizer.ggml.eos_token_id")),
+				BOS:    []int32{int32(c.Uint("tokenizer.ggml.bos_token_id"))},
 				AddEOS: c.Bool("tokenizer.ggml.add_eos_token", false),
-				// TODO: set EOT to EOS otherwise 0 will stop generation
-				EOT:    int32(c.Uint("tokenizer.ggml.eos_token_id")),
-				AddEOT: c.Bool("tokenizer.ggml.add_eos_token", false),
+				EOS: append(
+					[]int32{int32(c.Uint("tokenizer.ggml.eos_token_id"))},
+					c.Ints("tokenizer.ggml.eos_token_ids")...,
+				),
 			},
 		),
 		ImageProcessor: newImageProcessor(c),
diff --git a/model/models/qwen25vl/model.go b/model/models/qwen25vl/model.go
index 48655450..7de9b6eb 100644
--- a/model/models/qwen25vl/model.go
+++ b/model/models/qwen25vl/model.go
@@ -34,12 +34,13 @@ func New(c fs.Config) (model.Model, error) {
 				Values: c.Strings("tokenizer.ggml.tokens"),
 				Types:  c.Ints("tokenizer.ggml.token_type"),
 				Merges: c.Strings("tokenizer.ggml.merges"),
-				BOS:    int32(c.Uint("tokenizer.ggml.bos_token_id")),
-				AddBOS: c.Bool("tokenizer.ggml.add_bos_token", false),
-				EOS:    int32(c.Uint("tokenizer.ggml.eos_token_id")),
+				AddBOS: c.Bool("tokenizer.ggml.add_bos_token", true),
+				BOS:    []int32{int32(c.Uint("tokenizer.ggml.bos_token_id"))},
 				AddEOS: c.Bool("tokenizer.ggml.add_eos_token", false),
-				EOT:    int32(c.Uint("tokenizer.ggml.eos_token_id")),
-				AddEOT: c.Bool("tokenizer.ggml.add_eos_token", false),
+				EOS: append(
+					[]int32{int32(c.Uint("tokenizer.ggml.eos_token_id"))},
+					c.Ints("tokenizer.ggml.eos_token_ids")...,
+				),
 			},
 		),
 		TextModel:      NewTextModel(c),
diff --git a/model/process_text_spm.go b/model/sentencepiece.go
similarity index 89%
rename from model/process_text_spm.go
rename to model/sentencepiece.go
index b1cff7d2..7d725f04 100644
--- a/model/process_text_spm.go
+++ b/model/sentencepiece.go
@@ -182,27 +182,12 @@ func (spm SentencePieceModel) Encode(s string, addSpecial bool) ([]int32, error)
 		}
 	}
 
+	slog.Log(context.TODO(), logutil.LevelTrace, "encoded", "string", s, "ids", ids)
+
 	if addSpecial && len(ids) > 0 {
-		if spm.vocab.AddBOS {
-			if ids[0] == spm.vocab.BOS {
-				slog.Warn("adding bos token to prompt which already has it", "id", spm.vocab.BOS)
-			}
-
-			slog.Debug("adding bos token to prompt", "id", spm.vocab.BOS)
-			ids = append([]int32{spm.vocab.BOS}, ids...)
-		}
-
-		if spm.vocab.AddEOS {
-			if ids[len(ids)-1] == spm.vocab.EOS {
-				slog.Warn("adding eos token to prompt which already has it", "id", spm.vocab.EOS)
-			}
-
-			slog.Debug("adding eos token to prompt", "id", spm.vocab.EOS)
-			ids = append(ids, spm.vocab.EOS)
-		}
+		ids = spm.vocab.addSpecials(ids)
 	}
 
-	slog.Log(context.TODO(), logutil.LevelTrace, "encoded", "ids", ids)
 	return ids, nil
 }
 
@@ -261,6 +246,6 @@ func (spm SentencePieceModel) Decode(ids []int32) (string, error) {
 		}
 	}
 
-	slog.Log(context.TODO(), logutil.LevelTrace, "decoded", "string", sb.String())
+	slog.Log(context.TODO(), logutil.LevelTrace, "decoded", "ids", ids, "string", sb.String())
 	return sb.String(), nil
 }
diff --git a/model/process_text_spm_test.go b/model/sentencepiece_test.go
similarity index 100%
rename from model/process_text_spm_test.go
rename to model/sentencepiece_test.go
diff --git a/model/textprocessor.go b/model/textprocessor.go
new file mode 100644
index 00000000..4a36f235
--- /dev/null
+++ b/model/textprocessor.go
@@ -0,0 +1,17 @@
+package model
+
+const (
+	TOKEN_TYPE_NORMAL = iota + 1
+	TOKEN_TYPE_UNKNOWN
+	TOKEN_TYPE_CONTROL
+	TOKEN_TYPE_USER_DEFINED
+	TOKEN_TYPE_UNUSED
+	TOKEN_TYPE_BYTE
+)
+
+type TextProcessor interface {
+	Encode(s string, addSpecial bool) ([]int32, error)
+	Decode([]int32) (string, error)
+	Is(int32, Special) bool
+	Vocabulary() *Vocabulary
+}
diff --git a/model/vocabulary.go b/model/vocabulary.go
new file mode 100644
index 00000000..24adbaca
--- /dev/null
+++ b/model/vocabulary.go
@@ -0,0 +1,112 @@
+package model
+
+import (
+	"log/slog"
+	"slices"
+	"sync"
+)
+
+type Special int32
+
+const (
+	SpecialBOS Special = iota
+	SpecialEOS
+)
+
+type Vocabulary struct {
+	Values []string
+	Types  []int32
+	Scores []float32
+	Merges []string
+
+	BOS, EOS       []int32
+	AddBOS, AddEOS bool
+
+	specialOnce sync.Once
+	special     []string
+
+	valuesOnce sync.Once
+	values     map[string]int32
+
+	mergeOnce sync.Once
+	merge     map[string]int32
+}
+
+func (v *Vocabulary) Is(id int32, special Special) bool {
+	switch special {
+	case SpecialBOS:
+		return slices.Contains(v.BOS, id)
+	case SpecialEOS:
+		return slices.Contains(v.EOS, id)
+	default:
+		return false
+	}
+}
+
+func (v *Vocabulary) addSpecials(ids []int32) []int32 {
+	if v.AddBOS && len(v.BOS) > 0 {
+		if slices.Contains(v.BOS, ids[0]) {
+			slog.Warn("adding bos token to prompt which already has it", "id", v.BOS)
+		}
+
+		slog.Debug("adding bos token to prompt", "id", v.BOS)
+		ids = append([]int32{v.BOS[0]}, ids...)
+	}
+
+	if v.AddEOS && len(v.EOS) > 0 {
+		if slices.Contains(v.BOS, ids[len(ids)-1]) {
+			slog.Warn("adding eos token to prompt which already has it", "id", v.EOS)
+		}
+
+		slog.Debug("adding eos token to prompt", "id", v.EOS)
+		ids = append(ids, v.EOS[0])
+	}
+
+	return ids
+}
+
+func (v *Vocabulary) Encode(s string) int32 {
+	v.valuesOnce.Do(func() {
+		v.values = make(map[string]int32, len(v.Values))
+		for i, value := range v.Values {
+			v.values[value] = int32(i)
+		}
+	})
+
+	if id, ok := v.values[s]; ok {
+		return id
+	}
+
+	return -1
+}
+
+func (v *Vocabulary) Decode(id int32) string {
+	return v.Values[id]
+}
+
+func (v *Vocabulary) SpecialVocabulary() []string {
+	v.specialOnce.Do(func() {
+		for i := range v.Values {
+			if v.Types[i] == TOKEN_TYPE_CONTROL {
+				v.special = append(v.special, v.Values[i])
+			}
+		}
+	})
+
+	return v.special
+}
+
+func (v *Vocabulary) Merge(left, right string) int {
+	v.mergeOnce.Do(func() {
+		v.merge = make(map[string]int32, len(v.Merges))
+		for i, merge := range v.Merges {
+			v.merge[merge] = int32(i)
+		}
+	})
+
+	if id, ok := v.merge[left+" "+right]; ok {
+		return int(id)
+	}
+
+	return -1
+}
diff --git a/sample/samplers.go b/sample/samplers.go
index f0846c8d..d395650d 100644
--- a/sample/samplers.go
+++ b/sample/samplers.go
@@ -176,7 +176,7 @@ func NewGrammarSampler(model model.TextProcessor, grammarStr string) (*GrammarSa
 		vocabIds[i] = uint32(i)
 	}
 
-	grammar := llama.NewGrammar(grammarStr, vocabIds, pieces, []uint32{uint32(model.Vocabulary().EOS), uint32(model.Vocabulary().EOT)})
+	grammar := llama.NewGrammar(grammarStr, vocabIds, pieces, model.Vocabulary().EOS)
 	if grammar == nil {
 		return nil, errors.New("sample: failed to initialize grammar")
 	}

From 7edfdd2f5f48a7be035cec23b4acd12f7c112e1c Mon Sep 17 00:00:00 2001
From: Ronald Wilson <57818133+ronxldwilson@users.noreply.github.com>
Date: Mon, 19 May 2025 01:13:22 +0530
Subject: [PATCH 033/108] readme: add TinyNotepad to community integrations
 (#10763)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This PR adds Tiny Notepad, a lightweight, notepad-like interface to chat with local LLMs via Ollama.

- It’s designed as a simple, distraction-free alternative.
- The app supports basic note-taking, timestamped logs, and model parameter controls.
- Built with Tkinter, it runs entirely offline and available via PyPI.

Aims to provide a lightweight easy to run and install interface for ollama.
---
 README.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/README.md b/README.md
index 7cb61ac2..6a4815c1 100644
--- a/README.md
+++ b/README.md
@@ -405,6 +405,7 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [Writeopia](https://github.com/Writeopia/Writeopia) (Text editor with integration with Ollama)
 - [AppFlowy](https://github.com/AppFlowy-IO/AppFlowy) (AI collaborative workspace with Ollama, cross-platform and self-hostable)
 - [Lumina](https://github.com/cushydigit/lumina.git) (A lightweight, minimal React.js frontend for interacting with Ollama servers)
+- [Tiny Notepad](https://pypi.org/project/tiny-notepad) (A lightweight, notepad-like interface to chat with ollama available on PyPI)
 
 ### Cloud
 

From a2cc8571c5b2b8f77a8a5e2f65cb7aaa56482dc4 Mon Sep 17 00:00:00 2001
From: Jesse Gross <jesse@ollama.com>
Date: Tue, 13 May 2025 13:04:20 -0700
Subject: [PATCH 034/108] llm: Consistently track unassigned model data

In some cases, if we fail to assign a piece of the model to a GPU then
we lose track of this data. Although it doesn't change the memory
allocation, it does affect the total size of the model reported by
tools such as ollama ps (and also the percent offloaded).

This makes it look like setting num_gpu isn't reflected in ollama ps,
which isn't true but the offloading percent may appear to not change.

Spreading the model across more GPUs will continue to impact the
reported total size of the model.
---
 llm/memory.go | 22 +++++++++++++---------
 1 file changed, 13 insertions(+), 9 deletions(-)

diff --git a/llm/memory.go b/llm/memory.go
index b5a8dd5c..46472330 100644
--- a/llm/memory.go
+++ b/llm/memory.go
@@ -216,6 +216,8 @@ func EstimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []strin
 	if len(gpusWithSpace) > 0 {
 		gpuZeroID = gpusWithSpace[0].i
 		gpuAllocations[gpuZeroID] += gpuZeroOverhead
+	} else {
+		overflow += gpuZeroOverhead
 	}
 
 	// For all the layers, find where they can fit on the GPU(s)
@@ -256,15 +258,17 @@ func EstimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []strin
 	}
 
 	// Determine if we need to consider output then find where it fits
-	if memoryLayerOutput > 0 && (opts.NumGPU < 0 || layerCount < opts.NumGPU) {
-		for j := len(gpusWithSpace); j > 0; j-- {
-			g := gpusWithSpace[layerCount%j]
-			used := gpuAllocations[g.i] + max(graphPartialOffload, graphFullOffload)
-			if g.g.FreeMemory > overhead+used+memoryLayerOutput {
-				gpuAllocations[g.i] += memoryLayerOutput
-				layerCounts[g.i]++
-				layerCount++
-				break
+	if memoryLayerOutput > 0 {
+		if opts.NumGPU < 0 || layerCount < opts.NumGPU {
+			for j := len(gpusWithSpace); j > 0; j-- {
+				g := gpusWithSpace[layerCount%j]
+				used := gpuAllocations[g.i] + max(graphPartialOffload, graphFullOffload)
+				if g.g.FreeMemory > overhead+used+memoryLayerOutput {
+					gpuAllocations[g.i] += memoryLayerOutput
+					layerCounts[g.i]++
+					layerCount++
+					break
+				}
 			}
 		}
 

From d75557747357bfb3afd441a0cc207ec944bd3a18 Mon Sep 17 00:00:00 2001
From: Jesse Gross <jesse@ollama.com>
Date: Tue, 13 May 2025 11:36:52 -0700
Subject: [PATCH 035/108] llm: Estimate projector memory correctly for Ollama
 engine

The Llama engine always places vision projectors on the first GPU
if one exists. However, the Ollama engine groups it with the output
layer, which means the projector is only offloaded if all other layers
are offloaded. The memory estimation code always assumes the former
layout - this changes it to use the correct layout based on the engine.

This addresses two impacts of the current behavior:
 - In multi-GPU setups, we can crash with OOM errors when we try to
   allocate memory on a full GPU while another still has space.
 - If the vision projector is large, it may prevent us from offloading
   anything when we could have fit some of the text layers.
---
 llm/memory.go | 32 ++++++++++++++++++--------------
 1 file changed, 18 insertions(+), 14 deletions(-)

diff --git a/llm/memory.go b/llm/memory.go
index 46472330..32774d28 100644
--- a/llm/memory.go
+++ b/llm/memory.go
@@ -85,8 +85,11 @@ func EstimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []strin
 	var graphOffload uint64
 
 	// Projectors loaded into GPU0 only
-	var projectorWeights uint64
-	var projectorGraph uint64
+	var llamaEngineProjectorWeights uint64
+
+	// Projectors loaded with output layer
+	var ollamaEngineProjectorWeights uint64
+	var ollamaEngineProjectorGraph uint64
 
 	// Conditional output size on GPU 0
 	var memoryLayerOutput uint64
@@ -111,14 +114,14 @@ func EstimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []strin
 	slog.Debug("evaluating", "library", gpus[0].Library, "gpu_count", len(gpus), "available", availableList)
 
 	for _, projector := range projectors {
-		weight := projectorMemoryRequirements(projector)
-		projectorWeights += weight
+		llamaEngineProjectorWeights += projectorMemoryRequirements(projector)
 
 		// multimodal models require at least 2048 context
 		opts.NumCtx = max(opts.NumCtx, 2048)
 	}
-	if projectorWeights == 0 && projectorGraph == 0 {
-		projectorWeights, projectorGraph = f.VisionGraphSize()
+	if llamaEngineProjectorWeights == 0 {
+		ollamaEngineProjectorWeights, ollamaEngineProjectorGraph = f.VisionGraphSize()
+		opts.NumCtx = max(opts.NumCtx, 2048)
 	}
 
 	layers := f.Tensors().GroupLayers()
@@ -163,6 +166,7 @@ func EstimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []strin
 		graphFullOffload = graphPartialOffload
 	}
 
+	// Output layer handled at the end if we have space
 	if layer, ok := layers["output_norm"]; ok {
 		memoryLayerOutput += layer.Size()
 	}
@@ -172,8 +176,7 @@ func EstimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []strin
 		memoryLayerOutput += layer.Size()
 	}
 
-	// Output layer handled at the end if we have space
-	gpuZeroOverhead := projectorWeights + projectorGraph
+	gpuZeroOverhead := llamaEngineProjectorWeights
 
 	// Reduce set of GPUs to only those that have sufficient space to fit overhead and at least one layer
 	var layerCount int
@@ -258,13 +261,14 @@ func EstimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []strin
 	}
 
 	// Determine if we need to consider output then find where it fits
-	if memoryLayerOutput > 0 {
+	memoryLastLayer := memoryLayerOutput + ollamaEngineProjectorWeights + ollamaEngineProjectorGraph
+	if memoryLastLayer > 0 {
 		if opts.NumGPU < 0 || layerCount < opts.NumGPU {
 			for j := len(gpusWithSpace); j > 0; j-- {
 				g := gpusWithSpace[layerCount%j]
 				used := gpuAllocations[g.i] + max(graphPartialOffload, graphFullOffload)
-				if g.g.FreeMemory > overhead+used+memoryLayerOutput {
-					gpuAllocations[g.i] += memoryLayerOutput
+				if g.g.FreeMemory > overhead+used+memoryLastLayer {
+					gpuAllocations[g.i] += memoryLastLayer
 					layerCounts[g.i]++
 					layerCount++
 					break
@@ -274,7 +278,7 @@ func EstimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []strin
 
 		if layerCount < int(f.KV().BlockCount())+1 {
 			fullyLoaded = false
-			overflow += memoryLayerOutput
+			overflow += memoryLastLayer
 		}
 	}
 
@@ -332,8 +336,8 @@ func EstimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []strin
 		memoryLayerOutput:   memoryLayerOutput,
 		graphFullOffload:    graphFullOffload,
 		graphPartialOffload: graphPartialOffload,
-		projectorWeights:    projectorWeights,
-		projectorGraph:      projectorGraph,
+		projectorWeights:    llamaEngineProjectorWeights + ollamaEngineProjectorWeights,
+		projectorGraph:      ollamaEngineProjectorGraph,
 	}
 
 	if gpus[0].Library == "cpu" {

From 94ab428e3f77fdd9d9c833b369bb40980c65049a Mon Sep 17 00:00:00 2001
From: Jesse Gross <jesse@ollama.com>
Date: Thu, 17 Apr 2025 13:42:40 -0700
Subject: [PATCH 036/108] ggml: Seperate tensor load from backend creation

Currently, when the backend is created, the tensors are loaded at the
same time, which is a slow operation. This separates them to be two
steps:
 - Create backend, including enumerating tensors and memory allocation
 - Loading tensor data

This allows more flexibility in managing model loading.
---
 convert/convert_test.go       |   4 +-
 fs/ggml/ggml.go               |  14 +--
 fs/ggml/gguf_test.go          |   2 +-
 llm/memory.go                 |   2 +-
 llm/server.go                 |   2 +-
 ml/backend.go                 |  14 ++-
 ml/backend/ggml/ggml.go       | 163 +++++++++++++++++++---------------
 model/model.go                |  12 +--
 runner/ollamarunner/runner.go |  13 ++-
 server/create.go              |  12 +--
 server/images.go              |   2 +-
 server/model.go               |   2 +-
 server/quantization_test.go   |   4 +-
 13 files changed, 131 insertions(+), 115 deletions(-)

diff --git a/convert/convert_test.go b/convert/convert_test.go
index b9db6fa1..105fbb3d 100644
--- a/convert/convert_test.go
+++ b/convert/convert_test.go
@@ -47,7 +47,7 @@ func convertFull(t *testing.T, fsys fs.FS) (*os.File, ggml.KV, ggml.Tensors) {
 	}
 	t.Cleanup(func() { r.Close() })
 
-	m, _, err := ggml.Decode(r, -1)
+	m, err := ggml.Decode(r, -1)
 	if err != nil {
 		t.Fatal(err)
 	}
@@ -332,7 +332,7 @@ func TestConvertAdapter(t *testing.T) {
 			}
 			defer r.Close()
 
-			m, _, err := ggml.Decode(r, -1)
+			m, err := ggml.Decode(r, -1)
 			if err != nil {
 				t.Fatal(err)
 			}
diff --git a/fs/ggml/ggml.go b/fs/ggml/ggml.go
index 8c0a2ae5..aa85aec2 100644
--- a/fs/ggml/ggml.go
+++ b/fs/ggml/ggml.go
@@ -15,6 +15,7 @@ import (
 type GGML struct {
 	container
 	model
+	Length int64
 }
 
 type model interface {
@@ -386,12 +387,12 @@ func DetectContentType(b []byte) string {
 //
 // It collects array values for arrays with a size less than or equal to
 // maxArraySize. If the maxArraySize is negative, all arrays are collected.
-func Decode(rs io.ReadSeeker, maxArraySize int) (*GGML, int64, error) {
+func Decode(rs io.ReadSeeker, maxArraySize int) (*GGML, error) {
 	rs = bufioutil.NewBufferedSeeker(rs, 32<<10)
 
 	var magic uint32
 	if err := binary.Read(rs, binary.LittleEndian, &magic); err != nil {
-		return nil, 0, err
+		return nil, err
 	}
 
 	var c container
@@ -401,24 +402,25 @@ func Decode(rs io.ReadSeeker, maxArraySize int) (*GGML, int64, error) {
 	case FILE_MAGIC_GGUF_BE:
 		c = &containerGGUF{ByteOrder: binary.BigEndian, maxArraySize: maxArraySize}
 	default:
-		return nil, 0, errors.New("invalid file magic")
+		return nil, errors.New("invalid file magic")
 	}
 
 	model, err := c.Decode(rs)
 	if err != nil {
-		return nil, 0, err
+		return nil, err
 	}
 
 	offset, err := rs.Seek(0, io.SeekCurrent)
 	if err != nil {
-		return nil, 0, err
+		return nil, err
 	}
 
 	// final model type
 	return &GGML{
 		container: c,
 		model:     model,
-	}, offset, nil
+		Length:    offset,
+	}, nil
 }
 
 func (f GGML) GraphSize(context, batch uint64, numParallel int, kvCacheType string) (kv []uint64, partialOffload, fullOffload uint64) {
diff --git a/fs/ggml/gguf_test.go b/fs/ggml/gguf_test.go
index 10d3b684..0e071800 100644
--- a/fs/ggml/gguf_test.go
+++ b/fs/ggml/gguf_test.go
@@ -35,7 +35,7 @@ func TestWriteGGUF(t *testing.T) {
 	}
 	defer r.Close()
 
-	ff, _, err := Decode(r, 0)
+	ff, err := Decode(r, 0)
 	if err != nil {
 		t.Fatal(err)
 	}
diff --git a/llm/memory.go b/llm/memory.go
index 32774d28..e9ed1738 100644
--- a/llm/memory.go
+++ b/llm/memory.go
@@ -423,7 +423,7 @@ func projectorMemoryRequirements(filename string) (weights uint64) {
 	}
 	defer file.Close()
 
-	ggml, _, err := ggml.Decode(file, 1024)
+	ggml, err := ggml.Decode(file, 1024)
 	if err != nil {
 		return 0
 	}
diff --git a/llm/server.go b/llm/server.go
index c07315fa..4abb569f 100644
--- a/llm/server.go
+++ b/llm/server.go
@@ -121,7 +121,7 @@ func LoadModel(model string, maxArraySize int) (*ggml.GGML, error) {
 	}
 	defer f.Close()
 
-	ggml, _, err := ggml.Decode(f, maxArraySize)
+	ggml, err := ggml.Decode(f, maxArraySize)
 	return ggml, err
 }
 
diff --git a/ml/backend.go b/ml/backend.go
index cb32d818..2599d774 100644
--- a/ml/backend.go
+++ b/ml/backend.go
@@ -6,7 +6,6 @@ import (
 	"encoding/binary"
 	"fmt"
 	"math"
-	"os"
 	"slices"
 	"strconv"
 	"strings"
@@ -15,6 +14,7 @@ import (
 )
 
 type Backend interface {
+	Load(ctx context.Context, progress func(float32)) error
 	Config() fs.Config
 	Get(name string) Tensor
 	NewContext() Context
@@ -52,10 +52,6 @@ type CacheConfig struct {
 
 // BackendParams controls how the backend loads and executes models
 type BackendParams struct {
-	// Progress is a callback function that allows reporting percentage completion
-	// of model loading
-	Progress func(float32)
-
 	// NumThreads sets the number of threads to use if running on the CPU
 	NumThreads int
 
@@ -72,9 +68,9 @@ type BackendParams struct {
 	FlashAttention bool
 }
 
-var backends = make(map[string]func(context.Context, *os.File, BackendParams) (Backend, error))
+var backends = make(map[string]func(string, BackendParams) (Backend, error))
 
-func RegisterBackend(name string, f func(context.Context, *os.File, BackendParams) (Backend, error)) {
+func RegisterBackend(name string, f func(string, BackendParams) (Backend, error)) {
 	if _, ok := backends[name]; ok {
 		panic("backend: backend already registered")
 	}
@@ -82,9 +78,9 @@ func RegisterBackend(name string, f func(context.Context, *os.File, BackendParam
 	backends[name] = f
 }
 
-func NewBackend(ctx context.Context, f *os.File, params BackendParams) (Backend, error) {
+func NewBackend(modelPath string, params BackendParams) (Backend, error) {
 	if backend, ok := backends["ggml"]; ok {
-		return backend(ctx, f, params)
+		return backend(modelPath, params)
 	}
 
 	return nil, fmt.Errorf("unsupported backend")
diff --git a/ml/backend/ggml/ggml.go b/ml/backend/ggml/ggml.go
index 2821ad11..f0b26b2f 100644
--- a/ml/backend/ggml/ggml.go
+++ b/ml/backend/ggml/ggml.go
@@ -44,8 +44,15 @@ func devices() []*C.struct_ggml_backend_device {
 }
 
 type Backend struct {
+	// modelPath is the location of the model data
+	modelPath string
+
 	meta *fsggml.GGML
 
+	// tensorLoadTargets maps from the name of the tensor in the file
+	// to the name that is used by the model definition
+	tensorLoadTargets map[string][]string
+
 	sched         *C.struct_ggml_backend_sched
 	schedBackends []*C.struct_ggml_backend
 	schedBufts    []*C.struct_ggml_backend_buffer_type
@@ -64,8 +71,14 @@ type Backend struct {
 	maxGraphNodes int
 }
 
-func New(ctx context.Context, r *os.File, params ml.BackendParams) (ml.Backend, error) {
-	meta, n, err := fsggml.Decode(r, -1)
+func New(modelPath string, params ml.BackendParams) (ml.Backend, error) {
+	r, err := os.Open(modelPath)
+	if err != nil {
+		return nil, err
+	}
+	defer r.Close()
+
+	meta, err := fsggml.Decode(r, -1)
 	if err != nil {
 		return nil, err
 	}
@@ -307,73 +320,6 @@ func New(ctx context.Context, r *os.File, params ml.BackendParams) (ml.Backend,
 		}
 	}
 
-	var doneBytes atomic.Uint64
-	totalBytes := uint64(n) - meta.Tensors().Offset
-
-	g, ctx := errgroup.WithContext(ctx)
-	g.SetLimit(runtime.GOMAXPROCS(0))
-	for _, t := range meta.Tensors().Items() {
-		t := t
-		g.Go(func() error {
-			tts := make([]*C.struct_ggml_tensor, max(1, len(targets[t.Name])))
-			for i := range tts {
-				target := targets[t.Name][i]
-				if target == "" {
-					target = t.Name
-				}
-
-				tt, ok := tensors[target]
-				if !ok {
-					return fmt.Errorf("unassigned tensor: %s", t.Name)
-				}
-
-				tts[i] = tt
-			}
-
-			// Create a new FD for each goroutine so that each FD is read sequentially, rather than
-			// seeking around within an FD shared between all goroutines.
-			file, err := os.Open(r.Name())
-			if err != nil {
-				slog.Warn("file open error", "file", r.Name(), "error", err)
-				return err
-			}
-			defer file.Close()
-			sr := io.NewSectionReader(file, int64(meta.Tensors().Offset+t.Offset), int64(t.Size()))
-			bts := make([]byte, 128*format.KibiByte)
-
-			var s uint64
-			for s < t.Size() {
-				// Stop if either the parent context has been canceled or if any of the other tensors returned an error
-				if err := ctx.Err(); err != nil {
-					return err
-				}
-
-				n, err := io.ReadFull(sr, bts[:min(len(bts), int(t.Size()-s))])
-				if err != nil {
-					slog.Warn("file read error", "file", r.Name(), "error", err)
-					return err
-				}
-
-				for _, tt := range tts {
-					C.ggml_backend_tensor_set(tt, unsafe.Pointer(&bts[0]), C.size_t(s), C.size_t(n))
-				}
-
-				s += uint64(n)
-
-				if params.Progress != nil {
-					done := doneBytes.Add(uint64(n))
-					params.Progress(float32(done) / float32(totalBytes))
-				}
-			}
-
-			return nil
-		})
-	}
-
-	if err := g.Wait(); err != nil {
-		return nil, err
-	}
-
 	// map devices to backend buffer types so new tensors can be assigned to the correct device
 	deviceBufferTypes := make(map[*C.struct_ggml_backend_device]*C.struct_ggml_backend_buffer_type)
 
@@ -397,9 +343,11 @@ func New(ctx context.Context, r *os.File, params ml.BackendParams) (ml.Backend,
 
 	maxGraphNodes := max(8192, len(meta.Tensors().Items())*5)
 	return &Backend{
-		flashAttention: params.FlashAttention,
-		meta:           meta,
-		tensors:        tensors,
+		modelPath:         modelPath,
+		flashAttention:    params.FlashAttention,
+		meta:              meta,
+		tensorLoadTargets: targets,
+		tensors:           tensors,
 		sched: C.ggml_backend_sched_new(
 			(*C.ggml_backend_t)(unsafe.Pointer(&schedBackends[0])),
 			(*C.ggml_backend_buffer_type_t)(unsafe.Pointer(&schedBufts[0])),
@@ -426,6 +374,77 @@ func init() {
 	ml.RegisterBackend("ggml", New)
 }
 
+func (b *Backend) Load(ctx context.Context, progress func(float32)) error {
+	var doneBytes atomic.Uint64
+	totalBytes := uint64(b.meta.Length) - b.meta.Tensors().Offset
+
+	g, ctx := errgroup.WithContext(ctx)
+	g.SetLimit(runtime.GOMAXPROCS(0))
+	for _, t := range b.meta.Tensors().Items() {
+		t := t
+		g.Go(func() error {
+			tts := make([]*C.struct_ggml_tensor, max(1, len(b.tensorLoadTargets[t.Name])))
+			for i := range tts {
+				target := b.tensorLoadTargets[t.Name][i]
+				if target == "" {
+					target = t.Name
+				}
+
+				tt, ok := b.tensors[target]
+				if !ok {
+					return fmt.Errorf("unassigned tensor: %s", t.Name)
+				}
+
+				tts[i] = tt
+			}
+
+			// Create a new FD for each goroutine so that each FD is read sequentially, rather than
+			// seeking around within an FD shared between all goroutines.
+			file, err := os.Open(b.modelPath)
+			if err != nil {
+				slog.Warn("file open error", "file", b.modelPath, "error", err)
+				return err
+			}
+			defer file.Close()
+			sr := io.NewSectionReader(file, int64(b.meta.Tensors().Offset+t.Offset), int64(t.Size()))
+			bts := make([]byte, 128*format.KibiByte)
+
+			var s uint64
+			for s < t.Size() {
+				// Stop if either the parent context has been canceled or if any of the other tensors returned an error
+				if err := ctx.Err(); err != nil {
+					return err
+				}
+
+				n, err := io.ReadFull(sr, bts[:min(len(bts), int(t.Size()-s))])
+				if err != nil {
+					slog.Warn("file read error", "file", b.modelPath, "error", err)
+					return err
+				}
+
+				for _, tt := range tts {
+					C.ggml_backend_tensor_set(tt, unsafe.Pointer(&bts[0]), C.size_t(s), C.size_t(n))
+				}
+
+				s += uint64(n)
+
+				if progress != nil {
+					done := doneBytes.Add(uint64(n))
+					progress(float32(done) / float32(totalBytes))
+				}
+			}
+
+			return nil
+		})
+	}
+
+	if err := g.Wait(); err != nil {
+		return err
+	}
+
+	return nil
+}
+
 func (b *Backend) Config() fs.Config {
 	return b.meta.KV()
 }
diff --git a/model/model.go b/model/model.go
index 98381c90..39b68db1 100644
--- a/model/model.go
+++ b/model/model.go
@@ -98,14 +98,8 @@ func Register(name string, f func(fs.Config) (Model, error)) {
 }
 
 // New initializes a new model instance with the provided configuration based on the metadata in the model file
-func New(ctx context.Context, modelPath string, params ml.BackendParams) (Model, error) {
-	r, err := os.Open(modelPath)
-	if err != nil {
-		return nil, err
-	}
-	defer r.Close()
-
-	b, err := ml.NewBackend(ctx, r, params)
+func New(modelPath string, params ml.BackendParams) (Model, error) {
+	b, err := ml.NewBackend(modelPath, params)
 	if err != nil {
 		return nil, err
 	}
@@ -134,7 +128,7 @@ func NewTextProcessor(s string) (TextProcessor, error) {
 		return nil, err
 	}
 	defer r.Close()
-	meta, _, err := fsggml.Decode(r, -1)
+	meta, err := fsggml.Decode(r, -1)
 	if err != nil {
 		return nil, err
 	}
diff --git a/runner/ollamarunner/runner.go b/runner/ollamarunner/runner.go
index cd42d434..a488a104 100644
--- a/runner/ollamarunner/runner.go
+++ b/runner/ollamarunner/runner.go
@@ -845,7 +845,7 @@ func (s *Server) loadModel(
 	multiUserCache bool,
 ) {
 	var err error
-	s.model, err = model.New(ctx, mpath, params)
+	s.model, err = model.New(mpath, params)
 	if err != nil {
 		panic(err)
 	}
@@ -874,6 +874,14 @@ func (s *Server) loadModel(
 		panic(err)
 	}
 
+	err = s.model.Backend().Load(ctx,
+		func(progress float32) {
+			s.progress = progress
+		})
+	if err != nil {
+		panic(err)
+	}
+
 	s.status = llm.ServerStatusReady
 	s.ready.Done()
 }
@@ -928,9 +936,6 @@ func Execute(args []string) error {
 	}
 
 	params := ml.BackendParams{
-		Progress: func(progress float32) {
-			server.progress = progress
-		},
 		NumThreads:     *threads,
 		NumGPULayers:   *numGPULayers,
 		MainGPU:        *mainGPU,
diff --git a/server/create.go b/server/create.go
index 68e003df..82856c4d 100644
--- a/server/create.go
+++ b/server/create.go
@@ -295,7 +295,7 @@ func convertFromSafetensors(files map[string]string, baseLayers []*layerGGML, is
 	}
 	defer bin.Close()
 
-	f, _, err := ggml.Decode(bin, -1)
+	f, err := ggml.Decode(bin, -1)
 	if err != nil {
 		return nil, err
 	}
@@ -467,7 +467,7 @@ func quantizeLayer(layer *layerGGML, quantizeType string, fn func(resp api.Progr
 		return nil, err
 	}
 
-	f, _, err := ggml.Decode(temp, 1024)
+	f, err := ggml.Decode(temp, 1024)
 	if err != nil {
 		slog.Error(fmt.Sprintf("error decoding ggml: %s\n", err))
 		return nil, err
@@ -508,7 +508,7 @@ func ggufLayers(digest string, fn func(resp api.ProgressResponse)) ([]*layerGGML
 
 	var offset int64
 	for offset < stat.Size() {
-		f, n, err := ggml.Decode(blob, 1024)
+		f, err := ggml.Decode(blob, 1024)
 		if errors.Is(err, io.EOF) {
 			break
 		} else if err != nil {
@@ -523,7 +523,7 @@ func ggufLayers(digest string, fn func(resp api.ProgressResponse)) ([]*layerGGML
 		}
 
 		var layer Layer
-		if digest != "" && n == stat.Size() && offset == 0 {
+		if digest != "" && f.Length == stat.Size() && offset == 0 {
 			layer, err = NewLayerFromLayer(digest, mediatype, blob.Name())
 			if err != nil {
 				slog.Debug("could not create new layer from layer", "error", err)
@@ -533,14 +533,14 @@ func ggufLayers(digest string, fn func(resp api.ProgressResponse)) ([]*layerGGML
 
 		// Fallback to creating layer from file copy (either NewLayerFromLayer failed, or digest empty/n != stat.Size())
 		if layer.Digest == "" {
-			layer, err = NewLayer(io.NewSectionReader(blob, offset, n), mediatype)
+			layer, err = NewLayer(io.NewSectionReader(blob, offset, f.Length), mediatype)
 			if err != nil {
 				return nil, err
 			}
 		}
 
 		layers = append(layers, &layerGGML{layer, f})
-		offset = n
+		offset = f.Length
 	}
 
 	return detectChatTemplate(layers)
diff --git a/server/images.go b/server/images.go
index 352f10f2..a69e2a9f 100644
--- a/server/images.go
+++ b/server/images.go
@@ -75,7 +75,7 @@ func (m *Model) Capabilities() []model.Capability {
 	if err == nil {
 		defer r.Close()
 
-		f, _, err := ggml.Decode(r, 1024)
+		f, err := ggml.Decode(r, 1024)
 		if err == nil {
 			if _, ok := f.KV()[fmt.Sprintf("%s.pooling_type", f.KV().Architecture())]; ok {
 				capabilities = append(capabilities, model.CapabilityEmbedding)
diff --git a/server/model.go b/server/model.go
index 2149ff85..6b5439a4 100644
--- a/server/model.go
+++ b/server/model.go
@@ -64,7 +64,7 @@ func parseFromModel(ctx context.Context, name model.Name, fn func(api.ProgressRe
 			}
 			defer blob.Close()
 
-			f, _, err := ggml.Decode(blob, -1)
+			f, err := ggml.Decode(blob, -1)
 			if err != nil {
 				return nil, err
 			}
diff --git a/server/quantization_test.go b/server/quantization_test.go
index 495297df..4f717c2c 100644
--- a/server/quantization_test.go
+++ b/server/quantization_test.go
@@ -271,7 +271,7 @@ func TestQuantizeModel(t *testing.T) {
 				t.Fatal(err.Error())
 			}
 			defer fp.Close()
-			meta, _, err := fsggml.Decode(fp, -1)
+			meta, err := fsggml.Decode(fp, -1)
 			if err != nil {
 				t.Fatal(err.Error())
 			}
@@ -303,7 +303,7 @@ func TestQuantizeModel(t *testing.T) {
 				t.Fatalf("failed to load the quantized model %s: %s", tmp.Name(), err)
 			}
 			defer fpNew.Close()
-			newMeta, _, err := fsggml.Decode(fpNew, -1)
+			newMeta, err := fsggml.Decode(fpNew, -1)
 			if err != nil {
 				t.Fatalf("failed to load the quantized model %s: %s", tmp.Name(), err)
 			}

From 1a0cfd080a2d3e65519c241b7561bf5aa49468ff Mon Sep 17 00:00:00 2001
From: Daniel Hiltgen <dhiltgen@users.noreply.github.com>
Date: Mon, 19 May 2025 13:54:54 -0700
Subject: [PATCH 037/108] avoid kv truncation during create (#10761)

---
 server/create.go | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/server/create.go b/server/create.go
index 82856c4d..9488de35 100644
--- a/server/create.go
+++ b/server/create.go
@@ -508,7 +508,7 @@ func ggufLayers(digest string, fn func(resp api.ProgressResponse)) ([]*layerGGML
 
 	var offset int64
 	for offset < stat.Size() {
-		f, err := ggml.Decode(blob, 1024)
+		f, err := ggml.Decode(blob, -1)
 		if errors.Is(err, io.EOF) {
 			break
 		} else if err != nil {

From 3fe74fba42b8d496a1ab3e8298bdc9b8ffb0f336 Mon Sep 17 00:00:00 2001
From: Jesse Gross <jesse@ollama.com>
Date: Mon, 19 May 2025 11:40:44 -0700
Subject: [PATCH 038/108] llm: Use first layer as memory buffer in estimation

This is a partial revert of 0478d44 "Fixed over vram allcation dure to
small initial layer sizes."

Previously we used the size of the first layer as an extra reserved
amount of space to buffer our memory estimates. The above commit
changed this to use the largest layer. However, this had performance
impacts on more models than the original commit was trying to fix.

There is just a heuristic without an ideal solution so this goes back
to the historic behavior.

Fixes: #10765, #10756, #10752, #10726
---
 llm/memory.go | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

diff --git a/llm/memory.go b/llm/memory.go
index e9ed1738..05b3b2fd 100644
--- a/llm/memory.go
+++ b/llm/memory.go
@@ -1,12 +1,9 @@
 package llm
 
 import (
-	"cmp"
 	"fmt"
 	"log/slog"
-	"maps"
 	"os"
-	"slices"
 	"strconv"
 	"strings"
 
@@ -125,10 +122,12 @@ func EstimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []strin
 	}
 
 	layers := f.Tensors().GroupLayers()
-	// add one layer (chosing the max layer) worth of memory as a buffer
-	layerSize = slices.MaxFunc(slices.Collect(maps.Values(layers)), func(a, b ggml.Layer) int {
-		return cmp.Compare(a.Size(), b.Size())
-	}).Size()
+	// add one layer worth of memory as a buffer
+	if blk0, ok := layers["blk.0"]; ok {
+		layerSize = blk0.Size()
+	} else {
+		slog.Warn("model missing blk.0 layer size")
+	}
 
 	var kvct string
 	if envconfig.FlashAttention() &&

From ff180c3466e7f3ee21658465958c9ece6de2d5c0 Mon Sep 17 00:00:00 2001
From: Michael Yang <mxyng@pm.me>
Date: Mon, 19 May 2025 15:06:35 -0700
Subject: [PATCH 039/108] fix llama and mistral3 models (#10774)

* fix llama model

* fix mistral3.1 model

do not set default vision layers
---
 model/models/llama/model.go           | 18 +++++++-----------
 model/models/mistral3/model.go        |  7 +------
 model/models/mistral3/model_text.go   | 18 ++++--------------
 model/models/mistral3/model_vision.go |  2 +-
 4 files changed, 13 insertions(+), 32 deletions(-)

diff --git a/model/models/llama/model.go b/model/models/llama/model.go
index 6e214f0f..9b85f1c7 100644
--- a/model/models/llama/model.go
+++ b/model/models/llama/model.go
@@ -1,9 +1,8 @@
 package llama
 
 import (
-	"fmt"
+	"cmp"
 	"math"
-	"strings"
 
 	"github.com/ollama/ollama/fs"
 	"github.com/ollama/ollama/kvcache"
@@ -14,9 +13,9 @@ import (
 )
 
 type Options struct {
-	hiddenSize, numHeads, numKVHeads int
-	eps, ropeBase, ropeScale         float32
-	ropeDim                          uint32
+	hiddenSize, numHeads, numKVHeads, headDim int
+	eps, ropeBase, ropeScale                  float32
+	ropeDim                                   uint32
 }
 
 type Model struct {
@@ -32,10 +31,6 @@ type Model struct {
 }
 
 func New(c fs.Config) (model.Model, error) {
-	if !strings.EqualFold(c.String("tokenizer.ggml.model"), "gpt2") {
-		return nil, fmt.Errorf("tokenizer %s not yet supported", c.String("tokenizer.ggml.model"))
-	}
-
 	m := Model{
 		BytePairEncoding: model.NewBytePairEncoding(
 			c.String("tokenizer.ggml.pretokenizer", `(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+`),
@@ -57,6 +52,7 @@ func New(c fs.Config) (model.Model, error) {
 			hiddenSize: int(c.Uint("embedding_length")),
 			numHeads:   int(c.Uint("attention.head_count")),
 			numKVHeads: int(c.Uint("attention.head_count_kv")),
+			headDim:    int(c.Uint("attention.key_length")),
 			eps:        c.Float("attention.layer_norm_rms_epsilon"),
 			ropeBase:   c.Float("rope.freq_base"),
 			ropeScale:  c.Float("rope.freq_scale", 1),
@@ -79,7 +75,7 @@ type SelfAttention struct {
 
 func (sa *SelfAttention) Forward(ctx ml.Context, hiddenState, positionIDs ml.Tensor, cache kvcache.Cache, opts *Options) ml.Tensor {
 	batchSize := hiddenState.Dim(1)
-	headDim := opts.hiddenSize / opts.numHeads
+	headDim := cmp.Or(opts.headDim, opts.hiddenSize/opts.numHeads)
 	ropeType := uint32(0)
 
 	q := sa.Query.Forward(ctx, hiddenState)
@@ -95,7 +91,7 @@ func (sa *SelfAttention) Forward(ctx ml.Context, hiddenState, positionIDs ml.Ten
 
 	scaleFactor := 1.0 / math.Sqrt(float64(headDim))
 	kqv := nn.Attention(ctx, q, k, v, scaleFactor, cache)
-	kqv = kqv.Reshape(ctx, opts.hiddenSize, batchSize)
+	kqv = kqv.Reshape(ctx, headDim*opts.numHeads, batchSize)
 
 	return sa.Output.Forward(ctx, kqv)
 }
diff --git a/model/models/mistral3/model.go b/model/models/mistral3/model.go
index 0d384b94..dd01a587 100644
--- a/model/models/mistral3/model.go
+++ b/model/models/mistral3/model.go
@@ -31,11 +31,6 @@ var _ model.MultimodalProcessor = (*Model)(nil)
 var _ model.TextProcessor = (*Model)(nil)
 
 func New(c fs.Config) (model.Model, error) {
-	textModel, err := NewTextModel(c)
-	if err != nil {
-		return nil, err
-	}
-
 	m := &Model{
 		BytePairEncoding: model.NewBytePairEncoding(
 			c.String("tokenizer.ggml.pretokenizer", `[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}]*[\p{Ll}\p{Lm}\p{Lo}\p{M}]+|[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}]+[\p{Ll}\p{Lm}\p{Lo}\p{M}]*|\p{N}| ?[^\s\p{L}\p{N}]+[\r\n/]*|\s*[\r\n]+|\s+(?!\S)|\s+`),
@@ -52,7 +47,7 @@ func New(c fs.Config) (model.Model, error) {
 				),
 			},
 		),
-		TextModel:           textModel,
+		TextModel:           newTextModel(c),
 		VisionModel:         newVisionModel(c),
 		ImageProcessor:      newImageProcessor(c),
 		MultiModalProjector: newMultiModalProjector(c),
diff --git a/model/models/mistral3/model_text.go b/model/models/mistral3/model_text.go
index 17939800..57e2a40a 100644
--- a/model/models/mistral3/model_text.go
+++ b/model/models/mistral3/model_text.go
@@ -1,9 +1,8 @@
 package mistral3
 
 import (
-	"fmt"
+	"cmp"
 	"math"
-	"strings"
 
 	"github.com/ollama/ollama/fs"
 	"github.com/ollama/ollama/kvcache"
@@ -37,10 +36,7 @@ type SelfAttention struct {
 func (sa *SelfAttention) Forward(ctx ml.Context, hiddenState, positionIDs ml.Tensor, cache kvcache.Cache, opts *TextOptions) ml.Tensor {
 	batchSize := hiddenState.Dim(1)
 	ropeType := uint32(0)
-	headDim := opts.headDim
-	if headDim == 0 {
-		headDim = opts.hiddenSize / opts.numHeads
-	}
+	headDim := cmp.Or(opts.headDim, opts.hiddenSize/opts.numHeads)
 
 	q := sa.Query.Forward(ctx, hiddenState)
 	q = q.Reshape(ctx, headDim, opts.numHeads, batchSize)
@@ -125,12 +121,8 @@ func (m *TextModel) Forward(ctx ml.Context, inputs, positions, outputs ml.Tensor
 	return m.Output.Forward(ctx, hiddenState)
 }
 
-func NewTextModel(c fs.Config) (*TextModel, error) {
-	if !strings.EqualFold(c.String("tokenizer.ggml.model"), "gpt2") {
-		return nil, fmt.Errorf("tokenizer %s not yet supported", c.String("tokenizer.ggml.model"))
-	}
-
-	textModel := &TextModel{
+func newTextModel(c fs.Config) *TextModel {
+	return &TextModel{
 		Layers: make([]Layer, c.Uint("block_count")),
 		TextOptions: &TextOptions{
 			hiddenSize: int(c.Uint("embedding_length")),
@@ -143,6 +135,4 @@ func NewTextModel(c fs.Config) (*TextModel, error) {
 			ropeDim:    c.Uint("rope.dimension_count"),
 		},
 	}
-
-	return textModel, nil
 }
diff --git a/model/models/mistral3/model_vision.go b/model/models/mistral3/model_vision.go
index 469dc40c..24541004 100644
--- a/model/models/mistral3/model_vision.go
+++ b/model/models/mistral3/model_vision.go
@@ -170,7 +170,7 @@ func (m *VisionModel) Forward(ctx ml.Context, pixelValues ml.Tensor) ml.Tensor {
 
 func newVisionModel(c fs.Config) *VisionModel {
 	return &VisionModel{
-		Layers: make([]VisionEncoderLayer, c.Uint("vision.block_count", 24)),
+		Layers: make([]VisionEncoderLayer, c.Uint("vision.block_count")),
 		VisionModelOptions: &VisionModelOptions{
 			hiddenSize:       int(c.Uint("vision.embedding_length", 1024)),
 			numHeads:         int(c.Uint("vision.attention.head_count", 16)),

From e6a800ca11cb52b24fa5afc5245ed1277811fbe9 Mon Sep 17 00:00:00 2001
From: DarkCaster <DarkCaster@users.noreply.github.com>
Date: Tue, 20 May 2025 20:41:15 +0300
Subject: [PATCH 040/108] llama: fix incorrect initialization of
 C.struct_common_sampler_cparams.penalty_present (#10779)

---
 llama/llama.go | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llama/llama.go b/llama/llama.go
index 626ea13a..adee6f63 100644
--- a/llama/llama.go
+++ b/llama/llama.go
@@ -544,7 +544,7 @@ func NewSamplingContext(model *Model, params SamplingParams) (*SamplingContext,
 	cparams.penalty_last_n = C.int32_t(params.RepeatLastN)
 	cparams.penalty_repeat = C.float(params.PenaltyRepeat)
 	cparams.penalty_freq = C.float(params.PenaltyFreq)
-	cparams.penalty_present = C.float(params.PenaltyFreq)
+	cparams.penalty_present = C.float(params.PenaltyPresent)
 	cparams.seed = C.uint32_t(params.Seed)
 
 	grammar := C.CString(params.Grammar)

From 9ed8bf14cb885509281d63731cda16637a7e0bd2 Mon Sep 17 00:00:00 2001
From: Michael Yang <mxyng@pm.me>
Date: Tue, 20 May 2025 15:51:08 -0700
Subject: [PATCH 041/108] ml: add more rope options (#10775)

---
 ml/backend.go                       | 16 --------------
 ml/backend/ggml/ggml.go             | 26 ++++++-----------------
 ml/nn/fast/rope.go                  | 21 ++++++++++++++++++
 ml/nn/rope/rope.go                  | 33 +++++++++++++++++++++++++++++
 model/models/gemma2/model.go        |  9 ++++----
 model/models/gemma3/model_text.go   |  9 ++++----
 model/models/llama/model.go         | 17 ++++++++-------
 model/models/llama4/model_text.go   |  8 ++++---
 model/models/mistral3/model_text.go | 16 +++++++-------
 model/models/mllama/model_text.go   | 13 ++++++------
 model/models/qwen25vl/model_text.go | 31 ++++++++++++++-------------
 11 files changed, 116 insertions(+), 83 deletions(-)
 create mode 100644 ml/nn/fast/rope.go
 create mode 100644 ml/nn/rope/rope.go

diff --git a/ml/backend.go b/ml/backend.go
index 2599d774..70497e6e 100644
--- a/ml/backend.go
+++ b/ml/backend.go
@@ -115,21 +115,6 @@ type Context interface {
 	Layer(int) Context
 }
 
-// RopeOptions contains optional parameters for RoPE function
-type RopeOptions struct {
-	OriginalContextLen uint32
-}
-
-// RopeOption defines a function that modifies RopeOpts
-type RopeOption func(*RopeOptions)
-
-// WithContextLen sets a custom context length
-func WithContextLen(len uint32) RopeOption {
-	return func(opts *RopeOptions) {
-		opts.OriginalContextLen = len
-	}
-}
-
 type Tensor interface {
 	Dim(n int) int
 	Stride(n int) int
@@ -155,7 +140,6 @@ type Tensor interface {
 	AvgPool2D(ctx Context, k, s int, p float32) Tensor
 	Conv2D(ctx Context, weight Tensor, s0, s1, p0, p1, d0, d1 int) Tensor
 
-	RoPE(ctx Context, positionIDs, ropeFactors Tensor, dim, ropeType uint32, base, scale float32, options ...RopeOption) Tensor
 	IM2Col(ctx Context, weight Tensor, s0, s1, p0, p1, d0, d1 int) Tensor
 
 	Sin(ctx Context) Tensor
diff --git a/ml/backend/ggml/ggml.go b/ml/backend/ggml/ggml.go
index f0b26b2f..3ae50ace 100644
--- a/ml/backend/ggml/ggml.go
+++ b/ml/backend/ggml/ggml.go
@@ -30,6 +30,7 @@ import (
 	"github.com/ollama/ollama/logutil"
 	"github.com/ollama/ollama/ml"
 	ggml "github.com/ollama/ollama/ml/backend/ggml/ggml/src"
+	"github.com/ollama/ollama/ml/nn/rope"
 	"golang.org/x/sync/errgroup"
 )
 
@@ -1074,28 +1075,15 @@ func (t *Tensor) View(ctx ml.Context, offset int, shape ...int) ml.Tensor {
 	}
 }
 
-const (
-	ropeTypeNorm   C.int = 0
-	ropeTypeNeox   C.int = 2
-	ropeTypeMrope  C.int = 8
-	ropeTypeVision C.int = 24
-)
-
-func (t *Tensor) RoPE(ctx ml.Context, positionIDs, ropeFactors ml.Tensor, ropeDim, ropeType uint32, ropeBase, ropeScale float32, options ...ml.RopeOption) ml.Tensor {
+func (t *Tensor) RoPE(ctx ml.Context, positions ml.Tensor, ropeDim int, ropeBase, ropeScale float32, options ...func(*rope.Options)) ml.Tensor {
 	// Default options
-	opts := &ml.RopeOptions{
-		OriginalContextLen: 131072,
-	}
+	opts := &rope.Options{OriginalContextLength: 131072, Factors: &Tensor{}}
 
 	// Apply any provided options
 	for _, option := range options {
 		option(opts)
 	}
 
-	if ropeFactors == nil {
-		ropeFactors = &Tensor{b: t.b}
-	}
-
 	dequant := t.t
 	if C.ggml_is_quantized(t.t._type) {
 		dequant = C.ggml_cast(ctx.(*Context).ctx, t.t, C.GGML_TYPE_F32)
@@ -1106,11 +1094,11 @@ func (t *Tensor) RoPE(ctx ml.Context, positionIDs, ropeFactors ml.Tensor, ropeDi
 		t: C.ggml_rope_ext(
 			ctx.(*Context).ctx,
 			dequant,
-			positionIDs.(*Tensor).t,
-			ropeFactors.(*Tensor).t,
+			positions.(*Tensor).t,
+			opts.Factors.(*Tensor).t,
 			C.int(ropeDim),
-			C.int(ropeType),
-			C.int(opts.OriginalContextLen),
+			C.int(opts.Type),
+			C.int(opts.OriginalContextLength),
 			C.float(ropeBase),
 			C.float(ropeScale),
 			C.float(0.0),
diff --git a/ml/nn/fast/rope.go b/ml/nn/fast/rope.go
new file mode 100644
index 00000000..b45938eb
--- /dev/null
+++ b/ml/nn/fast/rope.go
@@ -0,0 +1,21 @@
+// fast provides implementations of fast (fused) operations for increased performance.
+package fast
+
+import (
+	"github.com/ollama/ollama/ml"
+	"github.com/ollama/ollama/ml/nn/rope"
+)
+
+// fastRoPE is an interface for tensors that support fast rotary positional embedding.
+type fastRoPE interface {
+	RoPE(ctx ml.Context, positionIDs ml.Tensor, dim int, base, scale float32, options ...func(*rope.Options)) ml.Tensor
+}
+
+// RoPE applies rotary positional embedding to tensor `t`.
+func RoPE(ctx ml.Context, t, positions ml.Tensor, dim int, base, scale float32, options ...func(*rope.Options)) ml.Tensor {
+	if t, ok := t.(fastRoPE); ok {
+		return t.RoPE(ctx, positions, dim, base, scale, options...)
+	}
+
+	panic("RoPE not implemented for this tensor type")
+}
diff --git a/ml/nn/rope/rope.go b/ml/nn/rope/rope.go
new file mode 100644
index 00000000..b0c00a5b
--- /dev/null
+++ b/ml/nn/rope/rope.go
@@ -0,0 +1,33 @@
+package rope
+
+import "github.com/ollama/ollama/ml"
+
+// Options contains optional parameters for RoPE function
+type Options struct {
+	OriginalContextLength int
+	Type                  int
+	Factors               ml.Tensor
+}
+
+// WithOriginalContextLength sets a custom context length
+func WithOriginalContextLength(n int) func(*Options) {
+	return func(opts *Options) {
+		opts.OriginalContextLength = n
+	}
+}
+
+// WithType sets RoPE type to NeoX
+func WithTypeNeoX() func(*Options) {
+	return func(opts *Options) {
+		opts.Type = 2
+	}
+}
+
+// WithFactors sets custom rope factors
+func WithFactors(factors ml.Tensor) func(*Options) {
+	return func(opts *Options) {
+		if factors != nil {
+			opts.Factors = factors
+		}
+	}
+}
diff --git a/model/models/gemma2/model.go b/model/models/gemma2/model.go
index a87534c5..3c5a7ea5 100644
--- a/model/models/gemma2/model.go
+++ b/model/models/gemma2/model.go
@@ -7,6 +7,8 @@ import (
 	"github.com/ollama/ollama/kvcache"
 	"github.com/ollama/ollama/ml"
 	"github.com/ollama/ollama/ml/nn"
+	"github.com/ollama/ollama/ml/nn/fast"
+	"github.com/ollama/ollama/ml/nn/rope"
 	"github.com/ollama/ollama/model"
 	"github.com/ollama/ollama/model/input"
 )
@@ -83,11 +85,10 @@ type SelfAttention struct {
 
 func (sa *SelfAttention) Forward(ctx ml.Context, hiddenState, positionIDs ml.Tensor, cache kvcache.Cache, opts *Options) ml.Tensor {
 	batchSize := hiddenState.Dim(1)
-	ropeType := uint32(2)
 
 	q := sa.Query.Forward(ctx, hiddenState)
 	q = q.Reshape(ctx, opts.attnKeyLen, opts.numHeads, batchSize)
-	q = q.RoPE(ctx, positionIDs, nil, uint32(opts.attnKeyLen), ropeType, opts.ropeBase, opts.ropeScale)
+	q = fast.RoPE(ctx, q, positionIDs, opts.attnKeyLen, opts.ropeBase, opts.ropeScale, rope.WithTypeNeoX())
 
 	if opts.largeModelScaling {
 		q = q.Scale(ctx, 1.0/math.Sqrt(float64(opts.hiddenSize/opts.numHeads)))
@@ -97,7 +98,7 @@ func (sa *SelfAttention) Forward(ctx ml.Context, hiddenState, positionIDs ml.Ten
 
 	k := sa.Key.Forward(ctx, hiddenState)
 	k = k.Reshape(ctx, opts.attnKeyLen, opts.numKVHeads, batchSize)
-	k = k.RoPE(ctx, positionIDs, nil, uint32(opts.attnKeyLen), ropeType, opts.ropeBase, opts.ropeScale)
+	k = fast.RoPE(ctx, k, positionIDs, opts.attnKeyLen, opts.ropeBase, opts.ropeScale, rope.WithTypeNeoX())
 
 	v := sa.Value.Forward(ctx, hiddenState)
 	v = v.Reshape(ctx, opts.attnValLen, opts.numKVHeads, batchSize)
@@ -127,7 +128,7 @@ func (sa *SelfAttention) Forward(ctx ml.Context, hiddenState, positionIDs ml.Ten
 }
 
 func (m *Model) Shift(ctx ml.Context, layer int, key, shift ml.Tensor) (ml.Tensor, error) {
-	return key.RoPE(ctx, shift, nil, uint32(m.Options.attnKeyLen), uint32(2), m.Options.ropeBase, m.Options.ropeScale), nil
+	return fast.RoPE(ctx, key, shift, m.Options.attnKeyLen, m.Options.ropeBase, m.Options.ropeScale, rope.WithTypeNeoX()), nil
 }
 
 type MLP struct {
diff --git a/model/models/gemma3/model_text.go b/model/models/gemma3/model_text.go
index a40614af..70d7797e 100644
--- a/model/models/gemma3/model_text.go
+++ b/model/models/gemma3/model_text.go
@@ -7,6 +7,8 @@ import (
 	"github.com/ollama/ollama/kvcache"
 	"github.com/ollama/ollama/ml"
 	"github.com/ollama/ollama/ml/nn"
+	"github.com/ollama/ollama/ml/nn/fast"
+	"github.com/ollama/ollama/ml/nn/rope"
 	"github.com/ollama/ollama/model/input"
 )
 
@@ -73,7 +75,6 @@ type TextSelfAttention struct {
 
 func (sa *TextSelfAttention) Forward(ctx ml.Context, layer int, hiddenState, positionIDs ml.Tensor, cache kvcache.Cache, opts *TextConfig) ml.Tensor {
 	batchSize := hiddenState.Dim(1)
-	ropeType := uint32(2)
 
 	ropeBase := opts.ropeLocalBase
 	if (layer+1)%gemmaGlobalCacheCount == 0 {
@@ -83,7 +84,7 @@ func (sa *TextSelfAttention) Forward(ctx ml.Context, layer int, hiddenState, pos
 	q := sa.Query.Forward(ctx, hiddenState)
 	q = q.Reshape(ctx, opts.attnKeyLen, opts.numHeads, batchSize)
 	q = sa.QueryNorm.Forward(ctx, q, opts.eps)
-	q = q.RoPE(ctx, positionIDs, nil, uint32(opts.attnKeyLen), ropeType, ropeBase, opts.ropeScale)
+	q = fast.RoPE(ctx, q, positionIDs, opts.attnKeyLen, ropeBase, opts.ropeScale, rope.WithTypeNeoX())
 
 	if opts.largeModelScaling {
 		q = q.Scale(ctx, 1.0/math.Sqrt(float64(opts.hiddenSize/opts.numHeads)))
@@ -94,7 +95,7 @@ func (sa *TextSelfAttention) Forward(ctx ml.Context, layer int, hiddenState, pos
 	k := sa.Key.Forward(ctx, hiddenState)
 	k = k.Reshape(ctx, opts.attnKeyLen, opts.numKVHeads, batchSize)
 	k = sa.KeyNorm.Forward(ctx, k, opts.eps)
-	k = k.RoPE(ctx, positionIDs, nil, uint32(opts.attnKeyLen), ropeType, ropeBase, opts.ropeScale)
+	k = fast.RoPE(ctx, k, positionIDs, opts.attnKeyLen, ropeBase, opts.ropeScale, rope.WithTypeNeoX())
 
 	v := sa.Value.Forward(ctx, hiddenState)
 	v = v.Reshape(ctx, opts.attnValLen, opts.numKVHeads, batchSize)
@@ -112,7 +113,7 @@ func (m *TextModel) Shift(ctx ml.Context, layer int, key, shift ml.Tensor) (ml.T
 		ropeBase = m.TextConfig.ropeGlobalBase
 	}
 
-	return key.RoPE(ctx, shift, nil, uint32(m.TextConfig.attnKeyLen), uint32(2), ropeBase, m.TextConfig.ropeScale), nil
+	return fast.RoPE(ctx, key, shift, m.TextConfig.attnKeyLen, ropeBase, m.TextConfig.ropeScale, rope.WithTypeNeoX()), nil
 }
 
 type TextMLP struct {
diff --git a/model/models/llama/model.go b/model/models/llama/model.go
index 9b85f1c7..7b475512 100644
--- a/model/models/llama/model.go
+++ b/model/models/llama/model.go
@@ -8,14 +8,16 @@ import (
 	"github.com/ollama/ollama/kvcache"
 	"github.com/ollama/ollama/ml"
 	"github.com/ollama/ollama/ml/nn"
+	"github.com/ollama/ollama/ml/nn/fast"
+	"github.com/ollama/ollama/ml/nn/rope"
 	"github.com/ollama/ollama/model"
 	"github.com/ollama/ollama/model/input"
 )
 
 type Options struct {
-	hiddenSize, numHeads, numKVHeads, headDim int
-	eps, ropeBase, ropeScale                  float32
-	ropeDim                                   uint32
+	hiddenSize, numHeads, numKVHeads int
+	headDim, ropeDim                 int
+	eps, ropeBase, ropeScale         float32
 }
 
 type Model struct {
@@ -53,10 +55,10 @@ func New(c fs.Config) (model.Model, error) {
 			numHeads:   int(c.Uint("attention.head_count")),
 			numKVHeads: int(c.Uint("attention.head_count_kv")),
 			headDim:    int(c.Uint("attention.key_length")),
+			ropeDim:    int(c.Uint("rope.dimension_count")),
 			eps:        c.Float("attention.layer_norm_rms_epsilon"),
 			ropeBase:   c.Float("rope.freq_base"),
 			ropeScale:  c.Float("rope.freq_scale", 1),
-			ropeDim:    c.Uint("rope.dimension_count"),
 		},
 	}
 
@@ -76,15 +78,14 @@ type SelfAttention struct {
 func (sa *SelfAttention) Forward(ctx ml.Context, hiddenState, positionIDs ml.Tensor, cache kvcache.Cache, opts *Options) ml.Tensor {
 	batchSize := hiddenState.Dim(1)
 	headDim := cmp.Or(opts.headDim, opts.hiddenSize/opts.numHeads)
-	ropeType := uint32(0)
 
 	q := sa.Query.Forward(ctx, hiddenState)
 	q = q.Reshape(ctx, headDim, opts.numHeads, batchSize)
-	q = q.RoPE(ctx, positionIDs, sa.RopeFactors, opts.ropeDim, ropeType, opts.ropeBase, opts.ropeScale)
+	q = fast.RoPE(ctx, q, positionIDs, opts.ropeDim, opts.ropeBase, opts.ropeScale, rope.WithFactors(sa.RopeFactors))
 
 	k := sa.Key.Forward(ctx, hiddenState)
 	k = k.Reshape(ctx, headDim, opts.numKVHeads, batchSize)
-	k = k.RoPE(ctx, positionIDs, sa.RopeFactors, opts.ropeDim, ropeType, opts.ropeBase, opts.ropeScale)
+	k = fast.RoPE(ctx, k, positionIDs, opts.ropeDim, opts.ropeBase, opts.ropeScale, rope.WithFactors(sa.RopeFactors))
 
 	v := sa.Value.Forward(ctx, hiddenState)
 	v = v.Reshape(ctx, headDim, opts.numKVHeads, batchSize)
@@ -97,7 +98,7 @@ func (sa *SelfAttention) Forward(ctx ml.Context, hiddenState, positionIDs ml.Ten
 }
 
 func (m *Model) Shift(ctx ml.Context, layer int, key, shift ml.Tensor) (ml.Tensor, error) {
-	return key.RoPE(ctx, shift, m.Layers[layer].SelfAttention.RopeFactors, uint32(0), m.ropeDim, m.ropeBase, m.ropeScale), nil
+	return fast.RoPE(ctx, key, shift, m.ropeDim, m.ropeBase, m.ropeScale, rope.WithFactors(m.Layers[layer].SelfAttention.RopeFactors)), nil
 }
 
 type MLP struct {
diff --git a/model/models/llama4/model_text.go b/model/models/llama4/model_text.go
index d98587bd..829c805c 100644
--- a/model/models/llama4/model_text.go
+++ b/model/models/llama4/model_text.go
@@ -8,6 +8,8 @@ import (
 	"github.com/ollama/ollama/kvcache"
 	"github.com/ollama/ollama/ml"
 	"github.com/ollama/ollama/ml/nn"
+	"github.com/ollama/ollama/ml/nn/fast"
+	"github.com/ollama/ollama/ml/nn/rope"
 	"github.com/ollama/ollama/model/input"
 )
 
@@ -31,8 +33,8 @@ func (sa *TextAttention) Forward(ctx ml.Context, hiddenStates, positions, attent
 	value = value.Reshape(ctx, headDim, opts.numKVHeads, batchSize)
 
 	if useRope {
-		query = query.RoPE(ctx, positions, sa.RopeFactors, uint32(opts.ropeDim), uint32(0), opts.ropeBase, opts.ropeScale)
-		key = key.RoPE(ctx, positions, sa.RopeFactors, uint32(opts.ropeDim), uint32(0), opts.ropeBase, opts.ropeScale)
+		query = fast.RoPE(ctx, query, positions, opts.ropeDim, opts.ropeBase, opts.ropeScale, rope.WithFactors(sa.RopeFactors))
+		key = fast.RoPE(ctx, key, positions, opts.ropeDim, opts.ropeBase, opts.ropeScale, rope.WithFactors(sa.RopeFactors))
 	}
 
 	if opts.useQKNorm {
@@ -250,5 +252,5 @@ func (m *TextModel) Forward(ctx ml.Context, inputs, positions, outputs ml.Tensor
 }
 
 func (m *TextModel) Shift(ctx ml.Context, layer int, key, shift ml.Tensor) (ml.Tensor, error) {
-	return key.RoPE(ctx, shift, m.Layers[layer].Attention.RopeFactors, uint32(0), uint32(m.ropeDim), m.ropeBase, m.ropeScale), nil
+	return fast.RoPE(ctx, key, shift, m.ropeDim, m.ropeBase, m.ropeScale, rope.WithFactors(m.Layers[layer].Attention.RopeFactors)), nil
 }
diff --git a/model/models/mistral3/model_text.go b/model/models/mistral3/model_text.go
index 57e2a40a..19c36f9f 100644
--- a/model/models/mistral3/model_text.go
+++ b/model/models/mistral3/model_text.go
@@ -8,13 +8,14 @@ import (
 	"github.com/ollama/ollama/kvcache"
 	"github.com/ollama/ollama/ml"
 	"github.com/ollama/ollama/ml/nn"
+	"github.com/ollama/ollama/ml/nn/fast"
 	"github.com/ollama/ollama/model/input"
 )
 
 type TextOptions struct {
-	hiddenSize, numHeads, numKVHeads, headDim int
-	eps, ropeBase, ropeScale                  float32
-	ropeDim                                   uint32
+	hiddenSize, numHeads, numKVHeads int
+	headDim, ropeDim                 int
+	eps, ropeBase, ropeScale         float32
 }
 
 type TextModel struct {
@@ -35,16 +36,15 @@ type SelfAttention struct {
 
 func (sa *SelfAttention) Forward(ctx ml.Context, hiddenState, positionIDs ml.Tensor, cache kvcache.Cache, opts *TextOptions) ml.Tensor {
 	batchSize := hiddenState.Dim(1)
-	ropeType := uint32(0)
 	headDim := cmp.Or(opts.headDim, opts.hiddenSize/opts.numHeads)
 
 	q := sa.Query.Forward(ctx, hiddenState)
 	q = q.Reshape(ctx, headDim, opts.numHeads, batchSize)
-	q = q.RoPE(ctx, positionIDs, nil, opts.ropeDim, ropeType, opts.ropeBase, opts.ropeScale)
+	q = fast.RoPE(ctx, q, positionIDs, opts.ropeDim, opts.ropeBase, opts.ropeScale)
 
 	k := sa.Key.Forward(ctx, hiddenState)
 	k = k.Reshape(ctx, headDim, opts.numKVHeads, batchSize)
-	k = k.RoPE(ctx, positionIDs, nil, opts.ropeDim, ropeType, opts.ropeBase, opts.ropeScale)
+	k = fast.RoPE(ctx, k, positionIDs, opts.ropeDim, opts.ropeBase, opts.ropeScale)
 
 	v := sa.Value.Forward(ctx, hiddenState)
 	v = v.Reshape(ctx, headDim, opts.numKVHeads, batchSize)
@@ -55,7 +55,7 @@ func (sa *SelfAttention) Forward(ctx ml.Context, hiddenState, positionIDs ml.Ten
 }
 
 func (m *TextModel) Shift(ctx ml.Context, layer int, key, shift ml.Tensor) (ml.Tensor, error) {
-	return key.RoPE(ctx, shift, nil, uint32(0), m.ropeDim, m.ropeBase, m.ropeScale), nil
+	return fast.RoPE(ctx, key, shift, m.ropeDim, m.ropeBase, m.ropeScale), nil
 }
 
 type MLP struct {
@@ -129,10 +129,10 @@ func newTextModel(c fs.Config) *TextModel {
 			numHeads:   int(c.Uint("attention.head_count")),
 			numKVHeads: int(c.Uint("attention.head_count_kv")),
 			headDim:    int(c.Uint("attention.key_length")),
+			ropeDim:    int(c.Uint("rope.dimension_count")),
 			eps:        c.Float("attention.layer_norm_rms_epsilon"),
 			ropeBase:   c.Float("rope.freq_base"),
 			ropeScale:  c.Float("rope.freq_scale", 1),
-			ropeDim:    c.Uint("rope.dimension_count"),
 		},
 	}
 }
diff --git a/model/models/mllama/model_text.go b/model/models/mllama/model_text.go
index 9bd414af..47a518ce 100644
--- a/model/models/mllama/model_text.go
+++ b/model/models/mllama/model_text.go
@@ -8,6 +8,8 @@ import (
 	"github.com/ollama/ollama/kvcache"
 	"github.com/ollama/ollama/ml"
 	"github.com/ollama/ollama/ml/nn"
+	"github.com/ollama/ollama/ml/nn/fast"
+	"github.com/ollama/ollama/ml/nn/rope"
 )
 
 type TextSelfAttention struct {
@@ -21,15 +23,14 @@ type TextSelfAttention struct {
 func (sa *TextSelfAttention) Forward(ctx ml.Context, hiddenState, positions ml.Tensor, cache *kvcache.WrapperCache, opts *TextModelOptions) ml.Tensor {
 	batchSize := hiddenState.Dim(1)
 	headDim := opts.hiddenSize / opts.numHeads
-	ropeType := uint32(0)
 
 	query := sa.Query.Forward(ctx, hiddenState)
 	query = query.Reshape(ctx, headDim, opts.numHeads, batchSize)
-	query = query.RoPE(ctx, positions, sa.RopeFactors, opts.ropeDim, ropeType, opts.ropeBase, opts.ropeScale)
+	query = fast.RoPE(ctx, query, positions, opts.ropeDim, opts.ropeBase, opts.ropeScale, rope.WithFactors(sa.RopeFactors))
 
 	key := sa.Key.Forward(ctx, hiddenState)
 	key = key.Reshape(ctx, headDim, opts.numKVHeads, batchSize)
-	key = key.RoPE(ctx, positions, sa.RopeFactors, opts.ropeDim, ropeType, opts.ropeBase, opts.ropeScale)
+	key = fast.RoPE(ctx, key, positions, opts.ropeDim, opts.ropeBase, opts.ropeScale, rope.WithFactors(sa.RopeFactors))
 
 	value := sa.Value.Forward(ctx, hiddenState)
 	value = value.Reshape(ctx, headDim, opts.numKVHeads, batchSize)
@@ -44,7 +45,7 @@ func (sa *TextSelfAttention) Forward(ctx ml.Context, hiddenState, positions ml.T
 func (m *TextModel) Shift(ctx ml.Context, layer int, key, shift ml.Tensor) (ml.Tensor, error) {
 	// This will only get called for layers in the cache, which are just the self attention layers
 	if sa, ok := m.Transformer.Layers[layer].(*TextSelfAttentionDecoderLayer); ok {
-		return key.RoPE(ctx, shift, sa.SelfAttention.RopeFactors, m.ropeDim, uint32(0), m.ropeBase, m.ropeScale), nil
+		return fast.RoPE(ctx, key, shift, m.ropeDim, m.ropeBase, m.ropeScale, rope.WithFactors(sa.SelfAttention.RopeFactors)), nil
 	}
 
 	return key, nil
@@ -199,8 +200,8 @@ func (d *TextDecoder) Forward(ctx ml.Context, hiddenState, positionIDs, outputs,
 
 type TextModelOptions struct {
 	hiddenSize, numHeads, numKVHeads int
+	ropeDim                          int
 	eps, ropeBase, ropeScale         float32
-	ropeDim                          uint32
 
 	crossAttentionLayers []int32
 }
@@ -240,10 +241,10 @@ func newTextModel(c fs.Config) *TextModel {
 			hiddenSize:           int(c.Uint("embedding_length")),
 			numHeads:             int(c.Uint("attention.head_count")),
 			numKVHeads:           int(c.Uint("attention.head_count_kv")),
+			ropeDim:              int(c.Uint("rope.dimension_count")),
 			eps:                  c.Float("attention.layer_norm_rms_epsilon"),
 			ropeBase:             c.Float("rope.freq_base"),
 			ropeScale:            c.Float("rope.freq_scale", 1),
-			ropeDim:              c.Uint("rope.dimension_count"),
 			crossAttentionLayers: c.Ints("attention.cross_attention_layers"),
 		},
 	}
diff --git a/model/models/qwen25vl/model_text.go b/model/models/qwen25vl/model_text.go
index 800fd961..4b6bc166 100644
--- a/model/models/qwen25vl/model_text.go
+++ b/model/models/qwen25vl/model_text.go
@@ -7,13 +7,15 @@ import (
 	"github.com/ollama/ollama/kvcache"
 	"github.com/ollama/ollama/ml"
 	"github.com/ollama/ollama/ml/nn"
+	"github.com/ollama/ollama/ml/nn/fast"
+	"github.com/ollama/ollama/ml/nn/rope"
 	"github.com/ollama/ollama/model/input"
 )
 
 type TextOptions struct {
-	ctxLen, hiddenSize, numHeads, numKVHeads int
-	eps, ropeBase, ropeScale                 float32
-	ropeDim, defaultContextLen               uint32
+	hiddenSize, numHeads, numKVHeads int
+	ropeDim, originalContextLength   int
+	eps, ropeBase, ropeScale         float32
 }
 
 type TextModel struct {
@@ -29,15 +31,14 @@ func NewTextModel(c fs.Config) *TextModel {
 	m := TextModel{
 		Layers: make([]Layer, c.Uint("block_count")),
 		TextOptions: &TextOptions{
-			ctxLen:            int(c.Uint("context_length")),
-			hiddenSize:        int(c.Uint("embedding_length")),
-			numHeads:          int(c.Uint("attention.head_count")),
-			numKVHeads:        int(c.Uint("attention.head_count_kv")),
-			eps:               c.Float("attention.layer_norm_rms_epsilon"),
-			ropeBase:          c.Float("rope.freq_base"),
-			ropeScale:         c.Float("rope.freq_scale", 1),
-			ropeDim:           c.Uint("rope.dimension_count", 128),
-			defaultContextLen: c.Uint("context_length", 128000),
+			hiddenSize:            int(c.Uint("embedding_length")),
+			numHeads:              int(c.Uint("attention.head_count")),
+			numKVHeads:            int(c.Uint("attention.head_count_kv")),
+			ropeDim:               int(c.Uint("rope.dimension_count", 128)),
+			originalContextLength: int(c.Uint("context_length", 128000)),
+			eps:                   c.Float("attention.layer_norm_rms_epsilon"),
+			ropeBase:              c.Float("rope.freq_base"),
+			ropeScale:             c.Float("rope.freq_scale", 1),
 		},
 	}
 
@@ -59,11 +60,11 @@ func (sa *SelfAttention) Forward(ctx ml.Context, hiddenState, positionIDs ml.Ten
 
 	q := sa.Query.Forward(ctx, hiddenState)
 	q = q.Reshape(ctx, headDim, opts.numHeads, batchSize)
-	q = q.RoPE(ctx, positionIDs, nil, opts.ropeDim, 2, opts.ropeBase, opts.ropeScale, ml.WithContextLen(opts.defaultContextLen))
+	q = fast.RoPE(ctx, q, positionIDs, opts.ropeDim, opts.ropeBase, opts.ropeScale, rope.WithOriginalContextLength(opts.originalContextLength), rope.WithTypeNeoX())
 
 	k := sa.Key.Forward(ctx, hiddenState)
 	k = k.Reshape(ctx, headDim, opts.numKVHeads, batchSize)
-	k = k.RoPE(ctx, positionIDs, nil, opts.ropeDim, 2, opts.ropeBase, opts.ropeScale, ml.WithContextLen(opts.defaultContextLen))
+	k = fast.RoPE(ctx, k, positionIDs, opts.ropeDim, opts.ropeBase, opts.ropeScale, rope.WithOriginalContextLength(opts.originalContextLength), rope.WithTypeNeoX())
 
 	v := sa.Value.Forward(ctx, hiddenState)
 	v = v.Reshape(ctx, headDim, opts.numKVHeads, batchSize)
@@ -77,7 +78,7 @@ func (sa *SelfAttention) Forward(ctx ml.Context, hiddenState, positionIDs ml.Ten
 
 // Shift applies rotary position embeddings to the key tensor for causal attention caching
 func (m *TextModel) Shift(ctx ml.Context, layer int, key, shift ml.Tensor) (ml.Tensor, error) {
-	return key.RoPE(ctx, shift, nil, m.ropeDim, 2, m.ropeBase, m.ropeScale, ml.WithContextLen(m.defaultContextLen)), nil
+	return fast.RoPE(ctx, key, shift, m.ropeDim, m.ropeBase, m.ropeScale, rope.WithOriginalContextLength(m.originalContextLength), rope.WithTypeNeoX()), nil
 }
 
 // MLP implements the feed-forward network component with SwiGLU activation

From 69b2fe9282323a57cd3557bed9b598b465d1b3a6 Mon Sep 17 00:00:00 2001
From: Michael Yang <mxyng@pm.me>
Date: Wed, 21 May 2025 09:39:20 -0700
Subject: [PATCH 042/108] fix: qwen25vl assign samebatch in multimodal input
 (#10789)

setting samebatch on the vision start token is problematic because it
will be shared with other inputs that also use images. this will cause
the input to be cached and the runner will not see SameBatch. SameBatch
will also be incorrect since it may be for a different image.

assigning samebatch to the input tokens resolves this by ensure it's
assigned correctly to inputs corresponding to the image.

not setting same batch correctly may cause panics during inference since
images are no longer guaranteed to be in the same batch.
---
 model/models/qwen25vl/model.go | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/model/models/qwen25vl/model.go b/model/models/qwen25vl/model.go
index 7de9b6eb..32cca560 100644
--- a/model/models/qwen25vl/model.go
+++ b/model/models/qwen25vl/model.go
@@ -121,13 +121,14 @@ func (m *Model) PostTokenize(inputs []input.Input) ([]input.Input, error) {
 			patchesPerChunk := inp.Multimodal[0].Tensor.Dim(1)
 
 			// First add the vision start token
-			result = append(result, input.Input{Token: visionStartToken, SameBatch: patchesPerChunk + 1})
+			result = append(result, input.Input{Token: visionStartToken})
 
 			// Add the image token with the multimodal tensor data at the first position
 			result = append(result, input.Input{
 				Token:          imageToken,
 				Multimodal:     inp.Multimodal,
 				MultimodalHash: inp.MultimodalHash,
+				SameBatch:      patchesPerChunk,
 			})
 
 			// Add the placeholder tokens for the remaining positions (tokensPerGrid-1)

From 375839ea2d05a056d02f934f02e953b41f1d444d Mon Sep 17 00:00:00 2001
From: Michael Yang <mxyng@pm.me>
Date: Wed, 21 May 2025 09:39:38 -0700
Subject: [PATCH 043/108] chore: disable debug in binary libraries (#10788)

---
 CMakeLists.txt                               | 2 ++
 ml/backend/ggml/ggml/src/ggml-cpu/cpu.go     | 2 +-
 ml/backend/ggml/ggml/src/ggml-metal/metal.go | 2 +-
 3 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 5343d877..ce41288b 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -51,6 +51,8 @@ include_directories(${CMAKE_CURRENT_SOURCE_DIR}/ml/backend/ggml/ggml/src/include
 include_directories(${CMAKE_CURRENT_SOURCE_DIR}/ml/backend/ggml/ggml/src/ggml-cpu)
 include_directories(${CMAKE_CURRENT_SOURCE_DIR}/ml/backend/ggml/ggml/src/ggml-cpu/amx)
 
+add_compiler_definitions(NDEBUG)
+
 set(GGML_CPU ON)
 add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/ml/backend/ggml/ggml/src)
 set_property(TARGET ggml PROPERTY EXCLUDE_FROM_ALL TRUE)
diff --git a/ml/backend/ggml/ggml/src/ggml-cpu/cpu.go b/ml/backend/ggml/ggml/src/ggml-cpu/cpu.go
index 895d093c..895b7f6e 100644
--- a/ml/backend/ggml/ggml/src/ggml-cpu/cpu.go
+++ b/ml/backend/ggml/ggml/src/ggml-cpu/cpu.go
@@ -3,7 +3,7 @@ package cpu
 // #cgo CFLAGS: -O3 -Wno-implicit-function-declaration
 // #cgo CXXFLAGS: -std=c++17
 // #cgo CPPFLAGS: -I${SRCDIR}/amx -I${SRCDIR}/llamafile -I${SRCDIR}/.. -I${SRCDIR}/../../include
-// #cgo CPPFLAGS: -DGGML_USE_LLAMAFILE
+// #cgo CPPFLAGS: -DNDEBUG -DGGML_USE_LLAMAFILE
 // #cgo linux CPPFLAGS: -D_GNU_SOURCE
 // #cgo darwin,arm64 CPPFLAGS: -DGGML_USE_ACCELERATE -DACCELERATE_NEW_LAPACK -DACCELERATE_LAPACK_ILP64
 // #cgo darwin,arm64 LDFLAGS: -framework Accelerate
diff --git a/ml/backend/ggml/ggml/src/ggml-metal/metal.go b/ml/backend/ggml/ggml/src/ggml-metal/metal.go
index eb65dfde..0ee017dd 100644
--- a/ml/backend/ggml/ggml/src/ggml-metal/metal.go
+++ b/ml/backend/ggml/ggml/src/ggml-metal/metal.go
@@ -4,6 +4,6 @@ package metal
 
 //go:generate sh -c "{ echo // Code generated by 'go generate'. DO NOT EDIT.; sed -e '/__embed_ggml-common.h__/r ../ggml-common.h' -e '/__embed_ggml-common.h__/d' -e '/#include \"ggml-metal-impl.h\"/r ggml-metal-impl.h' -e '/#include \"ggml-metal-impl.h\"/d' ggml-metal.metal; } >ggml-metal-embed.metal"
 
-// #cgo CPPFLAGS: -DGGML_METAL_EMBED_LIBRARY -I.. -I../../include
+// #cgo CPPFLAGS: -DGGML_METAL_NDEBUG -DGGML_METAL_EMBED_LIBRARY -I.. -I../../include
 // #cgo LDFLAGS: -framework Metal -framework MetalKit
 import "C"

From 139f84cf21f8d8107f69c1404f17a8840c6d67d0 Mon Sep 17 00:00:00 2001
From: Michael Yang <mxyng@pm.me>
Date: Wed, 21 May 2025 09:52:52 -0700
Subject: [PATCH 044/108] fix cmakelists (#10804)

this fixes an issue introduced in #10788
---
 CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index ce41288b..c005d014 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -51,7 +51,7 @@ include_directories(${CMAKE_CURRENT_SOURCE_DIR}/ml/backend/ggml/ggml/src/include
 include_directories(${CMAKE_CURRENT_SOURCE_DIR}/ml/backend/ggml/ggml/src/ggml-cpu)
 include_directories(${CMAKE_CURRENT_SOURCE_DIR}/ml/backend/ggml/ggml/src/ggml-cpu/amx)
 
-add_compiler_definitions(NDEBUG)
+add_compile_definitions(NDEBUG)
 
 set(GGML_CPU ON)
 add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/ml/backend/ggml/ggml/src)

From e0ed984cde1f6191e38ac2d7f4415ffd619a631f Mon Sep 17 00:00:00 2001
From: Michael Yang <mxyng@pm.me>
Date: Wed, 21 May 2025 10:21:07 -0700
Subject: [PATCH 045/108] feat: qwen3 dense and sparse models (#10708)

* feat: qwen3 dense
* feat: qwen3moe
* fix llama4 moe
---
 ml/backend.go                     |   3 +
 ml/backend/ggml/ggml.go           |  14 ++
 model/models/llama4/model_text.go |   2 +-
 model/models/models.go            |   1 +
 model/models/qwen3/model.go       | 239 ++++++++++++++++++++++++++++++
 5 files changed, 258 insertions(+), 1 deletion(-)
 create mode 100644 model/models/qwen3/model.go

diff --git a/ml/backend.go b/ml/backend.go
index 70497e6e..3c417ef9 100644
--- a/ml/backend.go
+++ b/ml/backend.go
@@ -128,6 +128,8 @@ type Tensor interface {
 	Neg(ctx Context) Tensor
 	Add(ctx Context, t2 Tensor) Tensor
 	Mul(ctx Context, t2 Tensor) Tensor
+	Div(ctx Context, t2 Tensor) Tensor
+
 	Mulmat(ctx Context, t2 Tensor) Tensor
 	MulmatFullPrec(ctx Context, t2 Tensor) Tensor
 	MulmatID(ctx Context, t2, ids Tensor) Tensor
@@ -136,6 +138,7 @@ type Tensor interface {
 	LayerNorm(ctx Context, weight, bias Tensor, eps float32) Tensor
 	RMSNorm(ctx Context, weight Tensor, eps float32) Tensor
 	Scale(ctx Context, s float64) Tensor
+	SumRows(ctx Context) Tensor
 
 	AvgPool2D(ctx Context, k, s int, p float32) Tensor
 	Conv2D(ctx Context, weight Tensor, s0, s1, p0, p1, d0, d1 int) Tensor
diff --git a/ml/backend/ggml/ggml.go b/ml/backend/ggml/ggml.go
index 3ae50ace..44e3b61b 100644
--- a/ml/backend/ggml/ggml.go
+++ b/ml/backend/ggml/ggml.go
@@ -887,6 +887,13 @@ func (t *Tensor) Mul(ctx ml.Context, t2 ml.Tensor) ml.Tensor {
 	}
 }
 
+func (t *Tensor) Div(ctx ml.Context, t2 ml.Tensor) ml.Tensor {
+	return &Tensor{
+		b: t.b,
+		t: C.ggml_div(ctx.(*Context).ctx, t.t, t2.(*Tensor).t),
+	}
+}
+
 func (t *Tensor) Mulmat(ctx ml.Context, t2 ml.Tensor) ml.Tensor {
 	return &Tensor{
 		b: t.b,
@@ -1004,6 +1011,13 @@ func (t *Tensor) Scale(ctx ml.Context, s float64) ml.Tensor {
 	}
 }
 
+func (t *Tensor) SumRows(ctx ml.Context) ml.Tensor {
+	return &Tensor{
+		b: t.b,
+		t: C.ggml_sum_rows(ctx.(*Context).ctx, t.t),
+	}
+}
+
 func (t *Tensor) Softmax(ctx ml.Context) ml.Tensor {
 	return &Tensor{
 		b: t.b,
diff --git a/model/models/llama4/model_text.go b/model/models/llama4/model_text.go
index 829c805c..ff9f5e20 100644
--- a/model/models/llama4/model_text.go
+++ b/model/models/llama4/model_text.go
@@ -82,7 +82,7 @@ func (e *TextExperts) Forward(ctx ml.Context, hiddenStates, routerLogits ml.Tens
 
 	nextStates := downStates.View(ctx, 0, hiddenStates.Dim(0), downStates.Stride(2), hiddenStates.Dim(2))
 	for i := 1; i < opts.numExpertsUsed; i++ {
-		nextStates.Add(ctx, downStates.View(ctx, i*downStates.Stride(1), hiddenStates.Dim(0), downStates.Stride(2), hiddenStates.Dim(2)))
+		nextStates = nextStates.Add(ctx, downStates.View(ctx, i*downStates.Stride(1), hiddenStates.Dim(0), downStates.Stride(2), hiddenStates.Dim(2)))
 	}
 
 	return nextStates
diff --git a/model/models/models.go b/model/models/models.go
index 133e5176..fd935f30 100644
--- a/model/models/models.go
+++ b/model/models/models.go
@@ -8,4 +8,5 @@ import (
 	_ "github.com/ollama/ollama/model/models/mistral3"
 	_ "github.com/ollama/ollama/model/models/mllama"
 	_ "github.com/ollama/ollama/model/models/qwen25vl"
+	_ "github.com/ollama/ollama/model/models/qwen3"
 )
diff --git a/model/models/qwen3/model.go b/model/models/qwen3/model.go
new file mode 100644
index 00000000..44c32f9e
--- /dev/null
+++ b/model/models/qwen3/model.go
@@ -0,0 +1,239 @@
+package qwen3
+
+import (
+	"cmp"
+	"math"
+
+	"github.com/ollama/ollama/fs"
+	"github.com/ollama/ollama/kvcache"
+	"github.com/ollama/ollama/ml"
+	"github.com/ollama/ollama/ml/nn"
+	"github.com/ollama/ollama/ml/nn/fast"
+	"github.com/ollama/ollama/ml/nn/rope"
+	"github.com/ollama/ollama/model"
+	"github.com/ollama/ollama/model/input"
+)
+
+type Options struct {
+	hiddenSize, numHeads, numKVHeads int
+	eps                              float32
+	ropeBase, ropeScale              float32
+
+	keyLength, valueLength int
+
+	numExperts, numExpertsUsed int
+	normTopKProb               bool
+}
+
+func (o Options) headDim() int {
+	return cmp.Or(o.keyLength, o.valueLength, o.hiddenSize/o.numHeads)
+}
+
+type Attention struct {
+	QueryNorm *nn.RMSNorm `gguf:"attn_q_norm"`
+	Query     *nn.Linear  `gguf:"attn_q"`
+	KeyNorm   *nn.RMSNorm `gguf:"attn_k_norm"`
+	Key       *nn.Linear  `gguf:"attn_k"`
+	Value     *nn.Linear  `gguf:"attn_v"`
+	Output    *nn.Linear  `gguf:"attn_output"`
+}
+
+func (sa *Attention) Forward(ctx ml.Context, hiddenStates, positions ml.Tensor, cache kvcache.Cache, opts *Options) ml.Tensor {
+	batchSize := hiddenStates.Dim(1)
+
+	query := sa.Query.Forward(ctx, hiddenStates)
+	key := sa.Key.Forward(ctx, hiddenStates)
+	value := sa.Value.Forward(ctx, hiddenStates)
+
+	query = query.Reshape(ctx, opts.headDim(), opts.numHeads, batchSize)
+	key = key.Reshape(ctx, opts.headDim(), opts.numKVHeads, batchSize)
+	value = value.Reshape(ctx, opts.headDim(), opts.numKVHeads, batchSize)
+
+	query = sa.QueryNorm.Forward(ctx, query, opts.eps)
+	key = sa.KeyNorm.Forward(ctx, key, opts.eps)
+
+	query = fast.RoPE(ctx, query, positions, opts.headDim(), opts.ropeBase, opts.ropeScale, rope.WithTypeNeoX())
+	key = fast.RoPE(ctx, key, positions, opts.headDim(), opts.ropeBase, opts.ropeScale, rope.WithTypeNeoX())
+
+	attention := nn.Attention(ctx, query, key, value, 1./math.Sqrt(float64(opts.headDim())), cache)
+	attention = attention.Reshape(ctx, attention.Dim(0)*attention.Dim(1), batchSize)
+	return sa.Output.Forward(ctx, attention)
+}
+
+type MLP interface {
+	Forward(ml.Context, ml.Tensor, *Options) ml.Tensor
+}
+
+type sparse struct {
+	Router *nn.Linear `gguf:"ffn_gate_inp"`
+	Gate   ml.Tensor  `gguf:"ffn_gate_exps.weight"`
+	Up     ml.Tensor  `gguf:"ffn_up_exps.weight"`
+	Down   ml.Tensor  `gguf:"ffn_down_exps.weight"`
+}
+
+func (mlp *sparse) Forward(ctx ml.Context, hiddenStates ml.Tensor, opts *Options) ml.Tensor {
+	hiddenDim, sequenceLength, batchSize := hiddenStates.Dim(0), hiddenStates.Dim(1), hiddenStates.Dim(2)
+	hiddenStates = hiddenStates.Reshape(ctx, hiddenDim, sequenceLength*batchSize)
+	routerLogits := mlp.Router.Forward(ctx, hiddenStates)
+
+	routingWeights := routerLogits.Softmax(ctx)
+	selectedExperts := routingWeights.TopK(ctx, opts.numExpertsUsed)
+	routingWeights = routingWeights.Reshape(ctx, 1, opts.numExperts, hiddenStates.Dim(1)).Rows(ctx, selectedExperts)
+	if opts.normTopKProb {
+		routingWeights = routingWeights.Reshape(ctx, opts.numExpertsUsed, hiddenStates.Dim(1))
+		routingWeights = routingWeights.Div(ctx, routingWeights.SumRows(ctx))
+		routingWeights = routingWeights.Reshape(ctx, 1, opts.numExpertsUsed, hiddenStates.Dim(1))
+	}
+
+	hiddenStates = hiddenStates.Reshape(ctx, hiddenStates.Dim(0), 1, hiddenStates.Dim(1))
+
+	upStates := mlp.Up.MulmatID(ctx, hiddenStates, selectedExperts)
+
+	hiddenStates = mlp.Gate.MulmatID(ctx, hiddenStates, selectedExperts)
+	hiddenStates = hiddenStates.SILU(ctx)
+	hiddenStates = hiddenStates.Mul(ctx, upStates)
+
+	experts := mlp.Down.MulmatID(ctx, hiddenStates, selectedExperts)
+	experts = experts.Mul(ctx, routingWeights)
+
+	nextStates := experts.View(ctx, 0, experts.Dim(0), experts.Stride(2), experts.Dim(2))
+	for i := 1; i < opts.numExpertsUsed; i++ {
+		nextStates = nextStates.Add(ctx, experts.View(ctx, i*experts.Stride(1), experts.Dim(0), experts.Stride(2), experts.Dim(2)))
+	}
+
+	return nextStates
+}
+
+type dense struct {
+	Gate *nn.Linear `gguf:"ffn_gate"`
+	Up   *nn.Linear `gguf:"ffn_up"`
+	Down *nn.Linear `gguf:"ffn_down"`
+}
+
+func (mlp *dense) Forward(ctx ml.Context, hiddenStates ml.Tensor, _ *Options) ml.Tensor {
+	hiddenStates = mlp.Gate.Forward(ctx, hiddenStates).SILU(ctx).Mul(ctx, mlp.Up.Forward(ctx, hiddenStates))
+	return mlp.Down.Forward(ctx, hiddenStates)
+}
+
+type Layer struct {
+	AttentionNorm *nn.RMSNorm `gguf:"attn_norm"`
+	*Attention
+
+	MLPNorm *nn.RMSNorm `gguf:"ffn_norm"`
+	MLP
+}
+
+func (d *Layer) Forward(ctx ml.Context, hiddenStates, positions, outputs ml.Tensor, cache kvcache.Cache, opts *Options) ml.Tensor {
+	residual := hiddenStates
+	hiddenStates = d.AttentionNorm.Forward(ctx, hiddenStates, opts.eps)
+	hiddenStates = d.Attention.Forward(ctx, hiddenStates, positions, cache, opts)
+
+	if outputs != nil {
+		hiddenStates = hiddenStates.Rows(ctx, outputs)
+		residual = residual.Rows(ctx, outputs)
+	}
+
+	hiddenStates = hiddenStates.Add(ctx, residual)
+
+	residual = hiddenStates
+	hiddenStates = d.MLPNorm.Forward(ctx, hiddenStates, opts.eps)
+	hiddenStates = d.MLP.Forward(ctx, hiddenStates, opts)
+	return hiddenStates.Add(ctx, residual)
+}
+
+type Model struct {
+	model.Base
+	model.BytePairEncoding
+
+	TokenEmbedding *nn.Embedding `gguf:"token_embd"`
+	OutputNorm     *nn.RMSNorm   `gguf:"output_norm"`
+	Output         *nn.Linear    `gguf:"output,alt:token_embd"`
+
+	Layers []Layer `gguf:"blk"`
+
+	*Options
+}
+
+// Forward implements model.Model.
+func (m *Model) Forward(ctx ml.Context, batch input.Batch) (ml.Tensor, error) {
+	positions, err := ctx.Input().FromIntSlice(batch.Positions, len(batch.Positions))
+	if err != nil {
+		return nil, err
+	}
+
+	hiddenStates := m.TokenEmbedding.Forward(ctx, batch.Inputs)
+
+	for i, layer := range m.Layers {
+		m.Cache.SetLayer(i)
+
+		var outputs ml.Tensor
+		if i == len(m.Layers)-1 {
+			outputs, err = ctx.Input().FromIntSlice(batch.Outputs, len(batch.Outputs))
+			if err != nil {
+				return nil, err
+			}
+		}
+
+		hiddenStates = layer.Forward(ctx, hiddenStates, positions, outputs, m.Cache, m.Options)
+	}
+
+	hiddenStates = m.OutputNorm.Forward(ctx, hiddenStates, m.eps)
+	return m.Output.Forward(ctx, hiddenStates), nil
+}
+
+func (m *Model) Shift(ctx ml.Context, layer int, key, shift ml.Tensor) (ml.Tensor, error) {
+	return fast.RoPE(ctx, key, shift, m.headDim(), m.ropeBase, m.ropeScale, rope.WithTypeNeoX()), nil
+}
+
+var _ model.Model = (*Model)(nil)
+
+func New(c fs.Config) (model.Model, error) {
+	layers := make([]Layer, c.Uint("block_count"))
+	for i := range layers {
+		if c.String("general.architecture") == "qwen3moe" {
+			layers[i].MLP = &sparse{}
+		} else {
+			layers[i].MLP = &dense{}
+		}
+	}
+
+	m := Model{
+		BytePairEncoding: model.NewBytePairEncoding(
+			`(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+`,
+			&model.Vocabulary{
+				Values: c.Strings("tokenizer.ggml.tokens"),
+				Types:  c.Ints("tokenizer.ggml.token_type"),
+				Merges: c.Strings("tokenizer.ggml.merges"),
+				AddBOS: c.Bool("tokenizer.ggml.add_bos_token", true),
+				BOS:    []int32{int32(c.Uint("tokenizer.ggml.bos_token_id"))},
+				AddEOS: c.Bool("tokenizer.ggml.add_eos_token", false),
+				EOS: append(
+					[]int32{int32(c.Uint("tokenizer.ggml.eos_token_id"))},
+					c.Ints("tokenizer.ggml.eos_token_ids")...,
+				),
+			},
+		),
+		Layers: layers,
+		Options: &Options{
+			hiddenSize:     int(c.Uint("embedding_length")),
+			numHeads:       int(c.Uint("attention.head_count")),
+			numKVHeads:     int(c.Uint("attention.head_count_kv")),
+			keyLength:      int(c.Uint("attention.key_length")),
+			valueLength:    int(c.Uint("attention.value_length")),
+			eps:            c.Float("attention.layer_norm_rms_epsilon"),
+			ropeBase:       c.Float("rope.freq_base"),
+			ropeScale:      c.Float("rope.freq_scale", 1),
+			numExperts:     int(c.Uint("expert_count")),
+			numExpertsUsed: int(c.Uint("expert_used_count")),
+			normTopKProb:   c.Bool("norm_top_k_prob", true),
+		},
+	}
+
+	m.Cache = kvcache.NewCausalCache(m.Shift)
+	return &m, nil
+}
+
+func init() {
+	model.Register("qwen3", New)
+	model.Register("qwen3moe", New)
+}

From c890011322fbdd325ef9f16e425fe1f5213a24fe Mon Sep 17 00:00:00 2001
From: Michael Yang <mxyng@pm.me>
Date: Wed, 21 May 2025 10:21:24 -0700
Subject: [PATCH 046/108] feat: port qwen2 model (#10782)

---
 model/models/llama/model.go |  47 +++++-----
 model/models/models.go      |   1 +
 model/models/qwen2/model.go | 170 ++++++++++++++++++++++++++++++++++++
 3 files changed, 194 insertions(+), 24 deletions(-)
 create mode 100644 model/models/qwen2/model.go

diff --git a/model/models/llama/model.go b/model/models/llama/model.go
index 7b475512..507f1ebc 100644
--- a/model/models/llama/model.go
+++ b/model/models/llama/model.go
@@ -75,30 +75,31 @@ type SelfAttention struct {
 	RopeFactors ml.Tensor  `gguf:"rope_freqs.weight"`
 }
 
-func (sa *SelfAttention) Forward(ctx ml.Context, hiddenState, positionIDs ml.Tensor, cache kvcache.Cache, opts *Options) ml.Tensor {
+func (sa *SelfAttention) Forward(ctx ml.Context, hiddenState, positions ml.Tensor, cache kvcache.Cache, opts *Options) ml.Tensor {
 	batchSize := hiddenState.Dim(1)
 	headDim := cmp.Or(opts.headDim, opts.hiddenSize/opts.numHeads)
+	ropeDim := cmp.Or(opts.ropeDim, headDim)
 
-	q := sa.Query.Forward(ctx, hiddenState)
-	q = q.Reshape(ctx, headDim, opts.numHeads, batchSize)
-	q = fast.RoPE(ctx, q, positionIDs, opts.ropeDim, opts.ropeBase, opts.ropeScale, rope.WithFactors(sa.RopeFactors))
+	query := sa.Query.Forward(ctx, hiddenState)
+	query = query.Reshape(ctx, headDim, opts.numHeads, batchSize)
 
-	k := sa.Key.Forward(ctx, hiddenState)
-	k = k.Reshape(ctx, headDim, opts.numKVHeads, batchSize)
-	k = fast.RoPE(ctx, k, positionIDs, opts.ropeDim, opts.ropeBase, opts.ropeScale, rope.WithFactors(sa.RopeFactors))
+	key := sa.Key.Forward(ctx, hiddenState)
+	key = key.Reshape(ctx, headDim, opts.numKVHeads, batchSize)
 
-	v := sa.Value.Forward(ctx, hiddenState)
-	v = v.Reshape(ctx, headDim, opts.numKVHeads, batchSize)
+	value := sa.Value.Forward(ctx, hiddenState)
+	value = value.Reshape(ctx, headDim, opts.numKVHeads, batchSize)
 
-	scaleFactor := 1.0 / math.Sqrt(float64(headDim))
-	kqv := nn.Attention(ctx, q, k, v, scaleFactor, cache)
-	kqv = kqv.Reshape(ctx, headDim*opts.numHeads, batchSize)
+	query = fast.RoPE(ctx, query, positions, ropeDim, opts.ropeBase, opts.ropeScale, rope.WithFactors(sa.RopeFactors))
+	key = fast.RoPE(ctx, key, positions, ropeDim, opts.ropeBase, opts.ropeScale, rope.WithFactors(sa.RopeFactors))
 
-	return sa.Output.Forward(ctx, kqv)
+	attention := nn.Attention(ctx, query, key, value, 1.0/math.Sqrt(float64(headDim)), cache)
+	attention = attention.Reshape(ctx, headDim*opts.numHeads, batchSize)
+	return sa.Output.Forward(ctx, attention)
 }
 
 func (m *Model) Shift(ctx ml.Context, layer int, key, shift ml.Tensor) (ml.Tensor, error) {
-	return fast.RoPE(ctx, key, shift, m.ropeDim, m.ropeBase, m.ropeScale, rope.WithFactors(m.Layers[layer].SelfAttention.RopeFactors)), nil
+	ropeDim := cmp.Or(m.ropeDim, m.hiddenSize/m.numHeads)
+	return fast.RoPE(ctx, key, shift, ropeDim, m.ropeBase, m.ropeScale, rope.WithFactors(m.Layers[layer].SelfAttention.RopeFactors)), nil
 }
 
 type MLP struct {
@@ -119,11 +120,11 @@ type Layer struct {
 	MLP           *MLP
 }
 
-func (l *Layer) Forward(ctx ml.Context, hiddenState, positionIDs, outputs ml.Tensor, cache kvcache.Cache, opts *Options) ml.Tensor {
+func (l *Layer) Forward(ctx ml.Context, hiddenState, positions, outputs ml.Tensor, cache kvcache.Cache, opts *Options) ml.Tensor {
 	residual := hiddenState
 
 	hiddenState = l.AttentionNorm.Forward(ctx, hiddenState, opts.eps)
-	hiddenState = l.SelfAttention.Forward(ctx, hiddenState, positionIDs, cache, opts)
+	hiddenState = l.SelfAttention.Forward(ctx, hiddenState, positions, cache, opts)
 
 	// In the final layer (outputs != nil), optimize by pruning to just the token positions
 	// we need logits for.
@@ -146,22 +147,20 @@ func (m *Model) Forward(ctx ml.Context, batch input.Batch) (ml.Tensor, error) {
 		return nil, err
 	}
 
-	outputs, err := ctx.Input().FromIntSlice(batch.Outputs, len(batch.Outputs))
-	if err != nil {
-		return nil, err
-	}
-
 	hiddenState := m.TokenEmbedding.Forward(ctx, batch.Inputs)
 
 	for i, layer := range m.Layers {
 		m.Cache.SetLayer(i)
 
-		var lastLayerOutputs ml.Tensor
+		var outputs ml.Tensor
 		if i == len(m.Layers)-1 {
-			lastLayerOutputs = outputs
+			outputs, err = ctx.Input().FromIntSlice(batch.Outputs, len(batch.Outputs))
+			if err != nil {
+				return nil, err
+			}
 		}
 
-		hiddenState = layer.Forward(ctx, hiddenState, positions, lastLayerOutputs, m.Cache, m.Options)
+		hiddenState = layer.Forward(ctx, hiddenState, positions, outputs, m.Cache, m.Options)
 	}
 
 	hiddenState = m.OutputNorm.Forward(ctx, hiddenState, m.eps)
diff --git a/model/models/models.go b/model/models/models.go
index fd935f30..5471ce89 100644
--- a/model/models/models.go
+++ b/model/models/models.go
@@ -7,6 +7,7 @@ import (
 	_ "github.com/ollama/ollama/model/models/llama4"
 	_ "github.com/ollama/ollama/model/models/mistral3"
 	_ "github.com/ollama/ollama/model/models/mllama"
+	_ "github.com/ollama/ollama/model/models/qwen2"
 	_ "github.com/ollama/ollama/model/models/qwen25vl"
 	_ "github.com/ollama/ollama/model/models/qwen3"
 )
diff --git a/model/models/qwen2/model.go b/model/models/qwen2/model.go
new file mode 100644
index 00000000..3c3d81aa
--- /dev/null
+++ b/model/models/qwen2/model.go
@@ -0,0 +1,170 @@
+package qwen2
+
+import (
+	"cmp"
+	"math"
+
+	"github.com/ollama/ollama/fs"
+	"github.com/ollama/ollama/kvcache"
+	"github.com/ollama/ollama/ml"
+	"github.com/ollama/ollama/ml/nn"
+	"github.com/ollama/ollama/ml/nn/fast"
+	"github.com/ollama/ollama/ml/nn/rope"
+	"github.com/ollama/ollama/model"
+	"github.com/ollama/ollama/model/input"
+)
+
+type Options struct {
+	hiddenSize, numHeads, numKVHeads int
+	headDim, ropeDim                 int
+	eps, ropeBase, ropeScale         float32
+}
+
+type Attention struct {
+	Query  *nn.Linear `gguf:"attn_q"`
+	Key    *nn.Linear `gguf:"attn_k"`
+	Value  *nn.Linear `gguf:"attn_v"`
+	Output *nn.Linear `gguf:"attn_output"`
+}
+
+func (attn Attention) Forward(ctx ml.Context, hiddenStates, positions ml.Tensor, cache kvcache.Cache, opts *Options) ml.Tensor {
+	batchSize := hiddenStates.Dim(1)
+	headDim := cmp.Or(opts.headDim, opts.hiddenSize/opts.numHeads)
+	ropeDim := cmp.Or(opts.ropeDim, headDim)
+
+	query := attn.Query.Forward(ctx, hiddenStates)
+	query = query.Reshape(ctx, headDim, opts.numHeads, batchSize)
+
+	key := attn.Key.Forward(ctx, hiddenStates)
+	key = key.Reshape(ctx, headDim, opts.numKVHeads, batchSize)
+
+	value := attn.Value.Forward(ctx, hiddenStates)
+	value = value.Reshape(ctx, headDim, opts.numKVHeads, batchSize)
+
+	query = fast.RoPE(ctx, query, positions, ropeDim, opts.ropeBase, opts.ropeScale, rope.WithTypeNeoX())
+	key = fast.RoPE(ctx, key, positions, ropeDim, opts.ropeBase, opts.ropeScale, rope.WithTypeNeoX())
+
+	attention := nn.Attention(ctx, query, key, value, 1.0/math.Sqrt(float64(headDim)), cache)
+	attention = attention.Reshape(ctx, headDim*opts.numHeads, batchSize)
+
+	return attn.Output.Forward(ctx, attention)
+}
+
+type MLP struct {
+	Gate *nn.Linear `gguf:"ffn_gate"`
+	Up   *nn.Linear `gguf:"ffn_up"`
+	Down *nn.Linear `gguf:"ffn_down"`
+}
+
+func (mlp MLP) Forward(ctx ml.Context, hiddenStates ml.Tensor) ml.Tensor {
+	hiddenStates = mlp.Gate.Forward(ctx, hiddenStates).SILU(ctx).Mul(ctx, mlp.Up.Forward(ctx, hiddenStates))
+	return mlp.Down.Forward(ctx, hiddenStates)
+}
+
+type DecoderLayer struct {
+	AttentionNorm *nn.RMSNorm `gguf:"attn_norm"`
+	Attention     *Attention
+	MLPNorm       *nn.RMSNorm `gguf:"ffn_norm"`
+	MLP           *MLP
+}
+
+func (d DecoderLayer) Forward(ctx ml.Context, hiddenStates, positions, outputs ml.Tensor, cache kvcache.Cache, opts *Options) ml.Tensor {
+	residual := hiddenStates
+
+	hiddenStates = d.AttentionNorm.Forward(ctx, hiddenStates, opts.eps)
+	hiddenStates = d.Attention.Forward(ctx, hiddenStates, positions, cache, opts)
+	if outputs != nil {
+		hiddenStates = hiddenStates.Rows(ctx, outputs)
+		residual = residual.Rows(ctx, outputs)
+	}
+
+	hiddenStates = hiddenStates.Add(ctx, residual)
+	residual = hiddenStates
+
+	hiddenStates = d.MLPNorm.Forward(ctx, hiddenStates, opts.eps)
+	hiddenStates = d.MLP.Forward(ctx, hiddenStates)
+	return hiddenStates.Add(ctx, residual)
+}
+
+type Model struct {
+	model.Base
+	model.BytePairEncoding
+
+	TokenEmbedding *nn.Embedding  `gguf:"token_embd"`
+	Layers         []DecoderLayer `gguf:"blk"`
+	OutputNorm     *nn.RMSNorm    `gguf:"output_norm"`
+	Output         *nn.Linear     `gguf:"output,alt:token_embd"`
+
+	Options
+}
+
+// Forward implements model.Model.
+func (m Model) Forward(ctx ml.Context, batch input.Batch) (ml.Tensor, error) {
+	positions, err := ctx.Input().FromIntSlice(batch.Positions, len(batch.Positions))
+	if err != nil {
+		return nil, err
+	}
+
+	hiddenStates := m.TokenEmbedding.Forward(ctx, batch.Inputs)
+
+	for i, layer := range m.Layers {
+		m.Cache.SetLayer(i)
+
+		var outputs ml.Tensor
+		if i == len(m.Layers)-1 {
+			outputs, err = ctx.Input().FromIntSlice(batch.Outputs, len(batch.Outputs))
+			if err != nil {
+				return nil, err
+			}
+		}
+
+		hiddenStates = layer.Forward(ctx, hiddenStates, positions, outputs, m.Cache, &m.Options)
+	}
+
+	hiddenStates = m.OutputNorm.Forward(ctx, hiddenStates, m.eps)
+	hiddenStates = m.Output.Forward(ctx, hiddenStates)
+	return hiddenStates, nil
+}
+
+func (m Model) Shift(ctx ml.Context, layer int, key, shift ml.Tensor) (ml.Tensor, error) {
+	ropeDim := cmp.Or(m.ropeDim, m.hiddenSize/m.numHeads)
+	return fast.RoPE(ctx, key, shift, ropeDim, m.ropeBase, m.ropeScale, rope.WithTypeNeoX()), nil
+}
+
+func New(c fs.Config) (model.Model, error) {
+	m := Model{
+		Layers: make([]DecoderLayer, c.Uint("block_count")),
+		BytePairEncoding: model.NewBytePairEncoding(
+			c.String("tokenizer.ggml.pretokenizer", `(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+`),
+			&model.Vocabulary{
+				Values: c.Strings("tokenizer.ggml.tokens"),
+				Types:  c.Ints("tokenizer.ggml.token_type"),
+				Merges: c.Strings("tokenizer.ggml.merges"),
+				AddBOS: c.Bool("tokenizer.ggml.add_bos_token", true),
+				BOS:    []int32{int32(c.Uint("tokenizer.ggml.bos_token_id"))},
+				AddEOS: c.Bool("tokenizer.ggml.add_eos_token", false),
+				EOS: append(
+					[]int32{int32(c.Uint("tokenizer.ggml.eos_token_id"))},
+					c.Ints("tokenizer.ggml.eos_token_ids")...,
+				),
+			},
+		),
+		Options: Options{
+			hiddenSize: int(c.Uint("embedding_length")),
+			numHeads:   int(c.Uint("attention.head_count")),
+			numKVHeads: int(c.Uint("attention.head_count_kv")),
+			headDim:    int(c.Uint("attention.key_length")),
+			ropeDim:    int(c.Uint("rope.dimension_count")),
+			ropeBase:   c.Float("rope.freq_base"),
+			ropeScale:  c.Float("rope.freq_scale", 1),
+			eps:        c.Float("attention.layer_norm_rms_epsilon"),
+		},
+	}
+
+	m.Cache = kvcache.NewCausalCache(m.Shift)
+	return &m, nil
+}
+
+func init() {
+	model.Register("qwen2", New)
+}

From 7359b0270767d1ebda33598d738a4263a4238b3a Mon Sep 17 00:00:00 2001
From: Daniel Hiltgen <dhiltgen@users.noreply.github.com>
Date: Wed, 21 May 2025 10:46:56 -0700
Subject: [PATCH 047/108] win: detect background upgrade in progress (#10785)

Give the user a helpful error instead of showing
connection refused errors.
---
 cmd/cmd.go           |  4 ++--
 cmd/start_windows.go | 50 +++++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 51 insertions(+), 3 deletions(-)

diff --git a/cmd/cmd.go b/cmd/cmd.go
index ad4be7f9..b9047529 100644
--- a/cmd/cmd.go
+++ b/cmd/cmd.go
@@ -1236,11 +1236,11 @@ func checkServerHeartbeat(cmd *cobra.Command, _ []string) error {
 		return err
 	}
 	if err := client.Heartbeat(cmd.Context()); err != nil {
-		if !strings.Contains(err.Error(), " refused") {
+		if !(strings.Contains(err.Error(), " refused") || strings.Contains(err.Error(), "could not connect")) {
 			return err
 		}
 		if err := startApp(cmd.Context(), client); err != nil {
-			return errors.New("could not connect to ollama app, is it running?")
+			return fmt.Errorf("ollama server not responding - %w", err)
 		}
 	}
 	return nil
diff --git a/cmd/start_windows.go b/cmd/start_windows.go
index 5bca2433..bcc51057 100644
--- a/cmd/start_windows.go
+++ b/cmd/start_windows.go
@@ -4,17 +4,27 @@ import (
 	"context"
 	"errors"
 	"fmt"
+	"log/slog"
 	"os"
 	"os/exec"
+	"path"
 	"path/filepath"
 	"strings"
 	"syscall"
+	"unsafe"
 
 	"github.com/ollama/ollama/api"
+	"golang.org/x/sys/windows"
+)
+
+const (
+	Installer = "OllamaSetup.exe"
 )
 
 func startApp(ctx context.Context, client *api.Client) error {
-	// log.Printf("XXX Attempting to find and start ollama app")
+	if len(isProcRunning(Installer)) > 0 {
+		return fmt.Errorf("upgrade in progress...")
+	}
 	AppName := "ollama app.exe"
 	exe, err := os.Executable()
 	if err != nil {
@@ -56,3 +66,41 @@ func startApp(ctx context.Context, client *api.Client) error {
 	}
 	return waitForServer(ctx, client)
 }
+
+func isProcRunning(procName string) []uint32 {
+	pids := make([]uint32, 2048)
+	var ret uint32
+	if err := windows.EnumProcesses(pids, &ret); err != nil || ret == 0 {
+		slog.Debug("failed to check for running installers", "error", err)
+		return nil
+	}
+	pids = pids[:ret]
+	var matches []uint32
+	for _, pid := range pids {
+		if pid == 0 {
+			continue
+		}
+		hProcess, err := windows.OpenProcess(windows.PROCESS_QUERY_INFORMATION|windows.PROCESS_VM_READ, false, pid)
+		if err != nil {
+			continue
+		}
+		defer windows.CloseHandle(hProcess)
+		var module windows.Handle
+		var cbNeeded uint32
+		cb := (uint32)(unsafe.Sizeof(module))
+		if err := windows.EnumProcessModules(hProcess, &module, cb, &cbNeeded); err != nil {
+			continue
+		}
+		var sz uint32 = 1024 * 8
+		moduleName := make([]uint16, sz)
+		cb = uint32(len(moduleName)) * (uint32)(unsafe.Sizeof(uint16(0)))
+		if err := windows.GetModuleBaseName(hProcess, module, &moduleName[0], cb); err != nil && err != syscall.ERROR_INSUFFICIENT_BUFFER {
+			continue
+		}
+		exeFile := path.Base(strings.ToLower(syscall.UTF16ToString(moduleName)))
+		if strings.EqualFold(exeFile, procName) {
+			matches = append(matches, pid)
+		}
+	}
+	return matches
+}

From 61aeaf7e813bf307f1b28480c7ee2aed639b28f7 Mon Sep 17 00:00:00 2001
From: Michael Yang <mxyng@pm.me>
Date: Wed, 21 May 2025 13:55:31 -0700
Subject: [PATCH 048/108] remove support for multiple ggufs in a single file
 (#10722)

* remove support for multiple ggufs in a single file

this was an attempt to make it easier to import multimodal models into
ollama. this was rarely used and error prone so remove it

* fix: create fused model from blob
---
 server/create.go | 51 ++++++++++++++----------------------------------
 1 file changed, 15 insertions(+), 36 deletions(-)

diff --git a/server/create.go b/server/create.go
index 9488de35..bd970876 100644
--- a/server/create.go
+++ b/server/create.go
@@ -501,48 +501,27 @@ func ggufLayers(digest string, fn func(resp api.ProgressResponse)) ([]*layerGGML
 		return nil, errOnlyGGUFSupported
 	}
 
-	stat, err := blob.Stat()
+	f, err := ggml.Decode(blob, -1)
 	if err != nil {
 		return nil, err
 	}
 
-	var offset int64
-	for offset < stat.Size() {
-		f, err := ggml.Decode(blob, -1)
-		if errors.Is(err, io.EOF) {
-			break
-		} else if err != nil {
-			return nil, err
-		}
-
-		mediatype := "application/vnd.ollama.image.model"
-		if f.KV().Kind() == "adapter" {
-			mediatype = "application/vnd.ollama.image.adapter"
-		} else if _, ok := f.KV()[fmt.Sprintf("%s.vision.block_count", f.KV().Architecture())]; ok || f.KV().Kind() == "projector" {
-			mediatype = "application/vnd.ollama.image.projector"
-		}
-
-		var layer Layer
-		if digest != "" && f.Length == stat.Size() && offset == 0 {
-			layer, err = NewLayerFromLayer(digest, mediatype, blob.Name())
-			if err != nil {
-				slog.Debug("could not create new layer from layer", "error", err)
-				return nil, err
-			}
-		}
-
-		// Fallback to creating layer from file copy (either NewLayerFromLayer failed, or digest empty/n != stat.Size())
-		if layer.Digest == "" {
-			layer, err = NewLayer(io.NewSectionReader(blob, offset, f.Length), mediatype)
-			if err != nil {
-				return nil, err
-			}
-		}
-
-		layers = append(layers, &layerGGML{layer, f})
-		offset = f.Length
+	mediatype := "application/vnd.ollama.image.model"
+	if f.KV().Kind() == "adapter" {
+		mediatype = "application/vnd.ollama.image.adapter"
+	} else if (f.KV().Uint("block_count") == 0 && f.KV().Uint("vision.block_count") > 0) || f.KV().Kind() == "projector" {
+		// if a model has vision.block_count but not block_count, it is a standalone vision model
+		mediatype = "application/vnd.ollama.image.projector"
 	}
 
+	layer, err := NewLayerFromLayer(digest, mediatype, blob.Name())
+	if err != nil {
+		slog.Debug("could not create new layer from layer", "error", err)
+		return nil, err
+	}
+
+	layers = append(layers, &layerGGML{layer, f})
+
 	return detectChatTemplate(layers)
 }
 

From fdd4d479a3cbaff1a7fe849e38a145652f87d611 Mon Sep 17 00:00:00 2001
From: Daniel Hiltgen <dhiltgen@users.noreply.github.com>
Date: Thu, 22 May 2025 09:12:32 -0700
Subject: [PATCH 049/108] integration: add qwen2.5-vl (#10815)

Replace the older llava model with qwen2.5 for vision tests
Skip split-batch test on small VRAM systems to avoid excessive test time
---
 integration/llm_image_test.go | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/integration/llm_image_test.go b/integration/llm_image_test.go
index b9726c8f..bbd031a9 100644
--- a/integration/llm_image_test.go
+++ b/integration/llm_image_test.go
@@ -19,7 +19,7 @@ func TestVisionModels(t *testing.T) {
 	}
 	testCases := []testCase{
 		{
-			model: "llava:7b",
+			model: "qwen2.5vl",
 		},
 		{
 			model: "llama3.2-vision",
@@ -60,6 +60,7 @@ func TestVisionModels(t *testing.T) {
 }
 
 func TestIntegrationSplitBatch(t *testing.T) {
+	skipUnderMinVRAM(t, 6)
 	image, err := base64.StdEncoding.DecodeString(imageEncoding)
 	require.NoError(t, err)
 	req := api.GenerateRequest{

From fbe6ae285a23baddb14c5bbce26d4fcb837503e4 Mon Sep 17 00:00:00 2001
From: Bruce MacDonald <brucewmacdonald@gmail.com>
Date: Thu, 22 May 2025 10:48:08 -0700
Subject: [PATCH 050/108] server: improve tensor quantization fallback logic
 (#10806)

Fall back to alternative quantization types when a tensor's dimensions aren't divisible by the block size required for the original desired quantization type. If retried quantization types fail, the system ultimately falls back to F16 (half-precision floating point) which has a block size of 1 and can handle any tensor dimension.
---
 server/quantization.go | 28 ++++++++++++++++++++++------
 1 file changed, 22 insertions(+), 6 deletions(-)

diff --git a/server/quantization.go b/server/quantization.go
index adfc948e..e57e8a4d 100644
--- a/server/quantization.go
+++ b/server/quantization.go
@@ -120,14 +120,30 @@ func getTensorNewType(kv fsggml.KV, qs *quantizeState, newType fsggml.TensorType
 
 	if newType.IsQuantized() {
 		nx := shape[0]
-		ny := uint64(1)
-		if len(shape) > 1 {
-			ny = shape[1]
-		}
 		qk_k := newType.BlockSize()
+
+		// Check if first dimension is divisible by block size
 		if nx%qk_k != 0 {
-			slog.Warn(fmt.Sprintf("tensor cols %d x %d are not divisible by %d, required for %s.  Falling back to quantization %s", nx, ny, qk_k, newType.String(), fsggml.TensorTypeF16.String()))
-			newType = fsggml.TensorTypeF16
+			// Store the original type for logging
+			originalType := newType
+
+			// Select appropriate fallback based on original type
+			switch newType {
+			case fsggml.TensorTypeQ4_K:
+				newType = fsggml.TensorTypeQ5_0
+			case fsggml.TensorTypeQ5_K:
+				newType = fsggml.TensorTypeQ5_1
+			case fsggml.TensorTypeQ6_K:
+				newType = fsggml.TensorTypeQ8_0
+			}
+
+			// Final check - if still incompatible, fall back to F16
+			if nx%newType.BlockSize() != 0 {
+				newType = fsggml.TensorTypeF16
+			}
+
+			slog.Warn(fmt.Sprintf("tensor cols %d are not divisible by %d, required for %s - using fallback quantization %s",
+				nx, qk_k, originalType.String(), newType.String()))
 		}
 	}
 	return newType

From adff143bcda0c7ab4ca3a85dc3db5a81552368c7 Mon Sep 17 00:00:00 2001
From: Michael Yang <mxyng@pm.me>
Date: Thu, 22 May 2025 11:30:49 -0700
Subject: [PATCH 051/108] fix: mllama quality (#10807)

* fix mllama convert

- transform attn_gate and ffn_gate
- swap attention heads for vision models

* fix mllama

the mlp gate which was applied in the wrong place
---
 convert/convert_mllama.go           | 65 +++++++++++++++++++----------
 model/models/mllama/model_vision.go | 22 ++--------
 2 files changed, 45 insertions(+), 42 deletions(-)

diff --git a/convert/convert_mllama.go b/convert/convert_mllama.go
index 12478be7..69d7f588 100644
--- a/convert/convert_mllama.go
+++ b/convert/convert_mllama.go
@@ -94,7 +94,9 @@ func (m *mllamaModel) Tensors(ts []Tensor) []*ggml.Tensor {
 	var out []*ggml.Tensor
 	var text []Tensor
 	for _, t := range ts {
-		if t.Name() == "v.position_embd.gate" {
+		if !strings.HasPrefix(t.Name(), "v.") && !strings.HasPrefix(t.Name(), "mm.") {
+			text = append(text, t)
+		} else if t.Name() == "v.position_embd.gate" {
 			for _, name := range []string{"v.position_embd.gate", "v.tile_position_embd.gate"} {
 				tt := t.Clone()
 				tt.SetRepacker(m.repack(name))
@@ -105,23 +107,21 @@ func (m *mllamaModel) Tensors(ts []Tensor) []*ggml.Tensor {
 					WriterTo: tt,
 				})
 			}
-		} else if t.Name() == "v.pre_tile_position_embd.gate" || t.Name() == "v.post_tile_position_embd.gate" {
-			t.SetRepacker(m.repack(t.Name()))
-			out = append(out, &ggml.Tensor{
-				Name:     t.Name(),
-				Kind:     t.Kind(),
-				Shape:    t.Shape(),
-				WriterTo: t,
-			})
-		} else if strings.HasPrefix(t.Name(), "v.") || strings.HasPrefix(t.Name(), "mm.") {
-			out = append(out, &ggml.Tensor{
-				Name:     t.Name(),
-				Kind:     t.Kind(),
-				Shape:    t.Shape(),
-				WriterTo: t,
-			})
 		} else {
-			text = append(text, t)
+			if t.Name() == "v.pre_tile_position_embd.gate" || t.Name() == "v.post_tile_position_embd.gate" {
+				t.SetRepacker(m.repack(t.Name()))
+			} else if strings.HasSuffix(t.Name(), "attn_q.weight") || strings.HasSuffix(t.Name(), "attn_k.weight") {
+				t.SetRepacker(m.repack(t.Name()))
+			} else if strings.HasSuffix(t.Name(), "attn_gate") || strings.HasSuffix(t.Name(), "ffn_gate") {
+				t.SetRepacker(m.repack(t.Name()))
+			}
+
+			out = append(out, &ggml.Tensor{
+				Name:     t.Name(),
+				Kind:     t.Kind(),
+				Shape:    t.Shape(),
+				WriterTo: t,
+			})
 		}
 	}
 
@@ -137,16 +137,35 @@ func (m *mllamaModel) repack(name string) Repacker {
 
 		var t tensor.Tensor = tensor.New(tensor.WithShape(dims...), tensor.WithBacking(data))
 
-		t, err = tensor.Tanh(t)
-		if err != nil {
-			return nil, err
-		}
+		if strings.HasSuffix(name, "attn_q.weight") || strings.HasSuffix(name, "attn_k.weight") {
+			heads := m.VisionModel.AttentionHeads
+			if err := t.Reshape(append([]int{int(heads), 2, dims[0] / int(heads) / 2}, dims[1:]...)...); err != nil {
+				return nil, err
+			}
 
-		if name == "v.position_embd.gate" {
-			t, err = tensor.Sub(float32(1), t)
+			if err := t.T(0, 2, 1, 3); err != nil {
+				return nil, err
+			}
+
+			if err := t.Reshape(dims...); err != nil {
+				return nil, err
+			}
+
+			if err := t.Transpose(); err != nil {
+				return nil, err
+			}
+		} else {
+			t, err = tensor.Tanh(t)
 			if err != nil {
 				return nil, err
 			}
+
+			if name == "v.position_embd.gate" {
+				t, err = tensor.Sub(float32(1), t)
+				if err != nil {
+					return nil, err
+				}
+			}
 		}
 
 		t = tensor.Materialize(t)
diff --git a/model/models/mllama/model_vision.go b/model/models/mllama/model_vision.go
index 77ea5373..2d424947 100644
--- a/model/models/mllama/model_vision.go
+++ b/model/models/mllama/model_vision.go
@@ -16,8 +16,6 @@ type VisionSelfAttention struct {
 	Key    *nn.Linear `gguf:"attn_k"`
 	Value  *nn.Linear `gguf:"attn_v"`
 	Output *nn.Linear `gguf:"attn_output"`
-
-	Gate ml.Tensor `gguf:"attn_gate"`
 }
 
 func (sa *VisionSelfAttention) Forward(ctx ml.Context, hiddenState ml.Tensor, opts *VisionModelOptions) ml.Tensor {
@@ -25,27 +23,16 @@ func (sa *VisionSelfAttention) Forward(ctx ml.Context, hiddenState ml.Tensor, op
 
 	query := sa.Query.Forward(ctx, hiddenState)
 	query = query.Reshape(ctx, headDim, opts.numHeads, query.Dim(1), batchSize)
-	query = query.Permute(ctx, 0, 2, 1, 3).Contiguous(ctx)
 
 	key := sa.Key.Forward(ctx, hiddenState)
 	key = key.Reshape(ctx, headDim, opts.numHeads, key.Dim(1), batchSize)
-	key = key.Permute(ctx, 0, 2, 1, 3).Contiguous(ctx)
 
 	value := sa.Value.Forward(ctx, hiddenState)
 	value = value.Reshape(ctx, headDim, opts.numHeads, value.Dim(1), batchSize)
-	value = value.Permute(ctx, 1, 2, 0, 3).Contiguous(ctx)
 
-	scores := key.Mulmat(ctx, query)
-	scores = scores.Scale(ctx, 1.0/math.Sqrt(float64(headDim)))
-	scores = scores.Softmax(ctx)
-
-	attention := value.Mulmat(ctx, scores)
-	attention = attention.Reshape(ctx, headDim, attention.Dim(1), opts.numHeads, batchSize)
-	attention = attention.Permute(ctx, 0, 2, 1, 3).Contiguous(ctx)
+	attention := nn.Attention(ctx, query, key, value, 1./math.Sqrt(float64(headDim)), nil)
 	attention = attention.Reshape(ctx, opts.hiddenSize, attention.Dim(2), batchSize)
-
-	hiddenState = sa.Output.Forward(ctx, attention)
-	return hiddenState
+	return sa.Output.Forward(ctx, attention)
 }
 
 type VisionMLP struct {
@@ -76,21 +63,18 @@ func (e *VisionEncoderLayer) Forward(ctx ml.Context, hiddenState ml.Tensor, opts
 	// self attention
 	hiddenState = e.AttentionNorm.Forward(ctx, hiddenState, opts.eps)
 	hiddenState = e.SelfAttention.Forward(ctx, hiddenState, opts)
-
 	if e.AttentionGate != nil {
 		hiddenState = hiddenState.Mul(ctx, e.AttentionGate)
 	}
 	hiddenState = hiddenState.Add(ctx, residual)
 	residual = hiddenState
 
-	// feed forward
 	hiddenState = e.MLPNorm.Forward(ctx, hiddenState, opts.eps)
 	hiddenState = e.MLP.Forward(ctx, hiddenState, opts)
-	hiddenState = hiddenState.Add(ctx, residual)
 	if e.MLPGate != nil {
 		hiddenState = hiddenState.Mul(ctx, e.MLPGate)
 	}
-
+	hiddenState = hiddenState.Add(ctx, residual)
 	return hiddenState
 }
 

From d950ff12c09c07a1cda7242373071fb9e7af9ddc Mon Sep 17 00:00:00 2001
From: Daniel Hiltgen <dhiltgen@users.noreply.github.com>
Date: Thu, 22 May 2025 14:31:36 -0700
Subject: [PATCH 052/108] sched: fix runner leak during reloading unload
 (#10819)

When the same model is being reloaded rapidly with client connections
being canceled before the model finishes loading, the queued unload
event could cause a leak of runners by deleting a different runner from
the loaded list.
---
 server/sched.go | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/server/sched.go b/server/sched.go
index 3fc54e55..612e4702 100644
--- a/server/sched.go
+++ b/server/sched.go
@@ -387,6 +387,17 @@ func (s *Scheduler) processCompleted(ctx context.Context) {
 				s.loadedMu.Unlock()
 				runner.refMu.Unlock()
 				slog.Debug("duplicate expired event, ignoring", "runner", runner)
+			} else if runner.pid != runnerToUnload.pid {
+				// If the pids do not match, we likely had multiple load
+				// failures for the same model in quick succession due to
+				// request context canceled and are draining the queue of
+				// events. Ensure the orphaned runner is properly shut down, but
+				// do not delete the mismatched loaded runner, or wait for VRAM
+				// convergence.
+				slog.Debug("orphaned runner shutting down", "orphan", runner, "loaded", runnerToUnload)
+				runner.unload()
+				s.loadedMu.Unlock()
+				runner.refMu.Unlock()
 			} else {
 				slog.Debug("starting background wait for VRAM recovery", "runner", runner)
 				finished := runner.waitForVRAMRecovery()

From 6db8a3771c29d070ef165cca0d7e8dbda3fc341e Mon Sep 17 00:00:00 2001
From: Jesse Gross <jesse@ollama.com>
Date: Fri, 16 May 2025 14:05:08 -0700
Subject: [PATCH 053/108] ggml: Report graph memory for failed allocations

GGML has a function to report the allocated size of a backend buffer.
However, this returns 0 if we tried to allocate a buffer and it failed.
For memory management purposes, it's important to know how much we were
trying to allocate. This extends the API to report attempted sizes for
all buffers and whether it succeeeded.
---
 ...16-graph-memory-reporting-on-failure.patch | 156 ++++++++++++++++++
 ml/backend/ggml/ggml/include/ggml-alloc.h     |   6 +
 ml/backend/ggml/ggml/include/ggml-backend.h   |   6 +
 ml/backend/ggml/ggml/src/ggml-alloc.c         |  38 ++++-
 ml/backend/ggml/ggml/src/ggml-backend.cpp     |  10 ++
 5 files changed, 212 insertions(+), 4 deletions(-)
 create mode 100644 llama/patches/0016-graph-memory-reporting-on-failure.patch

diff --git a/llama/patches/0016-graph-memory-reporting-on-failure.patch b/llama/patches/0016-graph-memory-reporting-on-failure.patch
new file mode 100644
index 00000000..92188224
--- /dev/null
+++ b/llama/patches/0016-graph-memory-reporting-on-failure.patch
@@ -0,0 +1,156 @@
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
+From: Jesse Gross <jesse@ollama.com>
+Date: Fri, 18 Apr 2025 15:58:19 -0700
+Subject: [PATCH] graph memory reporting on failure
+
+---
+ ggml/include/ggml-alloc.h   |  6 ++++++
+ ggml/include/ggml-backend.h |  6 ++++++
+ ggml/src/ggml-alloc.c       | 38 +++++++++++++++++++++++++++++++++----
+ ggml/src/ggml-backend.cpp   | 10 ++++++++++
+ 4 files changed, 56 insertions(+), 4 deletions(-)
+
+diff --git a/ggml/include/ggml-alloc.h b/ggml/include/ggml-alloc.h
+index 2cb150fd..781b1e10 100644
+--- a/ggml/include/ggml-alloc.h
++++ b/ggml/include/ggml-alloc.h
+@@ -66,6 +66,12 @@ GGML_API bool ggml_gallocr_alloc_graph(ggml_gallocr_t galloc, struct ggml_cgraph
+ 
+ GGML_API size_t ggml_gallocr_get_buffer_size(ggml_gallocr_t galloc, int buffer_id);
+ 
++struct ggml_allocr_buffer_status {
++    size_t size;
++    bool allocated;
++};
++GGML_API struct ggml_allocr_buffer_status ggml_gallocr_get_attempted_buffer_size(ggml_gallocr_t galloc, int buffer_id);
++
+ // Utils
+ // Create a buffer and allocate all the tensors in a ggml_context
+ GGML_API struct ggml_backend_buffer * ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_context * ctx, ggml_backend_buffer_type_t buft);
+diff --git a/ggml/include/ggml-backend.h b/ggml/include/ggml-backend.h
+index 778927f6..74e46716 100644
+--- a/ggml/include/ggml-backend.h
++++ b/ggml/include/ggml-backend.h
+@@ -304,6 +304,12 @@ extern "C" {
+ 
+     GGML_API size_t               ggml_backend_sched_get_buffer_size(ggml_backend_sched_t sched, ggml_backend_t backend);
+ 
++    struct ggml_backend_buffer_status {
++        size_t size;
++        bool allocated;
++    };
++    GGML_API struct ggml_backend_buffer_status ggml_backend_sched_get_attempted_buffer_size(ggml_backend_sched_t sched, ggml_backend_t backend);
++
+     GGML_API void                 ggml_backend_sched_set_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node, ggml_backend_t backend);
+     GGML_API ggml_backend_t       ggml_backend_sched_get_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node);
+ 
+diff --git a/ggml/src/ggml-alloc.c b/ggml/src/ggml-alloc.c
+index 5fd379f6..04812990 100644
+--- a/ggml/src/ggml-alloc.c
++++ b/ggml/src/ggml-alloc.c
+@@ -364,6 +364,7 @@ struct node_alloc {
+ struct ggml_gallocr {
+     ggml_backend_buffer_type_t * bufts; // [n_buffers]
+     ggml_backend_buffer_t * buffers; // [n_buffers]
++    size_t *buffer_sizes; // [n_buffers]
+     struct ggml_dyn_tallocr ** buf_tallocs; // [n_buffers]
+     int n_buffers;
+ 
+@@ -387,6 +388,9 @@ ggml_gallocr_t ggml_gallocr_new_n(ggml_backend_buffer_type_t * bufts, int n_bufs
+     galloc->buffers = calloc(n_bufs, sizeof(ggml_backend_buffer_t));
+     GGML_ASSERT(galloc->buffers != NULL);
+ 
++    galloc->buffer_sizes = calloc(n_bufs, sizeof(size_t));
++    GGML_ASSERT(galloc->buffer_sizes != NULL);
++
+     galloc->buf_tallocs = calloc(n_bufs, sizeof(struct ggml_dyn_tallocr *));
+     GGML_ASSERT(galloc->buf_tallocs != NULL);
+ 
+@@ -453,6 +457,7 @@ void ggml_gallocr_free(ggml_gallocr_t galloc) {
+     ggml_hash_set_free(&galloc->hash_set);
+     free(galloc->hash_values);
+     free(galloc->bufts);
++    free(galloc->buffer_sizes);
+     free(galloc->buffers);
+     free(galloc->buf_tallocs);
+     free(galloc->node_allocs);
+@@ -748,6 +753,8 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
+         }
+     }
+ 
++    bool success = true;
++
+     // reallocate buffers if needed
+     for (int i = 0; i < galloc->n_buffers; i++) {
+         // if the buffer type is used multiple times, we reuse the same buffer
+@@ -769,15 +776,20 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
+ 
+             ggml_backend_buffer_free(galloc->buffers[i]);
+             galloc->buffers[i] = ggml_backend_buft_alloc_buffer(galloc->bufts[i], new_size);
+-            if (galloc->buffers[i] == NULL) {
++            if (galloc->buffers[i]) {
++                galloc->buffer_sizes[i] = ggml_backend_buffer_get_size(galloc->buffers[i]);
++                ggml_backend_buffer_set_usage(galloc->buffers[i], GGML_BACKEND_BUFFER_USAGE_COMPUTE);
++            } else {
+                 GGML_LOG_ERROR("%s: failed to allocate %s buffer of size %zu\n", __func__, ggml_backend_buft_name(galloc->bufts[i]), new_size);
+-                return false;
++                galloc->buffer_sizes[i] = new_size;
++                success = false;
+             }
+-            ggml_backend_buffer_set_usage(galloc->buffers[i], GGML_BACKEND_BUFFER_USAGE_COMPUTE);
++        } else {
++            galloc->buffer_sizes[i] = ggml_backend_buffer_get_size(galloc->buffers[i]);
+         }
+     }
+ 
+-    return true;
++    return success;
+ }
+ 
+ bool ggml_gallocr_reserve(ggml_gallocr_t galloc, struct ggml_cgraph *graph) {
+@@ -934,6 +946,24 @@ size_t ggml_gallocr_get_buffer_size(ggml_gallocr_t galloc, int buffer_id) {
+     return ggml_backend_buffer_get_size(galloc->buffers[buffer_id]);
+ }
+ 
++struct ggml_allocr_buffer_status ggml_gallocr_get_attempted_buffer_size(ggml_gallocr_t galloc, int buffer_id) {
++    GGML_ASSERT(buffer_id >= 0 && buffer_id < galloc->n_buffers);
++
++    for (int i = 0; i < buffer_id; i++) {
++        if (galloc->buf_tallocs[i] == galloc->buf_tallocs[buffer_id]) {
++            // This buffer is the same as a previous one due to the same buffer type being used multiple times
++            // (See above.) However, we need a different check because multiple buffers might be NULL in our
++            // case and we still want to know the attempted size.
++
++            struct ggml_allocr_buffer_status status = {0, true};
++            return status;
++        }
++    }
++
++    struct ggml_allocr_buffer_status status = {galloc->buffer_sizes[buffer_id], galloc->buffers[buffer_id] != NULL};
++    return status;
++}
++
+ // utils
+ 
+ static void free_buffers(ggml_backend_buffer_t ** buffers, const size_t * n_buffers) {
+diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp
+index 0ce73a99..be335e8c 100644
+--- a/ggml/src/ggml-backend.cpp
++++ b/ggml/src/ggml-backend.cpp
+@@ -1629,6 +1629,16 @@ size_t ggml_backend_sched_get_buffer_size(ggml_backend_sched_t sched, ggml_backe
+     return ggml_gallocr_get_buffer_size(sched->galloc, backend_index);
+ }
+ 
++struct ggml_backend_buffer_status ggml_backend_sched_get_attempted_buffer_size(ggml_backend_sched_t sched, ggml_backend_t backend) {
++    int backend_index = ggml_backend_sched_backend_id(sched, backend);
++    GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends);
++
++    struct ggml_allocr_buffer_status allocr_status = ggml_gallocr_get_attempted_buffer_size(sched->galloc, backend_index);
++    struct ggml_backend_buffer_status status = {allocr_status.size, allocr_status.allocated};
++
++    return status;
++}
++
+ void ggml_backend_sched_set_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node, ggml_backend_t backend) {
+     int backend_index = ggml_backend_sched_backend_id(sched, backend);
+     GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends);
diff --git a/ml/backend/ggml/ggml/include/ggml-alloc.h b/ml/backend/ggml/ggml/include/ggml-alloc.h
index 2cb150fd..781b1e10 100644
--- a/ml/backend/ggml/ggml/include/ggml-alloc.h
+++ b/ml/backend/ggml/ggml/include/ggml-alloc.h
@@ -66,6 +66,12 @@ GGML_API bool ggml_gallocr_alloc_graph(ggml_gallocr_t galloc, struct ggml_cgraph
 
 GGML_API size_t ggml_gallocr_get_buffer_size(ggml_gallocr_t galloc, int buffer_id);
 
+struct ggml_allocr_buffer_status {
+    size_t size;
+    bool allocated;
+};
+GGML_API struct ggml_allocr_buffer_status ggml_gallocr_get_attempted_buffer_size(ggml_gallocr_t galloc, int buffer_id);
+
 // Utils
 // Create a buffer and allocate all the tensors in a ggml_context
 GGML_API struct ggml_backend_buffer * ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_context * ctx, ggml_backend_buffer_type_t buft);
diff --git a/ml/backend/ggml/ggml/include/ggml-backend.h b/ml/backend/ggml/ggml/include/ggml-backend.h
index 778927f6..74e46716 100644
--- a/ml/backend/ggml/ggml/include/ggml-backend.h
+++ b/ml/backend/ggml/ggml/include/ggml-backend.h
@@ -304,6 +304,12 @@ extern "C" {
 
     GGML_API size_t               ggml_backend_sched_get_buffer_size(ggml_backend_sched_t sched, ggml_backend_t backend);
 
+    struct ggml_backend_buffer_status {
+        size_t size;
+        bool allocated;
+    };
+    GGML_API struct ggml_backend_buffer_status ggml_backend_sched_get_attempted_buffer_size(ggml_backend_sched_t sched, ggml_backend_t backend);
+
     GGML_API void                 ggml_backend_sched_set_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node, ggml_backend_t backend);
     GGML_API ggml_backend_t       ggml_backend_sched_get_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node);
 
diff --git a/ml/backend/ggml/ggml/src/ggml-alloc.c b/ml/backend/ggml/ggml/src/ggml-alloc.c
index 5fd379f6..04812990 100644
--- a/ml/backend/ggml/ggml/src/ggml-alloc.c
+++ b/ml/backend/ggml/ggml/src/ggml-alloc.c
@@ -364,6 +364,7 @@ struct node_alloc {
 struct ggml_gallocr {
     ggml_backend_buffer_type_t * bufts; // [n_buffers]
     ggml_backend_buffer_t * buffers; // [n_buffers]
+    size_t *buffer_sizes; // [n_buffers]
     struct ggml_dyn_tallocr ** buf_tallocs; // [n_buffers]
     int n_buffers;
 
@@ -387,6 +388,9 @@ ggml_gallocr_t ggml_gallocr_new_n(ggml_backend_buffer_type_t * bufts, int n_bufs
     galloc->buffers = calloc(n_bufs, sizeof(ggml_backend_buffer_t));
     GGML_ASSERT(galloc->buffers != NULL);
 
+    galloc->buffer_sizes = calloc(n_bufs, sizeof(size_t));
+    GGML_ASSERT(galloc->buffer_sizes != NULL);
+
     galloc->buf_tallocs = calloc(n_bufs, sizeof(struct ggml_dyn_tallocr *));
     GGML_ASSERT(galloc->buf_tallocs != NULL);
 
@@ -453,6 +457,7 @@ void ggml_gallocr_free(ggml_gallocr_t galloc) {
     ggml_hash_set_free(&galloc->hash_set);
     free(galloc->hash_values);
     free(galloc->bufts);
+    free(galloc->buffer_sizes);
     free(galloc->buffers);
     free(galloc->buf_tallocs);
     free(galloc->node_allocs);
@@ -748,6 +753,8 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
         }
     }
 
+    bool success = true;
+
     // reallocate buffers if needed
     for (int i = 0; i < galloc->n_buffers; i++) {
         // if the buffer type is used multiple times, we reuse the same buffer
@@ -769,15 +776,20 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
 
             ggml_backend_buffer_free(galloc->buffers[i]);
             galloc->buffers[i] = ggml_backend_buft_alloc_buffer(galloc->bufts[i], new_size);
-            if (galloc->buffers[i] == NULL) {
+            if (galloc->buffers[i]) {
+                galloc->buffer_sizes[i] = ggml_backend_buffer_get_size(galloc->buffers[i]);
+                ggml_backend_buffer_set_usage(galloc->buffers[i], GGML_BACKEND_BUFFER_USAGE_COMPUTE);
+            } else {
                 GGML_LOG_ERROR("%s: failed to allocate %s buffer of size %zu\n", __func__, ggml_backend_buft_name(galloc->bufts[i]), new_size);
-                return false;
+                galloc->buffer_sizes[i] = new_size;
+                success = false;
             }
-            ggml_backend_buffer_set_usage(galloc->buffers[i], GGML_BACKEND_BUFFER_USAGE_COMPUTE);
+        } else {
+            galloc->buffer_sizes[i] = ggml_backend_buffer_get_size(galloc->buffers[i]);
         }
     }
 
-    return true;
+    return success;
 }
 
 bool ggml_gallocr_reserve(ggml_gallocr_t galloc, struct ggml_cgraph *graph) {
@@ -934,6 +946,24 @@ size_t ggml_gallocr_get_buffer_size(ggml_gallocr_t galloc, int buffer_id) {
     return ggml_backend_buffer_get_size(galloc->buffers[buffer_id]);
 }
 
+struct ggml_allocr_buffer_status ggml_gallocr_get_attempted_buffer_size(ggml_gallocr_t galloc, int buffer_id) {
+    GGML_ASSERT(buffer_id >= 0 && buffer_id < galloc->n_buffers);
+
+    for (int i = 0; i < buffer_id; i++) {
+        if (galloc->buf_tallocs[i] == galloc->buf_tallocs[buffer_id]) {
+            // This buffer is the same as a previous one due to the same buffer type being used multiple times
+            // (See above.) However, we need a different check because multiple buffers might be NULL in our
+            // case and we still want to know the attempted size.
+
+            struct ggml_allocr_buffer_status status = {0, true};
+            return status;
+        }
+    }
+
+    struct ggml_allocr_buffer_status status = {galloc->buffer_sizes[buffer_id], galloc->buffers[buffer_id] != NULL};
+    return status;
+}
+
 // utils
 
 static void free_buffers(ggml_backend_buffer_t ** buffers, const size_t * n_buffers) {
diff --git a/ml/backend/ggml/ggml/src/ggml-backend.cpp b/ml/backend/ggml/ggml/src/ggml-backend.cpp
index 0ce73a99..be335e8c 100644
--- a/ml/backend/ggml/ggml/src/ggml-backend.cpp
+++ b/ml/backend/ggml/ggml/src/ggml-backend.cpp
@@ -1629,6 +1629,16 @@ size_t ggml_backend_sched_get_buffer_size(ggml_backend_sched_t sched, ggml_backe
     return ggml_gallocr_get_buffer_size(sched->galloc, backend_index);
 }
 
+struct ggml_backend_buffer_status ggml_backend_sched_get_attempted_buffer_size(ggml_backend_sched_t sched, ggml_backend_t backend) {
+    int backend_index = ggml_backend_sched_backend_id(sched, backend);
+    GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends);
+
+    struct ggml_allocr_buffer_status allocr_status = ggml_gallocr_get_attempted_buffer_size(sched->galloc, backend_index);
+    struct ggml_backend_buffer_status status = {allocr_status.size, allocr_status.allocated};
+
+    return status;
+}
+
 void ggml_backend_sched_set_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node, ggml_backend_t backend) {
     int backend_index = ggml_backend_sched_backend_id(sched, backend);
     GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends);

From 73d6a82cce18f84ff5c67148783224cf25b30b32 Mon Sep 17 00:00:00 2001
From: Jesse Gross <jesse@ollama.com>
Date: Thu, 17 Apr 2025 11:00:25 -0700
Subject: [PATCH 054/108] ollamarunner: Memory usage reporting

This provides granular information about the backend memory allocations
required by the runner:
 - Per backend
 - Per layer
 - Weights, cache and graph
 - Allocation status

This can be used for debugging and validating memory estimates.
---
 kvcache/causal_test.go            |   2 +-
 ml/backend.go                     |  84 ++++++++++++++-
 ml/backend/ggml/ggml.go           | 163 ++++++++++++++++++++----------
 runner/ollamarunner/multimodal.go |   5 +-
 runner/ollamarunner/runner.go     |  48 +++++----
 5 files changed, 224 insertions(+), 78 deletions(-)

diff --git a/kvcache/causal_test.go b/kvcache/causal_test.go
index 79698708..820d496d 100644
--- a/kvcache/causal_test.go
+++ b/kvcache/causal_test.go
@@ -508,7 +508,7 @@ func (c *testContext) Forward(...ml.Tensor) ml.Context { return c }
 
 func (c *testContext) Compute(...ml.Tensor) {}
 
-func (c *testContext) Reserve() error { return nil }
+func (c *testContext) Reserve() {}
 
 func (c *testContext) MaxGraphNodes() int {
 	return 10
diff --git a/ml/backend.go b/ml/backend.go
index 3c417ef9..7c9b9e31 100644
--- a/ml/backend.go
+++ b/ml/backend.go
@@ -15,6 +15,10 @@ import (
 
 type Backend interface {
 	Load(ctx context.Context, progress func(float32)) error
+
+	// BackendMemory returns the memory allocations that were made for this model
+	BackendMemory() BackendMemory
+
 	Config() fs.Config
 	Get(name string) Tensor
 	NewContext() Context
@@ -68,6 +72,84 @@ type BackendParams struct {
 	FlashAttention bool
 }
 
+// ErrNoMem is returned when panicing due to insufficient memory. It includes
+// the attempted memory allocation.
+type ErrNoMem struct {
+	BackendMemory
+}
+
+func (e ErrNoMem) Error() string {
+	return fmt.Sprintf("insufficient memory - required allocations: %+v", e.BackendMemory)
+}
+
+type AllocationStatus int
+
+const (
+	// Unallocated memory - have not yet attempted to allocate
+	Unallocated AllocationStatus = iota
+
+	// Failed memory - tried to allocate the memory and did not succeed
+	Failed
+
+	// Allocated memory = tried and succeeded to allocate memory
+	Allocated
+)
+
+// Memory is the size of an allocation and whether it was successful.
+type Memory struct {
+	Size   uint64
+	Status AllocationStatus
+}
+
+func (m Memory) String() string {
+	s := fmt.Sprint(m.Size)
+
+	switch m.Status {
+	case Unallocated:
+		s += "U"
+	case Failed:
+		s += "F"
+	case Allocated:
+		s += "A"
+	}
+
+	return s
+}
+
+// DeviceMemory provides a breakdown of the memory needed
+// per device, such as a CPU or GPU.
+type DeviceMemory struct {
+	// Name is the name of the device as labeled by the backend. It
+	// may not be persistent across instances of the runner.
+	Name string
+
+	// Weights is the per-layer memory needed for the model weights.
+	Weights []Memory
+
+	// Cache is the per-layer memory needed for the KV cache.
+	Cache []Memory
+
+	// Graph is the size of the compute graph. It is not per-layer.
+	Graph Memory
+}
+
+// BackendMemory provides the amount of memory required to load the model
+// per device based on the BackendParams. In some cases, not all required
+// allocations will be known at this point. However, the size of the most recent
+// allocation is guaranteed to be provided so that if it failed, the caller can
+// accommodate that to make forward progress.
+type BackendMemory struct {
+	// InputsWeights are always located on the CPU and cannot be moved
+	InputWeights Memory
+
+	// CPU model components are located in system memory. This does not
+	// include unified memory allocated through the GPU.
+	CPU DeviceMemory
+
+	// GPU model components are located on one or more GPUs.
+	GPUs []DeviceMemory
+}
+
 var backends = make(map[string]func(string, BackendParams) (Backend, error))
 
 func RegisterBackend(name string, f func(string, BackendParams) (Backend, error)) {
@@ -102,7 +184,7 @@ type Context interface {
 	// graph, simply preallocates memory. Typically called with a
 	// worst case graph to ensure all resources are available for
 	// for future inference.
-	Reserve() error
+	Reserve()
 
 	MaxGraphNodes() int
 	Close()
diff --git a/ml/backend/ggml/ggml.go b/ml/backend/ggml/ggml.go
index 44e3b61b..496ba8a6 100644
--- a/ml/backend/ggml/ggml.go
+++ b/ml/backend/ggml/ggml.go
@@ -10,7 +10,6 @@ import "C"
 
 import (
 	"context"
-	"errors"
 	"fmt"
 	"io"
 	"log/slog"
@@ -66,6 +65,12 @@ type Backend struct {
 	// layers is the backend used for repeating layers
 	layers map[int]*C.struct_ggml_backend_buffer_type
 
+	// requiredMemory is the cumulative memory allocations needed by the backend
+	requiredMemory *ml.BackendMemory
+
+	// btDeviceMemory maps from a buffer type to the memory allocations associated with that device
+	btDeviceMemory map[*C.struct_ggml_backend_buffer_type]*ml.DeviceMemory
+
 	flashAttention bool
 
 	// maxGraphNodes is the maximum allowed number of graph nodes in this scheduler
@@ -94,6 +99,9 @@ func New(modelPath string, params ml.BackendParams) (ml.Backend, error) {
 		"num_key_values", len(meta.KV()),
 	)
 
+	var requiredMemory ml.BackendMemory
+	btDeviceMemory := make(map[*C.struct_ggml_backend_buffer_type]*ml.DeviceMemory)
+
 	type deviceBufferType struct {
 		d   *C.struct_ggml_backend_device
 		bts []*C.struct_ggml_backend_buffer_type
@@ -114,6 +122,8 @@ func New(modelPath string, params ml.BackendParams) (ml.Backend, error) {
 		}
 	}
 
+	blocks := int(meta.KV().BlockCount())
+
 	// create list of buffer types for the cpu
 	cpuDeviceBufferType := deviceBufferType{d: C.ggml_backend_dev_by_type(C.GGML_BACKEND_DEVICE_TYPE_CPU)}
 	for _, d := range append(accels, append(gpus, cpus...)...) {
@@ -121,17 +131,27 @@ func New(modelPath string, params ml.BackendParams) (ml.Backend, error) {
 		case C.GGML_BACKEND_DEVICE_TYPE_CPU,
 			C.GGML_BACKEND_DEVICE_TYPE_ACCEL:
 			cpuDeviceBufferType.bts = append(cpuDeviceBufferType.bts, C.ggml_backend_dev_buffer_type(d))
+			btDeviceMemory[C.ggml_backend_dev_buffer_type(d)] = &requiredMemory.CPU
 		}
 	}
 
+	requiredMemory.CPU.Name = C.GoString(C.ggml_backend_dev_name(cpuDeviceBufferType.d))
+	requiredMemory.CPU.Weights = make([]ml.Memory, blocks+1)
+	requiredMemory.CPU.Cache = make([]ml.Memory, blocks+1)
+
 	// create list of buffer types for each gpu
 	var gpuDeviceBufferTypes []deviceBufferType
-	for _, d := range gpus {
+	requiredMemory.GPUs = make([]ml.DeviceMemory, len(gpus))
+	for i, d := range gpus {
 		bt := C.ggml_backend_dev_buffer_type(d)
 		gpuDeviceBufferTypes = append(gpuDeviceBufferTypes, deviceBufferType{
 			d:   d,
 			bts: append([]*C.struct_ggml_backend_buffer_type{bt}, cpuDeviceBufferType.bts...),
 		})
+		btDeviceMemory[bt] = &requiredMemory.GPUs[i]
+		requiredMemory.GPUs[i].Name = C.GoString(C.ggml_backend_dev_name(d))
+		requiredMemory.GPUs[i].Weights = make([]ml.Memory, blocks+1)
+		requiredMemory.GPUs[i].Cache = make([]ml.Memory, blocks+1)
 	}
 
 	useDefaultSplit := true
@@ -170,8 +190,6 @@ func New(modelPath string, params ml.BackendParams) (ml.Backend, error) {
 	// inputs always use cpu
 	input := cpuDeviceBufferType
 
-	blocks := int(meta.KV().BlockCount())
-
 	// define a range of gpu layers. anything outside of this range is assigned to the cpu
 	gpuRangeStart := max(0, blocks-params.NumGPULayers)
 	gpuRangeStop := min(gpuRangeStart+params.NumGPULayers, blocks+1)
@@ -212,7 +230,7 @@ func New(modelPath string, params ml.BackendParams) (ml.Backend, error) {
 
 	// contexts are shared by tensors of the same buffer type
 	ctxs := make(map[*C.struct_ggml_backend_buffer_type]*C.struct_ggml_context)
-	createTensor := func(t tensor, bts []*C.struct_ggml_backend_buffer_type) *C.struct_ggml_tensor {
+	createTensor := func(t tensor, bts []*C.struct_ggml_backend_buffer_type, layer int) *C.struct_ggml_tensor {
 		for _, bt := range bts {
 			if _, ok := ctxs[bt]; !ok {
 				ctxs[bt] = C.ggml_init(C.struct_ggml_init_params{
@@ -238,6 +256,16 @@ func New(modelPath string, params ml.BackendParams) (ml.Backend, error) {
 			C.ggml_set_name(tt, cname)
 
 			slog.Log(context.TODO(), logutil.LevelTrace, "created tensor", "name", name, "shape", t.source.Shape, "dtype", t.source.Kind, "buffer_type", C.GoString(C.ggml_backend_buft_name(bt)))
+
+			size := pad(C.ggml_backend_buft_get_alloc_size(bt, tt), C.ggml_backend_buft_get_alignment(bt))
+			if layer == -1 {
+				// Assume that InputWeights can be allocated - they're always in system memory and can't be moved in any case
+				requiredMemory.InputWeights.Status = ml.Allocated
+				requiredMemory.InputWeights.Size += uint64(size)
+			} else {
+				btDeviceMemory[bt].Weights[layer].Size += uint64(size)
+			}
+
 			//nolint:staticcheck // TODO: check if buffer type supports this tensor
 			return tt
 		}
@@ -259,22 +287,22 @@ func New(modelPath string, params ml.BackendParams) (ml.Backend, error) {
 	for _, t := range meta.Tensors().Items() {
 		switch {
 		case contains(t.Name, "position_embd", "token_embd", "token_norm_embd", "token_types"):
-			createTensor(tensor{source: t}, input.bts)
+			createTensor(tensor{source: t}, input.bts, -1)
 			if _, ok := meta.Tensors().GroupLayers()["output"]; !ok && t.Name == "token_embd.weight" {
-				createTensor(tensor{source: t, target: "output.weight"}, output.bts)
+				createTensor(tensor{source: t, target: "output.weight"}, output.bts, blocks)
 			}
 		case contains(t.Name, "cls", "output", "output_norm"):
-			createTensor(tensor{source: t}, output.bts)
+			createTensor(tensor{source: t}, output.bts, blocks)
 		case strings.HasPrefix(t.Name, "v.") || strings.HasPrefix(t.Name, "mm."):
 			// TODO: assign vision tensors to the gpu if possible
-			createTensor(tensor{source: t}, output.bts)
+			createTensor(tensor{source: t}, output.bts, blocks)
 		case contains(t.Name, "rope_freqs", "rope_factors_long", "rope_factors_short"):
 			// these tensors should be repeated per layer
 			for i, layer := range layers {
 				createTensor(tensor{
 					source: t,
 					target: "blk." + strconv.Itoa(i) + "." + t.Name,
-				}, layer.bts)
+				}, layer.bts, i)
 			}
 		default:
 			layerIndex := -1
@@ -285,10 +313,10 @@ func New(modelPath string, params ml.BackendParams) (ml.Backend, error) {
 			}
 
 			if layerIndex >= 0 {
-				createTensor(tensor{source: t}, layers[layerIndex].bts)
+				createTensor(tensor{source: t}, layers[layerIndex].bts, layerIndex)
 			} else {
 				// load all other tensors on the cpu
-				createTensor(tensor{source: t}, input.bts)
+				createTensor(tensor{source: t}, input.bts, -1)
 			}
 		}
 	}
@@ -301,8 +329,18 @@ func New(modelPath string, params ml.BackendParams) (ml.Backend, error) {
 		}
 
 		b := C.ggml_backend_alloc_ctx_tensors_from_buft(c, bt)
+		for i := range btDeviceMemory[bt].Weights {
+			if btDeviceMemory[bt].Weights[i].Size != 0 {
+				if b != nil {
+					btDeviceMemory[bt].Weights[i].Status = ml.Allocated
+				} else {
+					btDeviceMemory[bt].Weights[i].Status = ml.Failed
+				}
+			}
+		}
+
 		if b == nil {
-			return nil, fmt.Errorf("unable to allocate memory from device %v for model weights", C.GoString(C.ggml_backend_buft_name(bt)))
+			panic(ml.ErrNoMem{BackendMemory: requiredMemory})
 		}
 
 		C.ggml_backend_buffer_set_usage(b, C.GGML_BACKEND_BUFFER_USAGE_WEIGHTS)
@@ -367,7 +405,9 @@ func New(modelPath string, params ml.BackendParams) (ml.Backend, error) {
 			}
 			return m
 		}(),
-		maxGraphNodes: maxGraphNodes,
+		requiredMemory: &requiredMemory,
+		btDeviceMemory: btDeviceMemory,
+		maxGraphNodes:  maxGraphNodes,
 	}, nil
 }
 
@@ -446,6 +486,10 @@ func (b *Backend) Load(ctx context.Context, progress func(float32)) error {
 	return nil
 }
 
+func (b *Backend) BackendMemory() ml.BackendMemory {
+	return *b.requiredMemory
+}
+
 func (b *Backend) Config() fs.Config {
 	return b.meta.KV()
 }
@@ -477,6 +521,7 @@ func (b *Backend) NewContextSize(n int) ml.Context {
 			no_alloc: true,
 		}),
 		allocatedBuffers: &allocatedBuffers,
+		layer:            -1,
 	}
 }
 
@@ -503,6 +548,9 @@ type Context struct {
 
 	// maxGraphNodes is the maximum allowed number of graph nodes in this context
 	maxGraphNodes int
+
+	// layer is the graph layer that this context is allocating for - assumed to be cache
+	layer int
 }
 
 func (c *Context) Input() ml.Context {
@@ -513,6 +561,7 @@ func (c *Context) Input() ml.Context {
 			buft:             c.b.input,
 			allocatedBuffers: c.allocatedBuffers,
 			maxGraphNodes:    c.maxGraphNodes,
+			layer:            -1,
 		}
 	}
 
@@ -527,6 +576,7 @@ func (c *Context) Layer(i int) ml.Context {
 			buft:             buft,
 			allocatedBuffers: c.allocatedBuffers,
 			maxGraphNodes:    c.maxGraphNodes,
+			layer:            i,
 		}
 	}
 
@@ -564,22 +614,34 @@ func (c *Context) Compute(tensors ...ml.Tensor) {
 	}
 }
 
-func (c *Context) Reserve() error {
-	if !C.ggml_backend_sched_reserve(c.b.sched, c.graph) {
-		C.ggml_backend_sched_reset(c.b.sched)
-		return errors.New("failed to reserve graph")
-	}
+func (c *Context) Reserve() {
+	reserved := C.ggml_backend_sched_reserve(c.b.sched, c.graph)
 
 	slog.Debug("compute graph", "nodes", C.ggml_graph_n_nodes(c.graph), "splits", C.ggml_backend_sched_get_n_splits(c.b.sched))
-	for i := range c.b.schedBackends {
-		size := C.ggml_backend_sched_get_buffer_size(c.b.sched, c.b.schedBackends[i])
-		slog.Info("compute graph", "backend", C.GoString(C.ggml_backend_name(c.b.schedBackends[i])), "buffer_type", C.GoString(C.ggml_backend_buft_name(c.b.schedBufts[i])),
-			"size", format.HumanBytes2(uint64(size)))
+
+	// Reserve may get called multiple times for different graphs - we just want the last run, which will contain the max allocations
+	for _, bt := range c.b.schedBufts {
+		c.b.btDeviceMemory[bt].Graph = ml.Memory{}
 	}
 
-	C.ggml_backend_sched_reset(c.b.sched)
+	for i := range c.b.schedBackends {
+		bufferStatus := C.ggml_backend_sched_get_attempted_buffer_size(c.b.sched, c.b.schedBackends[i])
 
-	return nil
+		graph := &c.b.btDeviceMemory[c.b.schedBufts[i]].Graph
+		graph.Size += uint64(bufferStatus.size)
+		if bufferStatus.allocated && graph.Status != ml.Failed {
+			graph.Status = ml.Allocated
+		} else {
+			graph.Status = ml.Failed
+		}
+
+		slog.Info("compute graph", "backend", C.GoString(C.ggml_backend_name(c.b.schedBackends[i])), "buffer_type", C.GoString(C.ggml_backend_buft_name(c.b.schedBufts[i])),
+			"size", format.HumanBytes2(uint64(bufferStatus.size)))
+	}
+
+	if !reserved {
+		panic(ml.ErrNoMem{BackendMemory: *c.b.requiredMemory})
+	}
 }
 
 func (c *Context) MaxGraphNodes() int {
@@ -599,7 +661,7 @@ func pad(length, pad C.size_t) C.size_t {
 	return ((length + pad - 1) / pad) * pad
 }
 
-func (c *Context) newTensor(dtype ml.DType, shape []int) (ml.Tensor, error) {
+func (c *Context) newTensor(dtype ml.DType, shape []int) ml.Tensor {
 	if c.buft == nil {
 		panic("set Input or Layer before creating tensors")
 	}
@@ -622,7 +684,7 @@ func (c *Context) newTensor(dtype ml.DType, shape []int) (ml.Tensor, error) {
 
 	if len(shape) < 1 || shape[0] == 0 {
 		var shape C.int64_t = 0
-		return &Tensor{b: c.b, t: C.ggml_new_tensor(c.ctx, cdtype, 1, &shape)}, nil
+		return &Tensor{b: c.b, t: C.ggml_new_tensor(c.ctx, cdtype, 1, &shape)}
 	} else if len(shape) > 4 {
 		panic("unsupported number of dimensions")
 	}
@@ -635,31 +697,34 @@ func (c *Context) newTensor(dtype ml.DType, shape []int) (ml.Tensor, error) {
 
 	t := C.ggml_new_tensor(c.ctx, cdtype, C.int(len(shape)), shapeToGGML(shape))
 	size := pad(C.ggml_backend_buft_get_alloc_size(c.buft, t), C.ggml_backend_buft_get_alignment(c.buft))
-	b := C.ggml_backend_buft_alloc_buffer(c.buft, size)
-	if b == nil {
-		return nil, fmt.Errorf("unable to allocate %v from device %v for new tensor", format.HumanBytes2(uint64(size)), C.GoString(C.ggml_backend_buft_name(c.buft)))
-	}
-	*c.allocatedBuffers = append(*c.allocatedBuffers, b)
 
+	b := C.ggml_backend_buft_alloc_buffer(c.buft, size)
+	if c.layer >= 0 {
+		cache := &c.b.btDeviceMemory[c.buft].Cache[c.layer]
+
+		cache.Size += uint64(size)
+		if b != nil {
+			cache.Status = ml.Allocated
+		} else {
+			cache.Status = ml.Failed
+		}
+	}
+
+	if b == nil {
+		panic(ml.ErrNoMem{BackendMemory: *c.b.requiredMemory})
+	}
+
+	*c.allocatedBuffers = append(*c.allocatedBuffers, b)
 	C.ggml_backend_tensor_alloc(b, t, C.ggml_backend_buffer_get_base(b))
-	return &Tensor{b: c.b, t: t}, nil
+	return &Tensor{b: c.b, t: t}
 }
 
 func (c *Context) Empty(dtype ml.DType, shape ...int) ml.Tensor {
-	t, err := c.newTensor(dtype, shape)
-	if err != nil {
-		panic(err)
-	}
-
-	return t
+	return c.newTensor(dtype, shape)
 }
 
 func (c *Context) Zeros(dtype ml.DType, shape ...int) ml.Tensor {
-	t, err := c.newTensor(dtype, shape)
-	if err != nil {
-		panic(err)
-	}
-
+	t := c.newTensor(dtype, shape)
 	C.ggml_set_zero(t.(*Tensor).t)
 	return t
 }
@@ -687,10 +752,7 @@ func (c *Context) FromFloatSlice(s []float32, shape ...int) (ml.Tensor, error) {
 		return nil, err
 	}
 
-	t, err := c.newTensor(ml.DTypeF32, shape)
-	if err != nil {
-		return nil, err
-	}
+	t := c.newTensor(ml.DTypeF32, shape)
 
 	if len(s) > 0 {
 		C.ggml_backend_tensor_set(t.(*Tensor).t, unsafe.Pointer(&s[0]), 0, C.ggml_nbytes(t.(*Tensor).t))
@@ -704,10 +766,7 @@ func (c *Context) FromIntSlice(s []int32, shape ...int) (ml.Tensor, error) {
 		return nil, err
 	}
 
-	t, err := c.newTensor(ml.DTypeI32, shape)
-	if err != nil {
-		return nil, err
-	}
+	t := c.newTensor(ml.DTypeI32, shape)
 
 	if len(s) > 0 {
 		C.ggml_backend_tensor_set(t.(*Tensor).t, unsafe.Pointer(&s[0]), 0, C.ggml_nbytes(t.(*Tensor).t))
diff --git a/runner/ollamarunner/multimodal.go b/runner/ollamarunner/multimodal.go
index d78612fe..dbe6bba1 100644
--- a/runner/ollamarunner/multimodal.go
+++ b/runner/ollamarunner/multimodal.go
@@ -95,10 +95,7 @@ func (m multimodalStore) getTensor(backend ml.Backend, ctx ml.Context, in ml.Ten
 				}
 			}
 		} else {
-			err := computeCtx.Reserve()
-			if err != nil {
-				return nil, err
-			}
+			computeCtx.Reserve()
 		}
 	}
 
diff --git a/runner/ollamarunner/runner.go b/runner/ollamarunner/runner.go
index a488a104..99bee106 100644
--- a/runner/ollamarunner/runner.go
+++ b/runner/ollamarunner/runner.go
@@ -826,16 +826,12 @@ func (s *Server) reserveWorstCaseGraph() error {
 		return err
 	}
 
-	err = ctx.Forward(t).Reserve()
-	if err != nil {
-		return err
-	}
+	ctx.Forward(t).Reserve()
 
 	return nil
 }
 
-func (s *Server) loadModel(
-	ctx context.Context,
+func (s *Server) initModel(
 	mpath string,
 	params ml.BackendParams,
 	lpath multiLPath,
@@ -843,21 +839,21 @@ func (s *Server) loadModel(
 	kvCacheType string,
 	kvSize int,
 	multiUserCache bool,
-) {
+) error {
 	var err error
 	s.model, err = model.New(mpath, params)
 	if err != nil {
-		panic(err)
+		return err
 	}
 
 	// TODO(jessegross): LoRA loading
 	if lpath.String() != "" {
-		panic("loras are not yet implemented")
+		return errors.New("loras are not yet implemented")
 	}
 
 	s.cache, err = NewInputCache(s.model, kvCacheType, int32(kvSize), parallel, s.batchSize, multiUserCache)
 	if err != nil {
-		panic(err)
+		return err
 	}
 
 	if !s.cache.enabled && parallel > 1 {
@@ -869,11 +865,25 @@ func (s *Server) loadModel(
 	s.seqs = make([]*Sequence, s.parallel)
 	s.seqsSem = semaphore.NewWeighted(int64(s.parallel))
 
-	err = s.reserveWorstCaseGraph()
+	return s.reserveWorstCaseGraph()
+}
+
+func (s *Server) load(
+	ctx context.Context,
+	mpath string,
+	params ml.BackendParams,
+	lpath multiLPath,
+	parallel int,
+	kvCacheType string,
+	kvSize int,
+	multiUserCache bool) {
+	err := s.initModel(mpath, params, lpath, parallel, kvCacheType, kvSize, multiUserCache)
 	if err != nil {
 		panic(err)
 	}
 
+	slog.Debug("memory", "allocated", s.model.Backend().BackendMemory())
+
 	err = s.model.Backend().Load(ctx,
 		func(progress float32) {
 			s.progress = progress
@@ -921,9 +931,14 @@ func Execute(args []string) error {
 		status:    llm.ServerStatusLoadingModel,
 	}
 
+	server.cond = sync.NewCond(&server.mu)
+	server.ready.Add(1)
+
+	ctx, cancel := context.WithCancel(context.Background())
+	defer cancel()
+
 	// TODO(jessegross): Parameters that need to be implemented:
 	//	no-mmap
-	//	mlock
 
 	var tensorSplitFloats []float32
 	if *tensorSplit != "" {
@@ -943,14 +958,7 @@ func Execute(args []string) error {
 		FlashAttention: *flashAttention,
 	}
 
-	server.ready.Add(1)
-	ctx, cancel := context.WithCancel(context.Background())
-	defer cancel()
-
-	go server.loadModel(ctx, *mpath, params, lpaths, *parallel, *kvCacheType, *kvSize, *multiUserCache)
-
-	server.cond = sync.NewCond(&server.mu)
-
+	go server.load(ctx, *mpath, params, lpaths, *parallel, *kvCacheType, *kvSize, *multiUserCache)
 	go server.run(ctx)
 
 	addr := "127.0.0.1:" + strconv.Itoa(*port)

From 1f371ea92f7ebe4edd208b6732753473b2c4d0cd Mon Sep 17 00:00:00 2001
From: Jesse Gross <jesse@ollama.com>
Date: Mon, 19 May 2025 10:43:56 -0700
Subject: [PATCH 055/108] ml: Panic rather than return error on tensor
 allocation failure

FromFloatSlice and FromIntSlice return an error if the shape doesn't
match the passed data or if memory can't be allocated. Since these
are inputs, the memory being allocated is system memory rather than VRAM.

In many cases, the caller can't really handle the error and panics.

Empty and Zeros directly panic if they can't allocate memory.

This makes things consistent by panicing for the first two cases,
removing a fair amount of error handling code. This is also consistent
with how Go typically handles these situations.
---
 kvcache/causal.go                     | 26 ++++++----------------
 kvcache/causal_test.go                | 18 ++++++++--------
 ml/backend.go                         |  4 ++--
 ml/backend/ggml/ggml.go               | 31 +++++++++------------------
 model/model.go                        |  6 +-----
 model/models/gemma2/model.go          | 11 ++--------
 model/models/gemma3/model.go          | 16 +++-----------
 model/models/llama/model.go           | 10 ++-------
 model/models/llama4/model.go          | 22 ++++---------------
 model/models/llama4/model_text.go     |  6 +-----
 model/models/llama4/model_vision.go   |  5 +----
 model/models/mistral3/model.go        | 16 +++-----------
 model/models/mistral3/model_vision.go | 16 +++-----------
 model/models/mllama/model.go          | 22 ++++---------------
 model/models/qwen2/model.go           | 10 ++-------
 model/models/qwen25vl/model.go        | 16 +++-----------
 model/models/qwen25vl/model_vision.go | 22 +++++--------------
 model/models/qwen3/model.go           | 10 ++-------
 runner/ollamarunner/multimodal.go     |  2 +-
 runner/ollamarunner/runner.go         |  8 +++----
 20 files changed, 68 insertions(+), 209 deletions(-)

diff --git a/kvcache/causal.go b/kvcache/causal.go
index 9bc1d5da..f6bacaaf 100644
--- a/kvcache/causal.go
+++ b/kvcache/causal.go
@@ -211,10 +211,9 @@ func (c *Causal) StartForward(ctx ml.Context, batch input.Batch, reserve bool) e
 		c.curCellRange.max = len(c.cells) - 1
 	}
 
-	var err error
-	c.curMask, err = c.buildMask(ctx)
+	c.curMask = c.buildMask(ctx)
 
-	return err
+	return nil
 }
 
 func newRange() cellRange {
@@ -297,7 +296,7 @@ func roundUp(length, pad int) int {
 // Builds a mask of history x batch indicating whether for each token in the batch the
 // token in the history should apply. This is based on both the sequence and causality (the
 // position of the history is not ahead of the token in the batch).
-func (c *Causal) buildMask(ctx ml.Context) (ml.Tensor, error) {
+func (c *Causal) buildMask(ctx ml.Context) ml.Tensor {
 	// Align and pad the two dimensions as required by the backend
 	batchSize := roundUp(c.curBatchSize, c.config.MaskBatchPadding)
 
@@ -325,10 +324,7 @@ func (c *Causal) buildMask(ctx ml.Context) (ml.Tensor, error) {
 		mask[i] = float32(math.Inf(-1))
 	}
 
-	maskTensor, err := ctx.Input().FromFloatSlice(mask, length, batchSize)
-	if err != nil {
-		return nil, err
-	}
+	maskTensor := ctx.Input().FromFloatSlice(mask, length, batchSize)
 
 	if c.config.MaskDType != ml.DTypeF32 {
 		out := ctx.Input().Empty(c.config.MaskDType, maskTensor.Shape()...)
@@ -336,7 +332,7 @@ func (c *Causal) buildMask(ctx ml.Context) (ml.Tensor, error) {
 		maskTensor = out
 	}
 
-	return maskTensor, nil
+	return maskTensor
 }
 
 func (c *Causal) moveCells(ctx ml.Context, src, dst, length int) {
@@ -491,12 +487,7 @@ func (c *Causal) SetCausal(ctx ml.Context, opts CausalOptions) {
 	if !slices.Equal(c.opts.Except, opts.Except) {
 		c.opts = opts
 		if ctx != nil {
-			var err error
-			c.curMask, err = c.buildMask(ctx)
-			if err != nil {
-				// This error should never occur because we have previously built a mask with the same shape
-				panic(fmt.Errorf("SetCausal: %w", err))
-			}
+			c.curMask = c.buildMask(ctx)
 		}
 	}
 }
@@ -652,10 +643,7 @@ func (c *Causal) shift(seq int, beginIndex, offset int32) error {
 		}
 	}
 
-	kShift, err := ctx.Input().FromIntSlice(offsets, len(offsets))
-	if err != nil {
-		return err
-	}
+	kShift := ctx.Input().FromIntSlice(offsets, len(offsets))
 
 	for i, key := range c.keys {
 		if key == nil {
diff --git a/kvcache/causal_test.go b/kvcache/causal_test.go
index 820d496d..5b1dbe86 100644
--- a/kvcache/causal_test.go
+++ b/kvcache/causal_test.go
@@ -344,7 +344,7 @@ func testCache(t *testing.T, backend ml.Backend, cache Cache, tests []testCase)
 			}
 
 			cache.SetLayer(0)
-			tensor, _ := context.FromFloatSlice(test.in, test.inShape...)
+			tensor := context.FromFloatSlice(test.in, test.inShape...)
 			cache.Put(context, tensor, tensor)
 
 			out, _, mask := cache.Get(context)
@@ -386,7 +386,7 @@ func TestCanResume(t *testing.T) {
 	}
 
 	cache.SetLayer(0)
-	tensor, _ := context.FromFloatSlice([]float32{1, 2, 3, 4}, 1, 1, 4)
+	tensor := context.FromFloatSlice([]float32{1, 2, 3, 4}, 1, 1, 4)
 	cache.Put(context, tensor, tensor)
 
 	// with window size 4, nothing has slid out of the window yet
@@ -413,7 +413,7 @@ func TestCanResume(t *testing.T) {
 	}
 
 	cache.SetLayer(0)
-	tensor, _ = context.FromFloatSlice([]float32{5, 6}, 1, 1, 2)
+	tensor = context.FromFloatSlice([]float32{5, 6}, 1, 1, 2)
 	cache.Put(context, tensor, tensor)
 
 	// only the latest position has overlapping windows
@@ -470,24 +470,24 @@ func (c *testContext) Zeros(dtype ml.DType, shape ...int) ml.Tensor {
 	return c.Empty(dtype, shape...)
 }
 
-func (c *testContext) FromFloatSlice(s []float32, shape ...int) (ml.Tensor, error) {
+func (c *testContext) FromFloatSlice(s []float32, shape ...int) ml.Tensor {
 	t := c.Empty(ml.DTypeF32, shape...).(*testTensor)
 
 	copy(t.data, s)
 
-	return t, nil
+	return t
 }
 
-func (c *testContext) FromIntSlice(s []int32, shape ...int) (ml.Tensor, error) {
+func (c *testContext) FromIntSlice(s []int32, shape ...int) ml.Tensor {
 	f := make([]float32, len(s))
 	for i := range f {
 		f[i] = float32(s[i])
 	}
 
-	out, _ := c.FromFloatSlice(f, shape...)
+	out := c.FromFloatSlice(f, shape...)
 	out.(*testTensor).dtype = ml.DTypeI32
 
-	return out, nil
+	return out
 }
 
 func (c *testContext) Arange(start, stop, step float32, dtype ml.DType) ml.Tensor {
@@ -496,7 +496,7 @@ func (c *testContext) Arange(start, stop, step float32, dtype ml.DType) ml.Tenso
 		s = append(s, i)
 	}
 
-	out, _ := c.FromFloatSlice(s, len(s))
+	out := c.FromFloatSlice(s, len(s))
 	out.(*testTensor).dtype = dtype
 	return out
 }
diff --git a/ml/backend.go b/ml/backend.go
index 7c9b9e31..6beb7d2b 100644
--- a/ml/backend.go
+++ b/ml/backend.go
@@ -171,8 +171,8 @@ func NewBackend(modelPath string, params BackendParams) (Backend, error) {
 type Context interface {
 	Empty(dtype DType, shape ...int) Tensor
 	Zeros(dtype DType, shape ...int) Tensor
-	FromFloatSlice(s []float32, shape ...int) (Tensor, error)
-	FromIntSlice(s []int32, shape ...int) (Tensor, error)
+	FromFloatSlice(s []float32, shape ...int) Tensor
+	FromIntSlice(s []int32, shape ...int) Tensor
 
 	// Arange creates a 1D tensor with values within an interval (start, stop] increased by step.
 	Arange(start, stop, step float32, dtype DType) Tensor
diff --git a/ml/backend/ggml/ggml.go b/ml/backend/ggml/ggml.go
index 496ba8a6..76172ae1 100644
--- a/ml/backend/ggml/ggml.go
+++ b/ml/backend/ggml/ggml.go
@@ -729,11 +729,11 @@ func (c *Context) Zeros(dtype ml.DType, shape ...int) ml.Tensor {
 	return t
 }
 
-func checkShape[S ~[]E, E any](s S, shape ...int) error {
+func checkShape[S ~[]E, E any](s S, shape ...int) {
 	n := len(s)
 
 	if n == 0 {
-		return nil
+		return
 	}
 
 	for _, v := range shape {
@@ -741,16 +741,12 @@ func checkShape[S ~[]E, E any](s S, shape ...int) error {
 	}
 
 	if n != 1 {
-		return fmt.Errorf("invalid shape: %v", shape)
+		panic(fmt.Errorf("invalid shape: %v", shape))
 	}
-
-	return nil
 }
 
-func (c *Context) FromFloatSlice(s []float32, shape ...int) (ml.Tensor, error) {
-	if err := checkShape(s, shape...); err != nil {
-		return nil, err
-	}
+func (c *Context) FromFloatSlice(s []float32, shape ...int) ml.Tensor {
+	checkShape(s, shape...)
 
 	t := c.newTensor(ml.DTypeF32, shape)
 
@@ -758,13 +754,11 @@ func (c *Context) FromFloatSlice(s []float32, shape ...int) (ml.Tensor, error) {
 		C.ggml_backend_tensor_set(t.(*Tensor).t, unsafe.Pointer(&s[0]), 0, C.ggml_nbytes(t.(*Tensor).t))
 	}
 
-	return t, nil
+	return t
 }
 
-func (c *Context) FromIntSlice(s []int32, shape ...int) (ml.Tensor, error) {
-	if err := checkShape(s, shape...); err != nil {
-		return nil, err
-	}
+func (c *Context) FromIntSlice(s []int32, shape ...int) ml.Tensor {
+	checkShape(s, shape...)
 
 	t := c.newTensor(ml.DTypeI32, shape)
 
@@ -772,7 +766,7 @@ func (c *Context) FromIntSlice(s []int32, shape ...int) (ml.Tensor, error) {
 		C.ggml_backend_tensor_set(t.(*Tensor).t, unsafe.Pointer(&s[0]), 0, C.ggml_nbytes(t.(*Tensor).t))
 	}
 
-	return t, nil
+	return t
 }
 
 func (c Context) Arange(start, stop, step float32, dtype ml.DType) ml.Tensor {
@@ -790,12 +784,7 @@ func (c Context) Arange(start, stop, step float32, dtype ml.DType) ml.Tensor {
 			arange = append(arange, int32(i))
 		}
 
-		t, err := c.Input().FromIntSlice(arange, len(arange))
-		if err != nil {
-			panic(err)
-		}
-
-		return t
+		return c.Input().FromIntSlice(arange, len(arange))
 	default:
 		panic("unsupported dtype for arange")
 	}
diff --git a/model/model.go b/model/model.go
index 39b68db1..25097e01 100644
--- a/model/model.go
+++ b/model/model.go
@@ -287,11 +287,7 @@ func Forward(ctx ml.Context, m Model, inputs []int32, batch input.Batch) (ml.Ten
 		return nil, errors.New("batch size cannot be less than 1")
 	}
 
-	var err error
-	batch.Inputs, err = ctx.Input().FromIntSlice(inputs, len(inputs))
-	if err != nil {
-		return nil, err
-	}
+	batch.Inputs = ctx.Input().FromIntSlice(inputs, len(inputs))
 
 	cache := m.Config().Cache
 	if cache != nil {
diff --git a/model/models/gemma2/model.go b/model/models/gemma2/model.go
index 3c5a7ea5..e621d03a 100644
--- a/model/models/gemma2/model.go
+++ b/model/models/gemma2/model.go
@@ -175,15 +175,8 @@ func (l *Layer) Forward(ctx ml.Context, hiddenState, positionIDs, outputs ml.Ten
 }
 
 func (m *Model) Forward(ctx ml.Context, batch input.Batch) (ml.Tensor, error) {
-	positions, err := ctx.Input().FromIntSlice(batch.Positions, len(batch.Positions))
-	if err != nil {
-		return nil, err
-	}
-
-	outputs, err := ctx.Input().FromIntSlice(batch.Outputs, len(batch.Outputs))
-	if err != nil {
-		return nil, err
-	}
+	positions := ctx.Input().FromIntSlice(batch.Positions, len(batch.Positions))
+	outputs := ctx.Input().FromIntSlice(batch.Outputs, len(batch.Outputs))
 
 	hiddenState := m.TokenEmbedding.Forward(ctx, batch.Inputs)
 	hiddenState = hiddenState.Scale(ctx, math.Sqrt(float64(m.Options.hiddenSize)))
diff --git a/model/models/gemma3/model.go b/model/models/gemma3/model.go
index 89d1788e..53bf8275 100644
--- a/model/models/gemma3/model.go
+++ b/model/models/gemma3/model.go
@@ -101,14 +101,11 @@ func (m *Model) EncodeMultimodal(ctx ml.Context, multimodalData []byte) ([]input
 		return nil, err
 	}
 
-	pixelValues, err := ctx.Input().FromFloatSlice(f32s,
+	pixelValues := ctx.Input().FromFloatSlice(f32s,
 		m.ImageProcessor.imageSize,
 		m.ImageProcessor.imageSize,
 		m.ImageProcessor.numChannels,
 	)
-	if err != nil {
-		return nil, err
-	}
 
 	visionOutputs := m.VisionModel.Forward(ctx, pixelValues)
 	visionOutputs = m.MultiModalProjector.Forward(ctx, visionOutputs, m.imageSize, m.patchSize, m.VisionModel.eps)
@@ -144,15 +141,8 @@ func (m *Model) PostTokenize(inputs []input.Input) ([]input.Input, error) {
 }
 
 func (m *Model) Forward(ctx ml.Context, batch input.Batch) (ml.Tensor, error) {
-	positions, err := ctx.Input().FromIntSlice(batch.Positions, len(batch.Positions))
-	if err != nil {
-		return nil, err
-	}
-
-	outputs, err := ctx.Input().FromIntSlice(batch.Outputs, len(batch.Outputs))
-	if err != nil {
-		return nil, err
-	}
+	positions := ctx.Input().FromIntSlice(batch.Positions, len(batch.Positions))
+	outputs := ctx.Input().FromIntSlice(batch.Outputs, len(batch.Outputs))
 
 	return m.TextModel.Forward(ctx, batch.Inputs, positions, outputs, batch, m.Cache), nil
 }
diff --git a/model/models/llama/model.go b/model/models/llama/model.go
index 507f1ebc..3cf782d0 100644
--- a/model/models/llama/model.go
+++ b/model/models/llama/model.go
@@ -142,10 +142,7 @@ func (l *Layer) Forward(ctx ml.Context, hiddenState, positions, outputs ml.Tenso
 }
 
 func (m *Model) Forward(ctx ml.Context, batch input.Batch) (ml.Tensor, error) {
-	positions, err := ctx.Input().FromIntSlice(batch.Positions, len(batch.Positions))
-	if err != nil {
-		return nil, err
-	}
+	positions := ctx.Input().FromIntSlice(batch.Positions, len(batch.Positions))
 
 	hiddenState := m.TokenEmbedding.Forward(ctx, batch.Inputs)
 
@@ -154,10 +151,7 @@ func (m *Model) Forward(ctx ml.Context, batch input.Batch) (ml.Tensor, error) {
 
 		var outputs ml.Tensor
 		if i == len(m.Layers)-1 {
-			outputs, err = ctx.Input().FromIntSlice(batch.Outputs, len(batch.Outputs))
-			if err != nil {
-				return nil, err
-			}
+			outputs = ctx.Input().FromIntSlice(batch.Outputs, len(batch.Outputs))
 		}
 
 		hiddenState = layer.Forward(ctx, hiddenState, positions, outputs, m.Cache, m.Options)
diff --git a/model/models/llama4/model.go b/model/models/llama4/model.go
index af5173a1..8084760b 100644
--- a/model/models/llama4/model.go
+++ b/model/models/llama4/model.go
@@ -77,10 +77,7 @@ func (m *Model) EncodeMultimodal(ctx ml.Context, multimodalData []byte) ([]input
 		return nil, err
 	}
 
-	tilesLocal, err := ctx.Input().FromFloatSlice(pixelsLocal, size.X, size.Y, m.numChannels)
-	if err != nil {
-		return nil, err
-	}
+	tilesLocal := ctx.Input().FromFloatSlice(pixelsLocal, size.X, size.Y, m.numChannels)
 
 	ratioW, ratioH := size.X/m.imageSize, size.Y/m.imageSize
 
@@ -91,11 +88,7 @@ func (m *Model) EncodeMultimodal(ctx ml.Context, multimodalData []byte) ([]input
 	pixelValues := tilesLocal
 
 	if len(pixelsGlobal) > 0 {
-		tilesGlobal, err := ctx.Input().FromFloatSlice(pixelsGlobal, m.imageSize, m.imageSize, m.numChannels)
-		if err != nil {
-			return nil, err
-		}
-
+		tilesGlobal := ctx.Input().FromFloatSlice(pixelsGlobal, m.imageSize, m.imageSize, m.numChannels)
 		pixelValues = pixelValues.Concat(ctx, tilesGlobal, 3)
 	}
 
@@ -182,15 +175,8 @@ func (m *Model) PostTokenize(inputs []input.Input) ([]input.Input, error) {
 }
 
 func (m *Model) Forward(ctx ml.Context, batch input.Batch) (ml.Tensor, error) {
-	positions, err := ctx.Input().FromIntSlice(batch.Positions, len(batch.Positions))
-	if err != nil {
-		return nil, err
-	}
-
-	outputs, err := ctx.Input().FromIntSlice(batch.Outputs, len(batch.Outputs))
-	if err != nil {
-		return nil, err
-	}
+	positions := ctx.Input().FromIntSlice(batch.Positions, len(batch.Positions))
+	outputs := ctx.Input().FromIntSlice(batch.Outputs, len(batch.Outputs))
 
 	return m.TextModel.Forward(ctx, batch.Inputs, positions, outputs, batch, m.Cache), nil
 }
diff --git a/model/models/llama4/model_text.go b/model/models/llama4/model_text.go
index ff9f5e20..27935f40 100644
--- a/model/models/llama4/model_text.go
+++ b/model/models/llama4/model_text.go
@@ -223,11 +223,7 @@ func (m *TextModel) Forward(ctx ml.Context, inputs, positions, outputs ml.Tensor
 			scales[i] = float32(math.Log(math.Floor(((float64(p)+1.0)/float64(m.attentionFloorScale))+1.0))*m.attentionScale + 1.0)
 		}
 
-		var err error
-		attentionScales, err = ctx.Input().FromFloatSlice(scales, 1, 1, len(scales))
-		if err != nil {
-			panic(err)
-		}
+		attentionScales = ctx.Input().FromFloatSlice(scales, 1, 1, len(scales))
 	}
 
 	for i, layer := range m.Layers {
diff --git a/model/models/llama4/model_vision.go b/model/models/llama4/model_vision.go
index e6b1afef..dc6f82b8 100644
--- a/model/models/llama4/model_vision.go
+++ b/model/models/llama4/model_vision.go
@@ -245,10 +245,7 @@ func (m *VisionModel) rotaryEmbedding(ctx ml.Context) (ml.Tensor, ml.Tensor) {
 		}
 	}
 
-	ropeFreqs, err := ctx.Input().FromFloatSlice(freqs, freqDim/2, numPatches, 2)
-	if err != nil {
-		panic(err)
-	}
+	ropeFreqs := ctx.Input().FromFloatSlice(freqs, freqDim/2, numPatches, 2)
 
 	ropeFreqs = ropeFreqs.Permute(ctx, 0, 2, 1, 3).Contiguous(ctx)
 	ropeFreqs = ropeFreqs.Reshape(ctx, freqDim, 1, numPatches)
diff --git a/model/models/mistral3/model.go b/model/models/mistral3/model.go
index dd01a587..9d662fc1 100644
--- a/model/models/mistral3/model.go
+++ b/model/models/mistral3/model.go
@@ -114,10 +114,7 @@ func (m *Model) EncodeMultimodal(ctx ml.Context, multimodalData []byte) ([]input
 		return nil, err
 	}
 
-	pixelValues, err := ctx.Input().FromFloatSlice(f32s, size.X, size.Y, m.ImageProcessor.numChannels)
-	if err != nil {
-		return nil, err
-	}
+	pixelValues := ctx.Input().FromFloatSlice(f32s, size.X, size.Y, m.ImageProcessor.numChannels)
 
 	visionOutputs := m.VisionModel.Forward(ctx, pixelValues)
 	features, size := m.MultiModalProjector.Forward(ctx, visionOutputs, size)
@@ -161,15 +158,8 @@ func (m *Model) PostTokenize(inputs []input.Input) ([]input.Input, error) {
 }
 
 func (m *Model) Forward(ctx ml.Context, batch input.Batch) (ml.Tensor, error) {
-	positions, err := ctx.Input().FromIntSlice(batch.Positions, len(batch.Positions))
-	if err != nil {
-		return nil, err
-	}
-
-	outputs, err := ctx.Input().FromIntSlice(batch.Outputs, len(batch.Outputs))
-	if err != nil {
-		return nil, err
-	}
+	positions := ctx.Input().FromIntSlice(batch.Positions, len(batch.Positions))
+	outputs := ctx.Input().FromIntSlice(batch.Outputs, len(batch.Outputs))
 
 	return m.TextModel.Forward(ctx, batch.Inputs, positions, outputs, batch, m.Cache), nil
 }
diff --git a/model/models/mistral3/model_vision.go b/model/models/mistral3/model_vision.go
index 24541004..65bdcff2 100644
--- a/model/models/mistral3/model_vision.go
+++ b/model/models/mistral3/model_vision.go
@@ -110,15 +110,8 @@ func (m *VisionModel) positionalEmbedding(ctx ml.Context, positionIDs ml.Tensor)
 		}
 	}
 
-	h, err := ctx.Input().FromFloatSlice(frequenciesHeight, maxPatchesPerSide, frequencies/2)
-	if err != nil {
-		panic(err)
-	}
-
-	w, err := ctx.Input().FromFloatSlice(frequenciesWidth, maxPatchesPerSide, frequencies/2)
-	if err != nil {
-		panic(err)
-	}
+	h := ctx.Input().FromFloatSlice(frequenciesHeight, maxPatchesPerSide, frequencies/2)
+	w := ctx.Input().FromFloatSlice(frequenciesWidth, maxPatchesPerSide, frequencies/2)
 
 	h = h.Permute(ctx, 1, 0, 2, 3).Contiguous(ctx)
 	w = w.Permute(ctx, 1, 0, 2, 3).Contiguous(ctx)
@@ -151,10 +144,7 @@ func (m *VisionModel) Forward(ctx ml.Context, pixelValues ml.Tensor) ml.Tensor {
 		}
 	}
 
-	positionIDs, err := ctx.Input().FromIntSlice(positions, len(positions))
-	if err != nil {
-		panic(err)
-	}
+	positionIDs := ctx.Input().FromIntSlice(positions, len(positions))
 
 	positionEmbedding := m.positionalEmbedding(ctx, positionIDs)
 	cos, sin := positionEmbedding.Cos(ctx), positionEmbedding.Sin(ctx)
diff --git a/model/models/mllama/model.go b/model/models/mllama/model.go
index 547e2cb3..45cb3e02 100644
--- a/model/models/mllama/model.go
+++ b/model/models/mllama/model.go
@@ -80,15 +80,8 @@ func (m *Model) EncodeMultimodal(ctx ml.Context, multimodalData []byte) ([]input
 		f32s = f32s[:m.imageSize*m.imageSize*m.numChannels*m.maxNumTiles]
 	}
 
-	pixelValues, err := ctx.Input().FromFloatSlice(f32s, m.imageSize, m.imageSize, m.numChannels, m.maxNumTiles)
-	if err != nil {
-		return nil, err
-	}
-
-	aspectRatio, err := ctx.Input().FromIntSlice([]int32{int32(ratio.rank)}, 1)
-	if err != nil {
-		return nil, err
-	}
+	pixelValues := ctx.Input().FromFloatSlice(f32s, m.imageSize, m.imageSize, m.numChannels, m.maxNumTiles)
+	aspectRatio := ctx.Input().FromIntSlice([]int32{int32(ratio.rank)}, 1)
 
 	positionIDs := ctx.Arange(0, 1601, 1, ml.DTypeI32)
 	crossAttentionStates := m.VisionModel.Forward(ctx, pixelValues, positionIDs, aspectRatio)
@@ -113,15 +106,8 @@ func (m *Model) Forward(ctx ml.Context, batch input.Batch) (ml.Tensor, error) {
 		crossAttentionStates = batch.Multimodal[len(batch.Multimodal)-1].Multimodal[0].Tensor
 	}
 
-	positions, err := ctx.Input().FromIntSlice(batch.Positions, len(batch.Positions))
-	if err != nil {
-		return nil, err
-	}
-
-	outputs, err := ctx.Input().FromIntSlice(batch.Outputs, len(batch.Outputs))
-	if err != nil {
-		return nil, err
-	}
+	positions := ctx.Input().FromIntSlice(batch.Positions, len(batch.Positions))
+	outputs := ctx.Input().FromIntSlice(batch.Outputs, len(batch.Outputs))
 
 	// TODO: attention mask, cross attention mask
 	return m.TextModel.Forward(ctx, batch.Inputs, positions, outputs, crossAttentionStates, nil, m.Cache.(*kvcache.WrapperCache)), nil
diff --git a/model/models/qwen2/model.go b/model/models/qwen2/model.go
index 3c3d81aa..42338d0d 100644
--- a/model/models/qwen2/model.go
+++ b/model/models/qwen2/model.go
@@ -100,10 +100,7 @@ type Model struct {
 
 // Forward implements model.Model.
 func (m Model) Forward(ctx ml.Context, batch input.Batch) (ml.Tensor, error) {
-	positions, err := ctx.Input().FromIntSlice(batch.Positions, len(batch.Positions))
-	if err != nil {
-		return nil, err
-	}
+	positions := ctx.Input().FromIntSlice(batch.Positions, len(batch.Positions))
 
 	hiddenStates := m.TokenEmbedding.Forward(ctx, batch.Inputs)
 
@@ -112,10 +109,7 @@ func (m Model) Forward(ctx ml.Context, batch input.Batch) (ml.Tensor, error) {
 
 		var outputs ml.Tensor
 		if i == len(m.Layers)-1 {
-			outputs, err = ctx.Input().FromIntSlice(batch.Outputs, len(batch.Outputs))
-			if err != nil {
-				return nil, err
-			}
+			outputs = ctx.Input().FromIntSlice(batch.Outputs, len(batch.Outputs))
 		}
 
 		hiddenStates = layer.Forward(ctx, hiddenStates, positions, outputs, m.Cache, &m.Options)
diff --git a/model/models/qwen25vl/model.go b/model/models/qwen25vl/model.go
index 32cca560..ee38cad9 100644
--- a/model/models/qwen25vl/model.go
+++ b/model/models/qwen25vl/model.go
@@ -69,10 +69,7 @@ func (m *Model) PixelValues(ctx ml.Context, multimodalData []byte) (ml.Tensor, *
 		m.ImageProcessor.patchSize * m.ImageProcessor.patchSize
 	numPatches := grid.Temporal * grid.Height * grid.Width
 
-	pixelValues, err := ctx.Input().FromFloatSlice(f32s, patchDim, numPatches)
-	if err != nil {
-		return nil, nil, fmt.Errorf("failed to create tensor from image: %w", err)
-	}
+	pixelValues := ctx.Input().FromFloatSlice(f32s, patchDim, numPatches)
 
 	return pixelValues, grid, nil
 }
@@ -142,15 +139,8 @@ func (m *Model) PostTokenize(inputs []input.Input) ([]input.Input, error) {
 }
 
 func (m *Model) Forward(ctx ml.Context, batch input.Batch) (ml.Tensor, error) {
-	positions, err := ctx.Input().FromIntSlice(batch.Positions, len(batch.Positions))
-	if err != nil {
-		return nil, err
-	}
-
-	outputs, err := ctx.Input().FromIntSlice(batch.Outputs, len(batch.Outputs))
-	if err != nil {
-		return nil, err
-	}
+	positions := ctx.Input().FromIntSlice(batch.Positions, len(batch.Positions))
+	outputs := ctx.Input().FromIntSlice(batch.Outputs, len(batch.Outputs))
 
 	return m.TextModel.Forward(ctx, batch.Inputs, positions, outputs, batch, m.Cache)
 }
diff --git a/model/models/qwen25vl/model_vision.go b/model/models/qwen25vl/model_vision.go
index 01eef392..4d7afaa1 100644
--- a/model/models/qwen25vl/model_vision.go
+++ b/model/models/qwen25vl/model_vision.go
@@ -1,7 +1,6 @@
 package qwen25vl
 
 import (
-	"fmt"
 	"math"
 	"slices"
 
@@ -44,10 +43,8 @@ func blockDiagonalMask(ctx ml.Context, seqLength int, bounds []int, numHeads int
 		}
 	}
 
-	mask, err := ctx.Input().FromFloatSlice(flat, seqLength, seqLength)
-	if err != nil {
-		panic(err)
-	}
+	mask := ctx.Input().FromFloatSlice(flat, seqLength, seqLength)
+
 	// Reshape to match [seqLength, seqLength, 1] for broadcasting
 	mask = mask.Reshape(ctx, seqLength, seqLength, 1)
 
@@ -303,10 +300,7 @@ func (m *VisionModel) WindowIndex(ctx ml.Context, grid *Grid) (ml.Tensor, []int)
 		}
 	}
 
-	t, err := ctx.Input().FromIntSlice(index, len(index))
-	if err != nil {
-		panic(err)
-	}
+	t := ctx.Input().FromIntSlice(index, len(index))
 
 	return t, bounds
 }
@@ -326,10 +320,7 @@ func (m *VisionModel) PositionalEmbedding(ctx ml.Context, grid *Grid) ml.Tensor
 			freqVals[i*freq+j] = float32(i) / float32(math.Pow(theta, float64(j*2)/float64(dim)))
 		}
 	}
-	freqs, err := ctx.Input().FromFloatSlice(freqVals, freq, maxGridSize)
-	if err != nil {
-		panic(fmt.Errorf("failed to create tensor from frequencies: %w", err))
-	}
+	freqs := ctx.Input().FromFloatSlice(freqVals, freq, maxGridSize)
 
 	// Create position coordinates (y,x pairs) for the grid
 	// In PyTorch: Equivalent to generating position ids with torch.arange()
@@ -339,10 +330,7 @@ func (m *VisionModel) PositionalEmbedding(ctx ml.Context, grid *Grid) ml.Tensor
 			coords = append(coords, int32(y), int32(x))
 		}
 	}
-	pos, err := ctx.Input().FromIntSlice(coords, 2, grid.Width, grid.Height)
-	if err != nil {
-		panic(fmt.Errorf("failed to create tensor from positions: %w", err))
-	}
+	pos := ctx.Input().FromIntSlice(coords, 2, grid.Width, grid.Height)
 
 	// Reshape and permute positions to match spatial merging pattern
 	pos = pos.Reshape(ctx, 2, grid.Width, merge, grid.Height/merge)
diff --git a/model/models/qwen3/model.go b/model/models/qwen3/model.go
index 44c32f9e..1930da7e 100644
--- a/model/models/qwen3/model.go
+++ b/model/models/qwen3/model.go
@@ -156,10 +156,7 @@ type Model struct {
 
 // Forward implements model.Model.
 func (m *Model) Forward(ctx ml.Context, batch input.Batch) (ml.Tensor, error) {
-	positions, err := ctx.Input().FromIntSlice(batch.Positions, len(batch.Positions))
-	if err != nil {
-		return nil, err
-	}
+	positions := ctx.Input().FromIntSlice(batch.Positions, len(batch.Positions))
 
 	hiddenStates := m.TokenEmbedding.Forward(ctx, batch.Inputs)
 
@@ -168,10 +165,7 @@ func (m *Model) Forward(ctx ml.Context, batch input.Batch) (ml.Tensor, error) {
 
 		var outputs ml.Tensor
 		if i == len(m.Layers)-1 {
-			outputs, err = ctx.Input().FromIntSlice(batch.Outputs, len(batch.Outputs))
-			if err != nil {
-				return nil, err
-			}
+			outputs = ctx.Input().FromIntSlice(batch.Outputs, len(batch.Outputs))
 		}
 
 		hiddenStates = layer.Forward(ctx, hiddenStates, positions, outputs, m.Cache, m.Options)
diff --git a/runner/ollamarunner/multimodal.go b/runner/ollamarunner/multimodal.go
index dbe6bba1..fbdc7d72 100644
--- a/runner/ollamarunner/multimodal.go
+++ b/runner/ollamarunner/multimodal.go
@@ -102,7 +102,7 @@ func (m multimodalStore) getTensor(backend ml.Backend, ctx ml.Context, in ml.Ten
 	for i, t := range entry.mm {
 		if in == t.Tensor {
 			if !reserve {
-				return ctx.Input().FromFloatSlice(entry.data[i], t.Tensor.Shape()...)
+				return ctx.Input().FromFloatSlice(entry.data[i], t.Tensor.Shape()...), nil
 			} else {
 				return ctx.Input().Empty(t.Tensor.DType(), t.Tensor.Shape()...), nil
 			}
diff --git a/runner/ollamarunner/runner.go b/runner/ollamarunner/runner.go
index 99bee106..a7a889f1 100644
--- a/runner/ollamarunner/runner.go
+++ b/runner/ollamarunner/runner.go
@@ -808,10 +808,7 @@ func (s *Server) reserveWorstCaseGraph() error {
 		batch.Outputs[i] = int32(i)
 	}
 
-	batch.Inputs, err = ctx.Input().FromIntSlice(batchInputs, len(batchInputs))
-	if err != nil {
-		return err
-	}
+	batch.Inputs = ctx.Input().FromIntSlice(batchInputs, len(batchInputs))
 
 	cache := s.model.Config().Cache
 	if cache != nil {
@@ -876,7 +873,8 @@ func (s *Server) load(
 	parallel int,
 	kvCacheType string,
 	kvSize int,
-	multiUserCache bool) {
+	multiUserCache bool,
+) {
 	err := s.initModel(mpath, params, lpath, parallel, kvCacheType, kvSize, multiUserCache)
 	if err != nil {
 		panic(err)

From 884d26093c80491a3fe07f606fc04851dc317199 Mon Sep 17 00:00:00 2001
From: Parth Sareen <parth.sareen@ollama.com>
Date: Thu, 22 May 2025 18:53:31 -0700
Subject: [PATCH 056/108] llama: add minimum memory for grammar (#10820)

---
 llama/llama.go | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llama/llama.go b/llama/llama.go
index adee6f63..0dc64e57 100644
--- a/llama/llama.go
+++ b/llama/llama.go
@@ -580,7 +580,7 @@ func SchemaToGrammar(schema []byte) []byte {
 	defer C.free(unsafe.Pointer(cStr))
 
 	// Allocate buffer for grammar based on schema length but with upper bound
-	maxLen := min(1024*1024, len(schema)*4)
+	maxLen := max(32768, min(1024*1024, len(schema)*4))
 	buf := make([]byte, maxLen)
 
 	// Call C function to convert schema to grammar

From e8b981fa5d7c1875ec0c290068bcfe3b4662f5c4 Mon Sep 17 00:00:00 2001
From: Parth Sareen <parth.sareen@ollama.com>
Date: Fri, 23 May 2025 14:19:31 -0700
Subject: [PATCH 057/108] tools: refactor tool call parsing and enable
 streaming (#10415)

---
 server/model.go                               | 124 ----
 server/model_test.go                          | 179 -----
 server/routes.go                              |  68 +-
 .../testdata}/command-r-plus.gotmpl           |   0
 .../testdata}/command-r-plus.out              |   0
 .../testdata}/firefunction.gotmpl             |   0
 .../tools => tools/testdata}/firefunction.out |   0
 .../testdata}/llama3-groq-tool-use.gotmpl     |   0
 .../testdata}/llama3-groq-tool-use.out        |   0
 tools/testdata/llama3.2.gotmpl                |  44 ++
 tools/testdata/llama3.2.out                   |  24 +
 .../tools => tools/testdata}/messages.json    |   0
 .../tools => tools/testdata}/mistral.gotmpl   |   0
 .../tools => tools/testdata}/mistral.out      |   0
 .../tools => tools/testdata}/nemotron.gotmpl  |   0
 .../tools => tools/testdata}/nemotron.out     |   0
 tools/testdata/qwen2.5.gotmpl                 |  51 ++
 tools/testdata/qwen2.5.out                    |  31 +
 tools/testdata/qwen3.gotmpl                   |  50 ++
 tools/testdata/qwen3.out                      |  31 +
 .../tools => tools/testdata}/tools.json       |   0
 .../tools => tools/testdata}/xlam.gotmpl      |   0
 .../tools => tools/testdata}/xlam.out         |   0
 tools/tools.go                                | 271 ++++++++
 tools/tools_test.go                           | 644 ++++++++++++++++++
 tools/tools_utils.go                          | 227 ++++++
 tools/tools_utils_test.go                     | 464 +++++++++++++
 27 files changed, 1868 insertions(+), 340 deletions(-)
 delete mode 100644 server/model_test.go
 rename {server/testdata/tools => tools/testdata}/command-r-plus.gotmpl (100%)
 rename {server/testdata/tools => tools/testdata}/command-r-plus.out (100%)
 rename {server/testdata/tools => tools/testdata}/firefunction.gotmpl (100%)
 rename {server/testdata/tools => tools/testdata}/firefunction.out (100%)
 rename {server/testdata/tools => tools/testdata}/llama3-groq-tool-use.gotmpl (100%)
 rename {server/testdata/tools => tools/testdata}/llama3-groq-tool-use.out (100%)
 create mode 100644 tools/testdata/llama3.2.gotmpl
 create mode 100644 tools/testdata/llama3.2.out
 rename {server/testdata/tools => tools/testdata}/messages.json (100%)
 rename {server/testdata/tools => tools/testdata}/mistral.gotmpl (100%)
 rename {server/testdata/tools => tools/testdata}/mistral.out (100%)
 rename {server/testdata/tools => tools/testdata}/nemotron.gotmpl (100%)
 rename {server/testdata/tools => tools/testdata}/nemotron.out (100%)
 create mode 100644 tools/testdata/qwen2.5.gotmpl
 create mode 100644 tools/testdata/qwen2.5.out
 create mode 100644 tools/testdata/qwen3.gotmpl
 create mode 100644 tools/testdata/qwen3.out
 rename {server/testdata/tools => tools/testdata}/tools.json (100%)
 rename {server/testdata/tools => tools/testdata}/xlam.gotmpl (100%)
 rename {server/testdata/tools => tools/testdata}/xlam.out (100%)
 create mode 100644 tools/tools.go
 create mode 100644 tools/tools_test.go
 create mode 100644 tools/tools_utils.go
 create mode 100644 tools/tools_utils_test.go

diff --git a/server/model.go b/server/model.go
index 6b5439a4..401547e4 100644
--- a/server/model.go
+++ b/server/model.go
@@ -10,9 +10,6 @@ import (
 	"log/slog"
 	"net/http"
 	"os"
-	"slices"
-	"strings"
-	"text/template/parse"
 
 	"github.com/ollama/ollama/api"
 	"github.com/ollama/ollama/fs/ggml"
@@ -128,124 +125,3 @@ func detectContentType(r io.Reader) (string, error) {
 
 	return "unknown", nil
 }
-
-func parseObjects(s string) []map[string]any {
-	var objs []map[string]any
-	for offset := 0; offset < len(s); {
-		var obj map[string]any
-		decoder := json.NewDecoder(strings.NewReader(s[offset:]))
-		if err := decoder.Decode(&obj); errors.Is(err, io.EOF) || errors.Is(err, io.ErrUnexpectedEOF) {
-			break
-		} else if syntax := &(json.SyntaxError{}); errors.As(err, &syntax) {
-			// skip over any syntax errors
-			offset += int(syntax.Offset)
-		} else if unmarshalType := &(json.UnmarshalTypeError{}); errors.As(err, &unmarshalType) {
-			// skip over any unmarshalable types
-			offset += int(unmarshalType.Offset)
-		} else if err != nil {
-			return nil
-		} else {
-			offset += int(decoder.InputOffset())
-			objs = append(objs, obj)
-		}
-	}
-
-	return objs
-}
-
-// parseToolCalls attempts to parse a JSON string into a slice of ToolCalls.
-// mxyng: this only really works if the input contains tool calls in some JSON format
-func (m *Model) parseToolCalls(s string) ([]api.ToolCall, bool) {
-	// create a subtree from the node that ranges over .ToolCalls
-	tmpl := m.Template.Subtree(func(n parse.Node) bool {
-		if t, ok := n.(*parse.RangeNode); ok {
-			return slices.Contains(template.Identifiers(t.Pipe), "ToolCalls")
-		}
-
-		return false
-	})
-
-	if tmpl == nil {
-		return nil, false
-	}
-
-	var b bytes.Buffer
-	if err := tmpl.Execute(&b, map[string][]api.ToolCall{
-		"ToolCalls": {
-			{
-				Function: api.ToolCallFunction{
-					Name: "@@name@@",
-					Arguments: api.ToolCallFunctionArguments{
-						"@@argument@@": 1,
-					},
-				},
-			},
-		},
-	}); err != nil {
-		return nil, false
-	}
-
-	templateObjects := parseObjects(b.String())
-	if len(templateObjects) == 0 {
-		return nil, false
-	}
-
-	// find the keys that correspond to the name and arguments fields
-	var name, arguments string
-	for k, v := range templateObjects[0] {
-		switch v.(type) {
-		case string:
-			name = k
-		case map[string]any:
-			arguments = k
-		}
-	}
-
-	if name == "" || arguments == "" {
-		return nil, false
-	}
-
-	responseObjects := parseObjects(s)
-	if len(responseObjects) == 0 {
-		return nil, false
-	}
-
-	// collect all nested objects
-	var collect func(any) []map[string]any
-	collect = func(obj any) (all []map[string]any) {
-		switch o := obj.(type) {
-		case map[string]any:
-			all = append(all, o)
-			for _, v := range o {
-				all = append(all, collect(v)...)
-			}
-		case []any:
-			for _, v := range o {
-				all = append(all, collect(v)...)
-			}
-		}
-
-		return all
-	}
-
-	var objs []map[string]any
-	for _, p := range responseObjects {
-		objs = append(objs, collect(p)...)
-	}
-
-	var toolCalls []api.ToolCall
-	for _, kv := range objs {
-		n, nok := kv[name].(string)
-		a, aok := kv[arguments].(map[string]any)
-		if nok && aok {
-			toolCalls = append(toolCalls, api.ToolCall{
-				Function: api.ToolCallFunction{
-					Name:      n,
-					Arguments: a,
-				},
-			})
-		}
-	}
-
-	return toolCalls, len(toolCalls) > 0
-}
diff --git a/server/model_test.go b/server/model_test.go
deleted file mode 100644
index e5c2f2bb..00000000
--- a/server/model_test.go
+++ /dev/null
@@ -1,179 +0,0 @@
-package server
-
-import (
-	"bytes"
-	"encoding/json"
-	"fmt"
-	"os"
-	"path/filepath"
-	"testing"
-
-	"github.com/google/go-cmp/cmp"
-
-	"github.com/ollama/ollama/api"
-	"github.com/ollama/ollama/template"
-)
-
-func readFile(t *testing.T, base, name string) *bytes.Buffer {
-	t.Helper()
-
-	bts, err := os.ReadFile(filepath.Join(base, name))
-	if err != nil {
-		t.Fatal(err)
-	}
-
-	return bytes.NewBuffer(bts)
-}
-
-func TestExecuteWithTools(t *testing.T) {
-	p := filepath.Join("testdata", "tools")
-	cases := []struct {
-		model  string
-		output string
-		ok     bool
-	}{
-		{"mistral", `[TOOL_CALLS]  [{"name": "get_current_weather", "arguments": {"format":"fahrenheit","location":"San Francisco, CA"}},{"name": "get_current_weather", "arguments": {"format":"celsius","location":"Toronto, Canada"}}]`, true},
-		{"mistral", `[TOOL_CALLS]  [{"name": "get_current_weather", "arguments": {"format":"fahrenheit","location":"San Francisco, CA"}},{"name": "get_current_weather", "arguments": {"format":"celsius","location":"Toronto, Canada"}}]
-
-The temperature in San Francisco, CA is 70°F and in Toronto, Canada is 20°C.`, true},
-		{"mistral", `[TOOL_CALLS]  [{"name": "get_current_weather", "arguments": {"format":"fahrenheit","location":"San Francisco, CA"}},{"name": "get_current_weather", "arguments": {"format":"celsius","location":"Toronto, Canada"}},{"name": "get_current_weather", "arguments": {"format":"celsius","location":"To }]`, false},
-		{"mistral", `I'm not aware of that information. However, I can suggest searching for the weather using the "get_current_weather" function:
-
-		[{"name": "get_current_weather", "arguments": {"format":"fahrenheit","location":"San Francisco, CA"}},{"name": "get_current_weather", "arguments": {"format":"celsius","location":"Toronto, Canada"}}]`, true},
-		{"mistral", " The weather in San Francisco, CA is 70°F and in Toronto, Canada is 20°C.", false},
-		{"command-r-plus", "Action: ```json" + `
-[
-    {
-        "tool_name": "get_current_weather",
-        "parameters": {
-            "format": "fahrenheit",
-            "location": "San Francisco, CA"
-        }
-    },
-    {
-        "tool_name": "get_current_weather",
-        "parameters": {
-            "format": "celsius",
-            "location": "Toronto, Canada"
-        }
-    }
-]
-` + "```", true},
-		{"command-r-plus", " The weather in San Francisco, CA is 70°F and in Toronto, Canada is 20°C.", false},
-		{"firefunction", ` functools[{"name": "get_current_weather", "arguments": {"format":"fahrenheit","location":"San Francisco, CA"}},{"name": "get_current_weather", "arguments": {"format":"celsius","location":"Toronto, Canada"}}]`, true},
-		{"firefunction", " The weather in San Francisco, CA is 70°F and in Toronto, Canada is 20°C.", false},
-		{"llama3-groq-tool-use", `<tool_call>
-{"name": "get_current_weather", "arguments": {"format":"fahrenheit","location":"San Francisco, CA"}}
-{"name": "get_current_weather", "arguments": {"format":"celsius","location":"Toronto, Canada"}}
-</tool_call>`, true},
-		{"xlam", `{"tool_calls": [{"name": "get_current_weather", "arguments": {"format":"fahrenheit","location":"San Francisco, CA"}},{"name": "get_current_weather", "arguments": {"format":"celsius","location":"Toronto, Canada"}}]}`, true},
-		{"nemotron", `<toolcall>{"name": "get_current_weather", "arguments": {"format":"fahrenheit","location":"San Francisco, CA"}},{"name": "get_current_weather", "arguments": {"format":"celsius","location":"Toronto, Canada"}}]} </toolcall>`, true},
-	}
-
-	var tools []api.Tool
-	if err := json.Unmarshal(readFile(t, p, "tools.json").Bytes(), &tools); err != nil {
-		t.Fatal(err)
-	}
-
-	var messages []api.Message
-	if err := json.Unmarshal(readFile(t, p, "messages.json").Bytes(), &messages); err != nil {
-		t.Fatal(err)
-	}
-
-	calls := []api.ToolCall{
-		{
-			Function: api.ToolCallFunction{
-				Name: "get_current_weather",
-				Arguments: api.ToolCallFunctionArguments{
-					"format":   "fahrenheit",
-					"location": "San Francisco, CA",
-				},
-			},
-		},
-		{
-			Function: api.ToolCallFunction{
-				Name: "get_current_weather",
-				Arguments: api.ToolCallFunctionArguments{
-					"format":   "celsius",
-					"location": "Toronto, Canada",
-				},
-			},
-		},
-	}
-
-	for _, tt := range cases {
-		t.Run(tt.model, func(t *testing.T) {
-			tmpl, err := template.Parse(readFile(t, p, fmt.Sprintf("%s.gotmpl", tt.model)).String())
-			if err != nil {
-				t.Fatal(err)
-			}
-
-			t.Run("template", func(t *testing.T) {
-				var actual bytes.Buffer
-				if err := tmpl.Execute(&actual, template.Values{Tools: tools, Messages: messages}); err != nil {
-					t.Fatal(err)
-				}
-
-				if diff := cmp.Diff(actual.String(), readFile(t, p, fmt.Sprintf("%s.out", tt.model)).String()); diff != "" {
-					t.Errorf("mismatch (-got +want):\n%s", diff)
-				}
-			})
-
-			t.Run("parse", func(t *testing.T) {
-				m := &Model{Template: tmpl}
-				actual, ok := m.parseToolCalls(tt.output)
-				if ok != tt.ok {
-					t.Fatalf("expected %t, got %t", tt.ok, ok)
-				}
-
-				if tt.ok {
-					if diff := cmp.Diff(actual, calls); diff != "" {
-						t.Errorf("mismatch (-got +want):\n%s", diff)
-					}
-				}
-			})
-		})
-	}
-}
-
-func TestParseObjects(t *testing.T) {
-	tests := []struct {
-		input string
-		want  []map[string]any
-	}{
-		{
-			input: `[{"name": "get_current_weather", "arguments": {"format":"fahrenheit","location":"San Francisco, CA"}},{"name": "get_current_weather", "arguments": {"format":"celsius","location":"Toronto, Canada"}}]`,
-			want: []map[string]any{
-				{"name": "get_current_weather", "arguments": map[string]any{"format": "fahrenheit", "location": "San Francisco, CA"}},
-				{"name": "get_current_weather", "arguments": map[string]any{"format": "celsius", "location": "Toronto, Canada"}},
-			},
-		},
-		{
-			input: `<toolcall>{"name": "get_current_weather", "arguments": {"format":"fahrenheit","location":"San Francisco, CA"}} </toolcall>`,
-			want: []map[string]any{
-				{"name": "get_current_weather", "arguments": map[string]any{"format": "fahrenheit", "location": "San Francisco, CA"}},
-			},
-		},
-		{
-			input: `<toolcall>{"name": "get_current_weather", "arguments": {"format":"fahrenheit","location":"San Francisco, CA"}} </toolcall> <toolcall>{"name": "get_current_weather", "arguments": {"format":"celsius","location":"Toronto, ON"}} </toolcall>`,
-			want: []map[string]any{
-				{"name": "get_current_weather", "arguments": map[string]any{"format": "fahrenheit", "location": "San Francisco, CA"}},
-				{"name": "get_current_weather", "arguments": map[string]any{"format": "celsius", "location": "Toronto, ON"}},
-			},
-		},
-		{
-			input: `{"name": "get_current_weather", "arguments": `,
-			want:  nil,
-		},
-	}
-
-	for _, tc := range tests {
-		t.Run(tc.input, func(t *testing.T) {
-			got := parseObjects(tc.input)
-
-			if diff := cmp.Diff(got, tc.want); diff != "" {
-				t.Errorf("mismatch (-got +want):\n%s", diff)
-			}
-		})
-	}
-}
diff --git a/server/routes.go b/server/routes.go
index d0b8f487..42e8cdd1 100644
--- a/server/routes.go
+++ b/server/routes.go
@@ -38,6 +38,7 @@ import (
 	"github.com/ollama/ollama/server/internal/client/ollama"
 	"github.com/ollama/ollama/server/internal/registry"
 	"github.com/ollama/ollama/template"
+	"github.com/ollama/ollama/tools"
 	"github.com/ollama/ollama/types/errtypes"
 	"github.com/ollama/ollama/types/model"
 	"github.com/ollama/ollama/version"
@@ -1482,11 +1483,20 @@ func (s *Server) ChatHandler(c *gin.Context) {
 		return
 	}
 
+	var toolParser *tools.Parser
+	if len(req.Tools) > 0 {
+		toolParser, err = tools.NewParser(m.Template.Template)
+		if err != nil {
+			slog.Error("failed to create tool parser", "error", err)
+			c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
+			return
+		}
+	}
+
 	ch := make(chan any)
 	go func() {
 		defer close(ch)
-		var sb strings.Builder
-		var toolCallIndex int = 0
+
 		if err := r.Completion(c.Request.Context(), llm.CompletionRequest{
 			Prompt:  prompt,
 			Images:  images,
@@ -1512,37 +1522,21 @@ func (s *Server) ChatHandler(c *gin.Context) {
 				res.LoadDuration = checkpointLoaded.Sub(checkpointStart)
 			}
 
-			// TODO: tool call checking and filtering should be moved outside of this callback once streaming
-			// however this was a simple change for now without reworking streaming logic of this (and other)
-			// handlers
-			if req.Stream != nil && !*req.Stream || len(req.Tools) == 0 {
-				ch <- res
-				return
-			}
-
-			// Streaming tool calls:
-			// If tools are recognized, use a flag to track the sending of a tool downstream
-			// This ensures that content is cleared from the message on the last chunk sent
-			sb.WriteString(r.Content)
-			if toolCalls, ok := m.parseToolCalls(sb.String()); ok {
-				res.Message.ToolCalls = toolCalls
-				for i := range toolCalls {
-					toolCalls[i].Function.Index = toolCallIndex
-					toolCallIndex++
+			if len(req.Tools) > 0 {
+				toolCalls, content := toolParser.Add(r.Content)
+				if len(content) > 0 {
+					res.Message.Content = content
+				} else if len(toolCalls) > 0 {
+					res.Message.ToolCalls = toolCalls
+					res.Message.Content = ""
+				} else {
+					if r.Done {
+						ch <- res
+					}
+					return
 				}
-				res.Message.Content = ""
-				sb.Reset()
-				ch <- res
-				return
-			}
-
-			if r.Done {
-				// Send any remaining content if no tool calls were detected
-				if toolCallIndex == 0 {
-					res.Message.Content = sb.String()
-				}
-				ch <- res
 			}
+			ch <- res
 		}); err != nil {
 			ch <- gin.H{"error": err.Error()}
 		}
@@ -1551,11 +1545,15 @@ func (s *Server) ChatHandler(c *gin.Context) {
 	if req.Stream != nil && !*req.Stream {
 		var resp api.ChatResponse
 		var sb strings.Builder
+		var toolCalls []api.ToolCall
 		for rr := range ch {
 			switch t := rr.(type) {
 			case api.ChatResponse:
 				sb.WriteString(t.Message.Content)
 				resp = t
+				if len(req.Tools) > 0 {
+					toolCalls = append(toolCalls, t.Message.ToolCalls...)
+				}
 			case gin.H:
 				msg, ok := t["error"].(string)
 				if !ok {
@@ -1571,12 +1569,8 @@ func (s *Server) ChatHandler(c *gin.Context) {
 		}
 
 		resp.Message.Content = sb.String()
-
-		if len(req.Tools) > 0 {
-			if toolCalls, ok := m.parseToolCalls(sb.String()); ok {
-				resp.Message.ToolCalls = toolCalls
-				resp.Message.Content = ""
-			}
+		if len(toolCalls) > 0 {
+			resp.Message.ToolCalls = toolCalls
 		}
 
 		c.JSON(http.StatusOK, resp)
diff --git a/server/testdata/tools/command-r-plus.gotmpl b/tools/testdata/command-r-plus.gotmpl
similarity index 100%
rename from server/testdata/tools/command-r-plus.gotmpl
rename to tools/testdata/command-r-plus.gotmpl
diff --git a/server/testdata/tools/command-r-plus.out b/tools/testdata/command-r-plus.out
similarity index 100%
rename from server/testdata/tools/command-r-plus.out
rename to tools/testdata/command-r-plus.out
diff --git a/server/testdata/tools/firefunction.gotmpl b/tools/testdata/firefunction.gotmpl
similarity index 100%
rename from server/testdata/tools/firefunction.gotmpl
rename to tools/testdata/firefunction.gotmpl
diff --git a/server/testdata/tools/firefunction.out b/tools/testdata/firefunction.out
similarity index 100%
rename from server/testdata/tools/firefunction.out
rename to tools/testdata/firefunction.out
diff --git a/server/testdata/tools/llama3-groq-tool-use.gotmpl b/tools/testdata/llama3-groq-tool-use.gotmpl
similarity index 100%
rename from server/testdata/tools/llama3-groq-tool-use.gotmpl
rename to tools/testdata/llama3-groq-tool-use.gotmpl
diff --git a/server/testdata/tools/llama3-groq-tool-use.out b/tools/testdata/llama3-groq-tool-use.out
similarity index 100%
rename from server/testdata/tools/llama3-groq-tool-use.out
rename to tools/testdata/llama3-groq-tool-use.out
diff --git a/tools/testdata/llama3.2.gotmpl b/tools/testdata/llama3.2.gotmpl
new file mode 100644
index 00000000..b132423e
--- /dev/null
+++ b/tools/testdata/llama3.2.gotmpl
@@ -0,0 +1,44 @@
+<|start_header_id|>system<|end_header_id|>
+
+Cutting Knowledge Date: December 2023
+
+{{ if .System }}{{ .System }}
+{{- end }}
+{{- if .Tools }}When you receive a tool call response, use the output to format an answer to the orginal user question.
+
+You are a helpful assistant with tool calling capabilities.
+{{- end }}<|eot_id|>
+{{- range $i, $_ := .Messages }}
+{{- $last := eq (len (slice $.Messages $i)) 1 }}
+{{- if eq .Role "user" }}<|start_header_id|>user<|end_header_id|>
+{{- if and $.Tools $last }}
+
+Given the following functions, please respond with a JSON for a function call with its proper arguments that best answers the given prompt.
+
+Respond in the format {"name": function name, "parameters": dictionary of argument name and its value}. Do not use variables.
+
+{{ range $.Tools }}
+{{- . }}
+{{ end }}
+{{ .Content }}<|eot_id|>
+{{- else }}
+
+{{ .Content }}<|eot_id|>
+{{- end }}{{ if $last }}<|start_header_id|>assistant<|end_header_id|>
+
+{{ end }}
+{{- else if eq .Role "assistant" }}<|start_header_id|>assistant<|end_header_id|>
+{{- if .ToolCalls }}
+{{ range .ToolCalls }}
+{"name": "{{ .Function.Name }}", "parameters": {{ .Function.Arguments }}}{{ end }}
+{{- else }}
+
+{{ .Content }}
+{{- end }}{{ if not $last }}<|eot_id|>{{ end }}
+{{- else if eq .Role "tool" }}<|start_header_id|>ipython<|end_header_id|>
+
+{{ .Content }}<|eot_id|>{{ if $last }}<|start_header_id|>assistant<|end_header_id|>
+
+{{ end }}
+{{- end }}
+{{- end }}
\ No newline at end of file
diff --git a/tools/testdata/llama3.2.out b/tools/testdata/llama3.2.out
new file mode 100644
index 00000000..a27c6eaf
--- /dev/null
+++ b/tools/testdata/llama3.2.out
@@ -0,0 +1,24 @@
+<|start_header_id|>system<|end_header_id|>
+
+Cutting Knowledge Date: December 2023
+
+You are a knowledgeable assistant. You can answer questions and perform tasks.When you receive a tool call response, use the output to format an answer to the orginal user question.
+
+You are a helpful assistant with tool calling capabilities.<|eot_id|><|start_header_id|>user<|end_header_id|>
+
+What's the weather like today in Paris?<|eot_id|><|start_header_id|>assistant<|end_header_id|>
+
+{"name": "get_current_weather", "parameters": {"format":"celsius","location":"Paris, France"}}<|eot_id|><|start_header_id|>ipython<|end_header_id|>
+
+22<|eot_id|><|start_header_id|>assistant<|end_header_id|>
+
+The current temperature in Paris, France is 22 degrees Celsius.<|eot_id|><|start_header_id|>user<|end_header_id|>
+
+Given the following functions, please respond with a JSON for a function call with its proper arguments that best answers the given prompt.
+
+Respond in the format {"name": function name, "parameters": dictionary of argument name and its value}. Do not use variables.
+
+{"type":"function","function":{"name":"get_current_weather","description":"Get the current weather","parameters":{"type":"object","required":["location","format"],"properties":{"format":{"type":"string","description":"The temperature unit to use. Infer this from the user's location.","enum":["celsius","fahrenheit"]},"location":{"type":"string","description":"The city and state, e.g. San Francisco, CA"}}}}}
+
+What's the weather like today in San Francisco and Toronto?<|eot_id|><|start_header_id|>assistant<|end_header_id|>
+
diff --git a/server/testdata/tools/messages.json b/tools/testdata/messages.json
similarity index 100%
rename from server/testdata/tools/messages.json
rename to tools/testdata/messages.json
diff --git a/server/testdata/tools/mistral.gotmpl b/tools/testdata/mistral.gotmpl
similarity index 100%
rename from server/testdata/tools/mistral.gotmpl
rename to tools/testdata/mistral.gotmpl
diff --git a/server/testdata/tools/mistral.out b/tools/testdata/mistral.out
similarity index 100%
rename from server/testdata/tools/mistral.out
rename to tools/testdata/mistral.out
diff --git a/server/testdata/tools/nemotron.gotmpl b/tools/testdata/nemotron.gotmpl
similarity index 100%
rename from server/testdata/tools/nemotron.gotmpl
rename to tools/testdata/nemotron.gotmpl
diff --git a/server/testdata/tools/nemotron.out b/tools/testdata/nemotron.out
similarity index 100%
rename from server/testdata/tools/nemotron.out
rename to tools/testdata/nemotron.out
diff --git a/tools/testdata/qwen2.5.gotmpl b/tools/testdata/qwen2.5.gotmpl
new file mode 100644
index 00000000..cbd7302c
--- /dev/null
+++ b/tools/testdata/qwen2.5.gotmpl
@@ -0,0 +1,51 @@
+{{- if .Suffix }}<|fim_prefix|>{{ .Prompt }}<|fim_suffix|>{{ .Suffix }}<|fim_middle|>
+{{- else if .Messages }}
+{{- if or .System .Tools }}<|im_start|>system
+{{- if .System }}
+{{ .System }}
+{{- end }}
+{{- if .Tools }}
+
+# Tools
+
+You may call one or more functions to assist with the user query.
+
+You are provided with function signatures within <tools></tools> XML tags:
+<tools>
+{{- range .Tools }}
+{"type": "function", "function": {{ .Function }}}
+{{- end }}
+</tools>
+
+For each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:
+<tool_call>
+{"name": <function-name>, "arguments": <args-json-object>}
+</tool_call>
+{{- end }}<|im_end|>
+{{ end }}
+{{- range $i, $_ := .Messages }}
+{{- $last := eq (len (slice $.Messages $i)) 1 -}}
+{{- if eq .Role "user" }}<|im_start|>user
+{{ .Content }}<|im_end|>
+{{ else if eq .Role "assistant" }}<|im_start|>assistant
+{{ if .Content }}{{ .Content }}
+{{- else if .ToolCalls }}<tool_call>
+{{ range .ToolCalls }}{"name": "{{ .Function.Name }}", "arguments": {{ .Function.Arguments }}}
+{{ end }}</tool_call>
+{{- end }}{{ if not $last }}<|im_end|>
+{{ end }}
+{{- else if eq .Role "tool" }}<|im_start|>user
+<tool_response>
+{{ .Content }}
+</tool_response><|im_end|>
+{{ end }}
+{{- if and (ne .Role "assistant") $last }}<|im_start|>assistant
+{{ end }}
+{{- end }}
+{{- else }}
+{{- if .System }}<|im_start|>system
+{{ .System }}<|im_end|>
+{{ end }}{{ if .Prompt }}<|im_start|>user
+{{ .Prompt }}<|im_end|>
+{{ end }}<|im_start|>assistant
+{{ end }}{{ .Response }}{{ if .Response }}<|im_end|>{{ end }}
\ No newline at end of file
diff --git a/tools/testdata/qwen2.5.out b/tools/testdata/qwen2.5.out
new file mode 100644
index 00000000..76bfbfa9
--- /dev/null
+++ b/tools/testdata/qwen2.5.out
@@ -0,0 +1,31 @@
+<|im_start|>system
+You are a knowledgeable assistant. You can answer questions and perform tasks.
+
+# Tools
+
+You may call one or more functions to assist with the user query.
+
+You are provided with function signatures within <tools></tools> XML tags:
+<tools>
+{"type": "function", "function": {"name":"get_current_weather","description":"Get the current weather","parameters":{"type":"object","required":["location","format"],"properties":{"format":{"type":"string","description":"The temperature unit to use. Infer this from the user's location.","enum":["celsius","fahrenheit"]},"location":{"type":"string","description":"The city and state, e.g. San Francisco, CA"}}}}}
+</tools>
+
+For each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:
+<tool_call>
+{"name": <function-name>, "arguments": <args-json-object>}
+</tool_call><|im_end|>
+<|im_start|>user
+What's the weather like today in Paris?<|im_end|>
+<|im_start|>assistant
+<tool_call>
+{"name": "get_current_weather", "arguments": {"format":"celsius","location":"Paris, France"}}
+</tool_call><|im_end|>
+<|im_start|>user
+<tool_response>
+22
+</tool_response><|im_end|>
+<|im_start|>assistant
+The current temperature in Paris, France is 22 degrees Celsius.<|im_end|>
+<|im_start|>user
+What's the weather like today in San Francisco and Toronto?<|im_end|>
+<|im_start|>assistant
diff --git a/tools/testdata/qwen3.gotmpl b/tools/testdata/qwen3.gotmpl
new file mode 100644
index 00000000..26f6656f
--- /dev/null
+++ b/tools/testdata/qwen3.gotmpl
@@ -0,0 +1,50 @@
+{{- if .Messages }}
+{{- if or .System .Tools }}<|im_start|>system
+{{- if .System }}
+{{ .System }}
+{{- end }}
+{{- if .Tools }}
+
+# Tools
+
+You may call one or more functions to assist with the user query.
+
+You are provided with function signatures within <tools></tools> XML tags:
+<tools>
+{{- range .Tools }}
+{"type": "function", "function": {{ .Function }}}
+{{- end }}
+</tools>
+
+For each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:
+<tool_call>
+{"name": <function-name>, "arguments": <args-json-object>}
+</tool_call>
+{{- end }}<|im_end|>
+{{ end }}
+{{- range $i, $_ := .Messages }}
+{{- $last := eq (len (slice $.Messages $i)) 1 -}}
+{{- if eq .Role "user" }}<|im_start|>user
+{{ .Content }}<|im_end|>
+{{ else if eq .Role "assistant" }}<|im_start|>assistant
+{{ if .Content }}{{ .Content }}
+{{- else if .ToolCalls }}<tool_call>
+{{ range .ToolCalls }}{"name": "{{ .Function.Name }}", "arguments": {{ .Function.Arguments }}}
+{{ end }}</tool_call>
+{{- end }}{{ if not $last }}<|im_end|>
+{{ end }}
+{{- else if eq .Role "tool" }}<|im_start|>user
+<tool_response>
+{{ .Content }}
+</tool_response><|im_end|>
+{{ end }}
+{{- if and (ne .Role "assistant") $last }}<|im_start|>assistant
+{{ end }}
+{{- end }}
+{{- else }}
+{{- if .System }}<|im_start|>system
+{{ .System }}<|im_end|>
+{{ end }}{{ if .Prompt }}<|im_start|>user
+{{ .Prompt }}<|im_end|>
+{{ end }}<|im_start|>assistant
+{{ end }}{{ .Response }}{{ if .Response }}<|im_end|>{{ end }}
\ No newline at end of file
diff --git a/tools/testdata/qwen3.out b/tools/testdata/qwen3.out
new file mode 100644
index 00000000..76bfbfa9
--- /dev/null
+++ b/tools/testdata/qwen3.out
@@ -0,0 +1,31 @@
+<|im_start|>system
+You are a knowledgeable assistant. You can answer questions and perform tasks.
+
+# Tools
+
+You may call one or more functions to assist with the user query.
+
+You are provided with function signatures within <tools></tools> XML tags:
+<tools>
+{"type": "function", "function": {"name":"get_current_weather","description":"Get the current weather","parameters":{"type":"object","required":["location","format"],"properties":{"format":{"type":"string","description":"The temperature unit to use. Infer this from the user's location.","enum":["celsius","fahrenheit"]},"location":{"type":"string","description":"The city and state, e.g. San Francisco, CA"}}}}}
+</tools>
+
+For each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:
+<tool_call>
+{"name": <function-name>, "arguments": <args-json-object>}
+</tool_call><|im_end|>
+<|im_start|>user
+What's the weather like today in Paris?<|im_end|>
+<|im_start|>assistant
+<tool_call>
+{"name": "get_current_weather", "arguments": {"format":"celsius","location":"Paris, France"}}
+</tool_call><|im_end|>
+<|im_start|>user
+<tool_response>
+22
+</tool_response><|im_end|>
+<|im_start|>assistant
+The current temperature in Paris, France is 22 degrees Celsius.<|im_end|>
+<|im_start|>user
+What's the weather like today in San Francisco and Toronto?<|im_end|>
+<|im_start|>assistant
diff --git a/server/testdata/tools/tools.json b/tools/testdata/tools.json
similarity index 100%
rename from server/testdata/tools/tools.json
rename to tools/testdata/tools.json
diff --git a/server/testdata/tools/xlam.gotmpl b/tools/testdata/xlam.gotmpl
similarity index 100%
rename from server/testdata/tools/xlam.gotmpl
rename to tools/testdata/xlam.gotmpl
diff --git a/server/testdata/tools/xlam.out b/tools/testdata/xlam.out
similarity index 100%
rename from server/testdata/tools/xlam.out
rename to tools/testdata/xlam.out
diff --git a/tools/tools.go b/tools/tools.go
new file mode 100644
index 00000000..509ca90a
--- /dev/null
+++ b/tools/tools.go
@@ -0,0 +1,271 @@
+package tools
+
+import (
+	"encoding/json"
+	"errors"
+	"log/slog"
+	"strings"
+	gotmpl "text/template"
+
+	"github.com/ollama/ollama/api"
+	"github.com/ollama/ollama/template"
+)
+
+var (
+	errInvalidToolCall = errors.New("invalid tool call format")
+	errAccumulateMore  = errors.New("need to accumulate more content")
+)
+
+type Parser struct {
+	parseLeadingJSON bool
+	prefix           string
+	prefixFound      bool
+	tmpl             gotmpl.Template
+	sb               strings.Builder
+	index            int
+	name             string
+	arguments        string
+	done             bool
+}
+
+// parseJSONToolCalls attempts to parse a JSON string into a slice of ToolCalls.
+//
+// Parameters:
+//   - s: The string to parse
+//   - name: The field name from template that identifies the tool call name
+//   - arguments: The field name from template that identifies the tool call arguments
+//
+// Returns:
+//   - []api.ToolCall: The parsed tool calls if successful
+//   - error: ErrAccumulateMore if braces unbalanced, ErrInvalidToolCall if invalid, or nil if successful
+func parseJSONToolCalls(s string, name, arguments string, prefix string) ([]api.ToolCall, error) {
+	// Check for balanced braces before attempting to parse
+	braceCount := 0
+	squareCount := 0
+	startIndex := -1
+	var rawToolCalls []string
+	s = strings.TrimSpace(s)
+
+	// Only track these if we don't have a prefix as it will be cut off from the prefix. Also track in the parseLeadingJSON case.
+	trackSquareBrackets := prefix == "" || !strings.HasSuffix(prefix, "[") || strings.HasPrefix(s, "[")
+	for i, c := range s {
+		switch c {
+		case '{':
+			braceCount++
+			if startIndex == -1 {
+				startIndex = i
+			}
+		case '}':
+			braceCount--
+			if braceCount == 0 {
+				rawToolCalls = append(rawToolCalls, s[startIndex:i+1])
+				startIndex = -1
+			}
+		case '[':
+			if trackSquareBrackets {
+				squareCount++
+			}
+		case ']':
+			if trackSquareBrackets {
+				squareCount--
+			}
+		}
+
+		// Negative means we have an extra closing brace/bracket
+		if braceCount < 0 || squareCount < 0 {
+			return nil, errInvalidToolCall
+		}
+	}
+
+	// If braces/brackets aren't balanced, need more input
+	if braceCount > 0 || squareCount > 0 {
+		return nil, errAccumulateMore
+	}
+
+	t := strings.TrimSpace(s)
+	if len(t) == 0 {
+		return nil, errAccumulateMore
+	}
+	// If the input is a single square bracket, it's not a valid tool call
+	if t[0] == '[' && len(t) == 1 {
+		return nil, errAccumulateMore
+	}
+
+	// Attempt full unmarshal of the JSON
+	var toolCalls []api.ToolCall
+	for _, rawToolCall := range rawToolCalls {
+		var resp map[string]any
+		if err := json.Unmarshal([]byte(rawToolCall), &resp); err != nil {
+			continue
+		}
+
+		// Collect nested objects that could contain tool calls
+		objs := collect(resp)
+		if len(objs) == 0 {
+			continue
+		}
+
+		// Extract tool calls from objects
+		for _, kv := range objs {
+			n, nok := kv[name].(string)
+			a, aok := kv[arguments].(map[string]any)
+			if nok && aok {
+				toolCalls = append(toolCalls, api.ToolCall{
+					Function: api.ToolCallFunction{
+						Name:      n,
+						Arguments: a,
+					},
+				})
+			} else {
+				slog.Debug("No valid tool call found in object.", "object", kv)
+			}
+		}
+	}
+
+	// Valid JSON, no tool calls found
+	if len(toolCalls) == 0 {
+		slog.Debug("No valid tool calls found in any raw tool calls.", "rawToolCalls", rawToolCalls)
+		return nil, errInvalidToolCall
+	}
+
+	return toolCalls, nil
+}
+
+// checkPrefix processes a string to find and handle a prefix pattern.
+//
+// Returns:
+//   - The processed string with prefix removed if found
+//   - error: ErrAccumulateMore if prefix is incomplete, or nil if successful
+func (p *Parser) checkPrefix(s string) (string, error) {
+	original := s
+	if strings.ContainsRune(s, '\n') {
+		s = strings.ReplaceAll(s, "\n", " ")
+	}
+
+	if s == "" || p.prefix == "" {
+		return s, nil
+	}
+
+	// Check for prefix at start of string
+	if cut, hasPrefix := strings.CutPrefix(s, p.prefix); hasPrefix {
+		// Found prefix at start - accumulate for potential tool
+		p.prefixFound = true
+		return cut, nil
+	}
+
+	// Check if prefix overlaps end of string
+	if idx := suffixOverlap(s, p.prefix); idx != -1 {
+		// Return everything except overlapping portion
+		p.sb.Reset()
+		p.sb.WriteString(s[idx:])
+		return original[:idx], errAccumulateMore
+	}
+
+	// Check if prefix appears in middle of string
+	if idx := strings.Index(s, p.prefix); idx != -1 {
+		// Save remainder starting at prefix for next pass
+		p.sb.Reset()
+		p.sb.WriteString(strings.TrimSpace(s[idx:]))
+		// Return everything before prefix
+		return original[:idx], errAccumulateMore
+	}
+
+	// No partial prefix found
+	return s, nil
+}
+
+// Add processes a string input to parse tool calls and content.
+// It handles prefix detection and JSON parsing to extract tool calls.
+//
+// Returns:
+//   - tools: Any parsed tool calls
+//   - content: Non-tool call content
+func (p *Parser) Add(s string) (tools []api.ToolCall, content string) {
+	if strings.TrimSpace(s) == "" {
+		return nil, s
+	}
+	if p.done {
+		if p.index == 0 {
+			// Return original string if no tool calls found at start
+			return nil, s
+		}
+		// Return empty if no tool calls found after start
+		return nil, ""
+	}
+	p.sb.WriteString(s)
+	s = p.sb.String()
+
+	// Check for prefix pattern in input
+	s, err := p.checkPrefix(s)
+	if err != nil {
+		// Need more input to complete prefix
+		return nil, s
+	}
+
+	// Exit if prefix exists in template, greedy parsing is off, and prefix not found
+	if !p.parseLeadingJSON && !p.prefixFound {
+		p.sb.Reset()
+		return nil, s
+	}
+
+	toolCalls, err := parseJSONToolCalls(s, p.name, p.arguments, p.prefix)
+	if err != nil {
+		if errors.Is(err, errAccumulateMore) {
+			return nil, ""
+		}
+		p.sb.Reset()
+		// Do not try parsing leading JSON if JSON not found
+		p.parseLeadingJSON = false
+		if p.prefix == "" {
+			p.done = true
+		}
+		if p.index != 0 && p.prefix == "" {
+			return nil, ""
+		}
+		if p.prefixFound {
+			// Drop tokens since prefix was found
+			return nil, ""
+		}
+		return nil, s
+	}
+
+	for _, tc := range toolCalls {
+		tc.Function.Index = p.index
+		p.index++
+	}
+
+	p.sb.Reset()
+	return toolCalls, ""
+}
+
+// NewParser creates a new tool call parser from a template. It extracts the tool call format,
+// prefix, and field names from the template to use for parsing tool calls from model output.
+//
+// Returns an error if the template does not contain valid tool call formatting.
+func NewParser(templateToProcess *gotmpl.Template) (*Parser, error) {
+	parsed, err := template.Parse(templateToProcess.Root.String())
+	if err != nil {
+		return nil, err
+	}
+
+	tt, err := toolTemplate(parsed)
+	if err != nil {
+		return nil, err
+	}
+
+	tp := toolPrefix(templateToProcess)
+
+	name, arguments, err := extractToolArgs(tt)
+	if err != nil {
+		return nil, err
+	}
+
+	return &Parser{
+		tmpl:             *tt,
+		sb:               strings.Builder{},
+		prefix:           tp,
+		parseLeadingJSON: true,
+		name:             name,
+		arguments:        arguments,
+	}, nil
+}
diff --git a/tools/tools_test.go b/tools/tools_test.go
new file mode 100644
index 00000000..1ae3bff8
--- /dev/null
+++ b/tools/tools_test.go
@@ -0,0 +1,644 @@
+package tools
+
+import (
+	"bytes"
+	"encoding/json"
+	"fmt"
+	"os"
+	"path/filepath"
+	"strings"
+	"testing"
+
+	"github.com/google/go-cmp/cmp"
+
+	"github.com/ollama/ollama/api"
+	"github.com/ollama/ollama/template"
+)
+
+func readFile(t *testing.T, base, name string) *bytes.Buffer {
+	t.Helper()
+
+	bts, err := os.ReadFile(filepath.Join(base, name))
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	return bytes.NewBuffer(bts)
+}
+
+func TestParseJSONToolCalls(t *testing.T) {
+	tests := []struct {
+		name          string
+		input         string
+		nameField     string
+		argsField     string
+		wantToolCalls []api.ToolCall
+		wantErr       error
+		prefix        string
+	}{
+		{
+			name:      "valid single tool call",
+			input:     `{"name": "test_tool", "arguments": {"arg1": "value1"}}`,
+			nameField: "name",
+			argsField: "arguments",
+			wantToolCalls: []api.ToolCall{
+				{
+					Function: api.ToolCallFunction{
+						Name: "test_tool",
+						Arguments: map[string]any{
+							"arg1": "value1",
+						},
+					},
+				},
+			},
+			wantErr: nil,
+			prefix:  "",
+		},
+		{
+			name:          "incomplete JSON",
+			input:         `{"name": "test_tool", "arguments": {"arg1": `,
+			nameField:     "name",
+			argsField:     "arguments",
+			wantToolCalls: nil,
+			wantErr:       errAccumulateMore,
+			prefix:        "",
+		},
+		{
+			name:          "invalid JSON",
+			input:         `not json at all`,
+			nameField:     "name",
+			argsField:     "arguments",
+			wantToolCalls: nil,
+			wantErr:       errInvalidToolCall,
+			prefix:        "",
+		},
+		{
+			name:          "missing required fields",
+			input:         `{"other": "field"}`,
+			nameField:     "name",
+			argsField:     "arguments",
+			wantToolCalls: nil,
+			wantErr:       errInvalidToolCall,
+			prefix:        "",
+		},
+		{
+			name: "multiple tool calls in array",
+			input: `[
+				{"name": "tool1", "arguments": {"arg1": 1}},
+				{"name": "tool2", "arguments": {"arg2": "value"}}
+			]`,
+			nameField: "name",
+			argsField: "arguments",
+			wantToolCalls: []api.ToolCall{
+				{
+					Function: api.ToolCallFunction{
+						Name: "tool1",
+						Arguments: map[string]any{
+							"arg1": float64(1),
+						},
+					},
+				},
+				{
+					Function: api.ToolCallFunction{
+						Name: "tool2",
+						Arguments: map[string]any{
+							"arg2": "value",
+						},
+					},
+				},
+			},
+			wantErr: nil,
+			prefix:  "",
+		},
+		{
+			name: "multiple tool calls without array",
+			input: `
+				{"name": "tool1", "arguments": {"arg1": 1}},
+				{"name": "tool2", "arguments": {"arg2": "value"}}
+			`,
+			nameField: "name",
+			argsField: "arguments",
+			wantToolCalls: []api.ToolCall{
+				{
+					Function: api.ToolCallFunction{
+						Name: "tool1",
+						Arguments: map[string]any{
+							"arg1": float64(1),
+						},
+					},
+				},
+				{
+					Function: api.ToolCallFunction{
+						Name: "tool2",
+						Arguments: map[string]any{
+							"arg2": "value",
+						},
+					},
+				},
+			},
+			wantErr: nil,
+			prefix:  "",
+		},
+		{
+			name: "multiple tool calls with text after",
+			input: `
+				{"name": "tool1", "arguments": {"arg1": 1}} text
+				{"name": "tool2", "arguments": {"arg2": "value"}} text
+			`,
+			nameField: "name",
+			argsField: "arguments",
+			wantToolCalls: []api.ToolCall{
+				{
+					Function: api.ToolCallFunction{
+						Name: "tool1",
+						Arguments: map[string]any{
+							"arg1": float64(1),
+						},
+					},
+				},
+				{
+					Function: api.ToolCallFunction{
+						Name: "tool2",
+						Arguments: map[string]any{
+							"arg2": "value",
+						},
+					},
+				},
+			},
+			wantErr: nil,
+			prefix:  "",
+		},
+		{
+			name: "second tool call in array",
+			input: `
+				, {"name": "tool2", "arguments": {"arg2": "value"}}
+			`,
+			nameField: "name",
+			argsField: "arguments",
+			wantToolCalls: []api.ToolCall{
+				{
+					Function: api.ToolCallFunction{
+						Name: "tool2",
+						Arguments: map[string]any{
+							"arg2": "value",
+						},
+					},
+				},
+			},
+			wantErr: nil,
+			prefix:  "",
+		},
+		// a bad JSON would not return any tool calls or content as it would always accumulate more
+		{
+			name:          "unbalanced square brackets",
+			input:         `[{"name": "tool1", "arguments": {"arg1": [1, 2}]`,
+			nameField:     "name",
+			argsField:     "arguments",
+			wantToolCalls: nil,
+			wantErr:       errAccumulateMore,
+			prefix:        "",
+		},
+		{
+			name:          "incomplete square brackets",
+			input:         `[{"name": "tool1", "arguments": {"arg1": [1, 2, 3`,
+			nameField:     "name",
+			argsField:     "arguments",
+			wantToolCalls: nil,
+			wantErr:       errAccumulateMore,
+			prefix:        "",
+		},
+		{
+			name:      "nested arrays in arguments",
+			input:     `{"name": "tool1", "arguments": {"arg1": [1, 2, ["nested", "array"]]}}`,
+			nameField: "name",
+			argsField: "arguments",
+			wantToolCalls: []api.ToolCall{
+				{
+					Function: api.ToolCallFunction{
+						Name: "tool1",
+						Arguments: map[string]any{
+							"arg1": []any{float64(1), float64(2), []any{"nested", "array"}},
+						},
+					},
+				},
+			},
+			wantErr: nil,
+			prefix:  "",
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			gotCalls, err := parseJSONToolCalls(tt.input, tt.nameField, tt.argsField, tt.prefix)
+
+			if err != tt.wantErr {
+				t.Errorf("parseJSONToolCalls() error = %v, want %v", err, tt.wantErr)
+			}
+
+			if len(gotCalls) != 0 && tt.wantErr != nil {
+				t.Errorf("parseJSONToolCalls() valid = %v, want %v", len(gotCalls) == 0, tt.wantErr == nil)
+			}
+
+			if diff := cmp.Diff(gotCalls, tt.wantToolCalls); diff != "" {
+				t.Errorf("parseJSONToolCalls() tool calls mismatch (-got +want):\n%s", diff)
+			}
+		})
+	}
+}
+
+func TestParseToolCalls(t *testing.T) {
+	p := filepath.Join("testdata")
+	t1 := api.ToolCall{
+		Function: api.ToolCallFunction{
+			Name: "get_current_weather",
+			Arguments: api.ToolCallFunctionArguments{
+				"format":   "fahrenheit",
+				"location": "San Francisco, CA",
+			},
+		},
+	}
+	t2 := api.ToolCall{
+		Function: api.ToolCallFunction{
+			Name: "get_current_weather",
+			Arguments: api.ToolCallFunctionArguments{
+				"format":   "celsius",
+				"location": "Toronto, Canada",
+			},
+		},
+	}
+
+	cases := []struct {
+		name             string
+		model            string
+		output           string
+		expectedToolCall []api.ToolCall
+		expectedTokens   string
+	}{
+		{
+			name:             "mistral malformed json with tool calls prefix",
+			model:            "mistral",
+			output:           `[TOOL_CALLS]  [{"name": "get_current_weather", "arguments": {"format":"fahrenheit","location":"San Francisco, CA"}},{"name": "get_curren}]`,
+			expectedToolCall: []api.ToolCall{t1},
+			expectedTokens:   "",
+		},
+		{
+			name:             "mistral multiple tool calls without prefix",
+			model:            "mistral",
+			output:           `[{"name": "get_current_weather", "arguments": {"format":"fahrenheit","location":"San Francisco, CA"}},{"name": "get_current_weather", "arguments": {"format":"celsius","location":"Toronto, Canada"}} ]`,
+			expectedToolCall: []api.ToolCall{t1, t2},
+			expectedTokens:   "",
+		},
+		{
+			name:  "mistral tool calls with text between no prefix",
+			model: "mistral",
+			output: `[{"name": "get_current_weather", "arguments": {"format":"fahrenheit","location":"San Francisco, CA"}},{"name": "get_current_weather", "arguments": {"format":"celsius","location":"Toronto, Canada"}}] 
+			model outputs more tokens here and then [{"name": "get_current_weather", "arguments": {"format":"fahrenheit","location":"San Francisco, CA"}},{"name": "get_current_weather", "arguments": {"format":"celsius","location":"Toronto, Canada"}}]`,
+			expectedToolCall: []api.ToolCall{t1, t2},
+			expectedTokens:   `model outputs more tokens here and then [{"name": "get_current_weather", "arguments": {"format":"fahrenheit","location":"San Francisco, CA"}},{"name": "get_current_weather", "arguments": {"format":"celsius","location":"Toronto, Canada"}}]`,
+		},
+		{
+			name:             "mistral valid json with tool calls prefix",
+			model:            "mistral",
+			output:           `[TOOL_CALLS]  [{"name": "get_current_weather", "arguments": {"format":"fahrenheit","location":"San Francisco, CA"}},{"name": "get_current_weather", "arguments": {"format":"celsius","location":"Toronto, Canada"}}]`,
+			expectedToolCall: []api.ToolCall{t1, t2},
+			expectedTokens:   "",
+		},
+		{
+			name:  "mistral multiple tool calls with text between and prefix",
+			model: "mistral",
+			output: `[TOOL_CALLS]  [{"name": "get_current_weather", "arguments": {"format":"fahrenheit","location":"San Francisco, CA"}},{"name": "get_current_weather", "arguments": {"format":"celsius","location":"Toronto, Canada"}}]
+			model outputs more tokens here and then [{"name": "get_current_weather", "arguments": {"format":"fahrenheit","location":"San Francisco, CA"}},{"name": "get_current_weather", "arguments": {"format":"celsius","location":"Toronto, Canada"}}]`,
+			expectedToolCall: []api.ToolCall{t1, t2, t1, t2},
+			expectedTokens:   "",
+		},
+		{
+			name:             "mistral incomplete json with tool calls prefix",
+			model:            "mistral",
+			output:           `[TOOL_CALLS]  [{"name": "get_current_weather", "arguments": {"format":"fahrenheit","location":"San Francisco, `,
+			expectedToolCall: []api.ToolCall{},
+			expectedTokens:   "",
+		},
+		{
+			name:  "mistral invalid tool call with explanatory text no prefix",
+			model: "mistral",
+			output: `I'm not aware of that information. However, I can suggest searching for the weather using the "get_current_weather" function:
+
+		[{"name": "get_current_weather", "arguments": {"format":"fahrenheit","location":"San Francisco, CA"}},{"name": "get_current_weather", "arguments": {"format":"celsius","location":"Toronto, Canada"}}]`,
+			expectedToolCall: []api.ToolCall{},
+			expectedTokens:   `I'm not aware of that information. However, I can suggest searching for the weather using the "get_current_weather" function: [{"name": "get_current_weather", "arguments": {"format":"fahrenheit","location":"San Francisco, CA"}},{"name": "get_current_weather", "arguments": {"format":"celsius","location":"Toronto, Canada"}}]`,
+		},
+		{
+			name:             "mistral tool calls without prefix",
+			model:            "mistral",
+			output:           `[{"name": "get_current_weather", "arguments": {"format":"fahrenheit","location":"San Francisco, CA"}},{"name": "get_current_weather", "arguments": {"format":"celsius","location":"Toronto, Canada"}}]`,
+			expectedToolCall: []api.ToolCall{t1, t2},
+			expectedTokens:   "",
+		},
+		{
+			name:  "command r plus tool calls with json block format",
+			model: "command-r-plus",
+			output: "Action: ```json" + `
+		[
+		    {
+		        "tool_name": "get_current_weather",
+		        "parameters": {
+		            "format": "fahrenheit",
+		            "location": "San Francisco, CA"
+		        }
+		    },
+		    {
+		        "tool_name": "get_current_weather",
+		        "parameters": {
+		            "format": "celsius",
+		            "location": "Toronto, Canada"
+		        }
+		    }
+		]
+		` + "```",
+			expectedToolCall: []api.ToolCall{t1, t2},
+			expectedTokens:   "",
+		},
+		{
+			name:             "firefunction tool calls with functools prefix",
+			model:            "firefunction",
+			output:           ` functools[{"name": "get_current_weather", "arguments": {"format":"fahrenheit","location":"San Francisco, CA"}},{"name": "get_current_weather", "arguments": {"format":"celsius","location":"Toronto, Canada"}}]`,
+			expectedToolCall: []api.ToolCall{t1, t2},
+			expectedTokens:   "",
+		},
+		{
+			name:  "llama3 groq single tool call with xml tags",
+			model: "llama3-groq-tool-use",
+			output: `<tool_call>
+		{"name": "get_current_weather", "arguments": {"format":"fahrenheit","location":"San Francisco, CA"}}
+		</tool_call>`,
+			expectedToolCall: []api.ToolCall{t1},
+			expectedTokens:   "",
+		},
+		{
+			name:             "xlam tool calls with wrapper object",
+			model:            "xlam",
+			output:           `{"tool_calls": [{"name": "get_current_weather", "arguments": {"format":"fahrenheit","location":"San Francisco, CA"}},{"name": "get_current_weather", "arguments": {"format":"celsius","location":"Toronto, Canada"}}]}`,
+			expectedToolCall: []api.ToolCall{t1, t2},
+			expectedTokens:   "",
+		},
+		{
+			name:             "qwen2.5 single tool call with prefix",
+			model:            "qwen2.5",
+			output:           `<tool_call>{"name": "get_current_weather", "arguments": {"format":"fahrenheit","location":"San Francisco, CA"}}</tool_call>`,
+			expectedToolCall: []api.ToolCall{t1},
+			expectedTokens:   "",
+		},
+		{
+			name:             "qwen2.5 multiple tool calls with and without prefix",
+			model:            "qwen2.5",
+			output:           `{"name": "get_current_weather", "arguments": {"format":"fahrenheit","location":"San Francisco, CA"}} <tool_call>{"name": "get_current_weather", "arguments": {"format":"fahrenheit","location":"San Francisco, CA"}}</tool_call> <tool_call>{"name": "get_current_weather", "arguments": {"format":"celsius","location":"Toronto, Canada"}}</tool_call>`,
+			expectedToolCall: []api.ToolCall{t1, t1, t2},
+			expectedTokens:   "",
+		},
+		{
+			name:             "qwen2.5 plain text response no tool calls",
+			model:            "qwen2.5",
+			output:           "The weather in San Francisco, CA is 70°F and in Toronto, Canada is 20°C.",
+			expectedToolCall: []api.ToolCall{},
+			expectedTokens:   "The weather in San Francisco, CA is 70°F and in Toronto, Canada is 20°C.",
+		},
+		{
+			name:             "qwen2.5 tool calls with trailing text",
+			model:            "qwen2.5",
+			output:           `[{"name": "get_current_weather", "arguments": {"format":"fahrenheit","location":"San Francisco, CA"}}, {"name": "get_current_weather", "arguments": {"format":"celsius","location":"Toronto, Canada"}}] some tokens after call`,
+			expectedToolCall: []api.ToolCall{t1, t2},
+			expectedTokens:   "some tokens after call",
+		},
+		{
+			name:             "qwen2.5 tool calls with initial text",
+			model:            "qwen2.5",
+			output:           `some tokens before call [{"name": "get_current_weather", "arguments": {"format":"fahrenheit","location":"San Francisco, CA"}}, {"name": "get_current_weather", "arguments": {"format":"celsius","location":"Toronto, Canada"}}]`,
+			expectedToolCall: []api.ToolCall{},
+			expectedTokens:   `some tokens before call [{"name": "get_current_weather", "arguments": {"format":"fahrenheit","location":"San Francisco, CA"}}, {"name": "get_current_weather", "arguments": {"format":"celsius","location":"Toronto, Canada"}}]`,
+		},
+		{
+			name:             "qwen2.5 tool calls with prefix and trailing text",
+			model:            "qwen2.5",
+			output:           `<tool_call> [{"name": "get_current_weather", "arguments": {"format":"fahrenheit","location":"San Francisco, CA"}}, {"name": "get_current_weather", "arguments": {"format":"celsius","location":"Toronto, Canada"}}] </tool_call> some tokens after call`,
+			expectedToolCall: []api.ToolCall{t1, t2},
+			expectedTokens:   "",
+		},
+		{
+			name:             "qwen2.5 tool calls with prefix and initial text",
+			model:            "qwen2.5",
+			output:           `some tokens before call <tool_call> [{"name": "get_current_weather", "arguments": {"format":"fahrenheit","location":"San Francisco, CA"}}, {"name": "get_current_weather", "arguments": {"format":"celsius","location":"Toronto, Canada"}}] </tool_call>`,
+			expectedToolCall: []api.ToolCall{t1, t2},
+			expectedTokens:   "some tokens before call",
+		},
+		{
+			name:             "qwen2.5 tool calls without and with prefix",
+			model:            "qwen2.5",
+			output:           `{"name": "get_current_weather", "arguments": {"format":"fahrenheit","location":"San Francisco, CA"}} <tool_call>{"name": "get_current_weather", "arguments": {"format":"celsius","location":"Toronto, Canada"}}</tool_call>`,
+			expectedToolCall: []api.ToolCall{t1, t2},
+			expectedTokens:   "",
+		},
+		{
+			name:             "qwen2.5 tool calls without and with prefix and text between",
+			model:            "qwen2.5",
+			output:           `{"name": "get_current_weather", "arguments": {"format":"fahrenheit","location":"San Francisco, CA"}} some tokens between <tool_call>{"name": "get_current_weather", "arguments": {"format":"celsius","location":"Toronto, Canada"}}</tool_call> some tokens after call`,
+			expectedToolCall: []api.ToolCall{t1, t2},
+			expectedTokens:   "some tokens between",
+		},
+		{
+			name:             "qwen2.5 tool calls without prefix and invalid tool call with other tokens",
+			model:            "qwen2.5",
+			output:           `hi [{"options": "foo"}]`,
+			expectedToolCall: []api.ToolCall{},
+			expectedTokens:   `hi [{"options": "foo"}]`,
+		},
+		{
+			name:             "qwen2.5 tool calls with prefix and invalid tool call",
+			model:            "qwen2.5",
+			output:           `<tool_call> [{"options": "foo"}] </tool_call> `,
+			expectedToolCall: []api.ToolCall{},
+			expectedTokens:   ``,
+		},
+		{
+			name:             "qwen3 tool call with think prefix and tool prefix (sent as a single token)",
+			model:            "qwen3",
+			output:           `<think>Okay, let me think what tool we should use...</think><tool_call>{"name": "get_current_weather", "arguments": {"format":"fahrenheit","location":"San Francisco, CA"}}</tool_call>`,
+			expectedToolCall: []api.ToolCall{t1},
+			expectedTokens:   "<think>Okay, let me think what tool we should use...</think>",
+		},
+		{
+			name:             "qwen3 tool call with think prefix, tool prefix, and whitespace (sent as separate tokens)",
+			model:            "qwen3",
+			output:           `<think>Okay, let me think what tool we should use...</think> <tool_call>{ "name": "get_current_weather", "arguments": {"format":"fahrenheit","location":"San Francisco, CA"}} </tool_call>`,
+			expectedToolCall: []api.ToolCall{t1},
+			expectedTokens:   "<think>Okay, let me think what tool we should use...</think>",
+		},
+		{
+			name:             "qwen3 empty think prefix without tool prefix and invalid tool call",
+			model:            "qwen3",
+			output:           `<think></think> {"name": "get_current_weather", "arguments": {"format":"fahrenheit","location":"San Francisco, CA"}} </tool_call>`,
+			expectedToolCall: []api.ToolCall{},
+			expectedTokens:   `<think></think> {"name": "get_current_weather", "arguments": {"format":"fahrenheit","location":"San Francisco, CA"}} </tool_call>`,
+		},
+		{
+			name:             "qwen3 empty think prefix with tool prefix and valid tool call",
+			model:            "qwen3",
+			output:           `<think></think><tool_call>{ "name": "get_current_weather", "arguments": {"format":"fahrenheit","location":"San Francisco, CA"}}  </tool_call>`,
+			expectedToolCall: []api.ToolCall{t1},
+			expectedTokens:   `<think></think>`,
+		},
+		{
+			name:             "qwen3 invalid tool call with fake tool prefix (single rune suffix match)",
+			model:            "qwen3",
+			output:           `<think></think>< fakeout {"name": "get_current_weather", "arguments": {"format":"fahrenheit","location":"San Francisco, CA"}} </tool_call>`,
+			expectedToolCall: []api.ToolCall{},
+			expectedTokens:   `<think></think>< fakeout {"name": "get_current_weather", "arguments": {"format":"fahrenheit","location":"San Francisco, CA"}} </tool_call>`,
+		},
+		{
+			name:             "qwen3 invalid tool call with partial tool prefix (multiple rune suffix match)",
+			model:            "qwen3",
+			output:           `<think></think><tool_c fakeout {"name": "get_current_weather", "arguments": {"format":"fahrenheit","location":"San Francisco, CA"}} </tool_call>`,
+			expectedToolCall: []api.ToolCall{},
+			expectedTokens:   `<think></think><tool_c fakeout {"name": "get_current_weather", "arguments": {"format":"fahrenheit","location":"San Francisco, CA"}} </tool_call>`,
+		},
+		{
+			name:             "qwen3 invalid tool call with malformed tool prefix",
+			model:            "qwen3",
+			output:           `<think></think><tool_cfakeout {"name": "get_current_weather", "arguments": {"format":"fahrenheit","location":"San Francisco, CA"}} </tool_call>`,
+			expectedToolCall: []api.ToolCall{},
+			expectedTokens:   `<think></think><tool_cfakeout {"name": "get_current_weather", "arguments": {"format":"fahrenheit","location":"San Francisco, CA"}} </tool_call>`,
+		},
+		{
+			name:             "model with prefix in template, no prefix in output",
+			model:            "qwen2.5",
+			output:           `[{"name": "get_current_weather", "arguments": {"format":"fahrenheit","location":"San Francisco, CA"}} {"name": "get_current_weather", "arguments": {"format":"celsius","location":"Toronto, Canada"}}]`,
+			expectedToolCall: []api.ToolCall{t1, t2},
+			expectedTokens:   "",
+		},
+		{
+			name:             "model with prefix in template, prefix in output",
+			model:            "qwen2.5",
+			output:           `<tool_call>[{"name": "get_current_weather", "arguments": {"format":"fahrenheit","location":"San Francisco, CA"}} {"name": "get_current_weather", "arguments": {"format":"celsius","location":"Toronto, Canada"}}]</tool_call>`,
+			expectedToolCall: []api.ToolCall{t1, t2},
+			expectedTokens:   "",
+		},
+		{
+			name:             "model without prefix in template, no prefix in output",
+			model:            "llama3.2",
+			output:           `[{"name": "get_current_weather", "parameters": {"format":"fahrenheit","location":"San Francisco, CA"}} {"name": "get_current_weather", "parameters": {"format":"celsius","location":"Toronto, Canada"}}]`,
+			expectedToolCall: []api.ToolCall{t1, t2},
+			expectedTokens:   "",
+		},
+		{
+			name:             "model without prefix in template, no prefix in output, single tool call",
+			model:            "llama3.2",
+			output:           `{"name": "get_current_weather", "parameters": {"format":"fahrenheit","location":"San Francisco, CA"}}`,
+			expectedToolCall: []api.ToolCall{t1},
+			expectedTokens:   "",
+		},
+		{
+			name:             "model without prefix in template, prefix in output",
+			model:            "llama3.2",
+			output:           `<tool_call> [{"name": "get_current_weather", "parameters": {"format":"fahrenheit","location":"San Francisco, CA"}} {"name": "get_current_weather", "parameters": {"format":"celsius","location":"Toronto, Canada"}}]</tool_call>`,
+			expectedToolCall: []api.ToolCall{},
+			expectedTokens:   `<tool_call> [{"name": "get_current_weather", "parameters": {"format":"fahrenheit","location":"San Francisco, CA"}} {"name": "get_current_weather", "parameters": {"format":"celsius","location":"Toronto, Canada"}}]</tool_call>`,
+		},
+		{
+			name:             "model with prefix in template, no prefix in output, tokens before",
+			model:            "qwen2.5",
+			output:           `some tokens before [{"name": "get_current_weather", "arguments": {"format":"fahrenheit","location":"San Francisco, CA"}} {"name": "get_current_weather", "arguments": {"format":"celsius","location":"Toronto, Canada"}}]`,
+			expectedToolCall: []api.ToolCall{},
+			expectedTokens:   `some tokens before [{"name": "get_current_weather", "arguments": {"format":"fahrenheit","location":"San Francisco, CA"}} {"name": "get_current_weather", "arguments": {"format":"celsius","location":"Toronto, Canada"}}]`,
+		},
+		{
+			name:             "model with prefix in template, prefix in output, tokens after",
+			model:            "qwen2.5",
+			output:           `<tool_call>[{"name": "get_current_weather", "arguments": {"format":"fahrenheit","location":"San Francisco, CA"}} {"name": "get_current_weather", "arguments": {"format":"celsius","location":"Toronto, Canada"}}]</tool_call> some tokens after`,
+			expectedToolCall: []api.ToolCall{t1, t2},
+			expectedTokens:   "",
+		},
+		{
+			name:             "model without prefix in template, no prefix in output, tokens after",
+			model:            "llama3.2",
+			output:           `[{"name": "get_current_weather", "parameters": {"format":"fahrenheit","location":"San Francisco, CA"}} {"name": "get_current_weather", "parameters": {"format":"celsius","location":"Toronto, Canada"}}]</tool_call> some tokens after`,
+			expectedToolCall: []api.ToolCall{t1, t2},
+			expectedTokens:   "",
+		},
+		{
+			name:             "model without prefix in template, no prefix in output, tokens before",
+			model:            "llama3.2",
+			output:           `some tokens before [{"name": "get_current_weather", "parameters": {"format":"fahrenheit","location":"San Francisco, CA"}} {"name": "get_current_weather", "parameters": {"format":"celsius","location":"Toronto, Canada"}}]`,
+			expectedToolCall: []api.ToolCall{},
+			expectedTokens:   `some tokens before [{"name": "get_current_weather", "parameters": {"format":"fahrenheit","location":"San Francisco, CA"}} {"name": "get_current_weather", "parameters": {"format":"celsius","location":"Toronto, Canada"}}]`,
+		},
+		{
+			name:             "model without prefix in template, prefix in output, tokens after",
+			model:            "llama3.2",
+			output:           `<tool_call> [{"name": "get_current_weather", "parameters": {"format":"fahrenheit","location":"San Francisco, CA"}} {"name": "get_current_weather", "parameters": {"format":"celsius","location":"Toronto, Canada"}}]</tool_call> some tokens after`,
+			expectedToolCall: []api.ToolCall{},
+			expectedTokens:   `<tool_call> [{"name": "get_current_weather", "parameters": {"format":"fahrenheit","location":"San Francisco, CA"}} {"name": "get_current_weather", "parameters": {"format":"celsius","location":"Toronto, Canada"}}]</tool_call> some tokens after`,
+		},
+	}
+
+	var tools []api.Tool
+	if err := json.Unmarshal(readFile(t, p, "tools.json").Bytes(), &tools); err != nil {
+		t.Fatal(err)
+	}
+
+	var messages []api.Message
+	if err := json.Unmarshal(readFile(t, p, "messages.json").Bytes(), &messages); err != nil {
+		t.Fatal(err)
+	}
+
+	for _, tt := range cases {
+		t.Run(tt.name, func(t *testing.T) {
+			tmpl, err := template.Parse(readFile(t, p, fmt.Sprintf("%s.gotmpl", tt.model)).String())
+			if err != nil {
+				t.Fatal(err)
+			}
+
+			t.Run("template", func(t *testing.T) {
+				actual := &bytes.Buffer{} // Create new buffer for each test
+				if err := tmpl.Execute(actual, template.Values{Tools: tools, Messages: messages}); err != nil {
+					t.Fatal(err)
+				}
+
+				if diff := cmp.Diff(actual.String(), readFile(t, p, fmt.Sprintf("%s.out", tt.model)).String()); diff != "" {
+					t.Errorf("mismatch (-got +want):\n%s", diff)
+				}
+			})
+
+			t.Run("parse", func(t *testing.T) {
+				tp, err := NewParser(tmpl.Template)
+				if err != nil {
+					t.Fatal(err)
+				}
+				got := []api.ToolCall{}
+				var gotTokens strings.Builder
+
+				tokens := strings.Fields(tt.output)
+				for _, tok := range tokens {
+					s := " " + tok
+
+					toolCalls, content := tp.Add(s)
+					if len(content) > 0 {
+						gotTokens.WriteString(content)
+					} else if len(toolCalls) > 0 {
+						got = append(got, toolCalls...)
+					}
+				}
+
+				// Compare tool calls if we expect any
+				if diff := cmp.Diff(got, tt.expectedToolCall); diff != "" {
+					t.Errorf("tool calls mismatch (-got +want):\n%s", diff)
+				}
+
+				// Compare tokens if we expect any
+				stripped := strings.TrimSpace(gotTokens.String())
+				if diff := cmp.Diff(stripped, tt.expectedTokens); diff != "" {
+					t.Log("actualTokens", stripped, "expectedTokens", tt.expectedTokens)
+					t.Errorf("tokens mismatch (-got +want):\n%s", diff)
+				}
+			})
+		})
+	}
+}
diff --git a/tools/tools_utils.go b/tools/tools_utils.go
new file mode 100644
index 00000000..48531b78
--- /dev/null
+++ b/tools/tools_utils.go
@@ -0,0 +1,227 @@
+package tools
+
+import (
+	"bytes"
+	"encoding/json"
+	"errors"
+	"log/slog"
+	"slices"
+	"strings"
+	gotmpl "text/template"
+	"text/template/parse"
+
+	"github.com/ollama/ollama/api"
+	"github.com/ollama/ollama/template"
+)
+
+// extractToolCallsFormat traverses a template AST to find text that follows a ".ToolCalls" condition.
+// It walks the template nodes looking for if-statements containing ".ToolCalls" and extracts any
+// immediate text nodes that follow. This is used to identify tool call prefixes and formatting.
+//
+// Returns:
+//   - string: The extracted text following the first ".ToolCalls" condition found
+//   - bool: Whether a ".ToolCalls" condition was found in the template
+func extractToolCallsFormat(tmpl *gotmpl.Template) (string, bool) {
+	if tmpl == nil || tmpl.Tree == nil {
+		slog.Debug("template or tree is nil")
+		return "", false
+	}
+
+	var result string
+	var found bool
+
+	var walk func(nodes []parse.Node)
+	walk = func(nodes []parse.Node) {
+		for _, node := range nodes {
+			if found {
+				return
+			}
+
+			switch n := node.(type) {
+			case *parse.IfNode:
+				if isToolCallsNode(n) {
+					// Collect immediate TextNode(s) at start of IfNode's list
+					var sb strings.Builder
+					for _, innerNode := range n.List.Nodes {
+						if tn, ok := innerNode.(*parse.TextNode); ok {
+							sb.Write(tn.Text)
+						} else {
+							// Stop at first non-text node
+							break
+						}
+					}
+					result = sb.String()
+					found = true
+					return
+				}
+				// Recurse into child nodes
+				walk(n.List.Nodes)
+				if n.ElseList != nil {
+					walk(n.ElseList.Nodes)
+				}
+			case *parse.ListNode:
+				walk(n.Nodes)
+			case *parse.RangeNode:
+				walk(n.List.Nodes)
+				if n.ElseList != nil {
+					walk(n.ElseList.Nodes)
+				}
+			case *parse.WithNode:
+				walk(n.List.Nodes)
+				if n.ElseList != nil {
+					walk(n.ElseList.Nodes)
+				}
+			default:
+				// Continue to next node
+				continue
+			}
+		}
+	}
+
+	walk(tmpl.Tree.Root.Nodes)
+	return result, found
+}
+
+// isToolCallsNode detects if a node's condition includes ".ToolCalls"
+func isToolCallsNode(n *parse.IfNode) bool {
+	for _, cmd := range n.Pipe.Cmds {
+		for _, arg := range cmd.Args {
+			if field, ok := arg.(*parse.FieldNode); ok {
+				if slices.Contains(field.Ident, "ToolCalls") {
+					return true
+				}
+			}
+		}
+	}
+	return false
+}
+
+func toolPrefix(tmpl *gotmpl.Template) string {
+	tokenText, ok := extractToolCallsFormat(tmpl)
+	if !ok {
+		return ""
+	}
+	tokenText = strings.TrimSpace(tokenText)
+	tokenText = strings.ReplaceAll(tokenText, "\r", "")
+	tokenText = strings.ReplaceAll(tokenText, "\n", " ")
+
+	return tokenText
+}
+
+// toolTemplate creates a subtree from the node that ranges over .ToolCalls
+//
+// Returns:
+//   - *gotmpl.Template: The subtree containing the .ToolCalls range
+//   - error: Error if parsing failed
+func toolTemplate(t *template.Template) (*gotmpl.Template, error) {
+	tmpl := t.Subtree(func(n parse.Node) bool {
+		if t, ok := n.(*parse.RangeNode); ok {
+			return slices.Contains(template.Identifiers(t.Pipe), "ToolCalls")
+		}
+
+		return false
+	})
+
+	if tmpl == nil {
+		return nil, errors.New("failed to find tool template")
+	}
+
+	return tmpl, nil
+}
+
+// suffixOverlap returns the index in s where the longest suffix overlap with prefix begins
+//
+// Returns:
+//   - int: The starting index in s where the suffix overlap begins
+func suffixOverlap(s, prefix string) int {
+	max := min(len(prefix), len(s))
+	for i := max; i > 0; i-- {
+		if strings.HasSuffix(s, prefix[:i]) {
+			return len(s) - i
+		}
+	}
+	return -1
+}
+
+// extractToolArgs executes a template with a known tool call format to extract the name and arguments
+//
+// Returns:
+//   - string: The name of the tool call
+//   - string: The arguments of the tool call
+//   - error: Error if parsing failed
+func extractToolArgs(tmpl *gotmpl.Template) (name, arguments string, err error) {
+	var b bytes.Buffer
+	if err := tmpl.Execute(&b, map[string][]api.ToolCall{
+		"ToolCalls": {
+			{
+				Function: api.ToolCallFunction{
+					Name: "@@name@@",
+					Arguments: api.ToolCallFunctionArguments{
+						"@@argument@@": 1,
+					},
+				},
+			},
+		},
+	}); err != nil {
+		return "", "", err
+	}
+
+	var obj any
+	err = json.Unmarshal(b.Bytes(), &obj)
+	if err != nil {
+		return "", "", err
+	}
+
+	var objs []map[string]any
+	switch v := obj.(type) {
+	case map[string]any:
+		objs = []map[string]any{v}
+	case []map[string]any:
+		objs = v
+	case []any:
+		objs = collect(v)
+	}
+	if len(objs) == 0 {
+		return "", "", errors.New("no template objects found")
+	}
+
+	// find the keys that correspond to the name and arguments fields
+	for k, v := range objs[0] {
+		switch v.(type) {
+		case string:
+			name = k
+		case map[string]any:
+			arguments = k
+		}
+	}
+
+	if name == "" || arguments == "" {
+		slog.Debug("missing required fields in tool call template", "name", name, "arguments", arguments)
+		return "", "", errors.New("missing required fields in tool call template")
+	}
+
+	return name, arguments, nil
+}
+
+// collect recursively traverses an object to collect all nested maps
+//
+// Returns:
+//   - []map[string]any: A slice of all nested maps found in the object
+func collect(obj any) []map[string]any {
+	var all []map[string]any
+	switch o := obj.(type) {
+	case map[string]any:
+		all = append(all, o)
+		for _, v := range o {
+			all = append(all, collect(v)...)
+		}
+	case []any:
+		for _, v := range o {
+			all = append(all, collect(v)...)
+		}
+	default:
+		return nil
+	}
+
+	return all
+}
diff --git a/tools/tools_utils_test.go b/tools/tools_utils_test.go
new file mode 100644
index 00000000..769183b7
--- /dev/null
+++ b/tools/tools_utils_test.go
@@ -0,0 +1,464 @@
+package tools
+
+import (
+	"testing"
+	gotmpl "text/template"
+
+	"github.com/ollama/ollama/template"
+)
+
+func TestExtractToolCallsFormat(t *testing.T) {
+	cases := []struct {
+		name     string
+		template string
+		want     string
+		found    bool
+	}{
+		{
+			name:     "nil template",
+			template: "",
+			want:     "",
+			found:    false,
+		},
+		{
+			name:     "basic tool call with text",
+			template: "{{if .ToolCalls}}Hello world{{end}}",
+			want:     "Hello world",
+			found:    true,
+		},
+		{
+			name:     "tool call with json format",
+			template: "{{if .ToolCalls}}```json\n{{end}}",
+			want:     "```json\n",
+			found:    true,
+		},
+		{
+			name:     "tool call in range",
+			template: "{{range .ToolCalls}}tool: {{.}}{{end}}",
+			want:     "",
+			found:    false,
+		},
+		{
+			name:     "tool call with multiple text nodes",
+			template: "{{if .ToolCalls}}First text{{if .Something}}inner{{end}}Second text{{end}}",
+			want:     "First text",
+			found:    true,
+		},
+		{
+			name:     "nested if without tool calls",
+			template: "{{if .Something}}{{if .OtherThing}}text{{end}}{{end}}",
+			want:     "",
+			found:    false,
+		},
+	}
+
+	for _, tc := range cases {
+		t.Run(tc.name, func(t *testing.T) {
+			tmpl, err := gotmpl.New("test").Parse(tc.template)
+			if err != nil && tc.template != "" {
+				t.Fatalf("failed to parse template: %v", err)
+			}
+
+			got, found := extractToolCallsFormat(tmpl)
+			if got != tc.want {
+				t.Errorf("got text %q, want %q", got, tc.want)
+			}
+			if found != tc.found {
+				t.Errorf("got found %v, want %v", found, tc.found)
+			}
+		})
+	}
+}
+
+func TestToolPrefix(t *testing.T) {
+	cases := []struct {
+		name     string
+		template string
+		want     string
+	}{
+		{
+			name:     "basic tool call with action prefix",
+			template: "{{if .ToolCalls}}Action: ```json{{end}}",
+			want:     "Action: ```json",
+		},
+		{
+			name:     "incomplete functools bracket",
+			template: "{{if .ToolCalls}}functools[{{end}}",
+			want:     "functools[",
+		},
+		{
+			name:     "tool call with angle brackets",
+			template: "{{if .ToolCalls}}Hello, world! <tool_call>{{end}}",
+			want:     "Hello, world! <tool_call>",
+		},
+		{
+			name:     "multiple tool call formats",
+			template: "{{if .ToolCalls}}[tool_call] <tool_call>{{end}}",
+			want:     "[tool_call] <tool_call>",
+		},
+		{
+			name:     "single angle bracket tool call",
+			template: "{{if .ToolCalls}}<tool_call>{{end}}",
+			want:     "<tool_call>",
+		},
+		{
+			name:     "incomplete angle bracket after tool call",
+			template: "{{if .ToolCalls}}[tool_call] <{{end}}",
+			want:     "[tool_call] <",
+		},
+		{
+			name:     "angle bracket prefix with tool call",
+			template: "{{if .ToolCalls}}> <tool_call>{{end}}",
+			want:     "> <tool_call>",
+		},
+		{
+			name:     "uppercase tool call with incomplete bracket",
+			template: "{{if .ToolCalls}}[TOOL_CALL] [{{end}}",
+			want:     "[TOOL_CALL] [",
+		},
+		{
+			name:     "uppercase tool call with adjacent bracket",
+			template: "{{if .ToolCalls}}[TOOL_CALL][{{end}}",
+			want:     "[TOOL_CALL][",
+		},
+		{
+			name:     "tool call with pipe delimiters",
+			template: "{{if .ToolCalls}}<|tool_call|>{{end}}",
+			want:     "<|tool_call|>",
+		},
+		{
+			name:     "tool with no prefix",
+			template: "{{if .ToolCalls}}{{end}}",
+			want:     "",
+		},
+	}
+
+	for _, tt := range cases {
+		t.Run(tt.name, func(t *testing.T) {
+			tmpl, err := gotmpl.New("test").Parse(tt.template)
+			if err != nil {
+				t.Fatalf("failed to parse template: %v", err)
+			}
+			got := toolPrefix(tmpl)
+			if got != tt.want {
+				t.Errorf("ToolToken(%q) = %q; want %q", tt.template, got, tt.want)
+			}
+		})
+	}
+}
+
+func TestToolTemplate(t *testing.T) {
+	cases := []struct {
+		name     string
+		template string
+		want     bool
+	}{
+		{
+			name:     "basic tool call range",
+			template: "{{range .ToolCalls}}test{{end}}",
+			want:     true,
+		},
+		{
+			name:     "no tool calls",
+			template: "{{range .Other}}test{{end}}",
+			want:     false,
+		},
+		{
+			name:     "nested tool calls",
+			template: "{{range .Outer}}{{range .ToolCalls}}test{{end}}{{end}}",
+			want:     true,
+		},
+		{
+			name:     "empty template",
+			template: "",
+			want:     false,
+		},
+		{
+			name:     "tool calls in if statement",
+			template: "{{if .ToolCalls}}test{{end}}",
+			want:     false,
+		},
+	}
+
+	for _, tt := range cases {
+		t.Run(tt.name, func(t *testing.T) {
+			tmpl, err := gotmpl.New("test").Parse(tt.template)
+			if err != nil {
+				t.Fatalf("failed to parse template: %v", err)
+			}
+
+			parsed, err := template.Parse(tmpl.Root.String())
+			if err != nil {
+				t.Fatalf("failed to parse template: %v", err)
+			}
+
+			_, err = toolTemplate(parsed)
+			if err != nil && tt.want {
+				t.Errorf("toolTemplate() = %v; want %v", err, tt.want)
+			}
+		})
+	}
+}
+
+func TestSuffixOverlap(t *testing.T) {
+	cases := []struct {
+		name string
+		s    string
+		d    string
+		want int
+	}{
+		{
+			name: "no overlap",
+			s:    "hello world",
+			d:    "<tool_call>",
+			want: -1,
+		},
+		{
+			name: "full overlap",
+			s:    "<tool_call>",
+			d:    "<tool_call>",
+			want: 0,
+		},
+		{
+			name: "partial overlap",
+			s:    "text <tool_call>",
+			d:    "<tool_call>",
+			want: 5,
+		},
+		{
+			name: "delimiter longer than string",
+			s:    "<tool>",
+			d:    "<tool_call>",
+			want: -1,
+		},
+		{
+			name: "empty string",
+			s:    "",
+			d:    "<tool_call>",
+			want: -1,
+		},
+		{
+			name: "empty delimiter",
+			s:    "<tool_call>",
+			d:    "",
+			want: -1,
+		},
+		{
+			name: "single char overlap",
+			s:    "test<",
+			d:    "<tool_call>",
+			want: 4,
+		},
+		{
+			name: "partial tool call",
+			s:    "hello <tool_",
+			d:    "<tool_call>",
+			want: 6,
+		},
+	}
+
+	for _, tt := range cases {
+		t.Run(tt.name, func(t *testing.T) {
+			got := suffixOverlap(tt.s, tt.d)
+			if got != tt.want {
+				t.Errorf("suffixOverlap(%q, %q) = %d; want %d", tt.s, tt.d, got, tt.want)
+			}
+		})
+	}
+}
+
+func TestExtractToolArgs(t *testing.T) {
+	cases := []struct {
+		name     string
+		template string
+		want     string
+		ok       bool
+	}{
+		{
+			name:     "basic tool call with text after",
+			template: `{{if .ToolCalls}}tool response{{end}}`,
+			want:     "tool response",
+			ok:       true,
+		},
+		{
+			name:     "tool call with mixed content after",
+			template: `{{if .ToolCalls}}<tool_call>{{.Something}}{{end}}`,
+			want:     "<tool_call>",
+			ok:       true,
+		},
+		{
+			name:     "tool call with no text after",
+			template: `{{if .ToolCalls}}{{.Something}}{{end}}`,
+			want:     "",
+			ok:       true,
+		},
+		{
+			name:     "nested tool call",
+			template: `{{if .Something}}{{if .ToolCalls}}[TOOL_CALL]{{end}}{{end}}`,
+			want:     "[TOOL_CALL]",
+			ok:       true,
+		},
+		{
+			name:     "no tool calls",
+			template: `{{if .Something}}no tools here{{end}}`,
+			want:     "",
+			ok:       false,
+		},
+		{
+			name:     "empty template",
+			template: ``,
+			want:     "",
+			ok:       false,
+		},
+		{
+			name:     "multiple tool calls sections",
+			template: `{{if .ToolCalls}}first{{end}}{{if .ToolCalls}}second{{end}}`,
+			want:     "first",
+			ok:       true,
+		},
+		{
+			name:     "range over tool calls",
+			template: `{{if .ToolCalls}}{{range .ToolCalls}}tool{{end}}{{end}}`,
+			want:     "",
+			ok:       true,
+		},
+		{
+			name:     "tool calls with pipe delimiters",
+			template: `{{if .ToolCalls}}<|tool|>{{end}}`,
+			want:     "<|tool|>",
+			ok:       true,
+		},
+		{
+			name:     "tool calls with nested template",
+			template: `{{if .ToolCalls}}{{template "tool" .}}{{end}}`,
+			want:     "",
+			ok:       true,
+		},
+		{
+			name:     "tool calls with whitespace variations",
+			template: `{{if .ToolCalls}}  tool  {{end}}`,
+			want:     "  tool  ",
+			ok:       true,
+		},
+	}
+
+	for _, tt := range cases {
+		t.Run(tt.name, func(t *testing.T) {
+			tmpl, err := gotmpl.New("test").Parse(tt.template)
+			if err != nil {
+				t.Fatalf("failed to parse template: %v", err)
+			}
+
+			got, ok := extractToolCallsFormat(tmpl)
+			if got != tt.want {
+				t.Errorf("TextAfterToolCalls() got = %q, want %q", got, tt.want)
+			}
+			if ok != tt.ok {
+				t.Errorf("TextAfterToolCalls() ok = %v, want %v", ok, tt.ok)
+			}
+		})
+	}
+}
+
+func TestCollect(t *testing.T) {
+	cases := []struct {
+		name string
+		obj  any
+		want []map[string]any
+	}{
+		{
+			name: "simple map",
+			obj: map[string]any{
+				"key": "value",
+			},
+			want: []map[string]any{
+				{"key": "value"},
+			},
+		},
+		{
+			name: "nested map",
+			obj: map[string]any{
+				"outer": map[string]any{
+					"inner": "value",
+				},
+			},
+			want: []map[string]any{
+				{"outer": map[string]any{"inner": "value"}},
+				{"inner": "value"},
+			},
+		},
+		{
+			name: "array of maps",
+			obj: []any{
+				map[string]any{"key1": "val1"},
+				map[string]any{"key2": "val2"},
+			},
+			want: []map[string]any{
+				{"key1": "val1"},
+				{"key2": "val2"},
+			},
+		},
+		{
+			name: "deeply nested",
+			obj: map[string]any{
+				"l1": map[string]any{
+					"l2": map[string]any{
+						"l3": "value",
+					},
+				},
+			},
+			want: []map[string]any{
+				{"l1": map[string]any{"l2": map[string]any{"l3": "value"}}},
+				{"l2": map[string]any{"l3": "value"}},
+				{"l3": "value"},
+			},
+		},
+		{
+			name: "non-map value",
+			obj:  "string",
+			want: nil,
+		},
+	}
+
+	for _, tt := range cases {
+		t.Run(tt.name, func(t *testing.T) {
+			got := collect(tt.obj)
+			if len(got) != len(tt.want) {
+				t.Errorf("collect() got %d maps, want %d", len(got), len(tt.want))
+				return
+			}
+
+			// Compare each map in the result
+			for i := range tt.want {
+				if !mapsEqual(got[i], tt.want[i]) {
+					t.Errorf("collect() map[%d] = %v, want %v", i, got[i], tt.want[i])
+				}
+			}
+		})
+	}
+}
+
+// mapsEqual compares two maps for deep equality
+func mapsEqual(m1, m2 map[string]any) bool {
+	if len(m1) != len(m2) {
+		return false
+	}
+	for k, v1 := range m1 {
+		v2, ok := m2[k]
+		if !ok {
+			return false
+		}
+		switch val1 := v1.(type) {
+		case map[string]any:
+			val2, ok := v2.(map[string]any)
+			if !ok || !mapsEqual(val1, val2) {
+				return false
+			}
+		default:
+			if v1 != v2 {
+				return false
+			}
+		}
+	}
+	return true
+}

From f18e0cb5508450bd14db5ec8015709d2c4ab820f Mon Sep 17 00:00:00 2001
From: Jesse Gross <jesse@ollama.com>
Date: Fri, 23 May 2025 15:37:32 -0700
Subject: [PATCH 058/108] ml: Improve slog formatting for BackendMemory

---
 ml/backend.go | 36 ++++++++++++++++++++++++++++++++++++
 1 file changed, 36 insertions(+)

diff --git a/ml/backend.go b/ml/backend.go
index 6beb7d2b..65f16948 100644
--- a/ml/backend.go
+++ b/ml/backend.go
@@ -5,6 +5,7 @@ import (
 	"context"
 	"encoding/binary"
 	"fmt"
+	"log/slog"
 	"math"
 	"slices"
 	"strconv"
@@ -133,6 +134,27 @@ type DeviceMemory struct {
 	Graph Memory
 }
 
+func memoryPresent(mem []Memory) bool {
+	return slices.ContainsFunc(mem, func(m Memory) bool { return m.Size != 0 })
+}
+
+func (m DeviceMemory) LogValue() slog.Value {
+	var attrs []slog.Attr
+	if memoryPresent(m.Weights) {
+		attrs = append(attrs, slog.Any("Weights", m.Weights))
+	}
+
+	if memoryPresent(m.Cache) {
+		attrs = append(attrs, slog.Any("Cache", m.Cache))
+	}
+
+	if m.Graph.Size != 0 {
+		attrs = append(attrs, slog.Any("Graph", m.Graph))
+	}
+
+	return slog.GroupValue(attrs...)
+}
+
 // BackendMemory provides the amount of memory required to load the model
 // per device based on the BackendParams. In some cases, not all required
 // allocations will be known at this point. However, the size of the most recent
@@ -150,6 +172,20 @@ type BackendMemory struct {
 	GPUs []DeviceMemory
 }
 
+func (m BackendMemory) LogValue() slog.Value {
+	var attrs []slog.Attr
+	if m.InputWeights.Size != 0 {
+		attrs = append(attrs, slog.Any("InputWeights", m.InputWeights))
+	}
+
+	attrs = append(attrs, slog.Any(m.CPU.Name, m.CPU))
+	for _, g := range m.GPUs {
+		attrs = append(attrs, slog.Any(g.Name, g))
+	}
+
+	return slog.GroupValue(attrs...)
+}
+
 var backends = make(map[string]func(string, BackendParams) (Backend, error))
 
 func RegisterBackend(name string, f func(string, BackendParams) (Backend, error)) {

From eda472df1bd420517ca05c59ba0096e8b518fb69 Mon Sep 17 00:00:00 2001
From: frob <rick+github@frob.com.au>
Date: Sat, 24 May 2025 22:17:04 +0200
Subject: [PATCH 059/108] server: add hint to the error message when model path
 access fails (#10843)

---
 server/modelpath.go | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/server/modelpath.go b/server/modelpath.go
index d1bcae93..af82b8b3 100644
--- a/server/modelpath.go
+++ b/server/modelpath.go
@@ -116,7 +116,7 @@ func (mp ModelPath) BaseURL() *url.URL {
 func GetManifestPath() (string, error) {
 	path := filepath.Join(envconfig.Models(), "manifests")
 	if err := os.MkdirAll(path, 0o755); err != nil {
-		return "", err
+		return "", fmt.Errorf("%w: ensure path elements are traversable", err)
 	}
 
 	return path, nil
@@ -139,7 +139,7 @@ func GetBlobsPath(digest string) (string, error) {
 	}
 
 	if err := os.MkdirAll(dirPath, 0o755); err != nil {
-		return "", err
+		return "", fmt.Errorf("%w: ensure path elements are traversable", err)
 	}
 
 	return path, nil

From 6623898198fece95e6c5cef7ad55d9d1ecd1da1f Mon Sep 17 00:00:00 2001
From: frob <rick+github@frob.com.au>
Date: Sat, 24 May 2025 22:17:26 +0200
Subject: [PATCH 060/108] docs: remove unsupported quantizations (#10842)

---
 docs/import.md | 10 ----------
 1 file changed, 10 deletions(-)

diff --git a/docs/import.md b/docs/import.md
index 01fea542..df06ce4b 100644
--- a/docs/import.md
+++ b/docs/import.md
@@ -132,22 +132,12 @@ success
 
 ### Supported Quantizations
 
-- `q4_0`
-- `q4_1`
-- `q5_0`
-- `q5_1`
 - `q8_0`
 
 #### K-means Quantizations
 
-- `q3_K_S`
-- `q3_K_M`
-- `q3_K_L`
 - `q4_K_S`
 - `q4_K_M`
-- `q5_K_S`
-- `q5_K_M`
-- `q6_K`
 
 
 ## Sharing your model on ollama.com

From 2307fc2bcd8e9067ed5575d7570b69fff9cc4f38 Mon Sep 17 00:00:00 2001
From: Daniel Hiltgen <dhiltgen@users.noreply.github.com>
Date: Sat, 24 May 2025 13:17:53 -0700
Subject: [PATCH 061/108] tests: drop llama3.2-vision embedding tests (#10837)

---
 integration/testdata/embed.json | 1 -
 1 file changed, 1 deletion(-)

diff --git a/integration/testdata/embed.json b/integration/testdata/embed.json
index a6ef65aa..eea33bdf 100644
--- a/integration/testdata/embed.json
+++ b/integration/testdata/embed.json
@@ -15,7 +15,6 @@
 	"falcon:latest":        [-0.145434, 0.551458, 0.579686, 0.056737, 0.122334, -0.098554, 0.453500, 0.152636, 0.006995, 0.353793, -0.057853, 0.141001, -0.251169, 0.123486, -0.386756, 0.380073, 0.113592, 0.246952, 0.221665, 0.222362, 0.110978, -0.156950, -0.095453, -0.446011, 0.380302, -0.181973, -0.355711, 0.083788, 0.529327, 0.396823, -0.235883, 0.463392, -0.453481, 0.111083, -0.109795, 0.096437, 0.184561, -0.006152, -0.404893, -0.505164, -0.512443, -0.248063, 0.264552, 0.211518, -0.408603, -0.177284, -0.358847, 0.055804, 0.302788, -0.161492, -0.030894, -0.017345, -32.815334, -0.083117, -0.250543, 0.163018, 0.198646, -0.159876, -0.472978, -0.341143, -0.634069, 0.191124, -0.064307, 0.090823, -0.458379, 0.471753, -0.101117, 0.290473, 0.348141, 0.077560, 0.459839, 0.713714, -0.385911, 0.357724, -0.028543, 0.215573, 0.420039, -0.160133, 0.720215, 0.377099, 0.286783, 0.130918, -78.763008, 0.235381, 0.492549, 0.421542, 0.292254, -0.320543, 0.125360, 0.286009, -0.266907, -0.161205, 0.468458, 0.075248, -0.607871, -0.498977, -0.248155, 0.137575, 0.217211, 0.167880, 0.721968, 0.437299, -0.006375, -0.114837, -0.217905, 0.325793, 0.548343, -0.304491, 0.050522, -0.296999, -0.015102, -0.066116, 0.326557, -0.683608, 0.848642, 0.139344, -0.386741, 0.095375, 0.137059, -0.524803, -0.308266, 0.207935, 0.910517, -0.047032, 0.016144, -0.282897, 0.030230, -0.118328, -0.216214, -0.303300, 0.196013, -0.501000, 0.369998, -0.028593, 0.309059, 0.496592, -0.330565, -0.581806, 0.117767, 0.187372, -0.144573, -0.227210, 0.181474, -0.181403, 0.144057, 0.269330, 0.046298, -0.699166, 0.538736, 0.174926, -0.561055, -0.088680, -0.007410, 0.730400, -0.471273, -0.197240, -0.027064, -0.109930, 0.149596, -0.431747, -0.033780, 0.269468, 0.410048, -0.164335, 0.079866, -0.167682, -0.260156, 0.352913, 0.227972, -0.491290, -0.305480, -0.233817, 0.048377, -0.192452, 76.728951, -0.184723, -0.035661, 0.021605, -0.363392, -0.380687, -0.095017, -0.363713, -0.331915, -0.450960, -0.384010, 98.522270, -0.376337, 0.332839, -0.049045, 0.044364, -0.076163, -0.176768, 0.273568, 0.060393, 0.105446, 0.286757, -0.061164, -0.454434, 0.415012, -0.140746, -0.040113, -0.565394, -0.060276, -0.234516, 0.312736, -0.069791, 0.352157, 0.496571, -0.248295, 0.169454, 0.254744, -0.125044, 0.095099, 0.158710, -0.174195, 0.307172, 0.600827, 0.102560, 0.584701, 0.191773, -0.210491, 0.224133, -0.133474, 0.060811, -0.040503, -0.486995, -0.257410, 0.131609, 0.188029, -0.003900, -0.226279, 0.014537, -0.214499, -0.285907, -0.500149, 0.220990, -0.061304, -0.449743, 0.320928, 0.395599, -0.405548, 0.214390, 0.296444, 0.175481, 0.196306, -0.181649, -0.175221, 0.562277, -0.083596, -0.135095, -0.226197, 0.060174, 0.425703, -0.263677, -0.442736, -0.848780, 0.140745, 0.158711, -0.246688, 0.513777, -0.133425, 0.150256, -0.165640, -0.204918, -0.390736, -0.400310, 0.078730, 0.042572, 0.127664, 0.146947, -0.031153, -0.076668, 0.239822, -0.114565, 0.338568, 0.057678, 0.276141, -0.158014, -0.047334, 0.813145, 0.096568, -0.204652, 0.069818, -0.162866, 0.474288, 0.010297, -0.042806, 0.708678, 0.266313, 0.264811, 0.178802, 0.090678, -0.153934, -0.245041, -0.249606, -0.606384, 1.116905, -0.250234, -0.134183, 0.109237, 0.162539, 0.071664, -0.028506, 0.256520, -0.270116, 0.370725, 0.072267, 0.595097, -0.444489, -0.306348, 0.375055, 0.052390, 0.689186, 0.141866, -0.022982, -0.028208, 0.692657, 0.168249, -0.289483, -0.068967, -0.145280, -0.667039, 0.606857, 0.572388, -0.461176, -0.083282, 0.357757, 0.263421, -0.338740, 0.014035, 0.136918, -0.027329, 0.405532, 0.123614, 0.228298, -0.604249, -0.259012, 0.191833, -0.188005, 0.052887, -0.036500, 0.051033, -0.017828, -0.013952, -0.076532, -0.322442, 0.034422, -0.158676, -0.380441, 0.068665, 37.625778, -1.467730, -0.198724, -0.366973, 0.094293, -0.008216, -0.019456, 0.045300, 0.241962, 0.206565, 0.200205, -0.170838, -0.239032, -0.092126, 0.364009, 0.078817, 0.044788, 0.307262, -0.396150, 0.345320, -0.246881, -0.092816, -0.252418, -0.180483, -0.254689, 0.004093, -0.200098, 0.210746, 0.029551, 0.245714, -0.063405, -0.003852, -0.331452, 0.648124, -0.156812, 0.173493, -0.454535, -0.054677, 0.309534, -0.462142, 0.451003, -0.153110, 0.369131, 0.301209, 0.173910, -0.131474, -0.194955, -0.717491, -0.016214, 0.543446, 0.239048, -0.154631, 0.153298, -0.721207, 0.015260, -0.547927, -0.272240, -1.158423, -0.387883, 0.179015, 0.290848, -0.178274, -0.537356, 0.123705, 0.337805, -0.196415, -0.124260, 0.274265, -0.027145, -0.112975, 0.271123, 0.200173, 0.309503, 0.368623, -0.130785, 0.145833, -0.102706, -0.194807, -0.355944, 0.233473, -0.422869, -0.249461, 0.275593, 0.261122, -0.041622, 0.321849, 0.351184, -0.731175, -0.455722, 0.376331, -0.073822, -0.400349, 0.297684, -0.254106, -0.179405, -0.740918, 0.396933, -0.624790, -0.245724, -0.758950, -0.761523, 0.171997, 0.582886, 0.122688, 0.454277, 0.393659, -0.125691, 0.526288, 0.245751, 0.236661, -0.268370, 0.259706, 0.471990, 0.070545, 0.262786, -0.195632, -0.373156, 0.233073, -0.319479, -0.180365, -0.356844, 0.501312, -0.404738, -0.253011, 0.381910, -0.230624, 0.322234, 0.322426, -0.406919, 0.278409, 0.189779, -0.477614, 0.398916, 0.282304, -0.097129, -0.206756, 0.078960, -0.140931, -0.327043, -0.043203, -0.238777, 0.054524, 0.345428, 0.379250, -0.028527, -0.346552, 0.135533, -0.324663, 0.097073, 0.405679, 0.144856, 0.203372, -0.135059, -0.262663, -0.038000, 0.389909, -0.832866, -0.061717, 0.439580, -0.620867, 0.043219, -0.032121, 0.448644, -0.445477, 0.479233, -0.349839, 0.376309, 0.034940, -0.093427, -0.018762, 0.237700, -0.175609, 0.521672, 0.212652, -0.147365, -0.611912, 0.433886, -0.218107, 0.044754, 0.199969, -0.084820, -0.180790, 0.020706, -0.192604, 0.911501, -0.205838, -0.328527, -0.468358, -0.292061, -0.165838, -0.239964, 0.134404, -0.085968, 0.593265, -0.035909, 0.166329, -0.038361, -0.180849, 0.134523, -0.347261, -0.383150, 0.203595, -0.527209, 0.655002, 0.036892, -0.271854, -0.722635, -0.183820, 0.041028, 0.534431, -0.321399, -0.418089, 0.205068, 0.079445, -0.313515, 0.291001, 0.376464, -0.024004, 0.494628, 0.302450, 0.075265, -0.267347, 0.131573, -0.295871, 0.160672, -0.115069, 0.890312, -0.041931, 0.229802, 0.140957, -0.566682, 0.494251, 0.228695, 0.119960, -0.250476, 0.542480, 0.438343, -0.089255, 0.539449, -0.477021, 0.099634, -0.404685, 0.306461, -0.043212, 0.233839, 0.048056, -0.186367, 0.199987, 0.047861, -0.289694, -0.537992, -0.030393, -0.089514, 0.143944, 0.128985, -0.247290, -0.148287, -0.007974, 0.302635, -0.353074, 0.140356, 0.267783, -0.438974, 0.037830, 0.136492, 0.285023, -0.161996, -0.577940, 0.466486, 0.099931, -0.207497, 0.478511, -0.477054, 0.378640, 0.114531, -0.454839, 0.205362, -0.264606, -0.345706, -0.495824, 0.015153, 0.063968, 0.570447, 0.097192, -0.171304, 0.976958, -0.167555, 0.102738, -0.436587, -0.416574, 0.246487, -0.708184, 0.023123, 0.282953, -0.282671, -0.100751, 0.016980, 0.458428, 0.210551, 0.728111, -0.064927, -0.518779, 0.023455, -0.523534, -0.352629, -0.204977, -0.031492, 0.492380, 0.037194, 0.275549, 0.049998, -0.042273, 0.024827, -0.391516, -0.194958, -0.433102, -0.268930, -0.385709, 0.035979, -0.236354, 0.036609, -0.106188, -0.051624, 0.874279, -0.019348, 0.500591, -0.124548, 0.324765, -0.367921, -0.222806, 0.540817, -0.078370, 0.043867, 0.420149, -0.059615, -0.440197, -0.087696, -0.292653, 0.111633, 0.670683, 0.455422, 0.189336, -0.042270, -0.260230, 0.051045, -0.335735, -0.506230, -0.358802, -0.266131, -0.203469, 0.257531, -0.079949, -0.371272, -0.413529, 0.099179, 0.181491, 0.150417, 0.225561, -0.346218, 0.386289, -0.052629, 0.298301, 0.905319, 0.350629, 0.162920, -0.197367, 0.032053, 0.451291, 0.156002, 0.093688, -0.061183, -0.513632, 0.197035, -0.259313, 0.299369, -0.044861, -0.141547, -0.285648, 0.202531, 0.323211, -0.128848, 0.146046, -0.771048, 0.315781, 0.132139, -0.224002, -0.046500, 0.392186, 0.015927, -0.539539, -0.217051, -0.165655, -0.112836, -0.694149, 0.339086, -0.414199, -0.237005, -0.950271, -0.279200, -0.627571, 0.069664, 0.370548, -0.062389, 0.586872, 0.213491, 0.115808, -0.041785, 0.070284, 0.475451, 0.620675, 0.153083, 0.570752, 0.077146, -0.593140, -0.395619, -0.169177, -0.077108, 0.033796, 0.110675, -0.144388, -0.484813, 0.101560, 0.118790, 0.585920, -0.051071, 0.522129, -0.096738, 0.039210, 0.130928, -0.165276, 0.021268, -0.138029, 0.452325, 0.340640, -0.240428, 0.037821, -0.356228, 0.230344, -0.362754, -0.077494, 0.106675, -0.458798, -0.373419, 0.031627, -0.154582, 0.372776, -0.041724, 0.265408, 0.351702, -0.294240, 0.822546, 0.024259, 0.029190, 0.116972, -0.298376, -0.022028, 0.036926, -0.000887, 0.198864, -0.196297, -0.319581, 0.104491, -0.003430, 0.157759, 0.118287, 0.417707, 0.329480, 0.477700, 0.300714, -0.037461, -0.082834, -0.691333, 0.119406, 0.333651, -0.015797, -0.241354, 0.322229, -0.316640, 0.336315, -0.333795, 0.152463, -0.063831, -0.093148, -0.190586, 0.866467, 0.206135, -0.333121, -0.334417, 0.427795, 0.433384, 0.030511, -0.081860, -0.327655, 0.321278, 0.308877, -0.146296, 0.250392, 0.352685, -0.261779, -0.391084, 0.343821, -0.623625, -0.232097, 0.730628, -0.009076, -0.278967, -0.024484, 0.126125, -0.272171, 0.395506, 0.385768, -0.485491, 0.060047, -0.462739, -0.056647, -0.384519, -0.158643, -0.362551, 0.475326, -0.029004, -0.547719, 0.048068, 0.214930, -0.050708, -0.737851, 0.249612, -0.273211, 0.547692, -0.119554, -0.489587, -0.183487, -0.422387, -0.021538, -0.357277, 0.442413, 0.742838, -0.306633, -0.417351, 0.665328, 0.195444, -0.137114, -0.503417, 0.106885, 0.313572, -0.132145, 0.361419, 0.209603, -0.503253, -0.481884, 0.218992, -0.205752, -0.239932, -0.071110, 0.198684, 0.044104, 0.170742, 0.087633, 0.306695, 0.128361, -0.694840, -0.178137, -0.062519, 0.269586, -0.420635, 0.441342, -0.307177, -0.113335, 0.545861, 0.245667, -0.061284, 0.463628, 0.188273, -0.425786, -0.599940, -0.498917, -0.097796, 0.110285, 0.832950, -0.219720, 0.157499, 0.090185, 0.919885, -0.057676, 0.432318, -0.276865, 0.225659, -1.100891, 0.191885, -0.538571, -0.070240, 0.178796, -0.850664, -0.281632, 0.101154, 0.292607, -0.697670, 0.065382, 0.243841, -0.446409, -0.747985, 0.421588, 0.134048, 0.014038, 0.012480, 0.093830, 0.065262, -0.028793, -0.460956, 0.001223, -0.435688, -0.216765, 0.092236, -0.205991, -0.346419, -0.150538, -0.115917, -0.034806, 0.121649, -0.155723, 0.509418, 0.161882, -0.247912, -1.040411, 0.113688, -0.232234, 0.083722, -0.356446, 0.171696, -0.551805, 0.644128, -0.361454, -0.069536, 0.344402, -0.144846, 0.593874, 0.046123, 0.071060, 0.044171, 0.109494, 0.162099, 0.016161, 0.160383, -0.657956, 0.426369, 0.430661, 0.167817, 0.371546, 0.038148, 0.175910, 0.308253, -0.379426, 0.308540, 0.028549, -0.238940, -0.216455, 0.583075, -0.325968, 0.251503, 0.521912, 0.405485, -0.058058, -0.085051, 0.486884, 0.177756, 0.437350, -0.531478, -0.309321, -0.053716, -0.577053, -0.135426, 0.037255, -0.151116, -0.249868, 0.044155, -0.143193, -0.256241, -0.094862, -0.071529, -0.179807, 0.177920, 0.322324, -0.038461, 0.146140, -0.170251, -0.189690, 0.621307, 0.123934, 0.030048, -0.024102, 0.172103, 0.412975, -0.478826, -0.629470, 0.323721, -0.306728, -0.006467, 0.321414, 0.400523, -0.001823, 0.011803, -0.120583, 0.601702, 0.185840, 0.700491, 0.149545, 0.067835, 0.470991, 0.664574, -0.552964, -0.316984, -0.231059, -0.300164, -0.558253, -0.371432, -0.427449, 0.150378, 0.205915, 0.007517, 0.848489, -0.270204, 0.645367, -0.190823, 0.118916, -0.048641, -0.555063, 0.180961, 0.193901, 0.520572, -0.087776, 0.935664, -0.117911, 0.409115, -0.214746, -0.062976, -0.582564, -0.081324, -0.010914, 0.697559, 0.827275, -0.194161, -0.525253, 0.123935, 0.110752, -0.243882, -0.464774, 0.015078, -0.350227, -0.113486, 0.011749, 0.153878, 0.241713, 0.309567, 0.159847, 0.382380, -0.121834, 0.159451, -0.704771, 0.006205, 0.008906, 0.346081, -0.160726, 0.367822, -0.023612, -0.010030, 0.374448, -0.019864, -0.096305, -0.143692, 0.101554, 0.203875, -0.122353, -0.456235, 0.368918, -0.026021, 0.096329, -0.418561, -0.096026, -0.323244, -0.289808, -0.177779, -0.380370, -0.277077, 0.225156, 0.373779, 0.756219, -0.338806, -1.064188, 0.086767, 0.112865, 0.410580, -0.160529, 0.685520, 0.343053, 0.015070, 0.034452, 0.050372, 0.006650, 0.317398, 0.658390, -0.431091, 0.506827, -0.100293, -0.258241, -0.098443, 0.440216, -0.351024, -0.425316, 0.190989, -0.198299, 0.598305, 0.592949, -0.761293, 0.291733, 0.298414, -0.596820, -0.056470, 0.506640, 0.614545, -0.038101, 0.304769, 0.047714, -0.178597, 0.606069, -0.020600, -0.324800, -0.517904, -0.244066, -0.781475, -0.347456, -0.177254, 0.442952, -0.502207, -0.313296, -0.221216, -0.158749, -0.000308, 0.210433, -0.293535, 0.143593, -0.187607, 0.410221, -0.101963, 0.038983, 0.216233, 0.109033, 0.071577, -0.274980, -0.519410, 0.748628, 0.132990, 0.017301, -0.318737, 0.462538, -0.219474, -0.257446, 0.106117, -1.033327, 0.555366, 0.314015, -0.099034, -0.101012, 0.316906, 0.525460, -0.067212, -0.234236, -0.594703, -0.301416, 0.049783, 0.360627, -0.340874, -0.108111, -0.030016, -0.068959, 0.419880, 0.079512, 0.136864, -0.870394, 0.190759, 0.264447, -0.055466, -0.274267, 0.294311, 0.145651, 0.181500, 0.068148, -0.079297, 0.116602, 0.398743, -0.596057, -0.283995, -0.631194, 0.272831, 0.028005, -0.269716, 0.821832, -0.264261, 0.296154, 0.000392, 0.268086, 0.256604, 0.438840, -0.025024, 0.229269, 0.372718, -0.111645, 0.275149, 0.074714, -0.149943, -0.677116, -0.423984, -0.055012, 0.807868, -0.793449, -0.288004, -0.217445, -0.417038, 0.048146, 0.271043, -0.240449, 0.003111, 0.200560, 0.515866, -0.112648, 0.480673, -0.153305, 0.162125, -0.360713, -0.281660, -0.433440, 0.062965, -0.287887, 0.376615, -0.010636, -0.070215, -0.413233, -0.407751, -0.569790, 0.620695, -0.445826, -0.266545, -0.143776, 0.556380, -0.358060, 0.091419, 0.154067, -0.424598, 0.678844, 0.586404, -0.293496, -0.565051, 0.211552, -0.060263, -0.154836, 0.118186, 0.659808, 0.048121, 0.028627, 0.310143, -0.108596, 0.233567, -0.028822, 0.211269, -0.069226, 0.083919, 0.067399, 0.551407, 0.423780, -0.168638, -0.212640, -0.304896, -0.148888, 0.044441, -0.146128, -0.221889, 0.151668, -0.279946, -0.454766, -0.137937, 0.329202, 0.392256, -0.123300, -0.013104, 0.020592, -0.154003, -0.141471, 0.245494, 0.342663, -0.524716, -0.256356, 0.096822, 0.377818, 0.058905, 0.199189, 0.192921, 0.046306, -0.310740, 0.375515, 0.113114, 0.067435, -0.171354, -0.187886, 0.098279, -0.443635, -0.418073, -0.076781, -0.216082, -0.459061, 0.174019, 0.092634, 0.095597, 0.229115, -0.063759, 0.166816, 0.224667, 0.389135, 0.201666, 0.280591, -0.505003, -0.199995, 0.164115, -0.018507, 0.216536, -0.228796, -0.591178, -0.108026, -0.061647, -0.155654, 0.063727, -0.290692, 0.071313, 0.164409, 0.025392, 0.333114, 0.058157, 0.341020, -0.008189, -0.657713, -0.469880, 0.380310, -0.422738, 0.146785, 0.138773, -0.181386, -0.080411, 0.223552, 0.362902, -0.642274, -0.216622, 0.244858, -0.053043, -0.255551, 0.420415, -0.162121, 0.178924, 0.692984, -0.513166, -0.353268, -0.566853, 0.654280, -0.097715, 0.617112, 0.737402, -0.252336, -0.919382, -0.002810, -0.243296, -0.697557, -0.152351, -0.020530, -0.044934, -0.028899, -0.191319, -0.194357, -0.176970, 0.400549, 0.291772, -0.242575, 0.424298, 0.457313, -0.631985, -0.030793, 0.099553, 0.011074, 0.058982, 0.193177, 0.192687, -0.396982, -0.226565, -0.427594, 0.366640, 0.581517, -0.384248, -0.604846, 0.314109, 0.230639, -0.521342, 0.201063, 0.692748, 0.048733, 0.597786, 0.063185, 0.225998, -0.180704, 0.329972, 0.150493, -0.042691, -0.159609, -0.309584, -0.275296, 0.984232, -0.030626, 0.040339, -0.253096, 0.007542, -0.286120, -0.222316, 0.158408, 0.383668, 0.429613, -0.571129, -0.086979, -0.774914, 0.035450, 0.399168, 0.141715, 0.261576, 0.757710, 0.092665, -0.444902, 0.364449, -0.026400, 0.486443, -0.191074, 0.267344, 0.391305, 0.019776, -0.162476, 0.011955, -0.169685, 0.360746, -0.057660, 4.487291, -0.752153, 0.426690, -0.514092, -0.135662, 0.128860, -0.352554, 0.339426, 0.187895, -0.105337, -0.067870, -0.561997, 0.384668, 0.139748, -0.076339, -0.383534, 0.193489, 0.217438, 0.310760, -0.471129, 0.408550, -0.783918, 0.924570, 0.544810, 0.173359, 0.325069, 0.118672, 0.157661, 0.495307, 0.043571, 0.420747, 0.461203, 0.030940, 0.079866, 0.202492, -0.143818, 0.388617, 0.284566, 0.085482, -0.424856, 0.169660, 5.002161, 0.496126, 0.368753, -0.291746, 0.049863, 0.052142, 0.562981, 0.643301, -0.097461, -0.153821, -0.237668, -0.369237, 0.234187, 0.218032, -0.617891, -0.360784, 0.184486, 0.035056, -0.343886, 0.100110, 0.015195, -0.149124, 0.577815, 0.609786, -0.190856, 0.157805, -0.111564, 0.056562, -0.264586, -0.228257, 0.143420, -0.409625, -0.407802, -0.193288, -0.178765, 0.457437, -0.001436, 0.413320, 0.456115, 0.493243, -0.467128, -0.516562, -0.063919, 0.772535, -0.142793, -0.273115, -0.326627, -0.252157, 0.386950, 0.000773, 0.080755, 0.169150, 0.207458, -0.113956, -0.269044, -0.411961, 0.196148, 0.279117, 0.476607, 0.002481, -0.446647, -0.131562, -0.290507, -0.031059, 0.522627, 0.374723, -0.189284, -0.022486, 0.121929, -0.004285, 0.320933, 0.433629, -0.872581, 0.510533, -0.047597, -0.043823, -0.172759, 0.340392, 0.585445, -0.344459, 0.179457, 0.052765, -0.247181, -0.092283, 0.166314, -0.082850, 0.624757, 0.408819, 0.517236, 0.592212, 0.266402, 0.063619, -0.332631, 0.093216, -0.002696, 0.496564, 0.017645, -0.081635, -0.045906, 0.131908, -0.360834, -0.074638, 0.884801, -0.112605, 0.152780, 0.241896, 0.178782, 0.302635, 0.096668, -0.569904, -0.235891, 0.697626, -0.153779, 0.083589, 0.118558, 0.148341, 0.138840, -0.483483, 0.039876, -0.215070, 0.378216, 0.477800, -0.376276, -0.107364, -0.056474, 0.634841, -0.829850, -0.132656, 0.166909, 0.284389, -0.751289, -0.661793, 0.215448, 0.434656, 0.023345, -0.133867, 0.439496, -0.496212, -0.117264, -0.017485, -0.372388, 1.472339, -0.363553, -0.193867, 0.281123, 0.351186, 0.103060, -0.298733, 0.369226, 0.295971, -0.239668, -1.152384, 0.448451, 0.275586, -0.237399, 0.174515, 0.195369, 0.043802, -0.264791, 0.013253, 0.040941, 0.027880, -0.142671, 0.074643, -0.553351, 0.129982, -0.100195, 0.180619, 0.071297, -0.371206, -0.193323, -0.468507, -0.805926, 0.184589, 0.080032, 0.084080, 0.275789, 0.192904, -0.078385, 0.290711, -0.027608, 0.440038, -0.587812, -0.004581, 0.162749, -0.407841, 0.368317, -0.112203, 0.299965, 0.172529, -0.076716, 0.450040, -0.303669, 0.169699, 0.115787, -0.083244, 0.287229, -0.098873, 0.045379, -0.142650, 0.225791, 0.117558, 0.331856, -0.449384, 0.030248, 0.661376, -0.147196, 0.219943, 0.449944, 0.153362, 0.580988, -0.014734, 0.219984, -0.031598, -0.087134, 0.624181, 0.321511, 0.159627, 0.065644, 0.052018, -0.334922, -0.194216, -0.214867, 0.373563, 0.569381, 0.320083, -0.218419, 0.392195, -0.208969, -0.043425, 0.549654, -0.405465, -0.393805, 0.037566, -0.388948, -0.224721, -0.294911, 0.308722, -0.081107, 0.685822, -0.048916, -0.165033, 0.142796, -0.269439, -0.222129, -0.724417, -0.286839, 0.608340, 0.327775, -58.517979, -0.544269, -0.001866, -0.332386, 0.199435, 0.252479, -0.223065, -0.405478, -0.249875, -0.104632, -0.554372, 0.442492, 0.092624, -1.184105, 0.306272, 0.094480, 0.180830, -0.188470, 0.199765, -0.158313, 0.219714, 0.490681, -0.341416, -0.366322, -0.456658, -0.166933, -0.581015, -0.068403, -0.253523, -0.303116, 0.084059, 0.248953, 0.151603, 0.113430, 0.826785, -0.997589, -0.021810, -0.476477, 0.258189, 0.772523, 0.136442, -0.239684, 0.186631, 0.024477, -0.110479, -0.279112, 0.227488, 0.287025, -0.114060, -0.046808, 0.235135, -0.282328, 0.259069, -0.037700, 0.412282, 0.450889, -0.564557, 0.362352, -0.249231, -0.283077, 0.596018, 0.210083, -0.160349, -0.145695, 0.279737, 0.432935, -0.060723, 0.205922, -0.077641, -0.005025, 0.032035, -0.147291, -0.089267, 0.022549, 0.096784, 0.316596, -0.335415, -0.154303, 9.210744, 0.439763, -0.474908, 0.547985, 0.333362, -0.083074, -0.113624, -0.358873, -0.001362, 0.342030, 0.487156, -0.212562, 0.381375, 0.187135, -0.076945, -0.455437, -0.128274, -0.070635, -0.241333, -0.100881, -0.181602, -0.364256, -0.003553, 0.078503, -0.229145, -0.102308, 0.197712, 0.027701, -0.779926, 0.536838, 0.599103, 0.012468, -0.057213, 0.015195, 0.180001, 0.458210, -0.035604, -0.381750, -0.162652, 0.062134, -0.018506, 0.322445, 0.430900, 0.494851, -0.001246, -0.063209, 0.027377, 0.572648, 0.501041, -0.330562, 0.005837, -0.250195, -0.197133, -0.322219, -0.241606, 0.777974, -0.214089, -0.517721, 0.054999, 0.079865, -0.392691, 0.436211, 0.170465, 0.012087, -0.570184, 0.342888, -0.248057, -0.206725, -0.031366, -0.455187, -0.014615, 0.157380, 0.836096, -0.526227, 0.272770, -0.238688, -0.423359, 0.184629, -0.558110, 0.024373, -0.151099, 0.561669, 0.598668, 0.297249, 0.027630, -0.355015, 0.242124, 0.076283, 0.281585, 0.539186, 0.105147, 0.421329, 0.097530, 0.683345, -0.052615, -0.135516, -0.139486, -0.133028, -0.413589, 0.058511, 0.174991, 0.026137, -0.128775, 0.032074, 0.079121, -0.097758, -0.100383, -0.140296, 0.653418, -0.458114, 0.115476, 0.015241, -0.171333, -0.002086, -0.215181, 0.295451, 0.152256, -0.074834, -0.073559, 0.242919, -0.361564, -0.105575, -0.216026, 0.285056, 0.216080, -0.017842, -0.203024, 0.165294, -0.153601, -0.077933, -0.115630, 0.081592, 0.049974, -0.676193, 0.314703, -0.164646, -0.030029, -0.488286, -0.197653, -0.107439, 0.057860, -0.215854, 1.589793, 0.217210, 0.187354, -0.106198, -0.019009, -0.011475, -0.016074, -0.082618, -0.361909, -0.087632, -0.064170, 0.235486, 0.430535, 0.326502, -0.158178, 0.298049, -0.182466, 0.113623, -0.318923, 0.001735, 0.340390, 0.232367, 0.124780, -0.329235, -0.258752, 0.232064, 0.057419, 0.071112, -0.393735, 0.243256, 0.444519, 0.086050, 0.239353, 0.491001, 0.005872, 0.332629, 0.355181, 0.453798, -0.406120, -0.000941, -0.504079, 0.147405, 0.410417, -0.263093, -0.082063, -0.330414, -0.024870, 0.603955, 0.011611, 0.247248, -0.437377, 0.374511, -0.386487, -0.052682, -0.704873, 0.137623, -0.302090, 0.611321, 0.347271, -0.128144, -0.070892, 0.555710, 0.235903, 0.620517, 0.356831, -0.088181, -0.160007, 0.040110, -0.402164, -0.132727, -0.304703, -0.037726, 0.249084, 0.463306, 0.410124, 0.206482, 0.084820, -0.448904, 0.441546, 0.493072, 0.177585, -0.202363, -0.028921, 0.279302, 0.206562, 0.210913, -0.330171, -0.465549, 0.120795, 0.345071, 0.735387, 0.512654, -0.197911, 0.433159, 0.039721, 0.779497, -0.224703, 0.258248, -0.097690, -0.123420, -0.614652, -0.402285, 0.541440, 0.066245, 0.084928, 0.117902, -0.200090, 0.050084, 0.132697, -0.505208, 0.336530, 0.370033, -0.026980, -0.182485, 0.137949, 0.097208, -0.373811, -0.432984, 0.365751, 0.116326, 0.786149, -0.336528, -0.143944, 0.276964, -0.337612, -0.537772, -0.135477, -0.157744, 0.480776, -0.016617, 0.178157, 0.447851, -0.460623, -0.082251, -0.066533, -0.221613, 0.263356, 0.364406, 0.033944, -0.270242, 0.301467, -0.181274, -0.774562, -0.336110, 0.013128, 0.379040, -0.491963, 0.510146, 0.115073, 0.120280, 0.541961, 0.340013, -0.030580, 0.080096, 0.215101, 0.109936, -0.498924, -0.311070, 0.359271, 0.007063, -0.440929, 0.494422, -0.285638, 0.636720, -0.141487, 0.515842, -0.552986, -0.224828, 0.077687, -0.618073, -0.191511, -0.243853, 0.518603, -0.099494, 0.009308, -0.480245, 0.537420, 0.192950, -0.265831, 0.051783, 0.308917, 0.155403, 0.018711, 0.211687, 0.407065, 0.442201, 0.136252, 0.084571, 0.095548, -0.225721, -0.000452, 0.138476, -0.149119, -0.664407, 0.180302, 0.024027, -0.168956, -0.060498, 0.176705, 0.197362, 0.810919, 0.100887, -0.343077, -0.250311, 0.202211, -0.533182, 0.565174, 0.071214, -0.071025, 0.294692, 0.345120, -0.369266, -0.363377, 0.094061, 0.053798, -0.675334, 0.105947, -0.366219, -0.233770, -0.146928, 0.059898, -0.260274, -0.351740, 0.509495, 0.466080, 0.533641, -0.283434, -0.170006, 0.497754, 0.113559, -0.234416, 0.006083, 0.091491, -0.393112, 0.243943, 0.068198, -0.135965, 0.071724, 0.267782, -0.299795, 0.225524, 1.235563, -0.060205, 0.328835, -0.436151, 0.408247, -0.826190, 0.398270, 0.206538, 0.275721, 0.916620, -0.380989, -0.155483, -0.165130, 0.108895, 0.192115, -0.203651, 0.378474, -0.264306, 0.075935, -0.263781, -0.194646, -0.089090, -0.463814, -0.084640, -0.277837, -0.342353, -0.024847, -0.192613, 0.186768, 0.111856, 0.178706, -0.120027, -0.541726, -0.337925, 0.118754, 0.238603, 0.208689, -0.145369, -0.150549, 0.653374, 0.094129, 0.106789, -1.053561, -0.196887, -0.240114, -0.745993, -0.152362, -1.204002, -0.082581, 0.305619, -0.058988, -0.598331, 0.185987, -0.060489, -0.423577, 0.511601, -0.325674, -0.120736, -0.115833, 0.159326, -0.285350, 0.046486, 0.190043, -0.050766, 0.455020, 0.403656, 0.323676, -0.360419, -0.349012, 0.049382, -0.285226, -0.303766, 0.506560, -0.419006, 0.309048, 0.217033, -0.084596, -0.243125, 0.644726, -0.052139, 0.160264, -0.375426, -0.394390, 0.098200, 0.259292, -0.638499, 0.021518, -0.129735, -0.101215, -0.224675, -0.042415, -0.101307, -0.089895, 0.188957, -0.548835, -0.143101, 0.536356, 0.472715, 0.358453, 0.414846, -0.370800, -0.235898, 0.262339, 0.449393, -0.263371, -0.217770, -0.226305, 0.047581, 0.003470, -0.028333, -0.130409, -0.138864, 0.496755, 0.231264, -0.030316, -0.476095, 0.256390, -0.104659, -0.518679, 0.125111, -0.423105, 0.494313, -0.181924, 0.242448, 0.280324, -0.129197, -0.375469, -0.433732, 0.134280, 0.389087, -0.506155, 0.527342, 0.194793, 0.178930, -0.333685, 0.181848, 0.243475, -0.612548, 0.289363, 0.029938, 0.110186, 0.255495, -0.144533, -0.387284, 0.345772, -0.311679, -0.045539, -0.294871, 0.597431, 0.305880, 0.200646, 0.282531, -0.158703, -0.103099, -0.169067, -0.008382, -0.079756, 0.170810, 0.399198, -0.066380, -0.000559, -0.004255, 0.231632, 0.278643, 0.301572, 0.158929, -0.362739, -0.174208, 0.090002, 0.658097, -0.753288, -0.115159, -0.916517, 0.345335, 0.315035, -0.185459, 0.099901, -0.345710, -0.024454, 0.149417, 0.203911, -0.621984, 0.475272, 0.381955, 0.285558, 0.620195, -0.230048, -0.193346, 0.426024, -0.082065, -0.505288, -0.264816, -0.614786, 0.032445, -0.211640, -0.743929, -0.352548, -0.282483, 0.083452, 0.155323, 0.063746, -0.081079, 0.143076, 0.304706, -0.302146, -0.586734, -0.018925, 0.135363, 1.372922, 0.022931, -0.203815, -0.249750, -0.096802, -0.322165, -0.087625, -0.354634, 0.158380, 0.221725, 0.366852, 0.150404, -0.269403, 0.051307, -0.228197, 0.501474, -0.508713, 0.310570, 0.105104, 0.175922, -0.160948, -0.532149, 0.789218, -0.296985, 0.621667, -0.034217, 0.269299, -0.473348, 0.419473, -0.168176, -0.059621, 0.347642, 0.095557, 0.119908, 0.189069, 0.072328, -0.592562, 0.205006, -0.456758, -0.051765, 0.186611, -0.559713, 0.498142, 0.081539, -0.126901, -0.188289, -0.577864, -0.548407, 0.401423, 0.509642, -0.104356, 0.108317, 0.898374, 0.040251, -0.698863, -0.078856, -0.188995, -0.003449, -0.061420, -0.096289, 0.266162, 0.108274, 0.346424, -0.441824, 0.258435, -0.142721, -0.439990, 0.212413, 0.165938, -0.020241, 0.303992, -0.042712, -0.240819, 0.041794, 0.002941, 0.399268, 0.036751, 0.134195, -0.134292, 0.143634, 0.516429, 0.132746, -0.935939, 0.153861, 0.304214, -0.235507, 0.430125, 0.079114, 0.117013, 0.439177, 0.716553, -0.093885, -0.044955, -0.330311, 0.550033, -0.535871, 0.010232, -0.314547, 0.230491, 0.654590, 0.102710, -0.238373, -0.516637, 0.401736, 0.037948, 0.276305, 0.443498, 0.228219, 0.352056, 0.012582, 0.220353, -0.407780, 0.438028, 0.208541, 0.175988, -0.270985, -0.525523, -0.548477, 0.057955, -0.194781, -0.003298, 0.269760, -0.233289, -0.540260, -0.169692, -0.484772, -0.114232, -0.401115, -0.451770, -0.181977, -0.505564, -0.083401, 0.376819, -0.421163, -0.201286, -0.602337, -0.222767, 0.614156, -0.016797, -0.176580, 0.508542, -0.081163, 0.134763, -0.419954, 0.037240, -0.185230, -0.205294, 0.193024, 0.417523, -0.698548, -0.807030, -0.149474, -0.094982, -0.464999, -0.068225, -0.432381, 0.094518, -0.034902, -0.376230, -0.190843, -0.014541, -0.007451, 0.503071, 0.039062, -0.476695, -1.198664, -0.111433, 0.107706, 0.328139, -0.316463, -0.217056, 0.050820, -0.164824, 0.319711, -0.208170, 0.614053, -0.104899, -0.555545, -0.287073, -0.262042, -0.200674, -0.006250, -0.231419, -0.642396, -0.137048, -0.145983, 0.069366, 0.240843, -0.658315, -0.597073, 0.463778, -0.183305, 0.119860, 0.648310, 0.299937, -0.208176, 0.022095, -0.278521, -0.177953, 0.202277, 0.155420, 0.234455, 0.239173, 0.262826, 0.519646, 0.006134, 0.463624, 0.502869, -0.273425, -0.230075, 0.078392, 0.024440, -0.299886, 0.079623, -0.031235, 0.077222, -0.575926, 0.647744, -0.685791, 0.246712, -0.258738, -0.268204, -0.050904, -0.216824, 0.030123, -0.059568, 0.032685, 0.456928, 0.447742, 0.000349, -0.330170, -0.283173, 0.339716, -0.715647, 0.350537, -0.348117, -0.425378, 0.275241, -0.037290, -0.301914, -0.418955, -0.057846, 0.038219, 0.490801, -0.169052, -0.531855, -0.126881, 0.214531, -0.016972, 0.486737, -0.307092, 0.060385, -0.312028, -0.121100, -0.474154, -0.033297, -0.117146, -0.145217, -0.534372, 0.215293, 0.089433, -0.014680, 0.038871, -0.454982, 0.468077, -0.157630, 0.546171, 0.098118, 0.372806, -0.065871, 0.243025, -0.094658, -0.116481, 0.125599, -0.121057, -0.072374, 0.656453, -0.041724, 0.033926, -0.041462, -0.456399, -0.186608, -0.389824, -0.091279, -0.003108, 0.092814, 0.340213, -0.175039, 0.666803, -0.261088, 0.368768, -0.252754, 0.163965, -0.585329, -0.243940, -0.321711, -0.133012, -0.253066, -0.076904, -0.283974, -0.197103, -0.403979, 0.147247, -0.062263, -0.075245, -0.562477, -0.205014, 0.222767, -0.054252, -0.329677, 0.815129, 0.127072, -0.426652, -0.394265, 0.127829, 0.049436, -0.196066, 0.009618, 0.540864, -0.206275, -0.050193, -0.484369, 0.663735, -0.174202, 0.404585, 0.052897, 0.197160, 0.118619, 0.260436, 0.124425, -0.006648, 0.607803, -0.084435, 0.182488, -0.911126, -0.399702, -0.234616, -0.425689, 0.574178, 0.028556, 0.346813, -0.227521, -0.021377, -0.578073, -0.755381, -0.108397, 0.211026, -0.040924, -0.294499, -0.208813, -0.170734, 0.164126, 0.122645, 0.010301, -0.073459, 0.233719, -0.109440, -0.098003, -0.205121, -0.087854, 0.028307, 0.532311, -0.228900, -0.152958, 0.527488, 0.364191, 0.192383, -0.351570, -0.234025, 0.440298, -0.060356, -0.425945, 0.171791, 0.221136, -0.396888, -0.277037, -0.666172, -0.037598, -0.010303, -0.425237, -0.211664, -0.486224, 0.277630, 0.023025, 0.408915, -0.195016, 0.047634, 0.102783, -0.485706, 0.308589, -0.411144, -0.053444, 0.714636, 0.113829, 0.033845, 0.443375, -0.068617, -0.390009, 0.075894, 0.448207, -0.177000, -0.023104, -0.175386, 0.555945, 0.029201, -0.457334, -0.357339, 0.378139, 0.238245, -0.110313, -0.500821, 0.104544, -0.281480, 0.587670, -0.373495, -0.031328, -0.065652, -0.002243, 0.044564, -0.489538, 0.399526, -0.477148, 0.174514, 0.234356, -0.095146, -0.111628, -0.140329, -0.014722, 0.120857, -0.577776, -0.435228, 0.355946, -0.013572, -0.721267, -0.317641, 0.217404, 0.607119, 0.134049, -0.278283, -0.041902, -0.119571, 0.305286, -0.412544, 0.012801, 0.406974, 0.057604, -0.505895, 0.571090, 0.132650, 0.261956, 0.170358, -63.303562, 0.072853, -0.519040, -0.178422, 0.119113, 0.378141, 0.063010, 0.112086, -0.068155, 0.143270, -0.695979, -0.185095, 0.029685, 0.101319, 0.243841, -0.099949, 0.537030, -0.507941, -0.072316, -0.282923, -0.095952, 0.277304, 0.279224, 0.180773, 0.588230, -0.293736, -0.419753, 0.124619, -0.439369, -0.203452, 0.322170, 0.218235, -0.135500, 0.153501, -0.101683, -0.188600, 0.075205, 0.135442, -0.469343, 0.088359, 0.188366, 0.252388, -0.129069, 0.600527, 0.231637, -0.129259, -0.012753, -0.375286, -0.155583, 0.010971, 0.067799, 0.301400, -0.200182, -111.950508, -0.203151, 0.112066, -0.561678, 0.199826, 0.240353, 0.024038, -0.051374, -0.191646, 0.126553, 0.212573, -0.397948, -0.364226, -0.430931, -0.275204, -0.089596, -0.264655, 0.148102, 0.095208, -0.158669, -0.152007, -0.115978, -0.417394, -0.417549, 0.121950, -0.005634, -0.209508, 0.026816, 0.097941, 0.356630, -0.327798, -0.577439, 0.128759, -0.181856, 0.341304, -0.401386, -0.326418, 0.910050, 0.602965, -0.501181, 0.499152, -0.081410, -0.445701, 0.283482, 0.128661, -0.162188, 0.477510, -0.479452, 0.445276, 0.613333, -0.230295, -0.706247, -0.385147, -0.216893, -0.355799, -0.426881, 0.294281, 0.029298, -0.469063, 0.032645, 0.377577, -0.136977, 0.229560, -0.248002, 0.231691, 0.195523, -0.626372, 0.487773, -0.046927, 0.258056, 0.435159, -0.148099, -0.214715, 0.064264, -0.436486, 0.581707, -0.436593, 0.105906, -0.026306, 0.014113, 0.179391, -0.207917, -0.343595, -0.005207, 0.196769, -0.042901, 0.063794, 0.336295, -0.545930, -0.357988, 0.289421, -0.300983, 0.284619, -0.280840, 0.172100, 0.115409, 0.575850, -0.238892, 0.714082, -0.077031, -0.006099, 0.256360, -0.335220, 0.010254, -0.108844, 0.427106, 0.144753, -0.501155, 0.062318, -0.124000, 0.221674, 0.834580, 0.110858, -0.012859, 0.568384, -0.610597, 0.660640, -0.116576, -0.069433, 0.786274, -0.164275, -0.076165, -0.051904, 0.149984, 0.419948, 0.000599, -0.677398, 0.008406, -0.954607, 0.185710, 0.442692, 0.498803, 0.072520, -0.947522, -0.119591, -0.001380, -0.499802, 0.019757, -0.521202, -0.370886, 0.026982, 0.187350, 0.188310, 0.041130, 0.150330, -0.069001, 0.405078, -0.064037, -0.170146, 0.371683, 0.318880, -0.644247, 0.235556, -0.033404, -0.507976, -0.227250, 0.002788, 0.061252, -0.474965, -0.231972, -0.367271, 0.486901, -0.053223, -0.059750, -0.304050, -0.058611, 0.157256, 0.175846, 0.395777, 0.008041, -0.273340, -0.205810, 0.243421, -0.182496, -0.391188, 0.377847, 0.187971, 0.037746, 0.198323, -0.077953, 0.698547, -0.498333, 0.330154, -0.068499, 0.217126, 0.308641, 0.351311, 0.470557, -0.105092, 0.424691, 0.009478, 0.033756, -0.260245, 0.310565, 0.035176, 0.096856, -0.544571, 0.565485, -0.450896, 0.238636, -0.755101, 0.350827, 0.415188, -0.067474, -0.598875, 0.159576, -0.031585, 0.304653, 0.366733, 0.508857, 0.094383, 0.099464, 0.119461, 0.115205, 0.158623, 0.196619, 0.236457, 0.215316, 0.612975, 0.200485, 0.058877, -0.025619, -0.205582, 0.714231, 0.136720, -0.388014, -0.196395, 0.079210, -0.423071, -0.123676, 0.474725, -0.050195, 0.970900, -0.171853, -0.083888, -0.179501, -0.064880, -0.223606, -0.006045, -0.080197, 0.252009, -0.350478, 0.026701, 0.014843, -0.027661, -0.832059, 0.008524, 0.424898, -0.018123, -0.540465, -0.423612, -0.029289, 0.013382, -0.661031, 0.262038, -0.232594, 0.666213, 0.276438, 0.294243, -0.137279, 0.595431, -0.343497, -0.335292, 0.270682, -0.050774, -0.357238, 0.464678, 0.166497, -0.214094, 0.412270, 0.237754, -0.456134, 0.069817, -0.240430, 0.211049, 0.282824, -0.002317, 0.005693, -0.191463, -0.583706, -0.221818, -0.012686, -0.338345, -0.070691, -0.581066, 0.068721, 0.214868, 0.449330, 0.286828, -0.724386, 0.094391, -0.135015, -0.060382, 0.069094, 0.477051, -0.039083, -0.416415, -0.160632, 0.307268, 0.644104, -0.157020, -0.185252, 0.103966, -0.107401, 0.200582, -0.256117, -0.471047, 0.576146, -0.426443, -0.473182, 0.363728, -0.448481, 0.198918, 0.262472, -0.481285, 0.124921, -0.790910, 0.555112, -0.416907, -0.686696, 0.157033, 0.151132, -0.101561, -0.165456, -0.120177, 0.040872, -0.353821, 0.464011, 0.356632, -0.641347, 0.002591, -0.529365, -0.375631, -0.088245, 0.157345, -0.168277, -0.077418, -0.104852, -0.224176, -0.583345, -0.200973, 0.330438, 0.095610, 0.044952, -0.111227, 2.766827, -0.075490, -0.004877, 0.122718, -0.211410, -0.255072, 0.014199, -0.044222, -0.232582, -0.213509, -0.058883, -0.111578, 0.575816, 0.382956, 0.168038, -0.660369, -0.317659, -0.287882, -0.159493, 0.303753, 0.161251, -0.084348, 0.177310, -0.023731, -0.208123, 0.375143, -0.130619, 0.013847, 0.084544, 0.385289, -0.844378, 0.783994, -0.133632, 0.453322, -0.368930, 0.468662, 0.074394, -0.085951, -0.492908, -0.102703, -0.070889, 0.019265, -0.132152, -0.490296, -0.154806, -0.438457, 0.106065, -0.362225, 0.098467, -0.300806, 0.082323, -0.066897, 0.534250, 0.646450, 0.452268, -0.157001, 0.063397, -0.202829, 0.173803, -0.187386, 0.587771, -0.348677, -0.511564, 0.085221, -0.224883, 0.157183, 0.246904, 0.638365, -0.585779, -0.287903, 0.340373, -0.030720, 0.061279, 0.074192, 0.301575, 0.018036, -0.261474, 0.063272, 0.015081, 0.398179, -0.035079, -0.370379, -0.052801, -0.530252, 0.093011, 0.002836, -0.697104, 0.546006, -0.332340, -0.122048, 0.289079, -0.541533, -0.503000, -0.054692, 0.134673, 0.006598, 0.017476, -0.134000, -0.175561, -0.208997, -0.003923, 0.051280, 0.086685, -0.240404, -0.357857, 0.243271, 0.566339, -0.173989, -0.064073, 0.295804, 0.233289, -0.092659, -0.665500, 0.190993, 0.167569, -0.278074, -0.244135, -0.168051, -0.463405, -0.177617, -0.034471, 0.007265, -0.297528, -0.075948, -0.047524, 0.122029, 0.211117, 0.120729, -0.014987, -0.632226, 0.213933, 0.220751, -0.416721, -0.432320, -0.204294, 0.197827, -0.168607, 0.240733, 0.250329, 0.280815, 0.289349, -0.309672, 0.147626, -0.490905, 0.131334, 0.044870, 0.244089, -0.017633, -0.301772, -0.072316, 0.146994, -0.293942, -0.420279, 0.161965, 0.148996, -0.126067, 0.491702, -0.119142, 0.063983, 0.395035, -0.213297, 0.140833, 0.196409, -0.048128, 0.017225, -0.505947, 0.664136, -0.202357, 0.317128, 0.168667, 0.093162, -0.380574, -0.092151, 0.044294, -0.870981, 0.241743, -0.536381, -0.090947, -0.648787, -0.355641, 0.165425, -0.238059, -0.596338, 0.266993, -0.339680, -0.382530, 0.291458, 0.768516, -0.176065, -0.569123, 0.096203, 0.148821, 0.546749, 0.138208, -0.365413, 0.518124, -0.101714, 0.804016, -0.252335, 0.263548, 0.344599, -0.042573, 0.189019, 0.249793, -0.605840, 0.522157, -0.359445, 0.140590, -0.134246, 0.302941, 0.272368, -0.274690, 0.744960, 0.187399, 0.031517, -0.522490, 0.492635, -0.265579, 0.556953, 0.002599, -0.475620, -0.273551, 0.061263, -0.107502, -0.058321, 0.335491, 0.093252, 0.338381, 0.580982, 0.004722, 0.306945, -0.357699, 0.204085, -0.403294, -0.339715, 0.038231, -0.318873, -0.157163, -0.286547, -0.014830, -0.408272, 0.646497, -0.297623, -0.716684, -0.037028, -0.359614, -0.607903, -0.080210, -0.243790, -0.372580, 0.327745, 0.174814, 0.007795, -0.323086, 0.278917, 0.193464, -0.183376, -0.158650, -0.150630, 0.237812, -0.554636, -0.307655, -43.962406, 0.747826, 0.170216, -0.375895, -0.110257, -0.561347, 0.516887, -0.050687, -0.239293, 0.075187, 0.214272, -0.583057, 12.469608, 0.157744, 0.439794, 0.303756, -0.331535, 0.010205, 0.261059, 0.047994, 0.124434, 0.148169, -0.040808, -0.038213, 0.568745, 0.759167, -0.161330, -0.147977, 0.209662, 0.184281, 0.034607, -0.435261, -0.914145, -0.448240, 0.502655, 0.171096, 0.162151, -0.330909, 0.644280, -0.230030, 0.400959, -0.398285, 0.243113, 0.819535, -0.015852, -0.416345, -0.035277, 0.475769, -0.331067, 0.097865, 0.243495, -0.104292, -0.019232, 0.216380, 0.644561, -0.040559, 0.058224, -0.022566, -0.086129, -0.033864, -0.107755, -0.452065, 0.232334, 0.260739, -0.491365, -0.583293, -0.385895, -0.032558, 0.422507, -0.104322, -0.199052, -0.450761, 0.188655, -0.015979, 0.101417, -0.108862, -0.445502, -0.133705, -0.177996, -0.163099, -0.119531, -0.171117, 0.427125, 0.238072, -0.338347, 0.201365, -0.159817, -0.007986, -0.612904, -0.590535, -0.537798, 0.119414, 0.221465, 0.378790, 0.208833, -0.014505, 0.360619, -0.548349, 0.216484, 0.256683, 0.385207, 0.706974, 0.283812, -0.470752, 0.290828, -0.313211, 0.595083, -0.867086, -0.099950, -0.006093, 0.328319, -0.174707, 0.061710, -0.149805, 1.078194, 0.353151, -0.642559, -0.077821, 0.227179, 0.333391, 0.007280, 0.522016, 0.349228, -0.273700, -0.092780, 0.020995, -0.248111, -0.178455, 0.095094, -0.057657, 0.430333, 0.131980, 0.129095, 0.375191, -0.076389, -0.125741, 0.223384, -0.356347, 0.353186, -0.023569, -0.242601, -0.549070, 0.215102, -0.368639, -0.316551, 0.274893, -0.523182, 0.007249, 0.256059, -0.174845, 0.097209, -0.344082, -0.195790, -0.205222, -0.601514, 0.215583, -0.090739, 0.279482, -0.269076, -0.103268, 0.192448, 0.343325, -0.083246, -0.548744, 0.427873, -0.101012, -0.624601, 0.132522, 0.332115, -0.602158, 0.100890, 0.363860, -0.269838, 0.326628, 0.123457, -0.670479, -0.476265, 0.158514, 0.236981, 0.179209, -0.047529, -0.307782, 0.100836, -0.113148, -0.232984, -0.231841, -0.206003, 0.090617, 0.506436, -0.088726, 0.060476, 0.503427, 0.701742, -0.349184, 0.198925, -0.189434, -0.242362, 0.060725, 0.099748, 0.038268, -0.391647, 0.629047, 0.516061, 0.209719, 0.497822, 0.031479, -0.268111, -0.201153, -0.449376, -0.248165, 0.497311, -0.135929, -0.426607, 0.067370, -25.183310, -0.217668, 0.244560, -0.113699, -0.034418, 0.279075, -0.087385, 0.090707, -0.057644, -0.038125, 0.556788, -0.185287, -0.570150, -0.224455, 0.177524, 0.295346, -0.079438, 0.456904, -1.079334, -0.205251, -0.241942, 0.030973, -0.412797, -0.039332, -0.078231, -0.625990, -0.752923, -0.103901, -0.260115, 0.343021, -0.274866, -0.338814, -0.084887, 0.440932, 0.021582, 0.504624, 0.122857, -0.114787, 0.298529, 0.105958, 0.489619, 0.023746, 0.315594, 0.449650, 0.291635, 0.456601, 0.009884, 0.411796, -0.053264, 0.003709, 0.146712, 0.551897, 0.747416, -0.292591, -0.418243, 0.235221, -0.076943, -0.266084, -0.249324, -0.296882, -0.198107, -0.645373, -0.300805, 0.539361, -0.779099, -0.247917, 0.190751, 0.161664, 0.476388, -0.065680, -0.249199, 0.418452, 0.159107, 0.429605, 0.161158, -0.122370, 0.595451, 0.982097, 0.093595, -0.179688, 0.247968, 0.154720, 0.151772, 0.511285, 0.429992, 2.756784, -0.477258, 0.210764, 0.060735, -0.206560, 0.050323, 0.226779, 0.488080, -0.322912, 0.215526, 0.079592, 0.581980, -0.074688, 0.156174, -0.439357, 0.177422, 0.084449, -0.244841, 0.041477, -0.124318, 0.522598, 0.331280, 0.219801, -0.001475, 0.191698, 0.235577, 0.397007, -0.140321, 0.127137, 0.326186, 0.080475, -0.198508, -0.016927, -0.647423, 0.118334, 0.151125, -0.260203, 0.370355, -0.372183, 0.684103, -0.044755, 0.411336, -0.508753, 0.137960, -0.578130, -0.308091, 0.662036, -0.223479, 0.528630, -0.088039, 0.301694, 0.581005, -0.571935, -0.326560, 0.074047, 0.560205, -0.517768, -0.118131, -0.825478, -0.052467, 0.087111, 0.156990, -0.125347, 0.200030, -0.232904, 80.721504, -0.353159, 0.366250, 0.661608, 0.076759, -0.675517, 0.160898, -0.327791, -0.218044, -0.286360, -0.110733, -1.071516, -0.166569, 0.236425, 0.076636, 0.110097, 0.025876, 0.136520, -0.289324, -0.052212, -0.116298, -0.413791, -0.261230, -0.305566, 0.213748, -0.213800, -0.181269, 0.551312, 0.220690, 0.754416, 0.246657, 0.569155, 0.251539, 0.176920, 0.161606, -0.531014, -0.195418, -0.095278, 0.553006, -0.337154, -0.577858, -0.308738, -0.046533, -0.260912, -0.037302, -0.461034, -0.245663, -0.409261, -0.124479, 0.556365, -0.376639, 0.100875, 0.092979, 0.564593, 0.361897, -0.145732, 0.145205, -0.663406, 0.029300, -0.542234, 0.114352, 0.108448, 0.103333, -0.109936, -0.598299, 0.671874, 0.007481, -0.067422, -0.047742, 0.225532, -0.224111, 0.202110, 0.233249, 0.132545, -0.249386, -0.251535, -0.008667, 0.486993, -0.417082, 0.657399, 0.234556, 0.103305, -0.333804, 0.601285, 0.350520, 0.213891, 0.336645, 0.362093, -0.728231, 0.082136, -0.602861, -0.315520, -0.174943, 0.939620, 0.721859, 0.055698, -0.052252, 0.005834, 0.295090, -0.253800, 0.549283, 0.408780, -0.037937, 0.030686, 0.093003, -0.155114, -0.104932, 0.285636, 0.134702, -0.235902, -0.073232, -0.016380, -0.196810, -0.231825, -0.039652, 0.191879, 0.708907, 0.046643, -0.679296, -0.272560, 0.220830, 0.632073, 0.011146, 0.338630, -0.006068, -0.417746, -0.790530, -0.044891, -0.112715, 0.119640, -0.555511, 0.385198, 26.227066, 0.327086, -0.359925, -0.246532, 0.315119, 0.315461, 0.184825, 0.874743, -0.243538, 0.034515, -0.099371, 0.384990, -0.588993, -0.329512, 0.202681, 0.108792, -0.522285, 0.317060, 0.258312, -0.340065, 0.353208, -0.226905, -0.467262, 0.197749, -0.205172, -0.173064, -0.003351, 0.592039, -0.122060, 0.194374, -0.540868, -0.138176, 0.341844, 0.081864, 0.242733, 0.373292, -0.228865, 0.148295, 0.120463, -0.094172, -0.023198, -0.016959, 0.237407, -54.570545, -0.573048, 0.675045, -0.190351, 0.164020, -0.054506, -0.056234, -0.145207, 0.239472, 0.296177, -0.228059, 0.566965, 0.066044, -0.390903, -0.266558, -0.301220, 0.327292, -0.387228, -0.326956, 0.046026, -0.537017, -0.113912, 0.009175, -0.175402, -0.095018, -0.330905, 0.084470, -0.136133, -0.354516, -0.469037, 0.438247, -0.231660, 0.032670, -0.097507, 0.094830, 0.058257, -0.154819, -0.390733, -0.600798, 0.396636, -0.294010, -0.608886, 0.159131, -0.488123, 0.253940, 0.310930, -0.223843, -0.505954, -0.019320, 0.451123, 0.033688, -0.305418, -0.521554, -0.435363, 0.323155, -0.312042, -0.008847, -0.252040, -0.180144, -0.310767, -0.178879, 0.130031, 0.857571, -0.107182, -0.299440, 0.559908, 0.155175, 0.366574, 0.170108, -0.399408, 0.217482, 0.401690, -0.321029, -0.045608, -0.068220, 0.157180, -0.177041, -0.105864, 0.044081, -0.435274, -0.008077, -0.537582, 0.047345, -0.313762, 0.100752, 0.147813, 0.158115, 0.128763, 0.052037, 0.156659, -0.200804, -0.530616, 0.043076, -0.484326, 0.115019, -0.426879, 0.149550, -0.379401, 0.265588, 0.110010, -0.137560, -0.046419, -0.190895, 0.185832, -0.424423, 0.238436, -0.331180, -0.208932, 0.093624, -0.136191, -0.024762, -0.284244, 0.031795, -0.036643, 0.343705, 0.086551, 0.040446, 0.576904, -0.533034, 0.577420, -0.326808, -0.414999, 0.202424, 0.118209, 0.829163, 0.636830, -0.537408, 0.020064, 0.206316, -0.161511, -0.346788, 0.161839, -0.026227, -0.223244, 0.261731, -0.382454, 0.108277, -0.355422, 0.103880, -45.383808, -0.162695, -0.150417, -0.127754, -0.070344, 0.119024, -0.116959, -0.034194, 0.281006, 0.047009, -0.478871, -0.337656, -0.401619, 0.260010, 0.109437, -0.230819, -0.169927, 0.253672, 0.187743, -0.142163, 0.026885, 0.312044, 0.012883, -0.075687, 0.187942, -0.864674, -0.161560, -0.180769, 0.423245, 0.286899, -0.322536, 0.082467, 0.129985, -0.038326, 0.400418, -0.058516, 0.277353, 0.456197, 0.539164, -0.083526, 0.026757, 0.039200, -0.235375, 0.052175, -0.201285, -0.225762, 0.416601, -0.418963, 0.252038, 0.136844, 0.261401, -0.321984, 0.021641, 0.297252, -0.045968, 0.015987, -0.632211, 0.079325, 0.178558, -0.686958, 0.065480, 0.519709, 0.642042, -0.067845, 0.037921, 0.154914, -0.014848, -0.024645, 0.106363, -0.512707, 0.045547, 0.496962, -0.158557, 0.217693, 0.007438, -0.582735, -0.274856, 0.251478, -0.168064, -0.218378, -0.187458, 0.235936, -0.303580, 0.133740, 0.433543, -0.416777, 0.163158, -0.312299, -0.542848, 0.314374, 0.362104, 0.192649, 0.357390, -0.229162, -0.168068, -0.437465, 0.052538, 0.281516, 0.368737, -0.097629, 0.968332, 0.208670, -0.460906, -0.274308, 0.238028, 0.236472, 0.210751, 0.070369, -0.012518, 0.160674, 0.005216, -0.122002, 0.292114, -0.093093, 0.153105, -0.204351, 0.578618, -0.321862, -0.400429, 0.156500, 0.084270, 0.381356, 0.067640, 0.051333, 0.399535, -0.307002, 0.340882, 0.135507, 0.089646, -0.237054, 0.281415, 0.090153, 0.462730, 0.171334, 0.454288, -0.291779, -0.587445, -0.012028, -0.045764, 0.449051, 0.142538, 0.409746, -0.105807, 0.066156, 0.277036, 0.749279, 0.060583, 0.179168, 0.016475, -0.148920, -0.023366, -0.071207, -0.346201, 0.011620, 0.048880, 0.374774, -0.428500, -0.617913, 0.200059, -0.312047, -0.069751, 0.212112, 0.469310, -0.322054, -0.436160, -0.151759, -0.362400, -0.100673, -0.390196, -0.720022, 0.013152, -0.093669, 0.500391, -0.181266, 0.171757, 0.205245, -0.491997, 0.150053, -0.303285, 0.213764, 0.181915, 0.042309, 0.117224, 0.243289, 0.163959, 0.249842, 0.096863, -0.027731, -0.371531, -0.153938, 0.580561, 0.096046, -0.302468, 0.400171, 0.505695, -0.165858, -0.123545, -0.023400, 0.045407, 0.485218, -0.918825, -0.379058, 0.030079, -0.237506, -0.267864, -0.423086, -0.861124, 0.339907, -0.120479, -0.469097, 0.068623, 0.029885, -0.089538, 0.517054, 0.389653, 0.260604, -0.697998, -0.163827, 0.174017, -0.102305, 0.016205, -0.635075, -0.597663, -0.165878, 0.092621, 0.079050, 0.743767, 0.460755, 0.141955, -0.437859, 0.222894, 0.521355, -0.322920, 0.021801, 0.437041, 0.586200, 0.036765, 0.155797, 0.139985, -0.041782, -0.709892, 0.013074, -0.547880, 0.155922, 0.070960, 0.569953, -0.219364, -0.014088, -0.335218, 0.293984, -0.440478, -0.020124, -0.123338, 0.212336, 0.409897, -0.151092, 0.206929, 0.108662, 0.003318, 0.480850, -0.388049, -0.373116, 0.508014, 0.646062, 0.338924, -0.014462, 0.072739, 0.503070, -0.571416, -0.053392, 0.261631, 0.593776, -0.324953, 0.396804, 0.309049, 0.453608, 0.104420, -0.480800, -0.556374, 0.342257, 0.100447, -0.134773, -0.252847, -0.256652, -0.206923, 0.125899, 0.127910, 0.259252, -0.247190, 0.271972, -0.559636, -0.121707, -0.268752, -0.576536, -0.150227, 0.483635, -0.634572, 0.468136, -0.471012, 0.473711, 0.300323, -0.380895, -0.137505, -0.299252, 0.208652],
 	"falcon2:latest":       [-1.275306, 3.837295, 3.104335, -0.132523, 3.717942, 0.372535, -1.772705, 3.119587, 0.330449, -0.700744, -2.615506, 10.353819, -2.361413, -2.383462, 1.663645, 2.710775, -4.230299, 2.722366, -0.357314, -2.037119, 1.449030, -4.184108, -1.876729, -0.485423, 1.748516, -0.127038, -3.764583, 0.180338, -1.962256, -1.628213, -0.470965, 0.818017, 0.607546, -1.474351, 3.853329, -2.645384, 0.169052, 1.954102, 2.811140, 0.609534, 0.137802, -0.385182, -3.722800, -2.822938, 1.028885, 4.215508, -1.851904, -0.174632, 0.901177, -1.909700, 1.275259, -3.331348, -0.757433, 0.559766, -3.048050, -1.044024, 1.142050, -0.407469, -3.040350, 3.580981, 4.013700, 3.739869, 0.390057, -0.821299, 0.094822, 3.214658, 2.524949, -0.300330, 0.556939, -3.069934, -0.515992, -2.658362, -0.433554, 9.095484, 2.195112, -3.849095, 5.091149, 0.481200, -1.134630, -0.239280, 0.887909, 1.042396, 2.646552, 1.802093, 1.645549, -3.790758, 0.988161, -1.200287, 1.894759, -1.166587, -0.253824, -1.039379, -2.276851, -0.460683, 0.297967, -3.424223, -4.195358, -3.542181, -1.769864, -1.338850, 0.911988, -2.302943, 2.484190, 2.044442, -4.596624, 1.267139, -2.049352, 2.546231, 0.792628, -1.380133, -0.790615, 6.012039, -0.964435, -0.969051, -1.812450, -0.890375, -0.798536, 4.244349, 0.251844, -1.319340, -8.636320, -0.711426, -0.413526, -0.978065, -2.059388, 3.617761, 0.892199, -0.481894, 2.143296, 0.515694, -0.860783, 4.284967, 0.612026, 4.006313, -0.335522, 0.782485, -0.058630, -0.286828, 0.756606, -1.973024, -1.502667, 0.269806, 2.558606, -0.341148, 3.807795, 2.500325, 0.260572, 3.994771, 3.990319, 4.135689, -5.712500, -2.158962, 0.670838, 1.928773, 0.011099, 2.180364, 4.558554, -5.859202, -1.996603, 1.865865, -2.038666, -2.903919, 2.060443, 0.326185, -0.932059, -1.541007, 3.649105, 1.813717, 1.723021, -0.905025, -3.768096, -1.637242, 3.691015, -2.636121, 1.495227, -0.221481, -2.330304, 1.152045, 3.441514, -1.925960, 0.132836, 1.376882, 0.099456, -0.139153, -2.827578, 3.249744, 2.968992, -3.029808, -0.102355, -3.398517, -2.040715, -2.388101, -0.241652, -3.439781, -1.627563, -3.316260, 1.659161, -1.469869, 2.303240, 0.234823, -2.916763, -3.792274, -3.562071, 1.947090, -0.283635, 1.218568, 3.385237, -0.217122, -2.976565, 1.250648, -1.260653, 1.940458, -2.756780, 1.184896, 2.428695, 0.372655, 2.108495, 2.850650, 1.011860, -3.017907, -1.272859, -5.748411, -2.802590, -1.651772, -3.874090, -6.691458, 1.628203, 0.902252, 1.731206, 1.233933, 2.390242, 4.491738, 1.581355, 1.289003, -3.980048, 0.603783, -3.222681, -0.143940, -4.617625, -2.092155, 0.132560, 0.233982, -1.955884, 3.025106, -1.231423, 1.970144, 1.114148, -0.862942, -0.308265, 0.604513, 1.433046, -3.165818, 5.211211, -0.752265, -0.261660, -0.435479, 0.435807, -3.130022, -3.382930, -0.302572, -0.304755, -0.875783, 2.048395, -1.610556, 0.004490, 3.130698, 2.056630, -0.268167, 1.429440, -1.549552, 2.836768, 1.683631, 0.112271, -2.846099, -3.467909, -1.734015, -1.098224, -0.865726, 3.147281, -1.294847, 0.867880, -0.441601, 0.848505, 0.525628, 3.181535, 3.977831, -2.883601, 1.395165, 1.380046, -1.757408, 3.363719, 2.468548, 1.804424, -1.532946, -2.130015, 2.167905, 0.806237, 2.135587, -0.616036, 2.803680, -2.744563, 3.122983, 2.290845, -1.941239, -9.350484, -3.405049, -2.718211, 0.868880, -11.208566, -2.547097, 1.621465, 0.609452, -2.762423, 0.948383, 2.396591, -3.263968, 2.348562, -0.143500, -0.433124, 3.383017, 1.096289, 0.702994, 1.397233, -3.194034, 3.071927, -0.490006, 1.249623, 0.095233, 1.227319, 2.841045, 1.574195, 0.175490, 4.525906, 2.250069, 4.384015, 0.057987, 0.493614, 2.313399, 1.154731, 3.197756, -1.679319, 1.088407, -2.359752, -0.140352, 2.276346, 1.908259, -1.822850, -7.785120, -3.179761, -1.742176, 1.736056, 0.866745, 1.359070, 1.303184, -2.294070, 0.053626, 0.213622, -3.149813, -1.677805, 5.298675, -4.036638, -1.087946, -9.100941, 0.924279, 2.780944, 0.060562, 0.627181, -3.341824, -3.136374, 1.235826, 3.671900, -3.921594, -3.728671, -0.480427, 6.906900, 2.382487, 2.390993, -2.643352, -0.856610, -1.719015, -1.395070, -0.955716, -0.926107, -0.762820, -0.575991, 2.222496, 0.387985, -1.718496, 2.484186, -1.179515, -0.119917, -2.718995, -0.767698, -0.807503, 0.944953, -0.757398, -2.630700, -2.638915, -2.941369, -2.706702, 0.776204, 1.073829, -2.168399, 2.919386, 0.639097, 2.180970, -3.533285, -3.003216, 0.523227, -2.920606, -1.932676, 3.508812, -0.450955, -2.594667, -2.833239, 0.684165, 1.521983, -4.122576, 0.048962, 6.750394, -0.751528, 4.363237, 2.375494, -1.676940, -0.282620, -1.074734, 5.402434, -1.364364, 2.213356, 3.225933, 0.355178, 0.533272, -0.000414, -1.026736, -3.608583, 3.379501, 0.643868, -0.129584, -2.414845, 0.739444, 1.379362, 1.128999, 1.212591, 2.126804, -3.392489, 2.190962, -1.066917, 0.072771, -1.777203, 0.017392, -0.069404, 2.575250, -0.143043, -1.058601, 0.293741, 1.353118, -1.655753, -2.091888, 0.584350, -0.720455, -3.424674, -1.981546, -9.107293, 1.949826, -1.522261, -2.681044, -8.341246, -3.131934, -0.714103, 0.207412, 2.508620, 1.767241, 2.730778, -0.825572, -4.030791, 0.238825, -0.371033, 0.656116, 3.989294, 1.633813, 0.325307, -0.367170, 3.002364, 5.156171, 1.634276, -0.971831, -1.604515, -2.526582, -2.878766, 2.807352, 1.909165, -3.935943, -1.386310, 3.170664, -1.649927, 2.220213, 1.246705, -3.735313, 0.272806, 2.752307, 3.109334, -0.102484, -2.823759, -0.632412, 4.712577, 0.411905, 1.390527, -0.006540, -0.578229, -0.902223, 2.687959, 0.999556, 5.464525, -1.069448, -1.659709, 1.730871, 3.159081, -3.505104, -1.294691, 0.470974, 2.520426, -0.375588, -1.659444, -2.902283, -0.643385, 0.652468, 0.084223, -0.908147, -1.238602, -1.975286, -2.141201, 0.736527, -1.523522, 1.027745, 3.354010, -2.870191, -2.455250, -0.858900, 1.902766, -1.251234, -0.778018, -0.829765, 1.343457, 3.192241, 2.146025, -2.868140, 1.453027, -1.014266, -0.626704, -1.431984, 1.272131, 1.996223, 2.216051, 1.321754, -3.712698, -0.555156, 1.665618, -2.150003, 2.469344, -2.743378, -0.621012, 2.153028, -1.323079, -2.781879, -3.508905, 2.613299, -1.131783, 1.250944, -0.726568, -0.350890, -0.510844, -0.491322, -0.969947, 0.056512, 2.073022, -0.684992, 2.324330, 1.388874, -0.592233, -0.007646, -2.144584, 3.341875, -3.342694, 1.472340, 1.924615, 0.260784, 1.455808, 2.527028, -0.212072, -2.648695, -3.802433, 2.224674, 1.380913, -2.425954, 0.170378, -3.111950, -2.677550, -2.053871, 3.144357, 2.069789, 1.802598, 2.778758, 1.854536, 0.374600, 2.614195, 0.576047, 0.823646, 2.809750, 2.347219, -1.944691, -4.136095, 0.170414, -2.563500, -3.029089, -2.152708, 0.650496, -0.629906, -6.837932, -0.400384, -2.685872, 2.135639, -1.354106, 1.782213, 1.624843, 0.522975, 0.623956, -3.265278, 4.157569, 1.269328, 2.334650, 2.017716, -1.755639, 0.314134, 2.929660, 1.085749, 0.075581, 5.186268, -3.722156, -0.716771, -0.292780, 0.854932, -1.265511, -1.754162, -2.801681, 2.599700, 1.192626, 0.355987, -2.978914, 3.488926, -7.477931, 0.302190, 0.159773, -1.786207, 5.423014, -3.205537, 3.759773, -0.824988, 1.123907, 1.247544, -1.165879, 4.103333, 1.861792, 0.432820, 2.554196, -6.796187, 3.449815, 2.632658, -0.926445, -1.542485, 3.184329, 2.700121, 1.650769, -4.734762, -0.469165, 1.700503, 0.710945, 0.347955, 3.436646, -1.542240, -1.133071, 2.124078, 2.539431, -5.082206, -1.449178, 0.902514, 0.330178, 0.733471, 0.341730, 1.352243, 1.385499, -1.848245, -2.310890, 1.524840, -0.910095, 3.168741, 0.410063, 0.614422, 1.316848, 1.423225, -2.094517, -3.665807, -0.068259, 0.396121, 0.212791, -1.601394, 1.471721, 0.772471, 2.288241, -0.320014, -2.428612, -2.265923, -2.968658, -0.796963, 5.248973, -6.998604, 2.486512, -4.433386, 0.621024, 0.518264, 0.806498, -2.155303, 0.587934, 1.328967, 2.631720, -3.549683, 2.099039, 10.083544, 3.482533, -1.516241, -1.431416, -2.913141, -0.618922, 0.779420, 0.236411, -0.174201, -0.076979, 0.220478, -2.811640, -2.158289, 1.648178, 1.729170, 1.846626, 4.329294, 2.369144, 1.389106, 2.679754, -2.294376, 0.030183, -1.699179, -3.449418, -2.010730, 4.066484, -4.480805, 2.193635, -3.104744, 3.195236, -1.664788, -5.720037, -1.266570, 3.600276, -3.170063, 1.536892, 1.569534, 1.513302, 1.860432, -1.550267, -4.262663, -2.133216, 1.650067, -1.284678, -6.684666, 2.014430, -0.744968, 1.426262, -1.076141, -0.831970, 0.365885, -0.528754, 0.758301, 3.348044, 0.187185, -1.889567, 2.917908, 3.023930, 3.023178, -2.817112, -3.179446, 2.497720, 2.188169, 1.301504, 3.268865, -1.754922, -1.822780, -0.972488, 0.873712, 1.323133, -2.205761, -1.612030, 2.303227, 2.813283, -4.188452, -2.182941, 0.085170, -3.564689, -2.566697, 0.568231, 0.818893, -1.662748, 1.361568, -3.690748, -0.252375, 0.785948, 1.647538, -0.423860, -0.906425, -1.824527, -2.521552, 0.867587, -2.148677, 2.452386, -1.397286, 1.692210, -1.310338, 0.637889, 2.583428, -1.523493, 1.296878, -3.746370, 2.423034, 1.349175, 0.721364, 3.474914, -0.023369, 0.163030, -1.579503, 0.504772, 0.925550, 2.454133, 0.964500, 1.823297, 0.661305, 1.748695, 1.492398, 0.669027, -0.978568, -3.156673, 0.232972, -0.302810, 1.174894, 5.136952, 1.104236, -2.518059, 3.446657, 3.263038, 1.778748, 0.553110, -0.299201, 1.262304, -0.469016, 0.049288, 2.680809, 3.362026, 1.534753, 2.141226, 0.736711, 0.353176, 1.463385, -1.981672, 1.908352, 2.260112, -2.573204, 2.284262, 3.457025, 1.571169, 0.957621, 0.818471, 2.919568, -2.219922, -0.831442, 2.851260, -0.567659, -2.098528, -1.197574, -0.571810, -3.110029, 4.160790, 0.262709, 2.311606, -0.834847, 2.457886, -2.212020, 3.465058, -1.780708, -0.743038, 2.232378, -1.010238, -3.758393, 3.163526, 1.855109, 0.639158, 0.407191, -2.256367, 1.018602, 2.291389, 2.191854, 0.564081, 0.571482, 0.289375, 1.938669, 2.964486, -1.157557, 3.346216, -0.604904, -1.879804, -3.309341, 2.884830, 1.600047, -1.182005, 0.213239, 2.088271, -2.216653, 0.805338, 3.107358, 1.409771, 2.426908, 4.041484, 2.601931, -2.011261, 1.826019, -3.202612, -1.295027, 3.832757, -1.571364, 6.595190, 2.578595, 1.157802, -0.552183, 2.183033, 0.745915, -1.118431, 0.077764, 3.768408, -1.987089, 0.194293, 0.610980, -0.361441, 2.990337, -2.133753, 2.919108, -1.559100, 2.185312, -1.411286, 2.620373, 3.767267, 2.179324, -1.849180, 0.756302, 1.529904, 3.719940, -1.179238, -2.207529, -3.936229, -2.072637, -2.803760, -3.367758, 1.098284, -2.009740, 3.083918, -0.394910, 2.485883, 0.113272, 2.655967, 1.349244, -2.255511, 1.025767, -1.384176, 1.732267, 2.225488, 1.098336, -0.509963, -0.263632, 2.146725, -0.957762, 1.923358, 0.133071, -3.363287, -1.703886, 1.032311, -0.755408, 2.498776, -2.764693, 1.509607, -1.617618, 1.436502, -1.559158, -0.746124, 1.562070, 17.831846, 4.211090, -1.706081, 2.346309, -3.273780, -2.834237, -3.982527, -1.515350, -3.250439, 0.453186, 0.603446, 3.211556, 3.159271, 0.628849, -4.001279, -0.854389, 3.856479, -3.991511, -0.177681, -2.286148, 3.187693, -0.223602, -0.075862, 2.957560, -2.254858, -0.967292, -3.214899, 2.144798, 2.144349, -4.565749, 3.582543, -0.960689, 2.410637, -2.425848, 2.851839, 1.097054, -0.605512, -1.531101, 0.845345, 1.495862, -0.475145, -1.574789, -0.029720, 0.645808, 3.104814, -2.337604, 0.435249, 1.841378, 0.299883, 2.139250, 2.561849, -2.265870, 2.402302, 0.791734, 4.197929, 0.943617, 3.297447, -2.381050, -3.254916, -3.195335, -2.113678, 1.651127, 0.783532, 3.214834, 0.408355, 3.205199, 1.115050, 2.330830, -2.771334, -1.927293, -1.882555, 7.298583, 1.038048, 3.148403, -2.869358, -1.115117, 0.074848, 4.928357, 2.089816, 1.986463, 0.470056, -6.525270, 1.073113, -4.135253, 0.774483, 1.268632, -2.197856, -2.068014, 4.605621, -2.308162, -2.780638, -0.721838, -4.479808, -3.508407, -2.428396, 4.441323, 1.775349, 0.214881, -1.635818, 3.045053, -0.610118, 1.187438, -1.759295, 3.817451, -1.664751, 2.842919, -2.662443, -1.073685, -1.129181, 0.843486, -0.775921, -2.891870, 2.612444, -3.581708, 0.276234, -1.549791, -0.161535, 9.922145, -2.840818, -1.211635, 2.635808, 6.841765, 0.938128, 0.082862, -0.197744, 0.354956, -2.333372, -0.819113, 0.838161, 5.306095, 1.898514, -0.049039, 2.551995, -0.244460, 3.227154, 0.455535, 2.356509, 1.989387, 1.096240, 1.646904, -4.098421, 0.545672, 1.364193, 1.341040, 1.556323, 5.145891, 2.229922, -1.214043, 1.089752, -1.778068, -2.560244, 2.266327, 1.981164, 2.707444, 2.781528, 0.979474, 0.972002, -0.075105, 0.934945, -0.866869, 3.448736, -1.561079, 1.061687, 2.550811, -0.469708, 4.136203, 3.849202, -1.115006, -1.362522, -0.477934, -0.217236, 3.059588, -1.913679, -0.223996, -2.238563, -0.246944, -2.901762, 2.800288, 0.591907, 2.385238, 1.304711, 1.502628, 0.314458, -0.560291, 2.228894, 4.770694, 0.590195, 1.130904, -0.154598, -2.602068, 2.284669, -2.444100, 2.262833, 2.987651, -1.718937, 0.172004, -4.404370, -0.713307, 3.951679, 1.743177, 2.184051, 2.084052, -1.200920, 3.804940, 1.190594, 0.423731, 0.387230, -2.698928, 4.313253, -1.562275, 3.057487, 1.332809, 3.277320, -0.025386, -7.029213, 2.271002, -1.368629, 1.512957, -1.627040, -1.321970, 1.505093, 2.593931, 2.138204, -0.703714, -2.467005, 2.581830, 0.924391, -0.011639, 0.399110, -1.663732, -1.625977, -2.453845, 1.651928, 2.514879, -1.845028, -0.415811, -2.509327, 2.051333, 1.113616, 0.354290, -3.648528, 2.132520, 1.173429, -1.130877, 5.016101, -0.490695, 3.052016, -3.462008, -1.558317, 2.028475, 2.340829, -2.054028, -2.028140, -1.076622, -3.786443, 2.211594, 0.838984, -4.107341, 1.741582, 2.438528, -1.545592, 0.031897, -4.673401, -1.738998, -0.626893, 1.342387, -0.145155, -1.596622, 2.264874, -2.561344, -1.419518, -2.779258, 3.840589, -2.814966, -1.121065, 3.103562, -1.434957, 1.904631, -0.152767, 3.093784, -0.006062, 1.727092, 3.195707, 1.766008, 2.366390, -2.307591, 2.373812, -0.476952, 1.527025, -0.465218, 2.363796, 1.407915, 2.833245, -2.019148, -0.643388, 2.679688, 5.222930, -1.854012, 2.572557, 1.569281, -0.477092, 1.902260, 3.180279, 6.306729, -0.274454, 0.718802, -2.418252, 4.774065, -1.660186, 0.724020, 1.382207, -0.139734, -1.164008, 4.906361, -2.018308, 3.283710, 0.376118, -0.720316, -4.012407, 0.609760, -0.314082, 1.398531, -0.268018, 1.786567, -2.602562, -0.330188, -4.301880, 0.335341, 4.711812, 0.695186, -2.297723, 0.472864, 0.569482, 1.624462, 2.681783, 3.021969, -0.104063, -1.222748, -3.950817, -0.500057, -0.442732, 1.211303, 1.949326, 1.100488, -5.104456, -1.600076, 1.789613, 0.082084, -0.346776, 0.059769, 1.983953, 2.033277, 0.305617, 2.394171, 1.896901, 6.016169, 0.465130, 2.148189, -3.048455, -1.015694, -0.439929, -4.321608, 2.767549, -2.717652, 0.469900, -0.449218, -1.194940, 0.407131, 0.130723, -0.135469, 2.412801, -0.450518, -1.383515, 0.950040, -1.357381, 3.228314, 3.361074, 0.332888, -5.870872, -2.020130, -0.178779, -1.861960, 2.273883, 0.218612, -2.976734, -1.733326, 1.392309, 1.946056, 0.799269, 1.524984, 2.629119, 1.725265, 1.525396, -2.106735, 2.488540, -3.239527, 0.826539, 0.666062, -1.867959, 0.354207, -1.698327, -2.927954, 2.341576, 0.872006, -2.815492, 3.722144, 2.325369, 1.456939, 2.497020, 1.038249, -1.285768, -1.675847, 0.893679, -1.395089, 0.595987, -0.750167, -0.563275, 1.091686, -2.418337, -1.896940, -1.411161, -0.380414, -0.252327, 0.983415, 3.090965, -3.424773, 1.611721, -1.380765, 2.076887, -1.467507, 1.811858, -3.494227, -3.977528, 1.935094, 0.442732, 4.073760, 0.244701, 3.578159, -1.013995, -2.507242, 4.465893, 1.057177, -0.537271, -4.829791, 0.020822, 1.384580, -0.917904, -0.844748, -2.344644, -0.291398, -1.531452, -0.735474, -2.581677, 1.842135, -0.054317, 2.477489, 0.283114, 1.382037, 1.793713, 2.417314, -3.699762, -2.272878, -3.177811, -0.718206, 1.148291, -1.529859, -0.163814, 2.781793, -2.223887, 3.794726, -1.517594, -1.921671, 3.090774, 0.501923, -1.972393, -1.791274, -2.481785, 0.448921, -2.090209, 0.220071, -1.382417, -0.468002, -3.140492, 0.560268, -3.443594, 0.177877, -1.951589, -2.809496, 0.517052, -1.871615, -2.020593, -2.291572, -1.610467, -0.813363, -3.394322, -2.246128, 1.871319, -3.381363, -3.987760, -2.516682, 0.588713, 3.890212, 2.469473, 0.362648, 0.561302, -1.753007, -1.627349, -0.349749, 2.216681, 0.383506, -2.713041, -0.593570, 3.251638, 2.689493, -3.748345, 0.691237, -0.569738, -5.734797, 5.413714, 2.996217, -3.365990, 1.122713, -0.547549, -2.552277, 3.383361, 5.260242, 2.122134, -0.637885, 2.626056, 1.089457, -2.199468, -2.468286, -0.702738, -0.091756, 0.488842, 0.760422, 0.146176, -1.661150, 2.917534, -1.514377, -0.663864, 2.496569, 1.366415, -2.151349, 1.885142, -0.772022, 2.252527, 2.092843, -2.031601, -0.813497, 0.407781, -3.519864, 1.166523, 1.347374, -2.144313, 1.894274, 2.548582, 2.886257, -2.012259, -0.423815, -1.669818, 4.414027, -2.174315, -0.024239, 3.345891, -3.157781, 3.547366, 3.083466, -2.391912, -2.350487, 1.912315, 2.167852, -1.419118, -0.530295, 1.211723, 2.307965, -3.893567, -1.757907, -2.166578, -0.312290, -2.318677, 2.761449, -2.065255, -2.718894, -5.395889, 1.222173, -1.110870, -1.290285, -1.207875, -0.060934, -2.584382, 1.737085, 1.880211, 2.757859, -2.688734, -2.082012, 0.599087, 2.687817, -2.736868, -2.483763, 1.212993, -1.285007, -3.371017, -4.127361, -2.956272, 1.772995, -1.661328, 1.707176, 0.474840, -0.346689, 1.808019, -1.051578, -3.156462, -3.182742, -2.470137, -0.530100, 1.256876, -3.449591, 1.078454, 3.284560, -1.910391, -0.292548, -1.595873, -0.838040, -0.740394, 1.511373, -3.185318, 0.501699, 2.445569, 1.670159, -2.597500, 3.368602, 4.472101, -4.204967, 4.533583, 0.047076, -3.118933, 1.523812, -1.076397, -3.584929, 1.941660, -2.212678, -0.267279, -2.351005, 3.205257, -1.552729, -2.789754, 2.306149, -4.425440, 2.926135, -1.727820, 4.657523, -3.581824, 1.874127, 2.977703, 3.189226, -1.316201, -2.322132, 0.326548, -2.243823, -0.581202, -2.148850, 1.752387, 3.638435, 1.993092, 1.322640, 3.349422, -4.158694, 2.212160, 3.103670, 2.673969, -1.280546, 1.500715, 2.859776, -0.207381, 4.820920, 2.770439, 1.037140, 2.432384, 1.314973, -3.747844, -0.571894, 0.404972, -0.784428, -4.287580, -0.850005, -0.596455, 3.271060, -2.868608, 3.010384, 2.683059, 4.961196, -1.193018, -2.061228, 0.384417, 0.839436, 1.829330, 1.032372, -2.438330, 0.873286, -5.543126, -1.238505, 4.828424, 4.276352, 2.086277, 1.759339, 3.404096, 1.391880, -2.645986, 1.286137, 2.212531, 2.400476, -1.635331, -1.826915, 1.386136, 0.734565, 3.339298, -2.710637, -3.658201, -0.764404, 3.978921, 1.805965, 0.612052, -0.069873, 1.873704, -0.291247, 0.808841, 1.046961, -2.339138, 2.548506, 0.116501, -1.358457, -2.428479, 0.405364, 0.049829, 2.078218, 0.630792, 2.682093, -0.111304, -1.323117, -2.635784, 1.362048, 3.085995, 2.625044, 1.898022, 2.371150, -0.770906, -0.275337, -1.812634, 0.451130, 0.030746, 1.921646, 2.202017, -0.176585, 0.895559, -0.876122, -3.598070, 0.757027, 0.715836, -3.268029, -0.826421, 1.655850, -0.267985, 0.701595, 0.129097, 1.087256, -2.069307, -1.865456, 1.049002, 1.045302, 1.117406, -0.491220, 1.752171, 0.180681, 3.506818, 1.705417, -3.404651, 2.519688, 2.092036, 2.918577, -1.644121, -0.248030, 0.368383, 1.397353, 0.468383, 2.009109, 2.615247, 1.785939, -2.678337, 1.617231, -1.236114, -0.461577, -1.838622, 1.968311, 1.328617, -2.911062, -1.685913, -1.801324, 1.286906, -4.115755, -2.113906, 9.896759, 0.374081, -0.388752, -1.223068, -3.063830, 2.229397, 2.612700, -1.033353, -1.022675, 3.386715, -1.121697, 1.809654, -2.561433, -2.942688, 2.570541, 2.310937, -1.491891, 0.108844, 1.215748, -1.947517, 0.629480, 0.692682, -0.667800, -2.369210, 1.485672, 0.038969, 0.029837, -0.422000, -0.252593, -0.688025, -1.452450, 4.435425, 3.307030, -1.265886, 6.128372, 1.978850, -3.929060, -0.298456, 2.120240, 3.654680, 1.866614, 2.054912, -1.318845, 1.181852, -0.038600, 3.942681, 4.401767, -1.473351, -0.744864, -2.225775, -0.871413, 1.849269, -1.780350, 2.124613, 2.299182, -0.689189, -0.497090, 0.518188, -2.052712, 3.142647, -2.284230, 4.709138, -0.259247, -0.405531, 1.163305, 1.855179, -0.130810, 2.581027, -3.659903, 2.676016, 1.587614, -2.458660, 1.384374, -0.823230, -1.355044, -2.172476, 1.655872, -0.893984, -2.042878, 1.969854, 0.140095, -5.839048, -1.551351, 0.726457, 0.911761, -0.065500, 0.825390, 1.000783, 2.737443, -4.026385, 3.902370, -1.769917, -0.966249, -0.816267, -0.570867, 1.182167, -3.117018, -2.331257, -2.639359, -3.660809, 1.104480, 1.201332, 0.084591, -0.062481, -1.295772, 1.735669, -0.666152, -0.181148, -0.707571, 6.112964, -5.684958, -1.611841, 0.320344, 1.758454, -2.201783, -0.981238, -2.890517, -4.545226, -2.521300, 0.036706, 0.128801, -2.098386, 2.333826, 2.365282, -2.611004, 2.553960, -1.038355, -5.702994, -2.997832, -2.794518, -2.389665, 2.133088, -1.820822, -2.890388, 2.260902, 3.086394, 0.570292, 1.511309, -3.697427, -0.810149, -0.843711, -1.795822, 0.495115, 2.823005, 1.649202, 8.319246, -0.411636, -2.392526, 2.539201, 2.370214, 4.756253, 0.828395, -1.034127, -2.109726, -1.572361, 5.465284, -3.657965, 4.423150, 2.149361, -7.255781, 3.108456, -2.962606, 2.280480, 1.510164, -0.990311, -3.053618, 2.630822, -1.767082, -0.539244, -2.688886, -1.095631, -0.003539, -0.584830, 0.941815, 1.863402, -0.325740, -0.093675, -0.985750, -2.204835, -2.334163, 0.659979, -1.940781, 0.892017, -0.047762, -0.801082, 5.068970, -2.416367, -0.054956, -0.609716, -0.925279, 3.985359, -2.428799, 1.385789, -2.310544, -0.390019, -3.638839, 1.571243, 0.829102, -1.987846, -1.974930, 0.148836, -2.629555, -1.189142, -1.211454, 1.648244, 1.809920, 0.444209, 0.857393, -0.552437, -1.938646, -1.704941, 1.153294, -7.150531, 2.521330, -4.333313, -1.203517, -6.791515, 5.610275, 2.526631, -2.193799, 0.316134, -2.975452, 0.043736, -2.145086, 0.947292, -1.770846, -1.070288, -1.937447, 2.345026, -3.003102, -0.961535, 4.269505, -1.473615, -2.028548, -1.439545, 3.920681, 2.086540, -0.087132, 2.692583, -1.969973, 0.640000, 0.759344, -0.851917, -2.110132, -1.116281, 1.290830, -0.036435, 1.081315, 3.280384, 1.031457, -3.773770, 2.426805, -2.144011, 2.387061, -1.523329, 1.364333, 0.632966, 1.254096, 0.397885, 1.691442, -0.060497, -2.736957, -0.955565, -1.479424, 1.189433, 4.488828, -1.039956, 4.110622, -4.271568, 3.211975, -0.931945, 2.384370, 1.593602, -0.530324, -1.894910, 0.872313, 0.775410, 0.611552, 3.815946, 5.441214, 2.328136, -3.694026, 0.856822, -0.358736, 1.209829, 1.578911, -1.869051, 2.328872, 1.902640, 0.171978, 1.517722, 0.237618, -2.074708, 3.189997, -0.351501, -3.595257, -3.138206, 0.722451, -0.521091, 2.513267, 0.863403, -3.036732, 0.966440, 0.639682, 3.953517, 1.480138, 2.872857, -1.725542, -0.765049, -3.056712, 0.961703, 1.613330, -3.012677, 1.984263, 0.603873, -0.146085, 0.401721, -1.095984, 1.858853, -2.949477, -0.214432, 0.300439, -1.510662, -1.520183, 3.227612, 3.277660, -0.816754, -1.828793, 1.171779, 2.029778, 1.869878, -2.729131, -2.555247, -3.633353, -3.591051, -1.858859, 2.480379, 0.767383, 3.103071, -1.344633, 2.634771, 4.324559, 0.995620, 1.743588, -4.545441, 3.179331, 2.600043, -1.224859, -0.539547, -2.826778, 2.330503, -2.919528, -2.707401, 3.279189, 0.743885, -1.030735, 1.944464, 1.497105, -0.072477, 9.878094, 0.991437, 1.252940, 0.908459, 2.009145, 2.427585, -2.123645, -2.284483, -0.918611, 2.746569, -2.033646, 0.837004, -1.525458, 0.870853, -3.641463, -2.634501, -0.521927, 0.874221, -1.667083, 1.903460, 3.153025, -1.037110, 2.270567, 1.085054, 0.564745, 1.921359, 2.154752, -1.015654, 2.087633, 3.454697, 1.214256, -1.918541, -1.607991, 1.002951, 1.865858, -0.964412, 2.511755, -2.399036, 1.691220, 2.473380, 0.841120, 6.731910, 2.276564, -0.619042, 0.109877, -1.801440, 2.544270, -3.653917, -0.909440, -1.673509, -3.541651, -1.475499, 0.974824, 1.341343, 3.371499, 2.327110, -4.265070, 2.730826, 6.252456, 0.442818, -2.496783, 0.133818, -1.569381, 1.881387, -1.776341, 0.944535, 1.976379, 1.078518, 0.454756, -0.553306, 1.956321, -2.193647, -3.906279, 0.997558, -2.133405, 0.813385, 0.066720, 0.675073, -1.305464, 1.049415, 0.058389, -3.035835, 0.971060, 1.673939, -1.002198, -2.784204, 1.691542, -22.129820, 0.332251, 1.568601, -1.923303, 0.294346, 1.423302, -2.935637, -1.775013, -3.467926, 2.910514, -1.682199, 1.219355, 0.793816, 2.940609, 2.325723, 1.023719, -2.592135, -1.507832, -1.641595, 1.456100, -1.044729, 4.106136, 2.221826, 2.339298, 3.302754, 1.938588, 0.728833, -1.555544, 2.474434, -0.518467, 0.533135, 3.680743, 1.677448, 2.104536, 0.646286, 0.555699, 2.477109, -3.550225, -3.647983, -2.079800, 4.112247, 2.804218, -0.919420, 1.293995, -3.565257, 0.565476, -1.367818, -1.903385, -1.448610, 0.866282, 2.322896, 1.147820, -0.627827, -1.059485, 4.482340, -2.255801, -6.158365, -1.201316, -2.617374, 0.849946, -3.063239, -0.336349, 3.439678, 1.400132, 0.616048, -2.214755, -0.108316, 1.013055, 1.614219, -2.119207, 2.570008, 1.538140, -1.337971, -2.840325, -0.002898, -3.976804, -0.981789, -2.240550, -1.108801, 2.162783, -0.593558, 0.575897, 2.254664, -1.405173, 1.071949, 2.099885, -1.849970, -0.874483, 2.513324, 2.759261, -1.591699, -2.328701, 1.031207, -1.108524, -12.576934, -0.212726, -0.139737, 1.239497, -1.242406, 1.892567, -0.104317, -1.160928, -0.283431, -1.366523, -0.203520, -0.691903, 1.787807, 0.201596, -4.096425, 2.446693, -3.331512, 2.033213, -3.020568, -1.573980, -0.388636, -1.998314, -1.966732, 1.677706, 3.899530, 2.133426, 0.836722, 0.754416, -0.003854, 0.806098, -0.711075, -0.452569, 0.355189, 1.279910, -1.450249, 1.334980, -0.518858, -3.259822, 3.693094, -2.071981, 0.501570, 0.056253, -2.921665, -0.392897, -0.867844, -0.986845, -1.664861, 1.737591, -3.061367, 0.302886, 2.094111, 0.045325, -1.595525, 0.674877, -1.471849, -3.893967, 1.412523, 1.773032, -2.750422, 1.035602, -0.516791, 0.286685, -0.128167, -3.022458, 0.957886, -0.954299, -2.147751, -0.348507, 1.870028, 4.274771, 3.953058, 1.525810, 4.137014, -2.523398, 4.372852, 0.338937, -0.623839, -1.676739, 9.756101, -3.779910, -1.706488, -2.624722, 1.853259, -1.887885, 0.109111, 2.570731, 1.300253, 0.520757, 0.706603, -4.339977, -5.510975, -3.262259, 0.894060, 0.797520, -1.387151, 1.952152, 3.178773, -0.055920, -0.988293, -2.944663, 0.906312, 1.589234, -1.910973, 1.550346, -3.043045, 1.202343, 2.156839, -3.822891, 1.310066, 0.109305, 0.393427, 0.848353, -0.702017, 1.135580, -0.544394, 0.898604, 2.183110, 2.875517, -2.742109, -2.914018, 2.957861, 1.904441, 0.188540, 3.428993, 4.164488, -3.325378, -0.339448, 1.521152, -0.671186, 0.412715, -0.244523, -0.876270, 1.905240, 1.004341, 1.861009, 3.023627, 1.829536, -0.710838, 2.065416, 3.122601, 2.638688, -2.839584, 2.718982, -0.912577, 3.302638, 2.473169, 0.684764, 1.555684, -1.825787, -1.142594, -0.022323, 3.818839, -3.767341, -2.613019, 0.704796, 2.798040, -5.006337, -0.892998, 2.696245, -2.926975, 2.355693, -0.578362, -1.544222, 3.422838, -2.041148, -0.578029, 0.011704, 1.837247, -1.490923, 2.378649, -2.392335, 3.045223, -3.292506, 0.487439, 2.134249, -0.827777, 0.996064, -1.616960, -0.817308, 1.334069, 3.380960, 2.541771, -0.265344, -0.917402, 2.407108, 1.253521, -1.562716, -1.517113, 2.796017, 0.775317, 0.575597, 1.323530, 1.529689, -16.116417, -2.998111, 0.450969, -1.814111, 0.349480, 2.782877, 1.295974, 1.560289, 1.718585, 1.898549, 2.660482, 0.864414, -2.238849, 0.737516, -2.422750, 0.247123, -2.743533, 4.227241, 0.526669, 0.501835, -2.821635, -4.300069, 1.813544, -1.345552, -3.812800, -0.391366, -1.456246, 2.738957, 1.825692, -2.190766, -3.353376, -1.940057, -0.862514, 0.672473, -1.073232, -4.133321, -0.304357, -2.691906, 4.606014, 2.997205, 1.211570, -1.067043, 0.287098, -2.027639, -2.065545, 3.585075, -1.943740, 1.839725, 0.222802, -4.045444, 0.650917, 0.830172, -2.850742, -2.450385, -0.215431, 1.970153, 3.219507, 0.619687, -0.603187, -2.238926, 2.604592, -1.533692, 1.733842, 1.440334, 0.821643, 1.787324, -3.166633, 0.036767, 5.421640, 1.393930, 3.653860, -1.986755, -3.910746, 1.559711, 0.495545, -0.567174, 3.627657, 0.452299, 0.548954, 4.050268, -3.818210, -0.528180, 1.463299, 1.876337, -3.187659, 0.062530, -3.048895, -0.426209, 1.873657, -0.866349, -2.653108, 3.005883, -3.116135, 1.542959, 0.393652, 2.553407, 0.303429, 2.347852, 1.130731, -1.145539, -0.604549, 0.910890, -2.142335, -0.729891, -2.702992, -1.919118, 0.967466, -2.112257, 2.792009, 0.179999, 2.758096, -1.864986, -3.021833, -0.889339, 1.378984, -0.177384, -1.409245, 1.099375, -0.886126, 1.222357, -1.046859, -1.496866, -1.473668, -1.838376, -2.917238, 0.983777, 2.499777, -1.375984, 2.721064, -2.649018, 1.615053, 0.037384, -2.364583, -0.474144, -1.747913, -2.922879, 0.293949, -1.100694, 1.720940, 2.906822, 0.333392, 3.154211, -2.480198, 1.405208, 0.162729, 1.072986, 1.403922, -2.553919, -1.325863, -0.578505, 1.225022, 1.082428, -1.728628, 2.811140, 3.167954, 2.595941, -0.468482, 0.676855, -3.399291, -2.395138, 2.171390, 5.313373, 0.593171, 1.174135, 1.041957, -3.592760, -1.115999, -1.609447, 0.996160, -4.086498, 1.880493, 2.586645, -1.809857, 1.197866, 3.047547, 2.049238, -5.638014, 3.052633, 1.992575, -0.746054, 2.904683, -0.634189, -1.680446, 6.441619, 0.607398, 1.153746, 0.197727, -0.397482, 3.676062, 4.964014, 3.061106, -1.133064, 0.068444, 0.732943, 1.284233, 0.121063, -2.615230, -0.237515, -0.004988, -0.984315, -0.205606, 0.886206, 2.857014, -1.104957, -0.110875, -1.748340, -1.275044, 0.566729, 2.455837, 0.216097, 1.074522, -1.081340, 1.013576, -0.510011, 0.004325, -1.918031, 2.226862, -1.485101, 1.199568, 3.146095, 2.427711, -3.011320, -2.449924, 1.329107, -1.326373, 0.970822, 0.849875, -1.601499, 2.314740, -3.488747, 1.154833, 0.087346, 0.803074, -0.903432, -3.312709, 2.299358, 2.803974, -3.286255, -1.573510, 2.070856, -0.101941, 2.171456, -1.329902, -3.907687, -3.720195, -2.571179, 2.813491, 1.830530, -2.511291, 1.457561, -2.576567, -2.362876, -1.616081, 1.032160, -1.181725, -0.276253, 2.621583, -0.266221, -0.139193, 0.748835, 1.204025, -1.065338, 0.172388, -0.772580, 1.074110, 1.518136, 1.267711, -1.981061, 1.981899, 2.800466, 2.231948, 0.890822, 1.847108, -2.885270, 1.300715, 4.157722, -3.683444, -3.738341, -1.370671, -0.604141, -1.896416, -3.395854, 5.339676, -3.188531, -1.775950, -1.851381, 1.067175, -1.844483, -1.300046, 3.536031, 1.521053, -3.789303, 0.140510, 1.047382, -0.060062, -2.875290, -1.034876, -1.860420, -3.490496, 0.724952, 0.459753, 2.903262, 1.382170, -1.717902, -1.420321, 3.068552, -2.760029, 0.564609, 1.620253, 0.562308, 0.044785, 0.964442, -2.613660, -1.785604, -0.199799, 1.718101, 1.127491, 2.559109, 0.299823, 0.551968, 2.438937, 0.701297, -0.019573, -2.164789, 1.922659, -0.866623, 0.326152, 2.629571, -0.972274, -4.290656, 4.196831, -3.415634, 0.356076, 3.115250, 1.733862, 1.982695, -3.026798, -2.023067, 2.122068, -2.348504, 0.413962, 0.148813, -3.872636, -1.810721, 0.762815, 0.159291, -0.137686, 2.732738, -1.524898, 3.947837, 2.365946, -2.701674, 1.253832, 0.194691, -0.156369, -3.185858, -1.945729, 2.551816, 3.177832, -0.077237, -2.811372, 3.560195, -1.534038, -1.941651, -1.060778, -1.237860, -2.827729, 1.169655, -1.817541, 1.247796, -2.044961, -2.042051, -0.980262, 0.946876, 1.271373, 3.471315, -2.489898, 2.386957, 0.589401, 0.960726, 4.046332, -0.806419, -0.904894, 3.152041, -5.319210, 2.374078, 4.134824, 1.937350, -0.497493, 2.662346, 0.755571, 4.101086, 0.119435, -2.297841, 1.126628, 1.798876, 1.621213, 3.012361, 0.051148, -2.327565, 0.777902, -2.543133, 0.416235, 1.116755, 3.242164, 1.235808, 0.284160, -1.247335, 1.107888, -1.079982, 4.659342, 2.768052, 1.012518, -3.917892, 0.223945, 0.545874, 4.944964, 2.193990, 3.037240, 1.796157, -1.105980, -2.231532, 2.658256, 2.005485, -3.134518, -1.135793, 0.886341, 2.554361, -2.021382, -2.364516, -0.521016, 1.373525, 1.369994, 1.462440, -0.309788, -2.899583, -3.485990, -0.327034, -0.483858, -2.365628, 0.571344, -3.879119, -0.263073, -0.500833, 0.611228, -1.481678, -2.642653, 1.774129, -3.965195, 2.924938, 0.561954, 2.093593, 2.242171, -3.799051, -0.805658, -2.153137, -0.395024, -3.239681, 2.431444, 0.556913, -3.113872, -2.407112, 0.681333, 1.893931, -2.904344, -0.771003, -1.218895, -1.536474, 2.928406, -4.104526, 1.685589, -1.799101, 2.594079, 1.383285, -3.314305, 3.277896, -3.116689, -0.780264, 2.376668, 2.787229, 1.384615, -2.993711, -3.415761, 3.395609, -1.612519, 5.627576, 3.240973, 0.276432, -2.974000, 5.787770, 2.183374, -0.118661, 2.462688, -1.815760, 3.092844, -1.904415, -1.680902, 3.111355, 2.789873, -3.434824, -2.328745, 4.971395, -0.408980, -1.288155, 1.367070, 0.403338, 0.400134, 1.119727, -0.911876, -1.399861, -3.973023, -2.791090, 3.630412, -1.710031, 2.134027, -0.959766, -0.329367, -0.732914, -0.993124, -2.181825, -1.024484, -1.577234, 1.415553, 1.184569, 0.033538, -0.182342, -1.553549, 0.714704, -2.457281, -1.513178, 1.668061, -1.185722, -0.021128, 0.509903, 0.016285, 2.772515, 3.466581, 2.642076, 2.103426, -0.777547, -3.492056, 0.086423, -1.734952, -2.397410, 0.079009, -0.129437, 1.052238, -0.897817, -1.193079, -2.432753, -0.610183, 0.759082, 2.856251, 0.527911, 2.582767, -1.680882, -0.903450, 0.290197, 1.900693, 3.820286, -1.310768, -2.894516, 0.749423, -2.294792, 1.678832, -1.254640, -2.198882, -1.182828, 2.383785, -2.344207, -3.577337, 1.876101, -3.005041, 3.585013, -0.243899, -2.782204, -0.227220, 0.830313, -0.217250, -2.924293, 1.900393, 1.335294, -0.480179, -2.973958, -3.055072, 0.840056, 3.724781, -0.353757, 0.307534, 0.540906, -2.654098, -0.348129, -0.920658, 2.132657, -2.957996, -3.357490, -1.639126, 0.272579, -1.003739, 1.129556, -0.414330, -1.633175, -2.019671, -1.955855, 1.487665, -1.117603, -2.708709, -0.485532, 1.924051, 2.907635, -4.480110, -1.866212, 1.161165, 1.436610, 0.408536, 0.190726, 1.837151, 3.994542, -1.868298, 0.735382, 3.357700, -3.131468, 1.535451, -2.268453, 1.079755, 1.087334, 2.349835, -2.999949, 0.427312, -2.889404, 1.677031, -0.963640, -0.438927, 3.972819, -0.491260, 3.205900, 2.318801, 3.833720, 1.883632, 0.465372, -2.379404, 2.530885, -2.490902, -1.817383, 2.394443, 2.693515, -2.303585, 2.991621, -0.318521, -2.124806, -1.568704, -1.105953, 1.689304, -1.503239, -0.745483, 1.219584, -0.008324, -2.262623, 2.536577, 0.300485, -0.126827, 0.379301, -2.164514, -1.050668, 2.279346, -0.929581, -2.010704, -1.761641, 0.769421, 5.870451, 2.409795, 1.549106, 0.406771, -2.387406, 2.617296, 2.226446, -2.372685, -2.909598, -4.472449, -4.199341, 0.783240, 0.096103, 0.230260, 2.392704, 0.130306, -1.671189, 0.000460, 0.859836, -2.856006, 0.879900, 2.923421, -3.459907, -2.481248, -0.380551, -2.122747, 2.509279, 1.280891, 1.923709, 0.026194, 0.975075, 0.535158, 5.102134, -3.227482, -0.724829, -0.011868, -2.003006, 0.030337, 1.065228, -0.093162, 4.732137, -2.835727, 2.164323, -3.163384, 1.464387, -9.556238, 3.294225, -1.643234, -0.544956, 1.026036, 1.801332, 0.531682, 0.875548, 2.275990, -5.821035, -2.165171, -2.959899, -3.164179, -0.533030, 0.882995, -3.908379, -1.524249, -3.931026, -1.867308, -7.501748, -3.151053, -3.374834, -1.482490, -0.023572, 4.518243, 0.564633, 4.339701, 3.183874, 4.368966, -1.674653, -1.995406, -1.902929, 2.051385, -1.286106, 2.168142, 1.894793, 3.113796, 16.570700, -2.124722, 0.712696, -8.329745, 1.154208, -1.029663, -1.493598, 0.788374, 0.373883, 2.354367, -2.372883, 2.818204, -3.362396, 2.617382, 4.474661, -0.396233, -1.267438, 2.482839, 0.489908, 2.244136, 1.239966, 4.426870, 5.876467, 5.865771, -2.613004, -0.572432, -0.640416, 1.465602, 0.965603, 1.523670, 3.018555, 2.185856, -0.685083, 1.380377, -1.892750, -3.034616, -1.156111, 0.307927, 4.088897, -0.267865, -0.748823, 2.313815, 1.472056, 3.863946, -1.217377, 0.140903, 2.896098, -1.744379, -1.723044, -0.629762, -0.662425, -2.445518, 0.152551, 1.374967, -1.386010, -2.313100, -4.004970, 1.415225, -4.312975, -1.410863, -2.737320, 1.355401, 0.709947, 1.922767, -0.346310, 0.500168, -0.085046, -3.296778, 1.980763, -1.139568, 1.725973, 3.104150, 2.869179, 0.393733, -0.193732, 3.668149, -2.017278, 0.133616, -0.618817, -1.574684, -3.663116, 0.169122, -0.268842, -1.601942, 0.325557, 1.805874, 2.177101, -2.602196, -1.913941, 0.011868, -0.395234, -0.834341, 1.420007, 1.540699, 3.013688, 1.357066, 1.180240, 1.042086, 2.019283, -2.652542, -2.059645, 0.797553, -0.451454, 2.375256, 2.512674, 1.454196, -2.217665, 0.813907, 1.702392, -0.050121, -6.994725, 5.303644, 1.784331, 0.242761, 1.062950, 3.647518, 2.466606, 0.988599, -0.931017, 2.402037, -1.814979, -0.855702, -3.275341, -1.074031, -2.235515, -1.613209, -1.744707, -1.840088, -0.779895, -4.490575, 0.831230, -0.156800, -2.528859, -3.127295, 2.660365, 1.944716, -0.364012, -1.861313, 3.323568, -0.811558, 0.169003, 2.853401, 3.639811, -0.011596, 1.986325, -1.664246, -4.615153, 2.271653, 1.025806, -3.154962, -2.319878, 2.225085, 0.531775, 0.322223, 1.468869, -0.566937, 1.309493, -1.211456, -3.419137, -1.585997, 1.178995, 0.171657, 3.465562, -0.804610, 1.665783, -5.088279, 0.650534, 1.791170, 17.250408, -1.626894, -1.974006, -0.824130, -3.331557, -3.880378, 0.599182, -0.694505, 4.215900, -1.447300, 2.230931, -0.596584, 0.340238, 4.081385, 3.625847, 2.839355, 3.756844, -2.777805, 0.100937, -1.989295, 3.277542, -4.379092, 3.087123, 1.403637, -2.231328, 2.390607, 1.557257, 3.298607, -0.299247, -1.181531, 0.525384, -0.890912, 1.824297, -3.995151, -5.042112, 0.744427, -3.067411, 1.595778, -2.571892, 0.771376, -3.277676, -1.455596, -0.250049, -1.351680, 0.697943, 1.069591, 2.857038, -0.159182, -0.362470, 1.539649, 0.099324, -0.135235, 1.614402, -1.260010, -0.059322, 0.792264, -3.511580, -0.926815, -0.584613, 3.695359, -1.960196, 3.956642, -2.118308, 1.945329, -3.706782, -4.705505, -1.869684, 1.638058, -4.319848, 0.475216, 3.160248, 1.229439, 5.102840, 1.172704, 0.917782, 2.616993, 2.139804, 0.179050, 0.635365, -2.586541, 3.264217, 2.315623, -0.899583, -0.615601, -2.657061, 1.547319, -2.286578, 1.879086, 1.495636, 2.321740, -0.555475, -1.915786, 1.741425, 0.884898, 2.387746, 2.810685, -1.226168, 0.692660, 2.960999, 3.579106, 2.618080, 2.850123, 3.143281, -1.445385, -0.869172, 2.514671, -2.716269, -1.470776, 1.258983, -3.679865, -2.217870, 2.349101, -1.496878, -3.025600, -4.349153, -1.707175, 3.005045, -1.947022, 0.834064, 2.545904, 0.842057, 1.460314, -0.993782, -0.211882, -3.322590, 0.401144, 2.192401, 0.763440, -2.603230, 2.001291, 0.178745, -0.747411, 2.043627, -2.383173, 1.185634, -0.005095, 2.036742, -2.243795, -0.514880, -1.657962, 0.009154, -0.846694, 3.579744, -1.727863, 1.975658, 0.298879, -1.385962, 3.904871, 0.650457, -0.503490, 1.660019, -1.920314, 1.000721, 2.794953, 0.703100, 1.590531, -3.563776, -2.024322, 0.831120, -0.502338, 1.180401, -1.431693, -3.374807, 1.813871, 0.377378, 2.058053, -2.708455, -0.701740, 0.503954, 3.542927, 1.265793, -2.440325, 5.357823, 1.181739, 1.192558, 0.281461, 2.353562, -0.804080, -1.500842, 0.188305, -3.794515, -1.650937, 1.416369, -2.651186, 3.189369, 1.791879, -2.396369, -4.074958, -2.053515, -2.250571, -4.064375, 0.532033, 1.440506, -1.240853, 2.492684, -2.390124, -1.024883, -0.555563, -1.956213, 1.623435, 2.391842, 3.169787, -2.010656, -2.048950, -0.517389, 3.292646, 2.329223, -2.762945, -1.225980, -3.282871, -0.160141, -1.514427, 1.987719, 0.649399, 0.815709, -2.400442, -2.567317, -1.862416, -1.602954, 4.048628, 1.432278, 1.482120, -0.028168, -1.211920, -1.063073, 0.239240, -0.337652, 2.459109, 0.510653, -1.598884, 2.341745, -3.938889, 0.773527, 1.360329, 3.298347, 3.038607, -1.644866, -0.088439, 1.705638, -1.667633, 3.251128, -0.499025, -0.244674, -2.124416, 1.485698, 1.383972, -0.360520, -1.537704, 0.886796, 1.275228, 1.144015, -0.614845, 3.147204, -2.104358, -1.287056, -1.382660, 1.712896, -1.635278, -2.644013, -0.343970, 1.227356, -0.518890, -3.459199, -1.418720, 0.039872, -2.601501, -1.806357, -1.778686, 2.005520, -1.855919, 0.793212, -1.463252, 1.586207, -0.184600, 1.077291, -3.230673, -1.065376, -2.426338, -4.433507, 3.362201, -2.072861, 4.426600, -1.687956, -0.826343, -0.115859, -2.048442, 0.140565, 4.825446, -1.373407, -0.139355, -0.565521, -3.447233, 2.419903, -1.851389, -3.471667, 1.108885, 3.593758, 0.237525, 2.659810, -1.035963, -2.568973, 1.054946, -1.254823, -1.618325, 0.781841, 1.970722, 3.680123, 1.463840, 0.097049, 1.916009, -0.708227, -0.067044, -1.741496, 2.598838, 3.050692, -0.787915, 2.766759, 1.675597, -1.173892, 2.223792, -1.062551, 2.273316, 3.011199, -1.650119, -2.648605, 0.158647, -2.177063, 0.186268, -3.177166, -0.526156, -1.345172, -2.188882, -0.780098, 2.949754, 2.012983, 0.683741, 1.745923, 3.283293, -2.406209, 2.281897, 1.034458, 0.070274, -3.126071, -3.453828, 0.506103, 4.258954, -0.538088, -2.218058, 3.084713, 5.720852, 2.743039, -1.810076, 6.272157, 0.664128, 3.120400, 0.742479, 3.379241, -2.731331, -0.166025, -6.347305, -2.193065, 0.273632, 0.289647, 3.115033, -2.160245, 0.387171, -0.448133, -1.982356, -4.029318, 1.811508, 3.046365, -2.964187, 2.094236, -3.426285, -3.485981, -1.995775, -1.983477, -2.634778, 0.562914, 1.780642, -2.165194, 0.609340, 0.511937, -2.450258, 3.387038, 2.438292, 1.092778, 1.200358, 2.149229, -3.404688, -3.018096, 4.586117, 1.999421, -1.138862, -1.361518, -0.968755, 1.716326, 2.557005, 1.137697, -1.860175, 4.096773, -2.173637, 0.623251, -0.005716, 0.244621, 2.366596, 3.502475, 1.700907, 1.084500, 3.542811, 0.157646, 0.709190, -1.301711, 1.101328, -1.765094, -2.921493, -0.108164, -0.767839, -1.458841, -3.287886, -2.644441, -0.287442, 0.930482, -1.790940, 2.306692, 2.406610, -3.171745, 3.804529, -2.363433, 2.127100, -0.969682, -1.289734, 0.080165, -0.030003, 3.477387, 1.284148, -0.626319, -1.433724, 1.803858, 3.479468, -0.094739, 0.400459, -7.394617, 0.835967, -2.398387, 0.867873, 2.937330, 0.751119, -1.297753, 0.425168, -1.585854, -3.647148, 3.178249, 1.690364, -1.492471, 2.513674, -1.554596, 3.298636, 0.609163, -2.723015, 2.000729, 1.757294, 0.908518, -3.078389, -2.418847, 7.257916, -1.492727, -3.191773, 2.915990, 1.081085, -1.266900, 1.227162, -3.655681, 0.880118, -0.415529, 4.598647, 0.800119, 3.403203, 1.137017, 0.396906, 0.119401, 1.960427, 3.879504, 6.221431, -3.068269, -2.315856, 0.694096, -0.145249, 0.976113, 1.862161, 0.650839, 0.130839, 0.288261, -4.273160, 0.193502, -4.425699, -2.068643, -1.223312, 1.958007, -0.721088, 1.534529, 2.555326, 2.301806, -1.540642, 2.078291, -3.631954, 4.562294, 1.094564, -0.621842, 1.441936, 2.677450, -1.552978, -0.132519, 0.788240, 1.321109, 3.038334, -5.551532, 2.999269, 0.364990, -0.272237, -2.743863, -2.561628, -1.798099, -0.921997, 2.971652, -1.706339, -1.688948, 0.070037, 3.021770, -3.601783, 2.029795, -0.783538, 1.702649, -0.212672, -1.084160, -0.049358, 5.279011, -1.903958, 2.085022, -3.444110, -1.312608, 2.920285, -1.673897, -0.170632, 0.140543, 0.217857, 0.993657, -0.147226, 4.131040, 1.917654, -0.402115, -1.762819, 4.605012, -0.253315, -0.367822, -1.636496, 2.923368, -3.571045, 1.615221, -0.274513, 1.293480, -3.736974, -2.906220, 1.564071, -1.271615, -4.054400, -0.437894, -2.497492, -4.102562, 0.335778, 4.307206, 6.871069, -1.353800, 1.235324, 1.515420, -0.432864, -0.915976, 1.982390, 1.529526, 1.321939, 0.553168, -0.996906, 1.229996, -1.395458, -0.720184, 3.174892, -0.654703, -5.431517, -2.564596, -2.954030, -1.324461, 1.512441, -2.701057, 0.388167, -2.322500, 2.430663, 1.636868, 3.658733, 0.276975, -2.526526, -2.830755, 0.008060, 2.742812, -1.348127, 1.398821, -1.734482, 0.392166, -0.090562, -1.914905, 1.780919, -2.045624, -1.376605, -1.993155, -0.685609, 2.082545, 2.854451, 1.075901, -1.963066, 2.440422, 0.654377, 1.624016, 0.347185, 3.520130, -3.108392, -0.240567, -3.161610, -0.800270, 0.307272, -6.462657, -2.478560, -1.957913, 1.711907, -0.733501, 0.139884, 1.806551, -1.349770, 3.346692, -7.009873, -2.372363, 0.695869, -0.527927, -0.040519, 3.153802, -0.830838, 2.495985, -0.613522],
 	"minicpm-v:latest":     [0.373405, -0.434066, 0.402574, -0.150118, -1.009159, 0.055999, 0.578607, 1.257351, -2.809614, 3.972482, -0.813682, 1.450852, 0.222104, -1.309440, 0.825239, 1.990135, 1.091002, -1.035189, -0.127123, 1.494732, 4.583488, 2.152238, 1.950469, 0.040962, -1.026761, -2.280671, -0.764667, -2.645734, 0.738428, -0.337854, -0.514919, -2.169918, -1.606695, 0.726714, 2.363379, 0.812033, -1.017132, -5.906273, -0.068586, -3.161905, -0.668377, -0.930418, 0.602283, 1.411093, 0.929184, 0.361665, -7.188779, 0.122860, -0.208246, -0.857899, -1.278283, 0.683655, -1.143659, 3.433253, 0.186680, 0.858210, 0.086462, -1.969538, -0.585241, 2.501438, 1.663121, -1.452310, -2.782187, 2.376974, 0.011909, -2.539715, 0.295956, 0.566496, -1.961629, -0.696331, 2.636359, 1.223098, -0.417314, 0.676321, 2.153070, 0.455967, 0.085189, 1.253216, -0.196767, -0.790615, 2.291733, -0.371285, -3.192962, -1.635582, 0.242907, 17.824533, -1.191271, 0.693028, 0.503808, -2.743020, -0.241273, 1.333885, 1.035344, 1.961190, -0.152041, 1.285618, 6.496006, 0.763207, 0.087968, 2.870465, 0.356444, -1.683651, 2.216422, -1.844649, -0.691816, -1.050912, -0.499342, 1.075545, -0.171938, 0.909891, 1.241483, 1.092643, -2.173166, -12.015554, -2.856367, -20.698908, -0.609858, -3.207479, -0.977120, -0.093685, 2.417937, 2.442160, 0.848334, -2.646831, -1.455137, -0.189020, 3.002412, -1.256232, -21.787876, 0.788017, -0.381844, -1.949451, -1.868758, 0.008738, 2.679282, -0.296270, -1.594275, -2.922423, 0.540774, 0.256479, 3.338656, 0.494479, -2.322596, 2.338914, 0.057826, 1.467675, -1.584482, 0.633437, -3.083071, -0.641232, -1.212952, 0.981840, 1.525307, 0.569190, -2.707587, 1.231240, 0.272207, 0.788035, 0.123544, 1.150698, 0.283472, -2.258527, -35.561195, 2.163777, 6.679248, -1.705841, -0.327031, 2.162935, 1.658646, 0.227390, 0.758502, 1.687766, 1.669509, -0.139092, -0.986089, -0.276972, 1.080841, -0.210213, -0.943579, -2.606741, -4.459008, 0.478328, -1.479767, -3.058852, -8.545797, -0.395627, -1.164287, 2.495408, -0.183550, -2.715141, -2.877285, 0.344198, 0.790495, 0.623198, -1.037656, -0.456738, 1.659658, -0.046735, 2.193197, -0.659875, 2.066572, 2.893802, 0.885306, 0.174187, 1.547745, 1.911764, -1.608837, 0.710351, -1.673273, 9.898326, -2.634362, -0.044190, 0.545811, 0.707455, 3.859440, -3.805061, 2.133735, 4.158154, -0.054382, -0.429803, 0.409739, -0.137998, 2.086717, -0.399401, 2.022264, -0.645827, 1.551178, 0.694332, 0.139730, -0.094900, -0.723024, -2.779635, -0.507423, 1.105360, -0.154632, 0.736738, 0.161376, -0.676730, 15.677238, -0.188134, 1.856025, -1.736571, -0.953508, 0.939773, 4.345387, 0.713690, -2.220577, -2.284271, 2.869463, 1.600713, -0.709076, 0.347308, -1.171167, 1.196288, -2.898170, -0.818068, -0.079971, 1.877244, -0.987878, -2.073211, 2.558323, -1.109507, -0.980287, 0.290609, -0.468093, -1.300264, -2.062796, -0.978946, 1.764146, -0.106737, -1.404154, -0.090169, 0.652458, -0.804112, -2.321134, 1.668066, 0.430334, 6.668191, -1.203562, -0.704168, -0.549548, 2.036619, 0.800234, -2.810486, 0.518899, -0.560133, 1.006556, -2.317722, -0.750900, -0.774626, -2.130997, -3.248784, 0.649599, 1.748983, -0.228464, -0.605739, 0.412901, -0.329177, -1.682082, -0.877227, 0.548816, -0.893511, 0.946567, -2.476345, 1.110094, 1.972772, 0.139028, 2.039542, 1.314990, 0.706638, 0.704419, 1.276759, 0.549634, -0.312816, -6.987369, -2.040702, 0.164418, 0.289892, -1.125816, -0.748568, 1.327388, 0.417259, -1.186356, 0.290069, 0.164743, -1.728422, 0.379547, -1.688991, -1.351178, 2.636830, -5.866957, -0.173038, -0.586063, 2.464278, 1.674203, -0.039280, 2.671825, -0.426402, 0.797512, 1.504823, -4.485917, 0.373612, 0.854094, 1.092571, 1.066606, -0.742925, -1.399240, -0.229709, -0.043383, 1.223743, -0.173675, -1.400634, -0.751707, -1.184195, -2.087879, -1.150729, -3.714264, -1.186671, -2.254113, 0.601697, 2.189452, -2.337301, 39.923878, 1.442919, -6.136178, 0.997319, 0.443302, 1.893847, 2.134698, -1.521084, -2.211249, -0.605163, 2.920052, -1.224859, -0.488684, 0.174961, 1.765274, 0.043963, 1.291592, -0.027254, 0.362563, 0.535911, 0.055040, 0.965669, -0.998588, -0.965053, 0.910799, -0.635001, -0.755441, -0.216526, -0.230160, 1.201640, -1.004903, 4.072732, -3.240913, -0.649104, -0.221394, -0.652424, -0.344757, -0.897428, 1.605987, 0.713921, 1.132797, 3.124265, -4.620122, -1.233402, -2.835165, -1.103005, -0.272193, 1.986733, -2.367633, 2.668632, -0.008239, -0.637520, 1.968715, 0.809258, 1.002304, 0.056953, 0.215855, -0.626684, -1.733515, -0.024491, -1.386338, 3.167961, 0.568686, 1.992621, -0.684541, -2.265276, -4.273585, 3.601405, 0.195045, 1.275817, -1.411008, -1.897714, -1.553356, 0.561319, -1.729886, -0.169032, -2.529392, -1.037613, 3.703908, 3.448970, -1.749579, -2.396593, 0.606615, -1.292753, 0.056893, -1.645130, 1.672513, 0.194815, 0.816912, 1.116137, 1.635205, -0.354506, 0.295139, -1.623365, 1.817265, -0.269048, 1.538462, -1.011906, 1.269872, 3.067373, 0.211793, -0.732102, -0.391167, -2.233507, -2.231629, -2.095895, 0.727035, 1.411009, -0.877170, 0.340414, -0.075507, 0.745245, -1.728194, -0.083872, 3.342732, 3.725381, -0.929764, -0.378220, 0.475852, 0.933637, 0.376741, -0.074854, 0.588735, -1.907476, 2.541552, -0.915148, 1.144570, 1.298011, -1.079804, 0.938594, -0.981594, -1.589298, -0.491617, 0.186467, 0.123616, -2.508697, -1.810076, -1.044597, 1.582594, 0.258120, -0.006478, -0.524830, -1.418215, -2.215704, 0.407240, 1.460952, -1.709083, -1.259769, -1.154309, 1.705392, -0.556009, 1.421965, -0.456303, -2.493070, 0.633029, -0.641055, -0.343872, 1.294190, 2.638387, -0.840192, -0.314008, 11.737316, -1.054496, -0.697600, 0.080303, 0.745963, -0.622276, -1.977137, -1.301768, 0.361449, -1.011482, 0.635370, -2.172765, 1.191688, 1.330068, -3.170839, 0.580865, 1.834052, 4.704100, 3.587384, -3.430662, 2.697701, -0.127231, -1.113158, -1.443183, -41.022354, -0.110150, -1.822315, -0.207086, -0.767085, 1.362608, 0.168367, 2.363555, -0.680236, -0.087632, 1.561876, 1.680736, 3.146271, 1.144814, -0.160254, 1.950765, -1.504552, -0.332065, -0.360760, 1.558014, 0.636103, 0.785471, -1.704202, 0.857587, 1.007262, 1.093875, 2.384038, -3.869771, 0.113459, -0.496516, 1.827181, -1.552594, -2.527205, 0.675593, 0.022145, 1.097947, 0.381638, 1.416903, -1.212677, -0.577768, -1.223361, -0.348717, -0.045128, 0.010273, -1.609390, -1.338410, -1.802680, 0.713390, 1.108162, -0.479905, -2.831465, 0.522394, -0.296082, 2.357434, 2.456294, -1.470178, -0.920917, -2.398948, -0.204510, -1.523816, 0.510649, -0.536751, 1.747033, -1.597472, -0.743485, 0.238079, -1.569961, 1.913989, 1.672282, 2.897475, 0.532682, -2.069995, -2.510436, 0.142607, 0.483538, 0.606211, -0.185770, 0.913036, -0.824144, -2.868243, -1.664191, -0.130939, -1.009496, -2.834906, -2.953358, 1.251532, -2.528788, 0.715784, 0.619456, 1.275110, -0.968184, 0.420924, 0.751423, -0.012198, -3.663082, -0.932058, 2.118438, -0.064653, -0.722701, -0.739168, -0.247635, -0.043738, -0.452323, 3.432029, -0.554806, 2.927828, -0.212855, 0.003955, -0.357271, -2.083921, -0.759885, 1.128178, 0.407497, -0.346585, 0.254764, -0.036644, 1.292589, 0.889476, 4.094336, -1.213452, -0.292327, 0.835652, -1.123920, -1.357339, 0.583533, -0.212889, -2.109420, -2.039322, -2.049470, 1.393434, 0.227105, -1.943285, -1.751864, 0.368406, -0.076198, -0.553293, 0.431516, 1.632055, -2.800219, 0.425088, -0.216394, -0.355467, 3.298864, -1.703126, -0.529324, -3.259636, -0.005124, 2.216942, 0.056490, 1.522583, 0.341275, 0.279314, -0.579731, 2.031629, -1.494830, 1.357635, -0.870218, 0.445755, 1.674716, 0.256306, -0.997950, 1.777514, 2.048767, 0.416811, 4.965404, 0.141465, 0.773394, 1.545753, 0.027247, -3.331308, 0.706755, -2.109440, -1.031737, 0.457077, -0.085427, -0.124898, 2.836118, 0.668133, -0.377359, 0.098304, 0.442726, 0.469875, 2.438758, -0.113918, -0.708185, 0.850951, -0.269033, 1.265060, 1.252891, -0.723365, -0.106239, -0.572127, -1.127482, 0.805131, 0.789896, -1.664021, -0.853497, -1.080191, -1.777647, -1.330180, 0.512299, -1.983276, 0.702110, -7.340586, -2.208733, -3.390751, 0.570193, -0.615102, 1.125478, 1.215835, 0.822542, 0.171175, -0.069884, -0.123099, 1.784375, -2.423002, -0.737408, 0.343006, -0.787060, 1.550688, -1.225983, -4.489320, -0.243526, -1.443461, 0.465390, 1.505774, 1.369661, -1.840881, 5.110126, 2.507883, -0.404607, 0.081059, 0.555840, -1.737800, -0.521636, 0.646257, -0.794755, 1.622429, -1.143865, 0.247397, -3.387873, -1.999178, -0.475409, 0.254099, -1.216121, 0.947898, -0.436287, 0.588959, 3.871037, -2.230380, -1.394659, 0.590486, -1.651382, -2.237124, -3.214393, 0.829730, -2.667794, -1.414842, -2.266326, -0.213557, 3.285220, 0.244696, 1.276757, -1.010511, 0.954373, 2.183401, 0.428658, 0.591693, 0.545552, -0.059087, 1.103354, 1.915429, 2.638312, 0.722118, 0.093238, 0.864807, -0.179161, 2.299291, -4.168321, -0.407841, -2.142166, -0.239048, 2.303017, -1.151473, -0.927492, -0.527071, -0.511980, -0.261069, -0.298833, 1.963749, -0.033439, -1.881962, -0.522094, -2.649604, 1.647931, 0.776876, 0.432121, -0.191135, -2.145623, -1.231426, -0.372664, -0.452278, 1.274758, 0.170175, 1.026496, 0.073504, 0.640993, -1.226367, 0.783692, -0.448901, 1.416342, -0.930619, -0.322302, -0.675186, 1.486036, -2.058913, 1.081755, 1.013078, -0.047287, -0.117357, 0.994664, 0.758084, -0.193374, 0.876904, 0.891312, 1.532211, -0.034118, 0.127642, -0.266024, -0.440258, 0.074581, 1.515373, -9.780887, 0.753208, 0.330636, 2.945389, -0.220795, 0.140223, -0.131821, -1.725950, -2.171350, -0.204008, 0.595772, -0.116645, 1.457196, 3.947017, -0.784874, -0.888277, -1.782836, 0.817804, -0.192799, 4.299161, -7.505760, 1.463305, -0.106789, 1.174378, -0.811028, 2.220318, -0.583385, -1.241430, -1.802632, -0.958205, 2.339732, 2.392081, 0.028450, 0.596011, -1.068670, -0.249349, -2.410244, -0.316100, 3.512356, 1.815074, -2.563461, -1.023207, -1.255088, 0.645512, 0.079494, -0.294559, -0.094625, 3.387850, 0.226355, -2.717346, 0.559185, 1.418335, 0.593394, -1.490641, -0.128680, 0.076089, -1.944839, -1.451210, 2.537951, 0.006042, 0.122956, 1.893836, -0.406169, 0.839640, -0.788404, 4.068511, -0.677647, 1.193237, 1.380761, -1.895364, 1.780263, -0.083210, -0.052489, -2.717657, 0.885126, 0.945189, 0.714716, -0.331114, -0.276344, -3.294456, -1.361389, -1.806947, 0.547034, 0.398358, 2.003053, 0.901725, 0.410779, -0.864764, -1.618749, -1.349252, -0.911480, 2.053375, -1.349382, 0.849684, 1.225166, -0.703413, 1.470940, -1.646121, 1.467508, 1.322925, 1.738584, -2.498531, -1.925045, -0.384096, -0.101580, -2.523852, 0.767522, 0.719983, 1.575168, -0.290012, 1.028273, 1.360790, -1.643553, 0.725088, -3.634740, -0.672812, -1.429943, -0.466053, -1.564547, 2.244541, -0.990287, 0.477694, 0.142603, -1.236258, -0.260318, 0.138124, 0.848498, 0.986988, 1.307112, 0.753924, -0.791700, 0.239069, 1.626977, 0.944337, 2.356667, 4.252893, 1.440436, -0.834279, -3.924565, 1.486321, 1.781534, 1.588995, -2.617434, -1.426156, -1.792942, -0.477611, -0.760941, 2.425961, 1.197546, 4.233758, -2.818418, -0.567543, 1.363195, 0.341048, -2.289856, 0.309621, -0.637264, 0.347269, -0.461381, 2.523387, -1.344993, 0.216397, -10.708355, 0.919107, -2.543254, 1.904163, -0.058510, 2.088098, 0.810954, -0.385477, -1.210636, 0.094202, 0.535237, -1.502995, 0.344693, -1.197936, -1.386554, -0.128568, 1.160851, 2.858701, 1.907802, 1.539349, 2.433446, 2.448377, 2.303145, 4.565366, 2.033096, -2.155319, -0.801175, 0.520414, 1.972140, -2.310103, 0.173755, 0.974195, -1.724003, -1.241399, -2.017481, -2.318946, 0.835495, -1.525946, 0.359016, -2.165900, 0.091703, 4.673479, 1.606368, 0.459343, -0.392127, -1.453846, -0.353931, -2.310848, 1.236771, -0.827690, -1.652173, 8.877085, 2.143196, -1.679855, 0.654059, 2.409938, 0.361839, -1.841513, 0.169567, -0.140878, -0.171590, 1.312745, -0.134062, 0.724259, 2.047288, -0.841137, 1.260102, 1.002513, -0.647772, -5.369482, 5.233755, 0.599639, 1.172586, -1.073408, 0.885488, -1.003686, -3.339315, -0.038161, 0.559816, -0.626176, -1.193738, 8.828257, -1.190498, 0.449845, 1.863093, 0.556650, 1.450109, -0.126581, 0.105580, 0.397996, 0.313131, 0.914442, 0.404302, -3.006561, -1.363588, -0.628297, 0.217007, -2.647120, 2.862575, -8.684146, -0.418185, 1.437103, 0.828107, 0.458347, -0.435039, 1.646789, 0.926643, 2.910391, 0.709955, 0.771045, 0.037429, -4.088374, 0.162264, -3.077456, 1.285309, 1.316735, 1.387373, -1.066935, -0.645049, 1.368517, -1.300493, 1.187293, -0.031566, -0.225020, 0.646678, -0.172955, 3.641774, 0.928097, -1.952956, 0.098479, 0.563732, 2.789780, 1.441335, 2.300636, 1.045073, -1.739837, -0.522630, -0.936236, -0.113540, -0.295716, 0.952094, -12.808439, 0.464352, 1.135275, -1.872086, 0.042147, -3.117124, -0.354251, 2.127711, 0.385427, 1.285458, 0.280779, 1.432706, 3.229187, -0.091171, -2.339383, 2.241873, 0.316786, -2.255201, -0.556684, 2.200058, -0.417242, -1.281321, -0.334675, -0.578597, -1.088588, 2.267492, -12.715529, -0.348637, 1.471534, -1.676870, 1.577533, 0.676559, 0.080321, -1.086842, 0.430531, 0.437493, -2.491856, -1.025787, -5.674278, 0.392208, -4.118344, 1.785830, 3.346591, 0.304010, -2.162539, -1.001513, 0.018220, -2.174213, 0.871597, -0.414440, 0.716925, 0.457252, -1.066866, -1.192133, -1.134685, -0.413272, -0.989214, -0.402608, 0.400279, 2.370824, 0.565737, -1.827672, 0.184565, 1.494704, 1.196620, 1.353731, -1.431783, -0.185922, -0.168908, -0.945395, -1.528083, 0.495618, 1.339592, -0.752455, -0.017693, -1.226821, -1.587509, -1.819557, 0.250778, -0.129144, -0.049336, -0.556312, 1.388641, 5.209940, 7.888215, 2.105105, -4.130414, -1.113280, 0.696917, -1.569371, 1.314536, -1.086163, 1.195435, -0.062427, 1.109358, 2.636901, 0.217930, 0.296666, -1.974165, -1.735777, -2.618426, 1.400363, -0.096162, 0.028053, 2.971062, 0.302502, -1.890169, 0.857939, -2.612601, 1.933837, -2.467517, 1.603533, 1.537552, 0.085175, -2.052856, 0.524604, 0.883950, 0.739990, 1.870276, 1.625467, -4.919993, 1.174270, -2.426991, 1.683943, 2.972154, -1.803723, 1.008075, 0.248989, -0.385458, -0.196188, 1.270961, -0.229944, -3.168777, 1.614013, 1.595189, -0.410707, -1.044544, 0.778994, 0.653459, 0.350870, 4.649518, -1.140979, 0.838352, 1.230573, 2.527621, 1.062342, -0.298138, 0.888464, 0.313968, 1.429981, -1.374527, -0.592750, -1.111212, 0.250075, -1.495723, -0.555394, 1.749684, 1.169427, 0.184614, 0.222946, -2.256650, -1.231119, 1.744124, 2.778740, -0.821602, 0.101260, 1.169010, 0.798608, 0.031937, 0.170795, -7.833639, -0.522136, -0.191751, -0.644957, 1.360851, -0.476764, 0.153982, 1.444253, -0.697542, -1.839324, 1.011768, -1.918130, 1.279057, -0.911504, 1.704320, 1.411318, 0.101799, -1.732975, 1.152074, 1.164443, -0.210665, -0.760696, 0.550097, 0.398409, 1.169000, 4.403631, 0.044010, 0.429579, 0.020382, 1.313668, 0.072535, -0.443568, -0.970095, 0.235293, 3.075127, -0.917624, -0.191897, 0.649122, 0.604821, -2.162972, 1.809951, -0.566550, -0.368535, 0.561067, -1.963957, 0.591505, -1.989305, 2.010246, 1.025328, 1.118531, 0.765658, 1.132761, 0.590744, 0.309546, -0.072684, 0.927601, 0.402006, -0.279634, 3.336041, 0.363340, -0.084176, -0.376793, -0.792103, -0.910649, 0.842618, 0.704083, 1.336435, 0.633091, 1.232460, -0.126867, 1.860313, -0.582771, 2.027243, -0.734689, 0.396841, -4.301981, -2.570962, -2.346615, -0.294627, -9.963593, -0.950176, -0.416823, 0.185868, -0.575314, -0.248928, 0.115231, -0.267261, -0.893238, -0.038151, -1.312479, -0.859603, -0.981597, 0.905031, -1.797287, 1.575514, -0.293734, -2.374883, -2.280471, -0.114908, -8.689425, -3.053759, -0.254290, -1.284394, -2.571275, -0.515252, 1.703866, 0.979483, -1.641806, 0.677989, 0.632244, -0.935906, -0.334838, -0.395663, -1.000913, -1.619586, -0.240378, -0.943641, 2.728803, -0.003069, 1.764210, -1.329910, 2.901473, -0.663974, -2.739002, -0.557714, -1.467272, 0.430610, -2.140990, -1.018700, -8.117669, 2.187021, 1.585709, 0.303729, 2.666512, -0.322125, -0.510872, -2.895965, 0.332371, -0.563060, -4.857492, -0.213120, 0.974745, -0.822440, 1.557947, -4.240340, -2.603575, -0.108165, -1.109751, 2.159279, -3.169286, -0.512767, -0.863623, -0.091780, -1.966987, 2.861250, 0.022738, 1.359011, 0.319737, 1.852149, -0.047252, -0.426285, -0.759734, 1.233245, 1.222208, -0.428217, -1.377964, -0.014197, -1.666247, -0.693576, -2.364216, -2.303179, -1.875806, -2.193077, -2.161515, 0.730956, 0.106795, -0.440026, 0.882222, 1.271376, 0.223913, 0.026457, 0.875005, -3.332191, 0.711084, 1.622856, -1.125061, -0.091555, 0.591242, -1.053302, -0.374932, -0.451309, 0.116873, 0.916229, -1.526904, 1.396491, -4.383001, 0.277899, -0.744718, 3.206128, -0.688903, -1.531305, 3.416818, 2.721693, 2.085379, 3.218796, -6.358753, 1.035356, -2.443428, -1.629758, -0.956840, -2.862966, 1.097265, -0.132868, -1.223482, -0.695732, 0.078407, -1.925692, -1.591343, -0.446446, -0.413490, -3.946516, 0.004555, -0.718744, 0.671976, 0.724541, 0.277439, -1.528037, 0.149476, 1.957371, 0.107017, 1.056521, -0.852286, -1.079987, 1.865555, 1.076413, 0.002297, 2.019732, -3.465175, -1.121418, -0.474052, -1.834379, 0.894442, 0.671292, 1.058087, -0.456843, 1.975728, 1.927830, 0.147125, -1.154134, 0.253917, 3.004951, 0.158338, 0.203460, 0.986412, 0.876946, 3.303209, 0.924066, 2.434523, 1.368330, -1.493254, 0.657588, -2.332864, -0.716405, -1.623935, 0.734781, 2.475377, -3.862651, 0.639123, -1.024320, -2.099458, -1.505555, -2.106091, 0.105197, -1.276832, -1.604650, 11.007366, 16.257780, 1.212517, 0.889604, 3.437125, -1.665995, -0.542497, -0.793582, 0.072068, 0.891223, -1.907271, 1.315802, 0.976274, -2.103378, 0.598498, 1.317193, 2.073358, -5.785201, 0.120337, -0.128886, -0.625547, 1.823134, -0.627097, -0.074800, 0.489213, 0.000787, -0.695088, -0.151693, -1.164222, -0.719642, -4.420443, -1.641346, 0.223747, -4.239794, -2.342813, -0.701424, -0.777337, -0.055989, 2.989524, 1.142629, -1.108115, 1.921261, 2.104734, 0.713200, -1.836161, -0.769459, -0.096665, -1.405661, 1.133891, 0.677240, -0.079669, -0.786968, 0.757151, -1.115593, 1.729968, 3.429117, 3.333342, 0.889537, 1.064027, 2.452963, 0.146189, 1.487560, 0.159354, -1.003925, 3.638438, -1.171568, 0.858463, 1.559678, -1.309983, 0.294997, -1.911492, -0.658933, -3.117077, -1.047560, 0.714687, -2.235785, -0.924417, 1.562038, 1.057431, -0.458399, -0.513814, -0.561772, -1.739037, 1.467458, 0.620015, -0.726114, 0.533329, -0.823658, 0.602171, -4.322502, 0.547450, -0.053952, 5.291243, 0.374018, 0.543503, 0.658229, -1.971978, -0.861369, 0.358919, 0.152244, 0.556775, -2.041901, 0.755251, -0.084016, 2.247265, -1.645395, -0.975744, -0.170778, 2.687211, 0.454293, -0.637791, -0.689573, 2.783953, -0.462319, 1.623681, -0.080127, -2.483746, 0.395303, 2.702187, 0.863832, -0.829897, 1.051659, -0.179475, 1.012557, -3.725295, -2.425763, 1.506878, 0.261926, -0.952743, 1.053141, -0.772923, -0.683897, 3.952535, 2.515576, -1.145893, -1.876673, 1.160748, 0.543679, 0.466676, -0.385127, 1.240535, -0.393820, -2.291663, 0.888950, -1.278874, 0.038750, -0.752375, 2.060377, -0.336541, -1.480818, -1.426695, -1.427118, 2.019039, -2.795073, 4.602587, -0.728483, 0.066261, 0.140672, 0.995434, 6.288233, -0.336479, -1.012862, -0.521627, 2.346851, 1.099883, -1.747922, 0.289940, -2.412572, -0.726209, 1.113508, 1.037497, -0.935774, 1.114324, 1.137784, -0.378494, -0.369090, -0.495506, 0.839105, -4.885697, 2.921585, -0.534774, 2.706858, -2.904579, -3.026774, 1.074224, 0.988017, -1.126216, -2.458224, -0.514051, 0.257682, 1.672144, -3.153304, -0.512526, -1.063563, 0.611227, -0.169314, 0.288846, 0.492560, 0.557903, 1.448651, 1.216152, 1.514556, 0.348303, -1.505949, -0.923314, 3.453034, 1.904600, -0.675914, -0.474720, -0.985341, -0.153247, 1.354074, 14.742016, 0.398167, 2.527965, 1.902573, 0.569317, 0.713961, -0.157248, 0.583093, 1.446733, -0.671570, 3.754446, -1.230091, 2.326427, -1.630548, 1.877837, 0.963890, 2.939709, -0.352764, -2.149770, 0.465506, -0.063179, -1.710148, 0.230100, -0.749501, -1.599947, 0.353103, -1.376222, -3.326641, -0.228345, 0.709135, 0.279200, -0.913855, -1.260106, -1.632902, -0.528416, 0.245233, 1.025672, -1.362333, -1.633941, -0.171049, 0.478239, 2.395704, 1.798306, 0.518881, 2.364272, -0.769217, 2.540560, 1.193590, 2.762551, 1.171583, -0.288009, 3.315905, 0.178900, -0.414532, 1.629692, 3.644429, 0.047435, -0.409813, 1.277344, 1.235892, -0.370595, -1.994537, -1.594649, -2.062027, 4.117460, -1.693915, 0.134951, -0.276196, 1.733694, -0.138643, 0.993175, 0.659948, -1.959156, -0.568687, -0.549599, 1.432800, -1.232484, -3.082007, -0.631581, -2.420118, 2.023217, 1.324583, 0.041360, 3.613342, -4.568541, 0.285068, 0.292985, -0.218380, 3.284291, 1.405751, -0.301350, -1.727242, -1.090432, 2.118229, 0.914769, 0.380845, 0.296591, 0.511121, -0.837049, -0.388810, -0.239591, -4.444277, 4.205221, -0.768113, -0.578623, -1.128701, 1.552697, 0.367937, -0.099581, -1.281450, -0.908604, -0.632442, 13.450798, -2.382877, -3.409620, -1.395153, -0.904329, -0.477906, 2.242418, 1.626651, 1.743318, 0.201355, -1.874433, 1.069358, -1.122438, -0.513217, 0.798678, -2.774002, -3.890982, 1.194817, 0.417884, -2.555126, -1.121142, -1.795019, -0.905862, 0.113841, 0.568253, 0.533732, 0.223853, -0.951292, 0.625884, -0.039080, 0.570316, 1.236212, 0.025467, -1.700714, 5.611750, 1.285293, 0.344259, 0.667707, -0.425748, 0.564495, -1.901882, 1.431199, -1.215952, 1.253814, -0.354065, 0.312732, -3.331754, -1.358605, -3.347685, -1.173771, 2.645103, -2.667670, -1.189343, 0.628767, 0.290056, -1.889051, 1.207483, -0.013234, -1.082175, -0.463480, 1.240325, -0.788110, 3.318880, 1.457310, -1.047825, 0.244308, -1.829554, 3.589651, -0.124967, 2.177041, -1.269532, 0.687182, 0.477322, -0.286502, 2.742045, -1.479311, -0.366762, 0.130173, -3.005942, -4.997714, -1.074178, -1.553302, 2.917926, 0.381329, -1.468071, -1.214693, 0.776294, 0.101943, 2.781591, -0.340745, -0.709129, 1.041831, 0.696277, 0.774845, 2.691426, 0.668740, -0.311207, 1.233782, -1.170120, -0.034383, -1.132921, -1.765879, 0.484065, 0.475109, 0.550853, 0.381090, 0.574175, 0.871582, 2.628418, 1.365183, 1.224013, -3.376759, 1.086175, -0.021419, -2.019457, 0.234232, -3.502301, 1.270517, 4.130678, 1.725621, -2.442235, -10.042397, -0.838784, 0.022949, 2.967072, 0.118950, 2.288779, 1.842968, 2.072146, -0.317232, -1.301560, 0.340098, 0.647872, -0.179999, 0.620119, -0.987134, 1.012957, -0.834723, 0.834885, 0.751951, 1.262253, -1.385338, -1.551879, -1.337877, -1.756028, 0.510630, -2.394951, -0.205486, 1.575394, -0.480835, -0.409149, -0.437463, -2.282324, -1.881278, 1.116361, 1.136089, 24.378939, -0.393524, -3.075753, 1.290074, 1.087091, -2.154333, 0.970394, -1.926258, 1.618542, 0.412278, -1.739195, 1.623174, 1.373538, -0.979996, 0.275235, 0.773335, -0.732338, -2.702239, 0.798534, -1.287799, 35.395630, -0.367900, -1.554014, -3.089915, -0.288484, -2.492246, 1.115393, -1.127128, -3.936405, -0.125779, 0.485530, 0.389982, -1.279955, -1.730957, -0.593967, 0.948370, -2.064061, -0.272153, 1.658313, -1.792256, 1.410611, 1.607257, -0.524023, -1.193940, -0.477432, -0.203510, -5.396678, 0.878312, -0.600852, -0.958319, -1.770715, 2.374568, -0.837991, 0.792972, 1.333605, -2.388574, -0.201241, 0.498677, 0.179003, -1.049678, -1.267674, 0.086085, 1.527040, -0.410888, 0.113187, -0.560847, 0.438254, -0.419042, -8.495057, 1.416575, 0.312171, -4.181562, -3.007250, 0.894191, 0.282944, -0.465586, -1.058151, -0.204914, -0.380061, -2.493015, -0.319261, -2.098388, -2.529561, -2.387559, 1.457654, 2.064481, -0.206003, 0.331642, -0.909741, -0.725373, -1.512754, 0.902448, 1.214116, 0.187296, 0.355552, -1.346207, 3.032812, -0.008650, 0.665337, 0.523351, 2.378387, -3.201352, 1.975095, -0.343192, -0.887129, 0.839616, 2.507690, 0.034270, -0.425381, 0.244614, -1.063135, -0.416553, 0.151447, 0.544476, -0.904222, -1.981529, 0.008429, -1.530195, -0.380718, 2.639323, -1.974096, 2.537336, 0.128485, 0.057991, -1.839553, -2.247260, 0.850893, -0.609513, -0.073828, -6.912158, -0.139270, 0.179913, -1.241016, 0.320185, 1.688670, 2.298871, -0.673911, 1.046304, -2.560003, 2.457667, -1.773535, -0.873110, 1.303595, -1.350625, -2.650466, 1.821359, -0.091552, -1.048395, 4.371029, -1.531572, -3.530663, -7.938312, 0.371839, -3.025643, -1.249872, 2.221552, 3.361130, -0.400779, 1.653993, 1.089241, -0.370755, 1.477701, -0.303671, -0.078472, 1.358668, -0.792539, -1.049296, 0.496935, 0.733483, 1.849233, -1.298069, 1.658483, 1.070524, 1.064211, 2.341386, 2.022804, -0.712600, -0.828958, -4.590330, -2.193731, -1.187169, -0.390252, -1.434384, 0.475188, 0.472390, 1.016687, 0.816242, -2.084652, -0.264609, 2.821970, 0.642034, 0.144703, -0.085207, 3.882366, -0.335585, 2.945193, -8.780211, -1.788703, -0.816980, -0.324785, -0.251749, 0.550231, 1.536315, -0.976531, -0.001636, 0.025487, 2.723643, -1.050987, 1.859178, 1.747509, 0.632397, 0.674326, 1.942476, 0.170618, 0.628320, -2.931181, 1.509731, 0.052616, 0.396503, 2.053801, 0.786840, -1.765508, -2.049108, 1.391213, -1.846742, 0.302443, -0.235106, -4.113284, 2.388442, -4.504088, 1.407906, -0.792122, 0.238252, 0.770481, -2.253059, -0.076944, -2.216919, -1.822923, 1.290645, 1.323500, -1.547410, 1.743029, -1.069493, -0.496441, 0.411108, 0.256616, 1.004339, -0.964894, 1.061500, -0.336986, 0.609277, 1.396947, 0.389251, -0.800786, 2.296143, -0.148887, 1.665507, -1.662055, -0.582016, -2.904270, -1.138959, -1.115410, 0.504910, -0.440490, -1.917381, 1.905744, 2.691540, 0.668207, -1.137727, 1.715427, 2.627678, -0.848056, -2.206196, -0.485603, -0.104143, -1.581868, -3.105368, -0.136548, -3.890651, 2.893072, 1.311447, 1.207675, -0.110239, -0.015520, 3.073158, 0.722297, -0.317380, -1.639348, 2.564957, 0.707945, -2.089616, -1.077968, -1.993640, -0.184228, 2.603096, 0.495441, -0.690588, -1.996336, -0.615599, 1.252049, -1.305657, 2.258763, -0.367589, 0.744874, -3.983390, 0.641868, 0.682344, 1.534273, -0.078887, -0.638862, -0.436887, 1.282581, -0.474767, 0.933970, -0.562142, -1.196221, 0.620703, -1.794334, -0.201897, -0.273618, 0.759434, -4.578209, 1.045878, -0.396665, -1.393743, -0.258993, 0.909860, 0.065080, 2.534648, -0.621033, -2.463706, -1.267582, -0.949879, 0.684556, -3.153200, -1.322336, -0.846883, 0.756270, -1.246623, 1.325826, 0.269144, 0.655697, 1.508641, 0.955546, -4.049324, 1.236385, -2.208986, -2.164684, 0.483166, -1.556875, -2.399807, -0.071894, -0.598155, -1.504630, 0.909837, 0.441980, -0.142267, 3.186562, -9.105115, -0.706504, -0.904092, -1.254219, 1.707472, -0.484811, 0.243699, -1.906812, 0.557756, 0.399335, -0.668094, 1.150291, -33.314419, -4.579463, -0.827204, -2.219878, 0.864965, 0.843359, -3.120395, -3.499207, -1.056821, 2.018512, 1.058076, -2.007281, -1.928121, -3.263878, -0.079084, -1.419108, 0.705255, -2.721349, -0.210468, 1.977742, 0.119329, -3.372694, -1.065624, 0.044773, 0.949414, 1.982919, -2.992223, 0.043504, 0.458767, 2.387087, 1.143641, 2.626701, 1.635916, 1.130192, -1.146586, 2.295970, -0.108903, 1.978274, 0.249793, -1.822635, 1.072817, -1.037818, -1.654801, 0.506986, 0.223898, -0.801533, 0.471886, 0.821947, 0.165247, -1.029804, 1.522923, 1.681942, -34.283497, -1.443581, 3.120758, 0.109961, 1.252267, -1.838808, 4.105927, 1.430098, 0.795596, 2.285416, -0.350729, -0.183396, 2.008162, 2.535472, 1.442522, 0.067094, 1.726215, -0.465237, 0.228092, -0.898866, -0.351523, 0.943222, 2.762748, -0.150182, 0.072098, -3.666024, 0.861893, 1.339004, -4.193988, 0.934029, 1.222872, 1.534581, 0.536860, 0.175307, -4.649261, -0.155842, 0.486639, -2.415943, 4.921558, 0.074272, -0.523580, 9.426488, 1.450721, -0.273441, -0.685305, -0.208207, -0.141061, -0.689686, -0.430765, 1.203310, 0.457785, -0.636253, 1.884096, 0.180858, -0.601352, 1.059752, 1.353793, -1.144091, 1.012244, 0.111028, 1.282956, 0.539753, -2.136105, -0.362957, 1.343894, 1.246425, 0.270625, 0.846098, -1.814227, 0.723642, 0.896651, -0.492568, -0.483125, 1.335945, -1.528607, -12.522959, -2.514193, -0.385231, -0.627579, 0.507236, 1.104174, 0.487314, -0.776387, -0.105157, 2.839599, -0.512341, -1.799638, -1.330242, -0.425150, 0.598084, -9.454448, -0.575648, -1.355067, -1.314143, -3.041453, 0.646404, -0.381917, -0.618457, -0.122956, 1.836147, 0.686982, 1.346205, -0.397530, 0.177667, -0.312596, 0.959667, 0.490264, -0.229002, 2.652546, -0.523528, -0.295107, -0.722657, -0.770971, 0.501242, 2.509320, 0.498323, -2.765550, -0.476504, 2.484814, 1.173968, 0.066951, -0.033656, 1.939883, 0.595368, -0.492711, -0.485204, 0.524125, 2.012069, 1.902331, 1.441705, -1.873747, 8.485007, 0.809767, 2.354154, -1.817922, -0.857522, -1.486599, 0.853559, -3.121060, 1.469213, -0.247292, -0.377132, -1.203432, 1.732122, -0.968094, -2.430906, -0.646179, -0.096946, -1.891861, -1.449054, -1.289801, -0.931208, 3.503360, 0.109478, -2.145627, 1.445237, 0.036277, 0.486036, -1.857045, -0.748751, 0.455339, 0.367962, -2.402283, -0.665134, 2.883188, 0.057114, -2.703108, -1.443505, 0.250365, -0.242302, 3.324877, 0.508369, -0.850818, -1.103852, 0.660547, 3.891412, -0.431533, 0.720094, 0.351299, -1.011191, 2.850002, -2.083087, -1.732875, -0.878147, 0.754441, 1.108804, -1.122806, 0.565206, 0.396117, 3.356467, 0.028610, -1.452879, -1.373615, -0.454277, 0.141389, 3.266813, 12.898273, 1.677444, 0.586111, 1.705890, 1.566172, -2.538870, -2.231463, -1.102204, -2.527648, -1.085532, 0.478172, 0.187650, -6.529508, 0.846004, 2.000021, 0.900888, 1.543347, 1.928105, 0.956930, 1.169501, 2.499456, 0.512169, 1.320805, 0.813092, -0.862943, -0.914188, 1.019531, -1.543529, -3.052305, -0.000092, 0.458211, 1.184352, -0.189562, -0.504885, 0.651876, -2.231333, 0.150909, 2.522482, 1.546768, -1.469670, 0.283542, -0.038828, -0.221745, -0.613487, 0.986294, -2.087503, 1.868805, -0.862680, -2.028013, -1.342783, 2.996248, -0.964869, -0.889650, -1.940610, -0.491476, 4.659435, -0.629713, 0.880112, 0.013399, 1.543464, 1.811594, -1.919105, 0.579925, -0.054963, -0.611301, 1.213541, 0.044412, 2.428571, 1.650359, -0.011562, -3.940373, 0.219158, -0.250527, 2.196884, 2.333903, 2.026985, -2.534095, -1.620452, 1.754802, -2.421583, 0.130233, 1.375082, 1.599321, 1.592287, 1.072141, -1.522067, 0.396378, -1.969304, 1.184472, 1.064752, -0.051353, -0.429749, 3.536501, 1.289551, -1.872306, 0.962450, 0.012061, -0.633179, -5.554706, 3.344516, -1.520184, -0.216105, -1.681684, 4.008332, 2.174610, -0.961140, -0.686181, -1.402003, -0.401292, -0.713743, 3.171560, -0.512009, 0.219556, 0.559843, -0.798941, -2.584601, 1.708179, -1.446687, 3.116506, 0.701865, 0.474630, 0.662607, -0.341657, 3.770340, -1.970125, 0.079163, -0.033681, 2.390205, -0.802647, -0.419331, -0.633927, 0.158110, -1.025442, 0.967359, -0.777828, 0.594151, -0.833795, -0.444425, -1.413129, -0.685053, -0.072876, -0.182840, 0.716580, 2.238798, -0.586169, 1.660746, -1.904589, 4.398918, -1.573961, 0.223843, -2.490864, 1.426156, -1.309565, 0.297844, -0.122553, -4.125076, -1.175787, -0.808162, 3.899071, -3.727594, -1.969113, -1.416097, 0.433778, 0.566248, 2.189430, 13.170939, -0.437235, 1.563687, -1.015102, -10.265989, 0.488794, -1.776708, 0.143081, -0.918150, -1.271382, 4.251751, 2.227284, -0.468386, -0.818391, -1.076629, 0.803174, -6.778745, -2.332751, 3.314303, -0.854069, 2.525487, -0.036530, 0.712685, -2.247428, 1.711706, -0.839512, 2.300299, -0.371730, -0.143227, -1.000555, 2.392848, 3.181103, 1.384507, 0.174168, 0.332910, -0.797669, -1.976347, -0.308509, -1.638045, -0.427884, -1.578577, 1.571453, -1.381518, -0.068900, -0.008149, -0.877026, 0.584573, 0.371052, -0.273352, -0.496754, 0.173186, 1.495046, -2.134991, -1.174339, -2.233019, -1.859445, -0.982453, 1.005392, -3.032267, 1.807362, -1.046106, 1.126163, 1.836564, 1.041053, -0.354948, -2.419614, -1.992808, 1.194339, -2.727120, -2.960786, 1.990473, -0.249863, -2.641395, 2.517450, 1.227859, -0.836136, -0.104000, 1.062187, -1.290700, 1.327066, -0.019623, 5.412015, 0.280590, 2.486149, 0.634194, 0.961871, -1.225696, 2.310711, 0.309103, -1.714766, -0.548396, -0.683925, -3.823319, 1.304894, 1.286261, 0.745414, 3.101287, 0.891395, 1.379889, 1.123019, 1.053005, 2.294336, 1.014520, -2.056264, 0.847806, 1.838467, 0.661211, 0.911672, 2.108672, -1.296724, -0.994294, 2.204733, 0.060958, -3.965935, -0.375877, 4.837624, 1.608993, 0.728340, 0.086287, -0.533132, 0.689582, -0.234022, -0.732648, 1.564658, 1.145785, 1.684504, -0.951721, -2.480582, 0.639165, -0.163595, 0.932444, -0.321780, -1.511633, -0.700274, -2.130924, -0.589078, 2.558939, -0.031520, -0.466372, -0.661476, -0.310624, 0.919263, 2.138045, -2.980811, -1.367038, 1.232296, 0.824462, -1.153319, -1.915287, -0.933534, 0.126949, -0.378158, 0.226273, -0.199483, 4.345228, 1.491282, -15.042450, -0.682710, 0.315321, 0.360537, 0.098894, -1.003842, -1.970495, 0.370964, 0.917547, -2.802694, -0.426734, -0.872954, -0.628911, 0.498273, -1.146430, -0.303363, 0.423223, -1.333830, 3.313324, -0.368433, -0.129785, -1.392973, 0.117655, 0.907470, 0.862268, 30.468317, 1.916252, 0.630351, -2.087536, 2.504394, 1.090119, -1.520968, -0.478656, -0.055788, -1.631508, -0.471805, 0.263214, 1.737623, -2.067608, 0.341913, -0.636885, -1.703791, -0.276518, -1.013657, 0.735331, 0.755858, -7.211546, -2.701548, -1.511194, 0.885756, 1.038019, -0.747457, -1.262162, -2.229699, 0.619854, -0.506964, -2.198322, 1.772692, 0.917558, -0.345676, 0.773990, 0.077003, 0.222814, 0.570620, 13.318283, 5.154265, -0.133818, 1.382821, -3.381096, -1.348515, 1.883829, 1.128500, 1.431208, -2.484697, 2.832257, 2.581187, -0.643860, -0.449537, 2.258231, -0.865092, -2.880285, -4.159235, 0.895552, -0.865830, 0.576851, 0.660565, -0.503361, 2.477618, 0.888527, 0.642267, -0.203590, -0.203652, 0.518252, -0.481242, -1.399775, 0.071838, -1.687822, 1.408637, -1.714611, -1.262840, 0.550977, 1.361458, 1.093706, 0.218738, 1.694335, -0.816886, -1.712049, -1.776350, 0.478118, 0.873915, 0.157106, -6.387165, 0.410236, 2.402042, -1.496522, 0.605822, -1.822789, -1.357874, 0.902568, 0.476761, -0.997130, -0.898986, -0.416213, 0.628011, -0.191016, -2.507825, 2.000534, -1.100833, 1.287321, -0.746681, 0.251129, -0.452919, -2.379659, 0.467141, 2.813297, 3.019051, 0.894045, 0.513003, 0.247807, 2.164137, -0.861424, -1.187832, 1.431251, -1.880267, -2.707241, 0.882622, -1.083287, -2.503087, -0.331748, -0.253308, 1.218072, 0.465451, 2.239259, 0.033480, -0.988318, -1.066254, 1.289019, 1.244425, 1.395825, -2.000483, 8.393459, 2.063885, 0.351303, 1.736136, 1.678012, 0.466673, -0.735065, 2.319815, 0.587236, -0.219373, 4.404061, 0.069035, -0.615926, -1.054781, -0.281119, -2.315298, -0.598869, -1.699938, 0.399952, 1.414117, 3.159378, 0.971840, 1.468741, 0.738671, -1.217917, -0.005656, -1.837169, -1.033584, 1.689942, 0.296139, -0.159082, -0.282257, 1.146659, 0.589294, 1.239523, 0.584590, 0.732567, 1.415604, 0.229542, -1.803726, 1.402160, -2.028142, 1.227429, -1.287164, 0.215315, 0.091171, -0.837675, -1.208822, -0.082121, -0.389508, -0.755331, -1.751180, -0.248557, -1.661491, 0.068512, 2.337994, 1.096209, 2.914422, 1.787869, 3.141023, -3.783829, 1.704273, 1.094445, 0.711651, -0.020473, -0.531446, -0.418532, -0.951771, -1.334428, 3.076059, -8.316907, 0.220659, -0.314838, -1.326944, 0.409516, 8.980949, 0.336135, -0.271100, 0.718912, -0.735141, 2.867171, 0.454470, -1.684832, -0.667039, 30.702013, 0.883889, 0.531766, -0.452834, -1.168136, -1.349033, 1.579897, -9.245929, 3.323505, 0.975022, 1.259668, 1.623177, 1.048897, 0.252429, 0.494380, -1.923661, 0.298914, -1.570870, -2.745278, -1.792378, 0.748294, 5.594098, -1.439187, -1.536032, 2.820908, -0.333814, -0.420440, 2.032806, -1.904369, -0.007114, -0.871720, 0.579277, 0.640428, -0.765459, 0.173253, 0.786739, -0.283481, 0.956241, -1.522335, -3.105874, -2.173901, -0.282985, -1.197513, -2.059102, -1.084585, -0.648405, -0.228381, -0.374229, 0.766084, -2.314189, 0.978999, -0.764588, -0.302045, -1.880411, -0.740701, 1.435884, 0.545858, 1.537653, 1.470700, 0.422056, 0.317391, 2.463825, 0.103666, 1.313960, 1.550173, 1.734758, 2.650702, 0.980899, -4.777202, -0.605416, -1.772424, -1.299403, 1.382249, -2.957427, 1.906892, -0.161418, 0.556454, 1.740305, 0.006908, 6.763729, -0.278677, -1.766817, 0.121939, 1.601009, -0.562171, 1.424215, 0.542079, -0.740449, 0.396567, -1.856834, -1.074328, -1.226289, -0.696100, 0.464042, 1.506954, 0.080288, -2.100569, 0.921613, 0.558173, 2.382897, -2.477536, -1.884583, -0.054904, -0.519191, -0.969873, 0.959781, 0.231381, 1.963495, -0.388346, 0.184281, -0.207316, -1.240958, 0.680580, -0.335134, -0.166712, 2.305028, -0.148513, 5.842828, 1.085152, 0.373194, -1.860990, 0.415133, -1.615274, -2.057973, 1.233435, 0.705104, -0.175169, 0.365537, -3.113515, -0.644583, -1.924930, 0.268149, -3.299269, -0.044249, 0.769384, 0.747958, 0.502507, 1.253872, 2.567127, -1.534801, -6.195432, 0.239830, 1.061098, 1.498278, -1.147298, -0.012934, 1.279267, 0.256549, 0.932009, 2.801676, 2.601096, -0.057946, 0.010808, 0.225650, -0.888552, -1.055726, 2.577467, 0.214108, -2.550973, -2.023343, 0.168402, 1.144072, 1.188154, 0.007975, -1.752963, -0.353960, -1.058180, 1.402650, 2.647588, 1.578010, -0.623806, -5.160738, 0.940677, -2.087744, 2.504496, 2.207713, 0.738647, -3.554515, 0.701976, -1.332600, -0.665026, -0.837372, 0.036542, 0.498252, -0.463899, -1.281477, 0.037951, -2.718753, -2.519139, 0.797102, -0.969279, -0.007088, 1.170141, -0.555038, 0.601634, -0.725569, 1.991015, 1.571092, 1.635221, -0.142499, -0.239051, -0.623268, 1.279354, 1.173028, -0.863181, 0.221621, 50.890709, 0.459216, -0.634014, -1.232200, 2.451216, 0.997746, -0.070813, 4.072617, -1.105848, -0.707585, 0.129221, -1.666970, -2.221236, -2.830456, 0.259465, 0.248408, 0.067695, -3.389023, 2.130977, 2.272633, -2.333500, -2.583345, -0.533235, -0.838626, 0.675928, 2.638815, 3.564664, -0.836662, -0.464054, 1.113580, -0.739706, -1.093256, -0.590220, 1.229130, -2.069568, -1.136705, 1.287993, 1.863708, -1.156420, -0.803547, 1.157883, 0.605184, 1.073040, 0.025076, -1.252429, 1.357709, -0.065110, -0.653028, 0.069790, 4.635374, -0.414350, 1.573298, -0.352398, 0.538733, 1.097928, 1.345788, -1.038578, -0.814868, -0.190454, -1.798921, -1.403554, 3.546619, -1.749692, 0.060874, 0.027193, 0.516291, -0.682459, -0.085446, -0.505916, -1.123992, -0.330597, 0.062728, -0.334435, 0.210847, 1.488047, -1.092073, 1.356015, 3.290716, -3.166292, -1.558093, -0.793964, 3.183681, 3.087046, 8.326931, -2.114156, -1.133654, -0.425647, -1.151897],
-	"llama3.2-vision:latest": [-3.078805, -1.697646, 4.600920, 2.136139, -1.034632, -0.008391, -0.913782, 2.059704, 1.261938, 0.314856, -0.313179, -2.066850, -0.737111, 2.565375, -1.834170, 3.589492, -2.283495, 1.541524, -2.779084, 1.555042, -0.403128, -0.313654, 1.059721, 2.011516, -5.750325, -0.567921, 2.883536, -0.803768, -2.809469, -2.432648, 0.492391, -1.709821, 0.459170, -0.239044, 8.034880, 2.616068, -0.866570, -0.313675, -0.625120, 0.185346, 2.798899, -4.163560, 1.685675, 0.213714, 3.102599, 2.594100, 1.551703, 0.171097, 1.063220, -1.646748, -0.573196, 2.495553, -0.831499, -1.483251, 2.482893, -0.726516, 0.400158, -0.145284, -3.525773, -2.577699, -2.111852, 2.612575, -3.947710, 1.561014, 0.210879, 1.777725, -0.181272, 2.283506, 1.242015, -0.025549, 0.957746, -0.246194, -0.365402, -0.971253, 4.563991, 1.184461, -0.725158, 0.744156, 0.815184, -1.637800, -4.056623, 4.689488, 0.249045, 5.708639, -3.208293, -2.040814, 0.996086, -1.227924, 2.843411, 0.019621, -0.695887, 2.414440, -0.218883, 0.861006, 3.528445, -1.916659, -0.619786, 3.501029, -0.093414, -3.492441, -1.942818, 0.583332, -0.770895, 3.453049, 0.567662, -1.010252, 2.237015, 1.537082, -2.533475, 1.944466, -0.632733, -0.084942, -0.505062, -0.605591, -1.142169, 2.620798, -1.018153, 0.334478, -1.901000, 0.225330, 0.280273, -0.487890, -1.552703, -0.719227, -3.150110, 1.320974, 3.735471, 1.304873, 5.072580, -2.039533, -3.994229, 0.517429, 1.133909, -1.422137, -1.049232, 0.620438, 1.793169, -0.807552, 0.183158, -0.329035, -0.641384, -1.251977, -1.347493, 0.622846, -0.984846, 1.988598, 0.296134, -2.342518, 0.632345, 0.030586, 0.768713, 0.437475, -2.269910, -2.521681, 1.579402, -2.377503, -2.105729, 1.594941, -2.283249, -2.322279, -0.602992, -4.794743, -2.325619, -0.762731, -0.249267, -2.240785, -2.439840, -0.962278, 1.386412, 0.236573, -0.113179, 1.957054, -0.561917, -0.478650, -1.572174, -1.008380, -2.675298, 0.341852, 2.328920, -1.276262, 0.605668, 2.816123, -0.020307, -1.087129, 18.897930, 1.684022, -5.963904, 2.153839, 0.326535, 1.419691, 1.839680, 0.152121, -0.443793, 0.515642, -1.226702, -2.276216, 1.966202, 3.762245, 1.711866, 0.671341, -3.111674, -1.433167, 1.465878, -4.289611, 2.021384, -1.996823, 0.042214, -0.409925, -0.922436, -1.212982, 1.138044, -1.522454, -3.863219, -2.712918, 2.454779, 2.351706, 0.824271, 1.279557, 2.515165, 0.249954, -0.514616, -1.325547, -1.099375, -1.732998, 1.701366, 0.903615, 1.821438, 0.362362, -1.209264, -0.406613, 0.343815, -0.926961, 2.383206, -0.131069, 2.882418, -0.794012, 1.588547, 0.142149, 1.719791, -0.516556, 1.390384, 1.157517, -2.705374, 3.152814, 1.891767, -1.230698, -2.227765, -0.907071, -2.255760, -3.713498, 0.391423, -0.435654, -2.571611, 0.210851, -2.287234, -0.744805, -0.057202, -4.453759, -0.717052, 2.153277, 2.111250, 0.667674, 2.024823, -0.369146, -2.738459, -2.051172, -1.193694, 3.160493, 1.187114, -0.628924, -1.819261, -4.691652, -1.558387, 0.826536, 2.190940, 1.007532, -0.518849, -2.375483, -3.083109, 0.717804, 0.421144, 5.876964, -1.385087, 0.068475, -0.233602, -0.143602, -0.377704, 1.460576, 1.612279, -9.952719, -0.860141, 0.009690, 3.998900, -2.104997, 0.633108, -0.447114, 1.322761, -1.092791, 1.379224, -3.249701, 0.721551, -1.894614, -0.110262, -0.938271, -2.394038, -0.695555, 2.551907, -0.666710, -1.585857, -0.769654, 3.607650, 4.665731, 0.014892, 0.578692, -0.946742, 0.955832, -1.183421, 1.518056, -0.766180, -1.134929, -5.823644, 0.987941, -1.487348, 2.554693, -0.725647, 2.227943, 1.358883, 0.959986, -1.001250, 2.546376, -1.055410, -0.220511, -0.629391, -1.756114, 0.999128, -0.474006, 0.172467, -0.361360, 0.087746, 0.241353, -1.736117, 0.641458, 1.114634, 2.208075, 1.267806, 0.002817, -2.069037, -3.187589, 1.093324, -2.561264, 2.068681, 0.762202, 0.362883, 0.582804, -1.015167, 0.106795, 2.033842, 1.161639, 0.377664, 0.452618, 3.925405, 1.386955, 3.140606, 1.267758, -1.133554, -2.524645, -0.955275, 2.711424, 0.770822, 1.372985, 3.062992, 1.510188, -0.150879, 2.115181, -0.426800, -1.095572, -0.336918, -1.500960, -0.942303, -0.226177, -0.733918, -0.031548, 0.936019, 1.037607, 2.740880, 3.736936, 0.013158, 0.783627, -2.299813, -0.794128, 1.020153, 3.677419, -1.156537, 1.287886, -0.735975, 0.271875, -0.541371, 2.015721, -1.514776, 0.217231, 1.587353, 0.698595, -4.475769, -1.661238, 1.777915, 1.495372, 8.351849, -0.992776, 2.359664, -3.734209, 0.888680, 4.039773, 1.338669, -1.540372, 2.827198, 4.282894, 0.447615, -0.325227, -5.473198, 0.158931, -2.082453, -0.471492, -0.908210, -0.229417, -0.812437, -1.082374, -2.361205, -3.133377, 2.533987, -2.056405, 0.330013, -2.951894, -0.052760, -1.753070, 0.411255, 1.419914, 1.145815, -0.162324, 0.704823, -0.556056, 1.881800, 0.478933, 1.291103, -0.936796, -4.451693, 1.578889, 0.183439, -0.686146, 3.263305, 1.043620, 2.137517, -1.800166, 4.166996, 1.393071, 4.382342, -3.738013, -0.384553, 1.550048, -12.476042, 0.659573, -0.358259, 1.873343, -0.827457, 0.601567, -1.043456, 3.390819, 0.940549, 0.042124, -0.425845, 1.186888, -0.495413, -0.849585, -1.485713, -2.023389, 3.484549, -1.132264, 2.287027, -0.184956, 0.862702, -4.589354, 3.184383, -0.465502, -0.568603, 1.523978, 1.301721, 2.099811, 1.112086, -1.031363, -5.436310, -1.959793, 2.574366, -1.392580, -0.500089, -1.667079, 0.266578, 1.470267, -2.391178, -2.849307, -2.344163, 0.151920, 0.573642, -1.668947, -1.790319, -2.520195, 1.811686, -0.149748, -2.250818, 1.122893, -2.718899, 0.765947, 0.051963, 1.497161, -2.095890, -2.668485, -0.750255, 1.363428, 0.544781, -1.652589, 3.956085, -1.285231, 3.104691, 1.370670, -0.483672, 2.409342, -1.230793, 0.431622, -2.930366, -1.572196, -1.468563, 1.024296, 0.239093, -2.253988, -0.034438, -0.042014, 0.333263, 0.990264, 0.407541, -0.725304, -0.715123, 0.676341, 0.069297, 0.102975, -0.306056, -1.024400, -0.794437, -0.089673, 0.943760, 0.686709, -3.228787, 0.669426, -5.304088, 0.056853, 1.038435, -0.478588, -0.638576, 1.946829, 2.407026, 3.066535, -1.730606, -0.888373, 2.951096, 3.153862, 0.237999, 0.558091, -2.061528, 0.783972, -0.139891, 0.005644, -0.081209, -1.382260, 1.061232, 0.716218, -2.590728, -0.990372, 1.266114, -0.707920, -3.964265, 0.948937, -4.407448, -0.231940, 0.264556, -0.267132, -3.774856, 2.497759, -0.914523, 0.970749, 3.113668, 1.660681, -0.062968, 0.503072, 1.886241, 1.214914, -0.328673, -1.813600, 0.017637, -0.110344, 3.279947, 1.179452, -2.311359, 2.184004, -1.036836, 2.163837, 0.106519, 0.639714, 2.886490, -2.178060, -0.599363, -1.816274, -0.672925, -1.578791, -0.505086, 4.701667, -0.958578, 5.368207, -0.715092, 1.727489, 2.659025, -1.855298, -2.369414, -4.724561, -0.572714, 1.991303, 2.016687, 1.089363, 3.728837, -1.101324, 2.744128, 2.082385, -0.082763, 0.874934, -0.554436, 3.319520, 0.412499, -0.356465, -0.818570, 2.214372, -0.868555, -5.018708, 1.333610, -1.507039, 1.224960, 9.023772, -0.491060, 1.643431, -1.231728, 3.141759, 0.249819, 0.606284, 1.929034, 0.079260, 0.980546, -2.081781, 0.985507, 3.136811, 1.951684, 1.112473, 3.010893, 3.541259, 3.218399, 1.626346, 0.683307, 1.771878, -1.257802, -0.757362, 1.514842, -0.636001, -0.430890, 1.693416, -1.821938, -0.486594, 2.916702, -4.420597, -0.468898, 1.278626, 1.723206, 2.731259, 2.001235, -0.968886, -0.304650, -1.206299, -2.135083, -0.916858, -0.571528, 0.047041, 3.504944, -2.133256, 0.494332, -0.482203, -3.457354, -1.942995, -0.520856, -0.325685, 2.403196, 1.200350, 3.393925, -2.494207, -1.327450, 2.736926, 1.736917, -2.051530, -0.506557, 0.020822, 2.461713, 2.484358, -1.360792, 0.210698, 2.571049, -2.315030, 4.404229, -2.766439, -0.356564, 0.609894, -0.005560, 1.713071, -0.469658, 1.197658, 0.040987, -2.285223, 1.484401, -1.409689, 4.578481, 2.506946, 0.425305, 2.446197, 3.263529, -3.631989, -0.371354, -0.413407, 4.208085, -0.127080, -1.246227, 1.026809, -0.691294, -2.296336, 0.541863, -0.415415, -2.213721, -2.229713, 5.652533, 0.325638, 0.895194, 0.288386, 2.395745, 1.893746, 1.656826, -1.891253, -0.981483, 0.977041, -1.021206, 1.278371, -0.601382, -0.586926, 1.910359, -2.599929, 2.117822, -0.329484, 2.511219, -2.765614, 1.057920, 4.557327, -1.286663, -2.828605, 1.715522, 0.892571, 1.328999, -1.452793, 3.062454, -0.846668, 2.685005, 0.747479, 2.616977, -0.771560, -1.170870, 0.398534, 0.343821, -0.425957, -1.216428, 1.275198, -0.774221, -0.835212, -0.086740, 2.298670, -0.739717, 0.506853, 0.935785, 3.649465, -2.203318, 2.277899, -0.553957, 0.137781, 0.643336, -2.148784, 1.281616, 1.584459, 0.310634, 1.217097, -1.326976, -0.676382, 2.128299, -0.981715, 0.061699, 2.889244, -0.561955, -0.823019, -1.973488, -0.304435, 0.426746, 1.849452, 0.824656, -0.400681, 1.364501, 2.863911, -1.468426, -2.614557, 2.445089, -0.290599, -0.084631, -0.135070, 1.321539, -1.531789, -0.796952, 1.839522, 1.379722, 2.139342, -0.658808, -1.957961, 1.888349, -0.784370, -1.480167, 0.501366, -2.519140, -1.949068, -2.253042, 0.033506, 0.485347, 1.676559, -1.090220, -0.454366, 2.439051, 4.116050, 0.226810, -1.543618, 0.369009, -1.356788, 1.256831, 2.512082, -5.875708, -0.840281, 1.064519, -1.075835, 0.661105, 0.859369, -2.564416, 0.931348, -0.657630, 0.169598, -0.473798, -1.840275, 1.967578, -1.207092, -2.994502, 0.979585, -2.226941, -0.148251, -2.205890, 0.192183, 0.761379, 0.832209, -0.757570, -0.044144, -5.811568, 3.817863, -0.739124, 2.566487, 1.508290, 1.364889, 1.391061, 1.753602, 2.484661, 0.561939, -2.577687, 1.308320, -0.658141, -0.613247, -1.009616, 3.694613, 3.931254, 1.646078, -2.630128, -4.893660, -2.893271, -0.949201, -0.252491, 8.096486, -2.053660, -2.715685, -0.488898, -0.369804, 1.844169, -0.557177, -3.507263, 0.625425, -0.046343, 1.116201, -2.843462, 0.430182, -0.984149, 0.188606, -4.804567, -3.662477, -12.423755, 0.505854, -2.556453, -0.261161, -2.620847, -1.616541, 0.459177, 0.800119, 2.372884, 1.032130, -0.833381, 1.391556, -0.065205, -0.212230, -1.258538, -1.861240, -0.342623, 2.821825, -3.306765, 1.124912, -2.588235, -1.198842, -2.072963, 1.051923, 2.765494, 1.943674, -0.649781, -0.090553, 0.010493, -1.542902, 7.146881, 2.114514, 2.671738, -2.078392, 2.603645, -4.148600, -2.695807, 3.128589, -1.877043, -0.506973, 3.315228, 3.669570, -1.232590, -0.327188, -1.176315, 2.137116, -0.240232, -1.170810, -3.713598, -0.264318, -0.798505, 2.017999, 0.936734, 2.098501, 1.205739, 2.284950, 0.970075, 3.022959, -0.798790, 0.537358, 3.912182, 1.456274, 0.925978, 0.233295, 0.655816, -0.774384, 0.546590, -7.141086, 2.124636, -1.520990, 0.233711, 2.825353, 0.444932, 2.467602, -0.684698, -2.486389, -1.738088, -3.114126, 2.726388, -1.019151, 4.807260, -0.181204, -1.084334, 1.163428, -0.109641, 0.964845, 0.559943, 0.336826, 0.089202, -0.693184, -0.897094, -0.925852, -0.158974, 2.087272, -0.549617, 1.179534, -0.658688, -0.299359, -2.097598, 0.542013, 1.209597, 0.868445, 3.104147, -2.820510, -1.204089, 3.737103, 0.770909, 3.525891, -1.135699, -1.655476, -2.240339, 2.525290, 1.211621, -0.105237, 0.486678, 0.251381, -2.101022, -4.316065, 1.071823, 0.767139, 0.222749, -1.501596, 2.875869, 2.010502, 2.222627, -2.201579, 1.083319, -1.388173, 1.759563, -2.029091, -1.098797, -2.860517, -0.240031, 0.644087, -1.932882, 1.401318, 2.215424, 0.366056, 0.841219, 0.509652, 1.678419, 0.370270, -1.852237, -0.313416, -2.199522, -0.294784, -2.797149, -1.652146, -0.632833, -4.488834, -0.143081, -6.097857, -5.642374, 1.487491, -1.104113, 0.571834, 0.434285, 2.254967, 1.653841, 2.349760, 2.489481, -2.731234, -0.345902, -3.468615, 0.002949, 1.483104, 1.326606, 1.378591, 0.229062, -0.190730, -2.039232, 0.562338, 0.645633, -2.672769, 3.342596, 0.399291, 0.816063, 1.965873, 1.918716, -1.087965, 0.048307, -0.756520, -2.151526, -1.697711, 0.347615, -3.180706, -0.146576, -2.279339, 3.271249, -4.460975, 2.981601, -2.770810, -0.264376, -0.206536, 0.586737, -1.217057, 1.638721, -0.420798, 1.459608, 2.833084, 0.065077, -3.276213, -4.026783, 1.695028, -0.414748, -1.509182, -2.335618, 0.645046, -4.259199, -0.605817, -2.499427, 0.038157, -0.863183, 2.061970, 0.611956, -2.115386, -3.359080, -1.596068, 0.307425, 0.191455, 0.361744, 0.899717, -1.701859, -4.992336, 1.584893, -2.895921, 0.149289, -1.404445, 0.501034, 2.682879, -0.675067, -0.786691, 1.069297, -1.640006, 0.099055, 0.312075, 1.645067, -0.090822, -2.817954, 0.371475, 3.697231, -0.769372, 1.777448, -2.683017, -0.970097, 3.940720, 2.429331, -0.532402, -0.217043, -1.411375, 2.891897, 0.184554, -1.693691, 0.541098, 2.303737, 0.010059, -2.598566, -2.101953, 1.095212, -0.516849, 1.995777, -3.868278, -3.872117, -2.956234, -2.438829, 1.945905, -2.324775, 1.419934, 4.604033, -0.205442, -1.559651, -1.537539, -0.980349, 2.164563, -1.639275, 2.282555, 0.196895, 1.739233, 6.665299, 11.612860, 2.788843, -1.838532, 0.987776, 0.625764, -1.816129, 1.568187, 2.474130, 0.996020, 3.790505, -0.408657, -0.948997, -3.530251, -1.080017, -3.417130, -2.058712, -1.956407, -2.989211, 1.230417, -1.237799, 1.329787, -1.045655, 0.436384, -0.770900, -0.847061, 0.136578, 0.058188, 3.483011, 2.304829, -0.042124, -0.562030, -0.265483, 3.568855, 2.773987, 0.164709, -6.854160, 2.262440, -1.915387, -0.935466, -2.306059, 1.185262, -2.824844, -1.654353, 3.939613, 3.268545, -0.325672, 1.448018, -0.088753, 1.695897, 0.997020, -0.638259, 5.060489, -1.404778, -0.066517, -1.533605, 1.931493, 1.549286, 1.261844, -0.025092, -5.829014, -1.885451, 0.096401, -0.849005, 1.323331, 1.640723, 2.387309, 0.977560, 1.003233, -2.851422, 2.125631, -0.622645, -0.251111, 0.108200, -0.860243, 2.165220, -1.747716, -1.526603, -0.160664, -2.234724, -3.378771, -1.829369, -0.828802, 1.092334, -2.838180, 2.241061, -0.070938, -0.125325, -0.533843, -0.900653, -3.907611, 0.291193, 1.380849, 3.148268, 2.248498, 2.641789, 1.162023, -1.307409, -7.452969, -5.762230, 0.570644, 2.225794, 2.268607, 2.220694, -1.868391, 1.590621, -1.087466, -1.318701, 0.355766, -2.192402, 5.148515, -1.260565, -0.340788, -3.066326, -4.802943, -1.822399, 2.027146, 2.033635, 2.484090, 0.864079, -2.444166, -2.060366, 0.691616, -0.893325, 1.614154, 2.483134, -7.538818, -2.377872, -0.144321, -0.343106, -0.466051, -1.690653, -3.003664, -0.220598, -1.996093, -1.322911, -1.798978, 0.656198, 1.923437, -0.542048, -0.335357, -0.584021, 0.888298, -1.122800, -1.792932, 0.603619, -1.189901, 0.216393, -2.956912, 0.420574, -1.393399, -1.954148, -1.921767, 4.165194, -1.175791, 2.798207, -10.199874, -0.837972, 1.866549, 4.939780, -1.918338, -5.765625, 1.894684, 2.931093, -2.691194, -1.582052, -0.238202, -2.357203, 3.227861, 2.676049, -1.514403, 0.610364, -2.333433, 1.532472, 3.116742, 3.468613, -0.903491, -3.069241, 2.971029, 1.897196, -2.358795, 1.614014, -3.038753, 0.296462, -3.413616, 1.476674, -2.205294, -0.426963, -1.149141, 0.687046, 1.459785, -1.579808, -0.128799, 1.095678, -4.413157, -0.583210, -0.037564, -3.217687, -3.874143, 1.367258, -0.658092, -4.242138, -1.566653, 1.616565, -0.231850, -0.561782, 0.175397, 0.873898, 2.102082, -0.481464, -1.669100, -0.535181, 1.875431, 4.580070, -2.197358, -0.910670, -0.982496, -2.303578, -1.195334, -0.761116, 1.036271, 2.430554, -1.472700, 0.805644, -0.503828, -0.386881, -0.879527, 0.931079, -0.833261, -0.988179, -1.804157, 1.147231, 2.970528, 0.660973, 1.180153, 1.487581, -2.030739, 1.501949, -0.520467, 1.889769, 1.071903, 0.089176, 0.463579, -1.126580, -1.210711, 0.082390, -1.625102, 0.874795, -1.195674, -3.522628, -1.638729, 3.287249, -1.166453, -0.828902, -1.406641, -4.649971, -0.544490, -1.771114, -0.274173, -1.582270, -1.786603, 3.387163, -0.859107, 3.550398, 0.630316, -1.103999, -2.080497, -2.685435, 4.383327, -0.171877, -0.829238, 1.843727, 0.737404, 1.495080, -0.509761, -0.942684, 0.598197, 0.137417, -2.112840, -3.016319, 2.016804, -0.613345, 2.634976, 3.474667, 2.299239, 3.921109, -0.259313, 2.293547, -0.668590, 2.097881, 0.135472, 1.272800, 1.023938, 3.750861, 3.546648, -5.987249, 5.189409, -2.012692, -2.080787, 3.648663, -1.718726, -3.059613, 0.914977, -0.070629, 2.184004, -0.554282, -1.572647, -4.378010, -1.895564, 2.295761, -1.807297, 0.939028, 3.319588, 0.981454, 1.846804, -0.074813, -0.959925, 1.180125, 0.214293, -1.340116, 2.026832, -0.275083, -2.391880, -2.757796, -2.484968, 0.950552, -1.653520, -1.723869, 1.662331, 0.535817, 0.273211, -1.151067, 2.993492, -0.075704, 0.320338, -1.407653, -1.041616, -1.881412, -2.501787, 1.301393, -1.517647, -2.464800, 0.252449, -1.997724, -3.026290, 0.212039, 1.502458, -4.246790, 1.029287, 2.246881, 0.741828, 0.158541, -1.028801, 2.529005, -0.893468, 1.443282, -0.346494, 2.669097, -2.864852, 3.209790, 3.041502, 0.020234, 0.115243, -2.031523, -0.282542, 0.396332, -1.363705, 3.021288, -0.318689, -1.363607, 0.955074, 0.509633, 2.074913, -1.201262, -2.390836, -0.396984, -1.613606, 0.360301, 1.139240, 0.839843, -0.285194, -0.322745, 0.437023, 1.281257, 3.713523, -2.683018, -2.837252, 2.113353, 0.791203, 2.814537, -2.278843, 0.702416, -1.589167, 1.516128, 0.348960, 1.021993, -2.720528, 2.671391, -1.391619, 2.161263, 1.234758, 3.476467, 2.706942, -2.214298, 0.062933, 3.817600, -1.251961, 3.566387, 1.513068, 0.747667, 1.597302, 3.858117, -1.373156, 0.729746, 1.749554, 2.064092, 0.612888, -1.871164, -0.993573, 1.980641, -0.394882, -0.771789, -2.300255, -1.864526, -1.332325, 0.679321, -1.628580, 1.684532, 0.794181, -0.341031, 0.865944, 1.570801, 3.436136, 1.568980, 0.631498, -0.048965, 0.418611, 0.775022, 4.532741, 0.286989, 1.137858, 0.944515, -2.167217, 3.400816, -2.120444, 1.896021, 0.739563, 1.800057, 0.976187, 2.241848, 1.109004, -0.139124, 2.273750, 2.243190, -0.000260, 7.102096, -0.981638, 0.517041, -0.106855, 1.611364, -0.680890, -1.851746, -0.071771, -0.558747, -0.778105, 1.001288, -2.798460, 4.107984, -2.121557, 2.265193, -0.130054, 1.358769, 0.994828, -2.313165, 0.441207, -0.300783, -0.672292, -1.479555, 3.422815, 0.001658, 0.206383, 3.627066, -1.661040, -1.218103, -1.546490, 3.201838, -0.425097, -1.208788, 1.069553, 1.511283, -2.078781, -0.688469, -2.191390, -2.106024, 1.625274, 2.402612, -0.825243, -4.379784, -1.812769, -0.058542, 1.346276, 0.333447, 2.418711, 1.475576, -2.620394, -0.280505, 1.131126, 0.607015, -1.094618, -2.319695, 1.346097, 2.262267, -1.525486, 1.444179, 0.501193, -1.753824, 0.733545, -2.394020, -1.725506, 0.408842, 1.683417, -3.182556, -0.488479, -1.059273, 2.058362, 3.142289, -11.317917, -2.676118, 2.479192, 2.741839, -1.815452, -1.900564, 3.863253, 0.957858, 1.753211, -2.094190, 1.561213, 1.603490, 1.401700, -1.423863, -1.557342, -1.070505, -1.258466, -2.814864, -0.233111, 1.705125, 1.368987, 0.674738, 1.317454, 1.040991, 0.379262, -3.057135, 1.303494, -0.719236, -0.150981, -4.246114, 0.012882, 2.754486, 0.887157, -2.154360, 0.434403, 2.080772, 12.592675, -1.400895, 3.682422, 18.500687, -1.824731, -1.229960, -1.306891, 4.426475, 4.153411, 0.296232, 0.334915, -0.198403, 0.207817, 1.373839, -1.809203, 3.648643, -3.357051, -0.607143, 0.804729, -2.215043, 1.579057, -1.446862, 2.116117, 0.022489, -0.106560, 3.259476, 2.179984, -0.827409, 0.466238, -0.174084, 0.749496, -2.032082, 0.819768, -0.012092, -1.404604, 2.040248, -4.373999, 2.905264, 0.524823, 3.290520, 1.260755, -1.269325, -0.546155, 0.301613, -3.231517, -2.633416, -0.817336, -2.118700, -0.650470, -2.887156, 1.295805, 1.893788, -0.623003, -0.415323, 3.246930, 0.012536, -2.414260, -1.570900, -0.459119, 2.363632, -1.078479, 0.124088, 3.440009, 1.118437, -1.124332, 4.910594, -1.549336, -1.430183, 1.136268, -2.112774, -1.333619, 2.121091, 2.231785, 1.296702, -0.906199, 0.965406, 3.869365, -1.652183, 0.759212, -2.980289, 0.682221, -0.984810, 0.634504, -1.211771, -2.491594, -0.945581, -2.850454, -0.138457, -0.431604, -0.632796, 1.385889, -0.483601, -1.077885, -1.354634, 1.187483, -0.412081, -0.877794, -0.520237, 0.527379, -2.126064, -1.391075, -0.027463, -3.005242, 0.156661, 2.754908, -1.378801, 0.082020, 0.123864, -0.290277, -3.230394, 1.301592, 2.213902, 3.094155, -1.755002, -2.322603, 0.064931, -1.328647, 0.874909, 1.640637, -0.022136, -1.437375, -0.168740, -1.580431, -1.466243, 0.074297, 1.542410, 0.640596, -2.803895, 0.838751, 1.192846, -2.151078, -1.352274, 4.536688, 0.652296, -1.681036, -1.748077, 1.116713, 1.505466, 1.093051, -0.850750, -2.080233, -1.093351, 0.568372, 1.615424, -2.187084, -1.067516, -2.221610, 0.670807, -0.322722, -0.557603, -2.128840, 2.599048, -0.295556, -2.070664, 0.996682, 2.272573, 0.704867, -0.482912, 0.789556, 2.423104, 1.283592, -1.183174, -1.525468, 0.091626, 1.759403, -0.894384, 1.708647, 17.310020, -3.481026, -5.126164, -0.462365, 3.699655, -4.189553, 0.381989, 1.898774, -0.363996, 0.458372, 1.472605, 2.367271, 0.744145, 1.278426, -1.720926, 0.309616, -0.775783, -1.905312, -0.546735, -1.651521, -1.946627, 0.678068, -0.779717, -0.981420, 2.827802, -2.157915, 0.100543, 2.626758, -0.105437, -3.918166, 0.863819, -3.190261, -1.860274, -3.934666, 6.241655, 0.965956, -2.913852, 0.003365, 1.201703, -2.250198, 0.700441, -0.116006, -2.632267, 0.289158, -1.789185, 0.158896, -2.259541, -2.592633, -1.587397, -3.143592, 0.232405, -0.945033, -2.211038, 2.132655, -12.288932, 3.241026, -1.518620, 1.077391, 1.059972, -1.065658, -0.746511, -1.032448, -0.192777, 0.732772, -1.637340, -0.360361, -1.155054, -0.207818, 1.369705, 2.599172, 2.482543, -1.180554, -0.888696, 2.530247, -0.034697, 0.959176, 1.193800, 0.326233, -1.076948, 3.587785, -0.672961, -4.438885, -0.397180, -3.531070, -6.830092, -0.541381, 0.651230, 0.792656, 1.801674, 0.653098, 0.020145, -0.930110, 1.829328, 0.417755, -1.133758, -0.575342, -0.281619, -2.819133, 0.293893, 1.141983, -1.487329, -0.180050, -0.480632, 0.239821, 1.104678, -0.611782, -2.685412, 2.376577, 1.076170, 1.586790, -0.723516, 0.346962, 2.835624, -0.571163, -0.018835, -1.004567, 1.764784, -1.652679, 0.181482, 1.126542, -1.868064, 0.287467, -2.103253, 2.532168, 4.365986, 0.254799, -0.283266, -2.060601, -1.498800, -2.974444, -1.227938, 2.405658, 0.629228, 2.409974, 0.530177, 1.364315, 4.653522, -0.811898, -1.583435, -1.069651, -2.269577, -1.162140, 3.601405, -0.490714, -1.997232, -3.724590, 3.772781, -2.163326, -0.496667, -4.260191, -1.616646, -1.990680, 3.304270, -2.352931, -1.146297, -6.734396, 1.459528, -0.291793, -0.521500, 13.051759, -2.432004, -2.658777, -1.149259, 1.436345, -0.751880, -2.189231, 0.615586, -2.498409, -3.291502, -2.095250, 1.040146, 0.714031, 0.970416, -0.616222, -2.000639, 1.994921, 2.395315, -0.561833, 0.121436, -1.408910, -2.248885, -1.112001, 1.088048, -0.414269, -0.435527, 3.829697, 0.528129, 2.375221, -0.519734, -0.971582, 1.406152, 2.238408, 2.720003, -0.267679, -0.791799, -0.090492, 1.500209, -1.946325, -1.748135, 6.836010, -1.218125, 0.738863, -1.162376, 1.206608, -0.377882, 1.355038, 0.775188, -0.811568, 1.407529, -0.604065, 4.010166, -1.165298, 0.672806, -5.079141, 0.690641, 0.379462, -1.214549, -1.517621, 1.051511, 4.106417, 0.001753, 1.941271, -0.016778, -0.369193, -0.438197, -3.510782, -0.501950, 3.121536, 0.598490, 3.841994, 0.205602, -1.246792, -1.616723, -1.143543, -2.465981, -2.489971, 2.632675, -1.313524, 0.345787, 0.992921, 0.425921, -2.188962, 0.680292, 0.194884, 1.859877, -0.628118, 0.726368, -2.322854, -1.007814, 0.255692, 0.737325, 2.278359, 0.366714, -1.926799, -1.290199, -0.082554, 0.797631, 3.216966, 0.944166, -3.854357, 1.882313, -0.757823, -2.803020, -1.485754, 2.979990, 0.822588, -0.639839, -0.333204, 1.326457, 0.428489, 2.403607, -0.028208, 1.258138, -1.414546, 2.595982, -1.312079, 1.871267, 3.542688, -1.192679, -0.747623, 0.372664, 0.500303, -2.334963, -5.068396, -1.425156, -2.123881, -1.776343, -0.095469, 1.043374, 1.254840, -0.709997, -0.800484, 0.942643, -1.944042, -0.205155, 0.038740, 2.047468, -1.089243, 2.242003, -1.200643, 5.502473, -0.559836, 0.693430, -0.248119, 3.774752, -1.164453, 0.814244, 2.204838, 3.310038, -1.259518, -1.539988, 0.631875, 1.811592, 1.660107, 1.233070, 1.175259, -0.912320, -3.419650, 3.499059, -2.343933, 2.245733, 0.961249, -1.360731, -2.247202, -1.547391, 0.428645, 3.232787, 0.094637, -0.361900, 0.301173, -0.484404, 0.424156, -1.495115, 2.174474, -4.887064, -3.342853, 2.512707, -5.732465, 2.330478, 0.607593, 2.459414, 2.199535, -0.540271, -0.431682, -0.428440, -0.331239, 7.473131, -0.676626, -1.463685, 0.149552, 0.061583, 1.886732, 1.978416, -1.099899, -2.279263, 1.284642, -2.398491, 1.477496, -0.910583, 1.879131, -0.457095, -0.015456, 0.256275, -0.456831, -3.490171, -0.004046, -0.551340, -0.684976, 0.764194, 0.847659, -3.879746, -3.626539, -1.568061, 0.991960, -0.737746, 2.614909, 1.940545, -3.691171, -3.245214, -2.179925, 2.383390, -1.783122, -0.506950, 1.127885, -0.890014, 1.435303, 3.048434, -1.386346, -1.076663, -2.896395, -1.343953, -2.565495, -1.381629, -1.268511, 1.228591, 0.993910, -2.575418, 0.648286, 1.532965, -0.935785, -1.605901, -1.246196, -2.201324, -0.857666, 3.537265, -3.682135, 0.672675, 0.951825, -0.827279, -3.059412, 4.575532, -3.718941, -0.977951, -1.166684, -1.237993, 3.090737, 1.565580, 1.461544, -1.929372, 1.394221, -3.346243, -0.226639, 1.119265, -1.345803, -0.899012, -2.456377, 1.593067, 0.910089, 1.503470, -1.812514, -1.400435, 1.177375, 0.096689, -0.294497, 0.204516, 0.525131, 1.091468, 0.787260, 0.021002, -1.185784, -0.505473, -0.187676, -1.750899, -3.509349, 2.650220, 1.842912, -1.387142, -2.165509, -0.041363, 0.090037, 3.887540, -2.342398, 0.609889, -0.786447, 1.820952, 1.236354, -1.621533, 0.662158, 2.572353, 0.277775, -0.603314, -1.811826, -0.732945, 1.049378, -2.206546, 1.959944, -1.303532, 0.861363, -0.612073, 0.364254, 1.629579, 0.628865, -0.018568, -3.558367, -0.744735, -1.004377, -0.180408, 0.018036, 0.942299, 2.755231, -3.410764, -1.202904, 1.628659, 0.269436, 1.631451, 2.589946, -0.040498, 1.630141, 0.496223, 1.869666, -0.404186, 1.478820, -0.379239, -3.533465, 2.040182, -0.441357, 0.470875, -0.983391, 1.173682, 1.085656, -0.516549, -0.635032, -0.766274, -2.671732, -1.281379, 1.456411, -0.915409, -3.183290, -0.204772, 2.401025, 2.152182, -0.233675, 0.305950, 2.888514, 1.930429, -1.184927, 1.662104, -0.271050, 1.988250, -0.984551, 1.636173, -0.936373, -0.207487, -3.862692, -0.202516, 1.630704, -0.943317, 0.691296, -1.044768, 0.531007, 0.471725, 0.821349, -1.234496, -1.380861, -4.413808, 0.437657, -2.809109, 0.977942, 0.700759, -0.938202, 2.930912, 0.906982, 0.537006, -0.933399, -0.012241, -0.700445, 1.411654, 1.602838, 0.170198, 1.034546, -0.294352, -0.277783, -3.215640, -0.310807, 1.199179, 0.741775, -2.316380, -0.154027, -2.767129, -2.038565, 1.119279, -2.533041, 0.076981, -0.187397, 1.210819, 1.391541, -2.467751, -0.143722, -0.905126, 2.260527, -0.993221, -1.232143, 1.834083, -0.804748, -1.330043, -0.310126, -2.712278, -2.764476, -1.708571, 1.639717, 9.467136, 1.056757, 2.668218, 0.011055, 0.992315, -2.199364, 2.513154, 2.435208, -2.443968, 0.233445, -2.318432, 0.602261, 0.931487, -2.197106, 2.200380, -1.881728, 3.087734, -0.060384, 1.490917, -3.820469, -2.404525, 0.082236, -1.416263, 6.533288, -1.271866, 0.874339, -0.064727, -0.294185, 0.155135, -0.258834, -3.481010, 2.522374, 1.355910, -1.698231, -2.581586, 1.589780, -1.474311, 3.495509, 0.679703, -4.357119, -0.388887, 2.020856, -1.231388, 1.933426, -3.399568, -2.166835, 3.597775, -2.225888, -2.851705, -4.492297, 2.081487, 1.752849, 0.931443, 1.775176, -1.427293, 3.582803, 0.461577, 0.847037, 0.082594, -1.585400, -1.084166, -1.576557, -0.957542, -0.661302, 0.782861, -0.174181, 0.403607, -0.843751, 0.629765, -0.536972, -0.854492, 3.225008, -3.574810, -2.818572, 4.156142, 1.869532, 1.431887, -0.001005, 0.652833, -0.933383, 0.058137, -1.479301, -2.311369, -3.555691, 0.354053, 1.014045, -2.245627, -3.368932, 0.047818, -3.167487, -1.033651, 0.841527, -1.111248, 3.357744, 1.951621, 0.393261, -3.040123, -3.178878, 1.615363, -0.041975, -2.667777, 0.146986, -1.417583, -1.627755, -0.685473, -3.537882, 0.029217, -0.811283, -0.627015, -0.022089, -1.536894, -1.870402, 1.232272, -0.325790, 1.623947, 0.759523, 1.878842, 2.110918, -0.386001, 0.917781, 0.609635, -0.115149, 3.891633, -0.426539, -0.111032, -0.530549, 1.704758, -2.846103, 1.041818, 0.211156, -1.941607, 0.620646, -2.002606, 2.378968, 1.088245, 4.909698, 2.291356, -1.610609, -2.184232, -1.090413, -0.585811, 1.194546, -1.638153, -0.257659, -0.407240, 0.275248, 1.816893, -0.549051, -4.227658, -0.838440, 0.708863, 3.270518, -3.819626, 1.362298, 3.398038, 0.340970, -0.061836, -0.640408, 2.750417, 0.624174, 1.774026, -2.950428, -0.631979, -3.549893, 0.611664, -3.074150, 2.049044, -0.319013, 0.005977, -0.195755, -0.366373, -1.888581, -3.412463, 2.255754, 1.286729, -0.145309, 1.804818, 2.039304, -1.039636, 1.837572, 3.214562, -3.287841, -0.662308, 3.181302, -3.620912, -0.062886, -0.094632, 0.150107, 0.930212, -0.841085, -2.030969, 1.515888, 0.988359, 1.924312, -1.151969, 0.427673, -0.768772, -3.342158, 1.639557, 0.227930, -0.515050, 1.370292, 0.266383, 1.212152, -0.133667, 0.940615, -1.409638, 1.942192, 2.937795, -0.097905, -1.931062, 0.930637, 1.869293, 1.406385, -1.296549, 0.331738, -2.143225, 3.932092, 2.715863, 2.412196, 0.995437, 3.460501, -1.302808, 2.907689, 1.655309, 2.478021, -0.200031, 3.047060, -0.410674, -2.622407, 0.302213, -2.410561, -0.752223, -2.774459, 0.095784, -0.415811, -1.300381, -0.542583, -2.131463, 2.296709, 1.587769, -0.620450, -1.230258, -1.553957, 1.986467, 2.022719, 0.715672, 2.198387, -1.387220, -1.690441, -4.472634, 0.718367, -1.587503, 1.961340, 2.033993, -2.388419, -0.369963, 0.497210, -3.200239, -1.881070, -0.903583, -2.291441, -1.807256, 0.693820, -1.195831, -0.751228, -1.074330, 0.841028, 1.857826, 3.043664, -0.462683, -1.359275, -3.413938, -1.533384, 0.597736, -1.686284, -4.281181, 2.230808, -1.951093, 0.725956, 2.111386, -1.241247, -0.219312, -0.911483, 1.301086, 2.288175, 2.471712, 1.183948, 1.253952, -1.300000, -3.339571, 1.425936, -3.129603, 0.875244, -1.445471, 1.345114, -3.602144, -0.701855, 1.367753, 0.410516, -3.014481, 0.615465, -0.351461, -1.487927, -2.222125, -3.888845, 2.094670, -1.978802, -0.517767, 0.058138, -0.444669, -2.294036, 1.998730, 0.297695, 0.018321, 0.268706, -0.879354, 0.274088, -0.445150, 2.828982, -0.520435, -1.824163, -3.204499, 0.636438, 1.238371, -0.863154, 4.537728, 2.181285, 2.711806, -0.882168, -2.096454, 1.998263, 3.602588, 2.114964, -1.150518, -0.076333, 1.211806, -2.149293, -0.410827, 0.744497, -1.594841, 2.430060, -3.093566, -2.016082, -2.308710, 2.795972, -2.289312, -0.833563, 0.530762, 2.017911, 1.745501, 3.915513, -3.470293, -6.470500, -0.332230, 0.011448, -0.394733, 2.631524, -1.241708, -0.602433, -2.349052, 1.143003, -1.018770, -0.317715, 1.176016, -0.424689, 0.303274, -2.129854, -0.518942, -1.190350, -3.030563, 2.986264, 0.322974, -1.171382, -2.080170, -1.518298, 1.755744, 2.481019, -1.135633, 1.196923, -2.293517, -0.416247, -0.419748, 2.791564, 2.139885, 3.192137, -0.693876, 0.322415, -3.658330, 1.089203, 1.927066, -3.449436, 0.115316, -1.598851, -2.275302, -2.185232, -1.625353, -0.050324, 0.472127, 1.869982, 2.535981, -4.294853, -2.277530, 1.433655, 1.217977, -0.048806, 2.771290, -1.207959, -2.310196, 2.591723, -2.335873, -3.282704, 1.380378, 0.943002, -3.358639, -0.853271, -2.042148, 0.580582, 0.059815, 2.079012, -2.376285, -0.196453, -3.680857, 0.270507, -1.599318, -0.332833, 1.543115, -0.472989, -0.988075, 1.807841, -1.925379, -0.623305, 6.014649, 0.391908, -1.238512, 4.130530, -3.661225, 4.244806, 1.479349, 0.690953, 2.257133, -0.744723, 6.228973, -1.201092, -0.397441, -1.781538, 6.628207, 4.155041, -0.528522, 0.690781, -1.653167, 2.763517, 1.429862, 0.232714, 2.562456, -1.089225, -1.969519, -2.903962, -1.258486, -1.445670, -0.693330, 2.552694, -0.960994, -2.646074, 1.737455, -0.539961, 2.642196, -0.004014, -1.921011, 0.863813, -0.337553, -0.190130, -3.074884, -0.279881, -2.351591, 0.643860, -1.237673, -2.046865, 0.629775, 0.710130, -11.075646, 0.213202, -0.315120, -1.107641, -3.423430, -0.593594, -2.706415, -1.796721, 1.453879, -2.855584, -4.217345, -0.273067, 0.940133, -1.611402, 1.074368, -0.007585, 0.811808, -0.326986, -1.716057, -1.484485, 0.644776, 1.441688, -1.257548, -0.016782, -8.116377, -1.092561, 2.480797, 2.979291, -0.185106, -1.670235, 2.845420, 0.410733, -1.724248, -1.519051, 1.600042, -3.040879, 0.611754, 2.122114, -1.111939, -1.519439, -0.995059, -1.118172, 1.793956, -0.745843, 1.341350, 1.434149, 2.423525, 1.703777, 0.244239, 4.135608, 0.343924, 0.921309, 1.121611, 0.995589, 4.442313, -0.761273, 3.066107, -2.289390, 1.500767, -3.724264, 1.227392, -1.030693, 0.063110, 2.629669, 1.274395, 0.414828, -1.593909, -1.180849, 1.649198, 0.442606, 0.564936, 1.496706, 1.280906, -0.746933, 1.430545, -1.602895, -1.896697, 0.114705, -3.046932, -2.504452, -0.288728, -0.457942, 2.345242, -0.296523, 2.460886, 1.491791, 1.460493, 2.949695, 0.646835, 1.938821, 1.221826, -3.029753, 1.027855, -1.815510, -0.346786, 0.590048, -2.604692, 2.580613, -5.464456, -2.245464, 2.986708, -1.448367, -2.161349, 3.038980, -0.680506, -0.707855, -0.048277, 2.409130, -3.707219, -1.432573, -1.219632, -0.352755, -3.242210, -2.011038, -1.698111, -1.534498, -1.411064, -1.235086, -3.260119, 0.751513, 2.037949, 6.087719, -0.354985, 2.296287, -0.823595, -2.583332, 2.302197, 2.243156, 0.054603, -0.797066, 3.940995, -0.038713, 1.843858, -3.207862, 0.474149, -1.900639, -0.749204, 1.216251, -1.687184, 0.714031, -0.671786, -4.782870, 0.280564, -4.122239, 0.000554, -1.740705, 0.414890, -0.243921, -0.324114, 0.220153, 0.294286, -0.331063, 1.128937, -0.042819, -2.156856, -2.752139, 1.753495, -4.386309, 2.646769, 1.862037, -0.717543, -1.894679, -0.222581, -1.100053, 2.620292, -2.685768, 3.412238, -3.131670, 2.225328, -3.709669, -1.838988, 1.399511, -0.762291, 1.787900, 2.572352, 3.131334, -1.663082, -1.124640, 1.764323, 3.182696, -2.977489, 1.514403, -0.245818, -2.238850, 4.151056, 0.036864, -0.186692, -2.001691, 1.246032, 1.557085, 0.019165, 0.828518, 3.140849, -0.298136, 3.447491, 2.868086, 2.243513, -1.050800, -3.414328, 3.056563, -3.797553, 0.326561, -2.106754, -1.037465, -0.276275, -2.951207, 2.119463, -3.765703, 2.635464, 1.421755, -1.351970, -3.440002, -1.215871, 2.068440, 0.206381, 2.305964, -1.791120, -1.052097, 0.080710, 1.619676, 2.618592, -1.497641, -4.078834, 0.682429, -0.353784, -0.943733, -1.674624, 0.674289, 1.971855, -1.487211, 0.966145, 3.574993, -1.054608, 0.972266, -1.674012, 0.498910, 2.235878, -2.991302, -0.194584, -0.462182, -2.855854, 2.037934, -1.383906, 0.133169, -1.456793, -1.011050, -1.552601, 0.229914, -0.716652, -0.209454, 3.012683, -4.222848, 0.078118, -2.066082, -3.165344, 0.016393, -2.448389, -0.584697, 2.741099, 2.008625, 0.237100, 3.490952, -2.384674, -0.600877, 4.660543, 0.360961, 0.290765, -0.756962, -2.478991, 2.993396, 0.240600, 2.667300, 1.342028, 0.278699, 2.355097, 0.520877, -0.399971, -0.270937, 0.079413, -0.333582, 0.991009, -0.277407, -0.802320, 0.877358, -0.158195, -2.252553, -0.421490, -1.147035, 1.896074, 0.303360, 0.284368, 0.645959, -1.861567, -0.189420, 2.494082, 1.622385, -0.301365, -3.662149, 4.510878, 1.651112, 0.298648, 2.755453, 0.300312, 0.577870, -2.157631, 0.022099, 4.050114, -1.113486, 1.166550, 1.433385, 0.636592, 0.365205, -1.710035, -5.819573, -0.750998, -0.700895, -0.252428, -2.303637, 0.122997, -2.406902, 0.796248, 0.074350, -0.191512, -1.336334, -0.727378, -0.217502, 0.768734, 0.026973, 1.666830, 0.837306, 2.636509, -4.413403, -3.378228, -1.265510, -0.660484, -0.527256, 0.652431, 0.810927, -0.955311, -2.194607, 3.390256, -0.676141, -2.678200, -2.246846, 4.665495, -0.300063, -1.505307, 0.372656, -0.314319, -1.683735, 1.244436, 7.202695, 0.347428, -3.931497, -0.384359, 0.070719, 1.927199, 2.370592, 1.543958, 1.292192, -1.171941, 0.515360, -1.998340, -2.632734, 1.160288, 0.932155, -2.313407, 1.186146, -1.297318, 0.411109, 2.077339, 0.377120, 3.878205, 1.169885, -0.514475, 1.380647, -0.220764, -6.186738, -0.987336, -1.161258, 0.022470, 2.944921, -3.944496, -2.454340, -1.350916, -3.333500, -3.976480, -0.813268, -0.936630, 4.470922, -1.197749, 0.999399, -0.848027, -0.556465, 0.876649, 4.624232, 1.716997, -1.316871, 2.161342, 2.106090, -0.884313, -0.900795, -1.254095, -0.401763, -2.136899, -1.426937, -1.633984, -1.857804, 4.080161, 2.620926, 3.743167, 1.912780, 1.157939, -0.232982, 1.485714, 3.776951, -0.728675, 0.096456, 2.419091, -2.536920, 1.749187, 3.919883, 1.045194, 0.124967, 5.034767, -0.020524, -5.442020, -0.340481, 0.847831, -0.570193, -3.708969, -1.694553, -1.775758, -1.774149, 1.016557, 2.744298, -0.981895, -3.203657, 2.381986, -0.130370, -0.575154, 0.981555, -2.114384, -0.703772, -1.544474, -4.912024, 1.436929, -3.612131, -0.513180, 1.124501, 1.500321, 1.228162, 2.164581, -1.008544, 3.386724, 5.138061, 2.143028, -0.163646, 3.132955, 0.232388, 0.140801, 2.374613, -1.336730, -3.779842, -0.695596, -1.955346, -5.124892, -0.852984, 1.089569, 1.047508, -0.899278, -2.671301, 0.775844, -0.359940, 2.370226, -1.953356, 1.204961, 0.575502, 4.395379, -1.004487, -2.043077, -0.088922, 2.446170, 1.487957, -0.670360, 0.311429, -0.970048, 4.440516, 0.804933, 1.320401, -1.760813, -1.227641, -0.585840, -1.419913, 2.561329, -1.393864, -0.514215, -1.485557, 2.117904, 2.916862, -0.557396, 3.088374, -0.601427, -2.839745, -0.335838, 1.241137, -0.483900, 0.794831, 1.195930, 0.649772, 4.032490, -0.526397, 0.963251, -0.133096, -0.534765, -0.111903, -1.648276, -1.044494, 0.361883, -0.918383, -0.419664, 0.178241, -0.791128, -1.730827, 0.185987, -2.588084, -0.286288, -1.370608, -4.314114, 1.166656, -0.317811, 0.544867, -1.790729, 1.485108, 3.057710, -0.402884, -3.525095, 1.046638, 0.469837, -10.401346, -0.187911, 0.602185, 0.784814, 0.686189, -0.628216, -2.192061, -0.726722, -0.583705, -1.816725, 2.630092, 2.962842, -0.098243, -1.397867, 4.522958, -2.306017, 0.158243, 0.021326, 0.669872, 1.986373, -0.030303, -0.977807, 4.633324, 0.498535, 3.087742, 3.187632, -1.225902, -1.390914, 0.624044, 1.865394, -3.805171, -0.420694, 2.681516, -4.424584, 0.019501, 1.461479, 0.945931, -0.460550, -0.178802, 1.539152, 1.010580, 1.188666, -3.036549, -0.074547, -0.933213, 1.277559, -0.826474, -1.801009, 1.385077, 0.442225, 3.276604, -0.478880, 1.084784, 0.558545, -2.902088, 1.052059, -1.853553, 0.754707, 1.477189, 0.535306, 1.659573, 0.157024, -2.818528, 4.611130, -0.069930, 1.186805, 3.071634, -1.309555, 2.216107, 1.327526, 5.102565, -1.661079, -0.484750, 2.699060, 2.793719, 6.261371, 0.641963, -0.288440, -0.679968, -2.719732, -1.884756, 4.892308, 1.928781, 0.405360, -0.971816, 2.584059, -0.217052, -0.968458, 1.336929, 3.495005, 1.708722, 0.110665, 0.281588, -0.007339, 0.161777, 2.841527, 2.780258, 0.515291, -3.523671, 0.947459, -0.148462, -1.500179, -3.493957, -2.649297, 1.730525, 1.296232, 0.803082, -1.144793, 2.993541, 0.136460, 2.948369, 4.233619, -0.316875, -1.924663, 2.759971, 1.683363, 2.597641, 0.497635, 1.238039, 0.183853, 0.467309, 2.465760, 2.984601, -0.765347, -1.204559, 1.072209, -3.060474, -0.113188, -1.049060, -1.903181, 2.465544, 2.311688, 0.107047, -0.454180, 1.233230, -0.411675, -1.004517, -1.172773, -1.903938, 3.785788, 2.654215, -0.098252, 2.272377, -2.599964, 1.309094, 4.073755, 5.101512, 0.368129, -2.872931, 0.665823, -0.812287, -1.531622, 2.246672, 1.337333, 0.032180, -1.559512, -1.714184, -0.499598, -0.752196, -0.304698, 0.928405, 2.025278, 1.975079, -1.431077, -0.394958, 0.576032, -1.239387, -2.800098, -3.549074, -2.340450, -3.898644, -2.728451, 3.891414, 0.831638, 2.061581, 1.196748, 1.816497, 0.156793, 1.248291, 1.444935, 2.782322, 0.900815, 3.566337, 1.390690, -4.955343, 0.791208, -1.397928, 1.828276, -0.509986, -1.943480, -4.032582, -3.864328, -1.491560, 1.075016, 4.650043, 0.521032, -0.454500, 0.902760, 3.573976, -1.945750, 1.468956, 0.404240, 0.927548, 2.688647, -1.748163, 2.081175, 2.447916, -0.936185, 0.596772, -2.287498, 0.668699, -0.389346, -4.987719, -1.310262, -0.750813, -0.004687, -1.002931, -2.260232, 2.612141, -0.452705, 3.334452, 4.471656, 0.040492, 0.043492, 1.906444, 0.762864, 0.013549, -1.428000, 0.704618, 4.283254, -0.447447, 0.242987, 1.894843, -1.150813, 1.425953, 0.571005, -1.205558, 2.250842, 1.010226, 0.975433, -0.190685, 2.520607, -0.260550, -1.565487, 2.237750, 5.694161, 1.458744, 1.499833, -0.964584, 1.126270, -4.017033, 3.835517, 3.099368, 2.741929, 3.745870, 1.924112, -2.769214, -8.153638, -1.366967, -0.222473, 3.104762, 0.408815, -3.334017, -2.308936, -2.294003, -2.051878, 0.342104, 1.393579, 0.115670, 0.384816, 2.063236, -0.901094, 0.488691, 0.358792, 0.578016, 3.229413, 0.266376, -0.348818, -4.958932, -0.798056, 0.753551, -4.482573, -2.943725, -4.519968, -3.022888, 1.796830, 0.265710, 1.506794, 0.902000, 1.777172, -1.468453, 0.287163, 4.323891, 0.919122, -0.175923, -1.098132, 0.103859, 0.678686, -1.698002, 0.703255, 0.417760, 0.247697, 2.633260, -4.531164, -1.101134, -2.511764, 0.845692, 0.613309, -1.745634, -3.350141, 1.451234, -0.067550, 2.045863, 0.066918, -0.112667, -1.870865, -1.044443, -2.118799, 1.548138, -3.268719, 3.594374, -2.292007, -0.731400, 0.638149, 1.171789, -0.541452, 2.279704, 0.189897, 2.355322, 0.354103, 0.437733, 2.829152, 1.429048, 0.933853, -0.411465, -0.497428, -1.422664, 1.682440, -0.538568, -0.112830, 1.735603, -0.933443, 2.339074, -3.122605, 1.465831, 0.569450, 1.968063, -1.178774, 1.301933, -1.622244, -4.631080, 2.460362, -1.732073, -3.284402, 2.536322, -3.206595, -0.230824, 2.381750, -0.767115, -7.129733, 0.176164, 0.331321, -0.749577, -0.663327, 0.478035, -1.901979, 1.249618, 5.354519, -3.093740, -0.885269, 0.886041, 1.691122, -1.000635, -1.291685, -0.424720, -0.564069, -4.108914, 0.154632, 0.835152, -1.706249, -0.724605, 1.825291, -2.242805, 0.980393, -0.696546, 0.399662, 2.416517, -2.199752, -0.551790, 2.116515, 0.207906, 1.171264, 0.569540, 0.002868, -1.336563, 0.920318, -1.586634, -1.307527, -1.973062, -4.813841, -0.904314, -0.464118, -1.834300, 0.694039, 2.206343, 0.832556, -2.252901, 1.668644, 0.868798, -3.234017, -0.940149, 2.856216, 23.049131, -1.368276, -1.221370, -0.546990, -0.740844, -0.585601, 3.795197, 0.829124, -1.421780, -2.747960, 3.710470, -1.213560, -1.911591, 5.013275, 0.673200, 2.373624, 5.593239, -0.657087, 0.307902, -0.746474, 1.093833, -2.048117, 1.662055, 7.558481, 5.067765, -3.247648, 0.402622, 1.452313, -2.936994, 1.637158, -0.491472, 2.239504, 1.444507, 2.413685, 0.213852, -2.163628, 3.432185, 6.881503, 1.099702, -0.305388, -0.558462, 1.285792, 0.311286, -0.607929, 1.466767, 0.773958, -0.885835, 0.165974, -3.087383, 3.033762, -0.593204, 2.427591, -1.382962, 0.657607, 1.489019, -0.537272, -0.025472, -2.250768, 2.848840, 4.054736, -0.610624, 2.775694, 0.896121, -1.135586, 3.143772, 0.262527, 2.628227, -0.285485, 1.432166, -0.431331, -2.357857, -3.904627, 1.781239, 0.233672, 0.943503, 3.086826, -11.265800, -1.675461, 1.690181, -1.013600, -1.180526, 3.663422, -0.594121, 2.752018, 0.648223, -1.827538, -0.465675, 2.111429, -1.462047, -2.965230, -0.563627, 0.869006, -0.651719, 1.894362, -3.315145, 3.054309, -0.267607, -0.312651, -0.644268, -0.030751, -0.662597, 1.378941, -2.936244, 1.098406, 0.021220, -0.233837, 0.533106, 0.207417, 0.242259, -1.860722, 2.138731, -0.163239, 0.123441, 3.316773, 3.244506, -0.519945, 0.607942, -1.150754, -2.454610, -1.068099, 1.211501, -0.863424, 1.330000, 0.356083, 2.411207, -0.378593, -0.009602, -9.444021, -1.246032, 1.298337, -5.255346, -3.966677, -0.270956, 0.393291, 0.164479, 0.425593, 2.018335, -0.624935, -0.044252, 2.483628, 0.823812, -1.851756, -1.228499, -1.269864, -1.755970, 0.393510, -6.143491, -0.245994, -0.033852, -0.529157, 0.950466, 2.724706, 1.496825, -1.469126, 1.589233, -3.745199, -0.860516, -0.340473, 0.733098, 0.764605, -3.821471, 0.853751, 0.148301, 2.213405, 4.123124, 1.283300, -0.087948, 1.402946],
 	"qwen2.5-coder:latest":   [-0.436007, -1.314809, 0.139666, 5.148615, 1.838311, -1.521601, -3.773632, 0.581152, -0.971788, 7.219580, -0.066787, 1.198955, 1.659349, 3.621571, 1.779006, -0.560876, 0.355461, -1.897615, 2.303185, 0.109255, 6.044055, 1.446214, 0.674157, -0.692821, -1.987091, -2.048842, -1.936660, -2.349168, -3.152107, -3.172126, 0.873809, -1.490791, 0.970821, 1.441998, 2.091025, -0.651286, -2.027265, -1.386296, -0.700867, 2.765764, -1.228565, -0.294553, 0.431368, 1.284794, 0.631266, -0.846027, 16.100187, -0.651704, -0.213284, -1.980950, 0.748919, -1.524519, -3.034139, 1.428813, -0.906473, -0.359399, 0.568063, -0.806468, -0.336932, -0.204364, -2.249646, -1.886204, -2.921830, -2.873020, -2.915986, -0.558776, -5.761330, 0.710662, -14.035521, -2.583575, -0.747539, 0.809441, -0.951276, 0.723559, 1.804405, 0.222171, -1.118399, -0.074350, -11.002135, -1.436381, -1.212464, 0.143454, -2.309623, -2.390476, -0.117248, 1.324861, -1.891298, -0.915926, -0.236022, 2.277486, -1.947304, -1.112398, -0.341477, -1.563656, -1.073132, 0.275794, 2.828944, 0.914620, 2.530140, -2.008723, 1.740619, 1.433238, -0.137113, 1.737560, -2.091690, -0.144877, -1.682601, -0.273133, 0.891697, 3.031501, -0.963722, 2.925961, -0.128959, 5.033491, -2.237801, -0.329063, -0.569064, 1.594402, 0.993208, -5.571981, 3.424447, -0.005736, -1.073074, -0.499183, -1.037315, -0.981740, -0.731634, -2.787812, -2.274329, 0.616070, 2.383073, 0.642813, -0.994558, 0.252382, -4.711788, -0.271676, 3.495092, 1.964713, 1.607625, 1.494871, 1.336313, -1.587251, -1.514070, -5.126953, 0.097945, 0.919678, 2.441731, -2.928982, -0.321582, 0.855205, 1.654471, -1.153838, 4.138180, -0.955069, 1.078292, -1.350932, -4.036607, 1.604253, -0.872791, -1.828661, 2.580978, -0.709960, 6.295347, 1.293245, -1.309786, -2.217097, 0.007649, -2.884886, 1.301318, -0.435597, -0.482649, 3.068917, -1.476866, 2.451068, -1.917819, -1.498407, 0.184997, 1.833188, 0.816717, -0.623813, -2.193322, -1.512282, -1.615667, -1.072993, -4.262381, 0.010529, 1.257217, -2.236494, -4.156431, -0.603306, 2.949536, -0.846157, -0.592448, 1.623096, 0.024929, -0.389215, -1.058699, 0.131432, -1.387448, -3.724334, -0.617063, -1.108744, 0.636820, -1.469211, 2.240824, 0.869600, 4.049538, 3.993536, -0.610457, -0.671499, -3.111218, -2.991330, 0.696155, 2.608358, 1.001497, 1.474154, -3.532830, 2.660499, -1.066343, 2.081527, -1.972203, -0.570178, 1.080361, 0.286836, 0.940870, 1.073439, 0.588628, -2.507718, 2.726036, 0.290909, 1.218498, -1.227438, 0.014571, -0.629023, -0.212335, 0.783704, 4.464511, -1.721464, 8.572509, -4.255687, -0.337520, 0.154504, -0.440107, 1.332823, 6.227478, 1.853769, 0.825418, -0.090258, -0.175270, -0.294410, 2.580056, 0.677362, 2.338051, -0.415923, -1.507566, -3.802147, 0.664998, 1.294685, 0.533289, -4.037803, 1.228877, 1.029611, -2.024710, -2.125656, -0.549981, 2.533221, -2.395383, 0.313111, 0.601733, -2.608969, 0.216491, 0.406232, 0.609647, 0.711135, 2.503493, -1.242730, 0.346222, 0.542468, -3.621418, 0.442082, 0.813194, 2.384302, 0.784717, 0.717723, -1.476249, -1.676991, 0.117224, 0.378211, -0.194512, -0.284983, -3.506894, -1.025167, -0.378918, -1.602937, 1.909185, 2.551661, 2.117800, -0.885118, 0.708412, -1.859785, -0.479130, -0.738324, -2.670425, 2.509346, -1.601109, -2.061625, -1.747532, 2.314368, -2.034420, -2.830469, 0.645515, -0.335051, -1.528542, 1.276098, 1.399137, -2.062893, 3.157044, -0.355756, 0.008239, 0.529171, -3.172691, -1.092223, 4.341345, -4.409624, -0.345799, -1.003428, -4.002488, 1.766916, 1.763788, 1.652610, 0.630642, 0.222056, -1.701876, 2.214486, 2.379781, 0.916326, 2.501186, -0.940371, 0.257316, 0.774674, 2.868283, 0.917253, 0.978641, 0.544919, 7.170478, 1.259203, -1.779769, -1.019169, 3.777537, 0.795997, 1.963864, -0.853626, -0.343095, -1.443662, 0.394236, 0.305709, -0.351705, 0.524120, -1.773943, 2.920990, -0.804305, -3.644200, 2.719176, 2.336826, 2.026261, 0.880051, 0.758393, 0.228448, -2.986163, 5.670512, -0.611930, -0.002579, -0.251571, 0.224900, -1.935357, -0.920759, 8.231796, 1.971320, 1.576411, 1.943412, -0.953862, -0.375397, 3.036218, -2.298655, 0.889425, 1.108079, 0.258971, 2.647190, -1.539036, 0.500190, -0.727908, 1.263729, -3.033226, 1.961906, 0.810480, -2.089427, 0.708266, -4.047872, -0.527808, -14.221658, 2.361469, -1.320704, 0.836442, -0.386591, -2.236613, 0.486746, 1.005684, -0.821714, -0.599291, 3.795557, -4.883873, 3.555826, 0.689591, -0.508781, 1.518971, -2.128448, 0.553367, -0.200060, 1.611065, -1.504008, -3.721884, -3.394505, 3.972555, -0.445324, -1.183890, 3.563913, 1.988339, -0.476866, 0.289509, 0.277188, -0.175615, -1.041347, 1.197368, -0.477900, -3.759550, -5.051634, -1.572159, -0.226668, 0.180172, -0.086704, 2.489282, -3.111502, -2.003207, 1.001518, -0.583694, 1.663272, -1.637953, 0.545772, 2.723981, 0.770260, 0.925456, 0.989657, 2.473933, 2.747635, 2.175160, 1.760652, -0.013310, -4.443879, 37.422813, 1.288954, 2.467471, 2.146290, 1.013962, -2.084025, 2.127673, -0.888463, -1.521253, -0.895446, 4.524404, -0.104275, -1.633331, -9.016782, 1.046254, -1.463881, 3.555183, -1.504391, 0.277646, -1.592795, 0.272081, -0.698069, 0.696722, -1.410604, 1.937863, -0.376410, -1.499335, 1.668633, 1.260768, -2.925495, -1.584171, 1.157681, -3.337044, 3.435848, 0.386049, 1.061394, 1.006677, 0.669047, -1.470088, 1.235580, -1.276175, -0.555348, -1.114300, 0.408720, 0.459978, -2.024472, 1.675984, 1.306379, -0.254271, -2.635481, 1.731658, 2.153453, 0.928214, 0.306720, -3.400702, 2.915029, 0.031397, 1.996717, -1.877975, -4.729939, -0.609253, 0.541097, 1.393276, -0.035789, -1.044343, 0.889185, -0.336026, 2.131327, -1.250257, -1.924997, -0.287107, 1.208007, -1.054186, 1.337112, 1.274661, -2.434684, 3.016158, 1.469676, -0.695139, 1.505548, 2.100273, 1.219837, -0.772411, 3.593307, 5.785521, 2.612813, -3.146140, -0.555663, 0.243268, -3.966340, 0.486868, -1.174454, 1.676140, 0.317372, 0.213547, 0.103339, -1.636872, -1.078531, 3.002460, 0.008469, -2.326029, 0.867810, 0.871880, -1.080869, -0.768073, -4.836300, 0.383782, 0.691735, -0.713520, 1.354015, 2.611808, 6.183017, -1.084267, 0.292993, 0.816155, 1.273668, -0.560137, -1.290062, 1.882704, 1.010641, -1.717645, 1.066451, 0.092508, 0.932700, 0.062565, -0.448221, -1.144008, 1.872820, -1.933916, -4.155032, 1.226316, -0.842341, -1.021217, 1.994580, -0.036678, -0.072384, 0.812870, 1.374167, -2.501688, -0.321656, 3.544239, 2.224328, 2.181247, -1.169514, -2.582330, -1.573699, -2.363550, -2.202179, 1.673287, 0.711271, 3.818364, -1.154218, -0.725751, -1.511678, 0.080292, -8.115035, -3.127221, -3.172623, -1.133258, 5.428519, -0.574502, -2.239282, 3.485080, -2.038507, 2.057469, -0.269610, 1.192541, -1.718417, 1.374550, -1.552978, -2.797242, -2.503451, -4.544235, 0.636139, 1.108707, 3.459209, 2.595264, -0.305967, -0.003920, 5.865139, -2.725132, -1.437748, -1.930311, 1.015183, 0.153521, -0.693563, -1.841589, 1.740456, -0.131857, -1.259349, 1.107379, 0.814856, 2.672195, 1.490787, 0.823584, -0.483895, -0.674276, 3.958681, -4.697596, 0.824732, -0.919115, -4.634318, -0.438301, -0.317027, -0.150343, 1.811067, 1.930733, 2.765487, -3.315078, -1.065359, 1.905147, 1.231166, -0.812233, -0.079013, 3.098343, 1.337479, 0.038435, -1.502123, -0.722991, -0.738154, 1.651970, 3.549967, -2.832844, 2.526853, -4.530358, -1.405437, -0.020393, 1.433086, -0.355914, -1.094193, 0.618523, -0.507370, 1.855727, -0.352494, -1.034884, -0.052144, -1.694263, -1.334652, -0.578301, 1.324574, -1.522585, -0.899184, 3.180330, -2.992112, -0.966538, 1.624621, -2.214644, -0.544830, 0.516665, -2.317800, -0.701623, 0.938447, 0.935423, 7.276221, 2.074192, -0.549290, -0.723959, -1.024809, -2.093009, 0.378594, 1.273304, -1.574900, -2.294054, 1.627634, 2.471876, 0.272425, 0.226814, 0.288442, 0.485079, 0.329602, 0.418512, -1.030336, 0.618441, -0.205079, -0.751919, -0.533747, 0.987899, -0.886300, -1.857596, 0.970968, 0.782056, 1.377219, -3.131898, -2.278232, 0.560626, -5.161348, -0.380426, 0.495337, 0.181925, -0.484390, 2.722185, 1.441381, -3.070866, -1.380223, -2.827929, -0.428279, -2.406868, -1.231688, -1.393025, -2.365047, 1.180624, 4.354129, 0.116944, 1.813338, 0.807496, 5.356595, -0.893292, -2.298326, 0.608894, -2.458173, -3.912065, 1.234876, -0.789027, 0.989614, 1.082788, -0.383455, 0.617175, 3.955452, -2.602732, 0.340088, -2.055584, -1.332299, -2.340340, -2.757793, 1.896885, 1.322887, 0.021037, 2.907897, -0.321884, -2.944226, -1.640788, -1.364418, 2.449870, -4.300892, 4.101160, 1.867850, 0.166516, -0.710645, -1.172884, -1.457727, 2.929085, -0.218713, 2.206084, -0.883459, -1.040819, -2.344896, -1.173216, -0.345644, 1.559119, -1.632681, -2.044267, -2.017350, -0.654807, -0.716796, -0.901010, -1.560858, -2.691737, 2.685868, 1.782925, 3.339304, 0.565915, 0.391605, 3.610030, -3.298944, -0.705625, -1.543735, -1.870102, 1.338298, 1.612513, 0.385103, 1.112341, -0.001102, 0.658675, -0.503109, 1.369964, 2.857534, -0.630183, -0.912625, -1.217065, -0.550542, 2.285744, -2.127434, -3.157195, -2.480932, -0.733192, -1.617258, -1.182549, -1.050172, -0.540184, -1.505584, -1.097327, 1.250695, 0.358068, 1.122536, 0.025612, -0.006341, 1.774774, -2.286098, -2.659644, 3.301547, 0.470509, 1.290470, -1.164262, 0.626534, -3.474944, -1.804067, -1.576454, 3.310174, -0.299973, -1.118997, 0.285524, -0.434010, -0.235016, 2.567713, 2.659858, 0.159478, 0.923787, 0.271707, 0.721747, 1.390933, 1.695482, -35.562675, -0.141364, 3.048325, -1.559244, 0.738925, -1.291286, -2.524163, -0.560224, -2.540498, -0.174527, 7.668949, -0.468271, 2.184141, -0.495266, 4.118879, -0.306242, 1.102313, -4.138959, -3.319368, 1.504961, -0.296103, -0.015277, -0.136177, -2.483171, -0.736024, 2.367176, 2.261982, -0.849145, -0.117148, 0.933736, 1.702317, -0.301388, -0.906758, -1.500765, -3.250679, 3.582183, -0.331119, -0.037043, -1.458076, 1.892284, -0.766386, 0.352996, -0.095548, -2.071965, -0.491389, 2.410537, 2.485983, 0.411316, 1.657690, 2.453979, 0.544419, 1.520648, 0.534818, 5.743890, -1.727980, 1.763634, -1.548289, 4.157279, 2.225596, -0.064131, 1.865851, -2.642000, 0.274984, -0.866992, -0.558188, -6.135097, 1.808120, 1.006938, -0.790857, 0.461167, -0.860196, 2.084737, 5.253679, -0.978117, -0.453410, -0.819495, 0.236075, -2.136300, 0.021472, 0.437208, -2.464820, -0.033580, 0.188308, -3.331381, 3.856862, 3.718570, 1.367965, -0.220747, -2.310231, -1.149265, -1.234862, -1.002847, -1.974621, 1.130993, 4.297782, 0.930710, -1.492884, -0.713347, -0.377052, 2.645738, -0.489495, 4.203647, -0.183929, 2.602280, -0.300018, 0.319702, 0.484708, 1.726325, -2.608667, 0.224787, 0.628384, -0.105292, -0.063091, 3.132477, -1.491148, 0.832812, 2.437400, -1.012723, 0.005830, -1.205948, -3.225458, 3.295642, 0.629336, 1.091727, -3.268245, -4.394265, 1.717201, 0.162753, 2.099977, -0.146153, 0.175116, 2.906495, -3.290455, -1.285598, 7.942502, 0.160152, 0.179591, 4.543355, 3.116941, 3.622699, 3.588044, 0.600032, 1.724521, -1.439306, 0.903875, 1.313146, -0.755100, 8.836950, -0.952139, -0.333382, 0.589961, -1.261379, -1.519937, 0.594025, 1.764304, 4.243717, -1.108132, 1.925209, -3.976291, 0.235073, 1.686457, -2.469280, -8.696437, -3.778089, 0.477731, 2.229237, -1.451623, -0.649171, -0.769915, 4.691844, -0.204187, -1.182377, 1.737289, 0.001245, 0.825427, 0.922635, 0.588298, 1.344133, -0.989039, 4.205564, -0.115000, -0.715023, 0.977006, -0.833281, -0.086541, 7.658555, 1.852724, -1.467018, -4.778613, -5.723665, 11.013165, -0.535247, 1.304788, -0.278120, -3.020401, 1.007590, -1.811615, -2.275402, 0.979134, -1.302686, -1.351974, -1.836746, 0.542104, 0.011732, 0.467072, 1.436075, -1.912158, -0.329361, 0.906835, 1.310337, -1.657316, -1.454618, -0.057573, 1.054785, 2.734224, 0.864511, -2.898540, 0.721743, 3.449452, 2.660782, -3.281970, 0.793531, -0.939987, 2.136473, 2.435774, -0.931134, 2.375545, -1.911816, 1.508261, 1.438431, -1.554907, 10.636431, 10.181509, 2.162733, 0.092647, -1.967986, -0.263800, -4.868186, -6.291202, 2.514089, -1.563930, 2.339781, 1.775288, 0.459960, -0.443059, 1.163096, 0.124251, -6.209301, -1.731759, 0.933085, -2.717155, -2.114019, 0.764707, -3.598078, -3.171012, -3.224734, -2.070172, -1.446331, 2.061223, -1.382807, 2.091001, -0.697143, -1.502571, -0.759867, 0.295158, 0.411237, -4.074205, -0.500290, -1.297131, 0.634780, -0.869266, 4.393310, 0.298832, -2.684585, 2.340415, -2.036564, 0.622905, -2.296228, 1.926786, 3.597500, -1.945964, 0.066769, 0.744542, 0.415474, -0.575763, 1.847393, -0.139588, -4.165627, 4.270953, 2.814140, 3.188056, -0.450949, 0.978394, -0.130957, -0.081690, 2.020355, 1.264999, 2.465885, -1.305273, -1.484652, 1.913352, 1.274480, 1.071047, 32.553612, -3.937133, 1.768175, 3.947704, -3.099431, 1.153636, 2.373486, 2.360721, -2.020087, 2.675651, 2.172042, 1.795803, 0.559099, -1.527675, 1.322240, 2.126581, 1.912614, -4.063071, 1.399081, 4.218424, 5.575891, -0.873354, -0.366251, 0.625111, -3.856376, 3.276441, -1.155635, -2.581620, -3.619178, -2.831986, 1.644085, 0.133040, 1.446548, -1.878526, 1.286804, 2.888686, -2.470215, -2.362903, -0.971186, 1.676210, 0.187425, 1.839463, -1.896287, -2.081707, 1.814273, 0.938150, 1.786313, -3.683124, 3.041575, 0.135497, -0.442793, -1.932262, 3.042321, -1.016880, 2.041277, -2.040796, -0.421785, -1.340821, 0.105589, 2.341746, 0.292477, 0.475112, -0.495902, -0.475660, -1.491468, -2.629827, 3.888282, 2.804010, 0.150179, -0.908230, -1.783017, 0.495803, -0.610453, 0.289567, -0.078798, -1.560782, -1.805168, -2.729794, 1.172817, 2.256393, -0.442398, 0.522677, -1.631465, 0.356985, 3.064646, 2.438519, -4.161843, -2.152719, -3.548280, -1.114138, -0.859323, -1.193447, 1.367581, 0.220056, 0.163054, 1.273387, -0.971761, -2.162730, 0.737844, -0.055890, -0.216024, 0.862837, 0.151634, -0.601276, 0.358210, 1.970998, -0.229344, -0.017123, -4.781201, -1.324280, -2.443600, 1.611152, 0.149101, 3.477759, -0.751891, -0.717900, -1.417735, 1.407221, -1.185286, 2.538493, 2.014459, 1.288729, 0.121079, -0.775372, 1.095880, 2.870790, 2.828023, -0.678468, 0.029732, 0.468892, 0.173412, -0.554108, 0.868722, -0.347043, 0.670441, -0.115965, -1.428901, -1.543623, 1.486332, -0.581373, 2.728561, 0.982189, -2.209894, 3.659011, -0.832777, -2.435964, -1.310557, -1.055155, -1.834021, -1.562546, -3.038025, 3.090860, -2.067988, -3.092913, 0.118994, -1.485082, 2.321303, -1.523272, -1.430369, -0.029205, -1.739594, 0.019325, 0.569824, -1.613913, 1.451680, 0.781437, 2.524419, 1.962079, 0.689057, 0.560689, -1.565743, 0.606738, -0.366487, 0.825362, 0.567166, -2.249720, 1.356489, 1.744161, 1.633906, -1.390061, -0.507055, 0.629742, 1.448011, 1.131591, 1.812740, 1.272440, -4.645047, -4.487141, 1.566564, -1.905134, -2.191869, 0.572741, -0.455768, -2.307575, 1.607099, 2.300626, -0.653849, 0.221068, -1.337414, 3.450238, 1.781016, 1.688878, -1.666610, -1.287050, 1.609571, -1.283102, 1.244545, -1.443785, 1.087902, 1.376294, 0.741840, 3.563230, 2.121260, 1.941692, -1.210015, -1.075073, -1.537348, -3.691505, 1.077912, 1.348834, 0.498320, -1.474098, 2.513743, 0.034244, -0.481899, 1.553230, 2.030468, -0.253737, -0.082557, -1.063913, -2.305557, 0.269592, -2.979993, 0.824435, -2.439241, 0.592147, -1.105869, 0.421400, 1.004838, -1.749238, 0.952104, 0.919140, 1.684337, 2.167483, -0.795320, 3.446412, -0.198248, -3.708080, 1.209328, -2.476055, -2.325776, -3.323361, -0.395597, -2.130917, 3.682760, 0.225562, -0.801695, -0.575638, 0.496647, -1.356318, 0.600407, -3.152001, 0.148890, -0.074623, 0.417407, 2.860341, -1.896055, -2.015702, -0.401882, -9.507044, -1.447032, -0.036139, -0.848994, -0.379429, -0.110780, -0.272023, -0.108900, 3.433757, 0.450129, 0.396789, 1.146661, -3.451382, 2.723109, 2.754865, 0.767603, 1.128464, 1.264460, -1.380370, 9.841789, 1.019593, -0.664542, 0.108049, -4.518657, 3.028623, 0.955966, 2.159739, -0.992145, -3.089396, -2.380461, 3.170674, 3.399100, -1.193787, -1.100064, -1.222812, 1.331865, -4.171266, -0.224741, -0.748337, 0.600880, -4.090579, 1.785532, 0.843806, -0.897074, 1.120391, 0.226116, 1.339815, -0.558596, -0.229787, 0.421276, -1.492851, 2.736720, -1.181484, 1.033056, -2.783668, 0.868677, 1.473027, -0.409419, -0.926960, -1.578116, -1.657176, -4.058239, -2.139534, -4.226902, 0.042924, -4.044942, 0.975387, 3.117355, -2.011942, 1.897419, -0.394071, -2.194358, 0.765811, -1.147434, -0.198820, 4.750773, 1.958392, 0.566013, 0.746863, 0.205276, 6.244852, 0.292197, -0.264540, -1.483035, -0.521862, -0.821273, 0.241537, -1.867573, 0.887766, 0.970351, 0.943935, 0.907276, 0.863435, 0.609040, 0.758961, -2.844856, -4.021582, 2.337291, -0.033980, -1.227764, -2.930726, -3.163921, -0.012551, 2.236899, 0.202573, 4.800804, -3.107728, -2.295274, -2.600798, 0.614929, 5.242928, 1.385072, -0.441010, 0.496632, 3.172925, 1.638507, -0.125987, 5.617575, 0.405079, -2.154732, -2.598893, 3.025715, 0.283551, -1.228085, -3.021509, -1.903265, -1.605352, -2.429972, 0.022392, -1.591423, 1.237664, -0.669618, 2.921721, -3.849840, 0.507070, 2.304445, 1.411633, 0.925293, -1.438281, 4.524236, 3.233037, -0.987758, 4.162241, 1.087106, -3.555248, -1.064314, -1.531470, -0.240191, -2.561610, -1.132247, 1.349483, 2.722615, 0.037841, 1.215080, -1.428522, -2.192810, 1.110667, -2.803056, -0.672909, -1.977815, -0.052312, -2.126122, 1.366835, -3.752233, 1.900298, -0.700606, 0.833546, -0.918877, -1.297674, -1.865373, 2.671888, -0.841652, 0.527628, -3.107635, 1.778736, 6.087658, 10.168410, 1.901435, 0.008429, 1.880625, 0.691697, 1.251415, 2.020111, 0.165611, -1.962046, -2.567108, -1.646126, 1.804067, -4.159413, -2.876052, 0.582987, -2.315321, -2.834074, -0.143538, 0.333040, 2.520455, -1.053464, -0.936291, 0.497383, 2.377161, 0.688116, 0.252176, 1.265777, 1.036058, -1.827014, 0.232887, -1.180766, 1.928753, 0.833420, -5.280427, -3.046776, -0.869522, 2.473118, -4.426275, -1.009449, -0.393237, -0.908902, -0.088011, 0.554225, 0.937001, 0.837067, -3.216558, -0.756160, 1.128917, 0.964452, -0.273247, 0.137980, -0.334299, 0.644827, 1.049165, 1.687600, -0.582325, 0.678831, 3.991718, 3.445086, -2.419575, -1.215780, 0.836874, -1.800161, -4.176029, 1.932638, 4.352362, -0.640140, -0.239435, -0.297160, 0.884801, -5.469331, -0.966645, -2.994428, -1.457712, -1.884226, -1.562829, -0.063225, -1.810441, 1.137481, 1.578315, 4.733838, -0.413119, -2.760687, -4.963616, 2.259133, -1.462614, -0.125429, 1.826625, 0.243741, -1.940729, 4.980882, -1.852582, -2.769213, 3.123670, -3.025451, 1.775786, 0.248842, -0.728544, -1.077011, -1.391178, 0.466446, -1.327900, 0.010156, -1.119019, 0.406476, -3.023509, -2.991761, -2.242916, -2.700393, -1.149276, -0.574640, -0.623269, 4.592867, 0.085469, 2.845264, 0.500940, 1.761536, -0.758278, 5.217238, 0.938784, -0.517966, 0.385037, -1.285301, -2.375467, -3.277109, 5.389163, -1.802665, -1.425073, 2.843237, -1.299174, -2.772480, 1.326686, -0.461601, 2.181789, -1.470306, 0.363675, -1.825438, -1.595971, -2.633875, -1.263443, 11.719139, 0.751982, -5.437104, -0.402077, -1.769803, -0.488290, -3.136333, -0.206896, -2.774237, -6.601932, 3.922718, -4.739939, 2.086078, -3.384333, -0.272934, -1.534686, -0.467060, 1.184582, -0.470652, 12.171447, -2.855459, -8.415219, 1.164989, -1.923474, -0.980914, -0.601838, 1.957440, 0.195562, 1.815339, 0.264663, 0.158703, -1.772841, 1.136608, -0.290822, 1.927877, 0.562484, 2.267569, 0.140409, 2.506416, -0.439031, 0.860347, -1.896669, 1.397222, 1.579570, 0.485520, 0.957377, -1.102430, 0.442603, -0.920362, -1.657130, 4.251542, 0.022865, 1.276556, 0.728996, -1.456246, 1.469049, 17.869566, -3.002699, -2.320802, -0.407301, 0.102661, 1.159996, 3.357108, -0.603254, -1.308291, -3.801401, 0.142082, 0.596891, 3.836693, 1.812102, -2.562486, 2.357852, 7.259953, 0.651906, 1.206739, 2.925079, 0.821211, -3.675958, 4.616876, -0.251318, 1.483648, -0.104531, 3.231146, 1.392708, 0.213286, -4.196676, -1.593157, -3.684262, -0.936107, -1.851555, 1.740610, 0.759534, -0.725802, -32.275948, 0.821350, -2.229753, -3.098523, 0.335518, -3.705818, -2.480252, -2.382177, 2.335834, 0.559883, -0.078840, 0.265768, 0.672536, -0.617955, -2.180997, 0.050136, 1.086448, 2.409880, -1.411031, 0.431328, -0.727582, 1.202057, 0.676885, 2.333050, 0.667056, 0.704034, -3.499928, 0.377022, -3.475610, -0.461822, -0.347930, 0.258168, 1.602084, 0.519829, -0.155424, 1.457234, 0.126837, 0.867928, 4.091778, 1.680606, 4.067370, -1.098706, 1.745742, -1.184344, 2.251384, -0.088541, 0.205009, 0.219193, 0.667914, 3.437428, -1.572693, 0.404962, -1.385043, 2.847858, -0.488513, -0.725883, 0.752501, -0.153214, 1.029474, 1.962504, 3.080513, 2.361320, -0.633588, -1.436562, 0.260860, 26.884304, 0.354994, 0.023750, 0.750965, 1.353084, 0.426553, -1.632285, 0.965408, 4.016201, 2.177340, 1.253002, -1.670665, 1.720569, -2.180720, 0.322581, 0.279140, 3.985285, -3.724750, -3.344991, 0.349461, 1.836719, -0.266715, -0.634795, 1.627708, -2.453827, 2.661302, -3.227677, 0.053888, -2.480497, -0.681098, -0.049629, 2.152981, -0.581374, 0.205937, -1.006680, -0.501193, -2.547718, -0.441294, -0.789560, 1.551607, 1.530261, 3.002646, -3.386208, 0.647514, -1.249280, 0.674235, -1.530674, -3.838077, 3.722666, 2.963236, 2.088184, 3.554217, -1.200083, 1.763361, -1.686896, -1.194727, -2.250880, 0.875894, -0.186709, -25.303444, -0.505925, -1.991530, -1.867339, 1.825814, -0.766634, -0.213235, 4.148894, -0.046314, 2.148987, -1.871356, 0.335448, -0.485913, -0.093247, -3.264498, 0.528065, 0.522954, -1.395236, -2.970262, -0.581401, -1.861303, 0.173270, 1.324822, -0.018929, 1.717697, 1.011000, 2.334809, 0.878979, -0.626252, 0.987577, 2.981012, 2.250051, 2.358181, 1.915170, -1.237683, -2.741322, 2.350392, -2.240174, -2.580384, -0.634321, 0.317450, 1.725892, 0.162523, -0.422389, -0.178484, -2.114431, -4.840956, -0.001833, 0.139491, 4.319082, -1.384395, 0.323176, -1.353467, -1.339914, -1.007622, 8.807545, 1.286002, -2.342683, -1.185613, -1.472908, -1.924402, 4.893974, 0.222800, -0.740518, 2.156360, 2.747664, -0.312752, 2.395748, -0.615992, -8.395935, -0.730813, -0.073218, 1.852324, 0.291219, 1.468868, -1.620990, -0.756095, 1.401205, -1.078258, 0.483770, -0.073835, 0.852245, -2.719850, 1.460309, 1.374558, 8.152399, 2.677590, -1.674278, 0.375696, -0.073122, 0.925756, 0.923439, -0.069477, 2.279387, 2.252507, 1.538246, 1.814035, 1.929125, -1.111000, 0.341246, -1.168864, 2.812251, -3.087615, -0.544177, -0.092432, -3.101908, 2.925842, -1.453098, -0.303561, -1.021411, -0.136682, -5.494426, 0.610433, -2.182517, 3.119542, -1.883906, -0.745869, 2.522353, -1.648916, 0.309569, -0.577917, 1.033753, 2.487350, 7.941953, -0.071100, 0.527296, 0.779083, 6.350358, -3.325049, 1.075220, 0.105128, 0.276124, 0.347600, 0.306576, 3.274002, 1.036085, -2.652688, -2.018159, -0.050256, 0.416610, 0.418685, -1.532887, 0.523052, 3.134366, 0.387958, 1.859184, 2.089394, 1.539085, 1.657541, 1.209471, 1.878232, -3.116538, 0.800451, -3.243739, 2.656432, -0.071025, -0.537716, 2.768167, -0.689752, -4.555896, -2.193995, 1.733658, -1.575865, 0.762150, 2.772280, 0.646617, -2.128708, 2.234904, -2.126125, 0.459754, 1.413508, 0.166073, -2.205717, -0.741679, 0.306937, -1.672999, 0.225218, 0.055466, -1.127936, 7.234384, 0.063245, 1.979858, -2.608743, -2.807106, -2.397985, 0.106909, -3.808125, 1.513469, -1.036234, 0.811388, -2.431126, 2.732509, 0.589472, -1.097014, 0.551920, -1.516936, -2.533307, 0.797511, -2.425043, -3.588099, -1.160765, -0.590146, 3.004026, 0.810287, -2.107759, -0.651307, -5.370883, 2.917656, 0.991122, 0.444265, 2.693725, 0.286769, -1.541028, -3.018210, -2.283916, -0.207154, 1.837811, -1.245008, -5.110644, 1.981874, 1.236664, 1.905553, -0.207308, -1.105416, 4.194646, 0.440569, 1.659566, -0.880465, -4.547189, -2.118434, 1.557453, 2.841252, -5.531410, -2.707672, -0.314554, 0.362334, -2.255540, 0.359243, -0.935208, -0.932006, -1.131725, 1.080974, -1.834844, -0.971595, 2.742725, 3.401529, -1.261523, 0.220123, 2.862516, -0.858204, -0.541691, 1.122402, 3.115028, 0.782756, -1.248350, 0.043220, -0.986485, -0.844554, 2.879936, -0.039591, 0.383270, -0.683167, 1.174577, -0.593699, -1.137541, 1.152888, 1.766484, -7.748785, -0.404558, -0.467036, -0.500680, 1.611474, -0.383380, -3.673885, 1.017248, -0.770349, 0.237903, 2.069979, 1.630354, -2.739210, 3.819390, -1.732684, 2.563601, 2.213180, -2.380856, -2.852253, 1.170615, 0.828863, 1.211223, 2.499185, -0.747931, 0.118144, -1.666906, 0.565364, -1.657013, 0.774405, 0.693492, 7.705479, -1.408086, -0.155176, 1.412633, -5.123066, -0.853785, 0.116899, 2.256178, -2.218943, 1.518238, -0.196591, 0.143943, 0.285298, -2.482928, 0.615830, -2.046060, -0.975004, -2.833664, 3.254058, -1.370345, 0.944582, -0.014555, 1.834112, 2.484418, 0.814740, -0.132238, 1.918314, -1.774436, 0.460359, -0.580844, -1.645435, 1.954685, 2.618668, 0.267995, -2.560254, -0.594974, -2.286581, 4.714139, -0.405705, 1.295618, 0.840673, -2.838477, 0.366501, -1.036157, -0.400613, -1.738227, 1.300794, -1.672292, 3.829979, -3.479953, 3.891050, 0.341126, -0.399483, 1.662642, 3.562313, -1.184650, 2.612743, -3.805794, -2.371483, 1.948942, 2.188965, -0.459029, -0.178023, -1.700147, 3.120115, 5.629900, 1.224204, -1.879678, 1.268884, -2.494606, 0.783073, 0.035119, -0.997701, 0.350684, -1.231518, 1.909020, -2.098758, -0.661888, 0.994528, -1.374923, 2.376079, 0.031113, -1.458987, 0.113488, 0.266556, -2.248442, -0.549227, -1.980329, -0.843419, 0.893711, -1.482391, -0.773147, 1.418742, -0.100246, -2.365190, 1.173016, -0.702880, 0.275237, -1.607930, 0.260333, -2.186061, -0.709117, -1.991098, 3.764254, -3.194555, -1.260547, 1.071692, 3.033343, -0.641039, -0.255529, 3.194899, 1.048945, -0.669115, 4.109255, -2.829791, -0.658942, -0.670331, -3.036335, 1.109055, 0.316420, 9.496139, -1.423056, 4.316561, -3.777977, -0.611273, 4.206706, -3.108094, 2.093183, -0.592171, -2.694616, -2.441919, 0.851335, 0.160457, -2.691463, -0.074577, 0.744847, 1.683688, -0.708158, -1.727233, -1.502183, 0.525043, 0.983311, -0.089715, 0.532941, -0.811899, 0.976230, -2.550968, -0.344250, -0.052567, -0.187679, -0.121404, 2.285113, 1.596615, -1.304425, -2.099068, 1.705707, 0.282685, -1.729276, 1.459553, 0.320318, -4.074892, 1.004075, -3.545393, 1.803405, 0.490817, -4.327343, 1.060507, 2.926682, 1.222938, 3.830746, -0.337875, 1.288773, 0.835695, 1.570575, 3.477370, 0.256512, -1.542086, 0.159785, 0.399716, -0.581507, 2.310691, -0.635375, 0.457800, 2.778989, 4.189120, -1.164997, -2.390188, -1.172381, -2.444343, 0.283850, 2.672297, -0.091714, 0.491060, 3.292467, 0.893021, -0.021946, -4.014757, -2.941429, -3.471601, 0.691645, 0.227577, -0.660446, -0.993292, 1.157482, 0.663260, -0.723520, -2.946966, 0.588723, 0.340043, -0.358396, 1.354849, 1.527313, 0.792265, 0.354935, -3.001401, 2.202762, -0.826216, 1.678419, -0.252368, 2.635601, 3.578719, 3.746924, -0.271551, -2.890261, 2.253434, 0.673195, 1.846275, 0.377794, 1.797091, -0.269892, -2.769761, 0.315330, 3.191075, 1.445889, 1.961443, 0.250564, 1.217402, 1.685299, -0.773906, 3.594047, -0.128995, 0.446818, -2.274753, 1.524932, -5.203915, 1.088118, 0.079942, 3.207416, -18.327145, 1.500175, -0.097854, 2.531832, -1.950670, -0.083165, 2.955576, 0.681629, -0.329289, -3.521765, 2.633065, -0.085364, 5.152068, -0.129515, 2.816390, -3.628945, 0.186241, -1.867233, 1.983360, -0.515092, 1.410643, 0.049975, -0.440827, -0.180698, -0.876025, -1.237767, 2.805179, 1.542505, -0.951790, 0.443897, -2.973520, -3.115988, -3.136214, -1.392136, -4.330337, -2.264652, 0.625757, -4.091477, 5.732088, 0.250295, -0.520872, -0.999017, -5.399399, 1.408189, -1.518100, 0.076899, 0.173335, -0.000062, -0.360869, 1.392707, -0.592256, -0.071057, 1.330815, 1.857339, 1.405391, -1.056357, 3.606451, 0.844912, 0.430513, 0.705153, -0.095922, -3.005373, -6.676332, -1.697640, -2.368549, 0.382992, -0.944774, 1.198584, -1.091452, -1.749086, -3.316520, -2.743624, -0.783803, 0.405243, 0.186438, -2.742365, 2.742394, -0.300057, 1.709888, -0.119355, -1.055460, -0.554546, 1.549400, 3.316050, 1.876061, -0.066178, 0.378583, 2.503380, -0.843418, -2.350612, 1.370083, 4.067267, -0.592636, -1.435324, -0.224314, 3.440698, -3.128674, -1.354647, 0.871077, 0.835582, -2.731527, 3.152105, -13.977840, -3.494694, -1.888668, -0.369791, -2.067304, -1.631454, 1.369503, 1.902894, -0.525999, 0.394126, -0.526112, -1.452425, 0.266547, 0.910412, 2.690451, 0.452772, -0.904265, -0.274242, -1.941391, -0.781456, 1.339111, -0.650924, -2.150021, 0.411827, 0.306410, -1.720717, -0.847376, -0.433327, 8.572508, 5.576869, -1.604961, -3.355456, 0.578090, 3.117163, 0.740736, -1.816481, -1.542356, 1.402068, -3.337177, -3.460914, 0.022969, 0.263521, 2.447577, 2.692053, -2.677152, -1.154691, -0.166149, 2.455483, -1.960949, 0.644558, 0.451422, -2.235790, 0.731520, -0.340948, -1.902708, 0.762279, 0.125709, -1.536236, -0.245376, 5.871339, -1.315382, 1.152163, 0.313443, 2.684904, 1.522253, -2.293488, 1.985134, -1.717681, 1.696683, -2.022664, 2.500051, 0.610928, 2.378994, 0.165070, 2.061111, -0.533743, 2.667622, 3.499480, 2.485019, 1.861316, -1.150906, 1.217355, -2.907215, 1.377991, 2.626987, 2.557068, -2.540422, -6.635588, 0.997638, 3.295728, 0.550338, 1.183280, -0.518508, 2.825861, 3.199516, -0.683444, 2.162984, 2.370837, 4.094376, -0.561323, 0.784378, 0.182726, 1.457809, -1.172417, 2.387145, 0.324454, 1.190073, 4.428410, 1.672801, 2.330206, 3.335100, -0.557584, -1.865616, 0.313762, 2.122341, 1.508437, 2.887865, 3.216261, 3.003069, 4.676777, 3.199836, -1.936886, -1.224461, -0.051272, -3.227903, 1.606121, 1.955314, 0.546526, 0.661211, -1.968910, 1.787858, 1.048485, -0.461875, -0.509854, -1.147480, 1.632553, 1.572888, 1.340511, 2.246652, -0.744147, 1.609240, 0.752408, -1.323570, 1.025877, 0.814479, -1.259313, 2.315588, -1.214857, 0.447328, 1.474373, 1.504266, -0.497109, 0.577923, -0.198901, 0.571817, 1.572346, -0.229665, -0.482301, 0.700038, 1.471916, 0.754951, -4.222556, -0.885760, 2.337687, 3.346586, 0.457164, 1.315627, 1.394005, -1.703673, 0.883069, 1.166987, -0.816095, 1.200109, -2.375040, 0.826257, 0.166096, -1.786063, -3.522976, 3.031999, -3.488905, -0.379346, -1.583687, 1.950091, -0.198649, -1.979665, 0.650037, -1.176644, 2.323772, -1.853436, -1.898267, 0.950301, 2.705135, -1.034136, 1.889340, -1.965656, -1.262732, -1.637557, 0.371705, 0.813203, -2.314916, -2.003534, -0.387944, -1.389285, 0.283866, 3.240730, -5.791040, 2.754301, -1.986605, 5.498390, -0.273563, 2.022370, -0.106619, 5.943636, 2.984904, 0.613079, -0.669535, -0.343581, -2.248723, 0.348882, 0.099603, 0.015564, -0.590038, -1.586601, 2.235744, -1.424522, -0.791221, 0.587769, -1.771873, 0.902260, -0.221643, -1.720520, 0.883625, 2.038339, 1.595812, -0.467120, 1.634040, 1.504307, -0.748396, -1.020328, -4.903410, 1.616802, 1.099766, 0.745407, -0.021960, -0.519794, 1.157142, -0.792646, -3.190317, 2.819183, -1.442791, -0.101592, 0.509061, -1.473928, -2.708030, 0.791176, 0.473671, -1.907742, 2.050101, 0.003960, -1.289529, -2.341501, -0.373291, -0.925099, -0.040393, -0.479414, -0.349596, 1.877697, -0.776685, 1.993453, 0.372975, 0.333932, -2.029835, -1.045179, 2.716208, 1.865170, 0.747971, 4.798818, 1.579798, 1.555352, -1.155734, 3.145859, 0.887819, -3.669680, 0.728159, -0.804880, 1.060545, 0.031082, -2.118540, 1.153332, 3.899940, -1.163471, 1.295803, -2.362670, -2.000762, 0.001854, 0.792970, -2.817198, 1.936855, -3.864910, 2.709139, 1.039248, 0.088502, 0.231339, 0.591631, 0.030345, 1.299663, 3.123892, -0.961180, 0.043213, 5.733878, -4.098113, 3.308362, 0.510799, -4.140917, 0.843890, -1.273466, -2.071602, 1.445319, -1.818631, 0.162340, 1.681147, 0.362149, 5.492742, 0.825089, 1.722855, -0.476926, 0.432826, -2.043187, -2.016047, -2.091608, 0.597438, 0.042428, 3.973989, 0.883174, -0.129641, 2.237979, 0.567980, 0.839164, 0.976711, -2.264026, 1.649265, 1.344430, 1.434795, 5.406022, 3.077146, 2.661374, 3.106198, -2.270719, 0.887519, -0.145979, 3.447535, 1.555766, -0.138652, 1.195158, 3.220088, 0.879781, -2.362654, -6.360222, 0.395982, -2.533304, 4.291794, 3.155651, 1.251960, -2.407456, -0.218786, -2.583162, -0.899505, -0.780827, -0.983244, -3.478741, 1.778161, -0.658125, 0.597271, 1.897745, 1.777378, -0.100952, 1.271212, -0.681034, 2.460438, 1.110631, 1.252983, -0.114646, 0.479300, 1.725348, 4.230619, 1.187448, -2.427088, -1.236009, 2.918612, -1.148390, -1.583798, -1.435729, -1.783818, -2.767493, -1.541569, 0.203426, -1.242959, 0.281229, -0.392559, 0.520429, -3.234192, -5.539012, -2.151313, -2.462922, 0.635300, 1.190563, -0.151796, -0.622898, -0.042642, -0.083337, -0.730799, 3.834823, -0.641445, 1.817954, 3.308098, 2.103870, -1.456245, 1.461707, 1.371928, 0.733169, -2.590485, 0.514485, 3.581903, -1.606874, 0.716330, 1.596098, -0.624766, 0.570248, 0.343349, 0.814959, 2.110700, 3.385014, -0.654017, 0.103144, 1.129105, 0.854295, -1.497854, -0.833896, 0.883463, 2.740201, 1.252455, 0.383430, 4.764862, 0.244701, 0.397635, 2.114073, 3.711519, -1.009765, -2.999290, -0.075239, 1.702090, 4.073386, -2.221128, -0.173297, -1.598970, 1.764099, 1.255920, -3.153074, 0.940827, -0.085776, -0.153299, 3.531004, -2.893436, 2.231733, -2.386345, -1.483328, -3.745522, -5.938141, -1.462745, -1.746019, -2.920442, 0.672212, 0.022831, -2.624349, -2.061772, 0.922139, -2.596588, 2.745512, 1.169222, 0.959605, 3.072976, -0.254427, 5.414840, 0.527497, -1.430076, -1.501413, 2.057433, -1.026962, 5.627439, -2.945267, 0.809274, 2.203258, -0.872946, 4.923897, -1.509921, 0.499868, -1.122034, 0.442121, 4.887639, 2.859808, 0.291162, -0.565278, 0.837086, -5.220106, -0.498070, 1.504637, -0.144360, -0.114923, 0.522100, -2.818848, 1.758005, -0.885172, 1.853490, -2.924307, 2.890154, 1.534062, 1.918770, -0.117491, -0.226110, 0.317553, 0.141368, 1.936240, 2.881628, 0.301003, -2.461584, 0.041076, 0.719569, -0.117806, -2.094253, -2.824019, -1.391719, 1.183342, 1.017699, 0.376079, -0.095749, 0.889862, 1.062274, -0.524244, 2.349395, 1.186792, 2.231821, -1.102010, -1.332299, 2.127689, 0.351990, 1.511401, 0.622317, -3.630643, 0.548977, -3.208701, -2.996888, 2.649668, 0.304146, 1.341268, -2.146057, -1.861894, 4.507286, 0.904826, 1.395878, 0.450848, 0.070200, 3.053755, -2.140768, -2.565916, -0.972909, 1.667059, 1.065236, 0.645459, 0.884246, 0.220186, -3.282768, 1.298033, 4.002408, 0.833334, -3.617452, 0.368626, -1.903755, -0.626035, 1.850133, -3.082982, -2.774525, 1.769177, 0.522702, 0.263647, 1.313907, 3.750313, 2.017340, 0.823528, 2.732456, -3.571347, 2.283923, -0.022426, -0.600996, 1.081142, 0.014277, 2.022802, 0.375120, 1.400345, -2.918789, -1.019804, -1.212387, -4.197097, 0.703760, -1.529950, 2.411883, -0.584985, 1.346535, -0.723685, -2.590233, 3.926456, 2.828357, -1.109949, 1.414086, -0.135390, 0.382299, 2.244257, -0.295509, 4.518786, -3.660137, -2.117182, -2.384901, 3.393075, 0.985744, -0.516841, -0.374062, -4.132094, 2.249375, 2.650640, 0.171410, 3.396479, 0.069148, -0.270228, 0.905647, 3.294580, -3.019566, -3.331932, 2.340150, 2.807488, 2.162702, -3.298753, -1.766656, 1.310992, 0.260271, -0.273308, 4.299369, 3.715046, 1.070362, -1.441203, 0.129578, 0.900721, 5.234731, 2.976517, -0.659467, 0.284069, -0.954808, -0.256831, 0.583463, 1.284158, 1.351618, -0.708611, 2.189399, 0.599912, 1.235909, -1.296227, -0.026225, -0.115726, -1.447470, 1.321840, -0.550714, -0.555223, 1.529182, -1.148362, -1.568398, 3.317645, -0.735568, 0.437838, -0.237062, -1.593969, -1.894876, -1.147040, 6.875043, 11.417304, -1.270446, -0.386883, 0.448277, 0.306192, 0.444976, 0.047800, 0.323642, 0.030898, -0.246496, 1.848722, -0.699640, -2.337017, 3.233449, 1.195664, -1.263332, 2.178815, 0.288599, -1.581660, 3.966269, -0.985900, -0.975200, -0.793356, -2.546527, 0.314768, -1.653216, 0.076431, 0.527626, -4.124629, -0.522641, 2.863382, -0.581339, -0.483502, -1.371282, -2.072694, 2.361188, 1.388546, 0.461689, 0.054946, 2.079017, -0.905662, -1.667842, 1.560647, -2.268772, 2.163057, -1.461485, -9.571809, 2.473650, 27.804079, -0.824428, 1.293797, -3.527821, 0.322644, -0.829934, -1.641960, 1.559845, 0.651142, -2.018788, -0.652320, -0.534693, 2.995028, 0.591905, -0.210980, 1.812501, -2.794134, -0.021838, -1.462607, -4.406371, 0.911137, -1.410916, 1.293619, 0.665017, -0.952757, -1.092961, 3.622439, 4.556889, -0.322486, 3.173908, 2.919918, 1.061108, 1.607022, 0.961167, 1.944426, -2.335304, -0.616037, 0.400255, 3.586429, 0.725041, 0.298737, -1.890886, -0.024518, 0.091631, -1.731019, -1.398399, 1.815781, 4.581513, 2.576745, 1.146513, -0.740690, 0.073969, 3.108618, 0.535761, -0.192938, 0.454423, 2.053000, -0.904300, 2.072179, 0.649410, -0.662264, -0.673923, -1.733026, 2.902329, -1.831697, 3.206669, -1.994342, 1.821010, -1.573532, -0.257321, 5.394223, -0.162241, -3.188801, 0.847548, -2.007671, 3.947463, 1.168257, 2.246848, 1.873971, -0.494318, 0.724031, 2.609424, -1.787059, -0.378604, -1.377902, -3.019529, -1.382988, 0.023967, -0.011094, 0.253847, -1.189952, 0.510796, 1.386401, -2.568677, 1.666898, 2.934337, -1.311064, 0.238594, 0.870015, -0.393368, 2.161388, 3.384138, -2.131927, -2.342558, 0.287042, 3.170932, -4.181534, 1.297255, -2.535244, 0.296611, 0.657501, 2.486350, 0.579065, 0.901807, -1.826978, 3.008395, 0.015162, -0.812392, -5.237899, -0.064044, -0.666688, 1.184950, 1.343734, -1.661986, 0.091535, 2.635372, -0.006127, 9.887297, 4.918875, 0.266848, -2.460409, 0.854952, -1.334359, 0.044901, -0.075265, 0.200052, -2.027168, 0.649583, -0.283191, -0.853923, -2.430825, 2.559544, -0.007875, -0.466313, 0.781030, -0.850323, -1.138282, 1.897385, -0.009445, -0.401657, -2.866511, -2.329610, -1.208160, -1.773448, 0.369010, 0.188112, -0.367383, -0.806061, -0.871504, -1.204538, 0.251074, 0.966655, -0.356678, 2.718896, 3.092988, -2.580250, -0.622785, -0.762455, 0.026455, -1.173746, -3.874841, 1.271729, 1.204147, -1.713592, -0.083661, -2.208164, 1.115615, 0.098561, -0.295923, -1.642114, 3.539276, -1.431937, -0.611073, -2.333883, -0.713340, -1.436290, 0.124211, 0.463780, -1.330325, -2.417066, -1.452549, -0.621094, -0.331997, -0.248763, -1.276592, -0.835736, -0.789976, 0.210436, 2.124712, -1.141259, 0.878823, 3.689657, -0.996731, -0.788607, -1.662502, 0.702889, 0.455417, 0.196082, -1.905845, 1.417846, 0.184089, -0.428133, -1.289425, 1.620126, -0.999889, 0.915726, 2.355841, 8.382513, -2.161212, 15.398536, 2.307024, 1.641678, -1.617959, 1.106972],
 	"qwen:latest":            [6.398593, -9.147916, 3.167985, -0.016698, 5.146078, 5.260313, -0.588165, 6.041094, -0.448700, 1.538580, 4.409100, 1.965835, -2.027954, 3.322869, -0.209723, -1.591358, -0.332919, 0.490199, -1.801314, -1.394502, 6.208906, 2.582299, 8.829220, -1.215299, -10.441714, 6.180038, 2.357235, -0.251530, -1.217379, 6.260386, -1.005010, 3.707588, 2.900968, -2.015647, 1.153357, 2.200070, 4.747961, -2.433741, 4.030592, -9.292274, -3.870678, 2.177808, -0.545548, -2.237058, 2.649582, -0.528622, 1.250894, -0.112586, -0.969227, -0.571061, -0.680663, 1.271584, -5.168596, 4.699327, 5.181798, 1.421188, 6.487319, 3.149934, 2.146110, 0.893968, 4.691792, -2.652787, 0.780389, -7.469417, -1.913996, -4.114168, -1.824573, 1.825537, -7.670074, 0.083751, 1.497249, -1.842984, -0.207182, -1.217132, 1.720826, -0.462654, -0.142980, 6.752104, 0.513007, -9.219392, -6.861326, -1.046578, -3.621952, 8.216535, -1.929723, -0.226388, -0.364569, 0.592417, 0.661270, -2.502738, 0.655540, -2.301271, -2.658660, 6.579635, -2.761786, 3.214799, 1.964015, 0.085705, 0.268774, 1.773046, 6.180820, -4.607996, -0.740156, 2.677974, -0.014020, -5.367133, -0.792135, 1.014724, -2.928968, -2.636251, -0.764111, -2.006562, 0.120694, -4.609838, 1.088676, -11.941098, 5.737967, -3.500586, 1.158638, 5.277419, -0.824252, 3.719745, 1.067214, 6.999010, -3.490391, 3.380606, -3.941285, 3.037470, -3.074491, -5.476095, 0.703748, 0.590704, 1.712015, 2.907949, -3.779428, 4.336009, -3.208945, 6.012998, -0.633373, -0.858766, 0.509552, -2.374973, 2.670326, -3.898677, 3.515145, -1.504274, -2.528835, -4.196266, 1.497983, -0.251315, 1.430840, -6.129501, -0.432433, -2.067062, 0.683940, -0.651003, 2.075870, 0.301573, 0.456413, 1.512750, -2.011982, -2.651834, -3.443306, -1.217246, 2.249681, 4.056913, -0.001803, 1.856310, -1.120802, 5.100811, -4.818334, -1.832536, 0.575663, 2.821508, -2.484176, 1.090846, 3.982687, -3.890761, 1.816324, -1.156088, -1.480178, -0.946538, -0.579305, -0.901073, -0.626901, 0.268467, -3.557676, -1.586237, 1.791939, -0.206871, -0.901410, 1.202474, 2.744930, -0.571335, 3.200516, 0.411471, -0.096053, 1.948472, 1.908952, 2.884496, 1.760468, -0.835073, 3.515426, 4.381332, -0.223358, -1.809401, 0.086503, 2.059222, -1.261783, -0.836733, 0.856923, -4.898800, 1.888360, -2.207174, 0.376906, 3.295334, 5.689600, -2.268684, 1.449032, -1.591826, 1.197192, -0.958380, -1.880106, 2.903637, -1.158304, 0.286425, 1.411350, -1.044144, 4.642242, 5.629546, -1.222408, -0.653535, -0.909632, 1.304384, 0.949307, 2.316329, 2.287262, -0.217201, -0.789907, -0.414622, -3.416742, 1.308199, -4.681273, 5.369122, -4.293337, 0.764565, -2.343703, -3.794211, 1.932296, 2.057181, -1.281097, -5.943415, 1.662839, 0.180384, -4.695090, 0.702479, -3.153823, -3.411024, 0.180862, -3.886384, -1.007891, -1.704118, 0.862702, -0.511424, 3.460708, -1.867585, -7.966740, 2.017166, 7.204095, -0.777631, -2.125801, 1.848246, 2.023623, 2.431535, -0.120389, -0.336207, 2.110109, -0.700987, 4.741506, 2.285265, 2.249003, -3.817211, 3.017461, 1.867846, 4.302247, 0.304751, -0.310234, -2.985993, 4.513823, -1.711970, 0.424881, 1.872621, 8.673007, -2.507097, 0.834635, 3.987681, 0.865946, -2.122818, 2.411204, 0.166906, 4.962452, 2.623420, -0.873380, 1.751310, 3.454344, -4.624747, 0.288112, -1.057867, -7.632612, 3.383047, 2.529626, 0.388937, 0.680255, 0.010386, -0.285309, 2.821573, 3.907364, 0.626498, 6.285956, 2.837718, -4.136279, 2.172159, -0.740020, 2.603785, 0.488508, -2.008139, 1.194486, 2.840358, 1.642457, 1.124606, -2.682470, 2.333769, 0.111492, 1.035283, 1.694300, 2.123012, -2.884374, -7.135779, 4.317023, -0.086166, -1.382437, 0.327938, 0.949700, 6.212725, -1.359681, 0.690782, -1.888311, -3.168814, -4.167558, -6.230211, -4.251054, 1.301581, -2.542017, -27.238344, 0.095754, 4.638476, 3.855103, 0.669284, 0.051903, -0.231133, 5.462236, 1.029191, -1.093251, -2.784316, 2.413422, 4.149455, -6.006116, 1.867116, -1.621314, -2.286732, 1.713983, 1.969203, 2.893601, -1.096513, 2.859676, 5.299241, -1.564409, -4.103854, 3.312290, 7.535967, -10.267817, 0.146586, 0.198330, -0.458738, 5.533854, -0.592334, -1.642502, -0.867962, 0.357863, 1.499991, 0.317374, 1.710249, -3.821599, -4.164293, 0.626525, 2.896627, 4.243000, 2.967313, 1.367663, 6.382295, -6.377450, -4.389627, -0.473153, 2.991648, -1.669134, 1.115779, -0.026947, 0.960267, -5.584852, -1.439730, 0.244562, 4.521901, 2.087858, -0.596442, 1.345291, -1.198195, 1.134727, 7.460937, 3.136116, -1.059861, -3.781231, -3.531691, 1.444958, -3.786484, 4.863374, -4.276469, 5.663068, 0.108844, -4.433540, 3.566478, -0.737263, -0.857777, 3.881980, -2.940099, -0.232970, 1.024321, 0.370978, 1.810182, -1.729423, -0.953740, 0.363640, 0.101128, -0.457497, -5.657916, 2.896876, -2.673734, -0.363334, 2.990057, 1.737989, 1.253162, -2.239347, -0.736923, -1.326416, 1.898398, 0.172377, 3.468918, -1.249191, -2.190415, 4.681655, 4.550338, 8.617194, 1.506140, -4.036048, -5.532407, 3.284861, 0.496790, 0.007859, 1.394254, 0.463913, -0.189794, -3.210449, -5.304472, 1.114539, -1.199341, 0.368409, -3.468653, 0.319924, -7.521216, -0.515546, -0.846355, 3.365871, -7.414912, -1.118039, 0.057726, 0.471262, -0.671277, 5.010963, -1.456864, 0.795316, 5.710810, 5.483665, -3.680223, 3.203779, -7.778675, 1.581216, 0.493372, 1.308644, -0.168083, -1.437484, -4.021763, 3.567986, -1.012697, -1.095128, 4.802341, -1.501106, -3.554114, 1.340577, -2.658166, 0.273735, -2.363988, -3.568919, 2.061281, 1.113543, 0.817031, -1.828942, -3.359740, 5.046678, 4.811258, 1.904614, 1.466768, -1.282211, -2.547651, 4.968533, -4.411788, 4.293465, 0.164253, -1.659536, 0.885092, -0.371292, -1.131706, 5.356121, -4.582285, -6.341745, 2.657330, -1.279451, -3.364145, -4.528135, 6.154132, 0.255915, 0.995843, -6.160520, -4.956539, 0.385687, 1.730230, -4.108953, 1.728342, 1.137076, 0.444596, -3.889635, -4.341253, -2.400633, 0.832284, -3.842432, -2.179685, 1.179994, -3.570244, -3.793373, -0.992903, -6.269731, -2.026199, -2.740402, 3.908176, -0.131522, -2.611959, 5.417125, 4.601712, -0.944981, 1.923583, 3.380219, 0.557484, -4.402332, -1.821094, -1.393727, -4.922822, -0.122208, 1.627198, -2.698848, 0.543942, 7.662592, 6.865628, -3.118904, -2.938929, -1.204334, 0.880950, -2.432030, 0.659579, -5.937887, -3.082295, -2.329674, -0.929039, -6.670197, 4.624016, 1.847857, -6.151865, -1.295520, 4.561952, -4.189340, -1.271205, -3.114521, -5.400505, 2.505269, -1.440362, -0.543130, 1.583606, -3.754081, 1.425768, 4.600868, 2.031716, 4.381232, 0.756865, 1.269595, 2.452235, -3.499653, 0.843460, 5.095802, -3.224932, -1.973590, -0.918793, -2.948713, -1.726048, 0.723157, -1.867589, -3.809328, -1.288017, -1.415320, 9.147604, 4.031230, -2.515405, 6.017965, -0.808787, 6.938612, -0.940846, 0.514639, 4.288442, 0.824478, 0.728683, 1.763265, 0.545236, 1.356232, 5.527868, 4.475016, -3.437302, 3.826543, 5.086852, 0.299217, 4.161529, -0.210087, -2.249442, -2.974098, -6.259721, -1.801933, -1.427602, 3.724947, 3.003747, 0.540840, 0.380978, -0.046774, -0.128234, -6.908992, 0.317432, -0.887537, 3.312163, -0.027360, 2.139082, 2.805890, -0.598460, 2.746891, 3.195262, -0.988383, -0.554773, -1.027135, 0.100964, -3.613074, 5.187125, 0.852667, -0.236328, 6.884120, 2.533068, 6.731904, -0.778514, 0.375824, -1.817388, -5.946012, -2.352830, 5.102455, -1.938184, -1.387404, -4.757504, -2.173933, -0.201531, -3.642277, -5.362231, 3.637059, 2.751554, -1.022320, 0.236827, -3.948135, 0.148865, -1.120026, 7.616945, -0.963430, -0.756691, -0.919667, 4.724895, 0.976052, 2.488966, -3.964747, -3.973845, -0.865214, 6.001307, -2.119917, 4.599735, -0.121487, 4.304985, -1.779739, -1.172909, -0.084037, -4.013283, 2.303892, 3.205657, 3.503476, 1.411669, 1.280174, 8.759250, 0.880316, -2.128790, -4.705835, -3.441471, 1.247740, 1.331217, -2.363749, -0.643617, 1.385127, -5.501627, -7.879881, 5.820022, -1.216314, -2.178036, -1.108843, 3.346485, -3.363919, 3.609971, 2.944907, -1.553591, 0.286604, -1.090472, 3.108707, -0.752903, 3.523269, 4.811034, 5.104235, -0.589819, -1.991645, 0.047370, 0.897678, 1.814540, -2.876569, 0.355313, -0.919469, 0.483877, 4.114913, -4.182666, -1.667883, -0.516528, 3.272009, 1.569783, 1.219359, -2.658862, 1.495385, -6.089864, -2.128447, 2.838350, 0.987129, 1.348928, -2.442407, 3.521696, -0.835038, -0.587476, -0.638110, 0.922019, 2.216001, 1.777817, 1.454185, 4.067125, 1.120436, 0.289011, -10.992048, 1.672040, -0.465922, 0.592983, 0.989681, 1.752660, -0.118764, -1.185995, 4.017984, -0.896778, 1.672302, -2.030242, -4.911530, -5.827189, 1.958246, -7.221211, -2.034618, 1.940285, -1.227662, 1.219494, 4.101542, -2.650769, -2.734393, -6.333424, 2.812163, 2.962675, -4.904971, 0.171724, 1.579389, 3.322502, -0.514203, 1.357068, 5.346297, 0.754811, 2.778355, 1.658731, 1.977090, -1.235332, 5.064486, 1.987611, 6.833618, 1.541649, 2.747869, -2.104572, -1.799155, 0.295171, -4.860865, -0.765238, -1.785131, 2.577568, 0.511845, 0.550628, -2.406874, -2.713742, -2.492991, -0.175184, -0.126996, -1.339363, 0.823035, 2.378861, -2.361350, 1.065711, 2.798916, -0.116275, -5.790211, 4.252931, -0.497029, 1.009890, -1.029344, 1.774927, -6.555468, 2.879664, 0.601462, -1.747875, 1.505472, 1.982710, 2.030839, -2.763254, -1.348869, -0.266905, 4.583949, 3.799467, 3.299871, -6.750926, -5.246260, 0.655915, 5.968640, 0.782474, -1.063692, 1.318094, 3.220077, -3.783729, -0.677141, 0.595987, -2.890465, -1.360495, -0.124686, -0.284035, -2.925553, 0.262872, 2.786888, 4.160244, 0.014939, 0.569116, 1.181584, -2.572151, -1.554686, 1.284756, -4.822996, 0.997169, -3.813555, 0.617060, 2.565912, 1.037117, -0.167606, 7.223352, -2.058073, -3.086306, 4.202200, 0.180718, 3.078883, -1.246702, -2.439780, -1.529070, 2.548267, 3.340835, -6.063392, -1.775246, -0.201369, 1.530203, -0.590155, -4.917980, -2.405702, -1.290209, -0.190081, 4.806739, 4.704314, 3.928058, 2.274630, -0.550331, 0.822520, 1.818900, 1.580151, -3.903281, -0.857364, -3.976201, -0.785557, 1.069647, -5.818671, 2.766473, 1.259937, 2.320919, -1.525085, 2.384599, 4.469339, 0.205931, -0.547983, 2.354457, -0.327334, -6.448855, -0.907820, 5.117530, -0.255594, -0.240747, -1.073850, -1.768409, -5.428776, 0.481095, -3.812433, -1.717587, -3.733663, 2.083388, 10.582418, -4.710001, 0.563532, -7.536781, -6.452072, 2.862794, -1.463039, 0.230586, -1.815489, 1.481611, 1.544786, 1.551767, -0.059472, 0.789418, -0.825706, 1.940825, 5.768491, -0.494084, -2.538201, -1.392608, -1.908880, -1.700553, 5.082959, 10.069242, 0.678830, -0.559667, 2.036356, 1.434546, -2.558431, 2.288113, 0.170179, 2.484860, -2.987775, -0.382073, -0.226642, -2.446935, -3.155464, -3.223151, 8.751858, 0.507641, -2.614630, 1.343406, -0.892121, -2.539207, -4.111343, 1.182877, -4.679571, -2.524526, -5.096138, -1.301956, 2.281354, -2.029088, 0.427091, 3.804364, 5.483494, -40.825272, -1.143677, -2.260099, -0.089059, -1.493274, -1.944442, -2.385689, 1.698596, -1.873040, -0.443924, 2.621414, 1.287797, -0.057905, -2.352337, 1.238259, 1.387846, 1.417577, 0.688409, 4.645106, 2.341688, 0.465816, -7.549288, -5.974134, 6.261028, -5.144201, -2.522077, -0.662837, 4.664794, -0.539391, -3.439147, 4.087629, -1.963182, 2.466938, 2.107603, -1.249598, -6.342202, -3.447833, -3.704789, -1.074862, 0.765216, -1.964713, -5.504405, 1.663442, -2.911618, -0.616493, -2.083917, 2.209042, 0.487668, 1.796368, -1.596250, 4.042785, 2.681681, 2.703725, -1.306239, 2.601149, -2.099759, -5.318709, -1.065740, 0.589329, -3.567276, 0.062211, -0.425451, -3.580077, -1.296509, -5.680976, -0.678171, -0.762381, 0.299684, 0.240645, -1.079323, -2.957309, 1.849129, 0.099766, 4.726409, -2.363460, -4.875949, 1.315865, 1.462351, -4.889824, 1.677129, -0.667212, -5.138101, 5.330096, -2.835877, -3.902706, 3.045518, -0.134772, 6.075926, 4.500954, 5.112908, -0.905090, -1.909402, 0.904969, 0.690467, -0.619620, 2.666605, 0.904155, -4.029288, 0.402307, -3.036969, -1.037772, -1.147319, -1.899678, 6.248024, 1.974826, 5.616739, 8.500142, 2.245967, -3.625181, -2.428448, -1.810710, 0.331268, -3.135995, 1.302269, 1.225240, 3.909755, -0.403967, -3.433037, -1.311207, -2.177593, -18.017111, 1.118036, -0.756810, -6.615274, 1.566789, -2.418086, 2.953141, -2.826252, -0.735017, 1.528646, -3.723853, -1.885919, -4.900423, -2.478693, -3.619526, -1.294044, 1.890493, -1.871122, 3.827642, 2.343641, -2.662672, 3.404760, -0.482313, 5.578103, -2.134901, 5.369441, 2.492493, 2.104894, -5.205935, -1.008856, 3.011516, -0.641482, 2.548511, -8.027479, 0.661283, -7.434981, 1.621635, 2.033537, -1.930799, -1.076036, -0.960546, -0.117069, -3.372064, -2.063029, 1.654161, 1.716305, 4.944497, -0.019142, -4.051437, -3.766835, 5.842964, -3.690862, -3.004082, -4.533800, -1.790580, -2.050976, 0.561251, 1.564617, -2.369565, 2.445515, 0.568967, 0.080670, 1.160943, -1.780555, -0.186662, 0.478309, -4.549850, -17.848110, -4.087846, -0.273795, 5.222282, 5.777647, 0.494441, -1.833347, 0.257310, 1.124015, -1.849150, -1.647193, -1.132576, -2.164145, 4.289577, 0.453893, 1.341683, 0.748200, -2.471226, 5.312476, 0.836187, 1.723641, 0.498636, 1.428005, 1.828248, -3.398380, -2.883598, -5.426562, 2.745332, 1.425643, 0.006252, -3.581231, -0.262443, -0.622419, -1.944646, -3.869738, 4.476512, -1.191560, 4.310348, -1.112125, 3.697780, -1.439761, -4.078893, 2.179358, 2.530555, -0.148791, 2.168117, 0.114731, 1.831150, 1.157180, 2.166352, 2.055449, -2.871636, 4.420805, -1.390404, -6.722301, -1.769241, 1.519223, -2.692869, 1.015409, 7.691347, 0.583279, -4.058763, -1.453730, 2.806040, 0.556832, 2.730164, -2.572670, 2.370497, 1.912015, 3.515903, 1.919521, 0.369309, -2.022592, -0.218010, -6.205793, -5.902825, 4.063861, -2.432916, 33.217953, -0.551205, 3.915136, -3.102407, 3.210474, -1.160345, -2.887495, 1.523478, -1.018772, -2.422345, 1.810487, -5.200218, 1.671701, -0.721633, 4.423970, -1.442638, -5.779491, -0.397599, -3.750891, 2.698724, -5.718669, -2.086298, -7.792905, 3.388355, -3.770853, -0.215906, 2.560900, -2.680165, -2.986103, -6.097780, 4.229813, -2.723863, -3.065816, 2.248178, -0.651940, -1.214029, 2.257283, 2.486542, 2.257068, 4.830973, -2.277972, -1.946075, 3.999813, 3.456074, 0.536131, 1.805786, 0.709500, -4.097561, -5.412663, 4.285580, -4.488030, 1.119286, 2.302969, 5.947579, -0.113177, -1.724484, -0.104090, 1.840069, 4.592496, 1.569632, -2.483522, -0.311809, -0.194584, -2.869655, -2.849751, 4.997175, 2.512164, 1.758786, 0.490910, 4.247337, 0.569464, -0.659693, -0.454139, 4.440271, 1.410414, -0.773389, -2.376852, -6.240281, -4.227293, -1.671386, 0.603003, 2.479946, -2.180943, 3.127448, 1.668763, 0.149739, -0.781542, 7.098460, 2.111330, -4.436161, -2.082523, -2.478790, 49.008598, 1.723189, 5.866392, 3.153973, -0.181730, -7.328019, -1.992281, -2.027254, 2.432771, 0.979790, -0.092340, 1.070737, 0.099132, 6.923749, -4.058023, -8.293395, -1.186165, -3.612353, 0.829185, -0.439513, -1.879160, -1.854795, -5.092451, -1.970279, -4.055690, -7.000295, -4.616829, -0.039388, 4.131409, 1.653316, -7.131061, -5.091387, 3.990695, 3.182390, -1.955781, 0.335848, 1.516590, -0.282827, 2.731199, -1.598010, 5.144428, 2.064570, 2.552294, 1.812606, -2.513990, 2.353807, 7.561320, -0.036158, -0.385612, -4.383836, -3.971509, 0.043287, 4.296186, 0.464179, -1.848343, -5.514888, -8.349360, -8.976414, 0.569201, -4.740505, -1.152213, 5.905676, -5.700530, 3.814966, 2.234695, -0.222299, 4.771212, -3.930703, 2.960777, 0.165897, -1.333587, 2.028787, -0.778738, -1.235554, -1.246747, -0.041303, 3.006766, 3.820721, -5.493714, -0.189577, -3.010185, -2.883236, 0.024274, -1.494312, -2.523447, -4.104508, -2.666597, 3.281853, 3.544635, 3.610449, -0.587052, -0.098227, 0.073750, 0.127269, -0.344085, 0.770601, -2.514192, -4.431854, -1.949836, -1.391094, -2.886564, 0.702709, -0.731393, -2.069504, 1.040801, 3.144605, -0.115405, 2.797428, -1.210172, 3.943987, 2.697510, 1.059327, 0.742219, 4.559521, 1.323614, -0.211233, -0.215014, 1.990037, 1.956536, -2.729569, -3.183045, 2.447714, 2.988651, 5.736022, -3.185564, -2.579111, 3.323712, 4.193132, 0.727872, 5.551432, -0.744345, -2.344970, -3.441572, 5.568137, -4.830486, 3.880134, -5.891489, -2.210011, 0.171487, -1.457955, -2.245640, 2.203776, 0.875658, 1.959566, 8.602451, 4.474270, 2.726350, -0.354296, -2.258420, -2.513044, 4.931829, 2.946697, 0.520752, 4.333879, -0.821970, 2.029501, -2.277242, -4.952377, -1.538025, -1.064874, 3.308603, 1.247338, 2.504045, -3.224561, -2.891461, 3.316285, 7.715738, -1.605185, 3.243022, -5.836881, 2.714357, -0.388968, -1.375407, -3.556258, -2.474644, 1.088091, -3.216538, -2.776881, -8.227687, 5.125639, 5.283110, 6.973928, 37.955891, 3.139960, -5.763666, 1.350248, -1.406692, -3.986167, 2.335608, 3.157670, 4.455172, 2.308789, 2.309107, -3.130976, -4.247937, -2.270962, 1.574682, 5.501863, 4.485770, -10.403132, 0.773148, -1.224575, 5.244150, -3.847297, -1.800287, 6.947053, -2.622383, 4.851179, 3.233861, 4.995902, -4.162736, -1.983544, 3.995517, 6.687618, -3.040680, -6.543100, 5.237826, -4.066863, -1.226237, 1.639323, -4.240799, -0.423647, -2.178972, 1.593127, -3.968011, 5.461793, 2.277310, -2.744098, -3.358952, 4.035235, -3.137929, 2.964127, 7.214087, -1.364202, -4.651634, 6.503850, 2.740077, 0.886651, 2.558752, 0.043657, -5.242559, -0.792984, -2.882527, -5.403186, 3.155617, 3.991355, -0.568835, 1.141021, -3.702596, -0.279778, -0.912082, -3.422060, -1.879990, -0.733728, 2.278115, -2.372427, -2.681229, -1.556143, -0.770521, 0.899023, 4.508717, -3.010523, -1.908418, -1.475849, 3.509949, 3.582316, -0.681324, -7.056856, 1.278887, 0.770643, -0.584454, 2.825036, -0.821392, 1.240007, 7.577941, 0.096160, -3.572768, 1.793741, 3.848424, 0.995781, 0.054073, -0.235984, -0.301966, 2.916689, 1.391444, 1.480205, -3.852633, -1.223310, -3.153660, 3.194536, 1.030388, -4.751256, 1.885638, -2.114385, -6.170693, 1.415366, 4.334911, -4.597432, -0.846934, -3.220494, 7.086485, 3.530614, -2.059356, 1.625898, -2.086513, -1.870273, 3.097336, -2.899379, 4.410741, -2.872575, 0.711264, 5.131933, 1.355852, -2.781525, 3.417987, -2.958153, -0.075279, 1.298355, -0.051176, 2.940977, 0.224740, 4.669080, -3.098835, -1.002103, -2.227010, -0.529945, 1.581029, -1.380316, -0.444712, 1.033249, -2.976829, 1.184253, -3.129241, -4.516237, 1.861109, 0.749785, -0.883805, 2.467990, -1.366948, -2.021918, 9.678317, 0.780995, 5.006135, 2.355187, 4.709684, 0.273654, 0.848736, -1.546677, 0.190088, -1.301066, -1.275526, -0.592527, 2.147533, 4.496069, 2.915365, -2.232816, -3.967589, 2.198257, 3.257962, -1.514701, -1.426605, -0.670297, 0.013288, -1.436101, -3.523503, 1.392464, -7.692491, 4.621131, 4.215765, -6.433372, 8.560532, -5.166466, 1.641891, 0.939057, -1.161749, -0.620703, 0.268485, 0.818933, -1.119831, -2.502043, 1.727721, -2.059391, 2.365984, -3.621562, 0.293849, 2.106696, 2.435274, -0.056334, 1.744176, -0.872306, -3.566818, -3.476950, -13.363617, -2.873412, -0.464867, 0.363577, 2.328689, -3.345537, -8.831505, -0.320157, 0.871519, -4.472153, -0.401718, -0.096133, 0.386455, -1.806300, 1.170147, 1.817253, -3.103240, -0.165836, 3.598591, 0.253634, 2.874893, 3.903717, 1.369801, 5.628721, 2.572816, 2.252870, -1.170226, -1.062752, -0.456143, -3.078095, -6.341308, 4.122602, 2.681516, 0.093924, 3.844733, 3.950950, -3.356711, -0.854817, 3.384428, 13.341409, -4.234371, 2.496033, -1.629354, -1.507377, 0.173900, 2.698258, 0.267310, 1.372167, 0.510639, 2.101537, 1.261789, 2.597848, 2.449376, -0.840756, -1.046823, -0.002541, -2.707355, 0.755944, 1.257627, 1.465595, 0.194379, 7.094273, 0.264183, 5.946443, -6.767539, 3.796284, -2.374620, -2.461099, -1.391973, -3.764230, -2.892979, 1.555705, -1.332523, -1.799763, -2.815821, -4.461638, 0.016501, -0.694294, -0.483337, 1.215167, 0.758542, 4.294414, -2.490471, -2.232953, 0.058542, -0.601252, 4.924240, 3.834749, 2.027948, 1.471411, 0.901828, 0.966709, 0.390534, 7.367773, 1.259584, -0.460943, -6.608699, -4.643585, 2.040176, -4.100492, -1.690598, 3.704851, -2.805748, 1.226204, 6.179536, -1.176640, 3.004384, 3.111051, 3.486775, -4.034345, 4.785528, 7.299822, -0.907930, -5.061491, -3.685342, -1.961922, -3.682397, -0.804982, -1.361447, -0.922343, -4.764962, -0.093592, 6.292956, 9.184029, -1.652289, -0.349144, 2.227105, 2.586640, -1.344222, -0.799372, -1.067394, 1.337438, -4.368445, -3.637234, 0.453892, -2.348681, 3.619864, 0.782791, -0.751199, -2.257583, -0.637428, -2.538015, -4.910626, 0.804029, 1.904546, 2.071048, -4.913878, 7.321102, 2.329751, -4.918932, 9.630699, 4.401609, 4.836075, -37.672218, -0.989470, -3.400167, -1.611549, -4.914345, -2.178263, -0.507991, 7.461368, 1.499078, 9.014656, 0.385251, -5.354575, -0.235111, 1.188299, 2.999619, 4.158133, -2.458529, 0.836758, -3.781934, 2.581318, -0.546479, -0.767487, -1.108148, 1.394199, -6.728708, -0.049908, 2.439957, 2.281411, -0.256135, 2.625314, 0.526615, 5.245579, -0.693234, -4.652315, -6.053971, 0.761856, 1.380219, 2.186297, 1.574326, 3.023049, -3.008269, -5.497355, 1.208180, 1.866580, 0.383009, -5.021918, -1.859053, -0.577350, 0.791435, 1.862842, -2.977753, -2.982435, -3.968694, -2.911249, -0.671418, -0.664998, -4.328650, 0.777280, -3.322049, -1.996393, -10.946344, 1.487512, 3.325927, 2.512698, -4.560525, 2.487353, -3.221335, 2.498028, 1.558956, -7.027573, -4.410831, 1.153316, -2.383291, 2.752918, -2.445125, 1.669130, 2.055993, 3.435565, -2.200916, 7.625603, -0.429790, -7.823750, 0.715926, 1.946298, 0.931786, -0.959709, -2.558171, 1.349378, 2.164468, -1.039176, -0.840470, -1.273818, -1.788402, -0.181308, 4.731328, -1.308624, -4.392232, -3.329735, 3.710122, -0.471847, 0.321802, -2.903600, 2.763454, 1.606396, 5.018901, -4.307723, -5.778127, 1.896904, -1.911034, -0.877533, -0.970224, -6.048330, -1.055960, -0.004360, 3.615595, 0.670895, -3.405262, -6.979682, 1.391201, -2.248485, -0.812938, -3.361127, -0.245261, -2.004265, 2.180632, -2.492358, 0.106926, -2.312480, -0.849407, -3.182122, -3.868700, 1.483516, -2.800287, 1.606942, 5.240799, -4.588236, 3.825581, 1.475691, 5.782595, 0.572563, 0.615392, -3.263512, 2.357608, 2.504305, -0.131105, -4.494020, -3.431349, 1.323660, -1.233401, 5.227014, 1.833779, 4.529938, 0.172702, 2.495649, 1.844747, -0.571699, 6.221216, 3.510139, 2.034399, -2.708287, 2.504865, 0.827564, -0.389984, 0.973059, -1.210372, 3.136084, -0.528568, -2.895905, -2.945923, -3.354907, -6.284786, 2.405454, -3.611116, -1.510917, -0.275045, -2.606582, 0.023821, -0.977702, 4.496226, 3.893765, -5.102979, 4.505174, -2.151086, -2.598680, 3.437677, -1.525504, -3.606384, 2.925199, -1.819598, -1.297506, -3.938237, 0.758268, -6.536557, 2.823950, -1.268384, -4.748070, -1.569543, -4.040869, -4.199319, -1.537972, -2.470041, -2.424089, 4.282330, 2.027144, -2.279008, 1.956247, 0.243131, -1.307445, 7.631332, 3.666424, 3.244223, 0.806356, 1.559336, -6.454810, -2.210939, 6.478909, 8.370586, -4.877525, 1.632561, 0.528654, 7.175992, 3.294758, -2.347389, -0.656673, 2.440072, 0.033162, -3.974950, 3.992282, 1.041370, -0.452447, 1.250809, 5.725498, 0.952154, -1.769804, -6.182436, -1.491944, 0.978641, -1.848162, -3.526021, 2.130781, -1.247035, 0.430292, 2.094477, 1.015652, -0.982822, -3.514047, 1.362636, -1.385784, 7.721636, 0.452686, -0.514129, -0.185515, -4.155198, -2.091428, -0.342165, 3.609978, 2.325354, -2.549308, -1.935902, -4.843817, 4.849944, 0.559908, 0.317363, -6.979578, 4.078461, -1.781615, -0.168588, -3.608682, 0.540197, -3.224210, -2.160827, -2.483818, -1.395126, 2.122100, 0.045637, -5.244937, 1.460025, -1.313215, -1.831590, -3.608512, 2.137432, 1.063356, -4.977770, -4.077838, -0.227993, -2.521503, -2.196034, 2.424290, 3.695994, -0.664042, 3.369231, -0.868323, -1.362390, 2.049519, 1.623333, -3.785452, -0.366721, 0.929944, 1.189942, 2.357296, -2.012867, 4.039164, 5.018045, 1.173269, 1.037018, 1.239052, -1.506088, 1.858684, -8.143816, -8.377998, -4.266102, 2.238616, 0.990164, -5.564924, 4.017815, 0.013090, -2.655519, -2.138789, 0.309147, 0.815738, -3.671095, -1.768397, 0.642272, -4.239707, -1.738047, -3.186784, 0.928931, -3.288824, 2.778164, 2.127247, 1.010160, -0.380883, -1.794696, -0.380974, 2.568671, 2.952252, -0.378777, 3.388682, 1.112046, 3.843087, -4.014625, -2.586132, -1.373591, 0.875761, -2.153665, -4.098034, 4.174786, -3.971316, 3.831267, 2.632764, -2.082423, 2.323014, 0.126692, -1.658272, 3.813766, 0.731241, 5.020086, -1.314192, -0.455286, 2.529340, 1.737364, -2.307101, 3.249065, 3.359480, 2.342399, 2.964352, 0.450493, 6.821503, -0.876081, 0.104224, -4.359997, 6.337316, 1.961707, 1.584909, 5.782849, -7.702838, 0.486187, -2.076824, 0.556629, -0.377218, 1.867574, -2.546370, 3.420962, 3.743823, 0.830919, -3.154605, 2.442055, -0.077978, -2.174074, 7.835155, 0.042525, 5.061534, -1.297285, 0.468504, -0.896670, -2.810953, -2.835888, -4.904580, 1.410659, 6.832096, -1.246026, -2.643165, 2.461378, 1.465698, -2.603248, 3.614096, 0.604979, -0.203376, 0.620558, 2.010493, -3.164634, 2.355133, 0.910522, -9.153761, -6.733269, 0.191007, -2.222224, -3.317362, 1.621990, -3.271222, -3.361245, -2.236486, -3.204127, -0.102103, -6.418847, 0.475259, 0.960290, 0.809332, 2.043769, 0.233913, 1.443959, 3.518958, 0.381731, 3.579408, -1.736145, 0.935460, 0.490841, -1.445789, -2.606658, -0.390318, 1.241003, 1.852338, 3.745817, -3.404975, -2.927765, -1.375834, 4.025413, -2.450278, 0.526100, 0.871327, -8.299149, -3.752483, 8.046568, -3.801634, -2.328504, -2.873852, 12.037412, 5.806139, 0.151948, 1.294600, 0.820967, 1.635987, 1.551868, -16.845692, -0.875519, -2.038996, 4.211426, -4.056794, 5.402084, -3.281292, 7.430756, 1.138992, -3.348103, -0.064485, 2.009815, 3.773692, 3.913712, 4.532695, 1.372892, -1.063987, 1.025051, 2.728509, 9.491262, 4.673415, 2.517928, 0.824100, 1.517722, -2.984458, 0.270283, -1.110453, -2.754004, -3.045094, 1.386551, -6.637009, -6.467078, 5.209693, 0.221588, 1.345757, -1.233974, 1.406563, -11.624565, -2.558307, 3.355196, -1.574055, 11.278919, -4.175966, -3.528599, -5.955521, 5.521868, -6.089912, 1.871637, 0.704803, -0.803571, -3.412199, 6.814842, -2.581755, 1.461685, -1.340520, 1.030475, 0.972731, -2.055774, 1.705598, 3.048807, -1.037460, 2.906647, -0.434317, -0.054622, -3.738979, -4.256308, -4.723948, -0.522000, 4.060871, -4.338176, -24.310600, 3.466682, -0.756065, -4.103305, 1.243985, -2.868507, 0.153364, -0.084377, 3.205434, -1.317540, -0.319762, -0.802727, 2.456871, 3.382993, 4.963143, -0.354739, -2.737841, 2.374343, -1.887784, 1.198460, -7.294523, -2.117028, 46.393829, 2.054618, -1.339482, 2.647988, 1.026573, -0.775336, -1.399183, -6.095242, 2.313640, -2.377140, -2.874966, 1.462420, 1.763429, -4.175523, 0.322729, 0.076063, -0.492210, 5.528355, 2.494064, 3.575900, -4.731384, 1.914505, 0.780838, -0.513604, 0.834317, 4.637135, 0.748915, -2.242534, 5.284515, 0.941967, -4.771373, 12.168940, -2.509539, 2.931109, 6.130014, 2.304723, -0.401287, 1.846620, 2.191286, 0.522786, -2.212207, -2.330939, 3.671052, -3.602032, -8.045081, 1.841934, -15.802244, -4.091495, 2.758163, 3.801088, -0.116542, 4.200107, 3.809458, 2.594901, 8.304158, 2.272768, -3.162868, -4.738754, -1.238095, -2.981887, -0.739677, -8.554259, -1.512607, -3.465696, -0.641592, -5.622955, 0.712421, -0.786531, -3.055792, -1.708393, -1.229299, -2.440192, -2.387127, -1.278995, 0.990231, 0.675825, 7.402359, 2.521222, 0.413226, -1.471801, -0.305300, -2.968605, -0.631904, 2.190862, 1.293332, 0.988720, 1.396958, 1.053343, 0.096911, 5.328904]
 }

From a45231af47035e89031625d47210663881a4e2cf Mon Sep 17 00:00:00 2001
From: Min Yoo <yoo@minyoungyoo.com>
Date: Sun, 25 May 2025 05:18:32 +0900
Subject: [PATCH 062/108] readme: Add macLlama to community integrations
 (#10790)

This commit updates the README to include macLlama within the community integrations section.

macLlama is a native macOS application built for lightweight and efficient LLM interaction.  Key features include:

*   **Lightweight & Native:** Designed to be resource-friendly and perform optimally on macOS.
*   **Chat-like Interface:** Provides a user-friendly, conversational interface.
*   **Multiple Window Support:** Allows users to manage multiple conversations simultaneously.

The primary goal of macLlama is to offer a simple and easy-to-run LLM experience on macOS.
---
 README.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/README.md b/README.md
index 6a4815c1..00de95a7 100644
--- a/README.md
+++ b/README.md
@@ -406,6 +406,7 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [AppFlowy](https://github.com/AppFlowy-IO/AppFlowy) (AI collaborative workspace with Ollama, cross-platform and self-hostable)
 - [Lumina](https://github.com/cushydigit/lumina.git) (A lightweight, minimal React.js frontend for interacting with Ollama servers)
 - [Tiny Notepad](https://pypi.org/project/tiny-notepad) (A lightweight, notepad-like interface to chat with ollama available on PyPI)
+- [macLlama (macOS native)](https://github.com/hellotunamayo/macLlama) (A native macOS GUI application for interacting with Ollama models, featuring a chat interface.) 
 
 ### Cloud
 

From 012cf65340ccd44de76017c56eb553b0803fde95 Mon Sep 17 00:00:00 2001
From: RAPID ARCHITECT <126218667+rapidarchitect@users.noreply.github.com>
Date: Mon, 26 May 2025 14:05:03 -0500
Subject: [PATCH 063/108] readme: add AWS Strands Agents SDK example to
 community integrations (#10865)

---
 README.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/README.md b/README.md
index 00de95a7..d7cf5bfe 100644
--- a/README.md
+++ b/README.md
@@ -450,6 +450,7 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [orbiton](https://github.com/xyproto/orbiton) Configuration-free text editor and IDE with support for tab completion with Ollama.
 - [orca-cli](https://github.com/molbal/orca-cli) Ollama Registry CLI Application - Browse, pull, and download models from Ollama Registry in your terminal.
 - [GGUF-to-Ollama](https://github.com/jonathanhecl/gguf-to-ollama) - Importing GGUF to Ollama made easy (multiplatform)
+- [AWS-Strands-With-Ollama](https://github.com/rapidarchitect/ollama_strands) - AWS Strands Agents with Ollama Examples
 
 ### Apple Vision Pro
 

From aea6fb9b5862932ea4622e854be9ddae8c320658 Mon Sep 17 00:00:00 2001
From: Parth Sareen <parth.sareen@ollama.com>
Date: Mon, 26 May 2025 17:16:00 -0700
Subject: [PATCH 064/108] tools: remove newline stripping (#10869)

---
 tools/tools.go | 12 ++----------
 1 file changed, 2 insertions(+), 10 deletions(-)

diff --git a/tools/tools.go b/tools/tools.go
index 509ca90a..529bd3be 100644
--- a/tools/tools.go
+++ b/tools/tools.go
@@ -137,11 +137,6 @@ func parseJSONToolCalls(s string, name, arguments string, prefix string) ([]api.
 //   - The processed string with prefix removed if found
 //   - error: ErrAccumulateMore if prefix is incomplete, or nil if successful
 func (p *Parser) checkPrefix(s string) (string, error) {
-	original := s
-	if strings.ContainsRune(s, '\n') {
-		s = strings.ReplaceAll(s, "\n", " ")
-	}
-
 	if s == "" || p.prefix == "" {
 		return s, nil
 	}
@@ -158,7 +153,7 @@ func (p *Parser) checkPrefix(s string) (string, error) {
 		// Return everything except overlapping portion
 		p.sb.Reset()
 		p.sb.WriteString(s[idx:])
-		return original[:idx], errAccumulateMore
+		return s[:idx], errAccumulateMore
 	}
 
 	// Check if prefix appears in middle of string
@@ -167,7 +162,7 @@ func (p *Parser) checkPrefix(s string) (string, error) {
 		p.sb.Reset()
 		p.sb.WriteString(strings.TrimSpace(s[idx:]))
 		// Return everything before prefix
-		return original[:idx], errAccumulateMore
+		return s[:idx], errAccumulateMore
 	}
 
 	// No partial prefix found
@@ -181,9 +176,6 @@ func (p *Parser) checkPrefix(s string) (string, error) {
 //   - tools: Any parsed tool calls
 //   - content: Non-tool call content
 func (p *Parser) Add(s string) (tools []api.ToolCall, content string) {
-	if strings.TrimSpace(s) == "" {
-		return nil, s
-	}
 	if p.done {
 		if p.index == 0 {
 			// Return original string if no tool calls found at start

From 066d0f474671fd38532f4a245533158312a68e75 Mon Sep 17 00:00:00 2001
From: Parth Sareen <parth.sareen@ollama.com>
Date: Mon, 26 May 2025 18:59:06 -0700
Subject: [PATCH 065/108] tools: relax JSON parse constraints for tool calling
 (#10872)

---
 tools/tools.go      | 46 ++++++++++++++++++---------------------------
 tools/tools_test.go | 45 ++++++++++++++++++++++++++++++++++++--------
 2 files changed, 55 insertions(+), 36 deletions(-)

diff --git a/tools/tools.go b/tools/tools.go
index 529bd3be..914a5eaf 100644
--- a/tools/tools.go
+++ b/tools/tools.go
@@ -17,15 +17,14 @@ var (
 )
 
 type Parser struct {
-	parseLeadingJSON bool
-	prefix           string
-	prefixFound      bool
-	tmpl             gotmpl.Template
-	sb               strings.Builder
-	index            int
-	name             string
-	arguments        string
-	done             bool
+	greedyParseJSON bool
+	prefix          string
+	prefixFound     bool
+	tmpl            gotmpl.Template
+	sb              strings.Builder
+	index           int
+	name            string
+	arguments       string
 }
 
 // parseJSONToolCalls attempts to parse a JSON string into a slice of ToolCalls.
@@ -176,14 +175,6 @@ func (p *Parser) checkPrefix(s string) (string, error) {
 //   - tools: Any parsed tool calls
 //   - content: Non-tool call content
 func (p *Parser) Add(s string) (tools []api.ToolCall, content string) {
-	if p.done {
-		if p.index == 0 {
-			// Return original string if no tool calls found at start
-			return nil, s
-		}
-		// Return empty if no tool calls found after start
-		return nil, ""
-	}
 	p.sb.WriteString(s)
 	s = p.sb.String()
 
@@ -195,7 +186,7 @@ func (p *Parser) Add(s string) (tools []api.ToolCall, content string) {
 	}
 
 	// Exit if prefix exists in template, greedy parsing is off, and prefix not found
-	if !p.parseLeadingJSON && !p.prefixFound {
+	if !p.greedyParseJSON && !p.prefixFound {
 		p.sb.Reset()
 		return nil, s
 	}
@@ -206,10 +197,9 @@ func (p *Parser) Add(s string) (tools []api.ToolCall, content string) {
 			return nil, ""
 		}
 		p.sb.Reset()
-		// Do not try parsing leading JSON if JSON not found
-		p.parseLeadingJSON = false
-		if p.prefix == "" {
-			p.done = true
+		// Only do greedy JSON parsing if there is no prefix from template
+		if p.prefix != "" {
+			p.greedyParseJSON = false
 		}
 		if p.index != 0 && p.prefix == "" {
 			return nil, ""
@@ -253,11 +243,11 @@ func NewParser(templateToProcess *gotmpl.Template) (*Parser, error) {
 	}
 
 	return &Parser{
-		tmpl:             *tt,
-		sb:               strings.Builder{},
-		prefix:           tp,
-		parseLeadingJSON: true,
-		name:             name,
-		arguments:        arguments,
+		tmpl:            *tt,
+		sb:              strings.Builder{},
+		prefix:          tp,
+		greedyParseJSON: true,
+		name:            name,
+		arguments:       arguments,
 	}, nil
 }
diff --git a/tools/tools_test.go b/tools/tools_test.go
index 1ae3bff8..5fee8f57 100644
--- a/tools/tools_test.go
+++ b/tools/tools_test.go
@@ -536,11 +536,18 @@ func TestParseToolCalls(t *testing.T) {
 			expectedTokens:   "",
 		},
 		{
-			name:             "model without prefix in template, prefix in output",
+			name:             "model without prefix in template, prefix in output, multiple tool calls in list",
 			model:            "llama3.2",
 			output:           `<tool_call> [{"name": "get_current_weather", "parameters": {"format":"fahrenheit","location":"San Francisco, CA"}} {"name": "get_current_weather", "parameters": {"format":"celsius","location":"Toronto, Canada"}}]</tool_call>`,
-			expectedToolCall: []api.ToolCall{},
-			expectedTokens:   `<tool_call> [{"name": "get_current_weather", "parameters": {"format":"fahrenheit","location":"San Francisco, CA"}} {"name": "get_current_weather", "parameters": {"format":"celsius","location":"Toronto, Canada"}}]</tool_call>`,
+			expectedToolCall: []api.ToolCall{t1, t2},
+			expectedTokens:   `<tool_call>`,
+		},
+		{
+			name:             "model without prefix in template, prefix in output, individual tool calls",
+			model:            "llama3.2",
+			output:           `<tool_call> {"name": "get_current_weather", "parameters": {"format":"fahrenheit","location":"San Francisco, CA"}} {"name": "get_current_weather", "parameters": {"format":"celsius","location":"Toronto, Canada"}}`,
+			expectedToolCall: []api.ToolCall{t1, t2},
+			expectedTokens:   `<tool_call>`,
 		},
 		{
 			name:             "model with prefix in template, no prefix in output, tokens before",
@@ -567,15 +574,37 @@ func TestParseToolCalls(t *testing.T) {
 			name:             "model without prefix in template, no prefix in output, tokens before",
 			model:            "llama3.2",
 			output:           `some tokens before [{"name": "get_current_weather", "parameters": {"format":"fahrenheit","location":"San Francisco, CA"}} {"name": "get_current_weather", "parameters": {"format":"celsius","location":"Toronto, Canada"}}]`,
-			expectedToolCall: []api.ToolCall{},
-			expectedTokens:   `some tokens before [{"name": "get_current_weather", "parameters": {"format":"fahrenheit","location":"San Francisco, CA"}} {"name": "get_current_weather", "parameters": {"format":"celsius","location":"Toronto, Canada"}}]`,
+			expectedToolCall: []api.ToolCall{t1, t2},
+			expectedTokens:   `some tokens before`,
 		},
 		{
-			name:             "model without prefix in template, prefix in output, tokens after",
+			name:  "model without prefix in template, prefix in output, tokens after",
+			model: "llama3.2",
+			output: `<tool_call> 
+			[{"name": "get_current_weather", "parameters": {"format":"fahrenheit","location":"San Francisco, CA"}} {"name": "get_current_weather", "parameters": {"format":"celsius","location":"Toronto, Canada"}}]</tool_call> some tokens after`,
+			expectedToolCall: []api.ToolCall{t1, t2},
+			expectedTokens:   `<tool_call>`,
+		},
+		{
+			name:             "model without without prefix, match all jsons",
 			model:            "llama3.2",
-			output:           `<tool_call> [{"name": "get_current_weather", "parameters": {"format":"fahrenheit","location":"San Francisco, CA"}} {"name": "get_current_weather", "parameters": {"format":"celsius","location":"Toronto, Canada"}}]</tool_call> some tokens after`,
+			output:           `model outputs some text [{"name": "get_current_weather", "parameters": {"format":"fahrenheit","location":"San Francisco, CA"}} {"name": "get_current_weather", "parameters": {"format":"celsius","location":"Toronto, Canada"}}]</tool_call> some tokens after`,
+			expectedToolCall: []api.ToolCall{t1, t2},
+			expectedTokens:   "model outputs some text",
+		},
+		{
+			name:             "model flushes tokens if tool call doesn't match",
+			model:            "llama3.2",
+			output:           `{ "user": {"id": 12345, "name": "Alice", "preferences": {"theme": "dark", "notifications": true}, "stats": {"points": 987, "level": 42}}}`,
 			expectedToolCall: []api.ToolCall{},
-			expectedTokens:   `<tool_call> [{"name": "get_current_weather", "parameters": {"format":"fahrenheit","location":"San Francisco, CA"}} {"name": "get_current_weather", "parameters": {"format":"celsius","location":"Toronto, Canada"}}]</tool_call> some tokens after`,
+			expectedTokens:   `{ "user": {"id": 12345, "name": "Alice", "preferences": {"theme": "dark", "notifications": true}, "stats": {"points": 987, "level": 42}}}`,
+		},
+		{
+			name:             "model flushes tokens if tool call doesn't match array",
+			model:            "llama3.2",
+			output:           `[ { "user": {"id": 12345, "name": "Alice", "preferences": {"theme": "dark", "notifications": true}, "stats": {"points": 987, "level": 42}}}]`,
+			expectedToolCall: []api.ToolCall{},
+			expectedTokens:   `[ { "user": {"id": 12345, "name": "Alice", "preferences": {"theme": "dark", "notifications": true}, "stats": {"points": 987, "level": 42}}}]`,
 		},
 	}
 

From 9239a254e054d24b0de3358ba8c4bd9b50730bfd Mon Sep 17 00:00:00 2001
From: Kyle Steere <kbsteere@users.noreply.github.com>
Date: Tue, 27 May 2025 18:28:48 +0000
Subject: [PATCH 066/108] server: abort download on empty digest

Signed-off-by: Kyle Steere <kyle.steere@chainguard.dev>
---
 server/download.go | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/server/download.go b/server/download.go
index 6f79fd2d..784ba2d5 100644
--- a/server/download.go
+++ b/server/download.go
@@ -464,6 +464,10 @@ type downloadOpts struct {
 
 // downloadBlob downloads a blob from the registry and stores it in the blobs directory
 func downloadBlob(ctx context.Context, opts downloadOpts) (cacheHit bool, _ error) {
+	if opts.digest == "" {
+		return false, fmt.Errorf(("%s: %s"), opts.mp.GetNamespaceRepository(), "digest is is empty")
+	}
+
 	fp, err := GetBlobsPath(opts.digest)
 	if err != nil {
 		return false, err

From ea79003180205680000bacf97466fc9d78d71f5e Mon Sep 17 00:00:00 2001
From: Jesse Gross <jesse@ollama.com>
Date: Tue, 27 May 2025 13:33:57 -0700
Subject: [PATCH 067/108] kvcache: Skip computing causal mask for worst case
 graph reservation

Computing an attention mask for a large context and max batch is
expensive - over 100ms. Models like Gemma3 that have multiple types
of caches and custom attention masks need to do this 4 times, so this
adds approximately 500ms to startup time when using 128k context

When we are reserving the worst case graph, we don't need the mask,
only its shape, so we can skip this.
---
 kvcache/causal.go | 13 ++++++++++++-
 1 file changed, 12 insertions(+), 1 deletion(-)

diff --git a/kvcache/causal.go b/kvcache/causal.go
index f6bacaaf..b594d0b4 100644
--- a/kvcache/causal.go
+++ b/kvcache/causal.go
@@ -30,6 +30,11 @@ type Causal struct {
 
 	// ** current forward pass **
 
+	// curReserve indicates that this forward pass is only for
+	// memory reservation and we should not update our metadata
+	// based on it.
+	curReserve bool
+
 	// the active layer for Get and Put
 	curLayer int
 
@@ -159,12 +164,13 @@ func (c *Causal) Close() {
 }
 
 func (c *Causal) StartForward(ctx ml.Context, batch input.Batch, reserve bool) error {
+	c.curReserve = reserve
 	c.curBatchSize = len(batch.Positions)
 	c.curSequences = batch.Sequences
 	c.curPositions = batch.Positions
 	c.opts.Except = nil
 
-	if !reserve {
+	if !c.curReserve {
 		c.updateSlidingWindow()
 
 		var err error
@@ -304,6 +310,11 @@ func (c *Causal) buildMask(ctx ml.Context) ml.Tensor {
 	c.curCellRange.max = roundUp(c.curCellRange.max+1, c.config.CachePadding) - 1
 
 	length := c.curCellRange.max - c.curCellRange.min + 1
+
+	if c.curReserve {
+		return ctx.Input().Empty(c.config.MaskDType, length, batchSize)
+	}
+
 	mask := make([]float32, batchSize*length)
 
 	for i := range c.curBatchSize {

From aa25aff10d1ccc6dd4e85952678d63946bdf89dc Mon Sep 17 00:00:00 2001
From: Patrick Devine <patrick@infrahq.com>
Date: Tue, 27 May 2025 16:50:57 -0700
Subject: [PATCH 068/108] client: add request signing to the client (#10881)

If OLLAMA_AUTH is set, sign each request w/ a timestamp and pass the signature in the token header
---
 api/client.go       | 50 +++++++++++++++++++++++++++++++++++++++++++++
 envconfig/config.go |  2 ++
 2 files changed, 52 insertions(+)

diff --git a/api/client.go b/api/client.go
index 3dffce60..9f0dba8d 100644
--- a/api/client.go
+++ b/api/client.go
@@ -24,7 +24,10 @@ import (
 	"net/http"
 	"net/url"
 	"runtime"
+	"strconv"
+	"time"
 
+	"github.com/ollama/ollama/auth"
 	"github.com/ollama/ollama/envconfig"
 	"github.com/ollama/ollama/format"
 	"github.com/ollama/ollama/version"
@@ -76,6 +79,14 @@ func NewClient(base *url.URL, http *http.Client) *Client {
 	}
 }
 
+func getAuthorizationToken(ctx context.Context, challenge string) (string, error) {
+	token, err := auth.Sign(ctx, []byte(challenge))
+	if err != nil {
+		return "", err
+	}
+	return token, nil
+}
+
 func (c *Client) do(ctx context.Context, method, path string, reqData, respData any) error {
 	var reqBody io.Reader
 	var data []byte
@@ -97,6 +108,21 @@ func (c *Client) do(ctx context.Context, method, path string, reqData, respData
 	}
 
 	requestURL := c.base.JoinPath(path)
+
+	var token string
+	if envconfig.UseAuth() || c.base.Hostname() == "ollama.com" {
+		now := strconv.FormatInt(time.Now().Unix(), 10)
+		chal := fmt.Sprintf("%s,%s?ts=%s", method, path, now)
+		token, err = getAuthorizationToken(ctx, chal)
+		if err != nil {
+			return err
+		}
+
+		q := requestURL.Query()
+		q.Set("ts", now)
+		requestURL.RawQuery = q.Encode()
+	}
+
 	request, err := http.NewRequestWithContext(ctx, method, requestURL.String(), reqBody)
 	if err != nil {
 		return err
@@ -106,6 +132,10 @@ func (c *Client) do(ctx context.Context, method, path string, reqData, respData
 	request.Header.Set("Accept", "application/json")
 	request.Header.Set("User-Agent", fmt.Sprintf("ollama/%s (%s %s) Go/%s", version.Version, runtime.GOARCH, runtime.GOOS, runtime.Version()))
 
+	if token != "" {
+		request.Header.Set("Authorization", token)
+	}
+
 	respObj, err := c.http.Do(request)
 	if err != nil {
 		return err
@@ -143,6 +173,22 @@ func (c *Client) stream(ctx context.Context, method, path string, data any, fn f
 	}
 
 	requestURL := c.base.JoinPath(path)
+
+	var token string
+	if envconfig.UseAuth() || c.base.Hostname() == "ollama.com" {
+		var err error
+		now := strconv.FormatInt(time.Now().Unix(), 10)
+		chal := fmt.Sprintf("%s,%s?ts=%s", method, path, now)
+		token, err = getAuthorizationToken(ctx, chal)
+		if err != nil {
+			return err
+		}
+
+		q := requestURL.Query()
+		q.Set("ts", now)
+		requestURL.RawQuery = q.Encode()
+	}
+
 	request, err := http.NewRequestWithContext(ctx, method, requestURL.String(), buf)
 	if err != nil {
 		return err
@@ -152,6 +198,10 @@ func (c *Client) stream(ctx context.Context, method, path string, data any, fn f
 	request.Header.Set("Accept", "application/x-ndjson")
 	request.Header.Set("User-Agent", fmt.Sprintf("ollama/%s (%s %s) Go/%s", version.Version, runtime.GOARCH, runtime.GOOS, runtime.Version()))
 
+	if token != "" {
+		request.Header.Set("Authorization", token)
+	}
+
 	response, err := c.http.Do(request)
 	if err != nil {
 		return err
diff --git a/envconfig/config.go b/envconfig/config.go
index 9d7c2e21..763f0464 100644
--- a/envconfig/config.go
+++ b/envconfig/config.go
@@ -183,6 +183,8 @@ var (
 	NewEngine = Bool("OLLAMA_NEW_ENGINE")
 	// ContextLength sets the default context length
 	ContextLength = Uint("OLLAMA_CONTEXT_LENGTH", 4096)
+	// Auth enables authentication between the Ollama client and server
+	UseAuth = Bool("OLLAMA_AUTH")
 )
 
 func String(s string) func() string {

From 5f57b0ef4268a6bd9e8043d54c351a608a7e1bca Mon Sep 17 00:00:00 2001
From: Devon Rifkin <drifkin@drifkin.net>
Date: Wed, 28 May 2025 19:38:52 -0700
Subject: [PATCH 069/108] add thinking support to the api and cli  (#10584)

- Both `/api/generate` and `/api/chat` now accept a `"think"`
  option that allows specifying whether thinking mode should be on or
  not
- Templates get passed this new option so, e.g., qwen3's template can
  put `/think` or `/no_think` in the system prompt depending on the
  value of the setting
- Models' thinking support is inferred by inspecting model templates.
  The prefix and suffix the parser uses to identify thinking support is
  also automatically inferred from templates
- Thinking control & parsing is opt-in via the API to prevent breaking
  existing API consumers. If the `"think"` option is not specified, the
  behavior is unchanged from previous versions of ollama
- Add parsing for thinking blocks in both streaming/non-streaming mode
  in both `/generate` and `/chat`
- Update the CLI to make use of these changes. Users can pass `--think`
  or `--think=false` to control thinking, or during an interactive
  session they can use the commands `/set think` or `/set nothink`
- A `--hidethinking` option has also been added to the CLI. This makes
  it easy to use thinking in scripting scenarios like
  `ollama run qwen3 --think --hidethinking "my question here"` where you
  just want to see the answer but still want the benefits of thinking
  models
---
 api/types.go                   |  21 +-
 api/types_test.go              |  47 ++++
 cmd/cmd.go                     | 178 +++++++++++++--
 cmd/interactive.go             |  32 +++
 cmd/warn_thinking_test.go      |  63 ++++++
 docs/api.md                    |   3 +
 model/bytepairencoding.go      |  11 +-
 readline/types.go              |   2 +
 server/images.go               |  20 +-
 server/prompt.go               |  14 +-
 server/prompt_test.go          |   3 +-
 server/routes.go               |  89 +++++++-
 server/routes_generate_test.go |  19 ++
 server/thinking.go             | 300 ++++++++++++++++++++++++
 server/thinking_test.go        | 403 +++++++++++++++++++++++++++++++++
 template/template.go           |  38 ++--
 types/model/capability.go      |   1 +
 17 files changed, 1195 insertions(+), 49 deletions(-)
 create mode 100644 cmd/warn_thinking_test.go
 create mode 100644 server/thinking.go
 create mode 100644 server/thinking_test.go

diff --git a/api/types.go b/api/types.go
index 602f93da..94d49200 100644
--- a/api/types.go
+++ b/api/types.go
@@ -83,6 +83,12 @@ type GenerateRequest struct {
 	// Options lists model-specific options. For example, temperature can be
 	// set through this field, if the model supports it.
 	Options map[string]any `json:"options"`
+
+	// Think controls whether thinking/reasoning models will think before
+	// responding. Needs to be a pointer so we can distinguish between false
+	// (request that thinking _not_ be used) and unset (use the old behavior
+	// before this option was introduced)
+	Think *bool `json:"think,omitempty"`
 }
 
 // ChatRequest describes a request sent by [Client.Chat].
@@ -108,6 +114,10 @@ type ChatRequest struct {
 
 	// Options lists model-specific options.
 	Options map[string]any `json:"options"`
+
+	// Think controls whether thinking/reasoning models will think before
+	// responding
+	Think *bool `json:"think,omitempty"`
 }
 
 type Tools []Tool
@@ -126,8 +136,11 @@ func (t Tool) String() string {
 // role ("system", "user", or "assistant"), the content and an optional list
 // of images.
 type Message struct {
-	Role      string      `json:"role"`
-	Content   string      `json:"content"`
+	Role    string `json:"role"`
+	Content string `json:"content"`
+	// Thinking contains the text that was inside thinking tags in the
+	// original model output when ChatRequest.Think is enabled.
+	Thinking  string      `json:"thinking,omitempty"`
 	Images    []ImageData `json:"images,omitempty"`
 	ToolCalls []ToolCall  `json:"tool_calls,omitempty"`
 }
@@ -478,6 +491,10 @@ type GenerateResponse struct {
 	// Response is the textual response itself.
 	Response string `json:"response"`
 
+	// Thinking contains the text that was inside thinking tags in the
+	// original model output when ChatRequest.Think is enabled.
+	Thinking string `json:"thinking,omitempty"`
+
 	// Done specifies if the response is complete.
 	Done bool `json:"done"`
 
diff --git a/api/types_test.go b/api/types_test.go
index 1a6fc811..9c2fb1f1 100644
--- a/api/types_test.go
+++ b/api/types_test.go
@@ -372,3 +372,50 @@ func TestPropertyType_MarshalJSON(t *testing.T) {
 		})
 	}
 }
+
+func TestThinking_UnmarshalJSON(t *testing.T) {
+	trueVal := true
+	falseVal := false
+
+	tests := []struct {
+		name             string
+		input            string
+		expectedThinking *bool
+		expectedError    bool
+	}{
+		{
+			name:             "true",
+			input:            `{ "think": true }`,
+			expectedThinking: &trueVal,
+		},
+		{
+			name:             "false",
+			input:            `{ "think": false }`,
+			expectedThinking: &falseVal,
+		},
+		{
+			name:             "unset",
+			input:            `{ }`,
+			expectedThinking: nil,
+		},
+		{
+			name:             "invalid",
+			input:            `{ "think": "true" }`,
+			expectedThinking: nil,
+			expectedError:    true,
+		},
+	}
+
+	for _, test := range tests {
+		t.Run(test.name, func(t *testing.T) {
+			var req GenerateRequest
+			err := json.Unmarshal([]byte(test.input), &req)
+			if test.expectedError {
+				require.Error(t, err)
+			} else {
+				require.NoError(t, err)
+				assert.Equal(t, test.expectedThinking, req.Think)
+			}
+		})
+	}
+}
diff --git a/cmd/cmd.go b/cmd/cmd.go
index b9047529..2d165379 100644
--- a/cmd/cmd.go
+++ b/cmd/cmd.go
@@ -39,6 +39,7 @@ import (
 	"github.com/ollama/ollama/format"
 	"github.com/ollama/ollama/parser"
 	"github.com/ollama/ollama/progress"
+	"github.com/ollama/ollama/readline"
 	"github.com/ollama/ollama/runner"
 	"github.com/ollama/ollama/server"
 	"github.com/ollama/ollama/types/model"
@@ -46,6 +47,23 @@ import (
 	"github.com/ollama/ollama/version"
 )
 
+// ensureThinkingSupport emits a warning if the model does not advertise thinking support
+func ensureThinkingSupport(ctx context.Context, client *api.Client, name string) {
+	if name == "" {
+		return
+	}
+	resp, err := client.Show(ctx, &api.ShowRequest{Model: name})
+	if err != nil {
+		return
+	}
+	for _, cap := range resp.Capabilities {
+		if cap == model.CapabilityThinking {
+			return
+		}
+	}
+	fmt.Fprintf(os.Stderr, "warning: model %q does not support thinking output\n", name)
+}
+
 var errModelfileNotFound = errors.New("specified Modelfile wasn't found")
 
 func getModelfileName(cmd *cobra.Command) (string, error) {
@@ -265,6 +283,9 @@ func loadOrUnloadModel(cmd *cobra.Command, opts *runOptions) error {
 	req := &api.GenerateRequest{
 		Model:     opts.Model,
 		KeepAlive: opts.KeepAlive,
+
+		// pass Think here so we fail before getting to the chat prompt if the model doesn't support it
+		Think: opts.Think,
 	}
 
 	return client.Generate(cmd.Context(), req, func(api.GenerateResponse) error { return nil })
@@ -299,6 +320,22 @@ func RunHandler(cmd *cobra.Command, args []string) error {
 	}
 	opts.Format = format
 
+	thinkFlag := cmd.Flags().Lookup("think")
+	if thinkFlag.Changed {
+		think, err := cmd.Flags().GetBool("think")
+		if err != nil {
+			return err
+		}
+		opts.Think = &think
+	} else {
+		opts.Think = nil
+	}
+	hidethinking, err := cmd.Flags().GetBool("hidethinking")
+	if err != nil {
+		return err
+	}
+	opts.HideThinking = hidethinking
+
 	keepAlive, err := cmd.Flags().GetString("keepalive")
 	if err != nil {
 		return err
@@ -362,6 +399,11 @@ func RunHandler(cmd *cobra.Command, args []string) error {
 		return err
 	}
 
+	opts.Think, err = inferThinkingOption(&info.Capabilities, &opts, thinkFlag.Changed)
+	if err != nil {
+		return err
+	}
+
 	opts.MultiModal = slices.Contains(info.Capabilities, model.CapabilityVision)
 
 	// TODO: remove the projector info and vision info checks below,
@@ -923,17 +965,19 @@ func PullHandler(cmd *cobra.Command, args []string) error {
 type generateContextKey string
 
 type runOptions struct {
-	Model       string
-	ParentModel string
-	Prompt      string
-	Messages    []api.Message
-	WordWrap    bool
-	Format      string
-	System      string
-	Images      []api.ImageData
-	Options     map[string]any
-	MultiModal  bool
-	KeepAlive   *api.Duration
+	Model        string
+	ParentModel  string
+	Prompt       string
+	Messages     []api.Message
+	WordWrap     bool
+	Format       string
+	System       string
+	Images       []api.ImageData
+	Options      map[string]any
+	MultiModal   bool
+	KeepAlive    *api.Duration
+	Think        *bool
+	HideThinking bool
 }
 
 type displayResponseState struct {
@@ -989,6 +1033,26 @@ func displayResponse(content string, wordWrap bool, state *displayResponseState)
 	}
 }
 
+func thinkingOutputOpeningText(plainText bool) string {
+	text := "Thinking...\n"
+
+	if plainText {
+		return text
+	}
+
+	return readline.ColorGrey + readline.ColorBold + text + readline.ColorDefault + readline.ColorGrey
+}
+
+func thinkingOutputClosingText(plainText bool) string {
+	text := "...done thinking.\n\n"
+
+	if plainText {
+		return text
+	}
+
+	return readline.ColorGrey + readline.ColorBold + text + readline.ColorDefault
+}
+
 func chat(cmd *cobra.Command, opts runOptions) (*api.Message, error) {
 	client, err := api.ClientFromEnvironment()
 	if err != nil {
@@ -1016,14 +1080,34 @@ func chat(cmd *cobra.Command, opts runOptions) (*api.Message, error) {
 	var latest api.ChatResponse
 	var fullResponse strings.Builder
 	var role string
+	var thinkTagOpened bool = false
+	var thinkTagClosed bool = false
 
 	fn := func(response api.ChatResponse) error {
-		p.StopAndClear()
+		if response.Message.Content != "" || !opts.HideThinking {
+			p.StopAndClear()
+		}
 
 		latest = response
 
 		role = response.Message.Role
+		if response.Message.Thinking != "" && !opts.HideThinking {
+			if !thinkTagOpened {
+				fmt.Print(thinkingOutputOpeningText(false))
+				thinkTagOpened = true
+			}
+			displayResponse(response.Message.Thinking, opts.WordWrap, state)
+		}
+
 		content := response.Message.Content
+		if thinkTagOpened && !thinkTagClosed && content != "" {
+			fmt.Print(thinkingOutputClosingText(false))
+			thinkTagClosed = true
+		}
+		// purposefully not putting thinking blocks in the response, which would
+		// only be needed if we later added tool calling to the cli (they get
+		// filtered out anyway since current models don't expect them unless you're
+		// about to finish some tool calls)
 		fullResponse.WriteString(content)
 
 		displayResponse(content, opts.WordWrap, state)
@@ -1040,6 +1124,7 @@ func chat(cmd *cobra.Command, opts runOptions) (*api.Message, error) {
 		Messages: opts.Messages,
 		Format:   json.RawMessage(opts.Format),
 		Options:  opts.Options,
+		Think:    opts.Think,
 	}
 
 	if opts.KeepAlive != nil {
@@ -1101,13 +1186,32 @@ func generate(cmd *cobra.Command, opts runOptions) error {
 	}()
 
 	var state *displayResponseState = &displayResponseState{}
+	var thinkTagOpened bool = false
+	var thinkTagClosed bool = false
+
+	plainText := !term.IsTerminal(int(os.Stdout.Fd()))
 
 	fn := func(response api.GenerateResponse) error {
-		p.StopAndClear()
-
 		latest = response
 		content := response.Response
 
+		if response.Response != "" || !opts.HideThinking {
+			p.StopAndClear()
+		}
+
+		if response.Thinking != "" && !opts.HideThinking {
+			if !thinkTagOpened {
+				fmt.Print(thinkingOutputOpeningText(plainText))
+				thinkTagOpened = true
+			}
+			displayResponse(response.Thinking, opts.WordWrap, state)
+		}
+
+		if thinkTagOpened && !thinkTagClosed && content != "" {
+			fmt.Print(thinkingOutputClosingText(plainText))
+			thinkTagClosed = true
+		}
+
 		displayResponse(content, opts.WordWrap, state)
 
 		return nil
@@ -1133,6 +1237,7 @@ func generate(cmd *cobra.Command, opts runOptions) error {
 		System:    opts.System,
 		Options:   opts.Options,
 		KeepAlive: opts.KeepAlive,
+		Think:     opts.Think,
 	}
 
 	if err := client.Generate(ctx, &request, fn); err != nil {
@@ -1348,6 +1453,8 @@ func NewCLI() *cobra.Command {
 	runCmd.Flags().Bool("insecure", false, "Use an insecure registry")
 	runCmd.Flags().Bool("nowordwrap", false, "Don't wrap words to the next line automatically")
 	runCmd.Flags().String("format", "", "Response format (e.g. json)")
+	runCmd.Flags().Bool("think", false, "Whether to use thinking mode for supported models")
+	runCmd.Flags().Bool("hidethinking", false, "Hide thinking output (if provided)")
 
 	stopCmd := &cobra.Command{
 		Use:     "stop MODEL",
@@ -1399,7 +1506,6 @@ func NewCLI() *cobra.Command {
 		PreRunE: checkServerHeartbeat,
 		RunE:    ListRunningHandler,
 	}
-
 	copyCmd := &cobra.Command{
 		Use:     "cp SOURCE DESTINATION",
 		Short:   "Copy a model",
@@ -1488,3 +1594,45 @@ func NewCLI() *cobra.Command {
 
 	return rootCmd
 }
+
+// If the user has explicitly set thinking options, either through the CLI or
+// through the `/set think` or `set nothink` interactive options, then we
+// respect them. Otherwise, we check model capabilities to see if the model
+// supports thinking. If the model does support thinking, we enable it.
+// Otherwise, we unset the thinking option (which is different than setting it
+// to false).
+//
+// If capabilities are not provided, we fetch them from the server.
+func inferThinkingOption(caps *[]model.Capability, runOpts *runOptions, explicitlySetByUser bool) (*bool, error) {
+	if explicitlySetByUser {
+		return runOpts.Think, nil
+	}
+
+	if caps == nil {
+		client, err := api.ClientFromEnvironment()
+		if err != nil {
+			return nil, err
+		}
+		ret, err := client.Show(context.Background(), &api.ShowRequest{
+			Model: runOpts.Model,
+		})
+		if err != nil {
+			return nil, err
+		}
+		caps = &ret.Capabilities
+	}
+
+	thinkingSupported := false
+	for _, cap := range *caps {
+		if cap == model.CapabilityThinking {
+			thinkingSupported = true
+		}
+	}
+
+	if thinkingSupported {
+		thinking := true
+		return &thinking, nil
+	}
+
+	return nil, nil
+}
diff --git a/cmd/interactive.go b/cmd/interactive.go
index d7e6fbcf..a285b365 100644
--- a/cmd/interactive.go
+++ b/cmd/interactive.go
@@ -62,6 +62,8 @@ func generateInteractive(cmd *cobra.Command, opts runOptions) error {
 		fmt.Fprintln(os.Stderr, "  /set noformat          Disable formatting")
 		fmt.Fprintln(os.Stderr, "  /set verbose           Show LLM stats")
 		fmt.Fprintln(os.Stderr, "  /set quiet             Disable LLM stats")
+		fmt.Fprintln(os.Stderr, "  /set think             Enable thinking")
+		fmt.Fprintln(os.Stderr, "  /set nothink           Disable thinking")
 		fmt.Fprintln(os.Stderr, "")
 	}
 
@@ -128,6 +130,7 @@ func generateInteractive(cmd *cobra.Command, opts runOptions) error {
 
 	var sb strings.Builder
 	var multiline MultilineState
+	var thinkExplicitlySet bool = opts.Think != nil
 
 	for {
 		line, err := scanner.Readline()
@@ -195,11 +198,19 @@ func generateInteractive(cmd *cobra.Command, opts runOptions) error {
 			opts.Model = args[1]
 			opts.Messages = []api.Message{}
 			fmt.Printf("Loading model '%s'\n", opts.Model)
+			opts.Think, err = inferThinkingOption(nil, &opts, thinkExplicitlySet)
+			if err != nil {
+				return err
+			}
 			if err := loadOrUnloadModel(cmd, &opts); err != nil {
 				if strings.Contains(err.Error(), "not found") {
 					fmt.Printf("error: %v\n", err)
 					continue
 				}
+				if strings.Contains(err.Error(), "does not support thinking") {
+					fmt.Printf("error: %v\n", err)
+					continue
+				}
 				return err
 			}
 			continue
@@ -260,6 +271,22 @@ func generateInteractive(cmd *cobra.Command, opts runOptions) error {
 						return err
 					}
 					fmt.Println("Set 'quiet' mode.")
+				case "think":
+					think := true
+					opts.Think = &think
+					thinkExplicitlySet = true
+					if client, err := api.ClientFromEnvironment(); err == nil {
+						ensureThinkingSupport(cmd.Context(), client, opts.Model)
+					}
+					fmt.Println("Set 'think' mode.")
+				case "nothink":
+					think := false
+					opts.Think = &think
+					thinkExplicitlySet = true
+					if client, err := api.ClientFromEnvironment(); err == nil {
+						ensureThinkingSupport(cmd.Context(), client, opts.Model)
+					}
+					fmt.Println("Set 'nothink' mode.")
 				case "format":
 					if len(args) < 3 || args[2] != "json" {
 						fmt.Println("Invalid or missing format. For 'json' mode use '/set format json'")
@@ -448,6 +475,11 @@ func generateInteractive(cmd *cobra.Command, opts runOptions) error {
 
 			assistant, err := chat(cmd, opts)
 			if err != nil {
+				if strings.Contains(err.Error(), "does not support thinking") {
+					fmt.Printf("error: %v\n", err)
+					sb.Reset()
+					continue
+				}
 				return err
 			}
 			if assistant != nil {
diff --git a/cmd/warn_thinking_test.go b/cmd/warn_thinking_test.go
new file mode 100644
index 00000000..31dc4156
--- /dev/null
+++ b/cmd/warn_thinking_test.go
@@ -0,0 +1,63 @@
+package cmd
+
+import (
+	"encoding/json"
+	"io"
+	"net/http"
+	"net/http/httptest"
+	"os"
+	"strings"
+	"testing"
+
+	"github.com/ollama/ollama/api"
+	"github.com/ollama/ollama/types/model"
+)
+
+// Test that a warning is printed when thinking is requested but not supported.
+func TestWarnMissingThinking(t *testing.T) {
+	cases := []struct {
+		capabilities []model.Capability
+		expectWarn   bool
+	}{
+		{capabilities: []model.Capability{model.CapabilityThinking}, expectWarn: false},
+		{capabilities: []model.Capability{}, expectWarn: true},
+	}
+
+	for _, tc := range cases {
+		srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+			if r.URL.Path != "/api/show" || r.Method != http.MethodPost {
+				t.Fatalf("unexpected request to %s %s", r.URL.Path, r.Method)
+			}
+			var req api.ShowRequest
+			if err := json.NewDecoder(r.Body).Decode(&req); err != nil {
+				t.Fatalf("decode request: %v", err)
+			}
+			resp := api.ShowResponse{Capabilities: tc.capabilities}
+			if err := json.NewEncoder(w).Encode(resp); err != nil {
+				t.Fatalf("encode response: %v", err)
+			}
+		}))
+		defer srv.Close()
+
+		t.Setenv("OLLAMA_HOST", srv.URL)
+		client, err := api.ClientFromEnvironment()
+		if err != nil {
+			t.Fatal(err)
+		}
+		oldStderr := os.Stderr
+		r, w, _ := os.Pipe()
+		os.Stderr = w
+		ensureThinkingSupport(t.Context(), client, "m")
+		w.Close()
+		os.Stderr = oldStderr
+		out, _ := io.ReadAll(r)
+
+		warned := strings.Contains(string(out), "warning:")
+		if tc.expectWarn && !warned {
+			t.Errorf("expected warning, got none")
+		}
+		if !tc.expectWarn && warned {
+			t.Errorf("did not expect warning, got: %s", string(out))
+		}
+	}
+}
diff --git a/docs/api.md b/docs/api.md
index abd27615..11eaf73a 100644
--- a/docs/api.md
+++ b/docs/api.md
@@ -43,6 +43,7 @@ Generate a response for a given prompt with a provided model. This is a streamin
 - `prompt`: the prompt to generate a response for
 - `suffix`: the text after the model response
 - `images`: (optional) a list of base64-encoded images (for multimodal models such as `llava`)
+- `think`: (for thinking models) should the model think before responding?
 
 Advanced parameters (optional):
 
@@ -490,11 +491,13 @@ Generate the next message in a chat with a provided model. This is a streaming e
 - `model`: (required) the [model name](#model-names)
 - `messages`: the messages of the chat, this can be used to keep a chat memory
 - `tools`: list of tools in JSON for the model to use if supported
+- `think`: (for thinking models) should the model think before responding?
 
 The `message` object has the following fields:
 
 - `role`: the role of the message, either `system`, `user`, `assistant`, or `tool`
 - `content`: the content of the message
+- `thinking`: (for thinking models) the model's thinking process
 - `images` (optional): a list of images to include in the message (for multimodal models such as `llava`)
 - `tool_calls` (optional): a list of tools in JSON that the model wants to use
 
diff --git a/model/bytepairencoding.go b/model/bytepairencoding.go
index 6bb9a003..246d2ba3 100644
--- a/model/bytepairencoding.go
+++ b/model/bytepairencoding.go
@@ -3,6 +3,7 @@ package model
 import (
 	"cmp"
 	"context"
+	"fmt"
 	"iter"
 	"log/slog"
 	"strings"
@@ -210,6 +211,14 @@ func (bpe BytePairEncoding) Encode(s string, addSpecial bool) ([]int32, error) {
 	return ids, nil
 }
 
+type lazyIdsString struct {
+	ids []int32
+}
+
+func (l lazyIdsString) LogValue() slog.Value {
+	return slog.AnyValue(fmt.Sprint(l.ids))
+}
+
 func (bpe BytePairEncoding) Decode(ids []int32) (string, error) {
 	var sb strings.Builder
 	for _, id := range ids {
@@ -234,6 +243,6 @@ func (bpe BytePairEncoding) Decode(ids []int32) (string, error) {
 		}
 	}
 
-	slog.Log(context.TODO(), logutil.LevelTrace, "decoded", "ids", ids, "string", sb.String())
+	slog.Log(context.TODO(), logutil.LevelTrace, "decoded", "string", sb.String(), "from", lazyIdsString{ids: ids})
 	return sb.String(), nil
 }
diff --git a/readline/types.go b/readline/types.go
index e136d996..f4efa8d9 100644
--- a/readline/types.go
+++ b/readline/types.go
@@ -61,6 +61,8 @@ const (
 	ColorGrey    = Esc + "[38;5;245m"
 	ColorDefault = Esc + "[0m"
 
+	ColorBold = Esc + "[1m"
+
 	StartBracketedPaste = Esc + "[?2004h"
 	EndBracketedPaste   = Esc + "[?2004l"
 )
diff --git a/server/images.go b/server/images.go
index a69e2a9f..58fb87dc 100644
--- a/server/images.go
+++ b/server/images.go
@@ -37,6 +37,7 @@ var (
 	errCapabilityInsert     = errors.New("insert")
 	errCapabilityVision     = errors.New("vision")
 	errCapabilityEmbedding  = errors.New("embedding")
+	errCapabilityThinking   = errors.New("thinking")
 	errInsecureProtocol     = errors.New("insecure protocol http")
 )
 
@@ -111,6 +112,12 @@ func (m *Model) Capabilities() []model.Capability {
 		capabilities = append(capabilities, model.CapabilityVision)
 	}
 
+	// Check for thinking capability
+	openingTag, closingTag := inferThinkingTags(m.Template.Template)
+	if openingTag != "" && closingTag != "" {
+		capabilities = append(capabilities, model.CapabilityThinking)
+	}
+
 	return capabilities
 }
 
@@ -127,6 +134,7 @@ func (m *Model) CheckCapabilities(want ...model.Capability) error {
 		model.CapabilityInsert:     errCapabilityInsert,
 		model.CapabilityVision:     errCapabilityVision,
 		model.CapabilityEmbedding:  errCapabilityEmbedding,
+		model.CapabilityThinking:   errCapabilityThinking,
 	}
 
 	for _, cap := range want {
@@ -141,11 +149,19 @@ func (m *Model) CheckCapabilities(want ...model.Capability) error {
 		}
 	}
 
+	var err error
 	if len(errs) > 0 {
-		return fmt.Errorf("%w %w", errCapabilities, errors.Join(errs...))
+		err = fmt.Errorf("%w %w", errCapabilities, errors.Join(errs...))
 	}
 
-	return nil
+	if slices.Contains(errs, errCapabilityThinking) {
+		if m.Config.ModelFamily == "qwen3" || model.ParseName(m.Name).Model == "deepseek-r1" {
+			// append a message to the existing error
+			return fmt.Errorf("%w. Pull the model again to get the latest version with full thinking support", err)
+		}
+	}
+
+	return err
 }
 
 func (m *Model) String() string {
diff --git a/server/prompt.go b/server/prompt.go
index 147a02b6..f8c895d7 100644
--- a/server/prompt.go
+++ b/server/prompt.go
@@ -19,7 +19,7 @@ type tokenizeFunc func(context.Context, string) ([]int, error)
 // chatPrompt accepts a list of messages and returns the prompt and images that should be used for the next chat turn.
 // chatPrompt truncates any messages that exceed the context window of the model, making sure to always include 1) the
 // latest message and 2) system messages
-func chatPrompt(ctx context.Context, m *Model, tokenize tokenizeFunc, opts *api.Options, msgs []api.Message, tools []api.Tool) (prompt string, images []llm.ImageData, _ error) {
+func chatPrompt(ctx context.Context, m *Model, tokenize tokenizeFunc, opts *api.Options, msgs []api.Message, tools []api.Tool, think *bool) (prompt string, images []llm.ImageData, _ error) {
 	var system []api.Message
 
 	// TODO: Ideally we would compute this from the projector metadata but some pieces are implementation dependent
@@ -41,8 +41,12 @@ func chatPrompt(ctx context.Context, m *Model, tokenize tokenizeFunc, opts *api.
 			}
 		}
 
+		thinkVal := false
+		if think != nil {
+			thinkVal = *think
+		}
 		var b bytes.Buffer
-		if err := m.Template.Execute(&b, template.Values{Messages: append(system, msgs[i:]...), Tools: tools}); err != nil {
+		if err := m.Template.Execute(&b, template.Values{Messages: append(system, msgs[i:]...), Tools: tools, Think: thinkVal, IsThinkSet: think != nil}); err != nil {
 			return "", nil, err
 		}
 
@@ -96,7 +100,11 @@ func chatPrompt(ctx context.Context, m *Model, tokenize tokenizeFunc, opts *api.
 
 	// truncate any messages that do not fit into the context window
 	var b bytes.Buffer
-	if err := m.Template.Execute(&b, template.Values{Messages: append(system, msgs[currMsgIdx:]...), Tools: tools}); err != nil {
+	thinkVal := false
+	if think != nil {
+		thinkVal = *think
+	}
+	if err := m.Template.Execute(&b, template.Values{Messages: append(system, msgs[currMsgIdx:]...), Tools: tools, Think: thinkVal, IsThinkSet: think != nil}); err != nil {
 		return "", nil, err
 	}
 
diff --git a/server/prompt_test.go b/server/prompt_test.go
index fb6c96c0..0043b9a4 100644
--- a/server/prompt_test.go
+++ b/server/prompt_test.go
@@ -208,7 +208,8 @@ func TestChatPrompt(t *testing.T) {
 		t.Run(tt.name, func(t *testing.T) {
 			model := tt.model
 			opts := api.Options{Runner: api.Runner{NumCtx: tt.limit}}
-			prompt, images, err := chatPrompt(t.Context(), &model, mockRunner{}.Tokenize, &opts, tt.msgs, nil)
+			think := false
+			prompt, images, err := chatPrompt(t.Context(), &model, mockRunner{}.Tokenize, &opts, tt.msgs, nil, &think)
 			if tt.error == nil && err != nil {
 				t.Fatal(err)
 			} else if tt.error != nil && err != tt.error {
diff --git a/server/routes.go b/server/routes.go
index 42e8cdd1..236f92e2 100644
--- a/server/routes.go
+++ b/server/routes.go
@@ -17,7 +17,6 @@ import (
 	"net/netip"
 	"os"
 	"os/signal"
-	"regexp"
 	"slices"
 	"strings"
 	"syscall"
@@ -186,6 +185,13 @@ func (s *Server) GenerateHandler(c *gin.Context) {
 	if req.Suffix != "" {
 		caps = append(caps, model.CapabilityInsert)
 	}
+	if req.Think != nil && *req.Think {
+		caps = append(caps, model.CapabilityThinking)
+		// TODO(drifkin): consider adding a warning if it's false and the model
+		// doesn't support thinking. It's not strictly required, but it can be a
+		// hint that the user is on an older qwen3/r1 model that doesn't have an
+		// updated template supporting thinking
+	}
 
 	r, m, opts, err := s.scheduleRunner(c.Request.Context(), name.String(), caps, req.Options, req.KeepAlive)
 	if errors.Is(err, errCapabilityCompletion) {
@@ -254,6 +260,9 @@ func (s *Server) GenerateHandler(c *gin.Context) {
 			values.Messages = append(msgs, api.Message{Role: "user", Content: req.Prompt})
 		}
 
+		values.Think = req.Think != nil && *req.Think
+		values.IsThinkSet = req.Think != nil
+
 		var b bytes.Buffer
 		if req.Context != nil {
 			slog.Warn("the context field is deprecated and will be removed in a future version of Ollama")
@@ -273,6 +282,15 @@ func (s *Server) GenerateHandler(c *gin.Context) {
 		prompt = b.String()
 	}
 
+	var thinkingState *thinkingParser
+	openingTag, closingTag := inferThinkingTags(m.Template.Template)
+	if req.Think != nil && *req.Think && openingTag != "" && closingTag != "" {
+		thinkingState = &thinkingParser{
+			openingTag: openingTag,
+			closingTag: closingTag,
+		}
+	}
+
 	ch := make(chan any)
 	go func() {
 		// TODO (jmorganca): avoid building the response twice both here and below
@@ -297,6 +315,12 @@ func (s *Server) GenerateHandler(c *gin.Context) {
 				},
 			}
 
+			if thinkingState != nil {
+				thinking, content := thinkingState.addContent(cr.Content)
+				res.Thinking = thinking
+				res.Response = content
+			}
+
 			if _, err := sb.WriteString(cr.Content); err != nil {
 				ch <- gin.H{"error": err.Error()}
 			}
@@ -324,11 +348,13 @@ func (s *Server) GenerateHandler(c *gin.Context) {
 
 	if req.Stream != nil && !*req.Stream {
 		var r api.GenerateResponse
-		var sb strings.Builder
+		var sbThinking strings.Builder
+		var sbContent strings.Builder
 		for rr := range ch {
 			switch t := rr.(type) {
 			case api.GenerateResponse:
-				sb.WriteString(t.Response)
+				sbThinking.WriteString(t.Thinking)
+				sbContent.WriteString(t.Response)
 				r = t
 			case gin.H:
 				msg, ok := t["error"].(string)
@@ -344,7 +370,9 @@ func (s *Server) GenerateHandler(c *gin.Context) {
 			}
 		}
 
-		r.Response = sb.String()
+		r.Thinking = sbThinking.String()
+		r.Response = sbContent.String()
+
 		c.JSON(http.StatusOK, r)
 		return
 	}
@@ -1436,6 +1464,9 @@ func (s *Server) ChatHandler(c *gin.Context) {
 	if len(req.Tools) > 0 {
 		caps = append(caps, model.CapabilityTools)
 	}
+	if req.Think != nil && *req.Think {
+		caps = append(caps, model.CapabilityThinking)
+	}
 
 	name := model.ParseName(req.Model)
 	if !name.IsValid() {
@@ -1476,13 +1507,22 @@ func (s *Server) ChatHandler(c *gin.Context) {
 	}
 	msgs = filterThinkTags(msgs, m)
 
-	prompt, images, err := chatPrompt(c.Request.Context(), m, r.Tokenize, opts, msgs, req.Tools)
+	prompt, images, err := chatPrompt(c.Request.Context(), m, r.Tokenize, opts, msgs, req.Tools, req.Think)
 	if err != nil {
 		slog.Error("chat prompt error", "error", err)
 		c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
 		return
 	}
 
+	var thinkingState *thinkingParser
+	openingTag, closingTag := inferThinkingTags(m.Template.Template)
+	if req.Think != nil && *req.Think && openingTag != "" && closingTag != "" {
+		thinkingState = &thinkingParser{
+			openingTag: openingTag,
+			closingTag: closingTag,
+		}
+	}
+
 	var toolParser *tools.Parser
 	if len(req.Tools) > 0 {
 		toolParser, err = tools.NewParser(m.Template.Template)
@@ -1516,6 +1556,16 @@ func (s *Server) ChatHandler(c *gin.Context) {
 				},
 			}
 
+			if thinkingState != nil {
+				thinkingContent, remainingContent := thinkingState.addContent(res.Message.Content)
+				if thinkingContent == "" && remainingContent == "" && !r.Done {
+					// need to accumulate more to decide what to send
+					return
+				}
+				res.Message.Content = remainingContent
+				res.Message.Thinking = thinkingContent
+			}
+
 			if r.Done {
 				res.DoneReason = r.DoneReason.String()
 				res.TotalDuration = time.Since(checkpointStart)
@@ -1523,12 +1573,14 @@ func (s *Server) ChatHandler(c *gin.Context) {
 			}
 
 			if len(req.Tools) > 0 {
-				toolCalls, content := toolParser.Add(r.Content)
+				toolCalls, content := toolParser.Add(res.Message.Content)
 				if len(content) > 0 {
 					res.Message.Content = content
 				} else if len(toolCalls) > 0 {
 					res.Message.ToolCalls = toolCalls
 					res.Message.Content = ""
+				} else if res.Message.Thinking != "" {
+					// don't return
 				} else {
 					if r.Done {
 						ch <- res
@@ -1536,6 +1588,7 @@ func (s *Server) ChatHandler(c *gin.Context) {
 					return
 				}
 			}
+
 			ch <- res
 		}); err != nil {
 			ch <- gin.H{"error": err.Error()}
@@ -1544,12 +1597,14 @@ func (s *Server) ChatHandler(c *gin.Context) {
 
 	if req.Stream != nil && !*req.Stream {
 		var resp api.ChatResponse
-		var sb strings.Builder
 		var toolCalls []api.ToolCall
+		var sbThinking strings.Builder
+		var sbContent strings.Builder
 		for rr := range ch {
 			switch t := rr.(type) {
 			case api.ChatResponse:
-				sb.WriteString(t.Message.Content)
+				sbThinking.WriteString(t.Message.Thinking)
+				sbContent.WriteString(t.Message.Content)
 				resp = t
 				if len(req.Tools) > 0 {
 					toolCalls = append(toolCalls, t.Message.ToolCalls...)
@@ -1568,7 +1623,9 @@ func (s *Server) ChatHandler(c *gin.Context) {
 			}
 		}
 
-		resp.Message.Content = sb.String()
+		resp.Message.Content = sbContent.String()
+		resp.Message.Thinking = sbThinking.String()
+
 		if len(toolCalls) > 0 {
 			resp.Message.ToolCalls = toolCalls
 		}
@@ -1595,8 +1652,6 @@ func handleScheduleError(c *gin.Context, name string, err error) {
 	}
 }
 
-var thinkTagRegexp = regexp.MustCompile(`<think>(?s).*?</think>(\n)*`)
-
 func filterThinkTags(msgs []api.Message, m *Model) []api.Message {
 	if m.Config.ModelFamily == "qwen3" || model.ParseName(m.Name).Model == "deepseek-r1" {
 		finalUserIndex := -1
@@ -1608,7 +1663,17 @@ func filterThinkTags(msgs []api.Message, m *Model) []api.Message {
 
 		for i, msg := range msgs {
 			if msg.Role == "assistant" && i < finalUserIndex {
-				msgs[i].Content = thinkTagRegexp.ReplaceAllString(msg.Content, "")
+				// TODO(drifkin): this is from before we added proper thinking support.
+				// However, even if thinking is not enabled (and therefore we shouldn't
+				// change the user output), we should probably perform this filtering
+				// for all thinking models (not just qwen3 & deepseek-r1) since it tends
+				// to save tokens and improve quality.
+				thinkingState := &thinkingParser{
+					openingTag: "<think>",
+					closingTag: "</think>",
+				}
+				_, content := thinkingState.addContent(msg.Content)
+				msgs[i].Content = content
 			}
 		}
 	}
diff --git a/server/routes_generate_test.go b/server/routes_generate_test.go
index 6bbf5b11..75a246fc 100644
--- a/server/routes_generate_test.go
+++ b/server/routes_generate_test.go
@@ -143,6 +143,25 @@ func TestGenerateChat(t *testing.T) {
 		}
 	})
 
+	t.Run("missing thinking capability", func(t *testing.T) {
+		think := true
+		w := createRequest(t, s.ChatHandler, api.ChatRequest{
+			Model: "test",
+			Messages: []api.Message{
+				{Role: "user", Content: "Hello!"},
+			},
+			Think: &think,
+		})
+
+		if w.Code != http.StatusBadRequest {
+			t.Errorf("expected status 400, got %d", w.Code)
+		}
+
+		if diff := cmp.Diff(w.Body.String(), `{"error":"registry.ollama.ai/library/test:latest does not support thinking"}`); diff != "" {
+			t.Errorf("mismatch (-got +want):\n%s", diff)
+		}
+	})
+
 	t.Run("missing model", func(t *testing.T) {
 		w := createRequest(t, s.ChatHandler, api.ChatRequest{})
 		if w.Code != http.StatusBadRequest {
diff --git a/server/thinking.go b/server/thinking.go
new file mode 100644
index 00000000..2213b6b6
--- /dev/null
+++ b/server/thinking.go
@@ -0,0 +1,300 @@
+package server
+
+import (
+	"strings"
+	"text/template"
+	"text/template/parse"
+	"unicode"
+)
+
+type thinkingState int
+
+const (
+	// We're looking for the opening tag, but we haven't seen any non-whitespace
+	// characters yet
+	thinkingState_LookingForOpening thinkingState = iota
+	// We've seen the opening tag, but we haven't seen any non-whitespace
+	// characters yet (we want to eat any whitespace between the opening tag and
+	// the thinking content)
+	thinkingState_ThinkingStartedEatingWhitespace
+	// We've seen non-whitespace characters after the opening tag, but we haven't
+	// seen the closing tag yet
+	thinkingState_Thinking
+	// We've seen the closing tag, but we haven't seen any non-whitespace
+	// characters after the closing tag yet (we want to eat any whitespace between
+	// the closing tag and the content)
+	thinkingState_ThinkingDoneEatingWhitespace
+	// We've seen the closing tag and seen at least one non-whitespace character
+	// after it
+	thinkingState_ThinkingDone
+)
+
+func (s thinkingState) String() string {
+	switch s {
+	case thinkingState_LookingForOpening:
+		return "LookingForOpening"
+	case thinkingState_ThinkingStartedEatingWhitespace:
+		return "ThinkingStartedEatingWhitespace"
+	case thinkingState_Thinking:
+		return "Thinking"
+	case thinkingState_ThinkingDoneEatingWhitespace:
+		return "ThinkingDoneEatingWhitespace"
+	case thinkingState_ThinkingDone:
+		return "ThinkingDone"
+	default:
+		return "Unknown"
+	}
+}
+
+type thinkingParser struct {
+	state      thinkingState
+	openingTag string
+	closingTag string
+	acc        strings.Builder
+}
+
+// addContent returns the thinking content and the non-thinking content that
+// should be immediately sent to the user. It will internally buffer if it needs
+// to see more raw content to disambiguate
+func (s *thinkingParser) addContent(content string) (string, string) {
+	s.acc.WriteString(content)
+
+	var thinkingSb, remainingSb strings.Builder
+
+	var thinking, remaining string
+	keepLooping := true
+	// we loop because we might pass through multiple parsing states in a single
+	// call to addContent, and we want to make sure callers don't have to wait for
+	// data that's already unambiguous
+	for keepLooping {
+		thinking, remaining, keepLooping = eat(s)
+		thinkingSb.WriteString(thinking)
+		remainingSb.WriteString(remaining)
+	}
+
+	return thinkingSb.String(), remainingSb.String()
+}
+
+// the additional bool return is true iff we should continue eating
+func eat(s *thinkingParser) (string, string, bool) {
+	switch s.state {
+	case thinkingState_LookingForOpening:
+		trimmed := strings.TrimLeftFunc(s.acc.String(), unicode.IsSpace)
+		if strings.HasPrefix(trimmed, s.openingTag) {
+			after := strings.Join(strings.Split(trimmed, s.openingTag)[1:], s.openingTag)
+			after = strings.TrimLeftFunc(after, unicode.IsSpace)
+			// after might contain more than just thinking tokens, so we continue
+			// parsing instead of returning it as thinking tokens here
+			s.acc.Reset()
+			s.acc.WriteString(after)
+			if after == "" {
+				s.state = thinkingState_ThinkingStartedEatingWhitespace
+			} else {
+				s.state = thinkingState_Thinking
+			}
+			return "", "", true
+		} else if strings.HasPrefix(s.openingTag, trimmed) {
+			// partial opening seen, so let's keep accumulating
+			return "", "", false
+		} else if trimmed == "" {
+			// saw whitespace only, so let's keep accumulating
+			return "", "", false
+		} else {
+			// didn't see an opening tag, but we have content, so thinking was skipped
+			s.state = thinkingState_ThinkingDone
+			// note that we use the original content, not the trimmed one because we
+			// don't want to eat any whitespace in the real content if there were no
+			// thinking tags
+			return "", s.acc.String(), false
+		}
+	case thinkingState_ThinkingStartedEatingWhitespace:
+		trimmed := strings.TrimLeftFunc(s.acc.String(), unicode.IsSpace)
+		s.acc.Reset()
+		if trimmed == "" {
+			return "", "", false
+		} else {
+			s.state = thinkingState_Thinking
+			s.acc.WriteString(trimmed)
+			return "", "", true
+		}
+	case thinkingState_Thinking:
+		acc := s.acc.String()
+		if strings.Contains(acc, s.closingTag) {
+			split := strings.Split(acc, s.closingTag)
+			thinking := split[0]
+			remaining := strings.Join(split[1:], s.closingTag)
+			remaining = strings.TrimLeftFunc(remaining, unicode.IsSpace)
+			s.acc.Reset()
+			if remaining == "" {
+				s.state = thinkingState_ThinkingDoneEatingWhitespace
+			} else {
+				s.state = thinkingState_ThinkingDone
+			}
+			return thinking, remaining, false
+		} else if overlapLen := overlap(acc, s.closingTag); overlapLen > 0 {
+			thinking := acc[:len(acc)-overlapLen]
+			remaining := acc[len(acc)-overlapLen:]
+			s.acc.Reset()
+			// keep track of the candidate closing tag. We have to buffer it until it
+			// becomes disambiguated
+			s.acc.WriteString(remaining)
+			return thinking, "", false
+		} else {
+			// purely just thinking tokens, so we can return them
+			s.acc.Reset()
+			return acc, "", false
+		}
+	case thinkingState_ThinkingDoneEatingWhitespace:
+		trimmed := strings.TrimLeftFunc(s.acc.String(), unicode.IsSpace)
+		s.acc.Reset()
+		// if we see non-whitespace, we're done eating the leading whitespace of the content
+		if trimmed != "" {
+			s.state = thinkingState_ThinkingDone
+		}
+		return "", trimmed, false
+	case thinkingState_ThinkingDone:
+		acc := s.acc.String()
+		s.acc.Reset()
+		return "", acc, false
+	default:
+		panic("unknown state")
+	}
+}
+
+// longest overlap between suffix of s and prefix of delim
+func overlap(s, delim string) int {
+	max := min(len(delim), len(s))
+	for i := max; i > 0; i-- {
+		if strings.HasSuffix(s, delim[:i]) {
+			return i
+		}
+	}
+	return 0
+}
+
+func templateVisit(n parse.Node, enterFn func(parse.Node) bool, exitFn func(parse.Node)) {
+	if n == nil {
+		return
+	}
+	shouldContinue := enterFn(n)
+	if !shouldContinue {
+		return
+	}
+	switch x := n.(type) {
+	case *parse.ListNode:
+		for _, c := range x.Nodes {
+			templateVisit(c, enterFn, exitFn)
+		}
+	case *parse.BranchNode:
+		if x.Pipe != nil {
+			templateVisit(x.Pipe, enterFn, exitFn)
+		}
+		if x.List != nil {
+			templateVisit(x.List, enterFn, exitFn)
+		}
+		if x.ElseList != nil {
+			templateVisit(x.ElseList, enterFn, exitFn)
+		}
+	case *parse.ActionNode:
+		templateVisit(x.Pipe, enterFn, exitFn)
+	case *parse.WithNode:
+		templateVisit(&x.BranchNode, enterFn, exitFn)
+	case *parse.RangeNode:
+		templateVisit(&x.BranchNode, enterFn, exitFn)
+	case *parse.IfNode:
+		templateVisit(&x.BranchNode, enterFn, exitFn)
+	case *parse.TemplateNode:
+		templateVisit(x.Pipe, enterFn, exitFn)
+	case *parse.PipeNode:
+		for _, c := range x.Cmds {
+			templateVisit(c, enterFn, exitFn)
+		}
+	case *parse.CommandNode:
+		for _, a := range x.Args {
+			templateVisit(a, enterFn, exitFn)
+		}
+		// text, field, number, etc. are leaves – nothing to recurse into
+	}
+	if exitFn != nil {
+		exitFn(n)
+	}
+}
+
+// We use a heuristic to infer the tags that surround thinking traces:
+// We look for a range node that iterates over "Messages" and then look for a
+// reference to "Thinking" like `{{.Thinking}}`. We then go up to the nearest
+// ListNode and take the first and last TextNodes as the opening and closing
+// tags.
+func inferThinkingTags(t *template.Template) (string, string) {
+	ancestors := []parse.Node{}
+
+	openingTag := ""
+	closingTag := ""
+
+	enterFn := func(n parse.Node) bool {
+		ancestors = append(ancestors, n)
+
+		switch x := n.(type) {
+		case *parse.FieldNode:
+			if len(x.Ident) > 0 && x.Ident[0] == "Thinking" {
+				var mostRecentRange *parse.RangeNode
+				for i := len(ancestors) - 1; i >= 0; i-- {
+					if r, ok := ancestors[i].(*parse.RangeNode); ok {
+						mostRecentRange = r
+						break
+					}
+				}
+				if mostRecentRange == nil || !rangeUsesField(mostRecentRange, "Messages") {
+					return true
+				}
+
+				// TODO(drifkin): to be more robust, check that it's in the action
+				// part, not the `if`'s pipeline part. We do match on the nearest list
+				// that starts and ends with text nodes, which makes this not strictly
+				// necessary for our heuristic
+
+				// go up to the nearest ancestor that is a *parse.ListNode
+				for i := len(ancestors) - 1; i >= 0; i-- {
+					if l, ok := ancestors[i].(*parse.ListNode); ok {
+						firstNode := l.Nodes[0]
+						if t, ok := firstNode.(*parse.TextNode); ok {
+							openingTag = strings.TrimSpace(t.String())
+						}
+						lastNode := l.Nodes[len(l.Nodes)-1]
+						if t, ok := lastNode.(*parse.TextNode); ok {
+							closingTag = strings.TrimSpace(t.String())
+						}
+
+						break
+					}
+				}
+			}
+		}
+
+		return true
+	}
+
+	exitFn := func(n parse.Node) {
+		ancestors = ancestors[:len(ancestors)-1]
+	}
+
+	templateVisit(t.Root, enterFn, exitFn)
+
+	return openingTag, closingTag
+}
+
+// checks to see if the given field name is present in the pipeline of the given range node
+func rangeUsesField(rangeNode *parse.RangeNode, field string) bool {
+	found := false
+	enterFn := func(n parse.Node) bool {
+		switch x := n.(type) {
+		case *parse.FieldNode:
+			if x.Ident[0] == field {
+				found = true
+			}
+		}
+		return true
+	}
+	templateVisit(rangeNode.BranchNode.Pipe, enterFn, nil)
+	return found
+}
diff --git a/server/thinking_test.go b/server/thinking_test.go
new file mode 100644
index 00000000..a2055635
--- /dev/null
+++ b/server/thinking_test.go
@@ -0,0 +1,403 @@
+package server
+
+import (
+	"testing"
+	"text/template"
+)
+
+func TestExtractThinking(t *testing.T) {
+	tests := []struct {
+		in, wantContent, wantThink string
+	}{
+		{
+			in:          "<think> internal </think> world",
+			wantThink:   "internal ",
+			wantContent: "world",
+		},
+		{
+			in:          "<think>a</think><think>b</think>c",
+			wantThink:   "a",
+			wantContent: "<think>b</think>c",
+		},
+		{
+			in:          "no think",
+			wantThink:   "",
+			wantContent: "no think",
+		},
+	}
+	for i, tt := range tests {
+		parser := thinkingParser{
+			openingTag: "<think>",
+			closingTag: "</think>",
+		}
+		gotThinking, gotContent := parser.addContent(tt.in)
+		if gotContent != tt.wantContent || gotThinking != tt.wantThink {
+			t.Errorf("case %d: got (%q,%q), want (%q,%q)", i, gotThinking, gotContent, tt.wantThink, tt.wantContent)
+		}
+	}
+}
+
+func TestThinkingStreaming(t *testing.T) {
+	type step struct {
+		input          string
+		wantThinking   string
+		wantContent    string
+		wantStateAfter thinkingState
+	}
+
+	cases := []struct {
+		desc  string
+		skip  bool
+		steps []step
+	}{
+		{
+			desc: "content without a thinking tag",
+			steps: []step{
+				{
+					input:          "  abc",
+					wantThinking:   "",
+					wantContent:    "  abc",
+					wantStateAfter: thinkingState_ThinkingDone,
+				},
+			},
+		},
+		{
+			desc: "content before a thinking tag nerfs the thinking tag",
+			steps: []step{
+				{
+					input:          "  abc <think>def</think> ghi",
+					wantThinking:   "",
+					wantContent:    "  abc <think>def</think> ghi",
+					wantStateAfter: thinkingState_ThinkingDone,
+				},
+			},
+		},
+		{
+			desc: "building up a thinking tag partially",
+			steps: []step{
+				{
+					input:          "  <th",
+					wantThinking:   "",
+					wantContent:    "",
+					wantStateAfter: thinkingState_LookingForOpening,
+				},
+				{
+					input:          "in",
+					wantThinking:   "",
+					wantContent:    "",
+					wantStateAfter: thinkingState_LookingForOpening,
+				},
+				{
+					input:          "k>a",
+					wantThinking:   "a",
+					wantContent:    "",
+					wantStateAfter: thinkingState_Thinking,
+				},
+			},
+		},
+		{
+			desc: "partial closing tag",
+			steps: []step{
+				{
+					input:          "<think>abc</th",
+					wantThinking:   "abc",
+					wantContent:    "",
+					wantStateAfter: thinkingState_Thinking,
+				},
+				{
+					input:          "ink>def",
+					wantThinking:   "",
+					wantContent:    "def",
+					wantStateAfter: thinkingState_ThinkingDone,
+				},
+			},
+		},
+		{
+			desc: "partial closing tag fakeout",
+			steps: []step{
+				{
+					input:          "<think>abc</th",
+					wantThinking:   "abc",
+					wantContent:    "",
+					wantStateAfter: thinkingState_Thinking,
+				},
+				{
+					input:          "ing>def",
+					wantThinking:   "</thing>def",
+					wantContent:    "",
+					wantStateAfter: thinkingState_Thinking,
+				},
+				{
+					input:          "ghi</thi",
+					wantThinking:   "ghi",
+					wantContent:    "",
+					wantStateAfter: thinkingState_Thinking,
+				},
+				{
+					input:          "nk>jkl",
+					wantThinking:   "",
+					wantContent:    "jkl",
+					wantStateAfter: thinkingState_ThinkingDone,
+				},
+			},
+		},
+		{
+			desc: "whitespace after thinking tag",
+			steps: []step{
+				{
+					input:          "  <think>abc</think>\n\ndef",
+					wantThinking:   "abc",
+					wantContent:    "def",
+					wantStateAfter: thinkingState_ThinkingDone,
+				},
+			},
+		},
+		{
+			desc: "whitespace after thinking tag (incremental)",
+			steps: []step{
+				{
+					input:          "  <think>abc</think>",
+					wantThinking:   "abc",
+					wantContent:    "",
+					wantStateAfter: thinkingState_ThinkingDoneEatingWhitespace,
+				},
+				{
+					input:          "\n\ndef",
+					wantThinking:   "",
+					wantContent:    "def",
+					wantStateAfter: thinkingState_ThinkingDone,
+				},
+			},
+		},
+		{
+			desc: "whitespace after thinking tag with content and more whitespace",
+			steps: []step{
+				{
+					input:          "  <think>abc</think>\n\ndef ",
+					wantThinking:   "abc",
+					wantContent:    "def ",
+					wantStateAfter: thinkingState_ThinkingDone,
+				},
+				{
+					input:          " ghi",
+					wantThinking:   "",
+					wantContent:    " ghi",
+					wantStateAfter: thinkingState_ThinkingDone,
+				},
+			},
+		},
+		{
+			desc: "token by token",
+			steps: []step{
+				{
+					input:          "<think>",
+					wantThinking:   "",
+					wantContent:    "",
+					wantStateAfter: thinkingState_ThinkingStartedEatingWhitespace,
+				},
+				{
+					input:          "\n",
+					wantThinking:   "",
+					wantContent:    "",
+					wantStateAfter: thinkingState_ThinkingStartedEatingWhitespace,
+				},
+				{
+					input:          "</think>",
+					wantThinking:   "",
+					wantContent:    "",
+					wantStateAfter: thinkingState_ThinkingDoneEatingWhitespace,
+				},
+				{
+					input:          "\n\n",
+					wantThinking:   "",
+					wantContent:    "",
+					wantStateAfter: thinkingState_ThinkingDoneEatingWhitespace,
+				},
+				{
+					input:          "Hi",
+					wantThinking:   "",
+					wantContent:    "Hi",
+					wantStateAfter: thinkingState_ThinkingDone,
+				},
+				{
+					input:          " there",
+					wantThinking:   "",
+					wantContent:    " there",
+					wantStateAfter: thinkingState_ThinkingDone,
+				},
+			},
+		},
+		{
+			desc: "leading thinking whitespace",
+			steps: []step{
+				{
+					input:          "  <think>   \t ",
+					wantThinking:   "",
+					wantContent:    "",
+					wantStateAfter: thinkingState_ThinkingStartedEatingWhitespace,
+				},
+				{
+					input:          "  these are some ",
+					wantThinking:   "these are some ",
+					wantContent:    "",
+					wantStateAfter: thinkingState_Thinking,
+				},
+				{
+					input:          "thoughts </think>  ",
+					wantThinking:   "thoughts ",
+					wantContent:    "",
+					wantStateAfter: thinkingState_ThinkingDoneEatingWhitespace,
+				},
+				{
+					input:          "  more content",
+					wantThinking:   "",
+					wantContent:    "more content",
+					wantStateAfter: thinkingState_ThinkingDone,
+				},
+			},
+		},
+	}
+
+	for _, c := range cases {
+		parser := thinkingParser{
+			openingTag: "<think>",
+			closingTag: "</think>",
+		}
+		if c.skip {
+			continue
+		}
+		for i, step := range c.steps {
+			thinking, content := parser.addContent(step.input)
+			if content != step.wantContent || thinking != step.wantThinking {
+				t.Errorf("case %q (step %d): got (%q,%q), want (%q,%q)", c.desc, i, content, thinking, step.wantContent, step.wantThinking)
+			}
+			if parser.state != step.wantStateAfter {
+				t.Errorf("case %q (step %d): got state %s, want %s", c.desc, i, parser.state, step.wantStateAfter)
+			}
+		}
+	}
+}
+
+func TestInferThinkingTags(t *testing.T) {
+	cases := []struct {
+		desc           string
+		tmplString     string
+		wantOpeningTag string
+		wantClosingTag string
+	}{
+		{
+			desc: "basic",
+			tmplString: `
+			{{ if .Thinking}}
+				/think
+			{{ end }}
+			{{- range $i, $_ := .Messages }}
+				{{- $last := eq (len (slice $.Messages $i)) 1 -}}
+				{{ if and $last .Thinking }}
+					<think>{{ .Thinking }}</think>
+				{{ end }}
+			{{ end }}
+		`,
+			wantOpeningTag: "<think>",
+			wantClosingTag: "</think>",
+		},
+		{
+			desc: "doubly nested range",
+			tmplString: `
+			{{ if .Thinking}}
+				/think
+			{{ end }}
+			{{- range $i, $_ := .Messages }}
+				{{- range $j, $_ := .NotMessages }}
+					{{- $last := eq (len (slice $.Messages $i)) 1 -}}
+					{{ if and $last .Thinking }}
+						<think>{{ .Thinking }}</think>
+					{{ end }}
+				{{ end }}
+			{{ end }}
+		`,
+			wantOpeningTag: "",
+			wantClosingTag: "",
+		},
+		{
+			desc: "whitespace is trimmed",
+			tmplString: `
+			{{ if .Thinking}}
+				/think
+			{{ end }}
+			{{- range $i, $_ := .Messages }}
+				{{- $last := eq (len (slice $.Messages $i)) 1 -}}
+				{{ if and $last .Thinking }}
+					Some text before   {{ .Thinking }}    Some text after
+				{{ end }}
+			{{ end }}
+		`,
+			wantOpeningTag: "Some text before",
+			wantClosingTag: "Some text after",
+		},
+		{
+			desc: "qwen3",
+			tmplString: `
+{{- if or .System .Tools .Thinking }}<|im_start|>system
+{{- if .System }}
+{{ .System }}
+{{- end }}
+{{- if .Tools }}
+
+# Tools
+
+You may call one or more functions to assist with the user query.
+
+You are provided with function signatures within <tools></tools> XML tags:
+<tools>
+{{- range .Tools }}
+{"type": "function", "function": {{ .Function }}}
+{{- end }}
+</tools>
+
+For each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:
+<tool_call>
+{"name": <function-name>, "arguments": <args-json-object>}
+</tool_call>
+{{- end }}
+{{- if .Thinking }}
+/think
+{{- else }}
+/no_think
+{{- end }}<|im_end|>
+{{ end }}
+{{- range $i, $_ := .Messages }}
+{{- $last := eq (len (slice $.Messages $i)) 1 -}}
+{{- if eq .Role "user" }}<|im_start|>user
+{{ .Content }}<|im_end|>
+{{ else if eq .Role "assistant" }}<|im_start|>assistant
+{{ if and $last .Thinking }}
+<think>{{ .Thinking }}</think>
+{{ end }}
+{{ if .Content }}{{ .Content }}
+{{- else if .ToolCalls }}<tool_call>
+{{ range .ToolCalls }}{"name": "{{ .Function.Name }}", "arguments": {{ .Function.Arguments }}}
+{{ end }}</tool_call>
+{{- end }}{{ if not $last }}<|im_end|>
+{{ end }}
+{{- else if eq .Role "tool" }}<|im_start|>user
+<tool_response>
+{{ .Content }}
+</tool_response><|im_end|>
+{{ end }}
+{{- if and (ne .Role "assistant") $last }}<|im_start|>assistant
+{{ end }}
+{{- end }}
+			`,
+			wantOpeningTag: "<think>",
+			wantClosingTag: "</think>",
+		},
+	}
+	for _, c := range cases {
+		tmpl := template.Must(template.New("test").Parse(c.tmplString))
+		openingTag, closingTag := inferThinkingTags(tmpl)
+		if openingTag != c.wantOpeningTag || closingTag != c.wantClosingTag {
+			t.Errorf("case %q: got (%q,%q), want (%q,%q)", c.desc, openingTag, closingTag, c.wantOpeningTag, c.wantClosingTag)
+		}
+	}
+}
diff --git a/template/template.go b/template/template.go
index 5c886cac..da910afb 100644
--- a/template/template.go
+++ b/template/template.go
@@ -167,6 +167,10 @@ type Values struct {
 	api.Tools
 	Prompt string
 	Suffix string
+	Think  bool
+	// whether or not the user explicitly set the thinking flag (vs. it being
+	// implicitly false). Templates can't see whether `Think` is nil
+	IsThinkSet bool
 
 	// forceLegacy is a flag used to test compatibility with legacy templates
 	forceLegacy bool
@@ -222,16 +226,20 @@ func (t *Template) Execute(w io.Writer, v Values) error {
 	system, messages := collate(v.Messages)
 	if v.Prompt != "" && v.Suffix != "" {
 		return t.Template.Execute(w, map[string]any{
-			"Prompt":   v.Prompt,
-			"Suffix":   v.Suffix,
-			"Response": "",
+			"Prompt":     v.Prompt,
+			"Suffix":     v.Suffix,
+			"Response":   "",
+			"Think":      v.Think,
+			"IsThinkSet": v.IsThinkSet,
 		})
 	} else if !v.forceLegacy && slices.Contains(t.Vars(), "messages") {
 		return t.Template.Execute(w, map[string]any{
-			"System":   system,
-			"Messages": messages,
-			"Tools":    v.Tools,
-			"Response": "",
+			"System":     system,
+			"Messages":   messages,
+			"Tools":      v.Tools,
+			"Response":   "",
+			"Think":      v.Think,
+			"IsThinkSet": v.IsThinkSet,
 		})
 	}
 
@@ -241,9 +249,11 @@ func (t *Template) Execute(w io.Writer, v Values) error {
 	for _, m := range messages {
 		execute := func() error {
 			if err := t.Template.Execute(&b, map[string]any{
-				"System":   system,
-				"Prompt":   prompt,
-				"Response": response,
+				"System":     system,
+				"Prompt":     prompt,
+				"Response":   response,
+				"Think":      v.Think,
+				"IsThinkSet": v.IsThinkSet,
 			}); err != nil {
 				return err
 			}
@@ -286,9 +296,11 @@ func (t *Template) Execute(w io.Writer, v Values) error {
 
 	tree := parse.Tree{Root: nodes.(*parse.ListNode)}
 	if err := template.Must(template.New("").AddParseTree("", &tree)).Execute(&b, map[string]any{
-		"System":   system,
-		"Prompt":   prompt,
-		"Response": response,
+		"System":     system,
+		"Prompt":     prompt,
+		"Response":   response,
+		"Think":      v.Think,
+		"IsThinkSet": v.IsThinkSet,
 	}); err != nil {
 		return err
 	}
diff --git a/types/model/capability.go b/types/model/capability.go
index fb868940..cde23cee 100644
--- a/types/model/capability.go
+++ b/types/model/capability.go
@@ -8,6 +8,7 @@ const (
 	CapabilityInsert     = Capability("insert")
 	CapabilityVision     = Capability("vision")
 	CapabilityEmbedding  = Capability("embedding")
+	CapabilityThinking   = Capability("thinking")
 )
 
 func (c Capability) String() string {

From f15ffc432061e3d96b3412219a3a0f673b579a12 Mon Sep 17 00:00:00 2001
From: Jesse Gross <jesse@ollama.com>
Date: Tue, 13 May 2025 17:26:46 -0700
Subject: [PATCH 070/108] llm: Make "POST predict" error message more
 informative

"POST predict" basically means that the runner has crashed, which
can have many reasons. However, many people think this is a specific
error and either report only this message or group together unrelated
bugs. This replaces it with a more friendly and helpful message.
---
 llm/server.go | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/llm/server.go b/llm/server.go
index 4abb569f..373f6fae 100644
--- a/llm/server.go
+++ b/llm/server.go
@@ -797,7 +797,8 @@ func (s *llmServer) Completion(ctx context.Context, req CompletionRequest, fn fu
 
 	res, err := http.DefaultClient.Do(serverReq)
 	if err != nil {
-		return fmt.Errorf("POST predict: %v", err)
+		slog.Error("post predict", "error", err)
+		return errors.New("model runner has unexpectedly stopped, this may be due to resource limitations or an internal error, check ollama server logs for details")
 	}
 	defer res.Body.Close()
 

From aaa7818000c42a82fc030212c35ef83f9799efd7 Mon Sep 17 00:00:00 2001
From: Jesse Gross <jesse@ollama.com>
Date: Thu, 24 Apr 2025 11:48:49 -0700
Subject: [PATCH 071/108] ggml: Export GPU UUIDs

This enables matching up devices and information reported by the backend
with system management libraries such as nvml to get accurate free
memory reporting.
---
 .../patches/0017-ggml-Export-GPU-UUIDs.patch  | 102 ++++++++++++++++++
 ml/backend.go                                 |   8 ++
 ml/backend/ggml/ggml.go                       |   6 ++
 ml/backend/ggml/ggml/include/ggml-backend.h   |   1 +
 .../ggml/ggml/src/ggml-cuda/ggml-cuda.cu      |  33 ++++++
 .../ggml/ggml/src/ggml-metal/ggml-metal.m     |   1 +
 6 files changed, 151 insertions(+)
 create mode 100644 llama/patches/0017-ggml-Export-GPU-UUIDs.patch

diff --git a/llama/patches/0017-ggml-Export-GPU-UUIDs.patch b/llama/patches/0017-ggml-Export-GPU-UUIDs.patch
new file mode 100644
index 00000000..a2539034
--- /dev/null
+++ b/llama/patches/0017-ggml-Export-GPU-UUIDs.patch
@@ -0,0 +1,102 @@
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
+From: Jesse Gross <jesse@ollama.com>
+Date: Thu, 24 Apr 2025 14:48:51 -0700
+Subject: [PATCH] ggml: Export GPU UUIDs
+
+This enables matching up devices and information reported by the backend
+with tools (e.g. nvidia-smi) and system management libraries (e.g. nvml).
+---
+ ggml/include/ggml-backend.h      |  1 +
+ ggml/src/ggml-cuda/ggml-cuda.cu  | 33 ++++++++++++++++++++++++++++++++
+ ggml/src/ggml-metal/ggml-metal.m |  1 +
+ 3 files changed, 35 insertions(+)
+
+diff --git a/ggml/include/ggml-backend.h b/ggml/include/ggml-backend.h
+index 74e46716..a880df33 100644
+--- a/ggml/include/ggml-backend.h
++++ b/ggml/include/ggml-backend.h
+@@ -152,6 +152,7 @@ extern "C" {
+     struct ggml_backend_dev_props {
+         const char * name;
+         const char * description;
++        const char * uuid;
+         size_t memory_free;
+         size_t memory_total;
+         enum ggml_backend_dev_type type;
+diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
+index cb0d8528..4c829153 100644
+--- a/ggml/src/ggml-cuda/ggml-cuda.cu
++++ b/ggml/src/ggml-cuda/ggml-cuda.cu
+@@ -2884,6 +2884,7 @@ struct ggml_backend_cuda_device_context {
+     int device;
+     std::string name;
+     std::string description;
++    std::string uuid;
+ };
+ 
+ static const char * ggml_backend_cuda_device_get_name(ggml_backend_dev_t dev) {
+@@ -2896,6 +2897,11 @@ static const char * ggml_backend_cuda_device_get_description(ggml_backend_dev_t
+     return ctx->description.c_str();
+ }
+ 
++static const char * ggml_backend_cuda_device_get_uuid(ggml_backend_dev_t dev) {
++    ggml_backend_cuda_device_context * ctx = (ggml_backend_cuda_device_context *)dev->context;
++    return ctx->uuid.c_str();
++}
++
+ static void ggml_backend_cuda_device_get_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) {
+     ggml_backend_cuda_device_context * ctx = (ggml_backend_cuda_device_context *)dev->context;
+     ggml_cuda_set_device(ctx->device);
+@@ -2910,6 +2916,7 @@ static enum ggml_backend_dev_type ggml_backend_cuda_device_get_type(ggml_backend
+ static void ggml_backend_cuda_device_get_props(ggml_backend_dev_t dev, ggml_backend_dev_props * props) {
+     props->name        = ggml_backend_cuda_device_get_name(dev);
+     props->description = ggml_backend_cuda_device_get_description(dev);
++    props->uuid        = ggml_backend_cuda_device_get_uuid(dev);
+     props->type        = ggml_backend_cuda_device_get_type(dev);
+     ggml_backend_cuda_device_get_memory(dev, &props->memory_free, &props->memory_total);
+ 
+@@ -3458,6 +3465,32 @@ ggml_backend_reg_t ggml_backend_cuda_reg() {
+                 CUDA_CHECK(cudaGetDeviceProperties(&prop, i));
+                 dev_ctx->description = prop.name;
+ 
++                #if !defined(GGML_USE_HIP)
++                char uuid[64];
++                snprintf(uuid, sizeof(uuid),
++                    "GPU-%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
++                    (unsigned char)prop.uuid.bytes[0],
++                    (unsigned char)prop.uuid.bytes[1],
++                    (unsigned char)prop.uuid.bytes[2],
++                    (unsigned char)prop.uuid.bytes[3],
++                    (unsigned char)prop.uuid.bytes[4],
++                    (unsigned char)prop.uuid.bytes[5],
++                    (unsigned char)prop.uuid.bytes[6],
++                    (unsigned char)prop.uuid.bytes[7],
++                    (unsigned char)prop.uuid.bytes[8],
++                    (unsigned char)prop.uuid.bytes[9],
++                    (unsigned char)prop.uuid.bytes[10],
++                    (unsigned char)prop.uuid.bytes[11],
++                    (unsigned char)prop.uuid.bytes[12],
++                    (unsigned char)prop.uuid.bytes[13],
++                    (unsigned char)prop.uuid.bytes[14],
++                    (unsigned char)prop.uuid.bytes[15]
++                  );
++                dev_ctx->uuid = uuid;
++                #else
++                dev_ctx->uuid = "GPU-" + std::string(prop.uuid.bytes, 16);
++                #endif
++
+                 ggml_backend_dev_t dev = new ggml_backend_device {
+                     /* .iface   = */ ggml_backend_cuda_device_interface,
+                     /* .reg     = */ &reg,
+diff --git a/ggml/src/ggml-metal/ggml-metal.m b/ggml/src/ggml-metal/ggml-metal.m
+index 1b56f858..ee4f2dcb 100644
+--- a/ggml/src/ggml-metal/ggml-metal.m
++++ b/ggml/src/ggml-metal/ggml-metal.m
+@@ -5703,6 +5703,7 @@ static enum ggml_backend_dev_type ggml_backend_metal_device_get_type(ggml_backen
+ static void ggml_backend_metal_device_get_props(ggml_backend_dev_t dev, struct ggml_backend_dev_props * props) {
+     props->name        = ggml_backend_metal_device_get_name(dev);
+     props->description = ggml_backend_metal_device_get_description(dev);
++    props->uuid        = "0";
+     props->type        = ggml_backend_metal_device_get_type(dev);
+     ggml_backend_metal_device_get_memory(dev, &props->memory_free, &props->memory_total);
+     props->caps = (struct ggml_backend_dev_caps) {
diff --git a/ml/backend.go b/ml/backend.go
index 65f16948..2df6c892 100644
--- a/ml/backend.go
+++ b/ml/backend.go
@@ -124,6 +124,10 @@ type DeviceMemory struct {
 	// may not be persistent across instances of the runner.
 	Name string
 
+	// UUID is a unique persistent identifier for the device for matching
+	// with system management libraries
+	UUID string
+
 	// Weights is the per-layer memory needed for the model weights.
 	Weights []Memory
 
@@ -152,6 +156,10 @@ func (m DeviceMemory) LogValue() slog.Value {
 		attrs = append(attrs, slog.Any("Graph", m.Graph))
 	}
 
+	if len(attrs) > 0 && m.UUID != "" {
+		attrs = append([]slog.Attr{slog.String("UUID", m.UUID)}, attrs...)
+	}
+
 	return slog.GroupValue(attrs...)
 }
 
diff --git a/ml/backend/ggml/ggml.go b/ml/backend/ggml/ggml.go
index 76172ae1..5a9fe67e 100644
--- a/ml/backend/ggml/ggml.go
+++ b/ml/backend/ggml/ggml.go
@@ -136,6 +136,9 @@ func New(modelPath string, params ml.BackendParams) (ml.Backend, error) {
 	}
 
 	requiredMemory.CPU.Name = C.GoString(C.ggml_backend_dev_name(cpuDeviceBufferType.d))
+	var props C.struct_ggml_backend_dev_props
+	C.ggml_backend_dev_get_props(cpuDeviceBufferType.d, &props)
+	requiredMemory.CPU.UUID = C.GoString(props.uuid)
 	requiredMemory.CPU.Weights = make([]ml.Memory, blocks+1)
 	requiredMemory.CPU.Cache = make([]ml.Memory, blocks+1)
 
@@ -150,6 +153,9 @@ func New(modelPath string, params ml.BackendParams) (ml.Backend, error) {
 		})
 		btDeviceMemory[bt] = &requiredMemory.GPUs[i]
 		requiredMemory.GPUs[i].Name = C.GoString(C.ggml_backend_dev_name(d))
+		var props C.struct_ggml_backend_dev_props
+		C.ggml_backend_dev_get_props(d, &props)
+		requiredMemory.GPUs[i].UUID = C.GoString(props.uuid)
 		requiredMemory.GPUs[i].Weights = make([]ml.Memory, blocks+1)
 		requiredMemory.GPUs[i].Cache = make([]ml.Memory, blocks+1)
 	}
diff --git a/ml/backend/ggml/ggml/include/ggml-backend.h b/ml/backend/ggml/ggml/include/ggml-backend.h
index 74e46716..a880df33 100644
--- a/ml/backend/ggml/ggml/include/ggml-backend.h
+++ b/ml/backend/ggml/ggml/include/ggml-backend.h
@@ -152,6 +152,7 @@ extern "C" {
     struct ggml_backend_dev_props {
         const char * name;
         const char * description;
+        const char * uuid;
         size_t memory_free;
         size_t memory_total;
         enum ggml_backend_dev_type type;
diff --git a/ml/backend/ggml/ggml/src/ggml-cuda/ggml-cuda.cu b/ml/backend/ggml/ggml/src/ggml-cuda/ggml-cuda.cu
index cb0d8528..4c829153 100644
--- a/ml/backend/ggml/ggml/src/ggml-cuda/ggml-cuda.cu
+++ b/ml/backend/ggml/ggml/src/ggml-cuda/ggml-cuda.cu
@@ -2884,6 +2884,7 @@ struct ggml_backend_cuda_device_context {
     int device;
     std::string name;
     std::string description;
+    std::string uuid;
 };
 
 static const char * ggml_backend_cuda_device_get_name(ggml_backend_dev_t dev) {
@@ -2896,6 +2897,11 @@ static const char * ggml_backend_cuda_device_get_description(ggml_backend_dev_t
     return ctx->description.c_str();
 }
 
+static const char * ggml_backend_cuda_device_get_uuid(ggml_backend_dev_t dev) {
+    ggml_backend_cuda_device_context * ctx = (ggml_backend_cuda_device_context *)dev->context;
+    return ctx->uuid.c_str();
+}
+
 static void ggml_backend_cuda_device_get_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) {
     ggml_backend_cuda_device_context * ctx = (ggml_backend_cuda_device_context *)dev->context;
     ggml_cuda_set_device(ctx->device);
@@ -2910,6 +2916,7 @@ static enum ggml_backend_dev_type ggml_backend_cuda_device_get_type(ggml_backend
 static void ggml_backend_cuda_device_get_props(ggml_backend_dev_t dev, ggml_backend_dev_props * props) {
     props->name        = ggml_backend_cuda_device_get_name(dev);
     props->description = ggml_backend_cuda_device_get_description(dev);
+    props->uuid        = ggml_backend_cuda_device_get_uuid(dev);
     props->type        = ggml_backend_cuda_device_get_type(dev);
     ggml_backend_cuda_device_get_memory(dev, &props->memory_free, &props->memory_total);
 
@@ -3458,6 +3465,32 @@ ggml_backend_reg_t ggml_backend_cuda_reg() {
                 CUDA_CHECK(cudaGetDeviceProperties(&prop, i));
                 dev_ctx->description = prop.name;
 
+                #if !defined(GGML_USE_HIP)
+                char uuid[64];
+                snprintf(uuid, sizeof(uuid),
+                    "GPU-%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
+                    (unsigned char)prop.uuid.bytes[0],
+                    (unsigned char)prop.uuid.bytes[1],
+                    (unsigned char)prop.uuid.bytes[2],
+                    (unsigned char)prop.uuid.bytes[3],
+                    (unsigned char)prop.uuid.bytes[4],
+                    (unsigned char)prop.uuid.bytes[5],
+                    (unsigned char)prop.uuid.bytes[6],
+                    (unsigned char)prop.uuid.bytes[7],
+                    (unsigned char)prop.uuid.bytes[8],
+                    (unsigned char)prop.uuid.bytes[9],
+                    (unsigned char)prop.uuid.bytes[10],
+                    (unsigned char)prop.uuid.bytes[11],
+                    (unsigned char)prop.uuid.bytes[12],
+                    (unsigned char)prop.uuid.bytes[13],
+                    (unsigned char)prop.uuid.bytes[14],
+                    (unsigned char)prop.uuid.bytes[15]
+                  );
+                dev_ctx->uuid = uuid;
+                #else
+                dev_ctx->uuid = "GPU-" + std::string(prop.uuid.bytes, 16);
+                #endif
+
                 ggml_backend_dev_t dev = new ggml_backend_device {
                     /* .iface   = */ ggml_backend_cuda_device_interface,
                     /* .reg     = */ &reg,
diff --git a/ml/backend/ggml/ggml/src/ggml-metal/ggml-metal.m b/ml/backend/ggml/ggml/src/ggml-metal/ggml-metal.m
index 1b56f858..ee4f2dcb 100644
--- a/ml/backend/ggml/ggml/src/ggml-metal/ggml-metal.m
+++ b/ml/backend/ggml/ggml/src/ggml-metal/ggml-metal.m
@@ -5703,6 +5703,7 @@ static enum ggml_backend_dev_type ggml_backend_metal_device_get_type(ggml_backen
 static void ggml_backend_metal_device_get_props(ggml_backend_dev_t dev, struct ggml_backend_dev_props * props) {
     props->name        = ggml_backend_metal_device_get_name(dev);
     props->description = ggml_backend_metal_device_get_description(dev);
+    props->uuid        = "0";
     props->type        = ggml_backend_metal_device_get_type(dev);
     ggml_backend_metal_device_get_memory(dev, &props->memory_free, &props->memory_total);
     props->caps = (struct ggml_backend_dev_caps) {

From 65f10c2823d540837a9e79202522957194377735 Mon Sep 17 00:00:00 2001
From: Parth Sareen <parth.sareen@ollama.com>
Date: Fri, 30 May 2025 15:18:09 -0700
Subject: [PATCH 072/108] tools: resiliency upgrade to name and arg extraction
 from template (#10917)

---
 tools/tools_utils.go      |  37 +++++------
 tools/tools_utils_test.go | 131 ++++++++++++++++++++++++--------------
 2 files changed, 98 insertions(+), 70 deletions(-)

diff --git a/tools/tools_utils.go b/tools/tools_utils.go
index 48531b78..b6f80729 100644
--- a/tools/tools_utils.go
+++ b/tools/tools_utils.go
@@ -166,31 +166,26 @@ func extractToolArgs(tmpl *gotmpl.Template) (name, arguments string, err error)
 		return "", "", err
 	}
 
-	var obj any
-	err = json.Unmarshal(b.Bytes(), &obj)
-	if err != nil {
+	// Extract JSON object between curly braces
+	// JSON arrays are also valid as they will not be repeated in the template
+	output := b.String()
+	start := strings.Index(output, "{")
+	end := strings.LastIndex(output, "}")
+	if start == -1 || end == -1 || start > end {
+		return "", "", errors.New("no valid JSON object found in template output")
+	}
+	jsonStr := output[start : end+1]
+
+	var obj map[string]any
+	if err := json.Unmarshal([]byte(jsonStr), &obj); err != nil {
 		return "", "", err
 	}
 
-	var objs []map[string]any
-	switch v := obj.(type) {
-	case map[string]any:
-		objs = []map[string]any{v}
-	case []map[string]any:
-		objs = v
-	case []any:
-		objs = collect(v)
-	}
-	if len(objs) == 0 {
-		return "", "", errors.New("no template objects found")
-	}
-
-	// find the keys that correspond to the name and arguments fields
-	for k, v := range objs[0] {
-		switch v.(type) {
-		case string:
+	// Find name and arguments fields
+	for k, v := range obj {
+		if str, ok := v.(string); ok && str == "@@name@@" {
 			name = k
-		case map[string]any:
+		} else if _, ok := v.(map[string]any); ok {
 			arguments = k
 		}
 	}
diff --git a/tools/tools_utils_test.go b/tools/tools_utils_test.go
index 769183b7..e346117a 100644
--- a/tools/tools_utils_test.go
+++ b/tools/tools_utils_test.go
@@ -271,74 +271,99 @@ func TestExtractToolArgs(t *testing.T) {
 	cases := []struct {
 		name     string
 		template string
-		want     string
-		ok       bool
+		wantName string
+		wantArgs string
+		wantErr  bool
 	}{
 		{
-			name:     "basic tool call with text after",
-			template: `{{if .ToolCalls}}tool response{{end}}`,
-			want:     "tool response",
-			ok:       true,
+			name: "basic tool call",
+			template: `{{ range .ToolCalls }}
+{"name": "{{ .Function.Name }}", "parameters": {{ .Function.Arguments }}}{{ end }}`,
+			wantName: "name",
+			wantArgs: "parameters",
+			wantErr:  false,
 		},
 		{
-			name:     "tool call with mixed content after",
-			template: `{{if .ToolCalls}}<tool_call>{{.Something}}{{end}}`,
-			want:     "<tool_call>",
-			ok:       true,
+			name: "tool call with whitespace",
+			template: `{{range .ToolCalls}}
+  {"name": "{{.Function.Name}}", "parameters": {{.Function.Arguments}}}
+{{end}}`,
+			wantName: "name",
+			wantArgs: "parameters",
+			wantErr:  false,
 		},
 		{
-			name:     "tool call with no text after",
-			template: `{{if .ToolCalls}}{{.Something}}{{end}}`,
-			want:     "",
-			ok:       true,
-		},
-		{
-			name:     "nested tool call",
-			template: `{{if .Something}}{{if .ToolCalls}}[TOOL_CALL]{{end}}{{end}}`,
-			want:     "[TOOL_CALL]",
-			ok:       true,
+			name: "tool call with extra content",
+			template: `Before {{range .ToolCalls}}
+{"name": "{{.Function.Name}}", "arguments": {{.Function.Arguments}}}{{end}} After`,
+			wantName: "name",
+			wantArgs: "arguments",
+			wantErr:  false,
 		},
 		{
 			name:     "no tool calls",
 			template: `{{if .Something}}no tools here{{end}}`,
-			want:     "",
-			ok:       false,
+			wantName: "",
+			wantArgs: "",
+			wantErr:  true,
 		},
 		{
 			name:     "empty template",
 			template: ``,
-			want:     "",
-			ok:       false,
+			wantName: "",
+			wantArgs: "",
+			wantErr:  true,
 		},
 		{
-			name:     "multiple tool calls sections",
-			template: `{{if .ToolCalls}}first{{end}}{{if .ToolCalls}}second{{end}}`,
-			want:     "first",
-			ok:       true,
+			name: "prefix within tool call",
+			template: `{{- if .ToolCalls }}
+{{ range .ToolCalls }}
+<tool_call>
+{"name": "{{ .Function.Name }}", "arguments": {{ .Function.Arguments }}}
+</tool_call>{{ end }}{{- end }}`,
+			wantName: "name",
+			wantArgs: "arguments",
+			wantErr:  false,
 		},
 		{
-			name:     "range over tool calls",
-			template: `{{if .ToolCalls}}{{range .ToolCalls}}tool{{end}}{{end}}`,
-			want:     "",
-			ok:       true,
+			name: "JSON array",
+			template: `{{ range .ToolCalls }}
+[{"name": "{{ .Function.Name }}", "arguments": {{ .Function.Arguments }}}]{{ end }}`,
+			wantName: "name",
+			wantArgs: "arguments",
+			wantErr:  false,
 		},
 		{
-			name:     "tool calls with pipe delimiters",
-			template: `{{if .ToolCalls}}<|tool|>{{end}}`,
-			want:     "<|tool|>",
-			ok:       true,
+			name: "invalid JSON",
+			template: `{{ range .ToolCalls }}
+{"name": "{{ .Function.Name }}", "arguments": {{ .Function.Arguments }}, invalid}{{ end }}`,
+			wantName: "",
+			wantArgs: "",
+			wantErr:  true,
 		},
 		{
-			name:     "tool calls with nested template",
-			template: `{{if .ToolCalls}}{{template "tool" .}}{{end}}`,
-			want:     "",
-			ok:       true,
+			name: "missing name field",
+			template: `{{ range .ToolCalls }}
+{"parameters": {{ .Function.Arguments }}}{{ end }}`,
+			wantName: "",
+			wantArgs: "",
+			wantErr:  true,
 		},
 		{
-			name:     "tool calls with whitespace variations",
-			template: `{{if .ToolCalls}}  tool  {{end}}`,
-			want:     "  tool  ",
-			ok:       true,
+			name: "missing arguments field",
+			template: `{{ range .ToolCalls }}
+{"name": "{{ .Function.Name }}"}{{ end }}`,
+			wantName: "",
+			wantArgs: "",
+			wantErr:  true,
+		},
+		{
+			name: "malformed JSON",
+			template: `{{ range .ToolCalls }}
+{"name": {{ .Function.Name }}, "arguments": {{ .Function.Arguments }}{{ end }}`,
+			wantName: "",
+			wantArgs: "",
+			wantErr:  true,
 		},
 	}
 
@@ -349,12 +374,20 @@ func TestExtractToolArgs(t *testing.T) {
 				t.Fatalf("failed to parse template: %v", err)
 			}
 
-			got, ok := extractToolCallsFormat(tmpl)
-			if got != tt.want {
-				t.Errorf("TextAfterToolCalls() got = %q, want %q", got, tt.want)
+			gotName, gotArgs, err := extractToolArgs(tmpl)
+			if (err != nil) != tt.wantErr {
+				t.Errorf("extractToolArgs() error = %v, wantErr %v", err, tt.wantErr)
+				return
 			}
-			if ok != tt.ok {
-				t.Errorf("TextAfterToolCalls() ok = %v, want %v", ok, tt.ok)
+			if err != nil {
+				return
+			}
+
+			if gotName != tt.wantName {
+				t.Errorf("extractToolArgs() gotName = %q, want %q", gotName, tt.wantName)
+			}
+			if gotArgs != tt.wantArgs {
+				t.Errorf("extractToolArgs() gotArgs = %q, want %q", gotArgs, tt.wantArgs)
 			}
 		})
 	}

From 5c42800fca4da07d1c362c0f190429993e53c3b5 Mon Sep 17 00:00:00 2001
From: HardCodeDev <hardcodedev777@yandex.com>
Date: Sat, 31 May 2025 06:50:16 +0400
Subject: [PATCH 073/108] readme: add SimpleOllamaUnity to community
 integrations (#10817)

---
 README.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/README.md b/README.md
index d7cf5bfe..22de41e6 100644
--- a/README.md
+++ b/README.md
@@ -587,6 +587,7 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [Simple-Discord-AI](https://github.com/zyphixor/simple-discord-ai)
 - [LLM Telegram Bot](https://github.com/innightwolfsleep/llm_telegram_bot) (telegram bot, primary for RP. Oobabooga-like buttons, [A1111](https://github.com/AUTOMATIC1111/stable-diffusion-webui) API integration e.t.c)
 - [mcp-llm](https://github.com/sammcj/mcp-llm) (MCP Server to allow LLMs to call other LLMs)
+- [SimpleOllamaUnity](https://github.com/HardCodeDev777/SimpleOllamaUnity) (Unity Engine extension for communicating with Ollama in a few lines of code. Also works at runtime)
 - [UnityCodeLama](https://github.com/HardCodeDev777/UnityCodeLama) (Unity Edtior tool to analyze scripts via Ollama)
 
 ### Supported backends

From 09430011936652cf55925184aaed6f2cebf62a75 Mon Sep 17 00:00:00 2001
From: JasonHonKL <148705846+JasonHonKL@users.noreply.github.com>
Date: Thu, 5 Jun 2025 02:39:48 +0800
Subject: [PATCH 074/108] server: add model capabilities to the list endpoint
 (#10174)

---
 api/types.go     | 13 +++++++------
 docs/api.md      | 29 +++++++++++++++++++----------
 server/routes.go | 14 +++++++++++---
 3 files changed, 37 insertions(+), 19 deletions(-)

diff --git a/api/types.go b/api/types.go
index 94d49200..a1f896db 100644
--- a/api/types.go
+++ b/api/types.go
@@ -457,12 +457,13 @@ type ProcessResponse struct {
 
 // ListModelResponse is a single model description in [ListResponse].
 type ListModelResponse struct {
-	Name       string       `json:"name"`
-	Model      string       `json:"model"`
-	ModifiedAt time.Time    `json:"modified_at"`
-	Size       int64        `json:"size"`
-	Digest     string       `json:"digest"`
-	Details    ModelDetails `json:"details,omitempty"`
+	Name         string             `json:"name"`
+	Model        string             `json:"model"`
+	ModifiedAt   time.Time          `json:"modified_at"`
+	Size         int64              `json:"size"`
+	Digest       string             `json:"digest"`
+	Capabilities []model.Capability `json:"capabilities,omitempty"`
+	Details      ModelDetails       `json:"details,omitempty"`
 }
 
 // ProcessModelResponse is a single model description in [ProcessResponse].
diff --git a/docs/api.md b/docs/api.md
index 11eaf73a..31e18bd5 100644
--- a/docs/api.md
+++ b/docs/api.md
@@ -1157,11 +1157,15 @@ A single JSON object will be returned.
 {
   "models": [
     {
-      "name": "deepseek-r1:latest",
-      "model": "deepseek-r1:latest",
-      "modified_at": "2025-05-10T08:06:48.639712648-07:00",
-      "size": 4683075271,
-      "digest": "0a8c266910232fd3291e71e5ba1e058cc5af9d411192cf88b6d30e92b6e73163",
+
+      "model": "codellama:13b",
+      "modified_at": "2023-11-04T14:56:49.277302595-07:00",
+      "size": 7365960935,
+      "digest": "9f438cb9cd581fc025612d27f7c1a6669ff83a8bb0ed86c94fcf4c5440555697",
+      "capabilities": [
+        "completion"
+      ],
+
       "details": {
         "parent_model": "",
         "format": "gguf",
@@ -1174,11 +1178,16 @@ A single JSON object will be returned.
       }
     },
     {
-      "name": "llama3.2:latest",
-      "model": "llama3.2:latest",
-      "modified_at": "2025-05-04T17:37:44.706015396-07:00",
-      "size": 2019393189,
-      "digest": "a80c4f17acd55265feec403c7aef86be0c25983ab279d83f3bcd3abbcb5b8b72",
+
+      "model": "llama4:latest",
+      "modified_at": "2023-12-07T09:32:18.757212583-08:00",
+      "size": 3825819519,
+      "digest": "fe938a131f40e6f6d40083c9f0f430a515233eb2edaa6d72eb85c50d64f2300e",
+      "capabilities": [
+        "completion",
+        "vision"
+      ],
+
       "details": {
         "parent_model": "",
         "format": "gguf",
diff --git a/server/routes.go b/server/routes.go
index 236f92e2..924ba06c 100644
--- a/server/routes.go
+++ b/server/routes.go
@@ -928,8 +928,7 @@ func (s *Server) ListHandler(c *gin.Context) {
 			}
 		}
 
-		// tag should never be masked
-		models = append(models, api.ListModelResponse{
+		r := api.ListModelResponse{
 			Model:      n.DisplayShortest(),
 			Name:       n.DisplayShortest(),
 			Size:       m.Size(),
@@ -942,7 +941,16 @@ func (s *Server) ListHandler(c *gin.Context) {
 				ParameterSize:     cf.ModelType,
 				QuantizationLevel: cf.FileType,
 			},
-		})
+		}
+
+		model, err := GetModel(n.String())
+		if err != nil {
+			slog.Warn("bad model details", "name", n, "error", err)
+		} else {
+			r.Capabilities = model.Capabilities()
+		}
+
+		models = append(models, r)
 	}
 
 	slices.SortStableFunc(models, func(i, j api.ListModelResponse) int {

From 0683efa6379ba69384bb5876f1013d9ef38f1ab0 Mon Sep 17 00:00:00 2001
From: Devon Rifkin <drifkin@drifkin.net>
Date: Thu, 5 Jun 2025 10:22:32 -0700
Subject: [PATCH 075/108] export ThinkingParser

---
 server/routes.go        | 28 ++++++++++++++--------------
 server/thinking.go      | 26 +++++++++++++-------------
 server/thinking_test.go | 16 ++++++++--------
 3 files changed, 35 insertions(+), 35 deletions(-)

diff --git a/server/routes.go b/server/routes.go
index 924ba06c..d03ac2ec 100644
--- a/server/routes.go
+++ b/server/routes.go
@@ -282,12 +282,12 @@ func (s *Server) GenerateHandler(c *gin.Context) {
 		prompt = b.String()
 	}
 
-	var thinkingState *thinkingParser
+	var thinkingState *ThinkingParser
 	openingTag, closingTag := inferThinkingTags(m.Template.Template)
 	if req.Think != nil && *req.Think && openingTag != "" && closingTag != "" {
-		thinkingState = &thinkingParser{
-			openingTag: openingTag,
-			closingTag: closingTag,
+		thinkingState = &ThinkingParser{
+			OpeningTag: openingTag,
+			ClosingTag: closingTag,
 		}
 	}
 
@@ -316,7 +316,7 @@ func (s *Server) GenerateHandler(c *gin.Context) {
 			}
 
 			if thinkingState != nil {
-				thinking, content := thinkingState.addContent(cr.Content)
+				thinking, content := thinkingState.AddContent(cr.Content)
 				res.Thinking = thinking
 				res.Response = content
 			}
@@ -1522,12 +1522,12 @@ func (s *Server) ChatHandler(c *gin.Context) {
 		return
 	}
 
-	var thinkingState *thinkingParser
+	var thinkingState *ThinkingParser
 	openingTag, closingTag := inferThinkingTags(m.Template.Template)
 	if req.Think != nil && *req.Think && openingTag != "" && closingTag != "" {
-		thinkingState = &thinkingParser{
-			openingTag: openingTag,
-			closingTag: closingTag,
+		thinkingState = &ThinkingParser{
+			OpeningTag: openingTag,
+			ClosingTag: closingTag,
 		}
 	}
 
@@ -1565,7 +1565,7 @@ func (s *Server) ChatHandler(c *gin.Context) {
 			}
 
 			if thinkingState != nil {
-				thinkingContent, remainingContent := thinkingState.addContent(res.Message.Content)
+				thinkingContent, remainingContent := thinkingState.AddContent(res.Message.Content)
 				if thinkingContent == "" && remainingContent == "" && !r.Done {
 					// need to accumulate more to decide what to send
 					return
@@ -1676,11 +1676,11 @@ func filterThinkTags(msgs []api.Message, m *Model) []api.Message {
 				// change the user output), we should probably perform this filtering
 				// for all thinking models (not just qwen3 & deepseek-r1) since it tends
 				// to save tokens and improve quality.
-				thinkingState := &thinkingParser{
-					openingTag: "<think>",
-					closingTag: "</think>",
+				thinkingState := &ThinkingParser{
+					OpeningTag: "<think>",
+					ClosingTag: "</think>",
 				}
-				_, content := thinkingState.addContent(msg.Content)
+				_, content := thinkingState.AddContent(msg.Content)
 				msgs[i].Content = content
 			}
 		}
diff --git a/server/thinking.go b/server/thinking.go
index 2213b6b6..4ef3c184 100644
--- a/server/thinking.go
+++ b/server/thinking.go
@@ -46,17 +46,17 @@ func (s thinkingState) String() string {
 	}
 }
 
-type thinkingParser struct {
+type ThinkingParser struct {
 	state      thinkingState
-	openingTag string
-	closingTag string
+	OpeningTag string
+	ClosingTag string
 	acc        strings.Builder
 }
 
-// addContent returns the thinking content and the non-thinking content that
+// AddContent returns the thinking content and the non-thinking content that
 // should be immediately sent to the user. It will internally buffer if it needs
 // to see more raw content to disambiguate
-func (s *thinkingParser) addContent(content string) (string, string) {
+func (s *ThinkingParser) AddContent(content string) (string, string) {
 	s.acc.WriteString(content)
 
 	var thinkingSb, remainingSb strings.Builder
@@ -76,12 +76,12 @@ func (s *thinkingParser) addContent(content string) (string, string) {
 }
 
 // the additional bool return is true iff we should continue eating
-func eat(s *thinkingParser) (string, string, bool) {
+func eat(s *ThinkingParser) (string, string, bool) {
 	switch s.state {
 	case thinkingState_LookingForOpening:
 		trimmed := strings.TrimLeftFunc(s.acc.String(), unicode.IsSpace)
-		if strings.HasPrefix(trimmed, s.openingTag) {
-			after := strings.Join(strings.Split(trimmed, s.openingTag)[1:], s.openingTag)
+		if strings.HasPrefix(trimmed, s.OpeningTag) {
+			after := strings.Join(strings.Split(trimmed, s.OpeningTag)[1:], s.OpeningTag)
 			after = strings.TrimLeftFunc(after, unicode.IsSpace)
 			// after might contain more than just thinking tokens, so we continue
 			// parsing instead of returning it as thinking tokens here
@@ -93,7 +93,7 @@ func eat(s *thinkingParser) (string, string, bool) {
 				s.state = thinkingState_Thinking
 			}
 			return "", "", true
-		} else if strings.HasPrefix(s.openingTag, trimmed) {
+		} else if strings.HasPrefix(s.OpeningTag, trimmed) {
 			// partial opening seen, so let's keep accumulating
 			return "", "", false
 		} else if trimmed == "" {
@@ -119,10 +119,10 @@ func eat(s *thinkingParser) (string, string, bool) {
 		}
 	case thinkingState_Thinking:
 		acc := s.acc.String()
-		if strings.Contains(acc, s.closingTag) {
-			split := strings.Split(acc, s.closingTag)
+		if strings.Contains(acc, s.ClosingTag) {
+			split := strings.Split(acc, s.ClosingTag)
 			thinking := split[0]
-			remaining := strings.Join(split[1:], s.closingTag)
+			remaining := strings.Join(split[1:], s.ClosingTag)
 			remaining = strings.TrimLeftFunc(remaining, unicode.IsSpace)
 			s.acc.Reset()
 			if remaining == "" {
@@ -131,7 +131,7 @@ func eat(s *thinkingParser) (string, string, bool) {
 				s.state = thinkingState_ThinkingDone
 			}
 			return thinking, remaining, false
-		} else if overlapLen := overlap(acc, s.closingTag); overlapLen > 0 {
+		} else if overlapLen := overlap(acc, s.ClosingTag); overlapLen > 0 {
 			thinking := acc[:len(acc)-overlapLen]
 			remaining := acc[len(acc)-overlapLen:]
 			s.acc.Reset()
diff --git a/server/thinking_test.go b/server/thinking_test.go
index a2055635..90d3f961 100644
--- a/server/thinking_test.go
+++ b/server/thinking_test.go
@@ -26,11 +26,11 @@ func TestExtractThinking(t *testing.T) {
 		},
 	}
 	for i, tt := range tests {
-		parser := thinkingParser{
-			openingTag: "<think>",
-			closingTag: "</think>",
+		parser := ThinkingParser{
+			OpeningTag: "<think>",
+			ClosingTag: "</think>",
 		}
-		gotThinking, gotContent := parser.addContent(tt.in)
+		gotThinking, gotContent := parser.AddContent(tt.in)
 		if gotContent != tt.wantContent || gotThinking != tt.wantThink {
 			t.Errorf("case %d: got (%q,%q), want (%q,%q)", i, gotThinking, gotContent, tt.wantThink, tt.wantContent)
 		}
@@ -259,15 +259,15 @@ func TestThinkingStreaming(t *testing.T) {
 	}
 
 	for _, c := range cases {
-		parser := thinkingParser{
-			openingTag: "<think>",
-			closingTag: "</think>",
+		parser := ThinkingParser{
+			OpeningTag: "<think>",
+			ClosingTag: "</think>",
 		}
 		if c.skip {
 			continue
 		}
 		for i, step := range c.steps {
-			thinking, content := parser.addContent(step.input)
+			thinking, content := parser.AddContent(step.input)
 			if content != step.wantContent || thinking != step.wantThinking {
 				t.Errorf("case %q (step %d): got (%q,%q), want (%q,%q)", c.desc, i, content, thinking, step.wantContent, step.wantThinking)
 			}

From c6a6d7294dd50b9216918fe72fd92bc4ae572ac0 Mon Sep 17 00:00:00 2001
From: Hunter Wittenborn <hunter@hunterwittenborn.com>
Date: Fri, 6 Jun 2025 11:07:29 -0500
Subject: [PATCH 076/108] docs: fix typo in development.md (#10998)

---
 docs/development.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/development.md b/docs/development.md
index cf6d91e2..24bcba19 100644
--- a/docs/development.md
+++ b/docs/development.md
@@ -118,7 +118,7 @@ To run tests, use `go test`:
 go test ./...
 ```
 
-> NOTE: In rare cirumstances, you may nedd to change a package using the new
+> NOTE: In rare cirumstances, you may need to change a package using the new
 > "synctest" package in go1.24.
 >
 > If you do not have the "synctest" package enabled, you will not see build or

From a3b6886b7da0339e63ebf41e6ba5c6b06438a123 Mon Sep 17 00:00:00 2001
From: Devon Rifkin <drifkin@drifkin.net>
Date: Fri, 6 Jun 2025 12:02:20 -0700
Subject: [PATCH 077/108] move thinking logic into its own package (#10990)

move thinking logic into its own package
---
 server/images.go                              |   3 +-
 server/routes.go                              |  15 +-
 server/thinking.go => thinking/parser.go      | 137 +-----------------
 .../parser_test.go                            | 131 +----------------
 thinking/template.go                          | 134 +++++++++++++++++
 thinking/template_test.go                     | 130 +++++++++++++++++
 6 files changed, 281 insertions(+), 269 deletions(-)
 rename server/thinking.go => thinking/parser.go (59%)
 rename server/thinking_test.go => thinking/parser_test.go (66%)
 create mode 100644 thinking/template.go
 create mode 100644 thinking/template_test.go

diff --git a/server/images.go b/server/images.go
index 58fb87dc..d6cceff4 100644
--- a/server/images.go
+++ b/server/images.go
@@ -26,6 +26,7 @@ import (
 	"github.com/ollama/ollama/fs/ggml"
 	"github.com/ollama/ollama/parser"
 	"github.com/ollama/ollama/template"
+	"github.com/ollama/ollama/thinking"
 	"github.com/ollama/ollama/types/model"
 	"github.com/ollama/ollama/version"
 )
@@ -113,7 +114,7 @@ func (m *Model) Capabilities() []model.Capability {
 	}
 
 	// Check for thinking capability
-	openingTag, closingTag := inferThinkingTags(m.Template.Template)
+	openingTag, closingTag := thinking.InferTags(m.Template.Template)
 	if openingTag != "" && closingTag != "" {
 		capabilities = append(capabilities, model.CapabilityThinking)
 	}
diff --git a/server/routes.go b/server/routes.go
index d03ac2ec..70cb6cef 100644
--- a/server/routes.go
+++ b/server/routes.go
@@ -37,6 +37,7 @@ import (
 	"github.com/ollama/ollama/server/internal/client/ollama"
 	"github.com/ollama/ollama/server/internal/registry"
 	"github.com/ollama/ollama/template"
+	"github.com/ollama/ollama/thinking"
 	"github.com/ollama/ollama/tools"
 	"github.com/ollama/ollama/types/errtypes"
 	"github.com/ollama/ollama/types/model"
@@ -282,10 +283,10 @@ func (s *Server) GenerateHandler(c *gin.Context) {
 		prompt = b.String()
 	}
 
-	var thinkingState *ThinkingParser
-	openingTag, closingTag := inferThinkingTags(m.Template.Template)
+	var thinkingState *thinking.Parser
+	openingTag, closingTag := thinking.InferTags(m.Template.Template)
 	if req.Think != nil && *req.Think && openingTag != "" && closingTag != "" {
-		thinkingState = &ThinkingParser{
+		thinkingState = &thinking.Parser{
 			OpeningTag: openingTag,
 			ClosingTag: closingTag,
 		}
@@ -1522,10 +1523,10 @@ func (s *Server) ChatHandler(c *gin.Context) {
 		return
 	}
 
-	var thinkingState *ThinkingParser
-	openingTag, closingTag := inferThinkingTags(m.Template.Template)
+	var thinkingState *thinking.Parser
+	openingTag, closingTag := thinking.InferTags(m.Template.Template)
 	if req.Think != nil && *req.Think && openingTag != "" && closingTag != "" {
-		thinkingState = &ThinkingParser{
+		thinkingState = &thinking.Parser{
 			OpeningTag: openingTag,
 			ClosingTag: closingTag,
 		}
@@ -1676,7 +1677,7 @@ func filterThinkTags(msgs []api.Message, m *Model) []api.Message {
 				// change the user output), we should probably perform this filtering
 				// for all thinking models (not just qwen3 & deepseek-r1) since it tends
 				// to save tokens and improve quality.
-				thinkingState := &ThinkingParser{
+				thinkingState := &thinking.Parser{
 					OpeningTag: "<think>",
 					ClosingTag: "</think>",
 				}
diff --git a/server/thinking.go b/thinking/parser.go
similarity index 59%
rename from server/thinking.go
rename to thinking/parser.go
index 4ef3c184..a4d05e35 100644
--- a/server/thinking.go
+++ b/thinking/parser.go
@@ -1,9 +1,7 @@
-package server
+package thinking
 
 import (
 	"strings"
-	"text/template"
-	"text/template/parse"
 	"unicode"
 )
 
@@ -46,7 +44,7 @@ func (s thinkingState) String() string {
 	}
 }
 
-type ThinkingParser struct {
+type Parser struct {
 	state      thinkingState
 	OpeningTag string
 	ClosingTag string
@@ -56,7 +54,7 @@ type ThinkingParser struct {
 // AddContent returns the thinking content and the non-thinking content that
 // should be immediately sent to the user. It will internally buffer if it needs
 // to see more raw content to disambiguate
-func (s *ThinkingParser) AddContent(content string) (string, string) {
+func (s *Parser) AddContent(content string) (string, string) {
 	s.acc.WriteString(content)
 
 	var thinkingSb, remainingSb strings.Builder
@@ -76,7 +74,7 @@ func (s *ThinkingParser) AddContent(content string) (string, string) {
 }
 
 // the additional bool return is true iff we should continue eating
-func eat(s *ThinkingParser) (string, string, bool) {
+func eat(s *Parser) (string, string, bool) {
 	switch s.state {
 	case thinkingState_LookingForOpening:
 		trimmed := strings.TrimLeftFunc(s.acc.String(), unicode.IsSpace)
@@ -171,130 +169,3 @@ func overlap(s, delim string) int {
 	}
 	return 0
 }
-
-func templateVisit(n parse.Node, enterFn func(parse.Node) bool, exitFn func(parse.Node)) {
-	if n == nil {
-		return
-	}
-	shouldContinue := enterFn(n)
-	if !shouldContinue {
-		return
-	}
-	switch x := n.(type) {
-	case *parse.ListNode:
-		for _, c := range x.Nodes {
-			templateVisit(c, enterFn, exitFn)
-		}
-	case *parse.BranchNode:
-		if x.Pipe != nil {
-			templateVisit(x.Pipe, enterFn, exitFn)
-		}
-		if x.List != nil {
-			templateVisit(x.List, enterFn, exitFn)
-		}
-		if x.ElseList != nil {
-			templateVisit(x.ElseList, enterFn, exitFn)
-		}
-	case *parse.ActionNode:
-		templateVisit(x.Pipe, enterFn, exitFn)
-	case *parse.WithNode:
-		templateVisit(&x.BranchNode, enterFn, exitFn)
-	case *parse.RangeNode:
-		templateVisit(&x.BranchNode, enterFn, exitFn)
-	case *parse.IfNode:
-		templateVisit(&x.BranchNode, enterFn, exitFn)
-	case *parse.TemplateNode:
-		templateVisit(x.Pipe, enterFn, exitFn)
-	case *parse.PipeNode:
-		for _, c := range x.Cmds {
-			templateVisit(c, enterFn, exitFn)
-		}
-	case *parse.CommandNode:
-		for _, a := range x.Args {
-			templateVisit(a, enterFn, exitFn)
-		}
-		// text, field, number, etc. are leaves – nothing to recurse into
-	}
-	if exitFn != nil {
-		exitFn(n)
-	}
-}
-
-// We use a heuristic to infer the tags that surround thinking traces:
-// We look for a range node that iterates over "Messages" and then look for a
-// reference to "Thinking" like `{{.Thinking}}`. We then go up to the nearest
-// ListNode and take the first and last TextNodes as the opening and closing
-// tags.
-func inferThinkingTags(t *template.Template) (string, string) {
-	ancestors := []parse.Node{}
-
-	openingTag := ""
-	closingTag := ""
-
-	enterFn := func(n parse.Node) bool {
-		ancestors = append(ancestors, n)
-
-		switch x := n.(type) {
-		case *parse.FieldNode:
-			if len(x.Ident) > 0 && x.Ident[0] == "Thinking" {
-				var mostRecentRange *parse.RangeNode
-				for i := len(ancestors) - 1; i >= 0; i-- {
-					if r, ok := ancestors[i].(*parse.RangeNode); ok {
-						mostRecentRange = r
-						break
-					}
-				}
-				if mostRecentRange == nil || !rangeUsesField(mostRecentRange, "Messages") {
-					return true
-				}
-
-				// TODO(drifkin): to be more robust, check that it's in the action
-				// part, not the `if`'s pipeline part. We do match on the nearest list
-				// that starts and ends with text nodes, which makes this not strictly
-				// necessary for our heuristic
-
-				// go up to the nearest ancestor that is a *parse.ListNode
-				for i := len(ancestors) - 1; i >= 0; i-- {
-					if l, ok := ancestors[i].(*parse.ListNode); ok {
-						firstNode := l.Nodes[0]
-						if t, ok := firstNode.(*parse.TextNode); ok {
-							openingTag = strings.TrimSpace(t.String())
-						}
-						lastNode := l.Nodes[len(l.Nodes)-1]
-						if t, ok := lastNode.(*parse.TextNode); ok {
-							closingTag = strings.TrimSpace(t.String())
-						}
-
-						break
-					}
-				}
-			}
-		}
-
-		return true
-	}
-
-	exitFn := func(n parse.Node) {
-		ancestors = ancestors[:len(ancestors)-1]
-	}
-
-	templateVisit(t.Root, enterFn, exitFn)
-
-	return openingTag, closingTag
-}
-
-// checks to see if the given field name is present in the pipeline of the given range node
-func rangeUsesField(rangeNode *parse.RangeNode, field string) bool {
-	found := false
-	enterFn := func(n parse.Node) bool {
-		switch x := n.(type) {
-		case *parse.FieldNode:
-			if x.Ident[0] == field {
-				found = true
-			}
-		}
-		return true
-	}
-	templateVisit(rangeNode.BranchNode.Pipe, enterFn, nil)
-	return found
-}
diff --git a/server/thinking_test.go b/thinking/parser_test.go
similarity index 66%
rename from server/thinking_test.go
rename to thinking/parser_test.go
index 90d3f961..78c297cd 100644
--- a/server/thinking_test.go
+++ b/thinking/parser_test.go
@@ -1,8 +1,7 @@
-package server
+package thinking
 
 import (
 	"testing"
-	"text/template"
 )
 
 func TestExtractThinking(t *testing.T) {
@@ -26,7 +25,7 @@ func TestExtractThinking(t *testing.T) {
 		},
 	}
 	for i, tt := range tests {
-		parser := ThinkingParser{
+		parser := Parser{
 			OpeningTag: "<think>",
 			ClosingTag: "</think>",
 		}
@@ -259,7 +258,7 @@ func TestThinkingStreaming(t *testing.T) {
 	}
 
 	for _, c := range cases {
-		parser := ThinkingParser{
+		parser := Parser{
 			OpeningTag: "<think>",
 			ClosingTag: "</think>",
 		}
@@ -277,127 +276,3 @@ func TestThinkingStreaming(t *testing.T) {
 		}
 	}
 }
-
-func TestInferThinkingTags(t *testing.T) {
-	cases := []struct {
-		desc           string
-		tmplString     string
-		wantOpeningTag string
-		wantClosingTag string
-	}{
-		{
-			desc: "basic",
-			tmplString: `
-			{{ if .Thinking}}
-				/think
-			{{ end }}
-			{{- range $i, $_ := .Messages }}
-				{{- $last := eq (len (slice $.Messages $i)) 1 -}}
-				{{ if and $last .Thinking }}
-					<think>{{ .Thinking }}</think>
-				{{ end }}
-			{{ end }}
-		`,
-			wantOpeningTag: "<think>",
-			wantClosingTag: "</think>",
-		},
-		{
-			desc: "doubly nested range",
-			tmplString: `
-			{{ if .Thinking}}
-				/think
-			{{ end }}
-			{{- range $i, $_ := .Messages }}
-				{{- range $j, $_ := .NotMessages }}
-					{{- $last := eq (len (slice $.Messages $i)) 1 -}}
-					{{ if and $last .Thinking }}
-						<think>{{ .Thinking }}</think>
-					{{ end }}
-				{{ end }}
-			{{ end }}
-		`,
-			wantOpeningTag: "",
-			wantClosingTag: "",
-		},
-		{
-			desc: "whitespace is trimmed",
-			tmplString: `
-			{{ if .Thinking}}
-				/think
-			{{ end }}
-			{{- range $i, $_ := .Messages }}
-				{{- $last := eq (len (slice $.Messages $i)) 1 -}}
-				{{ if and $last .Thinking }}
-					Some text before   {{ .Thinking }}    Some text after
-				{{ end }}
-			{{ end }}
-		`,
-			wantOpeningTag: "Some text before",
-			wantClosingTag: "Some text after",
-		},
-		{
-			desc: "qwen3",
-			tmplString: `
-{{- if or .System .Tools .Thinking }}<|im_start|>system
-{{- if .System }}
-{{ .System }}
-{{- end }}
-{{- if .Tools }}
-
-# Tools
-
-You may call one or more functions to assist with the user query.
-
-You are provided with function signatures within <tools></tools> XML tags:
-<tools>
-{{- range .Tools }}
-{"type": "function", "function": {{ .Function }}}
-{{- end }}
-</tools>
-
-For each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:
-<tool_call>
-{"name": <function-name>, "arguments": <args-json-object>}
-</tool_call>
-{{- end }}
-{{- if .Thinking }}
-/think
-{{- else }}
-/no_think
-{{- end }}<|im_end|>
-{{ end }}
-{{- range $i, $_ := .Messages }}
-{{- $last := eq (len (slice $.Messages $i)) 1 -}}
-{{- if eq .Role "user" }}<|im_start|>user
-{{ .Content }}<|im_end|>
-{{ else if eq .Role "assistant" }}<|im_start|>assistant
-{{ if and $last .Thinking }}
-<think>{{ .Thinking }}</think>
-{{ end }}
-{{ if .Content }}{{ .Content }}
-{{- else if .ToolCalls }}<tool_call>
-{{ range .ToolCalls }}{"name": "{{ .Function.Name }}", "arguments": {{ .Function.Arguments }}}
-{{ end }}</tool_call>
-{{- end }}{{ if not $last }}<|im_end|>
-{{ end }}
-{{- else if eq .Role "tool" }}<|im_start|>user
-<tool_response>
-{{ .Content }}
-</tool_response><|im_end|>
-{{ end }}
-{{- if and (ne .Role "assistant") $last }}<|im_start|>assistant
-{{ end }}
-{{- end }}
-			`,
-			wantOpeningTag: "<think>",
-			wantClosingTag: "</think>",
-		},
-	}
-	for _, c := range cases {
-		tmpl := template.Must(template.New("test").Parse(c.tmplString))
-		openingTag, closingTag := inferThinkingTags(tmpl)
-		if openingTag != c.wantOpeningTag || closingTag != c.wantClosingTag {
-			t.Errorf("case %q: got (%q,%q), want (%q,%q)", c.desc, openingTag, closingTag, c.wantOpeningTag, c.wantClosingTag)
-		}
-	}
-}
diff --git a/thinking/template.go b/thinking/template.go
new file mode 100644
index 00000000..20bd65ec
--- /dev/null
+++ b/thinking/template.go
@@ -0,0 +1,134 @@
+package thinking
+
+import (
+	"strings"
+	"text/template"
+	"text/template/parse"
+)
+
+func templateVisit(n parse.Node, enterFn func(parse.Node) bool, exitFn func(parse.Node)) {
+	if n == nil {
+		return
+	}
+	shouldContinue := enterFn(n)
+	if !shouldContinue {
+		return
+	}
+	switch x := n.(type) {
+	case *parse.ListNode:
+		for _, c := range x.Nodes {
+			templateVisit(c, enterFn, exitFn)
+		}
+	case *parse.BranchNode:
+		if x.Pipe != nil {
+			templateVisit(x.Pipe, enterFn, exitFn)
+		}
+		if x.List != nil {
+			templateVisit(x.List, enterFn, exitFn)
+		}
+		if x.ElseList != nil {
+			templateVisit(x.ElseList, enterFn, exitFn)
+		}
+	case *parse.ActionNode:
+		templateVisit(x.Pipe, enterFn, exitFn)
+	case *parse.WithNode:
+		templateVisit(&x.BranchNode, enterFn, exitFn)
+	case *parse.RangeNode:
+		templateVisit(&x.BranchNode, enterFn, exitFn)
+	case *parse.IfNode:
+		templateVisit(&x.BranchNode, enterFn, exitFn)
+	case *parse.TemplateNode:
+		templateVisit(x.Pipe, enterFn, exitFn)
+	case *parse.PipeNode:
+		for _, c := range x.Cmds {
+			templateVisit(c, enterFn, exitFn)
+		}
+	case *parse.CommandNode:
+		for _, a := range x.Args {
+			templateVisit(a, enterFn, exitFn)
+		}
+		// text, field, number, etc. are leaves – nothing to recurse into
+	}
+	if exitFn != nil {
+		exitFn(n)
+	}
+}
+
+// InferTags uses a heuristic to infer the tags that surround thinking traces:
+// We look for a range node that iterates over "Messages" and then look for a
+// reference to "Thinking" like `{{.Thinking}}`. We then go up to the nearest
+// ListNode and take the first and last TextNodes as the opening and closing
+// tags.
+func InferTags(t *template.Template) (string, string) {
+	ancestors := []parse.Node{}
+
+	openingTag := ""
+	closingTag := ""
+
+	enterFn := func(n parse.Node) bool {
+		ancestors = append(ancestors, n)
+
+		switch x := n.(type) {
+		case *parse.FieldNode:
+			if len(x.Ident) > 0 && x.Ident[0] == "Thinking" {
+				var mostRecentRange *parse.RangeNode
+				for i := len(ancestors) - 1; i >= 0; i-- {
+					if r, ok := ancestors[i].(*parse.RangeNode); ok {
+						mostRecentRange = r
+						break
+					}
+				}
+				if mostRecentRange == nil || !rangeUsesField(mostRecentRange, "Messages") {
+					return true
+				}
+
+				// TODO(drifkin): to be more robust, check that it's in the action
+				// part, not the `if`'s pipeline part. We do match on the nearest list
+				// that starts and ends with text nodes, which makes this not strictly
+				// necessary for our heuristic
+
+				// go up to the nearest ancestor that is a *parse.ListNode
+				for i := len(ancestors) - 1; i >= 0; i-- {
+					if l, ok := ancestors[i].(*parse.ListNode); ok {
+						firstNode := l.Nodes[0]
+						if t, ok := firstNode.(*parse.TextNode); ok {
+							openingTag = strings.TrimSpace(t.String())
+						}
+						lastNode := l.Nodes[len(l.Nodes)-1]
+						if t, ok := lastNode.(*parse.TextNode); ok {
+							closingTag = strings.TrimSpace(t.String())
+						}
+
+						break
+					}
+				}
+			}
+		}
+
+		return true
+	}
+
+	exitFn := func(n parse.Node) {
+		ancestors = ancestors[:len(ancestors)-1]
+	}
+
+	templateVisit(t.Root, enterFn, exitFn)
+
+	return openingTag, closingTag
+}
+
+// checks to see if the given field name is present in the pipeline of the given range node
+func rangeUsesField(rangeNode *parse.RangeNode, field string) bool {
+	found := false
+	enterFn := func(n parse.Node) bool {
+		switch x := n.(type) {
+		case *parse.FieldNode:
+			if x.Ident[0] == field {
+				found = true
+			}
+		}
+		return true
+	}
+	templateVisit(rangeNode.BranchNode.Pipe, enterFn, nil)
+	return found
+}
diff --git a/thinking/template_test.go b/thinking/template_test.go
new file mode 100644
index 00000000..e63558e2
--- /dev/null
+++ b/thinking/template_test.go
@@ -0,0 +1,130 @@
+package thinking
+
+import (
+	"testing"
+	"text/template"
+)
+
+func TestInferThinkingTags(t *testing.T) {
+	cases := []struct {
+		desc           string
+		tmplString     string
+		wantOpeningTag string
+		wantClosingTag string
+	}{
+		{
+			desc: "basic",
+			tmplString: `
+			{{ if .Thinking}}
+				/think
+			{{ end }}
+			{{- range $i, $_ := .Messages }}
+				{{- $last := eq (len (slice $.Messages $i)) 1 -}}
+				{{ if and $last .Thinking }}
+					<think>{{ .Thinking }}</think>
+				{{ end }}
+			{{ end }}
+		`,
+			wantOpeningTag: "<think>",
+			wantClosingTag: "</think>",
+		},
+		{
+			desc: "doubly nested range",
+			tmplString: `
+			{{ if .Thinking}}
+				/think
+			{{ end }}
+			{{- range $i, $_ := .Messages }}
+				{{- range $j, $_ := .NotMessages }}
+					{{- $last := eq (len (slice $.Messages $i)) 1 -}}
+					{{ if and $last .Thinking }}
+						<think>{{ .Thinking }}</think>
+					{{ end }}
+				{{ end }}
+			{{ end }}
+		`,
+			wantOpeningTag: "",
+			wantClosingTag: "",
+		},
+		{
+			desc: "whitespace is trimmed",
+			tmplString: `
+			{{ if .Thinking}}
+				/think
+			{{ end }}
+			{{- range $i, $_ := .Messages }}
+				{{- $last := eq (len (slice $.Messages $i)) 1 -}}
+				{{ if and $last .Thinking }}
+					Some text before   {{ .Thinking }}    Some text after
+				{{ end }}
+			{{ end }}
+		`,
+			wantOpeningTag: "Some text before",
+			wantClosingTag: "Some text after",
+		},
+		{
+			desc: "qwen3",
+			tmplString: `
+{{- if or .System .Tools .Thinking }}<|im_start|>system
+{{- if .System }}
+{{ .System }}
+{{- end }}
+{{- if .Tools }}
+
+# Tools
+
+You may call one or more functions to assist with the user query.
+
+You are provided with function signatures within <tools></tools> XML tags:
+<tools>
+{{- range .Tools }}
+{"type": "function", "function": {{ .Function }}}
+{{- end }}
+</tools>
+
+For each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:
+<tool_call>
+{"name": <function-name>, "arguments": <args-json-object>}
+</tool_call>
+{{- end }}
+{{- if .Thinking }}
+/think
+{{- else }}
+/no_think
+{{- end }}<|im_end|>
+{{ end }}
+{{- range $i, $_ := .Messages }}
+{{- $last := eq (len (slice $.Messages $i)) 1 -}}
+{{- if eq .Role "user" }}<|im_start|>user
+{{ .Content }}<|im_end|>
+{{ else if eq .Role "assistant" }}<|im_start|>assistant
+{{ if and $last .Thinking }}
+<think>{{ .Thinking }}</think>
+{{ end }}
+{{ if .Content }}{{ .Content }}
+{{- else if .ToolCalls }}<tool_call>
+{{ range .ToolCalls }}{"name": "{{ .Function.Name }}", "arguments": {{ .Function.Arguments }}}
+{{ end }}</tool_call>
+{{- end }}{{ if not $last }}<|im_end|>
+{{ end }}
+{{- else if eq .Role "tool" }}<|im_start|>user
+<tool_response>
+{{ .Content }}
+</tool_response><|im_end|>
+{{ end }}
+{{- if and (ne .Role "assistant") $last }}<|im_start|>assistant
+{{ end }}
+{{- end }}
+			`,
+			wantOpeningTag: "<think>",
+			wantClosingTag: "</think>",
+		},
+	}
+	for _, c := range cases {
+		tmpl := template.Must(template.New("test").Parse(c.tmplString))
+		openingTag, closingTag := InferTags(tmpl)
+		if openingTag != c.wantOpeningTag || closingTag != c.wantClosingTag {
+			t.Errorf("case %q: got (%q,%q), want (%q,%q)", c.desc, openingTag, closingTag, c.wantOpeningTag, c.wantClosingTag)
+		}
+	}
+}

From 2ae65ae471c9d51d343f401da16c05b98b99a842 Mon Sep 17 00:00:00 2001
From: Daniel Hiltgen <dhiltgen@users.noreply.github.com>
Date: Fri, 6 Jun 2025 14:06:09 -0700
Subject: [PATCH 078/108] win: handle more than 2048 processes (#10997)

Fix an array out of bounds crash
---
 cmd/start_windows.go | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/cmd/start_windows.go b/cmd/start_windows.go
index bcc51057..1b648d9d 100644
--- a/cmd/start_windows.go
+++ b/cmd/start_windows.go
@@ -74,7 +74,16 @@ func isProcRunning(procName string) []uint32 {
 		slog.Debug("failed to check for running installers", "error", err)
 		return nil
 	}
-	pids = pids[:ret]
+	if ret > uint32(len(pids)) {
+		pids = make([]uint32, ret+10)
+		if err := windows.EnumProcesses(pids, &ret); err != nil || ret == 0 {
+			slog.Debug("failed to check for running installers", "error", err)
+			return nil
+		}
+	}
+	if ret < uint32(len(pids)) {
+		pids = pids[:ret]
+	}
 	var matches []uint32
 	for _, pid := range pids {
 		if pid == 0 {

From a8ed68bd9383ffec346fd1b3cf60d94c032bbec8 Mon Sep 17 00:00:00 2001
From: Daniel Hiltgen <dhiltgen@users.noreply.github.com>
Date: Fri, 6 Jun 2025 14:06:29 -0700
Subject: [PATCH 079/108] launch app hidden (#10962)

When starting the app in the background, start it hidden.
---
 cmd/start_darwin.go  | 2 +-
 cmd/start_windows.go | 5 +----
 2 files changed, 2 insertions(+), 5 deletions(-)

diff --git a/cmd/start_darwin.go b/cmd/start_darwin.go
index 1a9a1ae8..3cb726ea 100644
--- a/cmd/start_darwin.go
+++ b/cmd/start_darwin.go
@@ -23,7 +23,7 @@ func startApp(ctx context.Context, client *api.Client) error {
 		return errors.New("could not find ollama app")
 	}
 	path := strings.Split(link, "Ollama.app")
-	if err := exec.Command("/usr/bin/open", "-a", path[0]+"Ollama.app").Run(); err != nil {
+	if err := exec.Command("/usr/bin/open", "-j", "-a", path[0]+"Ollama.app").Run(); err != nil {
 		return err
 	}
 	return waitForServer(ctx, client)
diff --git a/cmd/start_windows.go b/cmd/start_windows.go
index 1b648d9d..635b5077 100644
--- a/cmd/start_windows.go
+++ b/cmd/start_windows.go
@@ -45,14 +45,11 @@ func startApp(ctx context.Context, client *api.Client) error {
 			}
 		}
 	}
-	// log.Printf("XXX attempting to start app %s", appExe)
 
 	cmd_path := "c:\\Windows\\system32\\cmd.exe"
-	cmd := exec.Command(cmd_path, "/c", appExe)
-	// TODO - these hide flags aren't working - still pops up a command window for some reason
+	cmd := exec.Command(cmd_path, "/c", appExe, "hidden")
 	cmd.SysProcAttr = &syscall.SysProcAttr{CreationFlags: 0x08000000, HideWindow: true}
 
-	// TODO this didn't help either...
 	cmd.Stdin = strings.NewReader("")
 	cmd.Stdout = os.Stdout
 	cmd.Stderr = os.Stderr

From 09d308d6b6c7995e3fb565e0ecfa184d49205f00 Mon Sep 17 00:00:00 2001
From: Jeffrey Morgan <jmorganca@gmail.com>
Date: Fri, 6 Jun 2025 23:29:14 -0400
Subject: [PATCH 080/108] Revert "server: add model capabilities to the list
 endpoint (#10174)" (#11004)

This reverts commit 09430011936652cf55925184aaed6f2cebf62a75.
---
 api/types.go     | 13 ++++++-------
 docs/api.md      | 29 ++++++++++-------------------
 server/routes.go | 14 +++-----------
 3 files changed, 19 insertions(+), 37 deletions(-)

diff --git a/api/types.go b/api/types.go
index a1f896db..94d49200 100644
--- a/api/types.go
+++ b/api/types.go
@@ -457,13 +457,12 @@ type ProcessResponse struct {
 
 // ListModelResponse is a single model description in [ListResponse].
 type ListModelResponse struct {
-	Name         string             `json:"name"`
-	Model        string             `json:"model"`
-	ModifiedAt   time.Time          `json:"modified_at"`
-	Size         int64              `json:"size"`
-	Digest       string             `json:"digest"`
-	Capabilities []model.Capability `json:"capabilities,omitempty"`
-	Details      ModelDetails       `json:"details,omitempty"`
+	Name       string       `json:"name"`
+	Model      string       `json:"model"`
+	ModifiedAt time.Time    `json:"modified_at"`
+	Size       int64        `json:"size"`
+	Digest     string       `json:"digest"`
+	Details    ModelDetails `json:"details,omitempty"`
 }
 
 // ProcessModelResponse is a single model description in [ProcessResponse].
diff --git a/docs/api.md b/docs/api.md
index 31e18bd5..11eaf73a 100644
--- a/docs/api.md
+++ b/docs/api.md
@@ -1157,15 +1157,11 @@ A single JSON object will be returned.
 {
   "models": [
     {
-
-      "model": "codellama:13b",
-      "modified_at": "2023-11-04T14:56:49.277302595-07:00",
-      "size": 7365960935,
-      "digest": "9f438cb9cd581fc025612d27f7c1a6669ff83a8bb0ed86c94fcf4c5440555697",
-      "capabilities": [
-        "completion"
-      ],
-
+      "name": "deepseek-r1:latest",
+      "model": "deepseek-r1:latest",
+      "modified_at": "2025-05-10T08:06:48.639712648-07:00",
+      "size": 4683075271,
+      "digest": "0a8c266910232fd3291e71e5ba1e058cc5af9d411192cf88b6d30e92b6e73163",
       "details": {
         "parent_model": "",
         "format": "gguf",
@@ -1178,16 +1174,11 @@ A single JSON object will be returned.
       }
     },
     {
-
-      "model": "llama4:latest",
-      "modified_at": "2023-12-07T09:32:18.757212583-08:00",
-      "size": 3825819519,
-      "digest": "fe938a131f40e6f6d40083c9f0f430a515233eb2edaa6d72eb85c50d64f2300e",
-      "capabilities": [
-        "completion",
-        "vision"
-      ],
-
+      "name": "llama3.2:latest",
+      "model": "llama3.2:latest",
+      "modified_at": "2025-05-04T17:37:44.706015396-07:00",
+      "size": 2019393189,
+      "digest": "a80c4f17acd55265feec403c7aef86be0c25983ab279d83f3bcd3abbcb5b8b72",
       "details": {
         "parent_model": "",
         "format": "gguf",
diff --git a/server/routes.go b/server/routes.go
index 70cb6cef..8eda5c73 100644
--- a/server/routes.go
+++ b/server/routes.go
@@ -929,7 +929,8 @@ func (s *Server) ListHandler(c *gin.Context) {
 			}
 		}
 
-		r := api.ListModelResponse{
+		// tag should never be masked
+		models = append(models, api.ListModelResponse{
 			Model:      n.DisplayShortest(),
 			Name:       n.DisplayShortest(),
 			Size:       m.Size(),
@@ -942,16 +943,7 @@ func (s *Server) ListHandler(c *gin.Context) {
 				ParameterSize:     cf.ModelType,
 				QuantizationLevel: cf.FileType,
 			},
-		}
-
-		model, err := GetModel(n.String())
-		if err != nil {
-			slog.Warn("bad model details", "name", n, "error", err)
-		} else {
-			r.Capabilities = model.Capabilities()
-		}
-
-		models = append(models, r)
+		})
 	}
 
 	slices.SortStableFunc(models, func(i, j api.ListModelResponse) int {

From fc0309615e42c32989e060e733d871e16617874e Mon Sep 17 00:00:00 2001
From: Krzysztof Jeziorny <872730+krzysztofjeziorny@users.noreply.github.com>
Date: Sat, 7 Jun 2025 05:30:04 +0200
Subject: [PATCH 081/108] docs: update link to AMD drivers in linux.md (#10973)

---
 docs/linux.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/linux.md b/docs/linux.md
index 2dda87f3..72a5ff01 100644
--- a/docs/linux.md
+++ b/docs/linux.md
@@ -112,8 +112,8 @@ sudo systemctl status ollama
 > While AMD has contributed the `amdgpu` driver upstream to the official linux
 > kernel source, the version is older and may not support all ROCm features. We
 > recommend you install the latest driver from
-> https://www.amd.com/en/support/linux-drivers for best support of your Radeon
-> GPU.
+> [AMD](https://www.amd.com/en/support/download/linux-drivers.html) for best support
+> of your Radeon GPU.
 
 ## Customizing
 

From feeabdadd2b272b40747f3e7e74957c40ba2800c Mon Sep 17 00:00:00 2001
From: Daniel Hiltgen <dhiltgen@users.noreply.github.com>
Date: Sun, 8 Jun 2025 09:34:52 -0700
Subject: [PATCH 082/108] spawn desktop quickly (#11011)

Give the desktop app a hint to start fast.
---
 cmd/start_darwin.go  | 2 +-
 cmd/start_windows.go | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/cmd/start_darwin.go b/cmd/start_darwin.go
index 3cb726ea..83af1235 100644
--- a/cmd/start_darwin.go
+++ b/cmd/start_darwin.go
@@ -23,7 +23,7 @@ func startApp(ctx context.Context, client *api.Client) error {
 		return errors.New("could not find ollama app")
 	}
 	path := strings.Split(link, "Ollama.app")
-	if err := exec.Command("/usr/bin/open", "-j", "-a", path[0]+"Ollama.app").Run(); err != nil {
+	if err := exec.Command("/usr/bin/open", "-j", "-a", path[0]+"Ollama.app", "--args", "--fast-startup").Run(); err != nil {
 		return err
 	}
 	return waitForServer(ctx, client)
diff --git a/cmd/start_windows.go b/cmd/start_windows.go
index 635b5077..9505e1bb 100644
--- a/cmd/start_windows.go
+++ b/cmd/start_windows.go
@@ -47,7 +47,7 @@ func startApp(ctx context.Context, client *api.Client) error {
 	}
 
 	cmd_path := "c:\\Windows\\system32\\cmd.exe"
-	cmd := exec.Command(cmd_path, "/c", appExe, "hidden")
+	cmd := exec.Command(cmd_path, "/c", appExe, "--hide", "--fast-startup")
 	cmd.SysProcAttr = &syscall.SysProcAttr{CreationFlags: 0x08000000, HideWindow: true}
 
 	cmd.Stdin = strings.NewReader("")

From 82ad1dbc07dc2db39c6f502eb148ff3ce00d96b8 Mon Sep 17 00:00:00 2001
From: Daniel Hiltgen <dhiltgen@users.noreply.github.com>
Date: Mon, 9 Jun 2025 16:29:57 -0700
Subject: [PATCH 083/108] mac: handle "keep" named apps (#11031)

When a user elects to keep the existing app, the
new Ollama is named `Ollama 2.app`
This fixes the app startup flow to handle this naming pattern.
---
 cmd/start_darwin.go | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/cmd/start_darwin.go b/cmd/start_darwin.go
index 83af1235..05a4551e 100644
--- a/cmd/start_darwin.go
+++ b/cmd/start_darwin.go
@@ -5,7 +5,7 @@ import (
 	"errors"
 	"os"
 	"os/exec"
-	"strings"
+	"regexp"
 
 	"github.com/ollama/ollama/api"
 )
@@ -19,11 +19,12 @@ func startApp(ctx context.Context, client *api.Client) error {
 	if err != nil {
 		return err
 	}
-	if !strings.Contains(link, "Ollama.app") {
+	r := regexp.MustCompile(`^.*/Ollama\s?\d*.app`)
+	m := r.FindStringSubmatch(link)
+	if len(m) != 1 {
 		return errors.New("could not find ollama app")
 	}
-	path := strings.Split(link, "Ollama.app")
-	if err := exec.Command("/usr/bin/open", "-j", "-a", path[0]+"Ollama.app", "--args", "--fast-startup").Run(); err != nil {
+	if err := exec.Command("/usr/bin/open", "-j", "-a", m[0], "--args", "--fast-startup").Run(); err != nil {
 		return err
 	}
 	return waitForServer(ctx, client)

From f63d7f68eb206cf403fb9c7dca7978d16204e268 Mon Sep 17 00:00:00 2001
From: Jeffrey Morgan <jmorganca@gmail.com>
Date: Tue, 10 Jun 2025 09:33:54 -0700
Subject: [PATCH 084/108] readme: update quickstart example to Gemma 3

---
 README.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index 22de41e6..af064a63 100644
--- a/README.md
+++ b/README.md
@@ -40,10 +40,10 @@ The official [Ollama Docker image](https://hub.docker.com/r/ollama/ollama) `olla
 
 ## Quickstart
 
-To run and chat with [Llama 3.2](https://ollama.com/library/llama3.2):
+To run and chat with [Llama 3.2](https://ollama.com/library/gemma3):
 
 ```shell
-ollama run llama3.2
+ollama run gemma3
 ```
 
 ## Model library

From af21a5ac397c1d2ce62881e0962bbff9d6da31f2 Mon Sep 17 00:00:00 2001
From: Jeffrey Morgan <jmorganca@gmail.com>
Date: Tue, 10 Jun 2025 09:34:23 -0700
Subject: [PATCH 085/108] readme: update quickstart link text to Gemma 3

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index af064a63..90e41be8 100644
--- a/README.md
+++ b/README.md
@@ -40,7 +40,7 @@ The official [Ollama Docker image](https://hub.docker.com/r/ollama/ollama) `olla
 
 ## Quickstart
 
-To run and chat with [Llama 3.2](https://ollama.com/library/gemma3):
+To run and chat with [Gemma 3](https://ollama.com/library/gemma3):
 
 ```shell
 ollama run gemma3

From deaabe292d86b712e061bebe7fdd6be6690f539b Mon Sep 17 00:00:00 2001
From: Attogram Project <attogram@users.noreply.github.com>
Date: Tue, 10 Jun 2025 23:14:51 +0200
Subject: [PATCH 086/108] readme: add ollama-multirun to community integrations
 (#11038)

---
 README.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/README.md b/README.md
index 90e41be8..ffaec628 100644
--- a/README.md
+++ b/README.md
@@ -451,6 +451,7 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [orca-cli](https://github.com/molbal/orca-cli) Ollama Registry CLI Application - Browse, pull, and download models from Ollama Registry in your terminal.
 - [GGUF-to-Ollama](https://github.com/jonathanhecl/gguf-to-ollama) - Importing GGUF to Ollama made easy (multiplatform)
 - [AWS-Strands-With-Ollama](https://github.com/rapidarchitect/ollama_strands) - AWS Strands Agents with Ollama Examples
+- [ollama-multirun](https://github.com/attogram/ollama-multirun) - A bash shell script to run a single prompt against any or all of your locally installed ollama models, saving the output and performance statistics as easily navigable web pages. ([Demo](https://attogram.github.io/ai_test_zone/))
 
 ### Apple Vision Pro
 

From 2e77aa1ae70372388bd4b08b9957e5198d566a22 Mon Sep 17 00:00:00 2001
From: Michael Yang <mxyng@pm.me>
Date: Wed, 11 Jun 2025 12:10:15 -0700
Subject: [PATCH 087/108] use nn.Linear in place of ml.Tensor (#11049)

while nn.Linear.Forward isn't applicable for sparse MLP, it's still
a nice container for the tensors
---
 model/models/llama4/model_text.go | 12 ++++++------
 model/models/qwen3/model.go       | 12 ++++++------
 2 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/model/models/llama4/model_text.go b/model/models/llama4/model_text.go
index 27935f40..045ab403 100644
--- a/model/models/llama4/model_text.go
+++ b/model/models/llama4/model_text.go
@@ -63,9 +63,9 @@ func (mlp *TextMLP) Forward(ctx ml.Context, hiddenStates ml.Tensor, opts *TextOp
 }
 
 type TextExperts struct {
-	Gate ml.Tensor `gguf:"ffn_gate_exps.weight"`
-	Up   ml.Tensor `gguf:"ffn_up_exps.weight"`
-	Down ml.Tensor `gguf:"ffn_down_exps.weight"`
+	Gate *nn.Linear `gguf:"ffn_gate_exps"`
+	Up   *nn.Linear `gguf:"ffn_up_exps"`
+	Down *nn.Linear `gguf:"ffn_down_exps"`
 }
 
 func (e *TextExperts) Forward(ctx ml.Context, hiddenStates, routerLogits ml.Tensor, opts *TextOptions) ml.Tensor {
@@ -76,9 +76,9 @@ func (e *TextExperts) Forward(ctx ml.Context, hiddenStates, routerLogits ml.Tens
 	hiddenStates = hiddenStates.Repeat(ctx, 1, opts.numExpertsUsed)
 	hiddenStates = hiddenStates.Mul(ctx, scores)
 
-	upStates := e.Up.MulmatID(ctx, hiddenStates, experts)
-	gateStates := e.Gate.MulmatID(ctx, hiddenStates, experts)
-	downStates := e.Down.MulmatID(ctx, upStates.Mul(ctx, gateStates.SILU(ctx)), experts)
+	upStates := e.Up.Weight.MulmatID(ctx, hiddenStates, experts)
+	gateStates := e.Gate.Weight.MulmatID(ctx, hiddenStates, experts)
+	downStates := e.Down.Weight.MulmatID(ctx, upStates.Mul(ctx, gateStates.SILU(ctx)), experts)
 
 	nextStates := downStates.View(ctx, 0, hiddenStates.Dim(0), downStates.Stride(2), hiddenStates.Dim(2))
 	for i := 1; i < opts.numExpertsUsed; i++ {
diff --git a/model/models/qwen3/model.go b/model/models/qwen3/model.go
index 1930da7e..7a83e0d0 100644
--- a/model/models/qwen3/model.go
+++ b/model/models/qwen3/model.go
@@ -66,9 +66,9 @@ type MLP interface {
 
 type sparse struct {
 	Router *nn.Linear `gguf:"ffn_gate_inp"`
-	Gate   ml.Tensor  `gguf:"ffn_gate_exps.weight"`
-	Up     ml.Tensor  `gguf:"ffn_up_exps.weight"`
-	Down   ml.Tensor  `gguf:"ffn_down_exps.weight"`
+	Gate   *nn.Linear `gguf:"ffn_gate_exps"`
+	Up     *nn.Linear `gguf:"ffn_up_exps"`
+	Down   *nn.Linear `gguf:"ffn_down_exps"`
 }
 
 func (mlp *sparse) Forward(ctx ml.Context, hiddenStates ml.Tensor, opts *Options) ml.Tensor {
@@ -87,13 +87,13 @@ func (mlp *sparse) Forward(ctx ml.Context, hiddenStates ml.Tensor, opts *Options
 
 	hiddenStates = hiddenStates.Reshape(ctx, hiddenStates.Dim(0), 1, hiddenStates.Dim(1))
 
-	upStates := mlp.Up.MulmatID(ctx, hiddenStates, selectedExperts)
+	upStates := mlp.Up.Weight.MulmatID(ctx, hiddenStates, selectedExperts)
 
-	hiddenStates = mlp.Gate.MulmatID(ctx, hiddenStates, selectedExperts)
+	hiddenStates = mlp.Gate.Weight.MulmatID(ctx, hiddenStates, selectedExperts)
 	hiddenStates = hiddenStates.SILU(ctx)
 	hiddenStates = hiddenStates.Mul(ctx, upStates)
 
-	experts := mlp.Down.MulmatID(ctx, hiddenStates, selectedExperts)
+	experts := mlp.Down.Weight.MulmatID(ctx, hiddenStates, selectedExperts)
 	experts = experts.Mul(ctx, routingWeights)
 
 	nextStates := experts.View(ctx, 0, experts.Dim(0), experts.Stride(2), experts.Dim(2))

From 0dabb4ef6a1aab240a59b6bb4ef82372d335e3a9 Mon Sep 17 00:00:00 2001
From: Michael Yang <mxyng@pm.me>
Date: Wed, 11 Jun 2025 12:10:35 -0700
Subject: [PATCH 088/108] skip tokenizer.model if possible (#11050)

if tokenizer.json is already copied, skip tokenizer.model
---
 parser/parser.go | 19 ++++++++++++-------
 1 file changed, 12 insertions(+), 7 deletions(-)

diff --git a/parser/parser.go b/parser/parser.go
index 96eae9c0..d40a79c2 100644
--- a/parser/parser.go
+++ b/parser/parser.go
@@ -292,13 +292,18 @@ func filesForModel(path string) ([]string, error) {
 	}
 	files = append(files, js...)
 
-	if tks, _ := glob(filepath.Join(path, "tokenizer.model"), "application/octet-stream"); len(tks) > 0 {
-		// add tokenizer.model if it exists, tokenizer.json is automatically picked up by the previous glob
-		// tokenizer.model might be a unresolved git lfs reference; error if it is
-		files = append(files, tks...)
-	} else if tks, _ := glob(filepath.Join(path, "**/tokenizer.model"), "text/plain"); len(tks) > 0 {
-		// some times tokenizer.model is in a subdirectory (e.g. meta-llama/Meta-Llama-3-8B)
-		files = append(files, tks...)
+	// only include tokenizer.model is tokenizer.json is not present
+	if !slices.ContainsFunc(files, func(s string) bool {
+		return slices.Contains(strings.Split(s, string(os.PathSeparator)), "tokenizer.json")
+	}) {
+		if tks, _ := glob(filepath.Join(path, "tokenizer.model"), "application/octet-stream"); len(tks) > 0 {
+			// add tokenizer.model if it exists, tokenizer.json is automatically picked up by the previous glob
+			// tokenizer.model might be a unresolved git lfs reference; error if it is
+			files = append(files, tks...)
+		} else if tks, _ := glob(filepath.Join(path, "**/tokenizer.model"), "text/plain"); len(tks) > 0 {
+			// some times tokenizer.model is in a subdirectory (e.g. meta-llama/Meta-Llama-3-8B)
+			files = append(files, tks...)
+		}
 	}
 
 	return files, nil

From 45f56355d557b7130c7c07bbd6e1b634a758d946 Mon Sep 17 00:00:00 2001
From: Michael Yang <mxyng@pm.me>
Date: Wed, 11 Jun 2025 12:10:54 -0700
Subject: [PATCH 089/108] feat: uneven splits (#11048)

The current splitDim function only operates on tensors that are split evenly which isn't always the case, e.g. a QKV tensor. This change allows the function to be used for arbitrary splits
---
 convert/convert_qwen25vl.go |  10 +-
 convert/tensor.go           |  50 ++++--
 convert/tensor_test.go      | 304 ++++++++++++++++++++++++++++++++++++
 3 files changed, 344 insertions(+), 20 deletions(-)
 create mode 100644 convert/tensor_test.go

diff --git a/convert/convert_qwen25vl.go b/convert/convert_qwen25vl.go
index c2d5a633..6e4c9640 100644
--- a/convert/convert_qwen25vl.go
+++ b/convert/convert_qwen25vl.go
@@ -65,17 +65,17 @@ func (q *qwen25VLModel) Tensors(ts []Tensor) []*ggml.Tensor {
 	for _, t := range ts {
 		if strings.Contains(t.Name(), "patch_embed.proj") {
 			for t := range splitDim(t, 2,
-				strings.NewReplacer("patch_embed.proj", "patch_embd_0"),
-				strings.NewReplacer("patch_embed.proj", "patch_embd_1"),
+				split{Replacer: strings.NewReplacer("patch_embed.proj", "patch_embd_0")},
+				split{Replacer: strings.NewReplacer("patch_embed.proj", "patch_embd_1")},
 			) {
 				t.Shape = slices.DeleteFunc(t.Shape, func(i uint64) bool { return i == 1 })
 				out = append(out, t)
 			}
 		} else if strings.Contains(t.Name(), "attn.qkv") {
 			out = append(out, slices.Collect(splitDim(t, 0,
-				strings.NewReplacer("attn.qkv", "attn_q"),
-				strings.NewReplacer("attn.qkv", "attn_k"),
-				strings.NewReplacer("attn.qkv", "attn_v"),
+				split{Replacer: strings.NewReplacer("attn.qkv", "attn_q")},
+				split{Replacer: strings.NewReplacer("attn.qkv", "attn_k")},
+				split{Replacer: strings.NewReplacer("attn.qkv", "attn_v")},
 			))...)
 		} else {
 			out = append(out, &ggml.Tensor{
diff --git a/convert/tensor.go b/convert/tensor.go
index ffb22ead..9d6919e3 100644
--- a/convert/tensor.go
+++ b/convert/tensor.go
@@ -1,53 +1,73 @@
 package convert
 
 import (
+	"cmp"
 	"iter"
 	"slices"
 	"strings"
 
-	"github.com/ollama/ollama/fs/ggml"
 	"github.com/pdevine/tensor"
 	"github.com/pdevine/tensor/native"
+
+	"github.com/ollama/ollama/fs/ggml"
 )
 
+type split struct {
+	*strings.Replacer
+	dim int
+
+	// fn is an optional function to apply to the tensor after slicing
+	fn func(tensor.Tensor) (tensor.Tensor, error)
+}
+
 // splitDim splits a tensor along a specified dimension into multiple tensors. The dimension
-// is split evenly based on the number of replacers provided.
-func splitDim(t Tensor, dim int, replacers ...*strings.Replacer) iter.Seq[*ggml.Tensor] {
+// is split evenly based on the number of replacers provided unless a specific count is given.
+func splitDim(t Tensor, dim int, splits ...split) iter.Seq[*ggml.Tensor] {
 	return func(yield func(*ggml.Tensor) bool) {
-		for i, replacer := range replacers {
+		var offset int
+		for _, split := range splits {
+			t := t.Clone()
 			shape := slices.Clone(t.Shape())
-			shape[dim] = shape[dim] / uint64(len(replacers))
+			shape[dim] = cmp.Or(uint64(split.dim), shape[dim]/uint64(len(splits)))
 
 			slice := slices.Repeat([]tensor.Slice{nil}, len(shape))
-			slice[dim] = tensor.S(i*int(shape[dim]), (i+1)*int(shape[dim]))
+			slice[dim] = tensor.S(offset, offset+int(shape[dim]))
+			offset += int(shape[dim])
 
-			tt := t.Clone()
-			tt.SetRepacker(func(_ string, data []float32, shape []uint64) ([]float32, error) {
+			t.SetRepacker(func(_ string, data []float32, shape []uint64) ([]float32, error) {
 				dims := make([]int, len(shape))
 				for i := range shape {
 					dims[i] = int(shape[i])
 				}
 
-				var t tensor.Tensor = tensor.New(tensor.WithShape(dims...), tensor.WithBacking(data))
-				t, err := t.Slice(slice...)
+				var tt tensor.Tensor = tensor.New(tensor.WithShape(dims...), tensor.WithBacking(data))
+				tt, err := tt.Slice(slice...)
 				if err != nil {
 					return nil, err
 				}
 
-				t = tensor.Materialize(t)
+				tt = tensor.Materialize(tt)
+
+				if split.fn != nil {
+					tt, err = split.fn(tt)
+					if err != nil {
+						return nil, err
+					}
+				}
+
 				// flatten tensor so it can be written as a vector
-				if err := t.Reshape(t.Shape().TotalSize()); err != nil {
+				if err := tt.Reshape(tt.Shape().TotalSize()); err != nil {
 					return nil, err
 				}
 
-				return native.VectorF32(t.(*tensor.Dense))
+				return native.VectorF32(tt.(*tensor.Dense))
 			})
 
 			if !yield(&ggml.Tensor{
-				Name:     replacer.Replace(t.Name()),
+				Name:     split.Replace(t.Name()),
 				Kind:     t.Kind(),
 				Shape:    shape,
-				WriterTo: tt,
+				WriterTo: t,
 			}) {
 				break
 			}
diff --git a/convert/tensor_test.go b/convert/tensor_test.go
new file mode 100644
index 00000000..ea12d0f5
--- /dev/null
+++ b/convert/tensor_test.go
@@ -0,0 +1,304 @@
+package convert
+
+import (
+	"bytes"
+	"encoding/binary"
+	"io"
+	"iter"
+	"slices"
+	"strings"
+	"testing"
+
+	"github.com/pdevine/tensor"
+)
+
+type fakeTensor struct {
+	name  string
+	shape []uint64
+	data  []float32
+
+	repacker Repacker
+}
+
+func (f fakeTensor) Name() string {
+	return f.name
+}
+
+func (f fakeTensor) Shape() []uint64 {
+	return f.shape
+}
+
+func (f fakeTensor) Kind() uint32 {
+	return 0
+}
+
+func (f *fakeTensor) SetRepacker(fn Repacker) {
+	f.repacker = fn
+}
+
+func (f fakeTensor) Clone() Tensor {
+	return &fakeTensor{
+		name:     f.name,
+		shape:    slices.Clone(f.shape),
+		data:     slices.Clone(f.data),
+		repacker: f.repacker,
+	}
+}
+
+func (f fakeTensor) WriteTo(w io.Writer) (n int64, err error) {
+	data := f.data
+	if f.repacker != nil {
+		data, err = f.repacker(f.name, data, f.shape)
+		if err != nil {
+			return 0, err
+		}
+	}
+
+	if err := binary.Write(w, binary.LittleEndian, data); err != nil {
+		return 0, err
+	}
+
+	return int64(len(data) * 4), nil
+}
+
+func mul(shape []uint64) int {
+	n := 1
+	for _, dim := range shape {
+		n *= int(dim)
+	}
+	return n
+}
+
+func TestSplitDim(t *testing.T) {
+	r := fakeTensor{
+		name:  "a.b",
+		shape: []uint64{3, 4},
+		data:  []float32{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11},
+	}
+
+	t.Run("no split", func(t *testing.T) {
+		for tt := range splitDim(&r, 0, split{Replacer: strings.NewReplacer("a", "x")}) {
+			if tt.Name != "x.b" {
+				t.Fatalf("expected name 'x', got '%s'", tt.Name)
+			}
+
+			if !slices.Equal(tt.Shape, []uint64{3, 4}) {
+				t.Fatalf("expected shape [3, 4], got %v", tt.Shape)
+			}
+
+			var b bytes.Buffer
+			if _, err := tt.WriteTo(&b); err != nil {
+				t.Fatal(err)
+			}
+
+			f32s := make([]float32, mul(tt.Shape))
+			if err := binary.Read(&b, binary.LittleEndian, &f32s); err != nil {
+				t.Fatal(err)
+			}
+
+			if !slices.Equal(f32s, []float32{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11}) {
+				t.Fatalf("expected data [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11], got %v", f32s)
+			}
+		}
+	})
+
+	t.Run("even split", func(t *testing.T) {
+		next, stop := iter.Pull(splitDim(&r, 1,
+			split{Replacer: strings.NewReplacer("a", "x")},
+			split{Replacer: strings.NewReplacer("b", "y")},
+		))
+		defer stop()
+
+		{
+			tt, ok := next()
+			if !ok {
+				t.Fatal("expected at least one split")
+			}
+
+			if tt.Name != "x.b" {
+				t.Fatal("expected name 'x.b', got", tt.Name)
+			}
+
+			if !slices.Equal(tt.Shape, []uint64{3, 2}) {
+				t.Fatal("expected shape [3, 2], got", tt.Shape)
+			}
+
+			var b bytes.Buffer
+			if _, err := tt.WriteTo(&b); err != nil {
+				t.Fatal(err)
+			}
+
+			f32s := make([]float32, mul(tt.Shape))
+			if err := binary.Read(&b, binary.LittleEndian, &f32s); err != nil {
+				t.Fatal(err)
+			}
+
+			if !slices.Equal(f32s, []float32{0, 1, 4, 5, 8, 9}) {
+				t.Fatal("expected data [0, 1, 4, 5, 8, 9], got", f32s)
+			}
+		}
+
+		{
+			tt, ok := next()
+			if !ok {
+				t.Fatal("expected at least one split")
+			}
+
+			if tt.Name != "a.y" {
+				t.Fatal("expected name 'a.y', got", tt.Name)
+			}
+
+			if !slices.Equal(tt.Shape, []uint64{3, 2}) {
+				t.Fatal("expected shape [3, 2], got", tt.Shape)
+			}
+
+			var b bytes.Buffer
+			if _, err := tt.WriteTo(&b); err != nil {
+				t.Fatal(err)
+			}
+
+			f32s := make([]float32, mul(tt.Shape))
+			if err := binary.Read(&b, binary.LittleEndian, &f32s); err != nil {
+				t.Fatal(err)
+			}
+
+			if !slices.Equal(f32s, []float32{2, 3, 6, 7, 10, 11}) {
+				t.Fatal("expected data [2, 3, 6, 7, 10, 11], got", f32s)
+			}
+		}
+	})
+
+	t.Run("uneven split", func(t *testing.T) {
+		next, stop := iter.Pull(splitDim(&r, 0,
+			split{Replacer: strings.NewReplacer("a", "x"), dim: 2},
+			split{Replacer: strings.NewReplacer("b", "y"), dim: 1},
+		))
+		defer stop()
+
+		{
+			tt, ok := next()
+			if !ok {
+				t.Fatal("expected at least one split")
+			}
+
+			if tt.Name != "x.b" {
+				t.Fatal("expected name 'x.b', got", tt.Name)
+			}
+
+			if !slices.Equal(tt.Shape, []uint64{2, 4}) {
+				t.Fatal("expected shape [2, 4], got", tt.Shape)
+			}
+
+			var b bytes.Buffer
+			if _, err := tt.WriteTo(&b); err != nil {
+				t.Fatal(err)
+			}
+
+			f32s := make([]float32, mul(tt.Shape))
+			if err := binary.Read(&b, binary.LittleEndian, &f32s); err != nil {
+				t.Fatal(err)
+			}
+
+			if !slices.Equal(f32s, []float32{0, 1, 2, 3, 4, 5, 6, 7}) {
+				t.Fatal("expected data [0, 1, 2, 3, 4, 5, 6, 7], got", f32s)
+			}
+		}
+
+		{
+			tt, ok := next()
+			if !ok {
+				t.Fatal("expected at least one split")
+			}
+
+			if tt.Name != "a.y" {
+				t.Fatal("expected name 'a.y', got", tt.Name)
+			}
+
+			if !slices.Equal(tt.Shape, []uint64{1, 4}) {
+				t.Fatal("expected shape [1, 4], got", tt.Shape)
+			}
+
+			var b bytes.Buffer
+			if _, err := tt.WriteTo(&b); err != nil {
+				t.Fatal(err)
+			}
+
+			f32s := make([]float32, mul(tt.Shape))
+			if err := binary.Read(&b, binary.LittleEndian, &f32s); err != nil {
+				t.Fatal(err)
+			}
+
+			if !slices.Equal(f32s, []float32{8, 9, 10, 11}) {
+				t.Fatal("expected data [8, 9, 10, 11], got", f32s)
+			}
+		}
+	})
+
+	t.Run("split with transpose", func(t *testing.T) {
+		next, stop := iter.Pull(splitDim(&r, 1,
+			split{Replacer: strings.NewReplacer("a", "x")},
+			split{Replacer: strings.NewReplacer("b", "y"), fn: func(tt tensor.Tensor) (tensor.Tensor, error) {
+				return tensor.Transpose(tt, 1, 0)
+			}},
+		))
+		defer stop()
+
+		{
+			tt, ok := next()
+			if !ok {
+				t.Fatal("expected at least one split")
+			}
+
+			if tt.Name != "x.b" {
+				t.Fatal("expected name 'x.b', got", tt.Name)
+			}
+
+			if !slices.Equal(tt.Shape, []uint64{3, 2}) {
+				t.Fatal("expected shape [3, 2], got", tt.Shape)
+			}
+
+			var b bytes.Buffer
+			if _, err := tt.WriteTo(&b); err != nil {
+				t.Fatal(err)
+			}
+
+			f32s := make([]float32, mul(tt.Shape))
+			if err := binary.Read(&b, binary.LittleEndian, &f32s); err != nil {
+				t.Fatal(err)
+			}
+
+			if !slices.Equal(f32s, []float32{0, 1, 4, 5, 8, 9}) {
+				t.Fatal("expected data [0, 1, 4, 5, 8, 9], got", f32s)
+			}
+		}
+
+		{
+			tt, ok := next()
+			if !ok {
+				t.Fatal("expected at least one split")
+			}
+
+			if tt.Name != "a.y" {
+				t.Fatal("expected name 'a.y', got", tt.Name)
+			}
+
+			if !slices.Equal(tt.Shape, []uint64{3, 2}) {
+				t.Fatal("expected shape [3, 2], got", tt.Shape)
+			}
+
+			var b bytes.Buffer
+			if _, err := tt.WriteTo(&b); err != nil {
+				t.Fatal(err)
+			}
+
+			f32s := make([]float32, mul(tt.Shape))
+			if err := binary.Read(&b, binary.LittleEndian, &f32s); err != nil {
+				t.Fatal(err)
+			}
+
+			if !slices.Equal(f32s, []float32{2, 6, 10, 3, 7, 11}) {
+				t.Fatal("expected data [2, 6, 10, 3, 7, 11], got", f32s)
+			}
+		}
+	})
+}

From 6b04cad7e816d1a119559e092d59f4fbaa6c3a0b Mon Sep 17 00:00:00 2001
From: Michael Yang <mxyng@pm.me>
Date: Thu, 12 Jun 2025 11:04:11 -0700
Subject: [PATCH 090/108] feat: incremental gguf parser (#10822)

* incremental gguf parser
* gguf: update test to not rely on gguf on disc
* re-use existing create gguf
* read capabilities from gguf kv
* kv exists
* update tests
* s/doneFunc/successFunc/g
* new buffered reader

---------

Co-authored-by: Bruce MacDonald <brucewmacdonald@gmail.com>
---
 fs/gguf/gguf.go             | 347 ++++++++++++++++++++++++++++++++++++
 fs/gguf/gguf_test.go        | 249 ++++++++++++++++++++++++++
 fs/gguf/keyvalue.go         |  90 ++++++++++
 fs/gguf/keyvalue_test.go    | 208 +++++++++++++++++++++
 fs/gguf/lazy.go             |  89 +++++++++
 fs/gguf/reader.go           |  23 +++
 fs/gguf/tensor.go           | 288 ++++++++++++++++++++++++++++++
 go.mod                      |   2 +-
 go.sum                      |   4 +-
 server/images.go            |  24 ++-
 server/images_test.go       | 165 +++++------------
 server/quantization_test.go |  12 +-
 server/sched_test.go        |  20 +--
 13 files changed, 1357 insertions(+), 164 deletions(-)
 create mode 100644 fs/gguf/gguf.go
 create mode 100644 fs/gguf/gguf_test.go
 create mode 100644 fs/gguf/keyvalue.go
 create mode 100644 fs/gguf/keyvalue_test.go
 create mode 100644 fs/gguf/lazy.go
 create mode 100644 fs/gguf/reader.go
 create mode 100644 fs/gguf/tensor.go

diff --git a/fs/gguf/gguf.go b/fs/gguf/gguf.go
new file mode 100644
index 00000000..ebb9286f
--- /dev/null
+++ b/fs/gguf/gguf.go
@@ -0,0 +1,347 @@
+package gguf
+
+import (
+	"bytes"
+	"cmp"
+	"encoding/binary"
+	"errors"
+	"fmt"
+	"io"
+	"iter"
+	"os"
+	"slices"
+	"strings"
+)
+
+const (
+	typeUint8 uint32 = iota
+	typeInt8
+	typeUint16
+	typeInt16
+	typeUint32
+	typeInt32
+	typeFloat32
+	typeBool
+	typeString
+	typeArray
+	typeUint64
+	typeInt64
+	typeFloat64
+)
+
+var ErrUnsupported = errors.New("unsupported")
+
+type File struct {
+	Magic   [4]byte
+	Version uint32
+
+	keyValues *lazy[KeyValue]
+	tensors   *lazy[TensorInfo]
+	offset    int64
+
+	file   *os.File
+	reader *bufferedReader
+	bts    []byte
+}
+
+func Open(path string) (f *File, err error) {
+	f = &File{bts: make([]byte, 4096)}
+	f.file, err = os.Open(path)
+	if err != nil {
+		return nil, err
+	}
+
+	f.reader = newBufferedReader(f.file, 32<<10)
+
+	if err := binary.Read(f.reader, binary.LittleEndian, &f.Magic); err != nil {
+		return nil, err
+	}
+
+	if bytes.Equal(f.Magic[:], []byte("gguf")) {
+		return nil, fmt.Errorf("%w file type %v", ErrUnsupported, f.Magic)
+	}
+
+	if err := binary.Read(f.reader, binary.LittleEndian, &f.Version); err != nil {
+		return nil, err
+	}
+
+	if f.Version != 3 {
+		return nil, fmt.Errorf("%w version %v", ErrUnsupported, f.Version)
+	}
+
+	f.tensors, err = newLazy(f, f.readTensor)
+	if err != nil {
+		return nil, err
+	}
+
+	f.tensors.successFunc = func() error {
+		offset := f.reader.offset
+
+		alignment := cmp.Or(f.KeyValue("general.alignment").Int(), 32)
+		f.offset = offset + (alignment-offset%alignment)%alignment
+		return nil
+	}
+
+	f.keyValues, err = newLazy(f, f.readKeyValue)
+	if err != nil {
+		return nil, err
+	}
+
+	return f, nil
+}
+
+func (f *File) readTensor() (TensorInfo, error) {
+	name, err := readString(f)
+	if err != nil {
+		return TensorInfo{}, err
+	}
+
+	dims, err := read[uint32](f)
+	if err != nil {
+		return TensorInfo{}, err
+	}
+
+	shape := make([]uint64, dims)
+	for i := range dims {
+		shape[i], err = read[uint64](f)
+		if err != nil {
+			return TensorInfo{}, err
+		}
+	}
+
+	type_, err := read[uint32](f)
+	if err != nil {
+		return TensorInfo{}, err
+	}
+
+	offset, err := read[uint64](f)
+	if err != nil {
+		return TensorInfo{}, err
+	}
+
+	return TensorInfo{
+		Name:   name,
+		Offset: offset,
+		Shape:  shape,
+		Type:   TensorType(type_),
+	}, nil
+}
+
+func (f *File) readKeyValue() (KeyValue, error) {
+	key, err := readString(f)
+	if err != nil {
+		return KeyValue{}, err
+	}
+
+	t, err := read[uint32](f)
+	if err != nil {
+		return KeyValue{}, err
+	}
+
+	value, err := func() (any, error) {
+		switch t {
+		case typeUint8:
+			return read[uint8](f)
+		case typeInt8:
+			return read[int8](f)
+		case typeUint16:
+			return read[uint16](f)
+		case typeInt16:
+			return read[int16](f)
+		case typeUint32:
+			return read[uint32](f)
+		case typeInt32:
+			return read[int32](f)
+		case typeUint64:
+			return read[uint64](f)
+		case typeInt64:
+			return read[int64](f)
+		case typeFloat32:
+			return read[float32](f)
+		case typeFloat64:
+			return read[float64](f)
+		case typeBool:
+			return read[bool](f)
+		case typeString:
+			return readString(f)
+		case typeArray:
+			return readArray(f)
+		default:
+			return nil, fmt.Errorf("%w type %d", ErrUnsupported, t)
+		}
+	}()
+	if err != nil {
+		return KeyValue{}, err
+	}
+
+	return KeyValue{
+		Key:   key,
+		Value: Value{value},
+	}, nil
+}
+
+func read[T any](f *File) (t T, err error) {
+	err = binary.Read(f.reader, binary.LittleEndian, &t)
+	return t, err
+}
+
+func readString(f *File) (string, error) {
+	n, err := read[uint64](f)
+	if err != nil {
+		return "", err
+	}
+
+	if int(n) > len(f.bts) {
+		f.bts = make([]byte, n)
+	}
+
+	bts := f.bts[:n]
+	if _, err := io.ReadFull(f.reader, bts); err != nil {
+		return "", err
+	}
+	defer clear(bts)
+
+	return string(bts), nil
+}
+
+func readArray(f *File) (any, error) {
+	t, err := read[uint32](f)
+	if err != nil {
+		return nil, err
+	}
+
+	n, err := read[uint64](f)
+	if err != nil {
+		return nil, err
+	}
+
+	switch t {
+	case typeUint8:
+		return readArrayData[uint8](f, n)
+	case typeInt8:
+		return readArrayData[int8](f, n)
+	case typeUint16:
+		return readArrayData[uint16](f, n)
+	case typeInt16:
+		return readArrayData[int16](f, n)
+	case typeUint32:
+		return readArrayData[uint32](f, n)
+	case typeInt32:
+		return readArrayData[int32](f, n)
+	case typeUint64:
+		return readArrayData[uint64](f, n)
+	case typeInt64:
+		return readArrayData[int64](f, n)
+	case typeFloat32:
+		return readArrayData[float32](f, n)
+	case typeFloat64:
+		return readArrayData[float64](f, n)
+	case typeBool:
+		return readArrayData[bool](f, n)
+	case typeString:
+		return readArrayString(f, n)
+	default:
+		return nil, fmt.Errorf("%w type %d", ErrUnsupported, t)
+	}
+}
+
+func readArrayData[T any](f *File, n uint64) (s []T, err error) {
+	s = make([]T, n)
+	for i := range n {
+		e, err := read[T](f)
+		if err != nil {
+			return nil, err
+		}
+
+		s[i] = e
+	}
+
+	return s, nil
+}
+
+func readArrayString(f *File, n uint64) (s []string, err error) {
+	s = make([]string, n)
+	for i := range n {
+		e, err := readString(f)
+		if err != nil {
+			return nil, err
+		}
+
+		s[i] = e
+	}
+
+	return s, nil
+}
+
+func (f *File) Close() error {
+	f.keyValues.stop()
+	f.tensors.stop()
+	return f.file.Close()
+}
+
+func (f *File) KeyValue(key string) KeyValue {
+	if !strings.HasPrefix(key, "general.") && !strings.HasPrefix(key, "tokenizer.") {
+		key = f.KeyValue("general.architecture").String() + "." + key
+	}
+
+	if index := slices.IndexFunc(f.keyValues.values, func(kv KeyValue) bool {
+		return kv.Key == key
+	}); index >= 0 {
+		return f.keyValues.values[index]
+	}
+
+	for keyValue, ok := f.keyValues.next(); ok; keyValue, ok = f.keyValues.next() {
+		if keyValue.Key == key {
+			return keyValue
+		}
+	}
+
+	return KeyValue{}
+}
+
+func (f *File) NumKeyValues() int {
+	return int(f.keyValues.count)
+}
+
+func (f *File) KeyValues() iter.Seq2[int, KeyValue] {
+	return f.keyValues.All()
+}
+
+func (f *File) TensorInfo(name string) TensorInfo {
+	if index := slices.IndexFunc(f.tensors.values, func(t TensorInfo) bool {
+		return t.Name == name
+	}); index >= 0 {
+		return f.tensors.values[index]
+	}
+
+	// fast-forward through key values if we haven't already
+	_ = f.keyValues.rest()
+	for tensor, ok := f.tensors.next(); ok; tensor, ok = f.tensors.next() {
+		if tensor.Name == name {
+			return tensor
+		}
+	}
+
+	return TensorInfo{}
+}
+
+func (f *File) NumTensors() int {
+	return int(f.tensors.count)
+}
+
+func (f *File) TensorInfos() iter.Seq2[int, TensorInfo] {
+	// fast forward through key values if we haven't already
+	f.keyValues.rest()
+	return f.tensors.All()
+}
+
+func (f *File) TensorReader(name string) (TensorInfo, io.Reader, error) {
+	t := f.TensorInfo(name)
+	if t.NumBytes() == 0 {
+		return TensorInfo{}, nil, fmt.Errorf("tensor %s not found", name)
+	}
+
+	// fast forward through tensor info if we haven't already
+	_ = f.tensors.rest()
+	return t, io.NewSectionReader(f.file, f.offset+int64(t.Offset), t.NumBytes()), nil
+}
diff --git a/fs/gguf/gguf_test.go b/fs/gguf/gguf_test.go
new file mode 100644
index 00000000..eea28a48
--- /dev/null
+++ b/fs/gguf/gguf_test.go
@@ -0,0 +1,249 @@
+package gguf_test
+
+import (
+	"bytes"
+	"os"
+	"strconv"
+	"strings"
+	"testing"
+
+	"github.com/google/go-cmp/cmp"
+	"github.com/google/go-cmp/cmp/cmpopts"
+	"github.com/ollama/ollama/fs/ggml"
+	"github.com/ollama/ollama/fs/gguf"
+)
+
+func createBinFile(tb testing.TB) string {
+	tb.Helper()
+	f, err := os.CreateTemp(tb.TempDir(), "")
+	if err != nil {
+		tb.Fatal(err)
+	}
+	defer f.Close()
+
+	kv := ggml.KV{
+		"general.architecture":                   "llama",
+		"llama.block_count":                      uint32(8),
+		"llama.embedding_length":                 uint32(3),
+		"llama.attention.head_count":             uint32(2),
+		"llama.attention.head_count_kv":          uint32(2),
+		"llama.attention.key_length":             uint32(3),
+		"llama.rope.dimension_count":             uint32(4),
+		"llama.rope.freq_base":                   float32(10000.0),
+		"llama.rope.freq_scale":                  float32(1.0),
+		"llama.attention.layer_norm_rms_epsilon": float32(1e-6),
+		"tokenizer.ggml.eos_token_id":            uint32(0),
+		"tokenizer.ggml.eos_token_ids":           []int32{1, 2, 3},
+		"tokenizer.ggml.tokens":                  []string{"hello", "world"},
+		"tokenizer.ggml.scores":                  []float32{0, 1},
+	}
+
+	tensors := []*ggml.Tensor{
+		{
+			Name:     "token_embd.weight",
+			Kind:     0,
+			Shape:    []uint64{2, 3},
+			WriterTo: bytes.NewBuffer(make([]byte, 4*2*3)),
+		},
+		{
+			Name:     "output.weight",
+			Kind:     0,
+			Shape:    []uint64{3, 2},
+			WriterTo: bytes.NewBuffer(make([]byte, 4*3*2)),
+		},
+	}
+
+	for i := range 8 {
+		tensors = append(tensors, &ggml.Tensor{
+			Name:     "blk." + strconv.Itoa(i) + ".attn_q.weight",
+			Kind:     0,
+			Shape:    []uint64{3, 3},
+			WriterTo: bytes.NewBuffer(make([]byte, 4*3*3)),
+		}, &ggml.Tensor{
+			Name:     "blk." + strconv.Itoa(i) + ".attn_k.weight",
+			Kind:     0,
+			Shape:    []uint64{3, 3},
+			WriterTo: bytes.NewBuffer(make([]byte, 4*3*3)),
+		}, &ggml.Tensor{
+			Name:     "blk." + strconv.Itoa(i) + ".attn_v.weight",
+			Kind:     0,
+			Shape:    []uint64{3, 3},
+			WriterTo: bytes.NewBuffer(make([]byte, 4*3*3)),
+		}, &ggml.Tensor{
+			Name:     "blk." + strconv.Itoa(i) + ".attn_output.weight",
+			Kind:     0,
+			Shape:    []uint64{3, 3},
+			WriterTo: bytes.NewBuffer(make([]byte, 4*3*3)),
+		})
+	}
+
+	if err := ggml.WriteGGUF(f, kv, tensors); err != nil {
+		tb.Fatal(err)
+	}
+
+	return f.Name()
+}
+
+func TestRead(t *testing.T) {
+	f, err := gguf.Open(createBinFile(t))
+	if err != nil {
+		t.Fatal(err)
+	}
+	defer f.Close()
+
+	if got := f.KeyValue("does.not.exist").Valid(); got {
+		t.Errorf(`KeyValue("does.not.exist").Exists() = %v, want false`, got)
+	}
+
+	if got := f.KeyValue("general.architecture").String(); got != "llama" {
+		t.Errorf(`KeyValue("general.architecture").String() = %q, want %q`, got, "llama")
+	}
+
+	if got := f.TensorInfo("token_embd.weight"); got.Name != "token_embd.weight" {
+		t.Errorf(`TensorInfo("token_embd.weight").Name = %q, want %q`, got.Name, "token_embd.weight")
+	} else if diff := cmp.Diff(got.Shape, []uint64{2, 3}); diff != "" {
+		t.Errorf(`TensorInfo("token_embd.weight").Shape mismatch (-got +want):\n%s`, diff)
+	} else if got.Type != gguf.TensorTypeF32 {
+		t.Errorf(`TensorInfo("token_embd.weight").Type = %d, want %d`, got.Type, gguf.TensorTypeF32)
+	}
+
+	if got := f.KeyValue("block_count").Uint(); got != 8 {
+		t.Errorf(`KeyValue("block_count").Uint() = %d, want %d`, got, 8)
+	}
+
+	if diff := cmp.Diff(f.KeyValue("tokenizer.ggml.tokens").Strings(), []string{"hello", "world"}); diff != "" {
+		t.Errorf("KeyValue(\"tokenizer.ggml.tokens\").Strings() mismatch (-got +want):\n%s", diff)
+	}
+
+	if diff := cmp.Diff(f.KeyValue("tokenizer.ggml.scores").Floats(), []float64{0, 1}); diff != "" {
+		t.Errorf("KeyValue(\"tokenizer.ggml.scores\").Ints() mismatch (-got +want):\n%s", diff)
+	}
+
+	var kvs []string
+	for _, kv := range f.KeyValues() {
+		if !kv.Valid() {
+			t.Error("found invalid key-value pair:", kv)
+		}
+
+		kvs = append(kvs, kv.Key)
+	}
+
+	if len(kvs) != f.NumKeyValues() {
+		t.Errorf("iterated key count = %d, want %d", len(kvs), f.NumKeyValues())
+	}
+
+	if diff := cmp.Diff(kvs, []string{
+		"general.architecture",
+		"llama.block_count",
+		"llama.embedding_length",
+		"llama.attention.head_count",
+		"llama.attention.head_count_kv",
+		"llama.attention.key_length",
+		"llama.rope.dimension_count",
+		"llama.rope.freq_base",
+		"llama.rope.freq_scale",
+		"llama.attention.layer_norm_rms_epsilon",
+		"tokenizer.ggml.eos_token_id",
+		"tokenizer.ggml.eos_token_ids",
+		"tokenizer.ggml.tokens",
+		"tokenizer.ggml.scores",
+	}, cmpopts.SortSlices(strings.Compare)); diff != "" {
+		t.Errorf("KeyValues() mismatch (-got +want):\n%s", diff)
+	}
+
+	var tis []string
+	for _, ti := range f.TensorInfos() {
+		if !ti.Valid() {
+			t.Error("found invalid tensor info:", ti)
+		}
+
+		tis = append(tis, ti.Name)
+	}
+
+	if len(tis) != f.NumTensors() {
+		t.Errorf("iterated tensor count = %d, want %d", len(tis), f.NumTensors())
+	}
+
+	if diff := cmp.Diff(tis, []string{
+		"token_embd.weight",
+		"output.weight",
+		"blk.0.attn_q.weight",
+		"blk.0.attn_k.weight",
+		"blk.0.attn_v.weight",
+		"blk.0.attn_output.weight",
+		"blk.1.attn_q.weight",
+		"blk.1.attn_k.weight",
+		"blk.1.attn_v.weight",
+		"blk.1.attn_output.weight",
+		"blk.2.attn_q.weight",
+		"blk.2.attn_k.weight",
+		"blk.2.attn_v.weight",
+		"blk.2.attn_output.weight",
+		"blk.3.attn_q.weight",
+		"blk.3.attn_k.weight",
+		"blk.3.attn_v.weight",
+		"blk.3.attn_output.weight",
+		"blk.4.attn_q.weight",
+		"blk.4.attn_k.weight",
+		"blk.4.attn_v.weight",
+		"blk.4.attn_output.weight",
+		"blk.5.attn_q.weight",
+		"blk.5.attn_k.weight",
+		"blk.5.attn_v.weight",
+		"blk.5.attn_output.weight",
+		"blk.6.attn_q.weight",
+		"blk.6.attn_k.weight",
+		"blk.6.attn_v.weight",
+		"blk.6.attn_output.weight",
+		"blk.7.attn_q.weight",
+		"blk.7.attn_k.weight",
+		"blk.7.attn_v.weight",
+		"blk.7.attn_output.weight",
+	}, cmpopts.SortSlices(strings.Compare)); diff != "" {
+		t.Errorf("TensorInfos() mismatch (-got +want):\n%s", diff)
+	}
+
+	ti, r, err := f.TensorReader("output.weight")
+	if err != nil {
+		t.Fatalf(`TensorReader("output.weight") error: %v`, err)
+	}
+
+	if ti.Name != "output.weight" {
+		t.Errorf(`TensorReader("output.weight").Name = %q, want %q`, ti.Name, "output.weight")
+	} else if diff := cmp.Diff(ti.Shape, []uint64{3, 2}); diff != "" {
+		t.Errorf(`TensorReader("output.weight").Shape mismatch (-got +want):\n%s`, diff)
+	} else if ti.Type != gguf.TensorTypeF32 {
+		t.Errorf(`TensorReader("output.weight").Type = %d, want %d`, ti.Type, gguf.TensorTypeF32)
+	}
+
+	var b bytes.Buffer
+	if _, err := b.ReadFrom(r); err != nil {
+		t.Fatalf(`ReadFrom TensorReader("output.weight") error: %v`, err)
+	}
+
+	if b.Len() != int(ti.NumBytes()) {
+		t.Errorf(`ReadFrom TensorReader("output.weight") length = %d, want %d`, b.Len(), ti.NumBytes())
+	}
+}
+
+func BenchmarkRead(b *testing.B) {
+	b.ReportAllocs()
+
+	p := createBinFile(b)
+	for b.Loop() {
+		f, err := gguf.Open(p)
+		if err != nil {
+			b.Fatal(err)
+		}
+
+		if got := f.KeyValue("general.architecture").String(); got != "llama" {
+			b.Errorf("got = %q, want %q", got, "llama")
+		}
+
+		// Iterate through some tensors
+		for range f.TensorInfos() {
+		}
+
+		f.Close()
+	}
+}
diff --git a/fs/gguf/keyvalue.go b/fs/gguf/keyvalue.go
new file mode 100644
index 00000000..5843326c
--- /dev/null
+++ b/fs/gguf/keyvalue.go
@@ -0,0 +1,90 @@
+package gguf
+
+import (
+	"reflect"
+	"slices"
+)
+
+type KeyValue struct {
+	Key string
+	Value
+}
+
+func (kv KeyValue) Valid() bool {
+	return kv.Key != "" && kv.Value.value != nil
+}
+
+type Value struct {
+	value any
+}
+
+func value[T any](v Value, kinds ...reflect.Kind) (t T) {
+	vv := reflect.ValueOf(v.value)
+	if slices.Contains(kinds, vv.Kind()) {
+		t = vv.Convert(reflect.TypeOf(t)).Interface().(T)
+	}
+	return
+}
+
+func values[T any](v Value, kinds ...reflect.Kind) (ts []T) {
+	switch vv := reflect.ValueOf(v.value); vv.Kind() {
+	case reflect.Slice:
+		if slices.Contains(kinds, vv.Type().Elem().Kind()) {
+			ts = make([]T, vv.Len())
+			for i := range vv.Len() {
+				ts[i] = vv.Index(i).Convert(reflect.TypeOf(ts[i])).Interface().(T)
+			}
+		}
+	}
+	return
+}
+
+// Int returns Value as a signed integer. If it is not a signed integer, it returns 0.
+func (v Value) Int() int64 {
+	return value[int64](v, reflect.Int, reflect.Int8, reflect.Int16, reflect.Int32, reflect.Int64)
+}
+
+// Ints returns Value as a signed integer slice. If it is not a signed integer slice, it returns nil.
+func (v Value) Ints() (i64s []int64) {
+	return values[int64](v, reflect.Int, reflect.Int8, reflect.Int16, reflect.Int32, reflect.Int64)
+}
+
+// Uint converts an unsigned integer value to uint64. If the value is not a unsigned integer, it returns 0.
+func (v Value) Uint() uint64 {
+	return value[uint64](v, reflect.Uint, reflect.Uint8, reflect.Uint16, reflect.Uint32, reflect.Uint64)
+}
+
+// Uints returns Value as a unsigned integer slice. If it is not a unsigned integer slice, it returns nil.
+func (v Value) Uints() (u64s []uint64) {
+	return values[uint64](v, reflect.Uint, reflect.Uint8, reflect.Uint16, reflect.Uint32, reflect.Uint64)
+}
+
+// Float returns Value as a float. If it is not a float, it returns 0.
+func (v Value) Float() float64 {
+	return value[float64](v, reflect.Float32, reflect.Float64)
+}
+
+// Floats returns Value as a float slice. If it is not a float slice, it returns nil.
+func (v Value) Floats() (f64s []float64) {
+	return values[float64](v, reflect.Float32, reflect.Float64)
+}
+
+// Bool returns Value as a boolean. If it is not a boolean, it returns false.
+func (v Value) Bool() bool {
+	return value[bool](v, reflect.Bool)
+}
+
+// Bools returns Value as a boolean slice. If it is not a boolean slice, it returns nil.
+func (v Value) Bools() (bools []bool) {
+	return values[bool](v, reflect.Bool)
+}
+
+// String returns Value as a string. If it is not a string, it returns an empty string.
+func (v Value) String() string {
+	return value[string](v, reflect.String)
+}
+
+// Strings returns Value as a string slice. If it is not a string slice, it returns nil.
+func (v Value) Strings() (strings []string) {
+	return values[string](v, reflect.String)
+}
diff --git a/fs/gguf/keyvalue_test.go b/fs/gguf/keyvalue_test.go
new file mode 100644
index 00000000..2caacc53
--- /dev/null
+++ b/fs/gguf/keyvalue_test.go
@@ -0,0 +1,208 @@
+package gguf
+
+import (
+	"testing"
+
+	"github.com/google/go-cmp/cmp"
+)
+
+func split(name string, values map[string][]any) (matched []any, unmatched []any) {
+	for key, value := range values {
+		if key == name {
+			matched = value
+		} else {
+			unmatched = append(unmatched, value...)
+		}
+	}
+	return
+}
+
+func TestValue(t *testing.T) {
+	values := map[string][]any{
+		"int64":   {int(42), int8(42), int16(42), int32(42), int64(42)},
+		"uint64":  {uint(42), uint8(42), uint16(42), uint32(42), uint64(42)},
+		"float64": {float32(42), float64(42)},
+		"string":  {"42", "hello"},
+		"bool":    {true, false},
+	}
+
+	t.Run("int64", func(t *testing.T) {
+		matched, unmatched := split("int64", values)
+		for _, v := range matched {
+			kv := KeyValue{"key", Value{v}}
+			if i64 := kv.Int(); i64 != 42 {
+				t.Errorf("expected 42, got %d", i64)
+			}
+		}
+
+		for _, v := range unmatched {
+			kv := KeyValue{"key", Value{v}}
+			if i64 := kv.Int(); i64 != 0 {
+				t.Errorf("expected 42, got %d", i64)
+			}
+		}
+	})
+
+	t.Run("uint64", func(t *testing.T) {
+		matched, unmatched := split("uint64", values)
+		for _, v := range matched {
+			kv := KeyValue{"key", Value{v}}
+			if u64 := kv.Uint(); u64 != 42 {
+				t.Errorf("expected 42, got %d", u64)
+			}
+		}
+
+		for _, v := range unmatched {
+			kv := KeyValue{"key", Value{v}}
+			if u64 := kv.Uint(); u64 != 0 {
+				t.Errorf("expected 42, got %d", u64)
+			}
+		}
+	})
+
+	t.Run("float64", func(t *testing.T) {
+		matched, unmatched := split("float64", values)
+		for _, v := range matched {
+			kv := KeyValue{"key", Value{v}}
+			if f64 := kv.Float(); f64 != 42 {
+				t.Errorf("expected 42, got %f", f64)
+			}
+		}
+
+		for _, v := range unmatched {
+			kv := KeyValue{"key", Value{v}}
+			if f64 := kv.Float(); f64 != 0 {
+				t.Errorf("expected 42, got %f", f64)
+			}
+		}
+	})
+
+	t.Run("string", func(t *testing.T) {
+		matched, unmatched := split("string", values)
+		for _, v := range matched {
+			kv := KeyValue{"key", Value{v}}
+			if s := kv.String(); s != v {
+				t.Errorf("expected 42, got %s", s)
+			}
+		}
+
+		for _, v := range unmatched {
+			kv := KeyValue{"key", Value{v}}
+			if s := kv.String(); s != "" {
+				t.Errorf("expected 42, got %s", s)
+			}
+		}
+	})
+
+	t.Run("bool", func(t *testing.T) {
+		matched, unmatched := split("bool", values)
+		for _, v := range matched {
+			kv := KeyValue{"key", Value{v}}
+			if b := kv.Bool(); b != v {
+				t.Errorf("expected true, got %v", b)
+			}
+		}
+
+		for _, v := range unmatched {
+			kv := KeyValue{"key", Value{v}}
+			if b := kv.Bool(); b != false {
+				t.Errorf("expected false, got %v", b)
+			}
+		}
+	})
+}
+
+func TestValues(t *testing.T) {
+	values := map[string][]any{
+		"int64s":   {[]int{42}, []int8{42}, []int16{42}, []int32{42}, []int64{42}},
+		"uint64s":  {[]uint{42}, []uint8{42}, []uint16{42}, []uint32{42}, []uint64{42}},
+		"float64s": {[]float32{42}, []float64{42}},
+		"strings":  {[]string{"42"}, []string{"hello"}},
+		"bools":    {[]bool{true}, []bool{false}},
+	}
+
+	t.Run("int64s", func(t *testing.T) {
+		matched, unmatched := split("int64s", values)
+		for _, v := range matched {
+			kv := KeyValue{"key", Value{v}}
+			if diff := cmp.Diff(kv.Ints(), []int64{42}); diff != "" {
+				t.Errorf("diff: %s", diff)
+			}
+		}
+
+		for _, v := range unmatched {
+			kv := KeyValue{"key", Value{v}}
+			if i64s := kv.Ints(); i64s != nil {
+				t.Errorf("expected nil, got %v", i64s)
+			}
+		}
+	})
+
+	t.Run("uint64s", func(t *testing.T) {
+		matched, unmatched := split("uint64s", values)
+		for _, v := range matched {
+			kv := KeyValue{"key", Value{v}}
+			if diff := cmp.Diff(kv.Uints(), []uint64{42}); diff != "" {
+				t.Errorf("diff: %s", diff)
+			}
+		}
+
+		for _, v := range unmatched {
+			kv := KeyValue{"key", Value{v}}
+			if u64s := kv.Uints(); u64s != nil {
+				t.Errorf("expected nil, got %v", u64s)
+			}
+		}
+	})
+
+	t.Run("float64s", func(t *testing.T) {
+		matched, unmatched := split("float64s", values)
+		for _, v := range matched {
+			kv := KeyValue{"key", Value{v}}
+			if diff := cmp.Diff(kv.Floats(), []float64{42}); diff != "" {
+				t.Errorf("diff: %s", diff)
+			}
+		}
+
+		for _, v := range unmatched {
+			kv := KeyValue{"key", Value{v}}
+			if f64s := kv.Floats(); f64s != nil {
+				t.Errorf("expected nil, got %v", f64s)
+			}
+		}
+	})
+
+	t.Run("strings", func(t *testing.T) {
+		matched, unmatched := split("strings", values)
+		for _, v := range matched {
+			kv := KeyValue{"key", Value{v}}
+			if diff := cmp.Diff(kv.Strings(), v); diff != "" {
+				t.Errorf("diff: %s", diff)
+			}
+		}
+
+		for _, v := range unmatched {
+			kv := KeyValue{"key", Value{v}}
+			if s := kv.Strings(); s != nil {
+				t.Errorf("expected nil, got %v", s)
+			}
+		}
+	})
+
+	t.Run("bools", func(t *testing.T) {
+		matched, unmatched := split("bools", values)
+		for _, v := range matched {
+			kv := KeyValue{"key", Value{v}}
+			if diff := cmp.Diff(kv.Bools(), v); diff != "" {
+				t.Errorf("diff: %s", diff)
+			}
+		}
+
+		for _, v := range unmatched {
+			kv := KeyValue{"key", Value{v}}
+			if b := kv.Bools(); b != nil {
+				t.Errorf("expected nil, got %v", b)
+			}
+		}
+	})
+}
diff --git a/fs/gguf/lazy.go b/fs/gguf/lazy.go
new file mode 100644
index 00000000..16ab9909
--- /dev/null
+++ b/fs/gguf/lazy.go
@@ -0,0 +1,89 @@
+package gguf
+
+import (
+	"encoding/binary"
+	"iter"
+	"log/slog"
+)
+
+type lazy[T any] struct {
+	count  uint64
+	next   func() (T, bool)
+	stop   func()
+	values []T
+
+	// successFunc is called when all values have been successfully read.
+	successFunc func() error
+}
+
+func newLazy[T any](f *File, fn func() (T, error)) (*lazy[T], error) {
+	it := lazy[T]{}
+	if err := binary.Read(f.reader, binary.LittleEndian, &it.count); err != nil {
+		return nil, err
+	}
+
+	it.values = make([]T, 0)
+	it.next, it.stop = iter.Pull(func(yield func(T) bool) {
+		for i := range it.count {
+			t, err := fn()
+			if err != nil {
+				slog.Error("error reading tensor", "index", i, "error", err)
+				return
+			}
+
+			it.values = append(it.values, t)
+			if !yield(t) {
+				break
+			}
+		}
+
+		if it.successFunc != nil {
+			it.successFunc()
+		}
+	})
+
+	return &it, nil
+}
+
+func (g *lazy[T]) Values() iter.Seq[T] {
+	return func(yield func(T) bool) {
+		for _, v := range g.All() {
+			if !yield(v) {
+				break
+			}
+		}
+	}
+}
+
+func (g *lazy[T]) All() iter.Seq2[int, T] {
+	return func(yield func(int, T) bool) {
+		for i := range int(g.count) {
+			if i < len(g.values) {
+				if !yield(i, g.values[i]) {
+					break
+				}
+			} else {
+				t, ok := g.next()
+				if !ok {
+					break
+				}
+
+				if !yield(i, t) {
+					break
+				}
+			}
+		}
+	}
+}
+
+func (g *lazy[T]) rest() (collected bool) {
+	for {
+		_, ok := g.next()
+		collected = collected || ok
+		if !ok {
+			break
+		}
+	}
+
+	return collected
+}
diff --git a/fs/gguf/reader.go b/fs/gguf/reader.go
new file mode 100644
index 00000000..0bd76184
--- /dev/null
+++ b/fs/gguf/reader.go
@@ -0,0 +1,23 @@
+package gguf
+
+import (
+	"bufio"
+	"io"
+)
+
+type bufferedReader struct {
+	offset int64
+	*bufio.Reader
+}
+
+func newBufferedReader(rs io.ReadSeeker, size int) *bufferedReader {
+	return &bufferedReader{
+		Reader: bufio.NewReaderSize(rs, size),
+	}
+}
+
+func (rs *bufferedReader) Read(p []byte) (n int, err error) {
+	n, err = rs.Reader.Read(p)
+	rs.offset += int64(n)
+	return n, err
+}
diff --git a/fs/gguf/tensor.go b/fs/gguf/tensor.go
new file mode 100644
index 00000000..194c1d73
--- /dev/null
+++ b/fs/gguf/tensor.go
@@ -0,0 +1,288 @@
+package gguf
+
+import (
+	"log/slog"
+	"strings"
+)
+
+type TensorInfo struct {
+	Name   string
+	Offset uint64
+	Shape  []uint64
+	Type   TensorType
+}
+
+func (ti TensorInfo) Valid() bool {
+	return ti.Name != "" && ti.NumBytes() > 0
+}
+
+func (ti TensorInfo) NumValues() int64 {
+	var numItems int64 = 1
+	for _, dim := range ti.Shape {
+		numItems *= int64(dim)
+	}
+	return numItems
+}
+
+// NumBytes returns the number of bytes in the tensor.
+func (ti TensorInfo) NumBytes() int64 {
+	return int64(float64(ti.NumValues()) * ti.Type.NumBytes())
+}
+
+func (ti TensorInfo) LogValue() slog.Value {
+	return slog.GroupValue(
+		slog.String("name", ti.Name),
+		slog.Int64("offset", int64(ti.Offset)),
+		slog.Any("shape", ti.Shape),
+		slog.Int64("num_values", ti.NumValues()),
+		slog.Int64("num_bytes", ti.NumBytes()),
+		slog.Any("type", ti.Type),
+	)
+}
+
+type TensorType uint32
+
+const (
+	TensorTypeF32 TensorType = iota
+	TensorTypeF16
+	TensorTypeQ4_0
+	TensorTypeQ4_1
+
+	// unexported // unused in gguf
+	tensorTypeQ4_2
+	tensorTypeQ4_3
+
+	TensorTypeQ5_0
+	TensorTypeQ5_1
+	TensorTypeQ8_0
+	TensorTypeQ8_1
+	TensorTypeQ2_K
+	TensorTypeQ3_K
+	TensorTypeQ4_K
+	TensorTypeQ5_K
+	TensorTypeQ6_K
+	TensorTypeQ8_K
+
+	// unexported // unquantizable by ollama
+	tensorTypeIQ2_XXS
+	tensorTypeIQ2_XS
+	tensorTypeIQ3_XXS
+	tensorTypeIQ1_S
+	tensorTypeIQ4_NL
+	tensorTypeIQ3_S
+	tensorTypeIQ2_S
+	tensorTypeIQ4_XS
+
+	TensorTypeI8
+	TensorTypeI16
+	TensorTypeI32
+	TensorTypeI64
+	TensorTypeF64
+
+	// unexported // unquantizable by ollama
+	tensorTypeIQ1_M
+
+	TensorTypeBF16
+
+	// unexported // unused in gguf
+	tensorTypeQ4_0_4_4
+	tensorTypeQ4_0_4_8
+	tensorTypeQ4_0_8_8
+
+	// unexported // unquantizable by ollama
+	tensorTypeTQ1_0
+	tensorTypeTQ2_0
+
+	// unexported // unused in gguf
+	tensorTypeIQ4_NL_4_4
+	tensorTypeIQ4_NL_4_8
+	tensorTypeIQ4_NL_8_8
+)
+
+func (tt TensorType) NumBytes() float64 {
+	return float64(tt.typeSize()) / float64(tt.blockSize())
+}
+
+func (tt TensorType) typeSize() int64 {
+	switch tt {
+	case TensorTypeF32:
+		return 4
+	case TensorTypeF16:
+		return 2
+	case TensorTypeQ4_0:
+		return 2 + tt.blockSize()/2
+	case TensorTypeQ4_1:
+		return 2 + 2 + tt.blockSize()/2
+	case TensorTypeQ5_0:
+		return 2 + 4 + tt.blockSize()/2
+	case TensorTypeQ5_1:
+		return 2 + 2 + 4 + tt.blockSize()/2
+	case TensorTypeQ8_0:
+		return 2 + tt.blockSize()
+	case TensorTypeQ8_1:
+		return 2 + 2 + tt.blockSize()
+	case TensorTypeQ2_K:
+		return tt.blockSize()/16 + tt.blockSize()/4 + 2 + 2
+	case TensorTypeQ3_K:
+		return tt.blockSize()/8 + tt.blockSize()/4 + 12 + 2
+	case TensorTypeQ4_K:
+		return 2 + 2 + 12 + tt.blockSize()/2
+	case TensorTypeQ5_K:
+		return 2 + 2 + 12 + tt.blockSize()/8 + tt.blockSize()/2
+	case TensorTypeQ6_K:
+		return tt.blockSize()/2 + tt.blockSize()/4 + tt.blockSize()/16 + 2
+	case TensorTypeQ8_K:
+		return 4 + tt.blockSize() + 2*tt.blockSize()/16
+	case tensorTypeIQ2_XXS:
+		return 2 + 2*tt.blockSize()/8
+	case tensorTypeIQ2_XS:
+		return 2 + 2*tt.blockSize()/8 + tt.blockSize()/32
+	case tensorTypeIQ3_XXS:
+		return 2 + tt.blockSize()/4 + tt.blockSize()/8
+	case tensorTypeIQ1_S:
+		return 2 + tt.blockSize()/8 + tt.blockSize()/16
+	case tensorTypeIQ4_NL:
+		return 2 + tt.blockSize()/2
+	case tensorTypeIQ3_S:
+		return 2 + tt.blockSize()/4 + tt.blockSize()/8 + tt.blockSize()/32 + 4
+	case tensorTypeIQ2_S:
+		return 2 + tt.blockSize()/4 + tt.blockSize()/16
+	case tensorTypeIQ4_XS:
+		return 2 + 2 + tt.blockSize()/2 + tt.blockSize()/64
+	case TensorTypeI8:
+		return 1
+	case TensorTypeI16:
+		return 2
+	case TensorTypeI32:
+		return 4
+	case TensorTypeI64:
+		return 8
+	case TensorTypeF64:
+		return 8
+	case tensorTypeIQ1_M:
+		return tt.blockSize()/8 + tt.blockSize()/16 + tt.blockSize()/32
+	case TensorTypeBF16:
+		return 2
+	default:
+		return 0
+	}
+}
+
+func (tt TensorType) blockSize() int64 {
+	switch tt {
+	case TensorTypeF32,
+		TensorTypeF16,
+		TensorTypeI8,
+		TensorTypeI16,
+		TensorTypeI32,
+		TensorTypeI64,
+		TensorTypeF64,
+		TensorTypeBF16:
+		return 1
+	case TensorTypeQ4_0,
+		TensorTypeQ4_1,
+		TensorTypeQ5_0,
+		TensorTypeQ5_1,
+		TensorTypeQ8_0,
+		TensorTypeQ8_1,
+		tensorTypeIQ4_NL:
+		return 32
+	default:
+		return 256
+	}
+}
+
+func (tt TensorType) String() string {
+	switch tt {
+	case TensorTypeF32:
+		return "f32"
+	case TensorTypeF16:
+		return "f16"
+	case TensorTypeQ4_0:
+		return "q4_0"
+	case TensorTypeQ4_1:
+		return "q4_1"
+	case tensorTypeQ4_2:
+		return "q4_2"
+	case tensorTypeQ4_3:
+		return "q4_3"
+	case TensorTypeQ5_0:
+		return "q5_0"
+	case TensorTypeQ5_1:
+		return "q5_1"
+	case TensorTypeQ8_0:
+		return "q8_0"
+	case TensorTypeQ8_1:
+		return "q8_1"
+	case TensorTypeQ2_K:
+		return "q2_k"
+	case TensorTypeQ3_K:
+		return "q3_k"
+	case TensorTypeQ4_K:
+		return "q4_k"
+	case TensorTypeQ5_K:
+		return "q5_k"
+	case TensorTypeQ6_K:
+		return "q6_k"
+	case TensorTypeQ8_K:
+		return "q8_k"
+	case tensorTypeIQ2_XXS:
+		return "iq2_xxs"
+	case tensorTypeIQ2_XS:
+		return "iq2_xs"
+	case tensorTypeIQ3_XXS:
+		return "iq3_xxs"
+	case tensorTypeIQ1_S:
+		return "iq1_s"
+	case tensorTypeIQ4_NL:
+		return "iq4_nl"
+	case tensorTypeIQ3_S:
+		return "iq3_s"
+	case tensorTypeIQ2_S:
+		return "iq2_s"
+	case tensorTypeIQ4_XS:
+		return "iq4_xs"
+	case TensorTypeI8:
+		return "i8"
+	case TensorTypeI16:
+		return "i16"
+	case TensorTypeI32:
+		return "i32"
+	case TensorTypeI64:
+		return "i64"
+	case TensorTypeF64:
+		return "f64"
+	case tensorTypeIQ1_M:
+		return "iq1_m"
+	case TensorTypeBF16:
+		return "bf16"
+	case tensorTypeQ4_0_4_4:
+		return "q4_0_4_4"
+	case tensorTypeQ4_0_4_8:
+		return "q4_0_4_8"
+	case tensorTypeQ4_0_8_8:
+		return "q4_0_8_8"
+	case tensorTypeTQ1_0:
+		return "tq1_0"
+	case tensorTypeTQ2_0:
+		return "tq2_0"
+	case tensorTypeIQ4_NL_4_4:
+		return "iq4_nl_4_4"
+	case tensorTypeIQ4_NL_4_8:
+		return "iq4_nl_4_8"
+	case tensorTypeIQ4_NL_8_8:
+		return "iq4_nl_8_8"
+	default:
+		return "unknown"
+	}
+}
+
+func (tt TensorType) LogValue() slog.Value {
+	return slog.GroupValue(
+		slog.Uint64("value", uint64(tt)),
+		slog.String("name", strings.ToUpper(tt.String())),
+		slog.Int64("size", tt.typeSize()),
+		slog.Int64("block_size", tt.blockSize()),
+		slog.Float64("num_bytes", tt.NumBytes()),
+	)
+}
diff --git a/go.mod b/go.mod
index 283286b7..6de5959b 100644
--- a/go.mod
+++ b/go.mod
@@ -19,7 +19,7 @@ require (
 	github.com/d4l3k/go-bfloat16 v0.0.0-20211005043715-690c3bdd05f1
 	github.com/dlclark/regexp2 v1.11.4
 	github.com/emirpasic/gods/v2 v2.0.0-alpha
-	github.com/google/go-cmp v0.6.0
+	github.com/google/go-cmp v0.7.0
 	github.com/mattn/go-runewidth v0.0.14
 	github.com/nlpodyssey/gopickle v0.3.0
 	github.com/pdevine/tensor v0.0.0-20240510204454-f88f4562727c
diff --git a/go.sum b/go.sum
index 5755616f..c0ab53aa 100644
--- a/go.sum
+++ b/go.sum
@@ -112,8 +112,8 @@ github.com/google/go-cmp v0.4.0/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/
 github.com/google/go-cmp v0.5.0/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE=
 github.com/google/go-cmp v0.5.5/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE=
 github.com/google/go-cmp v0.5.6/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE=
-github.com/google/go-cmp v0.6.0 h1:ofyhxvXcZhMsU5ulbFiLKl/XBFqE1GSq7atu8tAmTRI=
-github.com/google/go-cmp v0.6.0/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY=
+github.com/google/go-cmp v0.7.0 h1:wk8382ETsv4JYUZwIsn6YpYiWiBsYLSJiTsyBybVuN8=
+github.com/google/go-cmp v0.7.0/go.mod h1:pXiqmnSA92OHEEa9HXL2W4E7lf9JzCmGVUdgjX3N/iU=
 github.com/google/gofuzz v1.0.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg=
 github.com/google/uuid v1.1.2/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo=
 github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0=
diff --git a/server/images.go b/server/images.go
index d6cceff4..38505cc5 100644
--- a/server/images.go
+++ b/server/images.go
@@ -23,7 +23,7 @@ import (
 
 	"github.com/ollama/ollama/api"
 	"github.com/ollama/ollama/envconfig"
-	"github.com/ollama/ollama/fs/ggml"
+	"github.com/ollama/ollama/fs/gguf"
 	"github.com/ollama/ollama/parser"
 	"github.com/ollama/ollama/template"
 	"github.com/ollama/ollama/thinking"
@@ -73,22 +73,18 @@ func (m *Model) Capabilities() []model.Capability {
 	capabilities := []model.Capability{}
 
 	// Check for completion capability
-	r, err := os.Open(m.ModelPath)
+	f, err := gguf.Open(m.ModelPath)
 	if err == nil {
-		defer r.Close()
+		defer f.Close()
 
-		f, err := ggml.Decode(r, 1024)
-		if err == nil {
-			if _, ok := f.KV()[fmt.Sprintf("%s.pooling_type", f.KV().Architecture())]; ok {
-				capabilities = append(capabilities, model.CapabilityEmbedding)
-			} else {
-				capabilities = append(capabilities, model.CapabilityCompletion)
-			}
-			if _, ok := f.KV()[fmt.Sprintf("%s.vision.block_count", f.KV().Architecture())]; ok {
-				capabilities = append(capabilities, model.CapabilityVision)
-			}
+		if f.KeyValue("pooling_type").Valid() {
+			capabilities = append(capabilities, model.CapabilityEmbedding)
 		} else {
-			slog.Error("couldn't decode ggml", "error", err)
+			// If no embedding is specified, we assume the model supports completion
+			capabilities = append(capabilities, model.CapabilityCompletion)
+		}
+		if f.KeyValue("vision.block_count").Valid() {
+			capabilities = append(capabilities, model.CapabilityVision)
 		}
 	} else {
 		slog.Error("couldn't open model file", "error", err)
diff --git a/server/images_test.go b/server/images_test.go
index 363b298e..a2fba8d9 100644
--- a/server/images_test.go
+++ b/server/images_test.go
@@ -1,123 +1,42 @@
 package server
 
 import (
-	"bytes"
-	"encoding/binary"
-	"errors"
-	"os"
-	"path/filepath"
 	"strings"
 	"testing"
 
+	"github.com/ollama/ollama/fs/ggml"
 	"github.com/ollama/ollama/template"
 	"github.com/ollama/ollama/types/model"
 )
 
-// Constants for GGUF magic bytes and version
-var (
-	ggufMagic = []byte{0x47, 0x47, 0x55, 0x46} // "GGUF"
-	ggufVer   = uint32(3)                      // Version 3
-)
-
-// Helper function to create mock GGUF data
-func createMockGGUFData(architecture string, vision bool) []byte {
-	var buf bytes.Buffer
-
-	// Write GGUF header
-	buf.Write(ggufMagic)
-	binary.Write(&buf, binary.LittleEndian, ggufVer)
-
-	// Write tensor count (0 for our test)
-	var numTensors uint64 = 0
-	binary.Write(&buf, binary.LittleEndian, numTensors)
-
-	// Calculate number of metadata entries
-	numMetaEntries := uint64(1) // architecture entry
-	if vision {
-		numMetaEntries++
-	}
-	// Add embedding entry if architecture is "bert"
-	if architecture == "bert" {
-		numMetaEntries++
-	}
-	binary.Write(&buf, binary.LittleEndian, numMetaEntries)
-
-	// Write architecture metadata
-	archKey := "general.architecture"
-	keyLen := uint64(len(archKey))
-	binary.Write(&buf, binary.LittleEndian, keyLen)
-	buf.WriteString(archKey)
-
-	// String type (8)
-	var strType uint32 = 8
-	binary.Write(&buf, binary.LittleEndian, strType)
-
-	// String length
-	strLen := uint64(len(architecture))
-	binary.Write(&buf, binary.LittleEndian, strLen)
-	buf.WriteString(architecture)
-
-	if vision {
-		visionKey := architecture + ".vision.block_count"
-		keyLen = uint64(len(visionKey))
-		binary.Write(&buf, binary.LittleEndian, keyLen)
-		buf.WriteString(visionKey)
-
-		// uint32 type (4)
-		var uint32Type uint32 = 4
-		binary.Write(&buf, binary.LittleEndian, uint32Type)
-
-		// uint32 value (1)
-		var countVal uint32 = 1
-		binary.Write(&buf, binary.LittleEndian, countVal)
-	}
-	// Write embedding metadata if architecture is "bert"
-	if architecture == "bert" {
-		poolKey := architecture + ".pooling_type"
-		keyLen = uint64(len(poolKey))
-		binary.Write(&buf, binary.LittleEndian, keyLen)
-		buf.WriteString(poolKey)
-
-		// uint32 type (4)
-		var uint32Type uint32 = 4
-		binary.Write(&buf, binary.LittleEndian, uint32Type)
-
-		// uint32 value (1)
-		var poolingVal uint32 = 1
-		binary.Write(&buf, binary.LittleEndian, poolingVal)
-	}
-
-	return buf.Bytes()
-}
-
 func TestModelCapabilities(t *testing.T) {
-	// Create a temporary directory for test files
-	tempDir := t.TempDir()
+	// Create completion model (llama architecture without vision)
+	completionModelPath, _ := createBinFile(t, ggml.KV{
+		"general.architecture": "llama",
+	}, []*ggml.Tensor{})
 
-	// Create different types of mock model files
-	completionModelPath := filepath.Join(tempDir, "model.bin")
-	visionModelPath := filepath.Join(tempDir, "vision_model.bin")
-	embeddingModelPath := filepath.Join(tempDir, "embedding_model.bin")
-	// Create a simple model file for tests that don't depend on GGUF content
-	simpleModelPath := filepath.Join(tempDir, "simple_model.bin")
+	// Create vision model (llama architecture with vision block count)
+	visionModelPath, _ := createBinFile(t, ggml.KV{
+		"general.architecture":     "llama",
+		"llama.vision.block_count": uint32(1),
+	}, []*ggml.Tensor{})
 
-	if err := errors.Join(
-		os.WriteFile(completionModelPath, createMockGGUFData("llama", false), 0o644),
-		os.WriteFile(visionModelPath, createMockGGUFData("llama", true), 0o644),
-		os.WriteFile(embeddingModelPath, createMockGGUFData("bert", false), 0o644),
-		os.WriteFile(simpleModelPath, []byte("dummy model data"), 0o644),
-	); err != nil {
-		t.Fatalf("Failed to create model files: %v", err)
-	}
+	// Create embedding model (bert architecture with pooling type)
+	embeddingModelPath, _ := createBinFile(t, ggml.KV{
+		"general.architecture": "bert",
+		"bert.pooling_type":    uint32(1),
+	}, []*ggml.Tensor{})
 
 	toolsInsertTemplate, err := template.Parse("{{ .prompt }}{{ if .tools }}{{ .tools }}{{ end }}{{ if .suffix }}{{ .suffix }}{{ end }}")
 	if err != nil {
 		t.Fatalf("Failed to parse template: %v", err)
 	}
+
 	chatTemplate, err := template.Parse("{{ .prompt }}")
 	if err != nil {
 		t.Fatalf("Failed to parse template: %v", err)
 	}
+
 	toolsTemplate, err := template.Parse("{{ .prompt }}{{ if .tools }}{{ .tools }}{{ end }}")
 	if err != nil {
 		t.Fatalf("Failed to parse template: %v", err)
@@ -145,21 +64,13 @@ func TestModelCapabilities(t *testing.T) {
 			},
 			expectedCaps: []model.Capability{model.CapabilityCompletion, model.CapabilityTools, model.CapabilityInsert},
 		},
-		{
-			name: "model with tools and insert capability",
-			model: Model{
-				ModelPath: simpleModelPath,
-				Template:  toolsInsertTemplate,
-			},
-			expectedCaps: []model.Capability{model.CapabilityTools, model.CapabilityInsert},
-		},
 		{
 			name: "model with tools capability",
 			model: Model{
-				ModelPath: simpleModelPath,
+				ModelPath: completionModelPath,
 				Template:  toolsTemplate,
 			},
-			expectedCaps: []model.Capability{model.CapabilityTools},
+			expectedCaps: []model.Capability{model.CapabilityCompletion, model.CapabilityTools},
 		},
 		{
 			name: "model with vision capability",
@@ -224,29 +135,33 @@ func TestModelCapabilities(t *testing.T) {
 }
 
 func TestModelCheckCapabilities(t *testing.T) {
-	// Create a temporary directory for test files
-	tempDir := t.TempDir()
+	// Create simple model file for tests that don't depend on GGUF content
+	completionModelPath, _ := createBinFile(t, ggml.KV{
+		"general.architecture": "llama",
+	}, []*ggml.Tensor{})
 
-	visionModelPath := filepath.Join(tempDir, "vision_model.bin")
-	simpleModelPath := filepath.Join(tempDir, "model.bin")
-	embeddingModelPath := filepath.Join(tempDir, "embedding_model.bin")
+	// Create vision model (llama architecture with vision block count)
+	visionModelPath, _ := createBinFile(t, ggml.KV{
+		"general.architecture":     "llama",
+		"llama.vision.block_count": uint32(1),
+	}, []*ggml.Tensor{})
 
-	if err := errors.Join(
-		os.WriteFile(simpleModelPath, []byte("dummy model data"), 0o644),
-		os.WriteFile(visionModelPath, createMockGGUFData("llama", true), 0o644),
-		os.WriteFile(embeddingModelPath, createMockGGUFData("bert", false), 0o644),
-	); err != nil {
-		t.Fatalf("Failed to create model files: %v", err)
-	}
+	// Create embedding model (bert architecture with pooling type)
+	embeddingModelPath, _ := createBinFile(t, ggml.KV{
+		"general.architecture": "bert",
+		"bert.pooling_type":    uint32(1),
+	}, []*ggml.Tensor{})
 
 	toolsInsertTemplate, err := template.Parse("{{ .prompt }}{{ if .tools }}{{ .tools }}{{ end }}{{ if .suffix }}{{ .suffix }}{{ end }}")
 	if err != nil {
 		t.Fatalf("Failed to parse template: %v", err)
 	}
+
 	chatTemplate, err := template.Parse("{{ .prompt }}")
 	if err != nil {
 		t.Fatalf("Failed to parse template: %v", err)
 	}
+
 	toolsTemplate, err := template.Parse("{{ .prompt }}{{ if .tools }}{{ .tools }}{{ end }}")
 	if err != nil {
 		t.Fatalf("Failed to parse template: %v", err)
@@ -261,7 +176,7 @@ func TestModelCheckCapabilities(t *testing.T) {
 		{
 			name: "completion model without tools capability",
 			model: Model{
-				ModelPath: simpleModelPath,
+				ModelPath: completionModelPath,
 				Template:  chatTemplate,
 			},
 			checkCaps:      []model.Capability{model.CapabilityTools},
@@ -270,7 +185,7 @@ func TestModelCheckCapabilities(t *testing.T) {
 		{
 			name: "model with all needed capabilities",
 			model: Model{
-				ModelPath: simpleModelPath,
+				ModelPath: completionModelPath,
 				Template:  toolsInsertTemplate,
 			},
 			checkCaps: []model.Capability{model.CapabilityTools, model.CapabilityInsert},
@@ -278,7 +193,7 @@ func TestModelCheckCapabilities(t *testing.T) {
 		{
 			name: "model missing insert capability",
 			model: Model{
-				ModelPath: simpleModelPath,
+				ModelPath: completionModelPath,
 				Template:  toolsTemplate,
 			},
 			checkCaps:      []model.Capability{model.CapabilityInsert},
@@ -287,7 +202,7 @@ func TestModelCheckCapabilities(t *testing.T) {
 		{
 			name: "model missing vision capability",
 			model: Model{
-				ModelPath: simpleModelPath,
+				ModelPath: completionModelPath,
 				Template:  toolsTemplate,
 			},
 			checkCaps:      []model.Capability{model.CapabilityVision},
@@ -312,7 +227,7 @@ func TestModelCheckCapabilities(t *testing.T) {
 		{
 			name: "unknown capability",
 			model: Model{
-				ModelPath: simpleModelPath,
+				ModelPath: completionModelPath,
 				Template:  chatTemplate,
 			},
 			checkCaps:      []model.Capability{"unknown"},
diff --git a/server/quantization_test.go b/server/quantization_test.go
index 4f717c2c..8b726c83 100644
--- a/server/quantization_test.go
+++ b/server/quantization_test.go
@@ -257,16 +257,8 @@ func TestQuantizeModel(t *testing.T) {
 
 	for _, tt := range cases {
 		t.Run(tt.name, func(t *testing.T) {
-			f, err := os.CreateTemp(t.TempDir(), tt.name)
-			if err != nil {
-				t.Fatal(err.Error())
-			}
-			defer f.Close()
-			err = fsggml.WriteGGUF(f, tt.kv, tt.tensors)
-			if err != nil {
-				t.Fatalf("failed to create initial model: %s", err)
-			}
-			fp, err := os.Open(f.Name())
+			p, _ := createBinFile(t, tt.kv, tt.tensors)
+			fp, err := os.Open(p)
 			if err != nil {
 				t.Fatal(err.Error())
 			}
diff --git a/server/sched_test.go b/server/sched_test.go
index 01fb9a70..3892fbba 100644
--- a/server/sched_test.go
+++ b/server/sched_test.go
@@ -112,11 +112,7 @@ func newScenarioRequest(t *testing.T, ctx context.Context, modelName string, est
 	b.ctx, b.ctxDone = context.WithCancel(ctx)
 	t.Helper()
 
-	f, err := os.CreateTemp(t.TempDir(), modelName)
-	require.NoError(t, err)
-	defer f.Close()
-
-	require.NoError(t, ggml.WriteGGUF(f, ggml.KV{
+	p, _ := createBinFile(t, ggml.KV{
 		"general.architecture":          "llama",
 		"llama.context_length":          uint32(32),
 		"llama.embedding_length":        uint32(4096),
@@ -129,14 +125,14 @@ func newScenarioRequest(t *testing.T, ctx context.Context, modelName string, est
 	}, []*ggml.Tensor{
 		{Name: "blk.0.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: bytes.NewReader(make([]byte, 32))},
 		{Name: "output.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: bytes.NewReader(make([]byte, 32))},
-	}))
-	require.NoError(t, err)
-
-	fname := f.Name()
-	model := &Model{Name: modelName, ModelPath: fname}
-	b.f, err = llm.LoadModel(model.ModelPath, 0)
-	require.NoError(t, err)
+	})
 
+	model := &Model{Name: modelName, ModelPath: p}
+	f, err := llm.LoadModel(model.ModelPath, 0)
+	if err != nil {
+		t.Fatal(err)
+	}
+	b.f = f
 	if duration == nil {
 		duration = &api.Duration{Duration: 5 * time.Millisecond}
 	}

From 9f8a18ec050ef67fca11d4f9bea0508eece93a68 Mon Sep 17 00:00:00 2001
From: Jeffrey Morgan <jmorganca@gmail.com>
Date: Thu, 12 Jun 2025 14:18:54 -0700
Subject: [PATCH 091/108] tools: loosen tool parsing to allow for more formats
 (#11030)

---
 server/routes.go                           |    8 +-
 tools/template.go                          |  156 +++
 tools/template_test.go                     |  139 +++
 tools/testdata/command-r-plus.gotmpl       |   67 --
 tools/testdata/command-r-plus.out          |   39 -
 tools/testdata/firefunction.gotmpl         |   31 -
 tools/testdata/firefunction.out            |   17 -
 tools/testdata/llama3-groq-tool-use.gotmpl |   43 -
 tools/testdata/llama3-groq-tool-use.out    |   24 -
 tools/testdata/llama3.2.gotmpl             |   44 -
 tools/testdata/llama3.2.out                |   24 -
 tools/testdata/messages.json               |   39 -
 tools/testdata/mistral.gotmpl              |   15 -
 tools/testdata/mistral.out                 |    3 -
 tools/testdata/nemotron.gotmpl             |   33 -
 tools/testdata/nemotron.out                |   18 -
 tools/testdata/qwen2.5.gotmpl              |   51 -
 tools/testdata/qwen2.5.out                 |   31 -
 tools/testdata/qwen3.gotmpl                |   50 -
 tools/testdata/qwen3.out                   |   31 -
 tools/testdata/tools.json                  |   30 -
 tools/testdata/xlam.gotmpl                 |   45 -
 tools/testdata/xlam.out                    |   40 -
 tools/tools.go                             |  470 ++++----
 tools/tools_test.go                        | 1246 +++++++++++---------
 tools/tools_utils.go                       |  222 ----
 tools/tools_utils_test.go                  |  497 --------
 27 files changed, 1238 insertions(+), 2175 deletions(-)
 create mode 100644 tools/template.go
 create mode 100644 tools/template_test.go
 delete mode 100644 tools/testdata/command-r-plus.gotmpl
 delete mode 100644 tools/testdata/command-r-plus.out
 delete mode 100644 tools/testdata/firefunction.gotmpl
 delete mode 100644 tools/testdata/firefunction.out
 delete mode 100644 tools/testdata/llama3-groq-tool-use.gotmpl
 delete mode 100644 tools/testdata/llama3-groq-tool-use.out
 delete mode 100644 tools/testdata/llama3.2.gotmpl
 delete mode 100644 tools/testdata/llama3.2.out
 delete mode 100644 tools/testdata/messages.json
 delete mode 100644 tools/testdata/mistral.gotmpl
 delete mode 100644 tools/testdata/mistral.out
 delete mode 100644 tools/testdata/nemotron.gotmpl
 delete mode 100644 tools/testdata/nemotron.out
 delete mode 100644 tools/testdata/qwen2.5.gotmpl
 delete mode 100644 tools/testdata/qwen2.5.out
 delete mode 100644 tools/testdata/qwen3.gotmpl
 delete mode 100644 tools/testdata/qwen3.out
 delete mode 100644 tools/testdata/tools.json
 delete mode 100644 tools/testdata/xlam.gotmpl
 delete mode 100644 tools/testdata/xlam.out
 delete mode 100644 tools/tools_utils.go
 delete mode 100644 tools/tools_utils_test.go

diff --git a/server/routes.go b/server/routes.go
index 8eda5c73..cb46cef1 100644
--- a/server/routes.go
+++ b/server/routes.go
@@ -1526,12 +1526,7 @@ func (s *Server) ChatHandler(c *gin.Context) {
 
 	var toolParser *tools.Parser
 	if len(req.Tools) > 0 {
-		toolParser, err = tools.NewParser(m.Template.Template)
-		if err != nil {
-			slog.Error("failed to create tool parser", "error", err)
-			c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
-			return
-		}
+		toolParser = tools.NewParser(m.Template.Template, req.Tools)
 	}
 
 	ch := make(chan any)
@@ -1584,6 +1579,7 @@ func (s *Server) ChatHandler(c *gin.Context) {
 					// don't return
 				} else {
 					if r.Done {
+						res.Message.Content = toolParser.Content()
 						ch <- res
 					}
 					return
diff --git a/tools/template.go b/tools/template.go
new file mode 100644
index 00000000..e22f0675
--- /dev/null
+++ b/tools/template.go
@@ -0,0 +1,156 @@
+package tools
+
+import (
+	"bytes"
+	"log/slog"
+	"slices"
+	"strings"
+	"text/template"
+	"text/template/parse"
+)
+
+// parseTag finds the tool calling tag from a Go template
+// often <tool_call> [TOOL_CALL] or similar by finding the
+// first text node after .ToolCalls and returning the content
+// if no tag is found, return "{" to indicate that json objects
+// should be attempted to be parsed as tool calls
+func parseTag(tmpl *template.Template) string {
+	if tmpl == nil || tmpl.Tree == nil {
+		slog.Debug("template or tree is nil")
+		return "{"
+	}
+
+	tc := findToolCallNode(tmpl.Tree.Root.Nodes)
+	if tc == nil {
+		return "{"
+	}
+
+	tn := findTextNode(tc.List.Nodes)
+	if tn == nil {
+		return "{"
+	}
+
+	tag := string(tn.Text)
+	tag = strings.ReplaceAll(tag, "\r\n", "\n")
+
+	// avoid parsing { onwards as this may be a tool call
+	// however keep '{' as a prefix if there is no tag
+	// so that all json objects will be attempted to
+	// be parsed as tool calls
+	tag, _, _ = strings.Cut(tag, "{")
+	tag = strings.TrimSpace(tag)
+	if tag == "" {
+		tag = "{"
+	}
+
+	return tag
+}
+
+// findToolCallNode searches for and returns an IfNode with .ToolCalls
+func findToolCallNode(nodes []parse.Node) *parse.IfNode {
+	isToolCallsNode := func(n *parse.IfNode) bool {
+		for _, cmd := range n.Pipe.Cmds {
+			for _, arg := range cmd.Args {
+				if field, ok := arg.(*parse.FieldNode); ok {
+					if slices.Contains(field.Ident, "ToolCalls") {
+						return true
+					}
+				}
+			}
+		}
+		return false
+	}
+
+	for _, node := range nodes {
+		switch n := node.(type) {
+		case *parse.IfNode:
+			if isToolCallsNode(n) {
+				return n
+			}
+			// Recursively search in nested IfNodes
+			if result := findToolCallNode(n.List.Nodes); result != nil {
+				return result
+			}
+			if n.ElseList != nil {
+				if result := findToolCallNode(n.ElseList.Nodes); result != nil {
+					return result
+				}
+			}
+		case *parse.ListNode:
+			if result := findToolCallNode(n.Nodes); result != nil {
+				return result
+			}
+		case *parse.RangeNode:
+			if result := findToolCallNode(n.List.Nodes); result != nil {
+				return result
+			}
+			if n.ElseList != nil {
+				if result := findToolCallNode(n.ElseList.Nodes); result != nil {
+					return result
+				}
+			}
+		case *parse.WithNode:
+			if result := findToolCallNode(n.List.Nodes); result != nil {
+				return result
+			}
+			if n.ElseList != nil {
+				if result := findToolCallNode(n.ElseList.Nodes); result != nil {
+					return result
+				}
+			}
+		}
+	}
+	return nil
+}
+
+// findTextNode does a depth-first search for the first text content in nodes,
+// stopping at template constructs to avoid parsing text after the tool calls
+func findTextNode(nodes []parse.Node) *parse.TextNode {
+	for _, node := range nodes {
+		switch n := node.(type) {
+		case *parse.TextNode:
+			// skip whitespace-only text nodes
+			if len(bytes.TrimSpace(n.Text)) == 0 {
+				continue
+			}
+			return n
+		case *parse.IfNode:
+			if text := findTextNode(n.List.Nodes); text != nil {
+				return text
+			}
+			if n.ElseList != nil {
+				if text := findTextNode(n.ElseList.Nodes); text != nil {
+					return text
+				}
+			}
+			return nil
+		case *parse.ListNode:
+			if text := findTextNode(n.Nodes); text != nil {
+				return text
+			}
+		case *parse.RangeNode:
+			if text := findTextNode(n.List.Nodes); text != nil {
+				return text
+			}
+			if n.ElseList != nil {
+				if text := findTextNode(n.ElseList.Nodes); text != nil {
+					return text
+				}
+			}
+			return nil
+		case *parse.WithNode:
+			if text := findTextNode(n.List.Nodes); text != nil {
+				return text
+			}
+			if n.ElseList != nil {
+				if text := findTextNode(n.ElseList.Nodes); text != nil {
+					return text
+				}
+			}
+			return nil
+		case *parse.ActionNode:
+			return nil
+		}
+	}
+	return nil
+}
diff --git a/tools/template_test.go b/tools/template_test.go
new file mode 100644
index 00000000..970c0d59
--- /dev/null
+++ b/tools/template_test.go
@@ -0,0 +1,139 @@
+package tools
+
+import (
+	"testing"
+	"text/template"
+)
+
+func TestParseTag(t *testing.T) {
+	cases := []struct {
+		name     string
+		template string
+		want     string
+	}{
+		{
+			name:     "empty",
+			template: "",
+			want:     "{",
+		},
+		{
+			name:     "no tag",
+			template: "{{if .ToolCalls}}{{end}}",
+			want:     "{",
+		},
+		{
+			name:     "no tag with range",
+			template: "{{if .ToolCalls}}{{range .ToolCalls}}{{ . }}{{end}}{{end}}",
+			want:     "{",
+		},
+		{
+			name:     "tool call with json format",
+			template: "{{if .ToolCalls}}```json\n{{end}}",
+			want:     "```json",
+		},
+		{
+			name:     "square brackets",
+			template: "{{if .ToolCalls}}[{{range .ToolCalls}}{{ . }}{{end}}]{{end}}",
+			want:     "[",
+		},
+		{
+			name:     "square brackets with whitespace",
+			template: "{{if .ToolCalls}}\n [ {{range .ToolCalls}}{{ . }}{{end}}]{{end}}",
+			want:     "[",
+		},
+		{
+			name:     "tailing ]",
+			template: "{{if .ToolCalls}}{{range .ToolCalls}}{{ . }}{{end}}]{{end}}",
+			want:     "{",
+		},
+		{
+			name:     "whitespace only",
+			template: "{{if .ToolCalls}} {{range .ToolCalls}}{{ . }}{{end}}{{end}}",
+			want:     "{",
+		},
+		{
+			name:     "whitespace only in range",
+			template: "{{if .ToolCalls}}{{range .ToolCalls}}\n{{ . }}\n{{end}}{{end}}",
+			want:     "{",
+		},
+		{
+			name:     "json objects",
+			template: `{{if .ToolCalls}}{{range .ToolCalls}}{"name": "{{ .Function.Name }}", "arguments": {{ .Function.Arguments }}}{{end}}{{end}}`,
+			want:     "{",
+		},
+		{
+			name:     "json objects with whitespace",
+			template: "{{if .ToolCalls}}{{range .ToolCalls}}\n{\"name\": \"{{ .Function.Name }}\", \"arguments\": {{ .Function.Arguments }}}{{end}}{{end}}",
+			want:     "{",
+		},
+		{
+			name:     "json objects with CRLF",
+			template: "{{if .ToolCalls}}{{range .ToolCalls}}\r\n{\"name\": \"{{ .Function.Name }}\", \"arguments\": {{ .Function.Arguments }}}{{end}}{{end}}",
+			want:     "{",
+		},
+		{
+			name:     "json objects with whitespace before and after range",
+			template: "{{if .ToolCalls}}\n{{range .ToolCalls}}\n{\"name\": \"{{ .Function.Name }}\", \"arguments\": {{ .Function.Arguments }}}\r\n{{end}}\r\n{{end}}",
+			want:     "{",
+		},
+		{
+			name:     "before and after range",
+			template: "{{if .ToolCalls}}<|tool▁calls▁begin|>{{range .ToolCalls}}<|tool▁call▁begin|>functionget_current_weather\n```json\n{\"location\": \"Tokyo\"}\n```<|tool▁call▁end|>\n{{end}}<|tool▁calls▁end|>{{end}}",
+			want:     "<|tool▁calls▁begin|>",
+		},
+		{
+			name:     "after range",
+			template: "{{if .ToolCalls}}{{range .ToolCalls}}<tool_call>{\"name\": \"{{ .Function.Name }}\", \"arguments\": {{ .Function.Arguments }}}</tool_call>{{end}}{{end}}",
+			want:     "<tool_call>",
+		},
+		{
+			name:     "after range with leading whitespace before range",
+			template: "{{if .ToolCalls}}\n{{range .ToolCalls}}<tool_call>{\"name\": \"{{ .Function.Name }}\", \"arguments\": {{ .Function.Arguments }}}</tool_call>{{end}}{{end}}",
+			want:     "<tool_call>",
+		},
+		{
+			name:     "tool call in range with {",
+			template: `{{if .ToolCalls}}{{range .ToolCalls}}<tool_call>{"name": "{{ .Function.Name }}", "arguments": {{ .Function.Arguments }}}<tool_call>{{end}}{{end}}`,
+			want:     "<tool_call>",
+		},
+		{
+			name:     "tool call with multiple text nodes",
+			template: "{{if .ToolCalls}}First text{{if .Something}}inner{{end}}Second text{{end}}",
+			want:     "First text",
+		},
+		{
+			name:     "action tag",
+			template: "{{if .ToolCalls}}Action: ```json{{end}}",
+			want:     "Action: ```json",
+		},
+		{
+			name:     "incomplete functools bracket",
+			template: "{{if .ToolCalls}}functools[{{end}}",
+			want:     "functools[",
+		},
+		{
+			name:     "uppercase tool call with incomplete bracket",
+			template: "{{if .ToolCalls}}[TOOL_CALL] [{{end}}",
+			want:     "[TOOL_CALL] [",
+		},
+		{
+			name:     "uppercase tool call with adjacent bracket",
+			template: "{{if .ToolCalls}}[TOOL_CALL][{{end}}",
+			want:     "[TOOL_CALL][",
+		},
+	}
+
+	for _, tc := range cases {
+		t.Run(tc.name, func(t *testing.T) {
+			tmpl, err := template.New("test").Parse(tc.template)
+			if err != nil && tc.template != "" {
+				t.Fatalf("failed to parse template: %v", err)
+			}
+
+			got := parseTag(tmpl)
+			if got != tc.want {
+				t.Errorf("got text %q, want %q", got, tc.want)
+			}
+		})
+	}
+}
diff --git a/tools/testdata/command-r-plus.gotmpl b/tools/testdata/command-r-plus.gotmpl
deleted file mode 100644
index f30124e3..00000000
--- a/tools/testdata/command-r-plus.gotmpl
+++ /dev/null
@@ -1,67 +0,0 @@
-{{- if or .Tools .System }}<|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|>
-{{- if .Tools }}# Safety Preamble
-The instructions in this section override those in the task description and style guide sections. Don't answer questions that are harmful or immoral.
-
-# System Preamble
-## Basic Rules
-You are a powerful conversational AI trained by Cohere to help people. You are augmented by a number of tools, and your job is to use and consume the output of these tools to best help the user. You will see a conversation history between yourself and a user, ending with an utterance from the user. You will then see a specific instruction instructing you what kind of response to generate. When you answer the user's requests, you cite your sources in your answers, according to those instructions.
-
-{{ if .System }}# User Preamble
-{{ .System }}
-{{- end }}
-
-## Available Tools
-Here is a list of tools that you have available to you:
-{{- range .Tools }}
-
-```python
-def {{ .Function.Name }}(
-{{- range $name, $property := .Function.Parameters.Properties }}{{ $name }}: {{ $property.Type }}, {{ end }}) -> List[Dict]:
-    """{{ .Function.Description }}
-
-{{- if .Function.Parameters.Properties }}
-
-    Args:
-{{- range $name, $property := .Function.Parameters.Properties }}
-        {{ $name }} ({{ $property.Type }}): {{ $property.Description }}
-{{- end }}
-{{- end }}
-    """
-    pass
-```
-{{- end }}
-{{- else if .System }}{{ .System }}
-{{- end }}<|END_OF_TURN_TOKEN|>
-{{- end }}
-{{- range .Messages }}
-{{- if eq .Role "system" }}
-{{- continue }}
-{{- end }}<|START_OF_TURN_TOKEN|>
-{{- if eq .Role "user" }}<|USER_TOKEN|>{{ .Content }}
-{{- else if eq .Role "assistant" }}<|CHATBOT_TOKEN|>
-{{- if .Content }}{{ .Content }}
-{{- else if .ToolCalls }}
-Action: ```json
-[
-{{- range .ToolCalls }}
-    {
-        "tool_name": "{{ .Function.Name }}",
-        "parameters": {{ .Function.Arguments }}
-    }
-{{- end }}
-]```
-{{ continue }}
-{{ end }}
-{{- else if eq .Role "tool" }}<|SYSTEM_TOKEN|><results>
-{{ .Content }}</results>
-{{- end }}<|END_OF_TURN_TOKEN|>
-{{- end }}
-{{- if .Tools }}<|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|>Write 'Action:' followed by a json-formatted list of actions that you want to perform in order to produce a good response to the user's last input. You can use any of the supplied tools any number of times, but you should aim to execute the minimum number of necessary actions for the input. You should use the `directly-answer` tool if calling the other tools is unnecessary. The list of actions you want to call should be formatted as a list of json objects, for example:
-```json
-[
-    {
-        "tool_name": title of the tool in the specification,
-        "parameters": a dict of parameters to input into the tool as they are defined in the specs, or {} if it takes no parameters
-    }
-]```
-{{- end }}<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>
\ No newline at end of file
diff --git a/tools/testdata/command-r-plus.out b/tools/testdata/command-r-plus.out
deleted file mode 100644
index 8193d40c..00000000
--- a/tools/testdata/command-r-plus.out
+++ /dev/null
@@ -1,39 +0,0 @@
-<|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|># Safety Preamble
-The instructions in this section override those in the task description and style guide sections. Don't answer questions that are harmful or immoral.
-
-# System Preamble
-## Basic Rules
-You are a powerful conversational AI trained by Cohere to help people. You are augmented by a number of tools, and your job is to use and consume the output of these tools to best help the user. You will see a conversation history between yourself and a user, ending with an utterance from the user. You will then see a specific instruction instructing you what kind of response to generate. When you answer the user's requests, you cite your sources in your answers, according to those instructions.
-
-# User Preamble
-You are a knowledgeable assistant. You can answer questions and perform tasks.
-
-## Available Tools
-Here is a list of tools that you have available to you:
-
-```python
-def get_current_weather(format: string, location: string, ) -> List[Dict]:
-    """Get the current weather
-
-    Args:
-        format (string): The temperature unit to use. Infer this from the user's location.
-        location (string): The city and state, e.g. San Francisco, CA
-    """
-    pass
-```<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|USER_TOKEN|>What's the weather like today in Paris?<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>
-Action: ```json
-[
-    {
-        "tool_name": "get_current_weather",
-        "parameters": {"format":"celsius","location":"Paris, France"}
-    }
-]```
-<|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|><results>
-22</results><|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>The current temperature in Paris, France is 22 degrees Celsius.<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|USER_TOKEN|>What's the weather like today in San Francisco and Toronto?<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|>Write 'Action:' followed by a json-formatted list of actions that you want to perform in order to produce a good response to the user's last input. You can use any of the supplied tools any number of times, but you should aim to execute the minimum number of necessary actions for the input. You should use the `directly-answer` tool if calling the other tools is unnecessary. The list of actions you want to call should be formatted as a list of json objects, for example:
-```json
-[
-    {
-        "tool_name": title of the tool in the specification,
-        "parameters": a dict of parameters to input into the tool as they are defined in the specs, or {} if it takes no parameters
-    }
-]```<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>
\ No newline at end of file
diff --git a/tools/testdata/firefunction.gotmpl b/tools/testdata/firefunction.gotmpl
deleted file mode 100644
index 312be205..00000000
--- a/tools/testdata/firefunction.gotmpl
+++ /dev/null
@@ -1,31 +0,0 @@
-{{- if or .System .Tools }}<|start_header_id|>system<|end_header_id|>
-{{- if .System }}
-{{ .System }}
-{{- end }}
-In addition to plain text responses, you can chose to call one or more of the provided functions.
-
-Use the following rule to decide when to call a function:
-  * if the response can be generated from your internal knowledge (e.g., as in the case of queries like "What is the capital of Poland?"), do so
-  * if you need external information that can be obtained by calling one or more of the provided functions, generate a function calls
-
-If you decide to call functions:
-  * prefix function calls with functools marker (no closing marker required)
-  * all function calls should be generated in a single JSON list formatted as functools[{"name": [function name], "arguments": [function arguments as JSON]}, ...]
-  * follow the provided JSON schema. Do not hallucinate arguments or values. Do to blindly copy values from the provided samples
-  * respect the argument type formatting. E.g., if the type if number and format is float, write value 7 as 7.0
-  * make sure you pick the right functions that match the user intent
-
-Available functions as JSON spec:
-{{- if .Tools }}
-{{ .Tools }}
-{{- end }}<|eot_id|>
-{{- end }}
-{{- range .Messages }}<|start_header_id|>
-{{- if or (eq .Role "user") (eq .Role "assistant") (eq .Role "tool") }}{{ .Role }}
-{{- end }}<|end_header_id|>
-{{- if .Content }}{{ .Content }}
-{{- else if .ToolCalls }} functools[
-{{- range .ToolCalls }}{{ "{" }}"name": "{{ .Function.Name }}", "arguments": {{ .Function.Arguments }}{{ "}" }}
-{{- end }}]
-{{- end }}<|eot_id|>
-{{- end }}<|start_header_id|>assistant<|end_header_id|>
\ No newline at end of file
diff --git a/tools/testdata/firefunction.out b/tools/testdata/firefunction.out
deleted file mode 100644
index 144f5e42..00000000
--- a/tools/testdata/firefunction.out
+++ /dev/null
@@ -1,17 +0,0 @@
-<|start_header_id|>system<|end_header_id|>
-You are a knowledgeable assistant. You can answer questions and perform tasks.
-In addition to plain text responses, you can chose to call one or more of the provided functions.
-
-Use the following rule to decide when to call a function:
-  * if the response can be generated from your internal knowledge (e.g., as in the case of queries like "What is the capital of Poland?"), do so
-  * if you need external information that can be obtained by calling one or more of the provided functions, generate a function calls
-
-If you decide to call functions:
-  * prefix function calls with functools marker (no closing marker required)
-  * all function calls should be generated in a single JSON list formatted as functools[{"name": [function name], "arguments": [function arguments as JSON]}, ...]
-  * follow the provided JSON schema. Do not hallucinate arguments or values. Do to blindly copy values from the provided samples
-  * respect the argument type formatting. E.g., if the type if number and format is float, write value 7 as 7.0
-  * make sure you pick the right functions that match the user intent
-
-Available functions as JSON spec:
-[{"type":"function","function":{"name":"get_current_weather","description":"Get the current weather","parameters":{"type":"object","required":["location","format"],"properties":{"format":{"type":"string","description":"The temperature unit to use. Infer this from the user's location.","enum":["celsius","fahrenheit"]},"location":{"type":"string","description":"The city and state, e.g. San Francisco, CA"}}}}}]<|eot_id|><|start_header_id|><|end_header_id|>You are a knowledgeable assistant. You can answer questions and perform tasks.<|eot_id|><|start_header_id|>user<|end_header_id|>What's the weather like today in Paris?<|eot_id|><|start_header_id|>assistant<|end_header_id|> functools[{"name": "get_current_weather", "arguments": {"format":"celsius","location":"Paris, France"}}]<|eot_id|><|start_header_id|>tool<|end_header_id|>22<|eot_id|><|start_header_id|>assistant<|end_header_id|>The current temperature in Paris, France is 22 degrees Celsius.<|eot_id|><|start_header_id|>user<|end_header_id|>What's the weather like today in San Francisco and Toronto?<|eot_id|><|start_header_id|>assistant<|end_header_id|>
\ No newline at end of file
diff --git a/tools/testdata/llama3-groq-tool-use.gotmpl b/tools/testdata/llama3-groq-tool-use.gotmpl
deleted file mode 100644
index 45e9b462..00000000
--- a/tools/testdata/llama3-groq-tool-use.gotmpl
+++ /dev/null
@@ -1,43 +0,0 @@
-{{- if .Messages }}
-{{- if or .System .Tools }}<|start_header_id|>system<|end_header_id|>
-
-{{ .System }}
-{{- if .Tools }} You are provided with function signatures within <tools></tools> XML tags. You may call one or more functions to assist with the user query. Don't make assumptions about what values to plug into functions. For each function call return a json object with function name and arguments within <tool_call></tool_call> XML tags as follows:
-<tool_call>
-{"name": <function-name>,"arguments": <args-dict>}
-</tool_call>
-
-Here are the available tools:
-<tools>
-{{- range .Tools }} {{ .Function }}
-{{- end }} </tools>
-{{- end }}
-{{- end }}<|eot_id|>
-{{- range .Messages }}
-{{- if ne .Role "system" }}<|start_header_id|>{{ .Role }}<|end_header_id|>
-
-{{ if eq .Role "user" }}{{ .Content }}
-{{- else if eq .Role "assistant" }}
-{{- if .Content }}{{ .Content }}
-{{- else if .ToolCalls }}<tool_call>
-{{ range .ToolCalls }}{"name": "{{ .Function.Name }}", "arguments": {{ .Function.Arguments }}}
-{{- end }}
-</tool_call>
-{{- end }}
-{{- else if eq .Role "tool" }}<tool_response>
-{{ .Content }}
-</tool_response>
-{{- end }}<|eot_id|>
-{{- end }}
-{{- end }}<|start_header_id|>assistant<|end_header_id|>
-
-{{ else }}
-{{ if .System }}<|start_header_id|>system<|end_header_id|>
-
-{{ .System }}<|eot_id|>{{ end }}{{ if .Prompt }}<|start_header_id|>user<|end_header_id|>
-
-{{ .Prompt }}<|eot_id|>{{ end }}<|start_header_id|>assistant<|end_header_id|>
-
-{{ end }}{{ .Response }}
-{{- if .Response }}<|eot_id|>
-{{- end }}
\ No newline at end of file
diff --git a/tools/testdata/llama3-groq-tool-use.out b/tools/testdata/llama3-groq-tool-use.out
deleted file mode 100644
index 912ad11c..00000000
--- a/tools/testdata/llama3-groq-tool-use.out
+++ /dev/null
@@ -1,24 +0,0 @@
-<|start_header_id|>system<|end_header_id|>
-
-You are a knowledgeable assistant. You can answer questions and perform tasks. You are provided with function signatures within <tools></tools> XML tags. You may call one or more functions to assist with the user query. Don't make assumptions about what values to plug into functions. For each function call return a json object with function name and arguments within <tool_call></tool_call> XML tags as follows:
-<tool_call>
-{"name": <function-name>,"arguments": <args-dict>}
-</tool_call>
-
-Here are the available tools:
-<tools> {"name":"get_current_weather","description":"Get the current weather","parameters":{"type":"object","required":["location","format"],"properties":{"format":{"type":"string","description":"The temperature unit to use. Infer this from the user's location.","enum":["celsius","fahrenheit"]},"location":{"type":"string","description":"The city and state, e.g. San Francisco, CA"}}}} </tools><|eot_id|><|start_header_id|>user<|end_header_id|>
-
-What's the weather like today in Paris?<|eot_id|><|start_header_id|>assistant<|end_header_id|>
-
-<tool_call>
-{"name": "get_current_weather", "arguments": {"format":"celsius","location":"Paris, France"}}
-</tool_call><|eot_id|><|start_header_id|>tool<|end_header_id|>
-
-<tool_response>
-22
-</tool_response><|eot_id|><|start_header_id|>assistant<|end_header_id|>
-
-The current temperature in Paris, France is 22 degrees Celsius.<|eot_id|><|start_header_id|>user<|end_header_id|>
-
-What's the weather like today in San Francisco and Toronto?<|eot_id|><|start_header_id|>assistant<|end_header_id|>
-
diff --git a/tools/testdata/llama3.2.gotmpl b/tools/testdata/llama3.2.gotmpl
deleted file mode 100644
index b132423e..00000000
--- a/tools/testdata/llama3.2.gotmpl
+++ /dev/null
@@ -1,44 +0,0 @@
-<|start_header_id|>system<|end_header_id|>
-
-Cutting Knowledge Date: December 2023
-
-{{ if .System }}{{ .System }}
-{{- end }}
-{{- if .Tools }}When you receive a tool call response, use the output to format an answer to the orginal user question.
-
-You are a helpful assistant with tool calling capabilities.
-{{- end }}<|eot_id|>
-{{- range $i, $_ := .Messages }}
-{{- $last := eq (len (slice $.Messages $i)) 1 }}
-{{- if eq .Role "user" }}<|start_header_id|>user<|end_header_id|>
-{{- if and $.Tools $last }}
-
-Given the following functions, please respond with a JSON for a function call with its proper arguments that best answers the given prompt.
-
-Respond in the format {"name": function name, "parameters": dictionary of argument name and its value}. Do not use variables.
-
-{{ range $.Tools }}
-{{- . }}
-{{ end }}
-{{ .Content }}<|eot_id|>
-{{- else }}
-
-{{ .Content }}<|eot_id|>
-{{- end }}{{ if $last }}<|start_header_id|>assistant<|end_header_id|>
-
-{{ end }}
-{{- else if eq .Role "assistant" }}<|start_header_id|>assistant<|end_header_id|>
-{{- if .ToolCalls }}
-{{ range .ToolCalls }}
-{"name": "{{ .Function.Name }}", "parameters": {{ .Function.Arguments }}}{{ end }}
-{{- else }}
-
-{{ .Content }}
-{{- end }}{{ if not $last }}<|eot_id|>{{ end }}
-{{- else if eq .Role "tool" }}<|start_header_id|>ipython<|end_header_id|>
-
-{{ .Content }}<|eot_id|>{{ if $last }}<|start_header_id|>assistant<|end_header_id|>
-
-{{ end }}
-{{- end }}
-{{- end }}
\ No newline at end of file
diff --git a/tools/testdata/llama3.2.out b/tools/testdata/llama3.2.out
deleted file mode 100644
index a27c6eaf..00000000
--- a/tools/testdata/llama3.2.out
+++ /dev/null
@@ -1,24 +0,0 @@
-<|start_header_id|>system<|end_header_id|>
-
-Cutting Knowledge Date: December 2023
-
-You are a knowledgeable assistant. You can answer questions and perform tasks.When you receive a tool call response, use the output to format an answer to the orginal user question.
-
-You are a helpful assistant with tool calling capabilities.<|eot_id|><|start_header_id|>user<|end_header_id|>
-
-What's the weather like today in Paris?<|eot_id|><|start_header_id|>assistant<|end_header_id|>
-
-{"name": "get_current_weather", "parameters": {"format":"celsius","location":"Paris, France"}}<|eot_id|><|start_header_id|>ipython<|end_header_id|>
-
-22<|eot_id|><|start_header_id|>assistant<|end_header_id|>
-
-The current temperature in Paris, France is 22 degrees Celsius.<|eot_id|><|start_header_id|>user<|end_header_id|>
-
-Given the following functions, please respond with a JSON for a function call with its proper arguments that best answers the given prompt.
-
-Respond in the format {"name": function name, "parameters": dictionary of argument name and its value}. Do not use variables.
-
-{"type":"function","function":{"name":"get_current_weather","description":"Get the current weather","parameters":{"type":"object","required":["location","format"],"properties":{"format":{"type":"string","description":"The temperature unit to use. Infer this from the user's location.","enum":["celsius","fahrenheit"]},"location":{"type":"string","description":"The city and state, e.g. San Francisco, CA"}}}}}
-
-What's the weather like today in San Francisco and Toronto?<|eot_id|><|start_header_id|>assistant<|end_header_id|>
-
diff --git a/tools/testdata/messages.json b/tools/testdata/messages.json
deleted file mode 100644
index 42de4711..00000000
--- a/tools/testdata/messages.json
+++ /dev/null
@@ -1,39 +0,0 @@
-[
-  {
-    "role": "system",
-    "content": "You are a knowledgeable assistant. You can answer questions and perform tasks."
-  },
-  {
-    "role": "user",
-    "content": "What's the weather like today in Paris?"
-  },
-  {
-    "role": "assistant",
-    "tool_calls": [
-      {
-        "id": "89a1e453-0bce-4de3-a456-c54bed09c520",
-        "type": "function",
-        "function": {
-          "name": "get_current_weather",
-          "arguments": {
-            "location": "Paris, France",
-            "format": "celsius"
-          }
-        }
-      }
-    ]
-  },
-  {
-    "role": "tool",
-    "tool_call_id": "89a1e453-0bce-4de3-a456-c54bed09c520",
-    "content": "22"
-  },
-  {
-    "role": "assistant",
-    "content": "The current temperature in Paris, France is 22 degrees Celsius."
-  },
-  {
-    "role": "user",
-    "content": "What's the weather like today in San Francisco and Toronto?"
-  }
-]
diff --git a/tools/testdata/mistral.gotmpl b/tools/testdata/mistral.gotmpl
deleted file mode 100644
index b08d6c2c..00000000
--- a/tools/testdata/mistral.gotmpl
+++ /dev/null
@@ -1,15 +0,0 @@
-{{- range $index, $_ := .Messages }}
-{{- if eq .Role "user" }}
-{{- if and (eq (len (slice $.Messages $index)) 1) $.Tools }}[AVAILABLE_TOOLS] {{ $.Tools }}[/AVAILABLE_TOOLS]
-{{- end }}[INST] {{ if and (eq (len (slice $.Messages $index)) 1) $.System }}{{ $.System }}
-
-{{ end }}{{ .Content }}[/INST]
-{{- else if eq .Role "assistant" }}
-{{- if .Content }} {{ .Content }}</s>
-{{- else if .ToolCalls }}[TOOL_CALLS] [
-{{- range .ToolCalls }}{"name": "{{ .Function.Name }}", "arguments": {{ .Function.Arguments }}}
-{{- end }}]</s>
-{{- end }}
-{{- else if eq .Role "tool" }}[TOOL_RESULTS] {"content": {{ .Content }}}[/TOOL_RESULTS]
-{{- end }}
-{{- end }}
\ No newline at end of file
diff --git a/tools/testdata/mistral.out b/tools/testdata/mistral.out
deleted file mode 100644
index 6956e392..00000000
--- a/tools/testdata/mistral.out
+++ /dev/null
@@ -1,3 +0,0 @@
-[INST] What's the weather like today in Paris?[/INST][TOOL_CALLS] [{"name": "get_current_weather", "arguments": {"format":"celsius","location":"Paris, France"}}]</s>[TOOL_RESULTS] {"content": 22}[/TOOL_RESULTS] The current temperature in Paris, France is 22 degrees Celsius.</s>[AVAILABLE_TOOLS] [{"type":"function","function":{"name":"get_current_weather","description":"Get the current weather","parameters":{"type":"object","required":["location","format"],"properties":{"format":{"type":"string","description":"The temperature unit to use. Infer this from the user's location.","enum":["celsius","fahrenheit"]},"location":{"type":"string","description":"The city and state, e.g. San Francisco, CA"}}}}}][/AVAILABLE_TOOLS][INST] You are a knowledgeable assistant. You can answer questions and perform tasks.
-
-What's the weather like today in San Francisco and Toronto?[/INST]
\ No newline at end of file
diff --git a/tools/testdata/nemotron.gotmpl b/tools/testdata/nemotron.gotmpl
deleted file mode 100644
index 1b6b89ec..00000000
--- a/tools/testdata/nemotron.gotmpl
+++ /dev/null
@@ -1,33 +0,0 @@
-{{- if (or .Tools .System) }}<extra_id_0>System
-{{ if .System }}{{ .System }}
-
-
-{{ end }}
-{{- if .Tools }}
-{{- range .Tools }}<tool> {{ . }} </tool>{{ end }}
-
-
-{{ end }}
-{{- end }}
-{{- range $i, $m := .Messages }}
-{{- $last := eq (len (slice $.Messages $i)) 1 -}}
-{{- if eq .Role "user" }}<extra_id_1>User
-{{ .Content }}
-{{- if $last }}
-<extra_id_1>Assistant
-{{- end }}
-{{ else if eq .Role "tool" }}<extra_id_1>Tool
-{{ .Content }}
-{{- if $last }}
-<extra_id_1>Assistant
-{{- end }}
-{{ else if eq .Role "assistant" }}<extra_id_1>Assistant
-{{- if .ToolCalls }}
-{{ range .ToolCalls }}<toolcall> {"name": "{{ .Function.Name }}", "arguments": {{ .Function.Arguments }}} </toolcall> {{ end }}
-{{ else }}
-{{ .Content }}
-{{- if not $last }}
-{{ end }}
-{{- end }}
-{{- end }}
-{{- end }}
\ No newline at end of file
diff --git a/tools/testdata/nemotron.out b/tools/testdata/nemotron.out
deleted file mode 100644
index 486889ca..00000000
--- a/tools/testdata/nemotron.out
+++ /dev/null
@@ -1,18 +0,0 @@
-<extra_id_0>System
-You are a knowledgeable assistant. You can answer questions and perform tasks.
-
-
-<tool> {"type":"function","function":{"name":"get_current_weather","description":"Get the current weather","parameters":{"type":"object","required":["location","format"],"properties":{"format":{"type":"string","description":"The temperature unit to use. Infer this from the user's location.","enum":["celsius","fahrenheit"]},"location":{"type":"string","description":"The city and state, e.g. San Francisco, CA"}}}}} </tool>
-
-
-<extra_id_1>User
-What's the weather like today in Paris?
-<extra_id_1>Assistant
-<toolcall> {"name": "get_current_weather", "arguments": {"format":"celsius","location":"Paris, France"}} </toolcall> 
-<extra_id_1>Tool
-22
-<extra_id_1>Assistant
-The current temperature in Paris, France is 22 degrees Celsius.
-<extra_id_1>User
-What's the weather like today in San Francisco and Toronto?
-<extra_id_1>Assistant
diff --git a/tools/testdata/qwen2.5.gotmpl b/tools/testdata/qwen2.5.gotmpl
deleted file mode 100644
index cbd7302c..00000000
--- a/tools/testdata/qwen2.5.gotmpl
+++ /dev/null
@@ -1,51 +0,0 @@
-{{- if .Suffix }}<|fim_prefix|>{{ .Prompt }}<|fim_suffix|>{{ .Suffix }}<|fim_middle|>
-{{- else if .Messages }}
-{{- if or .System .Tools }}<|im_start|>system
-{{- if .System }}
-{{ .System }}
-{{- end }}
-{{- if .Tools }}
-
-# Tools
-
-You may call one or more functions to assist with the user query.
-
-You are provided with function signatures within <tools></tools> XML tags:
-<tools>
-{{- range .Tools }}
-{"type": "function", "function": {{ .Function }}}
-{{- end }}
-</tools>
-
-For each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:
-<tool_call>
-{"name": <function-name>, "arguments": <args-json-object>}
-</tool_call>
-{{- end }}<|im_end|>
-{{ end }}
-{{- range $i, $_ := .Messages }}
-{{- $last := eq (len (slice $.Messages $i)) 1 -}}
-{{- if eq .Role "user" }}<|im_start|>user
-{{ .Content }}<|im_end|>
-{{ else if eq .Role "assistant" }}<|im_start|>assistant
-{{ if .Content }}{{ .Content }}
-{{- else if .ToolCalls }}<tool_call>
-{{ range .ToolCalls }}{"name": "{{ .Function.Name }}", "arguments": {{ .Function.Arguments }}}
-{{ end }}</tool_call>
-{{- end }}{{ if not $last }}<|im_end|>
-{{ end }}
-{{- else if eq .Role "tool" }}<|im_start|>user
-<tool_response>
-{{ .Content }}
-</tool_response><|im_end|>
-{{ end }}
-{{- if and (ne .Role "assistant") $last }}<|im_start|>assistant
-{{ end }}
-{{- end }}
-{{- else }}
-{{- if .System }}<|im_start|>system
-{{ .System }}<|im_end|>
-{{ end }}{{ if .Prompt }}<|im_start|>user
-{{ .Prompt }}<|im_end|>
-{{ end }}<|im_start|>assistant
-{{ end }}{{ .Response }}{{ if .Response }}<|im_end|>{{ end }}
\ No newline at end of file
diff --git a/tools/testdata/qwen2.5.out b/tools/testdata/qwen2.5.out
deleted file mode 100644
index 76bfbfa9..00000000
--- a/tools/testdata/qwen2.5.out
+++ /dev/null
@@ -1,31 +0,0 @@
-<|im_start|>system
-You are a knowledgeable assistant. You can answer questions and perform tasks.
-
-# Tools
-
-You may call one or more functions to assist with the user query.
-
-You are provided with function signatures within <tools></tools> XML tags:
-<tools>
-{"type": "function", "function": {"name":"get_current_weather","description":"Get the current weather","parameters":{"type":"object","required":["location","format"],"properties":{"format":{"type":"string","description":"The temperature unit to use. Infer this from the user's location.","enum":["celsius","fahrenheit"]},"location":{"type":"string","description":"The city and state, e.g. San Francisco, CA"}}}}}
-</tools>
-
-For each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:
-<tool_call>
-{"name": <function-name>, "arguments": <args-json-object>}
-</tool_call><|im_end|>
-<|im_start|>user
-What's the weather like today in Paris?<|im_end|>
-<|im_start|>assistant
-<tool_call>
-{"name": "get_current_weather", "arguments": {"format":"celsius","location":"Paris, France"}}
-</tool_call><|im_end|>
-<|im_start|>user
-<tool_response>
-22
-</tool_response><|im_end|>
-<|im_start|>assistant
-The current temperature in Paris, France is 22 degrees Celsius.<|im_end|>
-<|im_start|>user
-What's the weather like today in San Francisco and Toronto?<|im_end|>
-<|im_start|>assistant
diff --git a/tools/testdata/qwen3.gotmpl b/tools/testdata/qwen3.gotmpl
deleted file mode 100644
index 26f6656f..00000000
--- a/tools/testdata/qwen3.gotmpl
+++ /dev/null
@@ -1,50 +0,0 @@
-{{- if .Messages }}
-{{- if or .System .Tools }}<|im_start|>system
-{{- if .System }}
-{{ .System }}
-{{- end }}
-{{- if .Tools }}
-
-# Tools
-
-You may call one or more functions to assist with the user query.
-
-You are provided with function signatures within <tools></tools> XML tags:
-<tools>
-{{- range .Tools }}
-{"type": "function", "function": {{ .Function }}}
-{{- end }}
-</tools>
-
-For each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:
-<tool_call>
-{"name": <function-name>, "arguments": <args-json-object>}
-</tool_call>
-{{- end }}<|im_end|>
-{{ end }}
-{{- range $i, $_ := .Messages }}
-{{- $last := eq (len (slice $.Messages $i)) 1 -}}
-{{- if eq .Role "user" }}<|im_start|>user
-{{ .Content }}<|im_end|>
-{{ else if eq .Role "assistant" }}<|im_start|>assistant
-{{ if .Content }}{{ .Content }}
-{{- else if .ToolCalls }}<tool_call>
-{{ range .ToolCalls }}{"name": "{{ .Function.Name }}", "arguments": {{ .Function.Arguments }}}
-{{ end }}</tool_call>
-{{- end }}{{ if not $last }}<|im_end|>
-{{ end }}
-{{- else if eq .Role "tool" }}<|im_start|>user
-<tool_response>
-{{ .Content }}
-</tool_response><|im_end|>
-{{ end }}
-{{- if and (ne .Role "assistant") $last }}<|im_start|>assistant
-{{ end }}
-{{- end }}
-{{- else }}
-{{- if .System }}<|im_start|>system
-{{ .System }}<|im_end|>
-{{ end }}{{ if .Prompt }}<|im_start|>user
-{{ .Prompt }}<|im_end|>
-{{ end }}<|im_start|>assistant
-{{ end }}{{ .Response }}{{ if .Response }}<|im_end|>{{ end }}
\ No newline at end of file
diff --git a/tools/testdata/qwen3.out b/tools/testdata/qwen3.out
deleted file mode 100644
index 76bfbfa9..00000000
--- a/tools/testdata/qwen3.out
+++ /dev/null
@@ -1,31 +0,0 @@
-<|im_start|>system
-You are a knowledgeable assistant. You can answer questions and perform tasks.
-
-# Tools
-
-You may call one or more functions to assist with the user query.
-
-You are provided with function signatures within <tools></tools> XML tags:
-<tools>
-{"type": "function", "function": {"name":"get_current_weather","description":"Get the current weather","parameters":{"type":"object","required":["location","format"],"properties":{"format":{"type":"string","description":"The temperature unit to use. Infer this from the user's location.","enum":["celsius","fahrenheit"]},"location":{"type":"string","description":"The city and state, e.g. San Francisco, CA"}}}}}
-</tools>
-
-For each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:
-<tool_call>
-{"name": <function-name>, "arguments": <args-json-object>}
-</tool_call><|im_end|>
-<|im_start|>user
-What's the weather like today in Paris?<|im_end|>
-<|im_start|>assistant
-<tool_call>
-{"name": "get_current_weather", "arguments": {"format":"celsius","location":"Paris, France"}}
-</tool_call><|im_end|>
-<|im_start|>user
-<tool_response>
-22
-</tool_response><|im_end|>
-<|im_start|>assistant
-The current temperature in Paris, France is 22 degrees Celsius.<|im_end|>
-<|im_start|>user
-What's the weather like today in San Francisco and Toronto?<|im_end|>
-<|im_start|>assistant
diff --git a/tools/testdata/tools.json b/tools/testdata/tools.json
deleted file mode 100644
index edde4ae0..00000000
--- a/tools/testdata/tools.json
+++ /dev/null
@@ -1,30 +0,0 @@
-[
-  {
-    "type": "function",
-    "function": {
-      "name": "get_current_weather",
-      "description": "Get the current weather",
-      "parameters": {
-        "type": "object",
-        "properties": {
-          "location": {
-            "type": "string",
-            "description": "The city and state, e.g. San Francisco, CA"
-          },
-          "format": {
-            "type": "string",
-            "enum": [
-              "celsius",
-              "fahrenheit"
-            ],
-            "description": "The temperature unit to use. Infer this from the user's location."
-          }
-        },
-        "required": [
-          "location",
-          "format"
-        ]
-      }
-    }
-  }
-]
diff --git a/tools/testdata/xlam.gotmpl b/tools/testdata/xlam.gotmpl
deleted file mode 100644
index 51513d69..00000000
--- a/tools/testdata/xlam.gotmpl
+++ /dev/null
@@ -1,45 +0,0 @@
-{{- if .System }}{{ .System }}
-{{ end }}
-{{- range $i, $_ := .Messages }}
-{{- if eq .Role "user" }}### Instruction:
-{{- if and $.Tools (le (len (slice $.Messages $i)) 2) }}
-[BEGIN OF TASK INSTRUCTION]
-You are an expert in composing functions. You are given a question and a set of possible functions. 
-Based on the question, you will need to make one or more function/tool calls to achieve the purpose. 
-If none of the functions can be used, point it out and refuse to answer. 
-If the given question lacks the parameters required by the function, also point it out.
-[END OF TASK INSTRUCTION]
-
-[BEGIN OF AVAILABLE TOOLS]
-{{ $.Tools }}
-[END OF AVAILABLE TOOLS]
-
-[BEGIN OF FORMAT INSTRUCTION]
-The output MUST strictly adhere to the following JSON format, and NO other text MUST be included.
-The example format is as follows. Please make sure the parameter type is correct. If no function call is needed, please make tool_calls an empty list '[]'.
-```
-{
-    "tool_calls": [
-    {"name": "func_name1", "arguments": {"argument1": "value1", "argument2": "value2"}},
-    ... (more tool calls as required)
-    ]
-}
-```
-[END OF FORMAT INSTRUCTION]
-
-[BEGIN OF QUERY]
-{{ .Content }}
-[END OF QUERY]
-
-
-{{ else }}
-{{ .Content }}
-{{ end }}
-{{- else if .ToolCalls }}### Response:
-{"tool_calls": [{{ range .ToolCalls }}{"name": "{{ .Function.Name }}", "arguments": {{ .Function.Arguments }}}{{ end }}]}
-<|EOT|>
-{{ else if eq .Role "assistant" }}### Response:
-{{ .Content }}
-<|EOT|>
-{{ end }}
-{{- end }}### Response:
\ No newline at end of file
diff --git a/tools/testdata/xlam.out b/tools/testdata/xlam.out
deleted file mode 100644
index 5d806532..00000000
--- a/tools/testdata/xlam.out
+++ /dev/null
@@ -1,40 +0,0 @@
-You are a knowledgeable assistant. You can answer questions and perform tasks.
-### Instruction:
-What's the weather like today in Paris?
-### Response:
-{"tool_calls": [{"name": "get_current_weather", "arguments": {"format":"celsius","location":"Paris, France"}}]}
-<|EOT|>
-### Response:
-The current temperature in Paris, France is 22 degrees Celsius.
-<|EOT|>
-### Instruction:
-[BEGIN OF TASK INSTRUCTION]
-You are an expert in composing functions. You are given a question and a set of possible functions. 
-Based on the question, you will need to make one or more function/tool calls to achieve the purpose. 
-If none of the functions can be used, point it out and refuse to answer. 
-If the given question lacks the parameters required by the function, also point it out.
-[END OF TASK INSTRUCTION]
-
-[BEGIN OF AVAILABLE TOOLS]
-[{"type":"function","function":{"name":"get_current_weather","description":"Get the current weather","parameters":{"type":"object","required":["location","format"],"properties":{"format":{"type":"string","description":"The temperature unit to use. Infer this from the user's location.","enum":["celsius","fahrenheit"]},"location":{"type":"string","description":"The city and state, e.g. San Francisco, CA"}}}}}]
-[END OF AVAILABLE TOOLS]
-
-[BEGIN OF FORMAT INSTRUCTION]
-The output MUST strictly adhere to the following JSON format, and NO other text MUST be included.
-The example format is as follows. Please make sure the parameter type is correct. If no function call is needed, please make tool_calls an empty list '[]'.
-```
-{
-    "tool_calls": [
-    {"name": "func_name1", "arguments": {"argument1": "value1", "argument2": "value2"}},
-    ... (more tool calls as required)
-    ]
-}
-```
-[END OF FORMAT INSTRUCTION]
-
-[BEGIN OF QUERY]
-What's the weather like today in San Francisco and Toronto?
-[END OF QUERY]
-
-
-### Response:
\ No newline at end of file
diff --git a/tools/tools.go b/tools/tools.go
index 914a5eaf..efeaeee0 100644
--- a/tools/tools.go
+++ b/tools/tools.go
@@ -1,253 +1,287 @@
 package tools
 
 import (
+	"bytes"
 	"encoding/json"
-	"errors"
-	"log/slog"
 	"strings"
-	gotmpl "text/template"
+	"text/template"
 
 	"github.com/ollama/ollama/api"
-	"github.com/ollama/ollama/template"
 )
 
-var (
-	errInvalidToolCall = errors.New("invalid tool call format")
-	errAccumulateMore  = errors.New("need to accumulate more content")
+type toolsState int
+
+const (
+	toolsState_LookingForTag toolsState = iota
+	toolsState_ToolCalling
+	toolsState_Done
 )
 
 type Parser struct {
-	greedyParseJSON bool
-	prefix          string
-	prefixFound     bool
-	tmpl            gotmpl.Template
-	sb              strings.Builder
-	index           int
-	name            string
-	arguments       string
+	tag        string
+	names      []string
+	properties []string
+
+	state  toolsState
+	buffer []byte
+	n      int
 }
 
-// parseJSONToolCalls attempts to parse a JSON string into a slice of ToolCalls.
-//
-// Parameters:
-//   - s: The string to parse
-//   - name: The field name from template that identifies the tool call name
-//   - arguments: The field name from template that identifies the tool call arguments
-//
-// Returns:
-//   - []api.ToolCall: The parsed tool calls if successful
-//   - error: ErrAccumulateMore if braces unbalanced, ErrInvalidToolCall if invalid, or nil if successful
-func parseJSONToolCalls(s string, name, arguments string, prefix string) ([]api.ToolCall, error) {
-	// Check for balanced braces before attempting to parse
-	braceCount := 0
-	squareCount := 0
-	startIndex := -1
-	var rawToolCalls []string
-	s = strings.TrimSpace(s)
-
-	// Only track these if we don't have a prefix as it will be cut off from the prefix. Also track in the parseLeadingJSON case.
-	trackSquareBrackets := prefix == "" || !strings.HasSuffix(prefix, "[") || strings.HasPrefix(s, "[")
-	for i, c := range s {
-		switch c {
-		case '{':
-			braceCount++
-			if startIndex == -1 {
-				startIndex = i
-			}
-		case '}':
-			braceCount--
-			if braceCount == 0 {
-				rawToolCalls = append(rawToolCalls, s[startIndex:i+1])
-				startIndex = -1
-			}
-		case '[':
-			if trackSquareBrackets {
-				squareCount++
-			}
-		case ']':
-			if trackSquareBrackets {
-				squareCount--
-			}
-		}
-
-		// Negative means we have an extra closing brace/bracket
-		if braceCount < 0 || squareCount < 0 {
-			return nil, errInvalidToolCall
-		}
-	}
-
-	// If braces/brackets aren't balanced, need more input
-	if braceCount > 0 || squareCount > 0 {
-		return nil, errAccumulateMore
-	}
-
-	t := strings.TrimSpace(s)
-	if len(t) == 0 {
-		return nil, errAccumulateMore
-	}
-	// If the input is a single square bracket, it's not a valid tool call
-	if t[0] == '[' && len(t) == 1 {
-		return nil, errAccumulateMore
-	}
-
-	// Attempt full unmarshal of the JSON
-	var toolCalls []api.ToolCall
-	for _, rawToolCall := range rawToolCalls {
-		var resp map[string]any
-		if err := json.Unmarshal([]byte(rawToolCall), &resp); err != nil {
-			continue
-		}
-
-		// Collect nested objects that could contain tool calls
-		objs := collect(resp)
-		if len(objs) == 0 {
-			continue
-		}
-
-		// Extract tool calls from objects
-		for _, kv := range objs {
-			n, nok := kv[name].(string)
-			a, aok := kv[arguments].(map[string]any)
-			if nok && aok {
-				toolCalls = append(toolCalls, api.ToolCall{
-					Function: api.ToolCallFunction{
-						Name:      n,
-						Arguments: a,
-					},
-				})
-			} else {
-				slog.Debug("No valid tool call found in object.", "object", kv)
-			}
-		}
-	}
-
-	// Valid JSON, no tool calls found
-	if len(toolCalls) == 0 {
-		slog.Debug("No valid tool calls found in any raw tool calls.", "rawToolCalls", rawToolCalls)
-		return nil, errInvalidToolCall
-	}
-
-	return toolCalls, nil
+// NewParser creates a new tool call parser from a model's chat
+// template and a list of provided tools.
+func NewParser(tmpl *template.Template, tools []api.Tool) *Parser {
+	return NewParserWithTag(tools, parseTag(tmpl))
 }
 
-// checkPrefix processes a string to find and handle a prefix pattern.
-//
-// Returns:
-//   - The processed string with prefix removed if found
-//   - error: ErrAccumulateMore if prefix is incomplete, or nil if successful
-func (p *Parser) checkPrefix(s string) (string, error) {
-	if s == "" || p.prefix == "" {
-		return s, nil
+func NewParserWithTag(tools []api.Tool, tag string) *Parser {
+	var p Parser
+	for _, t := range tools {
+		p.names = append(p.names, t.Function.Name)
+		for r := range t.Function.Parameters.Properties {
+			p.properties = append(p.properties, r)
+		}
 	}
-
-	// Check for prefix at start of string
-	if cut, hasPrefix := strings.CutPrefix(s, p.prefix); hasPrefix {
-		// Found prefix at start - accumulate for potential tool
-		p.prefixFound = true
-		return cut, nil
-	}
-
-	// Check if prefix overlaps end of string
-	if idx := suffixOverlap(s, p.prefix); idx != -1 {
-		// Return everything except overlapping portion
-		p.sb.Reset()
-		p.sb.WriteString(s[idx:])
-		return s[:idx], errAccumulateMore
-	}
-
-	// Check if prefix appears in middle of string
-	if idx := strings.Index(s, p.prefix); idx != -1 {
-		// Save remainder starting at prefix for next pass
-		p.sb.Reset()
-		p.sb.WriteString(strings.TrimSpace(s[idx:]))
-		// Return everything before prefix
-		return s[:idx], errAccumulateMore
-	}
-
-	// No partial prefix found
-	return s, nil
+	p.tag = tag
+	return &p
 }
 
-// Add processes a string input to parse tool calls and content.
-// It handles prefix detection and JSON parsing to extract tool calls.
-//
-// Returns:
-//   - tools: Any parsed tool calls
-//   - content: Non-tool call content
-func (p *Parser) Add(s string) (tools []api.ToolCall, content string) {
-	p.sb.WriteString(s)
-	s = p.sb.String()
-
-	// Check for prefix pattern in input
-	s, err := p.checkPrefix(s)
-	if err != nil {
-		// Need more input to complete prefix
+// Add processes a string input to parse tool calls and content that
+// should be sent back to the user.
+func (p *Parser) Add(s string) (calls []api.ToolCall, content string) {
+	if p.state == toolsState_Done {
 		return nil, s
 	}
 
-	// Exit if prefix exists in template, greedy parsing is off, and prefix not found
-	if !p.greedyParseJSON && !p.prefixFound {
-		p.sb.Reset()
-		return nil, s
+	p.buffer = append(p.buffer, s...)
+
+	if p.state == toolsState_LookingForTag {
+		i, found := p.findTag()
+		if i == -1 {
+			content = string(p.buffer)
+			p.buffer = []byte{}
+		} else {
+			content = string(p.buffer[:i])
+			p.buffer = p.buffer[i:]
+		}
+
+		// for models where { or [ are used as tool calling
+		// tags, we only support parsing tools if the first non-
+		// whitespace character is { or [
+		if p.tag == "{" || p.tag == "[" {
+			if strings.TrimSpace(content) != "" {
+				p.state = toolsState_Done
+				return nil, content + string(p.buffer)
+			}
+		}
+
+		if !found {
+			return nil, content
+		}
+
+		p.state = toolsState_ToolCalling
 	}
 
-	toolCalls, err := parseJSONToolCalls(s, p.name, p.arguments, p.prefix)
-	if err != nil {
-		if errors.Is(err, errAccumulateMore) {
-			return nil, ""
+	for {
+		call := p.parseToolCall()
+		if call == nil {
+			break
 		}
-		p.sb.Reset()
-		// Only do greedy JSON parsing if there is no prefix from template
-		if p.prefix != "" {
-			p.greedyParseJSON = false
-		}
-		if p.index != 0 && p.prefix == "" {
-			return nil, ""
-		}
-		if p.prefixFound {
-			// Drop tokens since prefix was found
-			return nil, ""
-		}
-		return nil, s
+
+		calls = append(calls, *call)
 	}
 
-	for _, tc := range toolCalls {
-		tc.Function.Index = p.index
-		p.index++
+	if p.done() {
+		p.state = toolsState_Done
+		content = string(p.buffer)
+		p.buffer = []byte{}
 	}
 
-	p.sb.Reset()
-	return toolCalls, ""
+	return calls, content
 }
 
-// NewParser creates a new tool call parser from a template. It extracts the tool call format,
-// prefix, and field names from the template to use for parsing tool calls from model output.
-//
-// Returns an error if the template does not contain valid tool call formatting.
-func NewParser(templateToProcess *gotmpl.Template) (*Parser, error) {
-	parsed, err := template.Parse(templateToProcess.Root.String())
-	if err != nil {
-		return nil, err
+// findTag searches the buffer to find and handle a tool calling tag
+// returning true if the tag was found and false otherwise, and
+// a string content signaling any content that should be sent back to the user
+func (p *Parser) findTag() (int, bool) {
+	// First check for complete substring anywhere in s
+	if i := bytes.Index(p.buffer, []byte(p.tag)); i > -1 {
+		return i, true
 	}
 
-	tt, err := toolTemplate(parsed)
-	if err != nil {
-		return nil, err
+	// Then check for partial suffix overlap
+	max := min(len(p.buffer), len(p.tag))
+	for i := max; i > 0; i-- {
+		if bytes.HasSuffix(p.buffer, []byte(p.tag[:i])) {
+			return len(p.buffer) - i, false
+		}
 	}
-
-	tp := toolPrefix(templateToProcess)
-
-	name, arguments, err := extractToolArgs(tt)
-	if err != nil {
-		return nil, err
-	}
-
-	return &Parser{
-		tmpl:            *tt,
-		sb:              strings.Builder{},
-		prefix:          tp,
-		greedyParseJSON: true,
-		name:            name,
-		arguments:       arguments,
-	}, nil
+	return -1, false
+}
+
+// parseToolCall finds the next complete tool call in the buffer
+// incrementing n and advancing the buffer.
+func (p *Parser) parseToolCall() *api.ToolCall {
+	var name string
+	var args map[string]any
+	var end int = len(p.buffer)
+
+	// find tool name
+	var i int
+	for _, n := range p.names {
+		if i = bytes.Index(p.buffer, []byte(n)); i != -1 {
+			if i+len(n) < end {
+				name = n
+				end = i + len(n)
+			}
+		}
+	}
+
+	if name == "" {
+		return nil
+	}
+
+	if args, i = p.findArguments(); args == nil {
+		return nil
+	}
+
+	if i > end {
+		end = i
+	}
+
+	tc := &api.ToolCall{
+		Function: api.ToolCallFunction{
+			Name:      name,
+			Arguments: args,
+			Index:     p.n,
+		},
+	}
+
+	p.n++
+	p.buffer = p.buffer[end:]
+	return tc
+}
+
+// findArguments returns the first object that appears to be
+// arguments and the position where the arguments end, returning nil and 0 if
+// an invalid JSON object or non-arguments object is found first
+func (p *Parser) findArguments() (map[string]any, int) {
+	if len(p.buffer) == 0 {
+		return nil, 0
+	}
+
+	var braces int
+	var start int = -1
+	var end int
+	var object []byte
+
+	// find any outer json object
+	for i, c := range p.buffer {
+		if c == '{' {
+			braces++
+			if start == -1 {
+				start = i
+			}
+		}
+
+		if c == '}' {
+			braces--
+			if braces == 0 && start != -1 {
+				end = i + 1
+				object = p.buffer[start:end]
+				break
+			}
+		}
+	}
+
+	if braces > 0 {
+		return nil, 0
+	}
+
+	var data map[string]any
+
+	// not valid json
+	if err := json.Unmarshal(object, &data); err != nil {
+		return nil, 0
+	}
+
+	var find func(obj any) map[string]any
+	find = func(obj any) map[string]any {
+		switch v := obj.(type) {
+		case map[string]any:
+			// check if the object keys are valid tool properties
+			// TODO (jmorganca): check only sets of properties that
+			// go together instead of the entire set
+			for _, prop := range p.properties {
+				if _, exists := v[prop]; exists {
+					return v
+				}
+			}
+
+			for _, value := range v {
+				if result := find(value); result != nil {
+					return result
+				}
+			}
+		case []any:
+			for _, item := range v {
+				if result := find(item); result != nil {
+					return result
+				}
+			}
+		}
+
+		return nil
+	}
+
+	result := find(data)
+	if result != nil {
+		return result, end
+	}
+
+	return nil, 0
+}
+
+// done checks if the parser is done parsing by looking
+// for closing tag. currently only } and ] are supported
+// for closing tags as {} or [] pairs may not always
+// represent tool calls and we need to send the content back
+func (p *Parser) done() bool {
+	var open, close rune
+	switch p.tag {
+	case "{":
+		open, close = '{', '}'
+	case "[":
+		open, close = '[', ']'
+	default:
+		return false
+	}
+
+	var count int
+	for _, c := range p.buffer {
+		if c == byte(open) {
+			count++
+		} else if c == byte(close) {
+			count--
+			if count == 0 {
+				return true
+			}
+		}
+	}
+
+	return false
+}
+
+// Content returns any remaining content that
+// should be sent to the user. This should be the empty string
+// string unless the tag is { or [ and a tool call was not found
+func (p *Parser) Content() string {
+	if p.n > 0 {
+		return ""
+	}
+
+	if p.tag == "{" || p.tag == "[" {
+		return string(p.buffer)
+	}
+
+	return ""
 }
diff --git a/tools/tools_test.go b/tools/tools_test.go
index 5fee8f57..67864168 100644
--- a/tools/tools_test.go
+++ b/tools/tools_test.go
@@ -1,673 +1,805 @@
 package tools
 
 import (
-	"bytes"
-	"encoding/json"
-	"fmt"
-	"os"
-	"path/filepath"
-	"strings"
 	"testing"
+	"text/template"
 
 	"github.com/google/go-cmp/cmp"
-
 	"github.com/ollama/ollama/api"
-	"github.com/ollama/ollama/template"
 )
 
-func readFile(t *testing.T, base, name string) *bytes.Buffer {
-	t.Helper()
-
-	bts, err := os.ReadFile(filepath.Join(base, name))
+func TestParser(t *testing.T) {
+	qwen, err := template.New("qwen").Parse(`{{if .ToolCalls}}<tool_call>{{range .ToolCalls}}{"name": "{{.Function.Name}}", "arguments": {{.Function.Arguments}}}{{end}}</tool_call>{{end}}`)
 	if err != nil {
-		t.Fatal(err)
+		t.Fatalf("Failed to parse template: %v", err)
 	}
 
-	return bytes.NewBuffer(bts)
-}
+	deepseek, err := template.New("deepseek").Parse("{{if .ToolCalls}}<|tool▁calls▁begin|>{{range .ToolCalls}}<|tool▁call▁begin|>function<|tool▁sep|>get_current_weather\n```json\n{\"location\": \"Tokyo\"}\n```<|tool▁call▁end|>{{end}}<|tool▁calls▁end|><|end▁of▁sentence|>{{end}}")
+	if err != nil {
+		t.Fatalf("Failed to parse template: %v", err)
+	}
+
+	json, err := template.New("json").Parse(`{{if .ToolCalls}}{{range .ToolCalls}}{"name": "{{.Function.Name}}", "arguments": {{.Function.Arguments}}}{{end}}{{end}}`)
+	if err != nil {
+		t.Fatalf("Failed to parse template: %v", err)
+	}
+
+	mistral, err := template.New("mistral").Parse(`{{if .ToolCalls}}[TOOL_CALLS] [{{range .ToolCalls}}{"name": "{{.Function.Name}}", "arguments": {{.Function.Arguments}}}{{end}}][/TOOL_CALLS]{{end}}`)
+	if err != nil {
+		t.Fatalf("Failed to parse template: %v", err)
+	}
+
+	list, err := template.New("list").Parse(`{{if .ToolCalls}}[{{range .ToolCalls}}{"name": "{{.Function.Name}}", "arguments": {{.Function.Arguments}}}{{end}}]{{end}}`)
+	if err != nil {
+		t.Fatalf("Failed to parse template: %v", err)
+	}
+
+	tools := []api.Tool{
+		{
+			Type: "function",
+			Function: api.ToolFunction{
+				Name:        "get_temperature",
+				Description: "Retrieve the temperature for a given location",
+				Parameters: struct {
+					Type       string   `json:"type"`
+					Defs       any      `json:"$defs,omitempty"`
+					Items      any      `json:"items,omitempty"`
+					Required   []string `json:"required"`
+					Properties map[string]struct {
+						Type        api.PropertyType `json:"type"`
+						Items       any              `json:"items,omitempty"`
+						Description string           `json:"description"`
+						Enum        []any            `json:"enum,omitempty"`
+					} `json:"properties"`
+				}{
+					Type: "object",
+					Properties: map[string]struct {
+						Type        api.PropertyType `json:"type"`
+						Items       any              `json:"items,omitempty"`
+						Description string           `json:"description"`
+						Enum        []any            `json:"enum,omitempty"`
+					}{
+						"format": {
+							Type:        api.PropertyType{"string"},
+							Description: "The format to return the temperature in",
+							Enum:        []any{"fahrenheit", "celsius"},
+						},
+						"city": {
+							Type:        api.PropertyType{"string"},
+							Description: "The city to get the temperature for",
+						},
+					},
+				},
+			},
+		},
+		{
+			Type: "function",
+			Function: api.ToolFunction{
+				Name:        "get_conditions",
+				Description: "Retrieve the current weather conditions for a given location",
+				Parameters: struct {
+					Type       string   `json:"type"`
+					Defs       any      `json:"$defs,omitempty"`
+					Items      any      `json:"items,omitempty"`
+					Required   []string `json:"required"`
+					Properties map[string]struct {
+						Type        api.PropertyType `json:"type"`
+						Items       any              `json:"items,omitempty"`
+						Description string           `json:"description"`
+						Enum        []any            `json:"enum,omitempty"`
+					} `json:"properties"`
+				}{
+					Type: "object",
+					Properties: map[string]struct {
+						Type        api.PropertyType `json:"type"`
+						Items       any              `json:"items,omitempty"`
+						Description string           `json:"description"`
+						Enum        []any            `json:"enum,omitempty"`
+					}{
+						"location": {
+							Type:        api.PropertyType{"string"},
+							Description: "The location to get the weather conditions for",
+						},
+					},
+				},
+			},
+		},
+	}
 
-func TestParseJSONToolCalls(t *testing.T) {
 	tests := []struct {
-		name          string
-		input         string
-		nameField     string
-		argsField     string
-		wantToolCalls []api.ToolCall
-		wantErr       error
-		prefix        string
+		name    string
+		inputs  []string
+		tmpl    *template.Template
+		content string
+		calls   []api.ToolCall
 	}{
 		{
-			name:      "valid single tool call",
-			input:     `{"name": "test_tool", "arguments": {"arg1": "value1"}}`,
-			nameField: "name",
-			argsField: "arguments",
-			wantToolCalls: []api.ToolCall{
+			name:    "no tool calls - just text",
+			inputs:  []string{"Hello, how can I help you today?"},
+			content: "Hello, how can I help you today?",
+			tmpl:    qwen,
+			calls:   nil,
+		},
+		{
+			name:    "empty input",
+			inputs:  []string{""},
+			content: "",
+			tmpl:    qwen,
+			calls:   nil,
+		},
+		{
+			name:    "tool call",
+			inputs:  []string{`<tool_call>{"name": "get_conditions", "arguments": {"location": "San Francisco"}}</tool_call>`},
+			content: "",
+			tmpl:    qwen,
+			calls: []api.ToolCall{
 				{
 					Function: api.ToolCallFunction{
-						Name: "test_tool",
-						Arguments: map[string]any{
-							"arg1": "value1",
+						Index: 0,
+						Name:  "get_conditions",
+						Arguments: api.ToolCallFunctionArguments{
+							"location": "San Francisco",
 						},
 					},
 				},
 			},
-			wantErr: nil,
-			prefix:  "",
 		},
 		{
-			name:          "incomplete JSON",
-			input:         `{"name": "test_tool", "arguments": {"arg1": `,
-			nameField:     "name",
-			argsField:     "arguments",
-			wantToolCalls: nil,
-			wantErr:       errAccumulateMore,
-			prefix:        "",
-		},
-		{
-			name:          "invalid JSON",
-			input:         `not json at all`,
-			nameField:     "name",
-			argsField:     "arguments",
-			wantToolCalls: nil,
-			wantErr:       errInvalidToolCall,
-			prefix:        "",
-		},
-		{
-			name:          "missing required fields",
-			input:         `{"other": "field"}`,
-			nameField:     "name",
-			argsField:     "arguments",
-			wantToolCalls: nil,
-			wantErr:       errInvalidToolCall,
-			prefix:        "",
-		},
-		{
-			name: "multiple tool calls in array",
-			input: `[
-				{"name": "tool1", "arguments": {"arg1": 1}},
-				{"name": "tool2", "arguments": {"arg2": "value"}}
-			]`,
-			nameField: "name",
-			argsField: "arguments",
-			wantToolCalls: []api.ToolCall{
+			name:    "text before tool call",
+			inputs:  []string{`Let me check the weather. <tool_call>{"name": "get_temperature", "arguments": {"city": "New York"}}</tool_call>`},
+			content: "Let me check the weather. ",
+			tmpl:    qwen,
+			calls: []api.ToolCall{
 				{
 					Function: api.ToolCallFunction{
-						Name: "tool1",
-						Arguments: map[string]any{
-							"arg1": float64(1),
-						},
-					},
-				},
-				{
-					Function: api.ToolCallFunction{
-						Name: "tool2",
-						Arguments: map[string]any{
-							"arg2": "value",
+						Index: 0,
+						Name:  "get_temperature",
+						Arguments: api.ToolCallFunctionArguments{
+							"city": "New York",
 						},
 					},
 				},
 			},
-			wantErr: nil,
-			prefix:  "",
 		},
 		{
-			name: "multiple tool calls without array",
-			input: `
-				{"name": "tool1", "arguments": {"arg1": 1}},
-				{"name": "tool2", "arguments": {"arg2": "value"}}
-			`,
-			nameField: "name",
-			argsField: "arguments",
-			wantToolCalls: []api.ToolCall{
+			name:    "two tool calls in a list",
+			inputs:  []string{`[TOOL_CALLS] [{"name": "get_temperature", "arguments": {"city": "London", "format": "fahrenheit"}}, {"name": "get_conditions", "arguments": {"location": "Tokyo"}}][/TOOL_CALLS]`},
+			content: "",
+			tmpl:    mistral,
+			calls: []api.ToolCall{
 				{
 					Function: api.ToolCallFunction{
-						Name: "tool1",
-						Arguments: map[string]any{
-							"arg1": float64(1),
+						Index: 0,
+						Name:  "get_temperature",
+						Arguments: api.ToolCallFunctionArguments{
+							"city":   "London",
+							"format": "fahrenheit",
 						},
 					},
 				},
 				{
 					Function: api.ToolCallFunction{
-						Name: "tool2",
-						Arguments: map[string]any{
-							"arg2": "value",
+						Index: 1,
+						Name:  "get_conditions",
+						Arguments: api.ToolCallFunctionArguments{
+							"location": "Tokyo",
 						},
 					},
 				},
 			},
-			wantErr: nil,
-			prefix:  "",
 		},
 		{
-			name: "multiple tool calls with text after",
-			input: `
-				{"name": "tool1", "arguments": {"arg1": 1}} text
-				{"name": "tool2", "arguments": {"arg2": "value"}} text
-			`,
-			nameField: "name",
-			argsField: "arguments",
-			wantToolCalls: []api.ToolCall{
+			name:    "two tool calls",
+			inputs:  []string{`Okay, let's call both tools! <tool_call>{"name": "get_temperature", "arguments": {"city": "London", "format": "fahrenheit"}}</tool_call><tool_call>{"name": "get_conditions", "arguments": {"location": "Tokyo"}}</tool_call>`},
+			content: "Okay, let's call both tools! ",
+			tmpl:    qwen,
+			calls: []api.ToolCall{
 				{
 					Function: api.ToolCallFunction{
-						Name: "tool1",
-						Arguments: map[string]any{
-							"arg1": float64(1),
+						Index: 0,
+						Name:  "get_temperature",
+						Arguments: api.ToolCallFunctionArguments{
+							"city":   "London",
+							"format": "fahrenheit",
 						},
 					},
 				},
 				{
 					Function: api.ToolCallFunction{
-						Name: "tool2",
-						Arguments: map[string]any{
-							"arg2": "value",
+						Index: 1,
+						Name:  "get_conditions",
+						Arguments: api.ToolCallFunctionArguments{
+							"location": "Tokyo",
 						},
 					},
 				},
 			},
-			wantErr: nil,
-			prefix:  "",
 		},
 		{
-			name: "second tool call in array",
-			input: `
-				, {"name": "tool2", "arguments": {"arg2": "value"}}
-			`,
-			nameField: "name",
-			argsField: "arguments",
-			wantToolCalls: []api.ToolCall{
+			name:    "deepseek",
+			inputs:  []string{"<think>Wait, I need to call a tool</think><|tool▁calls▁begin|><|tool▁call▁begin|>function<|tool▁sep|>get_temperature\n```json\n{\"city\": \"Tokyo\"}\n```<|tool▁call▁end|><|tool▁calls▁end|><|end▁of▁sentence|>"},
+			content: "<think>Wait, I need to call a tool</think>",
+			tmpl:    deepseek,
+			calls: []api.ToolCall{
 				{
 					Function: api.ToolCallFunction{
-						Name: "tool2",
-						Arguments: map[string]any{
-							"arg2": "value",
+						Index: 0,
+						Name:  "get_temperature",
+						Arguments: api.ToolCallFunctionArguments{
+							"city": "Tokyo",
 						},
 					},
 				},
 			},
-			wantErr: nil,
-			prefix:  "",
-		},
-		// a bad JSON would not return any tool calls or content as it would always accumulate more
-		{
-			name:          "unbalanced square brackets",
-			input:         `[{"name": "tool1", "arguments": {"arg1": [1, 2}]`,
-			nameField:     "name",
-			argsField:     "arguments",
-			wantToolCalls: nil,
-			wantErr:       errAccumulateMore,
-			prefix:        "",
 		},
 		{
-			name:          "incomplete square brackets",
-			input:         `[{"name": "tool1", "arguments": {"arg1": [1, 2, 3`,
-			nameField:     "name",
-			argsField:     "arguments",
-			wantToolCalls: nil,
-			wantErr:       errAccumulateMore,
-			prefix:        "",
-		},
-		{
-			name:      "nested arrays in arguments",
-			input:     `{"name": "tool1", "arguments": {"arg1": [1, 2, ["nested", "array"]]}}`,
-			nameField: "name",
-			argsField: "arguments",
-			wantToolCalls: []api.ToolCall{
+			name: "deepseek incremental",
+			inputs: []string{
+				"<think>Wait",
+				", I need",
+				" to call",
+				" a tool</think><|too",
+				"l▁calls▁begin",
+				"|>",
+				"<|tool▁call▁begin|>function<|tool▁sep|>get_temperature\n",
+				"```json\n",
+				"{\"city\": \"Tokyo\"}\n",
+				"```",
+				"<|tool▁c", "all▁end|>",
+				"<|tool▁calls▁end|>",
+				"<|end▁of▁sentence|>",
+			},
+			content: "<think>Wait, I need to call a tool</think>",
+			tmpl:    deepseek,
+			calls: []api.ToolCall{
 				{
 					Function: api.ToolCallFunction{
-						Name: "tool1",
-						Arguments: map[string]any{
-							"arg1": []any{float64(1), float64(2), []any{"nested", "array"}},
+						Index: 0,
+						Name:  "get_temperature",
+						Arguments: api.ToolCallFunctionArguments{
+							"city": "Tokyo",
 						},
 					},
 				},
 			},
-			wantErr: nil,
-			prefix:  "",
+		},
+		{
+			name: "json",
+			inputs: []string{
+				"{",
+				"\"name\": \"get_temperature\",",
+				"\"arguments\": {",
+				"\"city\": \"Tokyo\"",
+				"}",
+				"}",
+			},
+			content: "",
+			tmpl:    json,
+			calls: []api.ToolCall{
+				{
+					Function: api.ToolCallFunction{
+						Index: 0,
+						Name:  "get_temperature",
+						Arguments: api.ToolCallFunctionArguments{
+							"city": "Tokyo",
+						},
+					},
+				},
+			},
+		},
+		{
+			name: "json maybe a tool call",
+			inputs: []string{
+				"{",
+				"\"name\": \"get_temperature\",",
+				"\"arguments\": {",
+			},
+			content: "",
+			tmpl:    json,
+			calls:   nil,
+		},
+		{
+			name: "json not a tool call",
+			inputs: []string{
+				"{",
+				"\"name\": \"search\", ",
+				"\"arguments\": {",
+				"\"query\": \"What is the capital of Canada?\"",
+				"}",
+				"}",
+			},
+			content: "{\"name\": \"search\", \"arguments\": {\"query\": \"What is the capital of Canada?\"}}",
+			tmpl:    json,
+			calls:   nil,
+		},
+		{
+			name: "json object followed by tool call",
+			inputs: []string{
+				"{\"name\": \"jeff\"}",
+				"{\"name\": \"get_conditions\", \"arguments\": {\"location\": \"San Francisco\"}}",
+			},
+			content: "{\"name\": \"jeff\"}{\"name\": \"get_conditions\", \"arguments\": {\"location\": \"San Francisco\"}}",
+			tmpl:    json,
+		},
+		{
+			name: "json object followed by tool call split",
+			inputs: []string{
+				"{\"name\": \"jeff\"} {",
+				"\"name\": \"get_conditions\", \"arguments\": {\"location\": \"San Francisco\"}}",
+			},
+			content: "{\"name\": \"jeff\"} {\"name\": \"get_conditions\", \"arguments\": {\"location\": \"San Francisco\"}}",
+			tmpl:    json,
+		},
+		{
+			name: "json code",
+			inputs: []string{
+				"for { fmt.Println(\"hello\") }",
+			},
+			content: "for { fmt.Println(\"hello\") }",
+			tmpl:    json,
+		},
+		{
+			name: "list multiple",
+			inputs: []string{
+				"[",
+				"{",
+				"\"name\": \"get_temperature\", ",
+				"\"arguments\": {",
+				"\"city\": \"London\"",
+				"}",
+				"},",
+				"{",
+				"\"name\": \"get_conditions\", ",
+				"\"arguments\": {",
+				"\"location\": \"Tokyo\"",
+				"}",
+				"}]",
+			},
+			content: "",
+			tmpl:    list,
+			calls: []api.ToolCall{
+				{
+					Function: api.ToolCallFunction{
+						Index: 0,
+						Name:  "get_temperature",
+						Arguments: api.ToolCallFunctionArguments{
+							"city": "London",
+						},
+					},
+				},
+				{
+					Function: api.ToolCallFunction{
+						Index: 1,
+						Name:  "get_conditions",
+						Arguments: api.ToolCallFunctionArguments{
+							"location": "Tokyo",
+						},
+					},
+				},
+			},
+		},
+		{
+			name: "list partial",
+			inputs: []string{
+				"[",
+				"{",
+				"\"name\": \"search\", ",
+				"\"arguments\": {",
+				"\"query\": \"What is the capital of Canada?\"",
+				"}",
+				"}",
+			},
+			content: "",
+			tmpl:    list,
+			calls:   nil,
+		},
+		{
+			name: "list not a tool call",
+			inputs: []string{
+				"[special",
+				" del",
+				"ivery]",
+			},
+			content: "[special delivery]",
+			tmpl:    list,
+			calls:   nil,
 		},
 	}
 
 	for _, tt := range tests {
 		t.Run(tt.name, func(t *testing.T) {
-			gotCalls, err := parseJSONToolCalls(tt.input, tt.nameField, tt.argsField, tt.prefix)
+			parser := NewParser(tt.tmpl, tools)
 
-			if err != tt.wantErr {
-				t.Errorf("parseJSONToolCalls() error = %v, want %v", err, tt.wantErr)
+			var calls []api.ToolCall
+			var content string
+			for _, input := range tt.inputs {
+				tcs, c := parser.Add(input)
+				calls = append(calls, tcs...)
+				content += c
 			}
 
-			if len(gotCalls) != 0 && tt.wantErr != nil {
-				t.Errorf("parseJSONToolCalls() valid = %v, want %v", len(gotCalls) == 0, tt.wantErr == nil)
+			if content != tt.content {
+				t.Errorf("Expected content %q, got %q", tt.content, content)
 			}
 
-			if diff := cmp.Diff(gotCalls, tt.wantToolCalls); diff != "" {
-				t.Errorf("parseJSONToolCalls() tool calls mismatch (-got +want):\n%s", diff)
+			if len(calls) != len(tt.calls) {
+				t.Fatalf("Expected %d tool calls, got %d", len(tt.calls), len(calls))
+			}
+
+			for i, want := range tt.calls {
+				if diff := cmp.Diff(calls[i], want); diff != "" {
+					t.Errorf("Tool call %d mismatch (-got +want):\n%s", i, diff)
+				}
 			}
 		})
 	}
 }
 
-func TestParseToolCalls(t *testing.T) {
-	p := filepath.Join("testdata")
-	t1 := api.ToolCall{
-		Function: api.ToolCallFunction{
-			Name: "get_current_weather",
-			Arguments: api.ToolCallFunctionArguments{
-				"format":   "fahrenheit",
-				"location": "San Francisco, CA",
-			},
-		},
-	}
-	t2 := api.ToolCall{
-		Function: api.ToolCallFunction{
-			Name: "get_current_weather",
-			Arguments: api.ToolCallFunctionArguments{
-				"format":   "celsius",
-				"location": "Toronto, Canada",
-			},
-		},
-	}
-
-	cases := []struct {
-		name             string
-		model            string
-		output           string
-		expectedToolCall []api.ToolCall
-		expectedTokens   string
+func TestDone(t *testing.T) {
+	tests := []struct {
+		name   string
+		tag    string
+		buffer []byte
+		want   bool
 	}{
 		{
-			name:             "mistral malformed json with tool calls prefix",
-			model:            "mistral",
-			output:           `[TOOL_CALLS]  [{"name": "get_current_weather", "arguments": {"format":"fahrenheit","location":"San Francisco, CA"}},{"name": "get_curren}]`,
-			expectedToolCall: []api.ToolCall{t1},
-			expectedTokens:   "",
+			name:   "empty",
+			tag:    "<tool_call>",
+			buffer: []byte{},
+			want:   false,
 		},
 		{
-			name:             "mistral multiple tool calls without prefix",
-			model:            "mistral",
-			output:           `[{"name": "get_current_weather", "arguments": {"format":"fahrenheit","location":"San Francisco, CA"}},{"name": "get_current_weather", "arguments": {"format":"celsius","location":"Toronto, Canada"}} ]`,
-			expectedToolCall: []api.ToolCall{t1, t2},
-			expectedTokens:   "",
+			name:   "empty",
+			tag:    "<tool_call>",
+			buffer: []byte{},
+			want:   false,
 		},
 		{
-			name:  "mistral tool calls with text between no prefix",
-			model: "mistral",
-			output: `[{"name": "get_current_weather", "arguments": {"format":"fahrenheit","location":"San Francisco, CA"}},{"name": "get_current_weather", "arguments": {"format":"celsius","location":"Toronto, Canada"}}] 
-			model outputs more tokens here and then [{"name": "get_current_weather", "arguments": {"format":"fahrenheit","location":"San Francisco, CA"}},{"name": "get_current_weather", "arguments": {"format":"celsius","location":"Toronto, Canada"}}]`,
-			expectedToolCall: []api.ToolCall{t1, t2},
-			expectedTokens:   `model outputs more tokens here and then [{"name": "get_current_weather", "arguments": {"format":"fahrenheit","location":"San Francisco, CA"}},{"name": "get_current_weather", "arguments": {"format":"celsius","location":"Toronto, Canada"}}]`,
+			name:   "json open",
+			tag:    "{",
+			buffer: []byte("{\"name\": \"get_weather\""),
+			want:   false,
 		},
 		{
-			name:             "mistral valid json with tool calls prefix",
-			model:            "mistral",
-			output:           `[TOOL_CALLS]  [{"name": "get_current_weather", "arguments": {"format":"fahrenheit","location":"San Francisco, CA"}},{"name": "get_current_weather", "arguments": {"format":"celsius","location":"Toronto, Canada"}}]`,
-			expectedToolCall: []api.ToolCall{t1, t2},
-			expectedTokens:   "",
+			name:   "json closed",
+			tag:    "{",
+			buffer: []byte("{\"name\": \"get_weather\"}"),
+			want:   true,
 		},
 		{
-			name:  "mistral multiple tool calls with text between and prefix",
-			model: "mistral",
-			output: `[TOOL_CALLS]  [{"name": "get_current_weather", "arguments": {"format":"fahrenheit","location":"San Francisco, CA"}},{"name": "get_current_weather", "arguments": {"format":"celsius","location":"Toronto, Canada"}}]
-			model outputs more tokens here and then [{"name": "get_current_weather", "arguments": {"format":"fahrenheit","location":"San Francisco, CA"}},{"name": "get_current_weather", "arguments": {"format":"celsius","location":"Toronto, Canada"}}]`,
-			expectedToolCall: []api.ToolCall{t1, t2, t1, t2},
-			expectedTokens:   "",
+			name:   "json empty",
+			tag:    "{",
+			buffer: []byte("{}"),
+			want:   true,
 		},
 		{
-			name:             "mistral incomplete json with tool calls prefix",
-			model:            "mistral",
-			output:           `[TOOL_CALLS]  [{"name": "get_current_weather", "arguments": {"format":"fahrenheit","location":"San Francisco, `,
-			expectedToolCall: []api.ToolCall{},
-			expectedTokens:   "",
+			name:   "list open",
+			tag:    "[",
+			buffer: []byte("[{\"name\": \"get_weather\""),
+			want:   false,
 		},
 		{
-			name:  "mistral invalid tool call with explanatory text no prefix",
-			model: "mistral",
-			output: `I'm not aware of that information. However, I can suggest searching for the weather using the "get_current_weather" function:
-
-		[{"name": "get_current_weather", "arguments": {"format":"fahrenheit","location":"San Francisco, CA"}},{"name": "get_current_weather", "arguments": {"format":"celsius","location":"Toronto, Canada"}}]`,
-			expectedToolCall: []api.ToolCall{},
-			expectedTokens:   `I'm not aware of that information. However, I can suggest searching for the weather using the "get_current_weather" function: [{"name": "get_current_weather", "arguments": {"format":"fahrenheit","location":"San Francisco, CA"}},{"name": "get_current_weather", "arguments": {"format":"celsius","location":"Toronto, Canada"}}]`,
+			name:   "list closed",
+			tag:    "[",
+			buffer: []byte("[{\"name\": \"get_weather\"}]"),
+			want:   true,
 		},
 		{
-			name:             "mistral tool calls without prefix",
-			model:            "mistral",
-			output:           `[{"name": "get_current_weather", "arguments": {"format":"fahrenheit","location":"San Francisco, CA"}},{"name": "get_current_weather", "arguments": {"format":"celsius","location":"Toronto, Canada"}}]`,
-			expectedToolCall: []api.ToolCall{t1, t2},
-			expectedTokens:   "",
-		},
-		{
-			name:  "command r plus tool calls with json block format",
-			model: "command-r-plus",
-			output: "Action: ```json" + `
-		[
-		    {
-		        "tool_name": "get_current_weather",
-		        "parameters": {
-		            "format": "fahrenheit",
-		            "location": "San Francisco, CA"
-		        }
-		    },
-		    {
-		        "tool_name": "get_current_weather",
-		        "parameters": {
-		            "format": "celsius",
-		            "location": "Toronto, Canada"
-		        }
-		    }
-		]
-		` + "```",
-			expectedToolCall: []api.ToolCall{t1, t2},
-			expectedTokens:   "",
-		},
-		{
-			name:             "firefunction tool calls with functools prefix",
-			model:            "firefunction",
-			output:           ` functools[{"name": "get_current_weather", "arguments": {"format":"fahrenheit","location":"San Francisco, CA"}},{"name": "get_current_weather", "arguments": {"format":"celsius","location":"Toronto, Canada"}}]`,
-			expectedToolCall: []api.ToolCall{t1, t2},
-			expectedTokens:   "",
-		},
-		{
-			name:  "llama3 groq single tool call with xml tags",
-			model: "llama3-groq-tool-use",
-			output: `<tool_call>
-		{"name": "get_current_weather", "arguments": {"format":"fahrenheit","location":"San Francisco, CA"}}
-		</tool_call>`,
-			expectedToolCall: []api.ToolCall{t1},
-			expectedTokens:   "",
-		},
-		{
-			name:             "xlam tool calls with wrapper object",
-			model:            "xlam",
-			output:           `{"tool_calls": [{"name": "get_current_weather", "arguments": {"format":"fahrenheit","location":"San Francisco, CA"}},{"name": "get_current_weather", "arguments": {"format":"celsius","location":"Toronto, Canada"}}]}`,
-			expectedToolCall: []api.ToolCall{t1, t2},
-			expectedTokens:   "",
-		},
-		{
-			name:             "qwen2.5 single tool call with prefix",
-			model:            "qwen2.5",
-			output:           `<tool_call>{"name": "get_current_weather", "arguments": {"format":"fahrenheit","location":"San Francisco, CA"}}</tool_call>`,
-			expectedToolCall: []api.ToolCall{t1},
-			expectedTokens:   "",
-		},
-		{
-			name:             "qwen2.5 multiple tool calls with and without prefix",
-			model:            "qwen2.5",
-			output:           `{"name": "get_current_weather", "arguments": {"format":"fahrenheit","location":"San Francisco, CA"}} <tool_call>{"name": "get_current_weather", "arguments": {"format":"fahrenheit","location":"San Francisco, CA"}}</tool_call> <tool_call>{"name": "get_current_weather", "arguments": {"format":"celsius","location":"Toronto, Canada"}}</tool_call>`,
-			expectedToolCall: []api.ToolCall{t1, t1, t2},
-			expectedTokens:   "",
-		},
-		{
-			name:             "qwen2.5 plain text response no tool calls",
-			model:            "qwen2.5",
-			output:           "The weather in San Francisco, CA is 70°F and in Toronto, Canada is 20°C.",
-			expectedToolCall: []api.ToolCall{},
-			expectedTokens:   "The weather in San Francisco, CA is 70°F and in Toronto, Canada is 20°C.",
-		},
-		{
-			name:             "qwen2.5 tool calls with trailing text",
-			model:            "qwen2.5",
-			output:           `[{"name": "get_current_weather", "arguments": {"format":"fahrenheit","location":"San Francisco, CA"}}, {"name": "get_current_weather", "arguments": {"format":"celsius","location":"Toronto, Canada"}}] some tokens after call`,
-			expectedToolCall: []api.ToolCall{t1, t2},
-			expectedTokens:   "some tokens after call",
-		},
-		{
-			name:             "qwen2.5 tool calls with initial text",
-			model:            "qwen2.5",
-			output:           `some tokens before call [{"name": "get_current_weather", "arguments": {"format":"fahrenheit","location":"San Francisco, CA"}}, {"name": "get_current_weather", "arguments": {"format":"celsius","location":"Toronto, Canada"}}]`,
-			expectedToolCall: []api.ToolCall{},
-			expectedTokens:   `some tokens before call [{"name": "get_current_weather", "arguments": {"format":"fahrenheit","location":"San Francisco, CA"}}, {"name": "get_current_weather", "arguments": {"format":"celsius","location":"Toronto, Canada"}}]`,
-		},
-		{
-			name:             "qwen2.5 tool calls with prefix and trailing text",
-			model:            "qwen2.5",
-			output:           `<tool_call> [{"name": "get_current_weather", "arguments": {"format":"fahrenheit","location":"San Francisco, CA"}}, {"name": "get_current_weather", "arguments": {"format":"celsius","location":"Toronto, Canada"}}] </tool_call> some tokens after call`,
-			expectedToolCall: []api.ToolCall{t1, t2},
-			expectedTokens:   "",
-		},
-		{
-			name:             "qwen2.5 tool calls with prefix and initial text",
-			model:            "qwen2.5",
-			output:           `some tokens before call <tool_call> [{"name": "get_current_weather", "arguments": {"format":"fahrenheit","location":"San Francisco, CA"}}, {"name": "get_current_weather", "arguments": {"format":"celsius","location":"Toronto, Canada"}}] </tool_call>`,
-			expectedToolCall: []api.ToolCall{t1, t2},
-			expectedTokens:   "some tokens before call",
-		},
-		{
-			name:             "qwen2.5 tool calls without and with prefix",
-			model:            "qwen2.5",
-			output:           `{"name": "get_current_weather", "arguments": {"format":"fahrenheit","location":"San Francisco, CA"}} <tool_call>{"name": "get_current_weather", "arguments": {"format":"celsius","location":"Toronto, Canada"}}</tool_call>`,
-			expectedToolCall: []api.ToolCall{t1, t2},
-			expectedTokens:   "",
-		},
-		{
-			name:             "qwen2.5 tool calls without and with prefix and text between",
-			model:            "qwen2.5",
-			output:           `{"name": "get_current_weather", "arguments": {"format":"fahrenheit","location":"San Francisco, CA"}} some tokens between <tool_call>{"name": "get_current_weather", "arguments": {"format":"celsius","location":"Toronto, Canada"}}</tool_call> some tokens after call`,
-			expectedToolCall: []api.ToolCall{t1, t2},
-			expectedTokens:   "some tokens between",
-		},
-		{
-			name:             "qwen2.5 tool calls without prefix and invalid tool call with other tokens",
-			model:            "qwen2.5",
-			output:           `hi [{"options": "foo"}]`,
-			expectedToolCall: []api.ToolCall{},
-			expectedTokens:   `hi [{"options": "foo"}]`,
-		},
-		{
-			name:             "qwen2.5 tool calls with prefix and invalid tool call",
-			model:            "qwen2.5",
-			output:           `<tool_call> [{"options": "foo"}] </tool_call> `,
-			expectedToolCall: []api.ToolCall{},
-			expectedTokens:   ``,
-		},
-		{
-			name:             "qwen3 tool call with think prefix and tool prefix (sent as a single token)",
-			model:            "qwen3",
-			output:           `<think>Okay, let me think what tool we should use...</think><tool_call>{"name": "get_current_weather", "arguments": {"format":"fahrenheit","location":"San Francisco, CA"}}</tool_call>`,
-			expectedToolCall: []api.ToolCall{t1},
-			expectedTokens:   "<think>Okay, let me think what tool we should use...</think>",
-		},
-		{
-			name:             "qwen3 tool call with think prefix, tool prefix, and whitespace (sent as separate tokens)",
-			model:            "qwen3",
-			output:           `<think>Okay, let me think what tool we should use...</think> <tool_call>{ "name": "get_current_weather", "arguments": {"format":"fahrenheit","location":"San Francisco, CA"}} </tool_call>`,
-			expectedToolCall: []api.ToolCall{t1},
-			expectedTokens:   "<think>Okay, let me think what tool we should use...</think>",
-		},
-		{
-			name:             "qwen3 empty think prefix without tool prefix and invalid tool call",
-			model:            "qwen3",
-			output:           `<think></think> {"name": "get_current_weather", "arguments": {"format":"fahrenheit","location":"San Francisco, CA"}} </tool_call>`,
-			expectedToolCall: []api.ToolCall{},
-			expectedTokens:   `<think></think> {"name": "get_current_weather", "arguments": {"format":"fahrenheit","location":"San Francisco, CA"}} </tool_call>`,
-		},
-		{
-			name:             "qwen3 empty think prefix with tool prefix and valid tool call",
-			model:            "qwen3",
-			output:           `<think></think><tool_call>{ "name": "get_current_weather", "arguments": {"format":"fahrenheit","location":"San Francisco, CA"}}  </tool_call>`,
-			expectedToolCall: []api.ToolCall{t1},
-			expectedTokens:   `<think></think>`,
-		},
-		{
-			name:             "qwen3 invalid tool call with fake tool prefix (single rune suffix match)",
-			model:            "qwen3",
-			output:           `<think></think>< fakeout {"name": "get_current_weather", "arguments": {"format":"fahrenheit","location":"San Francisco, CA"}} </tool_call>`,
-			expectedToolCall: []api.ToolCall{},
-			expectedTokens:   `<think></think>< fakeout {"name": "get_current_weather", "arguments": {"format":"fahrenheit","location":"San Francisco, CA"}} </tool_call>`,
-		},
-		{
-			name:             "qwen3 invalid tool call with partial tool prefix (multiple rune suffix match)",
-			model:            "qwen3",
-			output:           `<think></think><tool_c fakeout {"name": "get_current_weather", "arguments": {"format":"fahrenheit","location":"San Francisco, CA"}} </tool_call>`,
-			expectedToolCall: []api.ToolCall{},
-			expectedTokens:   `<think></think><tool_c fakeout {"name": "get_current_weather", "arguments": {"format":"fahrenheit","location":"San Francisco, CA"}} </tool_call>`,
-		},
-		{
-			name:             "qwen3 invalid tool call with malformed tool prefix",
-			model:            "qwen3",
-			output:           `<think></think><tool_cfakeout {"name": "get_current_weather", "arguments": {"format":"fahrenheit","location":"San Francisco, CA"}} </tool_call>`,
-			expectedToolCall: []api.ToolCall{},
-			expectedTokens:   `<think></think><tool_cfakeout {"name": "get_current_weather", "arguments": {"format":"fahrenheit","location":"San Francisco, CA"}} </tool_call>`,
-		},
-		{
-			name:             "model with prefix in template, no prefix in output",
-			model:            "qwen2.5",
-			output:           `[{"name": "get_current_weather", "arguments": {"format":"fahrenheit","location":"San Francisco, CA"}} {"name": "get_current_weather", "arguments": {"format":"celsius","location":"Toronto, Canada"}}]`,
-			expectedToolCall: []api.ToolCall{t1, t2},
-			expectedTokens:   "",
-		},
-		{
-			name:             "model with prefix in template, prefix in output",
-			model:            "qwen2.5",
-			output:           `<tool_call>[{"name": "get_current_weather", "arguments": {"format":"fahrenheit","location":"San Francisco, CA"}} {"name": "get_current_weather", "arguments": {"format":"celsius","location":"Toronto, Canada"}}]</tool_call>`,
-			expectedToolCall: []api.ToolCall{t1, t2},
-			expectedTokens:   "",
-		},
-		{
-			name:             "model without prefix in template, no prefix in output",
-			model:            "llama3.2",
-			output:           `[{"name": "get_current_weather", "parameters": {"format":"fahrenheit","location":"San Francisco, CA"}} {"name": "get_current_weather", "parameters": {"format":"celsius","location":"Toronto, Canada"}}]`,
-			expectedToolCall: []api.ToolCall{t1, t2},
-			expectedTokens:   "",
-		},
-		{
-			name:             "model without prefix in template, no prefix in output, single tool call",
-			model:            "llama3.2",
-			output:           `{"name": "get_current_weather", "parameters": {"format":"fahrenheit","location":"San Francisco, CA"}}`,
-			expectedToolCall: []api.ToolCall{t1},
-			expectedTokens:   "",
-		},
-		{
-			name:             "model without prefix in template, prefix in output, multiple tool calls in list",
-			model:            "llama3.2",
-			output:           `<tool_call> [{"name": "get_current_weather", "parameters": {"format":"fahrenheit","location":"San Francisco, CA"}} {"name": "get_current_weather", "parameters": {"format":"celsius","location":"Toronto, Canada"}}]</tool_call>`,
-			expectedToolCall: []api.ToolCall{t1, t2},
-			expectedTokens:   `<tool_call>`,
-		},
-		{
-			name:             "model without prefix in template, prefix in output, individual tool calls",
-			model:            "llama3.2",
-			output:           `<tool_call> {"name": "get_current_weather", "parameters": {"format":"fahrenheit","location":"San Francisco, CA"}} {"name": "get_current_weather", "parameters": {"format":"celsius","location":"Toronto, Canada"}}`,
-			expectedToolCall: []api.ToolCall{t1, t2},
-			expectedTokens:   `<tool_call>`,
-		},
-		{
-			name:             "model with prefix in template, no prefix in output, tokens before",
-			model:            "qwen2.5",
-			output:           `some tokens before [{"name": "get_current_weather", "arguments": {"format":"fahrenheit","location":"San Francisco, CA"}} {"name": "get_current_weather", "arguments": {"format":"celsius","location":"Toronto, Canada"}}]`,
-			expectedToolCall: []api.ToolCall{},
-			expectedTokens:   `some tokens before [{"name": "get_current_weather", "arguments": {"format":"fahrenheit","location":"San Francisco, CA"}} {"name": "get_current_weather", "arguments": {"format":"celsius","location":"Toronto, Canada"}}]`,
-		},
-		{
-			name:             "model with prefix in template, prefix in output, tokens after",
-			model:            "qwen2.5",
-			output:           `<tool_call>[{"name": "get_current_weather", "arguments": {"format":"fahrenheit","location":"San Francisco, CA"}} {"name": "get_current_weather", "arguments": {"format":"celsius","location":"Toronto, Canada"}}]</tool_call> some tokens after`,
-			expectedToolCall: []api.ToolCall{t1, t2},
-			expectedTokens:   "",
-		},
-		{
-			name:             "model without prefix in template, no prefix in output, tokens after",
-			model:            "llama3.2",
-			output:           `[{"name": "get_current_weather", "parameters": {"format":"fahrenheit","location":"San Francisco, CA"}} {"name": "get_current_weather", "parameters": {"format":"celsius","location":"Toronto, Canada"}}]</tool_call> some tokens after`,
-			expectedToolCall: []api.ToolCall{t1, t2},
-			expectedTokens:   "",
-		},
-		{
-			name:             "model without prefix in template, no prefix in output, tokens before",
-			model:            "llama3.2",
-			output:           `some tokens before [{"name": "get_current_weather", "parameters": {"format":"fahrenheit","location":"San Francisco, CA"}} {"name": "get_current_weather", "parameters": {"format":"celsius","location":"Toronto, Canada"}}]`,
-			expectedToolCall: []api.ToolCall{t1, t2},
-			expectedTokens:   `some tokens before`,
-		},
-		{
-			name:  "model without prefix in template, prefix in output, tokens after",
-			model: "llama3.2",
-			output: `<tool_call> 
-			[{"name": "get_current_weather", "parameters": {"format":"fahrenheit","location":"San Francisco, CA"}} {"name": "get_current_weather", "parameters": {"format":"celsius","location":"Toronto, Canada"}}]</tool_call> some tokens after`,
-			expectedToolCall: []api.ToolCall{t1, t2},
-			expectedTokens:   `<tool_call>`,
-		},
-		{
-			name:             "model without without prefix, match all jsons",
-			model:            "llama3.2",
-			output:           `model outputs some text [{"name": "get_current_weather", "parameters": {"format":"fahrenheit","location":"San Francisco, CA"}} {"name": "get_current_weather", "parameters": {"format":"celsius","location":"Toronto, Canada"}}]</tool_call> some tokens after`,
-			expectedToolCall: []api.ToolCall{t1, t2},
-			expectedTokens:   "model outputs some text",
-		},
-		{
-			name:             "model flushes tokens if tool call doesn't match",
-			model:            "llama3.2",
-			output:           `{ "user": {"id": 12345, "name": "Alice", "preferences": {"theme": "dark", "notifications": true}, "stats": {"points": 987, "level": 42}}}`,
-			expectedToolCall: []api.ToolCall{},
-			expectedTokens:   `{ "user": {"id": 12345, "name": "Alice", "preferences": {"theme": "dark", "notifications": true}, "stats": {"points": 987, "level": 42}}}`,
-		},
-		{
-			name:             "model flushes tokens if tool call doesn't match array",
-			model:            "llama3.2",
-			output:           `[ { "user": {"id": 12345, "name": "Alice", "preferences": {"theme": "dark", "notifications": true}, "stats": {"points": 987, "level": 42}}}]`,
-			expectedToolCall: []api.ToolCall{},
-			expectedTokens:   `[ { "user": {"id": 12345, "name": "Alice", "preferences": {"theme": "dark", "notifications": true}, "stats": {"points": 987, "level": 42}}}]`,
+			name:   "list empty",
+			tag:    "[",
+			buffer: []byte("[]"),
+			want:   true,
 		},
 	}
 
-	var tools []api.Tool
-	if err := json.Unmarshal(readFile(t, p, "tools.json").Bytes(), &tools); err != nil {
-		t.Fatal(err)
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			parser := &Parser{
+				tag:    tt.tag,
+				buffer: tt.buffer,
+			}
+			got := parser.done()
+			if got != tt.want {
+				t.Errorf("done() = %t, want %t", got, tt.want)
+			}
+		})
+	}
+}
+
+func TestContent(t *testing.T) {
+	tests := []struct {
+		name    string
+		tag     string
+		content []byte
+		want    string
+		n       int
+	}{
+		{
+			name:    "empty",
+			content: []byte{},
+			tag:     "{",
+			want:    "",
+			n:       0,
+		},
+		{
+			name:    "tag",
+			tag:     "<tool_call>",
+			content: []byte("<tool_call>{\"name\": \"get_temperature\""),
+			want:    "",
+			n:       0,
+		},
+		{
+			name:    "json object",
+			tag:     "{",
+			content: []byte("{\"name\": \"get_temperature\"}"),
+			want:    "{\"name\": \"get_temperature\"}",
+			n:       0,
+		},
+		{
+			name:    "json object after called",
+			tag:     "{",
+			content: []byte("{\"hello\": \"world\"}"),
+			want:    "{\"hello\": \"world\"}",
+			n:       0,
+		},
+		{
+			name:    "json object after called",
+			tag:     "{",
+			content: []byte("{\"hello\": \"world\"}"),
+			want:    "",
+			n:       1,
+		},
+		{
+			name:    "list",
+			tag:     "[",
+			content: []byte("[{\"name\": \"get_temperature\"}]"),
+			want:    "[{\"name\": \"get_temperature\"}]",
+			n:       0,
+		},
+		{
+			name:    "code",
+			tag:     "{",
+			content: []byte("{ fmt.Println(\"hello\")"),
+			want:    "{ fmt.Println(\"hello\")",
+			n:       0,
+		},
 	}
 
-	var messages []api.Message
-	if err := json.Unmarshal(readFile(t, p, "messages.json").Bytes(), &messages); err != nil {
-		t.Fatal(err)
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			parser := &Parser{
+				tag:    tt.tag,
+				buffer: tt.content,
+				n:      tt.n,
+			}
+			got := parser.Content()
+			if got != tt.want {
+				t.Errorf("Content() = %q, want %q", got, tt.want)
+			}
+		})
+	}
+}
+
+func TestFindTag(t *testing.T) {
+	cases := []struct {
+		name   string
+		buffer []byte
+		tag    string
+		i      int
+		found  bool
+	}{
+		{
+			name:   "no overlap",
+			buffer: []byte("hello world"),
+			tag:    "<tool_call>",
+			i:      -1,
+			found:  false,
+		},
+		{
+			name:   "full overlap",
+			buffer: []byte("<tool_call>"),
+			tag:    "<tool_call>",
+			i:      0,
+			found:  true,
+		},
+		{
+			name:   "whitespace",
+			buffer: []byte("    <tool_call>\n {\"name\": \"bob\"}"),
+			tag:    "<tool_call>",
+			i:      4,
+			found:  true,
+		},
+		{
+			name:   "over",
+			buffer: []byte("<tool_call>{\"name\""),
+			tag:    "<tool_call>",
+			i:      0,
+			found:  true,
+		},
+		{
+			name:   "partial overlap",
+			buffer: []byte("text <tool_call>"),
+			tag:    "<tool_call>",
+			i:      5,
+			found:  true,
+		},
+		{
+			name:   "overlap with extra",
+			buffer: []byte("<tool_calls><tool_call>"),
+			tag:    "<tool_calls>",
+			i:      0,
+			found:  true,
+		},
+		{
+			name:   "delimiter longer than string",
+			buffer: []byte("<tool>"),
+			tag:    "<tool_call>",
+			i:      -1,
+			found:  false,
+		},
+		{
+			name:   "empty string",
+			buffer: []byte{},
+			tag:    "<tool_call>",
+			i:      -1,
+			found:  false,
+		},
+		{
+			name:   "single char overlap",
+			buffer: []byte("test<"),
+			tag:    "<tool_call>",
+			i:      4,
+			found:  false,
+		},
+		{
+			name:   "partial tool call",
+			buffer: []byte("hello <tool_"),
+			tag:    "<tool_call>",
+			i:      6,
+			found:  false,
+		},
+		{
+			name:   "square bracket",
+			buffer: []byte("calling tools: ["),
+			tag:    "[",
+			i:      15,
+			found:  true,
+		},
+		{
+			name:   "bracket",
+			buffer: []byte("{\"name\": \"bob\""),
+			tag:    "{",
+			i:      0,
+			found:  true,
+		},
+		{
+			name:   "bracket with whitespace",
+			buffer: []byte("\n\n{\n\"name\": \"bob\""),
+			tag:    "{",
+			i:      2,
+			found:  true,
+		},
 	}
 
 	for _, tt := range cases {
 		t.Run(tt.name, func(t *testing.T) {
-			tmpl, err := template.Parse(readFile(t, p, fmt.Sprintf("%s.gotmpl", tt.model)).String())
-			if err != nil {
-				t.Fatal(err)
+			parser := &Parser{
+				tag:    tt.tag,
+				buffer: tt.buffer,
+				n:      0,
+			}
+			i, found := parser.findTag()
+			if i != tt.i {
+				t.Errorf("findTag(%q, %q) = %d; want %d", tt.buffer, tt.tag, i, tt.i)
+			}
+			if found != tt.found {
+				t.Errorf("findTag(%q, %q) = %t; want %t", tt.buffer, tt.tag, found, tt.found)
+			}
+		})
+	}
+}
+
+func TestFindArguments(t *testing.T) {
+	tests := []struct {
+		name   string
+		buffer []byte
+		want   map[string]any
+	}{
+		{
+			name:   "empty string",
+			buffer: []byte{},
+			want:   nil,
+		},
+		{
+			name:   "whitespace only",
+			buffer: []byte("   \n\t  "),
+			want:   nil,
+		},
+		{
+			name:   "unbalanced braces - missing closing",
+			buffer: []byte(`{"format": "fahrenheit", "location": "San Francisco"`),
+			want:   nil,
+		},
+		{
+			name:   "unbalanced braces - extra closing",
+			buffer: []byte(`{"format": "fahrenheit"}}`),
+			want: map[string]any{
+				"format": "fahrenheit",
+			},
+		},
+		{
+			name:   "invalid JSON",
+			buffer: []byte(`{format: fahrenheit, location: "San Francisco"}`),
+			want:   nil,
+		},
+		{
+			name:   "valid json",
+			buffer: []byte(`{"name": "get_temperature", "arguments": {"format": "fahrenheit", "location": "San Francisco, CA"}}`),
+			want: map[string]any{
+				"format":   "fahrenheit",
+				"location": "San Francisco, CA",
+			},
+		},
+		{
+			name:   "valid arguments with special tokens",
+			buffer: []byte(`[tool]get_temperature[args]{"format": "fahrenheit", "location": "San Francisco, CA"}[end]`),
+			want: map[string]any{
+				"format":   "fahrenheit",
+				"location": "San Francisco, CA",
+			},
+		},
+		{
+			name:   "valid arguments in array",
+			buffer: []byte(`[{"arguments": {"format": "fahrenheit", "location": "San Francisco, CA"}}`),
+			want: map[string]any{
+				"format":   "fahrenheit",
+				"location": "San Francisco, CA",
+			},
+		},
+		{
+			name:   "nested deep",
+			buffer: []byte(`{"function": {"name": "get_temperature", "arguments": {"format": "fahrenheit", "location": "San Francisco, CA"}}}`),
+			want: map[string]any{
+				"format":   "fahrenheit",
+				"location": "San Francisco, CA",
+			},
+		},
+		{
+			name:   "one arg",
+			buffer: []byte(`get_weather({"location": "San Francisco, CA"})`),
+			want: map[string]any{
+				"location": "San Francisco, CA",
+			},
+		},
+		{
+			name:   "two args",
+			buffer: []byte(`[{"name": "get_weather", "arguments": {"location": "San Francisco, CA", "format": "fahrenheit"}}, {"name": "get_weather", "arguments": {"location": "San Francisco, CA", "format": "fahrenheit"}}]`),
+			want: map[string]any{
+				"location": "San Francisco, CA",
+				"format":   "fahrenheit",
+			},
+		},
+		{
+			name:   "deepseek",
+			buffer: []byte("<|tool▁calls▁begin|><|tool▁call▁begin|>function<|tool▁sep|>get_current_weather\n```json\n{\"location\": \"Tokyo\"}\n```<|tool▁call▁end|><|tool▁calls▁end|><|end▁of▁sentence|>"),
+			want: map[string]any{
+				"location": "Tokyo",
+			},
+		},
+	}
+
+	for _, tt := range tests {
+		parser := &Parser{
+			buffer:     tt.buffer,
+			properties: []string{"format", "location"},
+		}
+
+		t.Run(tt.name, func(t *testing.T) {
+			got, _ := parser.findArguments()
+
+			if diff := cmp.Diff(got, tt.want); diff != "" {
+				t.Errorf("scanArguments() args mismatch (-got +want):\n%s", diff)
 			}
-
-			t.Run("template", func(t *testing.T) {
-				actual := &bytes.Buffer{} // Create new buffer for each test
-				if err := tmpl.Execute(actual, template.Values{Tools: tools, Messages: messages}); err != nil {
-					t.Fatal(err)
-				}
-
-				if diff := cmp.Diff(actual.String(), readFile(t, p, fmt.Sprintf("%s.out", tt.model)).String()); diff != "" {
-					t.Errorf("mismatch (-got +want):\n%s", diff)
-				}
-			})
-
-			t.Run("parse", func(t *testing.T) {
-				tp, err := NewParser(tmpl.Template)
-				if err != nil {
-					t.Fatal(err)
-				}
-				got := []api.ToolCall{}
-				var gotTokens strings.Builder
-
-				tokens := strings.Fields(tt.output)
-				for _, tok := range tokens {
-					s := " " + tok
-
-					toolCalls, content := tp.Add(s)
-					if len(content) > 0 {
-						gotTokens.WriteString(content)
-					} else if len(toolCalls) > 0 {
-						got = append(got, toolCalls...)
-					}
-				}
-
-				// Compare tool calls if we expect any
-				if diff := cmp.Diff(got, tt.expectedToolCall); diff != "" {
-					t.Errorf("tool calls mismatch (-got +want):\n%s", diff)
-				}
-
-				// Compare tokens if we expect any
-				stripped := strings.TrimSpace(gotTokens.String())
-				if diff := cmp.Diff(stripped, tt.expectedTokens); diff != "" {
-					t.Log("actualTokens", stripped, "expectedTokens", tt.expectedTokens)
-					t.Errorf("tokens mismatch (-got +want):\n%s", diff)
-				}
-			})
 		})
 	}
 }
diff --git a/tools/tools_utils.go b/tools/tools_utils.go
deleted file mode 100644
index b6f80729..00000000
--- a/tools/tools_utils.go
+++ /dev/null
@@ -1,222 +0,0 @@
-package tools
-
-import (
-	"bytes"
-	"encoding/json"
-	"errors"
-	"log/slog"
-	"slices"
-	"strings"
-	gotmpl "text/template"
-	"text/template/parse"
-
-	"github.com/ollama/ollama/api"
-	"github.com/ollama/ollama/template"
-)
-
-// extractToolCallsFormat traverses a template AST to find text that follows a ".ToolCalls" condition.
-// It walks the template nodes looking for if-statements containing ".ToolCalls" and extracts any
-// immediate text nodes that follow. This is used to identify tool call prefixes and formatting.
-//
-// Returns:
-//   - string: The extracted text following the first ".ToolCalls" condition found
-//   - bool: Whether a ".ToolCalls" condition was found in the template
-func extractToolCallsFormat(tmpl *gotmpl.Template) (string, bool) {
-	if tmpl == nil || tmpl.Tree == nil {
-		slog.Debug("template or tree is nil")
-		return "", false
-	}
-
-	var result string
-	var found bool
-
-	var walk func(nodes []parse.Node)
-	walk = func(nodes []parse.Node) {
-		for _, node := range nodes {
-			if found {
-				return
-			}
-
-			switch n := node.(type) {
-			case *parse.IfNode:
-				if isToolCallsNode(n) {
-					// Collect immediate TextNode(s) at start of IfNode's list
-					var sb strings.Builder
-					for _, innerNode := range n.List.Nodes {
-						if tn, ok := innerNode.(*parse.TextNode); ok {
-							sb.Write(tn.Text)
-						} else {
-							// Stop at first non-text node
-							break
-						}
-					}
-					result = sb.String()
-					found = true
-					return
-				}
-				// Recurse into child nodes
-				walk(n.List.Nodes)
-				if n.ElseList != nil {
-					walk(n.ElseList.Nodes)
-				}
-			case *parse.ListNode:
-				walk(n.Nodes)
-			case *parse.RangeNode:
-				walk(n.List.Nodes)
-				if n.ElseList != nil {
-					walk(n.ElseList.Nodes)
-				}
-			case *parse.WithNode:
-				walk(n.List.Nodes)
-				if n.ElseList != nil {
-					walk(n.ElseList.Nodes)
-				}
-			default:
-				// Continue to next node
-				continue
-			}
-		}
-	}
-
-	walk(tmpl.Tree.Root.Nodes)
-	return result, found
-}
-
-// isToolCallsNode detects if a node's condition includes ".ToolCalls"
-func isToolCallsNode(n *parse.IfNode) bool {
-	for _, cmd := range n.Pipe.Cmds {
-		for _, arg := range cmd.Args {
-			if field, ok := arg.(*parse.FieldNode); ok {
-				if slices.Contains(field.Ident, "ToolCalls") {
-					return true
-				}
-			}
-		}
-	}
-	return false
-}
-
-func toolPrefix(tmpl *gotmpl.Template) string {
-	tokenText, ok := extractToolCallsFormat(tmpl)
-	if !ok {
-		return ""
-	}
-	tokenText = strings.TrimSpace(tokenText)
-	tokenText = strings.ReplaceAll(tokenText, "\r", "")
-	tokenText = strings.ReplaceAll(tokenText, "\n", " ")
-
-	return tokenText
-}
-
-// toolTemplate creates a subtree from the node that ranges over .ToolCalls
-//
-// Returns:
-//   - *gotmpl.Template: The subtree containing the .ToolCalls range
-//   - error: Error if parsing failed
-func toolTemplate(t *template.Template) (*gotmpl.Template, error) {
-	tmpl := t.Subtree(func(n parse.Node) bool {
-		if t, ok := n.(*parse.RangeNode); ok {
-			return slices.Contains(template.Identifiers(t.Pipe), "ToolCalls")
-		}
-
-		return false
-	})
-
-	if tmpl == nil {
-		return nil, errors.New("failed to find tool template")
-	}
-
-	return tmpl, nil
-}
-
-// suffixOverlap returns the index in s where the longest suffix overlap with prefix begins
-//
-// Returns:
-//   - int: The starting index in s where the suffix overlap begins
-func suffixOverlap(s, prefix string) int {
-	max := min(len(prefix), len(s))
-	for i := max; i > 0; i-- {
-		if strings.HasSuffix(s, prefix[:i]) {
-			return len(s) - i
-		}
-	}
-	return -1
-}
-
-// extractToolArgs executes a template with a known tool call format to extract the name and arguments
-//
-// Returns:
-//   - string: The name of the tool call
-//   - string: The arguments of the tool call
-//   - error: Error if parsing failed
-func extractToolArgs(tmpl *gotmpl.Template) (name, arguments string, err error) {
-	var b bytes.Buffer
-	if err := tmpl.Execute(&b, map[string][]api.ToolCall{
-		"ToolCalls": {
-			{
-				Function: api.ToolCallFunction{
-					Name: "@@name@@",
-					Arguments: api.ToolCallFunctionArguments{
-						"@@argument@@": 1,
-					},
-				},
-			},
-		},
-	}); err != nil {
-		return "", "", err
-	}
-
-	// Extract JSON object between curly braces
-	// JSON arrays are also valid as they will not be repeated in the template
-	output := b.String()
-	start := strings.Index(output, "{")
-	end := strings.LastIndex(output, "}")
-	if start == -1 || end == -1 || start > end {
-		return "", "", errors.New("no valid JSON object found in template output")
-	}
-	jsonStr := output[start : end+1]
-
-	var obj map[string]any
-	if err := json.Unmarshal([]byte(jsonStr), &obj); err != nil {
-		return "", "", err
-	}
-
-	// Find name and arguments fields
-	for k, v := range obj {
-		if str, ok := v.(string); ok && str == "@@name@@" {
-			name = k
-		} else if _, ok := v.(map[string]any); ok {
-			arguments = k
-		}
-	}
-
-	if name == "" || arguments == "" {
-		slog.Debug("missing required fields in tool call template", "name", name, "arguments", arguments)
-		return "", "", errors.New("missing required fields in tool call template")
-	}
-
-	return name, arguments, nil
-}
-
-// collect recursively traverses an object to collect all nested maps
-//
-// Returns:
-//   - []map[string]any: A slice of all nested maps found in the object
-func collect(obj any) []map[string]any {
-	var all []map[string]any
-	switch o := obj.(type) {
-	case map[string]any:
-		all = append(all, o)
-		for _, v := range o {
-			all = append(all, collect(v)...)
-		}
-	case []any:
-		for _, v := range o {
-			all = append(all, collect(v)...)
-		}
-	default:
-		return nil
-	}
-
-	return all
-}
diff --git a/tools/tools_utils_test.go b/tools/tools_utils_test.go
deleted file mode 100644
index e346117a..00000000
--- a/tools/tools_utils_test.go
+++ /dev/null
@@ -1,497 +0,0 @@
-package tools
-
-import (
-	"testing"
-	gotmpl "text/template"
-
-	"github.com/ollama/ollama/template"
-)
-
-func TestExtractToolCallsFormat(t *testing.T) {
-	cases := []struct {
-		name     string
-		template string
-		want     string
-		found    bool
-	}{
-		{
-			name:     "nil template",
-			template: "",
-			want:     "",
-			found:    false,
-		},
-		{
-			name:     "basic tool call with text",
-			template: "{{if .ToolCalls}}Hello world{{end}}",
-			want:     "Hello world",
-			found:    true,
-		},
-		{
-			name:     "tool call with json format",
-			template: "{{if .ToolCalls}}```json\n{{end}}",
-			want:     "```json\n",
-			found:    true,
-		},
-		{
-			name:     "tool call in range",
-			template: "{{range .ToolCalls}}tool: {{.}}{{end}}",
-			want:     "",
-			found:    false,
-		},
-		{
-			name:     "tool call with multiple text nodes",
-			template: "{{if .ToolCalls}}First text{{if .Something}}inner{{end}}Second text{{end}}",
-			want:     "First text",
-			found:    true,
-		},
-		{
-			name:     "nested if without tool calls",
-			template: "{{if .Something}}{{if .OtherThing}}text{{end}}{{end}}",
-			want:     "",
-			found:    false,
-		},
-	}
-
-	for _, tc := range cases {
-		t.Run(tc.name, func(t *testing.T) {
-			tmpl, err := gotmpl.New("test").Parse(tc.template)
-			if err != nil && tc.template != "" {
-				t.Fatalf("failed to parse template: %v", err)
-			}
-
-			got, found := extractToolCallsFormat(tmpl)
-			if got != tc.want {
-				t.Errorf("got text %q, want %q", got, tc.want)
-			}
-			if found != tc.found {
-				t.Errorf("got found %v, want %v", found, tc.found)
-			}
-		})
-	}
-}
-
-func TestToolPrefix(t *testing.T) {
-	cases := []struct {
-		name     string
-		template string
-		want     string
-	}{
-		{
-			name:     "basic tool call with action prefix",
-			template: "{{if .ToolCalls}}Action: ```json{{end}}",
-			want:     "Action: ```json",
-		},
-		{
-			name:     "incomplete functools bracket",
-			template: "{{if .ToolCalls}}functools[{{end}}",
-			want:     "functools[",
-		},
-		{
-			name:     "tool call with angle brackets",
-			template: "{{if .ToolCalls}}Hello, world! <tool_call>{{end}}",
-			want:     "Hello, world! <tool_call>",
-		},
-		{
-			name:     "multiple tool call formats",
-			template: "{{if .ToolCalls}}[tool_call] <tool_call>{{end}}",
-			want:     "[tool_call] <tool_call>",
-		},
-		{
-			name:     "single angle bracket tool call",
-			template: "{{if .ToolCalls}}<tool_call>{{end}}",
-			want:     "<tool_call>",
-		},
-		{
-			name:     "incomplete angle bracket after tool call",
-			template: "{{if .ToolCalls}}[tool_call] <{{end}}",
-			want:     "[tool_call] <",
-		},
-		{
-			name:     "angle bracket prefix with tool call",
-			template: "{{if .ToolCalls}}> <tool_call>{{end}}",
-			want:     "> <tool_call>",
-		},
-		{
-			name:     "uppercase tool call with incomplete bracket",
-			template: "{{if .ToolCalls}}[TOOL_CALL] [{{end}}",
-			want:     "[TOOL_CALL] [",
-		},
-		{
-			name:     "uppercase tool call with adjacent bracket",
-			template: "{{if .ToolCalls}}[TOOL_CALL][{{end}}",
-			want:     "[TOOL_CALL][",
-		},
-		{
-			name:     "tool call with pipe delimiters",
-			template: "{{if .ToolCalls}}<|tool_call|>{{end}}",
-			want:     "<|tool_call|>",
-		},
-		{
-			name:     "tool with no prefix",
-			template: "{{if .ToolCalls}}{{end}}",
-			want:     "",
-		},
-	}
-
-	for _, tt := range cases {
-		t.Run(tt.name, func(t *testing.T) {
-			tmpl, err := gotmpl.New("test").Parse(tt.template)
-			if err != nil {
-				t.Fatalf("failed to parse template: %v", err)
-			}
-			got := toolPrefix(tmpl)
-			if got != tt.want {
-				t.Errorf("ToolToken(%q) = %q; want %q", tt.template, got, tt.want)
-			}
-		})
-	}
-}
-
-func TestToolTemplate(t *testing.T) {
-	cases := []struct {
-		name     string
-		template string
-		want     bool
-	}{
-		{
-			name:     "basic tool call range",
-			template: "{{range .ToolCalls}}test{{end}}",
-			want:     true,
-		},
-		{
-			name:     "no tool calls",
-			template: "{{range .Other}}test{{end}}",
-			want:     false,
-		},
-		{
-			name:     "nested tool calls",
-			template: "{{range .Outer}}{{range .ToolCalls}}test{{end}}{{end}}",
-			want:     true,
-		},
-		{
-			name:     "empty template",
-			template: "",
-			want:     false,
-		},
-		{
-			name:     "tool calls in if statement",
-			template: "{{if .ToolCalls}}test{{end}}",
-			want:     false,
-		},
-	}
-
-	for _, tt := range cases {
-		t.Run(tt.name, func(t *testing.T) {
-			tmpl, err := gotmpl.New("test").Parse(tt.template)
-			if err != nil {
-				t.Fatalf("failed to parse template: %v", err)
-			}
-
-			parsed, err := template.Parse(tmpl.Root.String())
-			if err != nil {
-				t.Fatalf("failed to parse template: %v", err)
-			}
-
-			_, err = toolTemplate(parsed)
-			if err != nil && tt.want {
-				t.Errorf("toolTemplate() = %v; want %v", err, tt.want)
-			}
-		})
-	}
-}
-
-func TestSuffixOverlap(t *testing.T) {
-	cases := []struct {
-		name string
-		s    string
-		d    string
-		want int
-	}{
-		{
-			name: "no overlap",
-			s:    "hello world",
-			d:    "<tool_call>",
-			want: -1,
-		},
-		{
-			name: "full overlap",
-			s:    "<tool_call>",
-			d:    "<tool_call>",
-			want: 0,
-		},
-		{
-			name: "partial overlap",
-			s:    "text <tool_call>",
-			d:    "<tool_call>",
-			want: 5,
-		},
-		{
-			name: "delimiter longer than string",
-			s:    "<tool>",
-			d:    "<tool_call>",
-			want: -1,
-		},
-		{
-			name: "empty string",
-			s:    "",
-			d:    "<tool_call>",
-			want: -1,
-		},
-		{
-			name: "empty delimiter",
-			s:    "<tool_call>",
-			d:    "",
-			want: -1,
-		},
-		{
-			name: "single char overlap",
-			s:    "test<",
-			d:    "<tool_call>",
-			want: 4,
-		},
-		{
-			name: "partial tool call",
-			s:    "hello <tool_",
-			d:    "<tool_call>",
-			want: 6,
-		},
-	}
-
-	for _, tt := range cases {
-		t.Run(tt.name, func(t *testing.T) {
-			got := suffixOverlap(tt.s, tt.d)
-			if got != tt.want {
-				t.Errorf("suffixOverlap(%q, %q) = %d; want %d", tt.s, tt.d, got, tt.want)
-			}
-		})
-	}
-}
-
-func TestExtractToolArgs(t *testing.T) {
-	cases := []struct {
-		name     string
-		template string
-		wantName string
-		wantArgs string
-		wantErr  bool
-	}{
-		{
-			name: "basic tool call",
-			template: `{{ range .ToolCalls }}
-{"name": "{{ .Function.Name }}", "parameters": {{ .Function.Arguments }}}{{ end }}`,
-			wantName: "name",
-			wantArgs: "parameters",
-			wantErr:  false,
-		},
-		{
-			name: "tool call with whitespace",
-			template: `{{range .ToolCalls}}
-  {"name": "{{.Function.Name}}", "parameters": {{.Function.Arguments}}}
-{{end}}`,
-			wantName: "name",
-			wantArgs: "parameters",
-			wantErr:  false,
-		},
-		{
-			name: "tool call with extra content",
-			template: `Before {{range .ToolCalls}}
-{"name": "{{.Function.Name}}", "arguments": {{.Function.Arguments}}}{{end}} After`,
-			wantName: "name",
-			wantArgs: "arguments",
-			wantErr:  false,
-		},
-		{
-			name:     "no tool calls",
-			template: `{{if .Something}}no tools here{{end}}`,
-			wantName: "",
-			wantArgs: "",
-			wantErr:  true,
-		},
-		{
-			name:     "empty template",
-			template: ``,
-			wantName: "",
-			wantArgs: "",
-			wantErr:  true,
-		},
-		{
-			name: "prefix within tool call",
-			template: `{{- if .ToolCalls }}
-{{ range .ToolCalls }}
-<tool_call>
-{"name": "{{ .Function.Name }}", "arguments": {{ .Function.Arguments }}}
-</tool_call>{{ end }}{{- end }}`,
-			wantName: "name",
-			wantArgs: "arguments",
-			wantErr:  false,
-		},
-		{
-			name: "JSON array",
-			template: `{{ range .ToolCalls }}
-[{"name": "{{ .Function.Name }}", "arguments": {{ .Function.Arguments }}}]{{ end }}`,
-			wantName: "name",
-			wantArgs: "arguments",
-			wantErr:  false,
-		},
-		{
-			name: "invalid JSON",
-			template: `{{ range .ToolCalls }}
-{"name": "{{ .Function.Name }}", "arguments": {{ .Function.Arguments }}, invalid}{{ end }}`,
-			wantName: "",
-			wantArgs: "",
-			wantErr:  true,
-		},
-		{
-			name: "missing name field",
-			template: `{{ range .ToolCalls }}
-{"parameters": {{ .Function.Arguments }}}{{ end }}`,
-			wantName: "",
-			wantArgs: "",
-			wantErr:  true,
-		},
-		{
-			name: "missing arguments field",
-			template: `{{ range .ToolCalls }}
-{"name": "{{ .Function.Name }}"}{{ end }}`,
-			wantName: "",
-			wantArgs: "",
-			wantErr:  true,
-		},
-		{
-			name: "malformed JSON",
-			template: `{{ range .ToolCalls }}
-{"name": {{ .Function.Name }}, "arguments": {{ .Function.Arguments }}{{ end }}`,
-			wantName: "",
-			wantArgs: "",
-			wantErr:  true,
-		},
-	}
-
-	for _, tt := range cases {
-		t.Run(tt.name, func(t *testing.T) {
-			tmpl, err := gotmpl.New("test").Parse(tt.template)
-			if err != nil {
-				t.Fatalf("failed to parse template: %v", err)
-			}
-
-			gotName, gotArgs, err := extractToolArgs(tmpl)
-			if (err != nil) != tt.wantErr {
-				t.Errorf("extractToolArgs() error = %v, wantErr %v", err, tt.wantErr)
-				return
-			}
-			if err != nil {
-				return
-			}
-
-			if gotName != tt.wantName {
-				t.Errorf("extractToolArgs() gotName = %q, want %q", gotName, tt.wantName)
-			}
-			if gotArgs != tt.wantArgs {
-				t.Errorf("extractToolArgs() gotArgs = %q, want %q", gotArgs, tt.wantArgs)
-			}
-		})
-	}
-}
-
-func TestCollect(t *testing.T) {
-	cases := []struct {
-		name string
-		obj  any
-		want []map[string]any
-	}{
-		{
-			name: "simple map",
-			obj: map[string]any{
-				"key": "value",
-			},
-			want: []map[string]any{
-				{"key": "value"},
-			},
-		},
-		{
-			name: "nested map",
-			obj: map[string]any{
-				"outer": map[string]any{
-					"inner": "value",
-				},
-			},
-			want: []map[string]any{
-				{"outer": map[string]any{"inner": "value"}},
-				{"inner": "value"},
-			},
-		},
-		{
-			name: "array of maps",
-			obj: []any{
-				map[string]any{"key1": "val1"},
-				map[string]any{"key2": "val2"},
-			},
-			want: []map[string]any{
-				{"key1": "val1"},
-				{"key2": "val2"},
-			},
-		},
-		{
-			name: "deeply nested",
-			obj: map[string]any{
-				"l1": map[string]any{
-					"l2": map[string]any{
-						"l3": "value",
-					},
-				},
-			},
-			want: []map[string]any{
-				{"l1": map[string]any{"l2": map[string]any{"l3": "value"}}},
-				{"l2": map[string]any{"l3": "value"}},
-				{"l3": "value"},
-			},
-		},
-		{
-			name: "non-map value",
-			obj:  "string",
-			want: nil,
-		},
-	}
-
-	for _, tt := range cases {
-		t.Run(tt.name, func(t *testing.T) {
-			got := collect(tt.obj)
-			if len(got) != len(tt.want) {
-				t.Errorf("collect() got %d maps, want %d", len(got), len(tt.want))
-				return
-			}
-
-			// Compare each map in the result
-			for i := range tt.want {
-				if !mapsEqual(got[i], tt.want[i]) {
-					t.Errorf("collect() map[%d] = %v, want %v", i, got[i], tt.want[i])
-				}
-			}
-		})
-	}
-}
-
-// mapsEqual compares two maps for deep equality
-func mapsEqual(m1, m2 map[string]any) bool {
-	if len(m1) != len(m2) {
-		return false
-	}
-	for k, v1 := range m1 {
-		v2, ok := m2[k]
-		if !ok {
-			return false
-		}
-		switch val1 := v1.(type) {
-		case map[string]any:
-			val2, ok := v2.(map[string]any)
-			if !ok || !mapsEqual(val1, val2) {
-				return false
-			}
-		default:
-			if v1 != v2 {
-				return false
-			}
-		}
-	}
-	return true
-}

From 5a8eb0e1510a5a35b80649f2b88e9231716b6850 Mon Sep 17 00:00:00 2001
From: Phil <philipp.berndt@geonor.no>
Date: Sat, 14 Jun 2025 17:54:03 +0200
Subject: [PATCH 092/108] readme: add GPTranslate to community integrations
 (#11071)

---
 README.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/README.md b/README.md
index ffaec628..a216589a 100644
--- a/README.md
+++ b/README.md
@@ -407,6 +407,7 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [Lumina](https://github.com/cushydigit/lumina.git) (A lightweight, minimal React.js frontend for interacting with Ollama servers)
 - [Tiny Notepad](https://pypi.org/project/tiny-notepad) (A lightweight, notepad-like interface to chat with ollama available on PyPI)
 - [macLlama (macOS native)](https://github.com/hellotunamayo/macLlama) (A native macOS GUI application for interacting with Ollama models, featuring a chat interface.) 
+- [GPTranslate](https://github.com/philberndt/GPTranslate) (A fast and lightweight, AI powered desktop translation application written with Rust and Tauri. Features real-time translation with OpenAI/Azure/Ollama.)
 
 ### Cloud
 

From 502028968ddca04bd19c0859a73fb4e0cbeac3e1 Mon Sep 17 00:00:00 2001
From: NGC13009 <ngc1300@126.com>
Date: Mon, 16 Jun 2025 12:27:49 +0800
Subject: [PATCH 093/108] readme: add ollama-launcher to community integrations
 (#11080)

---
 README.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/README.md b/README.md
index a216589a..e148f9af 100644
--- a/README.md
+++ b/README.md
@@ -408,6 +408,7 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [Tiny Notepad](https://pypi.org/project/tiny-notepad) (A lightweight, notepad-like interface to chat with ollama available on PyPI)
 - [macLlama (macOS native)](https://github.com/hellotunamayo/macLlama) (A native macOS GUI application for interacting with Ollama models, featuring a chat interface.) 
 - [GPTranslate](https://github.com/philberndt/GPTranslate) (A fast and lightweight, AI powered desktop translation application written with Rust and Tauri. Features real-time translation with OpenAI/Azure/Ollama.)
+- [ollama launcher](https://github.com/NGC13009/ollama-launcher) (A launcher for Ollama, aiming to provide users with convenient functions such as ollama server launching, management, or configuration.)
 
 ### Cloud
 

From a6fbfc880c3de9b57e341db374907e2fedda9fa6 Mon Sep 17 00:00:00 2001
From: Michael Yang <mxyng@pm.me>
Date: Mon, 16 Jun 2025 10:42:32 -0700
Subject: [PATCH 094/108] gguf: fix write order (#11068)

* ggml: test write gguf order
* ggml: fix write tensor order
---
 fs/ggml/gguf.go      |  12 ++---
 fs/ggml/gguf_test.go | 110 +++++++++++++++++++++++++------------------
 2 files changed, 68 insertions(+), 54 deletions(-)

diff --git a/fs/ggml/gguf.go b/fs/ggml/gguf.go
index 8e75625e..33b596cc 100644
--- a/fs/ggml/gguf.go
+++ b/fs/ggml/gguf.go
@@ -527,23 +527,17 @@ func WriteGGUF(f *os.File, kv KV, ts []*Tensor) error {
 		return err
 	}
 
-	keys := slices.Collect(maps.Keys(kv))
-	slices.Sort(keys)
-
-	for _, key := range keys {
+	for _, key := range slices.Sorted(maps.Keys(kv)) {
 		if err := ggufWriteKV(f, key, kv[key]); err != nil {
 			return err
 		}
 	}
 
 	slices.SortStableFunc(ts, func(a, b *Tensor) int {
-		if i, j := a.block(), b.block(); i < 0 && j > 0 {
-			return 1
-		} else if i > 0 && j < 0 {
-			return -1
-		} else {
+		if i, j := a.block(), b.block(); i > 0 && j > 0 {
 			return cmp.Compare(i, j)
 		}
+		return cmp.Compare(a.Name, b.Name)
 	})
 
 	var s uint64
diff --git a/fs/ggml/gguf_test.go b/fs/ggml/gguf_test.go
index 0e071800..bf767918 100644
--- a/fs/ggml/gguf_test.go
+++ b/fs/ggml/gguf_test.go
@@ -2,62 +2,82 @@ package ggml
 
 import (
 	"bytes"
+	"math/rand/v2"
 	"os"
-	"slices"
+	"strings"
 	"testing"
 
 	"github.com/google/go-cmp/cmp"
 )
 
 func TestWriteGGUF(t *testing.T) {
-	w, err := os.CreateTemp(t.TempDir(), "*.bin")
-	if err != nil {
-		t.Fatal(err)
-	}
-	defer w.Close()
+	r := rand.New(rand.NewPCG(0, 0))
+	for range 8 {
+		t.Run("shuffle", func(t *testing.T) {
+			t.Parallel()
 
-	if err := WriteGGUF(w, KV{
-		"general.alignment": uint32(16),
-	}, []*Tensor{
-		{Name: "test.0", Shape: []uint64{2, 3}, WriterTo: bytes.NewBuffer(slices.Repeat([]byte{0}, 2*3*4))},
-		{Name: "test.1", Shape: []uint64{2, 3}, WriterTo: bytes.NewBuffer(slices.Repeat([]byte{0}, 2*3*4))},
-		{Name: "test.2", Shape: []uint64{2, 3}, WriterTo: bytes.NewBuffer(slices.Repeat([]byte{0}, 2*3*4))},
-		{Name: "test.3", Shape: []uint64{2, 3}, WriterTo: bytes.NewBuffer(slices.Repeat([]byte{0}, 2*3*4))},
-		{Name: "test.4", Shape: []uint64{2, 3}, WriterTo: bytes.NewBuffer(slices.Repeat([]byte{0}, 2*3*4))},
-		{Name: "test.5", Shape: []uint64{2, 3}, WriterTo: bytes.NewBuffer(slices.Repeat([]byte{0}, 2*3*4))},
-	}); err != nil {
-		t.Fatal(err)
-	}
+			ts := []*Tensor{
+				{Name: "token_embd.weight", Shape: []uint64{2, 3}, WriterTo: bytes.NewBuffer(make([]byte, 2*3))},
+				{Name: "blk.0.attn_norm.weight", Shape: []uint64{2, 3}, WriterTo: bytes.NewBuffer(make([]byte, 2*3))},
+				{Name: "blk.1.attn_norm.weight", Shape: []uint64{2, 3}, WriterTo: bytes.NewBuffer(make([]byte, 2*3))},
+				{Name: "blk.2.attn_norm.weight", Shape: []uint64{2, 3}, WriterTo: bytes.NewBuffer(make([]byte, 2*3))},
+				{Name: "blk.3.attn_norm.weight", Shape: []uint64{2, 3}, WriterTo: bytes.NewBuffer(make([]byte, 2*3))},
+				{Name: "blk.4.attn_norm.weight", Shape: []uint64{2, 3}, WriterTo: bytes.NewBuffer(make([]byte, 2*3))},
+				{Name: "blk.5.attn_norm.weight", Shape: []uint64{2, 3}, WriterTo: bytes.NewBuffer(make([]byte, 2*3))},
+				{Name: "output_norm.weight", Shape: []uint64{3, 2}, WriterTo: bytes.NewBuffer(make([]byte, 3*2))},
+				{Name: "output.weight", Shape: []uint64{3, 2}, WriterTo: bytes.NewBuffer(make([]byte, 3*2))},
+			}
 
-	r, err := os.Open(w.Name())
-	if err != nil {
-		t.Fatal(err)
-	}
-	defer r.Close()
+			r.Shuffle(len(ts), func(i, j int) {
+				ts[i], ts[j] = ts[j], ts[i]
+			})
 
-	ff, err := Decode(r, 0)
-	if err != nil {
-		t.Fatal(err)
-	}
+			w, err := os.CreateTemp(t.TempDir(), strings.ReplaceAll(t.Name(), "/", "_")+"*.bin")
+			if err != nil {
+				t.Fatal(err)
+			}
+			defer w.Close()
 
-	if diff := cmp.Diff(ff.KV(), KV{
-		"general.alignment":       uint32(16),
-		"general.parameter_count": uint64(36),
-	}); diff != "" {
-		t.Errorf("Mismatch (-want +got):\n%s", diff)
-	}
+			if err := WriteGGUF(w, KV{
+				"general.alignment": uint32(16),
+			}, ts); err != nil {
+				t.Fatal(err)
+			}
 
-	if diff := cmp.Diff(ff.Tensors(), Tensors{
-		Offset: 336,
-		items: []*Tensor{
-			{Name: "test.0", Offset: 0, Shape: []uint64{2, 3}},
-			{Name: "test.1", Offset: 32, Shape: []uint64{2, 3}},
-			{Name: "test.2", Offset: 64, Shape: []uint64{2, 3}},
-			{Name: "test.3", Offset: 96, Shape: []uint64{2, 3}},
-			{Name: "test.4", Offset: 128, Shape: []uint64{2, 3}},
-			{Name: "test.5", Offset: 160, Shape: []uint64{2, 3}},
-		},
-	}, cmp.AllowUnexported(Tensors{})); diff != "" {
-		t.Errorf("Mismatch (-want +got):\n%s", diff)
+			r, err := os.Open(w.Name())
+			if err != nil {
+				t.Fatal(err)
+			}
+			defer r.Close()
+
+			ff, err := Decode(r, 0)
+			if err != nil {
+				t.Fatal(err)
+			}
+
+			if diff := cmp.Diff(KV{
+				"general.alignment":       uint32(16),
+				"general.parameter_count": uint64(54),
+			}, ff.KV()); diff != "" {
+				t.Errorf("Mismatch (-want +got):\n%s", diff)
+			}
+
+			if diff := cmp.Diff(Tensors{
+				Offset: 608,
+				items: []*Tensor{
+					{Name: "blk.0.attn_norm.weight", Offset: 0, Shape: []uint64{2, 3}},
+					{Name: "blk.1.attn_norm.weight", Offset: 32, Shape: []uint64{2, 3}},
+					{Name: "blk.2.attn_norm.weight", Offset: 64, Shape: []uint64{2, 3}},
+					{Name: "blk.3.attn_norm.weight", Offset: 96, Shape: []uint64{2, 3}},
+					{Name: "blk.4.attn_norm.weight", Offset: 128, Shape: []uint64{2, 3}},
+					{Name: "blk.5.attn_norm.weight", Offset: 160, Shape: []uint64{2, 3}},
+					{Name: "output.weight", Offset: 192, Shape: []uint64{3, 2}},
+					{Name: "output_norm.weight", Offset: 224, Shape: []uint64{3, 2}},
+					{Name: "token_embd.weight", Offset: 256, Shape: []uint64{2, 3}},
+				},
+			}, ff.Tensors(), cmp.AllowUnexported(Tensors{})); diff != "" {
+				t.Errorf("Mismatch (-want +got):\n%s", diff)
+			}
+		})
 	}
 }

From 9e125d884cf995dfae7fcd74690d525e4326a517 Mon Sep 17 00:00:00 2001
From: Jeffrey Morgan <jmorganca@gmail.com>
Date: Mon, 16 Jun 2025 16:03:16 -0700
Subject: [PATCH 095/108] model: treat 'user defined' tokens as special tokens
 (#11077)

---
 model/vocabulary.go      |  2 +-
 model/vocabulary_test.go | 16 ++++++++++++++++
 2 files changed, 17 insertions(+), 1 deletion(-)
 create mode 100644 model/vocabulary_test.go

diff --git a/model/vocabulary.go b/model/vocabulary.go
index 24adbaca..a86de58d 100644
--- a/model/vocabulary.go
+++ b/model/vocabulary.go
@@ -87,7 +87,7 @@ func (v *Vocabulary) Decode(id int32) string {
 func (v *Vocabulary) SpecialVocabulary() []string {
 	v.specialOnce.Do(func() {
 		for i := range v.Values {
-			if v.Types[i] == TOKEN_TYPE_CONTROL {
+			if v.Types[i] == TOKEN_TYPE_CONTROL || v.Types[i] == TOKEN_TYPE_USER_DEFINED {
 				v.special = append(v.special, v.Values[i])
 			}
 		}
diff --git a/model/vocabulary_test.go b/model/vocabulary_test.go
new file mode 100644
index 00000000..46f0ead2
--- /dev/null
+++ b/model/vocabulary_test.go
@@ -0,0 +1,16 @@
+package model
+
+import "testing"
+
+func TestVocabulary_SpecialVocabulary(t *testing.T) {
+	vocab := &Vocabulary{
+		Values: []string{"<|startoftext|>", "<|endoftext|>", "<|tool_call_start|>", "<|tool_call_end|>", "hi"},
+		Types:  []int32{TOKEN_TYPE_CONTROL, TOKEN_TYPE_CONTROL, TOKEN_TYPE_USER_DEFINED, TOKEN_TYPE_USER_DEFINED, TOKEN_TYPE_NORMAL},
+	}
+
+	specialVocab := vocab.SpecialVocabulary()
+
+	if len(specialVocab) != 4 {
+		t.Errorf("expected 4 special tokens, got %d", len(specialVocab))
+	}
+}

From 6bda1d24798e40fc9ea1419c6ce22c6cdcc9dfe2 Mon Sep 17 00:00:00 2001
From: Jeffrey Morgan <jmorganca@gmail.com>
Date: Tue, 17 Jun 2025 10:51:43 -0700
Subject: [PATCH 096/108] tools: fix parsing tool calls without any parameters
 (#11101)

Fixes issue where tool calls that don't expect any parameters were
not being parsed. This also fixes two additional issues: one where
2+ tool calls would not be correctly parsed, and cases where tool calls
with invalid parameters would still get parsed
---
 tools/tools.go      |  85 ++++++++-------
 tools/tools_test.go | 258 ++++++++++++++++++++++++++++++++++++++++++--
 2 files changed, 297 insertions(+), 46 deletions(-)

diff --git a/tools/tools.go b/tools/tools.go
index efeaeee0..a86163ba 100644
--- a/tools/tools.go
+++ b/tools/tools.go
@@ -18,9 +18,8 @@ const (
 )
 
 type Parser struct {
-	tag        string
-	names      []string
-	properties []string
+	tag   string
+	tools []api.Tool
 
 	state  toolsState
 	buffer []byte
@@ -34,15 +33,10 @@ func NewParser(tmpl *template.Template, tools []api.Tool) *Parser {
 }
 
 func NewParserWithTag(tools []api.Tool, tag string) *Parser {
-	var p Parser
-	for _, t := range tools {
-		p.names = append(p.names, t.Function.Name)
-		for r := range t.Function.Parameters.Properties {
-			p.properties = append(p.properties, r)
-		}
+	return &Parser{
+		tag:   tag,
+		tools: tools,
 	}
-	p.tag = tag
-	return &p
 }
 
 // Add processes a string input to parse tool calls and content that
@@ -121,36 +115,40 @@ func (p *Parser) findTag() (int, bool) {
 // parseToolCall finds the next complete tool call in the buffer
 // incrementing n and advancing the buffer.
 func (p *Parser) parseToolCall() *api.ToolCall {
-	var name string
 	var args map[string]any
+	var tool *api.Tool
 	var end int = len(p.buffer)
 
-	// find tool name
 	var i int
-	for _, n := range p.names {
+	// find tool name
+	for _, t := range p.tools {
+		n := t.Function.Name
 		if i = bytes.Index(p.buffer, []byte(n)); i != -1 {
 			if i+len(n) < end {
-				name = n
+				tool = &t
 				end = i + len(n)
 			}
 		}
 	}
 
-	if name == "" {
+	if tool == nil {
 		return nil
 	}
 
-	if args, i = p.findArguments(); args == nil {
-		return nil
-	}
+	// only look for arguments if the tool has parameters
+	if len(tool.Function.Parameters.Properties) > 0 {
+		if args, i = p.findArguments(*tool); args == nil {
+			return nil
+		}
 
-	if i > end {
-		end = i
+		if i > end {
+			end = i
+		}
 	}
 
 	tc := &api.ToolCall{
 		Function: api.ToolCallFunction{
-			Name:      name,
+			Name:      tool.Function.Name,
 			Arguments: args,
 			Index:     p.n,
 		},
@@ -162,13 +160,17 @@ func (p *Parser) parseToolCall() *api.ToolCall {
 }
 
 // findArguments returns the first object that appears to be
-// arguments and the position where the arguments end, returning nil and 0 if
-// an invalid JSON object or non-arguments object is found first
-func (p *Parser) findArguments() (map[string]any, int) {
+// arguments for the provided tool, returning nil
+func (p *Parser) findArguments(tool api.Tool) (map[string]any, int) {
 	if len(p.buffer) == 0 {
 		return nil, 0
 	}
 
+	// no arguments to parse
+	if len(tool.Function.Parameters.Properties) == 0 {
+		return nil, 0
+	}
+
 	var braces int
 	var start int = -1
 	var end int
@@ -184,11 +186,13 @@ func (p *Parser) findArguments() (map[string]any, int) {
 		}
 
 		if c == '}' {
-			braces--
-			if braces == 0 && start != -1 {
-				end = i + 1
-				object = p.buffer[start:end]
-				break
+			if start != -1 {
+				braces--
+				if braces == 0 {
+					end = i + 1
+					object = p.buffer[start:end]
+					break
+				}
 			}
 		}
 	}
@@ -206,24 +210,27 @@ func (p *Parser) findArguments() (map[string]any, int) {
 
 	var find func(obj any) map[string]any
 	find = func(obj any) map[string]any {
-		switch v := obj.(type) {
+		switch obj := obj.(type) {
 		case map[string]any:
-			// check if the object keys are valid tool properties
-			// TODO (jmorganca): check only sets of properties that
-			// go together instead of the entire set
-			for _, prop := range p.properties {
-				if _, exists := v[prop]; exists {
-					return v
+			found := true
+			for key := range obj {
+				if _, exists := tool.Function.Parameters.Properties[key]; !exists {
+					found = false
+					break
 				}
 			}
 
-			for _, value := range v {
+			if found {
+				return obj
+			}
+
+			for _, value := range obj {
 				if result := find(value); result != nil {
 					return result
 				}
 			}
 		case []any:
-			for _, item := range v {
+			for _, item := range obj {
 				if result := find(item); result != nil {
 					return result
 				}
diff --git a/tools/tools_test.go b/tools/tools_test.go
index 67864168..ebf4ad7d 100644
--- a/tools/tools_test.go
+++ b/tools/tools_test.go
@@ -104,6 +104,13 @@ func TestParser(t *testing.T) {
 				},
 			},
 		},
+		{
+			Type: "function",
+			Function: api.ToolFunction{
+				Name:        "say_hello",
+				Description: "Say hello",
+			},
+		},
 	}
 
 	tests := []struct {
@@ -144,6 +151,20 @@ func TestParser(t *testing.T) {
 				},
 			},
 		},
+		{
+			name:    "invalid arguments",
+			inputs:  []string{`<tool_call>{"name": "get_conditions", "arguments": {"city": "San Francisco"}}</tool_call>`},
+			content: "",
+			tmpl:    qwen,
+			calls:   nil,
+		},
+		{
+			name:    "missing args",
+			inputs:  []string{`<tool_call>{"name": "get_conditions"}</tool_call>`},
+			content: "",
+			tmpl:    qwen,
+			calls:   nil,
+		},
 		{
 			name:    "text before tool call",
 			inputs:  []string{`Let me check the weather. <tool_call>{"name": "get_temperature", "arguments": {"city": "New York"}}</tool_call>`},
@@ -161,6 +182,27 @@ func TestParser(t *testing.T) {
 				},
 			},
 		},
+		{
+			name:    "qwen no args tool call",
+			inputs:  []string{`Let me say hello to the user. I'll use the say_hello tool <tool_call>{"name": "say_hello"}</tool_call>`},
+			content: "Let me say hello to the user. I'll use the say_hello tool ",
+			tmpl:    qwen,
+			calls: []api.ToolCall{
+				{
+					Function: api.ToolCallFunction{
+						Index: 0,
+						Name:  "say_hello",
+					},
+				},
+			},
+		},
+		{
+			name:    "qwen no args with text",
+			inputs:  []string{"Let me say hello to the user. I'll use the say_hello tool. "},
+			content: "Let me say hello to the user. I'll use the say_hello tool. ",
+			tmpl:    qwen,
+			calls:   nil,
+		},
 		{
 			name:    "two tool calls in a list",
 			inputs:  []string{`[TOOL_CALLS] [{"name": "get_temperature", "arguments": {"city": "London", "format": "fahrenheit"}}, {"name": "get_conditions", "arguments": {"location": "Tokyo"}}][/TOOL_CALLS]`},
@@ -189,7 +231,7 @@ func TestParser(t *testing.T) {
 			},
 		},
 		{
-			name:    "two tool calls",
+			name:    "qwen two tool calls",
 			inputs:  []string{`Okay, let's call both tools! <tool_call>{"name": "get_temperature", "arguments": {"city": "London", "format": "fahrenheit"}}</tool_call><tool_call>{"name": "get_conditions", "arguments": {"location": "Tokyo"}}</tool_call>`},
 			content: "Okay, let's call both tools! ",
 			tmpl:    qwen,
@@ -215,6 +257,29 @@ func TestParser(t *testing.T) {
 				},
 			},
 		},
+		{
+			name:    "qwen two tool calls one with no args",
+			inputs:  []string{`Let me check the weather. <tool_call>{"name": "say_hello"}</tool_call><tool_call>{"name": "get_conditions", "arguments": {"location": "Tokyo"}}`},
+			content: "Let me check the weather. ",
+			tmpl:    qwen,
+			calls: []api.ToolCall{
+				{
+					Function: api.ToolCallFunction{
+						Index: 0,
+						Name:  "say_hello",
+					},
+				},
+				{
+					Function: api.ToolCallFunction{
+						Index: 1,
+						Name:  "get_conditions",
+						Arguments: api.ToolCallFunctionArguments{
+							"location": "Tokyo",
+						},
+					},
+				},
+			},
+		},
 		{
 			name:    "deepseek",
 			inputs:  []string{"<think>Wait, I need to call a tool</think><|tool▁calls▁begin|><|tool▁call▁begin|>function<|tool▁sep|>get_temperature\n```json\n{\"city\": \"Tokyo\"}\n```<|tool▁call▁end|><|tool▁calls▁end|><|end▁of▁sentence|>"},
@@ -338,6 +403,50 @@ func TestParser(t *testing.T) {
 			content: "for { fmt.Println(\"hello\") }",
 			tmpl:    json,
 		},
+		{
+			name: "json no args tool call",
+			inputs: []string{
+				"{\"name\": \"say_hello\"}",
+			},
+			content: "",
+			tmpl:    json,
+			calls: []api.ToolCall{
+				{
+					Function: api.ToolCallFunction{
+						Index: 0,
+						Name:  "say_hello",
+					},
+				},
+			},
+		},
+		{
+			name: "json no args no tool call",
+			inputs: []string{
+				"I'll use the say_hello tool to say hello to the user.",
+			},
+			content: "I'll use the say_hello tool to say hello to the user.",
+			tmpl:    json,
+			calls:   nil,
+		},
+
+		// TODO (jmorganca): this is a false positive, we should
+		// not be parsing this as a tool call
+		{
+			name: "json no args false positive",
+			inputs: []string{
+				`{say_hello!!!}`,
+			},
+			content: "",
+			tmpl:    json,
+			calls: []api.ToolCall{
+				{
+					Function: api.ToolCallFunction{
+						Index: 0,
+						Name:  "say_hello",
+					},
+				},
+			},
+		},
 		{
 			name: "list multiple",
 			inputs: []string{
@@ -380,6 +489,30 @@ func TestParser(t *testing.T) {
 		},
 		{
 			name: "list partial",
+			inputs: []string{
+				"[{",
+				"\"name\": \"get_conditions\", ",
+				"\"arguments\": {",
+				"\"location\": \"Tokyo\"",
+				"}",
+				"}",
+			},
+			content: "",
+			tmpl:    list,
+			calls: []api.ToolCall{
+				{
+					Function: api.ToolCallFunction{
+						Index: 0,
+						Name:  "get_conditions",
+						Arguments: api.ToolCallFunctionArguments{
+							"location": "Tokyo",
+						},
+					},
+				},
+			},
+		},
+		{
+			name: "list invalid",
 			inputs: []string{
 				"[",
 				"{",
@@ -393,6 +526,33 @@ func TestParser(t *testing.T) {
 			tmpl:    list,
 			calls:   nil,
 		},
+		{
+			name: "list trailing ]",
+			inputs: []string{
+				"[",
+				"{",
+				"\"name\": \"get_conditions\", ",
+				"\"arguments\": {",
+				"\"location\": \"Tokyo\"",
+				"}",
+				"}",
+				"]",
+				"]",
+			},
+			content: "",
+			tmpl:    list,
+			calls: []api.ToolCall{
+				{
+					Function: api.ToolCallFunction{
+						Index: 0,
+						Name:  "get_conditions",
+						Arguments: api.ToolCallFunctionArguments{
+							"location": "Tokyo",
+						},
+					},
+				},
+			},
+		},
 		{
 			name: "list not a tool call",
 			inputs: []string{
@@ -404,6 +564,25 @@ func TestParser(t *testing.T) {
 			tmpl:    list,
 			calls:   nil,
 		},
+		{
+			name: "list with no arguments",
+			inputs: []string{
+				"[",
+				"{",
+				"\"name\": \"say_hello\"",
+				"}",
+			},
+			content: "",
+			tmpl:    list,
+			calls: []api.ToolCall{
+				{
+					Function: api.ToolCallFunction{
+						Index: 0,
+						Name:  "say_hello",
+					},
+				},
+			},
+		},
 	}
 
 	for _, tt := range tests {
@@ -700,25 +879,75 @@ func TestFindTag(t *testing.T) {
 }
 
 func TestFindArguments(t *testing.T) {
+	tool := api.Tool{
+		Type: "function",
+		Function: api.ToolFunction{
+			Name:        "get_temperature",
+			Description: "Retrieve the temperature for a given location",
+			Parameters: struct {
+				Type       string   `json:"type"`
+				Defs       any      `json:"$defs,omitempty"`
+				Items      any      `json:"items,omitempty"`
+				Required   []string `json:"required"`
+				Properties map[string]struct {
+					Type        api.PropertyType `json:"type"`
+					Items       any              `json:"items,omitempty"`
+					Description string           `json:"description"`
+					Enum        []any            `json:"enum,omitempty"`
+				} `json:"properties"`
+			}{
+				Type: "object",
+				Properties: map[string]struct {
+					Type        api.PropertyType `json:"type"`
+					Items       any              `json:"items,omitempty"`
+					Description string           `json:"description"`
+					Enum        []any            `json:"enum,omitempty"`
+				}{
+					"format": {
+						Type:        api.PropertyType{"string"},
+						Description: "The format to return the temperature in",
+						Enum:        []any{"fahrenheit", "celsius"},
+					},
+					"location": {
+						Type:        api.PropertyType{"string"},
+						Description: "The location to get the temperature for",
+					},
+				},
+			},
+		},
+	}
+
+	tool2 := api.Tool{
+		Type: "function",
+		Function: api.ToolFunction{
+			Name:        "say_hello",
+			Description: "Say hello to the user",
+		},
+	}
+
 	tests := []struct {
 		name   string
 		buffer []byte
 		want   map[string]any
+		tool   api.Tool
 	}{
 		{
 			name:   "empty string",
 			buffer: []byte{},
 			want:   nil,
+			tool:   tool,
 		},
 		{
 			name:   "whitespace only",
 			buffer: []byte("   \n\t  "),
 			want:   nil,
+			tool:   tool,
 		},
 		{
 			name:   "unbalanced braces - missing closing",
 			buffer: []byte(`{"format": "fahrenheit", "location": "San Francisco"`),
 			want:   nil,
+			tool:   tool,
 		},
 		{
 			name:   "unbalanced braces - extra closing",
@@ -726,11 +955,13 @@ func TestFindArguments(t *testing.T) {
 			want: map[string]any{
 				"format": "fahrenheit",
 			},
+			tool: tool,
 		},
 		{
 			name:   "invalid JSON",
 			buffer: []byte(`{format: fahrenheit, location: "San Francisco"}`),
 			want:   nil,
+			tool:   tool,
 		},
 		{
 			name:   "valid json",
@@ -739,6 +970,7 @@ func TestFindArguments(t *testing.T) {
 				"format":   "fahrenheit",
 				"location": "San Francisco, CA",
 			},
+			tool: tool,
 		},
 		{
 			name:   "valid arguments with special tokens",
@@ -747,6 +979,7 @@ func TestFindArguments(t *testing.T) {
 				"format":   "fahrenheit",
 				"location": "San Francisco, CA",
 			},
+			tool: tool,
 		},
 		{
 			name:   "valid arguments in array",
@@ -755,6 +988,7 @@ func TestFindArguments(t *testing.T) {
 				"format":   "fahrenheit",
 				"location": "San Francisco, CA",
 			},
+			tool: tool,
 		},
 		{
 			name:   "nested deep",
@@ -763,39 +997,49 @@ func TestFindArguments(t *testing.T) {
 				"format":   "fahrenheit",
 				"location": "San Francisco, CA",
 			},
+			tool: tool,
 		},
 		{
 			name:   "one arg",
-			buffer: []byte(`get_weather({"location": "San Francisco, CA"})`),
+			buffer: []byte(`get_temperature({"location": "San Francisco, CA"})`),
 			want: map[string]any{
 				"location": "San Francisco, CA",
 			},
+			tool: tool,
 		},
 		{
 			name:   "two args",
-			buffer: []byte(`[{"name": "get_weather", "arguments": {"location": "San Francisco, CA", "format": "fahrenheit"}}, {"name": "get_weather", "arguments": {"location": "San Francisco, CA", "format": "fahrenheit"}}]`),
+			buffer: []byte(`[{"name": "get_temperature", "arguments": {"location": "San Francisco, CA", "format": "fahrenheit"}}, {"name": "get_weather", "arguments": {"location": "San Francisco, CA", "format": "fahrenheit"}}]`),
 			want: map[string]any{
 				"location": "San Francisco, CA",
 				"format":   "fahrenheit",
 			},
+			tool: tool,
+		},
+		{
+			name:   "no args",
+			buffer: []byte(`{"name": "say_hello"}`),
+			want:   nil,
+			tool:   tool2,
 		},
 		{
 			name:   "deepseek",
-			buffer: []byte("<|tool▁calls▁begin|><|tool▁call▁begin|>function<|tool▁sep|>get_current_weather\n```json\n{\"location\": \"Tokyo\"}\n```<|tool▁call▁end|><|tool▁calls▁end|><|end▁of▁sentence|>"),
+			buffer: []byte("<|tool▁calls▁begin|><|tool▁call▁begin|>function<|tool▁sep|>get_temperature\n```json\n{\"location\": \"Tokyo\"}\n```<|tool▁call▁end|><|tool▁calls▁end|><|end▁of▁sentence|>"),
 			want: map[string]any{
 				"location": "Tokyo",
 			},
+			tool: tool,
 		},
 	}
 
 	for _, tt := range tests {
 		parser := &Parser{
-			buffer:     tt.buffer,
-			properties: []string{"format", "location"},
+			buffer: tt.buffer,
+			tools:  []api.Tool{tool, tool2},
 		}
 
 		t.Run(tt.name, func(t *testing.T) {
-			got, _ := parser.findArguments()
+			got, _ := parser.findArguments(tool)
 
 			if diff := cmp.Diff(got, tt.want); diff != "" {
 				t.Errorf("scanArguments() args mismatch (-got +want):\n%s", diff)

From 55bbf3b4a1766fc99e32ebd39f0dd5749152b55d Mon Sep 17 00:00:00 2001
From: Jeffrey Morgan <jmorganca@gmail.com>
Date: Wed, 18 Jun 2025 05:20:43 -0700
Subject: [PATCH 097/108] tools: return empty arguments object instead of null
 (#11113)

---
 tools/tools.go      |  4 ++--
 tools/tools_test.go | 25 +++++++++++++++----------
 2 files changed, 17 insertions(+), 12 deletions(-)

diff --git a/tools/tools.go b/tools/tools.go
index a86163ba..8a983e19 100644
--- a/tools/tools.go
+++ b/tools/tools.go
@@ -115,11 +115,10 @@ func (p *Parser) findTag() (int, bool) {
 // parseToolCall finds the next complete tool call in the buffer
 // incrementing n and advancing the buffer.
 func (p *Parser) parseToolCall() *api.ToolCall {
-	var args map[string]any
 	var tool *api.Tool
 	var end int = len(p.buffer)
-
 	var i int
+
 	// find tool name
 	for _, t := range p.tools {
 		n := t.Function.Name
@@ -136,6 +135,7 @@ func (p *Parser) parseToolCall() *api.ToolCall {
 	}
 
 	// only look for arguments if the tool has parameters
+	args := map[string]any{}
 	if len(tool.Function.Parameters.Properties) > 0 {
 		if args, i = p.findArguments(*tool); args == nil {
 			return nil
diff --git a/tools/tools_test.go b/tools/tools_test.go
index ebf4ad7d..35b58343 100644
--- a/tools/tools_test.go
+++ b/tools/tools_test.go
@@ -190,8 +190,9 @@ func TestParser(t *testing.T) {
 			calls: []api.ToolCall{
 				{
 					Function: api.ToolCallFunction{
-						Index: 0,
-						Name:  "say_hello",
+						Index:     0,
+						Name:      "say_hello",
+						Arguments: api.ToolCallFunctionArguments{},
 					},
 				},
 			},
@@ -265,8 +266,9 @@ func TestParser(t *testing.T) {
 			calls: []api.ToolCall{
 				{
 					Function: api.ToolCallFunction{
-						Index: 0,
-						Name:  "say_hello",
+						Index:     0,
+						Name:      "say_hello",
+						Arguments: api.ToolCallFunctionArguments{},
 					},
 				},
 				{
@@ -413,8 +415,9 @@ func TestParser(t *testing.T) {
 			calls: []api.ToolCall{
 				{
 					Function: api.ToolCallFunction{
-						Index: 0,
-						Name:  "say_hello",
+						Index:     0,
+						Name:      "say_hello",
+						Arguments: api.ToolCallFunctionArguments{},
 					},
 				},
 			},
@@ -441,8 +444,9 @@ func TestParser(t *testing.T) {
 			calls: []api.ToolCall{
 				{
 					Function: api.ToolCallFunction{
-						Index: 0,
-						Name:  "say_hello",
+						Index:     0,
+						Name:      "say_hello",
+						Arguments: api.ToolCallFunctionArguments{},
 					},
 				},
 			},
@@ -577,8 +581,9 @@ func TestParser(t *testing.T) {
 			calls: []api.ToolCall{
 				{
 					Function: api.ToolCallFunction{
-						Index: 0,
-						Name:  "say_hello",
+						Index:     0,
+						Name:      "say_hello",
+						Arguments: api.ToolCallFunctionArguments{},
 					},
 				},
 			},

From 60cfa2a203201e8513b7da26035f74495e498bc7 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E6=9B=B9=E5=AE=B6=E5=B7=A7?= <caojiaqiao@outlook.com>
Date: Wed, 18 Jun 2025 20:21:45 +0800
Subject: [PATCH 098/108] cache: fix comment function name in cache.go (#11110)

---
 server/internal/cache/blob/cache.go | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/server/internal/cache/blob/cache.go b/server/internal/cache/blob/cache.go
index a1351538..2f02ca95 100644
--- a/server/internal/cache/blob/cache.go
+++ b/server/internal/cache/blob/cache.go
@@ -59,7 +59,7 @@ type DiskCache struct {
 	testHookBeforeFinalWrite func(f *os.File)
 }
 
-// PutString is a convenience function for c.Put(d, strings.NewReader(s), int64(len(s))).
+// PutBytes is a convenience function for c.Put(d, strings.NewReader(s), int64(len(s))).
 func PutBytes[S string | []byte](c *DiskCache, d Digest, data S) error {
 	return c.Put(d, bytes.NewReader([]byte(data)), int64(len(data)))
 }

From a6e64fbdf28f0d6cb97cc7f022ca493b905fe895 Mon Sep 17 00:00:00 2001
From: Jeffrey Morgan <jmorganca@gmail.com>
Date: Wed, 18 Jun 2025 05:42:44 -0700
Subject: [PATCH 099/108] Revert "feat: incremental gguf parser (#10822)"
 (#11114)

This reverts commit 6b04cad7e816d1a119559e092d59f4fbaa6c3a0b.
---
 fs/gguf/gguf.go             | 347 ------------------------------------
 fs/gguf/gguf_test.go        | 249 --------------------------
 fs/gguf/keyvalue.go         |  90 ----------
 fs/gguf/keyvalue_test.go    | 208 ---------------------
 fs/gguf/lazy.go             |  89 ---------
 fs/gguf/reader.go           |  23 ---
 fs/gguf/tensor.go           | 288 ------------------------------
 go.mod                      |   2 +-
 go.sum                      |   4 +-
 server/images.go            |  24 +--
 server/images_test.go       | 165 ++++++++++++-----
 server/quantization_test.go |  12 +-
 server/sched_test.go        |  20 ++-
 13 files changed, 164 insertions(+), 1357 deletions(-)
 delete mode 100644 fs/gguf/gguf.go
 delete mode 100644 fs/gguf/gguf_test.go
 delete mode 100644 fs/gguf/keyvalue.go
 delete mode 100644 fs/gguf/keyvalue_test.go
 delete mode 100644 fs/gguf/lazy.go
 delete mode 100644 fs/gguf/reader.go
 delete mode 100644 fs/gguf/tensor.go

diff --git a/fs/gguf/gguf.go b/fs/gguf/gguf.go
deleted file mode 100644
index ebb9286f..00000000
--- a/fs/gguf/gguf.go
+++ /dev/null
@@ -1,347 +0,0 @@
-package gguf
-
-import (
-	"bytes"
-	"cmp"
-	"encoding/binary"
-	"errors"
-	"fmt"
-	"io"
-	"iter"
-	"os"
-	"slices"
-	"strings"
-)
-
-const (
-	typeUint8 uint32 = iota
-	typeInt8
-	typeUint16
-	typeInt16
-	typeUint32
-	typeInt32
-	typeFloat32
-	typeBool
-	typeString
-	typeArray
-	typeUint64
-	typeInt64
-	typeFloat64
-)
-
-var ErrUnsupported = errors.New("unsupported")
-
-type File struct {
-	Magic   [4]byte
-	Version uint32
-
-	keyValues *lazy[KeyValue]
-	tensors   *lazy[TensorInfo]
-	offset    int64
-
-	file   *os.File
-	reader *bufferedReader
-	bts    []byte
-}
-
-func Open(path string) (f *File, err error) {
-	f = &File{bts: make([]byte, 4096)}
-	f.file, err = os.Open(path)
-	if err != nil {
-		return nil, err
-	}
-
-	f.reader = newBufferedReader(f.file, 32<<10)
-
-	if err := binary.Read(f.reader, binary.LittleEndian, &f.Magic); err != nil {
-		return nil, err
-	}
-
-	if bytes.Equal(f.Magic[:], []byte("gguf")) {
-		return nil, fmt.Errorf("%w file type %v", ErrUnsupported, f.Magic)
-	}
-
-	if err := binary.Read(f.reader, binary.LittleEndian, &f.Version); err != nil {
-		return nil, err
-	}
-
-	if f.Version != 3 {
-		return nil, fmt.Errorf("%w version %v", ErrUnsupported, f.Version)
-	}
-
-	f.tensors, err = newLazy(f, f.readTensor)
-	if err != nil {
-		return nil, err
-	}
-
-	f.tensors.successFunc = func() error {
-		offset := f.reader.offset
-
-		alignment := cmp.Or(f.KeyValue("general.alignment").Int(), 32)
-		f.offset = offset + (alignment-offset%alignment)%alignment
-		return nil
-	}
-
-	f.keyValues, err = newLazy(f, f.readKeyValue)
-	if err != nil {
-		return nil, err
-	}
-
-	return f, nil
-}
-
-func (f *File) readTensor() (TensorInfo, error) {
-	name, err := readString(f)
-	if err != nil {
-		return TensorInfo{}, err
-	}
-
-	dims, err := read[uint32](f)
-	if err != nil {
-		return TensorInfo{}, err
-	}
-
-	shape := make([]uint64, dims)
-	for i := range dims {
-		shape[i], err = read[uint64](f)
-		if err != nil {
-			return TensorInfo{}, err
-		}
-	}
-
-	type_, err := read[uint32](f)
-	if err != nil {
-		return TensorInfo{}, err
-	}
-
-	offset, err := read[uint64](f)
-	if err != nil {
-		return TensorInfo{}, err
-	}
-
-	return TensorInfo{
-		Name:   name,
-		Offset: offset,
-		Shape:  shape,
-		Type:   TensorType(type_),
-	}, nil
-}
-
-func (f *File) readKeyValue() (KeyValue, error) {
-	key, err := readString(f)
-	if err != nil {
-		return KeyValue{}, err
-	}
-
-	t, err := read[uint32](f)
-	if err != nil {
-		return KeyValue{}, err
-	}
-
-	value, err := func() (any, error) {
-		switch t {
-		case typeUint8:
-			return read[uint8](f)
-		case typeInt8:
-			return read[int8](f)
-		case typeUint16:
-			return read[uint16](f)
-		case typeInt16:
-			return read[int16](f)
-		case typeUint32:
-			return read[uint32](f)
-		case typeInt32:
-			return read[int32](f)
-		case typeUint64:
-			return read[uint64](f)
-		case typeInt64:
-			return read[int64](f)
-		case typeFloat32:
-			return read[float32](f)
-		case typeFloat64:
-			return read[float64](f)
-		case typeBool:
-			return read[bool](f)
-		case typeString:
-			return readString(f)
-		case typeArray:
-			return readArray(f)
-		default:
-			return nil, fmt.Errorf("%w type %d", ErrUnsupported, t)
-		}
-	}()
-	if err != nil {
-		return KeyValue{}, err
-	}
-
-	return KeyValue{
-		Key:   key,
-		Value: Value{value},
-	}, nil
-}
-
-func read[T any](f *File) (t T, err error) {
-	err = binary.Read(f.reader, binary.LittleEndian, &t)
-	return t, err
-}
-
-func readString(f *File) (string, error) {
-	n, err := read[uint64](f)
-	if err != nil {
-		return "", err
-	}
-
-	if int(n) > len(f.bts) {
-		f.bts = make([]byte, n)
-	}
-
-	bts := f.bts[:n]
-	if _, err := io.ReadFull(f.reader, bts); err != nil {
-		return "", err
-	}
-	defer clear(bts)
-
-	return string(bts), nil
-}
-
-func readArray(f *File) (any, error) {
-	t, err := read[uint32](f)
-	if err != nil {
-		return nil, err
-	}
-
-	n, err := read[uint64](f)
-	if err != nil {
-		return nil, err
-	}
-
-	switch t {
-	case typeUint8:
-		return readArrayData[uint8](f, n)
-	case typeInt8:
-		return readArrayData[int8](f, n)
-	case typeUint16:
-		return readArrayData[uint16](f, n)
-	case typeInt16:
-		return readArrayData[int16](f, n)
-	case typeUint32:
-		return readArrayData[uint32](f, n)
-	case typeInt32:
-		return readArrayData[int32](f, n)
-	case typeUint64:
-		return readArrayData[uint64](f, n)
-	case typeInt64:
-		return readArrayData[int64](f, n)
-	case typeFloat32:
-		return readArrayData[float32](f, n)
-	case typeFloat64:
-		return readArrayData[float64](f, n)
-	case typeBool:
-		return readArrayData[bool](f, n)
-	case typeString:
-		return readArrayString(f, n)
-	default:
-		return nil, fmt.Errorf("%w type %d", ErrUnsupported, t)
-	}
-}
-
-func readArrayData[T any](f *File, n uint64) (s []T, err error) {
-	s = make([]T, n)
-	for i := range n {
-		e, err := read[T](f)
-		if err != nil {
-			return nil, err
-		}
-
-		s[i] = e
-	}
-
-	return s, nil
-}
-
-func readArrayString(f *File, n uint64) (s []string, err error) {
-	s = make([]string, n)
-	for i := range n {
-		e, err := readString(f)
-		if err != nil {
-			return nil, err
-		}
-
-		s[i] = e
-	}
-
-	return s, nil
-}
-
-func (f *File) Close() error {
-	f.keyValues.stop()
-	f.tensors.stop()
-	return f.file.Close()
-}
-
-func (f *File) KeyValue(key string) KeyValue {
-	if !strings.HasPrefix(key, "general.") && !strings.HasPrefix(key, "tokenizer.") {
-		key = f.KeyValue("general.architecture").String() + "." + key
-	}
-
-	if index := slices.IndexFunc(f.keyValues.values, func(kv KeyValue) bool {
-		return kv.Key == key
-	}); index >= 0 {
-		return f.keyValues.values[index]
-	}
-
-	for keyValue, ok := f.keyValues.next(); ok; keyValue, ok = f.keyValues.next() {
-		if keyValue.Key == key {
-			return keyValue
-		}
-	}
-
-	return KeyValue{}
-}
-
-func (f *File) NumKeyValues() int {
-	return int(f.keyValues.count)
-}
-
-func (f *File) KeyValues() iter.Seq2[int, KeyValue] {
-	return f.keyValues.All()
-}
-
-func (f *File) TensorInfo(name string) TensorInfo {
-	if index := slices.IndexFunc(f.tensors.values, func(t TensorInfo) bool {
-		return t.Name == name
-	}); index >= 0 {
-		return f.tensors.values[index]
-	}
-
-	// fast-forward through key values if we haven't already
-	_ = f.keyValues.rest()
-	for tensor, ok := f.tensors.next(); ok; tensor, ok = f.tensors.next() {
-		if tensor.Name == name {
-			return tensor
-		}
-	}
-
-	return TensorInfo{}
-}
-
-func (f *File) NumTensors() int {
-	return int(f.tensors.count)
-}
-
-func (f *File) TensorInfos() iter.Seq2[int, TensorInfo] {
-	// fast forward through key values if we haven't already
-	f.keyValues.rest()
-	return f.tensors.All()
-}
-
-func (f *File) TensorReader(name string) (TensorInfo, io.Reader, error) {
-	t := f.TensorInfo(name)
-	if t.NumBytes() == 0 {
-		return TensorInfo{}, nil, fmt.Errorf("tensor %s not found", name)
-	}
-
-	// fast forward through tensor info if we haven't already
-	_ = f.tensors.rest()
-	return t, io.NewSectionReader(f.file, f.offset+int64(t.Offset), t.NumBytes()), nil
-}
diff --git a/fs/gguf/gguf_test.go b/fs/gguf/gguf_test.go
deleted file mode 100644
index eea28a48..00000000
--- a/fs/gguf/gguf_test.go
+++ /dev/null
@@ -1,249 +0,0 @@
-package gguf_test
-
-import (
-	"bytes"
-	"os"
-	"strconv"
-	"strings"
-	"testing"
-
-	"github.com/google/go-cmp/cmp"
-	"github.com/google/go-cmp/cmp/cmpopts"
-	"github.com/ollama/ollama/fs/ggml"
-	"github.com/ollama/ollama/fs/gguf"
-)
-
-func createBinFile(tb testing.TB) string {
-	tb.Helper()
-	f, err := os.CreateTemp(tb.TempDir(), "")
-	if err != nil {
-		tb.Fatal(err)
-	}
-	defer f.Close()
-
-	kv := ggml.KV{
-		"general.architecture":                   "llama",
-		"llama.block_count":                      uint32(8),
-		"llama.embedding_length":                 uint32(3),
-		"llama.attention.head_count":             uint32(2),
-		"llama.attention.head_count_kv":          uint32(2),
-		"llama.attention.key_length":             uint32(3),
-		"llama.rope.dimension_count":             uint32(4),
-		"llama.rope.freq_base":                   float32(10000.0),
-		"llama.rope.freq_scale":                  float32(1.0),
-		"llama.attention.layer_norm_rms_epsilon": float32(1e-6),
-		"tokenizer.ggml.eos_token_id":            uint32(0),
-		"tokenizer.ggml.eos_token_ids":           []int32{1, 2, 3},
-		"tokenizer.ggml.tokens":                  []string{"hello", "world"},
-		"tokenizer.ggml.scores":                  []float32{0, 1},
-	}
-
-	tensors := []*ggml.Tensor{
-		{
-			Name:     "token_embd.weight",
-			Kind:     0,
-			Shape:    []uint64{2, 3},
-			WriterTo: bytes.NewBuffer(make([]byte, 4*2*3)),
-		},
-		{
-			Name:     "output.weight",
-			Kind:     0,
-			Shape:    []uint64{3, 2},
-			WriterTo: bytes.NewBuffer(make([]byte, 4*3*2)),
-		},
-	}
-
-	for i := range 8 {
-		tensors = append(tensors, &ggml.Tensor{
-			Name:     "blk." + strconv.Itoa(i) + ".attn_q.weight",
-			Kind:     0,
-			Shape:    []uint64{3, 3},
-			WriterTo: bytes.NewBuffer(make([]byte, 4*3*3)),
-		}, &ggml.Tensor{
-			Name:     "blk." + strconv.Itoa(i) + ".attn_k.weight",
-			Kind:     0,
-			Shape:    []uint64{3, 3},
-			WriterTo: bytes.NewBuffer(make([]byte, 4*3*3)),
-		}, &ggml.Tensor{
-			Name:     "blk." + strconv.Itoa(i) + ".attn_v.weight",
-			Kind:     0,
-			Shape:    []uint64{3, 3},
-			WriterTo: bytes.NewBuffer(make([]byte, 4*3*3)),
-		}, &ggml.Tensor{
-			Name:     "blk." + strconv.Itoa(i) + ".attn_output.weight",
-			Kind:     0,
-			Shape:    []uint64{3, 3},
-			WriterTo: bytes.NewBuffer(make([]byte, 4*3*3)),
-		})
-	}
-
-	if err := ggml.WriteGGUF(f, kv, tensors); err != nil {
-		tb.Fatal(err)
-	}
-
-	return f.Name()
-}
-
-func TestRead(t *testing.T) {
-	f, err := gguf.Open(createBinFile(t))
-	if err != nil {
-		t.Fatal(err)
-	}
-	defer f.Close()
-
-	if got := f.KeyValue("does.not.exist").Valid(); got {
-		t.Errorf(`KeyValue("does.not.exist").Exists() = %v, want false`, got)
-	}
-
-	if got := f.KeyValue("general.architecture").String(); got != "llama" {
-		t.Errorf(`KeyValue("general.architecture").String() = %q, want %q`, got, "llama")
-	}
-
-	if got := f.TensorInfo("token_embd.weight"); got.Name != "token_embd.weight" {
-		t.Errorf(`TensorInfo("token_embd.weight").Name = %q, want %q`, got.Name, "token_embd.weight")
-	} else if diff := cmp.Diff(got.Shape, []uint64{2, 3}); diff != "" {
-		t.Errorf(`TensorInfo("token_embd.weight").Shape mismatch (-got +want):\n%s`, diff)
-	} else if got.Type != gguf.TensorTypeF32 {
-		t.Errorf(`TensorInfo("token_embd.weight").Type = %d, want %d`, got.Type, gguf.TensorTypeF32)
-	}
-
-	if got := f.KeyValue("block_count").Uint(); got != 8 {
-		t.Errorf(`KeyValue("block_count").Uint() = %d, want %d`, got, 8)
-	}
-
-	if diff := cmp.Diff(f.KeyValue("tokenizer.ggml.tokens").Strings(), []string{"hello", "world"}); diff != "" {
-		t.Errorf("KeyValue(\"tokenizer.ggml.tokens\").Strings() mismatch (-got +want):\n%s", diff)
-	}
-
-	if diff := cmp.Diff(f.KeyValue("tokenizer.ggml.scores").Floats(), []float64{0, 1}); diff != "" {
-		t.Errorf("KeyValue(\"tokenizer.ggml.scores\").Ints() mismatch (-got +want):\n%s", diff)
-	}
-
-	var kvs []string
-	for _, kv := range f.KeyValues() {
-		if !kv.Valid() {
-			t.Error("found invalid key-value pair:", kv)
-		}
-
-		kvs = append(kvs, kv.Key)
-	}
-
-	if len(kvs) != f.NumKeyValues() {
-		t.Errorf("iterated key count = %d, want %d", len(kvs), f.NumKeyValues())
-	}
-
-	if diff := cmp.Diff(kvs, []string{
-		"general.architecture",
-		"llama.block_count",
-		"llama.embedding_length",
-		"llama.attention.head_count",
-		"llama.attention.head_count_kv",
-		"llama.attention.key_length",
-		"llama.rope.dimension_count",
-		"llama.rope.freq_base",
-		"llama.rope.freq_scale",
-		"llama.attention.layer_norm_rms_epsilon",
-		"tokenizer.ggml.eos_token_id",
-		"tokenizer.ggml.eos_token_ids",
-		"tokenizer.ggml.tokens",
-		"tokenizer.ggml.scores",
-	}, cmpopts.SortSlices(strings.Compare)); diff != "" {
-		t.Errorf("KeyValues() mismatch (-got +want):\n%s", diff)
-	}
-
-	var tis []string
-	for _, ti := range f.TensorInfos() {
-		if !ti.Valid() {
-			t.Error("found invalid tensor info:", ti)
-		}
-
-		tis = append(tis, ti.Name)
-	}
-
-	if len(tis) != f.NumTensors() {
-		t.Errorf("iterated tensor count = %d, want %d", len(tis), f.NumTensors())
-	}
-
-	if diff := cmp.Diff(tis, []string{
-		"token_embd.weight",
-		"output.weight",
-		"blk.0.attn_q.weight",
-		"blk.0.attn_k.weight",
-		"blk.0.attn_v.weight",
-		"blk.0.attn_output.weight",
-		"blk.1.attn_q.weight",
-		"blk.1.attn_k.weight",
-		"blk.1.attn_v.weight",
-		"blk.1.attn_output.weight",
-		"blk.2.attn_q.weight",
-		"blk.2.attn_k.weight",
-		"blk.2.attn_v.weight",
-		"blk.2.attn_output.weight",
-		"blk.3.attn_q.weight",
-		"blk.3.attn_k.weight",
-		"blk.3.attn_v.weight",
-		"blk.3.attn_output.weight",
-		"blk.4.attn_q.weight",
-		"blk.4.attn_k.weight",
-		"blk.4.attn_v.weight",
-		"blk.4.attn_output.weight",
-		"blk.5.attn_q.weight",
-		"blk.5.attn_k.weight",
-		"blk.5.attn_v.weight",
-		"blk.5.attn_output.weight",
-		"blk.6.attn_q.weight",
-		"blk.6.attn_k.weight",
-		"blk.6.attn_v.weight",
-		"blk.6.attn_output.weight",
-		"blk.7.attn_q.weight",
-		"blk.7.attn_k.weight",
-		"blk.7.attn_v.weight",
-		"blk.7.attn_output.weight",
-	}, cmpopts.SortSlices(strings.Compare)); diff != "" {
-		t.Errorf("TensorInfos() mismatch (-got +want):\n%s", diff)
-	}
-
-	ti, r, err := f.TensorReader("output.weight")
-	if err != nil {
-		t.Fatalf(`TensorReader("output.weight") error: %v`, err)
-	}
-
-	if ti.Name != "output.weight" {
-		t.Errorf(`TensorReader("output.weight").Name = %q, want %q`, ti.Name, "output.weight")
-	} else if diff := cmp.Diff(ti.Shape, []uint64{3, 2}); diff != "" {
-		t.Errorf(`TensorReader("output.weight").Shape mismatch (-got +want):\n%s`, diff)
-	} else if ti.Type != gguf.TensorTypeF32 {
-		t.Errorf(`TensorReader("output.weight").Type = %d, want %d`, ti.Type, gguf.TensorTypeF32)
-	}
-
-	var b bytes.Buffer
-	if _, err := b.ReadFrom(r); err != nil {
-		t.Fatalf(`ReadFrom TensorReader("output.weight") error: %v`, err)
-	}
-
-	if b.Len() != int(ti.NumBytes()) {
-		t.Errorf(`ReadFrom TensorReader("output.weight") length = %d, want %d`, b.Len(), ti.NumBytes())
-	}
-}
-
-func BenchmarkRead(b *testing.B) {
-	b.ReportAllocs()
-
-	p := createBinFile(b)
-	for b.Loop() {
-		f, err := gguf.Open(p)
-		if err != nil {
-			b.Fatal(err)
-		}
-
-		if got := f.KeyValue("general.architecture").String(); got != "llama" {
-			b.Errorf("got = %q, want %q", got, "llama")
-		}
-
-		// Iterate through some tensors
-		for range f.TensorInfos() {
-		}
-
-		f.Close()
-	}
-}
diff --git a/fs/gguf/keyvalue.go b/fs/gguf/keyvalue.go
deleted file mode 100644
index 5843326c..00000000
--- a/fs/gguf/keyvalue.go
+++ /dev/null
@@ -1,90 +0,0 @@
-package gguf
-
-import (
-	"reflect"
-	"slices"
-)
-
-type KeyValue struct {
-	Key string
-	Value
-}
-
-func (kv KeyValue) Valid() bool {
-	return kv.Key != "" && kv.Value.value != nil
-}
-
-type Value struct {
-	value any
-}
-
-func value[T any](v Value, kinds ...reflect.Kind) (t T) {
-	vv := reflect.ValueOf(v.value)
-	if slices.Contains(kinds, vv.Kind()) {
-		t = vv.Convert(reflect.TypeOf(t)).Interface().(T)
-	}
-	return
-}
-
-func values[T any](v Value, kinds ...reflect.Kind) (ts []T) {
-	switch vv := reflect.ValueOf(v.value); vv.Kind() {
-	case reflect.Slice:
-		if slices.Contains(kinds, vv.Type().Elem().Kind()) {
-			ts = make([]T, vv.Len())
-			for i := range vv.Len() {
-				ts[i] = vv.Index(i).Convert(reflect.TypeOf(ts[i])).Interface().(T)
-			}
-		}
-	}
-	return
-}
-
-// Int returns Value as a signed integer. If it is not a signed integer, it returns 0.
-func (v Value) Int() int64 {
-	return value[int64](v, reflect.Int, reflect.Int8, reflect.Int16, reflect.Int32, reflect.Int64)
-}
-
-// Ints returns Value as a signed integer slice. If it is not a signed integer slice, it returns nil.
-func (v Value) Ints() (i64s []int64) {
-	return values[int64](v, reflect.Int, reflect.Int8, reflect.Int16, reflect.Int32, reflect.Int64)
-}
-
-// Uint converts an unsigned integer value to uint64. If the value is not a unsigned integer, it returns 0.
-func (v Value) Uint() uint64 {
-	return value[uint64](v, reflect.Uint, reflect.Uint8, reflect.Uint16, reflect.Uint32, reflect.Uint64)
-}
-
-// Uints returns Value as a unsigned integer slice. If it is not a unsigned integer slice, it returns nil.
-func (v Value) Uints() (u64s []uint64) {
-	return values[uint64](v, reflect.Uint, reflect.Uint8, reflect.Uint16, reflect.Uint32, reflect.Uint64)
-}
-
-// Float returns Value as a float. If it is not a float, it returns 0.
-func (v Value) Float() float64 {
-	return value[float64](v, reflect.Float32, reflect.Float64)
-}
-
-// Floats returns Value as a float slice. If it is not a float slice, it returns nil.
-func (v Value) Floats() (f64s []float64) {
-	return values[float64](v, reflect.Float32, reflect.Float64)
-}
-
-// Bool returns Value as a boolean. If it is not a boolean, it returns false.
-func (v Value) Bool() bool {
-	return value[bool](v, reflect.Bool)
-}
-
-// Bools returns Value as a boolean slice. If it is not a boolean slice, it returns nil.
-func (v Value) Bools() (bools []bool) {
-	return values[bool](v, reflect.Bool)
-}
-
-// String returns Value as a string. If it is not a string, it returns an empty string.
-func (v Value) String() string {
-	return value[string](v, reflect.String)
-}
-
-// Strings returns Value as a string slice. If it is not a string slice, it returns nil.
-func (v Value) Strings() (strings []string) {
-	return values[string](v, reflect.String)
-}
diff --git a/fs/gguf/keyvalue_test.go b/fs/gguf/keyvalue_test.go
deleted file mode 100644
index 2caacc53..00000000
--- a/fs/gguf/keyvalue_test.go
+++ /dev/null
@@ -1,208 +0,0 @@
-package gguf
-
-import (
-	"testing"
-
-	"github.com/google/go-cmp/cmp"
-)
-
-func split(name string, values map[string][]any) (matched []any, unmatched []any) {
-	for key, value := range values {
-		if key == name {
-			matched = value
-		} else {
-			unmatched = append(unmatched, value...)
-		}
-	}
-	return
-}
-
-func TestValue(t *testing.T) {
-	values := map[string][]any{
-		"int64":   {int(42), int8(42), int16(42), int32(42), int64(42)},
-		"uint64":  {uint(42), uint8(42), uint16(42), uint32(42), uint64(42)},
-		"float64": {float32(42), float64(42)},
-		"string":  {"42", "hello"},
-		"bool":    {true, false},
-	}
-
-	t.Run("int64", func(t *testing.T) {
-		matched, unmatched := split("int64", values)
-		for _, v := range matched {
-			kv := KeyValue{"key", Value{v}}
-			if i64 := kv.Int(); i64 != 42 {
-				t.Errorf("expected 42, got %d", i64)
-			}
-		}
-
-		for _, v := range unmatched {
-			kv := KeyValue{"key", Value{v}}
-			if i64 := kv.Int(); i64 != 0 {
-				t.Errorf("expected 42, got %d", i64)
-			}
-		}
-	})
-
-	t.Run("uint64", func(t *testing.T) {
-		matched, unmatched := split("uint64", values)
-		for _, v := range matched {
-			kv := KeyValue{"key", Value{v}}
-			if u64 := kv.Uint(); u64 != 42 {
-				t.Errorf("expected 42, got %d", u64)
-			}
-		}
-
-		for _, v := range unmatched {
-			kv := KeyValue{"key", Value{v}}
-			if u64 := kv.Uint(); u64 != 0 {
-				t.Errorf("expected 42, got %d", u64)
-			}
-		}
-	})
-
-	t.Run("float64", func(t *testing.T) {
-		matched, unmatched := split("float64", values)
-		for _, v := range matched {
-			kv := KeyValue{"key", Value{v}}
-			if f64 := kv.Float(); f64 != 42 {
-				t.Errorf("expected 42, got %f", f64)
-			}
-		}
-
-		for _, v := range unmatched {
-			kv := KeyValue{"key", Value{v}}
-			if f64 := kv.Float(); f64 != 0 {
-				t.Errorf("expected 42, got %f", f64)
-			}
-		}
-	})
-
-	t.Run("string", func(t *testing.T) {
-		matched, unmatched := split("string", values)
-		for _, v := range matched {
-			kv := KeyValue{"key", Value{v}}
-			if s := kv.String(); s != v {
-				t.Errorf("expected 42, got %s", s)
-			}
-		}
-
-		for _, v := range unmatched {
-			kv := KeyValue{"key", Value{v}}
-			if s := kv.String(); s != "" {
-				t.Errorf("expected 42, got %s", s)
-			}
-		}
-	})
-
-	t.Run("bool", func(t *testing.T) {
-		matched, unmatched := split("bool", values)
-		for _, v := range matched {
-			kv := KeyValue{"key", Value{v}}
-			if b := kv.Bool(); b != v {
-				t.Errorf("expected true, got %v", b)
-			}
-		}
-
-		for _, v := range unmatched {
-			kv := KeyValue{"key", Value{v}}
-			if b := kv.Bool(); b != false {
-				t.Errorf("expected false, got %v", b)
-			}
-		}
-	})
-}
-
-func TestValues(t *testing.T) {
-	values := map[string][]any{
-		"int64s":   {[]int{42}, []int8{42}, []int16{42}, []int32{42}, []int64{42}},
-		"uint64s":  {[]uint{42}, []uint8{42}, []uint16{42}, []uint32{42}, []uint64{42}},
-		"float64s": {[]float32{42}, []float64{42}},
-		"strings":  {[]string{"42"}, []string{"hello"}},
-		"bools":    {[]bool{true}, []bool{false}},
-	}
-
-	t.Run("int64s", func(t *testing.T) {
-		matched, unmatched := split("int64s", values)
-		for _, v := range matched {
-			kv := KeyValue{"key", Value{v}}
-			if diff := cmp.Diff(kv.Ints(), []int64{42}); diff != "" {
-				t.Errorf("diff: %s", diff)
-			}
-		}
-
-		for _, v := range unmatched {
-			kv := KeyValue{"key", Value{v}}
-			if i64s := kv.Ints(); i64s != nil {
-				t.Errorf("expected nil, got %v", i64s)
-			}
-		}
-	})
-
-	t.Run("uint64s", func(t *testing.T) {
-		matched, unmatched := split("uint64s", values)
-		for _, v := range matched {
-			kv := KeyValue{"key", Value{v}}
-			if diff := cmp.Diff(kv.Uints(), []uint64{42}); diff != "" {
-				t.Errorf("diff: %s", diff)
-			}
-		}
-
-		for _, v := range unmatched {
-			kv := KeyValue{"key", Value{v}}
-			if u64s := kv.Uints(); u64s != nil {
-				t.Errorf("expected nil, got %v", u64s)
-			}
-		}
-	})
-
-	t.Run("float64s", func(t *testing.T) {
-		matched, unmatched := split("float64s", values)
-		for _, v := range matched {
-			kv := KeyValue{"key", Value{v}}
-			if diff := cmp.Diff(kv.Floats(), []float64{42}); diff != "" {
-				t.Errorf("diff: %s", diff)
-			}
-		}
-
-		for _, v := range unmatched {
-			kv := KeyValue{"key", Value{v}}
-			if f64s := kv.Floats(); f64s != nil {
-				t.Errorf("expected nil, got %v", f64s)
-			}
-		}
-	})
-
-	t.Run("strings", func(t *testing.T) {
-		matched, unmatched := split("strings", values)
-		for _, v := range matched {
-			kv := KeyValue{"key", Value{v}}
-			if diff := cmp.Diff(kv.Strings(), v); diff != "" {
-				t.Errorf("diff: %s", diff)
-			}
-		}
-
-		for _, v := range unmatched {
-			kv := KeyValue{"key", Value{v}}
-			if s := kv.Strings(); s != nil {
-				t.Errorf("expected nil, got %v", s)
-			}
-		}
-	})
-
-	t.Run("bools", func(t *testing.T) {
-		matched, unmatched := split("bools", values)
-		for _, v := range matched {
-			kv := KeyValue{"key", Value{v}}
-			if diff := cmp.Diff(kv.Bools(), v); diff != "" {
-				t.Errorf("diff: %s", diff)
-			}
-		}
-
-		for _, v := range unmatched {
-			kv := KeyValue{"key", Value{v}}
-			if b := kv.Bools(); b != nil {
-				t.Errorf("expected nil, got %v", b)
-			}
-		}
-	})
-}
diff --git a/fs/gguf/lazy.go b/fs/gguf/lazy.go
deleted file mode 100644
index 16ab9909..00000000
--- a/fs/gguf/lazy.go
+++ /dev/null
@@ -1,89 +0,0 @@
-package gguf
-
-import (
-	"encoding/binary"
-	"iter"
-	"log/slog"
-)
-
-type lazy[T any] struct {
-	count  uint64
-	next   func() (T, bool)
-	stop   func()
-	values []T
-
-	// successFunc is called when all values have been successfully read.
-	successFunc func() error
-}
-
-func newLazy[T any](f *File, fn func() (T, error)) (*lazy[T], error) {
-	it := lazy[T]{}
-	if err := binary.Read(f.reader, binary.LittleEndian, &it.count); err != nil {
-		return nil, err
-	}
-
-	it.values = make([]T, 0)
-	it.next, it.stop = iter.Pull(func(yield func(T) bool) {
-		for i := range it.count {
-			t, err := fn()
-			if err != nil {
-				slog.Error("error reading tensor", "index", i, "error", err)
-				return
-			}
-
-			it.values = append(it.values, t)
-			if !yield(t) {
-				break
-			}
-		}
-
-		if it.successFunc != nil {
-			it.successFunc()
-		}
-	})
-
-	return &it, nil
-}
-
-func (g *lazy[T]) Values() iter.Seq[T] {
-	return func(yield func(T) bool) {
-		for _, v := range g.All() {
-			if !yield(v) {
-				break
-			}
-		}
-	}
-}
-
-func (g *lazy[T]) All() iter.Seq2[int, T] {
-	return func(yield func(int, T) bool) {
-		for i := range int(g.count) {
-			if i < len(g.values) {
-				if !yield(i, g.values[i]) {
-					break
-				}
-			} else {
-				t, ok := g.next()
-				if !ok {
-					break
-				}
-
-				if !yield(i, t) {
-					break
-				}
-			}
-		}
-	}
-}
-
-func (g *lazy[T]) rest() (collected bool) {
-	for {
-		_, ok := g.next()
-		collected = collected || ok
-		if !ok {
-			break
-		}
-	}
-
-	return collected
-}
diff --git a/fs/gguf/reader.go b/fs/gguf/reader.go
deleted file mode 100644
index 0bd76184..00000000
--- a/fs/gguf/reader.go
+++ /dev/null
@@ -1,23 +0,0 @@
-package gguf
-
-import (
-	"bufio"
-	"io"
-)
-
-type bufferedReader struct {
-	offset int64
-	*bufio.Reader
-}
-
-func newBufferedReader(rs io.ReadSeeker, size int) *bufferedReader {
-	return &bufferedReader{
-		Reader: bufio.NewReaderSize(rs, size),
-	}
-}
-
-func (rs *bufferedReader) Read(p []byte) (n int, err error) {
-	n, err = rs.Reader.Read(p)
-	rs.offset += int64(n)
-	return n, err
-}
diff --git a/fs/gguf/tensor.go b/fs/gguf/tensor.go
deleted file mode 100644
index 194c1d73..00000000
--- a/fs/gguf/tensor.go
+++ /dev/null
@@ -1,288 +0,0 @@
-package gguf
-
-import (
-	"log/slog"
-	"strings"
-)
-
-type TensorInfo struct {
-	Name   string
-	Offset uint64
-	Shape  []uint64
-	Type   TensorType
-}
-
-func (ti TensorInfo) Valid() bool {
-	return ti.Name != "" && ti.NumBytes() > 0
-}
-
-func (ti TensorInfo) NumValues() int64 {
-	var numItems int64 = 1
-	for _, dim := range ti.Shape {
-		numItems *= int64(dim)
-	}
-	return numItems
-}
-
-// NumBytes returns the number of bytes in the tensor.
-func (ti TensorInfo) NumBytes() int64 {
-	return int64(float64(ti.NumValues()) * ti.Type.NumBytes())
-}
-
-func (ti TensorInfo) LogValue() slog.Value {
-	return slog.GroupValue(
-		slog.String("name", ti.Name),
-		slog.Int64("offset", int64(ti.Offset)),
-		slog.Any("shape", ti.Shape),
-		slog.Int64("num_values", ti.NumValues()),
-		slog.Int64("num_bytes", ti.NumBytes()),
-		slog.Any("type", ti.Type),
-	)
-}
-
-type TensorType uint32
-
-const (
-	TensorTypeF32 TensorType = iota
-	TensorTypeF16
-	TensorTypeQ4_0
-	TensorTypeQ4_1
-
-	// unexported // unused in gguf
-	tensorTypeQ4_2
-	tensorTypeQ4_3
-
-	TensorTypeQ5_0
-	TensorTypeQ5_1
-	TensorTypeQ8_0
-	TensorTypeQ8_1
-	TensorTypeQ2_K
-	TensorTypeQ3_K
-	TensorTypeQ4_K
-	TensorTypeQ5_K
-	TensorTypeQ6_K
-	TensorTypeQ8_K
-
-	// unexported // unquantizable by ollama
-	tensorTypeIQ2_XXS
-	tensorTypeIQ2_XS
-	tensorTypeIQ3_XXS
-	tensorTypeIQ1_S
-	tensorTypeIQ4_NL
-	tensorTypeIQ3_S
-	tensorTypeIQ2_S
-	tensorTypeIQ4_XS
-
-	TensorTypeI8
-	TensorTypeI16
-	TensorTypeI32
-	TensorTypeI64
-	TensorTypeF64
-
-	// unexported // unquantizable by ollama
-	tensorTypeIQ1_M
-
-	TensorTypeBF16
-
-	// unexported // unused in gguf
-	tensorTypeQ4_0_4_4
-	tensorTypeQ4_0_4_8
-	tensorTypeQ4_0_8_8
-
-	// unexported // unquantizable by ollama
-	tensorTypeTQ1_0
-	tensorTypeTQ2_0
-
-	// unexported // unused in gguf
-	tensorTypeIQ4_NL_4_4
-	tensorTypeIQ4_NL_4_8
-	tensorTypeIQ4_NL_8_8
-)
-
-func (tt TensorType) NumBytes() float64 {
-	return float64(tt.typeSize()) / float64(tt.blockSize())
-}
-
-func (tt TensorType) typeSize() int64 {
-	switch tt {
-	case TensorTypeF32:
-		return 4
-	case TensorTypeF16:
-		return 2
-	case TensorTypeQ4_0:
-		return 2 + tt.blockSize()/2
-	case TensorTypeQ4_1:
-		return 2 + 2 + tt.blockSize()/2
-	case TensorTypeQ5_0:
-		return 2 + 4 + tt.blockSize()/2
-	case TensorTypeQ5_1:
-		return 2 + 2 + 4 + tt.blockSize()/2
-	case TensorTypeQ8_0:
-		return 2 + tt.blockSize()
-	case TensorTypeQ8_1:
-		return 2 + 2 + tt.blockSize()
-	case TensorTypeQ2_K:
-		return tt.blockSize()/16 + tt.blockSize()/4 + 2 + 2
-	case TensorTypeQ3_K:
-		return tt.blockSize()/8 + tt.blockSize()/4 + 12 + 2
-	case TensorTypeQ4_K:
-		return 2 + 2 + 12 + tt.blockSize()/2
-	case TensorTypeQ5_K:
-		return 2 + 2 + 12 + tt.blockSize()/8 + tt.blockSize()/2
-	case TensorTypeQ6_K:
-		return tt.blockSize()/2 + tt.blockSize()/4 + tt.blockSize()/16 + 2
-	case TensorTypeQ8_K:
-		return 4 + tt.blockSize() + 2*tt.blockSize()/16
-	case tensorTypeIQ2_XXS:
-		return 2 + 2*tt.blockSize()/8
-	case tensorTypeIQ2_XS:
-		return 2 + 2*tt.blockSize()/8 + tt.blockSize()/32
-	case tensorTypeIQ3_XXS:
-		return 2 + tt.blockSize()/4 + tt.blockSize()/8
-	case tensorTypeIQ1_S:
-		return 2 + tt.blockSize()/8 + tt.blockSize()/16
-	case tensorTypeIQ4_NL:
-		return 2 + tt.blockSize()/2
-	case tensorTypeIQ3_S:
-		return 2 + tt.blockSize()/4 + tt.blockSize()/8 + tt.blockSize()/32 + 4
-	case tensorTypeIQ2_S:
-		return 2 + tt.blockSize()/4 + tt.blockSize()/16
-	case tensorTypeIQ4_XS:
-		return 2 + 2 + tt.blockSize()/2 + tt.blockSize()/64
-	case TensorTypeI8:
-		return 1
-	case TensorTypeI16:
-		return 2
-	case TensorTypeI32:
-		return 4
-	case TensorTypeI64:
-		return 8
-	case TensorTypeF64:
-		return 8
-	case tensorTypeIQ1_M:
-		return tt.blockSize()/8 + tt.blockSize()/16 + tt.blockSize()/32
-	case TensorTypeBF16:
-		return 2
-	default:
-		return 0
-	}
-}
-
-func (tt TensorType) blockSize() int64 {
-	switch tt {
-	case TensorTypeF32,
-		TensorTypeF16,
-		TensorTypeI8,
-		TensorTypeI16,
-		TensorTypeI32,
-		TensorTypeI64,
-		TensorTypeF64,
-		TensorTypeBF16:
-		return 1
-	case TensorTypeQ4_0,
-		TensorTypeQ4_1,
-		TensorTypeQ5_0,
-		TensorTypeQ5_1,
-		TensorTypeQ8_0,
-		TensorTypeQ8_1,
-		tensorTypeIQ4_NL:
-		return 32
-	default:
-		return 256
-	}
-}
-
-func (tt TensorType) String() string {
-	switch tt {
-	case TensorTypeF32:
-		return "f32"
-	case TensorTypeF16:
-		return "f16"
-	case TensorTypeQ4_0:
-		return "q4_0"
-	case TensorTypeQ4_1:
-		return "q4_1"
-	case tensorTypeQ4_2:
-		return "q4_2"
-	case tensorTypeQ4_3:
-		return "q4_3"
-	case TensorTypeQ5_0:
-		return "q5_0"
-	case TensorTypeQ5_1:
-		return "q5_1"
-	case TensorTypeQ8_0:
-		return "q8_0"
-	case TensorTypeQ8_1:
-		return "q8_1"
-	case TensorTypeQ2_K:
-		return "q2_k"
-	case TensorTypeQ3_K:
-		return "q3_k"
-	case TensorTypeQ4_K:
-		return "q4_k"
-	case TensorTypeQ5_K:
-		return "q5_k"
-	case TensorTypeQ6_K:
-		return "q6_k"
-	case TensorTypeQ8_K:
-		return "q8_k"
-	case tensorTypeIQ2_XXS:
-		return "iq2_xxs"
-	case tensorTypeIQ2_XS:
-		return "iq2_xs"
-	case tensorTypeIQ3_XXS:
-		return "iq3_xxs"
-	case tensorTypeIQ1_S:
-		return "iq1_s"
-	case tensorTypeIQ4_NL:
-		return "iq4_nl"
-	case tensorTypeIQ3_S:
-		return "iq3_s"
-	case tensorTypeIQ2_S:
-		return "iq2_s"
-	case tensorTypeIQ4_XS:
-		return "iq4_xs"
-	case TensorTypeI8:
-		return "i8"
-	case TensorTypeI16:
-		return "i16"
-	case TensorTypeI32:
-		return "i32"
-	case TensorTypeI64:
-		return "i64"
-	case TensorTypeF64:
-		return "f64"
-	case tensorTypeIQ1_M:
-		return "iq1_m"
-	case TensorTypeBF16:
-		return "bf16"
-	case tensorTypeQ4_0_4_4:
-		return "q4_0_4_4"
-	case tensorTypeQ4_0_4_8:
-		return "q4_0_4_8"
-	case tensorTypeQ4_0_8_8:
-		return "q4_0_8_8"
-	case tensorTypeTQ1_0:
-		return "tq1_0"
-	case tensorTypeTQ2_0:
-		return "tq2_0"
-	case tensorTypeIQ4_NL_4_4:
-		return "iq4_nl_4_4"
-	case tensorTypeIQ4_NL_4_8:
-		return "iq4_nl_4_8"
-	case tensorTypeIQ4_NL_8_8:
-		return "iq4_nl_8_8"
-	default:
-		return "unknown"
-	}
-}
-
-func (tt TensorType) LogValue() slog.Value {
-	return slog.GroupValue(
-		slog.Uint64("value", uint64(tt)),
-		slog.String("name", strings.ToUpper(tt.String())),
-		slog.Int64("size", tt.typeSize()),
-		slog.Int64("block_size", tt.blockSize()),
-		slog.Float64("num_bytes", tt.NumBytes()),
-	)
-}
diff --git a/go.mod b/go.mod
index 6de5959b..283286b7 100644
--- a/go.mod
+++ b/go.mod
@@ -19,7 +19,7 @@ require (
 	github.com/d4l3k/go-bfloat16 v0.0.0-20211005043715-690c3bdd05f1
 	github.com/dlclark/regexp2 v1.11.4
 	github.com/emirpasic/gods/v2 v2.0.0-alpha
-	github.com/google/go-cmp v0.7.0
+	github.com/google/go-cmp v0.6.0
 	github.com/mattn/go-runewidth v0.0.14
 	github.com/nlpodyssey/gopickle v0.3.0
 	github.com/pdevine/tensor v0.0.0-20240510204454-f88f4562727c
diff --git a/go.sum b/go.sum
index c0ab53aa..5755616f 100644
--- a/go.sum
+++ b/go.sum
@@ -112,8 +112,8 @@ github.com/google/go-cmp v0.4.0/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/
 github.com/google/go-cmp v0.5.0/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE=
 github.com/google/go-cmp v0.5.5/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE=
 github.com/google/go-cmp v0.5.6/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE=
-github.com/google/go-cmp v0.7.0 h1:wk8382ETsv4JYUZwIsn6YpYiWiBsYLSJiTsyBybVuN8=
-github.com/google/go-cmp v0.7.0/go.mod h1:pXiqmnSA92OHEEa9HXL2W4E7lf9JzCmGVUdgjX3N/iU=
+github.com/google/go-cmp v0.6.0 h1:ofyhxvXcZhMsU5ulbFiLKl/XBFqE1GSq7atu8tAmTRI=
+github.com/google/go-cmp v0.6.0/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY=
 github.com/google/gofuzz v1.0.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg=
 github.com/google/uuid v1.1.2/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo=
 github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0=
diff --git a/server/images.go b/server/images.go
index 38505cc5..d6cceff4 100644
--- a/server/images.go
+++ b/server/images.go
@@ -23,7 +23,7 @@ import (
 
 	"github.com/ollama/ollama/api"
 	"github.com/ollama/ollama/envconfig"
-	"github.com/ollama/ollama/fs/gguf"
+	"github.com/ollama/ollama/fs/ggml"
 	"github.com/ollama/ollama/parser"
 	"github.com/ollama/ollama/template"
 	"github.com/ollama/ollama/thinking"
@@ -73,18 +73,22 @@ func (m *Model) Capabilities() []model.Capability {
 	capabilities := []model.Capability{}
 
 	// Check for completion capability
-	f, err := gguf.Open(m.ModelPath)
+	r, err := os.Open(m.ModelPath)
 	if err == nil {
-		defer f.Close()
+		defer r.Close()
 
-		if f.KeyValue("pooling_type").Valid() {
-			capabilities = append(capabilities, model.CapabilityEmbedding)
+		f, err := ggml.Decode(r, 1024)
+		if err == nil {
+			if _, ok := f.KV()[fmt.Sprintf("%s.pooling_type", f.KV().Architecture())]; ok {
+				capabilities = append(capabilities, model.CapabilityEmbedding)
+			} else {
+				capabilities = append(capabilities, model.CapabilityCompletion)
+			}
+			if _, ok := f.KV()[fmt.Sprintf("%s.vision.block_count", f.KV().Architecture())]; ok {
+				capabilities = append(capabilities, model.CapabilityVision)
+			}
 		} else {
-			// If no embedding is specified, we assume the model supports completion
-			capabilities = append(capabilities, model.CapabilityCompletion)
-		}
-		if f.KeyValue("vision.block_count").Valid() {
-			capabilities = append(capabilities, model.CapabilityVision)
+			slog.Error("couldn't decode ggml", "error", err)
 		}
 	} else {
 		slog.Error("couldn't open model file", "error", err)
diff --git a/server/images_test.go b/server/images_test.go
index a2fba8d9..363b298e 100644
--- a/server/images_test.go
+++ b/server/images_test.go
@@ -1,42 +1,123 @@
 package server
 
 import (
+	"bytes"
+	"encoding/binary"
+	"errors"
+	"os"
+	"path/filepath"
 	"strings"
 	"testing"
 
-	"github.com/ollama/ollama/fs/ggml"
 	"github.com/ollama/ollama/template"
 	"github.com/ollama/ollama/types/model"
 )
 
+// Constants for GGUF magic bytes and version
+var (
+	ggufMagic = []byte{0x47, 0x47, 0x55, 0x46} // "GGUF"
+	ggufVer   = uint32(3)                      // Version 3
+)
+
+// Helper function to create mock GGUF data
+func createMockGGUFData(architecture string, vision bool) []byte {
+	var buf bytes.Buffer
+
+	// Write GGUF header
+	buf.Write(ggufMagic)
+	binary.Write(&buf, binary.LittleEndian, ggufVer)
+
+	// Write tensor count (0 for our test)
+	var numTensors uint64 = 0
+	binary.Write(&buf, binary.LittleEndian, numTensors)
+
+	// Calculate number of metadata entries
+	numMetaEntries := uint64(1) // architecture entry
+	if vision {
+		numMetaEntries++
+	}
+	// Add embedding entry if architecture is "bert"
+	if architecture == "bert" {
+		numMetaEntries++
+	}
+	binary.Write(&buf, binary.LittleEndian, numMetaEntries)
+
+	// Write architecture metadata
+	archKey := "general.architecture"
+	keyLen := uint64(len(archKey))
+	binary.Write(&buf, binary.LittleEndian, keyLen)
+	buf.WriteString(archKey)
+
+	// String type (8)
+	var strType uint32 = 8
+	binary.Write(&buf, binary.LittleEndian, strType)
+
+	// String length
+	strLen := uint64(len(architecture))
+	binary.Write(&buf, binary.LittleEndian, strLen)
+	buf.WriteString(architecture)
+
+	if vision {
+		visionKey := architecture + ".vision.block_count"
+		keyLen = uint64(len(visionKey))
+		binary.Write(&buf, binary.LittleEndian, keyLen)
+		buf.WriteString(visionKey)
+
+		// uint32 type (4)
+		var uint32Type uint32 = 4
+		binary.Write(&buf, binary.LittleEndian, uint32Type)
+
+		// uint32 value (1)
+		var countVal uint32 = 1
+		binary.Write(&buf, binary.LittleEndian, countVal)
+	}
+	// Write embedding metadata if architecture is "bert"
+	if architecture == "bert" {
+		poolKey := architecture + ".pooling_type"
+		keyLen = uint64(len(poolKey))
+		binary.Write(&buf, binary.LittleEndian, keyLen)
+		buf.WriteString(poolKey)
+
+		// uint32 type (4)
+		var uint32Type uint32 = 4
+		binary.Write(&buf, binary.LittleEndian, uint32Type)
+
+		// uint32 value (1)
+		var poolingVal uint32 = 1
+		binary.Write(&buf, binary.LittleEndian, poolingVal)
+	}
+
+	return buf.Bytes()
+}
+
 func TestModelCapabilities(t *testing.T) {
-	// Create completion model (llama architecture without vision)
-	completionModelPath, _ := createBinFile(t, ggml.KV{
-		"general.architecture": "llama",
-	}, []*ggml.Tensor{})
+	// Create a temporary directory for test files
+	tempDir := t.TempDir()
 
-	// Create vision model (llama architecture with vision block count)
-	visionModelPath, _ := createBinFile(t, ggml.KV{
-		"general.architecture":     "llama",
-		"llama.vision.block_count": uint32(1),
-	}, []*ggml.Tensor{})
+	// Create different types of mock model files
+	completionModelPath := filepath.Join(tempDir, "model.bin")
+	visionModelPath := filepath.Join(tempDir, "vision_model.bin")
+	embeddingModelPath := filepath.Join(tempDir, "embedding_model.bin")
+	// Create a simple model file for tests that don't depend on GGUF content
+	simpleModelPath := filepath.Join(tempDir, "simple_model.bin")
 
-	// Create embedding model (bert architecture with pooling type)
-	embeddingModelPath, _ := createBinFile(t, ggml.KV{
-		"general.architecture": "bert",
-		"bert.pooling_type":    uint32(1),
-	}, []*ggml.Tensor{})
+	if err := errors.Join(
+		os.WriteFile(completionModelPath, createMockGGUFData("llama", false), 0o644),
+		os.WriteFile(visionModelPath, createMockGGUFData("llama", true), 0o644),
+		os.WriteFile(embeddingModelPath, createMockGGUFData("bert", false), 0o644),
+		os.WriteFile(simpleModelPath, []byte("dummy model data"), 0o644),
+	); err != nil {
+		t.Fatalf("Failed to create model files: %v", err)
+	}
 
 	toolsInsertTemplate, err := template.Parse("{{ .prompt }}{{ if .tools }}{{ .tools }}{{ end }}{{ if .suffix }}{{ .suffix }}{{ end }}")
 	if err != nil {
 		t.Fatalf("Failed to parse template: %v", err)
 	}
-
 	chatTemplate, err := template.Parse("{{ .prompt }}")
 	if err != nil {
 		t.Fatalf("Failed to parse template: %v", err)
 	}
-
 	toolsTemplate, err := template.Parse("{{ .prompt }}{{ if .tools }}{{ .tools }}{{ end }}")
 	if err != nil {
 		t.Fatalf("Failed to parse template: %v", err)
@@ -64,13 +145,21 @@ func TestModelCapabilities(t *testing.T) {
 			},
 			expectedCaps: []model.Capability{model.CapabilityCompletion, model.CapabilityTools, model.CapabilityInsert},
 		},
+		{
+			name: "model with tools and insert capability",
+			model: Model{
+				ModelPath: simpleModelPath,
+				Template:  toolsInsertTemplate,
+			},
+			expectedCaps: []model.Capability{model.CapabilityTools, model.CapabilityInsert},
+		},
 		{
 			name: "model with tools capability",
 			model: Model{
-				ModelPath: completionModelPath,
+				ModelPath: simpleModelPath,
 				Template:  toolsTemplate,
 			},
-			expectedCaps: []model.Capability{model.CapabilityCompletion, model.CapabilityTools},
+			expectedCaps: []model.Capability{model.CapabilityTools},
 		},
 		{
 			name: "model with vision capability",
@@ -135,33 +224,29 @@ func TestModelCapabilities(t *testing.T) {
 }
 
 func TestModelCheckCapabilities(t *testing.T) {
-	// Create simple model file for tests that don't depend on GGUF content
-	completionModelPath, _ := createBinFile(t, ggml.KV{
-		"general.architecture": "llama",
-	}, []*ggml.Tensor{})
+	// Create a temporary directory for test files
+	tempDir := t.TempDir()
 
-	// Create vision model (llama architecture with vision block count)
-	visionModelPath, _ := createBinFile(t, ggml.KV{
-		"general.architecture":     "llama",
-		"llama.vision.block_count": uint32(1),
-	}, []*ggml.Tensor{})
+	visionModelPath := filepath.Join(tempDir, "vision_model.bin")
+	simpleModelPath := filepath.Join(tempDir, "model.bin")
+	embeddingModelPath := filepath.Join(tempDir, "embedding_model.bin")
 
-	// Create embedding model (bert architecture with pooling type)
-	embeddingModelPath, _ := createBinFile(t, ggml.KV{
-		"general.architecture": "bert",
-		"bert.pooling_type":    uint32(1),
-	}, []*ggml.Tensor{})
+	if err := errors.Join(
+		os.WriteFile(simpleModelPath, []byte("dummy model data"), 0o644),
+		os.WriteFile(visionModelPath, createMockGGUFData("llama", true), 0o644),
+		os.WriteFile(embeddingModelPath, createMockGGUFData("bert", false), 0o644),
+	); err != nil {
+		t.Fatalf("Failed to create model files: %v", err)
+	}
 
 	toolsInsertTemplate, err := template.Parse("{{ .prompt }}{{ if .tools }}{{ .tools }}{{ end }}{{ if .suffix }}{{ .suffix }}{{ end }}")
 	if err != nil {
 		t.Fatalf("Failed to parse template: %v", err)
 	}
-
 	chatTemplate, err := template.Parse("{{ .prompt }}")
 	if err != nil {
 		t.Fatalf("Failed to parse template: %v", err)
 	}
-
 	toolsTemplate, err := template.Parse("{{ .prompt }}{{ if .tools }}{{ .tools }}{{ end }}")
 	if err != nil {
 		t.Fatalf("Failed to parse template: %v", err)
@@ -176,7 +261,7 @@ func TestModelCheckCapabilities(t *testing.T) {
 		{
 			name: "completion model without tools capability",
 			model: Model{
-				ModelPath: completionModelPath,
+				ModelPath: simpleModelPath,
 				Template:  chatTemplate,
 			},
 			checkCaps:      []model.Capability{model.CapabilityTools},
@@ -185,7 +270,7 @@ func TestModelCheckCapabilities(t *testing.T) {
 		{
 			name: "model with all needed capabilities",
 			model: Model{
-				ModelPath: completionModelPath,
+				ModelPath: simpleModelPath,
 				Template:  toolsInsertTemplate,
 			},
 			checkCaps: []model.Capability{model.CapabilityTools, model.CapabilityInsert},
@@ -193,7 +278,7 @@ func TestModelCheckCapabilities(t *testing.T) {
 		{
 			name: "model missing insert capability",
 			model: Model{
-				ModelPath: completionModelPath,
+				ModelPath: simpleModelPath,
 				Template:  toolsTemplate,
 			},
 			checkCaps:      []model.Capability{model.CapabilityInsert},
@@ -202,7 +287,7 @@ func TestModelCheckCapabilities(t *testing.T) {
 		{
 			name: "model missing vision capability",
 			model: Model{
-				ModelPath: completionModelPath,
+				ModelPath: simpleModelPath,
 				Template:  toolsTemplate,
 			},
 			checkCaps:      []model.Capability{model.CapabilityVision},
@@ -227,7 +312,7 @@ func TestModelCheckCapabilities(t *testing.T) {
 		{
 			name: "unknown capability",
 			model: Model{
-				ModelPath: completionModelPath,
+				ModelPath: simpleModelPath,
 				Template:  chatTemplate,
 			},
 			checkCaps:      []model.Capability{"unknown"},
diff --git a/server/quantization_test.go b/server/quantization_test.go
index 8b726c83..4f717c2c 100644
--- a/server/quantization_test.go
+++ b/server/quantization_test.go
@@ -257,8 +257,16 @@ func TestQuantizeModel(t *testing.T) {
 
 	for _, tt := range cases {
 		t.Run(tt.name, func(t *testing.T) {
-			p, _ := createBinFile(t, tt.kv, tt.tensors)
-			fp, err := os.Open(p)
+			f, err := os.CreateTemp(t.TempDir(), tt.name)
+			if err != nil {
+				t.Fatal(err.Error())
+			}
+			defer f.Close()
+			err = fsggml.WriteGGUF(f, tt.kv, tt.tensors)
+			if err != nil {
+				t.Fatalf("failed to create initial model: %s", err)
+			}
+			fp, err := os.Open(f.Name())
 			if err != nil {
 				t.Fatal(err.Error())
 			}
diff --git a/server/sched_test.go b/server/sched_test.go
index 3892fbba..01fb9a70 100644
--- a/server/sched_test.go
+++ b/server/sched_test.go
@@ -112,7 +112,11 @@ func newScenarioRequest(t *testing.T, ctx context.Context, modelName string, est
 	b.ctx, b.ctxDone = context.WithCancel(ctx)
 	t.Helper()
 
-	p, _ := createBinFile(t, ggml.KV{
+	f, err := os.CreateTemp(t.TempDir(), modelName)
+	require.NoError(t, err)
+	defer f.Close()
+
+	require.NoError(t, ggml.WriteGGUF(f, ggml.KV{
 		"general.architecture":          "llama",
 		"llama.context_length":          uint32(32),
 		"llama.embedding_length":        uint32(4096),
@@ -125,14 +129,14 @@ func newScenarioRequest(t *testing.T, ctx context.Context, modelName string, est
 	}, []*ggml.Tensor{
 		{Name: "blk.0.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: bytes.NewReader(make([]byte, 32))},
 		{Name: "output.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: bytes.NewReader(make([]byte, 32))},
-	})
+	}))
+	require.NoError(t, err)
+
+	fname := f.Name()
+	model := &Model{Name: modelName, ModelPath: fname}
+	b.f, err = llm.LoadModel(model.ModelPath, 0)
+	require.NoError(t, err)
 
-	model := &Model{Name: modelName, ModelPath: p}
-	f, err := llm.LoadModel(model.ModelPath, 0)
-	if err != nil {
-		t.Fatal(err)
-	}
-	b.f = f
 	if duration == nil {
 		duration = &api.Duration{Duration: 5 * time.Millisecond}
 	}

From ed567ef43b5822423bd165f5f57fb6bad5fce1b3 Mon Sep 17 00:00:00 2001
From: Jeffrey Morgan <jmorganca@gmail.com>
Date: Wed, 18 Jun 2025 05:45:00 -0700
Subject: [PATCH 100/108] Revert "ggml: Export GPU UUIDs" (#11115)

This reverts commit aaa7818000c42a82fc030212c35ef83f9799efd7.
---
 .../patches/0017-ggml-Export-GPU-UUIDs.patch  | 102 ------------------
 ml/backend.go                                 |   8 --
 ml/backend/ggml/ggml.go                       |   6 --
 ml/backend/ggml/ggml/include/ggml-backend.h   |   1 -
 .../ggml/ggml/src/ggml-cuda/ggml-cuda.cu      |  33 ------
 .../ggml/ggml/src/ggml-metal/ggml-metal.m     |   1 -
 6 files changed, 151 deletions(-)
 delete mode 100644 llama/patches/0017-ggml-Export-GPU-UUIDs.patch

diff --git a/llama/patches/0017-ggml-Export-GPU-UUIDs.patch b/llama/patches/0017-ggml-Export-GPU-UUIDs.patch
deleted file mode 100644
index a2539034..00000000
--- a/llama/patches/0017-ggml-Export-GPU-UUIDs.patch
+++ /dev/null
@@ -1,102 +0,0 @@
-From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
-From: Jesse Gross <jesse@ollama.com>
-Date: Thu, 24 Apr 2025 14:48:51 -0700
-Subject: [PATCH] ggml: Export GPU UUIDs
-
-This enables matching up devices and information reported by the backend
-with tools (e.g. nvidia-smi) and system management libraries (e.g. nvml).
----
- ggml/include/ggml-backend.h      |  1 +
- ggml/src/ggml-cuda/ggml-cuda.cu  | 33 ++++++++++++++++++++++++++++++++
- ggml/src/ggml-metal/ggml-metal.m |  1 +
- 3 files changed, 35 insertions(+)
-
-diff --git a/ggml/include/ggml-backend.h b/ggml/include/ggml-backend.h
-index 74e46716..a880df33 100644
---- a/ggml/include/ggml-backend.h
-+++ b/ggml/include/ggml-backend.h
-@@ -152,6 +152,7 @@ extern "C" {
-     struct ggml_backend_dev_props {
-         const char * name;
-         const char * description;
-+        const char * uuid;
-         size_t memory_free;
-         size_t memory_total;
-         enum ggml_backend_dev_type type;
-diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
-index cb0d8528..4c829153 100644
---- a/ggml/src/ggml-cuda/ggml-cuda.cu
-+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
-@@ -2884,6 +2884,7 @@ struct ggml_backend_cuda_device_context {
-     int device;
-     std::string name;
-     std::string description;
-+    std::string uuid;
- };
- 
- static const char * ggml_backend_cuda_device_get_name(ggml_backend_dev_t dev) {
-@@ -2896,6 +2897,11 @@ static const char * ggml_backend_cuda_device_get_description(ggml_backend_dev_t
-     return ctx->description.c_str();
- }
- 
-+static const char * ggml_backend_cuda_device_get_uuid(ggml_backend_dev_t dev) {
-+    ggml_backend_cuda_device_context * ctx = (ggml_backend_cuda_device_context *)dev->context;
-+    return ctx->uuid.c_str();
-+}
-+
- static void ggml_backend_cuda_device_get_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) {
-     ggml_backend_cuda_device_context * ctx = (ggml_backend_cuda_device_context *)dev->context;
-     ggml_cuda_set_device(ctx->device);
-@@ -2910,6 +2916,7 @@ static enum ggml_backend_dev_type ggml_backend_cuda_device_get_type(ggml_backend
- static void ggml_backend_cuda_device_get_props(ggml_backend_dev_t dev, ggml_backend_dev_props * props) {
-     props->name        = ggml_backend_cuda_device_get_name(dev);
-     props->description = ggml_backend_cuda_device_get_description(dev);
-+    props->uuid        = ggml_backend_cuda_device_get_uuid(dev);
-     props->type        = ggml_backend_cuda_device_get_type(dev);
-     ggml_backend_cuda_device_get_memory(dev, &props->memory_free, &props->memory_total);
- 
-@@ -3458,6 +3465,32 @@ ggml_backend_reg_t ggml_backend_cuda_reg() {
-                 CUDA_CHECK(cudaGetDeviceProperties(&prop, i));
-                 dev_ctx->description = prop.name;
- 
-+                #if !defined(GGML_USE_HIP)
-+                char uuid[64];
-+                snprintf(uuid, sizeof(uuid),
-+                    "GPU-%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
-+                    (unsigned char)prop.uuid.bytes[0],
-+                    (unsigned char)prop.uuid.bytes[1],
-+                    (unsigned char)prop.uuid.bytes[2],
-+                    (unsigned char)prop.uuid.bytes[3],
-+                    (unsigned char)prop.uuid.bytes[4],
-+                    (unsigned char)prop.uuid.bytes[5],
-+                    (unsigned char)prop.uuid.bytes[6],
-+                    (unsigned char)prop.uuid.bytes[7],
-+                    (unsigned char)prop.uuid.bytes[8],
-+                    (unsigned char)prop.uuid.bytes[9],
-+                    (unsigned char)prop.uuid.bytes[10],
-+                    (unsigned char)prop.uuid.bytes[11],
-+                    (unsigned char)prop.uuid.bytes[12],
-+                    (unsigned char)prop.uuid.bytes[13],
-+                    (unsigned char)prop.uuid.bytes[14],
-+                    (unsigned char)prop.uuid.bytes[15]
-+                  );
-+                dev_ctx->uuid = uuid;
-+                #else
-+                dev_ctx->uuid = "GPU-" + std::string(prop.uuid.bytes, 16);
-+                #endif
-+
-                 ggml_backend_dev_t dev = new ggml_backend_device {
-                     /* .iface   = */ ggml_backend_cuda_device_interface,
-                     /* .reg     = */ &reg,
-diff --git a/ggml/src/ggml-metal/ggml-metal.m b/ggml/src/ggml-metal/ggml-metal.m
-index 1b56f858..ee4f2dcb 100644
---- a/ggml/src/ggml-metal/ggml-metal.m
-+++ b/ggml/src/ggml-metal/ggml-metal.m
-@@ -5703,6 +5703,7 @@ static enum ggml_backend_dev_type ggml_backend_metal_device_get_type(ggml_backen
- static void ggml_backend_metal_device_get_props(ggml_backend_dev_t dev, struct ggml_backend_dev_props * props) {
-     props->name        = ggml_backend_metal_device_get_name(dev);
-     props->description = ggml_backend_metal_device_get_description(dev);
-+    props->uuid        = "0";
-     props->type        = ggml_backend_metal_device_get_type(dev);
-     ggml_backend_metal_device_get_memory(dev, &props->memory_free, &props->memory_total);
-     props->caps = (struct ggml_backend_dev_caps) {
diff --git a/ml/backend.go b/ml/backend.go
index 2df6c892..65f16948 100644
--- a/ml/backend.go
+++ b/ml/backend.go
@@ -124,10 +124,6 @@ type DeviceMemory struct {
 	// may not be persistent across instances of the runner.
 	Name string
 
-	// UUID is a unique persistent identifier for the device for matching
-	// with system management libraries
-	UUID string
-
 	// Weights is the per-layer memory needed for the model weights.
 	Weights []Memory
 
@@ -156,10 +152,6 @@ func (m DeviceMemory) LogValue() slog.Value {
 		attrs = append(attrs, slog.Any("Graph", m.Graph))
 	}
 
-	if len(attrs) > 0 && m.UUID != "" {
-		attrs = append([]slog.Attr{slog.String("UUID", m.UUID)}, attrs...)
-	}
-
 	return slog.GroupValue(attrs...)
 }
 
diff --git a/ml/backend/ggml/ggml.go b/ml/backend/ggml/ggml.go
index 5a9fe67e..76172ae1 100644
--- a/ml/backend/ggml/ggml.go
+++ b/ml/backend/ggml/ggml.go
@@ -136,9 +136,6 @@ func New(modelPath string, params ml.BackendParams) (ml.Backend, error) {
 	}
 
 	requiredMemory.CPU.Name = C.GoString(C.ggml_backend_dev_name(cpuDeviceBufferType.d))
-	var props C.struct_ggml_backend_dev_props
-	C.ggml_backend_dev_get_props(cpuDeviceBufferType.d, &props)
-	requiredMemory.CPU.UUID = C.GoString(props.uuid)
 	requiredMemory.CPU.Weights = make([]ml.Memory, blocks+1)
 	requiredMemory.CPU.Cache = make([]ml.Memory, blocks+1)
 
@@ -153,9 +150,6 @@ func New(modelPath string, params ml.BackendParams) (ml.Backend, error) {
 		})
 		btDeviceMemory[bt] = &requiredMemory.GPUs[i]
 		requiredMemory.GPUs[i].Name = C.GoString(C.ggml_backend_dev_name(d))
-		var props C.struct_ggml_backend_dev_props
-		C.ggml_backend_dev_get_props(d, &props)
-		requiredMemory.GPUs[i].UUID = C.GoString(props.uuid)
 		requiredMemory.GPUs[i].Weights = make([]ml.Memory, blocks+1)
 		requiredMemory.GPUs[i].Cache = make([]ml.Memory, blocks+1)
 	}
diff --git a/ml/backend/ggml/ggml/include/ggml-backend.h b/ml/backend/ggml/ggml/include/ggml-backend.h
index a880df33..74e46716 100644
--- a/ml/backend/ggml/ggml/include/ggml-backend.h
+++ b/ml/backend/ggml/ggml/include/ggml-backend.h
@@ -152,7 +152,6 @@ extern "C" {
     struct ggml_backend_dev_props {
         const char * name;
         const char * description;
-        const char * uuid;
         size_t memory_free;
         size_t memory_total;
         enum ggml_backend_dev_type type;
diff --git a/ml/backend/ggml/ggml/src/ggml-cuda/ggml-cuda.cu b/ml/backend/ggml/ggml/src/ggml-cuda/ggml-cuda.cu
index 4c829153..cb0d8528 100644
--- a/ml/backend/ggml/ggml/src/ggml-cuda/ggml-cuda.cu
+++ b/ml/backend/ggml/ggml/src/ggml-cuda/ggml-cuda.cu
@@ -2884,7 +2884,6 @@ struct ggml_backend_cuda_device_context {
     int device;
     std::string name;
     std::string description;
-    std::string uuid;
 };
 
 static const char * ggml_backend_cuda_device_get_name(ggml_backend_dev_t dev) {
@@ -2897,11 +2896,6 @@ static const char * ggml_backend_cuda_device_get_description(ggml_backend_dev_t
     return ctx->description.c_str();
 }
 
-static const char * ggml_backend_cuda_device_get_uuid(ggml_backend_dev_t dev) {
-    ggml_backend_cuda_device_context * ctx = (ggml_backend_cuda_device_context *)dev->context;
-    return ctx->uuid.c_str();
-}
-
 static void ggml_backend_cuda_device_get_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) {
     ggml_backend_cuda_device_context * ctx = (ggml_backend_cuda_device_context *)dev->context;
     ggml_cuda_set_device(ctx->device);
@@ -2916,7 +2910,6 @@ static enum ggml_backend_dev_type ggml_backend_cuda_device_get_type(ggml_backend
 static void ggml_backend_cuda_device_get_props(ggml_backend_dev_t dev, ggml_backend_dev_props * props) {
     props->name        = ggml_backend_cuda_device_get_name(dev);
     props->description = ggml_backend_cuda_device_get_description(dev);
-    props->uuid        = ggml_backend_cuda_device_get_uuid(dev);
     props->type        = ggml_backend_cuda_device_get_type(dev);
     ggml_backend_cuda_device_get_memory(dev, &props->memory_free, &props->memory_total);
 
@@ -3465,32 +3458,6 @@ ggml_backend_reg_t ggml_backend_cuda_reg() {
                 CUDA_CHECK(cudaGetDeviceProperties(&prop, i));
                 dev_ctx->description = prop.name;
 
-                #if !defined(GGML_USE_HIP)
-                char uuid[64];
-                snprintf(uuid, sizeof(uuid),
-                    "GPU-%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
-                    (unsigned char)prop.uuid.bytes[0],
-                    (unsigned char)prop.uuid.bytes[1],
-                    (unsigned char)prop.uuid.bytes[2],
-                    (unsigned char)prop.uuid.bytes[3],
-                    (unsigned char)prop.uuid.bytes[4],
-                    (unsigned char)prop.uuid.bytes[5],
-                    (unsigned char)prop.uuid.bytes[6],
-                    (unsigned char)prop.uuid.bytes[7],
-                    (unsigned char)prop.uuid.bytes[8],
-                    (unsigned char)prop.uuid.bytes[9],
-                    (unsigned char)prop.uuid.bytes[10],
-                    (unsigned char)prop.uuid.bytes[11],
-                    (unsigned char)prop.uuid.bytes[12],
-                    (unsigned char)prop.uuid.bytes[13],
-                    (unsigned char)prop.uuid.bytes[14],
-                    (unsigned char)prop.uuid.bytes[15]
-                  );
-                dev_ctx->uuid = uuid;
-                #else
-                dev_ctx->uuid = "GPU-" + std::string(prop.uuid.bytes, 16);
-                #endif
-
                 ggml_backend_dev_t dev = new ggml_backend_device {
                     /* .iface   = */ ggml_backend_cuda_device_interface,
                     /* .reg     = */ &reg,
diff --git a/ml/backend/ggml/ggml/src/ggml-metal/ggml-metal.m b/ml/backend/ggml/ggml/src/ggml-metal/ggml-metal.m
index ee4f2dcb..1b56f858 100644
--- a/ml/backend/ggml/ggml/src/ggml-metal/ggml-metal.m
+++ b/ml/backend/ggml/ggml/src/ggml-metal/ggml-metal.m
@@ -5703,7 +5703,6 @@ static enum ggml_backend_dev_type ggml_backend_metal_device_get_type(ggml_backen
 static void ggml_backend_metal_device_get_props(ggml_backend_dev_t dev, struct ggml_backend_dev_props * props) {
     props->name        = ggml_backend_metal_device_get_name(dev);
     props->description = ggml_backend_metal_device_get_description(dev);
-    props->uuid        = "0";
     props->type        = ggml_backend_metal_device_get_type(dev);
     ggml_backend_metal_device_get_memory(dev, &props->memory_free, &props->memory_total);
     props->caps = (struct ggml_backend_dev_caps) {

From 6baf1e31e2e5b28c4ce6d145f4524448c9747204 Mon Sep 17 00:00:00 2001
From: Jeffrey Morgan <jmorganca@gmail.com>
Date: Wed, 18 Jun 2025 07:30:49 -0700
Subject: [PATCH 101/108] Revert "Revert "ggml: Export GPU UUIDs" (#11115)"
 (#11117)

Reverts PR #11115. The original change was mistakingly reverted instead of #10822
---
 .../patches/0017-ggml-Export-GPU-UUIDs.patch  | 102 ++++++++++++++++++
 ml/backend.go                                 |   8 ++
 ml/backend/ggml/ggml.go                       |   6 ++
 ml/backend/ggml/ggml/include/ggml-backend.h   |   1 +
 .../ggml/ggml/src/ggml-cuda/ggml-cuda.cu      |  33 ++++++
 .../ggml/ggml/src/ggml-metal/ggml-metal.m     |   1 +
 6 files changed, 151 insertions(+)
 create mode 100644 llama/patches/0017-ggml-Export-GPU-UUIDs.patch

diff --git a/llama/patches/0017-ggml-Export-GPU-UUIDs.patch b/llama/patches/0017-ggml-Export-GPU-UUIDs.patch
new file mode 100644
index 00000000..a2539034
--- /dev/null
+++ b/llama/patches/0017-ggml-Export-GPU-UUIDs.patch
@@ -0,0 +1,102 @@
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
+From: Jesse Gross <jesse@ollama.com>
+Date: Thu, 24 Apr 2025 14:48:51 -0700
+Subject: [PATCH] ggml: Export GPU UUIDs
+
+This enables matching up devices and information reported by the backend
+with tools (e.g. nvidia-smi) and system management libraries (e.g. nvml).
+---
+ ggml/include/ggml-backend.h      |  1 +
+ ggml/src/ggml-cuda/ggml-cuda.cu  | 33 ++++++++++++++++++++++++++++++++
+ ggml/src/ggml-metal/ggml-metal.m |  1 +
+ 3 files changed, 35 insertions(+)
+
+diff --git a/ggml/include/ggml-backend.h b/ggml/include/ggml-backend.h
+index 74e46716..a880df33 100644
+--- a/ggml/include/ggml-backend.h
++++ b/ggml/include/ggml-backend.h
+@@ -152,6 +152,7 @@ extern "C" {
+     struct ggml_backend_dev_props {
+         const char * name;
+         const char * description;
++        const char * uuid;
+         size_t memory_free;
+         size_t memory_total;
+         enum ggml_backend_dev_type type;
+diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
+index cb0d8528..4c829153 100644
+--- a/ggml/src/ggml-cuda/ggml-cuda.cu
++++ b/ggml/src/ggml-cuda/ggml-cuda.cu
+@@ -2884,6 +2884,7 @@ struct ggml_backend_cuda_device_context {
+     int device;
+     std::string name;
+     std::string description;
++    std::string uuid;
+ };
+ 
+ static const char * ggml_backend_cuda_device_get_name(ggml_backend_dev_t dev) {
+@@ -2896,6 +2897,11 @@ static const char * ggml_backend_cuda_device_get_description(ggml_backend_dev_t
+     return ctx->description.c_str();
+ }
+ 
++static const char * ggml_backend_cuda_device_get_uuid(ggml_backend_dev_t dev) {
++    ggml_backend_cuda_device_context * ctx = (ggml_backend_cuda_device_context *)dev->context;
++    return ctx->uuid.c_str();
++}
++
+ static void ggml_backend_cuda_device_get_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) {
+     ggml_backend_cuda_device_context * ctx = (ggml_backend_cuda_device_context *)dev->context;
+     ggml_cuda_set_device(ctx->device);
+@@ -2910,6 +2916,7 @@ static enum ggml_backend_dev_type ggml_backend_cuda_device_get_type(ggml_backend
+ static void ggml_backend_cuda_device_get_props(ggml_backend_dev_t dev, ggml_backend_dev_props * props) {
+     props->name        = ggml_backend_cuda_device_get_name(dev);
+     props->description = ggml_backend_cuda_device_get_description(dev);
++    props->uuid        = ggml_backend_cuda_device_get_uuid(dev);
+     props->type        = ggml_backend_cuda_device_get_type(dev);
+     ggml_backend_cuda_device_get_memory(dev, &props->memory_free, &props->memory_total);
+ 
+@@ -3458,6 +3465,32 @@ ggml_backend_reg_t ggml_backend_cuda_reg() {
+                 CUDA_CHECK(cudaGetDeviceProperties(&prop, i));
+                 dev_ctx->description = prop.name;
+ 
++                #if !defined(GGML_USE_HIP)
++                char uuid[64];
++                snprintf(uuid, sizeof(uuid),
++                    "GPU-%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
++                    (unsigned char)prop.uuid.bytes[0],
++                    (unsigned char)prop.uuid.bytes[1],
++                    (unsigned char)prop.uuid.bytes[2],
++                    (unsigned char)prop.uuid.bytes[3],
++                    (unsigned char)prop.uuid.bytes[4],
++                    (unsigned char)prop.uuid.bytes[5],
++                    (unsigned char)prop.uuid.bytes[6],
++                    (unsigned char)prop.uuid.bytes[7],
++                    (unsigned char)prop.uuid.bytes[8],
++                    (unsigned char)prop.uuid.bytes[9],
++                    (unsigned char)prop.uuid.bytes[10],
++                    (unsigned char)prop.uuid.bytes[11],
++                    (unsigned char)prop.uuid.bytes[12],
++                    (unsigned char)prop.uuid.bytes[13],
++                    (unsigned char)prop.uuid.bytes[14],
++                    (unsigned char)prop.uuid.bytes[15]
++                  );
++                dev_ctx->uuid = uuid;
++                #else
++                dev_ctx->uuid = "GPU-" + std::string(prop.uuid.bytes, 16);
++                #endif
++
+                 ggml_backend_dev_t dev = new ggml_backend_device {
+                     /* .iface   = */ ggml_backend_cuda_device_interface,
+                     /* .reg     = */ &reg,
+diff --git a/ggml/src/ggml-metal/ggml-metal.m b/ggml/src/ggml-metal/ggml-metal.m
+index 1b56f858..ee4f2dcb 100644
+--- a/ggml/src/ggml-metal/ggml-metal.m
++++ b/ggml/src/ggml-metal/ggml-metal.m
+@@ -5703,6 +5703,7 @@ static enum ggml_backend_dev_type ggml_backend_metal_device_get_type(ggml_backen
+ static void ggml_backend_metal_device_get_props(ggml_backend_dev_t dev, struct ggml_backend_dev_props * props) {
+     props->name        = ggml_backend_metal_device_get_name(dev);
+     props->description = ggml_backend_metal_device_get_description(dev);
++    props->uuid        = "0";
+     props->type        = ggml_backend_metal_device_get_type(dev);
+     ggml_backend_metal_device_get_memory(dev, &props->memory_free, &props->memory_total);
+     props->caps = (struct ggml_backend_dev_caps) {
diff --git a/ml/backend.go b/ml/backend.go
index 65f16948..2df6c892 100644
--- a/ml/backend.go
+++ b/ml/backend.go
@@ -124,6 +124,10 @@ type DeviceMemory struct {
 	// may not be persistent across instances of the runner.
 	Name string
 
+	// UUID is a unique persistent identifier for the device for matching
+	// with system management libraries
+	UUID string
+
 	// Weights is the per-layer memory needed for the model weights.
 	Weights []Memory
 
@@ -152,6 +156,10 @@ func (m DeviceMemory) LogValue() slog.Value {
 		attrs = append(attrs, slog.Any("Graph", m.Graph))
 	}
 
+	if len(attrs) > 0 && m.UUID != "" {
+		attrs = append([]slog.Attr{slog.String("UUID", m.UUID)}, attrs...)
+	}
+
 	return slog.GroupValue(attrs...)
 }
 
diff --git a/ml/backend/ggml/ggml.go b/ml/backend/ggml/ggml.go
index 76172ae1..5a9fe67e 100644
--- a/ml/backend/ggml/ggml.go
+++ b/ml/backend/ggml/ggml.go
@@ -136,6 +136,9 @@ func New(modelPath string, params ml.BackendParams) (ml.Backend, error) {
 	}
 
 	requiredMemory.CPU.Name = C.GoString(C.ggml_backend_dev_name(cpuDeviceBufferType.d))
+	var props C.struct_ggml_backend_dev_props
+	C.ggml_backend_dev_get_props(cpuDeviceBufferType.d, &props)
+	requiredMemory.CPU.UUID = C.GoString(props.uuid)
 	requiredMemory.CPU.Weights = make([]ml.Memory, blocks+1)
 	requiredMemory.CPU.Cache = make([]ml.Memory, blocks+1)
 
@@ -150,6 +153,9 @@ func New(modelPath string, params ml.BackendParams) (ml.Backend, error) {
 		})
 		btDeviceMemory[bt] = &requiredMemory.GPUs[i]
 		requiredMemory.GPUs[i].Name = C.GoString(C.ggml_backend_dev_name(d))
+		var props C.struct_ggml_backend_dev_props
+		C.ggml_backend_dev_get_props(d, &props)
+		requiredMemory.GPUs[i].UUID = C.GoString(props.uuid)
 		requiredMemory.GPUs[i].Weights = make([]ml.Memory, blocks+1)
 		requiredMemory.GPUs[i].Cache = make([]ml.Memory, blocks+1)
 	}
diff --git a/ml/backend/ggml/ggml/include/ggml-backend.h b/ml/backend/ggml/ggml/include/ggml-backend.h
index 74e46716..a880df33 100644
--- a/ml/backend/ggml/ggml/include/ggml-backend.h
+++ b/ml/backend/ggml/ggml/include/ggml-backend.h
@@ -152,6 +152,7 @@ extern "C" {
     struct ggml_backend_dev_props {
         const char * name;
         const char * description;
+        const char * uuid;
         size_t memory_free;
         size_t memory_total;
         enum ggml_backend_dev_type type;
diff --git a/ml/backend/ggml/ggml/src/ggml-cuda/ggml-cuda.cu b/ml/backend/ggml/ggml/src/ggml-cuda/ggml-cuda.cu
index cb0d8528..4c829153 100644
--- a/ml/backend/ggml/ggml/src/ggml-cuda/ggml-cuda.cu
+++ b/ml/backend/ggml/ggml/src/ggml-cuda/ggml-cuda.cu
@@ -2884,6 +2884,7 @@ struct ggml_backend_cuda_device_context {
     int device;
     std::string name;
     std::string description;
+    std::string uuid;
 };
 
 static const char * ggml_backend_cuda_device_get_name(ggml_backend_dev_t dev) {
@@ -2896,6 +2897,11 @@ static const char * ggml_backend_cuda_device_get_description(ggml_backend_dev_t
     return ctx->description.c_str();
 }
 
+static const char * ggml_backend_cuda_device_get_uuid(ggml_backend_dev_t dev) {
+    ggml_backend_cuda_device_context * ctx = (ggml_backend_cuda_device_context *)dev->context;
+    return ctx->uuid.c_str();
+}
+
 static void ggml_backend_cuda_device_get_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) {
     ggml_backend_cuda_device_context * ctx = (ggml_backend_cuda_device_context *)dev->context;
     ggml_cuda_set_device(ctx->device);
@@ -2910,6 +2916,7 @@ static enum ggml_backend_dev_type ggml_backend_cuda_device_get_type(ggml_backend
 static void ggml_backend_cuda_device_get_props(ggml_backend_dev_t dev, ggml_backend_dev_props * props) {
     props->name        = ggml_backend_cuda_device_get_name(dev);
     props->description = ggml_backend_cuda_device_get_description(dev);
+    props->uuid        = ggml_backend_cuda_device_get_uuid(dev);
     props->type        = ggml_backend_cuda_device_get_type(dev);
     ggml_backend_cuda_device_get_memory(dev, &props->memory_free, &props->memory_total);
 
@@ -3458,6 +3465,32 @@ ggml_backend_reg_t ggml_backend_cuda_reg() {
                 CUDA_CHECK(cudaGetDeviceProperties(&prop, i));
                 dev_ctx->description = prop.name;
 
+                #if !defined(GGML_USE_HIP)
+                char uuid[64];
+                snprintf(uuid, sizeof(uuid),
+                    "GPU-%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
+                    (unsigned char)prop.uuid.bytes[0],
+                    (unsigned char)prop.uuid.bytes[1],
+                    (unsigned char)prop.uuid.bytes[2],
+                    (unsigned char)prop.uuid.bytes[3],
+                    (unsigned char)prop.uuid.bytes[4],
+                    (unsigned char)prop.uuid.bytes[5],
+                    (unsigned char)prop.uuid.bytes[6],
+                    (unsigned char)prop.uuid.bytes[7],
+                    (unsigned char)prop.uuid.bytes[8],
+                    (unsigned char)prop.uuid.bytes[9],
+                    (unsigned char)prop.uuid.bytes[10],
+                    (unsigned char)prop.uuid.bytes[11],
+                    (unsigned char)prop.uuid.bytes[12],
+                    (unsigned char)prop.uuid.bytes[13],
+                    (unsigned char)prop.uuid.bytes[14],
+                    (unsigned char)prop.uuid.bytes[15]
+                  );
+                dev_ctx->uuid = uuid;
+                #else
+                dev_ctx->uuid = "GPU-" + std::string(prop.uuid.bytes, 16);
+                #endif
+
                 ggml_backend_dev_t dev = new ggml_backend_device {
                     /* .iface   = */ ggml_backend_cuda_device_interface,
                     /* .reg     = */ &reg,
diff --git a/ml/backend/ggml/ggml/src/ggml-metal/ggml-metal.m b/ml/backend/ggml/ggml/src/ggml-metal/ggml-metal.m
index 1b56f858..ee4f2dcb 100644
--- a/ml/backend/ggml/ggml/src/ggml-metal/ggml-metal.m
+++ b/ml/backend/ggml/ggml/src/ggml-metal/ggml-metal.m
@@ -5703,6 +5703,7 @@ static enum ggml_backend_dev_type ggml_backend_metal_device_get_type(ggml_backen
 static void ggml_backend_metal_device_get_props(ggml_backend_dev_t dev, struct ggml_backend_dev_props * props) {
     props->name        = ggml_backend_metal_device_get_name(dev);
     props->description = ggml_backend_metal_device_get_description(dev);
+    props->uuid        = "0";
     props->type        = ggml_backend_metal_device_get_type(dev);
     ggml_backend_metal_device_get_memory(dev, &props->memory_free, &props->memory_total);
     props->caps = (struct ggml_backend_dev_caps) {

From 8bcb3125c1b416b43aa431b2b3b105d933eca697 Mon Sep 17 00:00:00 2001
From: Jeffrey Morgan <jmorganca@gmail.com>
Date: Wed, 18 Jun 2025 12:58:50 -0700
Subject: [PATCH 102/108] benchmark: remove unused benchmark test (#11120)

Removes a test under benchmark/ that is unused
---
 benchmark/server_benchmark_test.go | 178 -----------------------------
 docs/benchmark.md                  |  59 ----------
 2 files changed, 237 deletions(-)
 delete mode 100644 benchmark/server_benchmark_test.go
 delete mode 100644 docs/benchmark.md

diff --git a/benchmark/server_benchmark_test.go b/benchmark/server_benchmark_test.go
deleted file mode 100644
index 4a3c46cd..00000000
--- a/benchmark/server_benchmark_test.go
+++ /dev/null
@@ -1,178 +0,0 @@
-package benchmark
-
-import (
-	"context"
-	"flag"
-	"fmt"
-	"testing"
-	"time"
-
-	"github.com/ollama/ollama/api"
-)
-
-// Command line flags
-var modelFlag string
-
-func init() {
-	flag.StringVar(&modelFlag, "m", "", "Name of the model to benchmark")
-	flag.Lookup("m").DefValue = "model"
-}
-
-// modelName returns the model name from flags, failing the test if not set
-func modelName(b *testing.B) string {
-	if modelFlag == "" {
-		b.Fatal("Error: -m flag is required for benchmark tests")
-	}
-	return modelFlag
-}
-
-type TestCase struct {
-	name      string
-	prompt    string
-	maxTokens int
-}
-
-// runGenerateBenchmark contains the common generate and metrics logic
-func runGenerateBenchmark(b *testing.B, ctx context.Context, client *api.Client, req *api.GenerateRequest) {
-	start := time.Now()
-	var ttft time.Duration
-	var metrics api.Metrics
-
-	err := client.Generate(ctx, req, func(resp api.GenerateResponse) error {
-		if ttft == 0 && resp.Response != "" {
-			ttft = time.Since(start)
-		}
-		if resp.Done {
-			metrics = resp.Metrics
-		}
-		return nil
-	})
-
-	// Report custom metrics as part of the benchmark results
-	b.ReportMetric(float64(ttft.Milliseconds()), "ttft_ms")
-	b.ReportMetric(float64(metrics.LoadDuration.Milliseconds()), "load_ms")
-
-	// Token throughput metrics
-	promptThroughput := float64(metrics.PromptEvalCount) / metrics.PromptEvalDuration.Seconds()
-	genThroughput := float64(metrics.EvalCount) / metrics.EvalDuration.Seconds()
-	b.ReportMetric(promptThroughput, "prompt_tok/s")
-	b.ReportMetric(genThroughput, "gen_tok/s")
-
-	// Token counts
-	b.ReportMetric(float64(metrics.PromptEvalCount), "prompt_tokens")
-	b.ReportMetric(float64(metrics.EvalCount), "gen_tokens")
-	if err != nil {
-		b.Fatal(err)
-	}
-}
-
-// BenchmarkColdStart runs benchmarks with model loading from cold state
-func BenchmarkColdStart(b *testing.B) {
-	client := setup(b)
-	tests := []TestCase{
-		{"short_prompt", "Write a long story", 100},
-		{"medium_prompt", "Write a detailed economic analysis", 500},
-		{"long_prompt", "Write a comprehensive AI research paper", 1000},
-	}
-	m := modelName(b)
-
-	for _, tt := range tests {
-		b.Run(fmt.Sprintf("%s/cold/%s", m, tt.name), func(b *testing.B) {
-			ctx := b.Context()
-
-			// Set number of tokens as our throughput metric
-			b.SetBytes(int64(tt.maxTokens))
-
-			for b.Loop() {
-				b.StopTimer()
-				// Ensure model is unloaded before each iteration
-				unload(client, m, b)
-				b.StartTimer()
-
-				req := &api.GenerateRequest{
-					Model:   m,
-					Prompt:  tt.prompt,
-					Options: map[string]any{"num_predict": tt.maxTokens, "temperature": 0.1},
-				}
-
-				runGenerateBenchmark(b, ctx, client, req)
-			}
-		})
-	}
-}
-
-// BenchmarkWarmStart runs benchmarks with pre-loaded model
-func BenchmarkWarmStart(b *testing.B) {
-	client := setup(b)
-	tests := []TestCase{
-		{"short_prompt", "Write a long story", 100},
-		{"medium_prompt", "Write a detailed economic analysis", 500},
-		{"long_prompt", "Write a comprehensive AI research paper", 1000},
-	}
-	m := modelName(b)
-
-	for _, tt := range tests {
-		b.Run(fmt.Sprintf("%s/warm/%s", m, tt.name), func(b *testing.B) {
-			ctx := b.Context()
-
-			// Pre-warm the model
-			warmup(client, m, tt.prompt, b)
-
-			// Set number of tokens as our throughput metric
-			b.SetBytes(int64(tt.maxTokens))
-
-			for b.Loop() {
-				req := &api.GenerateRequest{
-					Model:   m,
-					Prompt:  tt.prompt,
-					Options: map[string]any{"num_predict": tt.maxTokens, "temperature": 0.1},
-				}
-
-				runGenerateBenchmark(b, ctx, client, req)
-			}
-		})
-	}
-}
-
-// setup verifies server and model availability
-func setup(b *testing.B) *api.Client {
-	client, err := api.ClientFromEnvironment()
-	if err != nil {
-		b.Fatal(err)
-	}
-	if _, err := client.Show(b.Context(), &api.ShowRequest{Model: modelName(b)}); err != nil {
-		b.Fatalf("Model unavailable: %v", err)
-	}
-
-	return client
-}
-
-// warmup ensures the model is loaded and warmed up
-func warmup(client *api.Client, model string, prompt string, b *testing.B) {
-	for range 3 {
-		err := client.Generate(
-			context.Background(),
-			&api.GenerateRequest{
-				Model:   model,
-				Prompt:  prompt,
-				Options: map[string]any{"num_predict": 50, "temperature": 0.1},
-			},
-			func(api.GenerateResponse) error { return nil },
-		)
-		if err != nil {
-			b.Logf("Error during model warm-up: %v", err)
-		}
-	}
-}
-
-// unload forces model unloading using KeepAlive: 0 parameter
-func unload(client *api.Client, model string, b *testing.B) {
-	req := &api.GenerateRequest{
-		Model:     model,
-		KeepAlive: &api.Duration{Duration: 0},
-	}
-	if err := client.Generate(context.Background(), req, func(api.GenerateResponse) error { return nil }); err != nil {
-		b.Logf("Unload error: %v", err)
-	}
-	time.Sleep(1 * time.Second)
-}
diff --git a/docs/benchmark.md b/docs/benchmark.md
deleted file mode 100644
index a7bed808..00000000
--- a/docs/benchmark.md
+++ /dev/null
@@ -1,59 +0,0 @@
-# Benchmark
-
-Go benchmark tests that measure end-to-end performance of a running Ollama server. Run these tests to evaluate model inference performance on your hardware and measure the impact of code changes.
-
-## When to use
-
-Run these benchmarks when:
-- Making changes to the model inference engine
-- Modifying model loading/unloading logic
-- Changing prompt processing or token generation code
-- Implementing a new model architecture
-- Testing performance across different hardware setups
-
-## Prerequisites
-- Ollama server running locally with `ollama serve` on `127.0.0.1:11434`
-## Usage and Examples
-
->[!NOTE]
->All commands must be run from the root directory of the Ollama project.
-
-Basic syntax:
-```bash
-go test -bench=. ./benchmark/... -m $MODEL_NAME
-```
-
-Required flags:
-- `-bench=.`: Run all benchmarks
-- `-m`: Model name to benchmark
-
-Optional flags:
-- `-count N`: Number of times to run the benchmark (useful for statistical analysis)
-- `-timeout T`: Maximum time for the benchmark to run (e.g. "10m" for 10 minutes)
-
-Common usage patterns:
-
-Single benchmark run with a model specified:
-```bash
-go test -bench=. ./benchmark/... -m llama3.3
-```
-
-## Output metrics
-
-The benchmark reports several key metrics:
-
-- `gen_tok/s`: Generated tokens per second
-- `prompt_tok/s`: Prompt processing tokens per second
-- `ttft_ms`: Time to first token in milliseconds
-- `load_ms`: Model load time in milliseconds
-- `gen_tokens`: Total tokens generated
-- `prompt_tokens`: Total prompt tokens processed
-
-Each benchmark runs two scenarios:
-- Cold start: Model is loaded from disk for each test
-- Warm start: Model is pre-loaded in memory
-
-Three prompt lengths are tested for each scenario:
-- Short prompt (100 tokens)
-- Medium prompt (500 tokens)
-- Long prompt (1000 tokens)

From f2527b08fba57d606e12cb21b583249c11724d7a Mon Sep 17 00:00:00 2001
From: Daniel Hiltgen <dhiltgen@users.noreply.github.com>
Date: Thu, 19 Jun 2025 12:10:19 -0700
Subject: [PATCH 103/108] int: add coverage for older models (#11137)

Verified these fail on 0.9.1 and pass on HEAD.
---
 integration/model_arch_test.go | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/integration/model_arch_test.go b/integration/model_arch_test.go
index 6ce183d7..628b0231 100644
--- a/integration/model_arch_test.go
+++ b/integration/model_arch_test.go
@@ -45,6 +45,8 @@ var (
 		"qwen2.5-coder:latest",
 		"qwen:latest",
 		"solar-pro:latest",
+		"codellama:latest",
+		"nous-hermes:latest",
 	}
 )
 

From 87b7af6ceef2b4d96374dbff5070b41b17d3f138 Mon Sep 17 00:00:00 2001
From: Jesse Gross <jesse@ollama.com>
Date: Thu, 19 Jun 2025 14:39:20 -0700
Subject: [PATCH 104/108] ggml: Check return status for computation.

We don't check the return status after computing the graph, which
can silently lead to bad outputs if we try to keep going and future
computation succeeds. This appears to happens in certain cases on
Apple M2 devices.

Fixes #11070
---
 ml/backend/ggml/ggml.go | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/ml/backend/ggml/ggml.go b/ml/backend/ggml/ggml.go
index 5a9fe67e..8aadad86 100644
--- a/ml/backend/ggml/ggml.go
+++ b/ml/backend/ggml/ggml.go
@@ -602,7 +602,9 @@ func (c *Context) Forward(tensors ...ml.Tensor) ml.Context {
 }
 
 func (c *Context) Compute(tensors ...ml.Tensor) {
-	C.ggml_backend_sched_graph_compute_async(c.b.sched, c.graph)
+	if status := C.ggml_backend_sched_graph_compute_async(c.b.sched, c.graph); status != C.GGML_STATUS_SUCCESS {
+		panic(fmt.Errorf("error computing ggml graph: %v", status))
+	}
 	C.ggml_backend_sched_reset(c.b.sched)
 
 	needSync := true

From 0a066cfd91abdddc6ee172776974a6720a3072d3 Mon Sep 17 00:00:00 2001
From: Michael Yang <mxyng@pm.me>
Date: Fri, 20 Jun 2025 11:11:40 -0700
Subject: [PATCH 105/108] Reapply "feat: incremental gguf parser (#10822)"
 (#11114) (#11119)

* Reapply "feat: incremental gguf parser (#10822)" (#11114)

This reverts commit a6e64fbdf28f0d6cb97cc7f022ca493b905fe895.

* fix older ggufs
---
 fs/gguf/gguf.go             | 347 ++++++++++++++++++++++++++++++++++++
 fs/gguf/gguf_test.go        | 249 ++++++++++++++++++++++++++
 fs/gguf/keyvalue.go         |  90 ++++++++++
 fs/gguf/keyvalue_test.go    | 208 +++++++++++++++++++++
 fs/gguf/lazy.go             |  89 +++++++++
 fs/gguf/reader.go           |  23 +++
 fs/gguf/tensor.go           | 288 ++++++++++++++++++++++++++++++
 go.mod                      |   2 +-
 go.sum                      |   4 +-
 server/images.go            |  24 ++-
 server/images_test.go       | 165 +++++------------
 server/quantization_test.go |  12 +-
 server/sched_test.go        |  20 +--
 13 files changed, 1357 insertions(+), 164 deletions(-)
 create mode 100644 fs/gguf/gguf.go
 create mode 100644 fs/gguf/gguf_test.go
 create mode 100644 fs/gguf/keyvalue.go
 create mode 100644 fs/gguf/keyvalue_test.go
 create mode 100644 fs/gguf/lazy.go
 create mode 100644 fs/gguf/reader.go
 create mode 100644 fs/gguf/tensor.go

diff --git a/fs/gguf/gguf.go b/fs/gguf/gguf.go
new file mode 100644
index 00000000..bbb9bb41
--- /dev/null
+++ b/fs/gguf/gguf.go
@@ -0,0 +1,347 @@
+package gguf
+
+import (
+	"bytes"
+	"cmp"
+	"encoding/binary"
+	"errors"
+	"fmt"
+	"io"
+	"iter"
+	"os"
+	"slices"
+	"strings"
+)
+
+const (
+	typeUint8 uint32 = iota
+	typeInt8
+	typeUint16
+	typeInt16
+	typeUint32
+	typeInt32
+	typeFloat32
+	typeBool
+	typeString
+	typeArray
+	typeUint64
+	typeInt64
+	typeFloat64
+)
+
+var ErrUnsupported = errors.New("unsupported")
+
+type File struct {
+	Magic   [4]byte
+	Version uint32
+
+	keyValues *lazy[KeyValue]
+	tensors   *lazy[TensorInfo]
+	offset    int64
+
+	file   *os.File
+	reader *bufferedReader
+	bts    []byte
+}
+
+func Open(path string) (f *File, err error) {
+	f = &File{bts: make([]byte, 4096)}
+	f.file, err = os.Open(path)
+	if err != nil {
+		return nil, err
+	}
+
+	f.reader = newBufferedReader(f.file, 32<<10)
+
+	if err := binary.Read(f.reader, binary.LittleEndian, &f.Magic); err != nil {
+		return nil, err
+	}
+
+	if bytes.Equal(f.Magic[:], []byte("gguf")) {
+		return nil, fmt.Errorf("%w file type %v", ErrUnsupported, f.Magic)
+	}
+
+	if err := binary.Read(f.reader, binary.LittleEndian, &f.Version); err != nil {
+		return nil, err
+	}
+
+	if f.Version < 2 {
+		return nil, fmt.Errorf("%w version %v", ErrUnsupported, f.Version)
+	}
+
+	f.tensors, err = newLazy(f, f.readTensor)
+	if err != nil {
+		return nil, err
+	}
+
+	f.tensors.successFunc = func() error {
+		offset := f.reader.offset
+
+		alignment := cmp.Or(f.KeyValue("general.alignment").Int(), 32)
+		f.offset = offset + (alignment-offset%alignment)%alignment
+		return nil
+	}
+
+	f.keyValues, err = newLazy(f, f.readKeyValue)
+	if err != nil {
+		return nil, err
+	}
+
+	return f, nil
+}
+
+func (f *File) readTensor() (TensorInfo, error) {
+	name, err := readString(f)
+	if err != nil {
+		return TensorInfo{}, err
+	}
+
+	dims, err := read[uint32](f)
+	if err != nil {
+		return TensorInfo{}, err
+	}
+
+	shape := make([]uint64, dims)
+	for i := range dims {
+		shape[i], err = read[uint64](f)
+		if err != nil {
+			return TensorInfo{}, err
+		}
+	}
+
+	type_, err := read[uint32](f)
+	if err != nil {
+		return TensorInfo{}, err
+	}
+
+	offset, err := read[uint64](f)
+	if err != nil {
+		return TensorInfo{}, err
+	}
+
+	return TensorInfo{
+		Name:   name,
+		Offset: offset,
+		Shape:  shape,
+		Type:   TensorType(type_),
+	}, nil
+}
+
+func (f *File) readKeyValue() (KeyValue, error) {
+	key, err := readString(f)
+	if err != nil {
+		return KeyValue{}, err
+	}
+
+	t, err := read[uint32](f)
+	if err != nil {
+		return KeyValue{}, err
+	}
+
+	value, err := func() (any, error) {
+		switch t {
+		case typeUint8:
+			return read[uint8](f)
+		case typeInt8:
+			return read[int8](f)
+		case typeUint16:
+			return read[uint16](f)
+		case typeInt16:
+			return read[int16](f)
+		case typeUint32:
+			return read[uint32](f)
+		case typeInt32:
+			return read[int32](f)
+		case typeUint64:
+			return read[uint64](f)
+		case typeInt64:
+			return read[int64](f)
+		case typeFloat32:
+			return read[float32](f)
+		case typeFloat64:
+			return read[float64](f)
+		case typeBool:
+			return read[bool](f)
+		case typeString:
+			return readString(f)
+		case typeArray:
+			return readArray(f)
+		default:
+			return nil, fmt.Errorf("%w type %d", ErrUnsupported, t)
+		}
+	}()
+	if err != nil {
+		return KeyValue{}, err
+	}
+
+	return KeyValue{
+		Key:   key,
+		Value: Value{value},
+	}, nil
+}
+
+func read[T any](f *File) (t T, err error) {
+	err = binary.Read(f.reader, binary.LittleEndian, &t)
+	return t, err
+}
+
+func readString(f *File) (string, error) {
+	n, err := read[uint64](f)
+	if err != nil {
+		return "", err
+	}
+
+	if int(n) > len(f.bts) {
+		f.bts = make([]byte, n)
+	}
+
+	bts := f.bts[:n]
+	if _, err := io.ReadFull(f.reader, bts); err != nil {
+		return "", err
+	}
+	defer clear(bts)
+
+	return string(bts), nil
+}
+
+func readArray(f *File) (any, error) {
+	t, err := read[uint32](f)
+	if err != nil {
+		return nil, err
+	}
+
+	n, err := read[uint64](f)
+	if err != nil {
+		return nil, err
+	}
+
+	switch t {
+	case typeUint8:
+		return readArrayData[uint8](f, n)
+	case typeInt8:
+		return readArrayData[int8](f, n)
+	case typeUint16:
+		return readArrayData[uint16](f, n)
+	case typeInt16:
+		return readArrayData[int16](f, n)
+	case typeUint32:
+		return readArrayData[uint32](f, n)
+	case typeInt32:
+		return readArrayData[int32](f, n)
+	case typeUint64:
+		return readArrayData[uint64](f, n)
+	case typeInt64:
+		return readArrayData[int64](f, n)
+	case typeFloat32:
+		return readArrayData[float32](f, n)
+	case typeFloat64:
+		return readArrayData[float64](f, n)
+	case typeBool:
+		return readArrayData[bool](f, n)
+	case typeString:
+		return readArrayString(f, n)
+	default:
+		return nil, fmt.Errorf("%w type %d", ErrUnsupported, t)
+	}
+}
+
+func readArrayData[T any](f *File, n uint64) (s []T, err error) {
+	s = make([]T, n)
+	for i := range n {
+		e, err := read[T](f)
+		if err != nil {
+			return nil, err
+		}
+
+		s[i] = e
+	}
+
+	return s, nil
+}
+
+func readArrayString(f *File, n uint64) (s []string, err error) {
+	s = make([]string, n)
+	for i := range n {
+		e, err := readString(f)
+		if err != nil {
+			return nil, err
+		}
+
+		s[i] = e
+	}
+
+	return s, nil
+}
+
+func (f *File) Close() error {
+	f.keyValues.stop()
+	f.tensors.stop()
+	return f.file.Close()
+}
+
+func (f *File) KeyValue(key string) KeyValue {
+	if !strings.HasPrefix(key, "general.") && !strings.HasPrefix(key, "tokenizer.") {
+		key = f.KeyValue("general.architecture").String() + "." + key
+	}
+
+	if index := slices.IndexFunc(f.keyValues.values, func(kv KeyValue) bool {
+		return kv.Key == key
+	}); index >= 0 {
+		return f.keyValues.values[index]
+	}
+
+	for keyValue, ok := f.keyValues.next(); ok; keyValue, ok = f.keyValues.next() {
+		if keyValue.Key == key {
+			return keyValue
+		}
+	}
+
+	return KeyValue{}
+}
+
+func (f *File) NumKeyValues() int {
+	return int(f.keyValues.count)
+}
+
+func (f *File) KeyValues() iter.Seq2[int, KeyValue] {
+	return f.keyValues.All()
+}
+
+func (f *File) TensorInfo(name string) TensorInfo {
+	if index := slices.IndexFunc(f.tensors.values, func(t TensorInfo) bool {
+		return t.Name == name
+	}); index >= 0 {
+		return f.tensors.values[index]
+	}
+
+	// fast-forward through key values if we haven't already
+	_ = f.keyValues.rest()
+	for tensor, ok := f.tensors.next(); ok; tensor, ok = f.tensors.next() {
+		if tensor.Name == name {
+			return tensor
+		}
+	}
+
+	return TensorInfo{}
+}
+
+func (f *File) NumTensors() int {
+	return int(f.tensors.count)
+}
+
+func (f *File) TensorInfos() iter.Seq2[int, TensorInfo] {
+	// fast forward through key values if we haven't already
+	f.keyValues.rest()
+	return f.tensors.All()
+}
+
+func (f *File) TensorReader(name string) (TensorInfo, io.Reader, error) {
+	t := f.TensorInfo(name)
+	if t.NumBytes() == 0 {
+		return TensorInfo{}, nil, fmt.Errorf("tensor %s not found", name)
+	}
+
+	// fast forward through tensor info if we haven't already
+	_ = f.tensors.rest()
+	return t, io.NewSectionReader(f.file, f.offset+int64(t.Offset), t.NumBytes()), nil
+}
diff --git a/fs/gguf/gguf_test.go b/fs/gguf/gguf_test.go
new file mode 100644
index 00000000..eea28a48
--- /dev/null
+++ b/fs/gguf/gguf_test.go
@@ -0,0 +1,249 @@
+package gguf_test
+
+import (
+	"bytes"
+	"os"
+	"strconv"
+	"strings"
+	"testing"
+
+	"github.com/google/go-cmp/cmp"
+	"github.com/google/go-cmp/cmp/cmpopts"
+	"github.com/ollama/ollama/fs/ggml"
+	"github.com/ollama/ollama/fs/gguf"
+)
+
+func createBinFile(tb testing.TB) string {
+	tb.Helper()
+	f, err := os.CreateTemp(tb.TempDir(), "")
+	if err != nil {
+		tb.Fatal(err)
+	}
+	defer f.Close()
+
+	kv := ggml.KV{
+		"general.architecture":                   "llama",
+		"llama.block_count":                      uint32(8),
+		"llama.embedding_length":                 uint32(3),
+		"llama.attention.head_count":             uint32(2),
+		"llama.attention.head_count_kv":          uint32(2),
+		"llama.attention.key_length":             uint32(3),
+		"llama.rope.dimension_count":             uint32(4),
+		"llama.rope.freq_base":                   float32(10000.0),
+		"llama.rope.freq_scale":                  float32(1.0),
+		"llama.attention.layer_norm_rms_epsilon": float32(1e-6),
+		"tokenizer.ggml.eos_token_id":            uint32(0),
+		"tokenizer.ggml.eos_token_ids":           []int32{1, 2, 3},
+		"tokenizer.ggml.tokens":                  []string{"hello", "world"},
+		"tokenizer.ggml.scores":                  []float32{0, 1},
+	}
+
+	tensors := []*ggml.Tensor{
+		{
+			Name:     "token_embd.weight",
+			Kind:     0,
+			Shape:    []uint64{2, 3},
+			WriterTo: bytes.NewBuffer(make([]byte, 4*2*3)),
+		},
+		{
+			Name:     "output.weight",
+			Kind:     0,
+			Shape:    []uint64{3, 2},
+			WriterTo: bytes.NewBuffer(make([]byte, 4*3*2)),
+		},
+	}
+
+	for i := range 8 {
+		tensors = append(tensors, &ggml.Tensor{
+			Name:     "blk." + strconv.Itoa(i) + ".attn_q.weight",
+			Kind:     0,
+			Shape:    []uint64{3, 3},
+			WriterTo: bytes.NewBuffer(make([]byte, 4*3*3)),
+		}, &ggml.Tensor{
+			Name:     "blk." + strconv.Itoa(i) + ".attn_k.weight",
+			Kind:     0,
+			Shape:    []uint64{3, 3},
+			WriterTo: bytes.NewBuffer(make([]byte, 4*3*3)),
+		}, &ggml.Tensor{
+			Name:     "blk." + strconv.Itoa(i) + ".attn_v.weight",
+			Kind:     0,
+			Shape:    []uint64{3, 3},
+			WriterTo: bytes.NewBuffer(make([]byte, 4*3*3)),
+		}, &ggml.Tensor{
+			Name:     "blk." + strconv.Itoa(i) + ".attn_output.weight",
+			Kind:     0,
+			Shape:    []uint64{3, 3},
+			WriterTo: bytes.NewBuffer(make([]byte, 4*3*3)),
+		})
+	}
+
+	if err := ggml.WriteGGUF(f, kv, tensors); err != nil {
+		tb.Fatal(err)
+	}
+
+	return f.Name()
+}
+
+func TestRead(t *testing.T) {
+	f, err := gguf.Open(createBinFile(t))
+	if err != nil {
+		t.Fatal(err)
+	}
+	defer f.Close()
+
+	if got := f.KeyValue("does.not.exist").Valid(); got {
+		t.Errorf(`KeyValue("does.not.exist").Exists() = %v, want false`, got)
+	}
+
+	if got := f.KeyValue("general.architecture").String(); got != "llama" {
+		t.Errorf(`KeyValue("general.architecture").String() = %q, want %q`, got, "llama")
+	}
+
+	if got := f.TensorInfo("token_embd.weight"); got.Name != "token_embd.weight" {
+		t.Errorf(`TensorInfo("token_embd.weight").Name = %q, want %q`, got.Name, "token_embd.weight")
+	} else if diff := cmp.Diff(got.Shape, []uint64{2, 3}); diff != "" {
+		t.Errorf(`TensorInfo("token_embd.weight").Shape mismatch (-got +want):\n%s`, diff)
+	} else if got.Type != gguf.TensorTypeF32 {
+		t.Errorf(`TensorInfo("token_embd.weight").Type = %d, want %d`, got.Type, gguf.TensorTypeF32)
+	}
+
+	if got := f.KeyValue("block_count").Uint(); got != 8 {
+		t.Errorf(`KeyValue("block_count").Uint() = %d, want %d`, got, 8)
+	}
+
+	if diff := cmp.Diff(f.KeyValue("tokenizer.ggml.tokens").Strings(), []string{"hello", "world"}); diff != "" {
+		t.Errorf("KeyValue(\"tokenizer.ggml.tokens\").Strings() mismatch (-got +want):\n%s", diff)
+	}
+
+	if diff := cmp.Diff(f.KeyValue("tokenizer.ggml.scores").Floats(), []float64{0, 1}); diff != "" {
+		t.Errorf("KeyValue(\"tokenizer.ggml.scores\").Ints() mismatch (-got +want):\n%s", diff)
+	}
+
+	var kvs []string
+	for _, kv := range f.KeyValues() {
+		if !kv.Valid() {
+			t.Error("found invalid key-value pair:", kv)
+		}
+
+		kvs = append(kvs, kv.Key)
+	}
+
+	if len(kvs) != f.NumKeyValues() {
+		t.Errorf("iterated key count = %d, want %d", len(kvs), f.NumKeyValues())
+	}
+
+	if diff := cmp.Diff(kvs, []string{
+		"general.architecture",
+		"llama.block_count",
+		"llama.embedding_length",
+		"llama.attention.head_count",
+		"llama.attention.head_count_kv",
+		"llama.attention.key_length",
+		"llama.rope.dimension_count",
+		"llama.rope.freq_base",
+		"llama.rope.freq_scale",
+		"llama.attention.layer_norm_rms_epsilon",
+		"tokenizer.ggml.eos_token_id",
+		"tokenizer.ggml.eos_token_ids",
+		"tokenizer.ggml.tokens",
+		"tokenizer.ggml.scores",
+	}, cmpopts.SortSlices(strings.Compare)); diff != "" {
+		t.Errorf("KeyValues() mismatch (-got +want):\n%s", diff)
+	}
+
+	var tis []string
+	for _, ti := range f.TensorInfos() {
+		if !ti.Valid() {
+			t.Error("found invalid tensor info:", ti)
+		}
+
+		tis = append(tis, ti.Name)
+	}
+
+	if len(tis) != f.NumTensors() {
+		t.Errorf("iterated tensor count = %d, want %d", len(tis), f.NumTensors())
+	}
+
+	if diff := cmp.Diff(tis, []string{
+		"token_embd.weight",
+		"output.weight",
+		"blk.0.attn_q.weight",
+		"blk.0.attn_k.weight",
+		"blk.0.attn_v.weight",
+		"blk.0.attn_output.weight",
+		"blk.1.attn_q.weight",
+		"blk.1.attn_k.weight",
+		"blk.1.attn_v.weight",
+		"blk.1.attn_output.weight",
+		"blk.2.attn_q.weight",
+		"blk.2.attn_k.weight",
+		"blk.2.attn_v.weight",
+		"blk.2.attn_output.weight",
+		"blk.3.attn_q.weight",
+		"blk.3.attn_k.weight",
+		"blk.3.attn_v.weight",
+		"blk.3.attn_output.weight",
+		"blk.4.attn_q.weight",
+		"blk.4.attn_k.weight",
+		"blk.4.attn_v.weight",
+		"blk.4.attn_output.weight",
+		"blk.5.attn_q.weight",
+		"blk.5.attn_k.weight",
+		"blk.5.attn_v.weight",
+		"blk.5.attn_output.weight",
+		"blk.6.attn_q.weight",
+		"blk.6.attn_k.weight",
+		"blk.6.attn_v.weight",
+		"blk.6.attn_output.weight",
+		"blk.7.attn_q.weight",
+		"blk.7.attn_k.weight",
+		"blk.7.attn_v.weight",
+		"blk.7.attn_output.weight",
+	}, cmpopts.SortSlices(strings.Compare)); diff != "" {
+		t.Errorf("TensorInfos() mismatch (-got +want):\n%s", diff)
+	}
+
+	ti, r, err := f.TensorReader("output.weight")
+	if err != nil {
+		t.Fatalf(`TensorReader("output.weight") error: %v`, err)
+	}
+
+	if ti.Name != "output.weight" {
+		t.Errorf(`TensorReader("output.weight").Name = %q, want %q`, ti.Name, "output.weight")
+	} else if diff := cmp.Diff(ti.Shape, []uint64{3, 2}); diff != "" {
+		t.Errorf(`TensorReader("output.weight").Shape mismatch (-got +want):\n%s`, diff)
+	} else if ti.Type != gguf.TensorTypeF32 {
+		t.Errorf(`TensorReader("output.weight").Type = %d, want %d`, ti.Type, gguf.TensorTypeF32)
+	}
+
+	var b bytes.Buffer
+	if _, err := b.ReadFrom(r); err != nil {
+		t.Fatalf(`ReadFrom TensorReader("output.weight") error: %v`, err)
+	}
+
+	if b.Len() != int(ti.NumBytes()) {
+		t.Errorf(`ReadFrom TensorReader("output.weight") length = %d, want %d`, b.Len(), ti.NumBytes())
+	}
+}
+
+func BenchmarkRead(b *testing.B) {
+	b.ReportAllocs()
+
+	p := createBinFile(b)
+	for b.Loop() {
+		f, err := gguf.Open(p)
+		if err != nil {
+			b.Fatal(err)
+		}
+
+		if got := f.KeyValue("general.architecture").String(); got != "llama" {
+			b.Errorf("got = %q, want %q", got, "llama")
+		}
+
+		// Iterate through some tensors
+		for range f.TensorInfos() {
+		}
+
+		f.Close()
+	}
+}
diff --git a/fs/gguf/keyvalue.go b/fs/gguf/keyvalue.go
new file mode 100644
index 00000000..5843326c
--- /dev/null
+++ b/fs/gguf/keyvalue.go
@@ -0,0 +1,90 @@
+package gguf
+
+import (
+	"reflect"
+	"slices"
+)
+
+type KeyValue struct {
+	Key string
+	Value
+}
+
+func (kv KeyValue) Valid() bool {
+	return kv.Key != "" && kv.Value.value != nil
+}
+
+type Value struct {
+	value any
+}
+
+func value[T any](v Value, kinds ...reflect.Kind) (t T) {
+	vv := reflect.ValueOf(v.value)
+	if slices.Contains(kinds, vv.Kind()) {
+		t = vv.Convert(reflect.TypeOf(t)).Interface().(T)
+	}
+	return
+}
+
+func values[T any](v Value, kinds ...reflect.Kind) (ts []T) {
+	switch vv := reflect.ValueOf(v.value); vv.Kind() {
+	case reflect.Slice:
+		if slices.Contains(kinds, vv.Type().Elem().Kind()) {
+			ts = make([]T, vv.Len())
+			for i := range vv.Len() {
+				ts[i] = vv.Index(i).Convert(reflect.TypeOf(ts[i])).Interface().(T)
+			}
+		}
+	}
+	return
+}
+
+// Int returns Value as a signed integer. If it is not a signed integer, it returns 0.
+func (v Value) Int() int64 {
+	return value[int64](v, reflect.Int, reflect.Int8, reflect.Int16, reflect.Int32, reflect.Int64)
+}
+
+// Ints returns Value as a signed integer slice. If it is not a signed integer slice, it returns nil.
+func (v Value) Ints() (i64s []int64) {
+	return values[int64](v, reflect.Int, reflect.Int8, reflect.Int16, reflect.Int32, reflect.Int64)
+}
+
+// Uint converts an unsigned integer value to uint64. If the value is not a unsigned integer, it returns 0.
+func (v Value) Uint() uint64 {
+	return value[uint64](v, reflect.Uint, reflect.Uint8, reflect.Uint16, reflect.Uint32, reflect.Uint64)
+}
+
+// Uints returns Value as a unsigned integer slice. If it is not a unsigned integer slice, it returns nil.
+func (v Value) Uints() (u64s []uint64) {
+	return values[uint64](v, reflect.Uint, reflect.Uint8, reflect.Uint16, reflect.Uint32, reflect.Uint64)
+}
+
+// Float returns Value as a float. If it is not a float, it returns 0.
+func (v Value) Float() float64 {
+	return value[float64](v, reflect.Float32, reflect.Float64)
+}
+
+// Floats returns Value as a float slice. If it is not a float slice, it returns nil.
+func (v Value) Floats() (f64s []float64) {
+	return values[float64](v, reflect.Float32, reflect.Float64)
+}
+
+// Bool returns Value as a boolean. If it is not a boolean, it returns false.
+func (v Value) Bool() bool {
+	return value[bool](v, reflect.Bool)
+}
+
+// Bools returns Value as a boolean slice. If it is not a boolean slice, it returns nil.
+func (v Value) Bools() (bools []bool) {
+	return values[bool](v, reflect.Bool)
+}
+
+// String returns Value as a string. If it is not a string, it returns an empty string.
+func (v Value) String() string {
+	return value[string](v, reflect.String)
+}
+
+// Strings returns Value as a string slice. If it is not a string slice, it returns nil.
+func (v Value) Strings() (strings []string) {
+	return values[string](v, reflect.String)
+}
diff --git a/fs/gguf/keyvalue_test.go b/fs/gguf/keyvalue_test.go
new file mode 100644
index 00000000..2caacc53
--- /dev/null
+++ b/fs/gguf/keyvalue_test.go
@@ -0,0 +1,208 @@
+package gguf
+
+import (
+	"testing"
+
+	"github.com/google/go-cmp/cmp"
+)
+
+func split(name string, values map[string][]any) (matched []any, unmatched []any) {
+	for key, value := range values {
+		if key == name {
+			matched = value
+		} else {
+			unmatched = append(unmatched, value...)
+		}
+	}
+	return
+}
+
+func TestValue(t *testing.T) {
+	values := map[string][]any{
+		"int64":   {int(42), int8(42), int16(42), int32(42), int64(42)},
+		"uint64":  {uint(42), uint8(42), uint16(42), uint32(42), uint64(42)},
+		"float64": {float32(42), float64(42)},
+		"string":  {"42", "hello"},
+		"bool":    {true, false},
+	}
+
+	t.Run("int64", func(t *testing.T) {
+		matched, unmatched := split("int64", values)
+		for _, v := range matched {
+			kv := KeyValue{"key", Value{v}}
+			if i64 := kv.Int(); i64 != 42 {
+				t.Errorf("expected 42, got %d", i64)
+			}
+		}
+
+		for _, v := range unmatched {
+			kv := KeyValue{"key", Value{v}}
+			if i64 := kv.Int(); i64 != 0 {
+				t.Errorf("expected 42, got %d", i64)
+			}
+		}
+	})
+
+	t.Run("uint64", func(t *testing.T) {
+		matched, unmatched := split("uint64", values)
+		for _, v := range matched {
+			kv := KeyValue{"key", Value{v}}
+			if u64 := kv.Uint(); u64 != 42 {
+				t.Errorf("expected 42, got %d", u64)
+			}
+		}
+
+		for _, v := range unmatched {
+			kv := KeyValue{"key", Value{v}}
+			if u64 := kv.Uint(); u64 != 0 {
+				t.Errorf("expected 42, got %d", u64)
+			}
+		}
+	})
+
+	t.Run("float64", func(t *testing.T) {
+		matched, unmatched := split("float64", values)
+		for _, v := range matched {
+			kv := KeyValue{"key", Value{v}}
+			if f64 := kv.Float(); f64 != 42 {
+				t.Errorf("expected 42, got %f", f64)
+			}
+		}
+
+		for _, v := range unmatched {
+			kv := KeyValue{"key", Value{v}}
+			if f64 := kv.Float(); f64 != 0 {
+				t.Errorf("expected 42, got %f", f64)
+			}
+		}
+	})
+
+	t.Run("string", func(t *testing.T) {
+		matched, unmatched := split("string", values)
+		for _, v := range matched {
+			kv := KeyValue{"key", Value{v}}
+			if s := kv.String(); s != v {
+				t.Errorf("expected 42, got %s", s)
+			}
+		}
+
+		for _, v := range unmatched {
+			kv := KeyValue{"key", Value{v}}
+			if s := kv.String(); s != "" {
+				t.Errorf("expected 42, got %s", s)
+			}
+		}
+	})
+
+	t.Run("bool", func(t *testing.T) {
+		matched, unmatched := split("bool", values)
+		for _, v := range matched {
+			kv := KeyValue{"key", Value{v}}
+			if b := kv.Bool(); b != v {
+				t.Errorf("expected true, got %v", b)
+			}
+		}
+
+		for _, v := range unmatched {
+			kv := KeyValue{"key", Value{v}}
+			if b := kv.Bool(); b != false {
+				t.Errorf("expected false, got %v", b)
+			}
+		}
+	})
+}
+
+func TestValues(t *testing.T) {
+	values := map[string][]any{
+		"int64s":   {[]int{42}, []int8{42}, []int16{42}, []int32{42}, []int64{42}},
+		"uint64s":  {[]uint{42}, []uint8{42}, []uint16{42}, []uint32{42}, []uint64{42}},
+		"float64s": {[]float32{42}, []float64{42}},
+		"strings":  {[]string{"42"}, []string{"hello"}},
+		"bools":    {[]bool{true}, []bool{false}},
+	}
+
+	t.Run("int64s", func(t *testing.T) {
+		matched, unmatched := split("int64s", values)
+		for _, v := range matched {
+			kv := KeyValue{"key", Value{v}}
+			if diff := cmp.Diff(kv.Ints(), []int64{42}); diff != "" {
+				t.Errorf("diff: %s", diff)
+			}
+		}
+
+		for _, v := range unmatched {
+			kv := KeyValue{"key", Value{v}}
+			if i64s := kv.Ints(); i64s != nil {
+				t.Errorf("expected nil, got %v", i64s)
+			}
+		}
+	})
+
+	t.Run("uint64s", func(t *testing.T) {
+		matched, unmatched := split("uint64s", values)
+		for _, v := range matched {
+			kv := KeyValue{"key", Value{v}}
+			if diff := cmp.Diff(kv.Uints(), []uint64{42}); diff != "" {
+				t.Errorf("diff: %s", diff)
+			}
+		}
+
+		for _, v := range unmatched {
+			kv := KeyValue{"key", Value{v}}
+			if u64s := kv.Uints(); u64s != nil {
+				t.Errorf("expected nil, got %v", u64s)
+			}
+		}
+	})
+
+	t.Run("float64s", func(t *testing.T) {
+		matched, unmatched := split("float64s", values)
+		for _, v := range matched {
+			kv := KeyValue{"key", Value{v}}
+			if diff := cmp.Diff(kv.Floats(), []float64{42}); diff != "" {
+				t.Errorf("diff: %s", diff)
+			}
+		}
+
+		for _, v := range unmatched {
+			kv := KeyValue{"key", Value{v}}
+			if f64s := kv.Floats(); f64s != nil {
+				t.Errorf("expected nil, got %v", f64s)
+			}
+		}
+	})
+
+	t.Run("strings", func(t *testing.T) {
+		matched, unmatched := split("strings", values)
+		for _, v := range matched {
+			kv := KeyValue{"key", Value{v}}
+			if diff := cmp.Diff(kv.Strings(), v); diff != "" {
+				t.Errorf("diff: %s", diff)
+			}
+		}
+
+		for _, v := range unmatched {
+			kv := KeyValue{"key", Value{v}}
+			if s := kv.Strings(); s != nil {
+				t.Errorf("expected nil, got %v", s)
+			}
+		}
+	})
+
+	t.Run("bools", func(t *testing.T) {
+		matched, unmatched := split("bools", values)
+		for _, v := range matched {
+			kv := KeyValue{"key", Value{v}}
+			if diff := cmp.Diff(kv.Bools(), v); diff != "" {
+				t.Errorf("diff: %s", diff)
+			}
+		}
+
+		for _, v := range unmatched {
+			kv := KeyValue{"key", Value{v}}
+			if b := kv.Bools(); b != nil {
+				t.Errorf("expected nil, got %v", b)
+			}
+		}
+	})
+}
diff --git a/fs/gguf/lazy.go b/fs/gguf/lazy.go
new file mode 100644
index 00000000..16ab9909
--- /dev/null
+++ b/fs/gguf/lazy.go
@@ -0,0 +1,89 @@
+package gguf
+
+import (
+	"encoding/binary"
+	"iter"
+	"log/slog"
+)
+
+type lazy[T any] struct {
+	count  uint64
+	next   func() (T, bool)
+	stop   func()
+	values []T
+
+	// successFunc is called when all values have been successfully read.
+	successFunc func() error
+}
+
+func newLazy[T any](f *File, fn func() (T, error)) (*lazy[T], error) {
+	it := lazy[T]{}
+	if err := binary.Read(f.reader, binary.LittleEndian, &it.count); err != nil {
+		return nil, err
+	}
+
+	it.values = make([]T, 0)
+	it.next, it.stop = iter.Pull(func(yield func(T) bool) {
+		for i := range it.count {
+			t, err := fn()
+			if err != nil {
+				slog.Error("error reading tensor", "index", i, "error", err)
+				return
+			}
+
+			it.values = append(it.values, t)
+			if !yield(t) {
+				break
+			}
+		}
+
+		if it.successFunc != nil {
+			it.successFunc()
+		}
+	})
+
+	return &it, nil
+}
+
+func (g *lazy[T]) Values() iter.Seq[T] {
+	return func(yield func(T) bool) {
+		for _, v := range g.All() {
+			if !yield(v) {
+				break
+			}
+		}
+	}
+}
+
+func (g *lazy[T]) All() iter.Seq2[int, T] {
+	return func(yield func(int, T) bool) {
+		for i := range int(g.count) {
+			if i < len(g.values) {
+				if !yield(i, g.values[i]) {
+					break
+				}
+			} else {
+				t, ok := g.next()
+				if !ok {
+					break
+				}
+
+				if !yield(i, t) {
+					break
+				}
+			}
+		}
+	}
+}
+
+func (g *lazy[T]) rest() (collected bool) {
+	for {
+		_, ok := g.next()
+		collected = collected || ok
+		if !ok {
+			break
+		}
+	}
+
+	return collected
+}
diff --git a/fs/gguf/reader.go b/fs/gguf/reader.go
new file mode 100644
index 00000000..0bd76184
--- /dev/null
+++ b/fs/gguf/reader.go
@@ -0,0 +1,23 @@
+package gguf
+
+import (
+	"bufio"
+	"io"
+)
+
+type bufferedReader struct {
+	offset int64
+	*bufio.Reader
+}
+
+func newBufferedReader(rs io.ReadSeeker, size int) *bufferedReader {
+	return &bufferedReader{
+		Reader: bufio.NewReaderSize(rs, size),
+	}
+}
+
+func (rs *bufferedReader) Read(p []byte) (n int, err error) {
+	n, err = rs.Reader.Read(p)
+	rs.offset += int64(n)
+	return n, err
+}
diff --git a/fs/gguf/tensor.go b/fs/gguf/tensor.go
new file mode 100644
index 00000000..194c1d73
--- /dev/null
+++ b/fs/gguf/tensor.go
@@ -0,0 +1,288 @@
+package gguf
+
+import (
+	"log/slog"
+	"strings"
+)
+
+type TensorInfo struct {
+	Name   string
+	Offset uint64
+	Shape  []uint64
+	Type   TensorType
+}
+
+func (ti TensorInfo) Valid() bool {
+	return ti.Name != "" && ti.NumBytes() > 0
+}
+
+func (ti TensorInfo) NumValues() int64 {
+	var numItems int64 = 1
+	for _, dim := range ti.Shape {
+		numItems *= int64(dim)
+	}
+	return numItems
+}
+
+// NumBytes returns the number of bytes in the tensor.
+func (ti TensorInfo) NumBytes() int64 {
+	return int64(float64(ti.NumValues()) * ti.Type.NumBytes())
+}
+
+func (ti TensorInfo) LogValue() slog.Value {
+	return slog.GroupValue(
+		slog.String("name", ti.Name),
+		slog.Int64("offset", int64(ti.Offset)),
+		slog.Any("shape", ti.Shape),
+		slog.Int64("num_values", ti.NumValues()),
+		slog.Int64("num_bytes", ti.NumBytes()),
+		slog.Any("type", ti.Type),
+	)
+}
+
+type TensorType uint32
+
+const (
+	TensorTypeF32 TensorType = iota
+	TensorTypeF16
+	TensorTypeQ4_0
+	TensorTypeQ4_1
+
+	// unexported // unused in gguf
+	tensorTypeQ4_2
+	tensorTypeQ4_3
+
+	TensorTypeQ5_0
+	TensorTypeQ5_1
+	TensorTypeQ8_0
+	TensorTypeQ8_1
+	TensorTypeQ2_K
+	TensorTypeQ3_K
+	TensorTypeQ4_K
+	TensorTypeQ5_K
+	TensorTypeQ6_K
+	TensorTypeQ8_K
+
+	// unexported // unquantizable by ollama
+	tensorTypeIQ2_XXS
+	tensorTypeIQ2_XS
+	tensorTypeIQ3_XXS
+	tensorTypeIQ1_S
+	tensorTypeIQ4_NL
+	tensorTypeIQ3_S
+	tensorTypeIQ2_S
+	tensorTypeIQ4_XS
+
+	TensorTypeI8
+	TensorTypeI16
+	TensorTypeI32
+	TensorTypeI64
+	TensorTypeF64
+
+	// unexported // unquantizable by ollama
+	tensorTypeIQ1_M
+
+	TensorTypeBF16
+
+	// unexported // unused in gguf
+	tensorTypeQ4_0_4_4
+	tensorTypeQ4_0_4_8
+	tensorTypeQ4_0_8_8
+
+	// unexported // unquantizable by ollama
+	tensorTypeTQ1_0
+	tensorTypeTQ2_0
+
+	// unexported // unused in gguf
+	tensorTypeIQ4_NL_4_4
+	tensorTypeIQ4_NL_4_8
+	tensorTypeIQ4_NL_8_8
+)
+
+func (tt TensorType) NumBytes() float64 {
+	return float64(tt.typeSize()) / float64(tt.blockSize())
+}
+
+func (tt TensorType) typeSize() int64 {
+	switch tt {
+	case TensorTypeF32:
+		return 4
+	case TensorTypeF16:
+		return 2
+	case TensorTypeQ4_0:
+		return 2 + tt.blockSize()/2
+	case TensorTypeQ4_1:
+		return 2 + 2 + tt.blockSize()/2
+	case TensorTypeQ5_0:
+		return 2 + 4 + tt.blockSize()/2
+	case TensorTypeQ5_1:
+		return 2 + 2 + 4 + tt.blockSize()/2
+	case TensorTypeQ8_0:
+		return 2 + tt.blockSize()
+	case TensorTypeQ8_1:
+		return 2 + 2 + tt.blockSize()
+	case TensorTypeQ2_K:
+		return tt.blockSize()/16 + tt.blockSize()/4 + 2 + 2
+	case TensorTypeQ3_K:
+		return tt.blockSize()/8 + tt.blockSize()/4 + 12 + 2
+	case TensorTypeQ4_K:
+		return 2 + 2 + 12 + tt.blockSize()/2
+	case TensorTypeQ5_K:
+		return 2 + 2 + 12 + tt.blockSize()/8 + tt.blockSize()/2
+	case TensorTypeQ6_K:
+		return tt.blockSize()/2 + tt.blockSize()/4 + tt.blockSize()/16 + 2
+	case TensorTypeQ8_K:
+		return 4 + tt.blockSize() + 2*tt.blockSize()/16
+	case tensorTypeIQ2_XXS:
+		return 2 + 2*tt.blockSize()/8
+	case tensorTypeIQ2_XS:
+		return 2 + 2*tt.blockSize()/8 + tt.blockSize()/32
+	case tensorTypeIQ3_XXS:
+		return 2 + tt.blockSize()/4 + tt.blockSize()/8
+	case tensorTypeIQ1_S:
+		return 2 + tt.blockSize()/8 + tt.blockSize()/16
+	case tensorTypeIQ4_NL:
+		return 2 + tt.blockSize()/2
+	case tensorTypeIQ3_S:
+		return 2 + tt.blockSize()/4 + tt.blockSize()/8 + tt.blockSize()/32 + 4
+	case tensorTypeIQ2_S:
+		return 2 + tt.blockSize()/4 + tt.blockSize()/16
+	case tensorTypeIQ4_XS:
+		return 2 + 2 + tt.blockSize()/2 + tt.blockSize()/64
+	case TensorTypeI8:
+		return 1
+	case TensorTypeI16:
+		return 2
+	case TensorTypeI32:
+		return 4
+	case TensorTypeI64:
+		return 8
+	case TensorTypeF64:
+		return 8
+	case tensorTypeIQ1_M:
+		return tt.blockSize()/8 + tt.blockSize()/16 + tt.blockSize()/32
+	case TensorTypeBF16:
+		return 2
+	default:
+		return 0
+	}
+}
+
+func (tt TensorType) blockSize() int64 {
+	switch tt {
+	case TensorTypeF32,
+		TensorTypeF16,
+		TensorTypeI8,
+		TensorTypeI16,
+		TensorTypeI32,
+		TensorTypeI64,
+		TensorTypeF64,
+		TensorTypeBF16:
+		return 1
+	case TensorTypeQ4_0,
+		TensorTypeQ4_1,
+		TensorTypeQ5_0,
+		TensorTypeQ5_1,
+		TensorTypeQ8_0,
+		TensorTypeQ8_1,
+		tensorTypeIQ4_NL:
+		return 32
+	default:
+		return 256
+	}
+}
+
+func (tt TensorType) String() string {
+	switch tt {
+	case TensorTypeF32:
+		return "f32"
+	case TensorTypeF16:
+		return "f16"
+	case TensorTypeQ4_0:
+		return "q4_0"
+	case TensorTypeQ4_1:
+		return "q4_1"
+	case tensorTypeQ4_2:
+		return "q4_2"
+	case tensorTypeQ4_3:
+		return "q4_3"
+	case TensorTypeQ5_0:
+		return "q5_0"
+	case TensorTypeQ5_1:
+		return "q5_1"
+	case TensorTypeQ8_0:
+		return "q8_0"
+	case TensorTypeQ8_1:
+		return "q8_1"
+	case TensorTypeQ2_K:
+		return "q2_k"
+	case TensorTypeQ3_K:
+		return "q3_k"
+	case TensorTypeQ4_K:
+		return "q4_k"
+	case TensorTypeQ5_K:
+		return "q5_k"
+	case TensorTypeQ6_K:
+		return "q6_k"
+	case TensorTypeQ8_K:
+		return "q8_k"
+	case tensorTypeIQ2_XXS:
+		return "iq2_xxs"
+	case tensorTypeIQ2_XS:
+		return "iq2_xs"
+	case tensorTypeIQ3_XXS:
+		return "iq3_xxs"
+	case tensorTypeIQ1_S:
+		return "iq1_s"
+	case tensorTypeIQ4_NL:
+		return "iq4_nl"
+	case tensorTypeIQ3_S:
+		return "iq3_s"
+	case tensorTypeIQ2_S:
+		return "iq2_s"
+	case tensorTypeIQ4_XS:
+		return "iq4_xs"
+	case TensorTypeI8:
+		return "i8"
+	case TensorTypeI16:
+		return "i16"
+	case TensorTypeI32:
+		return "i32"
+	case TensorTypeI64:
+		return "i64"
+	case TensorTypeF64:
+		return "f64"
+	case tensorTypeIQ1_M:
+		return "iq1_m"
+	case TensorTypeBF16:
+		return "bf16"
+	case tensorTypeQ4_0_4_4:
+		return "q4_0_4_4"
+	case tensorTypeQ4_0_4_8:
+		return "q4_0_4_8"
+	case tensorTypeQ4_0_8_8:
+		return "q4_0_8_8"
+	case tensorTypeTQ1_0:
+		return "tq1_0"
+	case tensorTypeTQ2_0:
+		return "tq2_0"
+	case tensorTypeIQ4_NL_4_4:
+		return "iq4_nl_4_4"
+	case tensorTypeIQ4_NL_4_8:
+		return "iq4_nl_4_8"
+	case tensorTypeIQ4_NL_8_8:
+		return "iq4_nl_8_8"
+	default:
+		return "unknown"
+	}
+}
+
+func (tt TensorType) LogValue() slog.Value {
+	return slog.GroupValue(
+		slog.Uint64("value", uint64(tt)),
+		slog.String("name", strings.ToUpper(tt.String())),
+		slog.Int64("size", tt.typeSize()),
+		slog.Int64("block_size", tt.blockSize()),
+		slog.Float64("num_bytes", tt.NumBytes()),
+	)
+}
diff --git a/go.mod b/go.mod
index 283286b7..6de5959b 100644
--- a/go.mod
+++ b/go.mod
@@ -19,7 +19,7 @@ require (
 	github.com/d4l3k/go-bfloat16 v0.0.0-20211005043715-690c3bdd05f1
 	github.com/dlclark/regexp2 v1.11.4
 	github.com/emirpasic/gods/v2 v2.0.0-alpha
-	github.com/google/go-cmp v0.6.0
+	github.com/google/go-cmp v0.7.0
 	github.com/mattn/go-runewidth v0.0.14
 	github.com/nlpodyssey/gopickle v0.3.0
 	github.com/pdevine/tensor v0.0.0-20240510204454-f88f4562727c
diff --git a/go.sum b/go.sum
index 5755616f..c0ab53aa 100644
--- a/go.sum
+++ b/go.sum
@@ -112,8 +112,8 @@ github.com/google/go-cmp v0.4.0/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/
 github.com/google/go-cmp v0.5.0/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE=
 github.com/google/go-cmp v0.5.5/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE=
 github.com/google/go-cmp v0.5.6/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE=
-github.com/google/go-cmp v0.6.0 h1:ofyhxvXcZhMsU5ulbFiLKl/XBFqE1GSq7atu8tAmTRI=
-github.com/google/go-cmp v0.6.0/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY=
+github.com/google/go-cmp v0.7.0 h1:wk8382ETsv4JYUZwIsn6YpYiWiBsYLSJiTsyBybVuN8=
+github.com/google/go-cmp v0.7.0/go.mod h1:pXiqmnSA92OHEEa9HXL2W4E7lf9JzCmGVUdgjX3N/iU=
 github.com/google/gofuzz v1.0.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg=
 github.com/google/uuid v1.1.2/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo=
 github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0=
diff --git a/server/images.go b/server/images.go
index d6cceff4..38505cc5 100644
--- a/server/images.go
+++ b/server/images.go
@@ -23,7 +23,7 @@ import (
 
 	"github.com/ollama/ollama/api"
 	"github.com/ollama/ollama/envconfig"
-	"github.com/ollama/ollama/fs/ggml"
+	"github.com/ollama/ollama/fs/gguf"
 	"github.com/ollama/ollama/parser"
 	"github.com/ollama/ollama/template"
 	"github.com/ollama/ollama/thinking"
@@ -73,22 +73,18 @@ func (m *Model) Capabilities() []model.Capability {
 	capabilities := []model.Capability{}
 
 	// Check for completion capability
-	r, err := os.Open(m.ModelPath)
+	f, err := gguf.Open(m.ModelPath)
 	if err == nil {
-		defer r.Close()
+		defer f.Close()
 
-		f, err := ggml.Decode(r, 1024)
-		if err == nil {
-			if _, ok := f.KV()[fmt.Sprintf("%s.pooling_type", f.KV().Architecture())]; ok {
-				capabilities = append(capabilities, model.CapabilityEmbedding)
-			} else {
-				capabilities = append(capabilities, model.CapabilityCompletion)
-			}
-			if _, ok := f.KV()[fmt.Sprintf("%s.vision.block_count", f.KV().Architecture())]; ok {
-				capabilities = append(capabilities, model.CapabilityVision)
-			}
+		if f.KeyValue("pooling_type").Valid() {
+			capabilities = append(capabilities, model.CapabilityEmbedding)
 		} else {
-			slog.Error("couldn't decode ggml", "error", err)
+			// If no embedding is specified, we assume the model supports completion
+			capabilities = append(capabilities, model.CapabilityCompletion)
+		}
+		if f.KeyValue("vision.block_count").Valid() {
+			capabilities = append(capabilities, model.CapabilityVision)
 		}
 	} else {
 		slog.Error("couldn't open model file", "error", err)
diff --git a/server/images_test.go b/server/images_test.go
index 363b298e..a2fba8d9 100644
--- a/server/images_test.go
+++ b/server/images_test.go
@@ -1,123 +1,42 @@
 package server
 
 import (
-	"bytes"
-	"encoding/binary"
-	"errors"
-	"os"
-	"path/filepath"
 	"strings"
 	"testing"
 
+	"github.com/ollama/ollama/fs/ggml"
 	"github.com/ollama/ollama/template"
 	"github.com/ollama/ollama/types/model"
 )
 
-// Constants for GGUF magic bytes and version
-var (
-	ggufMagic = []byte{0x47, 0x47, 0x55, 0x46} // "GGUF"
-	ggufVer   = uint32(3)                      // Version 3
-)
-
-// Helper function to create mock GGUF data
-func createMockGGUFData(architecture string, vision bool) []byte {
-	var buf bytes.Buffer
-
-	// Write GGUF header
-	buf.Write(ggufMagic)
-	binary.Write(&buf, binary.LittleEndian, ggufVer)
-
-	// Write tensor count (0 for our test)
-	var numTensors uint64 = 0
-	binary.Write(&buf, binary.LittleEndian, numTensors)
-
-	// Calculate number of metadata entries
-	numMetaEntries := uint64(1) // architecture entry
-	if vision {
-		numMetaEntries++
-	}
-	// Add embedding entry if architecture is "bert"
-	if architecture == "bert" {
-		numMetaEntries++
-	}
-	binary.Write(&buf, binary.LittleEndian, numMetaEntries)
-
-	// Write architecture metadata
-	archKey := "general.architecture"
-	keyLen := uint64(len(archKey))
-	binary.Write(&buf, binary.LittleEndian, keyLen)
-	buf.WriteString(archKey)
-
-	// String type (8)
-	var strType uint32 = 8
-	binary.Write(&buf, binary.LittleEndian, strType)
-
-	// String length
-	strLen := uint64(len(architecture))
-	binary.Write(&buf, binary.LittleEndian, strLen)
-	buf.WriteString(architecture)
-
-	if vision {
-		visionKey := architecture + ".vision.block_count"
-		keyLen = uint64(len(visionKey))
-		binary.Write(&buf, binary.LittleEndian, keyLen)
-		buf.WriteString(visionKey)
-
-		// uint32 type (4)
-		var uint32Type uint32 = 4
-		binary.Write(&buf, binary.LittleEndian, uint32Type)
-
-		// uint32 value (1)
-		var countVal uint32 = 1
-		binary.Write(&buf, binary.LittleEndian, countVal)
-	}
-	// Write embedding metadata if architecture is "bert"
-	if architecture == "bert" {
-		poolKey := architecture + ".pooling_type"
-		keyLen = uint64(len(poolKey))
-		binary.Write(&buf, binary.LittleEndian, keyLen)
-		buf.WriteString(poolKey)
-
-		// uint32 type (4)
-		var uint32Type uint32 = 4
-		binary.Write(&buf, binary.LittleEndian, uint32Type)
-
-		// uint32 value (1)
-		var poolingVal uint32 = 1
-		binary.Write(&buf, binary.LittleEndian, poolingVal)
-	}
-
-	return buf.Bytes()
-}
-
 func TestModelCapabilities(t *testing.T) {
-	// Create a temporary directory for test files
-	tempDir := t.TempDir()
+	// Create completion model (llama architecture without vision)
+	completionModelPath, _ := createBinFile(t, ggml.KV{
+		"general.architecture": "llama",
+	}, []*ggml.Tensor{})
 
-	// Create different types of mock model files
-	completionModelPath := filepath.Join(tempDir, "model.bin")
-	visionModelPath := filepath.Join(tempDir, "vision_model.bin")
-	embeddingModelPath := filepath.Join(tempDir, "embedding_model.bin")
-	// Create a simple model file for tests that don't depend on GGUF content
-	simpleModelPath := filepath.Join(tempDir, "simple_model.bin")
+	// Create vision model (llama architecture with vision block count)
+	visionModelPath, _ := createBinFile(t, ggml.KV{
+		"general.architecture":     "llama",
+		"llama.vision.block_count": uint32(1),
+	}, []*ggml.Tensor{})
 
-	if err := errors.Join(
-		os.WriteFile(completionModelPath, createMockGGUFData("llama", false), 0o644),
-		os.WriteFile(visionModelPath, createMockGGUFData("llama", true), 0o644),
-		os.WriteFile(embeddingModelPath, createMockGGUFData("bert", false), 0o644),
-		os.WriteFile(simpleModelPath, []byte("dummy model data"), 0o644),
-	); err != nil {
-		t.Fatalf("Failed to create model files: %v", err)
-	}
+	// Create embedding model (bert architecture with pooling type)
+	embeddingModelPath, _ := createBinFile(t, ggml.KV{
+		"general.architecture": "bert",
+		"bert.pooling_type":    uint32(1),
+	}, []*ggml.Tensor{})
 
 	toolsInsertTemplate, err := template.Parse("{{ .prompt }}{{ if .tools }}{{ .tools }}{{ end }}{{ if .suffix }}{{ .suffix }}{{ end }}")
 	if err != nil {
 		t.Fatalf("Failed to parse template: %v", err)
 	}
+
 	chatTemplate, err := template.Parse("{{ .prompt }}")
 	if err != nil {
 		t.Fatalf("Failed to parse template: %v", err)
 	}
+
 	toolsTemplate, err := template.Parse("{{ .prompt }}{{ if .tools }}{{ .tools }}{{ end }}")
 	if err != nil {
 		t.Fatalf("Failed to parse template: %v", err)
@@ -145,21 +64,13 @@ func TestModelCapabilities(t *testing.T) {
 			},
 			expectedCaps: []model.Capability{model.CapabilityCompletion, model.CapabilityTools, model.CapabilityInsert},
 		},
-		{
-			name: "model with tools and insert capability",
-			model: Model{
-				ModelPath: simpleModelPath,
-				Template:  toolsInsertTemplate,
-			},
-			expectedCaps: []model.Capability{model.CapabilityTools, model.CapabilityInsert},
-		},
 		{
 			name: "model with tools capability",
 			model: Model{
-				ModelPath: simpleModelPath,
+				ModelPath: completionModelPath,
 				Template:  toolsTemplate,
 			},
-			expectedCaps: []model.Capability{model.CapabilityTools},
+			expectedCaps: []model.Capability{model.CapabilityCompletion, model.CapabilityTools},
 		},
 		{
 			name: "model with vision capability",
@@ -224,29 +135,33 @@ func TestModelCapabilities(t *testing.T) {
 }
 
 func TestModelCheckCapabilities(t *testing.T) {
-	// Create a temporary directory for test files
-	tempDir := t.TempDir()
+	// Create simple model file for tests that don't depend on GGUF content
+	completionModelPath, _ := createBinFile(t, ggml.KV{
+		"general.architecture": "llama",
+	}, []*ggml.Tensor{})
 
-	visionModelPath := filepath.Join(tempDir, "vision_model.bin")
-	simpleModelPath := filepath.Join(tempDir, "model.bin")
-	embeddingModelPath := filepath.Join(tempDir, "embedding_model.bin")
+	// Create vision model (llama architecture with vision block count)
+	visionModelPath, _ := createBinFile(t, ggml.KV{
+		"general.architecture":     "llama",
+		"llama.vision.block_count": uint32(1),
+	}, []*ggml.Tensor{})
 
-	if err := errors.Join(
-		os.WriteFile(simpleModelPath, []byte("dummy model data"), 0o644),
-		os.WriteFile(visionModelPath, createMockGGUFData("llama", true), 0o644),
-		os.WriteFile(embeddingModelPath, createMockGGUFData("bert", false), 0o644),
-	); err != nil {
-		t.Fatalf("Failed to create model files: %v", err)
-	}
+	// Create embedding model (bert architecture with pooling type)
+	embeddingModelPath, _ := createBinFile(t, ggml.KV{
+		"general.architecture": "bert",
+		"bert.pooling_type":    uint32(1),
+	}, []*ggml.Tensor{})
 
 	toolsInsertTemplate, err := template.Parse("{{ .prompt }}{{ if .tools }}{{ .tools }}{{ end }}{{ if .suffix }}{{ .suffix }}{{ end }}")
 	if err != nil {
 		t.Fatalf("Failed to parse template: %v", err)
 	}
+
 	chatTemplate, err := template.Parse("{{ .prompt }}")
 	if err != nil {
 		t.Fatalf("Failed to parse template: %v", err)
 	}
+
 	toolsTemplate, err := template.Parse("{{ .prompt }}{{ if .tools }}{{ .tools }}{{ end }}")
 	if err != nil {
 		t.Fatalf("Failed to parse template: %v", err)
@@ -261,7 +176,7 @@ func TestModelCheckCapabilities(t *testing.T) {
 		{
 			name: "completion model without tools capability",
 			model: Model{
-				ModelPath: simpleModelPath,
+				ModelPath: completionModelPath,
 				Template:  chatTemplate,
 			},
 			checkCaps:      []model.Capability{model.CapabilityTools},
@@ -270,7 +185,7 @@ func TestModelCheckCapabilities(t *testing.T) {
 		{
 			name: "model with all needed capabilities",
 			model: Model{
-				ModelPath: simpleModelPath,
+				ModelPath: completionModelPath,
 				Template:  toolsInsertTemplate,
 			},
 			checkCaps: []model.Capability{model.CapabilityTools, model.CapabilityInsert},
@@ -278,7 +193,7 @@ func TestModelCheckCapabilities(t *testing.T) {
 		{
 			name: "model missing insert capability",
 			model: Model{
-				ModelPath: simpleModelPath,
+				ModelPath: completionModelPath,
 				Template:  toolsTemplate,
 			},
 			checkCaps:      []model.Capability{model.CapabilityInsert},
@@ -287,7 +202,7 @@ func TestModelCheckCapabilities(t *testing.T) {
 		{
 			name: "model missing vision capability",
 			model: Model{
-				ModelPath: simpleModelPath,
+				ModelPath: completionModelPath,
 				Template:  toolsTemplate,
 			},
 			checkCaps:      []model.Capability{model.CapabilityVision},
@@ -312,7 +227,7 @@ func TestModelCheckCapabilities(t *testing.T) {
 		{
 			name: "unknown capability",
 			model: Model{
-				ModelPath: simpleModelPath,
+				ModelPath: completionModelPath,
 				Template:  chatTemplate,
 			},
 			checkCaps:      []model.Capability{"unknown"},
diff --git a/server/quantization_test.go b/server/quantization_test.go
index 4f717c2c..8b726c83 100644
--- a/server/quantization_test.go
+++ b/server/quantization_test.go
@@ -257,16 +257,8 @@ func TestQuantizeModel(t *testing.T) {
 
 	for _, tt := range cases {
 		t.Run(tt.name, func(t *testing.T) {
-			f, err := os.CreateTemp(t.TempDir(), tt.name)
-			if err != nil {
-				t.Fatal(err.Error())
-			}
-			defer f.Close()
-			err = fsggml.WriteGGUF(f, tt.kv, tt.tensors)
-			if err != nil {
-				t.Fatalf("failed to create initial model: %s", err)
-			}
-			fp, err := os.Open(f.Name())
+			p, _ := createBinFile(t, tt.kv, tt.tensors)
+			fp, err := os.Open(p)
 			if err != nil {
 				t.Fatal(err.Error())
 			}
diff --git a/server/sched_test.go b/server/sched_test.go
index 01fb9a70..3892fbba 100644
--- a/server/sched_test.go
+++ b/server/sched_test.go
@@ -112,11 +112,7 @@ func newScenarioRequest(t *testing.T, ctx context.Context, modelName string, est
 	b.ctx, b.ctxDone = context.WithCancel(ctx)
 	t.Helper()
 
-	f, err := os.CreateTemp(t.TempDir(), modelName)
-	require.NoError(t, err)
-	defer f.Close()
-
-	require.NoError(t, ggml.WriteGGUF(f, ggml.KV{
+	p, _ := createBinFile(t, ggml.KV{
 		"general.architecture":          "llama",
 		"llama.context_length":          uint32(32),
 		"llama.embedding_length":        uint32(4096),
@@ -129,14 +125,14 @@ func newScenarioRequest(t *testing.T, ctx context.Context, modelName string, est
 	}, []*ggml.Tensor{
 		{Name: "blk.0.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: bytes.NewReader(make([]byte, 32))},
 		{Name: "output.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: bytes.NewReader(make([]byte, 32))},
-	}))
-	require.NoError(t, err)
-
-	fname := f.Name()
-	model := &Model{Name: modelName, ModelPath: fname}
-	b.f, err = llm.LoadModel(model.ModelPath, 0)
-	require.NoError(t, err)
+	})
 
+	model := &Model{Name: modelName, ModelPath: p}
+	f, err := llm.LoadModel(model.ModelPath, 0)
+	if err != nil {
+		t.Fatal(err)
+	}
+	b.f = f
 	if duration == nil {
 		duration = &api.Duration{Duration: 5 * time.Millisecond}
 	}

From c088ac0e79a4a995e8a5a3733f7db2a981ac3364 Mon Sep 17 00:00:00 2001
From: Michael Yang <mxyng@pm.me>
Date: Fri, 20 Jun 2025 11:12:01 -0700
Subject: [PATCH 106/108] convert: utility for merging tensors (#11069)

---
 convert/convert_mixtral.go | 76 +++++++++--------------------
 convert/tensor.go          | 53 +++++++++++++++++++++
 convert/tensor_test.go     | 98 ++++++++++++++++++++++++++++++++++++++
 3 files changed, 174 insertions(+), 53 deletions(-)

diff --git a/convert/convert_mixtral.go b/convert/convert_mixtral.go
index 17580ff8..7d60146b 100644
--- a/convert/convert_mixtral.go
+++ b/convert/convert_mixtral.go
@@ -2,9 +2,6 @@ package convert
 
 import (
 	"fmt"
-	"io"
-	"slices"
-	"strings"
 
 	"github.com/ollama/ollama/fs/ggml"
 )
@@ -30,65 +27,38 @@ func (p *mixtralModel) KV(t *Tokenizer) ggml.KV {
 }
 
 func (p *mixtralModel) Tensors(ts []Tensor) []*ggml.Tensor {
-	oldnew := []string{
-		"model.layers", "blk",
-		"w1", "ffn_gate_exps",
-		"w2", "ffn_down_exps",
-		"w3", "ffn_up_exps",
-	}
-
-	for i := range p.NumLocalExperts {
-		oldnew = append(oldnew, fmt.Sprintf(".block_sparse_moe.experts.%d.", i), ".")
-	}
-
-	// group experts of the same layer (model.layers.%d) and type (w[123]) into a single tensor
-	namer := strings.NewReplacer(oldnew...)
-	experts := make(map[string]experts)
-
-	// merge experts into a single tensor while removing them from ts
-	ts = slices.DeleteFunc(ts, func(t Tensor) bool {
-		if !strings.Contains(t.Name(), ".block_sparse_moe.experts.") {
-			return false
-		}
-
-		name := namer.Replace(t.Name())
-		experts[name] = append(experts[name], t)
-		return true
-	})
-
-	var out []*ggml.Tensor
-	for n, e := range experts {
-		// TODO(mxyng): sanity check experts
-		out = append(out, &ggml.Tensor{
-			Name:     n,
-			Kind:     e[0].Kind(),
-			Shape:    append([]uint64{uint64(len(e))}, e[0].Shape()...),
-			WriterTo: e,
+	merges := make([]merge, 0, p.NumHiddenLayers*6)
+	for i := range p.NumHiddenLayers {
+		merges = append(merges, merge{
+			fmt.Sprintf("blk.%d.*.w1.weight", i),
+			fmt.Sprintf("blk.%d.ffn_gate_exps.weight", i),
+		}, merge{
+			fmt.Sprintf("blk.%d.*.w1.bias", i),
+			fmt.Sprintf("blk.%d.ffn_gate_exps.bias", i),
+		}, merge{
+			fmt.Sprintf("blk.%d.*.w2.weight", i),
+			fmt.Sprintf("blk.%d.ffn_up_exps.weight", i),
+		}, merge{
+			fmt.Sprintf("blk.%d.*.w2.bias", i),
+			fmt.Sprintf("blk.%d.ffn_up_exps.bias", i),
+		}, merge{
+			fmt.Sprintf("blk.%d.*.w3.weight", i),
+			fmt.Sprintf("blk.%d.ffn_down_exps.weight", i),
+		}, merge{
+			fmt.Sprintf("blk.%d.*.w3.bias", i),
+			fmt.Sprintf("blk.%d.ffn_down_exps.bias", i),
 		})
 	}
 
+	out, ts := mergeTensors(ts, merges...)
 	return append(out, p.llamaModel.Tensors(ts)...)
 }
 
 func (p *mixtralModel) Replacements() []string {
 	return append(
 		p.llamaModel.Replacements(),
+		"model.layers", "blk",
 		"block_sparse_moe.gate", "ffn_gate_inp",
+		"block_sparse_moe.experts.", ".",
 	)
 }
-
-type experts []Tensor
-
-func (e experts) WriteTo(w io.Writer) (int64, error) {
-	// TODO(mxyng): experts _should_ be numerically sorted by expert but this should check
-	for _, t := range e {
-		// the canonical merged experts tensor stacks all experts along a new, 0 axis,
-		// e.g. `tensor.Stack(0, e[0], e[1:]...)`, which requires allocating temporary buffers
-		// this accomplishes the same thing by writing each expert tensor in sequence
-		if _, err := t.WriteTo(w); err != nil {
-			return 0, err
-		}
-	}
-
-	return 0, nil
-}
diff --git a/convert/tensor.go b/convert/tensor.go
index 9d6919e3..c9565ed4 100644
--- a/convert/tensor.go
+++ b/convert/tensor.go
@@ -2,7 +2,9 @@ package convert
 
 import (
 	"cmp"
+	"io"
 	"iter"
+	"path"
 	"slices"
 	"strings"
 
@@ -74,3 +76,54 @@ func splitDim(t Tensor, dim int, splits ...split) iter.Seq[*ggml.Tensor] {
 		}
 	}
 }
+
+type merge struct {
+	pattern, name string
+}
+
+// mergeTensors merges tensors that match a given pattern into a single tensor.
+func mergeTensors(unmatched []Tensor, merges ...merge) (out []*ggml.Tensor, _ []Tensor) {
+	var matched []Tensor
+	for i := range merges {
+		matched, unmatched = slicesSplitFunc(unmatched, func(t Tensor) bool {
+			matched, _ := path.Match(merges[i].pattern, t.Name())
+			return matched
+		})
+
+		if len(matched) > 0 {
+			out = append(out, &ggml.Tensor{
+				Name:     merges[i].name,
+				Kind:     matched[0].Kind(),
+				Shape:    append([]uint64{uint64(len(matched))}, matched[0].Shape()...),
+				WriterTo: mergeGroup(matched),
+			})
+		}
+	}
+
+	return out, unmatched
+}
+
+// slicesSplitFunc splits a slice into two slices based on a predicate function.
+func slicesSplitFunc[S ~[]E, E comparable](s S, fn func(e E) bool) (matched, unmatched S) {
+	for _, e := range s {
+		if fn(e) {
+			matched = append(matched, e)
+		} else {
+			unmatched = append(unmatched, e)
+		}
+	}
+
+	return matched, unmatched
+}
+
+type mergeGroup []Tensor
+
+func (g mergeGroup) WriteTo(w io.Writer) (int64, error) {
+	for _, t := range g {
+		if _, err := t.WriteTo(w); err != nil {
+			return 0, err
+		}
+	}
+
+	return 0, nil
+}
diff --git a/convert/tensor_test.go b/convert/tensor_test.go
index ea12d0f5..0b2db5ba 100644
--- a/convert/tensor_test.go
+++ b/convert/tensor_test.go
@@ -9,6 +9,8 @@ import (
 	"strings"
 	"testing"
 
+	"github.com/google/go-cmp/cmp"
+	"github.com/ollama/ollama/fs/ggml"
 	"github.com/pdevine/tensor"
 )
 
@@ -302,3 +304,99 @@ func TestSplitDim(t *testing.T) {
 		}
 	})
 }
+
+func TestMerge(t *testing.T) {
+	unmatched := []Tensor{
+		&fakeTensor{
+			name:  "a.0.b",
+			shape: []uint64{5, 2},
+			data:  []float32{10, 11, 12, 13, 14, 15, 16, 17, 18, 19},
+		},
+		&fakeTensor{
+			name:  "a.1.b",
+			shape: []uint64{5, 2},
+			data:  []float32{20, 21, 22, 23, 24, 25, 26, 27, 28, 29},
+		},
+		&fakeTensor{
+			name:  "c.0.d",
+			shape: []uint64{5, 2},
+			data:  []float32{30, 31, 32, 33, 34, 35, 36, 37, 38, 39},
+		},
+		&fakeTensor{
+			name:  "c.1.d",
+			shape: []uint64{5, 2},
+			data:  []float32{40, 41, 42, 43, 44, 45, 46, 47, 48, 49},
+		},
+		&fakeTensor{
+			name:  "e.0.f",
+			shape: []uint64{5, 2},
+			data:  []float32{50, 51, 52, 53, 54, 55, 56, 57, 58, 59},
+		},
+	}
+
+	checkMatched := func(t *testing.T, n int, matched []*ggml.Tensor) {
+		for i := range n {
+			got := matched[i]
+			if diff := cmp.Diff([]uint64{2, 5, 2}, got.Shape); diff != "" {
+				t.Errorf("unexpected (-want +got):\n%s", diff)
+			}
+
+			var b bytes.Buffer
+			if _, err := got.WriteTo(&b); err != nil {
+				t.Fatal(err)
+			}
+
+			f32s := make([]float32, 20)
+			if err := binary.Read(&b, binary.LittleEndian, &f32s); err != nil {
+				t.Fatal(err)
+			}
+
+			offset := 10 + (i * 20)
+			want := make([]float32, 20)
+			for j := range 20 {
+				want[j] = float32(offset + j)
+			}
+
+			if diff := cmp.Diff(want, f32s); diff != "" {
+				t.Errorf("unexpected data (-want +got):\n%s", diff)
+			}
+		}
+	}
+
+	t.Run("single merge", func(t *testing.T) {
+		matched, unmatched := mergeTensors(unmatched, merge{"a.*.b", "a.b"})
+		if len(unmatched) != 3 {
+			t.Error("expected 3 remaining tensors, got", len(unmatched))
+		}
+
+		if len(matched) != 1 {
+			t.Error("expected 1 merged tensor, got", len(matched))
+		}
+
+		checkMatched(t, 1, matched)
+	})
+
+	t.Run("multiple merges", func(t *testing.T) {
+		matched, unmatched := mergeTensors(unmatched, merge{"a.*.b", "a.b"}, merge{"c.*.d", "c.d"})
+		if len(unmatched) != 1 {
+			t.Error("expected 1 remaining tensors, got", len(unmatched))
+		}
+
+		if len(matched) != 2 {
+			t.Error("expected 2 merged tensor, got", len(matched))
+		}
+
+		checkMatched(t, 2, matched)
+	})
+
+	t.Run("no match", func(t *testing.T) {
+		matched, unmatched := mergeTensors(unmatched, merge{"x.*.y", "x.y"})
+		if len(unmatched) != 5 {
+			t.Error("expected 5 remaining tensors, got", len(unmatched))
+		}
+
+		if len(matched) != 0 {
+			t.Error("expected no merged tensors, got", len(matched))
+		}
+	})
+}

From 65bff664cb39ed16a1fa814b0228e4e48d7234ba Mon Sep 17 00:00:00 2001
From: Daniel Hiltgen <dhiltgen@users.noreply.github.com>
Date: Fri, 20 Jun 2025 12:32:51 -0700
Subject: [PATCH 107/108] build speedups (#11142)

Enable parallel building of the GPU architectures.
---
 CMakePresets.json | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/CMakePresets.json b/CMakePresets.json
index 0b70d8ba..3234ce2c 100644
--- a/CMakePresets.json
+++ b/CMakePresets.json
@@ -22,7 +22,7 @@
       "inherits": [ "CUDA" ],
       "cacheVariables": {
         "CMAKE_CUDA_ARCHITECTURES": "50;52;53;60;61;70;75;80;86",
-        "CMAKE_CUDA_FLAGS": "-Wno-deprecated-gpu-targets"
+        "CMAKE_CUDA_FLAGS": "-Wno-deprecated-gpu-targets -t 2"
       }
     },
     {
@@ -30,7 +30,7 @@
       "inherits": [ "CUDA" ],
       "cacheVariables": {
         "CMAKE_CUDA_ARCHITECTURES": "50;60;61;70;75;80;86;87;89;90;90a;120",
-        "CMAKE_CUDA_FLAGS": "-Wno-deprecated-gpu-targets"
+        "CMAKE_CUDA_FLAGS": "-Wno-deprecated-gpu-targets -t 2"
       }
     },
     {
@@ -58,6 +58,7 @@
       "name": "ROCm 6",
       "inherits": [ "ROCm" ],
       "cacheVariables": {
+        "CMAKE_HIP_FLAGS": "-parallel-jobs=4",
         "AMDGPU_TARGETS": "gfx900;gfx940;gfx941;gfx942;gfx1010;gfx1012;gfx1030;gfx1100;gfx1101;gfx1102;gfx1151;gfx1200;gfx1201;gfx906:xnack-;gfx908:xnack-;gfx90a:xnack+;gfx90a:xnack-"
       }
     }

From 2bb69b40c7f6f2783290f0b7e4f7d5ec0a41f69c Mon Sep 17 00:00:00 2001
From: AJ <abduljaberaj00786@gmail.com>
Date: Mon, 23 Jun 2025 21:51:12 +0530
Subject: [PATCH 108/108] readme: add ai-hub to community integrations (#11169)

---
 README.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/README.md b/README.md
index e148f9af..366fe94b 100644
--- a/README.md
+++ b/README.md
@@ -409,6 +409,7 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [macLlama (macOS native)](https://github.com/hellotunamayo/macLlama) (A native macOS GUI application for interacting with Ollama models, featuring a chat interface.) 
 - [GPTranslate](https://github.com/philberndt/GPTranslate) (A fast and lightweight, AI powered desktop translation application written with Rust and Tauri. Features real-time translation with OpenAI/Azure/Ollama.)
 - [ollama launcher](https://github.com/NGC13009/ollama-launcher) (A launcher for Ollama, aiming to provide users with convenient functions such as ollama server launching, management, or configuration.)
+- [ai-hub](https://github.com/Aj-Seven/ai-hub) (AI Hub supports multiple models via API keys and Chat support via Ollama API.)
 
 ### Cloud