Merge branch 'main' into drifkin/array-head-count-simple

2025-12-06 12:19:56 +01:00 · 2025-06-23 10:37:31 -07:00 · 2025-06-23 10:37:31 -07:00 · b2b270ad5d
commit b2b270ad5d
parent 20c5fd39c8 2bb69b40c7
289 changed files with 17986 additions and 11657 deletions
--- a/.github/workflows/release.yaml
+++ b/.github/workflows/release.yaml
@ -103,6 +103,11 @@ jobs:
        arch: [amd64]
        preset: ['CPU']
        include:
          - os: windows
            arch: amd64
            preset: 'CUDA 11'
            install: https://developer.download.nvidia.com/compute/cuda/11.3.1/local_installers/cuda_11.3.1_465.89_win10.exe
            cuda-version: '11.3'
          - os: windows
            arch: amd64
            preset: 'CUDA 12'
@ -319,6 +324,7 @@ jobs:
            case "$COMPONENT" in
              bin/ollama)               echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}.tar.in ;;
              lib/ollama/*.so)          echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}.tar.in ;;
              lib/ollama/cuda_v11)      echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}.tar.in ;;
              lib/ollama/cuda_v12)      echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}.tar.in ;;
              lib/ollama/cuda_jetpack5) echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}-jetpack5.tar.in ;;
              lib/ollama/cuda_jetpack6) echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}-jetpack6.tar.in ;;
--- a/.github/workflows/test.yaml
+++ b/.github/workflows/test.yaml
@ -46,7 +46,7 @@ jobs:
        include:
          - preset: CPU
          - preset: CUDA
-            container: nvidia/cuda:12.8.1-devel-ubuntu22.04
+            container: nvidia/cuda:11.8.0-devel-ubuntu22.04
            flags: '-DCMAKE_CUDA_ARCHITECTURES=87'
          - preset: ROCm
            container: rocm/dev-ubuntu-22.04:6.1.2
@ -78,7 +78,7 @@ jobs:
        include:
          - preset: CPU
          - preset: CUDA
-            install: https://developer.download.nvidia.com/compute/cuda/12.8.0/local_installers/cuda_12.8.0_571.96_windows.exe
+            install: https://developer.download.nvidia.com/compute/cuda/11.3.1/local_installers/cuda_11.3.1_465.89_win10.exe
            flags: '-DCMAKE_CUDA_ARCHITECTURES=80'
          - preset: ROCm
            install: https://download.amd.com/developer/eula/rocm-hub/AMD-Software-PRO-Edition-24.Q4-WinSvr2022-For-HIP.exe
@ -102,7 +102,7 @@ jobs:
          $ErrorActionPreference = "Stop"
          if ("${{ steps.cache-install.outputs.cache-hit }}" -ne 'true') {
            Invoke-WebRequest -Uri "${{ matrix.install }}" -OutFile "install.exe"
-            Start-Process -FilePath .\install.exe -ArgumentList (@("-s", "cudart_12.8", "nvcc_12.8", "cublas_12.8", "cublas_dev_12.8")) -NoNewWindow -Wait
+            Start-Process -FilePath .\install.exe -ArgumentList (@("-s", "cudart_11.3", "nvcc_11.3", "cublas_11.3", "cublas_dev_11.3")) -NoNewWindow -Wait
          }
          $cudaPath = (Resolve-Path "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\*").path
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -51,6 +51,8 @@ include_directories(${CMAKE_CURRENT_SOURCE_DIR}/ml/backend/ggml/ggml/src/include
 include_directories(${CMAKE_CURRENT_SOURCE_DIR}/ml/backend/ggml/ggml/src/ggml-cpu)
 include_directories(${CMAKE_CURRENT_SOURCE_DIR}/ml/backend/ggml/ggml/src/ggml-cpu/amx)
 add_compile_definitions(NDEBUG)
 set(GGML_CPU ON)
 add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/ml/backend/ggml/ggml/src)
 set_property(TARGET ggml PROPERTY EXCLUDE_FROM_ALL TRUE)
--- a/CMakePresets.json
+++ b/CMakePresets.json
@ -17,12 +17,20 @@
      "name": "CUDA",
      "inherits": [ "Default" ]
    },
    {
      "name": "CUDA 11",
      "inherits": [ "CUDA" ],
      "cacheVariables": {
        "CMAKE_CUDA_ARCHITECTURES": "50;52;53;60;61;70;75;80;86",
        "CMAKE_CUDA_FLAGS": "-Wno-deprecated-gpu-targets -t 2"
      }
    },
    {
      "name": "CUDA 12",
      "inherits": [ "CUDA" ],
      "cacheVariables": {
        "CMAKE_CUDA_ARCHITECTURES": "50;60;61;70;75;80;86;87;89;90;90a;120",
-        "CMAKE_CUDA_FLAGS": "-Wno-deprecated-gpu-targets"
+        "CMAKE_CUDA_FLAGS": "-Wno-deprecated-gpu-targets -t 2"
      }
    },
    {
@ -50,6 +58,7 @@
      "name": "ROCm 6",
      "inherits": [ "ROCm" ],
      "cacheVariables": {
        "CMAKE_HIP_FLAGS": "-parallel-jobs=4",
        "AMDGPU_TARGETS": "gfx900;gfx940;gfx941;gfx942;gfx1010;gfx1012;gfx1030;gfx1100;gfx1101;gfx1102;gfx1151;gfx1200;gfx1201;gfx906:xnack-;gfx908:xnack-;gfx90a:xnack+;gfx90a:xnack-"
      }
    }
@ -70,6 +79,11 @@
      "configurePreset": "CUDA",
      "targets": [ "ggml-cuda" ]
    },
    {
      "name": "CUDA 11",
      "inherits": [ "CUDA" ],
      "configurePreset": "CUDA 11"
    },
    {
      "name": "CUDA 12",
      "inherits": [ "CUDA" ],
--- a/17
+++ b/17
@ -7,10 +7,14 @@ ARG JETPACK5VERSION=r35.4.1
 ARG JETPACK6VERSION=r36.4.0
 ARG CMAKEVERSION=3.31.2
 # CUDA v11 requires gcc v10.  v10.3 has regressions, so the rockylinux 8.5 AppStream has the latest compatible version
 FROM --platform=linux/amd64 rocm/dev-almalinux-8:${ROCMVERSION}-complete AS base-amd64
 RUN yum install -y yum-utils \
-    && dnf install -y ccache \
+    && yum-config-manager --add-repo https://dl.rockylinux.org/vault/rocky/8.5/AppStream/\$basearch/os/ \
    && rpm --import https://dl.rockylinux.org/pub/rocky/RPM-GPG-KEY-Rocky-8 \
    && dnf install -y yum-utils ccache gcc-toolset-10-gcc-10.2.1-8.2.el8 gcc-toolset-10-gcc-c++-10.2.1-8.2.el8 gcc-toolset-10-binutils-2.35-11.el8 \
    && yum-config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/rhel8/x86_64/cuda-rhel8.repo
 ENV PATH=/opt/rh/gcc-toolset-10/root/usr/bin:$PATH
 FROM --platform=linux/arm64 almalinux:8 AS base-arm64
 # install epel-release for ccache
@ -34,6 +38,15 @@ RUN --mount=type=cache,target=/root/.ccache \
        && cmake --build --parallel --preset 'CPU' \
        && cmake --install build --component CPU --strip --parallel 8
 FROM base AS cuda-11
 ARG CUDA11VERSION=11.3
 RUN dnf install -y cuda-toolkit-${CUDA11VERSION//./-}
 ENV PATH=/usr/local/cuda-11/bin:$PATH
 RUN --mount=type=cache,target=/root/.ccache \
    cmake --preset 'CUDA 11' \
        && cmake --build --parallel --preset 'CUDA 11' \
        && cmake --install build --component CUDA --strip --parallel 8
 FROM base AS cuda-12
 ARG CUDA12VERSION=12.8
 RUN dnf install -y cuda-toolkit-${CUDA12VERSION//./-}
@ -85,9 +98,11 @@ RUN --mount=type=cache,target=/root/.cache/go-build \
    go build -trimpath -buildmode=pie -o /bin/ollama .
 FROM --platform=linux/amd64 scratch AS amd64
 COPY --from=cuda-11 dist/lib/ollama/cuda_v11 /lib/ollama/cuda_v11
 COPY --from=cuda-12 dist/lib/ollama/cuda_v12 /lib/ollama/cuda_v12
 FROM --platform=linux/arm64 scratch AS arm64
 COPY --from=cuda-11 dist/lib/ollama/cuda_v11 /lib/ollama/cuda_v11
 COPY --from=cuda-12 dist/lib/ollama/cuda_v12 /lib/ollama/cuda_v12
 COPY --from=jetpack-5 dist/lib/ollama/cuda_v11 /lib/ollama/cuda_jetpack5
 COPY --from=jetpack-6 dist/lib/ollama/cuda_v12 /lib/ollama/cuda_jetpack6
--- a/Makefile.sync
+++ b/Makefile.sync
@ -1,6 +1,6 @@
 UPSTREAM=https://github.com/ggerganov/llama.cpp.git
 WORKDIR=llama/vendor
-FETCH_HEAD=e1e8e0991ffd9e99a445c6812bb519d5bac9f4b5
+FETCH_HEAD=de4c07f93783a1a96456a44dc16b9db538ee1618
 .PHONY: help
 help:
@ -15,11 +15,13 @@ help:
 	@echo "    make -f $(lastword $(MAKEFILE_LIST)) clean sync"
 .PHONY: sync
-sync: llama/build-info.cpp llama/llama.cpp ml/backend/ggml/ggml
+sync: llama/build-info.cpp ml/backend/ggml/ggml/src/ggml-metal/ggml-metal-embed.metal
-.PHONY: llama/build-info.cpp
+llama/build-info.cpp: llama/build-info.cpp.in llama/llama.cpp
-llama/build-info.cpp: llama/build-info.cpp.in
+	sed -e 's|@FETCH_HEAD@|$(FETCH_HEAD)|' <$< >$@
-	sed -e 's|@FETCH_HEAD@|$(FETCH_HEAD)|' $< > $@
+
 ml/backend/ggml/ggml/src/ggml-metal/ggml-metal-embed.metal: ml/backend/ggml/ggml
 	go generate ./$(@D)
 .PHONY: llama/llama.cpp
 llama/llama.cpp: llama/vendor/
@ -30,12 +32,13 @@ ml/backend/ggml/ggml: llama/vendor/ggml/
 	rsync -arvzc -f "merge $@/.rsync-filter" $< $@
 PATCHES=$(wildcard llama/patches/*.patch)
 PATCHED=$(join $(dir $(PATCHES)), $(addsuffix ed, $(addprefix ., $(notdir $(PATCHES)))))
 .PHONY: apply-patches
 .NOTPARALLEL:
-apply-patches: $(addsuffix ed, $(PATCHES))
+apply-patches: $(PATCHED)
-%.patched: %.patch
+llama/patches/.%.patched: llama/patches/%.patch
 	@if git -c user.name=nobody -c 'user.email=<>' -C $(WORKDIR) am -3 $(realpath $<); then touch $@; else git -C $(WORKDIR) am --abort; exit 1; fi
 .PHONY: checkout
@ -57,4 +60,4 @@ format-patches: llama/patches
 .PHONE: clean
 clean: checkout
-	$(RM) $(addsuffix ed, $(PATCHES))
+	$(RM) llama/patches/.*.patched
--- a/README.md
+++ b/README.md
@ -40,10 +40,10 @@ The official [Ollama Docker image](https://hub.docker.com/r/ollama/ollama) `olla
 ## Quickstart
-To run and chat with [Llama 3.2](https://ollama.com/library/llama3.2):
+To run and chat with [Gemma 3](https://ollama.com/library/gemma3):
 ```shell
-ollama run llama3.2
+ollama run gemma3
 ```
 ## Model library
@ -315,6 +315,7 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [Ollama-chats RPG](https://github.com/drazdra/ollama-chats)
 - [IntelliBar](https://intellibar.app/) (AI-powered assistant for macOS)
 - [Jirapt](https://github.com/AliAhmedNada/jirapt) (Jira Integration to generate issues, tasks, epics)
 - [ojira](https://github.com/AliAhmedNada/ojira) (Jira chrome plugin to easily generate descriptions for tasks)
 - [QA-Pilot](https://github.com/reid41/QA-Pilot) (Interactive chat tool that can leverage Ollama models for rapid understanding and navigation of GitHub code repositories)
 - [ChatOllama](https://github.com/sugarforever/chat-ollama) (Open Source Chatbot based on Ollama with Knowledge Bases)
 - [CRAG Ollama Chat](https://github.com/Nagi-ovo/CRAG-Ollama-Chat) (Simple Web Search with Corrective RAG)
@ -404,6 +405,11 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [Writeopia](https://github.com/Writeopia/Writeopia) (Text editor with integration with Ollama)
 - [AppFlowy](https://github.com/AppFlowy-IO/AppFlowy) (AI collaborative workspace with Ollama, cross-platform and self-hostable)
 - [Lumina](https://github.com/cushydigit/lumina.git) (A lightweight, minimal React.js frontend for interacting with Ollama servers)
 - [Tiny Notepad](https://pypi.org/project/tiny-notepad) (A lightweight, notepad-like interface to chat with ollama available on PyPI)
 - [macLlama (macOS native)](https://github.com/hellotunamayo/macLlama) (A native macOS GUI application for interacting with Ollama models, featuring a chat interface.) 
 - [GPTranslate](https://github.com/philberndt/GPTranslate) (A fast and lightweight, AI powered desktop translation application written with Rust and Tauri. Features real-time translation with OpenAI/Azure/Ollama.)
 - [ollama launcher](https://github.com/NGC13009/ollama-launcher) (A launcher for Ollama, aiming to provide users with convenient functions such as ollama server launching, management, or configuration.)
 - [ai-hub](https://github.com/Aj-Seven/ai-hub) (AI Hub supports multiple models via API keys and Chat support via Ollama API.)
 ### Cloud
@ -447,6 +453,8 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [orbiton](https://github.com/xyproto/orbiton) Configuration-free text editor and IDE with support for tab completion with Ollama.
 - [orca-cli](https://github.com/molbal/orca-cli) Ollama Registry CLI Application - Browse, pull, and download models from Ollama Registry in your terminal.
 - [GGUF-to-Ollama](https://github.com/jonathanhecl/gguf-to-ollama) - Importing GGUF to Ollama made easy (multiplatform)
 - [AWS-Strands-With-Ollama](https://github.com/rapidarchitect/ollama_strands) - AWS Strands Agents with Ollama Examples
 - [ollama-multirun](https://github.com/attogram/ollama-multirun) - A bash shell script to run a single prompt against any or all of your locally installed ollama models, saving the output and performance statistics as easily navigable web pages. ([Demo](https://attogram.github.io/ai_test_zone/))
 ### Apple Vision Pro
@ -526,6 +534,7 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [Abso](https://github.com/lunary-ai/abso) (OpenAI-compatible TypeScript SDK for any LLM provider)
 - [Nichey](https://github.com/goodreasonai/nichey) is a Python package for generating custom wikis for your research topic
 - [Ollama for D](https://github.com/kassane/ollama-d)
 - [OllamaPlusPlus](https://github.com/HardCodeDev777/OllamaPlusPlus) (Very simple C++ library for Ollama)
 ### Mobile
@ -582,6 +591,8 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [Simple-Discord-AI](https://github.com/zyphixor/simple-discord-ai)
 - [LLM Telegram Bot](https://github.com/innightwolfsleep/llm_telegram_bot) (telegram bot, primary for RP. Oobabooga-like buttons, [A1111](https://github.com/AUTOMATIC1111/stable-diffusion-webui) API integration e.t.c)
 - [mcp-llm](https://github.com/sammcj/mcp-llm) (MCP Server to allow LLMs to call other LLMs)
 - [SimpleOllamaUnity](https://github.com/HardCodeDev777/SimpleOllamaUnity) (Unity Engine extension for communicating with Ollama in a few lines of code. Also works at runtime)
 - [UnityCodeLama](https://github.com/HardCodeDev777/UnityCodeLama) (Unity Edtior tool to analyze scripts via Ollama)
 ### Supported backends
--- a/api/client.go
+++ b/api/client.go
@ -24,7 +24,10 @@ import (
 	"net/http"
 	"net/url"
 	"runtime"
 	"strconv"
 	"time"
 	"github.com/ollama/ollama/auth"
 	"github.com/ollama/ollama/envconfig"
 	"github.com/ollama/ollama/format"
 	"github.com/ollama/ollama/version"
@ -76,6 +79,14 @@ func NewClient(base *url.URL, http *http.Client) *Client {
 	}
 }
 func getAuthorizationToken(ctx context.Context, challenge string) (string, error) {
 	token, err := auth.Sign(ctx, []byte(challenge))
 	if err != nil {
 		return "", err
 	}
 	return token, nil
 }
 func (c *Client) do(ctx context.Context, method, path string, reqData, respData any) error {
 	var reqBody io.Reader
 	var data []byte
@ -97,6 +108,21 @@ func (c *Client) do(ctx context.Context, method, path string, reqData, respData
 	}
 	requestURL := c.base.JoinPath(path)
 	var token string
 	if envconfig.UseAuth() || c.base.Hostname() == "ollama.com" {
 		now := strconv.FormatInt(time.Now().Unix(), 10)
 		chal := fmt.Sprintf("%s,%s?ts=%s", method, path, now)
 		token, err = getAuthorizationToken(ctx, chal)
 		if err != nil {
 			return err
 		}
 		q := requestURL.Query()
 		q.Set("ts", now)
 		requestURL.RawQuery = q.Encode()
 	}
 	request, err := http.NewRequestWithContext(ctx, method, requestURL.String(), reqBody)
 	if err != nil {
 		return err
@ -106,6 +132,10 @@ func (c *Client) do(ctx context.Context, method, path string, reqData, respData
 	request.Header.Set("Accept", "application/json")
 	request.Header.Set("User-Agent", fmt.Sprintf("ollama/%s (%s %s) Go/%s", version.Version, runtime.GOARCH, runtime.GOOS, runtime.Version()))
 	if token != "" {
 		request.Header.Set("Authorization", token)
 	}
 	respObj, err := c.http.Do(request)
 	if err != nil {
 		return err
@ -143,6 +173,22 @@ func (c *Client) stream(ctx context.Context, method, path string, data any, fn f
 	}
 	requestURL := c.base.JoinPath(path)
 	var token string
 	if envconfig.UseAuth() || c.base.Hostname() == "ollama.com" {
 		var err error
 		now := strconv.FormatInt(time.Now().Unix(), 10)
 		chal := fmt.Sprintf("%s,%s?ts=%s", method, path, now)
 		token, err = getAuthorizationToken(ctx, chal)
 		if err != nil {
 			return err
 		}
 		q := requestURL.Query()
 		q.Set("ts", now)
 		requestURL.RawQuery = q.Encode()
 	}
 	request, err := http.NewRequestWithContext(ctx, method, requestURL.String(), buf)
 	if err != nil {
 		return err
@ -152,6 +198,10 @@ func (c *Client) stream(ctx context.Context, method, path string, data any, fn f
 	request.Header.Set("Accept", "application/x-ndjson")
 	request.Header.Set("User-Agent", fmt.Sprintf("ollama/%s (%s %s) Go/%s", version.Version, runtime.GOARCH, runtime.GOOS, runtime.Version()))
 	if token != "" {
 		request.Header.Set("Authorization", token)
 	}
 	response, err := c.http.Do(request)
 	if err != nil {
 		return err
--- a/api/types.go
+++ b/api/types.go
@ -83,6 +83,12 @@ type GenerateRequest struct {
 	// Options lists model-specific options. For example, temperature can be
 	// set through this field, if the model supports it.
 	Options map[string]any `json:"options"`
 	// Think controls whether thinking/reasoning models will think before
 	// responding. Needs to be a pointer so we can distinguish between false
 	// (request that thinking _not_ be used) and unset (use the old behavior
 	// before this option was introduced)
 	Think *bool `json:"think,omitempty"`
 }
 // ChatRequest describes a request sent by [Client.Chat].
@ -108,6 +114,10 @@ type ChatRequest struct {
 	// Options lists model-specific options.
 	Options map[string]any `json:"options"`
 	// Think controls whether thinking/reasoning models will think before
 	// responding
 	Think *bool `json:"think,omitempty"`
 }
 type Tools []Tool
@ -126,8 +136,11 @@ func (t Tool) String() string {
 // role ("system", "user", or "assistant"), the content and an optional list
 // of images.
 type Message struct {
-	Role      string      `json:"role"`
+	Role    string `json:"role"`
-	Content   string      `json:"content"`
+	Content string `json:"content"`
 	// Thinking contains the text that was inside thinking tags in the
 	// original model output when ChatRequest.Think is enabled.
 	Thinking  string      `json:"thinking,omitempty"`
 	Images    []ImageData `json:"images,omitempty"`
 	ToolCalls []ToolCall  `json:"tool_calls,omitempty"`
 }
@ -478,6 +491,10 @@ type GenerateResponse struct {
 	// Response is the textual response itself.
 	Response string `json:"response"`
 	// Thinking contains the text that was inside thinking tags in the
 	// original model output when ChatRequest.Think is enabled.
 	Thinking string `json:"thinking,omitempty"`
 	// Done specifies if the response is complete.
 	Done bool `json:"done"`
--- a/api/types_test.go
+++ b/api/types_test.go
@ -372,3 +372,50 @@ func TestPropertyType_MarshalJSON(t *testing.T) {
 		})
 	}
 }
 func TestThinking_UnmarshalJSON(t *testing.T) {
 	trueVal := true
 	falseVal := false
 	tests := []struct {
 		name             string
 		input            string
 		expectedThinking *bool
 		expectedError    bool
 	}{
 		{
 			name:             "true",
 			input:            `{ "think": true }`,
 			expectedThinking: &trueVal,
 		},
 		{
 			name:             "false",
 			input:            `{ "think": false }`,
 			expectedThinking: &falseVal,
 		},
 		{
 			name:             "unset",
 			input:            `{ }`,
 			expectedThinking: nil,
 		},
 		{
 			name:             "invalid",
 			input:            `{ "think": "true" }`,
 			expectedThinking: nil,
 			expectedError:    true,
 		},
 	}
 	for _, test := range tests {
 		t.Run(test.name, func(t *testing.T) {
 			var req GenerateRequest
 			err := json.Unmarshal([]byte(test.input), &req)
 			if test.expectedError {
 				require.Error(t, err)
 			} else {
 				require.NoError(t, err)
 				assert.Equal(t, test.expectedThinking, req.Think)
 			}
 		})
 	}
 }
--- a/app/lifecycle/logging.go
+++ b/app/lifecycle/logging.go
@ -4,20 +4,14 @@ import (
 	"fmt"
 	"log/slog"
 	"os"
 	"path/filepath"
 	"strconv"
 	"strings"
 	"github.com/ollama/ollama/envconfig"
 	"github.com/ollama/ollama/logutil"
 )
 func InitLogging() {
 	level := slog.LevelInfo
 	if envconfig.Debug() {
 		level = slog.LevelDebug
 	}
 	var logFile *os.File
 	var err error
 	// Detect if we're a GUI app on windows, and if not, send logs to console
@ -33,20 +27,8 @@ func InitLogging() {
 			return
 		}
 	}
 	handler := slog.NewTextHandler(logFile, &slog.HandlerOptions{
 		Level:     level,
 		AddSource: true,
 		ReplaceAttr: func(_ []string, attr slog.Attr) slog.Attr {
 			if attr.Key == slog.SourceKey {
 				source := attr.Value.Any().(*slog.Source)
 				source.File = filepath.Base(source.File)
 			}
 			return attr
 		},
 	})
 	slog.SetDefault(slog.New(handler))
 	slog.SetDefault(logutil.NewLogger(logFile, envconfig.LogLevel()))
 	slog.Info("ollama app started")
 }
--- a/benchmark/server_benchmark_test.go
+++ b/benchmark/server_benchmark_test.go
@ -1,178 +0,0 @@
 package benchmark
 import (
 	"context"
 	"flag"
 	"fmt"
 	"testing"
 	"time"
 	"github.com/ollama/ollama/api"
 )
 // Command line flags
 var modelFlag string
 func init() {
 	flag.StringVar(&modelFlag, "m", "", "Name of the model to benchmark")
 	flag.Lookup("m").DefValue = "model"
 }
 // modelName returns the model name from flags, failing the test if not set
 func modelName(b *testing.B) string {
 	if modelFlag == "" {
 		b.Fatal("Error: -m flag is required for benchmark tests")
 	}
 	return modelFlag
 }
 type TestCase struct {
 	name      string
 	prompt    string
 	maxTokens int
 }
 // runGenerateBenchmark contains the common generate and metrics logic
 func runGenerateBenchmark(b *testing.B, ctx context.Context, client *api.Client, req *api.GenerateRequest) {
 	start := time.Now()
 	var ttft time.Duration
 	var metrics api.Metrics
 	err := client.Generate(ctx, req, func(resp api.GenerateResponse) error {
 		if ttft == 0 && resp.Response != "" {
 			ttft = time.Since(start)
 		}
 		if resp.Done {
 			metrics = resp.Metrics
 		}
 		return nil
 	})
 	// Report custom metrics as part of the benchmark results
 	b.ReportMetric(float64(ttft.Milliseconds()), "ttft_ms")
 	b.ReportMetric(float64(metrics.LoadDuration.Milliseconds()), "load_ms")
 	// Token throughput metrics
 	promptThroughput := float64(metrics.PromptEvalCount) / metrics.PromptEvalDuration.Seconds()
 	genThroughput := float64(metrics.EvalCount) / metrics.EvalDuration.Seconds()
 	b.ReportMetric(promptThroughput, "prompt_tok/s")
 	b.ReportMetric(genThroughput, "gen_tok/s")
 	// Token counts
 	b.ReportMetric(float64(metrics.PromptEvalCount), "prompt_tokens")
 	b.ReportMetric(float64(metrics.EvalCount), "gen_tokens")
 	if err != nil {
 		b.Fatal(err)
 	}
 }
 // BenchmarkColdStart runs benchmarks with model loading from cold state
 func BenchmarkColdStart(b *testing.B) {
 	client := setup(b)
 	tests := []TestCase{
 		{"short_prompt", "Write a long story", 100},
 		{"medium_prompt", "Write a detailed economic analysis", 500},
 		{"long_prompt", "Write a comprehensive AI research paper", 1000},
 	}
 	m := modelName(b)
 	for _, tt := range tests {
 		b.Run(fmt.Sprintf("%s/cold/%s", m, tt.name), func(b *testing.B) {
 			ctx := b.Context()
 			// Set number of tokens as our throughput metric
 			b.SetBytes(int64(tt.maxTokens))
 			for b.Loop() {
 				b.StopTimer()
 				// Ensure model is unloaded before each iteration
 				unload(client, m, b)
 				b.StartTimer()
 				req := &api.GenerateRequest{
 					Model:   m,
 					Prompt:  tt.prompt,
 					Options: map[string]any{"num_predict": tt.maxTokens, "temperature": 0.1},
 				}
 				runGenerateBenchmark(b, ctx, client, req)
 			}
 		})
 	}
 }
 // BenchmarkWarmStart runs benchmarks with pre-loaded model
 func BenchmarkWarmStart(b *testing.B) {
 	client := setup(b)
 	tests := []TestCase{
 		{"short_prompt", "Write a long story", 100},
 		{"medium_prompt", "Write a detailed economic analysis", 500},
 		{"long_prompt", "Write a comprehensive AI research paper", 1000},
 	}
 	m := modelName(b)
 	for _, tt := range tests {
 		b.Run(fmt.Sprintf("%s/warm/%s", m, tt.name), func(b *testing.B) {
 			ctx := b.Context()
 			// Pre-warm the model
 			warmup(client, m, tt.prompt, b)
 			// Set number of tokens as our throughput metric
 			b.SetBytes(int64(tt.maxTokens))
 			for b.Loop() {
 				req := &api.GenerateRequest{
 					Model:   m,
 					Prompt:  tt.prompt,
 					Options: map[string]any{"num_predict": tt.maxTokens, "temperature": 0.1},
 				}
 				runGenerateBenchmark(b, ctx, client, req)
 			}
 		})
 	}
 }
 // setup verifies server and model availability
 func setup(b *testing.B) *api.Client {
 	client, err := api.ClientFromEnvironment()
 	if err != nil {
 		b.Fatal(err)
 	}
 	if _, err := client.Show(b.Context(), &api.ShowRequest{Model: modelName(b)}); err != nil {
 		b.Fatalf("Model unavailable: %v", err)
 	}
 	return client
 }
 // warmup ensures the model is loaded and warmed up
 func warmup(client *api.Client, model string, prompt string, b *testing.B) {
 	for range 3 {
 		err := client.Generate(
 			context.Background(),
 			&api.GenerateRequest{
 				Model:   model,
 				Prompt:  prompt,
 				Options: map[string]any{"num_predict": 50, "temperature": 0.1},
 			},
 			func(api.GenerateResponse) error { return nil },
 		)
 		if err != nil {
 			b.Logf("Error during model warm-up: %v", err)
 		}
 	}
 }
 // unload forces model unloading using KeepAlive: 0 parameter
 func unload(client *api.Client, model string, b *testing.B) {
 	req := &api.GenerateRequest{
 		Model:     model,
 		KeepAlive: &api.Duration{Duration: 0},
 	}
 	if err := client.Generate(context.Background(), req, func(api.GenerateResponse) error { return nil }); err != nil {
 		b.Logf("Unload error: %v", err)
 	}
 	time.Sleep(1 * time.Second)
 }
--- a/cmd/cmd.go
+++ b/cmd/cmd.go
@ -39,6 +39,7 @@ import (
 	"github.com/ollama/ollama/format"
 	"github.com/ollama/ollama/parser"
 	"github.com/ollama/ollama/progress"
 	"github.com/ollama/ollama/readline"
 	"github.com/ollama/ollama/runner"
 	"github.com/ollama/ollama/server"
 	"github.com/ollama/ollama/types/model"
@ -46,6 +47,23 @@ import (
 	"github.com/ollama/ollama/version"
 )
 // ensureThinkingSupport emits a warning if the model does not advertise thinking support
 func ensureThinkingSupport(ctx context.Context, client *api.Client, name string) {
 	if name == "" {
 		return
 	}
 	resp, err := client.Show(ctx, &api.ShowRequest{Model: name})
 	if err != nil {
 		return
 	}
 	for _, cap := range resp.Capabilities {
 		if cap == model.CapabilityThinking {
 			return
 		}
 	}
 	fmt.Fprintf(os.Stderr, "warning: model %q does not support thinking output\n", name)
 }
 var errModelfileNotFound = errors.New("specified Modelfile wasn't found")
 func getModelfileName(cmd *cobra.Command) (string, error) {
@ -265,6 +283,9 @@ func loadOrUnloadModel(cmd *cobra.Command, opts *runOptions) error {
 	req := &api.GenerateRequest{
 		Model:     opts.Model,
 		KeepAlive: opts.KeepAlive,
 		// pass Think here so we fail before getting to the chat prompt if the model doesn't support it
 		Think: opts.Think,
 	}
 	return client.Generate(cmd.Context(), req, func(api.GenerateResponse) error { return nil })
@ -299,6 +320,22 @@ func RunHandler(cmd *cobra.Command, args []string) error {
 	}
 	opts.Format = format
 	thinkFlag := cmd.Flags().Lookup("think")
 	if thinkFlag.Changed {
 		think, err := cmd.Flags().GetBool("think")
 		if err != nil {
 			return err
 		}
 		opts.Think = &think
 	} else {
 		opts.Think = nil
 	}
 	hidethinking, err := cmd.Flags().GetBool("hidethinking")
 	if err != nil {
 		return err
 	}
 	opts.HideThinking = hidethinking
 	keepAlive, err := cmd.Flags().GetString("keepalive")
 	if err != nil {
 		return err
@ -362,6 +399,11 @@ func RunHandler(cmd *cobra.Command, args []string) error {
 		return err
 	}
 	opts.Think, err = inferThinkingOption(&info.Capabilities, &opts, thinkFlag.Changed)
 	if err != nil {
 		return err
 	}
 	opts.MultiModal = slices.Contains(info.Capabilities, model.CapabilityVision)
 	// TODO: remove the projector info and vision info checks below,
@ -747,11 +789,38 @@ func showInfo(resp *api.ShowResponse, verbose bool, w io.Writer) error {
 				case float64:
 					v = fmt.Sprintf("%g", vData)
 				case []any:
-					n := 3
+					targetWidth := 10 // Small width where we are displaying the data in a column
-					if len(vData) < n {
+
-						n = len(vData)
+					var itemsToShow int
 					totalWidth := 1 // Start with 1 for opening bracket
 					// Find how many we can fit
 					for i := range vData {
 						itemStr := fmt.Sprintf("%v", vData[i])
 						width := runewidth.StringWidth(itemStr)
 						// Add separator width (", ") for all items except the first
 						if i > 0 {
 							width += 2
 						}
 						// Check if adding this item would exceed our width limit
 						if totalWidth+width > targetWidth && i > 0 {
 							break
 						}
 						totalWidth += width
 						itemsToShow++
 					}
 					// Format the output
 					if itemsToShow < len(vData) {
 						v = fmt.Sprintf("%v", vData[:itemsToShow])
 						v = strings.TrimSuffix(v, "]")
 						v += fmt.Sprintf(" ...+%d more]", len(vData)-itemsToShow)
 					} else {
 						v = fmt.Sprintf("%v", vData)
 					}
 					v = fmt.Sprintf("%v", vData[:n])
 				default:
 					v = fmt.Sprintf("%T", vData)
 				}
@ -772,10 +841,19 @@ func showInfo(resp *api.ShowResponse, verbose bool, w io.Writer) error {
 	head := func(s string, n int) (rows [][]string) {
 		scanner := bufio.NewScanner(strings.NewReader(s))
-		for scanner.Scan() && (len(rows) < n || n < 0) {
+		count := 0
-			if text := scanner.Text(); text != "" {
+		for scanner.Scan() {
-				rows = append(rows, []string{"", strings.TrimSpace(text)})
+			text := strings.TrimSpace(scanner.Text())
 			if text == "" {
 				continue
 			}
 			count++
 			if n < 0 || count <= n {
 				rows = append(rows, []string{"", text})
 			}
 		}
 		if n >= 0 && count > n {
 			rows = append(rows, []string{"", "..."})
 		}
 		return
 	}
@ -887,17 +965,19 @@ func PullHandler(cmd *cobra.Command, args []string) error {
 type generateContextKey string
 type runOptions struct {
-	Model       string
+	Model        string
-	ParentModel string
+	ParentModel  string
-	Prompt      string
+	Prompt       string
-	Messages    []api.Message
+	Messages     []api.Message
-	WordWrap    bool
+	WordWrap     bool
-	Format      string
+	Format       string
-	System      string
+	System       string
-	Images      []api.ImageData
+	Images       []api.ImageData
-	Options     map[string]any
+	Options      map[string]any
-	MultiModal  bool
+	MultiModal   bool
-	KeepAlive   *api.Duration
+	KeepAlive    *api.Duration
 	Think        *bool
 	HideThinking bool
 }
 type displayResponseState struct {
@ -953,6 +1033,26 @@ func displayResponse(content string, wordWrap bool, state *displayResponseState)
 	}
 }
 func thinkingOutputOpeningText(plainText bool) string {
 	text := "Thinking...\n"
 	if plainText {
 		return text
 	}
 	return readline.ColorGrey + readline.ColorBold + text + readline.ColorDefault + readline.ColorGrey
 }
 func thinkingOutputClosingText(plainText bool) string {
 	text := "...done thinking.\n\n"
 	if plainText {
 		return text
 	}
 	return readline.ColorGrey + readline.ColorBold + text + readline.ColorDefault
 }
 func chat(cmd *cobra.Command, opts runOptions) (*api.Message, error) {
 	client, err := api.ClientFromEnvironment()
 	if err != nil {
@ -980,14 +1080,34 @@ func chat(cmd *cobra.Command, opts runOptions) (*api.Message, error) {
 	var latest api.ChatResponse
 	var fullResponse strings.Builder
 	var role string
 	var thinkTagOpened bool = false
 	var thinkTagClosed bool = false
 	fn := func(response api.ChatResponse) error {
-		p.StopAndClear()
+		if response.Message.Content != "" || !opts.HideThinking {
 			p.StopAndClear()
 		}
 		latest = response
 		role = response.Message.Role
 		if response.Message.Thinking != "" && !opts.HideThinking {
 			if !thinkTagOpened {
 				fmt.Print(thinkingOutputOpeningText(false))
 				thinkTagOpened = true
 			}
 			displayResponse(response.Message.Thinking, opts.WordWrap, state)
 		}
 		content := response.Message.Content
 		if thinkTagOpened && !thinkTagClosed && content != "" {
 			fmt.Print(thinkingOutputClosingText(false))
 			thinkTagClosed = true
 		}
 		// purposefully not putting thinking blocks in the response, which would
 		// only be needed if we later added tool calling to the cli (they get
 		// filtered out anyway since current models don't expect them unless you're
 		// about to finish some tool calls)
 		fullResponse.WriteString(content)
 		displayResponse(content, opts.WordWrap, state)
@ -1004,6 +1124,7 @@ func chat(cmd *cobra.Command, opts runOptions) (*api.Message, error) {
 		Messages: opts.Messages,
 		Format:   json.RawMessage(opts.Format),
 		Options:  opts.Options,
 		Think:    opts.Think,
 	}
 	if opts.KeepAlive != nil {
@ -1065,13 +1186,32 @@ func generate(cmd *cobra.Command, opts runOptions) error {
 	}()
 	var state *displayResponseState = &displayResponseState{}
 	var thinkTagOpened bool = false
 	var thinkTagClosed bool = false
 	plainText := !term.IsTerminal(int(os.Stdout.Fd()))
 	fn := func(response api.GenerateResponse) error {
 		p.StopAndClear()
 		latest = response
 		content := response.Response
 		if response.Response != "" || !opts.HideThinking {
 			p.StopAndClear()
 		}
 		if response.Thinking != "" && !opts.HideThinking {
 			if !thinkTagOpened {
 				fmt.Print(thinkingOutputOpeningText(plainText))
 				thinkTagOpened = true
 			}
 			displayResponse(response.Thinking, opts.WordWrap, state)
 		}
 		if thinkTagOpened && !thinkTagClosed && content != "" {
 			fmt.Print(thinkingOutputClosingText(plainText))
 			thinkTagClosed = true
 		}
 		displayResponse(content, opts.WordWrap, state)
 		return nil
@ -1097,6 +1237,7 @@ func generate(cmd *cobra.Command, opts runOptions) error {
 		System:    opts.System,
 		Options:   opts.Options,
 		KeepAlive: opts.KeepAlive,
 		Think:     opts.Think,
 	}
 	if err := client.Generate(ctx, &request, fn); err != nil {
@ -1200,11 +1341,11 @@ func checkServerHeartbeat(cmd *cobra.Command, _ []string) error {
 		return err
 	}
 	if err := client.Heartbeat(cmd.Context()); err != nil {
-		if !strings.Contains(err.Error(), " refused") {
+		if !(strings.Contains(err.Error(), " refused") || strings.Contains(err.Error(), "could not connect")) {
 			return err
 		}
 		if err := startApp(cmd.Context(), client); err != nil {
-			return errors.New("could not connect to ollama app, is it running?")
+			return fmt.Errorf("ollama server not responding - %w", err)
 		}
 	}
 	return nil
@ -1282,7 +1423,7 @@ func NewCLI() *cobra.Command {
 	}
 	createCmd.Flags().StringP("file", "f", "", "Name of the Modelfile (default \"Modelfile\"")
-	createCmd.Flags().StringP("quantize", "q", "", "Quantize model to this level (e.g. q4_0)")
+	createCmd.Flags().StringP("quantize", "q", "", "Quantize model to this level (e.g. q4_K_M)")
 	showCmd := &cobra.Command{
 		Use:     "show MODEL",
@ -1312,6 +1453,8 @@ func NewCLI() *cobra.Command {
 	runCmd.Flags().Bool("insecure", false, "Use an insecure registry")
 	runCmd.Flags().Bool("nowordwrap", false, "Don't wrap words to the next line automatically")
 	runCmd.Flags().String("format", "", "Response format (e.g. json)")
 	runCmd.Flags().Bool("think", false, "Whether to use thinking mode for supported models")
 	runCmd.Flags().Bool("hidethinking", false, "Hide thinking output (if provided)")
 	stopCmd := &cobra.Command{
 		Use:     "stop MODEL",
@ -1363,7 +1506,6 @@ func NewCLI() *cobra.Command {
 		PreRunE: checkServerHeartbeat,
 		RunE:    ListRunningHandler,
 	}
 	copyCmd := &cobra.Command{
 		Use:     "cp SOURCE DESTINATION",
 		Short:   "Copy a model",
@ -1452,3 +1594,45 @@ func NewCLI() *cobra.Command {
 	return rootCmd
 }
 // If the user has explicitly set thinking options, either through the CLI or
 // through the `/set think` or `set nothink` interactive options, then we
 // respect them. Otherwise, we check model capabilities to see if the model
 // supports thinking. If the model does support thinking, we enable it.
 // Otherwise, we unset the thinking option (which is different than setting it
 // to false).
 //
 // If capabilities are not provided, we fetch them from the server.
 func inferThinkingOption(caps *[]model.Capability, runOpts *runOptions, explicitlySetByUser bool) (*bool, error) {
 	if explicitlySetByUser {
 		return runOpts.Think, nil
 	}
 	if caps == nil {
 		client, err := api.ClientFromEnvironment()
 		if err != nil {
 			return nil, err
 		}
 		ret, err := client.Show(context.Background(), &api.ShowRequest{
 			Model: runOpts.Model,
 		})
 		if err != nil {
 			return nil, err
 		}
 		caps = &ret.Capabilities
 	}
 	thinkingSupported := false
 	for _, cap := range *caps {
 		if cap == model.CapabilityThinking {
 			thinkingSupported = true
 		}
 	}
 	if thinkingSupported {
 		thinking := true
 		return &thinking, nil
 	}
 	return nil, nil
 }
--- a/cmd/cmd_test.go
+++ b/cmd/cmd_test.go
@ -225,6 +225,7 @@ Weigh anchor!
  System
    You are a pirate!    
    Ahoy, matey!         
    ...                  
 `
 		if diff := cmp.Diff(expect, b.String()); diff != "" {
--- a/cmd/interactive.go
+++ b/cmd/interactive.go
@ -44,7 +44,7 @@ func generateInteractive(cmd *cobra.Command, opts runOptions) error {
 		fmt.Fprintln(os.Stderr, "Use \"\"\" to begin a multi-line message.")
 		if opts.MultiModal {
-			fmt.Fprintf(os.Stderr, "Use %s to include .jpg or .png images.\n", filepath.FromSlash("/path/to/file"))
+			fmt.Fprintf(os.Stderr, "Use %s to include .jpg, .png, or .webp images.\n", filepath.FromSlash("/path/to/file"))
 		}
 		fmt.Fprintln(os.Stderr, "")
@ -62,6 +62,8 @@ func generateInteractive(cmd *cobra.Command, opts runOptions) error {
 		fmt.Fprintln(os.Stderr, "  /set noformat          Disable formatting")
 		fmt.Fprintln(os.Stderr, "  /set verbose           Show LLM stats")
 		fmt.Fprintln(os.Stderr, "  /set quiet             Disable LLM stats")
 		fmt.Fprintln(os.Stderr, "  /set think             Enable thinking")
 		fmt.Fprintln(os.Stderr, "  /set nothink           Disable thinking")
 		fmt.Fprintln(os.Stderr, "")
 	}
@ -128,6 +130,7 @@ func generateInteractive(cmd *cobra.Command, opts runOptions) error {
 	var sb strings.Builder
 	var multiline MultilineState
 	var thinkExplicitlySet bool = opts.Think != nil
 	for {
 		line, err := scanner.Readline()
@ -195,11 +198,19 @@ func generateInteractive(cmd *cobra.Command, opts runOptions) error {
 			opts.Model = args[1]
 			opts.Messages = []api.Message{}
 			fmt.Printf("Loading model '%s'\n", opts.Model)
 			opts.Think, err = inferThinkingOption(nil, &opts, thinkExplicitlySet)
 			if err != nil {
 				return err
 			}
 			if err := loadOrUnloadModel(cmd, &opts); err != nil {
 				if strings.Contains(err.Error(), "not found") {
 					fmt.Printf("error: %v\n", err)
 					continue
 				}
 				if strings.Contains(err.Error(), "does not support thinking") {
 					fmt.Printf("error: %v\n", err)
 					continue
 				}
 				return err
 			}
 			continue
@ -260,6 +271,22 @@ func generateInteractive(cmd *cobra.Command, opts runOptions) error {
 						return err
 					}
 					fmt.Println("Set 'quiet' mode.")
 				case "think":
 					think := true
 					opts.Think = &think
 					thinkExplicitlySet = true
 					if client, err := api.ClientFromEnvironment(); err == nil {
 						ensureThinkingSupport(cmd.Context(), client, opts.Model)
 					}
 					fmt.Println("Set 'think' mode.")
 				case "nothink":
 					think := false
 					opts.Think = &think
 					thinkExplicitlySet = true
 					if client, err := api.ClientFromEnvironment(); err == nil {
 						ensureThinkingSupport(cmd.Context(), client, opts.Model)
 					}
 					fmt.Println("Set 'nothink' mode.")
 				case "format":
 					if len(args) < 3 || args[2] != "json" {
 						fmt.Println("Invalid or missing format. For 'json' mode use '/set format json'")
@ -448,6 +475,11 @@ func generateInteractive(cmd *cobra.Command, opts runOptions) error {
 			assistant, err := chat(cmd, opts)
 			if err != nil {
 				if strings.Contains(err.Error(), "does not support thinking") {
 					fmt.Printf("error: %v\n", err)
 					sb.Reset()
 					continue
 				}
 				return err
 			}
 			if assistant != nil {
@ -511,7 +543,7 @@ func extractFileNames(input string) []string {
 	// Regex to match file paths starting with optional drive letter, / ./ \ or .\ and include escaped or unescaped spaces (\ or %20)
 	// and followed by more characters and a file extension
 	// This will capture non filename strings, but we'll check for file existence to remove mismatches
-	regexPattern := `(?:[a-zA-Z]:)?(?:\./|/|\\)[\S\\ ]+?\.(?i:jpg|jpeg|png)\b`
+	regexPattern := `(?:[a-zA-Z]:)?(?:\./|/|\\)[\S\\ ]+?\.(?i:jpg|jpeg|png|webp)\b`
 	re := regexp.MustCompile(regexPattern)
 	return re.FindAllString(input, -1)
@ -531,6 +563,8 @@ func extractFileData(input string) (string, []api.ImageData, error) {
 			return "", imgs, err
 		}
 		fmt.Fprintf(os.Stderr, "Added image '%s'\n", nfp)
 		input = strings.ReplaceAll(input, "'"+nfp+"'", "")
 		input = strings.ReplaceAll(input, "'"+fp+"'", "")
 		input = strings.ReplaceAll(input, fp, "")
 		imgs = append(imgs, data)
 	}
@ -551,7 +585,7 @@ func getImageData(filePath string) ([]byte, error) {
 	}
 	contentType := http.DetectContentType(buf)
-	allowedTypes := []string{"image/jpeg", "image/jpg", "image/png"}
+	allowedTypes := []string{"image/jpeg", "image/jpg", "image/png", "image/webp"}
 	if !slices.Contains(allowedTypes, contentType) {
 		return nil, fmt.Errorf("invalid image type: %s", contentType)
 	}
--- a/cmd/interactive_test.go
+++ b/cmd/interactive_test.go
@ -1,6 +1,8 @@
 package cmd
 import (
 	"os"
 	"path/filepath"
 	"testing"
 	"github.com/stretchr/testify/assert"
@ -10,14 +12,17 @@ func TestExtractFilenames(t *testing.T) {
 	// Unix style paths
 	input := ` some preamble 
 ./relative\ path/one.png inbetween1 ./not a valid two.jpg inbetween2 ./1.svg
-/unescaped space /three.jpeg inbetween3 /valid\ path/dir/four.png "./quoted with spaces/five.JPG`
+/unescaped space /three.jpeg inbetween3 /valid\ path/dir/four.png "./quoted with spaces/five.JPG
 /unescaped space /six.webp inbetween6 /valid\ path/dir/seven.WEBP`
 	res := extractFileNames(input)
-	assert.Len(t, res, 5)
+	assert.Len(t, res, 7)
 	assert.Contains(t, res[0], "one.png")
 	assert.Contains(t, res[1], "two.jpg")
 	assert.Contains(t, res[2], "three.jpeg")
 	assert.Contains(t, res[3], "four.png")
 	assert.Contains(t, res[4], "five.JPG")
 	assert.Contains(t, res[5], "six.webp")
 	assert.Contains(t, res[6], "seven.WEBP")
 	assert.NotContains(t, res[4], '"')
 	assert.NotContains(t, res, "inbetween1")
 	assert.NotContains(t, res, "./1.svg")
@ -28,10 +33,12 @@ func TestExtractFilenames(t *testing.T) {
 /absolute/nospace/three.jpeg inbetween3 /absolute/with space/four.png inbetween4
 ./relative\ path/five.JPG inbetween5 "./relative with/spaces/six.png inbetween6
 d:\path with\spaces\seven.JPEG inbetween7 c:\users\jdoe\eight.png inbetween8 
- d:\program files\someplace\nine.png inbetween9 "E:\program files\someplace\ten.PNG some ending
+ d:\program files\someplace\nine.png inbetween9 "E:\program files\someplace\ten.PNG
 c:/users/jdoe/eleven.webp inbetween11 c:/program files/someplace/twelve.WebP inbetween12
 d:\path with\spaces\thirteen.WEBP some ending
 `
 	res = extractFileNames(input)
-	assert.Len(t, res, 10)
+	assert.Len(t, res, 13)
 	assert.NotContains(t, res, "inbetween2")
 	assert.Contains(t, res[0], "one.png")
 	assert.Contains(t, res[0], "c:")
@ -49,4 +56,31 @@ d:\path with\spaces\seven.JPEG inbetween7 c:\users\jdoe\eight.png inbetween8
 	assert.Contains(t, res[8], "d:")
 	assert.Contains(t, res[9], "ten.PNG")
 	assert.Contains(t, res[9], "E:")
 	assert.Contains(t, res[10], "eleven.webp")
 	assert.Contains(t, res[10], "c:")
 	assert.Contains(t, res[11], "twelve.WebP")
 	assert.Contains(t, res[11], "c:")
 	assert.Contains(t, res[12], "thirteen.WEBP")
 	assert.Contains(t, res[12], "d:")
 }
 // Ensure that file paths wrapped in single quotes are removed with the quotes.
 func TestExtractFileDataRemovesQuotedFilepath(t *testing.T) {
 	dir := t.TempDir()
 	fp := filepath.Join(dir, "img.jpg")
 	data := make([]byte, 600)
 	copy(data, []byte{
 		0xff, 0xd8, 0xff, 0xe0, 0x00, 0x10, 'J', 'F', 'I', 'F',
 		0x00, 0x01, 0x01, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
 		0xff, 0xd9,
 	})
 	if err := os.WriteFile(fp, data, 0o600); err != nil {
 		t.Fatalf("failed to write test image: %v", err)
 	}
 	input := "before '" + fp + "' after"
 	cleaned, imgs, err := extractFileData(input)
 	assert.NoError(t, err)
 	assert.Len(t, imgs, 1)
 	assert.Equal(t, cleaned, "before  after")
 }
--- a/cmd/start_darwin.go
+++ b/cmd/start_darwin.go
@ -5,7 +5,7 @@ import (
 	"errors"
 	"os"
 	"os/exec"
-	"strings"
+	"regexp"
 	"github.com/ollama/ollama/api"
 )
@ -19,11 +19,12 @@ func startApp(ctx context.Context, client *api.Client) error {
 	if err != nil {
 		return err
 	}
-	if !strings.Contains(link, "Ollama.app") {
+	r := regexp.MustCompile(`^.*/Ollama\s?\d*.app`)
 	m := r.FindStringSubmatch(link)
 	if len(m) != 1 {
 		return errors.New("could not find ollama app")
 	}
-	path := strings.Split(link, "Ollama.app")
+	if err := exec.Command("/usr/bin/open", "-j", "-a", m[0], "--args", "--fast-startup").Run(); err != nil {
 	if err := exec.Command("/usr/bin/open", "-a", path[0]+"Ollama.app").Run(); err != nil {
 		return err
 	}
 	return waitForServer(ctx, client)
--- a/cmd/start_windows.go
+++ b/cmd/start_windows.go
@ -4,17 +4,27 @@ import (
 	"context"
 	"errors"
 	"fmt"
 	"log/slog"
 	"os"
 	"os/exec"
 	"path"
 	"path/filepath"
 	"strings"
 	"syscall"
 	"unsafe"
 	"github.com/ollama/ollama/api"
 	"golang.org/x/sys/windows"
 )
 const (
 	Installer = "OllamaSetup.exe"
 )
 func startApp(ctx context.Context, client *api.Client) error {
-	// log.Printf("XXX Attempting to find and start ollama app")
+	if len(isProcRunning(Installer)) > 0 {
 		return fmt.Errorf("upgrade in progress...")
 	}
 	AppName := "ollama app.exe"
 	exe, err := os.Executable()
 	if err != nil {
@ -35,14 +45,11 @@ func startApp(ctx context.Context, client *api.Client) error {
 			}
 		}
 	}
 	// log.Printf("XXX attempting to start app %s", appExe)
 	cmd_path := "c:\\Windows\\system32\\cmd.exe"
-	cmd := exec.Command(cmd_path, "/c", appExe)
+	cmd := exec.Command(cmd_path, "/c", appExe, "--hide", "--fast-startup")
 	// TODO - these hide flags aren't working - still pops up a command window for some reason
 	cmd.SysProcAttr = &syscall.SysProcAttr{CreationFlags: 0x08000000, HideWindow: true}
 	// TODO this didn't help either...
 	cmd.Stdin = strings.NewReader("")
 	cmd.Stdout = os.Stdout
 	cmd.Stderr = os.Stderr
@ -56,3 +63,50 @@ func startApp(ctx context.Context, client *api.Client) error {
 	}
 	return waitForServer(ctx, client)
 }
 func isProcRunning(procName string) []uint32 {
 	pids := make([]uint32, 2048)
 	var ret uint32
 	if err := windows.EnumProcesses(pids, &ret); err != nil || ret == 0 {
 		slog.Debug("failed to check for running installers", "error", err)
 		return nil
 	}
 	if ret > uint32(len(pids)) {
 		pids = make([]uint32, ret+10)
 		if err := windows.EnumProcesses(pids, &ret); err != nil || ret == 0 {
 			slog.Debug("failed to check for running installers", "error", err)
 			return nil
 		}
 	}
 	if ret < uint32(len(pids)) {
 		pids = pids[:ret]
 	}
 	var matches []uint32
 	for _, pid := range pids {
 		if pid == 0 {
 			continue
 		}
 		hProcess, err := windows.OpenProcess(windows.PROCESS_QUERY_INFORMATION|windows.PROCESS_VM_READ, false, pid)
 		if err != nil {
 			continue
 		}
 		defer windows.CloseHandle(hProcess)
 		var module windows.Handle
 		var cbNeeded uint32
 		cb := (uint32)(unsafe.Sizeof(module))
 		if err := windows.EnumProcessModules(hProcess, &module, cb, &cbNeeded); err != nil {
 			continue
 		}
 		var sz uint32 = 1024 * 8
 		moduleName := make([]uint16, sz)
 		cb = uint32(len(moduleName)) * (uint32)(unsafe.Sizeof(uint16(0)))
 		if err := windows.GetModuleBaseName(hProcess, module, &moduleName[0], cb); err != nil && err != syscall.ERROR_INSUFFICIENT_BUFFER {
 			continue
 		}
 		exeFile := path.Base(strings.ToLower(syscall.UTF16ToString(moduleName)))
 		if strings.EqualFold(exeFile, procName) {
 			matches = append(matches, pid)
 		}
 	}
 	return matches
 }
--- a/cmd/warn_thinking_test.go
+++ b/cmd/warn_thinking_test.go
@ -0,0 +1,63 @@
 package cmd
 import (
 	"encoding/json"
 	"io"
 	"net/http"
 	"net/http/httptest"
 	"os"
 	"strings"
 	"testing"
 	"github.com/ollama/ollama/api"
 	"github.com/ollama/ollama/types/model"
 )
 // Test that a warning is printed when thinking is requested but not supported.
 func TestWarnMissingThinking(t *testing.T) {
 	cases := []struct {
 		capabilities []model.Capability
 		expectWarn   bool
 	}{
 		{capabilities: []model.Capability{model.CapabilityThinking}, expectWarn: false},
 		{capabilities: []model.Capability{}, expectWarn: true},
 	}
 	for _, tc := range cases {
 		srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
 			if r.URL.Path != "/api/show" || r.Method != http.MethodPost {
 				t.Fatalf("unexpected request to %s %s", r.URL.Path, r.Method)
 			}
 			var req api.ShowRequest
 			if err := json.NewDecoder(r.Body).Decode(&req); err != nil {
 				t.Fatalf("decode request: %v", err)
 			}
 			resp := api.ShowResponse{Capabilities: tc.capabilities}
 			if err := json.NewEncoder(w).Encode(resp); err != nil {
 				t.Fatalf("encode response: %v", err)
 			}
 		}))
 		defer srv.Close()
 		t.Setenv("OLLAMA_HOST", srv.URL)
 		client, err := api.ClientFromEnvironment()
 		if err != nil {
 			t.Fatal(err)
 		}
 		oldStderr := os.Stderr
 		r, w, _ := os.Pipe()
 		os.Stderr = w
 		ensureThinkingSupport(t.Context(), client, "m")
 		w.Close()
 		os.Stderr = oldStderr
 		out, _ := io.ReadAll(r)
 		warned := strings.Contains(string(out), "warning:")
 		if tc.expectWarn && !warned {
 			t.Errorf("expected warning, got none")
 		}
 		if !tc.expectWarn && warned {
 			t.Errorf("did not expect warning, got: %s", string(out))
 		}
 	}
 }
--- a/convert/convert.go
+++ b/convert/convert.go
@ -1,6 +1,7 @@
 package convert
 import (
 	"cmp"
 	"encoding/json"
 	"errors"
 	"fmt"
@ -14,13 +15,12 @@ import (
 )
 type ModelParameters struct {
-	Architectures []string       `json:"architectures"`
+	Architectures []string `json:"architectures"`
-	VocabSize     uint32         `json:"vocab_size"`
+	VocabSize     uint32   `json:"vocab_size"`
 	TextModel     TextParameters `json:"text_config"`
 }
-type TextParameters struct {
+	TextModel struct {
-	VocabSize uint32 `json:"vocab_size"`
+		VocabSize uint32 `json:"vocab_size"`
 	} `json:"text_config"`
 }
 type AdapterParameters struct {
@ -53,8 +53,11 @@ func (ModelParameters) KV(t *Tokenizer) ggml.KV {
 	}
 	for _, sv := range t.SpecialVocabulary {
 		kv[fmt.Sprintf("tokenizer.ggml.%s_token_id", sv.Key())] = uint32(sv.ID)
 		kv[fmt.Sprintf("tokenizer.ggml.add_%s_token", sv.Key())] = sv.AddToken
 		kv[fmt.Sprintf("tokenizer.ggml.%s_token_id", sv.Key())] = uint32(sv.ID)
 		if len(sv.IDs) > 0 {
 			kv[fmt.Sprintf("tokenizer.ggml.%s_token_ids", sv.Key())] = sv.IDs
 		}
 	}
 	return kv
@ -173,6 +176,8 @@ func ConvertModel(fsys fs.FS, f *os.File) error {
 	switch p.Architectures[0] {
 	case "LlamaForCausalLM":
 		conv = &llamaModel{}
 	case "MllamaForConditionalGeneration":
 		conv = &mllamaModel{}
 	case "Llama4ForConditionalGeneration":
 		conv = &llama4Model{}
 	case "Mistral3ForConditionalGeneration":
@ -189,6 +194,8 @@ func ConvertModel(fsys fs.FS, f *os.File) error {
 		conv = &phi3Model{}
 	case "Qwen2ForCausalLM":
 		conv = &qwen2Model{}
 	case "Qwen2_5_VLForConditionalGeneration":
 		conv = &qwen25VLModel{}
 	case "BertModel":
 		conv = &bertModel{}
 	case "CohereForCausalLM":
@ -212,24 +219,22 @@ func ConvertModel(fsys fs.FS, f *os.File) error {
 		return err
 	}
-	vocabSize := int(p.VocabSize)
+	vocabSize := int(cmp.Or(p.VocabSize, p.TextModel.VocabSize))
 	if vocabSize == 0 {
 		tVocabSize := int(p.TextModel.VocabSize)
 		vocabSize = tVocabSize
 	}
 	switch {
 	case vocabSize == 0:
-		slog.Warn("vocabulary size was not explicitly set by the model", "default size", len(t.Vocabulary.Tokens))
+		slog.Debug("vocabulary size was not explicitly set by the model", "default size", len(t.Vocabulary.Tokens))
 	case vocabSize > len(t.Vocabulary.Tokens):
-		slog.Warn("vocabulary is smaller than expected, padding with dummy tokens", "expect", vocabSize, "actual", len(t.Vocabulary.Tokens))
+		slog.Debug("vocabulary is smaller than expected, padding with dummy tokens", "expect", vocabSize, "actual", len(t.Vocabulary.Tokens))
 		for i := range vocabSize - len(t.Vocabulary.Tokens) {
 			t.Vocabulary.Tokens = append(t.Vocabulary.Tokens, fmt.Sprintf("[PAD%d]", i))
 			t.Vocabulary.Scores = append(t.Vocabulary.Scores, -1)
 			t.Vocabulary.Types = append(t.Vocabulary.Types, tokenTypeUserDefined)
 		}
 	case vocabSize < len(t.Vocabulary.Tokens):
-		return fmt.Errorf("vocabulary is larger than expected '%d' instead of '%d'", len(t.Vocabulary.Tokens), vocabSize)
+		slog.Debug("vocabulary is larger than expected", "want", vocabSize, "got", len(t.Vocabulary.Tokens))
 		p.VocabSize = uint32(len(t.Vocabulary.Tokens))
 		p.TextModel.VocabSize = uint32(len(t.Vocabulary.Tokens))
 	default:
 		slog.Debug("vocabulary", "size", len(t.Vocabulary.Tokens))
 	}
--- a/convert/convert_llama.go
+++ b/convert/convert_llama.go
@ -139,7 +139,8 @@ func (p *llamaModel) Tensors(ts []Tensor) []*ggml.Tensor {
 	}
 	for _, t := range ts {
-		if strings.HasSuffix(t.Name(), "attn_q.weight") || strings.HasSuffix(t.Name(), "attn_k.weight") {
+		if strings.HasSuffix(t.Name(), "attn_q.weight") || strings.HasSuffix(t.Name(), "attn_k.weight") ||
 			strings.HasSuffix(t.Name(), "attn_q_proj.weight") || strings.HasSuffix(t.Name(), "attn_k_proj.weight") {
 			if !p.skipRepack {
 				t.SetRepacker(p.repack)
 			}
@ -181,9 +182,9 @@ func (p *llamaModel) repack(name string, data []float32, shape []uint64) ([]floa
 	}
 	var heads uint32
-	if strings.HasSuffix(name, "attn_q.weight") {
+	if strings.HasSuffix(name, "attn_q.weight") || strings.HasSuffix(name, "attn_q_proj.weight") {
 		heads = p.NumAttentionHeads
-	} else if strings.HasSuffix(name, "attn_k.weight") {
+	} else if strings.HasSuffix(name, "attn_k.weight") || strings.HasSuffix(name, "attn_k_proj.weight") {
 		heads = cmp.Or(p.NumKeyValueHeads, p.NumAttentionHeads)
 	} else {
 		return nil, fmt.Errorf("unknown tensor for repack: %s", name)
--- a/convert/convert_mixtral.go
+++ b/convert/convert_mixtral.go
@ -2,9 +2,6 @@ package convert
 import (
 	"fmt"
 	"io"
 	"slices"
 	"strings"
 	"github.com/ollama/ollama/fs/ggml"
 )
@ -30,65 +27,38 @@ func (p *mixtralModel) KV(t *Tokenizer) ggml.KV {
 }
 func (p *mixtralModel) Tensors(ts []Tensor) []*ggml.Tensor {
-	oldnew := []string{
+	merges := make([]merge, 0, p.NumHiddenLayers*6)
-		"model.layers", "blk",
+	for i := range p.NumHiddenLayers {
-		"w1", "ffn_gate_exps",
+		merges = append(merges, merge{
-		"w2", "ffn_down_exps",
+			fmt.Sprintf("blk.%d.*.w1.weight", i),
-		"w3", "ffn_up_exps",
+			fmt.Sprintf("blk.%d.ffn_gate_exps.weight", i),
-	}
+		}, merge{
-
+			fmt.Sprintf("blk.%d.*.w1.bias", i),
-	for i := range p.NumLocalExperts {
+			fmt.Sprintf("blk.%d.ffn_gate_exps.bias", i),
-		oldnew = append(oldnew, fmt.Sprintf(".block_sparse_moe.experts.%d.", i), ".")
+		}, merge{
-	}
+			fmt.Sprintf("blk.%d.*.w2.weight", i),
-
+			fmt.Sprintf("blk.%d.ffn_up_exps.weight", i),
-	// group experts of the same layer (model.layers.%d) and type (w[123]) into a single tensor
+		}, merge{
-	namer := strings.NewReplacer(oldnew...)
+			fmt.Sprintf("blk.%d.*.w2.bias", i),
-	experts := make(map[string]experts)
+			fmt.Sprintf("blk.%d.ffn_up_exps.bias", i),
-
+		}, merge{
-	// merge experts into a single tensor while removing them from ts
+			fmt.Sprintf("blk.%d.*.w3.weight", i),
-	ts = slices.DeleteFunc(ts, func(t Tensor) bool {
+			fmt.Sprintf("blk.%d.ffn_down_exps.weight", i),
-		if !strings.Contains(t.Name(), ".block_sparse_moe.experts.") {
+		}, merge{
-			return false
+			fmt.Sprintf("blk.%d.*.w3.bias", i),
-		}
+			fmt.Sprintf("blk.%d.ffn_down_exps.bias", i),
 		name := namer.Replace(t.Name())
 		experts[name] = append(experts[name], t)
 		return true
 	})
 	var out []*ggml.Tensor
 	for n, e := range experts {
 		// TODO(mxyng): sanity check experts
 		out = append(out, &ggml.Tensor{
 			Name:     n,
 			Kind:     e[0].Kind(),
 			Shape:    append([]uint64{uint64(len(e))}, e[0].Shape()...),
 			WriterTo: e,
 		})
 	}
 	out, ts := mergeTensors(ts, merges...)
 	return append(out, p.llamaModel.Tensors(ts)...)
 }
 func (p *mixtralModel) Replacements() []string {
 	return append(
 		p.llamaModel.Replacements(),
 		"model.layers", "blk",
 		"block_sparse_moe.gate", "ffn_gate_inp",
 		"block_sparse_moe.experts.", ".",
 	)
 }
 type experts []Tensor
 func (e experts) WriteTo(w io.Writer) (int64, error) {
 	// TODO(mxyng): experts _should_ be numerically sorted by expert but this should check
 	for _, t := range e {
 		// the canonical merged experts tensor stacks all experts along a new, 0 axis,
 		// e.g. `tensor.Stack(0, e[0], e[1:]...)`, which requires allocating temporary buffers
 		// this accomplishes the same thing by writing each expert tensor in sequence
 		if _, err := t.WriteTo(w); err != nil {
 			return 0, err
 		}
 	}
 	return 0, nil
 }
--- a/convert/convert_mllama.go
+++ b/convert/convert_mllama.go
@ -0,0 +1,179 @@
 package convert
 import (
 	"strings"
 	"github.com/ollama/ollama/fs/ggml"
 	"github.com/pdevine/tensor"
 	"github.com/pdevine/tensor/native"
 )
 type mllamaModel struct {
 	ModelParameters
 	TextModel struct {
 		llamaModel
 		CrossAttentionLayers []int32 `json:"cross_attention_layers"`
 	} `json:"text_config"`
 	VisionModel struct {
 		NumHiddenLayers           uint32  `json:"num_hidden_layers"`
 		NumGlobalLayers           uint32  `json:"num_global_layers"`
 		IntermediateLayersIndices []int32 `json:"intermediate_layers_indices"`
 		HiddenSize       uint32 `json:"hidden_size"`
 		IntermediateSize uint32 `json:"intermediate_size"`
 		AttentionHeads uint32 `json:"attention_heads"`
 		ImageSize   uint32  `json:"image_size"`
 		PatchSize   uint32  `json:"patch_size"`
 		NumChannels uint32  `json:"num_channels"`
 		MaxNumTiles uint32  `json:"max_num_tiles"`
 		NormEpsilon float32 `json:"norm_eps"`
 		RopeTheta   float32 `json:"rope.freq_base"`
 	} `json:"vision_config"`
 }
 func (m *mllamaModel) KV(t *Tokenizer) ggml.KV {
 	kv := m.ModelParameters.KV(t)
 	kv["general.architecture"] = "mllama"
 	for k, v := range m.TextModel.KV(t) {
 		if strings.HasPrefix(k, "llama.") {
 			kv[strings.ReplaceAll(k, "llama.", "mllama.")] = v
 		}
 	}
 	kv["mllama.attention.cross_attention_layers"] = m.TextModel.CrossAttentionLayers
 	kv["mllama.vision.block_count"] = m.VisionModel.NumHiddenLayers
 	kv["mllama.vision.global.block_count"] = m.VisionModel.NumGlobalLayers
 	kv["mllama.vision.intermediate_layers_indices"] = m.VisionModel.IntermediateLayersIndices
 	kv["mllama.vision.embedding_length"] = m.VisionModel.HiddenSize
 	kv["mllama.vision.feed_forward_length"] = m.VisionModel.IntermediateSize
 	kv["mllama.vision.attention.head_count"] = m.VisionModel.AttentionHeads
 	kv["mllama.vision.attention.layer_norm_epsilon"] = m.VisionModel.NormEpsilon
 	kv["mllama.vision.image_size"] = m.VisionModel.ImageSize
 	kv["mllama.vision.patch_size"] = m.VisionModel.PatchSize
 	kv["mllama.vision.max_num_tiles"] = m.VisionModel.MaxNumTiles
 	kv["mllama.vision.num_channels"] = m.VisionModel.NumChannels
 	return kv
 }
 func (m *mllamaModel) Replacements() []string {
 	return append(
 		m.TextModel.Replacements(),
 		"language_model.", "",
 		"gate_attn", "attn_gate",
 		"gate_ffn", "ffn_gate",
 		"cross_attn.", "cross_attn_",
 		"vision_model", "v",
 		"class_embedding", "class_embd",
 		"patch_embedding", "patch_embd",
 		"gated_positional_embedding.tile_embedding", "tile_position_embd",
 		"gated_positional_embedding.embedding", "position_embd.weight",
 		"gated_positional_embedding", "position_embd",
 		"embedding.weight", "weight",
 		"pre_tile_positional_embedding", "pre_tile_position_embd",
 		"post_tile_positional_embedding", "post_tile_position_embd",
 		"layernorm_pre", "pre_ln",
 		"layernorm_post", "post_ln",
 		"global_transformer.layers", "global.blk",
 		"transformer.layers", "blk",
 		"mlp.fc1", "ffn_up",
 		"mlp.fc2", "ffn_down",
 		"multi_modal_projector", "mm.0",
 	)
 }
 func (m *mllamaModel) Tensors(ts []Tensor) []*ggml.Tensor {
 	var out []*ggml.Tensor
 	var text []Tensor
 	for _, t := range ts {
 		if !strings.HasPrefix(t.Name(), "v.") && !strings.HasPrefix(t.Name(), "mm.") {
 			text = append(text, t)
 		} else if t.Name() == "v.position_embd.gate" {
 			for _, name := range []string{"v.position_embd.gate", "v.tile_position_embd.gate"} {
 				tt := t.Clone()
 				tt.SetRepacker(m.repack(name))
 				out = append(out, &ggml.Tensor{
 					Name:     name,
 					Kind:     t.Kind(),
 					Shape:    t.Shape(),
 					WriterTo: tt,
 				})
 			}
 		} else {
 			if t.Name() == "v.pre_tile_position_embd.gate" || t.Name() == "v.post_tile_position_embd.gate" {
 				t.SetRepacker(m.repack(t.Name()))
 			} else if strings.HasSuffix(t.Name(), "attn_q.weight") || strings.HasSuffix(t.Name(), "attn_k.weight") {
 				t.SetRepacker(m.repack(t.Name()))
 			} else if strings.HasSuffix(t.Name(), "attn_gate") || strings.HasSuffix(t.Name(), "ffn_gate") {
 				t.SetRepacker(m.repack(t.Name()))
 			}
 			out = append(out, &ggml.Tensor{
 				Name:     t.Name(),
 				Kind:     t.Kind(),
 				Shape:    t.Shape(),
 				WriterTo: t,
 			})
 		}
 	}
 	return append(out, m.TextModel.Tensors(text)...)
 }
 func (m *mllamaModel) repack(name string) Repacker {
 	return func(_ string, data []float32, shape []uint64) (_ []float32, err error) {
 		dims := make([]int, len(shape))
 		for i, dim := range shape {
 			dims[i] = int(dim)
 		}
 		var t tensor.Tensor = tensor.New(tensor.WithShape(dims...), tensor.WithBacking(data))
 		if strings.HasSuffix(name, "attn_q.weight") || strings.HasSuffix(name, "attn_k.weight") {
 			heads := m.VisionModel.AttentionHeads
 			if err := t.Reshape(append([]int{int(heads), 2, dims[0] / int(heads) / 2}, dims[1:]...)...); err != nil {
 				return nil, err
 			}
 			if err := t.T(0, 2, 1, 3); err != nil {
 				return nil, err
 			}
 			if err := t.Reshape(dims...); err != nil {
 				return nil, err
 			}
 			if err := t.Transpose(); err != nil {
 				return nil, err
 			}
 		} else {
 			t, err = tensor.Tanh(t)
 			if err != nil {
 				return nil, err
 			}
 			if name == "v.position_embd.gate" {
 				t, err = tensor.Sub(float32(1), t)
 				if err != nil {
 					return nil, err
 				}
 			}
 		}
 		t = tensor.Materialize(t)
 		// flatten tensor so it can be return as a vector
 		if err := t.Reshape(t.Shape().TotalSize()); err != nil {
 			return nil, err
 		}
 		return native.VectorF32(t.(*tensor.Dense))
 	}
 }
--- a/convert/convert_qwen2.go
+++ b/convert/convert_qwen2.go
@ -15,6 +15,7 @@ type qwen2Model struct {
 		Type                          string     `json:"type"`
 		Factor                        ropeFactor `json:"factor"`
 		OriginalMaxPositionEmbeddings uint32     `json:"original_max_position_embeddings"`
 		MropeSection                  []int32    `json:"mrope_section"`
 	} `json:"rope_scaling"`
 	RMSNormEPS float32 `json:"rms_norm_eps"`
 }
@ -39,6 +40,8 @@ func (q *qwen2Model) KV(t *Tokenizer) ggml.KV {
 	case "yarn":
 		kv["qwen2.rope.scaling.type"] = q.RopeScaling.Type
 		kv["qwen2.rope.scaling.factor"] = q.RopeScaling.Factor
 	case "mrope", "default":
 		kv["qwen2.rope.mrope_section"] = q.RopeScaling.MropeSection
 	default:
 		panic("unknown rope scaling type")
 	}
--- a/convert/convert_qwen25vl.go
+++ b/convert/convert_qwen25vl.go
@ -0,0 +1,102 @@
 package convert
 import (
 	"cmp"
 	"slices"
 	"strings"
 	"github.com/ollama/ollama/fs/ggml"
 )
 type qwen25VLModel struct {
 	qwen2Model
 	VisionModel struct {
 		Depth               uint32  `json:"depth"`
 		HiddenSize          uint32  `json:"hidden_size"`
 		NumHeads            uint32  `json:"num_heads"`
 		InChannels          uint32  `json:"in_chans"`
 		PatchSize           uint32  `json:"patch_size"`
 		SpatialMergeSize    uint32  `json:"spatial_merge_size"`
 		SpatialPatchSize    uint32  `json:"spatial_patch_size"`
 		WindowSize          uint32  `json:"window_size"`
 		RMSNormEps          float32 `json:"layer_norm_epsilon"`
 		RopeTheta           float32 `json:"rope_theta"`
 		FullAttentionBlocks []int32 `json:"fullatt_block_indexes"`
 		TemporalPatchSize   uint32  `json:"temporal_patch_size"`
 	} `json:"vision_config"`
 }
 var _ ModelConverter = (*qwen25VLModel)(nil)
 func (q *qwen25VLModel) KV(t *Tokenizer) ggml.KV {
 	kv := q.ModelParameters.KV(t)
 	kv["general.architecture"] = "qwen25vl"
 	for k, v := range q.qwen2Model.KV(t) {
 		if strings.HasPrefix(k, "qwen2.") {
 			kv[strings.Replace(k, "qwen2.", "qwen25vl.", 1)] = v
 		}
 	}
 	if q.VisionModel.FullAttentionBlocks == nil {
 		kv["qwen25vl.vision.fullatt_block_indexes"] = []int32{7, 15, 23, 31}
 	}
 	kv["qwen25vl.vision.block_count"] = cmp.Or(q.VisionModel.Depth, 32)
 	kv["qwen25vl.vision.embedding_length"] = q.VisionModel.HiddenSize
 	kv["qwen25vl.vision.attention.head_count"] = cmp.Or(q.VisionModel.NumHeads, 16)
 	kv["qwen25vl.vision.num_channels"] = q.VisionModel.InChannels
 	kv["qwen25vl.vision.patch_size"] = cmp.Or(q.VisionModel.PatchSize, 14)
 	kv["qwen25vl.vision.spatial_merge_size"] = cmp.Or(q.VisionModel.SpatialMergeSize, 2)
 	kv["qwen25vl.vision.spatial_patch_size"] = q.VisionModel.SpatialPatchSize
 	kv["qwen25vl.vision.window_size"] = cmp.Or(q.VisionModel.WindowSize, 112)
 	kv["qwen25vl.vision.attention.layer_norm_epsilon"] = cmp.Or(q.VisionModel.RMSNormEps, 1e-6)
 	kv["qwen25vl.vision.rope.freq_base"] = cmp.Or(q.VisionModel.RopeTheta, 1e4)
 	kv["qwen25vl.vision.fullatt_block_indexes"] = q.VisionModel.FullAttentionBlocks
 	kv["qwen25vl.vision.temporal_patch_size"] = cmp.Or(q.VisionModel.TemporalPatchSize, 2)
 	return kv
 }
 func (q *qwen25VLModel) Tensors(ts []Tensor) []*ggml.Tensor {
 	var out []*ggml.Tensor
 	for _, t := range ts {
 		if strings.Contains(t.Name(), "patch_embed.proj") {
 			for t := range splitDim(t, 2,
 				split{Replacer: strings.NewReplacer("patch_embed.proj", "patch_embd_0")},
 				split{Replacer: strings.NewReplacer("patch_embed.proj", "patch_embd_1")},
 			) {
 				t.Shape = slices.DeleteFunc(t.Shape, func(i uint64) bool { return i == 1 })
 				out = append(out, t)
 			}
 		} else if strings.Contains(t.Name(), "attn.qkv") {
 			out = append(out, slices.Collect(splitDim(t, 0,
 				split{Replacer: strings.NewReplacer("attn.qkv", "attn_q")},
 				split{Replacer: strings.NewReplacer("attn.qkv", "attn_k")},
 				split{Replacer: strings.NewReplacer("attn.qkv", "attn_v")},
 			))...)
 		} else {
 			out = append(out, &ggml.Tensor{
 				Name:     t.Name(),
 				Kind:     t.Kind(),
 				Shape:    t.Shape(),
 				WriterTo: t,
 			})
 		}
 	}
 	return out
 }
 func (p *qwen25VLModel) Replacements() []string {
 	return append(
 		p.qwen2Model.Replacements(),
 		"visual", "v",
 		"blocks", "blk",
 		"attn.proj", "attn_out",
 		"norm1", "ln1",
 		"norm2", "ln2",
 	)
 }
--- a/convert/convert_test.go
+++ b/convert/convert_test.go
@ -47,7 +47,7 @@ func convertFull(t *testing.T, fsys fs.FS) (*os.File, ggml.KV, ggml.Tensors) {
 	}
 	t.Cleanup(func() { r.Close() })
-	m, _, err := ggml.Decode(r, -1)
+	m, err := ggml.Decode(r, -1)
 	if err != nil {
 		t.Fatal(err)
 	}
@ -332,7 +332,7 @@ func TestConvertAdapter(t *testing.T) {
 			}
 			defer r.Close()
-			m, _, err := ggml.Decode(r, -1)
+			m, err := ggml.Decode(r, -1)
 			if err != nil {
 				t.Fatal(err)
 			}
--- a/convert/reader.go
+++ b/convert/reader.go
@ -38,7 +38,10 @@ const (
 func (t tensorBase) Kind() uint32 {
 	if strings.HasSuffix(t.name, ".ffn_gate_inp.weight") ||
 		t.name == "token_types.weight" ||
-		t.name == "v.positional_embedding_vlm" {
+		t.name == "v.positional_embedding_vlm" ||
 		t.name == "v.tile_position_embd.weight" ||
 		t.name == "v.pre_tile_position_embd.weight" ||
 		t.name == "v.post_tile_position_embd.weight" {
 		// these tensors are always F32
 		return 0
 	}
--- a/convert/tensor.go
+++ b/convert/tensor.go
@ -0,0 +1,129 @@
 package convert
 import (
 	"cmp"
 	"io"
 	"iter"
 	"path"
 	"slices"
 	"strings"
 	"github.com/pdevine/tensor"
 	"github.com/pdevine/tensor/native"
 	"github.com/ollama/ollama/fs/ggml"
 )
 type split struct {
 	*strings.Replacer
 	dim int
 	// fn is an optional function to apply to the tensor after slicing
 	fn func(tensor.Tensor) (tensor.Tensor, error)
 }
 // splitDim splits a tensor along a specified dimension into multiple tensors. The dimension
 // is split evenly based on the number of replacers provided unless a specific count is given.
 func splitDim(t Tensor, dim int, splits ...split) iter.Seq[*ggml.Tensor] {
 	return func(yield func(*ggml.Tensor) bool) {
 		var offset int
 		for _, split := range splits {
 			t := t.Clone()
 			shape := slices.Clone(t.Shape())
 			shape[dim] = cmp.Or(uint64(split.dim), shape[dim]/uint64(len(splits)))
 			slice := slices.Repeat([]tensor.Slice{nil}, len(shape))
 			slice[dim] = tensor.S(offset, offset+int(shape[dim]))
 			offset += int(shape[dim])
 			t.SetRepacker(func(_ string, data []float32, shape []uint64) ([]float32, error) {
 				dims := make([]int, len(shape))
 				for i := range shape {
 					dims[i] = int(shape[i])
 				}
 				var tt tensor.Tensor = tensor.New(tensor.WithShape(dims...), tensor.WithBacking(data))
 				tt, err := tt.Slice(slice...)
 				if err != nil {
 					return nil, err
 				}
 				tt = tensor.Materialize(tt)
 				if split.fn != nil {
 					tt, err = split.fn(tt)
 					if err != nil {
 						return nil, err
 					}
 				}
 				// flatten tensor so it can be written as a vector
 				if err := tt.Reshape(tt.Shape().TotalSize()); err != nil {
 					return nil, err
 				}
 				return native.VectorF32(tt.(*tensor.Dense))
 			})
 			if !yield(&ggml.Tensor{
 				Name:     split.Replace(t.Name()),
 				Kind:     t.Kind(),
 				Shape:    shape,
 				WriterTo: t,
 			}) {
 				break
 			}
 		}
 	}
 }
 type merge struct {
 	pattern, name string
 }
 // mergeTensors merges tensors that match a given pattern into a single tensor.
 func mergeTensors(unmatched []Tensor, merges ...merge) (out []*ggml.Tensor, _ []Tensor) {
 	var matched []Tensor
 	for i := range merges {
 		matched, unmatched = slicesSplitFunc(unmatched, func(t Tensor) bool {
 			matched, _ := path.Match(merges[i].pattern, t.Name())
 			return matched
 		})
 		if len(matched) > 0 {
 			out = append(out, &ggml.Tensor{
 				Name:     merges[i].name,
 				Kind:     matched[0].Kind(),
 				Shape:    append([]uint64{uint64(len(matched))}, matched[0].Shape()...),
 				WriterTo: mergeGroup(matched),
 			})
 		}
 	}
 	return out, unmatched
 }
 // slicesSplitFunc splits a slice into two slices based on a predicate function.
 func slicesSplitFunc[S ~[]E, E comparable](s S, fn func(e E) bool) (matched, unmatched S) {
 	for _, e := range s {
 		if fn(e) {
 			matched = append(matched, e)
 		} else {
 			unmatched = append(unmatched, e)
 		}
 	}
 	return matched, unmatched
 }
 type mergeGroup []Tensor
 func (g mergeGroup) WriteTo(w io.Writer) (int64, error) {
 	for _, t := range g {
 		if _, err := t.WriteTo(w); err != nil {
 			return 0, err
 		}
 	}
 	return 0, nil
 }
--- a/convert/tensor_test.go
+++ b/convert/tensor_test.go
@ -0,0 +1,402 @@
 package convert
 import (
 	"bytes"
 	"encoding/binary"
 	"io"
 	"iter"
 	"slices"
 	"strings"
 	"testing"
 	"github.com/google/go-cmp/cmp"
 	"github.com/ollama/ollama/fs/ggml"
 	"github.com/pdevine/tensor"
 )
 type fakeTensor struct {
 	name  string
 	shape []uint64
 	data  []float32
 	repacker Repacker
 }
 func (f fakeTensor) Name() string {
 	return f.name
 }
 func (f fakeTensor) Shape() []uint64 {
 	return f.shape
 }
 func (f fakeTensor) Kind() uint32 {
 	return 0
 }
 func (f *fakeTensor) SetRepacker(fn Repacker) {
 	f.repacker = fn
 }
 func (f fakeTensor) Clone() Tensor {
 	return &fakeTensor{
 		name:     f.name,
 		shape:    slices.Clone(f.shape),
 		data:     slices.Clone(f.data),
 		repacker: f.repacker,
 	}
 }
 func (f fakeTensor) WriteTo(w io.Writer) (n int64, err error) {
 	data := f.data
 	if f.repacker != nil {
 		data, err = f.repacker(f.name, data, f.shape)
 		if err != nil {
 			return 0, err
 		}
 	}
 	if err := binary.Write(w, binary.LittleEndian, data); err != nil {
 		return 0, err
 	}
 	return int64(len(data) * 4), nil
 }
 func mul(shape []uint64) int {
 	n := 1
 	for _, dim := range shape {
 		n *= int(dim)
 	}
 	return n
 }
 func TestSplitDim(t *testing.T) {
 	r := fakeTensor{
 		name:  "a.b",
 		shape: []uint64{3, 4},
 		data:  []float32{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11},
 	}
 	t.Run("no split", func(t *testing.T) {
 		for tt := range splitDim(&r, 0, split{Replacer: strings.NewReplacer("a", "x")}) {
 			if tt.Name != "x.b" {
 				t.Fatalf("expected name 'x', got '%s'", tt.Name)
 			}
 			if !slices.Equal(tt.Shape, []uint64{3, 4}) {
 				t.Fatalf("expected shape [3, 4], got %v", tt.Shape)
 			}
 			var b bytes.Buffer
 			if _, err := tt.WriteTo(&b); err != nil {
 				t.Fatal(err)
 			}
 			f32s := make([]float32, mul(tt.Shape))
 			if err := binary.Read(&b, binary.LittleEndian, &f32s); err != nil {
 				t.Fatal(err)
 			}
 			if !slices.Equal(f32s, []float32{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11}) {
 				t.Fatalf("expected data [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11], got %v", f32s)
 			}
 		}
 	})
 	t.Run("even split", func(t *testing.T) {
 		next, stop := iter.Pull(splitDim(&r, 1,
 			split{Replacer: strings.NewReplacer("a", "x")},
 			split{Replacer: strings.NewReplacer("b", "y")},
 		))
 		defer stop()
 		{
 			tt, ok := next()
 			if !ok {
 				t.Fatal("expected at least one split")
 			}
 			if tt.Name != "x.b" {
 				t.Fatal("expected name 'x.b', got", tt.Name)
 			}
 			if !slices.Equal(tt.Shape, []uint64{3, 2}) {
 				t.Fatal("expected shape [3, 2], got", tt.Shape)
 			}
 			var b bytes.Buffer
 			if _, err := tt.WriteTo(&b); err != nil {
 				t.Fatal(err)
 			}
 			f32s := make([]float32, mul(tt.Shape))
 			if err := binary.Read(&b, binary.LittleEndian, &f32s); err != nil {
 				t.Fatal(err)
 			}
 			if !slices.Equal(f32s, []float32{0, 1, 4, 5, 8, 9}) {
 				t.Fatal("expected data [0, 1, 4, 5, 8, 9], got", f32s)
 			}
 		}
 		{
 			tt, ok := next()
 			if !ok {
 				t.Fatal("expected at least one split")
 			}
 			if tt.Name != "a.y" {
 				t.Fatal("expected name 'a.y', got", tt.Name)
 			}
 			if !slices.Equal(tt.Shape, []uint64{3, 2}) {
 				t.Fatal("expected shape [3, 2], got", tt.Shape)
 			}
 			var b bytes.Buffer
 			if _, err := tt.WriteTo(&b); err != nil {
 				t.Fatal(err)
 			}
 			f32s := make([]float32, mul(tt.Shape))
 			if err := binary.Read(&b, binary.LittleEndian, &f32s); err != nil {
 				t.Fatal(err)
 			}
 			if !slices.Equal(f32s, []float32{2, 3, 6, 7, 10, 11}) {
 				t.Fatal("expected data [2, 3, 6, 7, 10, 11], got", f32s)
 			}
 		}
 	})
 	t.Run("uneven split", func(t *testing.T) {
 		next, stop := iter.Pull(splitDim(&r, 0,
 			split{Replacer: strings.NewReplacer("a", "x"), dim: 2},
 			split{Replacer: strings.NewReplacer("b", "y"), dim: 1},
 		))
 		defer stop()
 		{
 			tt, ok := next()
 			if !ok {
 				t.Fatal("expected at least one split")
 			}
 			if tt.Name != "x.b" {
 				t.Fatal("expected name 'x.b', got", tt.Name)
 			}
 			if !slices.Equal(tt.Shape, []uint64{2, 4}) {
 				t.Fatal("expected shape [2, 4], got", tt.Shape)
 			}
 			var b bytes.Buffer
 			if _, err := tt.WriteTo(&b); err != nil {
 				t.Fatal(err)
 			}
 			f32s := make([]float32, mul(tt.Shape))
 			if err := binary.Read(&b, binary.LittleEndian, &f32s); err != nil {
 				t.Fatal(err)
 			}
 			if !slices.Equal(f32s, []float32{0, 1, 2, 3, 4, 5, 6, 7}) {
 				t.Fatal("expected data [0, 1, 2, 3, 4, 5, 6, 7], got", f32s)
 			}
 		}
 		{
 			tt, ok := next()
 			if !ok {
 				t.Fatal("expected at least one split")
 			}
 			if tt.Name != "a.y" {
 				t.Fatal("expected name 'a.y', got", tt.Name)
 			}
 			if !slices.Equal(tt.Shape, []uint64{1, 4}) {
 				t.Fatal("expected shape [1, 4], got", tt.Shape)
 			}
 			var b bytes.Buffer
 			if _, err := tt.WriteTo(&b); err != nil {
 				t.Fatal(err)
 			}
 			f32s := make([]float32, mul(tt.Shape))
 			if err := binary.Read(&b, binary.LittleEndian, &f32s); err != nil {
 				t.Fatal(err)
 			}
 			if !slices.Equal(f32s, []float32{8, 9, 10, 11}) {
 				t.Fatal("expected data [8, 9, 10, 11], got", f32s)
 			}
 		}
 	})
 	t.Run("split with transpose", func(t *testing.T) {
 		next, stop := iter.Pull(splitDim(&r, 1,
 			split{Replacer: strings.NewReplacer("a", "x")},
 			split{Replacer: strings.NewReplacer("b", "y"), fn: func(tt tensor.Tensor) (tensor.Tensor, error) {
 				return tensor.Transpose(tt, 1, 0)
 			}},
 		))
 		defer stop()
 		{
 			tt, ok := next()
 			if !ok {
 				t.Fatal("expected at least one split")
 			}
 			if tt.Name != "x.b" {
 				t.Fatal("expected name 'x.b', got", tt.Name)
 			}
 			if !slices.Equal(tt.Shape, []uint64{3, 2}) {
 				t.Fatal("expected shape [3, 2], got", tt.Shape)
 			}
 			var b bytes.Buffer
 			if _, err := tt.WriteTo(&b); err != nil {
 				t.Fatal(err)
 			}
 			f32s := make([]float32, mul(tt.Shape))
 			if err := binary.Read(&b, binary.LittleEndian, &f32s); err != nil {
 				t.Fatal(err)
 			}
 			if !slices.Equal(f32s, []float32{0, 1, 4, 5, 8, 9}) {
 				t.Fatal("expected data [0, 1, 4, 5, 8, 9], got", f32s)
 			}
 		}
 		{
 			tt, ok := next()
 			if !ok {
 				t.Fatal("expected at least one split")
 			}
 			if tt.Name != "a.y" {
 				t.Fatal("expected name 'a.y', got", tt.Name)
 			}
 			if !slices.Equal(tt.Shape, []uint64{3, 2}) {
 				t.Fatal("expected shape [3, 2], got", tt.Shape)
 			}
 			var b bytes.Buffer
 			if _, err := tt.WriteTo(&b); err != nil {
 				t.Fatal(err)
 			}
 			f32s := make([]float32, mul(tt.Shape))
 			if err := binary.Read(&b, binary.LittleEndian, &f32s); err != nil {
 				t.Fatal(err)
 			}
 			if !slices.Equal(f32s, []float32{2, 6, 10, 3, 7, 11}) {
 				t.Fatal("expected data [2, 6, 10, 3, 7, 11], got", f32s)
 			}
 		}
 	})
 }
 func TestMerge(t *testing.T) {
 	unmatched := []Tensor{
 		&fakeTensor{
 			name:  "a.0.b",
 			shape: []uint64{5, 2},
 			data:  []float32{10, 11, 12, 13, 14, 15, 16, 17, 18, 19},
 		},
 		&fakeTensor{
 			name:  "a.1.b",
 			shape: []uint64{5, 2},
 			data:  []float32{20, 21, 22, 23, 24, 25, 26, 27, 28, 29},
 		},
 		&fakeTensor{
 			name:  "c.0.d",
 			shape: []uint64{5, 2},
 			data:  []float32{30, 31, 32, 33, 34, 35, 36, 37, 38, 39},
 		},
 		&fakeTensor{
 			name:  "c.1.d",
 			shape: []uint64{5, 2},
 			data:  []float32{40, 41, 42, 43, 44, 45, 46, 47, 48, 49},
 		},
 		&fakeTensor{
 			name:  "e.0.f",
 			shape: []uint64{5, 2},
 			data:  []float32{50, 51, 52, 53, 54, 55, 56, 57, 58, 59},
 		},
 	}
 	checkMatched := func(t *testing.T, n int, matched []*ggml.Tensor) {
 		for i := range n {
 			got := matched[i]
 			if diff := cmp.Diff([]uint64{2, 5, 2}, got.Shape); diff != "" {
 				t.Errorf("unexpected (-want +got):\n%s", diff)
 			}
 			var b bytes.Buffer
 			if _, err := got.WriteTo(&b); err != nil {
 				t.Fatal(err)
 			}
 			f32s := make([]float32, 20)
 			if err := binary.Read(&b, binary.LittleEndian, &f32s); err != nil {
 				t.Fatal(err)
 			}
 			offset := 10 + (i * 20)
 			want := make([]float32, 20)
 			for j := range 20 {
 				want[j] = float32(offset + j)
 			}
 			if diff := cmp.Diff(want, f32s); diff != "" {
 				t.Errorf("unexpected data (-want +got):\n%s", diff)
 			}
 		}
 	}
 	t.Run("single merge", func(t *testing.T) {
 		matched, unmatched := mergeTensors(unmatched, merge{"a.*.b", "a.b"})
 		if len(unmatched) != 3 {
 			t.Error("expected 3 remaining tensors, got", len(unmatched))
 		}
 		if len(matched) != 1 {
 			t.Error("expected 1 merged tensor, got", len(matched))
 		}
 		checkMatched(t, 1, matched)
 	})
 	t.Run("multiple merges", func(t *testing.T) {
 		matched, unmatched := mergeTensors(unmatched, merge{"a.*.b", "a.b"}, merge{"c.*.d", "c.d"})
 		if len(unmatched) != 1 {
 			t.Error("expected 1 remaining tensors, got", len(unmatched))
 		}
 		if len(matched) != 2 {
 			t.Error("expected 2 merged tensor, got", len(matched))
 		}
 		checkMatched(t, 2, matched)
 	})
 	t.Run("no match", func(t *testing.T) {
 		matched, unmatched := mergeTensors(unmatched, merge{"x.*.y", "x.y"})
 		if len(unmatched) != 5 {
 			t.Error("expected 5 remaining tensors, got", len(unmatched))
 		}
 		if len(matched) != 0 {
 			t.Error("expected no merged tensors, got", len(matched))
 		}
 	})
 }
--- a/convert/tokenizer.go
+++ b/convert/tokenizer.go
@ -110,6 +110,7 @@ func parseTokenizer(fsys fs.FS, specialTokenTypes []string) (*Tokenizer, error)
 	}
 	if f, err := fsys.Open("tokenizer_config.json"); errors.Is(err, os.ErrNotExist) {
 		// noop
 	} else if err != nil {
 		return nil, err
 	} else {
@ -171,6 +172,34 @@ func parseTokenizer(fsys fs.FS, specialTokenTypes []string) (*Tokenizer, error)
 		}
 	}
 	if f, err := fsys.Open("generation_config.json"); errors.Is(err, os.ErrNotExist) {
 	} else if err != nil {
 		return nil, err
 	} else {
 		defer f.Close()
 		var p map[string]json.RawMessage
 		if err := json.NewDecoder(f).Decode(&p); err != nil {
 			return nil, err
 		}
 		for _, st := range specialTokenTypes {
 			if bts, ok := p[fmt.Sprintf("%s_token_id", st)]; ok {
 				var ids []int32
 				if err := json.Unmarshal(bts, &ids); err != nil {
 					// value is not a list so the existing ID is used
 					continue
 				}
 				if i := slices.IndexFunc(t.SpecialVocabulary, func(sv *SpecialVocabulary) bool {
 					return sv.Type == st
 				}); i >= 0 {
 					t.SpecialVocabulary[i].IDs = ids
 				}
 			}
 		}
 	}
 	return t, nil
 }
@ -280,6 +309,9 @@ type SpecialVocabulary struct {
 	ID       int
 	Content  string
 	AddToken bool
 	// IDs is populated by generation_config.json
 	IDs []int32
 }
 func (sv SpecialVocabulary) Key() string {
--- a/convert/tokenizer_test.go
+++ b/convert/tokenizer_test.go
@ -247,6 +247,67 @@ func TestParseTokenizer(t *testing.T) {
 				Pre: "default",
 			},
 		},
 		{
 			name: "generation config eos token ids",
 			fsys: createTokenizerFS(t, t.TempDir(), map[string]io.Reader{
 				"tokenizer.json": strings.NewReader(`{
 					"added_tokens": [
 						{
 							"id": 0,
 							"content": "<bos>",
 							"special": true
 						},
 						{
 							"id": 1,
 							"content": "<eos>",
 							"special": true
 						},
 						{
 							"id": 2,
 							"content": "<eot>",
 							"special": true
 						},
 						{
 							"id": 3,
 							"content": "<eom>",
 							"special": true
 						}
 					],
 					"model": {
 						"vocab": {
 							"<bos>": 0,
 							"<eos>": 1,
 							"<eot>": 2,
 							"<eom>": 3
 						}
 					}
 				}`),
 				"tokenizer_config.json": strings.NewReader(`{
 					"add_bos_token": true,
 					"add_eos_token": false,
 					"bos_token": "<bos>",
 					"eos_token": "<eos>"
 				}`),
 				"generation_config.json": strings.NewReader(`{
 					"bos_token_id": 0,
 					"eos_token_id": [1, 2, 3]
 				}`),
 			}),
 			specialTokenTypes: []string{"pad", "eos", "bos", "unk"},
 			want: &Tokenizer{
 				Vocabulary: &Vocabulary{
 					Model:  "gpt2",
 					Tokens: []string{"<bos>", "<eos>", "<eot>", "<eom>"},
 					Scores: []float32{0, 1, 2, 3},
 					Types:  []int32{3, 3, 3, 3},
 				},
 				SpecialVocabulary: []*SpecialVocabulary{
 					{Type: "eos", Content: "<eos>", ID: 1, IDs: []int32{1, 2, 3}, AddToken: false},
 					{Type: "bos", Content: "<bos>", ID: 0, AddToken: true},
 				},
 				Pre: "default",
 			},
 		},
 	}
 	for _, tt := range cases {
--- a/discover/cuda_common.go
+++ b/discover/cuda_common.go
@ -3,7 +3,6 @@
 package discover
 import (
 	"fmt"
 	"log/slog"
 	"os"
 	"regexp"
@ -60,8 +59,6 @@ func cudaVariant(gpuInfo CudaGPUInfo) string {
 	// driver 12.0 has problems with the cuda v12 library, so run v11 on those older drivers
 	if gpuInfo.DriverMajor < 12 || (gpuInfo.DriverMajor == 12 && gpuInfo.DriverMinor == 0) {
 		// The detected driver is older than Feb 2023
 		slog.Warn("old CUDA driver detected - please upgrade to a newer driver", "version", fmt.Sprintf("%d.%d", gpuInfo.DriverMajor, gpuInfo.DriverMinor))
 		return "v11"
 	}
 	return "v12"
--- a/discover/gpu.go
+++ b/discover/gpu.go
@ -670,7 +670,7 @@ func loadOneapiMgmt(oneapiLibPaths []string) (int, *C.oneapi_handle_t, string, e
 }
 func getVerboseState() C.uint16_t {
-	if envconfig.Debug() {
+	if envconfig.LogLevel() < slog.LevelInfo {
 		return C.uint16_t(1)
 	}
 	return C.uint16_t(0)
--- a/discover/path.go
+++ b/discover/path.go
@ -12,7 +12,7 @@ import (
 // '../lib/ollama' on Linux and the executable's directory on macOS
 // note: distribution builds, additional GPU-specific libraries are
 // found in subdirectories of the returned path, such as
-// 'cuda_v12', 'rocm', etc.
+// 'cuda_v11', 'cuda_v12', 'rocm', etc.
 var LibOllamaPath string = func() string {
 	exe, err := os.Executable()
 	if err != nil {
--- a/docs/api.md
+++ b/docs/api.md
@ -19,7 +19,7 @@
 ### Model names
-Model names follow a `model:tag` format, where `model` can have an optional namespace such as `example/model`. Some examples are `orca-mini:3b-q4_1` and `llama3:70b`. The tag is optional and, if not provided, will default to `latest`. The tag is used to identify a specific version.
+Model names follow a `model:tag` format, where `model` can have an optional namespace such as `example/model`. Some examples are `orca-mini:3b-q8_0` and `llama3:70b`. The tag is optional and, if not provided, will default to `latest`. The tag is used to identify a specific version.
 ### Durations
@ -43,6 +43,7 @@ Generate a response for a given prompt with a provided model. This is a streamin
 - `prompt`: the prompt to generate a response for
 - `suffix`: the text after the model response
 - `images`: (optional) a list of base64-encoded images (for multimodal models such as `llava`)
 - `think`: (for thinking models) should the model think before responding?
 Advanced parameters (optional):
@ -490,11 +491,13 @@ Generate the next message in a chat with a provided model. This is a streaming e
 - `model`: (required) the [model name](#model-names)
 - `messages`: the messages of the chat, this can be used to keep a chat memory
 - `tools`: list of tools in JSON for the model to use if supported
 - `think`: (for thinking models) should the model think before responding?
 The `message` object has the following fields:
 - `role`: the role of the message, either `system`, `user`, `assistant`, or `tool`
 - `content`: the content of the message
 - `thinking`: (for thinking models) the model's thinking process
 - `images` (optional): a list of images to include in the message (for multimodal models such as `llava`)
 - `tool_calls` (optional): a list of tools in JSON that the model wants to use
@ -952,19 +955,8 @@ If you are creating a model from a safetensors directory or from a GGUF file, yo
 | Type | Recommended |
 | --- | :-: |
 | q2_K | |
 | q3_K_L | |
 | q3_K_M | |
 | q3_K_S | |
 | q4_0 | |
 | q4_1 | |
 | q4_K_M | * |
 | q4_K_S | |
 | q5_0 | |
 | q5_1 | |
 | q5_K_M | |
 | q5_K_S | |
 | q6_K | |
 | q8_0 | * |
 ### Examples
@ -1009,8 +1001,8 @@ Quantize a non-quantized model.
 ```shell
 curl http://localhost:11434/api/create -d '{
-  "model": "llama3.1:quantized",
+  "model": "llama3.2:quantized",
-  "from": "llama3.1:8b-instruct-fp16",
+  "from": "llama3.2:3b-instruct-fp16",
  "quantize": "q4_K_M"
 }'
 ```
@ -1020,12 +1012,14 @@ curl http://localhost:11434/api/create -d '{
 A stream of JSON objects is returned:
 ```json
-{"status":"quantizing F16 model to Q4_K_M"}
+{"status":"quantizing F16 model to Q4_K_M","digest":"0","total":6433687776,"completed":12302}
-{"status":"creating new layer sha256:667b0c1932bc6ffc593ed1d03f895bf2dc8dc6df21db3042284a6f4416b06a29"}
+{"status":"quantizing F16 model to Q4_K_M","digest":"0","total":6433687776,"completed":6433687552}
-{"status":"using existing layer sha256:11ce4ee3e170f6adebac9a991c22e22ab3f8530e154ee669954c4bc73061c258"}
+{"status":"verifying conversion"}
-{"status":"using existing layer sha256:0ba8f0e314b4264dfd19df045cde9d4c394a52474bf92ed6a3de22a4ca31a177"}
+{"status":"creating new layer sha256:fb7f4f211b89c6c4928ff4ddb73db9f9c0cfca3e000c3e40d6cf27ddc6ca72eb"}
 {"status":"using existing layer sha256:966de95ca8a62200913e3f8bfbf84c8494536f1b94b49166851e76644e966396"}
 {"status":"using existing layer sha256:fcc5a6bec9daf9b561a68827b67ab6088e1dba9d1fa2a50d7bbcc8384e0a265d"}
 {"status":"using existing layer sha256:a70ff7e570d97baaf4e62ac6e6ad9975e04caa6d900d3742d37698494479e0cd"}
 {"status":"using existing layer sha256:56bb8bd477a519ffa694fc449c2413c6f0e1d3b1c88fa7e3c9d88d3ae49d4dcb"}
 {"status":"creating new layer sha256:455f34728c9b5dd3376378bfb809ee166c145b0b4c1f1a6feca069055066ef9a"}
 {"status":"writing manifest"}
 {"status":"success"}
 ```
@ -1163,29 +1157,37 @@ A single JSON object will be returned.
 {
  "models": [
    {
-      "name": "codellama:13b",
+      "name": "deepseek-r1:latest",
-      "modified_at": "2023-11-04T14:56:49.277302595-07:00",
+      "model": "deepseek-r1:latest",
-      "size": 7365960935,
+      "modified_at": "2025-05-10T08:06:48.639712648-07:00",
-      "digest": "9f438cb9cd581fc025612d27f7c1a6669ff83a8bb0ed86c94fcf4c5440555697",
+      "size": 4683075271,
      "digest": "0a8c266910232fd3291e71e5ba1e058cc5af9d411192cf88b6d30e92b6e73163",
      "details": {
        "parent_model": "",
        "format": "gguf",
-        "family": "llama",
+        "family": "qwen2",
-        "families": null,
+        "families": [
-        "parameter_size": "13B",
+          "qwen2"
-        "quantization_level": "Q4_0"
+        ],
        "parameter_size": "7.6B",
        "quantization_level": "Q4_K_M"
      }
    },
    {
-      "name": "llama3:latest",
+      "name": "llama3.2:latest",
-      "modified_at": "2023-12-07T09:32:18.757212583-08:00",
+      "model": "llama3.2:latest",
-      "size": 3825819519,
+      "modified_at": "2025-05-04T17:37:44.706015396-07:00",
-      "digest": "fe938a131f40e6f6d40083c9f0f430a515233eb2edaa6d72eb85c50d64f2300e",
+      "size": 2019393189,
      "digest": "a80c4f17acd55265feec403c7aef86be0c25983ab279d83f3bcd3abbcb5b8b72",
      "details": {
        "parent_model": "",
        "format": "gguf",
        "family": "llama",
-        "families": null,
+        "families": [
-        "parameter_size": "7B",
+          "llama"
-        "quantization_level": "Q4_0"
+        ],
        "parameter_size": "3.2B",
        "quantization_level": "Q4_K_M"
      }
    }
  ]
--- a/docs/benchmark.md
+++ b/docs/benchmark.md
@ -1,59 +0,0 @@
 # Benchmark
 Go benchmark tests that measure end-to-end performance of a running Ollama server. Run these tests to evaluate model inference performance on your hardware and measure the impact of code changes.
 ## When to use
 Run these benchmarks when:
 - Making changes to the model inference engine
 - Modifying model loading/unloading logic
 - Changing prompt processing or token generation code
 - Implementing a new model architecture
 - Testing performance across different hardware setups
 ## Prerequisites
 - Ollama server running locally with `ollama serve` on `127.0.0.1:11434`
 ## Usage and Examples
 >[!NOTE]
 >All commands must be run from the root directory of the Ollama project.
 Basic syntax:
 ```bash
 go test -bench=. ./benchmark/... -m $MODEL_NAME
 ```
 Required flags:
 - `-bench=.`: Run all benchmarks
 - `-m`: Model name to benchmark
 Optional flags:
 - `-count N`: Number of times to run the benchmark (useful for statistical analysis)
 - `-timeout T`: Maximum time for the benchmark to run (e.g. "10m" for 10 minutes)
 Common usage patterns:
 Single benchmark run with a model specified:
 ```bash
 go test -bench=. ./benchmark/... -m llama3.3
 ```
 ## Output metrics
 The benchmark reports several key metrics:
 - `gen_tok/s`: Generated tokens per second
 - `prompt_tok/s`: Prompt processing tokens per second
 - `ttft_ms`: Time to first token in milliseconds
 - `load_ms`: Model load time in milliseconds
 - `gen_tokens`: Total tokens generated
 - `prompt_tokens`: Total prompt tokens processed
 Each benchmark runs two scenarios:
 - Cold start: Model is loaded from disk for each test
 - Warm start: Model is pre-loaded in memory
 Three prompt lengths are tested for each scenario:
 - Short prompt (100 tokens)
 - Medium prompt (500 tokens)
 - Long prompt (1000 tokens)
--- a/docs/development.md
+++ b/docs/development.md
@ -118,7 +118,7 @@ To run tests, use `go test`:
 go test ./...
 ```
-> NOTE: In rare cirumstances, you may nedd to change a package using the new
+> NOTE: In rare cirumstances, you may need to change a package using the new
 > "synctest" package in go1.24.
 >
 > If you do not have the "synctest" package enabled, you will not see build or
--- a/docs/gpu.md
+++ b/docs/gpu.md
@ -1,6 +1,6 @@
 # GPU
 ## Nvidia
-Ollama supports Nvidia GPUs with compute capability 5.0+ and driver version 531 and newer.
+Ollama supports Nvidia GPUs with compute capability 5.0+.
 Check your compute compatibility to see if your card is supported:
 [https://developer.nvidia.com/cuda-gpus](https://developer.nvidia.com/cuda-gpus)
--- a/docs/import.md
+++ b/docs/import.md
@ -132,22 +132,12 @@ success
 ### Supported Quantizations
 - `q4_0`
 - `q4_1`
 - `q5_0`
 - `q5_1`
 - `q8_0`
 #### K-means Quantizations
 - `q3_K_S`
 - `q3_K_M`
 - `q3_K_L`
 - `q4_K_S`
 - `q4_K_M`
 - `q5_K_S`
 - `q5_K_M`
 - `q6_K`
 ## Sharing your model on ollama.com
--- a/docs/linux.md
+++ b/docs/linux.md
@ -112,8 +112,8 @@ sudo systemctl status ollama
 > While AMD has contributed the `amdgpu` driver upstream to the official linux
 > kernel source, the version is older and may not support all ROCm features. We
 > recommend you install the latest driver from
-> https://www.amd.com/en/support/linux-drivers for best support of your Radeon
+> [AMD](https://www.amd.com/en/support/download/linux-drivers.html) for best support
-> GPU.
+> of your Radeon GPU.
 ## Customizing
--- a/docs/troubleshooting.md
+++ b/docs/troubleshooting.md
@ -43,7 +43,7 @@ Ollama includes multiple LLM libraries compiled for different GPUs and CPU vecto
 In the server log, you will see a message that looks something like this (varies from release to release):
 ```
-Dynamic LLM libraries [rocm_v6 cpu cpu_avx cpu_avx2 cuda_v12 rocm_v5]
+Dynamic LLM libraries [rocm_v6 cpu cpu_avx cpu_avx2 cuda_v11 rocm_v5]
 ```
 **Experimental LLM Library Override**
--- a/envconfig/config.go
+++ b/envconfig/config.go
@ -149,9 +149,22 @@ func Bool(k string) func() bool {
 	}
 }
 // LogLevel returns the log level for the application.
 // Values are 0 or false INFO (Default), 1 or true DEBUG, 2 TRACE
 func LogLevel() slog.Level {
 	level := slog.LevelInfo
 	if s := Var("OLLAMA_DEBUG"); s != "" {
 		if b, _ := strconv.ParseBool(s); b {
 			level = slog.LevelDebug
 		} else if i, _ := strconv.ParseInt(s, 10, 64); i != 0 {
 			level = slog.Level(i * -4)
 		}
 	}
 	return level
 }
 var (
 	// Debug enabled additional debug information.
 	Debug = Bool("OLLAMA_DEBUG")
 	// FlashAttention enables the experimental flash attention feature.
 	FlashAttention = Bool("OLLAMA_FLASH_ATTENTION")
 	// KvCacheType is the quantization type for the K/V cache.
@ -170,6 +183,8 @@ var (
 	NewEngine = Bool("OLLAMA_NEW_ENGINE")
 	// ContextLength sets the default context length
 	ContextLength = Uint("OLLAMA_CONTEXT_LENGTH", 4096)
 	// Auth enables authentication between the Ollama client and server
 	UseAuth = Bool("OLLAMA_AUTH")
 )
 func String(s string) func() string {
@ -209,8 +224,6 @@ var (
 	MaxRunners = Uint("OLLAMA_MAX_LOADED_MODELS", 0)
 	// MaxQueue sets the maximum number of queued requests. MaxQueue can be configured via the OLLAMA_MAX_QUEUE environment variable.
 	MaxQueue = Uint("OLLAMA_MAX_QUEUE", 512)
 	// MaxVRAM sets a maximum VRAM override in bytes. MaxVRAM can be configured via the OLLAMA_MAX_VRAM environment variable.
 	MaxVRAM = Uint("OLLAMA_MAX_VRAM", 0)
 )
 func Uint64(key string, defaultValue uint64) func() uint64 {
@ -238,7 +251,7 @@ type EnvVar struct {
 func AsMap() map[string]EnvVar {
 	ret := map[string]EnvVar{
-		"OLLAMA_DEBUG":             {"OLLAMA_DEBUG", Debug(), "Show additional debug information (e.g. OLLAMA_DEBUG=1)"},
+		"OLLAMA_DEBUG":             {"OLLAMA_DEBUG", LogLevel(), "Show additional debug information (e.g. OLLAMA_DEBUG=1)"},
 		"OLLAMA_FLASH_ATTENTION":   {"OLLAMA_FLASH_ATTENTION", FlashAttention(), "Enabled flash attention"},
 		"OLLAMA_KV_CACHE_TYPE":     {"OLLAMA_KV_CACHE_TYPE", KvCacheType(), "Quantization type for the K/V cache (default: f16)"},
 		"OLLAMA_GPU_OVERHEAD":      {"OLLAMA_GPU_OVERHEAD", GpuOverhead(), "Reserve a portion of VRAM per GPU (bytes)"},
--- a/envconfig/config_test.go
+++ b/envconfig/config_test.go
@ -1,11 +1,13 @@
 package envconfig
 import (
 	"log/slog"
 	"math"
 	"testing"
 	"time"
 	"github.com/google/go-cmp/cmp"
 	"github.com/ollama/ollama/logutil"
 )
 func TestHost(t *testing.T) {
@ -292,3 +294,34 @@ func TestContextLength(t *testing.T) {
 		})
 	}
 }
 func TestLogLevel(t *testing.T) {
 	cases := map[string]slog.Level{
 		// Default to INFO
 		"":      slog.LevelInfo,
 		"false": slog.LevelInfo,
 		"f":     slog.LevelInfo,
 		"0":     slog.LevelInfo,
 		// True values enable Debug
 		"true": slog.LevelDebug,
 		"t":    slog.LevelDebug,
 		// Positive values increase verbosity
 		"1": slog.LevelDebug,
 		"2": logutil.LevelTrace,
 		// Negative values decrease verbosity
 		"-1": slog.LevelWarn,
 		"-2": slog.LevelError,
 	}
 	for k, v := range cases {
 		t.Run(k, func(t *testing.T) {
 			t.Setenv("OLLAMA_DEBUG", k)
 			if i := LogLevel(); i != v {
 				t.Errorf("%s: expected %d, got %d", k, v, i)
 			}
 		})
 	}
 }
--- a/fs/ggml/ggml.go
+++ b/fs/ggml/ggml.go
@ -15,6 +15,7 @@ import (
 type GGML struct {
 	container
 	model
 	Length int64
 }
 type model interface {
@ -170,6 +171,8 @@ func (kv KV) OllamaEngineRequired() bool {
 		"gemma3",
 		"mistral3",
 		"llama4",
 		"mllama",
 		"qwen25vl",
 	}, kv.Architecture())
 }
@ -429,12 +432,12 @@ func DetectContentType(b []byte) string {
 //
 // It collects array values for arrays with a size less than or equal to
 // maxArraySize. If the maxArraySize is negative, all arrays are collected.
-func Decode(rs io.ReadSeeker, maxArraySize int) (*GGML, int64, error) {
+func Decode(rs io.ReadSeeker, maxArraySize int) (*GGML, error) {
 	rs = bufioutil.NewBufferedSeeker(rs, 32<<10)
 	var magic uint32
 	if err := binary.Read(rs, binary.LittleEndian, &magic); err != nil {
-		return nil, 0, err
+		return nil, err
 	}
 	var c container
@ -444,24 +447,25 @@ func Decode(rs io.ReadSeeker, maxArraySize int) (*GGML, int64, error) {
 	case FILE_MAGIC_GGUF_BE:
 		c = &containerGGUF{ByteOrder: binary.BigEndian, maxArraySize: maxArraySize}
 	default:
-		return nil, 0, errors.New("invalid file magic")
+		return nil, errors.New("invalid file magic")
 	}
 	model, err := c.Decode(rs)
 	if err != nil {
-		return nil, 0, err
+		return nil, err
 	}
 	offset, err := rs.Seek(0, io.SeekCurrent)
 	if err != nil {
-		return nil, 0, err
+		return nil, err
 	}
 	// final model type
 	return &GGML{
 		container: c,
 		model:     model,
-	}, offset, nil
+		Length:    offset,
 	}, nil
 }
 func (f GGML) GraphSize(context, batch uint64, numParallel int, kvCacheType string) (kv []uint64, partialOffload, fullOffload uint64) {
@ -693,6 +697,20 @@ func (llm GGML) VisionGraphSize() (weights, graphSize uint64) {
 		graphSize = 4 * (imageSize*imageSize*numChannels +
 			embeddingLength*patchSize +
 			numPatches*numPatches*headCount)
 	case "qwen25vl":
 		maxPixels := uint64(llm.KV().Uint("vision.max_pixels", 28*28*1280))
 		numPatches := maxPixels / (patchSize * patchSize)
 		graphSize = 4 * (maxPixels*numChannels + // Original image storage
 			// Normalized pixels
 			maxPixels*numChannels +
 			// Patches storage (numPatches * channels * patchSize^2)
 			numPatches*numChannels*patchSize*patchSize +
 			// Self-attention calculations
 			numPatches*numPatches*headCount +
 			// Additional buffer for processing
 			embeddingLength*numPatches)
 	case "llama4":
 		// vision graph is computed independently in the same schedule
 		// and is negligible compared to the worst case text graph
--- a/fs/ggml/gguf.go
+++ b/fs/ggml/gguf.go
@ -527,23 +527,17 @@ func WriteGGUF(f *os.File, kv KV, ts []*Tensor) error {
 		return err
 	}
-	keys := slices.Collect(maps.Keys(kv))
+	for _, key := range slices.Sorted(maps.Keys(kv)) {
 	slices.Sort(keys)
 	for _, key := range keys {
 		if err := ggufWriteKV(f, key, kv[key]); err != nil {
 			return err
 		}
 	}
 	slices.SortStableFunc(ts, func(a, b *Tensor) int {
-		if i, j := a.block(), b.block(); i < 0 && j > 0 {
+		if i, j := a.block(), b.block(); i > 0 && j > 0 {
 			return 1
 		} else if i > 0 && j < 0 {
 			return -1
 		} else {
 			return cmp.Compare(i, j)
 		}
 		return cmp.Compare(a.Name, b.Name)
 	})
 	var s uint64
--- a/fs/ggml/gguf_test.go
+++ b/fs/ggml/gguf_test.go
@ -2,62 +2,82 @@ package ggml
 import (
 	"bytes"
 	"math/rand/v2"
 	"os"
-	"slices"
+	"strings"
 	"testing"
 	"github.com/google/go-cmp/cmp"
 )
 func TestWriteGGUF(t *testing.T) {
-	w, err := os.CreateTemp(t.TempDir(), "*.bin")
+	r := rand.New(rand.NewPCG(0, 0))
-	if err != nil {
+	for range 8 {
-		t.Fatal(err)
+		t.Run("shuffle", func(t *testing.T) {
-	}
+			t.Parallel()
 	defer w.Close()
-	if err := WriteGGUF(w, KV{
+			ts := []*Tensor{
-		"general.alignment": uint32(16),
+				{Name: "token_embd.weight", Shape: []uint64{2, 3}, WriterTo: bytes.NewBuffer(make([]byte, 2*3))},
-	}, []*Tensor{
+				{Name: "blk.0.attn_norm.weight", Shape: []uint64{2, 3}, WriterTo: bytes.NewBuffer(make([]byte, 2*3))},
-		{Name: "test.0", Shape: []uint64{2, 3}, WriterTo: bytes.NewBuffer(slices.Repeat([]byte{0}, 2*3*4))},
+				{Name: "blk.1.attn_norm.weight", Shape: []uint64{2, 3}, WriterTo: bytes.NewBuffer(make([]byte, 2*3))},
-		{Name: "test.1", Shape: []uint64{2, 3}, WriterTo: bytes.NewBuffer(slices.Repeat([]byte{0}, 2*3*4))},
+				{Name: "blk.2.attn_norm.weight", Shape: []uint64{2, 3}, WriterTo: bytes.NewBuffer(make([]byte, 2*3))},
-		{Name: "test.2", Shape: []uint64{2, 3}, WriterTo: bytes.NewBuffer(slices.Repeat([]byte{0}, 2*3*4))},
+				{Name: "blk.3.attn_norm.weight", Shape: []uint64{2, 3}, WriterTo: bytes.NewBuffer(make([]byte, 2*3))},
-		{Name: "test.3", Shape: []uint64{2, 3}, WriterTo: bytes.NewBuffer(slices.Repeat([]byte{0}, 2*3*4))},
+				{Name: "blk.4.attn_norm.weight", Shape: []uint64{2, 3}, WriterTo: bytes.NewBuffer(make([]byte, 2*3))},
-		{Name: "test.4", Shape: []uint64{2, 3}, WriterTo: bytes.NewBuffer(slices.Repeat([]byte{0}, 2*3*4))},
+				{Name: "blk.5.attn_norm.weight", Shape: []uint64{2, 3}, WriterTo: bytes.NewBuffer(make([]byte, 2*3))},
-		{Name: "test.5", Shape: []uint64{2, 3}, WriterTo: bytes.NewBuffer(slices.Repeat([]byte{0}, 2*3*4))},
+				{Name: "output_norm.weight", Shape: []uint64{3, 2}, WriterTo: bytes.NewBuffer(make([]byte, 3*2))},
-	}); err != nil {
+				{Name: "output.weight", Shape: []uint64{3, 2}, WriterTo: bytes.NewBuffer(make([]byte, 3*2))},
-		t.Fatal(err)
+			}
 	}
-	r, err := os.Open(w.Name())
+			r.Shuffle(len(ts), func(i, j int) {
-	if err != nil {
+				ts[i], ts[j] = ts[j], ts[i]
-		t.Fatal(err)
+			})
 	}
 	defer r.Close()
-	ff, _, err := Decode(r, 0)
+			w, err := os.CreateTemp(t.TempDir(), strings.ReplaceAll(t.Name(), "/", "_")+"*.bin")
-	if err != nil {
+			if err != nil {
-		t.Fatal(err)
+				t.Fatal(err)
-	}
+			}
 			defer w.Close()
-	if diff := cmp.Diff(ff.KV(), KV{
+			if err := WriteGGUF(w, KV{
-		"general.alignment":       uint32(16),
+				"general.alignment": uint32(16),
-		"general.parameter_count": uint64(36),
+			}, ts); err != nil {
-	}); diff != "" {
+				t.Fatal(err)
-		t.Errorf("Mismatch (-want +got):\n%s", diff)
+			}
 	}
-	if diff := cmp.Diff(ff.Tensors(), Tensors{
+			r, err := os.Open(w.Name())
-		Offset: 336,
+			if err != nil {
-		items: []*Tensor{
+				t.Fatal(err)
-			{Name: "test.0", Offset: 0, Shape: []uint64{2, 3}},
+			}
-			{Name: "test.1", Offset: 32, Shape: []uint64{2, 3}},
+			defer r.Close()
-			{Name: "test.2", Offset: 64, Shape: []uint64{2, 3}},
+
-			{Name: "test.3", Offset: 96, Shape: []uint64{2, 3}},
+			ff, err := Decode(r, 0)
-			{Name: "test.4", Offset: 128, Shape: []uint64{2, 3}},
+			if err != nil {
-			{Name: "test.5", Offset: 160, Shape: []uint64{2, 3}},
+				t.Fatal(err)
-		},
+			}
-	}, cmp.AllowUnexported(Tensors{})); diff != "" {
+
-		t.Errorf("Mismatch (-want +got):\n%s", diff)
+			if diff := cmp.Diff(KV{
 				"general.alignment":       uint32(16),
 				"general.parameter_count": uint64(54),
 			}, ff.KV()); diff != "" {
 				t.Errorf("Mismatch (-want +got):\n%s", diff)
 			}
 			if diff := cmp.Diff(Tensors{
 				Offset: 608,
 				items: []*Tensor{
 					{Name: "blk.0.attn_norm.weight", Offset: 0, Shape: []uint64{2, 3}},
 					{Name: "blk.1.attn_norm.weight", Offset: 32, Shape: []uint64{2, 3}},
 					{Name: "blk.2.attn_norm.weight", Offset: 64, Shape: []uint64{2, 3}},
 					{Name: "blk.3.attn_norm.weight", Offset: 96, Shape: []uint64{2, 3}},
 					{Name: "blk.4.attn_norm.weight", Offset: 128, Shape: []uint64{2, 3}},
 					{Name: "blk.5.attn_norm.weight", Offset: 160, Shape: []uint64{2, 3}},
 					{Name: "output.weight", Offset: 192, Shape: []uint64{3, 2}},
 					{Name: "output_norm.weight", Offset: 224, Shape: []uint64{3, 2}},
 					{Name: "token_embd.weight", Offset: 256, Shape: []uint64{2, 3}},
 				},
 			}, ff.Tensors(), cmp.AllowUnexported(Tensors{})); diff != "" {
 				t.Errorf("Mismatch (-want +got):\n%s", diff)
 			}
 		})
 	}
 }
--- a/fs/ggml/type.go
+++ b/fs/ggml/type.go
@ -12,42 +12,42 @@ type FileType uint32
 const (
 	FileTypeF32 FileType = iota
 	FileTypeF16
-	FileTypeQ4_0
+	fileTypeQ4_0
-	FileTypeQ4_1
+	fileTypeQ4_1
 	fileTypeQ4_1_F16 // unused by GGML
 	fileTypeQ4_2     // unused by GGML
 	fileTypeQ4_3     // unused by GGML
 	FileTypeQ8_0
-	FileTypeQ5_0
+	fileTypeQ5_0
-	FileTypeQ5_1
+	fileTypeQ5_1
-	FileTypeQ2_K
+	fileTypeQ2_K
-	FileTypeQ3_K_S
+	fileTypeQ3_K_S
-	FileTypeQ3_K_M
+	fileTypeQ3_K_M
-	FileTypeQ3_K_L
+	fileTypeQ3_K_L
 	FileTypeQ4_K_S
 	FileTypeQ4_K_M
-	FileTypeQ5_K_S
+	fileTypeQ5_K_S
-	FileTypeQ5_K_M
+	fileTypeQ5_K_M
-	FileTypeQ6_K
+	fileTypeQ6_K
-	fileTypeIQ2_XXS // not supported by ollama
+	fileTypeIQ2_XXS
-	fileTypeIQ2_XS  // not supported by ollama
+	fileTypeIQ2_XS
-	FileTypeQ2_K_S
+	fileTypeQ2_K_S
-	fileTypeIQ3_XS  // not supported by ollama
+	fileTypeIQ3_XS
-	fileTypeIQ3_XXS // not supported by ollama
+	fileTypeIQ3_XXS
-	fileTypeIQ1_S   // not supported by ollama
+	fileTypeIQ1_S
-	fileTypeIQ4_NL  // not supported by ollama
+	fileTypeIQ4_NL
-	fileTypeIQ3_S   // not supported by ollama
+	fileTypeIQ3_S
-	fileTypeIQ3_M   // not supported by ollama
+	fileTypeIQ3_M
-	fileTypeIQ2_S   // not supported by ollama
+	fileTypeIQ2_S
-	fileTypeIQ2_M   // not supported by ollama
+	fileTypeIQ2_M
-	fileTypeIQ4_XS  // not supported by ollama
+	fileTypeIQ4_XS
-	fileTypeIQ1_M   // not supported by ollama
+	fileTypeIQ1_M
 	FileTypeBF16
 	fileTypeQ4_0_4_4 // unused by GGML
 	fileTypeQ4_0_4_8 // unused by GGML
 	fileTypeQ4_0_8_8 // unused by GGML
-	fileTypeTQ1_0    // not supported by ollama
+	fileTypeTQ1_0
-	fileTypeTQ2_0    // not supported by ollama
+	fileTypeTQ2_0
 	FileTypeUnknown = 1024
 )
@ -60,36 +60,12 @@ func ParseFileType(s string) (FileType, error) {
 		return FileTypeF32, nil
 	case "F16":
 		return FileTypeF16, nil
 	case "Q4_0":
 		return FileTypeQ4_0, nil
 	case "Q4_1":
 		return FileTypeQ4_1, nil
 	case "Q8_0":
 		return FileTypeQ8_0, nil
 	case "Q5_0":
 		return FileTypeQ5_0, nil
 	case "Q5_1":
 		return FileTypeQ5_1, nil
 	case "Q2_K":
 		return FileTypeQ2_K, nil
 	case "Q3_K_S":
 		return FileTypeQ3_K_S, nil
 	case "Q3_K_M":
 		return FileTypeQ3_K_M, nil
 	case "Q3_K_L":
 		return FileTypeQ3_K_L, nil
 	case "Q4_K_S":
 		return FileTypeQ4_K_S, nil
 	case "Q4_K_M", "Q4_K":
 		return FileTypeQ4_K_M, nil
 	case "Q5_K_S":
 		return FileTypeQ5_K_S, nil
 	case "Q5_K_M", "Q5_K":
 		return FileTypeQ5_K_M, nil
 	case "Q6_K":
 		return FileTypeQ6_K, nil
 	case "Q2_K_S":
 		return FileTypeQ2_K_S, nil
 	case "BF16":
 		return FileTypeBF16, nil
 	default:
@ -111,40 +87,41 @@ func ParseFileType(s string) (FileType, error) {
 }
 func (t FileType) String() string {
 	// Note: this routine will return a broader set of file types for existing models
 	switch t {
 	case FileTypeF32:
 		return "F32"
 	case FileTypeF16:
 		return "F16"
-	case FileTypeQ4_0:
+	case fileTypeQ4_0:
 		return "Q4_0"
-	case FileTypeQ4_1:
+	case fileTypeQ4_1:
 		return "Q4_1"
 	case FileTypeQ8_0:
 		return "Q8_0"
-	case FileTypeQ5_0:
+	case fileTypeQ5_0:
 		return "Q5_0"
-	case FileTypeQ5_1:
+	case fileTypeQ5_1:
 		return "Q5_1"
-	case FileTypeQ2_K:
+	case fileTypeQ2_K:
 		return "Q2_K"
-	case FileTypeQ3_K_S:
+	case fileTypeQ3_K_S:
 		return "Q3_K_S"
-	case FileTypeQ3_K_M:
+	case fileTypeQ3_K_M:
 		return "Q3_K_M"
-	case FileTypeQ3_K_L:
+	case fileTypeQ3_K_L:
 		return "Q3_K_L"
 	case FileTypeQ4_K_S:
 		return "Q4_K_S"
 	case FileTypeQ4_K_M:
 		return "Q4_K_M"
-	case FileTypeQ5_K_S:
+	case fileTypeQ5_K_S:
 		return "Q5_K_S"
-	case FileTypeQ5_K_M:
+	case fileTypeQ5_K_M:
 		return "Q5_K_M"
-	case FileTypeQ6_K:
+	case fileTypeQ6_K:
 		return "Q6_K"
-	case FileTypeQ2_K_S:
+	case fileTypeQ2_K_S:
 		return "Q2_K_S"
 	case FileTypeBF16:
 		return "BF16"
@ -163,35 +140,35 @@ func (ftype FileType) ToTensorType() TensorType {
 		return TensorTypeF32
 	case FileTypeF16:
 		return TensorTypeF16
-	case FileTypeQ4_0:
+	case fileTypeQ4_0:
 		return TensorTypeQ4_0
-	case FileTypeQ4_1:
+	case fileTypeQ4_1:
 		return TensorTypeQ4_1
 	case FileTypeQ8_0:
 		return TensorTypeQ8_0
-	case FileTypeQ5_0:
+	case fileTypeQ5_0:
 		return TensorTypeQ5_0
-	case FileTypeQ5_1:
+	case fileTypeQ5_1:
 		return TensorTypeQ5_1
-	case FileTypeQ2_K:
+	case fileTypeQ2_K:
 		return TensorTypeQ2_K
-	case FileTypeQ3_K_S:
+	case fileTypeQ3_K_S:
 		return TensorTypeQ3_K
-	case FileTypeQ3_K_M:
+	case fileTypeQ3_K_M:
 		return TensorTypeQ3_K
-	case FileTypeQ3_K_L:
+	case fileTypeQ3_K_L:
 		return TensorTypeQ3_K
 	case FileTypeQ4_K_S:
 		return TensorTypeQ4_K
 	case FileTypeQ4_K_M:
 		return TensorTypeQ4_K
-	case FileTypeQ5_K_S:
+	case fileTypeQ5_K_S:
 		return TensorTypeQ5_K
-	case FileTypeQ5_K_M:
+	case fileTypeQ5_K_M:
 		return TensorTypeQ5_K
-	case FileTypeQ6_K:
+	case fileTypeQ6_K:
 		return TensorTypeQ6_K
-	case FileTypeQ2_K_S:
+	case fileTypeQ2_K_S:
 		return TensorTypeQ2_K
 	case FileTypeBF16:
 		return TensorTypeBF16
--- a/fs/gguf/gguf.go
+++ b/fs/gguf/gguf.go
@ -0,0 +1,347 @@
 package gguf
 import (
 	"bytes"
 	"cmp"
 	"encoding/binary"
 	"errors"
 	"fmt"
 	"io"
 	"iter"
 	"os"
 	"slices"
 	"strings"
 )
 const (
 	typeUint8 uint32 = iota
 	typeInt8
 	typeUint16
 	typeInt16
 	typeUint32
 	typeInt32
 	typeFloat32
 	typeBool
 	typeString
 	typeArray
 	typeUint64
 	typeInt64
 	typeFloat64
 )
 var ErrUnsupported = errors.New("unsupported")
 type File struct {
 	Magic   [4]byte
 	Version uint32
 	keyValues *lazy[KeyValue]
 	tensors   *lazy[TensorInfo]
 	offset    int64
 	file   *os.File
 	reader *bufferedReader
 	bts    []byte
 }
 func Open(path string) (f *File, err error) {
 	f = &File{bts: make([]byte, 4096)}
 	f.file, err = os.Open(path)
 	if err != nil {
 		return nil, err
 	}
 	f.reader = newBufferedReader(f.file, 32<<10)
 	if err := binary.Read(f.reader, binary.LittleEndian, &f.Magic); err != nil {
 		return nil, err
 	}
 	if bytes.Equal(f.Magic[:], []byte("gguf")) {
 		return nil, fmt.Errorf("%w file type %v", ErrUnsupported, f.Magic)
 	}
 	if err := binary.Read(f.reader, binary.LittleEndian, &f.Version); err != nil {
 		return nil, err
 	}
 	if f.Version < 2 {
 		return nil, fmt.Errorf("%w version %v", ErrUnsupported, f.Version)
 	}
 	f.tensors, err = newLazy(f, f.readTensor)
 	if err != nil {
 		return nil, err
 	}
 	f.tensors.successFunc = func() error {
 		offset := f.reader.offset
 		alignment := cmp.Or(f.KeyValue("general.alignment").Int(), 32)
 		f.offset = offset + (alignment-offset%alignment)%alignment
 		return nil
 	}
 	f.keyValues, err = newLazy(f, f.readKeyValue)
 	if err != nil {
 		return nil, err
 	}
 	return f, nil
 }
 func (f *File) readTensor() (TensorInfo, error) {
 	name, err := readString(f)
 	if err != nil {
 		return TensorInfo{}, err
 	}
 	dims, err := read[uint32](f)
 	if err != nil {
 		return TensorInfo{}, err
 	}
 	shape := make([]uint64, dims)
 	for i := range dims {
 		shape[i], err = read[uint64](f)
 		if err != nil {
 			return TensorInfo{}, err
 		}
 	}
 	type_, err := read[uint32](f)
 	if err != nil {
 		return TensorInfo{}, err
 	}
 	offset, err := read[uint64](f)
 	if err != nil {
 		return TensorInfo{}, err
 	}
 	return TensorInfo{
 		Name:   name,
 		Offset: offset,
 		Shape:  shape,
 		Type:   TensorType(type_),
 	}, nil
 }
 func (f *File) readKeyValue() (KeyValue, error) {
 	key, err := readString(f)
 	if err != nil {
 		return KeyValue{}, err
 	}
 	t, err := read[uint32](f)
 	if err != nil {
 		return KeyValue{}, err
 	}
 	value, err := func() (any, error) {
 		switch t {
 		case typeUint8:
 			return read[uint8](f)
 		case typeInt8:
 			return read[int8](f)
 		case typeUint16:
 			return read[uint16](f)
 		case typeInt16:
 			return read[int16](f)
 		case typeUint32:
 			return read[uint32](f)
 		case typeInt32:
 			return read[int32](f)
 		case typeUint64:
 			return read[uint64](f)
 		case typeInt64:
 			return read[int64](f)
 		case typeFloat32:
 			return read[float32](f)
 		case typeFloat64:
 			return read[float64](f)
 		case typeBool:
 			return read[bool](f)
 		case typeString:
 			return readString(f)
 		case typeArray:
 			return readArray(f)
 		default:
 			return nil, fmt.Errorf("%w type %d", ErrUnsupported, t)
 		}
 	}()
 	if err != nil {
 		return KeyValue{}, err
 	}
 	return KeyValue{
 		Key:   key,
 		Value: Value{value},
 	}, nil
 }
 func read[T any](f *File) (t T, err error) {
 	err = binary.Read(f.reader, binary.LittleEndian, &t)
 	return t, err
 }
 func readString(f *File) (string, error) {
 	n, err := read[uint64](f)
 	if err != nil {
 		return "", err
 	}
 	if int(n) > len(f.bts) {
 		f.bts = make([]byte, n)
 	}
 	bts := f.bts[:n]
 	if _, err := io.ReadFull(f.reader, bts); err != nil {
 		return "", err
 	}
 	defer clear(bts)
 	return string(bts), nil
 }
 func readArray(f *File) (any, error) {
 	t, err := read[uint32](f)
 	if err != nil {
 		return nil, err
 	}
 	n, err := read[uint64](f)
 	if err != nil {
 		return nil, err
 	}
 	switch t {
 	case typeUint8:
 		return readArrayData[uint8](f, n)
 	case typeInt8:
 		return readArrayData[int8](f, n)
 	case typeUint16:
 		return readArrayData[uint16](f, n)
 	case typeInt16:
 		return readArrayData[int16](f, n)
 	case typeUint32:
 		return readArrayData[uint32](f, n)
 	case typeInt32:
 		return readArrayData[int32](f, n)
 	case typeUint64:
 		return readArrayData[uint64](f, n)
 	case typeInt64:
 		return readArrayData[int64](f, n)
 	case typeFloat32:
 		return readArrayData[float32](f, n)
 	case typeFloat64:
 		return readArrayData[float64](f, n)
 	case typeBool:
 		return readArrayData[bool](f, n)
 	case typeString:
 		return readArrayString(f, n)
 	default:
 		return nil, fmt.Errorf("%w type %d", ErrUnsupported, t)
 	}
 }
 func readArrayData[T any](f *File, n uint64) (s []T, err error) {
 	s = make([]T, n)
 	for i := range n {
 		e, err := read[T](f)
 		if err != nil {
 			return nil, err
 		}
 		s[i] = e
 	}
 	return s, nil
 }
 func readArrayString(f *File, n uint64) (s []string, err error) {
 	s = make([]string, n)
 	for i := range n {
 		e, err := readString(f)
 		if err != nil {
 			return nil, err
 		}
 		s[i] = e
 	}
 	return s, nil
 }
 func (f *File) Close() error {
 	f.keyValues.stop()
 	f.tensors.stop()
 	return f.file.Close()
 }
 func (f *File) KeyValue(key string) KeyValue {
 	if !strings.HasPrefix(key, "general.") && !strings.HasPrefix(key, "tokenizer.") {
 		key = f.KeyValue("general.architecture").String() + "." + key
 	}
 	if index := slices.IndexFunc(f.keyValues.values, func(kv KeyValue) bool {
 		return kv.Key == key
 	}); index >= 0 {
 		return f.keyValues.values[index]
 	}
 	for keyValue, ok := f.keyValues.next(); ok; keyValue, ok = f.keyValues.next() {
 		if keyValue.Key == key {
 			return keyValue
 		}
 	}
 	return KeyValue{}
 }
 func (f *File) NumKeyValues() int {
 	return int(f.keyValues.count)
 }
 func (f *File) KeyValues() iter.Seq2[int, KeyValue] {
 	return f.keyValues.All()
 }
 func (f *File) TensorInfo(name string) TensorInfo {
 	if index := slices.IndexFunc(f.tensors.values, func(t TensorInfo) bool {
 		return t.Name == name
 	}); index >= 0 {
 		return f.tensors.values[index]
 	}
 	// fast-forward through key values if we haven't already
 	_ = f.keyValues.rest()
 	for tensor, ok := f.tensors.next(); ok; tensor, ok = f.tensors.next() {
 		if tensor.Name == name {
 			return tensor
 		}
 	}
 	return TensorInfo{}
 }
 func (f *File) NumTensors() int {
 	return int(f.tensors.count)
 }
 func (f *File) TensorInfos() iter.Seq2[int, TensorInfo] {
 	// fast forward through key values if we haven't already
 	f.keyValues.rest()
 	return f.tensors.All()
 }
 func (f *File) TensorReader(name string) (TensorInfo, io.Reader, error) {
 	t := f.TensorInfo(name)
 	if t.NumBytes() == 0 {
 		return TensorInfo{}, nil, fmt.Errorf("tensor %s not found", name)
 	}
 	// fast forward through tensor info if we haven't already
 	_ = f.tensors.rest()
 	return t, io.NewSectionReader(f.file, f.offset+int64(t.Offset), t.NumBytes()), nil
 }
--- a/fs/gguf/gguf_test.go
+++ b/fs/gguf/gguf_test.go
@ -0,0 +1,249 @@
 package gguf_test
 import (
 	"bytes"
 	"os"
 	"strconv"
 	"strings"
 	"testing"
 	"github.com/google/go-cmp/cmp"
 	"github.com/google/go-cmp/cmp/cmpopts"
 	"github.com/ollama/ollama/fs/ggml"
 	"github.com/ollama/ollama/fs/gguf"
 )
 func createBinFile(tb testing.TB) string {
 	tb.Helper()
 	f, err := os.CreateTemp(tb.TempDir(), "")
 	if err != nil {
 		tb.Fatal(err)
 	}
 	defer f.Close()
 	kv := ggml.KV{
 		"general.architecture":                   "llama",
 		"llama.block_count":                      uint32(8),
 		"llama.embedding_length":                 uint32(3),
 		"llama.attention.head_count":             uint32(2),
 		"llama.attention.head_count_kv":          uint32(2),
 		"llama.attention.key_length":             uint32(3),
 		"llama.rope.dimension_count":             uint32(4),
 		"llama.rope.freq_base":                   float32(10000.0),
 		"llama.rope.freq_scale":                  float32(1.0),
 		"llama.attention.layer_norm_rms_epsilon": float32(1e-6),
 		"tokenizer.ggml.eos_token_id":            uint32(0),
 		"tokenizer.ggml.eos_token_ids":           []int32{1, 2, 3},
 		"tokenizer.ggml.tokens":                  []string{"hello", "world"},
 		"tokenizer.ggml.scores":                  []float32{0, 1},
 	}
 	tensors := []*ggml.Tensor{
 		{
 			Name:     "token_embd.weight",
 			Kind:     0,
 			Shape:    []uint64{2, 3},
 			WriterTo: bytes.NewBuffer(make([]byte, 4*2*3)),
 		},
 		{
 			Name:     "output.weight",
 			Kind:     0,
 			Shape:    []uint64{3, 2},
 			WriterTo: bytes.NewBuffer(make([]byte, 4*3*2)),
 		},
 	}
 	for i := range 8 {
 		tensors = append(tensors, &ggml.Tensor{
 			Name:     "blk." + strconv.Itoa(i) + ".attn_q.weight",
 			Kind:     0,
 			Shape:    []uint64{3, 3},
 			WriterTo: bytes.NewBuffer(make([]byte, 4*3*3)),
 		}, &ggml.Tensor{
 			Name:     "blk." + strconv.Itoa(i) + ".attn_k.weight",
 			Kind:     0,
 			Shape:    []uint64{3, 3},
 			WriterTo: bytes.NewBuffer(make([]byte, 4*3*3)),
 		}, &ggml.Tensor{
 			Name:     "blk." + strconv.Itoa(i) + ".attn_v.weight",
 			Kind:     0,
 			Shape:    []uint64{3, 3},
 			WriterTo: bytes.NewBuffer(make([]byte, 4*3*3)),
 		}, &ggml.Tensor{
 			Name:     "blk." + strconv.Itoa(i) + ".attn_output.weight",
 			Kind:     0,
 			Shape:    []uint64{3, 3},
 			WriterTo: bytes.NewBuffer(make([]byte, 4*3*3)),
 		})
 	}
 	if err := ggml.WriteGGUF(f, kv, tensors); err != nil {
 		tb.Fatal(err)
 	}
 	return f.Name()
 }
 func TestRead(t *testing.T) {
 	f, err := gguf.Open(createBinFile(t))
 	if err != nil {
 		t.Fatal(err)
 	}
 	defer f.Close()
 	if got := f.KeyValue("does.not.exist").Valid(); got {
 		t.Errorf(`KeyValue("does.not.exist").Exists() = %v, want false`, got)
 	}
 	if got := f.KeyValue("general.architecture").String(); got != "llama" {
 		t.Errorf(`KeyValue("general.architecture").String() = %q, want %q`, got, "llama")
 	}
 	if got := f.TensorInfo("token_embd.weight"); got.Name != "token_embd.weight" {
 		t.Errorf(`TensorInfo("token_embd.weight").Name = %q, want %q`, got.Name, "token_embd.weight")
 	} else if diff := cmp.Diff(got.Shape, []uint64{2, 3}); diff != "" {
 		t.Errorf(`TensorInfo("token_embd.weight").Shape mismatch (-got +want):\n%s`, diff)
 	} else if got.Type != gguf.TensorTypeF32 {
 		t.Errorf(`TensorInfo("token_embd.weight").Type = %d, want %d`, got.Type, gguf.TensorTypeF32)
 	}
 	if got := f.KeyValue("block_count").Uint(); got != 8 {
 		t.Errorf(`KeyValue("block_count").Uint() = %d, want %d`, got, 8)
 	}
 	if diff := cmp.Diff(f.KeyValue("tokenizer.ggml.tokens").Strings(), []string{"hello", "world"}); diff != "" {
 		t.Errorf("KeyValue(\"tokenizer.ggml.tokens\").Strings() mismatch (-got +want):\n%s", diff)
 	}
 	if diff := cmp.Diff(f.KeyValue("tokenizer.ggml.scores").Floats(), []float64{0, 1}); diff != "" {
 		t.Errorf("KeyValue(\"tokenizer.ggml.scores\").Ints() mismatch (-got +want):\n%s", diff)
 	}
 	var kvs []string
 	for _, kv := range f.KeyValues() {
 		if !kv.Valid() {
 			t.Error("found invalid key-value pair:", kv)
 		}
 		kvs = append(kvs, kv.Key)
 	}
 	if len(kvs) != f.NumKeyValues() {
 		t.Errorf("iterated key count = %d, want %d", len(kvs), f.NumKeyValues())
 	}
 	if diff := cmp.Diff(kvs, []string{
 		"general.architecture",
 		"llama.block_count",
 		"llama.embedding_length",
 		"llama.attention.head_count",
 		"llama.attention.head_count_kv",
 		"llama.attention.key_length",
 		"llama.rope.dimension_count",
 		"llama.rope.freq_base",
 		"llama.rope.freq_scale",
 		"llama.attention.layer_norm_rms_epsilon",
 		"tokenizer.ggml.eos_token_id",
 		"tokenizer.ggml.eos_token_ids",
 		"tokenizer.ggml.tokens",
 		"tokenizer.ggml.scores",
 	}, cmpopts.SortSlices(strings.Compare)); diff != "" {
 		t.Errorf("KeyValues() mismatch (-got +want):\n%s", diff)
 	}
 	var tis []string
 	for _, ti := range f.TensorInfos() {
 		if !ti.Valid() {
 			t.Error("found invalid tensor info:", ti)
 		}
 		tis = append(tis, ti.Name)
 	}
 	if len(tis) != f.NumTensors() {
 		t.Errorf("iterated tensor count = %d, want %d", len(tis), f.NumTensors())
 	}
 	if diff := cmp.Diff(tis, []string{
 		"token_embd.weight",
 		"output.weight",
 		"blk.0.attn_q.weight",
 		"blk.0.attn_k.weight",
 		"blk.0.attn_v.weight",
 		"blk.0.attn_output.weight",
 		"blk.1.attn_q.weight",
 		"blk.1.attn_k.weight",
 		"blk.1.attn_v.weight",
 		"blk.1.attn_output.weight",
 		"blk.2.attn_q.weight",
 		"blk.2.attn_k.weight",
 		"blk.2.attn_v.weight",
 		"blk.2.attn_output.weight",
 		"blk.3.attn_q.weight",
 		"blk.3.attn_k.weight",
 		"blk.3.attn_v.weight",
 		"blk.3.attn_output.weight",
 		"blk.4.attn_q.weight",
 		"blk.4.attn_k.weight",
 		"blk.4.attn_v.weight",
 		"blk.4.attn_output.weight",
 		"blk.5.attn_q.weight",
 		"blk.5.attn_k.weight",
 		"blk.5.attn_v.weight",
 		"blk.5.attn_output.weight",
 		"blk.6.attn_q.weight",
 		"blk.6.attn_k.weight",
 		"blk.6.attn_v.weight",
 		"blk.6.attn_output.weight",
 		"blk.7.attn_q.weight",
 		"blk.7.attn_k.weight",
 		"blk.7.attn_v.weight",
 		"blk.7.attn_output.weight",
 	}, cmpopts.SortSlices(strings.Compare)); diff != "" {
 		t.Errorf("TensorInfos() mismatch (-got +want):\n%s", diff)
 	}
 	ti, r, err := f.TensorReader("output.weight")
 	if err != nil {
 		t.Fatalf(`TensorReader("output.weight") error: %v`, err)
 	}
 	if ti.Name != "output.weight" {
 		t.Errorf(`TensorReader("output.weight").Name = %q, want %q`, ti.Name, "output.weight")
 	} else if diff := cmp.Diff(ti.Shape, []uint64{3, 2}); diff != "" {
 		t.Errorf(`TensorReader("output.weight").Shape mismatch (-got +want):\n%s`, diff)
 	} else if ti.Type != gguf.TensorTypeF32 {
 		t.Errorf(`TensorReader("output.weight").Type = %d, want %d`, ti.Type, gguf.TensorTypeF32)
 	}
 	var b bytes.Buffer
 	if _, err := b.ReadFrom(r); err != nil {
 		t.Fatalf(`ReadFrom TensorReader("output.weight") error: %v`, err)
 	}
 	if b.Len() != int(ti.NumBytes()) {
 		t.Errorf(`ReadFrom TensorReader("output.weight") length = %d, want %d`, b.Len(), ti.NumBytes())
 	}
 }
 func BenchmarkRead(b *testing.B) {
 	b.ReportAllocs()
 	p := createBinFile(b)
 	for b.Loop() {
 		f, err := gguf.Open(p)
 		if err != nil {
 			b.Fatal(err)
 		}
 		if got := f.KeyValue("general.architecture").String(); got != "llama" {
 			b.Errorf("got = %q, want %q", got, "llama")
 		}
 		// Iterate through some tensors
 		for range f.TensorInfos() {
 		}
 		f.Close()
 	}
 }
--- a/fs/gguf/keyvalue.go
+++ b/fs/gguf/keyvalue.go
@ -0,0 +1,90 @@
 package gguf
 import (
 	"reflect"
 	"slices"
 )
 type KeyValue struct {
 	Key string
 	Value
 }
 func (kv KeyValue) Valid() bool {
 	return kv.Key != "" && kv.Value.value != nil
 }
 type Value struct {
 	value any
 }
 func value[T any](v Value, kinds ...reflect.Kind) (t T) {
 	vv := reflect.ValueOf(v.value)
 	if slices.Contains(kinds, vv.Kind()) {
 		t = vv.Convert(reflect.TypeOf(t)).Interface().(T)
 	}
 	return
 }
 func values[T any](v Value, kinds ...reflect.Kind) (ts []T) {
 	switch vv := reflect.ValueOf(v.value); vv.Kind() {
 	case reflect.Slice:
 		if slices.Contains(kinds, vv.Type().Elem().Kind()) {
 			ts = make([]T, vv.Len())
 			for i := range vv.Len() {
 				ts[i] = vv.Index(i).Convert(reflect.TypeOf(ts[i])).Interface().(T)
 			}
 		}
 	}
 	return
 }
 // Int returns Value as a signed integer. If it is not a signed integer, it returns 0.
 func (v Value) Int() int64 {
 	return value[int64](v, reflect.Int, reflect.Int8, reflect.Int16, reflect.Int32, reflect.Int64)
 }
 // Ints returns Value as a signed integer slice. If it is not a signed integer slice, it returns nil.
 func (v Value) Ints() (i64s []int64) {
 	return values[int64](v, reflect.Int, reflect.Int8, reflect.Int16, reflect.Int32, reflect.Int64)
 }
 // Uint converts an unsigned integer value to uint64. If the value is not a unsigned integer, it returns 0.
 func (v Value) Uint() uint64 {
 	return value[uint64](v, reflect.Uint, reflect.Uint8, reflect.Uint16, reflect.Uint32, reflect.Uint64)
 }
 // Uints returns Value as a unsigned integer slice. If it is not a unsigned integer slice, it returns nil.
 func (v Value) Uints() (u64s []uint64) {
 	return values[uint64](v, reflect.Uint, reflect.Uint8, reflect.Uint16, reflect.Uint32, reflect.Uint64)
 }
 // Float returns Value as a float. If it is not a float, it returns 0.
 func (v Value) Float() float64 {
 	return value[float64](v, reflect.Float32, reflect.Float64)
 }
 // Floats returns Value as a float slice. If it is not a float slice, it returns nil.
 func (v Value) Floats() (f64s []float64) {
 	return values[float64](v, reflect.Float32, reflect.Float64)
 }
 // Bool returns Value as a boolean. If it is not a boolean, it returns false.
 func (v Value) Bool() bool {
 	return value[bool](v, reflect.Bool)
 }
 // Bools returns Value as a boolean slice. If it is not a boolean slice, it returns nil.
 func (v Value) Bools() (bools []bool) {
 	return values[bool](v, reflect.Bool)
 }
 // String returns Value as a string. If it is not a string, it returns an empty string.
 func (v Value) String() string {
 	return value[string](v, reflect.String)
 }
 // Strings returns Value as a string slice. If it is not a string slice, it returns nil.
 func (v Value) Strings() (strings []string) {
 	return values[string](v, reflect.String)
 }
--- a/fs/gguf/keyvalue_test.go
+++ b/fs/gguf/keyvalue_test.go
@ -0,0 +1,208 @@
 package gguf
 import (
 	"testing"
 	"github.com/google/go-cmp/cmp"
 )
 func split(name string, values map[string][]any) (matched []any, unmatched []any) {
 	for key, value := range values {
 		if key == name {
 			matched = value
 		} else {
 			unmatched = append(unmatched, value...)
 		}
 	}
 	return
 }
 func TestValue(t *testing.T) {
 	values := map[string][]any{
 		"int64":   {int(42), int8(42), int16(42), int32(42), int64(42)},
 		"uint64":  {uint(42), uint8(42), uint16(42), uint32(42), uint64(42)},
 		"float64": {float32(42), float64(42)},
 		"string":  {"42", "hello"},
 		"bool":    {true, false},
 	}
 	t.Run("int64", func(t *testing.T) {
 		matched, unmatched := split("int64", values)
 		for _, v := range matched {
 			kv := KeyValue{"key", Value{v}}
 			if i64 := kv.Int(); i64 != 42 {
 				t.Errorf("expected 42, got %d", i64)
 			}
 		}
 		for _, v := range unmatched {
 			kv := KeyValue{"key", Value{v}}
 			if i64 := kv.Int(); i64 != 0 {
 				t.Errorf("expected 42, got %d", i64)
 			}
 		}
 	})
 	t.Run("uint64", func(t *testing.T) {
 		matched, unmatched := split("uint64", values)
 		for _, v := range matched {
 			kv := KeyValue{"key", Value{v}}
 			if u64 := kv.Uint(); u64 != 42 {
 				t.Errorf("expected 42, got %d", u64)
 			}
 		}
 		for _, v := range unmatched {
 			kv := KeyValue{"key", Value{v}}
 			if u64 := kv.Uint(); u64 != 0 {
 				t.Errorf("expected 42, got %d", u64)
 			}
 		}
 	})
 	t.Run("float64", func(t *testing.T) {
 		matched, unmatched := split("float64", values)
 		for _, v := range matched {
 			kv := KeyValue{"key", Value{v}}
 			if f64 := kv.Float(); f64 != 42 {
 				t.Errorf("expected 42, got %f", f64)
 			}
 		}
 		for _, v := range unmatched {
 			kv := KeyValue{"key", Value{v}}
 			if f64 := kv.Float(); f64 != 0 {
 				t.Errorf("expected 42, got %f", f64)
 			}
 		}
 	})
 	t.Run("string", func(t *testing.T) {
 		matched, unmatched := split("string", values)
 		for _, v := range matched {
 			kv := KeyValue{"key", Value{v}}
 			if s := kv.String(); s != v {
 				t.Errorf("expected 42, got %s", s)
 			}
 		}
 		for _, v := range unmatched {
 			kv := KeyValue{"key", Value{v}}
 			if s := kv.String(); s != "" {
 				t.Errorf("expected 42, got %s", s)
 			}
 		}
 	})
 	t.Run("bool", func(t *testing.T) {
 		matched, unmatched := split("bool", values)
 		for _, v := range matched {
 			kv := KeyValue{"key", Value{v}}
 			if b := kv.Bool(); b != v {
 				t.Errorf("expected true, got %v", b)
 			}
 		}
 		for _, v := range unmatched {
 			kv := KeyValue{"key", Value{v}}
 			if b := kv.Bool(); b != false {
 				t.Errorf("expected false, got %v", b)
 			}
 		}
 	})
 }
 func TestValues(t *testing.T) {
 	values := map[string][]any{
 		"int64s":   {[]int{42}, []int8{42}, []int16{42}, []int32{42}, []int64{42}},
 		"uint64s":  {[]uint{42}, []uint8{42}, []uint16{42}, []uint32{42}, []uint64{42}},
 		"float64s": {[]float32{42}, []float64{42}},
 		"strings":  {[]string{"42"}, []string{"hello"}},
 		"bools":    {[]bool{true}, []bool{false}},
 	}
 	t.Run("int64s", func(t *testing.T) {
 		matched, unmatched := split("int64s", values)
 		for _, v := range matched {
 			kv := KeyValue{"key", Value{v}}
 			if diff := cmp.Diff(kv.Ints(), []int64{42}); diff != "" {
 				t.Errorf("diff: %s", diff)
 			}
 		}
 		for _, v := range unmatched {
 			kv := KeyValue{"key", Value{v}}
 			if i64s := kv.Ints(); i64s != nil {
 				t.Errorf("expected nil, got %v", i64s)
 			}
 		}
 	})
 	t.Run("uint64s", func(t *testing.T) {
 		matched, unmatched := split("uint64s", values)
 		for _, v := range matched {
 			kv := KeyValue{"key", Value{v}}
 			if diff := cmp.Diff(kv.Uints(), []uint64{42}); diff != "" {
 				t.Errorf("diff: %s", diff)
 			}
 		}
 		for _, v := range unmatched {
 			kv := KeyValue{"key", Value{v}}
 			if u64s := kv.Uints(); u64s != nil {
 				t.Errorf("expected nil, got %v", u64s)
 			}
 		}
 	})
 	t.Run("float64s", func(t *testing.T) {
 		matched, unmatched := split("float64s", values)
 		for _, v := range matched {
 			kv := KeyValue{"key", Value{v}}
 			if diff := cmp.Diff(kv.Floats(), []float64{42}); diff != "" {
 				t.Errorf("diff: %s", diff)
 			}
 		}
 		for _, v := range unmatched {
 			kv := KeyValue{"key", Value{v}}
 			if f64s := kv.Floats(); f64s != nil {
 				t.Errorf("expected nil, got %v", f64s)
 			}
 		}
 	})
 	t.Run("strings", func(t *testing.T) {
 		matched, unmatched := split("strings", values)
 		for _, v := range matched {
 			kv := KeyValue{"key", Value{v}}
 			if diff := cmp.Diff(kv.Strings(), v); diff != "" {
 				t.Errorf("diff: %s", diff)
 			}
 		}
 		for _, v := range unmatched {
 			kv := KeyValue{"key", Value{v}}
 			if s := kv.Strings(); s != nil {
 				t.Errorf("expected nil, got %v", s)
 			}
 		}
 	})
 	t.Run("bools", func(t *testing.T) {
 		matched, unmatched := split("bools", values)
 		for _, v := range matched {
 			kv := KeyValue{"key", Value{v}}
 			if diff := cmp.Diff(kv.Bools(), v); diff != "" {
 				t.Errorf("diff: %s", diff)
 			}
 		}
 		for _, v := range unmatched {
 			kv := KeyValue{"key", Value{v}}
 			if b := kv.Bools(); b != nil {
 				t.Errorf("expected nil, got %v", b)
 			}
 		}
 	})
 }
--- a/fs/gguf/lazy.go
+++ b/fs/gguf/lazy.go
@ -0,0 +1,89 @@
 package gguf
 import (
 	"encoding/binary"
 	"iter"
 	"log/slog"
 )
 type lazy[T any] struct {
 	count  uint64
 	next   func() (T, bool)
 	stop   func()
 	values []T
 	// successFunc is called when all values have been successfully read.
 	successFunc func() error
 }
 func newLazy[T any](f *File, fn func() (T, error)) (*lazy[T], error) {
 	it := lazy[T]{}
 	if err := binary.Read(f.reader, binary.LittleEndian, &it.count); err != nil {
 		return nil, err
 	}
 	it.values = make([]T, 0)
 	it.next, it.stop = iter.Pull(func(yield func(T) bool) {
 		for i := range it.count {
 			t, err := fn()
 			if err != nil {
 				slog.Error("error reading tensor", "index", i, "error", err)
 				return
 			}
 			it.values = append(it.values, t)
 			if !yield(t) {
 				break
 			}
 		}
 		if it.successFunc != nil {
 			it.successFunc()
 		}
 	})
 	return &it, nil
 }
 func (g *lazy[T]) Values() iter.Seq[T] {
 	return func(yield func(T) bool) {
 		for _, v := range g.All() {
 			if !yield(v) {
 				break
 			}
 		}
 	}
 }
 func (g *lazy[T]) All() iter.Seq2[int, T] {
 	return func(yield func(int, T) bool) {
 		for i := range int(g.count) {
 			if i < len(g.values) {
 				if !yield(i, g.values[i]) {
 					break
 				}
 			} else {
 				t, ok := g.next()
 				if !ok {
 					break
 				}
 				if !yield(i, t) {
 					break
 				}
 			}
 		}
 	}
 }
 func (g *lazy[T]) rest() (collected bool) {
 	for {
 		_, ok := g.next()
 		collected = collected || ok
 		if !ok {
 			break
 		}
 	}
 	return collected
 }
--- a/fs/gguf/reader.go
+++ b/fs/gguf/reader.go
@ -0,0 +1,23 @@
 package gguf
 import (
 	"bufio"
 	"io"
 )
 type bufferedReader struct {
 	offset int64
 	*bufio.Reader
 }
 func newBufferedReader(rs io.ReadSeeker, size int) *bufferedReader {
 	return &bufferedReader{
 		Reader: bufio.NewReaderSize(rs, size),
 	}
 }
 func (rs *bufferedReader) Read(p []byte) (n int, err error) {
 	n, err = rs.Reader.Read(p)
 	rs.offset += int64(n)
 	return n, err
 }
--- a/fs/gguf/tensor.go
+++ b/fs/gguf/tensor.go
@ -0,0 +1,288 @@
 package gguf
 import (
 	"log/slog"
 	"strings"
 )
 type TensorInfo struct {
 	Name   string
 	Offset uint64
 	Shape  []uint64
 	Type   TensorType
 }
 func (ti TensorInfo) Valid() bool {
 	return ti.Name != "" && ti.NumBytes() > 0
 }
 func (ti TensorInfo) NumValues() int64 {
 	var numItems int64 = 1
 	for _, dim := range ti.Shape {
 		numItems *= int64(dim)
 	}
 	return numItems
 }
 // NumBytes returns the number of bytes in the tensor.
 func (ti TensorInfo) NumBytes() int64 {
 	return int64(float64(ti.NumValues()) * ti.Type.NumBytes())
 }
 func (ti TensorInfo) LogValue() slog.Value {
 	return slog.GroupValue(
 		slog.String("name", ti.Name),
 		slog.Int64("offset", int64(ti.Offset)),
 		slog.Any("shape", ti.Shape),
 		slog.Int64("num_values", ti.NumValues()),
 		slog.Int64("num_bytes", ti.NumBytes()),
 		slog.Any("type", ti.Type),
 	)
 }
 type TensorType uint32
 const (
 	TensorTypeF32 TensorType = iota
 	TensorTypeF16
 	TensorTypeQ4_0
 	TensorTypeQ4_1
 	// unexported // unused in gguf
 	tensorTypeQ4_2
 	tensorTypeQ4_3
 	TensorTypeQ5_0
 	TensorTypeQ5_1
 	TensorTypeQ8_0
 	TensorTypeQ8_1
 	TensorTypeQ2_K
 	TensorTypeQ3_K
 	TensorTypeQ4_K
 	TensorTypeQ5_K
 	TensorTypeQ6_K
 	TensorTypeQ8_K
 	// unexported // unquantizable by ollama
 	tensorTypeIQ2_XXS
 	tensorTypeIQ2_XS
 	tensorTypeIQ3_XXS
 	tensorTypeIQ1_S
 	tensorTypeIQ4_NL
 	tensorTypeIQ3_S
 	tensorTypeIQ2_S
 	tensorTypeIQ4_XS
 	TensorTypeI8
 	TensorTypeI16
 	TensorTypeI32
 	TensorTypeI64
 	TensorTypeF64
 	// unexported // unquantizable by ollama
 	tensorTypeIQ1_M
 	TensorTypeBF16
 	// unexported // unused in gguf
 	tensorTypeQ4_0_4_4
 	tensorTypeQ4_0_4_8
 	tensorTypeQ4_0_8_8
 	// unexported // unquantizable by ollama
 	tensorTypeTQ1_0
 	tensorTypeTQ2_0
 	// unexported // unused in gguf
 	tensorTypeIQ4_NL_4_4
 	tensorTypeIQ4_NL_4_8
 	tensorTypeIQ4_NL_8_8
 )
 func (tt TensorType) NumBytes() float64 {
 	return float64(tt.typeSize()) / float64(tt.blockSize())
 }
 func (tt TensorType) typeSize() int64 {
 	switch tt {
 	case TensorTypeF32:
 		return 4
 	case TensorTypeF16:
 		return 2
 	case TensorTypeQ4_0:
 		return 2 + tt.blockSize()/2
 	case TensorTypeQ4_1:
 		return 2 + 2 + tt.blockSize()/2
 	case TensorTypeQ5_0:
 		return 2 + 4 + tt.blockSize()/2
 	case TensorTypeQ5_1:
 		return 2 + 2 + 4 + tt.blockSize()/2
 	case TensorTypeQ8_0:
 		return 2 + tt.blockSize()
 	case TensorTypeQ8_1:
 		return 2 + 2 + tt.blockSize()
 	case TensorTypeQ2_K:
 		return tt.blockSize()/16 + tt.blockSize()/4 + 2 + 2
 	case TensorTypeQ3_K:
 		return tt.blockSize()/8 + tt.blockSize()/4 + 12 + 2
 	case TensorTypeQ4_K:
 		return 2 + 2 + 12 + tt.blockSize()/2
 	case TensorTypeQ5_K:
 		return 2 + 2 + 12 + tt.blockSize()/8 + tt.blockSize()/2
 	case TensorTypeQ6_K:
 		return tt.blockSize()/2 + tt.blockSize()/4 + tt.blockSize()/16 + 2
 	case TensorTypeQ8_K:
 		return 4 + tt.blockSize() + 2*tt.blockSize()/16
 	case tensorTypeIQ2_XXS:
 		return 2 + 2*tt.blockSize()/8
 	case tensorTypeIQ2_XS:
 		return 2 + 2*tt.blockSize()/8 + tt.blockSize()/32
 	case tensorTypeIQ3_XXS:
 		return 2 + tt.blockSize()/4 + tt.blockSize()/8
 	case tensorTypeIQ1_S:
 		return 2 + tt.blockSize()/8 + tt.blockSize()/16
 	case tensorTypeIQ4_NL:
 		return 2 + tt.blockSize()/2
 	case tensorTypeIQ3_S:
 		return 2 + tt.blockSize()/4 + tt.blockSize()/8 + tt.blockSize()/32 + 4
 	case tensorTypeIQ2_S:
 		return 2 + tt.blockSize()/4 + tt.blockSize()/16
 	case tensorTypeIQ4_XS:
 		return 2 + 2 + tt.blockSize()/2 + tt.blockSize()/64
 	case TensorTypeI8:
 		return 1
 	case TensorTypeI16:
 		return 2
 	case TensorTypeI32:
 		return 4
 	case TensorTypeI64:
 		return 8
 	case TensorTypeF64:
 		return 8
 	case tensorTypeIQ1_M:
 		return tt.blockSize()/8 + tt.blockSize()/16 + tt.blockSize()/32
 	case TensorTypeBF16:
 		return 2
 	default:
 		return 0
 	}
 }
 func (tt TensorType) blockSize() int64 {
 	switch tt {
 	case TensorTypeF32,
 		TensorTypeF16,
 		TensorTypeI8,
 		TensorTypeI16,
 		TensorTypeI32,
 		TensorTypeI64,
 		TensorTypeF64,
 		TensorTypeBF16:
 		return 1
 	case TensorTypeQ4_0,
 		TensorTypeQ4_1,
 		TensorTypeQ5_0,
 		TensorTypeQ5_1,
 		TensorTypeQ8_0,
 		TensorTypeQ8_1,
 		tensorTypeIQ4_NL:
 		return 32
 	default:
 		return 256
 	}
 }
 func (tt TensorType) String() string {
 	switch tt {
 	case TensorTypeF32:
 		return "f32"
 	case TensorTypeF16:
 		return "f16"
 	case TensorTypeQ4_0:
 		return "q4_0"
 	case TensorTypeQ4_1:
 		return "q4_1"
 	case tensorTypeQ4_2:
 		return "q4_2"
 	case tensorTypeQ4_3:
 		return "q4_3"
 	case TensorTypeQ5_0:
 		return "q5_0"
 	case TensorTypeQ5_1:
 		return "q5_1"
 	case TensorTypeQ8_0:
 		return "q8_0"
 	case TensorTypeQ8_1:
 		return "q8_1"
 	case TensorTypeQ2_K:
 		return "q2_k"
 	case TensorTypeQ3_K:
 		return "q3_k"
 	case TensorTypeQ4_K:
 		return "q4_k"
 	case TensorTypeQ5_K:
 		return "q5_k"
 	case TensorTypeQ6_K:
 		return "q6_k"
 	case TensorTypeQ8_K:
 		return "q8_k"
 	case tensorTypeIQ2_XXS:
 		return "iq2_xxs"
 	case tensorTypeIQ2_XS:
 		return "iq2_xs"
 	case tensorTypeIQ3_XXS:
 		return "iq3_xxs"
 	case tensorTypeIQ1_S:
 		return "iq1_s"
 	case tensorTypeIQ4_NL:
 		return "iq4_nl"
 	case tensorTypeIQ3_S:
 		return "iq3_s"
 	case tensorTypeIQ2_S:
 		return "iq2_s"
 	case tensorTypeIQ4_XS:
 		return "iq4_xs"
 	case TensorTypeI8:
 		return "i8"
 	case TensorTypeI16:
 		return "i16"
 	case TensorTypeI32:
 		return "i32"
 	case TensorTypeI64:
 		return "i64"
 	case TensorTypeF64:
 		return "f64"
 	case tensorTypeIQ1_M:
 		return "iq1_m"
 	case TensorTypeBF16:
 		return "bf16"
 	case tensorTypeQ4_0_4_4:
 		return "q4_0_4_4"
 	case tensorTypeQ4_0_4_8:
 		return "q4_0_4_8"
 	case tensorTypeQ4_0_8_8:
 		return "q4_0_8_8"
 	case tensorTypeTQ1_0:
 		return "tq1_0"
 	case tensorTypeTQ2_0:
 		return "tq2_0"
 	case tensorTypeIQ4_NL_4_4:
 		return "iq4_nl_4_4"
 	case tensorTypeIQ4_NL_4_8:
 		return "iq4_nl_4_8"
 	case tensorTypeIQ4_NL_8_8:
 		return "iq4_nl_8_8"
 	default:
 		return "unknown"
 	}
 }
 func (tt TensorType) LogValue() slog.Value {
 	return slog.GroupValue(
 		slog.Uint64("value", uint64(tt)),
 		slog.String("name", strings.ToUpper(tt.String())),
 		slog.Int64("size", tt.typeSize()),
 		slog.Int64("block_size", tt.blockSize()),
 		slog.Float64("num_bytes", tt.NumBytes()),
 	)
 }
--- a/go.mod
+++ b/go.mod
@ -19,7 +19,7 @@ require (
 	github.com/d4l3k/go-bfloat16 v0.0.0-20211005043715-690c3bdd05f1
 	github.com/dlclark/regexp2 v1.11.4
 	github.com/emirpasic/gods/v2 v2.0.0-alpha
-	github.com/google/go-cmp v0.6.0
+	github.com/google/go-cmp v0.7.0
 	github.com/mattn/go-runewidth v0.0.14
 	github.com/nlpodyssey/gopickle v0.3.0
 	github.com/pdevine/tensor v0.0.0-20240510204454-f88f4562727c
--- a/go.sum
+++ b/go.sum
@ -112,8 +112,8 @@ github.com/google/go-cmp v0.4.0/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/
 github.com/google/go-cmp v0.5.0/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE=
 github.com/google/go-cmp v0.5.5/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE=
 github.com/google/go-cmp v0.5.6/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE=
-github.com/google/go-cmp v0.6.0 h1:ofyhxvXcZhMsU5ulbFiLKl/XBFqE1GSq7atu8tAmTRI=
+github.com/google/go-cmp v0.7.0 h1:wk8382ETsv4JYUZwIsn6YpYiWiBsYLSJiTsyBybVuN8=
-github.com/google/go-cmp v0.6.0/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY=
+github.com/google/go-cmp v0.7.0/go.mod h1:pXiqmnSA92OHEEa9HXL2W4E7lf9JzCmGVUdgjX3N/iU=
 github.com/google/gofuzz v1.0.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg=
 github.com/google/uuid v1.1.2/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo=
 github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0=
--- a/integration/llm_image_test.go
+++ b/integration/llm_image_test.go
@ -19,7 +19,7 @@ func TestVisionModels(t *testing.T) {
 	}
 	testCases := []testCase{
 		{
-			model: "llava:7b",
+			model: "qwen2.5vl",
 		},
 		{
 			model: "llama3.2-vision",
@ -60,6 +60,7 @@ func TestVisionModels(t *testing.T) {
 }
 func TestIntegrationSplitBatch(t *testing.T) {
 	skipUnderMinVRAM(t, 6)
 	image, err := base64.StdEncoding.DecodeString(imageEncoding)
 	require.NoError(t, err)
 	req := api.GenerateRequest{
--- a/integration/model_arch_test.go
+++ b/integration/model_arch_test.go
@ -45,6 +45,8 @@ var (
 		"qwen2.5-coder:latest",
 		"qwen:latest",
 		"solar-pro:latest",
 		"codellama:latest",
 		"nous-hermes:latest",
 	}
 )
--- a/integration/testdata/embed.json
+++ b/integration/testdata/embed.json
--- a/kvcache/causal.go
+++ b/kvcache/causal.go
@ -30,6 +30,11 @@ type Causal struct {
 	// ** current forward pass **
 	// curReserve indicates that this forward pass is only for
 	// memory reservation and we should not update our metadata
 	// based on it.
 	curReserve bool
 	// the active layer for Get and Put
 	curLayer int
@ -159,12 +164,13 @@ func (c *Causal) Close() {
 }
 func (c *Causal) StartForward(ctx ml.Context, batch input.Batch, reserve bool) error {
 	c.curReserve = reserve
 	c.curBatchSize = len(batch.Positions)
 	c.curSequences = batch.Sequences
 	c.curPositions = batch.Positions
 	c.opts.Except = nil
-	if !reserve {
+	if !c.curReserve {
 		c.updateSlidingWindow()
 		var err error
@ -211,10 +217,9 @@ func (c *Causal) StartForward(ctx ml.Context, batch input.Batch, reserve bool) e
 		c.curCellRange.max = len(c.cells) - 1
 	}
-	var err error
+	c.curMask = c.buildMask(ctx)
 	c.curMask, err = c.buildMask(ctx)
-	return err
+	return nil
 }
 func newRange() cellRange {
@ -297,7 +302,7 @@ func roundUp(length, pad int) int {
 // Builds a mask of history x batch indicating whether for each token in the batch the
 // token in the history should apply. This is based on both the sequence and causality (the
 // position of the history is not ahead of the token in the batch).
-func (c *Causal) buildMask(ctx ml.Context) (ml.Tensor, error) {
+func (c *Causal) buildMask(ctx ml.Context) ml.Tensor {
 	// Align and pad the two dimensions as required by the backend
 	batchSize := roundUp(c.curBatchSize, c.config.MaskBatchPadding)
@ -305,6 +310,11 @@ func (c *Causal) buildMask(ctx ml.Context) (ml.Tensor, error) {
 	c.curCellRange.max = roundUp(c.curCellRange.max+1, c.config.CachePadding) - 1
 	length := c.curCellRange.max - c.curCellRange.min + 1
 	if c.curReserve {
 		return ctx.Input().Empty(c.config.MaskDType, length, batchSize)
 	}
 	mask := make([]float32, batchSize*length)
 	for i := range c.curBatchSize {
@ -325,10 +335,7 @@ func (c *Causal) buildMask(ctx ml.Context) (ml.Tensor, error) {
 		mask[i] = float32(math.Inf(-1))
 	}
-	maskTensor, err := ctx.Input().FromFloatSlice(mask, length, batchSize)
+	maskTensor := ctx.Input().FromFloatSlice(mask, length, batchSize)
 	if err != nil {
 		return nil, err
 	}
 	if c.config.MaskDType != ml.DTypeF32 {
 		out := ctx.Input().Empty(c.config.MaskDType, maskTensor.Shape()...)
@ -336,7 +343,7 @@ func (c *Causal) buildMask(ctx ml.Context) (ml.Tensor, error) {
 		maskTensor = out
 	}
-	return maskTensor, nil
+	return maskTensor
 }
 func (c *Causal) moveCells(ctx ml.Context, src, dst, length int) {
@ -491,12 +498,7 @@ func (c *Causal) SetCausal(ctx ml.Context, opts CausalOptions) {
 	if !slices.Equal(c.opts.Except, opts.Except) {
 		c.opts = opts
 		if ctx != nil {
-			var err error
+			c.curMask = c.buildMask(ctx)
 			c.curMask, err = c.buildMask(ctx)
 			if err != nil {
 				// This error should never occur because we have previously built a mask with the same shape
 				panic(fmt.Errorf("SetCausal: %w", err))
 			}
 		}
 	}
 }
@ -652,10 +654,7 @@ func (c *Causal) shift(seq int, beginIndex, offset int32) error {
 		}
 	}
-	kShift, err := ctx.Input().FromIntSlice(offsets, len(offsets))
+	kShift := ctx.Input().FromIntSlice(offsets, len(offsets))
 	if err != nil {
 		return err
 	}
 	for i, key := range c.keys {
 		if key == nil {
--- a/kvcache/causal_test.go
+++ b/kvcache/causal_test.go
@ -344,7 +344,7 @@ func testCache(t *testing.T, backend ml.Backend, cache Cache, tests []testCase)
 			}
 			cache.SetLayer(0)
-			tensor, _ := context.FromFloatSlice(test.in, test.inShape...)
+			tensor := context.FromFloatSlice(test.in, test.inShape...)
 			cache.Put(context, tensor, tensor)
 			out, _, mask := cache.Get(context)
@ -386,7 +386,7 @@ func TestCanResume(t *testing.T) {
 	}
 	cache.SetLayer(0)
-	tensor, _ := context.FromFloatSlice([]float32{1, 2, 3, 4}, 1, 1, 4)
+	tensor := context.FromFloatSlice([]float32{1, 2, 3, 4}, 1, 1, 4)
 	cache.Put(context, tensor, tensor)
 	// with window size 4, nothing has slid out of the window yet
@ -413,7 +413,7 @@ func TestCanResume(t *testing.T) {
 	}
 	cache.SetLayer(0)
-	tensor, _ = context.FromFloatSlice([]float32{5, 6}, 1, 1, 2)
+	tensor = context.FromFloatSlice([]float32{5, 6}, 1, 1, 2)
 	cache.Put(context, tensor, tensor)
 	// only the latest position has overlapping windows
@ -470,24 +470,24 @@ func (c *testContext) Zeros(dtype ml.DType, shape ...int) ml.Tensor {
 	return c.Empty(dtype, shape...)
 }
-func (c *testContext) FromFloatSlice(s []float32, shape ...int) (ml.Tensor, error) {
+func (c *testContext) FromFloatSlice(s []float32, shape ...int) ml.Tensor {
 	t := c.Empty(ml.DTypeF32, shape...).(*testTensor)
 	copy(t.data, s)
-	return t, nil
+	return t
 }
-func (c *testContext) FromIntSlice(s []int32, shape ...int) (ml.Tensor, error) {
+func (c *testContext) FromIntSlice(s []int32, shape ...int) ml.Tensor {
 	f := make([]float32, len(s))
 	for i := range f {
 		f[i] = float32(s[i])
 	}
-	out, _ := c.FromFloatSlice(f, shape...)
+	out := c.FromFloatSlice(f, shape...)
 	out.(*testTensor).dtype = ml.DTypeI32
-	return out, nil
+	return out
 }
 func (c *testContext) Arange(start, stop, step float32, dtype ml.DType) ml.Tensor {
@ -496,7 +496,7 @@ func (c *testContext) Arange(start, stop, step float32, dtype ml.DType) ml.Tenso
 		s = append(s, i)
 	}
-	out, _ := c.FromFloatSlice(s, len(s))
+	out := c.FromFloatSlice(s, len(s))
 	out.(*testTensor).dtype = dtype
 	return out
 }
@ -508,7 +508,7 @@ func (c *testContext) Forward(...ml.Tensor) ml.Context { return c }
 func (c *testContext) Compute(...ml.Tensor) {}
-func (c *testContext) Reserve() error { return nil }
+func (c *testContext) Reserve() {}
 func (c *testContext) MaxGraphNodes() int {
 	return 10
--- a/llama/build-info.cpp
+++ b/llama/build-info.cpp
@ -1,4 +1,4 @@
 int LLAMA_BUILD_NUMBER = 0;
-char const *LLAMA_COMMIT = "e1e8e0991ffd9e99a445c6812bb519d5bac9f4b5";
+char const *LLAMA_COMMIT = "de4c07f93783a1a96456a44dc16b9db538ee1618";
 char const *LLAMA_COMPILER = "";
 char const *LLAMA_BUILD_TARGET = "";
--- a/llama/llama.cpp/.rsync-filter
+++ b/llama/llama.cpp/.rsync-filter
@ -10,11 +10,11 @@ include common/stb_image.*
 include include/
 include include/llama.*
 include include/llama-*.*
-include examples/
+include tools/
-include examples/llava/
+include tools/mtmd/
-include examples/llava/clip.*
+include tools/mtmd/clip.*
-include examples/llava/clip-impl.*
+include tools/mtmd/clip-impl.*
-include examples/llava/llava.*
+include tools/mtmd/llava.*
 include src/
 include src/llama.*
 include src/llama-*.*
--- a/llama/llama.cpp/common/common.cpp
+++ b/llama/llama.cpp/common/common.cpp
@ -1096,7 +1096,6 @@ struct llama_context_params common_context_params_to_llama(const common_params &
    cparams.n_threads         = params.cpuparams.n_threads;
    cparams.n_threads_batch   = params.cpuparams_batch.n_threads == -1 ?
                                params.cpuparams.n_threads : params.cpuparams_batch.n_threads;
    cparams.logits_all        = params.logits_all;
    cparams.embeddings        = params.embedding;
    cparams.rope_scaling_type = params.rope_scaling_type;
    cparams.rope_freq_base    = params.rope_freq_base;
@ -1114,6 +1113,7 @@ struct llama_context_params common_context_params_to_llama(const common_params &
    cparams.offload_kqv       = !params.no_kv_offload;
    cparams.flash_attn        = params.flash_attn;
    cparams.no_perf           = params.no_perf;
    cparams.op_offload        = !params.no_op_offload;
    if (params.reranking) {
        cparams.embeddings    = true;
@ -1565,3 +1565,20 @@ common_control_vector_data common_control_vector_load(const std::vector<common_c
    return result;
 }
 ggml_opt_dataset_t common_opt_dataset_init(struct llama_context * ctx, const std::vector<llama_token> & tokens, int64_t stride) {
    const int64_t ne_datapoint = llama_n_ctx(ctx);
    const int64_t ndata        = (tokens.size() - ne_datapoint - 1) / stride;
    ggml_opt_dataset_t result = ggml_opt_dataset_init(
        GGML_TYPE_I32, GGML_TYPE_I32, ne_datapoint, ne_datapoint, ndata, /*ndata_shard =*/ 1);
    llama_token * data   = (llama_token *) ggml_opt_dataset_data(result)->data;
    llama_token * labels = (llama_token *) ggml_opt_dataset_labels(result)->data;
    for (int64_t idata = 0; idata < ndata; ++idata) {
        memcpy(data   + idata*ne_datapoint, tokens.data() + idata*stride + 0, ne_datapoint*sizeof(llama_token));
        memcpy(labels + idata*ne_datapoint, tokens.data() + idata*stride + 1, ne_datapoint*sizeof(llama_token));
    }
    return result;
 }
--- a/llama/llama.cpp/common/common.h
+++ b/llama/llama.cpp/common/common.h
@ -66,7 +66,6 @@ enum llama_example {
    LLAMA_EXAMPLE_COMMON,
    LLAMA_EXAMPLE_SPECULATIVE,
    LLAMA_EXAMPLE_MAIN,
    LLAMA_EXAMPLE_INFILL,
    LLAMA_EXAMPLE_EMBEDDING,
    LLAMA_EXAMPLE_PERPLEXITY,
    LLAMA_EXAMPLE_RETRIEVAL,
@ -96,6 +95,7 @@ enum common_sampler_type {
    COMMON_SAMPLER_TYPE_XTC         = 8,
    COMMON_SAMPLER_TYPE_INFILL      = 9,
    COMMON_SAMPLER_TYPE_PENALTIES   = 10,
    COMMON_SAMPLER_TYPE_TOP_N_SIGMA = 11,
 };
 // dimensionality reduction methods, used by cvector-generator
@ -161,6 +161,7 @@ struct common_params_sampling {
    std::vector<enum common_sampler_type> samplers = {
        COMMON_SAMPLER_TYPE_PENALTIES,
        COMMON_SAMPLER_TYPE_DRY,
        COMMON_SAMPLER_TYPE_TOP_N_SIGMA,
        COMMON_SAMPLER_TYPE_TOP_K,
        COMMON_SAMPLER_TYPE_TYPICAL_P,
        COMMON_SAMPLER_TYPE_TOP_P,
@ -323,7 +324,6 @@ struct common_params {
    bool ctx_shift         = true;  // context shift on inifinite text generation
    bool input_prefix_bos  = false; // prefix BOS to user inputs, preceding input_prefix
    bool logits_all        = false; // return logits for all tokens in the batch
    bool use_mmap          = true;  // use mmap for faster loads
    bool use_mlock         = false; // use mlock to keep model in memory
    bool verbose_prompt    = false; // print prompt tokens before generation
@ -332,6 +332,7 @@ struct common_params {
    bool no_kv_offload     = false; // disable KV offloading
    bool warmup            = true;  // warmup run
    bool check_tensors     = false; // validate tensor data
    bool no_op_offload     = false; // globally disable offload host tensor operations to device
    bool single_turn       = false; // single turn chat conversation
@ -340,7 +341,7 @@ struct common_params {
    common_conversation_mode conversation_mode = COMMON_CONVERSATION_MODE_AUTO;
-    // multimodal models (see examples/llava)
+    // multimodal models (see tools/mtmd)
    struct common_params_model mmproj;
    bool mmproj_use_gpu = true;     // use GPU for multimodal model
    bool no_mmproj = false;         // explicitly disable multimodal model
@ -409,13 +410,14 @@ struct common_params {
    bool process_output = false; // collect data for the output tensor
    bool compute_ppl    = true;  // whether to compute perplexity
    bool parse_special  = false; // whether to parse special tokens during imatrix tokenization
    // cvector-generator params
    int n_pca_batch = 100;
    int n_pca_iterations = 1000;
    dimre_method cvector_dimre_method = DIMRE_METHOD_PCA;
-    std::string cvector_positive_file = "examples/cvector-generator/positive.txt";
+    std::string cvector_positive_file = "tools/cvector-generator/positive.txt";
-    std::string cvector_negative_file = "examples/cvector-generator/negative.txt";
+    std::string cvector_negative_file = "tools/cvector-generator/negative.txt";
    bool spm_infill = false; // suffix/prefix/middle pattern for infill
@ -664,3 +666,9 @@ const char * const LLM_KV_SPLIT_COUNT         = "split.count";
 const char * const LLM_KV_SPLIT_TENSORS_COUNT = "split.tensors.count";
 }
 //
 // training utils
 //
 ggml_opt_dataset_t common_opt_dataset_init(struct llama_context * ctx, const std::vector<llama_token> & tokens, int64_t stride);
--- a/llama/llama.cpp/common/sampling.cpp
+++ b/llama/llama.cpp/common/sampling.cpp
@ -1,6 +1,7 @@
 #include "sampling.h"
 #include "common.h"
 #include "log.h"
 #include <cmath>
 #include <unordered_map>
@ -229,51 +230,48 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, co
                params.logit_bias.data()));
    if (params.mirostat == 0) {
-        if (params.top_n_sigma >= 0) {
+        for (const auto & cnstr : params.samplers) {
-            llama_sampler_chain_add(result->chain, llama_sampler_init_top_k        (params.top_k));
+            switch (cnstr) {
-            llama_sampler_chain_add(result->chain, llama_sampler_init_temp         (params.temp));
+                case COMMON_SAMPLER_TYPE_DRY:
-            llama_sampler_chain_add(result->chain, llama_sampler_init_top_n_sigma  (params.top_n_sigma));
+                    {
-        } else {
+                        std::vector<const char *> c_breakers;
-            for (const auto & cnstr : params.samplers) {
+                        c_breakers.reserve(params.dry_sequence_breakers.size());
-                switch (cnstr) {
+                        for (const auto & str : params.dry_sequence_breakers) {
-                    case COMMON_SAMPLER_TYPE_DRY:
+                            c_breakers.push_back(str.c_str());
                        {
                            std::vector<const char *> c_breakers;
                            c_breakers.reserve(params.dry_sequence_breakers.size());
                            for (const auto & str : params.dry_sequence_breakers) {
                                c_breakers.push_back(str.c_str());
                            }
                            llama_sampler_chain_add(result->chain, llama_sampler_init_dry      (vocab, llama_model_n_ctx_train(model), params.dry_multiplier, params.dry_base, params.dry_allowed_length, params.dry_penalty_last_n, c_breakers.data(), c_breakers.size()));
                        }
-                        break;
+
-                    case COMMON_SAMPLER_TYPE_TOP_K:
+                        llama_sampler_chain_add(result->chain, llama_sampler_init_dry      (vocab, llama_model_n_ctx_train(model), params.dry_multiplier, params.dry_base, params.dry_allowed_length, params.dry_penalty_last_n, c_breakers.data(), c_breakers.size()));
-                        llama_sampler_chain_add(result->chain, llama_sampler_init_top_k    (params.top_k));
+                    }
-                        break;
+                    break;
-                    case COMMON_SAMPLER_TYPE_TOP_P:
+                case COMMON_SAMPLER_TYPE_TOP_K:
-                        llama_sampler_chain_add(result->chain, llama_sampler_init_top_p    (params.top_p, params.min_keep));
+                    llama_sampler_chain_add(result->chain, llama_sampler_init_top_k       (params.top_k));
-                        break;
+                    break;
-                    case COMMON_SAMPLER_TYPE_MIN_P:
+                case COMMON_SAMPLER_TYPE_TOP_P:
-                        llama_sampler_chain_add(result->chain, llama_sampler_init_min_p    (params.min_p, params.min_keep));
+                    llama_sampler_chain_add(result->chain, llama_sampler_init_top_p       (params.top_p, params.min_keep));
-                        break;
+                    break;
-                    case COMMON_SAMPLER_TYPE_XTC:
+                case COMMON_SAMPLER_TYPE_TOP_N_SIGMA:
-                        llama_sampler_chain_add(result->chain, llama_sampler_init_xtc      (params.xtc_probability, params.xtc_threshold, params.min_keep, params.seed));
+                    llama_sampler_chain_add(result->chain, llama_sampler_init_top_n_sigma (params.top_n_sigma));
-                        break;
+                    break;
-                    case COMMON_SAMPLER_TYPE_TYPICAL_P:
+                case COMMON_SAMPLER_TYPE_MIN_P:
-                        llama_sampler_chain_add(result->chain, llama_sampler_init_typical  (params.typ_p, params.min_keep));
+                    llama_sampler_chain_add(result->chain, llama_sampler_init_min_p       (params.min_p, params.min_keep));
-                        break;
+                    break;
-                    case COMMON_SAMPLER_TYPE_TEMPERATURE:
+                case COMMON_SAMPLER_TYPE_XTC:
-                        llama_sampler_chain_add(result->chain, llama_sampler_init_temp_ext (params.temp, params.dynatemp_range, params.dynatemp_exponent));
+                    llama_sampler_chain_add(result->chain, llama_sampler_init_xtc         (params.xtc_probability, params.xtc_threshold, params.min_keep, params.seed));
-                        break;
+                    break;
-                    case COMMON_SAMPLER_TYPE_INFILL:
+                case COMMON_SAMPLER_TYPE_TYPICAL_P:
-                        llama_sampler_chain_add(result->chain, llama_sampler_init_infill   (vocab));
+                    llama_sampler_chain_add(result->chain, llama_sampler_init_typical     (params.typ_p, params.min_keep));
-                        break;
+                    break;
-                    case COMMON_SAMPLER_TYPE_PENALTIES:
+                case COMMON_SAMPLER_TYPE_TEMPERATURE:
-                        llama_sampler_chain_add(result->chain, llama_sampler_init_penalties(params.penalty_last_n, params.penalty_repeat, params.penalty_freq, params.penalty_present));
+                    llama_sampler_chain_add(result->chain, llama_sampler_init_temp_ext    (params.temp, params.dynatemp_range, params.dynatemp_exponent));
-                        break;
+                    break;
-                    default:
+                case COMMON_SAMPLER_TYPE_INFILL:
-                        GGML_ASSERT(false && "unknown sampler type");
+                    llama_sampler_chain_add(result->chain, llama_sampler_init_infill      (vocab));
-                }
+                    break;
                case COMMON_SAMPLER_TYPE_PENALTIES:
                    llama_sampler_chain_add(result->chain, llama_sampler_init_penalties   (params.penalty_last_n, params.penalty_repeat, params.penalty_freq, params.penalty_present));
                    break;
                default:
                    GGML_ASSERT(false && "unknown sampler type");
            }
        }
        llama_sampler_chain_add(result->chain, llama_sampler_init_dist(params.seed));
@ -475,6 +473,7 @@ char common_sampler_type_to_chr(enum common_sampler_type cnstr) {
        case COMMON_SAMPLER_TYPE_TOP_K:       return 'k';
        case COMMON_SAMPLER_TYPE_TYPICAL_P:   return 'y';
        case COMMON_SAMPLER_TYPE_TOP_P:       return 'p';
        case COMMON_SAMPLER_TYPE_TOP_N_SIGMA: return 's';
        case COMMON_SAMPLER_TYPE_MIN_P:       return 'm';
        case COMMON_SAMPLER_TYPE_TEMPERATURE: return 't';
        case COMMON_SAMPLER_TYPE_XTC:         return 'x';
@ -490,6 +489,7 @@ std::string common_sampler_type_to_str(enum common_sampler_type cnstr) {
        case COMMON_SAMPLER_TYPE_TOP_K:       return "top_k";
        case COMMON_SAMPLER_TYPE_TYPICAL_P:   return "typ_p";
        case COMMON_SAMPLER_TYPE_TOP_P:       return "top_p";
        case COMMON_SAMPLER_TYPE_TOP_N_SIGMA: return "top_n_sigma";
        case COMMON_SAMPLER_TYPE_MIN_P:       return "min_p";
        case COMMON_SAMPLER_TYPE_TEMPERATURE: return "temperature";
        case COMMON_SAMPLER_TYPE_XTC:         return "xtc";
@ -504,6 +504,7 @@ std::vector<common_sampler_type> common_sampler_types_from_names(const std::vect
        { "dry",         COMMON_SAMPLER_TYPE_DRY },
        { "top_k",       COMMON_SAMPLER_TYPE_TOP_K },
        { "top_p",       COMMON_SAMPLER_TYPE_TOP_P },
        { "top_n_sigma", COMMON_SAMPLER_TYPE_TOP_N_SIGMA },
        { "typ_p",       COMMON_SAMPLER_TYPE_TYPICAL_P },
        { "min_p",       COMMON_SAMPLER_TYPE_MIN_P },
        { "temperature", COMMON_SAMPLER_TYPE_TEMPERATURE },
@ -517,6 +518,7 @@ std::vector<common_sampler_type> common_sampler_types_from_names(const std::vect
    std::unordered_map<std::string, common_sampler_type> sampler_alt_name_map {
        { "top-k",       COMMON_SAMPLER_TYPE_TOP_K },
        { "top-p",       COMMON_SAMPLER_TYPE_TOP_P },
        { "top-n-sigma", COMMON_SAMPLER_TYPE_TOP_N_SIGMA },
        { "nucleus",     COMMON_SAMPLER_TYPE_TOP_P },
        { "typical-p",   COMMON_SAMPLER_TYPE_TYPICAL_P },
        { "typical",     COMMON_SAMPLER_TYPE_TYPICAL_P },
@ -533,14 +535,16 @@ std::vector<common_sampler_type> common_sampler_types_from_names(const std::vect
        auto sampler = sampler_canonical_name_map.find(name);
        if (sampler != sampler_canonical_name_map.end()) {
            samplers.push_back(sampler->second);
-        } else {
+            continue;
-            if (allow_alt_names) {
+        }
-                sampler = sampler_alt_name_map.find(name);
+        if (allow_alt_names) {
-                if (sampler != sampler_alt_name_map.end()) {
+            sampler = sampler_alt_name_map.find(name);
-                    samplers.push_back(sampler->second);
+            if (sampler != sampler_alt_name_map.end()) {
-                }
+                samplers.push_back(sampler->second);
                continue;
            }
        }
        LOG_WRN("%s: unable to match sampler by name '%s'\n", __func__, name.c_str());
    }
    return samplers;
@ -552,6 +556,7 @@ std::vector<common_sampler_type> common_sampler_types_from_chars(const std::stri
        { common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_TOP_K),       COMMON_SAMPLER_TYPE_TOP_K },
        { common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_TYPICAL_P),   COMMON_SAMPLER_TYPE_TYPICAL_P },
        { common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_TOP_P),       COMMON_SAMPLER_TYPE_TOP_P },
        { common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_TOP_N_SIGMA), COMMON_SAMPLER_TYPE_TOP_N_SIGMA },
        { common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_MIN_P),       COMMON_SAMPLER_TYPE_MIN_P },
        { common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_TEMPERATURE), COMMON_SAMPLER_TYPE_TEMPERATURE },
        { common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_XTC),         COMMON_SAMPLER_TYPE_XTC },
@ -566,6 +571,8 @@ std::vector<common_sampler_type> common_sampler_types_from_chars(const std::stri
        const auto sampler = sampler_name_map.find(c);
        if (sampler != sampler_name_map.end()) {
            samplers.push_back(sampler->second);
        } else {
            LOG_WRN("%s: unable to match sampler by char '%c'\n", __func__, c);
        }
    }
--- a/llama/llama.cpp/include/llama.h
+++ b/llama/llama.cpp/include/llama.h
@ -4,6 +4,7 @@
 #include "ggml.h"
 #include "ggml-cpu.h"
 #include "ggml-backend.h"
 #include "ggml-opt.h"
 #include <stddef.h>
 #include <stdint.h>
@ -112,6 +113,7 @@ extern "C" {
        LLAMA_VOCAB_PRE_TYPE_BAILINGMOE     = 32,
        LLAMA_VOCAB_PRE_TYPE_LLAMA4         = 33,
        LLAMA_VOCAB_PRE_TYPE_PIXTRAL        = 34,
        LLAMA_VOCAB_PRE_TYPE_SEED_CODER     = 35,
    };
    enum llama_rope_type {
@ -256,7 +258,6 @@ extern "C" {
        llama_token  *  token;
        float        *  embd;
        int32_t         n_embd;
        llama_pos    *  pos;
        int32_t      *  n_seq_id;
        llama_seq_id ** seq_id;
@ -352,20 +353,18 @@ extern "C" {
        enum ggml_type type_k; // data type for K cache [EXPERIMENTAL]
        enum ggml_type type_v; // data type for V cache [EXPERIMENTAL]
        // Keep the booleans together and at the end of the struct to avoid misalignment during copy-by-value.
        // TODO: move at the end of the struct
        bool logits_all;  // the llama_decode() call computes all logits, not just the last one (DEPRECATED - set llama_batch.logits instead)
        bool embeddings;  // if true, extract embeddings (together with logits)
        bool offload_kqv; // whether to offload the KQV ops (including the KV cache) to GPU
        bool flash_attn;  // whether to use flash attention [EXPERIMENTAL]
        bool no_perf;     // whether to measure performance timings
        bool cross_attn;  // whether to use cross attention
        // Abort callback
        // if it returns true, execution of llama_decode() will be aborted
        // currently works only with CPU execution
        ggml_abort_callback abort_callback;
        void *              abort_callback_data;
        // Keep the booleans together and at the end of the struct to avoid misalignment during copy-by-value.
        bool embeddings;  // if true, extract embeddings (together with logits)
        bool offload_kqv; // whether to offload the KQV ops (including the KV cache) to GPU
        bool flash_attn;  // whether to use flash attention [EXPERIMENTAL]
        bool no_perf;     // whether to measure performance timings
        bool op_offload;  // whether to offload host tensor operations to device
    };
    // model quantization parameters
@ -447,6 +446,10 @@ extern "C" {
                                 size_t    n_paths,
              struct llama_model_params    params);
    LLAMA_API void llama_model_save_to_file(
            const struct llama_model * model,
                        const char * path_model);
    DEPRECATED(LLAMA_API void llama_free_model(struct llama_model * model),
            "use llama_model_free instead");
@ -461,10 +464,6 @@ extern "C" {
            struct llama_context_params   params),
            "use llama_init_from_model instead");
    // TODO (jmorganca): this should most likely be passed in as part of a batch
    // and not set on the context for all batches.
    LLAMA_API void llama_set_cross_attention(struct llama_context * ctx, bool cross_attn_state);
    // Frees all allocated memory
    LLAMA_API void llama_free(struct llama_context * ctx);
@ -930,14 +929,19 @@ extern "C" {
    // Frees a batch of tokens allocated with llama_batch_init()
    LLAMA_API void llama_batch_free(struct llama_batch batch);
-    // Processes a batch of tokens with the ecoder part of the encoder-decoder model.
+    // Process a batch of tokens.
-    // Stores the encoder output internally for later use by the decoder cross-attention layers.
+    // In contrast to llama_decode() - this call does not use KV cache.
    // For encode-decoder contexts, processes the batch using the encoder.
    // Can store the encoder output internally for later use by the decoder's cross-attention layers.
    //   0 - success
    // < 0 - error. the KV cache state is restored to the state before this call
    LLAMA_API int32_t llama_encode(
            struct llama_context * ctx,
              struct llama_batch   batch);
    // Process a batch of tokens.
    // Requires KV cache.
    // For encode-decoder contexts, processes the batch using the decoder.
    // Positive return values does not mean a fatal error, but rather a warning.
    //   0 - success
    //   1 - could not find a KV slot for the batch (try reducing the size of the batch or increase the context)
@ -1434,6 +1438,37 @@ extern "C" {
    LLAMA_API void                           llama_perf_sampler_print(const struct llama_sampler * chain);
    LLAMA_API void                           llama_perf_sampler_reset(      struct llama_sampler * chain);
    //
    // training
    //
    // function that returns whether or not a given tensor contains trainable parameters
    typedef bool (*llama_opt_param_filter)(const struct ggml_tensor * tensor, void * userdata);
    // always returns true
    LLAMA_API bool llama_opt_param_filter_all(const struct ggml_tensor * tensor, void * userdata);
    struct llama_opt_params {
        uint32_t n_ctx_train; // assumed context size post training, use context size specified in llama_context if 0
        llama_opt_param_filter param_filter; // callback for determining which tensors contain trainable parameters
        void * param_filter_ud;              // userdata for determining which tensors contain trainable parameters
        ggml_opt_get_optimizer_params get_opt_pars; // callback for calculating optimizer parameters
        void * get_opt_pars_ud;                     // userdata for calculating optimizer parameters
    };
    LLAMA_API void llama_opt_init(struct llama_context * lctx, struct llama_model * model, struct llama_opt_params lopt_params);
    LLAMA_API void llama_opt_epoch(
            struct llama_context    * lctx,
            ggml_opt_dataset_t        dataset,
            ggml_opt_result_t         result_train,
            ggml_opt_result_t         result_eval,
            int64_t                   idata_split,
            ggml_opt_epoch_callback   callback_train,
            ggml_opt_epoch_callback   callback_eval);
 #ifdef __cplusplus
 }
 #endif
--- a/llama/llama.cpp/src/llama-adapter.cpp
+++ b/llama/llama.cpp/src/llama-adapter.cpp
@ -253,6 +253,9 @@ static void llama_adapter_lora_init_impl(llama_model & model, const char * path_
    std::vector<ggml_backend_buffer_type_t> buft_extra;
    {
        auto * cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
        if (!cpu_dev) {
            throw std::runtime_error(format("%s: no CPU backend found", __func__));
        }
        auto * cpu_reg = ggml_backend_dev_backend_reg(cpu_dev);
        auto ggml_backend_dev_get_extra_bufts_fn = (ggml_backend_dev_get_extra_bufts_t)
@ -291,6 +294,9 @@ static void llama_adapter_lora_init_impl(llama_model & model, const char * path_
                LLAMA_LOG_WARN("%s: lora for '%s' cannot use buft '%s', fallback to CPU\n", __func__, model_tensor->name, ggml_backend_buft_name(buft));
                auto * cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
                if (!cpu_dev) {
                    throw std::runtime_error(format("%s: no CPU backend found", __func__));
                }
                buft = ggml_backend_dev_buffer_type(cpu_dev);
                break;
--- a/llama/llama.cpp/src/llama-arch.cpp
+++ b/llama/llama.cpp/src/llama-arch.cpp
@ -6,7 +6,6 @@
 static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
    { LLM_ARCH_LLAMA,            "llama"            },
    { LLM_ARCH_MLLAMA,           "mllama"           },
    { LLM_ARCH_LLAMA4,           "llama4"           },
    { LLM_ARCH_DECI,             "deci"             },
    { LLM_ARCH_FALCON,           "falcon"           },
@ -145,7 +144,6 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
    { LLM_KV_ATTENTION_SLIDING_WINDOW,               "%s.attention.sliding_window"               },
    { LLM_KV_ATTENTION_SCALE,                        "%s.attention.scale"                        },
    { LLM_KV_ATTENTION_BLOCK_SKIP_CONNECTION,        "%s.attention.block_skip_connection"        },
    { LLM_KV_ATTENTION_CROSS_ATTENTION_LAYERS,       "%s.attention.cross_attention_layers"       },
    { LLM_KV_ATTENTION_KEY_LENGTH_MLA,               "%s.attention.key_length_mla"               },
    { LLM_KV_ATTENTION_VALUE_LENGTH_MLA,             "%s.attention.value_length_mla"             },
@ -275,40 +273,6 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
            { LLM_TENSOR_FFN_UP_SHEXP,    "blk.%d.ffn_up_shexp" },
        },
    },
    {
        LLM_ARCH_MLLAMA,
        {
            { LLM_TENSOR_TOKEN_EMBD,      "token_embd" },
            { LLM_TENSOR_OUTPUT_NORM,     "output_norm" },
            { LLM_TENSOR_OUTPUT,          "output" },
            { LLM_TENSOR_ROPE_FREQS,      "rope_freqs" },
            { LLM_TENSOR_ATTN_NORM,       "blk.%d.attn_norm" },
            { LLM_TENSOR_ATTN_Q,          "blk.%d.attn_q" },
            { LLM_TENSOR_ATTN_K,          "blk.%d.attn_k" },
            { LLM_TENSOR_ATTN_V,          "blk.%d.attn_v" },
            { LLM_TENSOR_ATTN_OUT,        "blk.%d.attn_output" },
            { LLM_TENSOR_ATTN_ROT_EMBD,   "blk.%d.attn_rot_embd" },
            { LLM_TENSOR_FFN_GATE_INP,    "blk.%d.ffn_gate_inp" },
            { LLM_TENSOR_FFN_NORM,        "blk.%d.ffn_norm" },
            { LLM_TENSOR_FFN_GATE,        "blk.%d.ffn_gate" },
            { LLM_TENSOR_FFN_DOWN,        "blk.%d.ffn_down" },
            { LLM_TENSOR_FFN_UP,          "blk.%d.ffn_up" },
            { LLM_TENSOR_FFN_GATE_EXP,    "blk.%d.ffn_gate.%d" },
            { LLM_TENSOR_FFN_DOWN_EXP,    "blk.%d.ffn_down.%d" },
            { LLM_TENSOR_FFN_UP_EXP,      "blk.%d.ffn_up.%d" },
            { LLM_TENSOR_FFN_GATE_EXPS,   "blk.%d.ffn_gate_exps" },
            { LLM_TENSOR_FFN_DOWN_EXPS,   "blk.%d.ffn_down_exps" },
            { LLM_TENSOR_FFN_UP_EXPS,     "blk.%d.ffn_up_exps" },
            { LLM_TENSOR_CROSS_ATTN_K_NORM,    "blk.%d.cross_attn_k_norm" },
            { LLM_TENSOR_CROSS_ATTN_K_PROJ,    "blk.%d.cross_attn_k_proj" },
            { LLM_TENSOR_CROSS_ATTN_O_PROJ,    "blk.%d.cross_attn_o_proj" },
            { LLM_TENSOR_CROSS_ATTN_Q_NORM,    "blk.%d.cross_attn_q_norm" },
            { LLM_TENSOR_CROSS_ATTN_Q_PROJ,    "blk.%d.cross_attn_q_proj" },
            { LLM_TENSOR_CROSS_ATTN_V_PROJ,    "blk.%d.cross_attn_v_proj" },
            { LLM_TENSOR_CROSS_ATTN_ATTN_GATE, "blk.%d.cross_attn_attn_gate" },
            { LLM_TENSOR_CROSS_ATTN_MLP_GATE,  "blk.%d.cross_attn_mlp_gate" },
        },
    },
    {
        LLM_ARCH_DECI,
        {
@ -1737,14 +1701,6 @@ static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
    // this tensor is loaded for T5, but never used
    {LLM_TENSOR_DEC_CROSS_ATTN_REL_B,       {LLM_TENSOR_LAYER_REPEATING, GGML_OP_NONE}},
    {LLM_TENSOR_BSKCN_TV,                   {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
    {LLM_TENSOR_CROSS_ATTN_K_NORM,          {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
    {LLM_TENSOR_CROSS_ATTN_K_PROJ,          {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
    {LLM_TENSOR_CROSS_ATTN_O_PROJ,          {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
    {LLM_TENSOR_CROSS_ATTN_Q_NORM,          {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
    {LLM_TENSOR_CROSS_ATTN_Q_PROJ,          {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
    {LLM_TENSOR_CROSS_ATTN_V_PROJ,          {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
    {LLM_TENSOR_CROSS_ATTN_ATTN_GATE,       {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
    {LLM_TENSOR_CROSS_ATTN_MLP_GATE,        {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
    {LLM_TENSOR_CONV1D,                     {LLM_TENSOR_LAYER_INPUT,     GGML_OP_IM2COL}},
    {LLM_TENSOR_POS_NET_NORM,               {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
    {LLM_TENSOR_POS_NET_NORM1,              {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
--- a/llama/llama.cpp/src/llama-arch.h
+++ b/llama/llama.cpp/src/llama-arch.h
@ -11,7 +11,6 @@
 enum llm_arch {
    LLM_ARCH_LLAMA,
    LLM_ARCH_LLAMA4,
    LLM_ARCH_MLLAMA,
    LLM_ARCH_DECI,
    LLM_ARCH_FALCON,
    LLM_ARCH_BAICHUAN,
@ -149,7 +148,6 @@ enum llm_kv {
    LLM_KV_ATTENTION_SLIDING_WINDOW,
    LLM_KV_ATTENTION_SCALE,
    LLM_KV_ATTENTION_BLOCK_SKIP_CONNECTION,
    LLM_KV_ATTENTION_CROSS_ATTENTION_LAYERS,
    LLM_KV_ATTENTION_KEY_LENGTH_MLA,
    LLM_KV_ATTENTION_VALUE_LENGTH_MLA,
@ -351,14 +349,6 @@ enum llm_tensor {
    LLM_TENSOR_CLS,
    LLM_TENSOR_CLS_OUT,
    LLM_TENSOR_BSKCN_TV,
    LLM_TENSOR_CROSS_ATTN_K_NORM,
    LLM_TENSOR_CROSS_ATTN_K_PROJ,
    LLM_TENSOR_CROSS_ATTN_O_PROJ,
    LLM_TENSOR_CROSS_ATTN_Q_NORM,
    LLM_TENSOR_CROSS_ATTN_Q_PROJ,
    LLM_TENSOR_CROSS_ATTN_V_PROJ,
    LLM_TENSOR_CROSS_ATTN_ATTN_GATE,
    LLM_TENSOR_CROSS_ATTN_MLP_GATE,
    LLM_TENSOR_CONV1D,
    LLM_TENSOR_CONVNEXT_DW,
    LLM_TENSOR_CONVNEXT_NORM,
--- a/llama/llama.cpp/src/llama-batch.cpp
+++ b/llama/llama.cpp/src/llama-batch.cpp
@ -189,7 +189,7 @@ llama_ubatch llama_sbatch::split_seq(size_t n_ubatch) {
    return ubatch;
 }
-void llama_sbatch::from_batch(const llama_batch & batch, size_t n_embd, bool simple_split, bool logits_all) {
+llama_sbatch::llama_sbatch(const llama_batch & batch, size_t n_embd, bool simple_split, bool logits_all) {
    GGML_ASSERT(batch.n_tokens >= 0);
    this->batch = &batch;
    this->n_embd = n_embd;
@ -203,6 +203,7 @@ void llama_sbatch::from_batch(const llama_batch & batch, size_t n_embd, bool sim
    for (size_t i = 0; i < n_tokens; ++i) {
        ids[i] = i;
    }
    if (simple_split) {
        seq.resize(1);
        llama_sbatch_seq & s = seq[0];
@ -212,6 +213,7 @@ void llama_sbatch::from_batch(const llama_batch & batch, size_t n_embd, bool sim
        s.length = n_tokens;
        return;
    }
    std::sort(ids.begin(), ids.end(),
            [&batch](size_t a, size_t b) {
                int32_t n_seq_a = batch.n_seq_id ? batch.n_seq_id[a] : 1;
@ -239,6 +241,7 @@ void llama_sbatch::from_batch(const llama_batch & batch, size_t n_embd, bool sim
                return n_seq_a > n_seq_b;
            }
    );
    // init seq
    llama_sbatch_seq * last_seq = nullptr;
@ -262,6 +265,7 @@ void llama_sbatch::from_batch(const llama_batch & batch, size_t n_embd, bool sim
        seq.push_back(new_seq);
        last_seq = &seq.back();
    }
    // keep shared prompts first at the end, then sort by length descending.
    std::sort(seq.begin(), seq.end(),
            [](llama_sbatch_seq & a, llama_sbatch_seq & b) {
@ -316,7 +320,6 @@ struct llama_batch llama_batch_get_one(
        /*n_tokens       =*/ n_tokens,
        /*tokens         =*/ tokens,
        /*embd           =*/ nullptr,
        /*n_embd         =*/ 0,
        /*pos            =*/ nullptr,
        /*n_seq_id       =*/ nullptr,
        /*seq_id         =*/ nullptr,
@ -329,7 +332,6 @@ struct llama_batch llama_batch_init(int32_t n_tokens_alloc, int32_t embd, int32_
        /*n_tokens       =*/ 0,
        /*tokens         =*/ nullptr,
        /*embd           =*/ nullptr,
        /*n_embd         =*/ 0,
        /*pos            =*/ nullptr,
        /*n_seq_id       =*/ nullptr,
        /*seq_id         =*/ nullptr,
@ -338,7 +340,6 @@ struct llama_batch llama_batch_init(int32_t n_tokens_alloc, int32_t embd, int32_
    if (embd) {
        batch.embd = (float *) malloc(sizeof(float) * n_tokens_alloc * embd);
        batch.n_embd = embd;
    } else {
        batch.token = (llama_token *) malloc(sizeof(llama_token) * n_tokens_alloc);
    }
--- a/llama/llama.cpp/src/llama-batch.h
+++ b/llama/llama.cpp/src/llama-batch.h
@ -70,7 +70,8 @@ struct llama_sbatch {
    // sequence-wise split
    llama_ubatch split_seq(size_t n_ubatch);
-    void from_batch(const llama_batch & batch, size_t n_embd, bool simple_split = false, bool logits_all = false);
+    llama_sbatch() = default;
    llama_sbatch(const llama_batch & batch, size_t n_embd, bool simple_split = false, bool logits_all = false);
 };
 // temporary allocate memory for the input batch if needed
--- a/llama/llama.cpp/src/llama-chat.cpp
+++ b/llama/llama.cpp/src/llama-chat.cpp
@ -35,6 +35,7 @@ static const std::map<std::string, llm_chat_template> LLM_CHAT_TEMPLATES = {
    { "mistral-v3",        LLM_CHAT_TEMPLATE_MISTRAL_V3        },
    { "mistral-v3-tekken", LLM_CHAT_TEMPLATE_MISTRAL_V3_TEKKEN },
    { "mistral-v7",        LLM_CHAT_TEMPLATE_MISTRAL_V7        },
    { "mistral-v7-tekken", LLM_CHAT_TEMPLATE_MISTRAL_V7_TEKKEN },
    { "phi3",              LLM_CHAT_TEMPLATE_PHI_3             },
    { "phi4",              LLM_CHAT_TEMPLATE_PHI_4             },
    { "falcon3",           LLM_CHAT_TEMPLATE_FALCON_3          },
@ -202,19 +203,20 @@ int32_t llm_chat_apply_template(
        if (add_ass) {
            ss << "<|im_start|>assistant\n";
        }
-    } else if (tmpl == LLM_CHAT_TEMPLATE_MISTRAL_V7) {
+    } else if (tmpl == LLM_CHAT_TEMPLATE_MISTRAL_V7 || tmpl == LLM_CHAT_TEMPLATE_MISTRAL_V7_TEKKEN) {
        // Official mistral 'v7' template
        // See: https://huggingface.co/mistralai/Mistral-Large-Instruct-2411#basic-instruct-template-v7
        //      https://huggingface.co/mistralai/Mistral-Small-3.1-24B-Instruct-2503#basic-instruct-template-v7-tekken
        const char * trailing_space = tmpl == LLM_CHAT_TEMPLATE_MISTRAL_V7 ? " " : "";
        for (auto message : chat) {
            std::string role(message->role);
            std::string content(message->content);
            if (role == "system") {
-                ss << "[SYSTEM_PROMPT] " << content << "[/SYSTEM_PROMPT]";
+                ss << "[SYSTEM_PROMPT]" << trailing_space << content << "[/SYSTEM_PROMPT]";
            } else if (role == "user") {
-                ss << "[INST] " << content << "[/INST]";
+                ss << "[INST]" << trailing_space << content << "[/INST]";
-            }
+            } else {
-            else {
+                ss << trailing_space << content << "</s>";
                ss << " " << content << "</s>";
            }
        }
    } else if (tmpl == LLM_CHAT_TEMPLATE_MISTRAL_V1
@ -447,8 +449,16 @@ int32_t llm_chat_apply_template(
        if (add_ass) {
            ss << "<|assistant|>";
        }
-    } else if (tmpl == LLM_CHAT_TEMPLATE_CHATGLM_4 || tmpl == LLM_CHAT_TEMPLATE_GLMEDGE) {
+    } else if (tmpl == LLM_CHAT_TEMPLATE_CHATGLM_4) {
        ss << "[gMASK]" << "<sop>";
        for (auto message : chat) {
            std::string role(message->role);
            ss << "<|" << role << "|>" << "\n" << message->content;
        }
        if (add_ass) {
            ss << "<|assistant|>\n";
        }
    } else if (tmpl == LLM_CHAT_TEMPLATE_GLMEDGE) {
        for (auto message : chat) {
            std::string role(message->role);
            ss << "<|" << role << "|>" << "\n" << message->content;
--- a/llama/llama.cpp/src/llama-chat.h
+++ b/llama/llama.cpp/src/llama-chat.h
@ -14,6 +14,7 @@ enum llm_chat_template {
    LLM_CHAT_TEMPLATE_MISTRAL_V3,
    LLM_CHAT_TEMPLATE_MISTRAL_V3_TEKKEN,
    LLM_CHAT_TEMPLATE_MISTRAL_V7,
    LLM_CHAT_TEMPLATE_MISTRAL_V7_TEKKEN,
    LLM_CHAT_TEMPLATE_PHI_3,
    LLM_CHAT_TEMPLATE_PHI_4,
    LLM_CHAT_TEMPLATE_FALCON_3,
--- a/llama/llama.cpp/src/llama-context.cpp
+++ b/llama/llama.cpp/src/llama-context.cpp
--- a/llama/llama.cpp/src/llama-context.h
+++ b/llama/llama.cpp/src/llama-context.h
@ -8,6 +8,7 @@
 #include "llama-kv-cache.h"
 #include "ggml-cpp.h"
 #include "ggml-opt.h"
 #include <map>
 #include <vector>
@ -28,7 +29,12 @@ struct llama_context {
    void synchronize();
-    const llama_model & get_model() const;
+    const llama_model   & get_model()   const;
    const llama_cparams & get_cparams() const;
    ggml_backend_sched_t get_sched() const;
    ggml_context * get_ctx_compute() const;
    uint32_t n_ctx()         const;
    uint32_t n_ctx_per_seq() const;
@ -66,7 +72,6 @@ struct llama_context {
    void set_embeddings (bool value);
    void set_causal_attn(bool value);
    void set_warmup(bool value);
    void set_cross_attn(bool value);
    void set_adapter_lora(
            llama_adapter_lora * adapter,
@ -130,6 +135,32 @@ struct llama_context {
    llama_perf_context_data perf_get_data() const;
    void perf_reset();
    //
    // training
    //
    void opt_init(struct llama_model * model, struct llama_opt_params lopt_params);
    void opt_epoch(
            ggml_opt_dataset_t      dataset,
            ggml_opt_result_t       result_train,
            ggml_opt_result_t       result_eval,
            int64_t                 idata_split,
            ggml_opt_epoch_callback callback_train,
            ggml_opt_epoch_callback callback_eval);
    void opt_epoch_iter(
            ggml_opt_dataset_t               dataset,
            ggml_opt_result_t                result,
            const std::vector<llama_token> & tokens,
            const std::vector<llama_token> & labels_sparse,
            llama_batch                    & batch,
            ggml_opt_epoch_callback          callback,
            bool                             train,
            int64_t                          idata_in_loop,
            int64_t                          ndata_in_loop,
            int64_t                          t_loop_start);
 private:
    //
    // output
@ -139,50 +170,30 @@ private:
    // Returns max number of outputs for which space was reserved.
    int32_t output_reserve(int32_t n_outputs);
    // make the outputs have the same order they had in the user-provided batch
    // TODO: maybe remove this
    void output_reorder();
    //
    // graph
    //
 public:
    int32_t graph_max_nodes() const;
    // zero-out inputs and create the ctx_compute for the compute graph
    ggml_cgraph * graph_init();
    llm_graph_result_ptr graph_build(
            ggml_context * ctx,
             ggml_cgraph * gf,
      const llama_ubatch & ubatch,
          llm_graph_type   gtype);
    // returns the result of ggml_backend_sched_graph_compute_async execution
    ggml_status graph_compute(
            ggml_cgraph * gf,
                   bool   batched);
 private:
    llm_graph_result_ptr graph_build(
            ggml_context * ctx,
             ggml_cgraph * gf,
      const llama_ubatch & ubatch,
          llm_graph_type   gtype);
    llm_graph_cb graph_get_cb() const;
    // used by kv_self_update()
    ggml_tensor * build_rope_shift(
        ggml_context * ctx0,
        ggml_tensor * cur,
        ggml_tensor * shift,
        ggml_tensor * factors,
              float   freq_base,
              float   freq_scale) const;
    llm_graph_result_ptr build_kv_self_shift(
            ggml_context * ctx0,
            ggml_cgraph * gf) const;
    llm_graph_result_ptr build_kv_self_defrag(
            ggml_context * ctx0,
            ggml_cgraph * gf,
            const std::vector<struct llama_kv_defrag_move> & moves) const;
    // TODO: read/write lora adapters and cvec
    size_t state_write_data(llama_io_write_i & io);
    size_t state_read_data (llama_io_read_i  & io);
@ -199,14 +210,10 @@ private:
    llama_cparams       cparams;
    llama_adapter_cvec  cvec;
    llama_adapter_loras loras;
    llama_sbatch        sbatch;
    llama_cross cross; // TODO: tmp for handling cross-attention - need something better probably
-    std::unique_ptr<llama_kv_cache_unified> kv_self;
+    std::unique_ptr<llama_memory_i> memory;
    // TODO: remove
    bool logits_all = false;
    // decode output (2-dimensional array: [n_outputs][n_vocab])
    size_t  logits_size = 0; // capacity (of floats) for logits
@ -233,6 +240,9 @@ private:
    ggml_context_ptr ctx_compute;
    // training
    ggml_opt_context_t opt_ctx = nullptr;
    ggml_threadpool_t threadpool       = nullptr;
    ggml_threadpool_t threadpool_batch = nullptr;
--- a/llama/llama.cpp/src/llama-cparams.h
+++ b/llama/llama.cpp/src/llama-cparams.h
@ -29,8 +29,8 @@ struct llama_cparams {
    bool offload_kqv;
    bool flash_attn;
    bool no_perf;
    bool cross_attn;
    bool warmup;
    bool op_offload;
    enum llama_pooling_type pooling_type;
--- a/llama/llama.cpp/src/llama-graph.cpp
+++ b/llama/llama.cpp/src/llama-graph.cpp
@ -284,24 +284,7 @@ void llm_graph_input_s_copy::set_input(const llama_ubatch * ubatch) {
        // assuming copy destinations ALWAYS happen ONLY on the cells between head and head+n
        for (uint32_t i = 0; i < n_kv; ++i) {
-            const uint32_t  cell_id = i + kv_self->head;
+            data[i] = kv_self->s_copy(i);
            //////////////////////////////////////////////
            // TODO: this should not mutate the KV cache !
            llama_kv_cell & kv_cell = const_cast<class llama_kv_cache_unified *>(kv_self)->cells[i];
            // prevent out-of-bound sources
            if (kv_cell.src < 0 || (uint32_t) kv_cell.src >= kv_self->size) {
                kv_cell.src = cell_id;
            }
            data[i] = kv_cell.src;
            // TODO: do not mutate the KV cache
            // ensure copy only happens once
            if (kv_cell.src != (int32_t) cell_id) {
                kv_cell.src = cell_id;
            }
        }
    }
 }
@ -317,18 +300,7 @@ void llm_graph_input_s_mask::set_input(const llama_ubatch * ubatch) {
        // clear unused states
        for (int i = 0; i < n_kv; ++i) {
-            const uint32_t  cell_id = i + kv_self->head;
+            data[i] = kv_self->s_mask(i);
            //////////////////////////////////////////////
            // TODO: this should not mutate the KV cache !
            llama_kv_cell & kv_cell = const_cast<class llama_kv_cache_unified *>(kv_self)->cells[i];
            data[i] = (float) (kv_cell.src >= 0);
            // only clear once
            if (kv_cell.src < 0) {
                kv_cell.src = cell_id;
            }
        }
    }
 }
@ -560,12 +532,6 @@ void llm_graph_input_attn_cross::set_input(const llama_ubatch * ubatch) {
    }
 }
 void llm_graph_input_cross_attn_state::set_input(const llama_ubatch * ubatch) {
    if (ubatch->embd) {
        ggml_backend_tensor_set(cross_attn_state, ubatch->embd, 0, ggml_nbytes(cross_attn_state));
    }
 }
 //
 // llm_graph_context
 //
@ -816,7 +782,7 @@ ggml_tensor * llm_graph_context::build_ffn(
            } break;
    }
-    if (type_gate == LLM_FFN_PAR) {
+    if (gate && type_gate == LLM_FFN_PAR) {
        cur = ggml_mul(ctx0, cur, tmp);
        cb(cur, "ffn_gate_par", il);
    }
@ -1005,6 +971,7 @@ ggml_tensor * llm_graph_context::build_inp_embd(ggml_tensor * tok_embd) const {
        inp->tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, ubatch.n_tokens);
        //cb(inp->tokens, "inp_tokens", -1);
        ggml_set_input(inp->tokens);
        res->t_tokens = inp->tokens;
        cur = ggml_get_rows(ctx0, tok_embd, inp->tokens);
@ -1111,7 +1078,7 @@ ggml_tensor * llm_graph_context::build_inp_cls() const {
 }
 ggml_tensor * llm_graph_context::build_inp_s_copy() const {
-    const llama_kv_cache_unified * kv_self = static_cast<const llama_kv_cache_unified *>(memory);
+    const llama_kv_cache_recurrent * kv_self = static_cast<const llama_kv_cache_recurrent *>(memory);
    auto inp = std::make_unique<llm_graph_input_s_copy>(kv_self);
@ -1128,7 +1095,7 @@ ggml_tensor * llm_graph_context::build_inp_s_copy() const {
 }
 ggml_tensor * llm_graph_context::build_inp_s_mask() const {
-    const llama_kv_cache_unified * kv_self = static_cast<const llama_kv_cache_unified *>(memory);
+    const llama_kv_cache_recurrent * kv_self = static_cast<const llama_kv_cache_recurrent *>(memory);
    auto inp = std::make_unique<llm_graph_input_s_mask>(kv_self);
@ -1261,8 +1228,19 @@ ggml_tensor * llm_graph_context::build_attn_mha(
        ggml_flash_attn_ext_set_prec(cur, GGML_PREC_F32);
        if (v_mla) {
 #if 0
            // v_mla can be applied as a matrix-vector multiplication with broadcasting across dimension 3 == n_tokens.
            // However, the code is optimized for dimensions 0 and 1 being large, so this is ineffient.
            cur = ggml_reshape_4d(ctx0, cur, v_mla->ne[0], 1, n_head, n_tokens);
            cur = ggml_mul_mat(ctx0, v_mla, cur);
 #else
            // It's preferable to do the calculation as a matrix-matrix multiplication with n_tokens in dimension 1.
            // The permutations are noops and only change how the tensor data is interpreted.
            cur = ggml_permute(ctx0, cur, 0, 2, 1, 3);
            cur = ggml_mul_mat(ctx0, v_mla, cur);
            cur = ggml_permute(ctx0, cur, 0, 2, 1, 3);
            cur = ggml_cont(ctx0, cur); // Needed because ggml_reshape_2d expects contiguous inputs.
 #endif
        }
        cur = ggml_reshape_2d(ctx0, cur, cur->ne[0]*n_head, n_tokens);
@ -1442,8 +1420,6 @@ ggml_tensor * llm_graph_context::build_attn(
    // store to KV cache
    {
        GGML_ASSERT(!kv_self->recurrent);
        const auto kv_head = kv_self->head;
        GGML_ASSERT(kv_self->size == n_ctx);
@ -1538,25 +1514,6 @@ llm_graph_input_attn_cross * llm_graph_context::build_attn_inp_cross() const {
    return (llm_graph_input_attn_cross *) res->add_input(std::move(inp));
 }
 ggml_tensor * llm_graph_context::build_inp_cross_attn_state() const {
    const int64_t n_embd = hparams.n_embd;
    auto inp = std::make_unique<llm_graph_input_cross_attn_state>();
    ggml_tensor * cur = nullptr;
    inp->cross_attn_state = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_embd, 1601, 4);
    ggml_set_input(inp->cross_attn_state);
    cur = inp->cross_attn_state;
    cb(cur, "inp_cross_attn_state", -1);
    res->add_input(std::move(inp));
    return cur;
 }
 ggml_tensor * llm_graph_context::build_attn(
        llm_graph_input_attn_cross * inp,
        ggml_cgraph * gf,
@ -1612,7 +1569,7 @@ ggml_tensor * llm_graph_context::build_copy_mask_state(
         ggml_tensor * state_mask,
             int32_t   n_state,
             int32_t   n_seqs) const {
-    const llama_kv_cache_unified * kv_self = static_cast<const llama_kv_cache_unified *>(memory);
+    const llama_kv_cache_recurrent * kv_self = static_cast<const llama_kv_cache_recurrent *>(memory);
    const auto n_kv    = kv_self->n;
    const auto kv_head = kv_self->head;
@ -1644,7 +1601,7 @@ ggml_tensor * llm_graph_context::build_rwkv_token_shift_load(
         ggml_tensor * state_mask,
  const llama_ubatch & ubatch,
                 int   il) const {
-    const llama_kv_cache_unified * kv_self = static_cast<const llama_kv_cache_unified *>(memory);
+    const llama_kv_cache_recurrent * kv_self = static_cast<const llama_kv_cache_recurrent *>(memory);
    const auto token_shift_count = hparams.token_shift_count;
@ -1665,7 +1622,7 @@ ggml_tensor * llm_graph_context::build_rwkv_token_shift_store(
         ggml_tensor * token_shift,
  const llama_ubatch & ubatch,
                 int   il) const {
-    const llama_kv_cache_unified * kv_self = static_cast<const llama_kv_cache_unified *>(memory);
+    const llama_kv_cache_recurrent * kv_self = static_cast<const llama_kv_cache_recurrent *>(memory);
    const auto token_shift_count = hparams.token_shift_count;
    const auto n_embd = hparams.n_embd;
--- a/llama/llama.cpp/src/llama-graph.h
+++ b/llama/llama.cpp/src/llama-graph.h
@ -19,6 +19,7 @@ struct llama_cparams;
 class llama_memory_i;
 class llama_kv_cache_unified;
 class llama_kv_cache_recurrent;
 // certain models (typically multi-modal) can produce different types of graphs
 enum llm_graph_type {
@ -86,7 +87,6 @@ public:
    ggml_tensor * tokens = nullptr; // I32 [n_batch]
    ggml_tensor * embd   = nullptr; // F32 [n_embd, n_batch]
    ggml_tensor * cross_attn_state; // F32 [4, n_embd, 1061]
 };
 class llm_graph_input_pos : public llm_graph_input_i {
@ -187,26 +187,26 @@ public:
 class llm_graph_input_s_copy : public llm_graph_input_i {
 public:
-    llm_graph_input_s_copy(const llama_kv_cache_unified * kv_self) : kv_self(kv_self) {}
+    llm_graph_input_s_copy(const llama_kv_cache_recurrent * kv_self) : kv_self(kv_self) {}
    virtual ~llm_graph_input_s_copy() = default;
    void set_input(const llama_ubatch * ubatch) override;
    ggml_tensor * s_copy; // I32 [kv_size]
-    const llama_kv_cache_unified * kv_self;
+    const llama_kv_cache_recurrent * kv_self;
 };
 class llm_graph_input_s_mask : public llm_graph_input_i {
 public:
-    llm_graph_input_s_mask(const llama_kv_cache_unified * kv_self) : kv_self(kv_self) {}
+    llm_graph_input_s_mask(const llama_kv_cache_recurrent * kv_self) : kv_self(kv_self) {}
    virtual ~llm_graph_input_s_mask() = default;
    void set_input(const llama_ubatch * ubatch) override;
    ggml_tensor * s_mask; // F32 [1, n_kv]
-    const llama_kv_cache_unified * kv_self;
+    const llama_kv_cache_recurrent * kv_self;
 };
 class llm_graph_input_cross_embd : public llm_graph_input_i {
@ -284,16 +284,6 @@ public:
    const llama_cross * cross = nullptr;
 };
 class llm_graph_input_cross_attn_state : public llm_graph_input_i {
 public:
    llm_graph_input_cross_attn_state()          = default;
    virtual ~llm_graph_input_cross_attn_state() = default;
    void set_input(const llama_ubatch * ubatch) override;
    ggml_tensor * cross_attn_state; // F32 [4, n_embd, 1061]
 };
 //
 // llm_graph_result
 //
@ -308,6 +298,7 @@ class llm_graph_result_i {
 public:
    virtual ~llm_graph_result_i() = default;
    virtual ggml_tensor * get_tokens()      = 0;
    virtual ggml_tensor * get_logits()      = 0;
    virtual ggml_tensor * get_embd()        = 0;
    virtual ggml_tensor * get_embd_pooled() = 0;
@ -322,6 +313,7 @@ class llm_graph_result : public llm_graph_result_i {
 public:
    virtual ~llm_graph_result() = default;
    ggml_tensor * get_tokens()      override { return t_tokens; }
    ggml_tensor * get_logits()      override { return t_logits; }
    ggml_tensor * get_embd()        override { return t_embd; }
    ggml_tensor * get_embd_pooled() override { return t_embd_pooled; }
@ -338,6 +330,7 @@ public:
    }
    // important graph nodes
    ggml_tensor * t_tokens      = nullptr;
    ggml_tensor * t_logits      = nullptr;
    ggml_tensor * t_embd        = nullptr;
    ggml_tensor * t_embd_pooled = nullptr;
@ -361,8 +354,8 @@ struct llm_graph_params {
    const llama_cparams & cparams;
    const llama_ubatch  & ubatch;
-    ggml_backend_sched * sched;
+    ggml_backend_sched_t sched;
-    ggml_backend * backend_cpu;
+    ggml_backend_t backend_cpu;
    const llama_adapter_cvec  * cvec;
    const llama_adapter_loras * loras;
@ -413,9 +406,9 @@ struct llm_graph_context {
    ggml_context * ctx0 = nullptr;
-    ggml_backend_sched * sched;
+    ggml_backend_sched_t sched;
-    ggml_backend * backend_cpu; // TODO: needed by build_attn_mha, figure out a way to remove?
+    ggml_backend_t backend_cpu; // TODO: needed by build_attn_mha, figure out a way to remove?
    const llama_adapter_cvec  * cvec;
    const llama_adapter_loras * loras;
@ -502,7 +495,6 @@ struct llm_graph_context {
    ggml_tensor * build_inp_cls() const;
    ggml_tensor * build_inp_s_copy() const;
    ggml_tensor * build_inp_s_mask() const;
    ggml_tensor * build_inp_cross_attn_state() const;
    ggml_tensor * build_inp_cross_embd() const;
    ggml_tensor * build_inp_pos_bucket_enc() const;
--- a/llama/llama.cpp/src/llama-hparams.cpp
+++ b/llama/llama.cpp/src/llama-hparams.cpp
@ -85,7 +85,3 @@ bool llama_hparams::is_swa(uint32_t il) const {
    GGML_ABORT("fatal error");
 }
 bool llama_hparams::cross_attention_layers(uint32_t il) const {
    return std::find(cross_attn_layers.begin(), cross_attn_layers.end(), il) != cross_attn_layers.end();
 }
--- a/llama/llama.cpp/src/llama-hparams.h
+++ b/llama/llama.cpp/src/llama-hparams.h
@ -2,8 +2,6 @@
 #include "llama.h"
 #include <algorithm>
 #include <array>
 // bump if necessary
@ -44,7 +42,6 @@ struct llama_hparams {
    uint32_t n_expert = 0;
    uint32_t n_expert_used = 0;
    uint32_t n_rel_attn_bkts = 0;
    uint32_t n_vocab = 0;
    // note: deepseek2 using MLA converts into MQA with larger heads, then decompresses to MHA
    uint32_t n_embd_head_k_mla = 0;
@ -59,7 +56,6 @@ struct llama_hparams {
    std::array<uint32_t, LLAMA_MAX_LAYERS> n_ff_arr;
    std::array<std::array<uint32_t, LLAMA_MAX_LAYERS>, 4> n_bskcn_arr = {};
    std::array<uint32_t, LLAMA_MAX_LAYERS> cross_attn_layers;
    uint32_t n_layer_dense_lead = 0;
    uint32_t n_lora_q           = 0;
@ -163,9 +159,6 @@ struct llama_hparams {
    // Block skip connection
    bool n_bskcn(uint32_t n, uint32_t il) const;
    // cross attention layers
    bool cross_attention_layers(uint32_t il) const;
    bool is_swa(uint32_t il) const;
 };
--- a/llama/llama.cpp/src/llama-kv-cache.cpp
+++ b/llama/llama.cpp/src/llama-kv-cache.cpp
--- a/llama/llama.cpp/src/llama-kv-cache.h
+++ b/llama/llama.cpp/src/llama-kv-cache.h
@ -2,32 +2,72 @@
 #include "llama.h"
 #include "llama-io.h"
 #include "llama-graph.h"
 #include "llama-memory.h"
 #include "ggml-cpp.h"
 #include <functional>
 #include <set>
 #include <vector>
 struct llama_cparams;
 struct llama_hparams;
 struct llama_ubatch;
 struct llama_sbatch;
 struct llama_model;
 struct llama_context;
 struct llama_kv_cache : public llama_memory_i {
-    using llama_memory_i::llama_memory_i;
+    virtual ~llama_kv_cache() = default;
-    virtual void restore() = 0; // call if batch processing fails - restores the cache state
+    // call if batch processing fails - restores the cache state
-    virtual void commit() = 0;  // call after successful batch processing - clears any pending state
+    virtual void restore() = 0;
-    virtual int32_t get_n_tokens()   const = 0;
+    // call after successful batch processing - clears any pending state
-    virtual int32_t get_used_cells() const = 0; // TODO: remove, this is too-specific to the unified cache
+    virtual void commit()  = 0;
-    virtual bool get_can_shift() const = 0;
+    // process any pending defrag/shift/etc. operations
    // optionally call once before processing a new batch
    virtual bool update(llama_context & lctx) = 0;
    // schedule a defrag if the fragmentation threshold is exceeded. otherwise, do nothing
    virtual void defrag_sched(float thold) = 0;
    // simulate full cache, used for allocating worst-case compute buffers
    virtual void set_full() = 0;
    //
    // batch processing
    //
    virtual llama_sbatch sbatch_init(const llama_batch & batch, bool logits_all) = 0;
    // different KV caches require different batch splitting strategies
    virtual llama_ubatch ubatch_next(llama_sbatch & sbatch, uint32_t n_ubatch, bool embd_pooled) const = 0;
    // find an empty slot of size "n_tokens" in the cache
    virtual bool find_slot(const llama_ubatch & batch) = 0;
    // getters
    virtual int32_t   get_n_tokens()   const = 0;
    virtual int32_t   get_used_cells() const = 0; // TODO: remove, this is too-specific to the unified cache
    virtual llama_pos get_pos_max()    const = 0;
    virtual bool      get_can_shift()  const = 0;
    bool get_can_edit() const override { return get_can_shift(); }
    //
    // state write/read
    //
    virtual void state_write(llama_io_write_i & io, llama_seq_id seq_id = -1) const = 0;
    virtual void state_read (llama_io_read_i  & io, llama_seq_id seq_id = -1) = 0;
 };
 //
 // llama_kv_cache_guard
 //
 struct llama_kv_cache_guard {
    llama_kv_cache_guard(llama_kv_cache * kv) : kv(kv) {}
@ -42,7 +82,7 @@ struct llama_kv_cache_guard {
 private:
    llama_kv_cache * kv;
 };
-
+ 
 // block of KV slots to move when defragging
 struct llama_kv_defrag_move {
    uint32_t src;
@ -50,65 +90,50 @@ struct llama_kv_defrag_move {
    uint32_t len;
 };
-struct llama_kv_cell {
+//
-    llama_pos pos   = -1;
+// llama_kv_cache_unified
-    llama_pos delta =  0;
+//
    int32_t   src   = -1; // used by recurrent state models to copy states
    int32_t   tail  = -1;
    std::set<llama_seq_id> seq_id;
    bool has_seq_id(const llama_seq_id & id) const {
        return seq_id.find(id) != seq_id.end();
    }
    bool is_empty() const {
        return seq_id.empty();
    }
    bool is_same_seq(const llama_kv_cell & other) const {
        return seq_id == other.seq_id;
    }
 };
 // ring-buffer of cached KV data
 // TODO: pimpl
 // TODO: add notion of max sequences
 class llama_kv_cache_unified : public llama_kv_cache {
 public:
-    // can be used to query data from the model if needed
+    struct kv_cell {
-    struct callbacks {
+        llama_pos pos   = -1;
-        std::function<ggml_tensor * (uint32_t n_ctx_per_seq, int il)> get_rope_factors;
+        llama_pos delta =  0;
        std::set<llama_seq_id> seq_id;
        bool has_seq_id(const llama_seq_id & id) const {
            return seq_id.find(id) != seq_id.end();
        }
        bool is_empty() const {
            return seq_id.empty();
        }
        bool is_same_seq(const kv_cell & other) const {
            return seq_id == other.seq_id;
        }
    };
    static uint32_t get_padding(const llama_cparams & cparams);
    llama_kv_cache_unified(
-            const llama_hparams & hparams,
+            const llama_model & model,
            callbacks             cbs);
    virtual ~llama_kv_cache_unified() = default;
    // TODO: become constructor
    bool init(
            const llama_model & model,   // TODO: do not reference the model
          const llama_cparams & cparams,
                    ggml_type   type_k,
                    ggml_type   type_v,
                         bool   v_trans,
                         bool   offload,
                     uint32_t   kv_size,
-                         bool   offload);
+                     uint32_t   padding);
-    int32_t get_n_tokens()   const override;
+    ~llama_kv_cache_unified() = default;
    int32_t get_used_cells() const override;
-    size_t total_size() const;
+    //
-
+    // llama_memory_i
-    // TODO: better data structures to reduce the cost of this operation
+    //
    llama_pos pos_max() const;
    void clear() override;
    void defrag() override;
    virtual void restore() override;
    virtual void commit() override;
    bool seq_rm  (llama_seq_id seq_id,                              llama_pos p0, llama_pos p1) override;
    void seq_cp  (llama_seq_id seq_id_src, llama_seq_id seq_id_dst, llama_pos p0, llama_pos p1) override;
@ -118,63 +143,40 @@ public:
    llama_pos seq_pos_max(llama_seq_id seq_id) const override;
-    bool get_can_shift() const override;
+    //
    // llama_kv_cache
    //
    void restore() override;
    void commit()  override;
    bool update(llama_context & ctx) override;
    void defrag_sched(float thold) override;
    void set_full() override;
    llama_sbatch sbatch_init(const llama_batch & batch, bool logits_all) override;
    llama_ubatch ubatch_next(llama_sbatch & sbatch, uint32_t n_ubatch, bool embd_pooled) const override;
    // find an empty slot of size "n_tokens" in the cache
    // updates the cache head
    // Note: On success, it's important that cache.head points
    // to the first cell of the slot.
-    bool find_slot(const llama_ubatch & batch);
+    bool find_slot(const llama_ubatch & batch) override;
-    // TODO: maybe not needed
+    int32_t get_n_tokens()   const override;
-    uint32_t get_padding(const llama_cparams & cparams) const;
+    int32_t get_used_cells() const override;
-    // find how many cells are currently in use
+    // TODO: better data structures to reduce the cost of this operation
-    uint32_t cell_max() const;
+    llama_pos get_pos_max() const override;
-    size_t size_k_bytes() const;
+    bool get_can_shift() const override;
    size_t size_v_bytes() const;
    // defrag
    struct {
        std::vector<llama_kv_defrag_move> moves;
    } defrag_info;
    // return true if cells have been moved
    bool defrag_prepare(int32_t n_max_nodes);
    // commit/restore cache
    struct slot_range {
        uint32_t c0 = 0; // note: these are cell indices, not sequence positions
        uint32_t c1 = 0;
    };
    // pending cell updates that are not yet committed
    struct {
        std::vector<slot_range> ranges;
    } pending;
    // state write/load
-    void state_write(llama_io_write_i & io, llama_seq_id seq_id = -1) const;
+    void state_write(llama_io_write_i & io, llama_seq_id seq_id = -1) const override;
-    void state_read (llama_io_read_i  & io, llama_seq_id seq_id = -1);
+    void state_read (llama_io_read_i  & io, llama_seq_id seq_id = -1) override;
    // members
    const llama_hparams & hparams;
    callbacks cbs;
    bool has_shift = false;
    bool do_defrag = false;
    // TODO: remove this and implement llama_kv_cache_recurrent instead
    bool recurrent = false; // with recurrent state models, a cell can hold the state for more than one past token
    bool v_trans   = true;  // the value tensor is transposed
    bool can_shift = false;
    // Note: The value of head isn't only used to optimize searching
    // for a free KV slot. llama_decode_impl also uses it, so it
@ -186,18 +188,214 @@ public:
    // computed before each graph build
    uint32_t n = 0;
-    std::vector<llama_kv_cell> cells;
+    std::vector<kv_cell> cells;
    std::vector<ggml_tensor *> k_l; // per layer
    std::vector<ggml_tensor *> v_l;
 private:
    const llama_model & model;
    const llama_hparams & hparams;
    bool has_shift = false;
    bool do_defrag = false;
    bool v_trans   = true;  // the value tensor is transposed
    bool can_shift = false;
    // required padding
    uint32_t padding = 1;
    ggml_type type_k = GGML_TYPE_F16;
    ggml_type type_v = GGML_TYPE_F16;
    std::vector<ggml_context_ptr>        ctxs;
    std::vector<ggml_backend_buffer_ptr> bufs;
    // defrag
    struct {
        std::vector<llama_kv_defrag_move> moves;
    } defrag_info;
    // return true if cells have been moved
    bool defrag_prepare(int32_t n_max_nodes);
    // commit/restore cache
    struct slot_range {
        uint32_t c0 = 0; // note: these are cell indices, not sequence positions
        uint32_t c1 = 0;
    };
    // pending cell updates that are not yet committed
    struct {
        std::vector<slot_range> ranges;
    } pending;
    // find how many cells are currently in use
    uint32_t cell_max() const;
    size_t total_size() const;
    size_t size_k_bytes() const;
    size_t size_v_bytes() const;
    ggml_tensor * build_rope_shift(
            const llama_cparams & cparams,
                   ggml_context * ctx,
                    ggml_tensor * cur,
                    ggml_tensor * shift,
                    ggml_tensor * factors,
                          float   freq_base,
                          float   freq_scale) const;
    llm_graph_result_ptr build_graph_shift(
            const llama_cparams & cparams,
                   ggml_context * ctx,
                    ggml_cgraph * gf) const;
    llm_graph_result_ptr build_graph_defrag(
            const llama_cparams & cparams,
                   ggml_context * ctx,
                    ggml_cgraph * gf,
                    const std::vector<llama_kv_defrag_move> & moves) const;
    void state_write_meta(llama_io_write_i & io, const std::vector<std::pair<uint32_t, uint32_t>> & cell_ranges, llama_seq_id seq_id = -1) const;
    void state_write_data(llama_io_write_i & io, const std::vector<std::pair<uint32_t, uint32_t>> & cell_ranges) const;
    bool state_read_meta(llama_io_read_i & io, uint32_t cell_count, llama_seq_id dest_seq_id = -1);
    bool state_read_data(llama_io_read_i & io, uint32_t cell_count);
 };
 //
 // llama_kv_cache_recurrent
 //
 class llama_kv_cache_recurrent : public llama_kv_cache {
 public:
    struct kv_cell {
        llama_pos pos  = -1;
        int32_t   src  = -1; // used to copy states
        int32_t   tail = -1;
        std::set<llama_seq_id> seq_id;
        bool has_seq_id(const llama_seq_id & id) const {
            return seq_id.find(id) != seq_id.end();
        }
        bool is_empty() const {
            return seq_id.empty();
        }
        bool is_same_seq(const kv_cell & other) const {
            return seq_id == other.seq_id;
        }
    };
    llama_kv_cache_recurrent(
            const llama_model & model,
                    ggml_type   type_k,
                    ggml_type   type_v,
                         bool   offload,
                     uint32_t   kv_size);
    ~llama_kv_cache_recurrent() = default;
    //
    // llama_memory_i
    //
    void clear() override;
    bool seq_rm  (llama_seq_id seq_id,                              llama_pos p0, llama_pos p1) override;
    void seq_cp  (llama_seq_id seq_id_src, llama_seq_id seq_id_dst, llama_pos p0, llama_pos p1) override;
    void seq_keep(llama_seq_id seq_id) override;
    void seq_add (llama_seq_id seq_id,                              llama_pos p0, llama_pos p1, llama_pos delta) override;
    void seq_div (llama_seq_id seq_id,                              llama_pos p0, llama_pos p1, int d) override;
    llama_pos seq_pos_max(llama_seq_id seq_id) const override;
    //
    // llama_kv_cache
    //
    void restore() override;
    void commit()  override;
    bool update(llama_context & lctx) override;
    void defrag_sched(float thold) override;
    void set_full() override;
    llama_sbatch sbatch_init(const llama_batch & batch, bool logits_all) override;
    llama_ubatch ubatch_next(llama_sbatch & sbatch, uint32_t n_ubatch, bool embd_pooled) const override;
    bool find_slot(const llama_ubatch & batch) override;
    int32_t get_n_tokens()   const override;
    int32_t get_used_cells() const override;
    // TODO: better data structures to reduce the cost of this operation
    llama_pos get_pos_max() const override;
    bool get_can_shift() const override;
    // TODO: temporary methods - they are not really const as they do const_cast<>, fix this
    int32_t s_copy(int i) const;
    float   s_mask(int i) const;
    // state write/load
    void state_write(llama_io_write_i & io, llama_seq_id seq_id = -1) const override;
    void state_read (llama_io_read_i  & io, llama_seq_id seq_id = -1) override;
    // Note: The value of head isn't only used to optimize searching
    // for a free KV slot. llama_decode_impl also uses it, so it
    // cannot be freely changed after a slot has been allocated.
    uint32_t head = 0;
    uint32_t size = 0;
    uint32_t used = 0; // used cells (i.e. at least one seq_id)
    // computed before each graph build
    uint32_t n = 0;
    std::vector<kv_cell> cells;
    std::vector<ggml_tensor *> k_l; // per layer
    std::vector<ggml_tensor *> v_l;
 private:
    //const llama_model & model;
    const llama_hparams & hparams;
    // commit/restore cache
    // TODO: rework for recurrent cache
    struct slot_range {
        uint32_t c0 = 0; // note: these are cell indices, not sequence positions
        uint32_t c1 = 0;
    };
    // pending cell updates that are not yet committed
    struct {
        std::vector<slot_range> ranges;
    } pending;
    ggml_type type_k = GGML_TYPE_F16;
    ggml_type type_v = GGML_TYPE_F16;
    std::vector<ggml_context_ptr>        ctxs;
    std::vector<ggml_backend_buffer_ptr> bufs;
    // find how many cells are currently in use
    uint32_t cell_max() const;
    size_t total_size() const;
    size_t size_k_bytes() const;
    size_t size_v_bytes() const;
    void state_write_meta(llama_io_write_i & io, const std::vector<std::pair<uint32_t, uint32_t>> & cell_ranges, llama_seq_id seq_id = -1) const;
    void state_write_data(llama_io_write_i & io, const std::vector<std::pair<uint32_t, uint32_t>> & cell_ranges) const;
@ -205,11 +403,6 @@ private:
    bool state_read_data(llama_io_read_i & io, uint32_t cell_count);
 };
 // TODO: temporary reusing llama_kv_cache_unified -- implement recurrent cache and simplify llama_kv_cache_unified
 //class llama_kv_cache_recurrent : public llama_kv_cache_unified {
 //public:
 //    using llama_kv_cache_unified::llama_kv_cache_unified;
 //};
 //
 // kv cache view
--- a/llama/llama.cpp/src/llama-memory.h
+++ b/llama/llama.cpp/src/llama-memory.h
@ -2,12 +2,22 @@
 #include "llama.h"
 struct llama_memory_params {
    // kv cache
    ggml_type type_k;
    ggml_type type_v;
    // parameters for other types of memory
    // ...
 };
 // general concept of LLM memory
 // the KV cache is a type of LLM memory, but there can be other types
 class llama_memory_i {
 public:
    virtual ~llama_memory_i() = default;
    virtual void clear() = 0;
    virtual void defrag() = 0;
    virtual bool seq_rm  (llama_seq_id seq_id,                              llama_pos p0, llama_pos p1) = 0;
    virtual void seq_cp  (llama_seq_id seq_id_src, llama_seq_id seq_id_dst, llama_pos p0, llama_pos p1) = 0;
--- a/llama/llama.cpp/src/llama-model-loader.cpp
+++ b/llama/llama.cpp/src/llama-model-loader.cpp
@ -301,12 +301,12 @@ namespace GGUFMeta {
            GGUFMeta::GKV<GGUFMeta::ArrayInfo>::get_kv(meta.get(), kid);
        switch (arr_info.gt) {
-            case GGUF_TYPE_FLOAT32: GGML_ASSERT((std::is_same<T, float>::value)); break;
+            case GGUF_TYPE_UINT32:
-            case GGUF_TYPE_INT32:   GGML_ASSERT(
+            case GGUF_TYPE_INT32:   GGML_ASSERT((std::is_same<T,  int32_t>::value) ||
-                                            (std::is_same<T,  int32_t>::value) ||
+                                                (std::is_same<T, uint32_t>::value)); break;
-                                            (std::is_same<T, uint32_t>::value));  break;
+            case GGUF_TYPE_FLOAT32: GGML_ASSERT((std::is_same<T,    float>::value)); break;
            default:
-                throw std::runtime_error(format("%s is not a float32, int32 array", key.c_str()));
+                throw std::runtime_error(format("%s is not a float32/uint32/int32 array", key.c_str()));
        }
        result.resize(arr_info.length);
@ -315,8 +315,6 @@ namespace GGUFMeta {
        return true;
    }
    template bool llama_model_loader::get_arr<std::array<unsigned int, 512>>(enum llm_kv kid, std::array<unsigned int, 512>& result, bool required);
    template<typename T, size_t N_MAX>
    bool llama_model_loader::get_arr(const std::string & key, std::array<T, N_MAX> & result, bool required) {
        const int kid = gguf_find_key(meta.get(), key.c_str());
@ -332,12 +330,12 @@ namespace GGUFMeta {
            GGUFMeta::GKV<GGUFMeta::ArrayInfo>::get_kv(meta.get(), kid);
        switch (arr_info.gt) {
-            case GGUF_TYPE_FLOAT32: GGML_ASSERT((std::is_same<T, float>::value)); break;
+            case GGUF_TYPE_UINT32:
-            case GGUF_TYPE_INT32:   GGML_ASSERT(
+            case GGUF_TYPE_INT32:   GGML_ASSERT((std::is_same<T,  int32_t>::value) ||
-                                            (std::is_same<T,  int32_t>::value) ||
+                                                (std::is_same<T, uint32_t>::value)); break;
-                                            (std::is_same<T, uint32_t>::value));  break;
+            case GGUF_TYPE_FLOAT32: GGML_ASSERT((std::is_same<T,    float>::value)); break;
            default:
-                throw std::runtime_error(format("%s is not a float32, int32 array", key.c_str()));
+                throw std::runtime_error(format("%s is not a float32/uint32/int32 array", key.c_str()));
        }
        if (arr_info.length > N_MAX) {
@ -826,6 +824,10 @@ void llama_model_loader::init_mappings(bool prefetch, llama_mlocks * mlock_mmaps
        mmaps_used.reserve(files.size());
        for (const auto & file : files) {
            auto * reg = ggml_backend_dev_backend_reg(ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU));
            if (!reg) {
                throw std::runtime_error(format("%s: no CPU backend found", __func__));
            }
            auto * is_numa_fn = (decltype(ggml_is_numa) *) ggml_backend_reg_get_proc_address(reg, "ggml_backend_cpu_is_numa");
            std::unique_ptr<llama_mmap> mapping = std::make_unique<llama_mmap>(file.get(), prefetch ? -1 : 0, is_numa_fn());
            mmaps_used.emplace_back(mapping->size(), 0);
--- a/llama/llama.cpp/src/llama-model-saver.cpp
+++ b/llama/llama.cpp/src/llama-model-saver.cpp
@ -0,0 +1,281 @@
 #include "llama-model-saver.h"
 #include "gguf.h"
 #include "llama.h"
 #include "llama-hparams.h"
 #include "llama-model.h"
 #include "llama-vocab.h"
 #include <string>
 llama_model_saver::llama_model_saver(const struct llama_model & model) : model(model), llm_kv(model.arch) {
    gguf_ctx = gguf_init_empty();
 }
 llama_model_saver::~llama_model_saver() {
    gguf_free(gguf_ctx);
 }
 void llama_model_saver::add_kv(const enum llm_kv key, const uint32_t value) {
    gguf_set_val_u32(gguf_ctx, llm_kv(key).c_str(), value);
 }
 void llama_model_saver::add_kv(const enum llm_kv key, const int32_t value) {
    gguf_set_val_i32(gguf_ctx, llm_kv(key).c_str(), value);
 }
 void llama_model_saver::add_kv(const enum llm_kv key, const float value) {
    gguf_set_val_f32(gguf_ctx, llm_kv(key).c_str(), value);
 }
 void llama_model_saver::add_kv(const enum llm_kv key, const bool value) {
    gguf_set_val_bool(gguf_ctx, llm_kv(key).c_str(), value);
 }
 void llama_model_saver::add_kv(const enum llm_kv key, const char * value) {
    gguf_set_val_str(gguf_ctx, llm_kv(key).c_str(), value);
 }
 [[noreturn]]
 void llama_model_saver::add_kv(const enum llm_kv key, const char value) {
    GGML_UNUSED(key);
    GGML_UNUSED(value);
    GGML_ABORT("fatal error"); // this should never be called, only needed to make the template below compile
 }
 template <typename Container>
 void llama_model_saver::add_kv(const enum llm_kv key, const Container & value, const bool per_layer) {
    const size_t n_values = per_layer ? size_t(model.hparams.n_layer) : value.size();
    GGML_ASSERT(n_values <= value.size());
    if (n_values == 0) {
        return;
    }
    if (per_layer) {
        bool all_values_the_same = true;
        for (size_t i = 1; i < n_values; ++i) {
            if (value[i] != value[0]) {
                all_values_the_same = false;
                break;
            }
        }
        if (all_values_the_same) {
            add_kv(key, value[0]);
            return;
        }
    }
    if (std::is_same<typename Container::value_type, uint8_t>::value) {
        gguf_set_arr_data(gguf_ctx, llm_kv(key).c_str(), GGUF_TYPE_UINT8, value.data(), n_values);
    } else if (std::is_same<typename Container::value_type, int8_t>::value) {
        gguf_set_arr_data(gguf_ctx, llm_kv(key).c_str(), GGUF_TYPE_INT8, value.data(), n_values);
    } else if (std::is_same<typename Container::value_type, uint32_t>::value) {
        gguf_set_arr_data(gguf_ctx, llm_kv(key).c_str(), GGUF_TYPE_UINT32, value.data(), n_values);
    } else if (std::is_same<typename Container::value_type, int32_t>::value) {
        gguf_set_arr_data(gguf_ctx, llm_kv(key).c_str(), GGUF_TYPE_INT32, value.data(), n_values);
    } else if (std::is_same<typename Container::value_type, float>::value) {
        gguf_set_arr_data(gguf_ctx, llm_kv(key).c_str(), GGUF_TYPE_FLOAT32, value.data(), n_values);
    } else if (std::is_same<Container, std::string>::value) {
        gguf_set_val_str(gguf_ctx, llm_kv(key).c_str(), reinterpret_cast<const char *>(value.data()));
    } else {
        GGML_ABORT("fatal error");
    }
 }
 void llama_model_saver::add_kv(const enum llm_kv key, const std::vector<std::string> & value) {
    std::vector<const char *> tmp(value.size());
    for (size_t i = 0; i < value.size(); ++i) {
        tmp[i] = value[i].c_str();
    }
    gguf_set_arr_str(gguf_ctx, llm_kv(key).c_str(), tmp.data(), tmp.size());
 }
 void llama_model_saver::add_tensor(const struct ggml_tensor * tensor) {
    if (!tensor) {
        return;
    }
    if (gguf_find_tensor(gguf_ctx, tensor->name) >= 0) {
        GGML_ASSERT(std::string(tensor->name) == "rope_freqs.weight"); // FIXME
        return;
    }
    gguf_add_tensor(gguf_ctx, tensor);
 }
 void llama_model_saver::add_kv_from_model() {
    const llama_hparams & hparams = model.hparams;
    const llama_vocab   & vocab   = model.vocab;
    const int32_t n_vocab = vocab.n_tokens();
    std::vector<std::string> tokens(n_vocab);
    std::vector<float>       scores(n_vocab);
    std::vector<int32_t>     token_types(n_vocab);
    for (int32_t id = 0; id < n_vocab; ++id) {
        const llama_vocab::token_data & token_data = vocab.get_token_data(id);
        tokens[id] = token_data.text;
        scores[id] = token_data.score;
        switch(token_data.attr) {
            case LLAMA_TOKEN_ATTR_UNKNOWN:      token_types[id] = LLAMA_TOKEN_TYPE_UNKNOWN;      break;
            case LLAMA_TOKEN_ATTR_UNUSED:       token_types[id] = LLAMA_TOKEN_TYPE_UNUSED;       break;
            case LLAMA_TOKEN_ATTR_NORMAL:       token_types[id] = LLAMA_TOKEN_TYPE_NORMAL;       break;
            case LLAMA_TOKEN_ATTR_CONTROL:      token_types[id] = LLAMA_TOKEN_TYPE_CONTROL;      break;
            case LLAMA_TOKEN_ATTR_USER_DEFINED: token_types[id] = LLAMA_TOKEN_TYPE_USER_DEFINED; break;
            case LLAMA_TOKEN_ATTR_BYTE:         token_types[id] = LLAMA_TOKEN_TYPE_BYTE;         break;
            case LLAMA_TOKEN_ATTR_UNDEFINED:
            default:                            token_types[id] = LLAMA_TOKEN_TYPE_UNDEFINED;    break;
        }
    }
    // add_kv(LLM_KV_GENERAL_TYPE,                      ???);
    add_kv(LLM_KV_GENERAL_ARCHITECTURE,              model.arch_name());
    // add_kv(LLM_KV_GENERAL_QUANTIZATION_VERSION,      ???);
    // add_kv(LLM_KV_GENERAL_ALIGNMENT,                 ???);
    add_kv(LLM_KV_GENERAL_NAME,                      model.name);
    // add_kv(LLM_KV_GENERAL_AUTHOR,                    ???);
    // add_kv(LLM_KV_GENERAL_VERSION,                   ???);
    // add_kv(LLM_KV_GENERAL_URL,                       ???);
    // add_kv(LLM_KV_GENERAL_DESCRIPTION,               ???);
    // add_kv(LLM_KV_GENERAL_LICENSE,                   ???);
    // add_kv(LLM_KV_GENERAL_SOURCE_URL,                ???);
    // add_kv(LLM_KV_GENERAL_SOURCE_HF_REPO,            ???);
    add_kv(LLM_KV_VOCAB_SIZE,                        vocab.n_tokens());
    add_kv(LLM_KV_CONTEXT_LENGTH,                    hparams.n_ctx_train);
    add_kv(LLM_KV_EMBEDDING_LENGTH,                  hparams.n_embd);
    add_kv(LLM_KV_BLOCK_COUNT,                       hparams.n_layer);
    add_kv(LLM_KV_LEADING_DENSE_BLOCK_COUNT,         hparams.n_layer_dense_lead);
    add_kv(LLM_KV_FEED_FORWARD_LENGTH,               hparams.n_ff_arr, true);
    add_kv(LLM_KV_EXPERT_FEED_FORWARD_LENGTH,        hparams.n_ff_exp);
    add_kv(LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
    add_kv(LLM_KV_USE_PARALLEL_RESIDUAL,             hparams.use_par_res);
    // add_kv(LLM_KV_TENSOR_DATA_LAYOUT,                ???);
    add_kv(LLM_KV_EXPERT_COUNT,                      hparams.n_expert);
    add_kv(LLM_KV_EXPERT_USED_COUNT,                 hparams.n_expert_used);
    add_kv(LLM_KV_EXPERT_SHARED_COUNT,               hparams.n_expert_shared);
    add_kv(LLM_KV_EXPERT_WEIGHTS_SCALE,              hparams.expert_weights_scale);
    add_kv(LLM_KV_POOLING_TYPE,                      uint32_t(hparams.pooling_type));
    add_kv(LLM_KV_LOGIT_SCALE,                       hparams.f_logit_scale);
    add_kv(LLM_KV_DECODER_START_TOKEN_ID,            hparams.dec_start_token_id);
    add_kv(LLM_KV_ATTN_LOGIT_SOFTCAPPING,            hparams.f_attn_logit_softcapping);
    add_kv(LLM_KV_FINAL_LOGIT_SOFTCAPPING,           hparams.f_final_logit_softcapping);
    add_kv(LLM_KV_SWIN_NORM,                         hparams.swin_norm);
    add_kv(LLM_KV_RESCALE_EVERY_N_LAYERS,            hparams.rescale_every_n_layers);
    add_kv(LLM_KV_TIME_MIX_EXTRA_DIM,                hparams.time_mix_extra_dim);
    add_kv(LLM_KV_TIME_DECAY_EXTRA_DIM,              hparams.time_decay_extra_dim);
    add_kv(LLM_KV_RESIDUAL_SCALE,                    hparams.f_residual_scale);
    add_kv(LLM_KV_EMBEDDING_SCALE,                   hparams.f_embedding_scale);
    add_kv(LLM_KV_ATTENTION_HEAD_COUNT,              hparams.n_head_arr, true);
    add_kv(LLM_KV_ATTENTION_HEAD_COUNT_KV,           hparams.n_head_kv_arr, true);
    add_kv(LLM_KV_ATTENTION_MAX_ALIBI_BIAS,          hparams.f_max_alibi_bias);
    add_kv(LLM_KV_ATTENTION_CLAMP_KQV,               hparams.f_clamp_kqv);
    add_kv(LLM_KV_ATTENTION_KEY_LENGTH,              hparams.n_embd_head_k);
    add_kv(LLM_KV_ATTENTION_VALUE_LENGTH,            hparams.n_embd_head_v);
    add_kv(LLM_KV_ATTENTION_LAYERNORM_EPS,           hparams.f_norm_eps);
    add_kv(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS,       hparams.f_norm_rms_eps);
    add_kv(LLM_KV_ATTENTION_CAUSAL,                  hparams.causal_attn);
    add_kv(LLM_KV_ATTENTION_Q_LORA_RANK,             hparams.n_lora_q);
    add_kv(LLM_KV_ATTENTION_KV_LORA_RANK,            hparams.n_lora_kv);
    add_kv(LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT,  hparams.n_rel_attn_bkts);
    add_kv(LLM_KV_ATTENTION_SLIDING_WINDOW,          hparams.n_swa);
    add_kv(LLM_KV_ATTENTION_SCALE,                   hparams.f_attention_scale);
    const float rope_scaling_factor = hparams.rope_freq_scale_train == 1.0f ? 0.0f : 1.0f/hparams.rope_freq_scale_train;
    add_kv(LLM_KV_ROPE_DIMENSION_COUNT,              hparams.n_rot);
    add_kv(LLM_KV_ROPE_FREQ_BASE,                    hparams.rope_freq_base_train);
    // add_kv(LLM_KV_ROPE_SCALE_LINEAR,                 rope_scaling_factor); // old name
    add_kv(LLM_KV_ROPE_SCALING_TYPE,                 llama_rope_scaling_type_name(hparams.rope_scaling_type_train));
    add_kv(LLM_KV_ROPE_SCALING_FACTOR,               rope_scaling_factor);
    add_kv(LLM_KV_ROPE_SCALING_ATTN_FACTOR,          hparams.rope_attn_factor);
    add_kv(LLM_KV_ROPE_SCALING_ORIG_CTX_LEN,         hparams.n_ctx_orig_yarn);
    add_kv(LLM_KV_ROPE_SCALING_FINETUNED,            hparams.rope_finetuned);
    add_kv(LLM_KV_ROPE_SCALING_YARN_LOG_MUL,         hparams.rope_yarn_log_mul);
    // TODO: implement split file support
    // add_kv(LLM_KV_SPLIT_NO,                          ???);
    // add_kv(LLM_KV_SPLIT_COUNT,                       ???);
    // add_kv(LLM_KV_SPLIT_TENSORS_COUNT,               ???);
    add_kv(LLM_KV_SSM_INNER_SIZE,                    hparams.ssm_d_inner);
    add_kv(LLM_KV_SSM_CONV_KERNEL,                   hparams.ssm_d_conv);
    add_kv(LLM_KV_SSM_STATE_SIZE,                    hparams.ssm_d_state);
    add_kv(LLM_KV_SSM_TIME_STEP_RANK,                hparams.ssm_dt_rank);
    add_kv(LLM_KV_SSM_DT_B_C_RMS,                    hparams.ssm_dt_b_c_rms);
    add_kv(LLM_KV_WKV_HEAD_SIZE,                     hparams.wkv_head_size);
    add_kv(LLM_KV_TOKENIZER_MODEL,                   vocab.get_tokenizer_model());
    add_kv(LLM_KV_TOKENIZER_PRE,                     vocab.get_tokenizer_pre());
    add_kv(LLM_KV_TOKENIZER_LIST,                    tokens);
    add_kv(LLM_KV_TOKENIZER_TOKEN_TYPE,              token_types);
    add_kv(LLM_KV_TOKENIZER_TOKEN_TYPE_COUNT,        vocab.n_token_types());
    add_kv(LLM_KV_TOKENIZER_SCORES,                  scores);
    add_kv(LLM_KV_TOKENIZER_MERGES,                  vocab.get_bpe_merges());
    // FIXME llama_token is type i32 but when reading in a GGUF file u32 is expected, not an issue for writing though
    add_kv(LLM_KV_TOKENIZER_BOS_ID,                  uint32_t(vocab.token_bos()));
    add_kv(LLM_KV_TOKENIZER_EOS_ID,                  uint32_t(vocab.token_eos()));
    add_kv(LLM_KV_TOKENIZER_EOT_ID,                  uint32_t(vocab.token_eot()));
    add_kv(LLM_KV_TOKENIZER_EOM_ID,                  uint32_t(vocab.token_eom()));
    add_kv(LLM_KV_TOKENIZER_UNK_ID,                  uint32_t(vocab.token_unk()));
    add_kv(LLM_KV_TOKENIZER_SEP_ID,                  uint32_t(vocab.token_sep()));
    add_kv(LLM_KV_TOKENIZER_PAD_ID,                  uint32_t(vocab.token_pad()));
    // add_kv(LLM_KV_TOKENIZER_CLS_ID,                  uint32_t(vocab.token_bos())); // deprecated
    // add_kv(LLM_KV_TOKENIZER_MASK_ID,                 ???);
    add_kv(LLM_KV_TOKENIZER_ADD_BOS,                 vocab.get_add_bos());
    add_kv(LLM_KV_TOKENIZER_ADD_EOS,                 vocab.get_add_eos());
    add_kv(LLM_KV_TOKENIZER_ADD_PREFIX,              vocab.get_add_space_prefix());
    add_kv(LLM_KV_TOKENIZER_REMOVE_EXTRA_WS,         vocab.get_remove_extra_whitespaces());
    add_kv(LLM_KV_TOKENIZER_PRECOMPILED_CHARSMAP,    vocab.get_precompiled_charsmap());
    // add_kv(LLM_KV_TOKENIZER_HF_JSON,                 ???);
    // add_kv(LLM_KV_TOKENIZER_RWKV,                    ???);
    add_kv(LLM_KV_TOKENIZER_FIM_PRE_ID,              uint32_t(vocab.token_fim_pre()));
    add_kv(LLM_KV_TOKENIZER_FIM_SUF_ID,              uint32_t(vocab.token_fim_suf()));
    add_kv(LLM_KV_TOKENIZER_FIM_MID_ID,              uint32_t(vocab.token_fim_mid()));
    add_kv(LLM_KV_TOKENIZER_FIM_PAD_ID,              uint32_t(vocab.token_fim_pad()));
    add_kv(LLM_KV_TOKENIZER_FIM_REP_ID,              uint32_t(vocab.token_fim_rep()));
    add_kv(LLM_KV_TOKENIZER_FIM_SEP_ID,              uint32_t(vocab.token_fim_sep()));
    // TODO: implement LoRA support
    // add_kv(LLM_KV_ADAPTER_TYPE,                      ???);
    // add_kv(LLM_KV_ADAPTER_LORA_ALPHA,                ???);
    // deprecated
    // add_kv(LLM_KV_TOKENIZER_PREFIX_ID,               ???);
    // add_kv(LLM_KV_TOKENIZER_SUFFIX_ID,               ???);
    // add_kv(LLM_KV_TOKENIZER_MIDDLE_ID,               ???);
 }
 void llama_model_saver::add_tensors_from_model() {
    if (std::string(model.output->name) != std::string(model.tok_embd->name)) {
        add_tensor(model.tok_embd); // some models use the same tensor for tok_embd and output
    }
    add_tensor(model.type_embd);
    add_tensor(model.pos_embd);
    add_tensor(model.tok_norm);
    add_tensor(model.tok_norm_b);
    add_tensor(model.output_norm);
    add_tensor(model.output_norm_b);
    add_tensor(model.output);
    add_tensor(model.output_b);
    add_tensor(model.output_norm_enc);
    add_tensor(model.cls);
    add_tensor(model.cls_b);
    add_tensor(model.cls_out);
    add_tensor(model.cls_out_b);
    for (const struct llama_layer & layer : model.layers) {
        for (size_t i = 0; i < sizeof(layer)/sizeof(struct ggml_tensor *); ++i) {
            add_tensor(reinterpret_cast<const struct ggml_tensor * const *>(&layer)[i]);
        }
    }
 }
 void llama_model_saver::save(const std::string & path_model) {
    gguf_write_to_file(gguf_ctx, path_model.c_str(), false);
 }
--- a/llama/llama.cpp/src/llama-model-saver.h
+++ b/llama/llama.cpp/src/llama-model-saver.h
@ -0,0 +1,37 @@
 #pragma once
 #include "llama.h"
 #include "llama-arch.h"
 #include <vector>
 struct llama_model_saver {
    struct gguf_context * gguf_ctx = nullptr;
    const struct llama_model & model;
    const struct LLM_KV llm_kv;
    llama_model_saver(const struct llama_model & model);
    ~llama_model_saver();
    void add_kv(enum llm_kv key, uint32_t     value);
    void add_kv(enum llm_kv key, int32_t      value);
    void add_kv(enum llm_kv key, float        value);
    void add_kv(enum llm_kv key, bool         value);
    void add_kv(enum llm_kv key, const char * value);
    [[noreturn]]
    void add_kv(enum llm_kv key, char value); // needed to make the template below compile
    template <typename Container>
    void add_kv(enum llm_kv key, const Container & value, bool per_layer = false);
    void add_kv(enum llm_kv key, const std::vector<std::string> & value);
    void add_tensor(const struct ggml_tensor * tensor);
    void add_kv_from_model();
    void add_tensors_from_model();
    void save(const std::string & path_model);
 };
--- a/llama/llama.cpp/src/llama-model.cpp
+++ b/llama/llama.cpp/src/llama-model.cpp
@ -40,6 +40,7 @@ const char * llm_type_name(llm_type type) {
        case LLM_TYPE_335M:          return "335M";
        case LLM_TYPE_410M:          return "410M";
        case LLM_TYPE_450M:          return "450M";
        case LLM_TYPE_475M:          return "475M";
        case LLM_TYPE_770M:          return "770M";
        case LLM_TYPE_780M:          return "780M";
        case LLM_TYPE_0_5B:          return "0.5B";
@ -79,6 +80,7 @@ const char * llm_type_name(llm_type type) {
        case LLM_TYPE_236B:          return "236B";
        case LLM_TYPE_290B:          return "290B";
        case LLM_TYPE_314B:          return "314B";
        case LLM_TYPE_405B:          return "405B";
        case LLM_TYPE_671B:          return "671B";
        case LLM_TYPE_SMALL:         return "0.1B";
        case LLM_TYPE_MEDIUM:        return "0.4B";
@ -115,6 +117,10 @@ static const std::map<llama_rope_scaling_type, const char *> LLAMA_ROPE_SCALING_
    { LLAMA_ROPE_SCALING_TYPE_LONGROPE,   "longrope"   },
 };
 std::string llama_rope_scaling_type_name(llama_rope_scaling_type rope_scaling_type) {
    return LLAMA_ROPE_SCALING_TYPES.at(rope_scaling_type);
 }
 static llama_rope_scaling_type llama_rope_scaling_type_from_string(const std::string & name) {
    for (const auto & kv : LLAMA_ROPE_SCALING_TYPES) {
        if (kv.second == name) {
@ -297,6 +303,10 @@ static buft_list_t make_cpu_buft_list(const std::vector<ggml_backend_dev_t> & de
    // add extra buffer types, only if no GPU device is present
    // ref: https://github.com/ggml-org/llama.cpp/issues/12481#issuecomment-2743136094
    auto * cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
    if (cpu_dev == nullptr) {
        throw std::runtime_error(format("%s: no CPU backend found", __func__));
    }
    auto * cpu_reg = ggml_backend_dev_backend_reg(cpu_dev);
    auto ggml_backend_dev_get_extra_bufts_fn = (ggml_backend_dev_get_extra_bufts_t)
        ggml_backend_reg_get_proc_address(cpu_reg, "ggml_backend_dev_get_extra_bufts");
@ -423,7 +433,6 @@ void llama_model::load_hparams(llama_model_loader & ml) {
    // get general kv
    ml.get_key(LLM_KV_GENERAL_NAME, name, false);
    ml.get_key(LLM_KV_VOCAB_SIZE, hparams.n_vocab, false) || ml.get_arr_n(LLM_KV_TOKENIZER_LIST, hparams.n_vocab, false);
    // everything past this point is not vocab-related
    if (hparams.vocab_only) {
@ -435,7 +444,6 @@ void llama_model::load_hparams(llama_model_loader & ml) {
    ml.get_key(LLM_KV_BLOCK_COUNT,       hparams.n_layer);
    ml.get_key(LLM_KV_EXPERT_COUNT,      hparams.n_expert,      false);
    ml.get_key(LLM_KV_EXPERT_USED_COUNT, hparams.n_expert_used, false);
    ml.get_key(LLM_KV_VOCAB_SIZE,        hparams.n_vocab,       false);
    if (arch == LLM_ARCH_WAVTOKENIZER_DEC) {
        ml.get_key(LLM_KV_FEATURES_LENGTH, hparams.n_embd_features);
@ -459,11 +467,9 @@ void llama_model::load_hparams(llama_model_loader & ml) {
    std::fill(hparams.n_head_arr.begin(),    hparams.n_head_arr.end(),    0);
    std::fill(hparams.n_head_kv_arr.begin(), hparams.n_head_kv_arr.end(), 0);
    std::fill(hparams.n_ff_arr.begin(),      hparams.n_ff_arr.end(),      0);
    std::fill(hparams.cross_attn_layers.begin(), hparams.cross_attn_layers.end(), -1);
    ml.get_key_or_arr(LLM_KV_FEED_FORWARD_LENGTH,  hparams.n_ff_arr,   hparams.n_layer, false);
    ml.get_key_or_arr(LLM_KV_ATTENTION_HEAD_COUNT, hparams.n_head_arr, hparams.n_layer, false);
    ml.get_arr(LLM_KV_ATTENTION_CROSS_ATTENTION_LAYERS, hparams.cross_attn_layers, false);
    // n_head_kv is optional, default to n_head
    hparams.n_head_kv_arr = hparams.n_head_arr;
@ -516,7 +522,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
        ml.get_key(LLM_KV_ROPE_DIMENSION_COUNT, hparams.n_rot, false);
-        if (arch == LLM_ARCH_LLAMA || arch == LLM_ARCH_MLLAMA || arch == LLM_ARCH_DECI || arch == LLM_ARCH_FALCON) {
+        if (arch == LLM_ARCH_LLAMA || arch == LLM_ARCH_DECI || arch == LLM_ARCH_FALCON) {
            if (hparams.n_rot != hparams.n_embd_head_k) {
                throw std::runtime_error(format("invalid n_rot: %u, expected %u", hparams.n_rot, hparams.n_embd_head_k));
            }
@ -579,22 +585,13 @@ void llama_model::load_hparams(llama_model_loader & ml) {
                    hparams.use_kq_norm = false;
                }
            } break;
        case LLM_ARCH_MLLAMA:
            {
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
                switch (hparams.n_layer) {
                    case 40: type = LLM_TYPE_11B; break;
                    case 100: type = LLM_TYPE_90B; break;
                    default: type = LLM_TYPE_UNKNOWN;
                }
            } break;
        case LLM_ARCH_DECI:
            {
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
                switch (hparams.n_layer) {
                    case 32: type = LLM_TYPE_7B; break;
                    case 80: type = LLM_TYPE_70B; break;
                    case 162: type = LLM_TYPE_405B; break;
                    default: type = LLM_TYPE_UNKNOWN;
                }
            } break;
@ -721,7 +718,11 @@ void llama_model::load_hparams(llama_model_loader & ml) {
                ml.get_key(LLM_KV_MOE_EVERY_N_LAYERS,         hparams.moe_every_n_layers, 0);
                if (hparams.n_layer == 12 && hparams.n_embd == 768) {
-                    type = LLM_TYPE_137M;
+                    if (arch == LLM_ARCH_NOMIC_BERT) {
                        type = LLM_TYPE_137M;
                    } else if (arch == LLM_ARCH_NOMIC_BERT_MOE && hparams.moe_every_n_layers == 2) {
                        type = LLM_TYPE_475M;
                    }
                }
            } break;
        case LLM_ARCH_BLOOM:
@ -782,6 +783,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
            // fall through
        case LLM_ARCH_QWEN2:
            {
                ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type, false);
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
                switch (hparams.n_layer) {
                    case 24: type = hparams.n_embd == 1024 ? LLM_TYPE_0_5B : LLM_TYPE_1B; break;
@ -1505,6 +1507,9 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
    }
    ggml_backend_dev_t cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
    if (cpu_dev == nullptr) {
        throw std::runtime_error(format("%s: no CPU backend found", __func__));
    }
    const int i_gpu_start = std::max((int) hparams.n_layer - n_gpu_layers, (int) 0);
    const int act_gpu_layers = devices.empty() ? 0 : std::min(n_gpu_layers, (int)n_layer + 1);
    auto get_layer_buft_list = [&](int il) -> llama_model::impl::layer_dev {
@ -1576,7 +1581,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
        const int64_t n_embd_head_v = hparams.n_embd_head_v;
        const int64_t n_ff          = hparams.n_ff();
        const int64_t n_embd_gqa    = n_embd_v_gqa;
-        const int64_t n_vocab       = hparams.n_vocab;
+        const int64_t n_vocab       = vocab.n_tokens();
        const int64_t n_token_types = vocab.n_token_types();
        const int64_t n_rot         = hparams.n_rot;
        const int64_t n_expert      = hparams.n_expert;
@ -1672,8 +1677,11 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
                for (const auto * overrides = ml.tensor_buft_overrides; overrides->pattern != nullptr; ++overrides) {
                    std::regex pattern(overrides->pattern);
                    if (std::regex_search(tensor_name, pattern)) {
                        LLAMA_LOG_DEBUG("tensor %s buffer type overriden to %s\n", tensor_name.c_str(), ggml_backend_buft_name(overrides->buft));
                        buft = overrides->buft;
                        LLAMA_LOG_DEBUG("tensor %s (%zu MiB %s) buffer type overridden to %s\n",
                                tensor_name.c_str(),
                                ggml_nbytes(t_meta) / 1024 / 1024, ggml_type_name(t_meta->type),
                                ggml_backend_buft_name(buft));
                        break;
                    }
                }
@ -1690,6 +1698,9 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
            auto * buft_dev = ggml_backend_buft_get_device(buft);
            if (ml.use_mmap && buft_dev && buft == ggml_backend_dev_host_buffer_type(buft_dev)) {
                auto * cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
                if (!cpu_dev) {
                    throw std::runtime_error("no CPU backend found");
                }
                buft = ggml_backend_dev_buffer_type(cpu_dev);
            }
@ -1829,52 +1840,6 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
                        }
                    }
                } break;
            case LLM_ARCH_MLLAMA:
                {
                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab+8}, 0);
                    // output
                    {
                        output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
                        output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
                        // if output is NULL, init from the input tok embed
                        if (output == NULL) {
                            output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
                        }
                    }
                    for (int i = 0; i < n_layer; ++i) {
                        auto & layer = layers[i];
                        if (hparams.cross_attention_layers(i)) {
                            layer.cross_attn_k_norm = create_tensor(tn(LLM_TENSOR_CROSS_ATTN_K_NORM,   "weight", i), {128}, 0);
                            layer.cross_attn_k_proj = create_tensor(tn(LLM_TENSOR_CROSS_ATTN_K_PROJ,   "weight", i), {n_embd, 1024}, 0);
                            layer.cross_attn_o_proj = create_tensor(tn(LLM_TENSOR_CROSS_ATTN_O_PROJ,   "weight", i), {n_embd, n_embd}, 0);
                            layer.cross_attn_q_norm = create_tensor(tn(LLM_TENSOR_CROSS_ATTN_Q_NORM, "weight", i), {128}, 0);
                            layer.cross_attn_q_proj = create_tensor(tn(LLM_TENSOR_CROSS_ATTN_Q_PROJ, "weight", i), {n_embd, n_embd}, 0);
                            layer.cross_attn_v_proj = create_tensor(tn(LLM_TENSOR_CROSS_ATTN_V_PROJ, "weight", i), {n_embd, 1024}, 0);
                            layer.cross_attn_attn_gate = create_tensor(tn(LLM_TENSOR_CROSS_ATTN_ATTN_GATE, i), {1}, 0);
                            layer.cross_attn_mlp_gate = create_tensor(tn(LLM_TENSOR_CROSS_ATTN_MLP_GATE, i), {1}, 0);
                            layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
                            layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
                            layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, 0);
                            layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
                            layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
                        } else {
                            layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
                            layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
                            layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_k_gqa}, 0);
                            layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_v_gqa}, 0);
                            layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
                            layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
                            layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0));
                            layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, 0);
                            layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
                            layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
                        }
                    }
                } break;
            case LLM_ARCH_DECI:
                {
                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
@ -1917,7 +1882,9 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
                        layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
                        layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd},     TENSOR_NOT_REQUIRED);
-                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
+                        if (n_ff > 0) {
                            layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
                        }
                        if (hparams.rope_scaling_type_train == LLAMA_ROPE_SCALING_TYPE_LONGROPE) {
                            layer.rope_long  = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_LONG,  "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
@ -1927,9 +1894,11 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
                            layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
                        }
-                        layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, 0);
+                        if (n_ff > 0) {
-                        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
+                            layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, 0);
-                        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
+                            layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
                            layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
                        }
                        // optional MLP bias
                        layer.ffn_gate_b = create_tensor(tn(LLM_TENSOR_FFN_GATE, "bias", i), {n_ff}, TENSOR_NOT_REQUIRED);
@ -3573,7 +3542,11 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
                    // output
                    output_norm   = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
-                    output        = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, 0);
+                    output        = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
                    // if output is NULL, init from the input tok embed
                    if (output == NULL) {
                        output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
                    }
                    for (int i = 0; i < n_layer; ++i) {
                        auto & layer = layers[i];
@ -4206,6 +4179,9 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
        if (!dev) {
            // FIXME: workaround for CPU backend buft having a NULL device
            dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
            if (!dev) {
                throw std::runtime_error(format("%s: no CPU backend found", __func__));
            }
        }
        ggml_backend_dev_props props;
        ggml_backend_dev_get_props(dev, &props);
@ -4335,7 +4311,7 @@ uint64_t llama_model::n_elements() const {
 }
 void llama_model::print_info() const {
-    const char * rope_scaling_type = LLAMA_ROPE_SCALING_TYPES.at(hparams.rope_scaling_type_train);
+    const std::string rope_scaling_type = llama_rope_scaling_type_name(hparams.rope_scaling_type_train);
    auto print_f = [](const std::function<uint32_t(uint32_t)> & f, uint32_t n) {
        bool is_var = false;
@ -4396,7 +4372,7 @@ void llama_model::print_info() const {
        LLAMA_LOG_INFO("%s: causal attn      = %d\n",     __func__, hparams.causal_attn);
        LLAMA_LOG_INFO("%s: pooling type     = %d\n",     __func__, hparams.pooling_type);
        LLAMA_LOG_INFO("%s: rope type        = %d\n",     __func__, hparams.rope_type);
-        LLAMA_LOG_INFO("%s: rope scaling     = %s\n",     __func__, rope_scaling_type);
+        LLAMA_LOG_INFO("%s: rope scaling     = %s\n",     __func__, rope_scaling_type.c_str());
        LLAMA_LOG_INFO("%s: freq_base_train  = %.1f\n",   __func__, hparams.rope_freq_base_train);
        LLAMA_LOG_INFO("%s: freq_scale_train = %g\n",     __func__, hparams.rope_freq_scale_train);
        LLAMA_LOG_INFO("%s: n_ctx_orig_yarn  = %u\n",     __func__, hparams.n_ctx_orig_yarn);
@ -4543,6 +4519,19 @@ const ggml_tensor * llama_model::get_tensor(const char * name) const {
    return it->second;
 }
 ggml_tensor * llama_model::get_rope_factors(uint32_t n_ctx_per_seq, int il) const {
    // choose long/short freq factors based on the context size
    if (layers[il].rope_freqs != nullptr) {
        return layers[il].rope_freqs;
    }
    if (n_ctx_per_seq > hparams.n_ctx_orig_yarn) {
        return layers[il].rope_long;
    }
    return layers[il].rope_short;
 }
 struct llm_build_llama : public llm_graph_context {
    llm_build_llama(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
        const int64_t n_embd_head = hparams.n_embd_head_v;
@ -4583,7 +4572,7 @@ struct llm_build_llama : public llm_graph_context {
            // self-attention
            {
                // rope freq factors for llama3; may return nullptr for llama2 and other models
-                ggml_tensor * rope_factors = static_cast<const llama_kv_cache_unified *>(memory)->cbs.get_rope_factors(n_ctx_per_seq, il);
+                ggml_tensor * rope_factors = model.get_rope_factors(n_ctx_per_seq, il);
                // compute Q and K and RoPE them
                ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
@ -4767,246 +4756,6 @@ struct llm_build_llama : public llm_graph_context {
    }
 };
 struct llm_build_mllama: public llm_graph_context {
    llm_build_mllama(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
        // mutable variable, needed during the last layer of the computation to skip unused tokens
        int32_t n_tokens = this->n_tokens;
        const int64_t n_embd_head = hparams.n_embd_head_v;
        GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
        GGML_ASSERT(n_embd_head == hparams.n_rot);
        ggml_tensor * cur;
        ggml_tensor * inpL;
        ggml_tensor * inpCAS;
        inpL = build_inp_embd(model.tok_embd);
        inpCAS = build_inp_cross_attn_state();
          // inp_pos - contains the positions
        ggml_tensor * inp_pos = build_inp_pos();
        auto * inp_attn = build_attn_inp_kv_unified();
        const llama_kv_cache_unified * kv_self = static_cast<const llama_kv_cache_unified *>(memory);
        for (int il = 0; il < n_layer; ++il) {
            ggml_tensor * inpSA = inpL;
            // norm
            cur = build_norm(inpL,
                    model.layers[il].attn_norm, NULL,
                    LLM_NORM_RMS, il);
            cb(cur, "attn_norm", il);
            if (hparams.cross_attention_layers(il)) {
                if (!ubatch.embd && !cparams.cross_attn) {
                    continue;
                }
                // cross attention layer
                ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].cross_attn_q_proj, cur);
                cb(Qcur, "Qcur", il);
                Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
                cb(Qcur, "Qcur", il);
                Qcur = ggml_cont(ctx0, ggml_permute(ctx0, Qcur, 0, 2, 1, 3));
                cb(Qcur, "Qcur", il);
                Qcur = build_norm(Qcur, model.layers[il].cross_attn_q_norm, NULL, LLM_NORM_RMS, il);
                cb(Qcur, "Qcur", il);
                ggml_tensor * Kcur, * Vcur;
                if (ubatch.embd) {
                    Kcur = ggml_mul_mat(ctx0, model.layers[il].cross_attn_k_proj, inpCAS);
                    cb(Kcur, "Kcur", il);
                    Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, 6404);
                    cb(Kcur, "Kcur", il);
                    Kcur = ggml_cont(ctx0, ggml_permute(ctx0, Kcur, 0, 2, 1, 3));
                    cb(Kcur, "Kcur", il);
                    Kcur = build_norm(Kcur, model.layers[il].cross_attn_k_norm, NULL, LLM_NORM_RMS, il);
                    cb(Kcur, "Kcur", il);
                    ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, kv_self->k_l[il]));
                    Vcur = ggml_mul_mat(ctx0, model.layers[il].cross_attn_v_proj, inpCAS);
                    cb(Vcur, "Vcur", il);
                    Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, 6404);
                    cb(Vcur, "Vcur", il);
                    Vcur = ggml_permute(ctx0, Vcur, 0, 2, 1, 3);
                    cb(Vcur, "Vcur", il);
                    ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, kv_self->v_l[il]));
                } else {
                    Kcur = ggml_view_tensor(ctx0, kv_self->k_l[il]);
                    cb(Kcur, "Kcur (view)", il);
                    Vcur = ggml_view_tensor(ctx0, kv_self->v_l[il]);
                    cb(Vcur, "Vcur (view)", il);
                }
                struct ggml_tensor * kq = ggml_mul_mat(ctx0, Kcur, Qcur);
                cb(kq, "kq", il);
                // TODO: apply causal masks
                struct ggml_tensor * kq_soft_max = ggml_soft_max_ext(ctx0, kq, nullptr, 1.f/sqrtf(float(n_embd_head)), hparams.f_max_alibi_bias);
                cb(kq_soft_max, "kq_soft_max", il);
                Vcur = ggml_cont(ctx0, ggml_transpose(ctx0, Vcur));
                cb(Vcur, "Vcur", il);
                struct ggml_tensor * kqv = ggml_mul_mat(ctx0, Vcur, kq_soft_max);
                cb(kqv, "kqv", il);
                struct ggml_tensor * kqv_merged = ggml_permute(ctx0, kqv, 0, 2, 1, 3);
                cb(kqv_merged, "kqv_merged", il);
                cur = ggml_cont_2d(ctx0, kqv_merged, n_embd_head_v*n_head, n_tokens);
                cb(cur, "kqv_merged_cont", il);
                cur = ggml_mul_mat(ctx0, model.layers[il].cross_attn_o_proj, cur);
                cb(cur, "cur", il);
                // TODO: do this in place once?
                cur = ggml_mul(ctx0, cur, ggml_tanh(ctx0, model.layers[il].cross_attn_attn_gate));
                struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
                cb(ffn_inp, "ffn_inp", il);
                // feed-forward network
                cur = build_norm(ffn_inp,
                        model.layers[il].ffn_norm, NULL,
                        LLM_NORM_RMS, il);
                cb(cur, "ffn_norm", il);
                cur = build_ffn(cur,
                        model.layers[il].ffn_up,   model.layers[il].ffn_up_b,   NULL,
                        model.layers[il].ffn_gate, model.layers[il].ffn_gate_b, NULL,
                        model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
                        NULL,
                        LLM_FFN_SILU, LLM_FFN_PAR, il);
                cb(cur, "ffn_out", il);
                // TODO: do this inplace once?
                cur = ggml_add_inplace(ctx0, ggml_mul_inplace(ctx0, cur, ggml_tanh(ctx0, model.layers[il].cross_attn_mlp_gate)), ffn_inp);
                cb(cur, "ffn_out", il);
                cur = build_cvec(cur, il);
                cb(cur, "l_out", il);
                // input for next layer
                inpL = cur;
            } else {
                // self attention layer
                // rope freq factors for llama3; may return nullptr for llama2 and other models
                ggml_tensor * rope_factors = static_cast<const llama_kv_cache_unified *>(memory)->cbs.get_rope_factors(n_ctx_per_seq, il);
                // compute Q and K and RoPE them
                ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
                cb(Qcur, "Qcur", il);
                if (model.layers[il].bq) {
                    Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
                    cb(Qcur, "Qcur", il);
                }
                ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
                cb(Kcur, "Kcur", il);
                if (model.layers[il].bk) {
                    Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
                    cb(Kcur, "Kcur", il);
                }
                ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
                cb(Vcur, "Vcur", il);
                if (model.layers[il].bv) {
                    Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
                    cb(Vcur, "Vcur", il);
                }
                Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head,    n_tokens);
                Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
                Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
                Qcur = ggml_rope_ext(
                        ctx0, Qcur, inp_pos, rope_factors,
                        n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
                        ext_factor, attn_factor, beta_fast, beta_slow
                        );
                Kcur = ggml_rope_ext(
                        ctx0, Kcur, inp_pos, rope_factors,
                        n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
                        ext_factor, attn_factor, beta_fast, beta_slow
                        );
                cb(Qcur, "Qcur", il);
                cb(Kcur, "Kcur", il);
                cb(Vcur, "Vcur", il);
                cur = build_attn(inp_attn, gf,
                    model.layers[il].wo, model.layers[il].bo,
                    Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
                if (il == n_layer - 1) {
                    // skip computing output for unused tokens
                    struct ggml_tensor * inp_out_ids = build_inp_out_ids();
                    n_tokens = n_outputs;
                    cur   = ggml_get_rows(ctx0,   cur, inp_out_ids);
                    inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
                }
                struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
                cb(ffn_inp, "ffn_inp", il);
                // feed-forward network
                cur = build_norm(ffn_inp,
                        model.layers[il].ffn_norm, NULL,
                        LLM_NORM_RMS, il);
                cb(cur, "ffn_norm", il);
                cur = build_ffn(cur,
                        model.layers[il].ffn_up,   model.layers[il].ffn_up_b,   NULL,
                        model.layers[il].ffn_gate, model.layers[il].ffn_gate_b, NULL,
                        model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
                        NULL,
                        LLM_FFN_SILU, LLM_FFN_PAR, il);
                cb(cur, "ffn_out", il);
                cur = ggml_add(ctx0, cur, ffn_inp);
                cb(cur, "ffn_out", il);
                cur = build_cvec(cur, il);
                cb(cur, "l_out", il);
                // input for next layer
                inpL = cur;
            }
        }
        cur = inpL;
        cur = build_norm(cur,
                model.output_norm, NULL,
                LLM_NORM_RMS, -1);
        cb(cur, "result_norm", -1);
        res->t_embd = cur;
        // lm_head
        cur = build_lora_mm(model.output, cur);
        cb(cur, "result_output", -1);
        res->t_logits = cur;
        ggml_build_forward_expand(gf, cur);
    }
 };
 struct llm_build_deci : public llm_graph_context {
    llm_build_deci(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
        const int64_t n_embd_head = hparams.n_embd_head_v;
@ -5029,6 +4778,7 @@ struct llm_build_deci : public llm_graph_context {
            ggml_tensor * inpSA = inpL;
            const int64_t n_head_kv = hparams.n_head_kv(il);
            const int64_t n_head    = hparams.n_head(il);
            const int64_t n_ff      = hparams.n_ff(il);
            if (n_head == 0) {
                // attention-free layer of Llama-3_1-Nemotron-51B
@ -5048,7 +4798,7 @@ struct llm_build_deci : public llm_graph_context {
            } else if (n_head > 0) {
                // self-attention
                // rope freq factors for llama3; may return nullptr for llama2 and other models
-                ggml_tensor * rope_factors = static_cast<const llama_kv_cache_unified *>(memory)->cbs.get_rope_factors(n_ctx_per_seq, il);
+                ggml_tensor * rope_factors = model.get_rope_factors(n_ctx_per_seq, il);
                // compute Q and K and RoPE them
                ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
@ -5104,6 +4854,11 @@ struct llm_build_deci : public llm_graph_context {
                inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
            }
            // FFN-free layer of Llama-3_1-Nemotron-Ultra-253B
            if (n_ff == 0) {
                continue;
            }
            // For Granite architecture
            if (hparams.f_residual_scale) {
                cur = ggml_scale(ctx0, cur, hparams.f_residual_scale);
@ -7530,7 +7285,7 @@ struct llm_build_phi3 : public llm_graph_context {
            // self-attention
            {
                // rope freq factors for 128k context
-                ggml_tensor * rope_factors = static_cast<const llama_kv_cache_unified *>(memory)->cbs.get_rope_factors(n_ctx_per_seq, il);
+                ggml_tensor * rope_factors = model.get_rope_factors(n_ctx_per_seq, il);
                ggml_tensor* attn_norm_output = build_norm(inpL,
                        model.layers[il].attn_norm,
@ -8282,7 +8037,7 @@ struct llm_build_minicpm3 : public llm_graph_context {
        for (int il = 0; il < n_layer; ++il) {
            ggml_tensor * inpSA = inpL;
-            ggml_tensor * rope_factors = static_cast<const llama_kv_cache_unified *>(memory)->cbs.get_rope_factors(n_ctx_per_seq, il);
+            ggml_tensor * rope_factors = model.get_rope_factors(n_ctx_per_seq, il);
            // norm
            cur = build_norm(inpL,
@ -9049,7 +8804,7 @@ struct llm_build_mamba : public llm_graph_context {
             ggml_tensor * state_mask,
      const llama_ubatch & ubatch,
                     int   il) const {
-        const llama_kv_cache_unified * kv_self = static_cast<const llama_kv_cache_unified *>(memory);
+        const llama_kv_cache_recurrent * kv_self = static_cast<const llama_kv_cache_recurrent *>(memory);
        const auto kv_head = kv_self->head;
@ -9350,7 +9105,7 @@ struct llm_build_cohere2 : public llm_graph_context {
            // self-attention
            {
                // rope freq factors for 128k context
-                ggml_tensor * rope_factors = static_cast<const llama_kv_cache_unified *>(memory)->cbs.get_rope_factors(n_ctx_per_seq, il);
+                ggml_tensor * rope_factors = model.get_rope_factors(n_ctx_per_seq, il);
                // compute Q and K and RoPE them
                ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
@ -10288,7 +10043,7 @@ struct llm_build_deepseek : public llm_graph_context {
            // self-attention
            {
                // rope freq factors for llama3; may return nullptr for llama2 and other models
-                ggml_tensor * rope_factors = static_cast<const llama_kv_cache_unified *>(memory)->cbs.get_rope_factors(n_ctx_per_seq, il);
+                ggml_tensor * rope_factors = model.get_rope_factors(n_ctx_per_seq, il);
                // compute Q and K and RoPE them
                ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
@ -11652,7 +11407,7 @@ struct llm_build_exaone : public llm_graph_context {
            // self-attention
            {
                // rope freq factors for llama3; may return nullptr for llama2 and other models
-                ggml_tensor * rope_factors = static_cast<const llama_kv_cache_unified *>(memory)->cbs.get_rope_factors(n_ctx_per_seq, il);
+                ggml_tensor * rope_factors = model.get_rope_factors(n_ctx_per_seq, il);
                // compute Q and K and RoPE them
                ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
@ -11797,7 +11552,7 @@ struct llm_build_rwkv6_base : public llm_graph_context {
            ggml_tensor * state_mask,
            const llama_ubatch & ubatch,
            int   il) const {
-        const llama_kv_cache_unified * kv_self = static_cast<const llama_kv_cache_unified *>(memory);
+        const llama_kv_cache_recurrent * kv_self = static_cast<const llama_kv_cache_recurrent *>(memory);
        const auto n_tokens = ubatch.n_tokens;
        const auto n_seqs = ubatch.n_seqs;
@ -12193,7 +11948,7 @@ struct llm_build_rwkv7_base : public llm_graph_context {
            ggml_tensor *& first_layer_value,
            const llama_ubatch & ubatch,
            int   il) const {
-        const llama_kv_cache_unified * kv_self = static_cast<const llama_kv_cache_unified *>(memory);
+        const llama_kv_cache_recurrent * kv_self = static_cast<const llama_kv_cache_recurrent *>(memory);
        const auto n_tokens = ubatch.n_tokens;
        const auto n_seqs = ubatch.n_seqs;
@ -12741,7 +12496,7 @@ struct llm_build_solar : public llm_graph_context {
            // self-attention
            {
                // rope freq factors for llama3; may return nullptr for llama2 and other models
-                ggml_tensor * rope_factors = static_cast<const llama_kv_cache_unified *>(memory)->cbs.get_rope_factors(n_ctx_per_seq, il);
+                ggml_tensor * rope_factors = model.get_rope_factors(n_ctx_per_seq, il);
                // compute Q and K and RoPE them
                ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
@ -13192,7 +12947,7 @@ struct llm_build_bailingmoe : public llm_graph_context {
            // self-attention
            {
                // rope freq factors for llama3; may return nullptr for llama2 and other models
-                ggml_tensor * rope_factors = static_cast<const llama_kv_cache_unified *>(memory)->cbs.get_rope_factors(n_ctx_per_seq, il);
+                ggml_tensor * rope_factors = model.get_rope_factors(n_ctx_per_seq, il);
                // compute Q and K and RoPE them
                ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
@ -13312,36 +13067,46 @@ struct llm_build_bailingmoe : public llm_graph_context {
    }
 };
-llama_memory_i * llama_model::create_memory() const {
+llama_memory_i * llama_model::create_memory(const llama_memory_params & params, llama_cparams & cparams) const {
    llama_memory_i * res;
    switch (arch) {
        case LLM_ARCH_BERT:
        case LLM_ARCH_JINA_BERT_V2:
        case LLM_ARCH_NOMIC_BERT:
        case LLM_ARCH_NOMIC_BERT_MOE:
            {
                res = nullptr;
            } break;
        case LLM_ARCH_MAMBA:
        case LLM_ARCH_RWKV6:
        case LLM_ARCH_RWKV6QWEN2:
        case LLM_ARCH_RWKV7:
        case LLM_ARCH_ARWKV7:
            {
-                res = new llama_kv_cache_unified(hparams, {
+                res = new llama_kv_cache_recurrent(
-                    /*.get_rope_factors =*/ nullptr
+                        *this,
-                });
+                        GGML_TYPE_F32,
                        GGML_TYPE_F32,
                        cparams.offload_kqv,
                        std::max((uint32_t) 1, cparams.n_seq_max));
            } break;
        default:
            {
-                res = new llama_kv_cache_unified(hparams, {
+                const auto padding = llama_kv_cache_unified::get_padding(cparams);
                    /*.get_rope_factors =*/ [this](uint32_t n_ctx_per_seq, int il) {
                        // choose long/short freq factors based on the context size
                        if (layers[il].rope_freqs != nullptr) {
                            return layers[il].rope_freqs;
                        }
-                        if (n_ctx_per_seq > hparams.n_ctx_orig_yarn) {
+                cparams.n_ctx = GGML_PAD(cparams.n_ctx, padding);
                            return layers[il].rope_long;
                        }
-                        return layers[il].rope_short;
+                LLAMA_LOG_DEBUG("%s: n_ctx = %u (padded)\n", __func__, cparams.n_ctx);
-                    }
+
-                });
+                res = new llama_kv_cache_unified(
                        *this,
                        params.type_k,
                        params.type_v,
                        !cparams.flash_attn,
                        cparams.offload_kqv,
                        cparams.n_ctx,
                        padding);
            }
    }
@ -13363,10 +13128,6 @@ llm_graph_result_ptr llama_model::build_graph(
            {
                llm = std::make_unique<llm_build_llama>(*this, params, gf);
            } break;
        case LLM_ARCH_MLLAMA:
            {
                llm = std::make_unique<llm_build_mllama>(*this, params, gf);
            } break;
        case LLM_ARCH_DECI:
            {
                llm = std::make_unique<llm_build_deci>(*this, params, gf);
@ -13728,12 +13489,9 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
        // use what we call a normal RoPE, operating on pairs of consecutive head values
        case LLM_ARCH_LLAMA:
        case LLM_ARCH_LLAMA4:
        case LLM_ARCH_MLLAMA:
        case LLM_ARCH_DECI:
        case LLM_ARCH_BAICHUAN:
        case LLM_ARCH_STARCODER:
        case LLM_ARCH_PLAMO:
        case LLM_ARCH_ORION:
        case LLM_ARCH_INTERNLM2:
        case LLM_ARCH_MINICPM:
        case LLM_ARCH_XVERSE:
@ -13772,6 +13530,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
        case LLM_ARCH_PHI2:
        case LLM_ARCH_PHI3:
        case LLM_ARCH_PHIMOE:
        case LLM_ARCH_PLAMO:
        case LLM_ARCH_GEMMA:
        case LLM_ARCH_GEMMA2:
        case LLM_ARCH_GEMMA3:
@ -13779,6 +13538,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
        case LLM_ARCH_OPENELM:
        case LLM_ARCH_GPTNEOX:
        case LLM_ARCH_CODESHELL:
        case LLM_ARCH_ORION:
        case LLM_ARCH_NEMOTRON:
        case LLM_ARCH_EXAONE:
        case LLM_ARCH_MINICPM3:
@ -13851,6 +13611,14 @@ const char * llama_model_chat_template(const llama_model * model, const char * n
        : LLM_KV(model->arch)(LLM_KV_TOKENIZER_CHAT_TEMPLATE);
    const auto & it = model->gguf_kv.find(key);
    if (it == model->gguf_kv.end()) {
        // one-off fix for very popular models (so we are not flooded with issues)
        // do not extend this list unless absolutely necessary
        // Mistral-Small-2503 does not have built-in chat template
        llama_vocab_pre_type pre_type = model->vocab.get_pre_type();
        if (pre_type == LLAMA_VOCAB_PRE_TYPE_TEKKEN && model->layers.size() == 40) {
            return "mistral-v7-tekken";
        }
        return nullptr;
    }
--- a/llama/llama.cpp/src/llama-model.h
+++ b/llama/llama.cpp/src/llama-model.h
@ -11,7 +11,6 @@
 #include <string>
 #include <unordered_map>
 #include <vector>
 #include <stdexcept>
 struct llama_cparams;
 struct llama_ubatch;
@ -37,6 +36,7 @@ enum llm_type {
    LLM_TYPE_335M,
    LLM_TYPE_410M,
    LLM_TYPE_450M,
    LLM_TYPE_475M,
    LLM_TYPE_770M,
    LLM_TYPE_780M,
    LLM_TYPE_0_5B,
@ -74,10 +74,10 @@ enum llm_type {
    LLM_TYPE_40B,
    LLM_TYPE_65B,
    LLM_TYPE_70B,
    LLM_TYPE_90B,
    LLM_TYPE_236B,
    LLM_TYPE_290B,
    LLM_TYPE_314B,
    LLM_TYPE_405B,
    LLM_TYPE_671B,
    LLM_TYPE_SMALL,
    LLM_TYPE_MEDIUM,
@ -97,6 +97,8 @@ enum llm_type {
    LLM_TYPE_235B_A22B,
 };
 std::string llama_rope_scaling_type_name(llama_rope_scaling_type rope_scaling_type);
 struct llama_layer_posnet {
    // resnet
    struct ggml_tensor * norm1   = nullptr;
@ -316,16 +318,6 @@ struct llama_layer {
    struct ggml_tensor * bskcn_tv = nullptr;
    // cross attention
    struct ggml_tensor * cross_attn_k_norm = nullptr;
    struct ggml_tensor * cross_attn_k_proj = nullptr;
    struct ggml_tensor * cross_attn_o_proj = nullptr;
    struct ggml_tensor * cross_attn_q_norm = nullptr;
    struct ggml_tensor * cross_attn_q_proj = nullptr;
    struct ggml_tensor * cross_attn_v_proj = nullptr;
    struct ggml_tensor * cross_attn_attn_gate = nullptr;
    struct ggml_tensor * cross_attn_mlp_gate = nullptr;
    struct llama_layer_posnet posnet;
    struct llama_layer_convnext convnext;
@ -409,8 +401,11 @@ struct llama_model {
    const struct ggml_tensor * get_tensor(const char * name) const;
    ggml_tensor * get_rope_factors(uint32_t n_ctx_per_seq, int il) const;
    // note: can mutate `cparams`
    // TODO: move this to new llm_arch_model_i interface
-    llama_memory_i * create_memory() const; // TODO: params
+    llama_memory_i * create_memory(const llama_memory_params & params, llama_cparams & cparams) const;
    // TODO: move this to new llm_arch_model_i interface
    llm_graph_result_ptr build_graph(
--- a/llama/llama.cpp/src/llama-quant.cpp
+++ b/llama/llama.cpp/src/llama-quant.cpp
@ -519,7 +519,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
        nthread = std::thread::hardware_concurrency();
    }
-    // mmap consistently increases speed Linux, and also increases speed on Windows with
+    // mmap consistently increases speed on Linux, and also increases speed on Windows with
    // hot cache. It may cause a slowdown on macOS, possibly related to free memory.
 #if defined(__linux__) || defined(_WIN32)
    constexpr bool use_mmap = true;
@ -529,7 +529,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
    llama_model_kv_override * kv_overrides = nullptr;
    if (params->kv_overrides) {
-        auto v = (std::vector<llama_model_kv_override>*)params->kv_overrides;
+        auto * v = (std::vector<llama_model_kv_override>*)params->kv_overrides;
        kv_overrides = v->data();
    }
@ -639,9 +639,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
        if (llama_model_has_encoder(&model)) {
            n_attn_layer *= 3;
        }
-        if (qs.n_attention_wv != n_attn_layer) {
+        GGML_ASSERT((qs.n_attention_wv == n_attn_layer) && "n_attention_wv is unexpected");
            LLAMA_LOG_WARN("%s: n_attention_wv is unexpected, expected: %d, found: %d\n", __func__, n_attn_layer, qs.n_attention_wv);
        }
    }
    size_t total_size_org = 0;
--- a/llama/llama.cpp/src/llama-sampling.cpp
+++ b/llama/llama.cpp/src/llama-sampling.cpp
@ -1750,23 +1750,35 @@ static const char * llama_sampler_top_n_sigma_name(const struct llama_sampler *
 static void llama_sampler_top_n_sigma_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) {
    const auto * ctx = (llama_sampler_top_n_sigma *) smpl->ctx;
    if (ctx->n <= 0.0f || cur_p->size <= 1) {
        return;
    }
    // find max logit and calculate mean
    float max = cur_p->data[0].logit;
    float logits_sum = 0;
    size_t valid_count = 0;
    for (size_t i = 0; i < cur_p->size; ++i) {
-        if (cur_p->data[i].logit > max) {
+        // Only count non-negative infinity values
-            max = cur_p->data[i].logit;
+        if (cur_p->data[i].logit != -INFINITY) {
            if (cur_p->data[i].logit > max) {
                max = cur_p->data[i].logit;
            }
            logits_sum += cur_p->data[i].logit;
            valid_count++;
        }
        logits_sum += cur_p->data[i].logit;
    }
-    float mean = logits_sum/cur_p->size;
+    float mean = valid_count > 0 ? logits_sum/valid_count : 0;
    // calculate standard deviation
    float acc = 0;
    for (size_t i = 0; i < cur_p->size; ++i) {
-        acc += pow(cur_p->data[i].logit - mean, 2);
+        // Skip -infinity in std calculation
        if (cur_p->data[i].logit != -INFINITY) {
            acc += pow(cur_p->data[i].logit - mean, 2);
        }
    }
-    float std = sqrt(acc/cur_p->size);
+    float std = valid_count > 0 ? sqrt(acc/valid_count) : 0;
    //apply mask
    for (size_t i = 0; i < cur_p->size; ++i) {
--- a/llama/llama.cpp/src/llama-vocab.cpp
+++ b/llama/llama.cpp/src/llama-vocab.cpp
@ -1,5 +1,7 @@
 #include "llama-vocab.h"
 #include "ggml.h"
 #include "gguf.h"
 #include "llama-impl.h"
 #include "llama-model-loader.h"
@ -415,6 +417,13 @@ struct llm_tokenizer_bpe : llm_tokenizer {
                    "'(?:[sSdDmMtT]|[lL][lL]|[vV][eE]|[rR][eE])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]|\\s+(?!\\S)|\\s+",
                };
                break;
            case LLAMA_VOCAB_PRE_TYPE_SEED_CODER:
                regex_exprs = {
                    // original regex from tokenizer.json
                    // "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1}| ?[^\\s\\p{L}\\p{N}\r\n]+|\\s*[\r\n]+|\\s+(?!\\S)|\\s+"
                    "(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1}| ?[^\\s\\p{L}\\p{N}\\r\\n]+|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
                };
                break;
            default:
                // default regex for BPE tokenization pre-processing
                regex_exprs = {
@ -1227,6 +1236,9 @@ struct fragment_buffer_variant {
 struct llama_vocab::impl {
    uint32_t n_token_types = 0; // for BERT-style token types
    std::string tokenizer_model;
    std::string tokenizer_pre;
    enum llama_vocab_type     type     = LLAMA_VOCAB_TYPE_SPM;
    enum llama_vocab_pre_type pre_type = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
@ -1362,9 +1374,6 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
    // determine vocab type
    {
        std::string tokenizer_model;
        std::string tokenizer_pre;
        ml.get_key(LLM_KV_TOKENIZER_MODEL, tokenizer_model);
        ml.get_key(LLM_KV_TOKENIZER_PRE,   tokenizer_pre, false);
@ -1459,7 +1468,8 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
            const int precompiled_charsmap_keyidx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_PRECOMPILED_CHARSMAP).c_str());
            if (precompiled_charsmap_keyidx != -1) {
-                size_t n_precompiled_charsmap = gguf_get_arr_data_n(ctx, precompiled_charsmap_keyidx);
+                const gguf_type pc_type = gguf_get_arr_type(ctx, precompiled_charsmap_keyidx);
                const size_t n_precompiled_charsmap = gguf_get_arr_data_n(ctx, precompiled_charsmap_keyidx);
                const char * pc = (const char *) gguf_get_arr_data(ctx, precompiled_charsmap_keyidx);
                precompiled_charsmap.assign(pc, pc + n_precompiled_charsmap);
 #ifdef IS_BIG_ENDIAN
@ -1625,6 +1635,10 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
                tokenizer_pre == "bailingmoe") {
                pre_type = LLAMA_VOCAB_PRE_TYPE_BAILINGMOE;
                clean_spaces = false;
            } else if (
                tokenizer_pre == "seed-coder") {
                pre_type = LLAMA_VOCAB_PRE_TYPE_SEED_CODER;
                clean_spaces = false;
            } else {
                LLAMA_LOG_WARN("%s: missing or unrecognized pre-tokenizer type, using: 'default'\n", __func__);
                pre_type = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
@ -2770,6 +2784,14 @@ void llama_vocab::load(llama_model_loader & ml, const LLM_KV & kv) {
    pimpl->load(ml, kv);
 }
 std::string llama_vocab::get_tokenizer_model() const {
    return pimpl->tokenizer_model;
 }
 std::string llama_vocab::get_tokenizer_pre() const {
    return pimpl->tokenizer_pre;
 }
 enum llama_vocab_type llama_vocab::get_type() const {
    return pimpl->type;
 }
@ -2992,6 +3014,20 @@ int llama_vocab::find_bpe_rank(const std::string & token_left, const std::string
    return it->second;
 }
 std::vector<std::string> llama_vocab::get_bpe_merges() const {
    std::vector<std::string> result(pimpl->bpe_ranks.size());
    for (const auto & pair : pimpl->bpe_ranks) {
        result[pair.second] = pair.first.first + " " + pair.first.second;
    }
    return result;
 }
 std::vector<char> llama_vocab::get_precompiled_charsmap() const {
    return pimpl->precompiled_charsmap;
 }
 int32_t llama_vocab::tokenize(
                  const char * text,
                     int32_t   text_len,
--- a/llama/llama.cpp/src/llama-vocab.h
+++ b/llama/llama.cpp/src/llama-vocab.h
@ -21,6 +21,9 @@ struct llama_vocab {
    void load(llama_model_loader & ml, const LLM_KV & kv);
    std::string get_tokenizer_model() const;
    std::string get_tokenizer_pre() const;
    enum llama_vocab_type     get_type()     const;
    enum llama_vocab_pre_type get_pre_type() const;
@ -80,6 +83,9 @@ struct llama_vocab {
    int max_token_len() const;
    int find_bpe_rank(const std::string & token_left, const std::string & token_right) const;
    std::vector<std::string> get_bpe_merges() const;
    std::vector<char> get_precompiled_charsmap() const;
    int32_t tokenize(
                   const char * text,
--- a/llama/llama.cpp/src/llama.cpp
+++ b/llama/llama.cpp/src/llama.cpp
@ -4,6 +4,7 @@
 #include "llama-mmap.h"
 #include "llama-vocab.h"
 #include "llama-model-loader.h"
 #include "llama-model-saver.h"
 #include "llama-model.h"
 #include "ggml.h"
@ -253,6 +254,13 @@ struct llama_model * llama_model_load_from_splits(
    return llama_model_load_from_file_impl(splits.front(), splits, params);
 }
 void llama_model_save_to_file(const struct llama_model * model, const char * path_model) {
    llama_model_saver ms(*model);
    ms.add_kv_from_model();
    ms.add_tensors_from_model();
    ms.save(path_model);
 }
 //
 // chat templates
 //
@ -338,3 +346,4 @@ const char * llama_print_system_info(void) {
    return s.c_str();
 }
--- a/llama/llama.cpp/examples/llava/clip-impl.h
+++ b/llama/llama.cpp/examples/llava/clip-impl.h
@ -31,9 +31,7 @@
 #define KEY_FEATURE_LAYER       "clip.vision.feature_layer"
 #define KEY_PROJ_SCALE_FACTOR   "clip.vision.projector.scale_factor"
 #define KEY_PROJ_TYPE           "clip.projector_type"
-
+#define KEY_SPATIAL_MERGE_SIZE  "clip.vision.spatial_merge_size"
 #define KEY_USE_GLU_MLP         "clip.use_glu_mlp"  // for qwen2.5vl
 #define KEY_USE_RMS_NORM        "clip.use_rms_norm" // for qwen2.5vl
 #define KEY_MM_PATCH_MERGE_TYPE   "clip.vision.mm_patch_merge_type"
 #define KEY_IMAGE_GRID_PINPOINTS  "clip.vision.image_grid_pinpoints"
@ -55,12 +53,16 @@
 #define TN_ATTN_Q          "%s.blk.%d.attn_q.%s"
 #define TN_ATTN_V          "%s.blk.%d.attn_v.%s"
 #define TN_ATTN_OUTPUT     "%s.blk.%d.attn_out.%s"
 #define TN_ATTN_K_NORM     "%s.blk.%d.attn_k_norm.%s"
 #define TN_ATTN_Q_NORM     "%s.blk.%d.attn_q_norm.%s"
 #define TN_FFN_DOWN        "%s.blk.%d.ffn_down.%s"
 #define TN_FFN_GATE        "%s.blk.%d.ffn_gate.%s"
 #define TN_FFN_UP          "%s.blk.%d.ffn_up.%s"
 #define TN_FFN_GATE        "%s.blk.%d.ffn_gate.%s"
-#define TN_LN_1            "%s.blk.%d.ln1.%s"
+#define TN_LN_1            "%s.blk.%d.ln1.%s" // layer norm
-#define TN_LN_2            "%s.blk.%d.ln2.%s"
+#define TN_LN_2            "%s.blk.%d.ln2.%s" // layer norm
 #define TN_LS_1            "%s.blk.%d.ls1.%s" // layer scale
 #define TN_LS_2            "%s.blk.%d.ls2.%s" // layer scale
 #define TN_LN_PRE          "%s.pre_ln.%s"
 #define TN_LN_POST         "%s.post_ln.%s"
 #define TN_LLAVA_PROJ      "mm.%d.%s"
@ -68,10 +70,14 @@
 #define TN_MVLM_PROJ_BLOCK "mm.model.mb_block.%d.block.%d.%s"
 #define TN_MVLM_PROJ_PEG   "mm.model.peg.%d.%s"
 #define TN_IMAGE_NEWLINE   "model.image_newline"
 #define TN_MM_INP_NORM     "mm.input_norm.weight"
 #define TN_MM_INP_PROJ     "mm.input_projection.weight" // gemma3
 #define TN_MM_SOFT_EMB_N   "mm.soft_emb_norm.weight"    // gemma3
 #define TN_MM_PROJECTOR    "mm.model.fc.weight"         // idefics3
 #define TN_MM_PATCH_MERGER "mm.patch_merger.weight"     // mistral small 3.1
 #define TN_TOK_IMG_BREAK   "v.token_embd.img_break"     // pixtral
 #define TN_TOK_GLM_BOI     "adapter.boi"                // glm-edge (these embeddings are not in text model)
 #define TN_TOK_GLM_EOI     "adapter.eoi"                // glm-edge (these embeddings are not in text model)
 // mimicpmv
 #define TN_MINICPMV_POS_EMBD_K "resampler.pos_embed_k"
@ -88,6 +94,9 @@
 #define TN_GLM_ADAPTER_GATE     "adapter.linear.gate.%s"
 #define TN_GLM_ADAPTER_D_4H_2_H "adapter.linear.dense_4h_to_h.%s"
 // align x to upper multiple of n
 #define CLIP_ALIGN(x, n) ((((x) + (n) - 1) / (n)) * (n))
 enum projector_type {
    PROJECTOR_TYPE_MLP,
    PROJECTOR_TYPE_MLP_NORM,
@ -100,6 +109,7 @@ enum projector_type {
    PROJECTOR_TYPE_IDEFICS3,
    PROJECTOR_TYPE_PIXTRAL,
    PROJECTOR_TYPE_QWEN25VL,
    PROJECTOR_TYPE_INTERNVL,
    PROJECTOR_TYPE_UNKNOWN,
 };
@ -114,6 +124,7 @@ static std::map<projector_type, std::string> PROJECTOR_TYPE_NAMES = {
    { PROJECTOR_TYPE_GEMMA3,    "gemma3"},
    { PROJECTOR_TYPE_IDEFICS3,  "idefics3"},
    { PROJECTOR_TYPE_PIXTRAL,   "pixtral"},
    { PROJECTOR_TYPE_INTERNVL,  "internvl"},
 };
 static projector_type clip_projector_type_from_string(const std::string & str) {
@ -228,6 +239,15 @@ struct clip_image_u8_batch {
 struct clip_image_f32_batch {
    std::vector<clip_image_f32_ptr> entries;
    clip_image_f32_batch clone() const {
        clip_image_f32_batch new_batch;
        new_batch.entries.reserve(entries.size());
        for (const auto & entry : entries) {
            new_batch.entries.emplace_back(new clip_image_f32(*entry));
        }
        return new_batch;
    }
 };
 //
--- a/llama/llama.cpp/examples/llava/clip.cpp
+++ b/llama/llama.cpp/examples/llava/clip.cpp
--- a/llama/llama.cpp/examples/llava/clip.h
+++ b/llama/llama.cpp/examples/llava/clip.h
@ -78,10 +78,10 @@ CLIP_API int clip_uhd_num_image_embeds_col(struct clip_ctx * ctx_clip);
 CLIP_API void clip_add_load_image_size(struct clip_ctx * ctx_clip, struct clip_image_size * load_image_size);
 CLIP_API struct clip_image_size * clip_get_load_image_size(struct clip_ctx * ctx_clip);
-CLIP_API struct clip_image_size      * clip_image_size_init();
+CLIP_API struct clip_image_size      * clip_image_size_init(void);
-CLIP_API struct clip_image_u8        * clip_image_u8_init ();
+CLIP_API struct clip_image_u8        * clip_image_u8_init (void);
-CLIP_API struct clip_image_f32       * clip_image_f32_init();
+CLIP_API struct clip_image_f32       * clip_image_f32_init(void);
-CLIP_API struct clip_image_f32_batch * clip_image_f32_batch_init(); // only used by libllava
+CLIP_API struct clip_image_f32_batch * clip_image_f32_batch_init(void); // only used by libllava
 // nx, ny are the output image dimensions
 CLIP_API unsigned char * clip_image_u8_get_data(struct clip_image_u8 * img, uint32_t * nx, uint32_t * ny);
--- a/llama/llama.cpp/examples/llava/llava.cpp
+++ b/llama/llama.cpp/examples/llava/llava.cpp
@ -2,6 +2,7 @@
 #include "llava.h"
 #include "llama.h"
 #include "ggml-cpp.h"
 #include <algorithm>
 #include <cerrno>
@ -209,7 +210,11 @@ static bool clip_llava_handle_patches(clip_ctx * ctx_clip, std::vector<float *>
    struct ggml_tensor *flatten = ggml_view_2d(model.ctx, permuted_cont, clip_n_mmproj_embd(ctx_clip), num_patches_height * num_patches_width * num_patches_per_side * num_patches_per_side,  size_ele * clip_n_mmproj_embd(ctx_clip), 0);
    // ggml_tensor_printf(flatten,"flatten",__LINE__,false,false);
    ggml_build_forward_expand(gf, flatten);
-    ggml_graph_compute_with_ctx(model.ctx, gf, 1);
+
    ggml_backend_ptr backend { ggml_backend_init_by_type(GGML_BACKEND_DEVICE_TYPE_CPU, nullptr) };
    GGML_ASSERT(backend != nullptr && "failed to initialize CPU backend");
    ggml_backend_graph_compute(backend.get(), gf);
    struct ggml_tensor* result = ggml_graph_node(gf, -1);
    memcpy(image_embd_out, image_embd_v[0], clip_embd_nbytes(ctx_clip)); // main image as global context
@ -457,7 +462,7 @@ struct llava_embd_batch {
    std::vector<llama_seq_id *> seq_ids;
    std::vector<int8_t>         logits;
    llama_batch batch;
-    llava_embd_batch(float * embd, int32_t n_embd, int32_t n_tokens, llama_pos pos_0, llama_seq_id seq_id) {
+    llava_embd_batch(float * embd, int32_t n_tokens, llama_pos pos_0, llama_seq_id seq_id) {
        pos     .resize(n_tokens);
        n_seq_id.resize(n_tokens);
        seq_ids .resize(n_tokens + 1);
@ -469,7 +474,6 @@ struct llava_embd_batch {
            /*n_tokens       =*/ n_tokens,
            /*tokens         =*/ nullptr,
            /*embd           =*/ embd,
            /*n_embd         =*/ n_embd,
            /*pos            =*/ pos.data(),
            /*n_seq_id       =*/ n_seq_id.data(),
            /*seq_id         =*/ seq_ids.data(),
@ -493,7 +497,7 @@ bool llava_eval_image_embed(llama_context * ctx_llama, const struct llava_image_
            n_eval = n_batch;
        }
        float * embd = image_embed->embed+i*n_embd;
-        llava_embd_batch llava_batch = llava_embd_batch(embd, n_embd, n_eval, *n_past, 0);
+        llava_embd_batch llava_batch = llava_embd_batch(embd, n_eval, *n_past, 0);
        if (llama_decode(ctx_llama, llava_batch.batch)) {
            LOG_ERR("%s : failed to eval\n", __func__);
            return false;
--- a/llama/llama.cpp/examples/llava/llava.h
+++ b/llama/llama.cpp/examples/llava/llava.h
--- a/llama/llama.cpp/examples/llava/llava.go
+++ b/llama/llama.cpp/examples/llava/llava.go
@ -1,4 +1,4 @@
-package llava
+package mtmd
 // #cgo CXXFLAGS: -std=c++11
 // #cgo CPPFLAGS: -I${SRCDIR}/../../include -I${SRCDIR}/../../common
--- a/Show More
+++ b/Show More