mirror of
https://github.com/zebrajr/ollama.git
synced 2025-12-06 12:19:56 +01:00
Merge branch 'main' into drifkin/array-head-count-simple
This commit is contained in:
commit
b2b270ad5d
6
.github/workflows/release.yaml
vendored
6
.github/workflows/release.yaml
vendored
|
|
@ -103,6 +103,11 @@ jobs:
|
||||||
arch: [amd64]
|
arch: [amd64]
|
||||||
preset: ['CPU']
|
preset: ['CPU']
|
||||||
include:
|
include:
|
||||||
|
- os: windows
|
||||||
|
arch: amd64
|
||||||
|
preset: 'CUDA 11'
|
||||||
|
install: https://developer.download.nvidia.com/compute/cuda/11.3.1/local_installers/cuda_11.3.1_465.89_win10.exe
|
||||||
|
cuda-version: '11.3'
|
||||||
- os: windows
|
- os: windows
|
||||||
arch: amd64
|
arch: amd64
|
||||||
preset: 'CUDA 12'
|
preset: 'CUDA 12'
|
||||||
|
|
@ -319,6 +324,7 @@ jobs:
|
||||||
case "$COMPONENT" in
|
case "$COMPONENT" in
|
||||||
bin/ollama) echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}.tar.in ;;
|
bin/ollama) echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}.tar.in ;;
|
||||||
lib/ollama/*.so) echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}.tar.in ;;
|
lib/ollama/*.so) echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}.tar.in ;;
|
||||||
|
lib/ollama/cuda_v11) echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}.tar.in ;;
|
||||||
lib/ollama/cuda_v12) echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}.tar.in ;;
|
lib/ollama/cuda_v12) echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}.tar.in ;;
|
||||||
lib/ollama/cuda_jetpack5) echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}-jetpack5.tar.in ;;
|
lib/ollama/cuda_jetpack5) echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}-jetpack5.tar.in ;;
|
||||||
lib/ollama/cuda_jetpack6) echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}-jetpack6.tar.in ;;
|
lib/ollama/cuda_jetpack6) echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}-jetpack6.tar.in ;;
|
||||||
|
|
|
||||||
6
.github/workflows/test.yaml
vendored
6
.github/workflows/test.yaml
vendored
|
|
@ -46,7 +46,7 @@ jobs:
|
||||||
include:
|
include:
|
||||||
- preset: CPU
|
- preset: CPU
|
||||||
- preset: CUDA
|
- preset: CUDA
|
||||||
container: nvidia/cuda:12.8.1-devel-ubuntu22.04
|
container: nvidia/cuda:11.8.0-devel-ubuntu22.04
|
||||||
flags: '-DCMAKE_CUDA_ARCHITECTURES=87'
|
flags: '-DCMAKE_CUDA_ARCHITECTURES=87'
|
||||||
- preset: ROCm
|
- preset: ROCm
|
||||||
container: rocm/dev-ubuntu-22.04:6.1.2
|
container: rocm/dev-ubuntu-22.04:6.1.2
|
||||||
|
|
@ -78,7 +78,7 @@ jobs:
|
||||||
include:
|
include:
|
||||||
- preset: CPU
|
- preset: CPU
|
||||||
- preset: CUDA
|
- preset: CUDA
|
||||||
install: https://developer.download.nvidia.com/compute/cuda/12.8.0/local_installers/cuda_12.8.0_571.96_windows.exe
|
install: https://developer.download.nvidia.com/compute/cuda/11.3.1/local_installers/cuda_11.3.1_465.89_win10.exe
|
||||||
flags: '-DCMAKE_CUDA_ARCHITECTURES=80'
|
flags: '-DCMAKE_CUDA_ARCHITECTURES=80'
|
||||||
- preset: ROCm
|
- preset: ROCm
|
||||||
install: https://download.amd.com/developer/eula/rocm-hub/AMD-Software-PRO-Edition-24.Q4-WinSvr2022-For-HIP.exe
|
install: https://download.amd.com/developer/eula/rocm-hub/AMD-Software-PRO-Edition-24.Q4-WinSvr2022-For-HIP.exe
|
||||||
|
|
@ -102,7 +102,7 @@ jobs:
|
||||||
$ErrorActionPreference = "Stop"
|
$ErrorActionPreference = "Stop"
|
||||||
if ("${{ steps.cache-install.outputs.cache-hit }}" -ne 'true') {
|
if ("${{ steps.cache-install.outputs.cache-hit }}" -ne 'true') {
|
||||||
Invoke-WebRequest -Uri "${{ matrix.install }}" -OutFile "install.exe"
|
Invoke-WebRequest -Uri "${{ matrix.install }}" -OutFile "install.exe"
|
||||||
Start-Process -FilePath .\install.exe -ArgumentList (@("-s", "cudart_12.8", "nvcc_12.8", "cublas_12.8", "cublas_dev_12.8")) -NoNewWindow -Wait
|
Start-Process -FilePath .\install.exe -ArgumentList (@("-s", "cudart_11.3", "nvcc_11.3", "cublas_11.3", "cublas_dev_11.3")) -NoNewWindow -Wait
|
||||||
}
|
}
|
||||||
|
|
||||||
$cudaPath = (Resolve-Path "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\*").path
|
$cudaPath = (Resolve-Path "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\*").path
|
||||||
|
|
|
||||||
|
|
@ -51,6 +51,8 @@ include_directories(${CMAKE_CURRENT_SOURCE_DIR}/ml/backend/ggml/ggml/src/include
|
||||||
include_directories(${CMAKE_CURRENT_SOURCE_DIR}/ml/backend/ggml/ggml/src/ggml-cpu)
|
include_directories(${CMAKE_CURRENT_SOURCE_DIR}/ml/backend/ggml/ggml/src/ggml-cpu)
|
||||||
include_directories(${CMAKE_CURRENT_SOURCE_DIR}/ml/backend/ggml/ggml/src/ggml-cpu/amx)
|
include_directories(${CMAKE_CURRENT_SOURCE_DIR}/ml/backend/ggml/ggml/src/ggml-cpu/amx)
|
||||||
|
|
||||||
|
add_compile_definitions(NDEBUG)
|
||||||
|
|
||||||
set(GGML_CPU ON)
|
set(GGML_CPU ON)
|
||||||
add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/ml/backend/ggml/ggml/src)
|
add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/ml/backend/ggml/ggml/src)
|
||||||
set_property(TARGET ggml PROPERTY EXCLUDE_FROM_ALL TRUE)
|
set_property(TARGET ggml PROPERTY EXCLUDE_FROM_ALL TRUE)
|
||||||
|
|
|
||||||
|
|
@ -17,12 +17,20 @@
|
||||||
"name": "CUDA",
|
"name": "CUDA",
|
||||||
"inherits": [ "Default" ]
|
"inherits": [ "Default" ]
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
"name": "CUDA 11",
|
||||||
|
"inherits": [ "CUDA" ],
|
||||||
|
"cacheVariables": {
|
||||||
|
"CMAKE_CUDA_ARCHITECTURES": "50;52;53;60;61;70;75;80;86",
|
||||||
|
"CMAKE_CUDA_FLAGS": "-Wno-deprecated-gpu-targets -t 2"
|
||||||
|
}
|
||||||
|
},
|
||||||
{
|
{
|
||||||
"name": "CUDA 12",
|
"name": "CUDA 12",
|
||||||
"inherits": [ "CUDA" ],
|
"inherits": [ "CUDA" ],
|
||||||
"cacheVariables": {
|
"cacheVariables": {
|
||||||
"CMAKE_CUDA_ARCHITECTURES": "50;60;61;70;75;80;86;87;89;90;90a;120",
|
"CMAKE_CUDA_ARCHITECTURES": "50;60;61;70;75;80;86;87;89;90;90a;120",
|
||||||
"CMAKE_CUDA_FLAGS": "-Wno-deprecated-gpu-targets"
|
"CMAKE_CUDA_FLAGS": "-Wno-deprecated-gpu-targets -t 2"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
|
|
@ -50,6 +58,7 @@
|
||||||
"name": "ROCm 6",
|
"name": "ROCm 6",
|
||||||
"inherits": [ "ROCm" ],
|
"inherits": [ "ROCm" ],
|
||||||
"cacheVariables": {
|
"cacheVariables": {
|
||||||
|
"CMAKE_HIP_FLAGS": "-parallel-jobs=4",
|
||||||
"AMDGPU_TARGETS": "gfx900;gfx940;gfx941;gfx942;gfx1010;gfx1012;gfx1030;gfx1100;gfx1101;gfx1102;gfx1151;gfx1200;gfx1201;gfx906:xnack-;gfx908:xnack-;gfx90a:xnack+;gfx90a:xnack-"
|
"AMDGPU_TARGETS": "gfx900;gfx940;gfx941;gfx942;gfx1010;gfx1012;gfx1030;gfx1100;gfx1101;gfx1102;gfx1151;gfx1200;gfx1201;gfx906:xnack-;gfx908:xnack-;gfx90a:xnack+;gfx90a:xnack-"
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
@ -70,6 +79,11 @@
|
||||||
"configurePreset": "CUDA",
|
"configurePreset": "CUDA",
|
||||||
"targets": [ "ggml-cuda" ]
|
"targets": [ "ggml-cuda" ]
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
"name": "CUDA 11",
|
||||||
|
"inherits": [ "CUDA" ],
|
||||||
|
"configurePreset": "CUDA 11"
|
||||||
|
},
|
||||||
{
|
{
|
||||||
"name": "CUDA 12",
|
"name": "CUDA 12",
|
||||||
"inherits": [ "CUDA" ],
|
"inherits": [ "CUDA" ],
|
||||||
|
|
|
||||||
17
Dockerfile
17
Dockerfile
|
|
@ -7,10 +7,14 @@ ARG JETPACK5VERSION=r35.4.1
|
||||||
ARG JETPACK6VERSION=r36.4.0
|
ARG JETPACK6VERSION=r36.4.0
|
||||||
ARG CMAKEVERSION=3.31.2
|
ARG CMAKEVERSION=3.31.2
|
||||||
|
|
||||||
|
# CUDA v11 requires gcc v10. v10.3 has regressions, so the rockylinux 8.5 AppStream has the latest compatible version
|
||||||
FROM --platform=linux/amd64 rocm/dev-almalinux-8:${ROCMVERSION}-complete AS base-amd64
|
FROM --platform=linux/amd64 rocm/dev-almalinux-8:${ROCMVERSION}-complete AS base-amd64
|
||||||
RUN yum install -y yum-utils \
|
RUN yum install -y yum-utils \
|
||||||
&& dnf install -y ccache \
|
&& yum-config-manager --add-repo https://dl.rockylinux.org/vault/rocky/8.5/AppStream/\$basearch/os/ \
|
||||||
|
&& rpm --import https://dl.rockylinux.org/pub/rocky/RPM-GPG-KEY-Rocky-8 \
|
||||||
|
&& dnf install -y yum-utils ccache gcc-toolset-10-gcc-10.2.1-8.2.el8 gcc-toolset-10-gcc-c++-10.2.1-8.2.el8 gcc-toolset-10-binutils-2.35-11.el8 \
|
||||||
&& yum-config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/rhel8/x86_64/cuda-rhel8.repo
|
&& yum-config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/rhel8/x86_64/cuda-rhel8.repo
|
||||||
|
ENV PATH=/opt/rh/gcc-toolset-10/root/usr/bin:$PATH
|
||||||
|
|
||||||
FROM --platform=linux/arm64 almalinux:8 AS base-arm64
|
FROM --platform=linux/arm64 almalinux:8 AS base-arm64
|
||||||
# install epel-release for ccache
|
# install epel-release for ccache
|
||||||
|
|
@ -34,6 +38,15 @@ RUN --mount=type=cache,target=/root/.ccache \
|
||||||
&& cmake --build --parallel --preset 'CPU' \
|
&& cmake --build --parallel --preset 'CPU' \
|
||||||
&& cmake --install build --component CPU --strip --parallel 8
|
&& cmake --install build --component CPU --strip --parallel 8
|
||||||
|
|
||||||
|
FROM base AS cuda-11
|
||||||
|
ARG CUDA11VERSION=11.3
|
||||||
|
RUN dnf install -y cuda-toolkit-${CUDA11VERSION//./-}
|
||||||
|
ENV PATH=/usr/local/cuda-11/bin:$PATH
|
||||||
|
RUN --mount=type=cache,target=/root/.ccache \
|
||||||
|
cmake --preset 'CUDA 11' \
|
||||||
|
&& cmake --build --parallel --preset 'CUDA 11' \
|
||||||
|
&& cmake --install build --component CUDA --strip --parallel 8
|
||||||
|
|
||||||
FROM base AS cuda-12
|
FROM base AS cuda-12
|
||||||
ARG CUDA12VERSION=12.8
|
ARG CUDA12VERSION=12.8
|
||||||
RUN dnf install -y cuda-toolkit-${CUDA12VERSION//./-}
|
RUN dnf install -y cuda-toolkit-${CUDA12VERSION//./-}
|
||||||
|
|
@ -85,9 +98,11 @@ RUN --mount=type=cache,target=/root/.cache/go-build \
|
||||||
go build -trimpath -buildmode=pie -o /bin/ollama .
|
go build -trimpath -buildmode=pie -o /bin/ollama .
|
||||||
|
|
||||||
FROM --platform=linux/amd64 scratch AS amd64
|
FROM --platform=linux/amd64 scratch AS amd64
|
||||||
|
COPY --from=cuda-11 dist/lib/ollama/cuda_v11 /lib/ollama/cuda_v11
|
||||||
COPY --from=cuda-12 dist/lib/ollama/cuda_v12 /lib/ollama/cuda_v12
|
COPY --from=cuda-12 dist/lib/ollama/cuda_v12 /lib/ollama/cuda_v12
|
||||||
|
|
||||||
FROM --platform=linux/arm64 scratch AS arm64
|
FROM --platform=linux/arm64 scratch AS arm64
|
||||||
|
COPY --from=cuda-11 dist/lib/ollama/cuda_v11 /lib/ollama/cuda_v11
|
||||||
COPY --from=cuda-12 dist/lib/ollama/cuda_v12 /lib/ollama/cuda_v12
|
COPY --from=cuda-12 dist/lib/ollama/cuda_v12 /lib/ollama/cuda_v12
|
||||||
COPY --from=jetpack-5 dist/lib/ollama/cuda_v11 /lib/ollama/cuda_jetpack5
|
COPY --from=jetpack-5 dist/lib/ollama/cuda_v11 /lib/ollama/cuda_jetpack5
|
||||||
COPY --from=jetpack-6 dist/lib/ollama/cuda_v12 /lib/ollama/cuda_jetpack6
|
COPY --from=jetpack-6 dist/lib/ollama/cuda_v12 /lib/ollama/cuda_jetpack6
|
||||||
|
|
|
||||||
|
|
@ -1,6 +1,6 @@
|
||||||
UPSTREAM=https://github.com/ggerganov/llama.cpp.git
|
UPSTREAM=https://github.com/ggerganov/llama.cpp.git
|
||||||
WORKDIR=llama/vendor
|
WORKDIR=llama/vendor
|
||||||
FETCH_HEAD=e1e8e0991ffd9e99a445c6812bb519d5bac9f4b5
|
FETCH_HEAD=de4c07f93783a1a96456a44dc16b9db538ee1618
|
||||||
|
|
||||||
.PHONY: help
|
.PHONY: help
|
||||||
help:
|
help:
|
||||||
|
|
@ -15,11 +15,13 @@ help:
|
||||||
@echo " make -f $(lastword $(MAKEFILE_LIST)) clean sync"
|
@echo " make -f $(lastword $(MAKEFILE_LIST)) clean sync"
|
||||||
|
|
||||||
.PHONY: sync
|
.PHONY: sync
|
||||||
sync: llama/build-info.cpp llama/llama.cpp ml/backend/ggml/ggml
|
sync: llama/build-info.cpp ml/backend/ggml/ggml/src/ggml-metal/ggml-metal-embed.metal
|
||||||
|
|
||||||
.PHONY: llama/build-info.cpp
|
llama/build-info.cpp: llama/build-info.cpp.in llama/llama.cpp
|
||||||
llama/build-info.cpp: llama/build-info.cpp.in
|
sed -e 's|@FETCH_HEAD@|$(FETCH_HEAD)|' <$< >$@
|
||||||
sed -e 's|@FETCH_HEAD@|$(FETCH_HEAD)|' $< > $@
|
|
||||||
|
ml/backend/ggml/ggml/src/ggml-metal/ggml-metal-embed.metal: ml/backend/ggml/ggml
|
||||||
|
go generate ./$(@D)
|
||||||
|
|
||||||
.PHONY: llama/llama.cpp
|
.PHONY: llama/llama.cpp
|
||||||
llama/llama.cpp: llama/vendor/
|
llama/llama.cpp: llama/vendor/
|
||||||
|
|
@ -30,12 +32,13 @@ ml/backend/ggml/ggml: llama/vendor/ggml/
|
||||||
rsync -arvzc -f "merge $@/.rsync-filter" $< $@
|
rsync -arvzc -f "merge $@/.rsync-filter" $< $@
|
||||||
|
|
||||||
PATCHES=$(wildcard llama/patches/*.patch)
|
PATCHES=$(wildcard llama/patches/*.patch)
|
||||||
|
PATCHED=$(join $(dir $(PATCHES)), $(addsuffix ed, $(addprefix ., $(notdir $(PATCHES)))))
|
||||||
|
|
||||||
.PHONY: apply-patches
|
.PHONY: apply-patches
|
||||||
.NOTPARALLEL:
|
.NOTPARALLEL:
|
||||||
apply-patches: $(addsuffix ed, $(PATCHES))
|
apply-patches: $(PATCHED)
|
||||||
|
|
||||||
%.patched: %.patch
|
llama/patches/.%.patched: llama/patches/%.patch
|
||||||
@if git -c user.name=nobody -c 'user.email=<>' -C $(WORKDIR) am -3 $(realpath $<); then touch $@; else git -C $(WORKDIR) am --abort; exit 1; fi
|
@if git -c user.name=nobody -c 'user.email=<>' -C $(WORKDIR) am -3 $(realpath $<); then touch $@; else git -C $(WORKDIR) am --abort; exit 1; fi
|
||||||
|
|
||||||
.PHONY: checkout
|
.PHONY: checkout
|
||||||
|
|
@ -57,4 +60,4 @@ format-patches: llama/patches
|
||||||
|
|
||||||
.PHONE: clean
|
.PHONE: clean
|
||||||
clean: checkout
|
clean: checkout
|
||||||
$(RM) $(addsuffix ed, $(PATCHES))
|
$(RM) llama/patches/.*.patched
|
||||||
|
|
|
||||||
15
README.md
15
README.md
|
|
@ -40,10 +40,10 @@ The official [Ollama Docker image](https://hub.docker.com/r/ollama/ollama) `olla
|
||||||
|
|
||||||
## Quickstart
|
## Quickstart
|
||||||
|
|
||||||
To run and chat with [Llama 3.2](https://ollama.com/library/llama3.2):
|
To run and chat with [Gemma 3](https://ollama.com/library/gemma3):
|
||||||
|
|
||||||
```shell
|
```shell
|
||||||
ollama run llama3.2
|
ollama run gemma3
|
||||||
```
|
```
|
||||||
|
|
||||||
## Model library
|
## Model library
|
||||||
|
|
@ -315,6 +315,7 @@ See the [API documentation](./docs/api.md) for all endpoints.
|
||||||
- [Ollama-chats RPG](https://github.com/drazdra/ollama-chats)
|
- [Ollama-chats RPG](https://github.com/drazdra/ollama-chats)
|
||||||
- [IntelliBar](https://intellibar.app/) (AI-powered assistant for macOS)
|
- [IntelliBar](https://intellibar.app/) (AI-powered assistant for macOS)
|
||||||
- [Jirapt](https://github.com/AliAhmedNada/jirapt) (Jira Integration to generate issues, tasks, epics)
|
- [Jirapt](https://github.com/AliAhmedNada/jirapt) (Jira Integration to generate issues, tasks, epics)
|
||||||
|
- [ojira](https://github.com/AliAhmedNada/ojira) (Jira chrome plugin to easily generate descriptions for tasks)
|
||||||
- [QA-Pilot](https://github.com/reid41/QA-Pilot) (Interactive chat tool that can leverage Ollama models for rapid understanding and navigation of GitHub code repositories)
|
- [QA-Pilot](https://github.com/reid41/QA-Pilot) (Interactive chat tool that can leverage Ollama models for rapid understanding and navigation of GitHub code repositories)
|
||||||
- [ChatOllama](https://github.com/sugarforever/chat-ollama) (Open Source Chatbot based on Ollama with Knowledge Bases)
|
- [ChatOllama](https://github.com/sugarforever/chat-ollama) (Open Source Chatbot based on Ollama with Knowledge Bases)
|
||||||
- [CRAG Ollama Chat](https://github.com/Nagi-ovo/CRAG-Ollama-Chat) (Simple Web Search with Corrective RAG)
|
- [CRAG Ollama Chat](https://github.com/Nagi-ovo/CRAG-Ollama-Chat) (Simple Web Search with Corrective RAG)
|
||||||
|
|
@ -404,6 +405,11 @@ See the [API documentation](./docs/api.md) for all endpoints.
|
||||||
- [Writeopia](https://github.com/Writeopia/Writeopia) (Text editor with integration with Ollama)
|
- [Writeopia](https://github.com/Writeopia/Writeopia) (Text editor with integration with Ollama)
|
||||||
- [AppFlowy](https://github.com/AppFlowy-IO/AppFlowy) (AI collaborative workspace with Ollama, cross-platform and self-hostable)
|
- [AppFlowy](https://github.com/AppFlowy-IO/AppFlowy) (AI collaborative workspace with Ollama, cross-platform and self-hostable)
|
||||||
- [Lumina](https://github.com/cushydigit/lumina.git) (A lightweight, minimal React.js frontend for interacting with Ollama servers)
|
- [Lumina](https://github.com/cushydigit/lumina.git) (A lightweight, minimal React.js frontend for interacting with Ollama servers)
|
||||||
|
- [Tiny Notepad](https://pypi.org/project/tiny-notepad) (A lightweight, notepad-like interface to chat with ollama available on PyPI)
|
||||||
|
- [macLlama (macOS native)](https://github.com/hellotunamayo/macLlama) (A native macOS GUI application for interacting with Ollama models, featuring a chat interface.)
|
||||||
|
- [GPTranslate](https://github.com/philberndt/GPTranslate) (A fast and lightweight, AI powered desktop translation application written with Rust and Tauri. Features real-time translation with OpenAI/Azure/Ollama.)
|
||||||
|
- [ollama launcher](https://github.com/NGC13009/ollama-launcher) (A launcher for Ollama, aiming to provide users with convenient functions such as ollama server launching, management, or configuration.)
|
||||||
|
- [ai-hub](https://github.com/Aj-Seven/ai-hub) (AI Hub supports multiple models via API keys and Chat support via Ollama API.)
|
||||||
|
|
||||||
### Cloud
|
### Cloud
|
||||||
|
|
||||||
|
|
@ -447,6 +453,8 @@ See the [API documentation](./docs/api.md) for all endpoints.
|
||||||
- [orbiton](https://github.com/xyproto/orbiton) Configuration-free text editor and IDE with support for tab completion with Ollama.
|
- [orbiton](https://github.com/xyproto/orbiton) Configuration-free text editor and IDE with support for tab completion with Ollama.
|
||||||
- [orca-cli](https://github.com/molbal/orca-cli) Ollama Registry CLI Application - Browse, pull, and download models from Ollama Registry in your terminal.
|
- [orca-cli](https://github.com/molbal/orca-cli) Ollama Registry CLI Application - Browse, pull, and download models from Ollama Registry in your terminal.
|
||||||
- [GGUF-to-Ollama](https://github.com/jonathanhecl/gguf-to-ollama) - Importing GGUF to Ollama made easy (multiplatform)
|
- [GGUF-to-Ollama](https://github.com/jonathanhecl/gguf-to-ollama) - Importing GGUF to Ollama made easy (multiplatform)
|
||||||
|
- [AWS-Strands-With-Ollama](https://github.com/rapidarchitect/ollama_strands) - AWS Strands Agents with Ollama Examples
|
||||||
|
- [ollama-multirun](https://github.com/attogram/ollama-multirun) - A bash shell script to run a single prompt against any or all of your locally installed ollama models, saving the output and performance statistics as easily navigable web pages. ([Demo](https://attogram.github.io/ai_test_zone/))
|
||||||
|
|
||||||
### Apple Vision Pro
|
### Apple Vision Pro
|
||||||
|
|
||||||
|
|
@ -526,6 +534,7 @@ See the [API documentation](./docs/api.md) for all endpoints.
|
||||||
- [Abso](https://github.com/lunary-ai/abso) (OpenAI-compatible TypeScript SDK for any LLM provider)
|
- [Abso](https://github.com/lunary-ai/abso) (OpenAI-compatible TypeScript SDK for any LLM provider)
|
||||||
- [Nichey](https://github.com/goodreasonai/nichey) is a Python package for generating custom wikis for your research topic
|
- [Nichey](https://github.com/goodreasonai/nichey) is a Python package for generating custom wikis for your research topic
|
||||||
- [Ollama for D](https://github.com/kassane/ollama-d)
|
- [Ollama for D](https://github.com/kassane/ollama-d)
|
||||||
|
- [OllamaPlusPlus](https://github.com/HardCodeDev777/OllamaPlusPlus) (Very simple C++ library for Ollama)
|
||||||
|
|
||||||
### Mobile
|
### Mobile
|
||||||
|
|
||||||
|
|
@ -582,6 +591,8 @@ See the [API documentation](./docs/api.md) for all endpoints.
|
||||||
- [Simple-Discord-AI](https://github.com/zyphixor/simple-discord-ai)
|
- [Simple-Discord-AI](https://github.com/zyphixor/simple-discord-ai)
|
||||||
- [LLM Telegram Bot](https://github.com/innightwolfsleep/llm_telegram_bot) (telegram bot, primary for RP. Oobabooga-like buttons, [A1111](https://github.com/AUTOMATIC1111/stable-diffusion-webui) API integration e.t.c)
|
- [LLM Telegram Bot](https://github.com/innightwolfsleep/llm_telegram_bot) (telegram bot, primary for RP. Oobabooga-like buttons, [A1111](https://github.com/AUTOMATIC1111/stable-diffusion-webui) API integration e.t.c)
|
||||||
- [mcp-llm](https://github.com/sammcj/mcp-llm) (MCP Server to allow LLMs to call other LLMs)
|
- [mcp-llm](https://github.com/sammcj/mcp-llm) (MCP Server to allow LLMs to call other LLMs)
|
||||||
|
- [SimpleOllamaUnity](https://github.com/HardCodeDev777/SimpleOllamaUnity) (Unity Engine extension for communicating with Ollama in a few lines of code. Also works at runtime)
|
||||||
|
- [UnityCodeLama](https://github.com/HardCodeDev777/UnityCodeLama) (Unity Edtior tool to analyze scripts via Ollama)
|
||||||
|
|
||||||
### Supported backends
|
### Supported backends
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -24,7 +24,10 @@ import (
|
||||||
"net/http"
|
"net/http"
|
||||||
"net/url"
|
"net/url"
|
||||||
"runtime"
|
"runtime"
|
||||||
|
"strconv"
|
||||||
|
"time"
|
||||||
|
|
||||||
|
"github.com/ollama/ollama/auth"
|
||||||
"github.com/ollama/ollama/envconfig"
|
"github.com/ollama/ollama/envconfig"
|
||||||
"github.com/ollama/ollama/format"
|
"github.com/ollama/ollama/format"
|
||||||
"github.com/ollama/ollama/version"
|
"github.com/ollama/ollama/version"
|
||||||
|
|
@ -76,6 +79,14 @@ func NewClient(base *url.URL, http *http.Client) *Client {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func getAuthorizationToken(ctx context.Context, challenge string) (string, error) {
|
||||||
|
token, err := auth.Sign(ctx, []byte(challenge))
|
||||||
|
if err != nil {
|
||||||
|
return "", err
|
||||||
|
}
|
||||||
|
return token, nil
|
||||||
|
}
|
||||||
|
|
||||||
func (c *Client) do(ctx context.Context, method, path string, reqData, respData any) error {
|
func (c *Client) do(ctx context.Context, method, path string, reqData, respData any) error {
|
||||||
var reqBody io.Reader
|
var reqBody io.Reader
|
||||||
var data []byte
|
var data []byte
|
||||||
|
|
@ -97,6 +108,21 @@ func (c *Client) do(ctx context.Context, method, path string, reqData, respData
|
||||||
}
|
}
|
||||||
|
|
||||||
requestURL := c.base.JoinPath(path)
|
requestURL := c.base.JoinPath(path)
|
||||||
|
|
||||||
|
var token string
|
||||||
|
if envconfig.UseAuth() || c.base.Hostname() == "ollama.com" {
|
||||||
|
now := strconv.FormatInt(time.Now().Unix(), 10)
|
||||||
|
chal := fmt.Sprintf("%s,%s?ts=%s", method, path, now)
|
||||||
|
token, err = getAuthorizationToken(ctx, chal)
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
q := requestURL.Query()
|
||||||
|
q.Set("ts", now)
|
||||||
|
requestURL.RawQuery = q.Encode()
|
||||||
|
}
|
||||||
|
|
||||||
request, err := http.NewRequestWithContext(ctx, method, requestURL.String(), reqBody)
|
request, err := http.NewRequestWithContext(ctx, method, requestURL.String(), reqBody)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return err
|
return err
|
||||||
|
|
@ -106,6 +132,10 @@ func (c *Client) do(ctx context.Context, method, path string, reqData, respData
|
||||||
request.Header.Set("Accept", "application/json")
|
request.Header.Set("Accept", "application/json")
|
||||||
request.Header.Set("User-Agent", fmt.Sprintf("ollama/%s (%s %s) Go/%s", version.Version, runtime.GOARCH, runtime.GOOS, runtime.Version()))
|
request.Header.Set("User-Agent", fmt.Sprintf("ollama/%s (%s %s) Go/%s", version.Version, runtime.GOARCH, runtime.GOOS, runtime.Version()))
|
||||||
|
|
||||||
|
if token != "" {
|
||||||
|
request.Header.Set("Authorization", token)
|
||||||
|
}
|
||||||
|
|
||||||
respObj, err := c.http.Do(request)
|
respObj, err := c.http.Do(request)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return err
|
return err
|
||||||
|
|
@ -143,6 +173,22 @@ func (c *Client) stream(ctx context.Context, method, path string, data any, fn f
|
||||||
}
|
}
|
||||||
|
|
||||||
requestURL := c.base.JoinPath(path)
|
requestURL := c.base.JoinPath(path)
|
||||||
|
|
||||||
|
var token string
|
||||||
|
if envconfig.UseAuth() || c.base.Hostname() == "ollama.com" {
|
||||||
|
var err error
|
||||||
|
now := strconv.FormatInt(time.Now().Unix(), 10)
|
||||||
|
chal := fmt.Sprintf("%s,%s?ts=%s", method, path, now)
|
||||||
|
token, err = getAuthorizationToken(ctx, chal)
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
q := requestURL.Query()
|
||||||
|
q.Set("ts", now)
|
||||||
|
requestURL.RawQuery = q.Encode()
|
||||||
|
}
|
||||||
|
|
||||||
request, err := http.NewRequestWithContext(ctx, method, requestURL.String(), buf)
|
request, err := http.NewRequestWithContext(ctx, method, requestURL.String(), buf)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return err
|
return err
|
||||||
|
|
@ -152,6 +198,10 @@ func (c *Client) stream(ctx context.Context, method, path string, data any, fn f
|
||||||
request.Header.Set("Accept", "application/x-ndjson")
|
request.Header.Set("Accept", "application/x-ndjson")
|
||||||
request.Header.Set("User-Agent", fmt.Sprintf("ollama/%s (%s %s) Go/%s", version.Version, runtime.GOARCH, runtime.GOOS, runtime.Version()))
|
request.Header.Set("User-Agent", fmt.Sprintf("ollama/%s (%s %s) Go/%s", version.Version, runtime.GOARCH, runtime.GOOS, runtime.Version()))
|
||||||
|
|
||||||
|
if token != "" {
|
||||||
|
request.Header.Set("Authorization", token)
|
||||||
|
}
|
||||||
|
|
||||||
response, err := c.http.Do(request)
|
response, err := c.http.Do(request)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return err
|
return err
|
||||||
|
|
|
||||||
21
api/types.go
21
api/types.go
|
|
@ -83,6 +83,12 @@ type GenerateRequest struct {
|
||||||
// Options lists model-specific options. For example, temperature can be
|
// Options lists model-specific options. For example, temperature can be
|
||||||
// set through this field, if the model supports it.
|
// set through this field, if the model supports it.
|
||||||
Options map[string]any `json:"options"`
|
Options map[string]any `json:"options"`
|
||||||
|
|
||||||
|
// Think controls whether thinking/reasoning models will think before
|
||||||
|
// responding. Needs to be a pointer so we can distinguish between false
|
||||||
|
// (request that thinking _not_ be used) and unset (use the old behavior
|
||||||
|
// before this option was introduced)
|
||||||
|
Think *bool `json:"think,omitempty"`
|
||||||
}
|
}
|
||||||
|
|
||||||
// ChatRequest describes a request sent by [Client.Chat].
|
// ChatRequest describes a request sent by [Client.Chat].
|
||||||
|
|
@ -108,6 +114,10 @@ type ChatRequest struct {
|
||||||
|
|
||||||
// Options lists model-specific options.
|
// Options lists model-specific options.
|
||||||
Options map[string]any `json:"options"`
|
Options map[string]any `json:"options"`
|
||||||
|
|
||||||
|
// Think controls whether thinking/reasoning models will think before
|
||||||
|
// responding
|
||||||
|
Think *bool `json:"think,omitempty"`
|
||||||
}
|
}
|
||||||
|
|
||||||
type Tools []Tool
|
type Tools []Tool
|
||||||
|
|
@ -126,8 +136,11 @@ func (t Tool) String() string {
|
||||||
// role ("system", "user", or "assistant"), the content and an optional list
|
// role ("system", "user", or "assistant"), the content and an optional list
|
||||||
// of images.
|
// of images.
|
||||||
type Message struct {
|
type Message struct {
|
||||||
Role string `json:"role"`
|
Role string `json:"role"`
|
||||||
Content string `json:"content"`
|
Content string `json:"content"`
|
||||||
|
// Thinking contains the text that was inside thinking tags in the
|
||||||
|
// original model output when ChatRequest.Think is enabled.
|
||||||
|
Thinking string `json:"thinking,omitempty"`
|
||||||
Images []ImageData `json:"images,omitempty"`
|
Images []ImageData `json:"images,omitempty"`
|
||||||
ToolCalls []ToolCall `json:"tool_calls,omitempty"`
|
ToolCalls []ToolCall `json:"tool_calls,omitempty"`
|
||||||
}
|
}
|
||||||
|
|
@ -478,6 +491,10 @@ type GenerateResponse struct {
|
||||||
// Response is the textual response itself.
|
// Response is the textual response itself.
|
||||||
Response string `json:"response"`
|
Response string `json:"response"`
|
||||||
|
|
||||||
|
// Thinking contains the text that was inside thinking tags in the
|
||||||
|
// original model output when ChatRequest.Think is enabled.
|
||||||
|
Thinking string `json:"thinking,omitempty"`
|
||||||
|
|
||||||
// Done specifies if the response is complete.
|
// Done specifies if the response is complete.
|
||||||
Done bool `json:"done"`
|
Done bool `json:"done"`
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -372,3 +372,50 @@ func TestPropertyType_MarshalJSON(t *testing.T) {
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestThinking_UnmarshalJSON(t *testing.T) {
|
||||||
|
trueVal := true
|
||||||
|
falseVal := false
|
||||||
|
|
||||||
|
tests := []struct {
|
||||||
|
name string
|
||||||
|
input string
|
||||||
|
expectedThinking *bool
|
||||||
|
expectedError bool
|
||||||
|
}{
|
||||||
|
{
|
||||||
|
name: "true",
|
||||||
|
input: `{ "think": true }`,
|
||||||
|
expectedThinking: &trueVal,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "false",
|
||||||
|
input: `{ "think": false }`,
|
||||||
|
expectedThinking: &falseVal,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "unset",
|
||||||
|
input: `{ }`,
|
||||||
|
expectedThinking: nil,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "invalid",
|
||||||
|
input: `{ "think": "true" }`,
|
||||||
|
expectedThinking: nil,
|
||||||
|
expectedError: true,
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
for _, test := range tests {
|
||||||
|
t.Run(test.name, func(t *testing.T) {
|
||||||
|
var req GenerateRequest
|
||||||
|
err := json.Unmarshal([]byte(test.input), &req)
|
||||||
|
if test.expectedError {
|
||||||
|
require.Error(t, err)
|
||||||
|
} else {
|
||||||
|
require.NoError(t, err)
|
||||||
|
assert.Equal(t, test.expectedThinking, req.Think)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
|
||||||
|
|
@ -4,20 +4,14 @@ import (
|
||||||
"fmt"
|
"fmt"
|
||||||
"log/slog"
|
"log/slog"
|
||||||
"os"
|
"os"
|
||||||
"path/filepath"
|
|
||||||
"strconv"
|
"strconv"
|
||||||
"strings"
|
"strings"
|
||||||
|
|
||||||
"github.com/ollama/ollama/envconfig"
|
"github.com/ollama/ollama/envconfig"
|
||||||
|
"github.com/ollama/ollama/logutil"
|
||||||
)
|
)
|
||||||
|
|
||||||
func InitLogging() {
|
func InitLogging() {
|
||||||
level := slog.LevelInfo
|
|
||||||
|
|
||||||
if envconfig.Debug() {
|
|
||||||
level = slog.LevelDebug
|
|
||||||
}
|
|
||||||
|
|
||||||
var logFile *os.File
|
var logFile *os.File
|
||||||
var err error
|
var err error
|
||||||
// Detect if we're a GUI app on windows, and if not, send logs to console
|
// Detect if we're a GUI app on windows, and if not, send logs to console
|
||||||
|
|
@ -33,20 +27,8 @@ func InitLogging() {
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
handler := slog.NewTextHandler(logFile, &slog.HandlerOptions{
|
|
||||||
Level: level,
|
|
||||||
AddSource: true,
|
|
||||||
ReplaceAttr: func(_ []string, attr slog.Attr) slog.Attr {
|
|
||||||
if attr.Key == slog.SourceKey {
|
|
||||||
source := attr.Value.Any().(*slog.Source)
|
|
||||||
source.File = filepath.Base(source.File)
|
|
||||||
}
|
|
||||||
return attr
|
|
||||||
},
|
|
||||||
})
|
|
||||||
|
|
||||||
slog.SetDefault(slog.New(handler))
|
|
||||||
|
|
||||||
|
slog.SetDefault(logutil.NewLogger(logFile, envconfig.LogLevel()))
|
||||||
slog.Info("ollama app started")
|
slog.Info("ollama app started")
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -1,178 +0,0 @@
|
||||||
package benchmark
|
|
||||||
|
|
||||||
import (
|
|
||||||
"context"
|
|
||||||
"flag"
|
|
||||||
"fmt"
|
|
||||||
"testing"
|
|
||||||
"time"
|
|
||||||
|
|
||||||
"github.com/ollama/ollama/api"
|
|
||||||
)
|
|
||||||
|
|
||||||
// Command line flags
|
|
||||||
var modelFlag string
|
|
||||||
|
|
||||||
func init() {
|
|
||||||
flag.StringVar(&modelFlag, "m", "", "Name of the model to benchmark")
|
|
||||||
flag.Lookup("m").DefValue = "model"
|
|
||||||
}
|
|
||||||
|
|
||||||
// modelName returns the model name from flags, failing the test if not set
|
|
||||||
func modelName(b *testing.B) string {
|
|
||||||
if modelFlag == "" {
|
|
||||||
b.Fatal("Error: -m flag is required for benchmark tests")
|
|
||||||
}
|
|
||||||
return modelFlag
|
|
||||||
}
|
|
||||||
|
|
||||||
type TestCase struct {
|
|
||||||
name string
|
|
||||||
prompt string
|
|
||||||
maxTokens int
|
|
||||||
}
|
|
||||||
|
|
||||||
// runGenerateBenchmark contains the common generate and metrics logic
|
|
||||||
func runGenerateBenchmark(b *testing.B, ctx context.Context, client *api.Client, req *api.GenerateRequest) {
|
|
||||||
start := time.Now()
|
|
||||||
var ttft time.Duration
|
|
||||||
var metrics api.Metrics
|
|
||||||
|
|
||||||
err := client.Generate(ctx, req, func(resp api.GenerateResponse) error {
|
|
||||||
if ttft == 0 && resp.Response != "" {
|
|
||||||
ttft = time.Since(start)
|
|
||||||
}
|
|
||||||
if resp.Done {
|
|
||||||
metrics = resp.Metrics
|
|
||||||
}
|
|
||||||
return nil
|
|
||||||
})
|
|
||||||
|
|
||||||
// Report custom metrics as part of the benchmark results
|
|
||||||
b.ReportMetric(float64(ttft.Milliseconds()), "ttft_ms")
|
|
||||||
b.ReportMetric(float64(metrics.LoadDuration.Milliseconds()), "load_ms")
|
|
||||||
|
|
||||||
// Token throughput metrics
|
|
||||||
promptThroughput := float64(metrics.PromptEvalCount) / metrics.PromptEvalDuration.Seconds()
|
|
||||||
genThroughput := float64(metrics.EvalCount) / metrics.EvalDuration.Seconds()
|
|
||||||
b.ReportMetric(promptThroughput, "prompt_tok/s")
|
|
||||||
b.ReportMetric(genThroughput, "gen_tok/s")
|
|
||||||
|
|
||||||
// Token counts
|
|
||||||
b.ReportMetric(float64(metrics.PromptEvalCount), "prompt_tokens")
|
|
||||||
b.ReportMetric(float64(metrics.EvalCount), "gen_tokens")
|
|
||||||
if err != nil {
|
|
||||||
b.Fatal(err)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// BenchmarkColdStart runs benchmarks with model loading from cold state
|
|
||||||
func BenchmarkColdStart(b *testing.B) {
|
|
||||||
client := setup(b)
|
|
||||||
tests := []TestCase{
|
|
||||||
{"short_prompt", "Write a long story", 100},
|
|
||||||
{"medium_prompt", "Write a detailed economic analysis", 500},
|
|
||||||
{"long_prompt", "Write a comprehensive AI research paper", 1000},
|
|
||||||
}
|
|
||||||
m := modelName(b)
|
|
||||||
|
|
||||||
for _, tt := range tests {
|
|
||||||
b.Run(fmt.Sprintf("%s/cold/%s", m, tt.name), func(b *testing.B) {
|
|
||||||
ctx := b.Context()
|
|
||||||
|
|
||||||
// Set number of tokens as our throughput metric
|
|
||||||
b.SetBytes(int64(tt.maxTokens))
|
|
||||||
|
|
||||||
for b.Loop() {
|
|
||||||
b.StopTimer()
|
|
||||||
// Ensure model is unloaded before each iteration
|
|
||||||
unload(client, m, b)
|
|
||||||
b.StartTimer()
|
|
||||||
|
|
||||||
req := &api.GenerateRequest{
|
|
||||||
Model: m,
|
|
||||||
Prompt: tt.prompt,
|
|
||||||
Options: map[string]any{"num_predict": tt.maxTokens, "temperature": 0.1},
|
|
||||||
}
|
|
||||||
|
|
||||||
runGenerateBenchmark(b, ctx, client, req)
|
|
||||||
}
|
|
||||||
})
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// BenchmarkWarmStart runs benchmarks with pre-loaded model
|
|
||||||
func BenchmarkWarmStart(b *testing.B) {
|
|
||||||
client := setup(b)
|
|
||||||
tests := []TestCase{
|
|
||||||
{"short_prompt", "Write a long story", 100},
|
|
||||||
{"medium_prompt", "Write a detailed economic analysis", 500},
|
|
||||||
{"long_prompt", "Write a comprehensive AI research paper", 1000},
|
|
||||||
}
|
|
||||||
m := modelName(b)
|
|
||||||
|
|
||||||
for _, tt := range tests {
|
|
||||||
b.Run(fmt.Sprintf("%s/warm/%s", m, tt.name), func(b *testing.B) {
|
|
||||||
ctx := b.Context()
|
|
||||||
|
|
||||||
// Pre-warm the model
|
|
||||||
warmup(client, m, tt.prompt, b)
|
|
||||||
|
|
||||||
// Set number of tokens as our throughput metric
|
|
||||||
b.SetBytes(int64(tt.maxTokens))
|
|
||||||
|
|
||||||
for b.Loop() {
|
|
||||||
req := &api.GenerateRequest{
|
|
||||||
Model: m,
|
|
||||||
Prompt: tt.prompt,
|
|
||||||
Options: map[string]any{"num_predict": tt.maxTokens, "temperature": 0.1},
|
|
||||||
}
|
|
||||||
|
|
||||||
runGenerateBenchmark(b, ctx, client, req)
|
|
||||||
}
|
|
||||||
})
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// setup verifies server and model availability
|
|
||||||
func setup(b *testing.B) *api.Client {
|
|
||||||
client, err := api.ClientFromEnvironment()
|
|
||||||
if err != nil {
|
|
||||||
b.Fatal(err)
|
|
||||||
}
|
|
||||||
if _, err := client.Show(b.Context(), &api.ShowRequest{Model: modelName(b)}); err != nil {
|
|
||||||
b.Fatalf("Model unavailable: %v", err)
|
|
||||||
}
|
|
||||||
|
|
||||||
return client
|
|
||||||
}
|
|
||||||
|
|
||||||
// warmup ensures the model is loaded and warmed up
|
|
||||||
func warmup(client *api.Client, model string, prompt string, b *testing.B) {
|
|
||||||
for range 3 {
|
|
||||||
err := client.Generate(
|
|
||||||
context.Background(),
|
|
||||||
&api.GenerateRequest{
|
|
||||||
Model: model,
|
|
||||||
Prompt: prompt,
|
|
||||||
Options: map[string]any{"num_predict": 50, "temperature": 0.1},
|
|
||||||
},
|
|
||||||
func(api.GenerateResponse) error { return nil },
|
|
||||||
)
|
|
||||||
if err != nil {
|
|
||||||
b.Logf("Error during model warm-up: %v", err)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// unload forces model unloading using KeepAlive: 0 parameter
|
|
||||||
func unload(client *api.Client, model string, b *testing.B) {
|
|
||||||
req := &api.GenerateRequest{
|
|
||||||
Model: model,
|
|
||||||
KeepAlive: &api.Duration{Duration: 0},
|
|
||||||
}
|
|
||||||
if err := client.Generate(context.Background(), req, func(api.GenerateResponse) error { return nil }); err != nil {
|
|
||||||
b.Logf("Unload error: %v", err)
|
|
||||||
}
|
|
||||||
time.Sleep(1 * time.Second)
|
|
||||||
}
|
|
||||||
234
cmd/cmd.go
234
cmd/cmd.go
|
|
@ -39,6 +39,7 @@ import (
|
||||||
"github.com/ollama/ollama/format"
|
"github.com/ollama/ollama/format"
|
||||||
"github.com/ollama/ollama/parser"
|
"github.com/ollama/ollama/parser"
|
||||||
"github.com/ollama/ollama/progress"
|
"github.com/ollama/ollama/progress"
|
||||||
|
"github.com/ollama/ollama/readline"
|
||||||
"github.com/ollama/ollama/runner"
|
"github.com/ollama/ollama/runner"
|
||||||
"github.com/ollama/ollama/server"
|
"github.com/ollama/ollama/server"
|
||||||
"github.com/ollama/ollama/types/model"
|
"github.com/ollama/ollama/types/model"
|
||||||
|
|
@ -46,6 +47,23 @@ import (
|
||||||
"github.com/ollama/ollama/version"
|
"github.com/ollama/ollama/version"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
// ensureThinkingSupport emits a warning if the model does not advertise thinking support
|
||||||
|
func ensureThinkingSupport(ctx context.Context, client *api.Client, name string) {
|
||||||
|
if name == "" {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
resp, err := client.Show(ctx, &api.ShowRequest{Model: name})
|
||||||
|
if err != nil {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
for _, cap := range resp.Capabilities {
|
||||||
|
if cap == model.CapabilityThinking {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
}
|
||||||
|
fmt.Fprintf(os.Stderr, "warning: model %q does not support thinking output\n", name)
|
||||||
|
}
|
||||||
|
|
||||||
var errModelfileNotFound = errors.New("specified Modelfile wasn't found")
|
var errModelfileNotFound = errors.New("specified Modelfile wasn't found")
|
||||||
|
|
||||||
func getModelfileName(cmd *cobra.Command) (string, error) {
|
func getModelfileName(cmd *cobra.Command) (string, error) {
|
||||||
|
|
@ -265,6 +283,9 @@ func loadOrUnloadModel(cmd *cobra.Command, opts *runOptions) error {
|
||||||
req := &api.GenerateRequest{
|
req := &api.GenerateRequest{
|
||||||
Model: opts.Model,
|
Model: opts.Model,
|
||||||
KeepAlive: opts.KeepAlive,
|
KeepAlive: opts.KeepAlive,
|
||||||
|
|
||||||
|
// pass Think here so we fail before getting to the chat prompt if the model doesn't support it
|
||||||
|
Think: opts.Think,
|
||||||
}
|
}
|
||||||
|
|
||||||
return client.Generate(cmd.Context(), req, func(api.GenerateResponse) error { return nil })
|
return client.Generate(cmd.Context(), req, func(api.GenerateResponse) error { return nil })
|
||||||
|
|
@ -299,6 +320,22 @@ func RunHandler(cmd *cobra.Command, args []string) error {
|
||||||
}
|
}
|
||||||
opts.Format = format
|
opts.Format = format
|
||||||
|
|
||||||
|
thinkFlag := cmd.Flags().Lookup("think")
|
||||||
|
if thinkFlag.Changed {
|
||||||
|
think, err := cmd.Flags().GetBool("think")
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
opts.Think = &think
|
||||||
|
} else {
|
||||||
|
opts.Think = nil
|
||||||
|
}
|
||||||
|
hidethinking, err := cmd.Flags().GetBool("hidethinking")
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
opts.HideThinking = hidethinking
|
||||||
|
|
||||||
keepAlive, err := cmd.Flags().GetString("keepalive")
|
keepAlive, err := cmd.Flags().GetString("keepalive")
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return err
|
return err
|
||||||
|
|
@ -362,6 +399,11 @@ func RunHandler(cmd *cobra.Command, args []string) error {
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
|
|
||||||
|
opts.Think, err = inferThinkingOption(&info.Capabilities, &opts, thinkFlag.Changed)
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
opts.MultiModal = slices.Contains(info.Capabilities, model.CapabilityVision)
|
opts.MultiModal = slices.Contains(info.Capabilities, model.CapabilityVision)
|
||||||
|
|
||||||
// TODO: remove the projector info and vision info checks below,
|
// TODO: remove the projector info and vision info checks below,
|
||||||
|
|
@ -747,11 +789,38 @@ func showInfo(resp *api.ShowResponse, verbose bool, w io.Writer) error {
|
||||||
case float64:
|
case float64:
|
||||||
v = fmt.Sprintf("%g", vData)
|
v = fmt.Sprintf("%g", vData)
|
||||||
case []any:
|
case []any:
|
||||||
n := 3
|
targetWidth := 10 // Small width where we are displaying the data in a column
|
||||||
if len(vData) < n {
|
|
||||||
n = len(vData)
|
var itemsToShow int
|
||||||
|
totalWidth := 1 // Start with 1 for opening bracket
|
||||||
|
|
||||||
|
// Find how many we can fit
|
||||||
|
for i := range vData {
|
||||||
|
itemStr := fmt.Sprintf("%v", vData[i])
|
||||||
|
width := runewidth.StringWidth(itemStr)
|
||||||
|
|
||||||
|
// Add separator width (", ") for all items except the first
|
||||||
|
if i > 0 {
|
||||||
|
width += 2
|
||||||
|
}
|
||||||
|
|
||||||
|
// Check if adding this item would exceed our width limit
|
||||||
|
if totalWidth+width > targetWidth && i > 0 {
|
||||||
|
break
|
||||||
|
}
|
||||||
|
|
||||||
|
totalWidth += width
|
||||||
|
itemsToShow++
|
||||||
|
}
|
||||||
|
|
||||||
|
// Format the output
|
||||||
|
if itemsToShow < len(vData) {
|
||||||
|
v = fmt.Sprintf("%v", vData[:itemsToShow])
|
||||||
|
v = strings.TrimSuffix(v, "]")
|
||||||
|
v += fmt.Sprintf(" ...+%d more]", len(vData)-itemsToShow)
|
||||||
|
} else {
|
||||||
|
v = fmt.Sprintf("%v", vData)
|
||||||
}
|
}
|
||||||
v = fmt.Sprintf("%v", vData[:n])
|
|
||||||
default:
|
default:
|
||||||
v = fmt.Sprintf("%T", vData)
|
v = fmt.Sprintf("%T", vData)
|
||||||
}
|
}
|
||||||
|
|
@ -772,10 +841,19 @@ func showInfo(resp *api.ShowResponse, verbose bool, w io.Writer) error {
|
||||||
|
|
||||||
head := func(s string, n int) (rows [][]string) {
|
head := func(s string, n int) (rows [][]string) {
|
||||||
scanner := bufio.NewScanner(strings.NewReader(s))
|
scanner := bufio.NewScanner(strings.NewReader(s))
|
||||||
for scanner.Scan() && (len(rows) < n || n < 0) {
|
count := 0
|
||||||
if text := scanner.Text(); text != "" {
|
for scanner.Scan() {
|
||||||
rows = append(rows, []string{"", strings.TrimSpace(text)})
|
text := strings.TrimSpace(scanner.Text())
|
||||||
|
if text == "" {
|
||||||
|
continue
|
||||||
}
|
}
|
||||||
|
count++
|
||||||
|
if n < 0 || count <= n {
|
||||||
|
rows = append(rows, []string{"", text})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if n >= 0 && count > n {
|
||||||
|
rows = append(rows, []string{"", "..."})
|
||||||
}
|
}
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
@ -887,17 +965,19 @@ func PullHandler(cmd *cobra.Command, args []string) error {
|
||||||
type generateContextKey string
|
type generateContextKey string
|
||||||
|
|
||||||
type runOptions struct {
|
type runOptions struct {
|
||||||
Model string
|
Model string
|
||||||
ParentModel string
|
ParentModel string
|
||||||
Prompt string
|
Prompt string
|
||||||
Messages []api.Message
|
Messages []api.Message
|
||||||
WordWrap bool
|
WordWrap bool
|
||||||
Format string
|
Format string
|
||||||
System string
|
System string
|
||||||
Images []api.ImageData
|
Images []api.ImageData
|
||||||
Options map[string]any
|
Options map[string]any
|
||||||
MultiModal bool
|
MultiModal bool
|
||||||
KeepAlive *api.Duration
|
KeepAlive *api.Duration
|
||||||
|
Think *bool
|
||||||
|
HideThinking bool
|
||||||
}
|
}
|
||||||
|
|
||||||
type displayResponseState struct {
|
type displayResponseState struct {
|
||||||
|
|
@ -953,6 +1033,26 @@ func displayResponse(content string, wordWrap bool, state *displayResponseState)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func thinkingOutputOpeningText(plainText bool) string {
|
||||||
|
text := "Thinking...\n"
|
||||||
|
|
||||||
|
if plainText {
|
||||||
|
return text
|
||||||
|
}
|
||||||
|
|
||||||
|
return readline.ColorGrey + readline.ColorBold + text + readline.ColorDefault + readline.ColorGrey
|
||||||
|
}
|
||||||
|
|
||||||
|
func thinkingOutputClosingText(plainText bool) string {
|
||||||
|
text := "...done thinking.\n\n"
|
||||||
|
|
||||||
|
if plainText {
|
||||||
|
return text
|
||||||
|
}
|
||||||
|
|
||||||
|
return readline.ColorGrey + readline.ColorBold + text + readline.ColorDefault
|
||||||
|
}
|
||||||
|
|
||||||
func chat(cmd *cobra.Command, opts runOptions) (*api.Message, error) {
|
func chat(cmd *cobra.Command, opts runOptions) (*api.Message, error) {
|
||||||
client, err := api.ClientFromEnvironment()
|
client, err := api.ClientFromEnvironment()
|
||||||
if err != nil {
|
if err != nil {
|
||||||
|
|
@ -980,14 +1080,34 @@ func chat(cmd *cobra.Command, opts runOptions) (*api.Message, error) {
|
||||||
var latest api.ChatResponse
|
var latest api.ChatResponse
|
||||||
var fullResponse strings.Builder
|
var fullResponse strings.Builder
|
||||||
var role string
|
var role string
|
||||||
|
var thinkTagOpened bool = false
|
||||||
|
var thinkTagClosed bool = false
|
||||||
|
|
||||||
fn := func(response api.ChatResponse) error {
|
fn := func(response api.ChatResponse) error {
|
||||||
p.StopAndClear()
|
if response.Message.Content != "" || !opts.HideThinking {
|
||||||
|
p.StopAndClear()
|
||||||
|
}
|
||||||
|
|
||||||
latest = response
|
latest = response
|
||||||
|
|
||||||
role = response.Message.Role
|
role = response.Message.Role
|
||||||
|
if response.Message.Thinking != "" && !opts.HideThinking {
|
||||||
|
if !thinkTagOpened {
|
||||||
|
fmt.Print(thinkingOutputOpeningText(false))
|
||||||
|
thinkTagOpened = true
|
||||||
|
}
|
||||||
|
displayResponse(response.Message.Thinking, opts.WordWrap, state)
|
||||||
|
}
|
||||||
|
|
||||||
content := response.Message.Content
|
content := response.Message.Content
|
||||||
|
if thinkTagOpened && !thinkTagClosed && content != "" {
|
||||||
|
fmt.Print(thinkingOutputClosingText(false))
|
||||||
|
thinkTagClosed = true
|
||||||
|
}
|
||||||
|
// purposefully not putting thinking blocks in the response, which would
|
||||||
|
// only be needed if we later added tool calling to the cli (they get
|
||||||
|
// filtered out anyway since current models don't expect them unless you're
|
||||||
|
// about to finish some tool calls)
|
||||||
fullResponse.WriteString(content)
|
fullResponse.WriteString(content)
|
||||||
|
|
||||||
displayResponse(content, opts.WordWrap, state)
|
displayResponse(content, opts.WordWrap, state)
|
||||||
|
|
@ -1004,6 +1124,7 @@ func chat(cmd *cobra.Command, opts runOptions) (*api.Message, error) {
|
||||||
Messages: opts.Messages,
|
Messages: opts.Messages,
|
||||||
Format: json.RawMessage(opts.Format),
|
Format: json.RawMessage(opts.Format),
|
||||||
Options: opts.Options,
|
Options: opts.Options,
|
||||||
|
Think: opts.Think,
|
||||||
}
|
}
|
||||||
|
|
||||||
if opts.KeepAlive != nil {
|
if opts.KeepAlive != nil {
|
||||||
|
|
@ -1065,13 +1186,32 @@ func generate(cmd *cobra.Command, opts runOptions) error {
|
||||||
}()
|
}()
|
||||||
|
|
||||||
var state *displayResponseState = &displayResponseState{}
|
var state *displayResponseState = &displayResponseState{}
|
||||||
|
var thinkTagOpened bool = false
|
||||||
|
var thinkTagClosed bool = false
|
||||||
|
|
||||||
|
plainText := !term.IsTerminal(int(os.Stdout.Fd()))
|
||||||
|
|
||||||
fn := func(response api.GenerateResponse) error {
|
fn := func(response api.GenerateResponse) error {
|
||||||
p.StopAndClear()
|
|
||||||
|
|
||||||
latest = response
|
latest = response
|
||||||
content := response.Response
|
content := response.Response
|
||||||
|
|
||||||
|
if response.Response != "" || !opts.HideThinking {
|
||||||
|
p.StopAndClear()
|
||||||
|
}
|
||||||
|
|
||||||
|
if response.Thinking != "" && !opts.HideThinking {
|
||||||
|
if !thinkTagOpened {
|
||||||
|
fmt.Print(thinkingOutputOpeningText(plainText))
|
||||||
|
thinkTagOpened = true
|
||||||
|
}
|
||||||
|
displayResponse(response.Thinking, opts.WordWrap, state)
|
||||||
|
}
|
||||||
|
|
||||||
|
if thinkTagOpened && !thinkTagClosed && content != "" {
|
||||||
|
fmt.Print(thinkingOutputClosingText(plainText))
|
||||||
|
thinkTagClosed = true
|
||||||
|
}
|
||||||
|
|
||||||
displayResponse(content, opts.WordWrap, state)
|
displayResponse(content, opts.WordWrap, state)
|
||||||
|
|
||||||
return nil
|
return nil
|
||||||
|
|
@ -1097,6 +1237,7 @@ func generate(cmd *cobra.Command, opts runOptions) error {
|
||||||
System: opts.System,
|
System: opts.System,
|
||||||
Options: opts.Options,
|
Options: opts.Options,
|
||||||
KeepAlive: opts.KeepAlive,
|
KeepAlive: opts.KeepAlive,
|
||||||
|
Think: opts.Think,
|
||||||
}
|
}
|
||||||
|
|
||||||
if err := client.Generate(ctx, &request, fn); err != nil {
|
if err := client.Generate(ctx, &request, fn); err != nil {
|
||||||
|
|
@ -1200,11 +1341,11 @@ func checkServerHeartbeat(cmd *cobra.Command, _ []string) error {
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
if err := client.Heartbeat(cmd.Context()); err != nil {
|
if err := client.Heartbeat(cmd.Context()); err != nil {
|
||||||
if !strings.Contains(err.Error(), " refused") {
|
if !(strings.Contains(err.Error(), " refused") || strings.Contains(err.Error(), "could not connect")) {
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
if err := startApp(cmd.Context(), client); err != nil {
|
if err := startApp(cmd.Context(), client); err != nil {
|
||||||
return errors.New("could not connect to ollama app, is it running?")
|
return fmt.Errorf("ollama server not responding - %w", err)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return nil
|
return nil
|
||||||
|
|
@ -1282,7 +1423,7 @@ func NewCLI() *cobra.Command {
|
||||||
}
|
}
|
||||||
|
|
||||||
createCmd.Flags().StringP("file", "f", "", "Name of the Modelfile (default \"Modelfile\"")
|
createCmd.Flags().StringP("file", "f", "", "Name of the Modelfile (default \"Modelfile\"")
|
||||||
createCmd.Flags().StringP("quantize", "q", "", "Quantize model to this level (e.g. q4_0)")
|
createCmd.Flags().StringP("quantize", "q", "", "Quantize model to this level (e.g. q4_K_M)")
|
||||||
|
|
||||||
showCmd := &cobra.Command{
|
showCmd := &cobra.Command{
|
||||||
Use: "show MODEL",
|
Use: "show MODEL",
|
||||||
|
|
@ -1312,6 +1453,8 @@ func NewCLI() *cobra.Command {
|
||||||
runCmd.Flags().Bool("insecure", false, "Use an insecure registry")
|
runCmd.Flags().Bool("insecure", false, "Use an insecure registry")
|
||||||
runCmd.Flags().Bool("nowordwrap", false, "Don't wrap words to the next line automatically")
|
runCmd.Flags().Bool("nowordwrap", false, "Don't wrap words to the next line automatically")
|
||||||
runCmd.Flags().String("format", "", "Response format (e.g. json)")
|
runCmd.Flags().String("format", "", "Response format (e.g. json)")
|
||||||
|
runCmd.Flags().Bool("think", false, "Whether to use thinking mode for supported models")
|
||||||
|
runCmd.Flags().Bool("hidethinking", false, "Hide thinking output (if provided)")
|
||||||
|
|
||||||
stopCmd := &cobra.Command{
|
stopCmd := &cobra.Command{
|
||||||
Use: "stop MODEL",
|
Use: "stop MODEL",
|
||||||
|
|
@ -1363,7 +1506,6 @@ func NewCLI() *cobra.Command {
|
||||||
PreRunE: checkServerHeartbeat,
|
PreRunE: checkServerHeartbeat,
|
||||||
RunE: ListRunningHandler,
|
RunE: ListRunningHandler,
|
||||||
}
|
}
|
||||||
|
|
||||||
copyCmd := &cobra.Command{
|
copyCmd := &cobra.Command{
|
||||||
Use: "cp SOURCE DESTINATION",
|
Use: "cp SOURCE DESTINATION",
|
||||||
Short: "Copy a model",
|
Short: "Copy a model",
|
||||||
|
|
@ -1452,3 +1594,45 @@ func NewCLI() *cobra.Command {
|
||||||
|
|
||||||
return rootCmd
|
return rootCmd
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// If the user has explicitly set thinking options, either through the CLI or
|
||||||
|
// through the `/set think` or `set nothink` interactive options, then we
|
||||||
|
// respect them. Otherwise, we check model capabilities to see if the model
|
||||||
|
// supports thinking. If the model does support thinking, we enable it.
|
||||||
|
// Otherwise, we unset the thinking option (which is different than setting it
|
||||||
|
// to false).
|
||||||
|
//
|
||||||
|
// If capabilities are not provided, we fetch them from the server.
|
||||||
|
func inferThinkingOption(caps *[]model.Capability, runOpts *runOptions, explicitlySetByUser bool) (*bool, error) {
|
||||||
|
if explicitlySetByUser {
|
||||||
|
return runOpts.Think, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
if caps == nil {
|
||||||
|
client, err := api.ClientFromEnvironment()
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
ret, err := client.Show(context.Background(), &api.ShowRequest{
|
||||||
|
Model: runOpts.Model,
|
||||||
|
})
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
caps = &ret.Capabilities
|
||||||
|
}
|
||||||
|
|
||||||
|
thinkingSupported := false
|
||||||
|
for _, cap := range *caps {
|
||||||
|
if cap == model.CapabilityThinking {
|
||||||
|
thinkingSupported = true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if thinkingSupported {
|
||||||
|
thinking := true
|
||||||
|
return &thinking, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
return nil, nil
|
||||||
|
}
|
||||||
|
|
|
||||||
|
|
@ -225,6 +225,7 @@ Weigh anchor!
|
||||||
System
|
System
|
||||||
You are a pirate!
|
You are a pirate!
|
||||||
Ahoy, matey!
|
Ahoy, matey!
|
||||||
|
...
|
||||||
|
|
||||||
`
|
`
|
||||||
if diff := cmp.Diff(expect, b.String()); diff != "" {
|
if diff := cmp.Diff(expect, b.String()); diff != "" {
|
||||||
|
|
|
||||||
|
|
@ -44,7 +44,7 @@ func generateInteractive(cmd *cobra.Command, opts runOptions) error {
|
||||||
fmt.Fprintln(os.Stderr, "Use \"\"\" to begin a multi-line message.")
|
fmt.Fprintln(os.Stderr, "Use \"\"\" to begin a multi-line message.")
|
||||||
|
|
||||||
if opts.MultiModal {
|
if opts.MultiModal {
|
||||||
fmt.Fprintf(os.Stderr, "Use %s to include .jpg or .png images.\n", filepath.FromSlash("/path/to/file"))
|
fmt.Fprintf(os.Stderr, "Use %s to include .jpg, .png, or .webp images.\n", filepath.FromSlash("/path/to/file"))
|
||||||
}
|
}
|
||||||
|
|
||||||
fmt.Fprintln(os.Stderr, "")
|
fmt.Fprintln(os.Stderr, "")
|
||||||
|
|
@ -62,6 +62,8 @@ func generateInteractive(cmd *cobra.Command, opts runOptions) error {
|
||||||
fmt.Fprintln(os.Stderr, " /set noformat Disable formatting")
|
fmt.Fprintln(os.Stderr, " /set noformat Disable formatting")
|
||||||
fmt.Fprintln(os.Stderr, " /set verbose Show LLM stats")
|
fmt.Fprintln(os.Stderr, " /set verbose Show LLM stats")
|
||||||
fmt.Fprintln(os.Stderr, " /set quiet Disable LLM stats")
|
fmt.Fprintln(os.Stderr, " /set quiet Disable LLM stats")
|
||||||
|
fmt.Fprintln(os.Stderr, " /set think Enable thinking")
|
||||||
|
fmt.Fprintln(os.Stderr, " /set nothink Disable thinking")
|
||||||
fmt.Fprintln(os.Stderr, "")
|
fmt.Fprintln(os.Stderr, "")
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -128,6 +130,7 @@ func generateInteractive(cmd *cobra.Command, opts runOptions) error {
|
||||||
|
|
||||||
var sb strings.Builder
|
var sb strings.Builder
|
||||||
var multiline MultilineState
|
var multiline MultilineState
|
||||||
|
var thinkExplicitlySet bool = opts.Think != nil
|
||||||
|
|
||||||
for {
|
for {
|
||||||
line, err := scanner.Readline()
|
line, err := scanner.Readline()
|
||||||
|
|
@ -195,11 +198,19 @@ func generateInteractive(cmd *cobra.Command, opts runOptions) error {
|
||||||
opts.Model = args[1]
|
opts.Model = args[1]
|
||||||
opts.Messages = []api.Message{}
|
opts.Messages = []api.Message{}
|
||||||
fmt.Printf("Loading model '%s'\n", opts.Model)
|
fmt.Printf("Loading model '%s'\n", opts.Model)
|
||||||
|
opts.Think, err = inferThinkingOption(nil, &opts, thinkExplicitlySet)
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
if err := loadOrUnloadModel(cmd, &opts); err != nil {
|
if err := loadOrUnloadModel(cmd, &opts); err != nil {
|
||||||
if strings.Contains(err.Error(), "not found") {
|
if strings.Contains(err.Error(), "not found") {
|
||||||
fmt.Printf("error: %v\n", err)
|
fmt.Printf("error: %v\n", err)
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
|
if strings.Contains(err.Error(), "does not support thinking") {
|
||||||
|
fmt.Printf("error: %v\n", err)
|
||||||
|
continue
|
||||||
|
}
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
continue
|
continue
|
||||||
|
|
@ -260,6 +271,22 @@ func generateInteractive(cmd *cobra.Command, opts runOptions) error {
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
fmt.Println("Set 'quiet' mode.")
|
fmt.Println("Set 'quiet' mode.")
|
||||||
|
case "think":
|
||||||
|
think := true
|
||||||
|
opts.Think = &think
|
||||||
|
thinkExplicitlySet = true
|
||||||
|
if client, err := api.ClientFromEnvironment(); err == nil {
|
||||||
|
ensureThinkingSupport(cmd.Context(), client, opts.Model)
|
||||||
|
}
|
||||||
|
fmt.Println("Set 'think' mode.")
|
||||||
|
case "nothink":
|
||||||
|
think := false
|
||||||
|
opts.Think = &think
|
||||||
|
thinkExplicitlySet = true
|
||||||
|
if client, err := api.ClientFromEnvironment(); err == nil {
|
||||||
|
ensureThinkingSupport(cmd.Context(), client, opts.Model)
|
||||||
|
}
|
||||||
|
fmt.Println("Set 'nothink' mode.")
|
||||||
case "format":
|
case "format":
|
||||||
if len(args) < 3 || args[2] != "json" {
|
if len(args) < 3 || args[2] != "json" {
|
||||||
fmt.Println("Invalid or missing format. For 'json' mode use '/set format json'")
|
fmt.Println("Invalid or missing format. For 'json' mode use '/set format json'")
|
||||||
|
|
@ -448,6 +475,11 @@ func generateInteractive(cmd *cobra.Command, opts runOptions) error {
|
||||||
|
|
||||||
assistant, err := chat(cmd, opts)
|
assistant, err := chat(cmd, opts)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
|
if strings.Contains(err.Error(), "does not support thinking") {
|
||||||
|
fmt.Printf("error: %v\n", err)
|
||||||
|
sb.Reset()
|
||||||
|
continue
|
||||||
|
}
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
if assistant != nil {
|
if assistant != nil {
|
||||||
|
|
@ -511,7 +543,7 @@ func extractFileNames(input string) []string {
|
||||||
// Regex to match file paths starting with optional drive letter, / ./ \ or .\ and include escaped or unescaped spaces (\ or %20)
|
// Regex to match file paths starting with optional drive letter, / ./ \ or .\ and include escaped or unescaped spaces (\ or %20)
|
||||||
// and followed by more characters and a file extension
|
// and followed by more characters and a file extension
|
||||||
// This will capture non filename strings, but we'll check for file existence to remove mismatches
|
// This will capture non filename strings, but we'll check for file existence to remove mismatches
|
||||||
regexPattern := `(?:[a-zA-Z]:)?(?:\./|/|\\)[\S\\ ]+?\.(?i:jpg|jpeg|png)\b`
|
regexPattern := `(?:[a-zA-Z]:)?(?:\./|/|\\)[\S\\ ]+?\.(?i:jpg|jpeg|png|webp)\b`
|
||||||
re := regexp.MustCompile(regexPattern)
|
re := regexp.MustCompile(regexPattern)
|
||||||
|
|
||||||
return re.FindAllString(input, -1)
|
return re.FindAllString(input, -1)
|
||||||
|
|
@ -531,6 +563,8 @@ func extractFileData(input string) (string, []api.ImageData, error) {
|
||||||
return "", imgs, err
|
return "", imgs, err
|
||||||
}
|
}
|
||||||
fmt.Fprintf(os.Stderr, "Added image '%s'\n", nfp)
|
fmt.Fprintf(os.Stderr, "Added image '%s'\n", nfp)
|
||||||
|
input = strings.ReplaceAll(input, "'"+nfp+"'", "")
|
||||||
|
input = strings.ReplaceAll(input, "'"+fp+"'", "")
|
||||||
input = strings.ReplaceAll(input, fp, "")
|
input = strings.ReplaceAll(input, fp, "")
|
||||||
imgs = append(imgs, data)
|
imgs = append(imgs, data)
|
||||||
}
|
}
|
||||||
|
|
@ -551,7 +585,7 @@ func getImageData(filePath string) ([]byte, error) {
|
||||||
}
|
}
|
||||||
|
|
||||||
contentType := http.DetectContentType(buf)
|
contentType := http.DetectContentType(buf)
|
||||||
allowedTypes := []string{"image/jpeg", "image/jpg", "image/png"}
|
allowedTypes := []string{"image/jpeg", "image/jpg", "image/png", "image/webp"}
|
||||||
if !slices.Contains(allowedTypes, contentType) {
|
if !slices.Contains(allowedTypes, contentType) {
|
||||||
return nil, fmt.Errorf("invalid image type: %s", contentType)
|
return nil, fmt.Errorf("invalid image type: %s", contentType)
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -1,6 +1,8 @@
|
||||||
package cmd
|
package cmd
|
||||||
|
|
||||||
import (
|
import (
|
||||||
|
"os"
|
||||||
|
"path/filepath"
|
||||||
"testing"
|
"testing"
|
||||||
|
|
||||||
"github.com/stretchr/testify/assert"
|
"github.com/stretchr/testify/assert"
|
||||||
|
|
@ -10,14 +12,17 @@ func TestExtractFilenames(t *testing.T) {
|
||||||
// Unix style paths
|
// Unix style paths
|
||||||
input := ` some preamble
|
input := ` some preamble
|
||||||
./relative\ path/one.png inbetween1 ./not a valid two.jpg inbetween2 ./1.svg
|
./relative\ path/one.png inbetween1 ./not a valid two.jpg inbetween2 ./1.svg
|
||||||
/unescaped space /three.jpeg inbetween3 /valid\ path/dir/four.png "./quoted with spaces/five.JPG`
|
/unescaped space /three.jpeg inbetween3 /valid\ path/dir/four.png "./quoted with spaces/five.JPG
|
||||||
|
/unescaped space /six.webp inbetween6 /valid\ path/dir/seven.WEBP`
|
||||||
res := extractFileNames(input)
|
res := extractFileNames(input)
|
||||||
assert.Len(t, res, 5)
|
assert.Len(t, res, 7)
|
||||||
assert.Contains(t, res[0], "one.png")
|
assert.Contains(t, res[0], "one.png")
|
||||||
assert.Contains(t, res[1], "two.jpg")
|
assert.Contains(t, res[1], "two.jpg")
|
||||||
assert.Contains(t, res[2], "three.jpeg")
|
assert.Contains(t, res[2], "three.jpeg")
|
||||||
assert.Contains(t, res[3], "four.png")
|
assert.Contains(t, res[3], "four.png")
|
||||||
assert.Contains(t, res[4], "five.JPG")
|
assert.Contains(t, res[4], "five.JPG")
|
||||||
|
assert.Contains(t, res[5], "six.webp")
|
||||||
|
assert.Contains(t, res[6], "seven.WEBP")
|
||||||
assert.NotContains(t, res[4], '"')
|
assert.NotContains(t, res[4], '"')
|
||||||
assert.NotContains(t, res, "inbetween1")
|
assert.NotContains(t, res, "inbetween1")
|
||||||
assert.NotContains(t, res, "./1.svg")
|
assert.NotContains(t, res, "./1.svg")
|
||||||
|
|
@ -28,10 +33,12 @@ func TestExtractFilenames(t *testing.T) {
|
||||||
/absolute/nospace/three.jpeg inbetween3 /absolute/with space/four.png inbetween4
|
/absolute/nospace/three.jpeg inbetween3 /absolute/with space/four.png inbetween4
|
||||||
./relative\ path/five.JPG inbetween5 "./relative with/spaces/six.png inbetween6
|
./relative\ path/five.JPG inbetween5 "./relative with/spaces/six.png inbetween6
|
||||||
d:\path with\spaces\seven.JPEG inbetween7 c:\users\jdoe\eight.png inbetween8
|
d:\path with\spaces\seven.JPEG inbetween7 c:\users\jdoe\eight.png inbetween8
|
||||||
d:\program files\someplace\nine.png inbetween9 "E:\program files\someplace\ten.PNG some ending
|
d:\program files\someplace\nine.png inbetween9 "E:\program files\someplace\ten.PNG
|
||||||
|
c:/users/jdoe/eleven.webp inbetween11 c:/program files/someplace/twelve.WebP inbetween12
|
||||||
|
d:\path with\spaces\thirteen.WEBP some ending
|
||||||
`
|
`
|
||||||
res = extractFileNames(input)
|
res = extractFileNames(input)
|
||||||
assert.Len(t, res, 10)
|
assert.Len(t, res, 13)
|
||||||
assert.NotContains(t, res, "inbetween2")
|
assert.NotContains(t, res, "inbetween2")
|
||||||
assert.Contains(t, res[0], "one.png")
|
assert.Contains(t, res[0], "one.png")
|
||||||
assert.Contains(t, res[0], "c:")
|
assert.Contains(t, res[0], "c:")
|
||||||
|
|
@ -49,4 +56,31 @@ d:\path with\spaces\seven.JPEG inbetween7 c:\users\jdoe\eight.png inbetween8
|
||||||
assert.Contains(t, res[8], "d:")
|
assert.Contains(t, res[8], "d:")
|
||||||
assert.Contains(t, res[9], "ten.PNG")
|
assert.Contains(t, res[9], "ten.PNG")
|
||||||
assert.Contains(t, res[9], "E:")
|
assert.Contains(t, res[9], "E:")
|
||||||
|
assert.Contains(t, res[10], "eleven.webp")
|
||||||
|
assert.Contains(t, res[10], "c:")
|
||||||
|
assert.Contains(t, res[11], "twelve.WebP")
|
||||||
|
assert.Contains(t, res[11], "c:")
|
||||||
|
assert.Contains(t, res[12], "thirteen.WEBP")
|
||||||
|
assert.Contains(t, res[12], "d:")
|
||||||
|
}
|
||||||
|
|
||||||
|
// Ensure that file paths wrapped in single quotes are removed with the quotes.
|
||||||
|
func TestExtractFileDataRemovesQuotedFilepath(t *testing.T) {
|
||||||
|
dir := t.TempDir()
|
||||||
|
fp := filepath.Join(dir, "img.jpg")
|
||||||
|
data := make([]byte, 600)
|
||||||
|
copy(data, []byte{
|
||||||
|
0xff, 0xd8, 0xff, 0xe0, 0x00, 0x10, 'J', 'F', 'I', 'F',
|
||||||
|
0x00, 0x01, 0x01, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
|
||||||
|
0xff, 0xd9,
|
||||||
|
})
|
||||||
|
if err := os.WriteFile(fp, data, 0o600); err != nil {
|
||||||
|
t.Fatalf("failed to write test image: %v", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
input := "before '" + fp + "' after"
|
||||||
|
cleaned, imgs, err := extractFileData(input)
|
||||||
|
assert.NoError(t, err)
|
||||||
|
assert.Len(t, imgs, 1)
|
||||||
|
assert.Equal(t, cleaned, "before after")
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -5,7 +5,7 @@ import (
|
||||||
"errors"
|
"errors"
|
||||||
"os"
|
"os"
|
||||||
"os/exec"
|
"os/exec"
|
||||||
"strings"
|
"regexp"
|
||||||
|
|
||||||
"github.com/ollama/ollama/api"
|
"github.com/ollama/ollama/api"
|
||||||
)
|
)
|
||||||
|
|
@ -19,11 +19,12 @@ func startApp(ctx context.Context, client *api.Client) error {
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
if !strings.Contains(link, "Ollama.app") {
|
r := regexp.MustCompile(`^.*/Ollama\s?\d*.app`)
|
||||||
|
m := r.FindStringSubmatch(link)
|
||||||
|
if len(m) != 1 {
|
||||||
return errors.New("could not find ollama app")
|
return errors.New("could not find ollama app")
|
||||||
}
|
}
|
||||||
path := strings.Split(link, "Ollama.app")
|
if err := exec.Command("/usr/bin/open", "-j", "-a", m[0], "--args", "--fast-startup").Run(); err != nil {
|
||||||
if err := exec.Command("/usr/bin/open", "-a", path[0]+"Ollama.app").Run(); err != nil {
|
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
return waitForServer(ctx, client)
|
return waitForServer(ctx, client)
|
||||||
|
|
|
||||||
|
|
@ -4,17 +4,27 @@ import (
|
||||||
"context"
|
"context"
|
||||||
"errors"
|
"errors"
|
||||||
"fmt"
|
"fmt"
|
||||||
|
"log/slog"
|
||||||
"os"
|
"os"
|
||||||
"os/exec"
|
"os/exec"
|
||||||
|
"path"
|
||||||
"path/filepath"
|
"path/filepath"
|
||||||
"strings"
|
"strings"
|
||||||
"syscall"
|
"syscall"
|
||||||
|
"unsafe"
|
||||||
|
|
||||||
"github.com/ollama/ollama/api"
|
"github.com/ollama/ollama/api"
|
||||||
|
"golang.org/x/sys/windows"
|
||||||
|
)
|
||||||
|
|
||||||
|
const (
|
||||||
|
Installer = "OllamaSetup.exe"
|
||||||
)
|
)
|
||||||
|
|
||||||
func startApp(ctx context.Context, client *api.Client) error {
|
func startApp(ctx context.Context, client *api.Client) error {
|
||||||
// log.Printf("XXX Attempting to find and start ollama app")
|
if len(isProcRunning(Installer)) > 0 {
|
||||||
|
return fmt.Errorf("upgrade in progress...")
|
||||||
|
}
|
||||||
AppName := "ollama app.exe"
|
AppName := "ollama app.exe"
|
||||||
exe, err := os.Executable()
|
exe, err := os.Executable()
|
||||||
if err != nil {
|
if err != nil {
|
||||||
|
|
@ -35,14 +45,11 @@ func startApp(ctx context.Context, client *api.Client) error {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
// log.Printf("XXX attempting to start app %s", appExe)
|
|
||||||
|
|
||||||
cmd_path := "c:\\Windows\\system32\\cmd.exe"
|
cmd_path := "c:\\Windows\\system32\\cmd.exe"
|
||||||
cmd := exec.Command(cmd_path, "/c", appExe)
|
cmd := exec.Command(cmd_path, "/c", appExe, "--hide", "--fast-startup")
|
||||||
// TODO - these hide flags aren't working - still pops up a command window for some reason
|
|
||||||
cmd.SysProcAttr = &syscall.SysProcAttr{CreationFlags: 0x08000000, HideWindow: true}
|
cmd.SysProcAttr = &syscall.SysProcAttr{CreationFlags: 0x08000000, HideWindow: true}
|
||||||
|
|
||||||
// TODO this didn't help either...
|
|
||||||
cmd.Stdin = strings.NewReader("")
|
cmd.Stdin = strings.NewReader("")
|
||||||
cmd.Stdout = os.Stdout
|
cmd.Stdout = os.Stdout
|
||||||
cmd.Stderr = os.Stderr
|
cmd.Stderr = os.Stderr
|
||||||
|
|
@ -56,3 +63,50 @@ func startApp(ctx context.Context, client *api.Client) error {
|
||||||
}
|
}
|
||||||
return waitForServer(ctx, client)
|
return waitForServer(ctx, client)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func isProcRunning(procName string) []uint32 {
|
||||||
|
pids := make([]uint32, 2048)
|
||||||
|
var ret uint32
|
||||||
|
if err := windows.EnumProcesses(pids, &ret); err != nil || ret == 0 {
|
||||||
|
slog.Debug("failed to check for running installers", "error", err)
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
if ret > uint32(len(pids)) {
|
||||||
|
pids = make([]uint32, ret+10)
|
||||||
|
if err := windows.EnumProcesses(pids, &ret); err != nil || ret == 0 {
|
||||||
|
slog.Debug("failed to check for running installers", "error", err)
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if ret < uint32(len(pids)) {
|
||||||
|
pids = pids[:ret]
|
||||||
|
}
|
||||||
|
var matches []uint32
|
||||||
|
for _, pid := range pids {
|
||||||
|
if pid == 0 {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
hProcess, err := windows.OpenProcess(windows.PROCESS_QUERY_INFORMATION|windows.PROCESS_VM_READ, false, pid)
|
||||||
|
if err != nil {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
defer windows.CloseHandle(hProcess)
|
||||||
|
var module windows.Handle
|
||||||
|
var cbNeeded uint32
|
||||||
|
cb := (uint32)(unsafe.Sizeof(module))
|
||||||
|
if err := windows.EnumProcessModules(hProcess, &module, cb, &cbNeeded); err != nil {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
var sz uint32 = 1024 * 8
|
||||||
|
moduleName := make([]uint16, sz)
|
||||||
|
cb = uint32(len(moduleName)) * (uint32)(unsafe.Sizeof(uint16(0)))
|
||||||
|
if err := windows.GetModuleBaseName(hProcess, module, &moduleName[0], cb); err != nil && err != syscall.ERROR_INSUFFICIENT_BUFFER {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
exeFile := path.Base(strings.ToLower(syscall.UTF16ToString(moduleName)))
|
||||||
|
if strings.EqualFold(exeFile, procName) {
|
||||||
|
matches = append(matches, pid)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return matches
|
||||||
|
}
|
||||||
|
|
|
||||||
63
cmd/warn_thinking_test.go
Normal file
63
cmd/warn_thinking_test.go
Normal file
|
|
@ -0,0 +1,63 @@
|
||||||
|
package cmd
|
||||||
|
|
||||||
|
import (
|
||||||
|
"encoding/json"
|
||||||
|
"io"
|
||||||
|
"net/http"
|
||||||
|
"net/http/httptest"
|
||||||
|
"os"
|
||||||
|
"strings"
|
||||||
|
"testing"
|
||||||
|
|
||||||
|
"github.com/ollama/ollama/api"
|
||||||
|
"github.com/ollama/ollama/types/model"
|
||||||
|
)
|
||||||
|
|
||||||
|
// Test that a warning is printed when thinking is requested but not supported.
|
||||||
|
func TestWarnMissingThinking(t *testing.T) {
|
||||||
|
cases := []struct {
|
||||||
|
capabilities []model.Capability
|
||||||
|
expectWarn bool
|
||||||
|
}{
|
||||||
|
{capabilities: []model.Capability{model.CapabilityThinking}, expectWarn: false},
|
||||||
|
{capabilities: []model.Capability{}, expectWarn: true},
|
||||||
|
}
|
||||||
|
|
||||||
|
for _, tc := range cases {
|
||||||
|
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||||
|
if r.URL.Path != "/api/show" || r.Method != http.MethodPost {
|
||||||
|
t.Fatalf("unexpected request to %s %s", r.URL.Path, r.Method)
|
||||||
|
}
|
||||||
|
var req api.ShowRequest
|
||||||
|
if err := json.NewDecoder(r.Body).Decode(&req); err != nil {
|
||||||
|
t.Fatalf("decode request: %v", err)
|
||||||
|
}
|
||||||
|
resp := api.ShowResponse{Capabilities: tc.capabilities}
|
||||||
|
if err := json.NewEncoder(w).Encode(resp); err != nil {
|
||||||
|
t.Fatalf("encode response: %v", err)
|
||||||
|
}
|
||||||
|
}))
|
||||||
|
defer srv.Close()
|
||||||
|
|
||||||
|
t.Setenv("OLLAMA_HOST", srv.URL)
|
||||||
|
client, err := api.ClientFromEnvironment()
|
||||||
|
if err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
oldStderr := os.Stderr
|
||||||
|
r, w, _ := os.Pipe()
|
||||||
|
os.Stderr = w
|
||||||
|
ensureThinkingSupport(t.Context(), client, "m")
|
||||||
|
w.Close()
|
||||||
|
os.Stderr = oldStderr
|
||||||
|
out, _ := io.ReadAll(r)
|
||||||
|
|
||||||
|
warned := strings.Contains(string(out), "warning:")
|
||||||
|
if tc.expectWarn && !warned {
|
||||||
|
t.Errorf("expected warning, got none")
|
||||||
|
}
|
||||||
|
if !tc.expectWarn && warned {
|
||||||
|
t.Errorf("did not expect warning, got: %s", string(out))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
@ -1,6 +1,7 @@
|
||||||
package convert
|
package convert
|
||||||
|
|
||||||
import (
|
import (
|
||||||
|
"cmp"
|
||||||
"encoding/json"
|
"encoding/json"
|
||||||
"errors"
|
"errors"
|
||||||
"fmt"
|
"fmt"
|
||||||
|
|
@ -14,13 +15,12 @@ import (
|
||||||
)
|
)
|
||||||
|
|
||||||
type ModelParameters struct {
|
type ModelParameters struct {
|
||||||
Architectures []string `json:"architectures"`
|
Architectures []string `json:"architectures"`
|
||||||
VocabSize uint32 `json:"vocab_size"`
|
VocabSize uint32 `json:"vocab_size"`
|
||||||
TextModel TextParameters `json:"text_config"`
|
|
||||||
}
|
|
||||||
|
|
||||||
type TextParameters struct {
|
TextModel struct {
|
||||||
VocabSize uint32 `json:"vocab_size"`
|
VocabSize uint32 `json:"vocab_size"`
|
||||||
|
} `json:"text_config"`
|
||||||
}
|
}
|
||||||
|
|
||||||
type AdapterParameters struct {
|
type AdapterParameters struct {
|
||||||
|
|
@ -53,8 +53,11 @@ func (ModelParameters) KV(t *Tokenizer) ggml.KV {
|
||||||
}
|
}
|
||||||
|
|
||||||
for _, sv := range t.SpecialVocabulary {
|
for _, sv := range t.SpecialVocabulary {
|
||||||
kv[fmt.Sprintf("tokenizer.ggml.%s_token_id", sv.Key())] = uint32(sv.ID)
|
|
||||||
kv[fmt.Sprintf("tokenizer.ggml.add_%s_token", sv.Key())] = sv.AddToken
|
kv[fmt.Sprintf("tokenizer.ggml.add_%s_token", sv.Key())] = sv.AddToken
|
||||||
|
kv[fmt.Sprintf("tokenizer.ggml.%s_token_id", sv.Key())] = uint32(sv.ID)
|
||||||
|
if len(sv.IDs) > 0 {
|
||||||
|
kv[fmt.Sprintf("tokenizer.ggml.%s_token_ids", sv.Key())] = sv.IDs
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return kv
|
return kv
|
||||||
|
|
@ -173,6 +176,8 @@ func ConvertModel(fsys fs.FS, f *os.File) error {
|
||||||
switch p.Architectures[0] {
|
switch p.Architectures[0] {
|
||||||
case "LlamaForCausalLM":
|
case "LlamaForCausalLM":
|
||||||
conv = &llamaModel{}
|
conv = &llamaModel{}
|
||||||
|
case "MllamaForConditionalGeneration":
|
||||||
|
conv = &mllamaModel{}
|
||||||
case "Llama4ForConditionalGeneration":
|
case "Llama4ForConditionalGeneration":
|
||||||
conv = &llama4Model{}
|
conv = &llama4Model{}
|
||||||
case "Mistral3ForConditionalGeneration":
|
case "Mistral3ForConditionalGeneration":
|
||||||
|
|
@ -189,6 +194,8 @@ func ConvertModel(fsys fs.FS, f *os.File) error {
|
||||||
conv = &phi3Model{}
|
conv = &phi3Model{}
|
||||||
case "Qwen2ForCausalLM":
|
case "Qwen2ForCausalLM":
|
||||||
conv = &qwen2Model{}
|
conv = &qwen2Model{}
|
||||||
|
case "Qwen2_5_VLForConditionalGeneration":
|
||||||
|
conv = &qwen25VLModel{}
|
||||||
case "BertModel":
|
case "BertModel":
|
||||||
conv = &bertModel{}
|
conv = &bertModel{}
|
||||||
case "CohereForCausalLM":
|
case "CohereForCausalLM":
|
||||||
|
|
@ -212,24 +219,22 @@ func ConvertModel(fsys fs.FS, f *os.File) error {
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
|
|
||||||
vocabSize := int(p.VocabSize)
|
vocabSize := int(cmp.Or(p.VocabSize, p.TextModel.VocabSize))
|
||||||
if vocabSize == 0 {
|
|
||||||
tVocabSize := int(p.TextModel.VocabSize)
|
|
||||||
vocabSize = tVocabSize
|
|
||||||
}
|
|
||||||
|
|
||||||
switch {
|
switch {
|
||||||
case vocabSize == 0:
|
case vocabSize == 0:
|
||||||
slog.Warn("vocabulary size was not explicitly set by the model", "default size", len(t.Vocabulary.Tokens))
|
slog.Debug("vocabulary size was not explicitly set by the model", "default size", len(t.Vocabulary.Tokens))
|
||||||
case vocabSize > len(t.Vocabulary.Tokens):
|
case vocabSize > len(t.Vocabulary.Tokens):
|
||||||
slog.Warn("vocabulary is smaller than expected, padding with dummy tokens", "expect", vocabSize, "actual", len(t.Vocabulary.Tokens))
|
slog.Debug("vocabulary is smaller than expected, padding with dummy tokens", "expect", vocabSize, "actual", len(t.Vocabulary.Tokens))
|
||||||
for i := range vocabSize - len(t.Vocabulary.Tokens) {
|
for i := range vocabSize - len(t.Vocabulary.Tokens) {
|
||||||
t.Vocabulary.Tokens = append(t.Vocabulary.Tokens, fmt.Sprintf("[PAD%d]", i))
|
t.Vocabulary.Tokens = append(t.Vocabulary.Tokens, fmt.Sprintf("[PAD%d]", i))
|
||||||
t.Vocabulary.Scores = append(t.Vocabulary.Scores, -1)
|
t.Vocabulary.Scores = append(t.Vocabulary.Scores, -1)
|
||||||
t.Vocabulary.Types = append(t.Vocabulary.Types, tokenTypeUserDefined)
|
t.Vocabulary.Types = append(t.Vocabulary.Types, tokenTypeUserDefined)
|
||||||
}
|
}
|
||||||
case vocabSize < len(t.Vocabulary.Tokens):
|
case vocabSize < len(t.Vocabulary.Tokens):
|
||||||
return fmt.Errorf("vocabulary is larger than expected '%d' instead of '%d'", len(t.Vocabulary.Tokens), vocabSize)
|
slog.Debug("vocabulary is larger than expected", "want", vocabSize, "got", len(t.Vocabulary.Tokens))
|
||||||
|
p.VocabSize = uint32(len(t.Vocabulary.Tokens))
|
||||||
|
p.TextModel.VocabSize = uint32(len(t.Vocabulary.Tokens))
|
||||||
default:
|
default:
|
||||||
slog.Debug("vocabulary", "size", len(t.Vocabulary.Tokens))
|
slog.Debug("vocabulary", "size", len(t.Vocabulary.Tokens))
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -139,7 +139,8 @@ func (p *llamaModel) Tensors(ts []Tensor) []*ggml.Tensor {
|
||||||
}
|
}
|
||||||
|
|
||||||
for _, t := range ts {
|
for _, t := range ts {
|
||||||
if strings.HasSuffix(t.Name(), "attn_q.weight") || strings.HasSuffix(t.Name(), "attn_k.weight") {
|
if strings.HasSuffix(t.Name(), "attn_q.weight") || strings.HasSuffix(t.Name(), "attn_k.weight") ||
|
||||||
|
strings.HasSuffix(t.Name(), "attn_q_proj.weight") || strings.HasSuffix(t.Name(), "attn_k_proj.weight") {
|
||||||
if !p.skipRepack {
|
if !p.skipRepack {
|
||||||
t.SetRepacker(p.repack)
|
t.SetRepacker(p.repack)
|
||||||
}
|
}
|
||||||
|
|
@ -181,9 +182,9 @@ func (p *llamaModel) repack(name string, data []float32, shape []uint64) ([]floa
|
||||||
}
|
}
|
||||||
|
|
||||||
var heads uint32
|
var heads uint32
|
||||||
if strings.HasSuffix(name, "attn_q.weight") {
|
if strings.HasSuffix(name, "attn_q.weight") || strings.HasSuffix(name, "attn_q_proj.weight") {
|
||||||
heads = p.NumAttentionHeads
|
heads = p.NumAttentionHeads
|
||||||
} else if strings.HasSuffix(name, "attn_k.weight") {
|
} else if strings.HasSuffix(name, "attn_k.weight") || strings.HasSuffix(name, "attn_k_proj.weight") {
|
||||||
heads = cmp.Or(p.NumKeyValueHeads, p.NumAttentionHeads)
|
heads = cmp.Or(p.NumKeyValueHeads, p.NumAttentionHeads)
|
||||||
} else {
|
} else {
|
||||||
return nil, fmt.Errorf("unknown tensor for repack: %s", name)
|
return nil, fmt.Errorf("unknown tensor for repack: %s", name)
|
||||||
|
|
|
||||||
|
|
@ -2,9 +2,6 @@ package convert
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"fmt"
|
"fmt"
|
||||||
"io"
|
|
||||||
"slices"
|
|
||||||
"strings"
|
|
||||||
|
|
||||||
"github.com/ollama/ollama/fs/ggml"
|
"github.com/ollama/ollama/fs/ggml"
|
||||||
)
|
)
|
||||||
|
|
@ -30,65 +27,38 @@ func (p *mixtralModel) KV(t *Tokenizer) ggml.KV {
|
||||||
}
|
}
|
||||||
|
|
||||||
func (p *mixtralModel) Tensors(ts []Tensor) []*ggml.Tensor {
|
func (p *mixtralModel) Tensors(ts []Tensor) []*ggml.Tensor {
|
||||||
oldnew := []string{
|
merges := make([]merge, 0, p.NumHiddenLayers*6)
|
||||||
"model.layers", "blk",
|
for i := range p.NumHiddenLayers {
|
||||||
"w1", "ffn_gate_exps",
|
merges = append(merges, merge{
|
||||||
"w2", "ffn_down_exps",
|
fmt.Sprintf("blk.%d.*.w1.weight", i),
|
||||||
"w3", "ffn_up_exps",
|
fmt.Sprintf("blk.%d.ffn_gate_exps.weight", i),
|
||||||
}
|
}, merge{
|
||||||
|
fmt.Sprintf("blk.%d.*.w1.bias", i),
|
||||||
for i := range p.NumLocalExperts {
|
fmt.Sprintf("blk.%d.ffn_gate_exps.bias", i),
|
||||||
oldnew = append(oldnew, fmt.Sprintf(".block_sparse_moe.experts.%d.", i), ".")
|
}, merge{
|
||||||
}
|
fmt.Sprintf("blk.%d.*.w2.weight", i),
|
||||||
|
fmt.Sprintf("blk.%d.ffn_up_exps.weight", i),
|
||||||
// group experts of the same layer (model.layers.%d) and type (w[123]) into a single tensor
|
}, merge{
|
||||||
namer := strings.NewReplacer(oldnew...)
|
fmt.Sprintf("blk.%d.*.w2.bias", i),
|
||||||
experts := make(map[string]experts)
|
fmt.Sprintf("blk.%d.ffn_up_exps.bias", i),
|
||||||
|
}, merge{
|
||||||
// merge experts into a single tensor while removing them from ts
|
fmt.Sprintf("blk.%d.*.w3.weight", i),
|
||||||
ts = slices.DeleteFunc(ts, func(t Tensor) bool {
|
fmt.Sprintf("blk.%d.ffn_down_exps.weight", i),
|
||||||
if !strings.Contains(t.Name(), ".block_sparse_moe.experts.") {
|
}, merge{
|
||||||
return false
|
fmt.Sprintf("blk.%d.*.w3.bias", i),
|
||||||
}
|
fmt.Sprintf("blk.%d.ffn_down_exps.bias", i),
|
||||||
|
|
||||||
name := namer.Replace(t.Name())
|
|
||||||
experts[name] = append(experts[name], t)
|
|
||||||
return true
|
|
||||||
})
|
|
||||||
|
|
||||||
var out []*ggml.Tensor
|
|
||||||
for n, e := range experts {
|
|
||||||
// TODO(mxyng): sanity check experts
|
|
||||||
out = append(out, &ggml.Tensor{
|
|
||||||
Name: n,
|
|
||||||
Kind: e[0].Kind(),
|
|
||||||
Shape: append([]uint64{uint64(len(e))}, e[0].Shape()...),
|
|
||||||
WriterTo: e,
|
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
|
out, ts := mergeTensors(ts, merges...)
|
||||||
return append(out, p.llamaModel.Tensors(ts)...)
|
return append(out, p.llamaModel.Tensors(ts)...)
|
||||||
}
|
}
|
||||||
|
|
||||||
func (p *mixtralModel) Replacements() []string {
|
func (p *mixtralModel) Replacements() []string {
|
||||||
return append(
|
return append(
|
||||||
p.llamaModel.Replacements(),
|
p.llamaModel.Replacements(),
|
||||||
|
"model.layers", "blk",
|
||||||
"block_sparse_moe.gate", "ffn_gate_inp",
|
"block_sparse_moe.gate", "ffn_gate_inp",
|
||||||
|
"block_sparse_moe.experts.", ".",
|
||||||
)
|
)
|
||||||
}
|
}
|
||||||
|
|
||||||
type experts []Tensor
|
|
||||||
|
|
||||||
func (e experts) WriteTo(w io.Writer) (int64, error) {
|
|
||||||
// TODO(mxyng): experts _should_ be numerically sorted by expert but this should check
|
|
||||||
for _, t := range e {
|
|
||||||
// the canonical merged experts tensor stacks all experts along a new, 0 axis,
|
|
||||||
// e.g. `tensor.Stack(0, e[0], e[1:]...)`, which requires allocating temporary buffers
|
|
||||||
// this accomplishes the same thing by writing each expert tensor in sequence
|
|
||||||
if _, err := t.WriteTo(w); err != nil {
|
|
||||||
return 0, err
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return 0, nil
|
|
||||||
}
|
|
||||||
|
|
|
||||||
179
convert/convert_mllama.go
Normal file
179
convert/convert_mllama.go
Normal file
|
|
@ -0,0 +1,179 @@
|
||||||
|
package convert
|
||||||
|
|
||||||
|
import (
|
||||||
|
"strings"
|
||||||
|
|
||||||
|
"github.com/ollama/ollama/fs/ggml"
|
||||||
|
"github.com/pdevine/tensor"
|
||||||
|
"github.com/pdevine/tensor/native"
|
||||||
|
)
|
||||||
|
|
||||||
|
type mllamaModel struct {
|
||||||
|
ModelParameters
|
||||||
|
TextModel struct {
|
||||||
|
llamaModel
|
||||||
|
|
||||||
|
CrossAttentionLayers []int32 `json:"cross_attention_layers"`
|
||||||
|
} `json:"text_config"`
|
||||||
|
VisionModel struct {
|
||||||
|
NumHiddenLayers uint32 `json:"num_hidden_layers"`
|
||||||
|
NumGlobalLayers uint32 `json:"num_global_layers"`
|
||||||
|
IntermediateLayersIndices []int32 `json:"intermediate_layers_indices"`
|
||||||
|
|
||||||
|
HiddenSize uint32 `json:"hidden_size"`
|
||||||
|
IntermediateSize uint32 `json:"intermediate_size"`
|
||||||
|
|
||||||
|
AttentionHeads uint32 `json:"attention_heads"`
|
||||||
|
|
||||||
|
ImageSize uint32 `json:"image_size"`
|
||||||
|
PatchSize uint32 `json:"patch_size"`
|
||||||
|
NumChannels uint32 `json:"num_channels"`
|
||||||
|
MaxNumTiles uint32 `json:"max_num_tiles"`
|
||||||
|
NormEpsilon float32 `json:"norm_eps"`
|
||||||
|
RopeTheta float32 `json:"rope.freq_base"`
|
||||||
|
} `json:"vision_config"`
|
||||||
|
}
|
||||||
|
|
||||||
|
func (m *mllamaModel) KV(t *Tokenizer) ggml.KV {
|
||||||
|
kv := m.ModelParameters.KV(t)
|
||||||
|
kv["general.architecture"] = "mllama"
|
||||||
|
|
||||||
|
for k, v := range m.TextModel.KV(t) {
|
||||||
|
if strings.HasPrefix(k, "llama.") {
|
||||||
|
kv[strings.ReplaceAll(k, "llama.", "mllama.")] = v
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
kv["mllama.attention.cross_attention_layers"] = m.TextModel.CrossAttentionLayers
|
||||||
|
|
||||||
|
kv["mllama.vision.block_count"] = m.VisionModel.NumHiddenLayers
|
||||||
|
kv["mllama.vision.global.block_count"] = m.VisionModel.NumGlobalLayers
|
||||||
|
kv["mllama.vision.intermediate_layers_indices"] = m.VisionModel.IntermediateLayersIndices
|
||||||
|
|
||||||
|
kv["mllama.vision.embedding_length"] = m.VisionModel.HiddenSize
|
||||||
|
kv["mllama.vision.feed_forward_length"] = m.VisionModel.IntermediateSize
|
||||||
|
|
||||||
|
kv["mllama.vision.attention.head_count"] = m.VisionModel.AttentionHeads
|
||||||
|
kv["mllama.vision.attention.layer_norm_epsilon"] = m.VisionModel.NormEpsilon
|
||||||
|
|
||||||
|
kv["mllama.vision.image_size"] = m.VisionModel.ImageSize
|
||||||
|
kv["mllama.vision.patch_size"] = m.VisionModel.PatchSize
|
||||||
|
kv["mllama.vision.max_num_tiles"] = m.VisionModel.MaxNumTiles
|
||||||
|
kv["mllama.vision.num_channels"] = m.VisionModel.NumChannels
|
||||||
|
|
||||||
|
return kv
|
||||||
|
}
|
||||||
|
|
||||||
|
func (m *mllamaModel) Replacements() []string {
|
||||||
|
return append(
|
||||||
|
m.TextModel.Replacements(),
|
||||||
|
"language_model.", "",
|
||||||
|
"gate_attn", "attn_gate",
|
||||||
|
"gate_ffn", "ffn_gate",
|
||||||
|
"cross_attn.", "cross_attn_",
|
||||||
|
"vision_model", "v",
|
||||||
|
"class_embedding", "class_embd",
|
||||||
|
"patch_embedding", "patch_embd",
|
||||||
|
"gated_positional_embedding.tile_embedding", "tile_position_embd",
|
||||||
|
"gated_positional_embedding.embedding", "position_embd.weight",
|
||||||
|
"gated_positional_embedding", "position_embd",
|
||||||
|
"embedding.weight", "weight",
|
||||||
|
"pre_tile_positional_embedding", "pre_tile_position_embd",
|
||||||
|
"post_tile_positional_embedding", "post_tile_position_embd",
|
||||||
|
"layernorm_pre", "pre_ln",
|
||||||
|
"layernorm_post", "post_ln",
|
||||||
|
"global_transformer.layers", "global.blk",
|
||||||
|
"transformer.layers", "blk",
|
||||||
|
"mlp.fc1", "ffn_up",
|
||||||
|
"mlp.fc2", "ffn_down",
|
||||||
|
"multi_modal_projector", "mm.0",
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (m *mllamaModel) Tensors(ts []Tensor) []*ggml.Tensor {
|
||||||
|
var out []*ggml.Tensor
|
||||||
|
var text []Tensor
|
||||||
|
for _, t := range ts {
|
||||||
|
if !strings.HasPrefix(t.Name(), "v.") && !strings.HasPrefix(t.Name(), "mm.") {
|
||||||
|
text = append(text, t)
|
||||||
|
} else if t.Name() == "v.position_embd.gate" {
|
||||||
|
for _, name := range []string{"v.position_embd.gate", "v.tile_position_embd.gate"} {
|
||||||
|
tt := t.Clone()
|
||||||
|
tt.SetRepacker(m.repack(name))
|
||||||
|
out = append(out, &ggml.Tensor{
|
||||||
|
Name: name,
|
||||||
|
Kind: t.Kind(),
|
||||||
|
Shape: t.Shape(),
|
||||||
|
WriterTo: tt,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
if t.Name() == "v.pre_tile_position_embd.gate" || t.Name() == "v.post_tile_position_embd.gate" {
|
||||||
|
t.SetRepacker(m.repack(t.Name()))
|
||||||
|
} else if strings.HasSuffix(t.Name(), "attn_q.weight") || strings.HasSuffix(t.Name(), "attn_k.weight") {
|
||||||
|
t.SetRepacker(m.repack(t.Name()))
|
||||||
|
} else if strings.HasSuffix(t.Name(), "attn_gate") || strings.HasSuffix(t.Name(), "ffn_gate") {
|
||||||
|
t.SetRepacker(m.repack(t.Name()))
|
||||||
|
}
|
||||||
|
|
||||||
|
out = append(out, &ggml.Tensor{
|
||||||
|
Name: t.Name(),
|
||||||
|
Kind: t.Kind(),
|
||||||
|
Shape: t.Shape(),
|
||||||
|
WriterTo: t,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return append(out, m.TextModel.Tensors(text)...)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (m *mllamaModel) repack(name string) Repacker {
|
||||||
|
return func(_ string, data []float32, shape []uint64) (_ []float32, err error) {
|
||||||
|
dims := make([]int, len(shape))
|
||||||
|
for i, dim := range shape {
|
||||||
|
dims[i] = int(dim)
|
||||||
|
}
|
||||||
|
|
||||||
|
var t tensor.Tensor = tensor.New(tensor.WithShape(dims...), tensor.WithBacking(data))
|
||||||
|
|
||||||
|
if strings.HasSuffix(name, "attn_q.weight") || strings.HasSuffix(name, "attn_k.weight") {
|
||||||
|
heads := m.VisionModel.AttentionHeads
|
||||||
|
if err := t.Reshape(append([]int{int(heads), 2, dims[0] / int(heads) / 2}, dims[1:]...)...); err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
|
||||||
|
if err := t.T(0, 2, 1, 3); err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
|
||||||
|
if err := t.Reshape(dims...); err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
|
||||||
|
if err := t.Transpose(); err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
t, err = tensor.Tanh(t)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
|
||||||
|
if name == "v.position_embd.gate" {
|
||||||
|
t, err = tensor.Sub(float32(1), t)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
t = tensor.Materialize(t)
|
||||||
|
// flatten tensor so it can be return as a vector
|
||||||
|
if err := t.Reshape(t.Shape().TotalSize()); err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
|
||||||
|
return native.VectorF32(t.(*tensor.Dense))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
@ -15,6 +15,7 @@ type qwen2Model struct {
|
||||||
Type string `json:"type"`
|
Type string `json:"type"`
|
||||||
Factor ropeFactor `json:"factor"`
|
Factor ropeFactor `json:"factor"`
|
||||||
OriginalMaxPositionEmbeddings uint32 `json:"original_max_position_embeddings"`
|
OriginalMaxPositionEmbeddings uint32 `json:"original_max_position_embeddings"`
|
||||||
|
MropeSection []int32 `json:"mrope_section"`
|
||||||
} `json:"rope_scaling"`
|
} `json:"rope_scaling"`
|
||||||
RMSNormEPS float32 `json:"rms_norm_eps"`
|
RMSNormEPS float32 `json:"rms_norm_eps"`
|
||||||
}
|
}
|
||||||
|
|
@ -39,6 +40,8 @@ func (q *qwen2Model) KV(t *Tokenizer) ggml.KV {
|
||||||
case "yarn":
|
case "yarn":
|
||||||
kv["qwen2.rope.scaling.type"] = q.RopeScaling.Type
|
kv["qwen2.rope.scaling.type"] = q.RopeScaling.Type
|
||||||
kv["qwen2.rope.scaling.factor"] = q.RopeScaling.Factor
|
kv["qwen2.rope.scaling.factor"] = q.RopeScaling.Factor
|
||||||
|
case "mrope", "default":
|
||||||
|
kv["qwen2.rope.mrope_section"] = q.RopeScaling.MropeSection
|
||||||
default:
|
default:
|
||||||
panic("unknown rope scaling type")
|
panic("unknown rope scaling type")
|
||||||
}
|
}
|
||||||
|
|
|
||||||
102
convert/convert_qwen25vl.go
Normal file
102
convert/convert_qwen25vl.go
Normal file
|
|
@ -0,0 +1,102 @@
|
||||||
|
package convert
|
||||||
|
|
||||||
|
import (
|
||||||
|
"cmp"
|
||||||
|
"slices"
|
||||||
|
"strings"
|
||||||
|
|
||||||
|
"github.com/ollama/ollama/fs/ggml"
|
||||||
|
)
|
||||||
|
|
||||||
|
type qwen25VLModel struct {
|
||||||
|
qwen2Model
|
||||||
|
|
||||||
|
VisionModel struct {
|
||||||
|
Depth uint32 `json:"depth"`
|
||||||
|
HiddenSize uint32 `json:"hidden_size"`
|
||||||
|
NumHeads uint32 `json:"num_heads"`
|
||||||
|
InChannels uint32 `json:"in_chans"`
|
||||||
|
PatchSize uint32 `json:"patch_size"`
|
||||||
|
SpatialMergeSize uint32 `json:"spatial_merge_size"`
|
||||||
|
SpatialPatchSize uint32 `json:"spatial_patch_size"`
|
||||||
|
WindowSize uint32 `json:"window_size"`
|
||||||
|
RMSNormEps float32 `json:"layer_norm_epsilon"`
|
||||||
|
RopeTheta float32 `json:"rope_theta"`
|
||||||
|
FullAttentionBlocks []int32 `json:"fullatt_block_indexes"`
|
||||||
|
TemporalPatchSize uint32 `json:"temporal_patch_size"`
|
||||||
|
} `json:"vision_config"`
|
||||||
|
}
|
||||||
|
|
||||||
|
var _ ModelConverter = (*qwen25VLModel)(nil)
|
||||||
|
|
||||||
|
func (q *qwen25VLModel) KV(t *Tokenizer) ggml.KV {
|
||||||
|
kv := q.ModelParameters.KV(t)
|
||||||
|
kv["general.architecture"] = "qwen25vl"
|
||||||
|
|
||||||
|
for k, v := range q.qwen2Model.KV(t) {
|
||||||
|
if strings.HasPrefix(k, "qwen2.") {
|
||||||
|
kv[strings.Replace(k, "qwen2.", "qwen25vl.", 1)] = v
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if q.VisionModel.FullAttentionBlocks == nil {
|
||||||
|
kv["qwen25vl.vision.fullatt_block_indexes"] = []int32{7, 15, 23, 31}
|
||||||
|
}
|
||||||
|
|
||||||
|
kv["qwen25vl.vision.block_count"] = cmp.Or(q.VisionModel.Depth, 32)
|
||||||
|
kv["qwen25vl.vision.embedding_length"] = q.VisionModel.HiddenSize
|
||||||
|
kv["qwen25vl.vision.attention.head_count"] = cmp.Or(q.VisionModel.NumHeads, 16)
|
||||||
|
kv["qwen25vl.vision.num_channels"] = q.VisionModel.InChannels
|
||||||
|
kv["qwen25vl.vision.patch_size"] = cmp.Or(q.VisionModel.PatchSize, 14)
|
||||||
|
kv["qwen25vl.vision.spatial_merge_size"] = cmp.Or(q.VisionModel.SpatialMergeSize, 2)
|
||||||
|
kv["qwen25vl.vision.spatial_patch_size"] = q.VisionModel.SpatialPatchSize
|
||||||
|
kv["qwen25vl.vision.window_size"] = cmp.Or(q.VisionModel.WindowSize, 112)
|
||||||
|
kv["qwen25vl.vision.attention.layer_norm_epsilon"] = cmp.Or(q.VisionModel.RMSNormEps, 1e-6)
|
||||||
|
kv["qwen25vl.vision.rope.freq_base"] = cmp.Or(q.VisionModel.RopeTheta, 1e4)
|
||||||
|
kv["qwen25vl.vision.fullatt_block_indexes"] = q.VisionModel.FullAttentionBlocks
|
||||||
|
kv["qwen25vl.vision.temporal_patch_size"] = cmp.Or(q.VisionModel.TemporalPatchSize, 2)
|
||||||
|
|
||||||
|
return kv
|
||||||
|
}
|
||||||
|
|
||||||
|
func (q *qwen25VLModel) Tensors(ts []Tensor) []*ggml.Tensor {
|
||||||
|
var out []*ggml.Tensor
|
||||||
|
|
||||||
|
for _, t := range ts {
|
||||||
|
if strings.Contains(t.Name(), "patch_embed.proj") {
|
||||||
|
for t := range splitDim(t, 2,
|
||||||
|
split{Replacer: strings.NewReplacer("patch_embed.proj", "patch_embd_0")},
|
||||||
|
split{Replacer: strings.NewReplacer("patch_embed.proj", "patch_embd_1")},
|
||||||
|
) {
|
||||||
|
t.Shape = slices.DeleteFunc(t.Shape, func(i uint64) bool { return i == 1 })
|
||||||
|
out = append(out, t)
|
||||||
|
}
|
||||||
|
} else if strings.Contains(t.Name(), "attn.qkv") {
|
||||||
|
out = append(out, slices.Collect(splitDim(t, 0,
|
||||||
|
split{Replacer: strings.NewReplacer("attn.qkv", "attn_q")},
|
||||||
|
split{Replacer: strings.NewReplacer("attn.qkv", "attn_k")},
|
||||||
|
split{Replacer: strings.NewReplacer("attn.qkv", "attn_v")},
|
||||||
|
))...)
|
||||||
|
} else {
|
||||||
|
out = append(out, &ggml.Tensor{
|
||||||
|
Name: t.Name(),
|
||||||
|
Kind: t.Kind(),
|
||||||
|
Shape: t.Shape(),
|
||||||
|
WriterTo: t,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return out
|
||||||
|
}
|
||||||
|
|
||||||
|
func (p *qwen25VLModel) Replacements() []string {
|
||||||
|
return append(
|
||||||
|
p.qwen2Model.Replacements(),
|
||||||
|
"visual", "v",
|
||||||
|
"blocks", "blk",
|
||||||
|
"attn.proj", "attn_out",
|
||||||
|
"norm1", "ln1",
|
||||||
|
"norm2", "ln2",
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
@ -47,7 +47,7 @@ func convertFull(t *testing.T, fsys fs.FS) (*os.File, ggml.KV, ggml.Tensors) {
|
||||||
}
|
}
|
||||||
t.Cleanup(func() { r.Close() })
|
t.Cleanup(func() { r.Close() })
|
||||||
|
|
||||||
m, _, err := ggml.Decode(r, -1)
|
m, err := ggml.Decode(r, -1)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
t.Fatal(err)
|
t.Fatal(err)
|
||||||
}
|
}
|
||||||
|
|
@ -332,7 +332,7 @@ func TestConvertAdapter(t *testing.T) {
|
||||||
}
|
}
|
||||||
defer r.Close()
|
defer r.Close()
|
||||||
|
|
||||||
m, _, err := ggml.Decode(r, -1)
|
m, err := ggml.Decode(r, -1)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
t.Fatal(err)
|
t.Fatal(err)
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -38,7 +38,10 @@ const (
|
||||||
func (t tensorBase) Kind() uint32 {
|
func (t tensorBase) Kind() uint32 {
|
||||||
if strings.HasSuffix(t.name, ".ffn_gate_inp.weight") ||
|
if strings.HasSuffix(t.name, ".ffn_gate_inp.weight") ||
|
||||||
t.name == "token_types.weight" ||
|
t.name == "token_types.weight" ||
|
||||||
t.name == "v.positional_embedding_vlm" {
|
t.name == "v.positional_embedding_vlm" ||
|
||||||
|
t.name == "v.tile_position_embd.weight" ||
|
||||||
|
t.name == "v.pre_tile_position_embd.weight" ||
|
||||||
|
t.name == "v.post_tile_position_embd.weight" {
|
||||||
// these tensors are always F32
|
// these tensors are always F32
|
||||||
return 0
|
return 0
|
||||||
}
|
}
|
||||||
|
|
|
||||||
129
convert/tensor.go
Normal file
129
convert/tensor.go
Normal file
|
|
@ -0,0 +1,129 @@
|
||||||
|
package convert
|
||||||
|
|
||||||
|
import (
|
||||||
|
"cmp"
|
||||||
|
"io"
|
||||||
|
"iter"
|
||||||
|
"path"
|
||||||
|
"slices"
|
||||||
|
"strings"
|
||||||
|
|
||||||
|
"github.com/pdevine/tensor"
|
||||||
|
"github.com/pdevine/tensor/native"
|
||||||
|
|
||||||
|
"github.com/ollama/ollama/fs/ggml"
|
||||||
|
)
|
||||||
|
|
||||||
|
type split struct {
|
||||||
|
*strings.Replacer
|
||||||
|
dim int
|
||||||
|
|
||||||
|
// fn is an optional function to apply to the tensor after slicing
|
||||||
|
fn func(tensor.Tensor) (tensor.Tensor, error)
|
||||||
|
}
|
||||||
|
|
||||||
|
// splitDim splits a tensor along a specified dimension into multiple tensors. The dimension
|
||||||
|
// is split evenly based on the number of replacers provided unless a specific count is given.
|
||||||
|
func splitDim(t Tensor, dim int, splits ...split) iter.Seq[*ggml.Tensor] {
|
||||||
|
return func(yield func(*ggml.Tensor) bool) {
|
||||||
|
var offset int
|
||||||
|
for _, split := range splits {
|
||||||
|
t := t.Clone()
|
||||||
|
shape := slices.Clone(t.Shape())
|
||||||
|
shape[dim] = cmp.Or(uint64(split.dim), shape[dim]/uint64(len(splits)))
|
||||||
|
|
||||||
|
slice := slices.Repeat([]tensor.Slice{nil}, len(shape))
|
||||||
|
slice[dim] = tensor.S(offset, offset+int(shape[dim]))
|
||||||
|
offset += int(shape[dim])
|
||||||
|
|
||||||
|
t.SetRepacker(func(_ string, data []float32, shape []uint64) ([]float32, error) {
|
||||||
|
dims := make([]int, len(shape))
|
||||||
|
for i := range shape {
|
||||||
|
dims[i] = int(shape[i])
|
||||||
|
}
|
||||||
|
|
||||||
|
var tt tensor.Tensor = tensor.New(tensor.WithShape(dims...), tensor.WithBacking(data))
|
||||||
|
tt, err := tt.Slice(slice...)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
|
||||||
|
tt = tensor.Materialize(tt)
|
||||||
|
|
||||||
|
if split.fn != nil {
|
||||||
|
tt, err = split.fn(tt)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// flatten tensor so it can be written as a vector
|
||||||
|
if err := tt.Reshape(tt.Shape().TotalSize()); err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
|
||||||
|
return native.VectorF32(tt.(*tensor.Dense))
|
||||||
|
})
|
||||||
|
|
||||||
|
if !yield(&ggml.Tensor{
|
||||||
|
Name: split.Replace(t.Name()),
|
||||||
|
Kind: t.Kind(),
|
||||||
|
Shape: shape,
|
||||||
|
WriterTo: t,
|
||||||
|
}) {
|
||||||
|
break
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
type merge struct {
|
||||||
|
pattern, name string
|
||||||
|
}
|
||||||
|
|
||||||
|
// mergeTensors merges tensors that match a given pattern into a single tensor.
|
||||||
|
func mergeTensors(unmatched []Tensor, merges ...merge) (out []*ggml.Tensor, _ []Tensor) {
|
||||||
|
var matched []Tensor
|
||||||
|
for i := range merges {
|
||||||
|
matched, unmatched = slicesSplitFunc(unmatched, func(t Tensor) bool {
|
||||||
|
matched, _ := path.Match(merges[i].pattern, t.Name())
|
||||||
|
return matched
|
||||||
|
})
|
||||||
|
|
||||||
|
if len(matched) > 0 {
|
||||||
|
out = append(out, &ggml.Tensor{
|
||||||
|
Name: merges[i].name,
|
||||||
|
Kind: matched[0].Kind(),
|
||||||
|
Shape: append([]uint64{uint64(len(matched))}, matched[0].Shape()...),
|
||||||
|
WriterTo: mergeGroup(matched),
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return out, unmatched
|
||||||
|
}
|
||||||
|
|
||||||
|
// slicesSplitFunc splits a slice into two slices based on a predicate function.
|
||||||
|
func slicesSplitFunc[S ~[]E, E comparable](s S, fn func(e E) bool) (matched, unmatched S) {
|
||||||
|
for _, e := range s {
|
||||||
|
if fn(e) {
|
||||||
|
matched = append(matched, e)
|
||||||
|
} else {
|
||||||
|
unmatched = append(unmatched, e)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return matched, unmatched
|
||||||
|
}
|
||||||
|
|
||||||
|
type mergeGroup []Tensor
|
||||||
|
|
||||||
|
func (g mergeGroup) WriteTo(w io.Writer) (int64, error) {
|
||||||
|
for _, t := range g {
|
||||||
|
if _, err := t.WriteTo(w); err != nil {
|
||||||
|
return 0, err
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return 0, nil
|
||||||
|
}
|
||||||
402
convert/tensor_test.go
Normal file
402
convert/tensor_test.go
Normal file
|
|
@ -0,0 +1,402 @@
|
||||||
|
package convert
|
||||||
|
|
||||||
|
import (
|
||||||
|
"bytes"
|
||||||
|
"encoding/binary"
|
||||||
|
"io"
|
||||||
|
"iter"
|
||||||
|
"slices"
|
||||||
|
"strings"
|
||||||
|
"testing"
|
||||||
|
|
||||||
|
"github.com/google/go-cmp/cmp"
|
||||||
|
"github.com/ollama/ollama/fs/ggml"
|
||||||
|
"github.com/pdevine/tensor"
|
||||||
|
)
|
||||||
|
|
||||||
|
type fakeTensor struct {
|
||||||
|
name string
|
||||||
|
shape []uint64
|
||||||
|
data []float32
|
||||||
|
|
||||||
|
repacker Repacker
|
||||||
|
}
|
||||||
|
|
||||||
|
func (f fakeTensor) Name() string {
|
||||||
|
return f.name
|
||||||
|
}
|
||||||
|
|
||||||
|
func (f fakeTensor) Shape() []uint64 {
|
||||||
|
return f.shape
|
||||||
|
}
|
||||||
|
|
||||||
|
func (f fakeTensor) Kind() uint32 {
|
||||||
|
return 0
|
||||||
|
}
|
||||||
|
|
||||||
|
func (f *fakeTensor) SetRepacker(fn Repacker) {
|
||||||
|
f.repacker = fn
|
||||||
|
}
|
||||||
|
|
||||||
|
func (f fakeTensor) Clone() Tensor {
|
||||||
|
return &fakeTensor{
|
||||||
|
name: f.name,
|
||||||
|
shape: slices.Clone(f.shape),
|
||||||
|
data: slices.Clone(f.data),
|
||||||
|
repacker: f.repacker,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func (f fakeTensor) WriteTo(w io.Writer) (n int64, err error) {
|
||||||
|
data := f.data
|
||||||
|
if f.repacker != nil {
|
||||||
|
data, err = f.repacker(f.name, data, f.shape)
|
||||||
|
if err != nil {
|
||||||
|
return 0, err
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if err := binary.Write(w, binary.LittleEndian, data); err != nil {
|
||||||
|
return 0, err
|
||||||
|
}
|
||||||
|
|
||||||
|
return int64(len(data) * 4), nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func mul(shape []uint64) int {
|
||||||
|
n := 1
|
||||||
|
for _, dim := range shape {
|
||||||
|
n *= int(dim)
|
||||||
|
}
|
||||||
|
return n
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestSplitDim(t *testing.T) {
|
||||||
|
r := fakeTensor{
|
||||||
|
name: "a.b",
|
||||||
|
shape: []uint64{3, 4},
|
||||||
|
data: []float32{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11},
|
||||||
|
}
|
||||||
|
|
||||||
|
t.Run("no split", func(t *testing.T) {
|
||||||
|
for tt := range splitDim(&r, 0, split{Replacer: strings.NewReplacer("a", "x")}) {
|
||||||
|
if tt.Name != "x.b" {
|
||||||
|
t.Fatalf("expected name 'x', got '%s'", tt.Name)
|
||||||
|
}
|
||||||
|
|
||||||
|
if !slices.Equal(tt.Shape, []uint64{3, 4}) {
|
||||||
|
t.Fatalf("expected shape [3, 4], got %v", tt.Shape)
|
||||||
|
}
|
||||||
|
|
||||||
|
var b bytes.Buffer
|
||||||
|
if _, err := tt.WriteTo(&b); err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
|
||||||
|
f32s := make([]float32, mul(tt.Shape))
|
||||||
|
if err := binary.Read(&b, binary.LittleEndian, &f32s); err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
|
||||||
|
if !slices.Equal(f32s, []float32{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11}) {
|
||||||
|
t.Fatalf("expected data [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11], got %v", f32s)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
})
|
||||||
|
|
||||||
|
t.Run("even split", func(t *testing.T) {
|
||||||
|
next, stop := iter.Pull(splitDim(&r, 1,
|
||||||
|
split{Replacer: strings.NewReplacer("a", "x")},
|
||||||
|
split{Replacer: strings.NewReplacer("b", "y")},
|
||||||
|
))
|
||||||
|
defer stop()
|
||||||
|
|
||||||
|
{
|
||||||
|
tt, ok := next()
|
||||||
|
if !ok {
|
||||||
|
t.Fatal("expected at least one split")
|
||||||
|
}
|
||||||
|
|
||||||
|
if tt.Name != "x.b" {
|
||||||
|
t.Fatal("expected name 'x.b', got", tt.Name)
|
||||||
|
}
|
||||||
|
|
||||||
|
if !slices.Equal(tt.Shape, []uint64{3, 2}) {
|
||||||
|
t.Fatal("expected shape [3, 2], got", tt.Shape)
|
||||||
|
}
|
||||||
|
|
||||||
|
var b bytes.Buffer
|
||||||
|
if _, err := tt.WriteTo(&b); err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
|
||||||
|
f32s := make([]float32, mul(tt.Shape))
|
||||||
|
if err := binary.Read(&b, binary.LittleEndian, &f32s); err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
|
||||||
|
if !slices.Equal(f32s, []float32{0, 1, 4, 5, 8, 9}) {
|
||||||
|
t.Fatal("expected data [0, 1, 4, 5, 8, 9], got", f32s)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
{
|
||||||
|
tt, ok := next()
|
||||||
|
if !ok {
|
||||||
|
t.Fatal("expected at least one split")
|
||||||
|
}
|
||||||
|
|
||||||
|
if tt.Name != "a.y" {
|
||||||
|
t.Fatal("expected name 'a.y', got", tt.Name)
|
||||||
|
}
|
||||||
|
|
||||||
|
if !slices.Equal(tt.Shape, []uint64{3, 2}) {
|
||||||
|
t.Fatal("expected shape [3, 2], got", tt.Shape)
|
||||||
|
}
|
||||||
|
|
||||||
|
var b bytes.Buffer
|
||||||
|
if _, err := tt.WriteTo(&b); err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
|
||||||
|
f32s := make([]float32, mul(tt.Shape))
|
||||||
|
if err := binary.Read(&b, binary.LittleEndian, &f32s); err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
|
||||||
|
if !slices.Equal(f32s, []float32{2, 3, 6, 7, 10, 11}) {
|
||||||
|
t.Fatal("expected data [2, 3, 6, 7, 10, 11], got", f32s)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
})
|
||||||
|
|
||||||
|
t.Run("uneven split", func(t *testing.T) {
|
||||||
|
next, stop := iter.Pull(splitDim(&r, 0,
|
||||||
|
split{Replacer: strings.NewReplacer("a", "x"), dim: 2},
|
||||||
|
split{Replacer: strings.NewReplacer("b", "y"), dim: 1},
|
||||||
|
))
|
||||||
|
defer stop()
|
||||||
|
|
||||||
|
{
|
||||||
|
tt, ok := next()
|
||||||
|
if !ok {
|
||||||
|
t.Fatal("expected at least one split")
|
||||||
|
}
|
||||||
|
|
||||||
|
if tt.Name != "x.b" {
|
||||||
|
t.Fatal("expected name 'x.b', got", tt.Name)
|
||||||
|
}
|
||||||
|
|
||||||
|
if !slices.Equal(tt.Shape, []uint64{2, 4}) {
|
||||||
|
t.Fatal("expected shape [2, 4], got", tt.Shape)
|
||||||
|
}
|
||||||
|
|
||||||
|
var b bytes.Buffer
|
||||||
|
if _, err := tt.WriteTo(&b); err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
|
||||||
|
f32s := make([]float32, mul(tt.Shape))
|
||||||
|
if err := binary.Read(&b, binary.LittleEndian, &f32s); err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
|
||||||
|
if !slices.Equal(f32s, []float32{0, 1, 2, 3, 4, 5, 6, 7}) {
|
||||||
|
t.Fatal("expected data [0, 1, 2, 3, 4, 5, 6, 7], got", f32s)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
{
|
||||||
|
tt, ok := next()
|
||||||
|
if !ok {
|
||||||
|
t.Fatal("expected at least one split")
|
||||||
|
}
|
||||||
|
|
||||||
|
if tt.Name != "a.y" {
|
||||||
|
t.Fatal("expected name 'a.y', got", tt.Name)
|
||||||
|
}
|
||||||
|
|
||||||
|
if !slices.Equal(tt.Shape, []uint64{1, 4}) {
|
||||||
|
t.Fatal("expected shape [1, 4], got", tt.Shape)
|
||||||
|
}
|
||||||
|
|
||||||
|
var b bytes.Buffer
|
||||||
|
if _, err := tt.WriteTo(&b); err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
|
||||||
|
f32s := make([]float32, mul(tt.Shape))
|
||||||
|
if err := binary.Read(&b, binary.LittleEndian, &f32s); err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
|
||||||
|
if !slices.Equal(f32s, []float32{8, 9, 10, 11}) {
|
||||||
|
t.Fatal("expected data [8, 9, 10, 11], got", f32s)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
})
|
||||||
|
|
||||||
|
t.Run("split with transpose", func(t *testing.T) {
|
||||||
|
next, stop := iter.Pull(splitDim(&r, 1,
|
||||||
|
split{Replacer: strings.NewReplacer("a", "x")},
|
||||||
|
split{Replacer: strings.NewReplacer("b", "y"), fn: func(tt tensor.Tensor) (tensor.Tensor, error) {
|
||||||
|
return tensor.Transpose(tt, 1, 0)
|
||||||
|
}},
|
||||||
|
))
|
||||||
|
defer stop()
|
||||||
|
|
||||||
|
{
|
||||||
|
tt, ok := next()
|
||||||
|
if !ok {
|
||||||
|
t.Fatal("expected at least one split")
|
||||||
|
}
|
||||||
|
|
||||||
|
if tt.Name != "x.b" {
|
||||||
|
t.Fatal("expected name 'x.b', got", tt.Name)
|
||||||
|
}
|
||||||
|
|
||||||
|
if !slices.Equal(tt.Shape, []uint64{3, 2}) {
|
||||||
|
t.Fatal("expected shape [3, 2], got", tt.Shape)
|
||||||
|
}
|
||||||
|
|
||||||
|
var b bytes.Buffer
|
||||||
|
if _, err := tt.WriteTo(&b); err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
|
||||||
|
f32s := make([]float32, mul(tt.Shape))
|
||||||
|
if err := binary.Read(&b, binary.LittleEndian, &f32s); err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
|
||||||
|
if !slices.Equal(f32s, []float32{0, 1, 4, 5, 8, 9}) {
|
||||||
|
t.Fatal("expected data [0, 1, 4, 5, 8, 9], got", f32s)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
{
|
||||||
|
tt, ok := next()
|
||||||
|
if !ok {
|
||||||
|
t.Fatal("expected at least one split")
|
||||||
|
}
|
||||||
|
|
||||||
|
if tt.Name != "a.y" {
|
||||||
|
t.Fatal("expected name 'a.y', got", tt.Name)
|
||||||
|
}
|
||||||
|
|
||||||
|
if !slices.Equal(tt.Shape, []uint64{3, 2}) {
|
||||||
|
t.Fatal("expected shape [3, 2], got", tt.Shape)
|
||||||
|
}
|
||||||
|
|
||||||
|
var b bytes.Buffer
|
||||||
|
if _, err := tt.WriteTo(&b); err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
|
||||||
|
f32s := make([]float32, mul(tt.Shape))
|
||||||
|
if err := binary.Read(&b, binary.LittleEndian, &f32s); err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
|
||||||
|
if !slices.Equal(f32s, []float32{2, 6, 10, 3, 7, 11}) {
|
||||||
|
t.Fatal("expected data [2, 6, 10, 3, 7, 11], got", f32s)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestMerge(t *testing.T) {
|
||||||
|
unmatched := []Tensor{
|
||||||
|
&fakeTensor{
|
||||||
|
name: "a.0.b",
|
||||||
|
shape: []uint64{5, 2},
|
||||||
|
data: []float32{10, 11, 12, 13, 14, 15, 16, 17, 18, 19},
|
||||||
|
},
|
||||||
|
&fakeTensor{
|
||||||
|
name: "a.1.b",
|
||||||
|
shape: []uint64{5, 2},
|
||||||
|
data: []float32{20, 21, 22, 23, 24, 25, 26, 27, 28, 29},
|
||||||
|
},
|
||||||
|
&fakeTensor{
|
||||||
|
name: "c.0.d",
|
||||||
|
shape: []uint64{5, 2},
|
||||||
|
data: []float32{30, 31, 32, 33, 34, 35, 36, 37, 38, 39},
|
||||||
|
},
|
||||||
|
&fakeTensor{
|
||||||
|
name: "c.1.d",
|
||||||
|
shape: []uint64{5, 2},
|
||||||
|
data: []float32{40, 41, 42, 43, 44, 45, 46, 47, 48, 49},
|
||||||
|
},
|
||||||
|
&fakeTensor{
|
||||||
|
name: "e.0.f",
|
||||||
|
shape: []uint64{5, 2},
|
||||||
|
data: []float32{50, 51, 52, 53, 54, 55, 56, 57, 58, 59},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
checkMatched := func(t *testing.T, n int, matched []*ggml.Tensor) {
|
||||||
|
for i := range n {
|
||||||
|
got := matched[i]
|
||||||
|
if diff := cmp.Diff([]uint64{2, 5, 2}, got.Shape); diff != "" {
|
||||||
|
t.Errorf("unexpected (-want +got):\n%s", diff)
|
||||||
|
}
|
||||||
|
|
||||||
|
var b bytes.Buffer
|
||||||
|
if _, err := got.WriteTo(&b); err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
|
||||||
|
f32s := make([]float32, 20)
|
||||||
|
if err := binary.Read(&b, binary.LittleEndian, &f32s); err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
|
||||||
|
offset := 10 + (i * 20)
|
||||||
|
want := make([]float32, 20)
|
||||||
|
for j := range 20 {
|
||||||
|
want[j] = float32(offset + j)
|
||||||
|
}
|
||||||
|
|
||||||
|
if diff := cmp.Diff(want, f32s); diff != "" {
|
||||||
|
t.Errorf("unexpected data (-want +got):\n%s", diff)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
t.Run("single merge", func(t *testing.T) {
|
||||||
|
matched, unmatched := mergeTensors(unmatched, merge{"a.*.b", "a.b"})
|
||||||
|
if len(unmatched) != 3 {
|
||||||
|
t.Error("expected 3 remaining tensors, got", len(unmatched))
|
||||||
|
}
|
||||||
|
|
||||||
|
if len(matched) != 1 {
|
||||||
|
t.Error("expected 1 merged tensor, got", len(matched))
|
||||||
|
}
|
||||||
|
|
||||||
|
checkMatched(t, 1, matched)
|
||||||
|
})
|
||||||
|
|
||||||
|
t.Run("multiple merges", func(t *testing.T) {
|
||||||
|
matched, unmatched := mergeTensors(unmatched, merge{"a.*.b", "a.b"}, merge{"c.*.d", "c.d"})
|
||||||
|
if len(unmatched) != 1 {
|
||||||
|
t.Error("expected 1 remaining tensors, got", len(unmatched))
|
||||||
|
}
|
||||||
|
|
||||||
|
if len(matched) != 2 {
|
||||||
|
t.Error("expected 2 merged tensor, got", len(matched))
|
||||||
|
}
|
||||||
|
|
||||||
|
checkMatched(t, 2, matched)
|
||||||
|
})
|
||||||
|
|
||||||
|
t.Run("no match", func(t *testing.T) {
|
||||||
|
matched, unmatched := mergeTensors(unmatched, merge{"x.*.y", "x.y"})
|
||||||
|
if len(unmatched) != 5 {
|
||||||
|
t.Error("expected 5 remaining tensors, got", len(unmatched))
|
||||||
|
}
|
||||||
|
|
||||||
|
if len(matched) != 0 {
|
||||||
|
t.Error("expected no merged tensors, got", len(matched))
|
||||||
|
}
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
@ -110,6 +110,7 @@ func parseTokenizer(fsys fs.FS, specialTokenTypes []string) (*Tokenizer, error)
|
||||||
}
|
}
|
||||||
|
|
||||||
if f, err := fsys.Open("tokenizer_config.json"); errors.Is(err, os.ErrNotExist) {
|
if f, err := fsys.Open("tokenizer_config.json"); errors.Is(err, os.ErrNotExist) {
|
||||||
|
// noop
|
||||||
} else if err != nil {
|
} else if err != nil {
|
||||||
return nil, err
|
return nil, err
|
||||||
} else {
|
} else {
|
||||||
|
|
@ -171,6 +172,34 @@ func parseTokenizer(fsys fs.FS, specialTokenTypes []string) (*Tokenizer, error)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if f, err := fsys.Open("generation_config.json"); errors.Is(err, os.ErrNotExist) {
|
||||||
|
} else if err != nil {
|
||||||
|
return nil, err
|
||||||
|
} else {
|
||||||
|
defer f.Close()
|
||||||
|
|
||||||
|
var p map[string]json.RawMessage
|
||||||
|
if err := json.NewDecoder(f).Decode(&p); err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
|
||||||
|
for _, st := range specialTokenTypes {
|
||||||
|
if bts, ok := p[fmt.Sprintf("%s_token_id", st)]; ok {
|
||||||
|
var ids []int32
|
||||||
|
if err := json.Unmarshal(bts, &ids); err != nil {
|
||||||
|
// value is not a list so the existing ID is used
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
|
if i := slices.IndexFunc(t.SpecialVocabulary, func(sv *SpecialVocabulary) bool {
|
||||||
|
return sv.Type == st
|
||||||
|
}); i >= 0 {
|
||||||
|
t.SpecialVocabulary[i].IDs = ids
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
return t, nil
|
return t, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -280,6 +309,9 @@ type SpecialVocabulary struct {
|
||||||
ID int
|
ID int
|
||||||
Content string
|
Content string
|
||||||
AddToken bool
|
AddToken bool
|
||||||
|
|
||||||
|
// IDs is populated by generation_config.json
|
||||||
|
IDs []int32
|
||||||
}
|
}
|
||||||
|
|
||||||
func (sv SpecialVocabulary) Key() string {
|
func (sv SpecialVocabulary) Key() string {
|
||||||
|
|
|
||||||
|
|
@ -247,6 +247,67 @@ func TestParseTokenizer(t *testing.T) {
|
||||||
Pre: "default",
|
Pre: "default",
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
name: "generation config eos token ids",
|
||||||
|
fsys: createTokenizerFS(t, t.TempDir(), map[string]io.Reader{
|
||||||
|
"tokenizer.json": strings.NewReader(`{
|
||||||
|
"added_tokens": [
|
||||||
|
{
|
||||||
|
"id": 0,
|
||||||
|
"content": "<bos>",
|
||||||
|
"special": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 1,
|
||||||
|
"content": "<eos>",
|
||||||
|
"special": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 2,
|
||||||
|
"content": "<eot>",
|
||||||
|
"special": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 3,
|
||||||
|
"content": "<eom>",
|
||||||
|
"special": true
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"model": {
|
||||||
|
"vocab": {
|
||||||
|
"<bos>": 0,
|
||||||
|
"<eos>": 1,
|
||||||
|
"<eot>": 2,
|
||||||
|
"<eom>": 3
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}`),
|
||||||
|
"tokenizer_config.json": strings.NewReader(`{
|
||||||
|
"add_bos_token": true,
|
||||||
|
"add_eos_token": false,
|
||||||
|
"bos_token": "<bos>",
|
||||||
|
"eos_token": "<eos>"
|
||||||
|
}`),
|
||||||
|
"generation_config.json": strings.NewReader(`{
|
||||||
|
"bos_token_id": 0,
|
||||||
|
"eos_token_id": [1, 2, 3]
|
||||||
|
}`),
|
||||||
|
}),
|
||||||
|
specialTokenTypes: []string{"pad", "eos", "bos", "unk"},
|
||||||
|
want: &Tokenizer{
|
||||||
|
Vocabulary: &Vocabulary{
|
||||||
|
Model: "gpt2",
|
||||||
|
Tokens: []string{"<bos>", "<eos>", "<eot>", "<eom>"},
|
||||||
|
Scores: []float32{0, 1, 2, 3},
|
||||||
|
Types: []int32{3, 3, 3, 3},
|
||||||
|
},
|
||||||
|
SpecialVocabulary: []*SpecialVocabulary{
|
||||||
|
{Type: "eos", Content: "<eos>", ID: 1, IDs: []int32{1, 2, 3}, AddToken: false},
|
||||||
|
{Type: "bos", Content: "<bos>", ID: 0, AddToken: true},
|
||||||
|
},
|
||||||
|
Pre: "default",
|
||||||
|
},
|
||||||
|
},
|
||||||
}
|
}
|
||||||
|
|
||||||
for _, tt := range cases {
|
for _, tt := range cases {
|
||||||
|
|
|
||||||
|
|
@ -3,7 +3,6 @@
|
||||||
package discover
|
package discover
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"fmt"
|
|
||||||
"log/slog"
|
"log/slog"
|
||||||
"os"
|
"os"
|
||||||
"regexp"
|
"regexp"
|
||||||
|
|
@ -60,8 +59,6 @@ func cudaVariant(gpuInfo CudaGPUInfo) string {
|
||||||
|
|
||||||
// driver 12.0 has problems with the cuda v12 library, so run v11 on those older drivers
|
// driver 12.0 has problems with the cuda v12 library, so run v11 on those older drivers
|
||||||
if gpuInfo.DriverMajor < 12 || (gpuInfo.DriverMajor == 12 && gpuInfo.DriverMinor == 0) {
|
if gpuInfo.DriverMajor < 12 || (gpuInfo.DriverMajor == 12 && gpuInfo.DriverMinor == 0) {
|
||||||
// The detected driver is older than Feb 2023
|
|
||||||
slog.Warn("old CUDA driver detected - please upgrade to a newer driver", "version", fmt.Sprintf("%d.%d", gpuInfo.DriverMajor, gpuInfo.DriverMinor))
|
|
||||||
return "v11"
|
return "v11"
|
||||||
}
|
}
|
||||||
return "v12"
|
return "v12"
|
||||||
|
|
|
||||||
|
|
@ -670,7 +670,7 @@ func loadOneapiMgmt(oneapiLibPaths []string) (int, *C.oneapi_handle_t, string, e
|
||||||
}
|
}
|
||||||
|
|
||||||
func getVerboseState() C.uint16_t {
|
func getVerboseState() C.uint16_t {
|
||||||
if envconfig.Debug() {
|
if envconfig.LogLevel() < slog.LevelInfo {
|
||||||
return C.uint16_t(1)
|
return C.uint16_t(1)
|
||||||
}
|
}
|
||||||
return C.uint16_t(0)
|
return C.uint16_t(0)
|
||||||
|
|
|
||||||
|
|
@ -12,7 +12,7 @@ import (
|
||||||
// '../lib/ollama' on Linux and the executable's directory on macOS
|
// '../lib/ollama' on Linux and the executable's directory on macOS
|
||||||
// note: distribution builds, additional GPU-specific libraries are
|
// note: distribution builds, additional GPU-specific libraries are
|
||||||
// found in subdirectories of the returned path, such as
|
// found in subdirectories of the returned path, such as
|
||||||
// 'cuda_v12', 'rocm', etc.
|
// 'cuda_v11', 'cuda_v12', 'rocm', etc.
|
||||||
var LibOllamaPath string = func() string {
|
var LibOllamaPath string = func() string {
|
||||||
exe, err := os.Executable()
|
exe, err := os.Executable()
|
||||||
if err != nil {
|
if err != nil {
|
||||||
|
|
|
||||||
70
docs/api.md
70
docs/api.md
|
|
@ -19,7 +19,7 @@
|
||||||
|
|
||||||
### Model names
|
### Model names
|
||||||
|
|
||||||
Model names follow a `model:tag` format, where `model` can have an optional namespace such as `example/model`. Some examples are `orca-mini:3b-q4_1` and `llama3:70b`. The tag is optional and, if not provided, will default to `latest`. The tag is used to identify a specific version.
|
Model names follow a `model:tag` format, where `model` can have an optional namespace such as `example/model`. Some examples are `orca-mini:3b-q8_0` and `llama3:70b`. The tag is optional and, if not provided, will default to `latest`. The tag is used to identify a specific version.
|
||||||
|
|
||||||
### Durations
|
### Durations
|
||||||
|
|
||||||
|
|
@ -43,6 +43,7 @@ Generate a response for a given prompt with a provided model. This is a streamin
|
||||||
- `prompt`: the prompt to generate a response for
|
- `prompt`: the prompt to generate a response for
|
||||||
- `suffix`: the text after the model response
|
- `suffix`: the text after the model response
|
||||||
- `images`: (optional) a list of base64-encoded images (for multimodal models such as `llava`)
|
- `images`: (optional) a list of base64-encoded images (for multimodal models such as `llava`)
|
||||||
|
- `think`: (for thinking models) should the model think before responding?
|
||||||
|
|
||||||
Advanced parameters (optional):
|
Advanced parameters (optional):
|
||||||
|
|
||||||
|
|
@ -490,11 +491,13 @@ Generate the next message in a chat with a provided model. This is a streaming e
|
||||||
- `model`: (required) the [model name](#model-names)
|
- `model`: (required) the [model name](#model-names)
|
||||||
- `messages`: the messages of the chat, this can be used to keep a chat memory
|
- `messages`: the messages of the chat, this can be used to keep a chat memory
|
||||||
- `tools`: list of tools in JSON for the model to use if supported
|
- `tools`: list of tools in JSON for the model to use if supported
|
||||||
|
- `think`: (for thinking models) should the model think before responding?
|
||||||
|
|
||||||
The `message` object has the following fields:
|
The `message` object has the following fields:
|
||||||
|
|
||||||
- `role`: the role of the message, either `system`, `user`, `assistant`, or `tool`
|
- `role`: the role of the message, either `system`, `user`, `assistant`, or `tool`
|
||||||
- `content`: the content of the message
|
- `content`: the content of the message
|
||||||
|
- `thinking`: (for thinking models) the model's thinking process
|
||||||
- `images` (optional): a list of images to include in the message (for multimodal models such as `llava`)
|
- `images` (optional): a list of images to include in the message (for multimodal models such as `llava`)
|
||||||
- `tool_calls` (optional): a list of tools in JSON that the model wants to use
|
- `tool_calls` (optional): a list of tools in JSON that the model wants to use
|
||||||
|
|
||||||
|
|
@ -952,19 +955,8 @@ If you are creating a model from a safetensors directory or from a GGUF file, yo
|
||||||
|
|
||||||
| Type | Recommended |
|
| Type | Recommended |
|
||||||
| --- | :-: |
|
| --- | :-: |
|
||||||
| q2_K | |
|
|
||||||
| q3_K_L | |
|
|
||||||
| q3_K_M | |
|
|
||||||
| q3_K_S | |
|
|
||||||
| q4_0 | |
|
|
||||||
| q4_1 | |
|
|
||||||
| q4_K_M | * |
|
| q4_K_M | * |
|
||||||
| q4_K_S | |
|
| q4_K_S | |
|
||||||
| q5_0 | |
|
|
||||||
| q5_1 | |
|
|
||||||
| q5_K_M | |
|
|
||||||
| q5_K_S | |
|
|
||||||
| q6_K | |
|
|
||||||
| q8_0 | * |
|
| q8_0 | * |
|
||||||
|
|
||||||
### Examples
|
### Examples
|
||||||
|
|
@ -1009,8 +1001,8 @@ Quantize a non-quantized model.
|
||||||
|
|
||||||
```shell
|
```shell
|
||||||
curl http://localhost:11434/api/create -d '{
|
curl http://localhost:11434/api/create -d '{
|
||||||
"model": "llama3.1:quantized",
|
"model": "llama3.2:quantized",
|
||||||
"from": "llama3.1:8b-instruct-fp16",
|
"from": "llama3.2:3b-instruct-fp16",
|
||||||
"quantize": "q4_K_M"
|
"quantize": "q4_K_M"
|
||||||
}'
|
}'
|
||||||
```
|
```
|
||||||
|
|
@ -1020,12 +1012,14 @@ curl http://localhost:11434/api/create -d '{
|
||||||
A stream of JSON objects is returned:
|
A stream of JSON objects is returned:
|
||||||
|
|
||||||
```json
|
```json
|
||||||
{"status":"quantizing F16 model to Q4_K_M"}
|
{"status":"quantizing F16 model to Q4_K_M","digest":"0","total":6433687776,"completed":12302}
|
||||||
{"status":"creating new layer sha256:667b0c1932bc6ffc593ed1d03f895bf2dc8dc6df21db3042284a6f4416b06a29"}
|
{"status":"quantizing F16 model to Q4_K_M","digest":"0","total":6433687776,"completed":6433687552}
|
||||||
{"status":"using existing layer sha256:11ce4ee3e170f6adebac9a991c22e22ab3f8530e154ee669954c4bc73061c258"}
|
{"status":"verifying conversion"}
|
||||||
{"status":"using existing layer sha256:0ba8f0e314b4264dfd19df045cde9d4c394a52474bf92ed6a3de22a4ca31a177"}
|
{"status":"creating new layer sha256:fb7f4f211b89c6c4928ff4ddb73db9f9c0cfca3e000c3e40d6cf27ddc6ca72eb"}
|
||||||
|
{"status":"using existing layer sha256:966de95ca8a62200913e3f8bfbf84c8494536f1b94b49166851e76644e966396"}
|
||||||
|
{"status":"using existing layer sha256:fcc5a6bec9daf9b561a68827b67ab6088e1dba9d1fa2a50d7bbcc8384e0a265d"}
|
||||||
|
{"status":"using existing layer sha256:a70ff7e570d97baaf4e62ac6e6ad9975e04caa6d900d3742d37698494479e0cd"}
|
||||||
{"status":"using existing layer sha256:56bb8bd477a519ffa694fc449c2413c6f0e1d3b1c88fa7e3c9d88d3ae49d4dcb"}
|
{"status":"using existing layer sha256:56bb8bd477a519ffa694fc449c2413c6f0e1d3b1c88fa7e3c9d88d3ae49d4dcb"}
|
||||||
{"status":"creating new layer sha256:455f34728c9b5dd3376378bfb809ee166c145b0b4c1f1a6feca069055066ef9a"}
|
|
||||||
{"status":"writing manifest"}
|
{"status":"writing manifest"}
|
||||||
{"status":"success"}
|
{"status":"success"}
|
||||||
```
|
```
|
||||||
|
|
@ -1163,29 +1157,37 @@ A single JSON object will be returned.
|
||||||
{
|
{
|
||||||
"models": [
|
"models": [
|
||||||
{
|
{
|
||||||
"name": "codellama:13b",
|
"name": "deepseek-r1:latest",
|
||||||
"modified_at": "2023-11-04T14:56:49.277302595-07:00",
|
"model": "deepseek-r1:latest",
|
||||||
"size": 7365960935,
|
"modified_at": "2025-05-10T08:06:48.639712648-07:00",
|
||||||
"digest": "9f438cb9cd581fc025612d27f7c1a6669ff83a8bb0ed86c94fcf4c5440555697",
|
"size": 4683075271,
|
||||||
|
"digest": "0a8c266910232fd3291e71e5ba1e058cc5af9d411192cf88b6d30e92b6e73163",
|
||||||
"details": {
|
"details": {
|
||||||
|
"parent_model": "",
|
||||||
"format": "gguf",
|
"format": "gguf",
|
||||||
"family": "llama",
|
"family": "qwen2",
|
||||||
"families": null,
|
"families": [
|
||||||
"parameter_size": "13B",
|
"qwen2"
|
||||||
"quantization_level": "Q4_0"
|
],
|
||||||
|
"parameter_size": "7.6B",
|
||||||
|
"quantization_level": "Q4_K_M"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"name": "llama3:latest",
|
"name": "llama3.2:latest",
|
||||||
"modified_at": "2023-12-07T09:32:18.757212583-08:00",
|
"model": "llama3.2:latest",
|
||||||
"size": 3825819519,
|
"modified_at": "2025-05-04T17:37:44.706015396-07:00",
|
||||||
"digest": "fe938a131f40e6f6d40083c9f0f430a515233eb2edaa6d72eb85c50d64f2300e",
|
"size": 2019393189,
|
||||||
|
"digest": "a80c4f17acd55265feec403c7aef86be0c25983ab279d83f3bcd3abbcb5b8b72",
|
||||||
"details": {
|
"details": {
|
||||||
|
"parent_model": "",
|
||||||
"format": "gguf",
|
"format": "gguf",
|
||||||
"family": "llama",
|
"family": "llama",
|
||||||
"families": null,
|
"families": [
|
||||||
"parameter_size": "7B",
|
"llama"
|
||||||
"quantization_level": "Q4_0"
|
],
|
||||||
|
"parameter_size": "3.2B",
|
||||||
|
"quantization_level": "Q4_K_M"
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
]
|
]
|
||||||
|
|
|
||||||
|
|
@ -1,59 +0,0 @@
|
||||||
# Benchmark
|
|
||||||
|
|
||||||
Go benchmark tests that measure end-to-end performance of a running Ollama server. Run these tests to evaluate model inference performance on your hardware and measure the impact of code changes.
|
|
||||||
|
|
||||||
## When to use
|
|
||||||
|
|
||||||
Run these benchmarks when:
|
|
||||||
- Making changes to the model inference engine
|
|
||||||
- Modifying model loading/unloading logic
|
|
||||||
- Changing prompt processing or token generation code
|
|
||||||
- Implementing a new model architecture
|
|
||||||
- Testing performance across different hardware setups
|
|
||||||
|
|
||||||
## Prerequisites
|
|
||||||
- Ollama server running locally with `ollama serve` on `127.0.0.1:11434`
|
|
||||||
## Usage and Examples
|
|
||||||
|
|
||||||
>[!NOTE]
|
|
||||||
>All commands must be run from the root directory of the Ollama project.
|
|
||||||
|
|
||||||
Basic syntax:
|
|
||||||
```bash
|
|
||||||
go test -bench=. ./benchmark/... -m $MODEL_NAME
|
|
||||||
```
|
|
||||||
|
|
||||||
Required flags:
|
|
||||||
- `-bench=.`: Run all benchmarks
|
|
||||||
- `-m`: Model name to benchmark
|
|
||||||
|
|
||||||
Optional flags:
|
|
||||||
- `-count N`: Number of times to run the benchmark (useful for statistical analysis)
|
|
||||||
- `-timeout T`: Maximum time for the benchmark to run (e.g. "10m" for 10 minutes)
|
|
||||||
|
|
||||||
Common usage patterns:
|
|
||||||
|
|
||||||
Single benchmark run with a model specified:
|
|
||||||
```bash
|
|
||||||
go test -bench=. ./benchmark/... -m llama3.3
|
|
||||||
```
|
|
||||||
|
|
||||||
## Output metrics
|
|
||||||
|
|
||||||
The benchmark reports several key metrics:
|
|
||||||
|
|
||||||
- `gen_tok/s`: Generated tokens per second
|
|
||||||
- `prompt_tok/s`: Prompt processing tokens per second
|
|
||||||
- `ttft_ms`: Time to first token in milliseconds
|
|
||||||
- `load_ms`: Model load time in milliseconds
|
|
||||||
- `gen_tokens`: Total tokens generated
|
|
||||||
- `prompt_tokens`: Total prompt tokens processed
|
|
||||||
|
|
||||||
Each benchmark runs two scenarios:
|
|
||||||
- Cold start: Model is loaded from disk for each test
|
|
||||||
- Warm start: Model is pre-loaded in memory
|
|
||||||
|
|
||||||
Three prompt lengths are tested for each scenario:
|
|
||||||
- Short prompt (100 tokens)
|
|
||||||
- Medium prompt (500 tokens)
|
|
||||||
- Long prompt (1000 tokens)
|
|
||||||
|
|
@ -118,7 +118,7 @@ To run tests, use `go test`:
|
||||||
go test ./...
|
go test ./...
|
||||||
```
|
```
|
||||||
|
|
||||||
> NOTE: In rare cirumstances, you may nedd to change a package using the new
|
> NOTE: In rare cirumstances, you may need to change a package using the new
|
||||||
> "synctest" package in go1.24.
|
> "synctest" package in go1.24.
|
||||||
>
|
>
|
||||||
> If you do not have the "synctest" package enabled, you will not see build or
|
> If you do not have the "synctest" package enabled, you will not see build or
|
||||||
|
|
|
||||||
|
|
@ -1,6 +1,6 @@
|
||||||
# GPU
|
# GPU
|
||||||
## Nvidia
|
## Nvidia
|
||||||
Ollama supports Nvidia GPUs with compute capability 5.0+ and driver version 531 and newer.
|
Ollama supports Nvidia GPUs with compute capability 5.0+.
|
||||||
|
|
||||||
Check your compute compatibility to see if your card is supported:
|
Check your compute compatibility to see if your card is supported:
|
||||||
[https://developer.nvidia.com/cuda-gpus](https://developer.nvidia.com/cuda-gpus)
|
[https://developer.nvidia.com/cuda-gpus](https://developer.nvidia.com/cuda-gpus)
|
||||||
|
|
|
||||||
|
|
@ -132,22 +132,12 @@ success
|
||||||
|
|
||||||
### Supported Quantizations
|
### Supported Quantizations
|
||||||
|
|
||||||
- `q4_0`
|
|
||||||
- `q4_1`
|
|
||||||
- `q5_0`
|
|
||||||
- `q5_1`
|
|
||||||
- `q8_0`
|
- `q8_0`
|
||||||
|
|
||||||
#### K-means Quantizations
|
#### K-means Quantizations
|
||||||
|
|
||||||
- `q3_K_S`
|
|
||||||
- `q3_K_M`
|
|
||||||
- `q3_K_L`
|
|
||||||
- `q4_K_S`
|
- `q4_K_S`
|
||||||
- `q4_K_M`
|
- `q4_K_M`
|
||||||
- `q5_K_S`
|
|
||||||
- `q5_K_M`
|
|
||||||
- `q6_K`
|
|
||||||
|
|
||||||
|
|
||||||
## Sharing your model on ollama.com
|
## Sharing your model on ollama.com
|
||||||
|
|
|
||||||
|
|
@ -112,8 +112,8 @@ sudo systemctl status ollama
|
||||||
> While AMD has contributed the `amdgpu` driver upstream to the official linux
|
> While AMD has contributed the `amdgpu` driver upstream to the official linux
|
||||||
> kernel source, the version is older and may not support all ROCm features. We
|
> kernel source, the version is older and may not support all ROCm features. We
|
||||||
> recommend you install the latest driver from
|
> recommend you install the latest driver from
|
||||||
> https://www.amd.com/en/support/linux-drivers for best support of your Radeon
|
> [AMD](https://www.amd.com/en/support/download/linux-drivers.html) for best support
|
||||||
> GPU.
|
> of your Radeon GPU.
|
||||||
|
|
||||||
## Customizing
|
## Customizing
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -43,7 +43,7 @@ Ollama includes multiple LLM libraries compiled for different GPUs and CPU vecto
|
||||||
In the server log, you will see a message that looks something like this (varies from release to release):
|
In the server log, you will see a message that looks something like this (varies from release to release):
|
||||||
|
|
||||||
```
|
```
|
||||||
Dynamic LLM libraries [rocm_v6 cpu cpu_avx cpu_avx2 cuda_v12 rocm_v5]
|
Dynamic LLM libraries [rocm_v6 cpu cpu_avx cpu_avx2 cuda_v11 rocm_v5]
|
||||||
```
|
```
|
||||||
|
|
||||||
**Experimental LLM Library Override**
|
**Experimental LLM Library Override**
|
||||||
|
|
|
||||||
|
|
@ -149,9 +149,22 @@ func Bool(k string) func() bool {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// LogLevel returns the log level for the application.
|
||||||
|
// Values are 0 or false INFO (Default), 1 or true DEBUG, 2 TRACE
|
||||||
|
func LogLevel() slog.Level {
|
||||||
|
level := slog.LevelInfo
|
||||||
|
if s := Var("OLLAMA_DEBUG"); s != "" {
|
||||||
|
if b, _ := strconv.ParseBool(s); b {
|
||||||
|
level = slog.LevelDebug
|
||||||
|
} else if i, _ := strconv.ParseInt(s, 10, 64); i != 0 {
|
||||||
|
level = slog.Level(i * -4)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return level
|
||||||
|
}
|
||||||
|
|
||||||
var (
|
var (
|
||||||
// Debug enabled additional debug information.
|
|
||||||
Debug = Bool("OLLAMA_DEBUG")
|
|
||||||
// FlashAttention enables the experimental flash attention feature.
|
// FlashAttention enables the experimental flash attention feature.
|
||||||
FlashAttention = Bool("OLLAMA_FLASH_ATTENTION")
|
FlashAttention = Bool("OLLAMA_FLASH_ATTENTION")
|
||||||
// KvCacheType is the quantization type for the K/V cache.
|
// KvCacheType is the quantization type for the K/V cache.
|
||||||
|
|
@ -170,6 +183,8 @@ var (
|
||||||
NewEngine = Bool("OLLAMA_NEW_ENGINE")
|
NewEngine = Bool("OLLAMA_NEW_ENGINE")
|
||||||
// ContextLength sets the default context length
|
// ContextLength sets the default context length
|
||||||
ContextLength = Uint("OLLAMA_CONTEXT_LENGTH", 4096)
|
ContextLength = Uint("OLLAMA_CONTEXT_LENGTH", 4096)
|
||||||
|
// Auth enables authentication between the Ollama client and server
|
||||||
|
UseAuth = Bool("OLLAMA_AUTH")
|
||||||
)
|
)
|
||||||
|
|
||||||
func String(s string) func() string {
|
func String(s string) func() string {
|
||||||
|
|
@ -209,8 +224,6 @@ var (
|
||||||
MaxRunners = Uint("OLLAMA_MAX_LOADED_MODELS", 0)
|
MaxRunners = Uint("OLLAMA_MAX_LOADED_MODELS", 0)
|
||||||
// MaxQueue sets the maximum number of queued requests. MaxQueue can be configured via the OLLAMA_MAX_QUEUE environment variable.
|
// MaxQueue sets the maximum number of queued requests. MaxQueue can be configured via the OLLAMA_MAX_QUEUE environment variable.
|
||||||
MaxQueue = Uint("OLLAMA_MAX_QUEUE", 512)
|
MaxQueue = Uint("OLLAMA_MAX_QUEUE", 512)
|
||||||
// MaxVRAM sets a maximum VRAM override in bytes. MaxVRAM can be configured via the OLLAMA_MAX_VRAM environment variable.
|
|
||||||
MaxVRAM = Uint("OLLAMA_MAX_VRAM", 0)
|
|
||||||
)
|
)
|
||||||
|
|
||||||
func Uint64(key string, defaultValue uint64) func() uint64 {
|
func Uint64(key string, defaultValue uint64) func() uint64 {
|
||||||
|
|
@ -238,7 +251,7 @@ type EnvVar struct {
|
||||||
|
|
||||||
func AsMap() map[string]EnvVar {
|
func AsMap() map[string]EnvVar {
|
||||||
ret := map[string]EnvVar{
|
ret := map[string]EnvVar{
|
||||||
"OLLAMA_DEBUG": {"OLLAMA_DEBUG", Debug(), "Show additional debug information (e.g. OLLAMA_DEBUG=1)"},
|
"OLLAMA_DEBUG": {"OLLAMA_DEBUG", LogLevel(), "Show additional debug information (e.g. OLLAMA_DEBUG=1)"},
|
||||||
"OLLAMA_FLASH_ATTENTION": {"OLLAMA_FLASH_ATTENTION", FlashAttention(), "Enabled flash attention"},
|
"OLLAMA_FLASH_ATTENTION": {"OLLAMA_FLASH_ATTENTION", FlashAttention(), "Enabled flash attention"},
|
||||||
"OLLAMA_KV_CACHE_TYPE": {"OLLAMA_KV_CACHE_TYPE", KvCacheType(), "Quantization type for the K/V cache (default: f16)"},
|
"OLLAMA_KV_CACHE_TYPE": {"OLLAMA_KV_CACHE_TYPE", KvCacheType(), "Quantization type for the K/V cache (default: f16)"},
|
||||||
"OLLAMA_GPU_OVERHEAD": {"OLLAMA_GPU_OVERHEAD", GpuOverhead(), "Reserve a portion of VRAM per GPU (bytes)"},
|
"OLLAMA_GPU_OVERHEAD": {"OLLAMA_GPU_OVERHEAD", GpuOverhead(), "Reserve a portion of VRAM per GPU (bytes)"},
|
||||||
|
|
|
||||||
|
|
@ -1,11 +1,13 @@
|
||||||
package envconfig
|
package envconfig
|
||||||
|
|
||||||
import (
|
import (
|
||||||
|
"log/slog"
|
||||||
"math"
|
"math"
|
||||||
"testing"
|
"testing"
|
||||||
"time"
|
"time"
|
||||||
|
|
||||||
"github.com/google/go-cmp/cmp"
|
"github.com/google/go-cmp/cmp"
|
||||||
|
"github.com/ollama/ollama/logutil"
|
||||||
)
|
)
|
||||||
|
|
||||||
func TestHost(t *testing.T) {
|
func TestHost(t *testing.T) {
|
||||||
|
|
@ -292,3 +294,34 @@ func TestContextLength(t *testing.T) {
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestLogLevel(t *testing.T) {
|
||||||
|
cases := map[string]slog.Level{
|
||||||
|
// Default to INFO
|
||||||
|
"": slog.LevelInfo,
|
||||||
|
"false": slog.LevelInfo,
|
||||||
|
"f": slog.LevelInfo,
|
||||||
|
"0": slog.LevelInfo,
|
||||||
|
|
||||||
|
// True values enable Debug
|
||||||
|
"true": slog.LevelDebug,
|
||||||
|
"t": slog.LevelDebug,
|
||||||
|
|
||||||
|
// Positive values increase verbosity
|
||||||
|
"1": slog.LevelDebug,
|
||||||
|
"2": logutil.LevelTrace,
|
||||||
|
|
||||||
|
// Negative values decrease verbosity
|
||||||
|
"-1": slog.LevelWarn,
|
||||||
|
"-2": slog.LevelError,
|
||||||
|
}
|
||||||
|
|
||||||
|
for k, v := range cases {
|
||||||
|
t.Run(k, func(t *testing.T) {
|
||||||
|
t.Setenv("OLLAMA_DEBUG", k)
|
||||||
|
if i := LogLevel(); i != v {
|
||||||
|
t.Errorf("%s: expected %d, got %d", k, v, i)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
|
||||||
|
|
@ -15,6 +15,7 @@ import (
|
||||||
type GGML struct {
|
type GGML struct {
|
||||||
container
|
container
|
||||||
model
|
model
|
||||||
|
Length int64
|
||||||
}
|
}
|
||||||
|
|
||||||
type model interface {
|
type model interface {
|
||||||
|
|
@ -170,6 +171,8 @@ func (kv KV) OllamaEngineRequired() bool {
|
||||||
"gemma3",
|
"gemma3",
|
||||||
"mistral3",
|
"mistral3",
|
||||||
"llama4",
|
"llama4",
|
||||||
|
"mllama",
|
||||||
|
"qwen25vl",
|
||||||
}, kv.Architecture())
|
}, kv.Architecture())
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -429,12 +432,12 @@ func DetectContentType(b []byte) string {
|
||||||
//
|
//
|
||||||
// It collects array values for arrays with a size less than or equal to
|
// It collects array values for arrays with a size less than or equal to
|
||||||
// maxArraySize. If the maxArraySize is negative, all arrays are collected.
|
// maxArraySize. If the maxArraySize is negative, all arrays are collected.
|
||||||
func Decode(rs io.ReadSeeker, maxArraySize int) (*GGML, int64, error) {
|
func Decode(rs io.ReadSeeker, maxArraySize int) (*GGML, error) {
|
||||||
rs = bufioutil.NewBufferedSeeker(rs, 32<<10)
|
rs = bufioutil.NewBufferedSeeker(rs, 32<<10)
|
||||||
|
|
||||||
var magic uint32
|
var magic uint32
|
||||||
if err := binary.Read(rs, binary.LittleEndian, &magic); err != nil {
|
if err := binary.Read(rs, binary.LittleEndian, &magic); err != nil {
|
||||||
return nil, 0, err
|
return nil, err
|
||||||
}
|
}
|
||||||
|
|
||||||
var c container
|
var c container
|
||||||
|
|
@ -444,24 +447,25 @@ func Decode(rs io.ReadSeeker, maxArraySize int) (*GGML, int64, error) {
|
||||||
case FILE_MAGIC_GGUF_BE:
|
case FILE_MAGIC_GGUF_BE:
|
||||||
c = &containerGGUF{ByteOrder: binary.BigEndian, maxArraySize: maxArraySize}
|
c = &containerGGUF{ByteOrder: binary.BigEndian, maxArraySize: maxArraySize}
|
||||||
default:
|
default:
|
||||||
return nil, 0, errors.New("invalid file magic")
|
return nil, errors.New("invalid file magic")
|
||||||
}
|
}
|
||||||
|
|
||||||
model, err := c.Decode(rs)
|
model, err := c.Decode(rs)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, 0, err
|
return nil, err
|
||||||
}
|
}
|
||||||
|
|
||||||
offset, err := rs.Seek(0, io.SeekCurrent)
|
offset, err := rs.Seek(0, io.SeekCurrent)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, 0, err
|
return nil, err
|
||||||
}
|
}
|
||||||
|
|
||||||
// final model type
|
// final model type
|
||||||
return &GGML{
|
return &GGML{
|
||||||
container: c,
|
container: c,
|
||||||
model: model,
|
model: model,
|
||||||
}, offset, nil
|
Length: offset,
|
||||||
|
}, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func (f GGML) GraphSize(context, batch uint64, numParallel int, kvCacheType string) (kv []uint64, partialOffload, fullOffload uint64) {
|
func (f GGML) GraphSize(context, batch uint64, numParallel int, kvCacheType string) (kv []uint64, partialOffload, fullOffload uint64) {
|
||||||
|
|
@ -693,6 +697,20 @@ func (llm GGML) VisionGraphSize() (weights, graphSize uint64) {
|
||||||
graphSize = 4 * (imageSize*imageSize*numChannels +
|
graphSize = 4 * (imageSize*imageSize*numChannels +
|
||||||
embeddingLength*patchSize +
|
embeddingLength*patchSize +
|
||||||
numPatches*numPatches*headCount)
|
numPatches*numPatches*headCount)
|
||||||
|
case "qwen25vl":
|
||||||
|
maxPixels := uint64(llm.KV().Uint("vision.max_pixels", 28*28*1280))
|
||||||
|
|
||||||
|
numPatches := maxPixels / (patchSize * patchSize)
|
||||||
|
|
||||||
|
graphSize = 4 * (maxPixels*numChannels + // Original image storage
|
||||||
|
// Normalized pixels
|
||||||
|
maxPixels*numChannels +
|
||||||
|
// Patches storage (numPatches * channels * patchSize^2)
|
||||||
|
numPatches*numChannels*patchSize*patchSize +
|
||||||
|
// Self-attention calculations
|
||||||
|
numPatches*numPatches*headCount +
|
||||||
|
// Additional buffer for processing
|
||||||
|
embeddingLength*numPatches)
|
||||||
case "llama4":
|
case "llama4":
|
||||||
// vision graph is computed independently in the same schedule
|
// vision graph is computed independently in the same schedule
|
||||||
// and is negligible compared to the worst case text graph
|
// and is negligible compared to the worst case text graph
|
||||||
|
|
|
||||||
|
|
@ -527,23 +527,17 @@ func WriteGGUF(f *os.File, kv KV, ts []*Tensor) error {
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
|
|
||||||
keys := slices.Collect(maps.Keys(kv))
|
for _, key := range slices.Sorted(maps.Keys(kv)) {
|
||||||
slices.Sort(keys)
|
|
||||||
|
|
||||||
for _, key := range keys {
|
|
||||||
if err := ggufWriteKV(f, key, kv[key]); err != nil {
|
if err := ggufWriteKV(f, key, kv[key]); err != nil {
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
slices.SortStableFunc(ts, func(a, b *Tensor) int {
|
slices.SortStableFunc(ts, func(a, b *Tensor) int {
|
||||||
if i, j := a.block(), b.block(); i < 0 && j > 0 {
|
if i, j := a.block(), b.block(); i > 0 && j > 0 {
|
||||||
return 1
|
|
||||||
} else if i > 0 && j < 0 {
|
|
||||||
return -1
|
|
||||||
} else {
|
|
||||||
return cmp.Compare(i, j)
|
return cmp.Compare(i, j)
|
||||||
}
|
}
|
||||||
|
return cmp.Compare(a.Name, b.Name)
|
||||||
})
|
})
|
||||||
|
|
||||||
var s uint64
|
var s uint64
|
||||||
|
|
|
||||||
|
|
@ -2,62 +2,82 @@ package ggml
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"bytes"
|
"bytes"
|
||||||
|
"math/rand/v2"
|
||||||
"os"
|
"os"
|
||||||
"slices"
|
"strings"
|
||||||
"testing"
|
"testing"
|
||||||
|
|
||||||
"github.com/google/go-cmp/cmp"
|
"github.com/google/go-cmp/cmp"
|
||||||
)
|
)
|
||||||
|
|
||||||
func TestWriteGGUF(t *testing.T) {
|
func TestWriteGGUF(t *testing.T) {
|
||||||
w, err := os.CreateTemp(t.TempDir(), "*.bin")
|
r := rand.New(rand.NewPCG(0, 0))
|
||||||
if err != nil {
|
for range 8 {
|
||||||
t.Fatal(err)
|
t.Run("shuffle", func(t *testing.T) {
|
||||||
}
|
t.Parallel()
|
||||||
defer w.Close()
|
|
||||||
|
|
||||||
if err := WriteGGUF(w, KV{
|
ts := []*Tensor{
|
||||||
"general.alignment": uint32(16),
|
{Name: "token_embd.weight", Shape: []uint64{2, 3}, WriterTo: bytes.NewBuffer(make([]byte, 2*3))},
|
||||||
}, []*Tensor{
|
{Name: "blk.0.attn_norm.weight", Shape: []uint64{2, 3}, WriterTo: bytes.NewBuffer(make([]byte, 2*3))},
|
||||||
{Name: "test.0", Shape: []uint64{2, 3}, WriterTo: bytes.NewBuffer(slices.Repeat([]byte{0}, 2*3*4))},
|
{Name: "blk.1.attn_norm.weight", Shape: []uint64{2, 3}, WriterTo: bytes.NewBuffer(make([]byte, 2*3))},
|
||||||
{Name: "test.1", Shape: []uint64{2, 3}, WriterTo: bytes.NewBuffer(slices.Repeat([]byte{0}, 2*3*4))},
|
{Name: "blk.2.attn_norm.weight", Shape: []uint64{2, 3}, WriterTo: bytes.NewBuffer(make([]byte, 2*3))},
|
||||||
{Name: "test.2", Shape: []uint64{2, 3}, WriterTo: bytes.NewBuffer(slices.Repeat([]byte{0}, 2*3*4))},
|
{Name: "blk.3.attn_norm.weight", Shape: []uint64{2, 3}, WriterTo: bytes.NewBuffer(make([]byte, 2*3))},
|
||||||
{Name: "test.3", Shape: []uint64{2, 3}, WriterTo: bytes.NewBuffer(slices.Repeat([]byte{0}, 2*3*4))},
|
{Name: "blk.4.attn_norm.weight", Shape: []uint64{2, 3}, WriterTo: bytes.NewBuffer(make([]byte, 2*3))},
|
||||||
{Name: "test.4", Shape: []uint64{2, 3}, WriterTo: bytes.NewBuffer(slices.Repeat([]byte{0}, 2*3*4))},
|
{Name: "blk.5.attn_norm.weight", Shape: []uint64{2, 3}, WriterTo: bytes.NewBuffer(make([]byte, 2*3))},
|
||||||
{Name: "test.5", Shape: []uint64{2, 3}, WriterTo: bytes.NewBuffer(slices.Repeat([]byte{0}, 2*3*4))},
|
{Name: "output_norm.weight", Shape: []uint64{3, 2}, WriterTo: bytes.NewBuffer(make([]byte, 3*2))},
|
||||||
}); err != nil {
|
{Name: "output.weight", Shape: []uint64{3, 2}, WriterTo: bytes.NewBuffer(make([]byte, 3*2))},
|
||||||
t.Fatal(err)
|
}
|
||||||
}
|
|
||||||
|
|
||||||
r, err := os.Open(w.Name())
|
r.Shuffle(len(ts), func(i, j int) {
|
||||||
if err != nil {
|
ts[i], ts[j] = ts[j], ts[i]
|
||||||
t.Fatal(err)
|
})
|
||||||
}
|
|
||||||
defer r.Close()
|
|
||||||
|
|
||||||
ff, _, err := Decode(r, 0)
|
w, err := os.CreateTemp(t.TempDir(), strings.ReplaceAll(t.Name(), "/", "_")+"*.bin")
|
||||||
if err != nil {
|
if err != nil {
|
||||||
t.Fatal(err)
|
t.Fatal(err)
|
||||||
}
|
}
|
||||||
|
defer w.Close()
|
||||||
|
|
||||||
if diff := cmp.Diff(ff.KV(), KV{
|
if err := WriteGGUF(w, KV{
|
||||||
"general.alignment": uint32(16),
|
"general.alignment": uint32(16),
|
||||||
"general.parameter_count": uint64(36),
|
}, ts); err != nil {
|
||||||
}); diff != "" {
|
t.Fatal(err)
|
||||||
t.Errorf("Mismatch (-want +got):\n%s", diff)
|
}
|
||||||
}
|
|
||||||
|
|
||||||
if diff := cmp.Diff(ff.Tensors(), Tensors{
|
r, err := os.Open(w.Name())
|
||||||
Offset: 336,
|
if err != nil {
|
||||||
items: []*Tensor{
|
t.Fatal(err)
|
||||||
{Name: "test.0", Offset: 0, Shape: []uint64{2, 3}},
|
}
|
||||||
{Name: "test.1", Offset: 32, Shape: []uint64{2, 3}},
|
defer r.Close()
|
||||||
{Name: "test.2", Offset: 64, Shape: []uint64{2, 3}},
|
|
||||||
{Name: "test.3", Offset: 96, Shape: []uint64{2, 3}},
|
ff, err := Decode(r, 0)
|
||||||
{Name: "test.4", Offset: 128, Shape: []uint64{2, 3}},
|
if err != nil {
|
||||||
{Name: "test.5", Offset: 160, Shape: []uint64{2, 3}},
|
t.Fatal(err)
|
||||||
},
|
}
|
||||||
}, cmp.AllowUnexported(Tensors{})); diff != "" {
|
|
||||||
t.Errorf("Mismatch (-want +got):\n%s", diff)
|
if diff := cmp.Diff(KV{
|
||||||
|
"general.alignment": uint32(16),
|
||||||
|
"general.parameter_count": uint64(54),
|
||||||
|
}, ff.KV()); diff != "" {
|
||||||
|
t.Errorf("Mismatch (-want +got):\n%s", diff)
|
||||||
|
}
|
||||||
|
|
||||||
|
if diff := cmp.Diff(Tensors{
|
||||||
|
Offset: 608,
|
||||||
|
items: []*Tensor{
|
||||||
|
{Name: "blk.0.attn_norm.weight", Offset: 0, Shape: []uint64{2, 3}},
|
||||||
|
{Name: "blk.1.attn_norm.weight", Offset: 32, Shape: []uint64{2, 3}},
|
||||||
|
{Name: "blk.2.attn_norm.weight", Offset: 64, Shape: []uint64{2, 3}},
|
||||||
|
{Name: "blk.3.attn_norm.weight", Offset: 96, Shape: []uint64{2, 3}},
|
||||||
|
{Name: "blk.4.attn_norm.weight", Offset: 128, Shape: []uint64{2, 3}},
|
||||||
|
{Name: "blk.5.attn_norm.weight", Offset: 160, Shape: []uint64{2, 3}},
|
||||||
|
{Name: "output.weight", Offset: 192, Shape: []uint64{3, 2}},
|
||||||
|
{Name: "output_norm.weight", Offset: 224, Shape: []uint64{3, 2}},
|
||||||
|
{Name: "token_embd.weight", Offset: 256, Shape: []uint64{2, 3}},
|
||||||
|
},
|
||||||
|
}, ff.Tensors(), cmp.AllowUnexported(Tensors{})); diff != "" {
|
||||||
|
t.Errorf("Mismatch (-want +got):\n%s", diff)
|
||||||
|
}
|
||||||
|
})
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
||||||
125
fs/ggml/type.go
125
fs/ggml/type.go
|
|
@ -12,42 +12,42 @@ type FileType uint32
|
||||||
const (
|
const (
|
||||||
FileTypeF32 FileType = iota
|
FileTypeF32 FileType = iota
|
||||||
FileTypeF16
|
FileTypeF16
|
||||||
FileTypeQ4_0
|
fileTypeQ4_0
|
||||||
FileTypeQ4_1
|
fileTypeQ4_1
|
||||||
fileTypeQ4_1_F16 // unused by GGML
|
fileTypeQ4_1_F16 // unused by GGML
|
||||||
fileTypeQ4_2 // unused by GGML
|
fileTypeQ4_2 // unused by GGML
|
||||||
fileTypeQ4_3 // unused by GGML
|
fileTypeQ4_3 // unused by GGML
|
||||||
FileTypeQ8_0
|
FileTypeQ8_0
|
||||||
FileTypeQ5_0
|
fileTypeQ5_0
|
||||||
FileTypeQ5_1
|
fileTypeQ5_1
|
||||||
FileTypeQ2_K
|
fileTypeQ2_K
|
||||||
FileTypeQ3_K_S
|
fileTypeQ3_K_S
|
||||||
FileTypeQ3_K_M
|
fileTypeQ3_K_M
|
||||||
FileTypeQ3_K_L
|
fileTypeQ3_K_L
|
||||||
FileTypeQ4_K_S
|
FileTypeQ4_K_S
|
||||||
FileTypeQ4_K_M
|
FileTypeQ4_K_M
|
||||||
FileTypeQ5_K_S
|
fileTypeQ5_K_S
|
||||||
FileTypeQ5_K_M
|
fileTypeQ5_K_M
|
||||||
FileTypeQ6_K
|
fileTypeQ6_K
|
||||||
fileTypeIQ2_XXS // not supported by ollama
|
fileTypeIQ2_XXS
|
||||||
fileTypeIQ2_XS // not supported by ollama
|
fileTypeIQ2_XS
|
||||||
FileTypeQ2_K_S
|
fileTypeQ2_K_S
|
||||||
fileTypeIQ3_XS // not supported by ollama
|
fileTypeIQ3_XS
|
||||||
fileTypeIQ3_XXS // not supported by ollama
|
fileTypeIQ3_XXS
|
||||||
fileTypeIQ1_S // not supported by ollama
|
fileTypeIQ1_S
|
||||||
fileTypeIQ4_NL // not supported by ollama
|
fileTypeIQ4_NL
|
||||||
fileTypeIQ3_S // not supported by ollama
|
fileTypeIQ3_S
|
||||||
fileTypeIQ3_M // not supported by ollama
|
fileTypeIQ3_M
|
||||||
fileTypeIQ2_S // not supported by ollama
|
fileTypeIQ2_S
|
||||||
fileTypeIQ2_M // not supported by ollama
|
fileTypeIQ2_M
|
||||||
fileTypeIQ4_XS // not supported by ollama
|
fileTypeIQ4_XS
|
||||||
fileTypeIQ1_M // not supported by ollama
|
fileTypeIQ1_M
|
||||||
FileTypeBF16
|
FileTypeBF16
|
||||||
fileTypeQ4_0_4_4 // unused by GGML
|
fileTypeQ4_0_4_4 // unused by GGML
|
||||||
fileTypeQ4_0_4_8 // unused by GGML
|
fileTypeQ4_0_4_8 // unused by GGML
|
||||||
fileTypeQ4_0_8_8 // unused by GGML
|
fileTypeQ4_0_8_8 // unused by GGML
|
||||||
fileTypeTQ1_0 // not supported by ollama
|
fileTypeTQ1_0
|
||||||
fileTypeTQ2_0 // not supported by ollama
|
fileTypeTQ2_0
|
||||||
|
|
||||||
FileTypeUnknown = 1024
|
FileTypeUnknown = 1024
|
||||||
)
|
)
|
||||||
|
|
@ -60,36 +60,12 @@ func ParseFileType(s string) (FileType, error) {
|
||||||
return FileTypeF32, nil
|
return FileTypeF32, nil
|
||||||
case "F16":
|
case "F16":
|
||||||
return FileTypeF16, nil
|
return FileTypeF16, nil
|
||||||
case "Q4_0":
|
|
||||||
return FileTypeQ4_0, nil
|
|
||||||
case "Q4_1":
|
|
||||||
return FileTypeQ4_1, nil
|
|
||||||
case "Q8_0":
|
case "Q8_0":
|
||||||
return FileTypeQ8_0, nil
|
return FileTypeQ8_0, nil
|
||||||
case "Q5_0":
|
|
||||||
return FileTypeQ5_0, nil
|
|
||||||
case "Q5_1":
|
|
||||||
return FileTypeQ5_1, nil
|
|
||||||
case "Q2_K":
|
|
||||||
return FileTypeQ2_K, nil
|
|
||||||
case "Q3_K_S":
|
|
||||||
return FileTypeQ3_K_S, nil
|
|
||||||
case "Q3_K_M":
|
|
||||||
return FileTypeQ3_K_M, nil
|
|
||||||
case "Q3_K_L":
|
|
||||||
return FileTypeQ3_K_L, nil
|
|
||||||
case "Q4_K_S":
|
case "Q4_K_S":
|
||||||
return FileTypeQ4_K_S, nil
|
return FileTypeQ4_K_S, nil
|
||||||
case "Q4_K_M", "Q4_K":
|
case "Q4_K_M", "Q4_K":
|
||||||
return FileTypeQ4_K_M, nil
|
return FileTypeQ4_K_M, nil
|
||||||
case "Q5_K_S":
|
|
||||||
return FileTypeQ5_K_S, nil
|
|
||||||
case "Q5_K_M", "Q5_K":
|
|
||||||
return FileTypeQ5_K_M, nil
|
|
||||||
case "Q6_K":
|
|
||||||
return FileTypeQ6_K, nil
|
|
||||||
case "Q2_K_S":
|
|
||||||
return FileTypeQ2_K_S, nil
|
|
||||||
case "BF16":
|
case "BF16":
|
||||||
return FileTypeBF16, nil
|
return FileTypeBF16, nil
|
||||||
default:
|
default:
|
||||||
|
|
@ -111,40 +87,41 @@ func ParseFileType(s string) (FileType, error) {
|
||||||
}
|
}
|
||||||
|
|
||||||
func (t FileType) String() string {
|
func (t FileType) String() string {
|
||||||
|
// Note: this routine will return a broader set of file types for existing models
|
||||||
switch t {
|
switch t {
|
||||||
case FileTypeF32:
|
case FileTypeF32:
|
||||||
return "F32"
|
return "F32"
|
||||||
case FileTypeF16:
|
case FileTypeF16:
|
||||||
return "F16"
|
return "F16"
|
||||||
case FileTypeQ4_0:
|
case fileTypeQ4_0:
|
||||||
return "Q4_0"
|
return "Q4_0"
|
||||||
case FileTypeQ4_1:
|
case fileTypeQ4_1:
|
||||||
return "Q4_1"
|
return "Q4_1"
|
||||||
case FileTypeQ8_0:
|
case FileTypeQ8_0:
|
||||||
return "Q8_0"
|
return "Q8_0"
|
||||||
case FileTypeQ5_0:
|
case fileTypeQ5_0:
|
||||||
return "Q5_0"
|
return "Q5_0"
|
||||||
case FileTypeQ5_1:
|
case fileTypeQ5_1:
|
||||||
return "Q5_1"
|
return "Q5_1"
|
||||||
case FileTypeQ2_K:
|
case fileTypeQ2_K:
|
||||||
return "Q2_K"
|
return "Q2_K"
|
||||||
case FileTypeQ3_K_S:
|
case fileTypeQ3_K_S:
|
||||||
return "Q3_K_S"
|
return "Q3_K_S"
|
||||||
case FileTypeQ3_K_M:
|
case fileTypeQ3_K_M:
|
||||||
return "Q3_K_M"
|
return "Q3_K_M"
|
||||||
case FileTypeQ3_K_L:
|
case fileTypeQ3_K_L:
|
||||||
return "Q3_K_L"
|
return "Q3_K_L"
|
||||||
case FileTypeQ4_K_S:
|
case FileTypeQ4_K_S:
|
||||||
return "Q4_K_S"
|
return "Q4_K_S"
|
||||||
case FileTypeQ4_K_M:
|
case FileTypeQ4_K_M:
|
||||||
return "Q4_K_M"
|
return "Q4_K_M"
|
||||||
case FileTypeQ5_K_S:
|
case fileTypeQ5_K_S:
|
||||||
return "Q5_K_S"
|
return "Q5_K_S"
|
||||||
case FileTypeQ5_K_M:
|
case fileTypeQ5_K_M:
|
||||||
return "Q5_K_M"
|
return "Q5_K_M"
|
||||||
case FileTypeQ6_K:
|
case fileTypeQ6_K:
|
||||||
return "Q6_K"
|
return "Q6_K"
|
||||||
case FileTypeQ2_K_S:
|
case fileTypeQ2_K_S:
|
||||||
return "Q2_K_S"
|
return "Q2_K_S"
|
||||||
case FileTypeBF16:
|
case FileTypeBF16:
|
||||||
return "BF16"
|
return "BF16"
|
||||||
|
|
@ -163,35 +140,35 @@ func (ftype FileType) ToTensorType() TensorType {
|
||||||
return TensorTypeF32
|
return TensorTypeF32
|
||||||
case FileTypeF16:
|
case FileTypeF16:
|
||||||
return TensorTypeF16
|
return TensorTypeF16
|
||||||
case FileTypeQ4_0:
|
case fileTypeQ4_0:
|
||||||
return TensorTypeQ4_0
|
return TensorTypeQ4_0
|
||||||
case FileTypeQ4_1:
|
case fileTypeQ4_1:
|
||||||
return TensorTypeQ4_1
|
return TensorTypeQ4_1
|
||||||
case FileTypeQ8_0:
|
case FileTypeQ8_0:
|
||||||
return TensorTypeQ8_0
|
return TensorTypeQ8_0
|
||||||
case FileTypeQ5_0:
|
case fileTypeQ5_0:
|
||||||
return TensorTypeQ5_0
|
return TensorTypeQ5_0
|
||||||
case FileTypeQ5_1:
|
case fileTypeQ5_1:
|
||||||
return TensorTypeQ5_1
|
return TensorTypeQ5_1
|
||||||
case FileTypeQ2_K:
|
case fileTypeQ2_K:
|
||||||
return TensorTypeQ2_K
|
return TensorTypeQ2_K
|
||||||
case FileTypeQ3_K_S:
|
case fileTypeQ3_K_S:
|
||||||
return TensorTypeQ3_K
|
return TensorTypeQ3_K
|
||||||
case FileTypeQ3_K_M:
|
case fileTypeQ3_K_M:
|
||||||
return TensorTypeQ3_K
|
return TensorTypeQ3_K
|
||||||
case FileTypeQ3_K_L:
|
case fileTypeQ3_K_L:
|
||||||
return TensorTypeQ3_K
|
return TensorTypeQ3_K
|
||||||
case FileTypeQ4_K_S:
|
case FileTypeQ4_K_S:
|
||||||
return TensorTypeQ4_K
|
return TensorTypeQ4_K
|
||||||
case FileTypeQ4_K_M:
|
case FileTypeQ4_K_M:
|
||||||
return TensorTypeQ4_K
|
return TensorTypeQ4_K
|
||||||
case FileTypeQ5_K_S:
|
case fileTypeQ5_K_S:
|
||||||
return TensorTypeQ5_K
|
return TensorTypeQ5_K
|
||||||
case FileTypeQ5_K_M:
|
case fileTypeQ5_K_M:
|
||||||
return TensorTypeQ5_K
|
return TensorTypeQ5_K
|
||||||
case FileTypeQ6_K:
|
case fileTypeQ6_K:
|
||||||
return TensorTypeQ6_K
|
return TensorTypeQ6_K
|
||||||
case FileTypeQ2_K_S:
|
case fileTypeQ2_K_S:
|
||||||
return TensorTypeQ2_K
|
return TensorTypeQ2_K
|
||||||
case FileTypeBF16:
|
case FileTypeBF16:
|
||||||
return TensorTypeBF16
|
return TensorTypeBF16
|
||||||
|
|
|
||||||
347
fs/gguf/gguf.go
Normal file
347
fs/gguf/gguf.go
Normal file
|
|
@ -0,0 +1,347 @@
|
||||||
|
package gguf
|
||||||
|
|
||||||
|
import (
|
||||||
|
"bytes"
|
||||||
|
"cmp"
|
||||||
|
"encoding/binary"
|
||||||
|
"errors"
|
||||||
|
"fmt"
|
||||||
|
"io"
|
||||||
|
"iter"
|
||||||
|
"os"
|
||||||
|
"slices"
|
||||||
|
"strings"
|
||||||
|
)
|
||||||
|
|
||||||
|
const (
|
||||||
|
typeUint8 uint32 = iota
|
||||||
|
typeInt8
|
||||||
|
typeUint16
|
||||||
|
typeInt16
|
||||||
|
typeUint32
|
||||||
|
typeInt32
|
||||||
|
typeFloat32
|
||||||
|
typeBool
|
||||||
|
typeString
|
||||||
|
typeArray
|
||||||
|
typeUint64
|
||||||
|
typeInt64
|
||||||
|
typeFloat64
|
||||||
|
)
|
||||||
|
|
||||||
|
var ErrUnsupported = errors.New("unsupported")
|
||||||
|
|
||||||
|
type File struct {
|
||||||
|
Magic [4]byte
|
||||||
|
Version uint32
|
||||||
|
|
||||||
|
keyValues *lazy[KeyValue]
|
||||||
|
tensors *lazy[TensorInfo]
|
||||||
|
offset int64
|
||||||
|
|
||||||
|
file *os.File
|
||||||
|
reader *bufferedReader
|
||||||
|
bts []byte
|
||||||
|
}
|
||||||
|
|
||||||
|
func Open(path string) (f *File, err error) {
|
||||||
|
f = &File{bts: make([]byte, 4096)}
|
||||||
|
f.file, err = os.Open(path)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
|
||||||
|
f.reader = newBufferedReader(f.file, 32<<10)
|
||||||
|
|
||||||
|
if err := binary.Read(f.reader, binary.LittleEndian, &f.Magic); err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
|
||||||
|
if bytes.Equal(f.Magic[:], []byte("gguf")) {
|
||||||
|
return nil, fmt.Errorf("%w file type %v", ErrUnsupported, f.Magic)
|
||||||
|
}
|
||||||
|
|
||||||
|
if err := binary.Read(f.reader, binary.LittleEndian, &f.Version); err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
|
||||||
|
if f.Version < 2 {
|
||||||
|
return nil, fmt.Errorf("%w version %v", ErrUnsupported, f.Version)
|
||||||
|
}
|
||||||
|
|
||||||
|
f.tensors, err = newLazy(f, f.readTensor)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
|
||||||
|
f.tensors.successFunc = func() error {
|
||||||
|
offset := f.reader.offset
|
||||||
|
|
||||||
|
alignment := cmp.Or(f.KeyValue("general.alignment").Int(), 32)
|
||||||
|
f.offset = offset + (alignment-offset%alignment)%alignment
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
f.keyValues, err = newLazy(f, f.readKeyValue)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
|
||||||
|
return f, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func (f *File) readTensor() (TensorInfo, error) {
|
||||||
|
name, err := readString(f)
|
||||||
|
if err != nil {
|
||||||
|
return TensorInfo{}, err
|
||||||
|
}
|
||||||
|
|
||||||
|
dims, err := read[uint32](f)
|
||||||
|
if err != nil {
|
||||||
|
return TensorInfo{}, err
|
||||||
|
}
|
||||||
|
|
||||||
|
shape := make([]uint64, dims)
|
||||||
|
for i := range dims {
|
||||||
|
shape[i], err = read[uint64](f)
|
||||||
|
if err != nil {
|
||||||
|
return TensorInfo{}, err
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
type_, err := read[uint32](f)
|
||||||
|
if err != nil {
|
||||||
|
return TensorInfo{}, err
|
||||||
|
}
|
||||||
|
|
||||||
|
offset, err := read[uint64](f)
|
||||||
|
if err != nil {
|
||||||
|
return TensorInfo{}, err
|
||||||
|
}
|
||||||
|
|
||||||
|
return TensorInfo{
|
||||||
|
Name: name,
|
||||||
|
Offset: offset,
|
||||||
|
Shape: shape,
|
||||||
|
Type: TensorType(type_),
|
||||||
|
}, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func (f *File) readKeyValue() (KeyValue, error) {
|
||||||
|
key, err := readString(f)
|
||||||
|
if err != nil {
|
||||||
|
return KeyValue{}, err
|
||||||
|
}
|
||||||
|
|
||||||
|
t, err := read[uint32](f)
|
||||||
|
if err != nil {
|
||||||
|
return KeyValue{}, err
|
||||||
|
}
|
||||||
|
|
||||||
|
value, err := func() (any, error) {
|
||||||
|
switch t {
|
||||||
|
case typeUint8:
|
||||||
|
return read[uint8](f)
|
||||||
|
case typeInt8:
|
||||||
|
return read[int8](f)
|
||||||
|
case typeUint16:
|
||||||
|
return read[uint16](f)
|
||||||
|
case typeInt16:
|
||||||
|
return read[int16](f)
|
||||||
|
case typeUint32:
|
||||||
|
return read[uint32](f)
|
||||||
|
case typeInt32:
|
||||||
|
return read[int32](f)
|
||||||
|
case typeUint64:
|
||||||
|
return read[uint64](f)
|
||||||
|
case typeInt64:
|
||||||
|
return read[int64](f)
|
||||||
|
case typeFloat32:
|
||||||
|
return read[float32](f)
|
||||||
|
case typeFloat64:
|
||||||
|
return read[float64](f)
|
||||||
|
case typeBool:
|
||||||
|
return read[bool](f)
|
||||||
|
case typeString:
|
||||||
|
return readString(f)
|
||||||
|
case typeArray:
|
||||||
|
return readArray(f)
|
||||||
|
default:
|
||||||
|
return nil, fmt.Errorf("%w type %d", ErrUnsupported, t)
|
||||||
|
}
|
||||||
|
}()
|
||||||
|
if err != nil {
|
||||||
|
return KeyValue{}, err
|
||||||
|
}
|
||||||
|
|
||||||
|
return KeyValue{
|
||||||
|
Key: key,
|
||||||
|
Value: Value{value},
|
||||||
|
}, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func read[T any](f *File) (t T, err error) {
|
||||||
|
err = binary.Read(f.reader, binary.LittleEndian, &t)
|
||||||
|
return t, err
|
||||||
|
}
|
||||||
|
|
||||||
|
func readString(f *File) (string, error) {
|
||||||
|
n, err := read[uint64](f)
|
||||||
|
if err != nil {
|
||||||
|
return "", err
|
||||||
|
}
|
||||||
|
|
||||||
|
if int(n) > len(f.bts) {
|
||||||
|
f.bts = make([]byte, n)
|
||||||
|
}
|
||||||
|
|
||||||
|
bts := f.bts[:n]
|
||||||
|
if _, err := io.ReadFull(f.reader, bts); err != nil {
|
||||||
|
return "", err
|
||||||
|
}
|
||||||
|
defer clear(bts)
|
||||||
|
|
||||||
|
return string(bts), nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func readArray(f *File) (any, error) {
|
||||||
|
t, err := read[uint32](f)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
|
||||||
|
n, err := read[uint64](f)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
|
||||||
|
switch t {
|
||||||
|
case typeUint8:
|
||||||
|
return readArrayData[uint8](f, n)
|
||||||
|
case typeInt8:
|
||||||
|
return readArrayData[int8](f, n)
|
||||||
|
case typeUint16:
|
||||||
|
return readArrayData[uint16](f, n)
|
||||||
|
case typeInt16:
|
||||||
|
return readArrayData[int16](f, n)
|
||||||
|
case typeUint32:
|
||||||
|
return readArrayData[uint32](f, n)
|
||||||
|
case typeInt32:
|
||||||
|
return readArrayData[int32](f, n)
|
||||||
|
case typeUint64:
|
||||||
|
return readArrayData[uint64](f, n)
|
||||||
|
case typeInt64:
|
||||||
|
return readArrayData[int64](f, n)
|
||||||
|
case typeFloat32:
|
||||||
|
return readArrayData[float32](f, n)
|
||||||
|
case typeFloat64:
|
||||||
|
return readArrayData[float64](f, n)
|
||||||
|
case typeBool:
|
||||||
|
return readArrayData[bool](f, n)
|
||||||
|
case typeString:
|
||||||
|
return readArrayString(f, n)
|
||||||
|
default:
|
||||||
|
return nil, fmt.Errorf("%w type %d", ErrUnsupported, t)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func readArrayData[T any](f *File, n uint64) (s []T, err error) {
|
||||||
|
s = make([]T, n)
|
||||||
|
for i := range n {
|
||||||
|
e, err := read[T](f)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
|
||||||
|
s[i] = e
|
||||||
|
}
|
||||||
|
|
||||||
|
return s, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func readArrayString(f *File, n uint64) (s []string, err error) {
|
||||||
|
s = make([]string, n)
|
||||||
|
for i := range n {
|
||||||
|
e, err := readString(f)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
|
||||||
|
s[i] = e
|
||||||
|
}
|
||||||
|
|
||||||
|
return s, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func (f *File) Close() error {
|
||||||
|
f.keyValues.stop()
|
||||||
|
f.tensors.stop()
|
||||||
|
return f.file.Close()
|
||||||
|
}
|
||||||
|
|
||||||
|
func (f *File) KeyValue(key string) KeyValue {
|
||||||
|
if !strings.HasPrefix(key, "general.") && !strings.HasPrefix(key, "tokenizer.") {
|
||||||
|
key = f.KeyValue("general.architecture").String() + "." + key
|
||||||
|
}
|
||||||
|
|
||||||
|
if index := slices.IndexFunc(f.keyValues.values, func(kv KeyValue) bool {
|
||||||
|
return kv.Key == key
|
||||||
|
}); index >= 0 {
|
||||||
|
return f.keyValues.values[index]
|
||||||
|
}
|
||||||
|
|
||||||
|
for keyValue, ok := f.keyValues.next(); ok; keyValue, ok = f.keyValues.next() {
|
||||||
|
if keyValue.Key == key {
|
||||||
|
return keyValue
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return KeyValue{}
|
||||||
|
}
|
||||||
|
|
||||||
|
func (f *File) NumKeyValues() int {
|
||||||
|
return int(f.keyValues.count)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (f *File) KeyValues() iter.Seq2[int, KeyValue] {
|
||||||
|
return f.keyValues.All()
|
||||||
|
}
|
||||||
|
|
||||||
|
func (f *File) TensorInfo(name string) TensorInfo {
|
||||||
|
if index := slices.IndexFunc(f.tensors.values, func(t TensorInfo) bool {
|
||||||
|
return t.Name == name
|
||||||
|
}); index >= 0 {
|
||||||
|
return f.tensors.values[index]
|
||||||
|
}
|
||||||
|
|
||||||
|
// fast-forward through key values if we haven't already
|
||||||
|
_ = f.keyValues.rest()
|
||||||
|
for tensor, ok := f.tensors.next(); ok; tensor, ok = f.tensors.next() {
|
||||||
|
if tensor.Name == name {
|
||||||
|
return tensor
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return TensorInfo{}
|
||||||
|
}
|
||||||
|
|
||||||
|
func (f *File) NumTensors() int {
|
||||||
|
return int(f.tensors.count)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (f *File) TensorInfos() iter.Seq2[int, TensorInfo] {
|
||||||
|
// fast forward through key values if we haven't already
|
||||||
|
f.keyValues.rest()
|
||||||
|
return f.tensors.All()
|
||||||
|
}
|
||||||
|
|
||||||
|
func (f *File) TensorReader(name string) (TensorInfo, io.Reader, error) {
|
||||||
|
t := f.TensorInfo(name)
|
||||||
|
if t.NumBytes() == 0 {
|
||||||
|
return TensorInfo{}, nil, fmt.Errorf("tensor %s not found", name)
|
||||||
|
}
|
||||||
|
|
||||||
|
// fast forward through tensor info if we haven't already
|
||||||
|
_ = f.tensors.rest()
|
||||||
|
return t, io.NewSectionReader(f.file, f.offset+int64(t.Offset), t.NumBytes()), nil
|
||||||
|
}
|
||||||
249
fs/gguf/gguf_test.go
Normal file
249
fs/gguf/gguf_test.go
Normal file
|
|
@ -0,0 +1,249 @@
|
||||||
|
package gguf_test
|
||||||
|
|
||||||
|
import (
|
||||||
|
"bytes"
|
||||||
|
"os"
|
||||||
|
"strconv"
|
||||||
|
"strings"
|
||||||
|
"testing"
|
||||||
|
|
||||||
|
"github.com/google/go-cmp/cmp"
|
||||||
|
"github.com/google/go-cmp/cmp/cmpopts"
|
||||||
|
"github.com/ollama/ollama/fs/ggml"
|
||||||
|
"github.com/ollama/ollama/fs/gguf"
|
||||||
|
)
|
||||||
|
|
||||||
|
func createBinFile(tb testing.TB) string {
|
||||||
|
tb.Helper()
|
||||||
|
f, err := os.CreateTemp(tb.TempDir(), "")
|
||||||
|
if err != nil {
|
||||||
|
tb.Fatal(err)
|
||||||
|
}
|
||||||
|
defer f.Close()
|
||||||
|
|
||||||
|
kv := ggml.KV{
|
||||||
|
"general.architecture": "llama",
|
||||||
|
"llama.block_count": uint32(8),
|
||||||
|
"llama.embedding_length": uint32(3),
|
||||||
|
"llama.attention.head_count": uint32(2),
|
||||||
|
"llama.attention.head_count_kv": uint32(2),
|
||||||
|
"llama.attention.key_length": uint32(3),
|
||||||
|
"llama.rope.dimension_count": uint32(4),
|
||||||
|
"llama.rope.freq_base": float32(10000.0),
|
||||||
|
"llama.rope.freq_scale": float32(1.0),
|
||||||
|
"llama.attention.layer_norm_rms_epsilon": float32(1e-6),
|
||||||
|
"tokenizer.ggml.eos_token_id": uint32(0),
|
||||||
|
"tokenizer.ggml.eos_token_ids": []int32{1, 2, 3},
|
||||||
|
"tokenizer.ggml.tokens": []string{"hello", "world"},
|
||||||
|
"tokenizer.ggml.scores": []float32{0, 1},
|
||||||
|
}
|
||||||
|
|
||||||
|
tensors := []*ggml.Tensor{
|
||||||
|
{
|
||||||
|
Name: "token_embd.weight",
|
||||||
|
Kind: 0,
|
||||||
|
Shape: []uint64{2, 3},
|
||||||
|
WriterTo: bytes.NewBuffer(make([]byte, 4*2*3)),
|
||||||
|
},
|
||||||
|
{
|
||||||
|
Name: "output.weight",
|
||||||
|
Kind: 0,
|
||||||
|
Shape: []uint64{3, 2},
|
||||||
|
WriterTo: bytes.NewBuffer(make([]byte, 4*3*2)),
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
for i := range 8 {
|
||||||
|
tensors = append(tensors, &ggml.Tensor{
|
||||||
|
Name: "blk." + strconv.Itoa(i) + ".attn_q.weight",
|
||||||
|
Kind: 0,
|
||||||
|
Shape: []uint64{3, 3},
|
||||||
|
WriterTo: bytes.NewBuffer(make([]byte, 4*3*3)),
|
||||||
|
}, &ggml.Tensor{
|
||||||
|
Name: "blk." + strconv.Itoa(i) + ".attn_k.weight",
|
||||||
|
Kind: 0,
|
||||||
|
Shape: []uint64{3, 3},
|
||||||
|
WriterTo: bytes.NewBuffer(make([]byte, 4*3*3)),
|
||||||
|
}, &ggml.Tensor{
|
||||||
|
Name: "blk." + strconv.Itoa(i) + ".attn_v.weight",
|
||||||
|
Kind: 0,
|
||||||
|
Shape: []uint64{3, 3},
|
||||||
|
WriterTo: bytes.NewBuffer(make([]byte, 4*3*3)),
|
||||||
|
}, &ggml.Tensor{
|
||||||
|
Name: "blk." + strconv.Itoa(i) + ".attn_output.weight",
|
||||||
|
Kind: 0,
|
||||||
|
Shape: []uint64{3, 3},
|
||||||
|
WriterTo: bytes.NewBuffer(make([]byte, 4*3*3)),
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
if err := ggml.WriteGGUF(f, kv, tensors); err != nil {
|
||||||
|
tb.Fatal(err)
|
||||||
|
}
|
||||||
|
|
||||||
|
return f.Name()
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestRead(t *testing.T) {
|
||||||
|
f, err := gguf.Open(createBinFile(t))
|
||||||
|
if err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
defer f.Close()
|
||||||
|
|
||||||
|
if got := f.KeyValue("does.not.exist").Valid(); got {
|
||||||
|
t.Errorf(`KeyValue("does.not.exist").Exists() = %v, want false`, got)
|
||||||
|
}
|
||||||
|
|
||||||
|
if got := f.KeyValue("general.architecture").String(); got != "llama" {
|
||||||
|
t.Errorf(`KeyValue("general.architecture").String() = %q, want %q`, got, "llama")
|
||||||
|
}
|
||||||
|
|
||||||
|
if got := f.TensorInfo("token_embd.weight"); got.Name != "token_embd.weight" {
|
||||||
|
t.Errorf(`TensorInfo("token_embd.weight").Name = %q, want %q`, got.Name, "token_embd.weight")
|
||||||
|
} else if diff := cmp.Diff(got.Shape, []uint64{2, 3}); diff != "" {
|
||||||
|
t.Errorf(`TensorInfo("token_embd.weight").Shape mismatch (-got +want):\n%s`, diff)
|
||||||
|
} else if got.Type != gguf.TensorTypeF32 {
|
||||||
|
t.Errorf(`TensorInfo("token_embd.weight").Type = %d, want %d`, got.Type, gguf.TensorTypeF32)
|
||||||
|
}
|
||||||
|
|
||||||
|
if got := f.KeyValue("block_count").Uint(); got != 8 {
|
||||||
|
t.Errorf(`KeyValue("block_count").Uint() = %d, want %d`, got, 8)
|
||||||
|
}
|
||||||
|
|
||||||
|
if diff := cmp.Diff(f.KeyValue("tokenizer.ggml.tokens").Strings(), []string{"hello", "world"}); diff != "" {
|
||||||
|
t.Errorf("KeyValue(\"tokenizer.ggml.tokens\").Strings() mismatch (-got +want):\n%s", diff)
|
||||||
|
}
|
||||||
|
|
||||||
|
if diff := cmp.Diff(f.KeyValue("tokenizer.ggml.scores").Floats(), []float64{0, 1}); diff != "" {
|
||||||
|
t.Errorf("KeyValue(\"tokenizer.ggml.scores\").Ints() mismatch (-got +want):\n%s", diff)
|
||||||
|
}
|
||||||
|
|
||||||
|
var kvs []string
|
||||||
|
for _, kv := range f.KeyValues() {
|
||||||
|
if !kv.Valid() {
|
||||||
|
t.Error("found invalid key-value pair:", kv)
|
||||||
|
}
|
||||||
|
|
||||||
|
kvs = append(kvs, kv.Key)
|
||||||
|
}
|
||||||
|
|
||||||
|
if len(kvs) != f.NumKeyValues() {
|
||||||
|
t.Errorf("iterated key count = %d, want %d", len(kvs), f.NumKeyValues())
|
||||||
|
}
|
||||||
|
|
||||||
|
if diff := cmp.Diff(kvs, []string{
|
||||||
|
"general.architecture",
|
||||||
|
"llama.block_count",
|
||||||
|
"llama.embedding_length",
|
||||||
|
"llama.attention.head_count",
|
||||||
|
"llama.attention.head_count_kv",
|
||||||
|
"llama.attention.key_length",
|
||||||
|
"llama.rope.dimension_count",
|
||||||
|
"llama.rope.freq_base",
|
||||||
|
"llama.rope.freq_scale",
|
||||||
|
"llama.attention.layer_norm_rms_epsilon",
|
||||||
|
"tokenizer.ggml.eos_token_id",
|
||||||
|
"tokenizer.ggml.eos_token_ids",
|
||||||
|
"tokenizer.ggml.tokens",
|
||||||
|
"tokenizer.ggml.scores",
|
||||||
|
}, cmpopts.SortSlices(strings.Compare)); diff != "" {
|
||||||
|
t.Errorf("KeyValues() mismatch (-got +want):\n%s", diff)
|
||||||
|
}
|
||||||
|
|
||||||
|
var tis []string
|
||||||
|
for _, ti := range f.TensorInfos() {
|
||||||
|
if !ti.Valid() {
|
||||||
|
t.Error("found invalid tensor info:", ti)
|
||||||
|
}
|
||||||
|
|
||||||
|
tis = append(tis, ti.Name)
|
||||||
|
}
|
||||||
|
|
||||||
|
if len(tis) != f.NumTensors() {
|
||||||
|
t.Errorf("iterated tensor count = %d, want %d", len(tis), f.NumTensors())
|
||||||
|
}
|
||||||
|
|
||||||
|
if diff := cmp.Diff(tis, []string{
|
||||||
|
"token_embd.weight",
|
||||||
|
"output.weight",
|
||||||
|
"blk.0.attn_q.weight",
|
||||||
|
"blk.0.attn_k.weight",
|
||||||
|
"blk.0.attn_v.weight",
|
||||||
|
"blk.0.attn_output.weight",
|
||||||
|
"blk.1.attn_q.weight",
|
||||||
|
"blk.1.attn_k.weight",
|
||||||
|
"blk.1.attn_v.weight",
|
||||||
|
"blk.1.attn_output.weight",
|
||||||
|
"blk.2.attn_q.weight",
|
||||||
|
"blk.2.attn_k.weight",
|
||||||
|
"blk.2.attn_v.weight",
|
||||||
|
"blk.2.attn_output.weight",
|
||||||
|
"blk.3.attn_q.weight",
|
||||||
|
"blk.3.attn_k.weight",
|
||||||
|
"blk.3.attn_v.weight",
|
||||||
|
"blk.3.attn_output.weight",
|
||||||
|
"blk.4.attn_q.weight",
|
||||||
|
"blk.4.attn_k.weight",
|
||||||
|
"blk.4.attn_v.weight",
|
||||||
|
"blk.4.attn_output.weight",
|
||||||
|
"blk.5.attn_q.weight",
|
||||||
|
"blk.5.attn_k.weight",
|
||||||
|
"blk.5.attn_v.weight",
|
||||||
|
"blk.5.attn_output.weight",
|
||||||
|
"blk.6.attn_q.weight",
|
||||||
|
"blk.6.attn_k.weight",
|
||||||
|
"blk.6.attn_v.weight",
|
||||||
|
"blk.6.attn_output.weight",
|
||||||
|
"blk.7.attn_q.weight",
|
||||||
|
"blk.7.attn_k.weight",
|
||||||
|
"blk.7.attn_v.weight",
|
||||||
|
"blk.7.attn_output.weight",
|
||||||
|
}, cmpopts.SortSlices(strings.Compare)); diff != "" {
|
||||||
|
t.Errorf("TensorInfos() mismatch (-got +want):\n%s", diff)
|
||||||
|
}
|
||||||
|
|
||||||
|
ti, r, err := f.TensorReader("output.weight")
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf(`TensorReader("output.weight") error: %v`, err)
|
||||||
|
}
|
||||||
|
|
||||||
|
if ti.Name != "output.weight" {
|
||||||
|
t.Errorf(`TensorReader("output.weight").Name = %q, want %q`, ti.Name, "output.weight")
|
||||||
|
} else if diff := cmp.Diff(ti.Shape, []uint64{3, 2}); diff != "" {
|
||||||
|
t.Errorf(`TensorReader("output.weight").Shape mismatch (-got +want):\n%s`, diff)
|
||||||
|
} else if ti.Type != gguf.TensorTypeF32 {
|
||||||
|
t.Errorf(`TensorReader("output.weight").Type = %d, want %d`, ti.Type, gguf.TensorTypeF32)
|
||||||
|
}
|
||||||
|
|
||||||
|
var b bytes.Buffer
|
||||||
|
if _, err := b.ReadFrom(r); err != nil {
|
||||||
|
t.Fatalf(`ReadFrom TensorReader("output.weight") error: %v`, err)
|
||||||
|
}
|
||||||
|
|
||||||
|
if b.Len() != int(ti.NumBytes()) {
|
||||||
|
t.Errorf(`ReadFrom TensorReader("output.weight") length = %d, want %d`, b.Len(), ti.NumBytes())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func BenchmarkRead(b *testing.B) {
|
||||||
|
b.ReportAllocs()
|
||||||
|
|
||||||
|
p := createBinFile(b)
|
||||||
|
for b.Loop() {
|
||||||
|
f, err := gguf.Open(p)
|
||||||
|
if err != nil {
|
||||||
|
b.Fatal(err)
|
||||||
|
}
|
||||||
|
|
||||||
|
if got := f.KeyValue("general.architecture").String(); got != "llama" {
|
||||||
|
b.Errorf("got = %q, want %q", got, "llama")
|
||||||
|
}
|
||||||
|
|
||||||
|
// Iterate through some tensors
|
||||||
|
for range f.TensorInfos() {
|
||||||
|
}
|
||||||
|
|
||||||
|
f.Close()
|
||||||
|
}
|
||||||
|
}
|
||||||
90
fs/gguf/keyvalue.go
Normal file
90
fs/gguf/keyvalue.go
Normal file
|
|
@ -0,0 +1,90 @@
|
||||||
|
package gguf
|
||||||
|
|
||||||
|
import (
|
||||||
|
"reflect"
|
||||||
|
"slices"
|
||||||
|
)
|
||||||
|
|
||||||
|
type KeyValue struct {
|
||||||
|
Key string
|
||||||
|
Value
|
||||||
|
}
|
||||||
|
|
||||||
|
func (kv KeyValue) Valid() bool {
|
||||||
|
return kv.Key != "" && kv.Value.value != nil
|
||||||
|
}
|
||||||
|
|
||||||
|
type Value struct {
|
||||||
|
value any
|
||||||
|
}
|
||||||
|
|
||||||
|
func value[T any](v Value, kinds ...reflect.Kind) (t T) {
|
||||||
|
vv := reflect.ValueOf(v.value)
|
||||||
|
if slices.Contains(kinds, vv.Kind()) {
|
||||||
|
t = vv.Convert(reflect.TypeOf(t)).Interface().(T)
|
||||||
|
}
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
func values[T any](v Value, kinds ...reflect.Kind) (ts []T) {
|
||||||
|
switch vv := reflect.ValueOf(v.value); vv.Kind() {
|
||||||
|
case reflect.Slice:
|
||||||
|
if slices.Contains(kinds, vv.Type().Elem().Kind()) {
|
||||||
|
ts = make([]T, vv.Len())
|
||||||
|
for i := range vv.Len() {
|
||||||
|
ts[i] = vv.Index(i).Convert(reflect.TypeOf(ts[i])).Interface().(T)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
// Int returns Value as a signed integer. If it is not a signed integer, it returns 0.
|
||||||
|
func (v Value) Int() int64 {
|
||||||
|
return value[int64](v, reflect.Int, reflect.Int8, reflect.Int16, reflect.Int32, reflect.Int64)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Ints returns Value as a signed integer slice. If it is not a signed integer slice, it returns nil.
|
||||||
|
func (v Value) Ints() (i64s []int64) {
|
||||||
|
return values[int64](v, reflect.Int, reflect.Int8, reflect.Int16, reflect.Int32, reflect.Int64)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Uint converts an unsigned integer value to uint64. If the value is not a unsigned integer, it returns 0.
|
||||||
|
func (v Value) Uint() uint64 {
|
||||||
|
return value[uint64](v, reflect.Uint, reflect.Uint8, reflect.Uint16, reflect.Uint32, reflect.Uint64)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Uints returns Value as a unsigned integer slice. If it is not a unsigned integer slice, it returns nil.
|
||||||
|
func (v Value) Uints() (u64s []uint64) {
|
||||||
|
return values[uint64](v, reflect.Uint, reflect.Uint8, reflect.Uint16, reflect.Uint32, reflect.Uint64)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Float returns Value as a float. If it is not a float, it returns 0.
|
||||||
|
func (v Value) Float() float64 {
|
||||||
|
return value[float64](v, reflect.Float32, reflect.Float64)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Floats returns Value as a float slice. If it is not a float slice, it returns nil.
|
||||||
|
func (v Value) Floats() (f64s []float64) {
|
||||||
|
return values[float64](v, reflect.Float32, reflect.Float64)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Bool returns Value as a boolean. If it is not a boolean, it returns false.
|
||||||
|
func (v Value) Bool() bool {
|
||||||
|
return value[bool](v, reflect.Bool)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Bools returns Value as a boolean slice. If it is not a boolean slice, it returns nil.
|
||||||
|
func (v Value) Bools() (bools []bool) {
|
||||||
|
return values[bool](v, reflect.Bool)
|
||||||
|
}
|
||||||
|
|
||||||
|
// String returns Value as a string. If it is not a string, it returns an empty string.
|
||||||
|
func (v Value) String() string {
|
||||||
|
return value[string](v, reflect.String)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Strings returns Value as a string slice. If it is not a string slice, it returns nil.
|
||||||
|
func (v Value) Strings() (strings []string) {
|
||||||
|
return values[string](v, reflect.String)
|
||||||
|
}
|
||||||
208
fs/gguf/keyvalue_test.go
Normal file
208
fs/gguf/keyvalue_test.go
Normal file
|
|
@ -0,0 +1,208 @@
|
||||||
|
package gguf
|
||||||
|
|
||||||
|
import (
|
||||||
|
"testing"
|
||||||
|
|
||||||
|
"github.com/google/go-cmp/cmp"
|
||||||
|
)
|
||||||
|
|
||||||
|
func split(name string, values map[string][]any) (matched []any, unmatched []any) {
|
||||||
|
for key, value := range values {
|
||||||
|
if key == name {
|
||||||
|
matched = value
|
||||||
|
} else {
|
||||||
|
unmatched = append(unmatched, value...)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestValue(t *testing.T) {
|
||||||
|
values := map[string][]any{
|
||||||
|
"int64": {int(42), int8(42), int16(42), int32(42), int64(42)},
|
||||||
|
"uint64": {uint(42), uint8(42), uint16(42), uint32(42), uint64(42)},
|
||||||
|
"float64": {float32(42), float64(42)},
|
||||||
|
"string": {"42", "hello"},
|
||||||
|
"bool": {true, false},
|
||||||
|
}
|
||||||
|
|
||||||
|
t.Run("int64", func(t *testing.T) {
|
||||||
|
matched, unmatched := split("int64", values)
|
||||||
|
for _, v := range matched {
|
||||||
|
kv := KeyValue{"key", Value{v}}
|
||||||
|
if i64 := kv.Int(); i64 != 42 {
|
||||||
|
t.Errorf("expected 42, got %d", i64)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
for _, v := range unmatched {
|
||||||
|
kv := KeyValue{"key", Value{v}}
|
||||||
|
if i64 := kv.Int(); i64 != 0 {
|
||||||
|
t.Errorf("expected 42, got %d", i64)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
})
|
||||||
|
|
||||||
|
t.Run("uint64", func(t *testing.T) {
|
||||||
|
matched, unmatched := split("uint64", values)
|
||||||
|
for _, v := range matched {
|
||||||
|
kv := KeyValue{"key", Value{v}}
|
||||||
|
if u64 := kv.Uint(); u64 != 42 {
|
||||||
|
t.Errorf("expected 42, got %d", u64)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
for _, v := range unmatched {
|
||||||
|
kv := KeyValue{"key", Value{v}}
|
||||||
|
if u64 := kv.Uint(); u64 != 0 {
|
||||||
|
t.Errorf("expected 42, got %d", u64)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
})
|
||||||
|
|
||||||
|
t.Run("float64", func(t *testing.T) {
|
||||||
|
matched, unmatched := split("float64", values)
|
||||||
|
for _, v := range matched {
|
||||||
|
kv := KeyValue{"key", Value{v}}
|
||||||
|
if f64 := kv.Float(); f64 != 42 {
|
||||||
|
t.Errorf("expected 42, got %f", f64)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
for _, v := range unmatched {
|
||||||
|
kv := KeyValue{"key", Value{v}}
|
||||||
|
if f64 := kv.Float(); f64 != 0 {
|
||||||
|
t.Errorf("expected 42, got %f", f64)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
})
|
||||||
|
|
||||||
|
t.Run("string", func(t *testing.T) {
|
||||||
|
matched, unmatched := split("string", values)
|
||||||
|
for _, v := range matched {
|
||||||
|
kv := KeyValue{"key", Value{v}}
|
||||||
|
if s := kv.String(); s != v {
|
||||||
|
t.Errorf("expected 42, got %s", s)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
for _, v := range unmatched {
|
||||||
|
kv := KeyValue{"key", Value{v}}
|
||||||
|
if s := kv.String(); s != "" {
|
||||||
|
t.Errorf("expected 42, got %s", s)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
})
|
||||||
|
|
||||||
|
t.Run("bool", func(t *testing.T) {
|
||||||
|
matched, unmatched := split("bool", values)
|
||||||
|
for _, v := range matched {
|
||||||
|
kv := KeyValue{"key", Value{v}}
|
||||||
|
if b := kv.Bool(); b != v {
|
||||||
|
t.Errorf("expected true, got %v", b)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
for _, v := range unmatched {
|
||||||
|
kv := KeyValue{"key", Value{v}}
|
||||||
|
if b := kv.Bool(); b != false {
|
||||||
|
t.Errorf("expected false, got %v", b)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestValues(t *testing.T) {
|
||||||
|
values := map[string][]any{
|
||||||
|
"int64s": {[]int{42}, []int8{42}, []int16{42}, []int32{42}, []int64{42}},
|
||||||
|
"uint64s": {[]uint{42}, []uint8{42}, []uint16{42}, []uint32{42}, []uint64{42}},
|
||||||
|
"float64s": {[]float32{42}, []float64{42}},
|
||||||
|
"strings": {[]string{"42"}, []string{"hello"}},
|
||||||
|
"bools": {[]bool{true}, []bool{false}},
|
||||||
|
}
|
||||||
|
|
||||||
|
t.Run("int64s", func(t *testing.T) {
|
||||||
|
matched, unmatched := split("int64s", values)
|
||||||
|
for _, v := range matched {
|
||||||
|
kv := KeyValue{"key", Value{v}}
|
||||||
|
if diff := cmp.Diff(kv.Ints(), []int64{42}); diff != "" {
|
||||||
|
t.Errorf("diff: %s", diff)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
for _, v := range unmatched {
|
||||||
|
kv := KeyValue{"key", Value{v}}
|
||||||
|
if i64s := kv.Ints(); i64s != nil {
|
||||||
|
t.Errorf("expected nil, got %v", i64s)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
})
|
||||||
|
|
||||||
|
t.Run("uint64s", func(t *testing.T) {
|
||||||
|
matched, unmatched := split("uint64s", values)
|
||||||
|
for _, v := range matched {
|
||||||
|
kv := KeyValue{"key", Value{v}}
|
||||||
|
if diff := cmp.Diff(kv.Uints(), []uint64{42}); diff != "" {
|
||||||
|
t.Errorf("diff: %s", diff)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
for _, v := range unmatched {
|
||||||
|
kv := KeyValue{"key", Value{v}}
|
||||||
|
if u64s := kv.Uints(); u64s != nil {
|
||||||
|
t.Errorf("expected nil, got %v", u64s)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
})
|
||||||
|
|
||||||
|
t.Run("float64s", func(t *testing.T) {
|
||||||
|
matched, unmatched := split("float64s", values)
|
||||||
|
for _, v := range matched {
|
||||||
|
kv := KeyValue{"key", Value{v}}
|
||||||
|
if diff := cmp.Diff(kv.Floats(), []float64{42}); diff != "" {
|
||||||
|
t.Errorf("diff: %s", diff)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
for _, v := range unmatched {
|
||||||
|
kv := KeyValue{"key", Value{v}}
|
||||||
|
if f64s := kv.Floats(); f64s != nil {
|
||||||
|
t.Errorf("expected nil, got %v", f64s)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
})
|
||||||
|
|
||||||
|
t.Run("strings", func(t *testing.T) {
|
||||||
|
matched, unmatched := split("strings", values)
|
||||||
|
for _, v := range matched {
|
||||||
|
kv := KeyValue{"key", Value{v}}
|
||||||
|
if diff := cmp.Diff(kv.Strings(), v); diff != "" {
|
||||||
|
t.Errorf("diff: %s", diff)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
for _, v := range unmatched {
|
||||||
|
kv := KeyValue{"key", Value{v}}
|
||||||
|
if s := kv.Strings(); s != nil {
|
||||||
|
t.Errorf("expected nil, got %v", s)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
})
|
||||||
|
|
||||||
|
t.Run("bools", func(t *testing.T) {
|
||||||
|
matched, unmatched := split("bools", values)
|
||||||
|
for _, v := range matched {
|
||||||
|
kv := KeyValue{"key", Value{v}}
|
||||||
|
if diff := cmp.Diff(kv.Bools(), v); diff != "" {
|
||||||
|
t.Errorf("diff: %s", diff)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
for _, v := range unmatched {
|
||||||
|
kv := KeyValue{"key", Value{v}}
|
||||||
|
if b := kv.Bools(); b != nil {
|
||||||
|
t.Errorf("expected nil, got %v", b)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
})
|
||||||
|
}
|
||||||
89
fs/gguf/lazy.go
Normal file
89
fs/gguf/lazy.go
Normal file
|
|
@ -0,0 +1,89 @@
|
||||||
|
package gguf
|
||||||
|
|
||||||
|
import (
|
||||||
|
"encoding/binary"
|
||||||
|
"iter"
|
||||||
|
"log/slog"
|
||||||
|
)
|
||||||
|
|
||||||
|
type lazy[T any] struct {
|
||||||
|
count uint64
|
||||||
|
next func() (T, bool)
|
||||||
|
stop func()
|
||||||
|
values []T
|
||||||
|
|
||||||
|
// successFunc is called when all values have been successfully read.
|
||||||
|
successFunc func() error
|
||||||
|
}
|
||||||
|
|
||||||
|
func newLazy[T any](f *File, fn func() (T, error)) (*lazy[T], error) {
|
||||||
|
it := lazy[T]{}
|
||||||
|
if err := binary.Read(f.reader, binary.LittleEndian, &it.count); err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
|
||||||
|
it.values = make([]T, 0)
|
||||||
|
it.next, it.stop = iter.Pull(func(yield func(T) bool) {
|
||||||
|
for i := range it.count {
|
||||||
|
t, err := fn()
|
||||||
|
if err != nil {
|
||||||
|
slog.Error("error reading tensor", "index", i, "error", err)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
it.values = append(it.values, t)
|
||||||
|
if !yield(t) {
|
||||||
|
break
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if it.successFunc != nil {
|
||||||
|
it.successFunc()
|
||||||
|
}
|
||||||
|
})
|
||||||
|
|
||||||
|
return &it, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func (g *lazy[T]) Values() iter.Seq[T] {
|
||||||
|
return func(yield func(T) bool) {
|
||||||
|
for _, v := range g.All() {
|
||||||
|
if !yield(v) {
|
||||||
|
break
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func (g *lazy[T]) All() iter.Seq2[int, T] {
|
||||||
|
return func(yield func(int, T) bool) {
|
||||||
|
for i := range int(g.count) {
|
||||||
|
if i < len(g.values) {
|
||||||
|
if !yield(i, g.values[i]) {
|
||||||
|
break
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
t, ok := g.next()
|
||||||
|
if !ok {
|
||||||
|
break
|
||||||
|
}
|
||||||
|
|
||||||
|
if !yield(i, t) {
|
||||||
|
break
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func (g *lazy[T]) rest() (collected bool) {
|
||||||
|
for {
|
||||||
|
_, ok := g.next()
|
||||||
|
collected = collected || ok
|
||||||
|
if !ok {
|
||||||
|
break
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return collected
|
||||||
|
}
|
||||||
23
fs/gguf/reader.go
Normal file
23
fs/gguf/reader.go
Normal file
|
|
@ -0,0 +1,23 @@
|
||||||
|
package gguf
|
||||||
|
|
||||||
|
import (
|
||||||
|
"bufio"
|
||||||
|
"io"
|
||||||
|
)
|
||||||
|
|
||||||
|
type bufferedReader struct {
|
||||||
|
offset int64
|
||||||
|
*bufio.Reader
|
||||||
|
}
|
||||||
|
|
||||||
|
func newBufferedReader(rs io.ReadSeeker, size int) *bufferedReader {
|
||||||
|
return &bufferedReader{
|
||||||
|
Reader: bufio.NewReaderSize(rs, size),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func (rs *bufferedReader) Read(p []byte) (n int, err error) {
|
||||||
|
n, err = rs.Reader.Read(p)
|
||||||
|
rs.offset += int64(n)
|
||||||
|
return n, err
|
||||||
|
}
|
||||||
288
fs/gguf/tensor.go
Normal file
288
fs/gguf/tensor.go
Normal file
|
|
@ -0,0 +1,288 @@
|
||||||
|
package gguf
|
||||||
|
|
||||||
|
import (
|
||||||
|
"log/slog"
|
||||||
|
"strings"
|
||||||
|
)
|
||||||
|
|
||||||
|
type TensorInfo struct {
|
||||||
|
Name string
|
||||||
|
Offset uint64
|
||||||
|
Shape []uint64
|
||||||
|
Type TensorType
|
||||||
|
}
|
||||||
|
|
||||||
|
func (ti TensorInfo) Valid() bool {
|
||||||
|
return ti.Name != "" && ti.NumBytes() > 0
|
||||||
|
}
|
||||||
|
|
||||||
|
func (ti TensorInfo) NumValues() int64 {
|
||||||
|
var numItems int64 = 1
|
||||||
|
for _, dim := range ti.Shape {
|
||||||
|
numItems *= int64(dim)
|
||||||
|
}
|
||||||
|
return numItems
|
||||||
|
}
|
||||||
|
|
||||||
|
// NumBytes returns the number of bytes in the tensor.
|
||||||
|
func (ti TensorInfo) NumBytes() int64 {
|
||||||
|
return int64(float64(ti.NumValues()) * ti.Type.NumBytes())
|
||||||
|
}
|
||||||
|
|
||||||
|
func (ti TensorInfo) LogValue() slog.Value {
|
||||||
|
return slog.GroupValue(
|
||||||
|
slog.String("name", ti.Name),
|
||||||
|
slog.Int64("offset", int64(ti.Offset)),
|
||||||
|
slog.Any("shape", ti.Shape),
|
||||||
|
slog.Int64("num_values", ti.NumValues()),
|
||||||
|
slog.Int64("num_bytes", ti.NumBytes()),
|
||||||
|
slog.Any("type", ti.Type),
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
|
type TensorType uint32
|
||||||
|
|
||||||
|
const (
|
||||||
|
TensorTypeF32 TensorType = iota
|
||||||
|
TensorTypeF16
|
||||||
|
TensorTypeQ4_0
|
||||||
|
TensorTypeQ4_1
|
||||||
|
|
||||||
|
// unexported // unused in gguf
|
||||||
|
tensorTypeQ4_2
|
||||||
|
tensorTypeQ4_3
|
||||||
|
|
||||||
|
TensorTypeQ5_0
|
||||||
|
TensorTypeQ5_1
|
||||||
|
TensorTypeQ8_0
|
||||||
|
TensorTypeQ8_1
|
||||||
|
TensorTypeQ2_K
|
||||||
|
TensorTypeQ3_K
|
||||||
|
TensorTypeQ4_K
|
||||||
|
TensorTypeQ5_K
|
||||||
|
TensorTypeQ6_K
|
||||||
|
TensorTypeQ8_K
|
||||||
|
|
||||||
|
// unexported // unquantizable by ollama
|
||||||
|
tensorTypeIQ2_XXS
|
||||||
|
tensorTypeIQ2_XS
|
||||||
|
tensorTypeIQ3_XXS
|
||||||
|
tensorTypeIQ1_S
|
||||||
|
tensorTypeIQ4_NL
|
||||||
|
tensorTypeIQ3_S
|
||||||
|
tensorTypeIQ2_S
|
||||||
|
tensorTypeIQ4_XS
|
||||||
|
|
||||||
|
TensorTypeI8
|
||||||
|
TensorTypeI16
|
||||||
|
TensorTypeI32
|
||||||
|
TensorTypeI64
|
||||||
|
TensorTypeF64
|
||||||
|
|
||||||
|
// unexported // unquantizable by ollama
|
||||||
|
tensorTypeIQ1_M
|
||||||
|
|
||||||
|
TensorTypeBF16
|
||||||
|
|
||||||
|
// unexported // unused in gguf
|
||||||
|
tensorTypeQ4_0_4_4
|
||||||
|
tensorTypeQ4_0_4_8
|
||||||
|
tensorTypeQ4_0_8_8
|
||||||
|
|
||||||
|
// unexported // unquantizable by ollama
|
||||||
|
tensorTypeTQ1_0
|
||||||
|
tensorTypeTQ2_0
|
||||||
|
|
||||||
|
// unexported // unused in gguf
|
||||||
|
tensorTypeIQ4_NL_4_4
|
||||||
|
tensorTypeIQ4_NL_4_8
|
||||||
|
tensorTypeIQ4_NL_8_8
|
||||||
|
)
|
||||||
|
|
||||||
|
func (tt TensorType) NumBytes() float64 {
|
||||||
|
return float64(tt.typeSize()) / float64(tt.blockSize())
|
||||||
|
}
|
||||||
|
|
||||||
|
func (tt TensorType) typeSize() int64 {
|
||||||
|
switch tt {
|
||||||
|
case TensorTypeF32:
|
||||||
|
return 4
|
||||||
|
case TensorTypeF16:
|
||||||
|
return 2
|
||||||
|
case TensorTypeQ4_0:
|
||||||
|
return 2 + tt.blockSize()/2
|
||||||
|
case TensorTypeQ4_1:
|
||||||
|
return 2 + 2 + tt.blockSize()/2
|
||||||
|
case TensorTypeQ5_0:
|
||||||
|
return 2 + 4 + tt.blockSize()/2
|
||||||
|
case TensorTypeQ5_1:
|
||||||
|
return 2 + 2 + 4 + tt.blockSize()/2
|
||||||
|
case TensorTypeQ8_0:
|
||||||
|
return 2 + tt.blockSize()
|
||||||
|
case TensorTypeQ8_1:
|
||||||
|
return 2 + 2 + tt.blockSize()
|
||||||
|
case TensorTypeQ2_K:
|
||||||
|
return tt.blockSize()/16 + tt.blockSize()/4 + 2 + 2
|
||||||
|
case TensorTypeQ3_K:
|
||||||
|
return tt.blockSize()/8 + tt.blockSize()/4 + 12 + 2
|
||||||
|
case TensorTypeQ4_K:
|
||||||
|
return 2 + 2 + 12 + tt.blockSize()/2
|
||||||
|
case TensorTypeQ5_K:
|
||||||
|
return 2 + 2 + 12 + tt.blockSize()/8 + tt.blockSize()/2
|
||||||
|
case TensorTypeQ6_K:
|
||||||
|
return tt.blockSize()/2 + tt.blockSize()/4 + tt.blockSize()/16 + 2
|
||||||
|
case TensorTypeQ8_K:
|
||||||
|
return 4 + tt.blockSize() + 2*tt.blockSize()/16
|
||||||
|
case tensorTypeIQ2_XXS:
|
||||||
|
return 2 + 2*tt.blockSize()/8
|
||||||
|
case tensorTypeIQ2_XS:
|
||||||
|
return 2 + 2*tt.blockSize()/8 + tt.blockSize()/32
|
||||||
|
case tensorTypeIQ3_XXS:
|
||||||
|
return 2 + tt.blockSize()/4 + tt.blockSize()/8
|
||||||
|
case tensorTypeIQ1_S:
|
||||||
|
return 2 + tt.blockSize()/8 + tt.blockSize()/16
|
||||||
|
case tensorTypeIQ4_NL:
|
||||||
|
return 2 + tt.blockSize()/2
|
||||||
|
case tensorTypeIQ3_S:
|
||||||
|
return 2 + tt.blockSize()/4 + tt.blockSize()/8 + tt.blockSize()/32 + 4
|
||||||
|
case tensorTypeIQ2_S:
|
||||||
|
return 2 + tt.blockSize()/4 + tt.blockSize()/16
|
||||||
|
case tensorTypeIQ4_XS:
|
||||||
|
return 2 + 2 + tt.blockSize()/2 + tt.blockSize()/64
|
||||||
|
case TensorTypeI8:
|
||||||
|
return 1
|
||||||
|
case TensorTypeI16:
|
||||||
|
return 2
|
||||||
|
case TensorTypeI32:
|
||||||
|
return 4
|
||||||
|
case TensorTypeI64:
|
||||||
|
return 8
|
||||||
|
case TensorTypeF64:
|
||||||
|
return 8
|
||||||
|
case tensorTypeIQ1_M:
|
||||||
|
return tt.blockSize()/8 + tt.blockSize()/16 + tt.blockSize()/32
|
||||||
|
case TensorTypeBF16:
|
||||||
|
return 2
|
||||||
|
default:
|
||||||
|
return 0
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func (tt TensorType) blockSize() int64 {
|
||||||
|
switch tt {
|
||||||
|
case TensorTypeF32,
|
||||||
|
TensorTypeF16,
|
||||||
|
TensorTypeI8,
|
||||||
|
TensorTypeI16,
|
||||||
|
TensorTypeI32,
|
||||||
|
TensorTypeI64,
|
||||||
|
TensorTypeF64,
|
||||||
|
TensorTypeBF16:
|
||||||
|
return 1
|
||||||
|
case TensorTypeQ4_0,
|
||||||
|
TensorTypeQ4_1,
|
||||||
|
TensorTypeQ5_0,
|
||||||
|
TensorTypeQ5_1,
|
||||||
|
TensorTypeQ8_0,
|
||||||
|
TensorTypeQ8_1,
|
||||||
|
tensorTypeIQ4_NL:
|
||||||
|
return 32
|
||||||
|
default:
|
||||||
|
return 256
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func (tt TensorType) String() string {
|
||||||
|
switch tt {
|
||||||
|
case TensorTypeF32:
|
||||||
|
return "f32"
|
||||||
|
case TensorTypeF16:
|
||||||
|
return "f16"
|
||||||
|
case TensorTypeQ4_0:
|
||||||
|
return "q4_0"
|
||||||
|
case TensorTypeQ4_1:
|
||||||
|
return "q4_1"
|
||||||
|
case tensorTypeQ4_2:
|
||||||
|
return "q4_2"
|
||||||
|
case tensorTypeQ4_3:
|
||||||
|
return "q4_3"
|
||||||
|
case TensorTypeQ5_0:
|
||||||
|
return "q5_0"
|
||||||
|
case TensorTypeQ5_1:
|
||||||
|
return "q5_1"
|
||||||
|
case TensorTypeQ8_0:
|
||||||
|
return "q8_0"
|
||||||
|
case TensorTypeQ8_1:
|
||||||
|
return "q8_1"
|
||||||
|
case TensorTypeQ2_K:
|
||||||
|
return "q2_k"
|
||||||
|
case TensorTypeQ3_K:
|
||||||
|
return "q3_k"
|
||||||
|
case TensorTypeQ4_K:
|
||||||
|
return "q4_k"
|
||||||
|
case TensorTypeQ5_K:
|
||||||
|
return "q5_k"
|
||||||
|
case TensorTypeQ6_K:
|
||||||
|
return "q6_k"
|
||||||
|
case TensorTypeQ8_K:
|
||||||
|
return "q8_k"
|
||||||
|
case tensorTypeIQ2_XXS:
|
||||||
|
return "iq2_xxs"
|
||||||
|
case tensorTypeIQ2_XS:
|
||||||
|
return "iq2_xs"
|
||||||
|
case tensorTypeIQ3_XXS:
|
||||||
|
return "iq3_xxs"
|
||||||
|
case tensorTypeIQ1_S:
|
||||||
|
return "iq1_s"
|
||||||
|
case tensorTypeIQ4_NL:
|
||||||
|
return "iq4_nl"
|
||||||
|
case tensorTypeIQ3_S:
|
||||||
|
return "iq3_s"
|
||||||
|
case tensorTypeIQ2_S:
|
||||||
|
return "iq2_s"
|
||||||
|
case tensorTypeIQ4_XS:
|
||||||
|
return "iq4_xs"
|
||||||
|
case TensorTypeI8:
|
||||||
|
return "i8"
|
||||||
|
case TensorTypeI16:
|
||||||
|
return "i16"
|
||||||
|
case TensorTypeI32:
|
||||||
|
return "i32"
|
||||||
|
case TensorTypeI64:
|
||||||
|
return "i64"
|
||||||
|
case TensorTypeF64:
|
||||||
|
return "f64"
|
||||||
|
case tensorTypeIQ1_M:
|
||||||
|
return "iq1_m"
|
||||||
|
case TensorTypeBF16:
|
||||||
|
return "bf16"
|
||||||
|
case tensorTypeQ4_0_4_4:
|
||||||
|
return "q4_0_4_4"
|
||||||
|
case tensorTypeQ4_0_4_8:
|
||||||
|
return "q4_0_4_8"
|
||||||
|
case tensorTypeQ4_0_8_8:
|
||||||
|
return "q4_0_8_8"
|
||||||
|
case tensorTypeTQ1_0:
|
||||||
|
return "tq1_0"
|
||||||
|
case tensorTypeTQ2_0:
|
||||||
|
return "tq2_0"
|
||||||
|
case tensorTypeIQ4_NL_4_4:
|
||||||
|
return "iq4_nl_4_4"
|
||||||
|
case tensorTypeIQ4_NL_4_8:
|
||||||
|
return "iq4_nl_4_8"
|
||||||
|
case tensorTypeIQ4_NL_8_8:
|
||||||
|
return "iq4_nl_8_8"
|
||||||
|
default:
|
||||||
|
return "unknown"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func (tt TensorType) LogValue() slog.Value {
|
||||||
|
return slog.GroupValue(
|
||||||
|
slog.Uint64("value", uint64(tt)),
|
||||||
|
slog.String("name", strings.ToUpper(tt.String())),
|
||||||
|
slog.Int64("size", tt.typeSize()),
|
||||||
|
slog.Int64("block_size", tt.blockSize()),
|
||||||
|
slog.Float64("num_bytes", tt.NumBytes()),
|
||||||
|
)
|
||||||
|
}
|
||||||
2
go.mod
2
go.mod
|
|
@ -19,7 +19,7 @@ require (
|
||||||
github.com/d4l3k/go-bfloat16 v0.0.0-20211005043715-690c3bdd05f1
|
github.com/d4l3k/go-bfloat16 v0.0.0-20211005043715-690c3bdd05f1
|
||||||
github.com/dlclark/regexp2 v1.11.4
|
github.com/dlclark/regexp2 v1.11.4
|
||||||
github.com/emirpasic/gods/v2 v2.0.0-alpha
|
github.com/emirpasic/gods/v2 v2.0.0-alpha
|
||||||
github.com/google/go-cmp v0.6.0
|
github.com/google/go-cmp v0.7.0
|
||||||
github.com/mattn/go-runewidth v0.0.14
|
github.com/mattn/go-runewidth v0.0.14
|
||||||
github.com/nlpodyssey/gopickle v0.3.0
|
github.com/nlpodyssey/gopickle v0.3.0
|
||||||
github.com/pdevine/tensor v0.0.0-20240510204454-f88f4562727c
|
github.com/pdevine/tensor v0.0.0-20240510204454-f88f4562727c
|
||||||
|
|
|
||||||
4
go.sum
4
go.sum
|
|
@ -112,8 +112,8 @@ github.com/google/go-cmp v0.4.0/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/
|
||||||
github.com/google/go-cmp v0.5.0/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE=
|
github.com/google/go-cmp v0.5.0/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE=
|
||||||
github.com/google/go-cmp v0.5.5/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE=
|
github.com/google/go-cmp v0.5.5/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE=
|
||||||
github.com/google/go-cmp v0.5.6/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE=
|
github.com/google/go-cmp v0.5.6/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE=
|
||||||
github.com/google/go-cmp v0.6.0 h1:ofyhxvXcZhMsU5ulbFiLKl/XBFqE1GSq7atu8tAmTRI=
|
github.com/google/go-cmp v0.7.0 h1:wk8382ETsv4JYUZwIsn6YpYiWiBsYLSJiTsyBybVuN8=
|
||||||
github.com/google/go-cmp v0.6.0/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY=
|
github.com/google/go-cmp v0.7.0/go.mod h1:pXiqmnSA92OHEEa9HXL2W4E7lf9JzCmGVUdgjX3N/iU=
|
||||||
github.com/google/gofuzz v1.0.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg=
|
github.com/google/gofuzz v1.0.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg=
|
||||||
github.com/google/uuid v1.1.2/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo=
|
github.com/google/uuid v1.1.2/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo=
|
||||||
github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0=
|
github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0=
|
||||||
|
|
|
||||||
|
|
@ -19,7 +19,7 @@ func TestVisionModels(t *testing.T) {
|
||||||
}
|
}
|
||||||
testCases := []testCase{
|
testCases := []testCase{
|
||||||
{
|
{
|
||||||
model: "llava:7b",
|
model: "qwen2.5vl",
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
model: "llama3.2-vision",
|
model: "llama3.2-vision",
|
||||||
|
|
@ -60,6 +60,7 @@ func TestVisionModels(t *testing.T) {
|
||||||
}
|
}
|
||||||
|
|
||||||
func TestIntegrationSplitBatch(t *testing.T) {
|
func TestIntegrationSplitBatch(t *testing.T) {
|
||||||
|
skipUnderMinVRAM(t, 6)
|
||||||
image, err := base64.StdEncoding.DecodeString(imageEncoding)
|
image, err := base64.StdEncoding.DecodeString(imageEncoding)
|
||||||
require.NoError(t, err)
|
require.NoError(t, err)
|
||||||
req := api.GenerateRequest{
|
req := api.GenerateRequest{
|
||||||
|
|
|
||||||
|
|
@ -45,6 +45,8 @@ var (
|
||||||
"qwen2.5-coder:latest",
|
"qwen2.5-coder:latest",
|
||||||
"qwen:latest",
|
"qwen:latest",
|
||||||
"solar-pro:latest",
|
"solar-pro:latest",
|
||||||
|
"codellama:latest",
|
||||||
|
"nous-hermes:latest",
|
||||||
}
|
}
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
|
||||||
1
integration/testdata/embed.json
vendored
1
integration/testdata/embed.json
vendored
File diff suppressed because one or more lines are too long
|
|
@ -30,6 +30,11 @@ type Causal struct {
|
||||||
|
|
||||||
// ** current forward pass **
|
// ** current forward pass **
|
||||||
|
|
||||||
|
// curReserve indicates that this forward pass is only for
|
||||||
|
// memory reservation and we should not update our metadata
|
||||||
|
// based on it.
|
||||||
|
curReserve bool
|
||||||
|
|
||||||
// the active layer for Get and Put
|
// the active layer for Get and Put
|
||||||
curLayer int
|
curLayer int
|
||||||
|
|
||||||
|
|
@ -159,12 +164,13 @@ func (c *Causal) Close() {
|
||||||
}
|
}
|
||||||
|
|
||||||
func (c *Causal) StartForward(ctx ml.Context, batch input.Batch, reserve bool) error {
|
func (c *Causal) StartForward(ctx ml.Context, batch input.Batch, reserve bool) error {
|
||||||
|
c.curReserve = reserve
|
||||||
c.curBatchSize = len(batch.Positions)
|
c.curBatchSize = len(batch.Positions)
|
||||||
c.curSequences = batch.Sequences
|
c.curSequences = batch.Sequences
|
||||||
c.curPositions = batch.Positions
|
c.curPositions = batch.Positions
|
||||||
c.opts.Except = nil
|
c.opts.Except = nil
|
||||||
|
|
||||||
if !reserve {
|
if !c.curReserve {
|
||||||
c.updateSlidingWindow()
|
c.updateSlidingWindow()
|
||||||
|
|
||||||
var err error
|
var err error
|
||||||
|
|
@ -211,10 +217,9 @@ func (c *Causal) StartForward(ctx ml.Context, batch input.Batch, reserve bool) e
|
||||||
c.curCellRange.max = len(c.cells) - 1
|
c.curCellRange.max = len(c.cells) - 1
|
||||||
}
|
}
|
||||||
|
|
||||||
var err error
|
c.curMask = c.buildMask(ctx)
|
||||||
c.curMask, err = c.buildMask(ctx)
|
|
||||||
|
|
||||||
return err
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func newRange() cellRange {
|
func newRange() cellRange {
|
||||||
|
|
@ -297,7 +302,7 @@ func roundUp(length, pad int) int {
|
||||||
// Builds a mask of history x batch indicating whether for each token in the batch the
|
// Builds a mask of history x batch indicating whether for each token in the batch the
|
||||||
// token in the history should apply. This is based on both the sequence and causality (the
|
// token in the history should apply. This is based on both the sequence and causality (the
|
||||||
// position of the history is not ahead of the token in the batch).
|
// position of the history is not ahead of the token in the batch).
|
||||||
func (c *Causal) buildMask(ctx ml.Context) (ml.Tensor, error) {
|
func (c *Causal) buildMask(ctx ml.Context) ml.Tensor {
|
||||||
// Align and pad the two dimensions as required by the backend
|
// Align and pad the two dimensions as required by the backend
|
||||||
batchSize := roundUp(c.curBatchSize, c.config.MaskBatchPadding)
|
batchSize := roundUp(c.curBatchSize, c.config.MaskBatchPadding)
|
||||||
|
|
||||||
|
|
@ -305,6 +310,11 @@ func (c *Causal) buildMask(ctx ml.Context) (ml.Tensor, error) {
|
||||||
c.curCellRange.max = roundUp(c.curCellRange.max+1, c.config.CachePadding) - 1
|
c.curCellRange.max = roundUp(c.curCellRange.max+1, c.config.CachePadding) - 1
|
||||||
|
|
||||||
length := c.curCellRange.max - c.curCellRange.min + 1
|
length := c.curCellRange.max - c.curCellRange.min + 1
|
||||||
|
|
||||||
|
if c.curReserve {
|
||||||
|
return ctx.Input().Empty(c.config.MaskDType, length, batchSize)
|
||||||
|
}
|
||||||
|
|
||||||
mask := make([]float32, batchSize*length)
|
mask := make([]float32, batchSize*length)
|
||||||
|
|
||||||
for i := range c.curBatchSize {
|
for i := range c.curBatchSize {
|
||||||
|
|
@ -325,10 +335,7 @@ func (c *Causal) buildMask(ctx ml.Context) (ml.Tensor, error) {
|
||||||
mask[i] = float32(math.Inf(-1))
|
mask[i] = float32(math.Inf(-1))
|
||||||
}
|
}
|
||||||
|
|
||||||
maskTensor, err := ctx.Input().FromFloatSlice(mask, length, batchSize)
|
maskTensor := ctx.Input().FromFloatSlice(mask, length, batchSize)
|
||||||
if err != nil {
|
|
||||||
return nil, err
|
|
||||||
}
|
|
||||||
|
|
||||||
if c.config.MaskDType != ml.DTypeF32 {
|
if c.config.MaskDType != ml.DTypeF32 {
|
||||||
out := ctx.Input().Empty(c.config.MaskDType, maskTensor.Shape()...)
|
out := ctx.Input().Empty(c.config.MaskDType, maskTensor.Shape()...)
|
||||||
|
|
@ -336,7 +343,7 @@ func (c *Causal) buildMask(ctx ml.Context) (ml.Tensor, error) {
|
||||||
maskTensor = out
|
maskTensor = out
|
||||||
}
|
}
|
||||||
|
|
||||||
return maskTensor, nil
|
return maskTensor
|
||||||
}
|
}
|
||||||
|
|
||||||
func (c *Causal) moveCells(ctx ml.Context, src, dst, length int) {
|
func (c *Causal) moveCells(ctx ml.Context, src, dst, length int) {
|
||||||
|
|
@ -491,12 +498,7 @@ func (c *Causal) SetCausal(ctx ml.Context, opts CausalOptions) {
|
||||||
if !slices.Equal(c.opts.Except, opts.Except) {
|
if !slices.Equal(c.opts.Except, opts.Except) {
|
||||||
c.opts = opts
|
c.opts = opts
|
||||||
if ctx != nil {
|
if ctx != nil {
|
||||||
var err error
|
c.curMask = c.buildMask(ctx)
|
||||||
c.curMask, err = c.buildMask(ctx)
|
|
||||||
if err != nil {
|
|
||||||
// This error should never occur because we have previously built a mask with the same shape
|
|
||||||
panic(fmt.Errorf("SetCausal: %w", err))
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
@ -652,10 +654,7 @@ func (c *Causal) shift(seq int, beginIndex, offset int32) error {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
kShift, err := ctx.Input().FromIntSlice(offsets, len(offsets))
|
kShift := ctx.Input().FromIntSlice(offsets, len(offsets))
|
||||||
if err != nil {
|
|
||||||
return err
|
|
||||||
}
|
|
||||||
|
|
||||||
for i, key := range c.keys {
|
for i, key := range c.keys {
|
||||||
if key == nil {
|
if key == nil {
|
||||||
|
|
|
||||||
|
|
@ -344,7 +344,7 @@ func testCache(t *testing.T, backend ml.Backend, cache Cache, tests []testCase)
|
||||||
}
|
}
|
||||||
|
|
||||||
cache.SetLayer(0)
|
cache.SetLayer(0)
|
||||||
tensor, _ := context.FromFloatSlice(test.in, test.inShape...)
|
tensor := context.FromFloatSlice(test.in, test.inShape...)
|
||||||
cache.Put(context, tensor, tensor)
|
cache.Put(context, tensor, tensor)
|
||||||
|
|
||||||
out, _, mask := cache.Get(context)
|
out, _, mask := cache.Get(context)
|
||||||
|
|
@ -386,7 +386,7 @@ func TestCanResume(t *testing.T) {
|
||||||
}
|
}
|
||||||
|
|
||||||
cache.SetLayer(0)
|
cache.SetLayer(0)
|
||||||
tensor, _ := context.FromFloatSlice([]float32{1, 2, 3, 4}, 1, 1, 4)
|
tensor := context.FromFloatSlice([]float32{1, 2, 3, 4}, 1, 1, 4)
|
||||||
cache.Put(context, tensor, tensor)
|
cache.Put(context, tensor, tensor)
|
||||||
|
|
||||||
// with window size 4, nothing has slid out of the window yet
|
// with window size 4, nothing has slid out of the window yet
|
||||||
|
|
@ -413,7 +413,7 @@ func TestCanResume(t *testing.T) {
|
||||||
}
|
}
|
||||||
|
|
||||||
cache.SetLayer(0)
|
cache.SetLayer(0)
|
||||||
tensor, _ = context.FromFloatSlice([]float32{5, 6}, 1, 1, 2)
|
tensor = context.FromFloatSlice([]float32{5, 6}, 1, 1, 2)
|
||||||
cache.Put(context, tensor, tensor)
|
cache.Put(context, tensor, tensor)
|
||||||
|
|
||||||
// only the latest position has overlapping windows
|
// only the latest position has overlapping windows
|
||||||
|
|
@ -470,24 +470,24 @@ func (c *testContext) Zeros(dtype ml.DType, shape ...int) ml.Tensor {
|
||||||
return c.Empty(dtype, shape...)
|
return c.Empty(dtype, shape...)
|
||||||
}
|
}
|
||||||
|
|
||||||
func (c *testContext) FromFloatSlice(s []float32, shape ...int) (ml.Tensor, error) {
|
func (c *testContext) FromFloatSlice(s []float32, shape ...int) ml.Tensor {
|
||||||
t := c.Empty(ml.DTypeF32, shape...).(*testTensor)
|
t := c.Empty(ml.DTypeF32, shape...).(*testTensor)
|
||||||
|
|
||||||
copy(t.data, s)
|
copy(t.data, s)
|
||||||
|
|
||||||
return t, nil
|
return t
|
||||||
}
|
}
|
||||||
|
|
||||||
func (c *testContext) FromIntSlice(s []int32, shape ...int) (ml.Tensor, error) {
|
func (c *testContext) FromIntSlice(s []int32, shape ...int) ml.Tensor {
|
||||||
f := make([]float32, len(s))
|
f := make([]float32, len(s))
|
||||||
for i := range f {
|
for i := range f {
|
||||||
f[i] = float32(s[i])
|
f[i] = float32(s[i])
|
||||||
}
|
}
|
||||||
|
|
||||||
out, _ := c.FromFloatSlice(f, shape...)
|
out := c.FromFloatSlice(f, shape...)
|
||||||
out.(*testTensor).dtype = ml.DTypeI32
|
out.(*testTensor).dtype = ml.DTypeI32
|
||||||
|
|
||||||
return out, nil
|
return out
|
||||||
}
|
}
|
||||||
|
|
||||||
func (c *testContext) Arange(start, stop, step float32, dtype ml.DType) ml.Tensor {
|
func (c *testContext) Arange(start, stop, step float32, dtype ml.DType) ml.Tensor {
|
||||||
|
|
@ -496,7 +496,7 @@ func (c *testContext) Arange(start, stop, step float32, dtype ml.DType) ml.Tenso
|
||||||
s = append(s, i)
|
s = append(s, i)
|
||||||
}
|
}
|
||||||
|
|
||||||
out, _ := c.FromFloatSlice(s, len(s))
|
out := c.FromFloatSlice(s, len(s))
|
||||||
out.(*testTensor).dtype = dtype
|
out.(*testTensor).dtype = dtype
|
||||||
return out
|
return out
|
||||||
}
|
}
|
||||||
|
|
@ -508,7 +508,7 @@ func (c *testContext) Forward(...ml.Tensor) ml.Context { return c }
|
||||||
|
|
||||||
func (c *testContext) Compute(...ml.Tensor) {}
|
func (c *testContext) Compute(...ml.Tensor) {}
|
||||||
|
|
||||||
func (c *testContext) Reserve() error { return nil }
|
func (c *testContext) Reserve() {}
|
||||||
|
|
||||||
func (c *testContext) MaxGraphNodes() int {
|
func (c *testContext) MaxGraphNodes() int {
|
||||||
return 10
|
return 10
|
||||||
|
|
|
||||||
2
llama/build-info.cpp
generated
vendored
2
llama/build-info.cpp
generated
vendored
|
|
@ -1,4 +1,4 @@
|
||||||
int LLAMA_BUILD_NUMBER = 0;
|
int LLAMA_BUILD_NUMBER = 0;
|
||||||
char const *LLAMA_COMMIT = "e1e8e0991ffd9e99a445c6812bb519d5bac9f4b5";
|
char const *LLAMA_COMMIT = "de4c07f93783a1a96456a44dc16b9db538ee1618";
|
||||||
char const *LLAMA_COMPILER = "";
|
char const *LLAMA_COMPILER = "";
|
||||||
char const *LLAMA_BUILD_TARGET = "";
|
char const *LLAMA_BUILD_TARGET = "";
|
||||||
|
|
|
||||||
|
|
@ -10,11 +10,11 @@ include common/stb_image.*
|
||||||
include include/
|
include include/
|
||||||
include include/llama.*
|
include include/llama.*
|
||||||
include include/llama-*.*
|
include include/llama-*.*
|
||||||
include examples/
|
include tools/
|
||||||
include examples/llava/
|
include tools/mtmd/
|
||||||
include examples/llava/clip.*
|
include tools/mtmd/clip.*
|
||||||
include examples/llava/clip-impl.*
|
include tools/mtmd/clip-impl.*
|
||||||
include examples/llava/llava.*
|
include tools/mtmd/llava.*
|
||||||
include src/
|
include src/
|
||||||
include src/llama.*
|
include src/llama.*
|
||||||
include src/llama-*.*
|
include src/llama-*.*
|
||||||
|
|
|
||||||
19
llama/llama.cpp/common/common.cpp
vendored
19
llama/llama.cpp/common/common.cpp
vendored
|
|
@ -1096,7 +1096,6 @@ struct llama_context_params common_context_params_to_llama(const common_params &
|
||||||
cparams.n_threads = params.cpuparams.n_threads;
|
cparams.n_threads = params.cpuparams.n_threads;
|
||||||
cparams.n_threads_batch = params.cpuparams_batch.n_threads == -1 ?
|
cparams.n_threads_batch = params.cpuparams_batch.n_threads == -1 ?
|
||||||
params.cpuparams.n_threads : params.cpuparams_batch.n_threads;
|
params.cpuparams.n_threads : params.cpuparams_batch.n_threads;
|
||||||
cparams.logits_all = params.logits_all;
|
|
||||||
cparams.embeddings = params.embedding;
|
cparams.embeddings = params.embedding;
|
||||||
cparams.rope_scaling_type = params.rope_scaling_type;
|
cparams.rope_scaling_type = params.rope_scaling_type;
|
||||||
cparams.rope_freq_base = params.rope_freq_base;
|
cparams.rope_freq_base = params.rope_freq_base;
|
||||||
|
|
@ -1114,6 +1113,7 @@ struct llama_context_params common_context_params_to_llama(const common_params &
|
||||||
cparams.offload_kqv = !params.no_kv_offload;
|
cparams.offload_kqv = !params.no_kv_offload;
|
||||||
cparams.flash_attn = params.flash_attn;
|
cparams.flash_attn = params.flash_attn;
|
||||||
cparams.no_perf = params.no_perf;
|
cparams.no_perf = params.no_perf;
|
||||||
|
cparams.op_offload = !params.no_op_offload;
|
||||||
|
|
||||||
if (params.reranking) {
|
if (params.reranking) {
|
||||||
cparams.embeddings = true;
|
cparams.embeddings = true;
|
||||||
|
|
@ -1565,3 +1565,20 @@ common_control_vector_data common_control_vector_load(const std::vector<common_c
|
||||||
|
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
ggml_opt_dataset_t common_opt_dataset_init(struct llama_context * ctx, const std::vector<llama_token> & tokens, int64_t stride) {
|
||||||
|
const int64_t ne_datapoint = llama_n_ctx(ctx);
|
||||||
|
const int64_t ndata = (tokens.size() - ne_datapoint - 1) / stride;
|
||||||
|
ggml_opt_dataset_t result = ggml_opt_dataset_init(
|
||||||
|
GGML_TYPE_I32, GGML_TYPE_I32, ne_datapoint, ne_datapoint, ndata, /*ndata_shard =*/ 1);
|
||||||
|
|
||||||
|
llama_token * data = (llama_token *) ggml_opt_dataset_data(result)->data;
|
||||||
|
llama_token * labels = (llama_token *) ggml_opt_dataset_labels(result)->data;
|
||||||
|
|
||||||
|
for (int64_t idata = 0; idata < ndata; ++idata) {
|
||||||
|
memcpy(data + idata*ne_datapoint, tokens.data() + idata*stride + 0, ne_datapoint*sizeof(llama_token));
|
||||||
|
memcpy(labels + idata*ne_datapoint, tokens.data() + idata*stride + 1, ne_datapoint*sizeof(llama_token));
|
||||||
|
}
|
||||||
|
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
|
||||||
18
llama/llama.cpp/common/common.h
vendored
18
llama/llama.cpp/common/common.h
vendored
|
|
@ -66,7 +66,6 @@ enum llama_example {
|
||||||
LLAMA_EXAMPLE_COMMON,
|
LLAMA_EXAMPLE_COMMON,
|
||||||
LLAMA_EXAMPLE_SPECULATIVE,
|
LLAMA_EXAMPLE_SPECULATIVE,
|
||||||
LLAMA_EXAMPLE_MAIN,
|
LLAMA_EXAMPLE_MAIN,
|
||||||
LLAMA_EXAMPLE_INFILL,
|
|
||||||
LLAMA_EXAMPLE_EMBEDDING,
|
LLAMA_EXAMPLE_EMBEDDING,
|
||||||
LLAMA_EXAMPLE_PERPLEXITY,
|
LLAMA_EXAMPLE_PERPLEXITY,
|
||||||
LLAMA_EXAMPLE_RETRIEVAL,
|
LLAMA_EXAMPLE_RETRIEVAL,
|
||||||
|
|
@ -96,6 +95,7 @@ enum common_sampler_type {
|
||||||
COMMON_SAMPLER_TYPE_XTC = 8,
|
COMMON_SAMPLER_TYPE_XTC = 8,
|
||||||
COMMON_SAMPLER_TYPE_INFILL = 9,
|
COMMON_SAMPLER_TYPE_INFILL = 9,
|
||||||
COMMON_SAMPLER_TYPE_PENALTIES = 10,
|
COMMON_SAMPLER_TYPE_PENALTIES = 10,
|
||||||
|
COMMON_SAMPLER_TYPE_TOP_N_SIGMA = 11,
|
||||||
};
|
};
|
||||||
|
|
||||||
// dimensionality reduction methods, used by cvector-generator
|
// dimensionality reduction methods, used by cvector-generator
|
||||||
|
|
@ -161,6 +161,7 @@ struct common_params_sampling {
|
||||||
std::vector<enum common_sampler_type> samplers = {
|
std::vector<enum common_sampler_type> samplers = {
|
||||||
COMMON_SAMPLER_TYPE_PENALTIES,
|
COMMON_SAMPLER_TYPE_PENALTIES,
|
||||||
COMMON_SAMPLER_TYPE_DRY,
|
COMMON_SAMPLER_TYPE_DRY,
|
||||||
|
COMMON_SAMPLER_TYPE_TOP_N_SIGMA,
|
||||||
COMMON_SAMPLER_TYPE_TOP_K,
|
COMMON_SAMPLER_TYPE_TOP_K,
|
||||||
COMMON_SAMPLER_TYPE_TYPICAL_P,
|
COMMON_SAMPLER_TYPE_TYPICAL_P,
|
||||||
COMMON_SAMPLER_TYPE_TOP_P,
|
COMMON_SAMPLER_TYPE_TOP_P,
|
||||||
|
|
@ -323,7 +324,6 @@ struct common_params {
|
||||||
bool ctx_shift = true; // context shift on inifinite text generation
|
bool ctx_shift = true; // context shift on inifinite text generation
|
||||||
|
|
||||||
bool input_prefix_bos = false; // prefix BOS to user inputs, preceding input_prefix
|
bool input_prefix_bos = false; // prefix BOS to user inputs, preceding input_prefix
|
||||||
bool logits_all = false; // return logits for all tokens in the batch
|
|
||||||
bool use_mmap = true; // use mmap for faster loads
|
bool use_mmap = true; // use mmap for faster loads
|
||||||
bool use_mlock = false; // use mlock to keep model in memory
|
bool use_mlock = false; // use mlock to keep model in memory
|
||||||
bool verbose_prompt = false; // print prompt tokens before generation
|
bool verbose_prompt = false; // print prompt tokens before generation
|
||||||
|
|
@ -332,6 +332,7 @@ struct common_params {
|
||||||
bool no_kv_offload = false; // disable KV offloading
|
bool no_kv_offload = false; // disable KV offloading
|
||||||
bool warmup = true; // warmup run
|
bool warmup = true; // warmup run
|
||||||
bool check_tensors = false; // validate tensor data
|
bool check_tensors = false; // validate tensor data
|
||||||
|
bool no_op_offload = false; // globally disable offload host tensor operations to device
|
||||||
|
|
||||||
bool single_turn = false; // single turn chat conversation
|
bool single_turn = false; // single turn chat conversation
|
||||||
|
|
||||||
|
|
@ -340,7 +341,7 @@ struct common_params {
|
||||||
|
|
||||||
common_conversation_mode conversation_mode = COMMON_CONVERSATION_MODE_AUTO;
|
common_conversation_mode conversation_mode = COMMON_CONVERSATION_MODE_AUTO;
|
||||||
|
|
||||||
// multimodal models (see examples/llava)
|
// multimodal models (see tools/mtmd)
|
||||||
struct common_params_model mmproj;
|
struct common_params_model mmproj;
|
||||||
bool mmproj_use_gpu = true; // use GPU for multimodal model
|
bool mmproj_use_gpu = true; // use GPU for multimodal model
|
||||||
bool no_mmproj = false; // explicitly disable multimodal model
|
bool no_mmproj = false; // explicitly disable multimodal model
|
||||||
|
|
@ -409,13 +410,14 @@ struct common_params {
|
||||||
|
|
||||||
bool process_output = false; // collect data for the output tensor
|
bool process_output = false; // collect data for the output tensor
|
||||||
bool compute_ppl = true; // whether to compute perplexity
|
bool compute_ppl = true; // whether to compute perplexity
|
||||||
|
bool parse_special = false; // whether to parse special tokens during imatrix tokenization
|
||||||
|
|
||||||
// cvector-generator params
|
// cvector-generator params
|
||||||
int n_pca_batch = 100;
|
int n_pca_batch = 100;
|
||||||
int n_pca_iterations = 1000;
|
int n_pca_iterations = 1000;
|
||||||
dimre_method cvector_dimre_method = DIMRE_METHOD_PCA;
|
dimre_method cvector_dimre_method = DIMRE_METHOD_PCA;
|
||||||
std::string cvector_positive_file = "examples/cvector-generator/positive.txt";
|
std::string cvector_positive_file = "tools/cvector-generator/positive.txt";
|
||||||
std::string cvector_negative_file = "examples/cvector-generator/negative.txt";
|
std::string cvector_negative_file = "tools/cvector-generator/negative.txt";
|
||||||
|
|
||||||
bool spm_infill = false; // suffix/prefix/middle pattern for infill
|
bool spm_infill = false; // suffix/prefix/middle pattern for infill
|
||||||
|
|
||||||
|
|
@ -664,3 +666,9 @@ const char * const LLM_KV_SPLIT_COUNT = "split.count";
|
||||||
const char * const LLM_KV_SPLIT_TENSORS_COUNT = "split.tensors.count";
|
const char * const LLM_KV_SPLIT_TENSORS_COUNT = "split.tensors.count";
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
//
|
||||||
|
// training utils
|
||||||
|
//
|
||||||
|
|
||||||
|
ggml_opt_dataset_t common_opt_dataset_init(struct llama_context * ctx, const std::vector<llama_token> & tokens, int64_t stride);
|
||||||
|
|
|
||||||
107
llama/llama.cpp/common/sampling.cpp
vendored
107
llama/llama.cpp/common/sampling.cpp
vendored
|
|
@ -1,6 +1,7 @@
|
||||||
#include "sampling.h"
|
#include "sampling.h"
|
||||||
|
|
||||||
#include "common.h"
|
#include "common.h"
|
||||||
|
#include "log.h"
|
||||||
|
|
||||||
#include <cmath>
|
#include <cmath>
|
||||||
#include <unordered_map>
|
#include <unordered_map>
|
||||||
|
|
@ -229,51 +230,48 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, co
|
||||||
params.logit_bias.data()));
|
params.logit_bias.data()));
|
||||||
|
|
||||||
if (params.mirostat == 0) {
|
if (params.mirostat == 0) {
|
||||||
if (params.top_n_sigma >= 0) {
|
for (const auto & cnstr : params.samplers) {
|
||||||
llama_sampler_chain_add(result->chain, llama_sampler_init_top_k (params.top_k));
|
switch (cnstr) {
|
||||||
llama_sampler_chain_add(result->chain, llama_sampler_init_temp (params.temp));
|
case COMMON_SAMPLER_TYPE_DRY:
|
||||||
llama_sampler_chain_add(result->chain, llama_sampler_init_top_n_sigma (params.top_n_sigma));
|
{
|
||||||
} else {
|
std::vector<const char *> c_breakers;
|
||||||
for (const auto & cnstr : params.samplers) {
|
c_breakers.reserve(params.dry_sequence_breakers.size());
|
||||||
switch (cnstr) {
|
for (const auto & str : params.dry_sequence_breakers) {
|
||||||
case COMMON_SAMPLER_TYPE_DRY:
|
c_breakers.push_back(str.c_str());
|
||||||
{
|
|
||||||
std::vector<const char *> c_breakers;
|
|
||||||
c_breakers.reserve(params.dry_sequence_breakers.size());
|
|
||||||
for (const auto & str : params.dry_sequence_breakers) {
|
|
||||||
c_breakers.push_back(str.c_str());
|
|
||||||
}
|
|
||||||
|
|
||||||
llama_sampler_chain_add(result->chain, llama_sampler_init_dry (vocab, llama_model_n_ctx_train(model), params.dry_multiplier, params.dry_base, params.dry_allowed_length, params.dry_penalty_last_n, c_breakers.data(), c_breakers.size()));
|
|
||||||
}
|
}
|
||||||
break;
|
|
||||||
case COMMON_SAMPLER_TYPE_TOP_K:
|
llama_sampler_chain_add(result->chain, llama_sampler_init_dry (vocab, llama_model_n_ctx_train(model), params.dry_multiplier, params.dry_base, params.dry_allowed_length, params.dry_penalty_last_n, c_breakers.data(), c_breakers.size()));
|
||||||
llama_sampler_chain_add(result->chain, llama_sampler_init_top_k (params.top_k));
|
}
|
||||||
break;
|
break;
|
||||||
case COMMON_SAMPLER_TYPE_TOP_P:
|
case COMMON_SAMPLER_TYPE_TOP_K:
|
||||||
llama_sampler_chain_add(result->chain, llama_sampler_init_top_p (params.top_p, params.min_keep));
|
llama_sampler_chain_add(result->chain, llama_sampler_init_top_k (params.top_k));
|
||||||
break;
|
break;
|
||||||
case COMMON_SAMPLER_TYPE_MIN_P:
|
case COMMON_SAMPLER_TYPE_TOP_P:
|
||||||
llama_sampler_chain_add(result->chain, llama_sampler_init_min_p (params.min_p, params.min_keep));
|
llama_sampler_chain_add(result->chain, llama_sampler_init_top_p (params.top_p, params.min_keep));
|
||||||
break;
|
break;
|
||||||
case COMMON_SAMPLER_TYPE_XTC:
|
case COMMON_SAMPLER_TYPE_TOP_N_SIGMA:
|
||||||
llama_sampler_chain_add(result->chain, llama_sampler_init_xtc (params.xtc_probability, params.xtc_threshold, params.min_keep, params.seed));
|
llama_sampler_chain_add(result->chain, llama_sampler_init_top_n_sigma (params.top_n_sigma));
|
||||||
break;
|
break;
|
||||||
case COMMON_SAMPLER_TYPE_TYPICAL_P:
|
case COMMON_SAMPLER_TYPE_MIN_P:
|
||||||
llama_sampler_chain_add(result->chain, llama_sampler_init_typical (params.typ_p, params.min_keep));
|
llama_sampler_chain_add(result->chain, llama_sampler_init_min_p (params.min_p, params.min_keep));
|
||||||
break;
|
break;
|
||||||
case COMMON_SAMPLER_TYPE_TEMPERATURE:
|
case COMMON_SAMPLER_TYPE_XTC:
|
||||||
llama_sampler_chain_add(result->chain, llama_sampler_init_temp_ext (params.temp, params.dynatemp_range, params.dynatemp_exponent));
|
llama_sampler_chain_add(result->chain, llama_sampler_init_xtc (params.xtc_probability, params.xtc_threshold, params.min_keep, params.seed));
|
||||||
break;
|
break;
|
||||||
case COMMON_SAMPLER_TYPE_INFILL:
|
case COMMON_SAMPLER_TYPE_TYPICAL_P:
|
||||||
llama_sampler_chain_add(result->chain, llama_sampler_init_infill (vocab));
|
llama_sampler_chain_add(result->chain, llama_sampler_init_typical (params.typ_p, params.min_keep));
|
||||||
break;
|
break;
|
||||||
case COMMON_SAMPLER_TYPE_PENALTIES:
|
case COMMON_SAMPLER_TYPE_TEMPERATURE:
|
||||||
llama_sampler_chain_add(result->chain, llama_sampler_init_penalties(params.penalty_last_n, params.penalty_repeat, params.penalty_freq, params.penalty_present));
|
llama_sampler_chain_add(result->chain, llama_sampler_init_temp_ext (params.temp, params.dynatemp_range, params.dynatemp_exponent));
|
||||||
break;
|
break;
|
||||||
default:
|
case COMMON_SAMPLER_TYPE_INFILL:
|
||||||
GGML_ASSERT(false && "unknown sampler type");
|
llama_sampler_chain_add(result->chain, llama_sampler_init_infill (vocab));
|
||||||
}
|
break;
|
||||||
|
case COMMON_SAMPLER_TYPE_PENALTIES:
|
||||||
|
llama_sampler_chain_add(result->chain, llama_sampler_init_penalties (params.penalty_last_n, params.penalty_repeat, params.penalty_freq, params.penalty_present));
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
GGML_ASSERT(false && "unknown sampler type");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
llama_sampler_chain_add(result->chain, llama_sampler_init_dist(params.seed));
|
llama_sampler_chain_add(result->chain, llama_sampler_init_dist(params.seed));
|
||||||
|
|
@ -475,6 +473,7 @@ char common_sampler_type_to_chr(enum common_sampler_type cnstr) {
|
||||||
case COMMON_SAMPLER_TYPE_TOP_K: return 'k';
|
case COMMON_SAMPLER_TYPE_TOP_K: return 'k';
|
||||||
case COMMON_SAMPLER_TYPE_TYPICAL_P: return 'y';
|
case COMMON_SAMPLER_TYPE_TYPICAL_P: return 'y';
|
||||||
case COMMON_SAMPLER_TYPE_TOP_P: return 'p';
|
case COMMON_SAMPLER_TYPE_TOP_P: return 'p';
|
||||||
|
case COMMON_SAMPLER_TYPE_TOP_N_SIGMA: return 's';
|
||||||
case COMMON_SAMPLER_TYPE_MIN_P: return 'm';
|
case COMMON_SAMPLER_TYPE_MIN_P: return 'm';
|
||||||
case COMMON_SAMPLER_TYPE_TEMPERATURE: return 't';
|
case COMMON_SAMPLER_TYPE_TEMPERATURE: return 't';
|
||||||
case COMMON_SAMPLER_TYPE_XTC: return 'x';
|
case COMMON_SAMPLER_TYPE_XTC: return 'x';
|
||||||
|
|
@ -490,6 +489,7 @@ std::string common_sampler_type_to_str(enum common_sampler_type cnstr) {
|
||||||
case COMMON_SAMPLER_TYPE_TOP_K: return "top_k";
|
case COMMON_SAMPLER_TYPE_TOP_K: return "top_k";
|
||||||
case COMMON_SAMPLER_TYPE_TYPICAL_P: return "typ_p";
|
case COMMON_SAMPLER_TYPE_TYPICAL_P: return "typ_p";
|
||||||
case COMMON_SAMPLER_TYPE_TOP_P: return "top_p";
|
case COMMON_SAMPLER_TYPE_TOP_P: return "top_p";
|
||||||
|
case COMMON_SAMPLER_TYPE_TOP_N_SIGMA: return "top_n_sigma";
|
||||||
case COMMON_SAMPLER_TYPE_MIN_P: return "min_p";
|
case COMMON_SAMPLER_TYPE_MIN_P: return "min_p";
|
||||||
case COMMON_SAMPLER_TYPE_TEMPERATURE: return "temperature";
|
case COMMON_SAMPLER_TYPE_TEMPERATURE: return "temperature";
|
||||||
case COMMON_SAMPLER_TYPE_XTC: return "xtc";
|
case COMMON_SAMPLER_TYPE_XTC: return "xtc";
|
||||||
|
|
@ -504,6 +504,7 @@ std::vector<common_sampler_type> common_sampler_types_from_names(const std::vect
|
||||||
{ "dry", COMMON_SAMPLER_TYPE_DRY },
|
{ "dry", COMMON_SAMPLER_TYPE_DRY },
|
||||||
{ "top_k", COMMON_SAMPLER_TYPE_TOP_K },
|
{ "top_k", COMMON_SAMPLER_TYPE_TOP_K },
|
||||||
{ "top_p", COMMON_SAMPLER_TYPE_TOP_P },
|
{ "top_p", COMMON_SAMPLER_TYPE_TOP_P },
|
||||||
|
{ "top_n_sigma", COMMON_SAMPLER_TYPE_TOP_N_SIGMA },
|
||||||
{ "typ_p", COMMON_SAMPLER_TYPE_TYPICAL_P },
|
{ "typ_p", COMMON_SAMPLER_TYPE_TYPICAL_P },
|
||||||
{ "min_p", COMMON_SAMPLER_TYPE_MIN_P },
|
{ "min_p", COMMON_SAMPLER_TYPE_MIN_P },
|
||||||
{ "temperature", COMMON_SAMPLER_TYPE_TEMPERATURE },
|
{ "temperature", COMMON_SAMPLER_TYPE_TEMPERATURE },
|
||||||
|
|
@ -517,6 +518,7 @@ std::vector<common_sampler_type> common_sampler_types_from_names(const std::vect
|
||||||
std::unordered_map<std::string, common_sampler_type> sampler_alt_name_map {
|
std::unordered_map<std::string, common_sampler_type> sampler_alt_name_map {
|
||||||
{ "top-k", COMMON_SAMPLER_TYPE_TOP_K },
|
{ "top-k", COMMON_SAMPLER_TYPE_TOP_K },
|
||||||
{ "top-p", COMMON_SAMPLER_TYPE_TOP_P },
|
{ "top-p", COMMON_SAMPLER_TYPE_TOP_P },
|
||||||
|
{ "top-n-sigma", COMMON_SAMPLER_TYPE_TOP_N_SIGMA },
|
||||||
{ "nucleus", COMMON_SAMPLER_TYPE_TOP_P },
|
{ "nucleus", COMMON_SAMPLER_TYPE_TOP_P },
|
||||||
{ "typical-p", COMMON_SAMPLER_TYPE_TYPICAL_P },
|
{ "typical-p", COMMON_SAMPLER_TYPE_TYPICAL_P },
|
||||||
{ "typical", COMMON_SAMPLER_TYPE_TYPICAL_P },
|
{ "typical", COMMON_SAMPLER_TYPE_TYPICAL_P },
|
||||||
|
|
@ -533,14 +535,16 @@ std::vector<common_sampler_type> common_sampler_types_from_names(const std::vect
|
||||||
auto sampler = sampler_canonical_name_map.find(name);
|
auto sampler = sampler_canonical_name_map.find(name);
|
||||||
if (sampler != sampler_canonical_name_map.end()) {
|
if (sampler != sampler_canonical_name_map.end()) {
|
||||||
samplers.push_back(sampler->second);
|
samplers.push_back(sampler->second);
|
||||||
} else {
|
continue;
|
||||||
if (allow_alt_names) {
|
}
|
||||||
sampler = sampler_alt_name_map.find(name);
|
if (allow_alt_names) {
|
||||||
if (sampler != sampler_alt_name_map.end()) {
|
sampler = sampler_alt_name_map.find(name);
|
||||||
samplers.push_back(sampler->second);
|
if (sampler != sampler_alt_name_map.end()) {
|
||||||
}
|
samplers.push_back(sampler->second);
|
||||||
|
continue;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
LOG_WRN("%s: unable to match sampler by name '%s'\n", __func__, name.c_str());
|
||||||
}
|
}
|
||||||
|
|
||||||
return samplers;
|
return samplers;
|
||||||
|
|
@ -552,6 +556,7 @@ std::vector<common_sampler_type> common_sampler_types_from_chars(const std::stri
|
||||||
{ common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_TOP_K), COMMON_SAMPLER_TYPE_TOP_K },
|
{ common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_TOP_K), COMMON_SAMPLER_TYPE_TOP_K },
|
||||||
{ common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_TYPICAL_P), COMMON_SAMPLER_TYPE_TYPICAL_P },
|
{ common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_TYPICAL_P), COMMON_SAMPLER_TYPE_TYPICAL_P },
|
||||||
{ common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_TOP_P), COMMON_SAMPLER_TYPE_TOP_P },
|
{ common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_TOP_P), COMMON_SAMPLER_TYPE_TOP_P },
|
||||||
|
{ common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_TOP_N_SIGMA), COMMON_SAMPLER_TYPE_TOP_N_SIGMA },
|
||||||
{ common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_MIN_P), COMMON_SAMPLER_TYPE_MIN_P },
|
{ common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_MIN_P), COMMON_SAMPLER_TYPE_MIN_P },
|
||||||
{ common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_TEMPERATURE), COMMON_SAMPLER_TYPE_TEMPERATURE },
|
{ common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_TEMPERATURE), COMMON_SAMPLER_TYPE_TEMPERATURE },
|
||||||
{ common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_XTC), COMMON_SAMPLER_TYPE_XTC },
|
{ common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_XTC), COMMON_SAMPLER_TYPE_XTC },
|
||||||
|
|
@ -566,6 +571,8 @@ std::vector<common_sampler_type> common_sampler_types_from_chars(const std::stri
|
||||||
const auto sampler = sampler_name_map.find(c);
|
const auto sampler = sampler_name_map.find(c);
|
||||||
if (sampler != sampler_name_map.end()) {
|
if (sampler != sampler_name_map.end()) {
|
||||||
samplers.push_back(sampler->second);
|
samplers.push_back(sampler->second);
|
||||||
|
} else {
|
||||||
|
LOG_WRN("%s: unable to match sampler by char '%c'\n", __func__, c);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
67
llama/llama.cpp/include/llama.h
vendored
67
llama/llama.cpp/include/llama.h
vendored
|
|
@ -4,6 +4,7 @@
|
||||||
#include "ggml.h"
|
#include "ggml.h"
|
||||||
#include "ggml-cpu.h"
|
#include "ggml-cpu.h"
|
||||||
#include "ggml-backend.h"
|
#include "ggml-backend.h"
|
||||||
|
#include "ggml-opt.h"
|
||||||
|
|
||||||
#include <stddef.h>
|
#include <stddef.h>
|
||||||
#include <stdint.h>
|
#include <stdint.h>
|
||||||
|
|
@ -112,6 +113,7 @@ extern "C" {
|
||||||
LLAMA_VOCAB_PRE_TYPE_BAILINGMOE = 32,
|
LLAMA_VOCAB_PRE_TYPE_BAILINGMOE = 32,
|
||||||
LLAMA_VOCAB_PRE_TYPE_LLAMA4 = 33,
|
LLAMA_VOCAB_PRE_TYPE_LLAMA4 = 33,
|
||||||
LLAMA_VOCAB_PRE_TYPE_PIXTRAL = 34,
|
LLAMA_VOCAB_PRE_TYPE_PIXTRAL = 34,
|
||||||
|
LLAMA_VOCAB_PRE_TYPE_SEED_CODER = 35,
|
||||||
};
|
};
|
||||||
|
|
||||||
enum llama_rope_type {
|
enum llama_rope_type {
|
||||||
|
|
@ -256,7 +258,6 @@ extern "C" {
|
||||||
|
|
||||||
llama_token * token;
|
llama_token * token;
|
||||||
float * embd;
|
float * embd;
|
||||||
int32_t n_embd;
|
|
||||||
llama_pos * pos;
|
llama_pos * pos;
|
||||||
int32_t * n_seq_id;
|
int32_t * n_seq_id;
|
||||||
llama_seq_id ** seq_id;
|
llama_seq_id ** seq_id;
|
||||||
|
|
@ -352,20 +353,18 @@ extern "C" {
|
||||||
enum ggml_type type_k; // data type for K cache [EXPERIMENTAL]
|
enum ggml_type type_k; // data type for K cache [EXPERIMENTAL]
|
||||||
enum ggml_type type_v; // data type for V cache [EXPERIMENTAL]
|
enum ggml_type type_v; // data type for V cache [EXPERIMENTAL]
|
||||||
|
|
||||||
// Keep the booleans together and at the end of the struct to avoid misalignment during copy-by-value.
|
|
||||||
// TODO: move at the end of the struct
|
|
||||||
bool logits_all; // the llama_decode() call computes all logits, not just the last one (DEPRECATED - set llama_batch.logits instead)
|
|
||||||
bool embeddings; // if true, extract embeddings (together with logits)
|
|
||||||
bool offload_kqv; // whether to offload the KQV ops (including the KV cache) to GPU
|
|
||||||
bool flash_attn; // whether to use flash attention [EXPERIMENTAL]
|
|
||||||
bool no_perf; // whether to measure performance timings
|
|
||||||
bool cross_attn; // whether to use cross attention
|
|
||||||
|
|
||||||
// Abort callback
|
// Abort callback
|
||||||
// if it returns true, execution of llama_decode() will be aborted
|
// if it returns true, execution of llama_decode() will be aborted
|
||||||
// currently works only with CPU execution
|
// currently works only with CPU execution
|
||||||
ggml_abort_callback abort_callback;
|
ggml_abort_callback abort_callback;
|
||||||
void * abort_callback_data;
|
void * abort_callback_data;
|
||||||
|
|
||||||
|
// Keep the booleans together and at the end of the struct to avoid misalignment during copy-by-value.
|
||||||
|
bool embeddings; // if true, extract embeddings (together with logits)
|
||||||
|
bool offload_kqv; // whether to offload the KQV ops (including the KV cache) to GPU
|
||||||
|
bool flash_attn; // whether to use flash attention [EXPERIMENTAL]
|
||||||
|
bool no_perf; // whether to measure performance timings
|
||||||
|
bool op_offload; // whether to offload host tensor operations to device
|
||||||
};
|
};
|
||||||
|
|
||||||
// model quantization parameters
|
// model quantization parameters
|
||||||
|
|
@ -447,6 +446,10 @@ extern "C" {
|
||||||
size_t n_paths,
|
size_t n_paths,
|
||||||
struct llama_model_params params);
|
struct llama_model_params params);
|
||||||
|
|
||||||
|
LLAMA_API void llama_model_save_to_file(
|
||||||
|
const struct llama_model * model,
|
||||||
|
const char * path_model);
|
||||||
|
|
||||||
DEPRECATED(LLAMA_API void llama_free_model(struct llama_model * model),
|
DEPRECATED(LLAMA_API void llama_free_model(struct llama_model * model),
|
||||||
"use llama_model_free instead");
|
"use llama_model_free instead");
|
||||||
|
|
||||||
|
|
@ -461,10 +464,6 @@ extern "C" {
|
||||||
struct llama_context_params params),
|
struct llama_context_params params),
|
||||||
"use llama_init_from_model instead");
|
"use llama_init_from_model instead");
|
||||||
|
|
||||||
// TODO (jmorganca): this should most likely be passed in as part of a batch
|
|
||||||
// and not set on the context for all batches.
|
|
||||||
LLAMA_API void llama_set_cross_attention(struct llama_context * ctx, bool cross_attn_state);
|
|
||||||
|
|
||||||
// Frees all allocated memory
|
// Frees all allocated memory
|
||||||
LLAMA_API void llama_free(struct llama_context * ctx);
|
LLAMA_API void llama_free(struct llama_context * ctx);
|
||||||
|
|
||||||
|
|
@ -930,14 +929,19 @@ extern "C" {
|
||||||
// Frees a batch of tokens allocated with llama_batch_init()
|
// Frees a batch of tokens allocated with llama_batch_init()
|
||||||
LLAMA_API void llama_batch_free(struct llama_batch batch);
|
LLAMA_API void llama_batch_free(struct llama_batch batch);
|
||||||
|
|
||||||
// Processes a batch of tokens with the ecoder part of the encoder-decoder model.
|
// Process a batch of tokens.
|
||||||
// Stores the encoder output internally for later use by the decoder cross-attention layers.
|
// In contrast to llama_decode() - this call does not use KV cache.
|
||||||
|
// For encode-decoder contexts, processes the batch using the encoder.
|
||||||
|
// Can store the encoder output internally for later use by the decoder's cross-attention layers.
|
||||||
// 0 - success
|
// 0 - success
|
||||||
// < 0 - error. the KV cache state is restored to the state before this call
|
// < 0 - error. the KV cache state is restored to the state before this call
|
||||||
LLAMA_API int32_t llama_encode(
|
LLAMA_API int32_t llama_encode(
|
||||||
struct llama_context * ctx,
|
struct llama_context * ctx,
|
||||||
struct llama_batch batch);
|
struct llama_batch batch);
|
||||||
|
|
||||||
|
// Process a batch of tokens.
|
||||||
|
// Requires KV cache.
|
||||||
|
// For encode-decoder contexts, processes the batch using the decoder.
|
||||||
// Positive return values does not mean a fatal error, but rather a warning.
|
// Positive return values does not mean a fatal error, but rather a warning.
|
||||||
// 0 - success
|
// 0 - success
|
||||||
// 1 - could not find a KV slot for the batch (try reducing the size of the batch or increase the context)
|
// 1 - could not find a KV slot for the batch (try reducing the size of the batch or increase the context)
|
||||||
|
|
@ -1434,6 +1438,37 @@ extern "C" {
|
||||||
LLAMA_API void llama_perf_sampler_print(const struct llama_sampler * chain);
|
LLAMA_API void llama_perf_sampler_print(const struct llama_sampler * chain);
|
||||||
LLAMA_API void llama_perf_sampler_reset( struct llama_sampler * chain);
|
LLAMA_API void llama_perf_sampler_reset( struct llama_sampler * chain);
|
||||||
|
|
||||||
|
//
|
||||||
|
// training
|
||||||
|
//
|
||||||
|
|
||||||
|
// function that returns whether or not a given tensor contains trainable parameters
|
||||||
|
typedef bool (*llama_opt_param_filter)(const struct ggml_tensor * tensor, void * userdata);
|
||||||
|
|
||||||
|
// always returns true
|
||||||
|
LLAMA_API bool llama_opt_param_filter_all(const struct ggml_tensor * tensor, void * userdata);
|
||||||
|
|
||||||
|
struct llama_opt_params {
|
||||||
|
uint32_t n_ctx_train; // assumed context size post training, use context size specified in llama_context if 0
|
||||||
|
|
||||||
|
llama_opt_param_filter param_filter; // callback for determining which tensors contain trainable parameters
|
||||||
|
void * param_filter_ud; // userdata for determining which tensors contain trainable parameters
|
||||||
|
|
||||||
|
ggml_opt_get_optimizer_params get_opt_pars; // callback for calculating optimizer parameters
|
||||||
|
void * get_opt_pars_ud; // userdata for calculating optimizer parameters
|
||||||
|
};
|
||||||
|
|
||||||
|
LLAMA_API void llama_opt_init(struct llama_context * lctx, struct llama_model * model, struct llama_opt_params lopt_params);
|
||||||
|
|
||||||
|
LLAMA_API void llama_opt_epoch(
|
||||||
|
struct llama_context * lctx,
|
||||||
|
ggml_opt_dataset_t dataset,
|
||||||
|
ggml_opt_result_t result_train,
|
||||||
|
ggml_opt_result_t result_eval,
|
||||||
|
int64_t idata_split,
|
||||||
|
ggml_opt_epoch_callback callback_train,
|
||||||
|
ggml_opt_epoch_callback callback_eval);
|
||||||
|
|
||||||
#ifdef __cplusplus
|
#ifdef __cplusplus
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
|
||||||
6
llama/llama.cpp/src/llama-adapter.cpp
vendored
6
llama/llama.cpp/src/llama-adapter.cpp
vendored
|
|
@ -253,6 +253,9 @@ static void llama_adapter_lora_init_impl(llama_model & model, const char * path_
|
||||||
std::vector<ggml_backend_buffer_type_t> buft_extra;
|
std::vector<ggml_backend_buffer_type_t> buft_extra;
|
||||||
{
|
{
|
||||||
auto * cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
|
auto * cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
|
||||||
|
if (!cpu_dev) {
|
||||||
|
throw std::runtime_error(format("%s: no CPU backend found", __func__));
|
||||||
|
}
|
||||||
auto * cpu_reg = ggml_backend_dev_backend_reg(cpu_dev);
|
auto * cpu_reg = ggml_backend_dev_backend_reg(cpu_dev);
|
||||||
|
|
||||||
auto ggml_backend_dev_get_extra_bufts_fn = (ggml_backend_dev_get_extra_bufts_t)
|
auto ggml_backend_dev_get_extra_bufts_fn = (ggml_backend_dev_get_extra_bufts_t)
|
||||||
|
|
@ -291,6 +294,9 @@ static void llama_adapter_lora_init_impl(llama_model & model, const char * path_
|
||||||
LLAMA_LOG_WARN("%s: lora for '%s' cannot use buft '%s', fallback to CPU\n", __func__, model_tensor->name, ggml_backend_buft_name(buft));
|
LLAMA_LOG_WARN("%s: lora for '%s' cannot use buft '%s', fallback to CPU\n", __func__, model_tensor->name, ggml_backend_buft_name(buft));
|
||||||
|
|
||||||
auto * cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
|
auto * cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
|
||||||
|
if (!cpu_dev) {
|
||||||
|
throw std::runtime_error(format("%s: no CPU backend found", __func__));
|
||||||
|
}
|
||||||
buft = ggml_backend_dev_buffer_type(cpu_dev);
|
buft = ggml_backend_dev_buffer_type(cpu_dev);
|
||||||
|
|
||||||
break;
|
break;
|
||||||
|
|
|
||||||
44
llama/llama.cpp/src/llama-arch.cpp
vendored
44
llama/llama.cpp/src/llama-arch.cpp
vendored
|
|
@ -6,7 +6,6 @@
|
||||||
|
|
||||||
static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
|
static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
|
||||||
{ LLM_ARCH_LLAMA, "llama" },
|
{ LLM_ARCH_LLAMA, "llama" },
|
||||||
{ LLM_ARCH_MLLAMA, "mllama" },
|
|
||||||
{ LLM_ARCH_LLAMA4, "llama4" },
|
{ LLM_ARCH_LLAMA4, "llama4" },
|
||||||
{ LLM_ARCH_DECI, "deci" },
|
{ LLM_ARCH_DECI, "deci" },
|
||||||
{ LLM_ARCH_FALCON, "falcon" },
|
{ LLM_ARCH_FALCON, "falcon" },
|
||||||
|
|
@ -145,7 +144,6 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
|
||||||
{ LLM_KV_ATTENTION_SLIDING_WINDOW, "%s.attention.sliding_window" },
|
{ LLM_KV_ATTENTION_SLIDING_WINDOW, "%s.attention.sliding_window" },
|
||||||
{ LLM_KV_ATTENTION_SCALE, "%s.attention.scale" },
|
{ LLM_KV_ATTENTION_SCALE, "%s.attention.scale" },
|
||||||
{ LLM_KV_ATTENTION_BLOCK_SKIP_CONNECTION, "%s.attention.block_skip_connection" },
|
{ LLM_KV_ATTENTION_BLOCK_SKIP_CONNECTION, "%s.attention.block_skip_connection" },
|
||||||
{ LLM_KV_ATTENTION_CROSS_ATTENTION_LAYERS, "%s.attention.cross_attention_layers" },
|
|
||||||
{ LLM_KV_ATTENTION_KEY_LENGTH_MLA, "%s.attention.key_length_mla" },
|
{ LLM_KV_ATTENTION_KEY_LENGTH_MLA, "%s.attention.key_length_mla" },
|
||||||
{ LLM_KV_ATTENTION_VALUE_LENGTH_MLA, "%s.attention.value_length_mla" },
|
{ LLM_KV_ATTENTION_VALUE_LENGTH_MLA, "%s.attention.value_length_mla" },
|
||||||
|
|
||||||
|
|
@ -275,40 +273,6 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
|
||||||
{ LLM_TENSOR_FFN_UP_SHEXP, "blk.%d.ffn_up_shexp" },
|
{ LLM_TENSOR_FFN_UP_SHEXP, "blk.%d.ffn_up_shexp" },
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
{
|
|
||||||
LLM_ARCH_MLLAMA,
|
|
||||||
{
|
|
||||||
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
|
||||||
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
|
||||||
{ LLM_TENSOR_OUTPUT, "output" },
|
|
||||||
{ LLM_TENSOR_ROPE_FREQS, "rope_freqs" },
|
|
||||||
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
|
||||||
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
|
|
||||||
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
|
|
||||||
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
|
|
||||||
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
|
||||||
{ LLM_TENSOR_ATTN_ROT_EMBD, "blk.%d.attn_rot_embd" },
|
|
||||||
{ LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" },
|
|
||||||
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
|
|
||||||
{ LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
|
|
||||||
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
|
||||||
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
|
||||||
{ LLM_TENSOR_FFN_GATE_EXP, "blk.%d.ffn_gate.%d" },
|
|
||||||
{ LLM_TENSOR_FFN_DOWN_EXP, "blk.%d.ffn_down.%d" },
|
|
||||||
{ LLM_TENSOR_FFN_UP_EXP, "blk.%d.ffn_up.%d" },
|
|
||||||
{ LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" },
|
|
||||||
{ LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" },
|
|
||||||
{ LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
|
|
||||||
{ LLM_TENSOR_CROSS_ATTN_K_NORM, "blk.%d.cross_attn_k_norm" },
|
|
||||||
{ LLM_TENSOR_CROSS_ATTN_K_PROJ, "blk.%d.cross_attn_k_proj" },
|
|
||||||
{ LLM_TENSOR_CROSS_ATTN_O_PROJ, "blk.%d.cross_attn_o_proj" },
|
|
||||||
{ LLM_TENSOR_CROSS_ATTN_Q_NORM, "blk.%d.cross_attn_q_norm" },
|
|
||||||
{ LLM_TENSOR_CROSS_ATTN_Q_PROJ, "blk.%d.cross_attn_q_proj" },
|
|
||||||
{ LLM_TENSOR_CROSS_ATTN_V_PROJ, "blk.%d.cross_attn_v_proj" },
|
|
||||||
{ LLM_TENSOR_CROSS_ATTN_ATTN_GATE, "blk.%d.cross_attn_attn_gate" },
|
|
||||||
{ LLM_TENSOR_CROSS_ATTN_MLP_GATE, "blk.%d.cross_attn_mlp_gate" },
|
|
||||||
},
|
|
||||||
},
|
|
||||||
{
|
{
|
||||||
LLM_ARCH_DECI,
|
LLM_ARCH_DECI,
|
||||||
{
|
{
|
||||||
|
|
@ -1737,14 +1701,6 @@ static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
|
||||||
// this tensor is loaded for T5, but never used
|
// this tensor is loaded for T5, but never used
|
||||||
{LLM_TENSOR_DEC_CROSS_ATTN_REL_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_NONE}},
|
{LLM_TENSOR_DEC_CROSS_ATTN_REL_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_NONE}},
|
||||||
{LLM_TENSOR_BSKCN_TV, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
|
{LLM_TENSOR_BSKCN_TV, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
|
||||||
{LLM_TENSOR_CROSS_ATTN_K_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
|
|
||||||
{LLM_TENSOR_CROSS_ATTN_K_PROJ, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
|
|
||||||
{LLM_TENSOR_CROSS_ATTN_O_PROJ, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
|
|
||||||
{LLM_TENSOR_CROSS_ATTN_Q_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
|
|
||||||
{LLM_TENSOR_CROSS_ATTN_Q_PROJ, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
|
|
||||||
{LLM_TENSOR_CROSS_ATTN_V_PROJ, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
|
|
||||||
{LLM_TENSOR_CROSS_ATTN_ATTN_GATE, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
|
|
||||||
{LLM_TENSOR_CROSS_ATTN_MLP_GATE, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
|
|
||||||
{LLM_TENSOR_CONV1D, {LLM_TENSOR_LAYER_INPUT, GGML_OP_IM2COL}},
|
{LLM_TENSOR_CONV1D, {LLM_TENSOR_LAYER_INPUT, GGML_OP_IM2COL}},
|
||||||
{LLM_TENSOR_POS_NET_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
|
{LLM_TENSOR_POS_NET_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
|
||||||
{LLM_TENSOR_POS_NET_NORM1, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
|
{LLM_TENSOR_POS_NET_NORM1, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
|
||||||
|
|
|
||||||
10
llama/llama.cpp/src/llama-arch.h
vendored
10
llama/llama.cpp/src/llama-arch.h
vendored
|
|
@ -11,7 +11,6 @@
|
||||||
enum llm_arch {
|
enum llm_arch {
|
||||||
LLM_ARCH_LLAMA,
|
LLM_ARCH_LLAMA,
|
||||||
LLM_ARCH_LLAMA4,
|
LLM_ARCH_LLAMA4,
|
||||||
LLM_ARCH_MLLAMA,
|
|
||||||
LLM_ARCH_DECI,
|
LLM_ARCH_DECI,
|
||||||
LLM_ARCH_FALCON,
|
LLM_ARCH_FALCON,
|
||||||
LLM_ARCH_BAICHUAN,
|
LLM_ARCH_BAICHUAN,
|
||||||
|
|
@ -149,7 +148,6 @@ enum llm_kv {
|
||||||
LLM_KV_ATTENTION_SLIDING_WINDOW,
|
LLM_KV_ATTENTION_SLIDING_WINDOW,
|
||||||
LLM_KV_ATTENTION_SCALE,
|
LLM_KV_ATTENTION_SCALE,
|
||||||
LLM_KV_ATTENTION_BLOCK_SKIP_CONNECTION,
|
LLM_KV_ATTENTION_BLOCK_SKIP_CONNECTION,
|
||||||
LLM_KV_ATTENTION_CROSS_ATTENTION_LAYERS,
|
|
||||||
LLM_KV_ATTENTION_KEY_LENGTH_MLA,
|
LLM_KV_ATTENTION_KEY_LENGTH_MLA,
|
||||||
LLM_KV_ATTENTION_VALUE_LENGTH_MLA,
|
LLM_KV_ATTENTION_VALUE_LENGTH_MLA,
|
||||||
|
|
||||||
|
|
@ -351,14 +349,6 @@ enum llm_tensor {
|
||||||
LLM_TENSOR_CLS,
|
LLM_TENSOR_CLS,
|
||||||
LLM_TENSOR_CLS_OUT,
|
LLM_TENSOR_CLS_OUT,
|
||||||
LLM_TENSOR_BSKCN_TV,
|
LLM_TENSOR_BSKCN_TV,
|
||||||
LLM_TENSOR_CROSS_ATTN_K_NORM,
|
|
||||||
LLM_TENSOR_CROSS_ATTN_K_PROJ,
|
|
||||||
LLM_TENSOR_CROSS_ATTN_O_PROJ,
|
|
||||||
LLM_TENSOR_CROSS_ATTN_Q_NORM,
|
|
||||||
LLM_TENSOR_CROSS_ATTN_Q_PROJ,
|
|
||||||
LLM_TENSOR_CROSS_ATTN_V_PROJ,
|
|
||||||
LLM_TENSOR_CROSS_ATTN_ATTN_GATE,
|
|
||||||
LLM_TENSOR_CROSS_ATTN_MLP_GATE,
|
|
||||||
LLM_TENSOR_CONV1D,
|
LLM_TENSOR_CONV1D,
|
||||||
LLM_TENSOR_CONVNEXT_DW,
|
LLM_TENSOR_CONVNEXT_DW,
|
||||||
LLM_TENSOR_CONVNEXT_NORM,
|
LLM_TENSOR_CONVNEXT_NORM,
|
||||||
|
|
|
||||||
9
llama/llama.cpp/src/llama-batch.cpp
vendored
9
llama/llama.cpp/src/llama-batch.cpp
vendored
|
|
@ -189,7 +189,7 @@ llama_ubatch llama_sbatch::split_seq(size_t n_ubatch) {
|
||||||
return ubatch;
|
return ubatch;
|
||||||
}
|
}
|
||||||
|
|
||||||
void llama_sbatch::from_batch(const llama_batch & batch, size_t n_embd, bool simple_split, bool logits_all) {
|
llama_sbatch::llama_sbatch(const llama_batch & batch, size_t n_embd, bool simple_split, bool logits_all) {
|
||||||
GGML_ASSERT(batch.n_tokens >= 0);
|
GGML_ASSERT(batch.n_tokens >= 0);
|
||||||
this->batch = &batch;
|
this->batch = &batch;
|
||||||
this->n_embd = n_embd;
|
this->n_embd = n_embd;
|
||||||
|
|
@ -203,6 +203,7 @@ void llama_sbatch::from_batch(const llama_batch & batch, size_t n_embd, bool sim
|
||||||
for (size_t i = 0; i < n_tokens; ++i) {
|
for (size_t i = 0; i < n_tokens; ++i) {
|
||||||
ids[i] = i;
|
ids[i] = i;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (simple_split) {
|
if (simple_split) {
|
||||||
seq.resize(1);
|
seq.resize(1);
|
||||||
llama_sbatch_seq & s = seq[0];
|
llama_sbatch_seq & s = seq[0];
|
||||||
|
|
@ -212,6 +213,7 @@ void llama_sbatch::from_batch(const llama_batch & batch, size_t n_embd, bool sim
|
||||||
s.length = n_tokens;
|
s.length = n_tokens;
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
std::sort(ids.begin(), ids.end(),
|
std::sort(ids.begin(), ids.end(),
|
||||||
[&batch](size_t a, size_t b) {
|
[&batch](size_t a, size_t b) {
|
||||||
int32_t n_seq_a = batch.n_seq_id ? batch.n_seq_id[a] : 1;
|
int32_t n_seq_a = batch.n_seq_id ? batch.n_seq_id[a] : 1;
|
||||||
|
|
@ -239,6 +241,7 @@ void llama_sbatch::from_batch(const llama_batch & batch, size_t n_embd, bool sim
|
||||||
return n_seq_a > n_seq_b;
|
return n_seq_a > n_seq_b;
|
||||||
}
|
}
|
||||||
);
|
);
|
||||||
|
|
||||||
// init seq
|
// init seq
|
||||||
llama_sbatch_seq * last_seq = nullptr;
|
llama_sbatch_seq * last_seq = nullptr;
|
||||||
|
|
||||||
|
|
@ -262,6 +265,7 @@ void llama_sbatch::from_batch(const llama_batch & batch, size_t n_embd, bool sim
|
||||||
seq.push_back(new_seq);
|
seq.push_back(new_seq);
|
||||||
last_seq = &seq.back();
|
last_seq = &seq.back();
|
||||||
}
|
}
|
||||||
|
|
||||||
// keep shared prompts first at the end, then sort by length descending.
|
// keep shared prompts first at the end, then sort by length descending.
|
||||||
std::sort(seq.begin(), seq.end(),
|
std::sort(seq.begin(), seq.end(),
|
||||||
[](llama_sbatch_seq & a, llama_sbatch_seq & b) {
|
[](llama_sbatch_seq & a, llama_sbatch_seq & b) {
|
||||||
|
|
@ -316,7 +320,6 @@ struct llama_batch llama_batch_get_one(
|
||||||
/*n_tokens =*/ n_tokens,
|
/*n_tokens =*/ n_tokens,
|
||||||
/*tokens =*/ tokens,
|
/*tokens =*/ tokens,
|
||||||
/*embd =*/ nullptr,
|
/*embd =*/ nullptr,
|
||||||
/*n_embd =*/ 0,
|
|
||||||
/*pos =*/ nullptr,
|
/*pos =*/ nullptr,
|
||||||
/*n_seq_id =*/ nullptr,
|
/*n_seq_id =*/ nullptr,
|
||||||
/*seq_id =*/ nullptr,
|
/*seq_id =*/ nullptr,
|
||||||
|
|
@ -329,7 +332,6 @@ struct llama_batch llama_batch_init(int32_t n_tokens_alloc, int32_t embd, int32_
|
||||||
/*n_tokens =*/ 0,
|
/*n_tokens =*/ 0,
|
||||||
/*tokens =*/ nullptr,
|
/*tokens =*/ nullptr,
|
||||||
/*embd =*/ nullptr,
|
/*embd =*/ nullptr,
|
||||||
/*n_embd =*/ 0,
|
|
||||||
/*pos =*/ nullptr,
|
/*pos =*/ nullptr,
|
||||||
/*n_seq_id =*/ nullptr,
|
/*n_seq_id =*/ nullptr,
|
||||||
/*seq_id =*/ nullptr,
|
/*seq_id =*/ nullptr,
|
||||||
|
|
@ -338,7 +340,6 @@ struct llama_batch llama_batch_init(int32_t n_tokens_alloc, int32_t embd, int32_
|
||||||
|
|
||||||
if (embd) {
|
if (embd) {
|
||||||
batch.embd = (float *) malloc(sizeof(float) * n_tokens_alloc * embd);
|
batch.embd = (float *) malloc(sizeof(float) * n_tokens_alloc * embd);
|
||||||
batch.n_embd = embd;
|
|
||||||
} else {
|
} else {
|
||||||
batch.token = (llama_token *) malloc(sizeof(llama_token) * n_tokens_alloc);
|
batch.token = (llama_token *) malloc(sizeof(llama_token) * n_tokens_alloc);
|
||||||
}
|
}
|
||||||
|
|
|
||||||
3
llama/llama.cpp/src/llama-batch.h
vendored
3
llama/llama.cpp/src/llama-batch.h
vendored
|
|
@ -70,7 +70,8 @@ struct llama_sbatch {
|
||||||
// sequence-wise split
|
// sequence-wise split
|
||||||
llama_ubatch split_seq(size_t n_ubatch);
|
llama_ubatch split_seq(size_t n_ubatch);
|
||||||
|
|
||||||
void from_batch(const llama_batch & batch, size_t n_embd, bool simple_split = false, bool logits_all = false);
|
llama_sbatch() = default;
|
||||||
|
llama_sbatch(const llama_batch & batch, size_t n_embd, bool simple_split = false, bool logits_all = false);
|
||||||
};
|
};
|
||||||
|
|
||||||
// temporary allocate memory for the input batch if needed
|
// temporary allocate memory for the input batch if needed
|
||||||
|
|
|
||||||
24
llama/llama.cpp/src/llama-chat.cpp
vendored
24
llama/llama.cpp/src/llama-chat.cpp
vendored
|
|
@ -35,6 +35,7 @@ static const std::map<std::string, llm_chat_template> LLM_CHAT_TEMPLATES = {
|
||||||
{ "mistral-v3", LLM_CHAT_TEMPLATE_MISTRAL_V3 },
|
{ "mistral-v3", LLM_CHAT_TEMPLATE_MISTRAL_V3 },
|
||||||
{ "mistral-v3-tekken", LLM_CHAT_TEMPLATE_MISTRAL_V3_TEKKEN },
|
{ "mistral-v3-tekken", LLM_CHAT_TEMPLATE_MISTRAL_V3_TEKKEN },
|
||||||
{ "mistral-v7", LLM_CHAT_TEMPLATE_MISTRAL_V7 },
|
{ "mistral-v7", LLM_CHAT_TEMPLATE_MISTRAL_V7 },
|
||||||
|
{ "mistral-v7-tekken", LLM_CHAT_TEMPLATE_MISTRAL_V7_TEKKEN },
|
||||||
{ "phi3", LLM_CHAT_TEMPLATE_PHI_3 },
|
{ "phi3", LLM_CHAT_TEMPLATE_PHI_3 },
|
||||||
{ "phi4", LLM_CHAT_TEMPLATE_PHI_4 },
|
{ "phi4", LLM_CHAT_TEMPLATE_PHI_4 },
|
||||||
{ "falcon3", LLM_CHAT_TEMPLATE_FALCON_3 },
|
{ "falcon3", LLM_CHAT_TEMPLATE_FALCON_3 },
|
||||||
|
|
@ -202,19 +203,20 @@ int32_t llm_chat_apply_template(
|
||||||
if (add_ass) {
|
if (add_ass) {
|
||||||
ss << "<|im_start|>assistant\n";
|
ss << "<|im_start|>assistant\n";
|
||||||
}
|
}
|
||||||
} else if (tmpl == LLM_CHAT_TEMPLATE_MISTRAL_V7) {
|
} else if (tmpl == LLM_CHAT_TEMPLATE_MISTRAL_V7 || tmpl == LLM_CHAT_TEMPLATE_MISTRAL_V7_TEKKEN) {
|
||||||
// Official mistral 'v7' template
|
// Official mistral 'v7' template
|
||||||
// See: https://huggingface.co/mistralai/Mistral-Large-Instruct-2411#basic-instruct-template-v7
|
// See: https://huggingface.co/mistralai/Mistral-Large-Instruct-2411#basic-instruct-template-v7
|
||||||
|
// https://huggingface.co/mistralai/Mistral-Small-3.1-24B-Instruct-2503#basic-instruct-template-v7-tekken
|
||||||
|
const char * trailing_space = tmpl == LLM_CHAT_TEMPLATE_MISTRAL_V7 ? " " : "";
|
||||||
for (auto message : chat) {
|
for (auto message : chat) {
|
||||||
std::string role(message->role);
|
std::string role(message->role);
|
||||||
std::string content(message->content);
|
std::string content(message->content);
|
||||||
if (role == "system") {
|
if (role == "system") {
|
||||||
ss << "[SYSTEM_PROMPT] " << content << "[/SYSTEM_PROMPT]";
|
ss << "[SYSTEM_PROMPT]" << trailing_space << content << "[/SYSTEM_PROMPT]";
|
||||||
} else if (role == "user") {
|
} else if (role == "user") {
|
||||||
ss << "[INST] " << content << "[/INST]";
|
ss << "[INST]" << trailing_space << content << "[/INST]";
|
||||||
}
|
} else {
|
||||||
else {
|
ss << trailing_space << content << "</s>";
|
||||||
ss << " " << content << "</s>";
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
} else if (tmpl == LLM_CHAT_TEMPLATE_MISTRAL_V1
|
} else if (tmpl == LLM_CHAT_TEMPLATE_MISTRAL_V1
|
||||||
|
|
@ -447,8 +449,16 @@ int32_t llm_chat_apply_template(
|
||||||
if (add_ass) {
|
if (add_ass) {
|
||||||
ss << "<|assistant|>";
|
ss << "<|assistant|>";
|
||||||
}
|
}
|
||||||
} else if (tmpl == LLM_CHAT_TEMPLATE_CHATGLM_4 || tmpl == LLM_CHAT_TEMPLATE_GLMEDGE) {
|
} else if (tmpl == LLM_CHAT_TEMPLATE_CHATGLM_4) {
|
||||||
ss << "[gMASK]" << "<sop>";
|
ss << "[gMASK]" << "<sop>";
|
||||||
|
for (auto message : chat) {
|
||||||
|
std::string role(message->role);
|
||||||
|
ss << "<|" << role << "|>" << "\n" << message->content;
|
||||||
|
}
|
||||||
|
if (add_ass) {
|
||||||
|
ss << "<|assistant|>\n";
|
||||||
|
}
|
||||||
|
} else if (tmpl == LLM_CHAT_TEMPLATE_GLMEDGE) {
|
||||||
for (auto message : chat) {
|
for (auto message : chat) {
|
||||||
std::string role(message->role);
|
std::string role(message->role);
|
||||||
ss << "<|" << role << "|>" << "\n" << message->content;
|
ss << "<|" << role << "|>" << "\n" << message->content;
|
||||||
|
|
|
||||||
1
llama/llama.cpp/src/llama-chat.h
vendored
1
llama/llama.cpp/src/llama-chat.h
vendored
|
|
@ -14,6 +14,7 @@ enum llm_chat_template {
|
||||||
LLM_CHAT_TEMPLATE_MISTRAL_V3,
|
LLM_CHAT_TEMPLATE_MISTRAL_V3,
|
||||||
LLM_CHAT_TEMPLATE_MISTRAL_V3_TEKKEN,
|
LLM_CHAT_TEMPLATE_MISTRAL_V3_TEKKEN,
|
||||||
LLM_CHAT_TEMPLATE_MISTRAL_V7,
|
LLM_CHAT_TEMPLATE_MISTRAL_V7,
|
||||||
|
LLM_CHAT_TEMPLATE_MISTRAL_V7_TEKKEN,
|
||||||
LLM_CHAT_TEMPLATE_PHI_3,
|
LLM_CHAT_TEMPLATE_PHI_3,
|
||||||
LLM_CHAT_TEMPLATE_PHI_4,
|
LLM_CHAT_TEMPLATE_PHI_4,
|
||||||
LLM_CHAT_TEMPLATE_FALCON_3,
|
LLM_CHAT_TEMPLATE_FALCON_3,
|
||||||
|
|
|
||||||
914
llama/llama.cpp/src/llama-context.cpp
vendored
914
llama/llama.cpp/src/llama-context.cpp
vendored
File diff suppressed because it is too large
Load Diff
80
llama/llama.cpp/src/llama-context.h
vendored
80
llama/llama.cpp/src/llama-context.h
vendored
|
|
@ -8,6 +8,7 @@
|
||||||
#include "llama-kv-cache.h"
|
#include "llama-kv-cache.h"
|
||||||
|
|
||||||
#include "ggml-cpp.h"
|
#include "ggml-cpp.h"
|
||||||
|
#include "ggml-opt.h"
|
||||||
|
|
||||||
#include <map>
|
#include <map>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
|
|
@ -28,7 +29,12 @@ struct llama_context {
|
||||||
|
|
||||||
void synchronize();
|
void synchronize();
|
||||||
|
|
||||||
const llama_model & get_model() const;
|
const llama_model & get_model() const;
|
||||||
|
const llama_cparams & get_cparams() const;
|
||||||
|
|
||||||
|
ggml_backend_sched_t get_sched() const;
|
||||||
|
|
||||||
|
ggml_context * get_ctx_compute() const;
|
||||||
|
|
||||||
uint32_t n_ctx() const;
|
uint32_t n_ctx() const;
|
||||||
uint32_t n_ctx_per_seq() const;
|
uint32_t n_ctx_per_seq() const;
|
||||||
|
|
@ -66,7 +72,6 @@ struct llama_context {
|
||||||
void set_embeddings (bool value);
|
void set_embeddings (bool value);
|
||||||
void set_causal_attn(bool value);
|
void set_causal_attn(bool value);
|
||||||
void set_warmup(bool value);
|
void set_warmup(bool value);
|
||||||
void set_cross_attn(bool value);
|
|
||||||
|
|
||||||
void set_adapter_lora(
|
void set_adapter_lora(
|
||||||
llama_adapter_lora * adapter,
|
llama_adapter_lora * adapter,
|
||||||
|
|
@ -130,6 +135,32 @@ struct llama_context {
|
||||||
llama_perf_context_data perf_get_data() const;
|
llama_perf_context_data perf_get_data() const;
|
||||||
void perf_reset();
|
void perf_reset();
|
||||||
|
|
||||||
|
//
|
||||||
|
// training
|
||||||
|
//
|
||||||
|
|
||||||
|
void opt_init(struct llama_model * model, struct llama_opt_params lopt_params);
|
||||||
|
|
||||||
|
void opt_epoch(
|
||||||
|
ggml_opt_dataset_t dataset,
|
||||||
|
ggml_opt_result_t result_train,
|
||||||
|
ggml_opt_result_t result_eval,
|
||||||
|
int64_t idata_split,
|
||||||
|
ggml_opt_epoch_callback callback_train,
|
||||||
|
ggml_opt_epoch_callback callback_eval);
|
||||||
|
|
||||||
|
void opt_epoch_iter(
|
||||||
|
ggml_opt_dataset_t dataset,
|
||||||
|
ggml_opt_result_t result,
|
||||||
|
const std::vector<llama_token> & tokens,
|
||||||
|
const std::vector<llama_token> & labels_sparse,
|
||||||
|
llama_batch & batch,
|
||||||
|
ggml_opt_epoch_callback callback,
|
||||||
|
bool train,
|
||||||
|
int64_t idata_in_loop,
|
||||||
|
int64_t ndata_in_loop,
|
||||||
|
int64_t t_loop_start);
|
||||||
|
|
||||||
private:
|
private:
|
||||||
//
|
//
|
||||||
// output
|
// output
|
||||||
|
|
@ -139,50 +170,30 @@ private:
|
||||||
// Returns max number of outputs for which space was reserved.
|
// Returns max number of outputs for which space was reserved.
|
||||||
int32_t output_reserve(int32_t n_outputs);
|
int32_t output_reserve(int32_t n_outputs);
|
||||||
|
|
||||||
// make the outputs have the same order they had in the user-provided batch
|
|
||||||
// TODO: maybe remove this
|
|
||||||
void output_reorder();
|
|
||||||
|
|
||||||
//
|
//
|
||||||
// graph
|
// graph
|
||||||
//
|
//
|
||||||
|
|
||||||
|
public:
|
||||||
int32_t graph_max_nodes() const;
|
int32_t graph_max_nodes() const;
|
||||||
|
|
||||||
// zero-out inputs and create the ctx_compute for the compute graph
|
// zero-out inputs and create the ctx_compute for the compute graph
|
||||||
ggml_cgraph * graph_init();
|
ggml_cgraph * graph_init();
|
||||||
|
|
||||||
llm_graph_result_ptr graph_build(
|
|
||||||
ggml_context * ctx,
|
|
||||||
ggml_cgraph * gf,
|
|
||||||
const llama_ubatch & ubatch,
|
|
||||||
llm_graph_type gtype);
|
|
||||||
|
|
||||||
// returns the result of ggml_backend_sched_graph_compute_async execution
|
// returns the result of ggml_backend_sched_graph_compute_async execution
|
||||||
ggml_status graph_compute(
|
ggml_status graph_compute(
|
||||||
ggml_cgraph * gf,
|
ggml_cgraph * gf,
|
||||||
bool batched);
|
bool batched);
|
||||||
|
|
||||||
|
private:
|
||||||
|
llm_graph_result_ptr graph_build(
|
||||||
|
ggml_context * ctx,
|
||||||
|
ggml_cgraph * gf,
|
||||||
|
const llama_ubatch & ubatch,
|
||||||
|
llm_graph_type gtype);
|
||||||
|
|
||||||
llm_graph_cb graph_get_cb() const;
|
llm_graph_cb graph_get_cb() const;
|
||||||
|
|
||||||
// used by kv_self_update()
|
|
||||||
ggml_tensor * build_rope_shift(
|
|
||||||
ggml_context * ctx0,
|
|
||||||
ggml_tensor * cur,
|
|
||||||
ggml_tensor * shift,
|
|
||||||
ggml_tensor * factors,
|
|
||||||
float freq_base,
|
|
||||||
float freq_scale) const;
|
|
||||||
|
|
||||||
llm_graph_result_ptr build_kv_self_shift(
|
|
||||||
ggml_context * ctx0,
|
|
||||||
ggml_cgraph * gf) const;
|
|
||||||
|
|
||||||
llm_graph_result_ptr build_kv_self_defrag(
|
|
||||||
ggml_context * ctx0,
|
|
||||||
ggml_cgraph * gf,
|
|
||||||
const std::vector<struct llama_kv_defrag_move> & moves) const;
|
|
||||||
|
|
||||||
// TODO: read/write lora adapters and cvec
|
// TODO: read/write lora adapters and cvec
|
||||||
size_t state_write_data(llama_io_write_i & io);
|
size_t state_write_data(llama_io_write_i & io);
|
||||||
size_t state_read_data (llama_io_read_i & io);
|
size_t state_read_data (llama_io_read_i & io);
|
||||||
|
|
@ -199,14 +210,10 @@ private:
|
||||||
llama_cparams cparams;
|
llama_cparams cparams;
|
||||||
llama_adapter_cvec cvec;
|
llama_adapter_cvec cvec;
|
||||||
llama_adapter_loras loras;
|
llama_adapter_loras loras;
|
||||||
llama_sbatch sbatch;
|
|
||||||
|
|
||||||
llama_cross cross; // TODO: tmp for handling cross-attention - need something better probably
|
llama_cross cross; // TODO: tmp for handling cross-attention - need something better probably
|
||||||
|
|
||||||
std::unique_ptr<llama_kv_cache_unified> kv_self;
|
std::unique_ptr<llama_memory_i> memory;
|
||||||
|
|
||||||
// TODO: remove
|
|
||||||
bool logits_all = false;
|
|
||||||
|
|
||||||
// decode output (2-dimensional array: [n_outputs][n_vocab])
|
// decode output (2-dimensional array: [n_outputs][n_vocab])
|
||||||
size_t logits_size = 0; // capacity (of floats) for logits
|
size_t logits_size = 0; // capacity (of floats) for logits
|
||||||
|
|
@ -233,6 +240,9 @@ private:
|
||||||
|
|
||||||
ggml_context_ptr ctx_compute;
|
ggml_context_ptr ctx_compute;
|
||||||
|
|
||||||
|
// training
|
||||||
|
ggml_opt_context_t opt_ctx = nullptr;
|
||||||
|
|
||||||
ggml_threadpool_t threadpool = nullptr;
|
ggml_threadpool_t threadpool = nullptr;
|
||||||
ggml_threadpool_t threadpool_batch = nullptr;
|
ggml_threadpool_t threadpool_batch = nullptr;
|
||||||
|
|
||||||
|
|
|
||||||
2
llama/llama.cpp/src/llama-cparams.h
vendored
2
llama/llama.cpp/src/llama-cparams.h
vendored
|
|
@ -29,8 +29,8 @@ struct llama_cparams {
|
||||||
bool offload_kqv;
|
bool offload_kqv;
|
||||||
bool flash_attn;
|
bool flash_attn;
|
||||||
bool no_perf;
|
bool no_perf;
|
||||||
bool cross_attn;
|
|
||||||
bool warmup;
|
bool warmup;
|
||||||
|
bool op_offload;
|
||||||
|
|
||||||
enum llama_pooling_type pooling_type;
|
enum llama_pooling_type pooling_type;
|
||||||
|
|
||||||
|
|
|
||||||
83
llama/llama.cpp/src/llama-graph.cpp
vendored
83
llama/llama.cpp/src/llama-graph.cpp
vendored
|
|
@ -284,24 +284,7 @@ void llm_graph_input_s_copy::set_input(const llama_ubatch * ubatch) {
|
||||||
|
|
||||||
// assuming copy destinations ALWAYS happen ONLY on the cells between head and head+n
|
// assuming copy destinations ALWAYS happen ONLY on the cells between head and head+n
|
||||||
for (uint32_t i = 0; i < n_kv; ++i) {
|
for (uint32_t i = 0; i < n_kv; ++i) {
|
||||||
const uint32_t cell_id = i + kv_self->head;
|
data[i] = kv_self->s_copy(i);
|
||||||
|
|
||||||
//////////////////////////////////////////////
|
|
||||||
// TODO: this should not mutate the KV cache !
|
|
||||||
llama_kv_cell & kv_cell = const_cast<class llama_kv_cache_unified *>(kv_self)->cells[i];
|
|
||||||
|
|
||||||
// prevent out-of-bound sources
|
|
||||||
if (kv_cell.src < 0 || (uint32_t) kv_cell.src >= kv_self->size) {
|
|
||||||
kv_cell.src = cell_id;
|
|
||||||
}
|
|
||||||
|
|
||||||
data[i] = kv_cell.src;
|
|
||||||
|
|
||||||
// TODO: do not mutate the KV cache
|
|
||||||
// ensure copy only happens once
|
|
||||||
if (kv_cell.src != (int32_t) cell_id) {
|
|
||||||
kv_cell.src = cell_id;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
@ -317,18 +300,7 @@ void llm_graph_input_s_mask::set_input(const llama_ubatch * ubatch) {
|
||||||
|
|
||||||
// clear unused states
|
// clear unused states
|
||||||
for (int i = 0; i < n_kv; ++i) {
|
for (int i = 0; i < n_kv; ++i) {
|
||||||
const uint32_t cell_id = i + kv_self->head;
|
data[i] = kv_self->s_mask(i);
|
||||||
|
|
||||||
//////////////////////////////////////////////
|
|
||||||
// TODO: this should not mutate the KV cache !
|
|
||||||
llama_kv_cell & kv_cell = const_cast<class llama_kv_cache_unified *>(kv_self)->cells[i];
|
|
||||||
|
|
||||||
data[i] = (float) (kv_cell.src >= 0);
|
|
||||||
|
|
||||||
// only clear once
|
|
||||||
if (kv_cell.src < 0) {
|
|
||||||
kv_cell.src = cell_id;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
@ -560,12 +532,6 @@ void llm_graph_input_attn_cross::set_input(const llama_ubatch * ubatch) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void llm_graph_input_cross_attn_state::set_input(const llama_ubatch * ubatch) {
|
|
||||||
if (ubatch->embd) {
|
|
||||||
ggml_backend_tensor_set(cross_attn_state, ubatch->embd, 0, ggml_nbytes(cross_attn_state));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
//
|
//
|
||||||
// llm_graph_context
|
// llm_graph_context
|
||||||
//
|
//
|
||||||
|
|
@ -816,7 +782,7 @@ ggml_tensor * llm_graph_context::build_ffn(
|
||||||
} break;
|
} break;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (type_gate == LLM_FFN_PAR) {
|
if (gate && type_gate == LLM_FFN_PAR) {
|
||||||
cur = ggml_mul(ctx0, cur, tmp);
|
cur = ggml_mul(ctx0, cur, tmp);
|
||||||
cb(cur, "ffn_gate_par", il);
|
cb(cur, "ffn_gate_par", il);
|
||||||
}
|
}
|
||||||
|
|
@ -1005,6 +971,7 @@ ggml_tensor * llm_graph_context::build_inp_embd(ggml_tensor * tok_embd) const {
|
||||||
inp->tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, ubatch.n_tokens);
|
inp->tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, ubatch.n_tokens);
|
||||||
//cb(inp->tokens, "inp_tokens", -1);
|
//cb(inp->tokens, "inp_tokens", -1);
|
||||||
ggml_set_input(inp->tokens);
|
ggml_set_input(inp->tokens);
|
||||||
|
res->t_tokens = inp->tokens;
|
||||||
|
|
||||||
cur = ggml_get_rows(ctx0, tok_embd, inp->tokens);
|
cur = ggml_get_rows(ctx0, tok_embd, inp->tokens);
|
||||||
|
|
||||||
|
|
@ -1111,7 +1078,7 @@ ggml_tensor * llm_graph_context::build_inp_cls() const {
|
||||||
}
|
}
|
||||||
|
|
||||||
ggml_tensor * llm_graph_context::build_inp_s_copy() const {
|
ggml_tensor * llm_graph_context::build_inp_s_copy() const {
|
||||||
const llama_kv_cache_unified * kv_self = static_cast<const llama_kv_cache_unified *>(memory);
|
const llama_kv_cache_recurrent * kv_self = static_cast<const llama_kv_cache_recurrent *>(memory);
|
||||||
|
|
||||||
auto inp = std::make_unique<llm_graph_input_s_copy>(kv_self);
|
auto inp = std::make_unique<llm_graph_input_s_copy>(kv_self);
|
||||||
|
|
||||||
|
|
@ -1128,7 +1095,7 @@ ggml_tensor * llm_graph_context::build_inp_s_copy() const {
|
||||||
}
|
}
|
||||||
|
|
||||||
ggml_tensor * llm_graph_context::build_inp_s_mask() const {
|
ggml_tensor * llm_graph_context::build_inp_s_mask() const {
|
||||||
const llama_kv_cache_unified * kv_self = static_cast<const llama_kv_cache_unified *>(memory);
|
const llama_kv_cache_recurrent * kv_self = static_cast<const llama_kv_cache_recurrent *>(memory);
|
||||||
|
|
||||||
auto inp = std::make_unique<llm_graph_input_s_mask>(kv_self);
|
auto inp = std::make_unique<llm_graph_input_s_mask>(kv_self);
|
||||||
|
|
||||||
|
|
@ -1261,8 +1228,19 @@ ggml_tensor * llm_graph_context::build_attn_mha(
|
||||||
ggml_flash_attn_ext_set_prec(cur, GGML_PREC_F32);
|
ggml_flash_attn_ext_set_prec(cur, GGML_PREC_F32);
|
||||||
|
|
||||||
if (v_mla) {
|
if (v_mla) {
|
||||||
|
#if 0
|
||||||
|
// v_mla can be applied as a matrix-vector multiplication with broadcasting across dimension 3 == n_tokens.
|
||||||
|
// However, the code is optimized for dimensions 0 and 1 being large, so this is ineffient.
|
||||||
cur = ggml_reshape_4d(ctx0, cur, v_mla->ne[0], 1, n_head, n_tokens);
|
cur = ggml_reshape_4d(ctx0, cur, v_mla->ne[0], 1, n_head, n_tokens);
|
||||||
cur = ggml_mul_mat(ctx0, v_mla, cur);
|
cur = ggml_mul_mat(ctx0, v_mla, cur);
|
||||||
|
#else
|
||||||
|
// It's preferable to do the calculation as a matrix-matrix multiplication with n_tokens in dimension 1.
|
||||||
|
// The permutations are noops and only change how the tensor data is interpreted.
|
||||||
|
cur = ggml_permute(ctx0, cur, 0, 2, 1, 3);
|
||||||
|
cur = ggml_mul_mat(ctx0, v_mla, cur);
|
||||||
|
cur = ggml_permute(ctx0, cur, 0, 2, 1, 3);
|
||||||
|
cur = ggml_cont(ctx0, cur); // Needed because ggml_reshape_2d expects contiguous inputs.
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
cur = ggml_reshape_2d(ctx0, cur, cur->ne[0]*n_head, n_tokens);
|
cur = ggml_reshape_2d(ctx0, cur, cur->ne[0]*n_head, n_tokens);
|
||||||
|
|
@ -1442,8 +1420,6 @@ ggml_tensor * llm_graph_context::build_attn(
|
||||||
|
|
||||||
// store to KV cache
|
// store to KV cache
|
||||||
{
|
{
|
||||||
GGML_ASSERT(!kv_self->recurrent);
|
|
||||||
|
|
||||||
const auto kv_head = kv_self->head;
|
const auto kv_head = kv_self->head;
|
||||||
|
|
||||||
GGML_ASSERT(kv_self->size == n_ctx);
|
GGML_ASSERT(kv_self->size == n_ctx);
|
||||||
|
|
@ -1538,25 +1514,6 @@ llm_graph_input_attn_cross * llm_graph_context::build_attn_inp_cross() const {
|
||||||
return (llm_graph_input_attn_cross *) res->add_input(std::move(inp));
|
return (llm_graph_input_attn_cross *) res->add_input(std::move(inp));
|
||||||
}
|
}
|
||||||
|
|
||||||
ggml_tensor * llm_graph_context::build_inp_cross_attn_state() const {
|
|
||||||
const int64_t n_embd = hparams.n_embd;
|
|
||||||
|
|
||||||
auto inp = std::make_unique<llm_graph_input_cross_attn_state>();
|
|
||||||
|
|
||||||
ggml_tensor * cur = nullptr;
|
|
||||||
|
|
||||||
inp->cross_attn_state = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_embd, 1601, 4);
|
|
||||||
ggml_set_input(inp->cross_attn_state);
|
|
||||||
|
|
||||||
cur = inp->cross_attn_state;
|
|
||||||
|
|
||||||
cb(cur, "inp_cross_attn_state", -1);
|
|
||||||
|
|
||||||
res->add_input(std::move(inp));
|
|
||||||
|
|
||||||
return cur;
|
|
||||||
}
|
|
||||||
|
|
||||||
ggml_tensor * llm_graph_context::build_attn(
|
ggml_tensor * llm_graph_context::build_attn(
|
||||||
llm_graph_input_attn_cross * inp,
|
llm_graph_input_attn_cross * inp,
|
||||||
ggml_cgraph * gf,
|
ggml_cgraph * gf,
|
||||||
|
|
@ -1612,7 +1569,7 @@ ggml_tensor * llm_graph_context::build_copy_mask_state(
|
||||||
ggml_tensor * state_mask,
|
ggml_tensor * state_mask,
|
||||||
int32_t n_state,
|
int32_t n_state,
|
||||||
int32_t n_seqs) const {
|
int32_t n_seqs) const {
|
||||||
const llama_kv_cache_unified * kv_self = static_cast<const llama_kv_cache_unified *>(memory);
|
const llama_kv_cache_recurrent * kv_self = static_cast<const llama_kv_cache_recurrent *>(memory);
|
||||||
|
|
||||||
const auto n_kv = kv_self->n;
|
const auto n_kv = kv_self->n;
|
||||||
const auto kv_head = kv_self->head;
|
const auto kv_head = kv_self->head;
|
||||||
|
|
@ -1644,7 +1601,7 @@ ggml_tensor * llm_graph_context::build_rwkv_token_shift_load(
|
||||||
ggml_tensor * state_mask,
|
ggml_tensor * state_mask,
|
||||||
const llama_ubatch & ubatch,
|
const llama_ubatch & ubatch,
|
||||||
int il) const {
|
int il) const {
|
||||||
const llama_kv_cache_unified * kv_self = static_cast<const llama_kv_cache_unified *>(memory);
|
const llama_kv_cache_recurrent * kv_self = static_cast<const llama_kv_cache_recurrent *>(memory);
|
||||||
|
|
||||||
const auto token_shift_count = hparams.token_shift_count;
|
const auto token_shift_count = hparams.token_shift_count;
|
||||||
|
|
||||||
|
|
@ -1665,7 +1622,7 @@ ggml_tensor * llm_graph_context::build_rwkv_token_shift_store(
|
||||||
ggml_tensor * token_shift,
|
ggml_tensor * token_shift,
|
||||||
const llama_ubatch & ubatch,
|
const llama_ubatch & ubatch,
|
||||||
int il) const {
|
int il) const {
|
||||||
const llama_kv_cache_unified * kv_self = static_cast<const llama_kv_cache_unified *>(memory);
|
const llama_kv_cache_recurrent * kv_self = static_cast<const llama_kv_cache_recurrent *>(memory);
|
||||||
|
|
||||||
const auto token_shift_count = hparams.token_shift_count;
|
const auto token_shift_count = hparams.token_shift_count;
|
||||||
const auto n_embd = hparams.n_embd;
|
const auto n_embd = hparams.n_embd;
|
||||||
|
|
|
||||||
32
llama/llama.cpp/src/llama-graph.h
vendored
32
llama/llama.cpp/src/llama-graph.h
vendored
|
|
@ -19,6 +19,7 @@ struct llama_cparams;
|
||||||
|
|
||||||
class llama_memory_i;
|
class llama_memory_i;
|
||||||
class llama_kv_cache_unified;
|
class llama_kv_cache_unified;
|
||||||
|
class llama_kv_cache_recurrent;
|
||||||
|
|
||||||
// certain models (typically multi-modal) can produce different types of graphs
|
// certain models (typically multi-modal) can produce different types of graphs
|
||||||
enum llm_graph_type {
|
enum llm_graph_type {
|
||||||
|
|
@ -86,7 +87,6 @@ public:
|
||||||
|
|
||||||
ggml_tensor * tokens = nullptr; // I32 [n_batch]
|
ggml_tensor * tokens = nullptr; // I32 [n_batch]
|
||||||
ggml_tensor * embd = nullptr; // F32 [n_embd, n_batch]
|
ggml_tensor * embd = nullptr; // F32 [n_embd, n_batch]
|
||||||
ggml_tensor * cross_attn_state; // F32 [4, n_embd, 1061]
|
|
||||||
};
|
};
|
||||||
|
|
||||||
class llm_graph_input_pos : public llm_graph_input_i {
|
class llm_graph_input_pos : public llm_graph_input_i {
|
||||||
|
|
@ -187,26 +187,26 @@ public:
|
||||||
|
|
||||||
class llm_graph_input_s_copy : public llm_graph_input_i {
|
class llm_graph_input_s_copy : public llm_graph_input_i {
|
||||||
public:
|
public:
|
||||||
llm_graph_input_s_copy(const llama_kv_cache_unified * kv_self) : kv_self(kv_self) {}
|
llm_graph_input_s_copy(const llama_kv_cache_recurrent * kv_self) : kv_self(kv_self) {}
|
||||||
virtual ~llm_graph_input_s_copy() = default;
|
virtual ~llm_graph_input_s_copy() = default;
|
||||||
|
|
||||||
void set_input(const llama_ubatch * ubatch) override;
|
void set_input(const llama_ubatch * ubatch) override;
|
||||||
|
|
||||||
ggml_tensor * s_copy; // I32 [kv_size]
|
ggml_tensor * s_copy; // I32 [kv_size]
|
||||||
|
|
||||||
const llama_kv_cache_unified * kv_self;
|
const llama_kv_cache_recurrent * kv_self;
|
||||||
};
|
};
|
||||||
|
|
||||||
class llm_graph_input_s_mask : public llm_graph_input_i {
|
class llm_graph_input_s_mask : public llm_graph_input_i {
|
||||||
public:
|
public:
|
||||||
llm_graph_input_s_mask(const llama_kv_cache_unified * kv_self) : kv_self(kv_self) {}
|
llm_graph_input_s_mask(const llama_kv_cache_recurrent * kv_self) : kv_self(kv_self) {}
|
||||||
virtual ~llm_graph_input_s_mask() = default;
|
virtual ~llm_graph_input_s_mask() = default;
|
||||||
|
|
||||||
void set_input(const llama_ubatch * ubatch) override;
|
void set_input(const llama_ubatch * ubatch) override;
|
||||||
|
|
||||||
ggml_tensor * s_mask; // F32 [1, n_kv]
|
ggml_tensor * s_mask; // F32 [1, n_kv]
|
||||||
|
|
||||||
const llama_kv_cache_unified * kv_self;
|
const llama_kv_cache_recurrent * kv_self;
|
||||||
};
|
};
|
||||||
|
|
||||||
class llm_graph_input_cross_embd : public llm_graph_input_i {
|
class llm_graph_input_cross_embd : public llm_graph_input_i {
|
||||||
|
|
@ -284,16 +284,6 @@ public:
|
||||||
const llama_cross * cross = nullptr;
|
const llama_cross * cross = nullptr;
|
||||||
};
|
};
|
||||||
|
|
||||||
class llm_graph_input_cross_attn_state : public llm_graph_input_i {
|
|
||||||
public:
|
|
||||||
llm_graph_input_cross_attn_state() = default;
|
|
||||||
virtual ~llm_graph_input_cross_attn_state() = default;
|
|
||||||
|
|
||||||
void set_input(const llama_ubatch * ubatch) override;
|
|
||||||
|
|
||||||
ggml_tensor * cross_attn_state; // F32 [4, n_embd, 1061]
|
|
||||||
};
|
|
||||||
|
|
||||||
//
|
//
|
||||||
// llm_graph_result
|
// llm_graph_result
|
||||||
//
|
//
|
||||||
|
|
@ -308,6 +298,7 @@ class llm_graph_result_i {
|
||||||
public:
|
public:
|
||||||
virtual ~llm_graph_result_i() = default;
|
virtual ~llm_graph_result_i() = default;
|
||||||
|
|
||||||
|
virtual ggml_tensor * get_tokens() = 0;
|
||||||
virtual ggml_tensor * get_logits() = 0;
|
virtual ggml_tensor * get_logits() = 0;
|
||||||
virtual ggml_tensor * get_embd() = 0;
|
virtual ggml_tensor * get_embd() = 0;
|
||||||
virtual ggml_tensor * get_embd_pooled() = 0;
|
virtual ggml_tensor * get_embd_pooled() = 0;
|
||||||
|
|
@ -322,6 +313,7 @@ class llm_graph_result : public llm_graph_result_i {
|
||||||
public:
|
public:
|
||||||
virtual ~llm_graph_result() = default;
|
virtual ~llm_graph_result() = default;
|
||||||
|
|
||||||
|
ggml_tensor * get_tokens() override { return t_tokens; }
|
||||||
ggml_tensor * get_logits() override { return t_logits; }
|
ggml_tensor * get_logits() override { return t_logits; }
|
||||||
ggml_tensor * get_embd() override { return t_embd; }
|
ggml_tensor * get_embd() override { return t_embd; }
|
||||||
ggml_tensor * get_embd_pooled() override { return t_embd_pooled; }
|
ggml_tensor * get_embd_pooled() override { return t_embd_pooled; }
|
||||||
|
|
@ -338,6 +330,7 @@ public:
|
||||||
}
|
}
|
||||||
|
|
||||||
// important graph nodes
|
// important graph nodes
|
||||||
|
ggml_tensor * t_tokens = nullptr;
|
||||||
ggml_tensor * t_logits = nullptr;
|
ggml_tensor * t_logits = nullptr;
|
||||||
ggml_tensor * t_embd = nullptr;
|
ggml_tensor * t_embd = nullptr;
|
||||||
ggml_tensor * t_embd_pooled = nullptr;
|
ggml_tensor * t_embd_pooled = nullptr;
|
||||||
|
|
@ -361,8 +354,8 @@ struct llm_graph_params {
|
||||||
const llama_cparams & cparams;
|
const llama_cparams & cparams;
|
||||||
const llama_ubatch & ubatch;
|
const llama_ubatch & ubatch;
|
||||||
|
|
||||||
ggml_backend_sched * sched;
|
ggml_backend_sched_t sched;
|
||||||
ggml_backend * backend_cpu;
|
ggml_backend_t backend_cpu;
|
||||||
|
|
||||||
const llama_adapter_cvec * cvec;
|
const llama_adapter_cvec * cvec;
|
||||||
const llama_adapter_loras * loras;
|
const llama_adapter_loras * loras;
|
||||||
|
|
@ -413,9 +406,9 @@ struct llm_graph_context {
|
||||||
|
|
||||||
ggml_context * ctx0 = nullptr;
|
ggml_context * ctx0 = nullptr;
|
||||||
|
|
||||||
ggml_backend_sched * sched;
|
ggml_backend_sched_t sched;
|
||||||
|
|
||||||
ggml_backend * backend_cpu; // TODO: needed by build_attn_mha, figure out a way to remove?
|
ggml_backend_t backend_cpu; // TODO: needed by build_attn_mha, figure out a way to remove?
|
||||||
|
|
||||||
const llama_adapter_cvec * cvec;
|
const llama_adapter_cvec * cvec;
|
||||||
const llama_adapter_loras * loras;
|
const llama_adapter_loras * loras;
|
||||||
|
|
@ -502,7 +495,6 @@ struct llm_graph_context {
|
||||||
ggml_tensor * build_inp_cls() const;
|
ggml_tensor * build_inp_cls() const;
|
||||||
ggml_tensor * build_inp_s_copy() const;
|
ggml_tensor * build_inp_s_copy() const;
|
||||||
ggml_tensor * build_inp_s_mask() const;
|
ggml_tensor * build_inp_s_mask() const;
|
||||||
ggml_tensor * build_inp_cross_attn_state() const;
|
|
||||||
|
|
||||||
ggml_tensor * build_inp_cross_embd() const;
|
ggml_tensor * build_inp_cross_embd() const;
|
||||||
ggml_tensor * build_inp_pos_bucket_enc() const;
|
ggml_tensor * build_inp_pos_bucket_enc() const;
|
||||||
|
|
|
||||||
4
llama/llama.cpp/src/llama-hparams.cpp
vendored
4
llama/llama.cpp/src/llama-hparams.cpp
vendored
|
|
@ -85,7 +85,3 @@ bool llama_hparams::is_swa(uint32_t il) const {
|
||||||
|
|
||||||
GGML_ABORT("fatal error");
|
GGML_ABORT("fatal error");
|
||||||
}
|
}
|
||||||
|
|
||||||
bool llama_hparams::cross_attention_layers(uint32_t il) const {
|
|
||||||
return std::find(cross_attn_layers.begin(), cross_attn_layers.end(), il) != cross_attn_layers.end();
|
|
||||||
}
|
|
||||||
|
|
|
||||||
7
llama/llama.cpp/src/llama-hparams.h
vendored
7
llama/llama.cpp/src/llama-hparams.h
vendored
|
|
@ -2,8 +2,6 @@
|
||||||
|
|
||||||
#include "llama.h"
|
#include "llama.h"
|
||||||
|
|
||||||
#include <algorithm>
|
|
||||||
|
|
||||||
#include <array>
|
#include <array>
|
||||||
|
|
||||||
// bump if necessary
|
// bump if necessary
|
||||||
|
|
@ -44,7 +42,6 @@ struct llama_hparams {
|
||||||
uint32_t n_expert = 0;
|
uint32_t n_expert = 0;
|
||||||
uint32_t n_expert_used = 0;
|
uint32_t n_expert_used = 0;
|
||||||
uint32_t n_rel_attn_bkts = 0;
|
uint32_t n_rel_attn_bkts = 0;
|
||||||
uint32_t n_vocab = 0;
|
|
||||||
|
|
||||||
// note: deepseek2 using MLA converts into MQA with larger heads, then decompresses to MHA
|
// note: deepseek2 using MLA converts into MQA with larger heads, then decompresses to MHA
|
||||||
uint32_t n_embd_head_k_mla = 0;
|
uint32_t n_embd_head_k_mla = 0;
|
||||||
|
|
@ -59,7 +56,6 @@ struct llama_hparams {
|
||||||
std::array<uint32_t, LLAMA_MAX_LAYERS> n_ff_arr;
|
std::array<uint32_t, LLAMA_MAX_LAYERS> n_ff_arr;
|
||||||
|
|
||||||
std::array<std::array<uint32_t, LLAMA_MAX_LAYERS>, 4> n_bskcn_arr = {};
|
std::array<std::array<uint32_t, LLAMA_MAX_LAYERS>, 4> n_bskcn_arr = {};
|
||||||
std::array<uint32_t, LLAMA_MAX_LAYERS> cross_attn_layers;
|
|
||||||
|
|
||||||
uint32_t n_layer_dense_lead = 0;
|
uint32_t n_layer_dense_lead = 0;
|
||||||
uint32_t n_lora_q = 0;
|
uint32_t n_lora_q = 0;
|
||||||
|
|
@ -163,9 +159,6 @@ struct llama_hparams {
|
||||||
// Block skip connection
|
// Block skip connection
|
||||||
bool n_bskcn(uint32_t n, uint32_t il) const;
|
bool n_bskcn(uint32_t n, uint32_t il) const;
|
||||||
|
|
||||||
// cross attention layers
|
|
||||||
bool cross_attention_layers(uint32_t il) const;
|
|
||||||
|
|
||||||
bool is_swa(uint32_t il) const;
|
bool is_swa(uint32_t il) const;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
|
||||||
1826
llama/llama.cpp/src/llama-kv-cache.cpp
vendored
1826
llama/llama.cpp/src/llama-kv-cache.cpp
vendored
File diff suppressed because it is too large
Load Diff
405
llama/llama.cpp/src/llama-kv-cache.h
vendored
405
llama/llama.cpp/src/llama-kv-cache.h
vendored
|
|
@ -2,32 +2,72 @@
|
||||||
|
|
||||||
#include "llama.h"
|
#include "llama.h"
|
||||||
#include "llama-io.h"
|
#include "llama-io.h"
|
||||||
|
#include "llama-graph.h"
|
||||||
#include "llama-memory.h"
|
#include "llama-memory.h"
|
||||||
|
|
||||||
#include "ggml-cpp.h"
|
#include "ggml-cpp.h"
|
||||||
|
|
||||||
#include <functional>
|
|
||||||
#include <set>
|
#include <set>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
|
|
||||||
struct llama_cparams;
|
struct llama_cparams;
|
||||||
struct llama_hparams;
|
struct llama_hparams;
|
||||||
struct llama_ubatch;
|
struct llama_ubatch;
|
||||||
|
struct llama_sbatch;
|
||||||
|
struct llama_model;
|
||||||
|
struct llama_context;
|
||||||
|
|
||||||
struct llama_kv_cache : public llama_memory_i {
|
struct llama_kv_cache : public llama_memory_i {
|
||||||
using llama_memory_i::llama_memory_i;
|
virtual ~llama_kv_cache() = default;
|
||||||
|
|
||||||
virtual void restore() = 0; // call if batch processing fails - restores the cache state
|
// call if batch processing fails - restores the cache state
|
||||||
virtual void commit() = 0; // call after successful batch processing - clears any pending state
|
virtual void restore() = 0;
|
||||||
|
|
||||||
virtual int32_t get_n_tokens() const = 0;
|
// call after successful batch processing - clears any pending state
|
||||||
virtual int32_t get_used_cells() const = 0; // TODO: remove, this is too-specific to the unified cache
|
virtual void commit() = 0;
|
||||||
|
|
||||||
virtual bool get_can_shift() const = 0;
|
// process any pending defrag/shift/etc. operations
|
||||||
|
// optionally call once before processing a new batch
|
||||||
|
virtual bool update(llama_context & lctx) = 0;
|
||||||
|
|
||||||
|
// schedule a defrag if the fragmentation threshold is exceeded. otherwise, do nothing
|
||||||
|
virtual void defrag_sched(float thold) = 0;
|
||||||
|
|
||||||
|
// simulate full cache, used for allocating worst-case compute buffers
|
||||||
|
virtual void set_full() = 0;
|
||||||
|
|
||||||
|
//
|
||||||
|
// batch processing
|
||||||
|
//
|
||||||
|
|
||||||
|
virtual llama_sbatch sbatch_init(const llama_batch & batch, bool logits_all) = 0;
|
||||||
|
|
||||||
|
// different KV caches require different batch splitting strategies
|
||||||
|
virtual llama_ubatch ubatch_next(llama_sbatch & sbatch, uint32_t n_ubatch, bool embd_pooled) const = 0;
|
||||||
|
|
||||||
|
// find an empty slot of size "n_tokens" in the cache
|
||||||
|
virtual bool find_slot(const llama_ubatch & batch) = 0;
|
||||||
|
|
||||||
|
// getters
|
||||||
|
virtual int32_t get_n_tokens() const = 0;
|
||||||
|
virtual int32_t get_used_cells() const = 0; // TODO: remove, this is too-specific to the unified cache
|
||||||
|
virtual llama_pos get_pos_max() const = 0;
|
||||||
|
virtual bool get_can_shift() const = 0;
|
||||||
|
|
||||||
bool get_can_edit() const override { return get_can_shift(); }
|
bool get_can_edit() const override { return get_can_shift(); }
|
||||||
|
|
||||||
|
//
|
||||||
|
// state write/read
|
||||||
|
//
|
||||||
|
|
||||||
|
virtual void state_write(llama_io_write_i & io, llama_seq_id seq_id = -1) const = 0;
|
||||||
|
virtual void state_read (llama_io_read_i & io, llama_seq_id seq_id = -1) = 0;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
//
|
||||||
|
// llama_kv_cache_guard
|
||||||
|
//
|
||||||
|
|
||||||
struct llama_kv_cache_guard {
|
struct llama_kv_cache_guard {
|
||||||
llama_kv_cache_guard(llama_kv_cache * kv) : kv(kv) {}
|
llama_kv_cache_guard(llama_kv_cache * kv) : kv(kv) {}
|
||||||
|
|
||||||
|
|
@ -42,7 +82,7 @@ struct llama_kv_cache_guard {
|
||||||
private:
|
private:
|
||||||
llama_kv_cache * kv;
|
llama_kv_cache * kv;
|
||||||
};
|
};
|
||||||
|
|
||||||
// block of KV slots to move when defragging
|
// block of KV slots to move when defragging
|
||||||
struct llama_kv_defrag_move {
|
struct llama_kv_defrag_move {
|
||||||
uint32_t src;
|
uint32_t src;
|
||||||
|
|
@ -50,65 +90,50 @@ struct llama_kv_defrag_move {
|
||||||
uint32_t len;
|
uint32_t len;
|
||||||
};
|
};
|
||||||
|
|
||||||
struct llama_kv_cell {
|
//
|
||||||
llama_pos pos = -1;
|
// llama_kv_cache_unified
|
||||||
llama_pos delta = 0;
|
//
|
||||||
int32_t src = -1; // used by recurrent state models to copy states
|
|
||||||
int32_t tail = -1;
|
|
||||||
|
|
||||||
std::set<llama_seq_id> seq_id;
|
|
||||||
|
|
||||||
bool has_seq_id(const llama_seq_id & id) const {
|
|
||||||
return seq_id.find(id) != seq_id.end();
|
|
||||||
}
|
|
||||||
|
|
||||||
bool is_empty() const {
|
|
||||||
return seq_id.empty();
|
|
||||||
}
|
|
||||||
|
|
||||||
bool is_same_seq(const llama_kv_cell & other) const {
|
|
||||||
return seq_id == other.seq_id;
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
// ring-buffer of cached KV data
|
|
||||||
// TODO: pimpl
|
|
||||||
// TODO: add notion of max sequences
|
// TODO: add notion of max sequences
|
||||||
class llama_kv_cache_unified : public llama_kv_cache {
|
class llama_kv_cache_unified : public llama_kv_cache {
|
||||||
public:
|
public:
|
||||||
// can be used to query data from the model if needed
|
struct kv_cell {
|
||||||
struct callbacks {
|
llama_pos pos = -1;
|
||||||
std::function<ggml_tensor * (uint32_t n_ctx_per_seq, int il)> get_rope_factors;
|
llama_pos delta = 0;
|
||||||
|
|
||||||
|
std::set<llama_seq_id> seq_id;
|
||||||
|
|
||||||
|
bool has_seq_id(const llama_seq_id & id) const {
|
||||||
|
return seq_id.find(id) != seq_id.end();
|
||||||
|
}
|
||||||
|
|
||||||
|
bool is_empty() const {
|
||||||
|
return seq_id.empty();
|
||||||
|
}
|
||||||
|
|
||||||
|
bool is_same_seq(const kv_cell & other) const {
|
||||||
|
return seq_id == other.seq_id;
|
||||||
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
static uint32_t get_padding(const llama_cparams & cparams);
|
||||||
|
|
||||||
llama_kv_cache_unified(
|
llama_kv_cache_unified(
|
||||||
const llama_hparams & hparams,
|
const llama_model & model,
|
||||||
callbacks cbs);
|
|
||||||
|
|
||||||
virtual ~llama_kv_cache_unified() = default;
|
|
||||||
|
|
||||||
// TODO: become constructor
|
|
||||||
bool init(
|
|
||||||
const llama_model & model, // TODO: do not reference the model
|
|
||||||
const llama_cparams & cparams,
|
|
||||||
ggml_type type_k,
|
ggml_type type_k,
|
||||||
ggml_type type_v,
|
ggml_type type_v,
|
||||||
|
bool v_trans,
|
||||||
|
bool offload,
|
||||||
uint32_t kv_size,
|
uint32_t kv_size,
|
||||||
bool offload);
|
uint32_t padding);
|
||||||
|
|
||||||
int32_t get_n_tokens() const override;
|
~llama_kv_cache_unified() = default;
|
||||||
int32_t get_used_cells() const override;
|
|
||||||
|
|
||||||
size_t total_size() const;
|
//
|
||||||
|
// llama_memory_i
|
||||||
// TODO: better data structures to reduce the cost of this operation
|
//
|
||||||
llama_pos pos_max() const;
|
|
||||||
|
|
||||||
void clear() override;
|
void clear() override;
|
||||||
void defrag() override;
|
|
||||||
|
|
||||||
virtual void restore() override;
|
|
||||||
virtual void commit() override;
|
|
||||||
|
|
||||||
bool seq_rm (llama_seq_id seq_id, llama_pos p0, llama_pos p1) override;
|
bool seq_rm (llama_seq_id seq_id, llama_pos p0, llama_pos p1) override;
|
||||||
void seq_cp (llama_seq_id seq_id_src, llama_seq_id seq_id_dst, llama_pos p0, llama_pos p1) override;
|
void seq_cp (llama_seq_id seq_id_src, llama_seq_id seq_id_dst, llama_pos p0, llama_pos p1) override;
|
||||||
|
|
@ -118,63 +143,40 @@ public:
|
||||||
|
|
||||||
llama_pos seq_pos_max(llama_seq_id seq_id) const override;
|
llama_pos seq_pos_max(llama_seq_id seq_id) const override;
|
||||||
|
|
||||||
bool get_can_shift() const override;
|
//
|
||||||
|
// llama_kv_cache
|
||||||
|
//
|
||||||
|
|
||||||
|
void restore() override;
|
||||||
|
void commit() override;
|
||||||
|
|
||||||
|
bool update(llama_context & ctx) override;
|
||||||
|
|
||||||
|
void defrag_sched(float thold) override;
|
||||||
|
|
||||||
|
void set_full() override;
|
||||||
|
|
||||||
|
llama_sbatch sbatch_init(const llama_batch & batch, bool logits_all) override;
|
||||||
|
|
||||||
|
llama_ubatch ubatch_next(llama_sbatch & sbatch, uint32_t n_ubatch, bool embd_pooled) const override;
|
||||||
|
|
||||||
// find an empty slot of size "n_tokens" in the cache
|
|
||||||
// updates the cache head
|
// updates the cache head
|
||||||
// Note: On success, it's important that cache.head points
|
// Note: On success, it's important that cache.head points
|
||||||
// to the first cell of the slot.
|
// to the first cell of the slot.
|
||||||
bool find_slot(const llama_ubatch & batch);
|
bool find_slot(const llama_ubatch & batch) override;
|
||||||
|
|
||||||
// TODO: maybe not needed
|
int32_t get_n_tokens() const override;
|
||||||
uint32_t get_padding(const llama_cparams & cparams) const;
|
int32_t get_used_cells() const override;
|
||||||
|
|
||||||
// find how many cells are currently in use
|
// TODO: better data structures to reduce the cost of this operation
|
||||||
uint32_t cell_max() const;
|
llama_pos get_pos_max() const override;
|
||||||
|
|
||||||
size_t size_k_bytes() const;
|
bool get_can_shift() const override;
|
||||||
size_t size_v_bytes() const;
|
|
||||||
|
|
||||||
// defrag
|
|
||||||
|
|
||||||
struct {
|
|
||||||
std::vector<llama_kv_defrag_move> moves;
|
|
||||||
} defrag_info;
|
|
||||||
|
|
||||||
// return true if cells have been moved
|
|
||||||
bool defrag_prepare(int32_t n_max_nodes);
|
|
||||||
|
|
||||||
// commit/restore cache
|
|
||||||
|
|
||||||
struct slot_range {
|
|
||||||
uint32_t c0 = 0; // note: these are cell indices, not sequence positions
|
|
||||||
uint32_t c1 = 0;
|
|
||||||
};
|
|
||||||
|
|
||||||
// pending cell updates that are not yet committed
|
|
||||||
struct {
|
|
||||||
std::vector<slot_range> ranges;
|
|
||||||
} pending;
|
|
||||||
|
|
||||||
// state write/load
|
// state write/load
|
||||||
|
|
||||||
void state_write(llama_io_write_i & io, llama_seq_id seq_id = -1) const;
|
void state_write(llama_io_write_i & io, llama_seq_id seq_id = -1) const override;
|
||||||
void state_read (llama_io_read_i & io, llama_seq_id seq_id = -1);
|
void state_read (llama_io_read_i & io, llama_seq_id seq_id = -1) override;
|
||||||
|
|
||||||
// members
|
|
||||||
|
|
||||||
const llama_hparams & hparams;
|
|
||||||
|
|
||||||
callbacks cbs;
|
|
||||||
|
|
||||||
bool has_shift = false;
|
|
||||||
bool do_defrag = false;
|
|
||||||
|
|
||||||
// TODO: remove this and implement llama_kv_cache_recurrent instead
|
|
||||||
bool recurrent = false; // with recurrent state models, a cell can hold the state for more than one past token
|
|
||||||
|
|
||||||
bool v_trans = true; // the value tensor is transposed
|
|
||||||
bool can_shift = false;
|
|
||||||
|
|
||||||
// Note: The value of head isn't only used to optimize searching
|
// Note: The value of head isn't only used to optimize searching
|
||||||
// for a free KV slot. llama_decode_impl also uses it, so it
|
// for a free KV slot. llama_decode_impl also uses it, so it
|
||||||
|
|
@ -186,18 +188,214 @@ public:
|
||||||
// computed before each graph build
|
// computed before each graph build
|
||||||
uint32_t n = 0;
|
uint32_t n = 0;
|
||||||
|
|
||||||
std::vector<llama_kv_cell> cells;
|
std::vector<kv_cell> cells;
|
||||||
|
|
||||||
std::vector<ggml_tensor *> k_l; // per layer
|
std::vector<ggml_tensor *> k_l; // per layer
|
||||||
std::vector<ggml_tensor *> v_l;
|
std::vector<ggml_tensor *> v_l;
|
||||||
|
|
||||||
private:
|
private:
|
||||||
|
const llama_model & model;
|
||||||
|
const llama_hparams & hparams;
|
||||||
|
|
||||||
|
bool has_shift = false;
|
||||||
|
bool do_defrag = false;
|
||||||
|
|
||||||
|
bool v_trans = true; // the value tensor is transposed
|
||||||
|
bool can_shift = false;
|
||||||
|
|
||||||
|
// required padding
|
||||||
|
uint32_t padding = 1;
|
||||||
|
|
||||||
ggml_type type_k = GGML_TYPE_F16;
|
ggml_type type_k = GGML_TYPE_F16;
|
||||||
ggml_type type_v = GGML_TYPE_F16;
|
ggml_type type_v = GGML_TYPE_F16;
|
||||||
|
|
||||||
std::vector<ggml_context_ptr> ctxs;
|
std::vector<ggml_context_ptr> ctxs;
|
||||||
std::vector<ggml_backend_buffer_ptr> bufs;
|
std::vector<ggml_backend_buffer_ptr> bufs;
|
||||||
|
|
||||||
|
// defrag
|
||||||
|
struct {
|
||||||
|
std::vector<llama_kv_defrag_move> moves;
|
||||||
|
} defrag_info;
|
||||||
|
|
||||||
|
// return true if cells have been moved
|
||||||
|
bool defrag_prepare(int32_t n_max_nodes);
|
||||||
|
|
||||||
|
// commit/restore cache
|
||||||
|
struct slot_range {
|
||||||
|
uint32_t c0 = 0; // note: these are cell indices, not sequence positions
|
||||||
|
uint32_t c1 = 0;
|
||||||
|
};
|
||||||
|
|
||||||
|
// pending cell updates that are not yet committed
|
||||||
|
struct {
|
||||||
|
std::vector<slot_range> ranges;
|
||||||
|
} pending;
|
||||||
|
|
||||||
|
// find how many cells are currently in use
|
||||||
|
uint32_t cell_max() const;
|
||||||
|
|
||||||
|
size_t total_size() const;
|
||||||
|
|
||||||
|
size_t size_k_bytes() const;
|
||||||
|
size_t size_v_bytes() const;
|
||||||
|
|
||||||
|
ggml_tensor * build_rope_shift(
|
||||||
|
const llama_cparams & cparams,
|
||||||
|
ggml_context * ctx,
|
||||||
|
ggml_tensor * cur,
|
||||||
|
ggml_tensor * shift,
|
||||||
|
ggml_tensor * factors,
|
||||||
|
float freq_base,
|
||||||
|
float freq_scale) const;
|
||||||
|
|
||||||
|
llm_graph_result_ptr build_graph_shift(
|
||||||
|
const llama_cparams & cparams,
|
||||||
|
ggml_context * ctx,
|
||||||
|
ggml_cgraph * gf) const;
|
||||||
|
|
||||||
|
llm_graph_result_ptr build_graph_defrag(
|
||||||
|
const llama_cparams & cparams,
|
||||||
|
ggml_context * ctx,
|
||||||
|
ggml_cgraph * gf,
|
||||||
|
const std::vector<llama_kv_defrag_move> & moves) const;
|
||||||
|
|
||||||
|
void state_write_meta(llama_io_write_i & io, const std::vector<std::pair<uint32_t, uint32_t>> & cell_ranges, llama_seq_id seq_id = -1) const;
|
||||||
|
void state_write_data(llama_io_write_i & io, const std::vector<std::pair<uint32_t, uint32_t>> & cell_ranges) const;
|
||||||
|
|
||||||
|
bool state_read_meta(llama_io_read_i & io, uint32_t cell_count, llama_seq_id dest_seq_id = -1);
|
||||||
|
bool state_read_data(llama_io_read_i & io, uint32_t cell_count);
|
||||||
|
};
|
||||||
|
|
||||||
|
//
|
||||||
|
// llama_kv_cache_recurrent
|
||||||
|
//
|
||||||
|
|
||||||
|
class llama_kv_cache_recurrent : public llama_kv_cache {
|
||||||
|
public:
|
||||||
|
struct kv_cell {
|
||||||
|
llama_pos pos = -1;
|
||||||
|
int32_t src = -1; // used to copy states
|
||||||
|
int32_t tail = -1;
|
||||||
|
|
||||||
|
std::set<llama_seq_id> seq_id;
|
||||||
|
|
||||||
|
bool has_seq_id(const llama_seq_id & id) const {
|
||||||
|
return seq_id.find(id) != seq_id.end();
|
||||||
|
}
|
||||||
|
|
||||||
|
bool is_empty() const {
|
||||||
|
return seq_id.empty();
|
||||||
|
}
|
||||||
|
|
||||||
|
bool is_same_seq(const kv_cell & other) const {
|
||||||
|
return seq_id == other.seq_id;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
llama_kv_cache_recurrent(
|
||||||
|
const llama_model & model,
|
||||||
|
ggml_type type_k,
|
||||||
|
ggml_type type_v,
|
||||||
|
bool offload,
|
||||||
|
uint32_t kv_size);
|
||||||
|
|
||||||
|
~llama_kv_cache_recurrent() = default;
|
||||||
|
|
||||||
|
//
|
||||||
|
// llama_memory_i
|
||||||
|
//
|
||||||
|
|
||||||
|
void clear() override;
|
||||||
|
|
||||||
|
bool seq_rm (llama_seq_id seq_id, llama_pos p0, llama_pos p1) override;
|
||||||
|
void seq_cp (llama_seq_id seq_id_src, llama_seq_id seq_id_dst, llama_pos p0, llama_pos p1) override;
|
||||||
|
void seq_keep(llama_seq_id seq_id) override;
|
||||||
|
void seq_add (llama_seq_id seq_id, llama_pos p0, llama_pos p1, llama_pos delta) override;
|
||||||
|
void seq_div (llama_seq_id seq_id, llama_pos p0, llama_pos p1, int d) override;
|
||||||
|
|
||||||
|
llama_pos seq_pos_max(llama_seq_id seq_id) const override;
|
||||||
|
|
||||||
|
//
|
||||||
|
// llama_kv_cache
|
||||||
|
//
|
||||||
|
|
||||||
|
void restore() override;
|
||||||
|
void commit() override;
|
||||||
|
|
||||||
|
bool update(llama_context & lctx) override;
|
||||||
|
|
||||||
|
void defrag_sched(float thold) override;
|
||||||
|
|
||||||
|
void set_full() override;
|
||||||
|
|
||||||
|
llama_sbatch sbatch_init(const llama_batch & batch, bool logits_all) override;
|
||||||
|
|
||||||
|
llama_ubatch ubatch_next(llama_sbatch & sbatch, uint32_t n_ubatch, bool embd_pooled) const override;
|
||||||
|
|
||||||
|
bool find_slot(const llama_ubatch & batch) override;
|
||||||
|
|
||||||
|
int32_t get_n_tokens() const override;
|
||||||
|
int32_t get_used_cells() const override;
|
||||||
|
|
||||||
|
// TODO: better data structures to reduce the cost of this operation
|
||||||
|
llama_pos get_pos_max() const override;
|
||||||
|
|
||||||
|
bool get_can_shift() const override;
|
||||||
|
|
||||||
|
// TODO: temporary methods - they are not really const as they do const_cast<>, fix this
|
||||||
|
int32_t s_copy(int i) const;
|
||||||
|
float s_mask(int i) const;
|
||||||
|
|
||||||
|
// state write/load
|
||||||
|
|
||||||
|
void state_write(llama_io_write_i & io, llama_seq_id seq_id = -1) const override;
|
||||||
|
void state_read (llama_io_read_i & io, llama_seq_id seq_id = -1) override;
|
||||||
|
|
||||||
|
// Note: The value of head isn't only used to optimize searching
|
||||||
|
// for a free KV slot. llama_decode_impl also uses it, so it
|
||||||
|
// cannot be freely changed after a slot has been allocated.
|
||||||
|
uint32_t head = 0;
|
||||||
|
uint32_t size = 0;
|
||||||
|
uint32_t used = 0; // used cells (i.e. at least one seq_id)
|
||||||
|
|
||||||
|
// computed before each graph build
|
||||||
|
uint32_t n = 0;
|
||||||
|
|
||||||
|
std::vector<kv_cell> cells;
|
||||||
|
|
||||||
|
std::vector<ggml_tensor *> k_l; // per layer
|
||||||
|
std::vector<ggml_tensor *> v_l;
|
||||||
|
|
||||||
|
private:
|
||||||
|
//const llama_model & model;
|
||||||
|
const llama_hparams & hparams;
|
||||||
|
|
||||||
|
// commit/restore cache
|
||||||
|
// TODO: rework for recurrent cache
|
||||||
|
struct slot_range {
|
||||||
|
uint32_t c0 = 0; // note: these are cell indices, not sequence positions
|
||||||
|
uint32_t c1 = 0;
|
||||||
|
};
|
||||||
|
|
||||||
|
// pending cell updates that are not yet committed
|
||||||
|
struct {
|
||||||
|
std::vector<slot_range> ranges;
|
||||||
|
} pending;
|
||||||
|
|
||||||
|
ggml_type type_k = GGML_TYPE_F16;
|
||||||
|
ggml_type type_v = GGML_TYPE_F16;
|
||||||
|
|
||||||
|
std::vector<ggml_context_ptr> ctxs;
|
||||||
|
std::vector<ggml_backend_buffer_ptr> bufs;
|
||||||
|
|
||||||
|
// find how many cells are currently in use
|
||||||
|
uint32_t cell_max() const;
|
||||||
|
|
||||||
|
size_t total_size() const;
|
||||||
|
|
||||||
|
size_t size_k_bytes() const;
|
||||||
|
size_t size_v_bytes() const;
|
||||||
|
|
||||||
void state_write_meta(llama_io_write_i & io, const std::vector<std::pair<uint32_t, uint32_t>> & cell_ranges, llama_seq_id seq_id = -1) const;
|
void state_write_meta(llama_io_write_i & io, const std::vector<std::pair<uint32_t, uint32_t>> & cell_ranges, llama_seq_id seq_id = -1) const;
|
||||||
void state_write_data(llama_io_write_i & io, const std::vector<std::pair<uint32_t, uint32_t>> & cell_ranges) const;
|
void state_write_data(llama_io_write_i & io, const std::vector<std::pair<uint32_t, uint32_t>> & cell_ranges) const;
|
||||||
|
|
||||||
|
|
@ -205,11 +403,6 @@ private:
|
||||||
bool state_read_data(llama_io_read_i & io, uint32_t cell_count);
|
bool state_read_data(llama_io_read_i & io, uint32_t cell_count);
|
||||||
};
|
};
|
||||||
|
|
||||||
// TODO: temporary reusing llama_kv_cache_unified -- implement recurrent cache and simplify llama_kv_cache_unified
|
|
||||||
//class llama_kv_cache_recurrent : public llama_kv_cache_unified {
|
|
||||||
//public:
|
|
||||||
// using llama_kv_cache_unified::llama_kv_cache_unified;
|
|
||||||
//};
|
|
||||||
|
|
||||||
//
|
//
|
||||||
// kv cache view
|
// kv cache view
|
||||||
|
|
|
||||||
12
llama/llama.cpp/src/llama-memory.h
vendored
12
llama/llama.cpp/src/llama-memory.h
vendored
|
|
@ -2,12 +2,22 @@
|
||||||
|
|
||||||
#include "llama.h"
|
#include "llama.h"
|
||||||
|
|
||||||
|
struct llama_memory_params {
|
||||||
|
// kv cache
|
||||||
|
ggml_type type_k;
|
||||||
|
ggml_type type_v;
|
||||||
|
|
||||||
|
// parameters for other types of memory
|
||||||
|
// ...
|
||||||
|
};
|
||||||
|
|
||||||
// general concept of LLM memory
|
// general concept of LLM memory
|
||||||
// the KV cache is a type of LLM memory, but there can be other types
|
// the KV cache is a type of LLM memory, but there can be other types
|
||||||
class llama_memory_i {
|
class llama_memory_i {
|
||||||
public:
|
public:
|
||||||
|
virtual ~llama_memory_i() = default;
|
||||||
|
|
||||||
virtual void clear() = 0;
|
virtual void clear() = 0;
|
||||||
virtual void defrag() = 0;
|
|
||||||
|
|
||||||
virtual bool seq_rm (llama_seq_id seq_id, llama_pos p0, llama_pos p1) = 0;
|
virtual bool seq_rm (llama_seq_id seq_id, llama_pos p0, llama_pos p1) = 0;
|
||||||
virtual void seq_cp (llama_seq_id seq_id_src, llama_seq_id seq_id_dst, llama_pos p0, llama_pos p1) = 0;
|
virtual void seq_cp (llama_seq_id seq_id_src, llama_seq_id seq_id_dst, llama_pos p0, llama_pos p1) = 0;
|
||||||
|
|
|
||||||
26
llama/llama.cpp/src/llama-model-loader.cpp
vendored
26
llama/llama.cpp/src/llama-model-loader.cpp
vendored
|
|
@ -301,12 +301,12 @@ namespace GGUFMeta {
|
||||||
GGUFMeta::GKV<GGUFMeta::ArrayInfo>::get_kv(meta.get(), kid);
|
GGUFMeta::GKV<GGUFMeta::ArrayInfo>::get_kv(meta.get(), kid);
|
||||||
|
|
||||||
switch (arr_info.gt) {
|
switch (arr_info.gt) {
|
||||||
case GGUF_TYPE_FLOAT32: GGML_ASSERT((std::is_same<T, float>::value)); break;
|
case GGUF_TYPE_UINT32:
|
||||||
case GGUF_TYPE_INT32: GGML_ASSERT(
|
case GGUF_TYPE_INT32: GGML_ASSERT((std::is_same<T, int32_t>::value) ||
|
||||||
(std::is_same<T, int32_t>::value) ||
|
(std::is_same<T, uint32_t>::value)); break;
|
||||||
(std::is_same<T, uint32_t>::value)); break;
|
case GGUF_TYPE_FLOAT32: GGML_ASSERT((std::is_same<T, float>::value)); break;
|
||||||
default:
|
default:
|
||||||
throw std::runtime_error(format("%s is not a float32, int32 array", key.c_str()));
|
throw std::runtime_error(format("%s is not a float32/uint32/int32 array", key.c_str()));
|
||||||
}
|
}
|
||||||
|
|
||||||
result.resize(arr_info.length);
|
result.resize(arr_info.length);
|
||||||
|
|
@ -315,8 +315,6 @@ namespace GGUFMeta {
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
template bool llama_model_loader::get_arr<std::array<unsigned int, 512>>(enum llm_kv kid, std::array<unsigned int, 512>& result, bool required);
|
|
||||||
|
|
||||||
template<typename T, size_t N_MAX>
|
template<typename T, size_t N_MAX>
|
||||||
bool llama_model_loader::get_arr(const std::string & key, std::array<T, N_MAX> & result, bool required) {
|
bool llama_model_loader::get_arr(const std::string & key, std::array<T, N_MAX> & result, bool required) {
|
||||||
const int kid = gguf_find_key(meta.get(), key.c_str());
|
const int kid = gguf_find_key(meta.get(), key.c_str());
|
||||||
|
|
@ -332,12 +330,12 @@ namespace GGUFMeta {
|
||||||
GGUFMeta::GKV<GGUFMeta::ArrayInfo>::get_kv(meta.get(), kid);
|
GGUFMeta::GKV<GGUFMeta::ArrayInfo>::get_kv(meta.get(), kid);
|
||||||
|
|
||||||
switch (arr_info.gt) {
|
switch (arr_info.gt) {
|
||||||
case GGUF_TYPE_FLOAT32: GGML_ASSERT((std::is_same<T, float>::value)); break;
|
case GGUF_TYPE_UINT32:
|
||||||
case GGUF_TYPE_INT32: GGML_ASSERT(
|
case GGUF_TYPE_INT32: GGML_ASSERT((std::is_same<T, int32_t>::value) ||
|
||||||
(std::is_same<T, int32_t>::value) ||
|
(std::is_same<T, uint32_t>::value)); break;
|
||||||
(std::is_same<T, uint32_t>::value)); break;
|
case GGUF_TYPE_FLOAT32: GGML_ASSERT((std::is_same<T, float>::value)); break;
|
||||||
default:
|
default:
|
||||||
throw std::runtime_error(format("%s is not a float32, int32 array", key.c_str()));
|
throw std::runtime_error(format("%s is not a float32/uint32/int32 array", key.c_str()));
|
||||||
}
|
}
|
||||||
|
|
||||||
if (arr_info.length > N_MAX) {
|
if (arr_info.length > N_MAX) {
|
||||||
|
|
@ -826,6 +824,10 @@ void llama_model_loader::init_mappings(bool prefetch, llama_mlocks * mlock_mmaps
|
||||||
mmaps_used.reserve(files.size());
|
mmaps_used.reserve(files.size());
|
||||||
for (const auto & file : files) {
|
for (const auto & file : files) {
|
||||||
auto * reg = ggml_backend_dev_backend_reg(ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU));
|
auto * reg = ggml_backend_dev_backend_reg(ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU));
|
||||||
|
if (!reg) {
|
||||||
|
throw std::runtime_error(format("%s: no CPU backend found", __func__));
|
||||||
|
}
|
||||||
|
|
||||||
auto * is_numa_fn = (decltype(ggml_is_numa) *) ggml_backend_reg_get_proc_address(reg, "ggml_backend_cpu_is_numa");
|
auto * is_numa_fn = (decltype(ggml_is_numa) *) ggml_backend_reg_get_proc_address(reg, "ggml_backend_cpu_is_numa");
|
||||||
std::unique_ptr<llama_mmap> mapping = std::make_unique<llama_mmap>(file.get(), prefetch ? -1 : 0, is_numa_fn());
|
std::unique_ptr<llama_mmap> mapping = std::make_unique<llama_mmap>(file.get(), prefetch ? -1 : 0, is_numa_fn());
|
||||||
mmaps_used.emplace_back(mapping->size(), 0);
|
mmaps_used.emplace_back(mapping->size(), 0);
|
||||||
|
|
|
||||||
281
llama/llama.cpp/src/llama-model-saver.cpp
vendored
Normal file
281
llama/llama.cpp/src/llama-model-saver.cpp
vendored
Normal file
|
|
@ -0,0 +1,281 @@
|
||||||
|
#include "llama-model-saver.h"
|
||||||
|
|
||||||
|
#include "gguf.h"
|
||||||
|
|
||||||
|
#include "llama.h"
|
||||||
|
#include "llama-hparams.h"
|
||||||
|
#include "llama-model.h"
|
||||||
|
#include "llama-vocab.h"
|
||||||
|
|
||||||
|
#include <string>
|
||||||
|
|
||||||
|
llama_model_saver::llama_model_saver(const struct llama_model & model) : model(model), llm_kv(model.arch) {
|
||||||
|
gguf_ctx = gguf_init_empty();
|
||||||
|
}
|
||||||
|
|
||||||
|
llama_model_saver::~llama_model_saver() {
|
||||||
|
gguf_free(gguf_ctx);
|
||||||
|
}
|
||||||
|
|
||||||
|
void llama_model_saver::add_kv(const enum llm_kv key, const uint32_t value) {
|
||||||
|
gguf_set_val_u32(gguf_ctx, llm_kv(key).c_str(), value);
|
||||||
|
}
|
||||||
|
|
||||||
|
void llama_model_saver::add_kv(const enum llm_kv key, const int32_t value) {
|
||||||
|
gguf_set_val_i32(gguf_ctx, llm_kv(key).c_str(), value);
|
||||||
|
}
|
||||||
|
|
||||||
|
void llama_model_saver::add_kv(const enum llm_kv key, const float value) {
|
||||||
|
gguf_set_val_f32(gguf_ctx, llm_kv(key).c_str(), value);
|
||||||
|
}
|
||||||
|
|
||||||
|
void llama_model_saver::add_kv(const enum llm_kv key, const bool value) {
|
||||||
|
gguf_set_val_bool(gguf_ctx, llm_kv(key).c_str(), value);
|
||||||
|
}
|
||||||
|
|
||||||
|
void llama_model_saver::add_kv(const enum llm_kv key, const char * value) {
|
||||||
|
gguf_set_val_str(gguf_ctx, llm_kv(key).c_str(), value);
|
||||||
|
}
|
||||||
|
|
||||||
|
[[noreturn]]
|
||||||
|
void llama_model_saver::add_kv(const enum llm_kv key, const char value) {
|
||||||
|
GGML_UNUSED(key);
|
||||||
|
GGML_UNUSED(value);
|
||||||
|
GGML_ABORT("fatal error"); // this should never be called, only needed to make the template below compile
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename Container>
|
||||||
|
void llama_model_saver::add_kv(const enum llm_kv key, const Container & value, const bool per_layer) {
|
||||||
|
const size_t n_values = per_layer ? size_t(model.hparams.n_layer) : value.size();
|
||||||
|
GGML_ASSERT(n_values <= value.size());
|
||||||
|
|
||||||
|
if (n_values == 0) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (per_layer) {
|
||||||
|
bool all_values_the_same = true;
|
||||||
|
for (size_t i = 1; i < n_values; ++i) {
|
||||||
|
if (value[i] != value[0]) {
|
||||||
|
all_values_the_same = false;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (all_values_the_same) {
|
||||||
|
add_kv(key, value[0]);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (std::is_same<typename Container::value_type, uint8_t>::value) {
|
||||||
|
gguf_set_arr_data(gguf_ctx, llm_kv(key).c_str(), GGUF_TYPE_UINT8, value.data(), n_values);
|
||||||
|
} else if (std::is_same<typename Container::value_type, int8_t>::value) {
|
||||||
|
gguf_set_arr_data(gguf_ctx, llm_kv(key).c_str(), GGUF_TYPE_INT8, value.data(), n_values);
|
||||||
|
} else if (std::is_same<typename Container::value_type, uint32_t>::value) {
|
||||||
|
gguf_set_arr_data(gguf_ctx, llm_kv(key).c_str(), GGUF_TYPE_UINT32, value.data(), n_values);
|
||||||
|
} else if (std::is_same<typename Container::value_type, int32_t>::value) {
|
||||||
|
gguf_set_arr_data(gguf_ctx, llm_kv(key).c_str(), GGUF_TYPE_INT32, value.data(), n_values);
|
||||||
|
} else if (std::is_same<typename Container::value_type, float>::value) {
|
||||||
|
gguf_set_arr_data(gguf_ctx, llm_kv(key).c_str(), GGUF_TYPE_FLOAT32, value.data(), n_values);
|
||||||
|
} else if (std::is_same<Container, std::string>::value) {
|
||||||
|
gguf_set_val_str(gguf_ctx, llm_kv(key).c_str(), reinterpret_cast<const char *>(value.data()));
|
||||||
|
} else {
|
||||||
|
GGML_ABORT("fatal error");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void llama_model_saver::add_kv(const enum llm_kv key, const std::vector<std::string> & value) {
|
||||||
|
std::vector<const char *> tmp(value.size());
|
||||||
|
for (size_t i = 0; i < value.size(); ++i) {
|
||||||
|
tmp[i] = value[i].c_str();
|
||||||
|
}
|
||||||
|
gguf_set_arr_str(gguf_ctx, llm_kv(key).c_str(), tmp.data(), tmp.size());
|
||||||
|
}
|
||||||
|
|
||||||
|
void llama_model_saver::add_tensor(const struct ggml_tensor * tensor) {
|
||||||
|
if (!tensor) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
if (gguf_find_tensor(gguf_ctx, tensor->name) >= 0) {
|
||||||
|
GGML_ASSERT(std::string(tensor->name) == "rope_freqs.weight"); // FIXME
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
gguf_add_tensor(gguf_ctx, tensor);
|
||||||
|
}
|
||||||
|
|
||||||
|
void llama_model_saver::add_kv_from_model() {
|
||||||
|
const llama_hparams & hparams = model.hparams;
|
||||||
|
const llama_vocab & vocab = model.vocab;
|
||||||
|
|
||||||
|
const int32_t n_vocab = vocab.n_tokens();
|
||||||
|
std::vector<std::string> tokens(n_vocab);
|
||||||
|
std::vector<float> scores(n_vocab);
|
||||||
|
std::vector<int32_t> token_types(n_vocab);
|
||||||
|
|
||||||
|
for (int32_t id = 0; id < n_vocab; ++id) {
|
||||||
|
const llama_vocab::token_data & token_data = vocab.get_token_data(id);
|
||||||
|
|
||||||
|
tokens[id] = token_data.text;
|
||||||
|
scores[id] = token_data.score;
|
||||||
|
|
||||||
|
switch(token_data.attr) {
|
||||||
|
case LLAMA_TOKEN_ATTR_UNKNOWN: token_types[id] = LLAMA_TOKEN_TYPE_UNKNOWN; break;
|
||||||
|
case LLAMA_TOKEN_ATTR_UNUSED: token_types[id] = LLAMA_TOKEN_TYPE_UNUSED; break;
|
||||||
|
case LLAMA_TOKEN_ATTR_NORMAL: token_types[id] = LLAMA_TOKEN_TYPE_NORMAL; break;
|
||||||
|
case LLAMA_TOKEN_ATTR_CONTROL: token_types[id] = LLAMA_TOKEN_TYPE_CONTROL; break;
|
||||||
|
case LLAMA_TOKEN_ATTR_USER_DEFINED: token_types[id] = LLAMA_TOKEN_TYPE_USER_DEFINED; break;
|
||||||
|
case LLAMA_TOKEN_ATTR_BYTE: token_types[id] = LLAMA_TOKEN_TYPE_BYTE; break;
|
||||||
|
case LLAMA_TOKEN_ATTR_UNDEFINED:
|
||||||
|
default: token_types[id] = LLAMA_TOKEN_TYPE_UNDEFINED; break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// add_kv(LLM_KV_GENERAL_TYPE, ???);
|
||||||
|
add_kv(LLM_KV_GENERAL_ARCHITECTURE, model.arch_name());
|
||||||
|
// add_kv(LLM_KV_GENERAL_QUANTIZATION_VERSION, ???);
|
||||||
|
// add_kv(LLM_KV_GENERAL_ALIGNMENT, ???);
|
||||||
|
add_kv(LLM_KV_GENERAL_NAME, model.name);
|
||||||
|
// add_kv(LLM_KV_GENERAL_AUTHOR, ???);
|
||||||
|
// add_kv(LLM_KV_GENERAL_VERSION, ???);
|
||||||
|
// add_kv(LLM_KV_GENERAL_URL, ???);
|
||||||
|
// add_kv(LLM_KV_GENERAL_DESCRIPTION, ???);
|
||||||
|
// add_kv(LLM_KV_GENERAL_LICENSE, ???);
|
||||||
|
// add_kv(LLM_KV_GENERAL_SOURCE_URL, ???);
|
||||||
|
// add_kv(LLM_KV_GENERAL_SOURCE_HF_REPO, ???);
|
||||||
|
|
||||||
|
add_kv(LLM_KV_VOCAB_SIZE, vocab.n_tokens());
|
||||||
|
add_kv(LLM_KV_CONTEXT_LENGTH, hparams.n_ctx_train);
|
||||||
|
add_kv(LLM_KV_EMBEDDING_LENGTH, hparams.n_embd);
|
||||||
|
add_kv(LLM_KV_BLOCK_COUNT, hparams.n_layer);
|
||||||
|
add_kv(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead);
|
||||||
|
add_kv(LLM_KV_FEED_FORWARD_LENGTH, hparams.n_ff_arr, true);
|
||||||
|
add_kv(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
|
||||||
|
add_kv(LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
|
||||||
|
add_kv(LLM_KV_USE_PARALLEL_RESIDUAL, hparams.use_par_res);
|
||||||
|
// add_kv(LLM_KV_TENSOR_DATA_LAYOUT, ???);
|
||||||
|
add_kv(LLM_KV_EXPERT_COUNT, hparams.n_expert);
|
||||||
|
add_kv(LLM_KV_EXPERT_USED_COUNT, hparams.n_expert_used);
|
||||||
|
add_kv(LLM_KV_EXPERT_SHARED_COUNT, hparams.n_expert_shared);
|
||||||
|
add_kv(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale);
|
||||||
|
add_kv(LLM_KV_POOLING_TYPE, uint32_t(hparams.pooling_type));
|
||||||
|
add_kv(LLM_KV_LOGIT_SCALE, hparams.f_logit_scale);
|
||||||
|
add_kv(LLM_KV_DECODER_START_TOKEN_ID, hparams.dec_start_token_id);
|
||||||
|
add_kv(LLM_KV_ATTN_LOGIT_SOFTCAPPING, hparams.f_attn_logit_softcapping);
|
||||||
|
add_kv(LLM_KV_FINAL_LOGIT_SOFTCAPPING, hparams.f_final_logit_softcapping);
|
||||||
|
add_kv(LLM_KV_SWIN_NORM, hparams.swin_norm);
|
||||||
|
add_kv(LLM_KV_RESCALE_EVERY_N_LAYERS, hparams.rescale_every_n_layers);
|
||||||
|
add_kv(LLM_KV_TIME_MIX_EXTRA_DIM, hparams.time_mix_extra_dim);
|
||||||
|
add_kv(LLM_KV_TIME_DECAY_EXTRA_DIM, hparams.time_decay_extra_dim);
|
||||||
|
add_kv(LLM_KV_RESIDUAL_SCALE, hparams.f_residual_scale);
|
||||||
|
add_kv(LLM_KV_EMBEDDING_SCALE, hparams.f_embedding_scale);
|
||||||
|
|
||||||
|
add_kv(LLM_KV_ATTENTION_HEAD_COUNT, hparams.n_head_arr, true);
|
||||||
|
add_kv(LLM_KV_ATTENTION_HEAD_COUNT_KV, hparams.n_head_kv_arr, true);
|
||||||
|
add_kv(LLM_KV_ATTENTION_MAX_ALIBI_BIAS, hparams.f_max_alibi_bias);
|
||||||
|
add_kv(LLM_KV_ATTENTION_CLAMP_KQV, hparams.f_clamp_kqv);
|
||||||
|
add_kv(LLM_KV_ATTENTION_KEY_LENGTH, hparams.n_embd_head_k);
|
||||||
|
add_kv(LLM_KV_ATTENTION_VALUE_LENGTH, hparams.n_embd_head_v);
|
||||||
|
add_kv(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
||||||
|
add_kv(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
||||||
|
add_kv(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn);
|
||||||
|
add_kv(LLM_KV_ATTENTION_Q_LORA_RANK, hparams.n_lora_q);
|
||||||
|
add_kv(LLM_KV_ATTENTION_KV_LORA_RANK, hparams.n_lora_kv);
|
||||||
|
add_kv(LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT, hparams.n_rel_attn_bkts);
|
||||||
|
add_kv(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa);
|
||||||
|
add_kv(LLM_KV_ATTENTION_SCALE, hparams.f_attention_scale);
|
||||||
|
|
||||||
|
const float rope_scaling_factor = hparams.rope_freq_scale_train == 1.0f ? 0.0f : 1.0f/hparams.rope_freq_scale_train;
|
||||||
|
|
||||||
|
add_kv(LLM_KV_ROPE_DIMENSION_COUNT, hparams.n_rot);
|
||||||
|
add_kv(LLM_KV_ROPE_FREQ_BASE, hparams.rope_freq_base_train);
|
||||||
|
// add_kv(LLM_KV_ROPE_SCALE_LINEAR, rope_scaling_factor); // old name
|
||||||
|
add_kv(LLM_KV_ROPE_SCALING_TYPE, llama_rope_scaling_type_name(hparams.rope_scaling_type_train));
|
||||||
|
add_kv(LLM_KV_ROPE_SCALING_FACTOR, rope_scaling_factor);
|
||||||
|
add_kv(LLM_KV_ROPE_SCALING_ATTN_FACTOR, hparams.rope_attn_factor);
|
||||||
|
add_kv(LLM_KV_ROPE_SCALING_ORIG_CTX_LEN, hparams.n_ctx_orig_yarn);
|
||||||
|
add_kv(LLM_KV_ROPE_SCALING_FINETUNED, hparams.rope_finetuned);
|
||||||
|
add_kv(LLM_KV_ROPE_SCALING_YARN_LOG_MUL, hparams.rope_yarn_log_mul);
|
||||||
|
|
||||||
|
// TODO: implement split file support
|
||||||
|
// add_kv(LLM_KV_SPLIT_NO, ???);
|
||||||
|
// add_kv(LLM_KV_SPLIT_COUNT, ???);
|
||||||
|
// add_kv(LLM_KV_SPLIT_TENSORS_COUNT, ???);
|
||||||
|
|
||||||
|
add_kv(LLM_KV_SSM_INNER_SIZE, hparams.ssm_d_inner);
|
||||||
|
add_kv(LLM_KV_SSM_CONV_KERNEL, hparams.ssm_d_conv);
|
||||||
|
add_kv(LLM_KV_SSM_STATE_SIZE, hparams.ssm_d_state);
|
||||||
|
add_kv(LLM_KV_SSM_TIME_STEP_RANK, hparams.ssm_dt_rank);
|
||||||
|
add_kv(LLM_KV_SSM_DT_B_C_RMS, hparams.ssm_dt_b_c_rms);
|
||||||
|
|
||||||
|
add_kv(LLM_KV_WKV_HEAD_SIZE, hparams.wkv_head_size);
|
||||||
|
|
||||||
|
add_kv(LLM_KV_TOKENIZER_MODEL, vocab.get_tokenizer_model());
|
||||||
|
add_kv(LLM_KV_TOKENIZER_PRE, vocab.get_tokenizer_pre());
|
||||||
|
add_kv(LLM_KV_TOKENIZER_LIST, tokens);
|
||||||
|
add_kv(LLM_KV_TOKENIZER_TOKEN_TYPE, token_types);
|
||||||
|
add_kv(LLM_KV_TOKENIZER_TOKEN_TYPE_COUNT, vocab.n_token_types());
|
||||||
|
add_kv(LLM_KV_TOKENIZER_SCORES, scores);
|
||||||
|
add_kv(LLM_KV_TOKENIZER_MERGES, vocab.get_bpe_merges());
|
||||||
|
// FIXME llama_token is type i32 but when reading in a GGUF file u32 is expected, not an issue for writing though
|
||||||
|
add_kv(LLM_KV_TOKENIZER_BOS_ID, uint32_t(vocab.token_bos()));
|
||||||
|
add_kv(LLM_KV_TOKENIZER_EOS_ID, uint32_t(vocab.token_eos()));
|
||||||
|
add_kv(LLM_KV_TOKENIZER_EOT_ID, uint32_t(vocab.token_eot()));
|
||||||
|
add_kv(LLM_KV_TOKENIZER_EOM_ID, uint32_t(vocab.token_eom()));
|
||||||
|
add_kv(LLM_KV_TOKENIZER_UNK_ID, uint32_t(vocab.token_unk()));
|
||||||
|
add_kv(LLM_KV_TOKENIZER_SEP_ID, uint32_t(vocab.token_sep()));
|
||||||
|
add_kv(LLM_KV_TOKENIZER_PAD_ID, uint32_t(vocab.token_pad()));
|
||||||
|
// add_kv(LLM_KV_TOKENIZER_CLS_ID, uint32_t(vocab.token_bos())); // deprecated
|
||||||
|
// add_kv(LLM_KV_TOKENIZER_MASK_ID, ???);
|
||||||
|
add_kv(LLM_KV_TOKENIZER_ADD_BOS, vocab.get_add_bos());
|
||||||
|
add_kv(LLM_KV_TOKENIZER_ADD_EOS, vocab.get_add_eos());
|
||||||
|
add_kv(LLM_KV_TOKENIZER_ADD_PREFIX, vocab.get_add_space_prefix());
|
||||||
|
add_kv(LLM_KV_TOKENIZER_REMOVE_EXTRA_WS, vocab.get_remove_extra_whitespaces());
|
||||||
|
add_kv(LLM_KV_TOKENIZER_PRECOMPILED_CHARSMAP, vocab.get_precompiled_charsmap());
|
||||||
|
// add_kv(LLM_KV_TOKENIZER_HF_JSON, ???);
|
||||||
|
// add_kv(LLM_KV_TOKENIZER_RWKV, ???);
|
||||||
|
add_kv(LLM_KV_TOKENIZER_FIM_PRE_ID, uint32_t(vocab.token_fim_pre()));
|
||||||
|
add_kv(LLM_KV_TOKENIZER_FIM_SUF_ID, uint32_t(vocab.token_fim_suf()));
|
||||||
|
add_kv(LLM_KV_TOKENIZER_FIM_MID_ID, uint32_t(vocab.token_fim_mid()));
|
||||||
|
add_kv(LLM_KV_TOKENIZER_FIM_PAD_ID, uint32_t(vocab.token_fim_pad()));
|
||||||
|
add_kv(LLM_KV_TOKENIZER_FIM_REP_ID, uint32_t(vocab.token_fim_rep()));
|
||||||
|
add_kv(LLM_KV_TOKENIZER_FIM_SEP_ID, uint32_t(vocab.token_fim_sep()));
|
||||||
|
|
||||||
|
// TODO: implement LoRA support
|
||||||
|
// add_kv(LLM_KV_ADAPTER_TYPE, ???);
|
||||||
|
// add_kv(LLM_KV_ADAPTER_LORA_ALPHA, ???);
|
||||||
|
|
||||||
|
// deprecated
|
||||||
|
// add_kv(LLM_KV_TOKENIZER_PREFIX_ID, ???);
|
||||||
|
// add_kv(LLM_KV_TOKENIZER_SUFFIX_ID, ???);
|
||||||
|
// add_kv(LLM_KV_TOKENIZER_MIDDLE_ID, ???);
|
||||||
|
}
|
||||||
|
|
||||||
|
void llama_model_saver::add_tensors_from_model() {
|
||||||
|
if (std::string(model.output->name) != std::string(model.tok_embd->name)) {
|
||||||
|
add_tensor(model.tok_embd); // some models use the same tensor for tok_embd and output
|
||||||
|
}
|
||||||
|
add_tensor(model.type_embd);
|
||||||
|
add_tensor(model.pos_embd);
|
||||||
|
add_tensor(model.tok_norm);
|
||||||
|
add_tensor(model.tok_norm_b);
|
||||||
|
add_tensor(model.output_norm);
|
||||||
|
add_tensor(model.output_norm_b);
|
||||||
|
add_tensor(model.output);
|
||||||
|
add_tensor(model.output_b);
|
||||||
|
add_tensor(model.output_norm_enc);
|
||||||
|
add_tensor(model.cls);
|
||||||
|
add_tensor(model.cls_b);
|
||||||
|
add_tensor(model.cls_out);
|
||||||
|
add_tensor(model.cls_out_b);
|
||||||
|
|
||||||
|
for (const struct llama_layer & layer : model.layers) {
|
||||||
|
for (size_t i = 0; i < sizeof(layer)/sizeof(struct ggml_tensor *); ++i) {
|
||||||
|
add_tensor(reinterpret_cast<const struct ggml_tensor * const *>(&layer)[i]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void llama_model_saver::save(const std::string & path_model) {
|
||||||
|
gguf_write_to_file(gguf_ctx, path_model.c_str(), false);
|
||||||
|
}
|
||||||
|
|
||||||
37
llama/llama.cpp/src/llama-model-saver.h
vendored
Normal file
37
llama/llama.cpp/src/llama-model-saver.h
vendored
Normal file
|
|
@ -0,0 +1,37 @@
|
||||||
|
#pragma once
|
||||||
|
|
||||||
|
#include "llama.h"
|
||||||
|
#include "llama-arch.h"
|
||||||
|
|
||||||
|
#include <vector>
|
||||||
|
|
||||||
|
struct llama_model_saver {
|
||||||
|
struct gguf_context * gguf_ctx = nullptr;
|
||||||
|
const struct llama_model & model;
|
||||||
|
const struct LLM_KV llm_kv;
|
||||||
|
|
||||||
|
llama_model_saver(const struct llama_model & model);
|
||||||
|
~llama_model_saver();
|
||||||
|
|
||||||
|
void add_kv(enum llm_kv key, uint32_t value);
|
||||||
|
void add_kv(enum llm_kv key, int32_t value);
|
||||||
|
void add_kv(enum llm_kv key, float value);
|
||||||
|
void add_kv(enum llm_kv key, bool value);
|
||||||
|
void add_kv(enum llm_kv key, const char * value);
|
||||||
|
|
||||||
|
[[noreturn]]
|
||||||
|
void add_kv(enum llm_kv key, char value); // needed to make the template below compile
|
||||||
|
|
||||||
|
template <typename Container>
|
||||||
|
void add_kv(enum llm_kv key, const Container & value, bool per_layer = false);
|
||||||
|
|
||||||
|
void add_kv(enum llm_kv key, const std::vector<std::string> & value);
|
||||||
|
|
||||||
|
void add_tensor(const struct ggml_tensor * tensor);
|
||||||
|
|
||||||
|
void add_kv_from_model();
|
||||||
|
|
||||||
|
void add_tensors_from_model();
|
||||||
|
|
||||||
|
void save(const std::string & path_model);
|
||||||
|
};
|
||||||
460
llama/llama.cpp/src/llama-model.cpp
vendored
460
llama/llama.cpp/src/llama-model.cpp
vendored
|
|
@ -40,6 +40,7 @@ const char * llm_type_name(llm_type type) {
|
||||||
case LLM_TYPE_335M: return "335M";
|
case LLM_TYPE_335M: return "335M";
|
||||||
case LLM_TYPE_410M: return "410M";
|
case LLM_TYPE_410M: return "410M";
|
||||||
case LLM_TYPE_450M: return "450M";
|
case LLM_TYPE_450M: return "450M";
|
||||||
|
case LLM_TYPE_475M: return "475M";
|
||||||
case LLM_TYPE_770M: return "770M";
|
case LLM_TYPE_770M: return "770M";
|
||||||
case LLM_TYPE_780M: return "780M";
|
case LLM_TYPE_780M: return "780M";
|
||||||
case LLM_TYPE_0_5B: return "0.5B";
|
case LLM_TYPE_0_5B: return "0.5B";
|
||||||
|
|
@ -79,6 +80,7 @@ const char * llm_type_name(llm_type type) {
|
||||||
case LLM_TYPE_236B: return "236B";
|
case LLM_TYPE_236B: return "236B";
|
||||||
case LLM_TYPE_290B: return "290B";
|
case LLM_TYPE_290B: return "290B";
|
||||||
case LLM_TYPE_314B: return "314B";
|
case LLM_TYPE_314B: return "314B";
|
||||||
|
case LLM_TYPE_405B: return "405B";
|
||||||
case LLM_TYPE_671B: return "671B";
|
case LLM_TYPE_671B: return "671B";
|
||||||
case LLM_TYPE_SMALL: return "0.1B";
|
case LLM_TYPE_SMALL: return "0.1B";
|
||||||
case LLM_TYPE_MEDIUM: return "0.4B";
|
case LLM_TYPE_MEDIUM: return "0.4B";
|
||||||
|
|
@ -115,6 +117,10 @@ static const std::map<llama_rope_scaling_type, const char *> LLAMA_ROPE_SCALING_
|
||||||
{ LLAMA_ROPE_SCALING_TYPE_LONGROPE, "longrope" },
|
{ LLAMA_ROPE_SCALING_TYPE_LONGROPE, "longrope" },
|
||||||
};
|
};
|
||||||
|
|
||||||
|
std::string llama_rope_scaling_type_name(llama_rope_scaling_type rope_scaling_type) {
|
||||||
|
return LLAMA_ROPE_SCALING_TYPES.at(rope_scaling_type);
|
||||||
|
}
|
||||||
|
|
||||||
static llama_rope_scaling_type llama_rope_scaling_type_from_string(const std::string & name) {
|
static llama_rope_scaling_type llama_rope_scaling_type_from_string(const std::string & name) {
|
||||||
for (const auto & kv : LLAMA_ROPE_SCALING_TYPES) {
|
for (const auto & kv : LLAMA_ROPE_SCALING_TYPES) {
|
||||||
if (kv.second == name) {
|
if (kv.second == name) {
|
||||||
|
|
@ -297,6 +303,10 @@ static buft_list_t make_cpu_buft_list(const std::vector<ggml_backend_dev_t> & de
|
||||||
// add extra buffer types, only if no GPU device is present
|
// add extra buffer types, only if no GPU device is present
|
||||||
// ref: https://github.com/ggml-org/llama.cpp/issues/12481#issuecomment-2743136094
|
// ref: https://github.com/ggml-org/llama.cpp/issues/12481#issuecomment-2743136094
|
||||||
auto * cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
|
auto * cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
|
||||||
|
if (cpu_dev == nullptr) {
|
||||||
|
throw std::runtime_error(format("%s: no CPU backend found", __func__));
|
||||||
|
}
|
||||||
|
|
||||||
auto * cpu_reg = ggml_backend_dev_backend_reg(cpu_dev);
|
auto * cpu_reg = ggml_backend_dev_backend_reg(cpu_dev);
|
||||||
auto ggml_backend_dev_get_extra_bufts_fn = (ggml_backend_dev_get_extra_bufts_t)
|
auto ggml_backend_dev_get_extra_bufts_fn = (ggml_backend_dev_get_extra_bufts_t)
|
||||||
ggml_backend_reg_get_proc_address(cpu_reg, "ggml_backend_dev_get_extra_bufts");
|
ggml_backend_reg_get_proc_address(cpu_reg, "ggml_backend_dev_get_extra_bufts");
|
||||||
|
|
@ -423,7 +433,6 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
||||||
|
|
||||||
// get general kv
|
// get general kv
|
||||||
ml.get_key(LLM_KV_GENERAL_NAME, name, false);
|
ml.get_key(LLM_KV_GENERAL_NAME, name, false);
|
||||||
ml.get_key(LLM_KV_VOCAB_SIZE, hparams.n_vocab, false) || ml.get_arr_n(LLM_KV_TOKENIZER_LIST, hparams.n_vocab, false);
|
|
||||||
|
|
||||||
// everything past this point is not vocab-related
|
// everything past this point is not vocab-related
|
||||||
if (hparams.vocab_only) {
|
if (hparams.vocab_only) {
|
||||||
|
|
@ -435,7 +444,6 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
||||||
ml.get_key(LLM_KV_BLOCK_COUNT, hparams.n_layer);
|
ml.get_key(LLM_KV_BLOCK_COUNT, hparams.n_layer);
|
||||||
ml.get_key(LLM_KV_EXPERT_COUNT, hparams.n_expert, false);
|
ml.get_key(LLM_KV_EXPERT_COUNT, hparams.n_expert, false);
|
||||||
ml.get_key(LLM_KV_EXPERT_USED_COUNT, hparams.n_expert_used, false);
|
ml.get_key(LLM_KV_EXPERT_USED_COUNT, hparams.n_expert_used, false);
|
||||||
ml.get_key(LLM_KV_VOCAB_SIZE, hparams.n_vocab, false);
|
|
||||||
|
|
||||||
if (arch == LLM_ARCH_WAVTOKENIZER_DEC) {
|
if (arch == LLM_ARCH_WAVTOKENIZER_DEC) {
|
||||||
ml.get_key(LLM_KV_FEATURES_LENGTH, hparams.n_embd_features);
|
ml.get_key(LLM_KV_FEATURES_LENGTH, hparams.n_embd_features);
|
||||||
|
|
@ -459,11 +467,9 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
||||||
std::fill(hparams.n_head_arr.begin(), hparams.n_head_arr.end(), 0);
|
std::fill(hparams.n_head_arr.begin(), hparams.n_head_arr.end(), 0);
|
||||||
std::fill(hparams.n_head_kv_arr.begin(), hparams.n_head_kv_arr.end(), 0);
|
std::fill(hparams.n_head_kv_arr.begin(), hparams.n_head_kv_arr.end(), 0);
|
||||||
std::fill(hparams.n_ff_arr.begin(), hparams.n_ff_arr.end(), 0);
|
std::fill(hparams.n_ff_arr.begin(), hparams.n_ff_arr.end(), 0);
|
||||||
std::fill(hparams.cross_attn_layers.begin(), hparams.cross_attn_layers.end(), -1);
|
|
||||||
|
|
||||||
ml.get_key_or_arr(LLM_KV_FEED_FORWARD_LENGTH, hparams.n_ff_arr, hparams.n_layer, false);
|
ml.get_key_or_arr(LLM_KV_FEED_FORWARD_LENGTH, hparams.n_ff_arr, hparams.n_layer, false);
|
||||||
ml.get_key_or_arr(LLM_KV_ATTENTION_HEAD_COUNT, hparams.n_head_arr, hparams.n_layer, false);
|
ml.get_key_or_arr(LLM_KV_ATTENTION_HEAD_COUNT, hparams.n_head_arr, hparams.n_layer, false);
|
||||||
ml.get_arr(LLM_KV_ATTENTION_CROSS_ATTENTION_LAYERS, hparams.cross_attn_layers, false);
|
|
||||||
|
|
||||||
// n_head_kv is optional, default to n_head
|
// n_head_kv is optional, default to n_head
|
||||||
hparams.n_head_kv_arr = hparams.n_head_arr;
|
hparams.n_head_kv_arr = hparams.n_head_arr;
|
||||||
|
|
@ -516,7 +522,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
||||||
|
|
||||||
ml.get_key(LLM_KV_ROPE_DIMENSION_COUNT, hparams.n_rot, false);
|
ml.get_key(LLM_KV_ROPE_DIMENSION_COUNT, hparams.n_rot, false);
|
||||||
|
|
||||||
if (arch == LLM_ARCH_LLAMA || arch == LLM_ARCH_MLLAMA || arch == LLM_ARCH_DECI || arch == LLM_ARCH_FALCON) {
|
if (arch == LLM_ARCH_LLAMA || arch == LLM_ARCH_DECI || arch == LLM_ARCH_FALCON) {
|
||||||
if (hparams.n_rot != hparams.n_embd_head_k) {
|
if (hparams.n_rot != hparams.n_embd_head_k) {
|
||||||
throw std::runtime_error(format("invalid n_rot: %u, expected %u", hparams.n_rot, hparams.n_embd_head_k));
|
throw std::runtime_error(format("invalid n_rot: %u, expected %u", hparams.n_rot, hparams.n_embd_head_k));
|
||||||
}
|
}
|
||||||
|
|
@ -579,22 +585,13 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
||||||
hparams.use_kq_norm = false;
|
hparams.use_kq_norm = false;
|
||||||
}
|
}
|
||||||
} break;
|
} break;
|
||||||
case LLM_ARCH_MLLAMA:
|
|
||||||
{
|
|
||||||
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
|
||||||
|
|
||||||
switch (hparams.n_layer) {
|
|
||||||
case 40: type = LLM_TYPE_11B; break;
|
|
||||||
case 100: type = LLM_TYPE_90B; break;
|
|
||||||
default: type = LLM_TYPE_UNKNOWN;
|
|
||||||
}
|
|
||||||
} break;
|
|
||||||
case LLM_ARCH_DECI:
|
case LLM_ARCH_DECI:
|
||||||
{
|
{
|
||||||
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
||||||
switch (hparams.n_layer) {
|
switch (hparams.n_layer) {
|
||||||
case 32: type = LLM_TYPE_7B; break;
|
case 32: type = LLM_TYPE_7B; break;
|
||||||
case 80: type = LLM_TYPE_70B; break;
|
case 80: type = LLM_TYPE_70B; break;
|
||||||
|
case 162: type = LLM_TYPE_405B; break;
|
||||||
default: type = LLM_TYPE_UNKNOWN;
|
default: type = LLM_TYPE_UNKNOWN;
|
||||||
}
|
}
|
||||||
} break;
|
} break;
|
||||||
|
|
@ -721,7 +718,11 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
||||||
ml.get_key(LLM_KV_MOE_EVERY_N_LAYERS, hparams.moe_every_n_layers, 0);
|
ml.get_key(LLM_KV_MOE_EVERY_N_LAYERS, hparams.moe_every_n_layers, 0);
|
||||||
|
|
||||||
if (hparams.n_layer == 12 && hparams.n_embd == 768) {
|
if (hparams.n_layer == 12 && hparams.n_embd == 768) {
|
||||||
type = LLM_TYPE_137M;
|
if (arch == LLM_ARCH_NOMIC_BERT) {
|
||||||
|
type = LLM_TYPE_137M;
|
||||||
|
} else if (arch == LLM_ARCH_NOMIC_BERT_MOE && hparams.moe_every_n_layers == 2) {
|
||||||
|
type = LLM_TYPE_475M;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
} break;
|
} break;
|
||||||
case LLM_ARCH_BLOOM:
|
case LLM_ARCH_BLOOM:
|
||||||
|
|
@ -782,6 +783,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
||||||
// fall through
|
// fall through
|
||||||
case LLM_ARCH_QWEN2:
|
case LLM_ARCH_QWEN2:
|
||||||
{
|
{
|
||||||
|
ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type, false);
|
||||||
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
||||||
switch (hparams.n_layer) {
|
switch (hparams.n_layer) {
|
||||||
case 24: type = hparams.n_embd == 1024 ? LLM_TYPE_0_5B : LLM_TYPE_1B; break;
|
case 24: type = hparams.n_embd == 1024 ? LLM_TYPE_0_5B : LLM_TYPE_1B; break;
|
||||||
|
|
@ -1505,6 +1507,9 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
||||||
}
|
}
|
||||||
|
|
||||||
ggml_backend_dev_t cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
|
ggml_backend_dev_t cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
|
||||||
|
if (cpu_dev == nullptr) {
|
||||||
|
throw std::runtime_error(format("%s: no CPU backend found", __func__));
|
||||||
|
}
|
||||||
const int i_gpu_start = std::max((int) hparams.n_layer - n_gpu_layers, (int) 0);
|
const int i_gpu_start = std::max((int) hparams.n_layer - n_gpu_layers, (int) 0);
|
||||||
const int act_gpu_layers = devices.empty() ? 0 : std::min(n_gpu_layers, (int)n_layer + 1);
|
const int act_gpu_layers = devices.empty() ? 0 : std::min(n_gpu_layers, (int)n_layer + 1);
|
||||||
auto get_layer_buft_list = [&](int il) -> llama_model::impl::layer_dev {
|
auto get_layer_buft_list = [&](int il) -> llama_model::impl::layer_dev {
|
||||||
|
|
@ -1576,7 +1581,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
||||||
const int64_t n_embd_head_v = hparams.n_embd_head_v;
|
const int64_t n_embd_head_v = hparams.n_embd_head_v;
|
||||||
const int64_t n_ff = hparams.n_ff();
|
const int64_t n_ff = hparams.n_ff();
|
||||||
const int64_t n_embd_gqa = n_embd_v_gqa;
|
const int64_t n_embd_gqa = n_embd_v_gqa;
|
||||||
const int64_t n_vocab = hparams.n_vocab;
|
const int64_t n_vocab = vocab.n_tokens();
|
||||||
const int64_t n_token_types = vocab.n_token_types();
|
const int64_t n_token_types = vocab.n_token_types();
|
||||||
const int64_t n_rot = hparams.n_rot;
|
const int64_t n_rot = hparams.n_rot;
|
||||||
const int64_t n_expert = hparams.n_expert;
|
const int64_t n_expert = hparams.n_expert;
|
||||||
|
|
@ -1672,8 +1677,11 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
||||||
for (const auto * overrides = ml.tensor_buft_overrides; overrides->pattern != nullptr; ++overrides) {
|
for (const auto * overrides = ml.tensor_buft_overrides; overrides->pattern != nullptr; ++overrides) {
|
||||||
std::regex pattern(overrides->pattern);
|
std::regex pattern(overrides->pattern);
|
||||||
if (std::regex_search(tensor_name, pattern)) {
|
if (std::regex_search(tensor_name, pattern)) {
|
||||||
LLAMA_LOG_DEBUG("tensor %s buffer type overriden to %s\n", tensor_name.c_str(), ggml_backend_buft_name(overrides->buft));
|
|
||||||
buft = overrides->buft;
|
buft = overrides->buft;
|
||||||
|
LLAMA_LOG_DEBUG("tensor %s (%zu MiB %s) buffer type overridden to %s\n",
|
||||||
|
tensor_name.c_str(),
|
||||||
|
ggml_nbytes(t_meta) / 1024 / 1024, ggml_type_name(t_meta->type),
|
||||||
|
ggml_backend_buft_name(buft));
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
@ -1690,6 +1698,9 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
||||||
auto * buft_dev = ggml_backend_buft_get_device(buft);
|
auto * buft_dev = ggml_backend_buft_get_device(buft);
|
||||||
if (ml.use_mmap && buft_dev && buft == ggml_backend_dev_host_buffer_type(buft_dev)) {
|
if (ml.use_mmap && buft_dev && buft == ggml_backend_dev_host_buffer_type(buft_dev)) {
|
||||||
auto * cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
|
auto * cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
|
||||||
|
if (!cpu_dev) {
|
||||||
|
throw std::runtime_error("no CPU backend found");
|
||||||
|
}
|
||||||
buft = ggml_backend_dev_buffer_type(cpu_dev);
|
buft = ggml_backend_dev_buffer_type(cpu_dev);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -1829,52 +1840,6 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
} break;
|
} break;
|
||||||
case LLM_ARCH_MLLAMA:
|
|
||||||
{
|
|
||||||
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab+8}, 0);
|
|
||||||
|
|
||||||
// output
|
|
||||||
{
|
|
||||||
output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
|
|
||||||
output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
|
||||||
|
|
||||||
// if output is NULL, init from the input tok embed
|
|
||||||
if (output == NULL) {
|
|
||||||
output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
for (int i = 0; i < n_layer; ++i) {
|
|
||||||
auto & layer = layers[i];
|
|
||||||
|
|
||||||
if (hparams.cross_attention_layers(i)) {
|
|
||||||
layer.cross_attn_k_norm = create_tensor(tn(LLM_TENSOR_CROSS_ATTN_K_NORM, "weight", i), {128}, 0);
|
|
||||||
layer.cross_attn_k_proj = create_tensor(tn(LLM_TENSOR_CROSS_ATTN_K_PROJ, "weight", i), {n_embd, 1024}, 0);
|
|
||||||
layer.cross_attn_o_proj = create_tensor(tn(LLM_TENSOR_CROSS_ATTN_O_PROJ, "weight", i), {n_embd, n_embd}, 0);
|
|
||||||
layer.cross_attn_q_norm = create_tensor(tn(LLM_TENSOR_CROSS_ATTN_Q_NORM, "weight", i), {128}, 0);
|
|
||||||
layer.cross_attn_q_proj = create_tensor(tn(LLM_TENSOR_CROSS_ATTN_Q_PROJ, "weight", i), {n_embd, n_embd}, 0);
|
|
||||||
layer.cross_attn_v_proj = create_tensor(tn(LLM_TENSOR_CROSS_ATTN_V_PROJ, "weight", i), {n_embd, 1024}, 0);
|
|
||||||
layer.cross_attn_attn_gate = create_tensor(tn(LLM_TENSOR_CROSS_ATTN_ATTN_GATE, i), {1}, 0);
|
|
||||||
layer.cross_attn_mlp_gate = create_tensor(tn(LLM_TENSOR_CROSS_ATTN_MLP_GATE, i), {1}, 0);
|
|
||||||
layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
|
|
||||||
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
|
|
||||||
layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
|
|
||||||
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
|
|
||||||
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
|
|
||||||
} else {
|
|
||||||
layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
|
|
||||||
layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
|
|
||||||
layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
|
|
||||||
layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
|
|
||||||
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
|
|
||||||
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
|
|
||||||
layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0));
|
|
||||||
layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
|
|
||||||
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
|
|
||||||
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
} break;
|
|
||||||
case LLM_ARCH_DECI:
|
case LLM_ARCH_DECI:
|
||||||
{
|
{
|
||||||
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
||||||
|
|
@ -1917,7 +1882,9 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
||||||
layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
|
layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
|
||||||
layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
|
layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
|
||||||
|
|
||||||
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
|
if (n_ff > 0) {
|
||||||
|
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
|
||||||
|
}
|
||||||
|
|
||||||
if (hparams.rope_scaling_type_train == LLAMA_ROPE_SCALING_TYPE_LONGROPE) {
|
if (hparams.rope_scaling_type_train == LLAMA_ROPE_SCALING_TYPE_LONGROPE) {
|
||||||
layer.rope_long = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_LONG, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
|
layer.rope_long = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_LONG, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
|
||||||
|
|
@ -1927,9 +1894,11 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
||||||
layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
|
layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
|
||||||
}
|
}
|
||||||
|
|
||||||
layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
|
if (n_ff > 0) {
|
||||||
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
|
layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
|
||||||
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
|
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
|
||||||
|
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
|
||||||
|
}
|
||||||
|
|
||||||
// optional MLP bias
|
// optional MLP bias
|
||||||
layer.ffn_gate_b = create_tensor(tn(LLM_TENSOR_FFN_GATE, "bias", i), {n_ff}, TENSOR_NOT_REQUIRED);
|
layer.ffn_gate_b = create_tensor(tn(LLM_TENSOR_FFN_GATE, "bias", i), {n_ff}, TENSOR_NOT_REQUIRED);
|
||||||
|
|
@ -3573,7 +3542,11 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
||||||
|
|
||||||
// output
|
// output
|
||||||
output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
|
output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
|
||||||
output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
|
output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
|
||||||
|
// if output is NULL, init from the input tok embed
|
||||||
|
if (output == NULL) {
|
||||||
|
output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
|
||||||
|
}
|
||||||
|
|
||||||
for (int i = 0; i < n_layer; ++i) {
|
for (int i = 0; i < n_layer; ++i) {
|
||||||
auto & layer = layers[i];
|
auto & layer = layers[i];
|
||||||
|
|
@ -4206,6 +4179,9 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
||||||
if (!dev) {
|
if (!dev) {
|
||||||
// FIXME: workaround for CPU backend buft having a NULL device
|
// FIXME: workaround for CPU backend buft having a NULL device
|
||||||
dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
|
dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
|
||||||
|
if (!dev) {
|
||||||
|
throw std::runtime_error(format("%s: no CPU backend found", __func__));
|
||||||
|
}
|
||||||
}
|
}
|
||||||
ggml_backend_dev_props props;
|
ggml_backend_dev_props props;
|
||||||
ggml_backend_dev_get_props(dev, &props);
|
ggml_backend_dev_get_props(dev, &props);
|
||||||
|
|
@ -4335,7 +4311,7 @@ uint64_t llama_model::n_elements() const {
|
||||||
}
|
}
|
||||||
|
|
||||||
void llama_model::print_info() const {
|
void llama_model::print_info() const {
|
||||||
const char * rope_scaling_type = LLAMA_ROPE_SCALING_TYPES.at(hparams.rope_scaling_type_train);
|
const std::string rope_scaling_type = llama_rope_scaling_type_name(hparams.rope_scaling_type_train);
|
||||||
|
|
||||||
auto print_f = [](const std::function<uint32_t(uint32_t)> & f, uint32_t n) {
|
auto print_f = [](const std::function<uint32_t(uint32_t)> & f, uint32_t n) {
|
||||||
bool is_var = false;
|
bool is_var = false;
|
||||||
|
|
@ -4396,7 +4372,7 @@ void llama_model::print_info() const {
|
||||||
LLAMA_LOG_INFO("%s: causal attn = %d\n", __func__, hparams.causal_attn);
|
LLAMA_LOG_INFO("%s: causal attn = %d\n", __func__, hparams.causal_attn);
|
||||||
LLAMA_LOG_INFO("%s: pooling type = %d\n", __func__, hparams.pooling_type);
|
LLAMA_LOG_INFO("%s: pooling type = %d\n", __func__, hparams.pooling_type);
|
||||||
LLAMA_LOG_INFO("%s: rope type = %d\n", __func__, hparams.rope_type);
|
LLAMA_LOG_INFO("%s: rope type = %d\n", __func__, hparams.rope_type);
|
||||||
LLAMA_LOG_INFO("%s: rope scaling = %s\n", __func__, rope_scaling_type);
|
LLAMA_LOG_INFO("%s: rope scaling = %s\n", __func__, rope_scaling_type.c_str());
|
||||||
LLAMA_LOG_INFO("%s: freq_base_train = %.1f\n", __func__, hparams.rope_freq_base_train);
|
LLAMA_LOG_INFO("%s: freq_base_train = %.1f\n", __func__, hparams.rope_freq_base_train);
|
||||||
LLAMA_LOG_INFO("%s: freq_scale_train = %g\n", __func__, hparams.rope_freq_scale_train);
|
LLAMA_LOG_INFO("%s: freq_scale_train = %g\n", __func__, hparams.rope_freq_scale_train);
|
||||||
LLAMA_LOG_INFO("%s: n_ctx_orig_yarn = %u\n", __func__, hparams.n_ctx_orig_yarn);
|
LLAMA_LOG_INFO("%s: n_ctx_orig_yarn = %u\n", __func__, hparams.n_ctx_orig_yarn);
|
||||||
|
|
@ -4543,6 +4519,19 @@ const ggml_tensor * llama_model::get_tensor(const char * name) const {
|
||||||
return it->second;
|
return it->second;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
ggml_tensor * llama_model::get_rope_factors(uint32_t n_ctx_per_seq, int il) const {
|
||||||
|
// choose long/short freq factors based on the context size
|
||||||
|
if (layers[il].rope_freqs != nullptr) {
|
||||||
|
return layers[il].rope_freqs;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (n_ctx_per_seq > hparams.n_ctx_orig_yarn) {
|
||||||
|
return layers[il].rope_long;
|
||||||
|
}
|
||||||
|
|
||||||
|
return layers[il].rope_short;
|
||||||
|
}
|
||||||
|
|
||||||
struct llm_build_llama : public llm_graph_context {
|
struct llm_build_llama : public llm_graph_context {
|
||||||
llm_build_llama(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
|
llm_build_llama(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
|
||||||
const int64_t n_embd_head = hparams.n_embd_head_v;
|
const int64_t n_embd_head = hparams.n_embd_head_v;
|
||||||
|
|
@ -4583,7 +4572,7 @@ struct llm_build_llama : public llm_graph_context {
|
||||||
// self-attention
|
// self-attention
|
||||||
{
|
{
|
||||||
// rope freq factors for llama3; may return nullptr for llama2 and other models
|
// rope freq factors for llama3; may return nullptr for llama2 and other models
|
||||||
ggml_tensor * rope_factors = static_cast<const llama_kv_cache_unified *>(memory)->cbs.get_rope_factors(n_ctx_per_seq, il);
|
ggml_tensor * rope_factors = model.get_rope_factors(n_ctx_per_seq, il);
|
||||||
|
|
||||||
// compute Q and K and RoPE them
|
// compute Q and K and RoPE them
|
||||||
ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
|
ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
|
||||||
|
|
@ -4767,246 +4756,6 @@ struct llm_build_llama : public llm_graph_context {
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
struct llm_build_mllama: public llm_graph_context {
|
|
||||||
llm_build_mllama(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
|
|
||||||
// mutable variable, needed during the last layer of the computation to skip unused tokens
|
|
||||||
int32_t n_tokens = this->n_tokens;
|
|
||||||
|
|
||||||
const int64_t n_embd_head = hparams.n_embd_head_v;
|
|
||||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
|
||||||
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
|
||||||
|
|
||||||
ggml_tensor * cur;
|
|
||||||
ggml_tensor * inpL;
|
|
||||||
ggml_tensor * inpCAS;
|
|
||||||
|
|
||||||
inpL = build_inp_embd(model.tok_embd);
|
|
||||||
inpCAS = build_inp_cross_attn_state();
|
|
||||||
|
|
||||||
// inp_pos - contains the positions
|
|
||||||
ggml_tensor * inp_pos = build_inp_pos();
|
|
||||||
|
|
||||||
auto * inp_attn = build_attn_inp_kv_unified();
|
|
||||||
const llama_kv_cache_unified * kv_self = static_cast<const llama_kv_cache_unified *>(memory);
|
|
||||||
|
|
||||||
for (int il = 0; il < n_layer; ++il) {
|
|
||||||
ggml_tensor * inpSA = inpL;
|
|
||||||
|
|
||||||
// norm
|
|
||||||
cur = build_norm(inpL,
|
|
||||||
model.layers[il].attn_norm, NULL,
|
|
||||||
LLM_NORM_RMS, il);
|
|
||||||
cb(cur, "attn_norm", il);
|
|
||||||
|
|
||||||
if (hparams.cross_attention_layers(il)) {
|
|
||||||
if (!ubatch.embd && !cparams.cross_attn) {
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
// cross attention layer
|
|
||||||
ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].cross_attn_q_proj, cur);
|
|
||||||
cb(Qcur, "Qcur", il);
|
|
||||||
|
|
||||||
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
|
||||||
cb(Qcur, "Qcur", il);
|
|
||||||
|
|
||||||
Qcur = ggml_cont(ctx0, ggml_permute(ctx0, Qcur, 0, 2, 1, 3));
|
|
||||||
cb(Qcur, "Qcur", il);
|
|
||||||
|
|
||||||
Qcur = build_norm(Qcur, model.layers[il].cross_attn_q_norm, NULL, LLM_NORM_RMS, il);
|
|
||||||
cb(Qcur, "Qcur", il);
|
|
||||||
|
|
||||||
ggml_tensor * Kcur, * Vcur;
|
|
||||||
if (ubatch.embd) {
|
|
||||||
Kcur = ggml_mul_mat(ctx0, model.layers[il].cross_attn_k_proj, inpCAS);
|
|
||||||
cb(Kcur, "Kcur", il);
|
|
||||||
|
|
||||||
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, 6404);
|
|
||||||
cb(Kcur, "Kcur", il);
|
|
||||||
|
|
||||||
Kcur = ggml_cont(ctx0, ggml_permute(ctx0, Kcur, 0, 2, 1, 3));
|
|
||||||
cb(Kcur, "Kcur", il);
|
|
||||||
|
|
||||||
Kcur = build_norm(Kcur, model.layers[il].cross_attn_k_norm, NULL, LLM_NORM_RMS, il);
|
|
||||||
cb(Kcur, "Kcur", il);
|
|
||||||
|
|
||||||
ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, kv_self->k_l[il]));
|
|
||||||
|
|
||||||
Vcur = ggml_mul_mat(ctx0, model.layers[il].cross_attn_v_proj, inpCAS);
|
|
||||||
cb(Vcur, "Vcur", il);
|
|
||||||
|
|
||||||
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, 6404);
|
|
||||||
cb(Vcur, "Vcur", il);
|
|
||||||
|
|
||||||
Vcur = ggml_permute(ctx0, Vcur, 0, 2, 1, 3);
|
|
||||||
cb(Vcur, "Vcur", il);
|
|
||||||
|
|
||||||
ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, kv_self->v_l[il]));
|
|
||||||
} else {
|
|
||||||
Kcur = ggml_view_tensor(ctx0, kv_self->k_l[il]);
|
|
||||||
cb(Kcur, "Kcur (view)", il);
|
|
||||||
|
|
||||||
Vcur = ggml_view_tensor(ctx0, kv_self->v_l[il]);
|
|
||||||
cb(Vcur, "Vcur (view)", il);
|
|
||||||
}
|
|
||||||
|
|
||||||
struct ggml_tensor * kq = ggml_mul_mat(ctx0, Kcur, Qcur);
|
|
||||||
cb(kq, "kq", il);
|
|
||||||
|
|
||||||
// TODO: apply causal masks
|
|
||||||
struct ggml_tensor * kq_soft_max = ggml_soft_max_ext(ctx0, kq, nullptr, 1.f/sqrtf(float(n_embd_head)), hparams.f_max_alibi_bias);
|
|
||||||
cb(kq_soft_max, "kq_soft_max", il);
|
|
||||||
|
|
||||||
Vcur = ggml_cont(ctx0, ggml_transpose(ctx0, Vcur));
|
|
||||||
cb(Vcur, "Vcur", il);
|
|
||||||
|
|
||||||
struct ggml_tensor * kqv = ggml_mul_mat(ctx0, Vcur, kq_soft_max);
|
|
||||||
cb(kqv, "kqv", il);
|
|
||||||
|
|
||||||
struct ggml_tensor * kqv_merged = ggml_permute(ctx0, kqv, 0, 2, 1, 3);
|
|
||||||
cb(kqv_merged, "kqv_merged", il);
|
|
||||||
|
|
||||||
cur = ggml_cont_2d(ctx0, kqv_merged, n_embd_head_v*n_head, n_tokens);
|
|
||||||
cb(cur, "kqv_merged_cont", il);
|
|
||||||
|
|
||||||
cur = ggml_mul_mat(ctx0, model.layers[il].cross_attn_o_proj, cur);
|
|
||||||
cb(cur, "cur", il);
|
|
||||||
|
|
||||||
// TODO: do this in place once?
|
|
||||||
cur = ggml_mul(ctx0, cur, ggml_tanh(ctx0, model.layers[il].cross_attn_attn_gate));
|
|
||||||
|
|
||||||
struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
|
|
||||||
cb(ffn_inp, "ffn_inp", il);
|
|
||||||
|
|
||||||
// feed-forward network
|
|
||||||
cur = build_norm(ffn_inp,
|
|
||||||
model.layers[il].ffn_norm, NULL,
|
|
||||||
LLM_NORM_RMS, il);
|
|
||||||
cb(cur, "ffn_norm", il);
|
|
||||||
|
|
||||||
cur = build_ffn(cur,
|
|
||||||
model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
|
|
||||||
model.layers[il].ffn_gate, model.layers[il].ffn_gate_b, NULL,
|
|
||||||
model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
|
|
||||||
NULL,
|
|
||||||
LLM_FFN_SILU, LLM_FFN_PAR, il);
|
|
||||||
cb(cur, "ffn_out", il);
|
|
||||||
|
|
||||||
// TODO: do this inplace once?
|
|
||||||
cur = ggml_add_inplace(ctx0, ggml_mul_inplace(ctx0, cur, ggml_tanh(ctx0, model.layers[il].cross_attn_mlp_gate)), ffn_inp);
|
|
||||||
cb(cur, "ffn_out", il);
|
|
||||||
|
|
||||||
cur = build_cvec(cur, il);
|
|
||||||
cb(cur, "l_out", il);
|
|
||||||
|
|
||||||
// input for next layer
|
|
||||||
inpL = cur;
|
|
||||||
} else {
|
|
||||||
// self attention layer
|
|
||||||
|
|
||||||
// rope freq factors for llama3; may return nullptr for llama2 and other models
|
|
||||||
ggml_tensor * rope_factors = static_cast<const llama_kv_cache_unified *>(memory)->cbs.get_rope_factors(n_ctx_per_seq, il);
|
|
||||||
|
|
||||||
// compute Q and K and RoPE them
|
|
||||||
ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
|
|
||||||
cb(Qcur, "Qcur", il);
|
|
||||||
if (model.layers[il].bq) {
|
|
||||||
Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
|
|
||||||
cb(Qcur, "Qcur", il);
|
|
||||||
}
|
|
||||||
|
|
||||||
ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
|
|
||||||
cb(Kcur, "Kcur", il);
|
|
||||||
if (model.layers[il].bk) {
|
|
||||||
Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
|
|
||||||
cb(Kcur, "Kcur", il);
|
|
||||||
}
|
|
||||||
|
|
||||||
ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
|
|
||||||
cb(Vcur, "Vcur", il);
|
|
||||||
if (model.layers[il].bv) {
|
|
||||||
Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
|
|
||||||
cb(Vcur, "Vcur", il);
|
|
||||||
}
|
|
||||||
|
|
||||||
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
|
||||||
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
|
||||||
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
|
||||||
|
|
||||||
Qcur = ggml_rope_ext(
|
|
||||||
ctx0, Qcur, inp_pos, rope_factors,
|
|
||||||
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
||||||
ext_factor, attn_factor, beta_fast, beta_slow
|
|
||||||
);
|
|
||||||
|
|
||||||
Kcur = ggml_rope_ext(
|
|
||||||
ctx0, Kcur, inp_pos, rope_factors,
|
|
||||||
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
||||||
ext_factor, attn_factor, beta_fast, beta_slow
|
|
||||||
);
|
|
||||||
|
|
||||||
cb(Qcur, "Qcur", il);
|
|
||||||
cb(Kcur, "Kcur", il);
|
|
||||||
cb(Vcur, "Vcur", il);
|
|
||||||
|
|
||||||
cur = build_attn(inp_attn, gf,
|
|
||||||
model.layers[il].wo, model.layers[il].bo,
|
|
||||||
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
||||||
|
|
||||||
if (il == n_layer - 1) {
|
|
||||||
// skip computing output for unused tokens
|
|
||||||
struct ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
||||||
n_tokens = n_outputs;
|
|
||||||
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
|
||||||
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
|
||||||
}
|
|
||||||
|
|
||||||
struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
|
|
||||||
cb(ffn_inp, "ffn_inp", il);
|
|
||||||
|
|
||||||
// feed-forward network
|
|
||||||
cur = build_norm(ffn_inp,
|
|
||||||
model.layers[il].ffn_norm, NULL,
|
|
||||||
LLM_NORM_RMS, il);
|
|
||||||
cb(cur, "ffn_norm", il);
|
|
||||||
|
|
||||||
cur = build_ffn(cur,
|
|
||||||
model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
|
|
||||||
model.layers[il].ffn_gate, model.layers[il].ffn_gate_b, NULL,
|
|
||||||
model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
|
|
||||||
NULL,
|
|
||||||
LLM_FFN_SILU, LLM_FFN_PAR, il);
|
|
||||||
cb(cur, "ffn_out", il);
|
|
||||||
|
|
||||||
cur = ggml_add(ctx0, cur, ffn_inp);
|
|
||||||
cb(cur, "ffn_out", il);
|
|
||||||
|
|
||||||
cur = build_cvec(cur, il);
|
|
||||||
cb(cur, "l_out", il);
|
|
||||||
|
|
||||||
// input for next layer
|
|
||||||
inpL = cur;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
cur = inpL;
|
|
||||||
|
|
||||||
cur = build_norm(cur,
|
|
||||||
model.output_norm, NULL,
|
|
||||||
LLM_NORM_RMS, -1);
|
|
||||||
cb(cur, "result_norm", -1);
|
|
||||||
res->t_embd = cur;
|
|
||||||
|
|
||||||
// lm_head
|
|
||||||
cur = build_lora_mm(model.output, cur);
|
|
||||||
|
|
||||||
cb(cur, "result_output", -1);
|
|
||||||
res->t_logits = cur;
|
|
||||||
|
|
||||||
ggml_build_forward_expand(gf, cur);
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
struct llm_build_deci : public llm_graph_context {
|
struct llm_build_deci : public llm_graph_context {
|
||||||
llm_build_deci(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
|
llm_build_deci(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
|
||||||
const int64_t n_embd_head = hparams.n_embd_head_v;
|
const int64_t n_embd_head = hparams.n_embd_head_v;
|
||||||
|
|
@ -5029,6 +4778,7 @@ struct llm_build_deci : public llm_graph_context {
|
||||||
ggml_tensor * inpSA = inpL;
|
ggml_tensor * inpSA = inpL;
|
||||||
const int64_t n_head_kv = hparams.n_head_kv(il);
|
const int64_t n_head_kv = hparams.n_head_kv(il);
|
||||||
const int64_t n_head = hparams.n_head(il);
|
const int64_t n_head = hparams.n_head(il);
|
||||||
|
const int64_t n_ff = hparams.n_ff(il);
|
||||||
|
|
||||||
if (n_head == 0) {
|
if (n_head == 0) {
|
||||||
// attention-free layer of Llama-3_1-Nemotron-51B
|
// attention-free layer of Llama-3_1-Nemotron-51B
|
||||||
|
|
@ -5048,7 +4798,7 @@ struct llm_build_deci : public llm_graph_context {
|
||||||
} else if (n_head > 0) {
|
} else if (n_head > 0) {
|
||||||
// self-attention
|
// self-attention
|
||||||
// rope freq factors for llama3; may return nullptr for llama2 and other models
|
// rope freq factors for llama3; may return nullptr for llama2 and other models
|
||||||
ggml_tensor * rope_factors = static_cast<const llama_kv_cache_unified *>(memory)->cbs.get_rope_factors(n_ctx_per_seq, il);
|
ggml_tensor * rope_factors = model.get_rope_factors(n_ctx_per_seq, il);
|
||||||
|
|
||||||
// compute Q and K and RoPE them
|
// compute Q and K and RoPE them
|
||||||
ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
|
ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
|
||||||
|
|
@ -5104,6 +4854,11 @@ struct llm_build_deci : public llm_graph_context {
|
||||||
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// FFN-free layer of Llama-3_1-Nemotron-Ultra-253B
|
||||||
|
if (n_ff == 0) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
// For Granite architecture
|
// For Granite architecture
|
||||||
if (hparams.f_residual_scale) {
|
if (hparams.f_residual_scale) {
|
||||||
cur = ggml_scale(ctx0, cur, hparams.f_residual_scale);
|
cur = ggml_scale(ctx0, cur, hparams.f_residual_scale);
|
||||||
|
|
@ -7530,7 +7285,7 @@ struct llm_build_phi3 : public llm_graph_context {
|
||||||
// self-attention
|
// self-attention
|
||||||
{
|
{
|
||||||
// rope freq factors for 128k context
|
// rope freq factors for 128k context
|
||||||
ggml_tensor * rope_factors = static_cast<const llama_kv_cache_unified *>(memory)->cbs.get_rope_factors(n_ctx_per_seq, il);
|
ggml_tensor * rope_factors = model.get_rope_factors(n_ctx_per_seq, il);
|
||||||
|
|
||||||
ggml_tensor* attn_norm_output = build_norm(inpL,
|
ggml_tensor* attn_norm_output = build_norm(inpL,
|
||||||
model.layers[il].attn_norm,
|
model.layers[il].attn_norm,
|
||||||
|
|
@ -8282,7 +8037,7 @@ struct llm_build_minicpm3 : public llm_graph_context {
|
||||||
for (int il = 0; il < n_layer; ++il) {
|
for (int il = 0; il < n_layer; ++il) {
|
||||||
ggml_tensor * inpSA = inpL;
|
ggml_tensor * inpSA = inpL;
|
||||||
|
|
||||||
ggml_tensor * rope_factors = static_cast<const llama_kv_cache_unified *>(memory)->cbs.get_rope_factors(n_ctx_per_seq, il);
|
ggml_tensor * rope_factors = model.get_rope_factors(n_ctx_per_seq, il);
|
||||||
|
|
||||||
// norm
|
// norm
|
||||||
cur = build_norm(inpL,
|
cur = build_norm(inpL,
|
||||||
|
|
@ -9049,7 +8804,7 @@ struct llm_build_mamba : public llm_graph_context {
|
||||||
ggml_tensor * state_mask,
|
ggml_tensor * state_mask,
|
||||||
const llama_ubatch & ubatch,
|
const llama_ubatch & ubatch,
|
||||||
int il) const {
|
int il) const {
|
||||||
const llama_kv_cache_unified * kv_self = static_cast<const llama_kv_cache_unified *>(memory);
|
const llama_kv_cache_recurrent * kv_self = static_cast<const llama_kv_cache_recurrent *>(memory);
|
||||||
|
|
||||||
const auto kv_head = kv_self->head;
|
const auto kv_head = kv_self->head;
|
||||||
|
|
||||||
|
|
@ -9350,7 +9105,7 @@ struct llm_build_cohere2 : public llm_graph_context {
|
||||||
// self-attention
|
// self-attention
|
||||||
{
|
{
|
||||||
// rope freq factors for 128k context
|
// rope freq factors for 128k context
|
||||||
ggml_tensor * rope_factors = static_cast<const llama_kv_cache_unified *>(memory)->cbs.get_rope_factors(n_ctx_per_seq, il);
|
ggml_tensor * rope_factors = model.get_rope_factors(n_ctx_per_seq, il);
|
||||||
|
|
||||||
// compute Q and K and RoPE them
|
// compute Q and K and RoPE them
|
||||||
ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
|
ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
|
||||||
|
|
@ -10288,7 +10043,7 @@ struct llm_build_deepseek : public llm_graph_context {
|
||||||
// self-attention
|
// self-attention
|
||||||
{
|
{
|
||||||
// rope freq factors for llama3; may return nullptr for llama2 and other models
|
// rope freq factors for llama3; may return nullptr for llama2 and other models
|
||||||
ggml_tensor * rope_factors = static_cast<const llama_kv_cache_unified *>(memory)->cbs.get_rope_factors(n_ctx_per_seq, il);
|
ggml_tensor * rope_factors = model.get_rope_factors(n_ctx_per_seq, il);
|
||||||
|
|
||||||
// compute Q and K and RoPE them
|
// compute Q and K and RoPE them
|
||||||
ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
|
ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
|
||||||
|
|
@ -11652,7 +11407,7 @@ struct llm_build_exaone : public llm_graph_context {
|
||||||
// self-attention
|
// self-attention
|
||||||
{
|
{
|
||||||
// rope freq factors for llama3; may return nullptr for llama2 and other models
|
// rope freq factors for llama3; may return nullptr for llama2 and other models
|
||||||
ggml_tensor * rope_factors = static_cast<const llama_kv_cache_unified *>(memory)->cbs.get_rope_factors(n_ctx_per_seq, il);
|
ggml_tensor * rope_factors = model.get_rope_factors(n_ctx_per_seq, il);
|
||||||
|
|
||||||
// compute Q and K and RoPE them
|
// compute Q and K and RoPE them
|
||||||
ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
|
ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
|
||||||
|
|
@ -11797,7 +11552,7 @@ struct llm_build_rwkv6_base : public llm_graph_context {
|
||||||
ggml_tensor * state_mask,
|
ggml_tensor * state_mask,
|
||||||
const llama_ubatch & ubatch,
|
const llama_ubatch & ubatch,
|
||||||
int il) const {
|
int il) const {
|
||||||
const llama_kv_cache_unified * kv_self = static_cast<const llama_kv_cache_unified *>(memory);
|
const llama_kv_cache_recurrent * kv_self = static_cast<const llama_kv_cache_recurrent *>(memory);
|
||||||
|
|
||||||
const auto n_tokens = ubatch.n_tokens;
|
const auto n_tokens = ubatch.n_tokens;
|
||||||
const auto n_seqs = ubatch.n_seqs;
|
const auto n_seqs = ubatch.n_seqs;
|
||||||
|
|
@ -12193,7 +11948,7 @@ struct llm_build_rwkv7_base : public llm_graph_context {
|
||||||
ggml_tensor *& first_layer_value,
|
ggml_tensor *& first_layer_value,
|
||||||
const llama_ubatch & ubatch,
|
const llama_ubatch & ubatch,
|
||||||
int il) const {
|
int il) const {
|
||||||
const llama_kv_cache_unified * kv_self = static_cast<const llama_kv_cache_unified *>(memory);
|
const llama_kv_cache_recurrent * kv_self = static_cast<const llama_kv_cache_recurrent *>(memory);
|
||||||
|
|
||||||
const auto n_tokens = ubatch.n_tokens;
|
const auto n_tokens = ubatch.n_tokens;
|
||||||
const auto n_seqs = ubatch.n_seqs;
|
const auto n_seqs = ubatch.n_seqs;
|
||||||
|
|
@ -12741,7 +12496,7 @@ struct llm_build_solar : public llm_graph_context {
|
||||||
// self-attention
|
// self-attention
|
||||||
{
|
{
|
||||||
// rope freq factors for llama3; may return nullptr for llama2 and other models
|
// rope freq factors for llama3; may return nullptr for llama2 and other models
|
||||||
ggml_tensor * rope_factors = static_cast<const llama_kv_cache_unified *>(memory)->cbs.get_rope_factors(n_ctx_per_seq, il);
|
ggml_tensor * rope_factors = model.get_rope_factors(n_ctx_per_seq, il);
|
||||||
|
|
||||||
// compute Q and K and RoPE them
|
// compute Q and K and RoPE them
|
||||||
ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
|
ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
|
||||||
|
|
@ -13192,7 +12947,7 @@ struct llm_build_bailingmoe : public llm_graph_context {
|
||||||
// self-attention
|
// self-attention
|
||||||
{
|
{
|
||||||
// rope freq factors for llama3; may return nullptr for llama2 and other models
|
// rope freq factors for llama3; may return nullptr for llama2 and other models
|
||||||
ggml_tensor * rope_factors = static_cast<const llama_kv_cache_unified *>(memory)->cbs.get_rope_factors(n_ctx_per_seq, il);
|
ggml_tensor * rope_factors = model.get_rope_factors(n_ctx_per_seq, il);
|
||||||
|
|
||||||
// compute Q and K and RoPE them
|
// compute Q and K and RoPE them
|
||||||
ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
|
ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
|
||||||
|
|
@ -13312,36 +13067,46 @@ struct llm_build_bailingmoe : public llm_graph_context {
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
llama_memory_i * llama_model::create_memory() const {
|
llama_memory_i * llama_model::create_memory(const llama_memory_params & params, llama_cparams & cparams) const {
|
||||||
llama_memory_i * res;
|
llama_memory_i * res;
|
||||||
|
|
||||||
switch (arch) {
|
switch (arch) {
|
||||||
|
case LLM_ARCH_BERT:
|
||||||
|
case LLM_ARCH_JINA_BERT_V2:
|
||||||
|
case LLM_ARCH_NOMIC_BERT:
|
||||||
|
case LLM_ARCH_NOMIC_BERT_MOE:
|
||||||
|
{
|
||||||
|
res = nullptr;
|
||||||
|
} break;
|
||||||
case LLM_ARCH_MAMBA:
|
case LLM_ARCH_MAMBA:
|
||||||
case LLM_ARCH_RWKV6:
|
case LLM_ARCH_RWKV6:
|
||||||
case LLM_ARCH_RWKV6QWEN2:
|
case LLM_ARCH_RWKV6QWEN2:
|
||||||
case LLM_ARCH_RWKV7:
|
case LLM_ARCH_RWKV7:
|
||||||
case LLM_ARCH_ARWKV7:
|
case LLM_ARCH_ARWKV7:
|
||||||
{
|
{
|
||||||
res = new llama_kv_cache_unified(hparams, {
|
res = new llama_kv_cache_recurrent(
|
||||||
/*.get_rope_factors =*/ nullptr
|
*this,
|
||||||
});
|
GGML_TYPE_F32,
|
||||||
|
GGML_TYPE_F32,
|
||||||
|
cparams.offload_kqv,
|
||||||
|
std::max((uint32_t) 1, cparams.n_seq_max));
|
||||||
} break;
|
} break;
|
||||||
default:
|
default:
|
||||||
{
|
{
|
||||||
res = new llama_kv_cache_unified(hparams, {
|
const auto padding = llama_kv_cache_unified::get_padding(cparams);
|
||||||
/*.get_rope_factors =*/ [this](uint32_t n_ctx_per_seq, int il) {
|
|
||||||
// choose long/short freq factors based on the context size
|
|
||||||
if (layers[il].rope_freqs != nullptr) {
|
|
||||||
return layers[il].rope_freqs;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (n_ctx_per_seq > hparams.n_ctx_orig_yarn) {
|
cparams.n_ctx = GGML_PAD(cparams.n_ctx, padding);
|
||||||
return layers[il].rope_long;
|
|
||||||
}
|
|
||||||
|
|
||||||
return layers[il].rope_short;
|
LLAMA_LOG_DEBUG("%s: n_ctx = %u (padded)\n", __func__, cparams.n_ctx);
|
||||||
}
|
|
||||||
});
|
res = new llama_kv_cache_unified(
|
||||||
|
*this,
|
||||||
|
params.type_k,
|
||||||
|
params.type_v,
|
||||||
|
!cparams.flash_attn,
|
||||||
|
cparams.offload_kqv,
|
||||||
|
cparams.n_ctx,
|
||||||
|
padding);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -13363,10 +13128,6 @@ llm_graph_result_ptr llama_model::build_graph(
|
||||||
{
|
{
|
||||||
llm = std::make_unique<llm_build_llama>(*this, params, gf);
|
llm = std::make_unique<llm_build_llama>(*this, params, gf);
|
||||||
} break;
|
} break;
|
||||||
case LLM_ARCH_MLLAMA:
|
|
||||||
{
|
|
||||||
llm = std::make_unique<llm_build_mllama>(*this, params, gf);
|
|
||||||
} break;
|
|
||||||
case LLM_ARCH_DECI:
|
case LLM_ARCH_DECI:
|
||||||
{
|
{
|
||||||
llm = std::make_unique<llm_build_deci>(*this, params, gf);
|
llm = std::make_unique<llm_build_deci>(*this, params, gf);
|
||||||
|
|
@ -13728,12 +13489,9 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
|
||||||
// use what we call a normal RoPE, operating on pairs of consecutive head values
|
// use what we call a normal RoPE, operating on pairs of consecutive head values
|
||||||
case LLM_ARCH_LLAMA:
|
case LLM_ARCH_LLAMA:
|
||||||
case LLM_ARCH_LLAMA4:
|
case LLM_ARCH_LLAMA4:
|
||||||
case LLM_ARCH_MLLAMA:
|
|
||||||
case LLM_ARCH_DECI:
|
case LLM_ARCH_DECI:
|
||||||
case LLM_ARCH_BAICHUAN:
|
case LLM_ARCH_BAICHUAN:
|
||||||
case LLM_ARCH_STARCODER:
|
case LLM_ARCH_STARCODER:
|
||||||
case LLM_ARCH_PLAMO:
|
|
||||||
case LLM_ARCH_ORION:
|
|
||||||
case LLM_ARCH_INTERNLM2:
|
case LLM_ARCH_INTERNLM2:
|
||||||
case LLM_ARCH_MINICPM:
|
case LLM_ARCH_MINICPM:
|
||||||
case LLM_ARCH_XVERSE:
|
case LLM_ARCH_XVERSE:
|
||||||
|
|
@ -13772,6 +13530,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
|
||||||
case LLM_ARCH_PHI2:
|
case LLM_ARCH_PHI2:
|
||||||
case LLM_ARCH_PHI3:
|
case LLM_ARCH_PHI3:
|
||||||
case LLM_ARCH_PHIMOE:
|
case LLM_ARCH_PHIMOE:
|
||||||
|
case LLM_ARCH_PLAMO:
|
||||||
case LLM_ARCH_GEMMA:
|
case LLM_ARCH_GEMMA:
|
||||||
case LLM_ARCH_GEMMA2:
|
case LLM_ARCH_GEMMA2:
|
||||||
case LLM_ARCH_GEMMA3:
|
case LLM_ARCH_GEMMA3:
|
||||||
|
|
@ -13779,6 +13538,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
|
||||||
case LLM_ARCH_OPENELM:
|
case LLM_ARCH_OPENELM:
|
||||||
case LLM_ARCH_GPTNEOX:
|
case LLM_ARCH_GPTNEOX:
|
||||||
case LLM_ARCH_CODESHELL:
|
case LLM_ARCH_CODESHELL:
|
||||||
|
case LLM_ARCH_ORION:
|
||||||
case LLM_ARCH_NEMOTRON:
|
case LLM_ARCH_NEMOTRON:
|
||||||
case LLM_ARCH_EXAONE:
|
case LLM_ARCH_EXAONE:
|
||||||
case LLM_ARCH_MINICPM3:
|
case LLM_ARCH_MINICPM3:
|
||||||
|
|
@ -13851,6 +13611,14 @@ const char * llama_model_chat_template(const llama_model * model, const char * n
|
||||||
: LLM_KV(model->arch)(LLM_KV_TOKENIZER_CHAT_TEMPLATE);
|
: LLM_KV(model->arch)(LLM_KV_TOKENIZER_CHAT_TEMPLATE);
|
||||||
const auto & it = model->gguf_kv.find(key);
|
const auto & it = model->gguf_kv.find(key);
|
||||||
if (it == model->gguf_kv.end()) {
|
if (it == model->gguf_kv.end()) {
|
||||||
|
// one-off fix for very popular models (so we are not flooded with issues)
|
||||||
|
// do not extend this list unless absolutely necessary
|
||||||
|
// Mistral-Small-2503 does not have built-in chat template
|
||||||
|
llama_vocab_pre_type pre_type = model->vocab.get_pre_type();
|
||||||
|
if (pre_type == LLAMA_VOCAB_PRE_TYPE_TEKKEN && model->layers.size() == 40) {
|
||||||
|
return "mistral-v7-tekken";
|
||||||
|
}
|
||||||
|
|
||||||
return nullptr;
|
return nullptr;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
21
llama/llama.cpp/src/llama-model.h
vendored
21
llama/llama.cpp/src/llama-model.h
vendored
|
|
@ -11,7 +11,6 @@
|
||||||
#include <string>
|
#include <string>
|
||||||
#include <unordered_map>
|
#include <unordered_map>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
#include <stdexcept>
|
|
||||||
|
|
||||||
struct llama_cparams;
|
struct llama_cparams;
|
||||||
struct llama_ubatch;
|
struct llama_ubatch;
|
||||||
|
|
@ -37,6 +36,7 @@ enum llm_type {
|
||||||
LLM_TYPE_335M,
|
LLM_TYPE_335M,
|
||||||
LLM_TYPE_410M,
|
LLM_TYPE_410M,
|
||||||
LLM_TYPE_450M,
|
LLM_TYPE_450M,
|
||||||
|
LLM_TYPE_475M,
|
||||||
LLM_TYPE_770M,
|
LLM_TYPE_770M,
|
||||||
LLM_TYPE_780M,
|
LLM_TYPE_780M,
|
||||||
LLM_TYPE_0_5B,
|
LLM_TYPE_0_5B,
|
||||||
|
|
@ -74,10 +74,10 @@ enum llm_type {
|
||||||
LLM_TYPE_40B,
|
LLM_TYPE_40B,
|
||||||
LLM_TYPE_65B,
|
LLM_TYPE_65B,
|
||||||
LLM_TYPE_70B,
|
LLM_TYPE_70B,
|
||||||
LLM_TYPE_90B,
|
|
||||||
LLM_TYPE_236B,
|
LLM_TYPE_236B,
|
||||||
LLM_TYPE_290B,
|
LLM_TYPE_290B,
|
||||||
LLM_TYPE_314B,
|
LLM_TYPE_314B,
|
||||||
|
LLM_TYPE_405B,
|
||||||
LLM_TYPE_671B,
|
LLM_TYPE_671B,
|
||||||
LLM_TYPE_SMALL,
|
LLM_TYPE_SMALL,
|
||||||
LLM_TYPE_MEDIUM,
|
LLM_TYPE_MEDIUM,
|
||||||
|
|
@ -97,6 +97,8 @@ enum llm_type {
|
||||||
LLM_TYPE_235B_A22B,
|
LLM_TYPE_235B_A22B,
|
||||||
};
|
};
|
||||||
|
|
||||||
|
std::string llama_rope_scaling_type_name(llama_rope_scaling_type rope_scaling_type);
|
||||||
|
|
||||||
struct llama_layer_posnet {
|
struct llama_layer_posnet {
|
||||||
// resnet
|
// resnet
|
||||||
struct ggml_tensor * norm1 = nullptr;
|
struct ggml_tensor * norm1 = nullptr;
|
||||||
|
|
@ -316,16 +318,6 @@ struct llama_layer {
|
||||||
|
|
||||||
struct ggml_tensor * bskcn_tv = nullptr;
|
struct ggml_tensor * bskcn_tv = nullptr;
|
||||||
|
|
||||||
// cross attention
|
|
||||||
struct ggml_tensor * cross_attn_k_norm = nullptr;
|
|
||||||
struct ggml_tensor * cross_attn_k_proj = nullptr;
|
|
||||||
struct ggml_tensor * cross_attn_o_proj = nullptr;
|
|
||||||
struct ggml_tensor * cross_attn_q_norm = nullptr;
|
|
||||||
struct ggml_tensor * cross_attn_q_proj = nullptr;
|
|
||||||
struct ggml_tensor * cross_attn_v_proj = nullptr;
|
|
||||||
struct ggml_tensor * cross_attn_attn_gate = nullptr;
|
|
||||||
struct ggml_tensor * cross_attn_mlp_gate = nullptr;
|
|
||||||
|
|
||||||
struct llama_layer_posnet posnet;
|
struct llama_layer_posnet posnet;
|
||||||
|
|
||||||
struct llama_layer_convnext convnext;
|
struct llama_layer_convnext convnext;
|
||||||
|
|
@ -409,8 +401,11 @@ struct llama_model {
|
||||||
|
|
||||||
const struct ggml_tensor * get_tensor(const char * name) const;
|
const struct ggml_tensor * get_tensor(const char * name) const;
|
||||||
|
|
||||||
|
ggml_tensor * get_rope_factors(uint32_t n_ctx_per_seq, int il) const;
|
||||||
|
|
||||||
|
// note: can mutate `cparams`
|
||||||
// TODO: move this to new llm_arch_model_i interface
|
// TODO: move this to new llm_arch_model_i interface
|
||||||
llama_memory_i * create_memory() const; // TODO: params
|
llama_memory_i * create_memory(const llama_memory_params & params, llama_cparams & cparams) const;
|
||||||
|
|
||||||
// TODO: move this to new llm_arch_model_i interface
|
// TODO: move this to new llm_arch_model_i interface
|
||||||
llm_graph_result_ptr build_graph(
|
llm_graph_result_ptr build_graph(
|
||||||
|
|
|
||||||
8
llama/llama.cpp/src/llama-quant.cpp
vendored
8
llama/llama.cpp/src/llama-quant.cpp
vendored
|
|
@ -519,7 +519,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
|
||||||
nthread = std::thread::hardware_concurrency();
|
nthread = std::thread::hardware_concurrency();
|
||||||
}
|
}
|
||||||
|
|
||||||
// mmap consistently increases speed Linux, and also increases speed on Windows with
|
// mmap consistently increases speed on Linux, and also increases speed on Windows with
|
||||||
// hot cache. It may cause a slowdown on macOS, possibly related to free memory.
|
// hot cache. It may cause a slowdown on macOS, possibly related to free memory.
|
||||||
#if defined(__linux__) || defined(_WIN32)
|
#if defined(__linux__) || defined(_WIN32)
|
||||||
constexpr bool use_mmap = true;
|
constexpr bool use_mmap = true;
|
||||||
|
|
@ -529,7 +529,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
|
||||||
|
|
||||||
llama_model_kv_override * kv_overrides = nullptr;
|
llama_model_kv_override * kv_overrides = nullptr;
|
||||||
if (params->kv_overrides) {
|
if (params->kv_overrides) {
|
||||||
auto v = (std::vector<llama_model_kv_override>*)params->kv_overrides;
|
auto * v = (std::vector<llama_model_kv_override>*)params->kv_overrides;
|
||||||
kv_overrides = v->data();
|
kv_overrides = v->data();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -639,9 +639,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
|
||||||
if (llama_model_has_encoder(&model)) {
|
if (llama_model_has_encoder(&model)) {
|
||||||
n_attn_layer *= 3;
|
n_attn_layer *= 3;
|
||||||
}
|
}
|
||||||
if (qs.n_attention_wv != n_attn_layer) {
|
GGML_ASSERT((qs.n_attention_wv == n_attn_layer) && "n_attention_wv is unexpected");
|
||||||
LLAMA_LOG_WARN("%s: n_attention_wv is unexpected, expected: %d, found: %d\n", __func__, n_attn_layer, qs.n_attention_wv);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
size_t total_size_org = 0;
|
size_t total_size_org = 0;
|
||||||
|
|
|
||||||
24
llama/llama.cpp/src/llama-sampling.cpp
vendored
24
llama/llama.cpp/src/llama-sampling.cpp
vendored
|
|
@ -1750,23 +1750,35 @@ static const char * llama_sampler_top_n_sigma_name(const struct llama_sampler *
|
||||||
static void llama_sampler_top_n_sigma_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) {
|
static void llama_sampler_top_n_sigma_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) {
|
||||||
const auto * ctx = (llama_sampler_top_n_sigma *) smpl->ctx;
|
const auto * ctx = (llama_sampler_top_n_sigma *) smpl->ctx;
|
||||||
|
|
||||||
|
if (ctx->n <= 0.0f || cur_p->size <= 1) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
// find max logit and calculate mean
|
// find max logit and calculate mean
|
||||||
float max = cur_p->data[0].logit;
|
float max = cur_p->data[0].logit;
|
||||||
float logits_sum = 0;
|
float logits_sum = 0;
|
||||||
|
size_t valid_count = 0;
|
||||||
for (size_t i = 0; i < cur_p->size; ++i) {
|
for (size_t i = 0; i < cur_p->size; ++i) {
|
||||||
if (cur_p->data[i].logit > max) {
|
// Only count non-negative infinity values
|
||||||
max = cur_p->data[i].logit;
|
if (cur_p->data[i].logit != -INFINITY) {
|
||||||
|
if (cur_p->data[i].logit > max) {
|
||||||
|
max = cur_p->data[i].logit;
|
||||||
|
}
|
||||||
|
logits_sum += cur_p->data[i].logit;
|
||||||
|
valid_count++;
|
||||||
}
|
}
|
||||||
logits_sum += cur_p->data[i].logit;
|
|
||||||
}
|
}
|
||||||
float mean = logits_sum/cur_p->size;
|
float mean = valid_count > 0 ? logits_sum/valid_count : 0;
|
||||||
|
|
||||||
// calculate standard deviation
|
// calculate standard deviation
|
||||||
float acc = 0;
|
float acc = 0;
|
||||||
for (size_t i = 0; i < cur_p->size; ++i) {
|
for (size_t i = 0; i < cur_p->size; ++i) {
|
||||||
acc += pow(cur_p->data[i].logit - mean, 2);
|
// Skip -infinity in std calculation
|
||||||
|
if (cur_p->data[i].logit != -INFINITY) {
|
||||||
|
acc += pow(cur_p->data[i].logit - mean, 2);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
float std = sqrt(acc/cur_p->size);
|
float std = valid_count > 0 ? sqrt(acc/valid_count) : 0;
|
||||||
|
|
||||||
//apply mask
|
//apply mask
|
||||||
for (size_t i = 0; i < cur_p->size; ++i) {
|
for (size_t i = 0; i < cur_p->size; ++i) {
|
||||||
|
|
|
||||||
44
llama/llama.cpp/src/llama-vocab.cpp
vendored
44
llama/llama.cpp/src/llama-vocab.cpp
vendored
|
|
@ -1,5 +1,7 @@
|
||||||
#include "llama-vocab.h"
|
#include "llama-vocab.h"
|
||||||
|
|
||||||
|
#include "ggml.h"
|
||||||
|
#include "gguf.h"
|
||||||
#include "llama-impl.h"
|
#include "llama-impl.h"
|
||||||
#include "llama-model-loader.h"
|
#include "llama-model-loader.h"
|
||||||
|
|
||||||
|
|
@ -415,6 +417,13 @@ struct llm_tokenizer_bpe : llm_tokenizer {
|
||||||
"'(?:[sSdDmMtT]|[lL][lL]|[vV][eE]|[rR][eE])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]|\\s+(?!\\S)|\\s+",
|
"'(?:[sSdDmMtT]|[lL][lL]|[vV][eE]|[rR][eE])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]|\\s+(?!\\S)|\\s+",
|
||||||
};
|
};
|
||||||
break;
|
break;
|
||||||
|
case LLAMA_VOCAB_PRE_TYPE_SEED_CODER:
|
||||||
|
regex_exprs = {
|
||||||
|
// original regex from tokenizer.json
|
||||||
|
// "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1}| ?[^\\s\\p{L}\\p{N}\r\n]+|\\s*[\r\n]+|\\s+(?!\\S)|\\s+"
|
||||||
|
"(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1}| ?[^\\s\\p{L}\\p{N}\\r\\n]+|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
|
||||||
|
};
|
||||||
|
break;
|
||||||
default:
|
default:
|
||||||
// default regex for BPE tokenization pre-processing
|
// default regex for BPE tokenization pre-processing
|
||||||
regex_exprs = {
|
regex_exprs = {
|
||||||
|
|
@ -1227,6 +1236,9 @@ struct fragment_buffer_variant {
|
||||||
struct llama_vocab::impl {
|
struct llama_vocab::impl {
|
||||||
uint32_t n_token_types = 0; // for BERT-style token types
|
uint32_t n_token_types = 0; // for BERT-style token types
|
||||||
|
|
||||||
|
std::string tokenizer_model;
|
||||||
|
std::string tokenizer_pre;
|
||||||
|
|
||||||
enum llama_vocab_type type = LLAMA_VOCAB_TYPE_SPM;
|
enum llama_vocab_type type = LLAMA_VOCAB_TYPE_SPM;
|
||||||
enum llama_vocab_pre_type pre_type = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
|
enum llama_vocab_pre_type pre_type = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
|
||||||
|
|
||||||
|
|
@ -1362,9 +1374,6 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|
||||||
|
|
||||||
// determine vocab type
|
// determine vocab type
|
||||||
{
|
{
|
||||||
std::string tokenizer_model;
|
|
||||||
std::string tokenizer_pre;
|
|
||||||
|
|
||||||
ml.get_key(LLM_KV_TOKENIZER_MODEL, tokenizer_model);
|
ml.get_key(LLM_KV_TOKENIZER_MODEL, tokenizer_model);
|
||||||
ml.get_key(LLM_KV_TOKENIZER_PRE, tokenizer_pre, false);
|
ml.get_key(LLM_KV_TOKENIZER_PRE, tokenizer_pre, false);
|
||||||
|
|
||||||
|
|
@ -1459,7 +1468,8 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|
||||||
|
|
||||||
const int precompiled_charsmap_keyidx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_PRECOMPILED_CHARSMAP).c_str());
|
const int precompiled_charsmap_keyidx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_PRECOMPILED_CHARSMAP).c_str());
|
||||||
if (precompiled_charsmap_keyidx != -1) {
|
if (precompiled_charsmap_keyidx != -1) {
|
||||||
size_t n_precompiled_charsmap = gguf_get_arr_data_n(ctx, precompiled_charsmap_keyidx);
|
const gguf_type pc_type = gguf_get_arr_type(ctx, precompiled_charsmap_keyidx);
|
||||||
|
const size_t n_precompiled_charsmap = gguf_get_arr_data_n(ctx, precompiled_charsmap_keyidx);
|
||||||
const char * pc = (const char *) gguf_get_arr_data(ctx, precompiled_charsmap_keyidx);
|
const char * pc = (const char *) gguf_get_arr_data(ctx, precompiled_charsmap_keyidx);
|
||||||
precompiled_charsmap.assign(pc, pc + n_precompiled_charsmap);
|
precompiled_charsmap.assign(pc, pc + n_precompiled_charsmap);
|
||||||
#ifdef IS_BIG_ENDIAN
|
#ifdef IS_BIG_ENDIAN
|
||||||
|
|
@ -1625,6 +1635,10 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|
||||||
tokenizer_pre == "bailingmoe") {
|
tokenizer_pre == "bailingmoe") {
|
||||||
pre_type = LLAMA_VOCAB_PRE_TYPE_BAILINGMOE;
|
pre_type = LLAMA_VOCAB_PRE_TYPE_BAILINGMOE;
|
||||||
clean_spaces = false;
|
clean_spaces = false;
|
||||||
|
} else if (
|
||||||
|
tokenizer_pre == "seed-coder") {
|
||||||
|
pre_type = LLAMA_VOCAB_PRE_TYPE_SEED_CODER;
|
||||||
|
clean_spaces = false;
|
||||||
} else {
|
} else {
|
||||||
LLAMA_LOG_WARN("%s: missing or unrecognized pre-tokenizer type, using: 'default'\n", __func__);
|
LLAMA_LOG_WARN("%s: missing or unrecognized pre-tokenizer type, using: 'default'\n", __func__);
|
||||||
pre_type = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
|
pre_type = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
|
||||||
|
|
@ -2770,6 +2784,14 @@ void llama_vocab::load(llama_model_loader & ml, const LLM_KV & kv) {
|
||||||
pimpl->load(ml, kv);
|
pimpl->load(ml, kv);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
std::string llama_vocab::get_tokenizer_model() const {
|
||||||
|
return pimpl->tokenizer_model;
|
||||||
|
}
|
||||||
|
|
||||||
|
std::string llama_vocab::get_tokenizer_pre() const {
|
||||||
|
return pimpl->tokenizer_pre;
|
||||||
|
}
|
||||||
|
|
||||||
enum llama_vocab_type llama_vocab::get_type() const {
|
enum llama_vocab_type llama_vocab::get_type() const {
|
||||||
return pimpl->type;
|
return pimpl->type;
|
||||||
}
|
}
|
||||||
|
|
@ -2992,6 +3014,20 @@ int llama_vocab::find_bpe_rank(const std::string & token_left, const std::string
|
||||||
return it->second;
|
return it->second;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
std::vector<std::string> llama_vocab::get_bpe_merges() const {
|
||||||
|
std::vector<std::string> result(pimpl->bpe_ranks.size());
|
||||||
|
|
||||||
|
for (const auto & pair : pimpl->bpe_ranks) {
|
||||||
|
result[pair.second] = pair.first.first + " " + pair.first.second;
|
||||||
|
}
|
||||||
|
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
std::vector<char> llama_vocab::get_precompiled_charsmap() const {
|
||||||
|
return pimpl->precompiled_charsmap;
|
||||||
|
}
|
||||||
|
|
||||||
int32_t llama_vocab::tokenize(
|
int32_t llama_vocab::tokenize(
|
||||||
const char * text,
|
const char * text,
|
||||||
int32_t text_len,
|
int32_t text_len,
|
||||||
|
|
|
||||||
6
llama/llama.cpp/src/llama-vocab.h
vendored
6
llama/llama.cpp/src/llama-vocab.h
vendored
|
|
@ -21,6 +21,9 @@ struct llama_vocab {
|
||||||
|
|
||||||
void load(llama_model_loader & ml, const LLM_KV & kv);
|
void load(llama_model_loader & ml, const LLM_KV & kv);
|
||||||
|
|
||||||
|
std::string get_tokenizer_model() const;
|
||||||
|
std::string get_tokenizer_pre() const;
|
||||||
|
|
||||||
enum llama_vocab_type get_type() const;
|
enum llama_vocab_type get_type() const;
|
||||||
enum llama_vocab_pre_type get_pre_type() const;
|
enum llama_vocab_pre_type get_pre_type() const;
|
||||||
|
|
||||||
|
|
@ -80,6 +83,9 @@ struct llama_vocab {
|
||||||
int max_token_len() const;
|
int max_token_len() const;
|
||||||
|
|
||||||
int find_bpe_rank(const std::string & token_left, const std::string & token_right) const;
|
int find_bpe_rank(const std::string & token_left, const std::string & token_right) const;
|
||||||
|
std::vector<std::string> get_bpe_merges() const;
|
||||||
|
|
||||||
|
std::vector<char> get_precompiled_charsmap() const;
|
||||||
|
|
||||||
int32_t tokenize(
|
int32_t tokenize(
|
||||||
const char * text,
|
const char * text,
|
||||||
|
|
|
||||||
9
llama/llama.cpp/src/llama.cpp
vendored
9
llama/llama.cpp/src/llama.cpp
vendored
|
|
@ -4,6 +4,7 @@
|
||||||
#include "llama-mmap.h"
|
#include "llama-mmap.h"
|
||||||
#include "llama-vocab.h"
|
#include "llama-vocab.h"
|
||||||
#include "llama-model-loader.h"
|
#include "llama-model-loader.h"
|
||||||
|
#include "llama-model-saver.h"
|
||||||
#include "llama-model.h"
|
#include "llama-model.h"
|
||||||
|
|
||||||
#include "ggml.h"
|
#include "ggml.h"
|
||||||
|
|
@ -253,6 +254,13 @@ struct llama_model * llama_model_load_from_splits(
|
||||||
return llama_model_load_from_file_impl(splits.front(), splits, params);
|
return llama_model_load_from_file_impl(splits.front(), splits, params);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void llama_model_save_to_file(const struct llama_model * model, const char * path_model) {
|
||||||
|
llama_model_saver ms(*model);
|
||||||
|
ms.add_kv_from_model();
|
||||||
|
ms.add_tensors_from_model();
|
||||||
|
ms.save(path_model);
|
||||||
|
}
|
||||||
|
|
||||||
//
|
//
|
||||||
// chat templates
|
// chat templates
|
||||||
//
|
//
|
||||||
|
|
@ -338,3 +346,4 @@ const char * llama_print_system_info(void) {
|
||||||
|
|
||||||
return s.c_str();
|
return s.c_str();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -31,9 +31,7 @@
|
||||||
#define KEY_FEATURE_LAYER "clip.vision.feature_layer"
|
#define KEY_FEATURE_LAYER "clip.vision.feature_layer"
|
||||||
#define KEY_PROJ_SCALE_FACTOR "clip.vision.projector.scale_factor"
|
#define KEY_PROJ_SCALE_FACTOR "clip.vision.projector.scale_factor"
|
||||||
#define KEY_PROJ_TYPE "clip.projector_type"
|
#define KEY_PROJ_TYPE "clip.projector_type"
|
||||||
|
#define KEY_SPATIAL_MERGE_SIZE "clip.vision.spatial_merge_size"
|
||||||
#define KEY_USE_GLU_MLP "clip.use_glu_mlp" // for qwen2.5vl
|
|
||||||
#define KEY_USE_RMS_NORM "clip.use_rms_norm" // for qwen2.5vl
|
|
||||||
|
|
||||||
#define KEY_MM_PATCH_MERGE_TYPE "clip.vision.mm_patch_merge_type"
|
#define KEY_MM_PATCH_MERGE_TYPE "clip.vision.mm_patch_merge_type"
|
||||||
#define KEY_IMAGE_GRID_PINPOINTS "clip.vision.image_grid_pinpoints"
|
#define KEY_IMAGE_GRID_PINPOINTS "clip.vision.image_grid_pinpoints"
|
||||||
|
|
@ -55,12 +53,16 @@
|
||||||
#define TN_ATTN_Q "%s.blk.%d.attn_q.%s"
|
#define TN_ATTN_Q "%s.blk.%d.attn_q.%s"
|
||||||
#define TN_ATTN_V "%s.blk.%d.attn_v.%s"
|
#define TN_ATTN_V "%s.blk.%d.attn_v.%s"
|
||||||
#define TN_ATTN_OUTPUT "%s.blk.%d.attn_out.%s"
|
#define TN_ATTN_OUTPUT "%s.blk.%d.attn_out.%s"
|
||||||
|
#define TN_ATTN_K_NORM "%s.blk.%d.attn_k_norm.%s"
|
||||||
|
#define TN_ATTN_Q_NORM "%s.blk.%d.attn_q_norm.%s"
|
||||||
#define TN_FFN_DOWN "%s.blk.%d.ffn_down.%s"
|
#define TN_FFN_DOWN "%s.blk.%d.ffn_down.%s"
|
||||||
#define TN_FFN_GATE "%s.blk.%d.ffn_gate.%s"
|
#define TN_FFN_GATE "%s.blk.%d.ffn_gate.%s"
|
||||||
#define TN_FFN_UP "%s.blk.%d.ffn_up.%s"
|
#define TN_FFN_UP "%s.blk.%d.ffn_up.%s"
|
||||||
#define TN_FFN_GATE "%s.blk.%d.ffn_gate.%s"
|
#define TN_FFN_GATE "%s.blk.%d.ffn_gate.%s"
|
||||||
#define TN_LN_1 "%s.blk.%d.ln1.%s"
|
#define TN_LN_1 "%s.blk.%d.ln1.%s" // layer norm
|
||||||
#define TN_LN_2 "%s.blk.%d.ln2.%s"
|
#define TN_LN_2 "%s.blk.%d.ln2.%s" // layer norm
|
||||||
|
#define TN_LS_1 "%s.blk.%d.ls1.%s" // layer scale
|
||||||
|
#define TN_LS_2 "%s.blk.%d.ls2.%s" // layer scale
|
||||||
#define TN_LN_PRE "%s.pre_ln.%s"
|
#define TN_LN_PRE "%s.pre_ln.%s"
|
||||||
#define TN_LN_POST "%s.post_ln.%s"
|
#define TN_LN_POST "%s.post_ln.%s"
|
||||||
#define TN_LLAVA_PROJ "mm.%d.%s"
|
#define TN_LLAVA_PROJ "mm.%d.%s"
|
||||||
|
|
@ -68,10 +70,14 @@
|
||||||
#define TN_MVLM_PROJ_BLOCK "mm.model.mb_block.%d.block.%d.%s"
|
#define TN_MVLM_PROJ_BLOCK "mm.model.mb_block.%d.block.%d.%s"
|
||||||
#define TN_MVLM_PROJ_PEG "mm.model.peg.%d.%s"
|
#define TN_MVLM_PROJ_PEG "mm.model.peg.%d.%s"
|
||||||
#define TN_IMAGE_NEWLINE "model.image_newline"
|
#define TN_IMAGE_NEWLINE "model.image_newline"
|
||||||
|
#define TN_MM_INP_NORM "mm.input_norm.weight"
|
||||||
#define TN_MM_INP_PROJ "mm.input_projection.weight" // gemma3
|
#define TN_MM_INP_PROJ "mm.input_projection.weight" // gemma3
|
||||||
#define TN_MM_SOFT_EMB_N "mm.soft_emb_norm.weight" // gemma3
|
#define TN_MM_SOFT_EMB_N "mm.soft_emb_norm.weight" // gemma3
|
||||||
#define TN_MM_PROJECTOR "mm.model.fc.weight" // idefics3
|
#define TN_MM_PROJECTOR "mm.model.fc.weight" // idefics3
|
||||||
|
#define TN_MM_PATCH_MERGER "mm.patch_merger.weight" // mistral small 3.1
|
||||||
#define TN_TOK_IMG_BREAK "v.token_embd.img_break" // pixtral
|
#define TN_TOK_IMG_BREAK "v.token_embd.img_break" // pixtral
|
||||||
|
#define TN_TOK_GLM_BOI "adapter.boi" // glm-edge (these embeddings are not in text model)
|
||||||
|
#define TN_TOK_GLM_EOI "adapter.eoi" // glm-edge (these embeddings are not in text model)
|
||||||
|
|
||||||
// mimicpmv
|
// mimicpmv
|
||||||
#define TN_MINICPMV_POS_EMBD_K "resampler.pos_embed_k"
|
#define TN_MINICPMV_POS_EMBD_K "resampler.pos_embed_k"
|
||||||
|
|
@ -88,6 +94,9 @@
|
||||||
#define TN_GLM_ADAPTER_GATE "adapter.linear.gate.%s"
|
#define TN_GLM_ADAPTER_GATE "adapter.linear.gate.%s"
|
||||||
#define TN_GLM_ADAPTER_D_4H_2_H "adapter.linear.dense_4h_to_h.%s"
|
#define TN_GLM_ADAPTER_D_4H_2_H "adapter.linear.dense_4h_to_h.%s"
|
||||||
|
|
||||||
|
// align x to upper multiple of n
|
||||||
|
#define CLIP_ALIGN(x, n) ((((x) + (n) - 1) / (n)) * (n))
|
||||||
|
|
||||||
enum projector_type {
|
enum projector_type {
|
||||||
PROJECTOR_TYPE_MLP,
|
PROJECTOR_TYPE_MLP,
|
||||||
PROJECTOR_TYPE_MLP_NORM,
|
PROJECTOR_TYPE_MLP_NORM,
|
||||||
|
|
@ -100,6 +109,7 @@ enum projector_type {
|
||||||
PROJECTOR_TYPE_IDEFICS3,
|
PROJECTOR_TYPE_IDEFICS3,
|
||||||
PROJECTOR_TYPE_PIXTRAL,
|
PROJECTOR_TYPE_PIXTRAL,
|
||||||
PROJECTOR_TYPE_QWEN25VL,
|
PROJECTOR_TYPE_QWEN25VL,
|
||||||
|
PROJECTOR_TYPE_INTERNVL,
|
||||||
PROJECTOR_TYPE_UNKNOWN,
|
PROJECTOR_TYPE_UNKNOWN,
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
@ -114,6 +124,7 @@ static std::map<projector_type, std::string> PROJECTOR_TYPE_NAMES = {
|
||||||
{ PROJECTOR_TYPE_GEMMA3, "gemma3"},
|
{ PROJECTOR_TYPE_GEMMA3, "gemma3"},
|
||||||
{ PROJECTOR_TYPE_IDEFICS3, "idefics3"},
|
{ PROJECTOR_TYPE_IDEFICS3, "idefics3"},
|
||||||
{ PROJECTOR_TYPE_PIXTRAL, "pixtral"},
|
{ PROJECTOR_TYPE_PIXTRAL, "pixtral"},
|
||||||
|
{ PROJECTOR_TYPE_INTERNVL, "internvl"},
|
||||||
};
|
};
|
||||||
|
|
||||||
static projector_type clip_projector_type_from_string(const std::string & str) {
|
static projector_type clip_projector_type_from_string(const std::string & str) {
|
||||||
|
|
@ -228,6 +239,15 @@ struct clip_image_u8_batch {
|
||||||
|
|
||||||
struct clip_image_f32_batch {
|
struct clip_image_f32_batch {
|
||||||
std::vector<clip_image_f32_ptr> entries;
|
std::vector<clip_image_f32_ptr> entries;
|
||||||
|
|
||||||
|
clip_image_f32_batch clone() const {
|
||||||
|
clip_image_f32_batch new_batch;
|
||||||
|
new_batch.entries.reserve(entries.size());
|
||||||
|
for (const auto & entry : entries) {
|
||||||
|
new_batch.entries.emplace_back(new clip_image_f32(*entry));
|
||||||
|
}
|
||||||
|
return new_batch;
|
||||||
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
//
|
//
|
||||||
File diff suppressed because it is too large
Load Diff
|
|
@ -78,10 +78,10 @@ CLIP_API int clip_uhd_num_image_embeds_col(struct clip_ctx * ctx_clip);
|
||||||
CLIP_API void clip_add_load_image_size(struct clip_ctx * ctx_clip, struct clip_image_size * load_image_size);
|
CLIP_API void clip_add_load_image_size(struct clip_ctx * ctx_clip, struct clip_image_size * load_image_size);
|
||||||
CLIP_API struct clip_image_size * clip_get_load_image_size(struct clip_ctx * ctx_clip);
|
CLIP_API struct clip_image_size * clip_get_load_image_size(struct clip_ctx * ctx_clip);
|
||||||
|
|
||||||
CLIP_API struct clip_image_size * clip_image_size_init();
|
CLIP_API struct clip_image_size * clip_image_size_init(void);
|
||||||
CLIP_API struct clip_image_u8 * clip_image_u8_init ();
|
CLIP_API struct clip_image_u8 * clip_image_u8_init (void);
|
||||||
CLIP_API struct clip_image_f32 * clip_image_f32_init();
|
CLIP_API struct clip_image_f32 * clip_image_f32_init(void);
|
||||||
CLIP_API struct clip_image_f32_batch * clip_image_f32_batch_init(); // only used by libllava
|
CLIP_API struct clip_image_f32_batch * clip_image_f32_batch_init(void); // only used by libllava
|
||||||
|
|
||||||
// nx, ny are the output image dimensions
|
// nx, ny are the output image dimensions
|
||||||
CLIP_API unsigned char * clip_image_u8_get_data(struct clip_image_u8 * img, uint32_t * nx, uint32_t * ny);
|
CLIP_API unsigned char * clip_image_u8_get_data(struct clip_image_u8 * img, uint32_t * nx, uint32_t * ny);
|
||||||
|
|
@ -2,6 +2,7 @@
|
||||||
#include "llava.h"
|
#include "llava.h"
|
||||||
|
|
||||||
#include "llama.h"
|
#include "llama.h"
|
||||||
|
#include "ggml-cpp.h"
|
||||||
|
|
||||||
#include <algorithm>
|
#include <algorithm>
|
||||||
#include <cerrno>
|
#include <cerrno>
|
||||||
|
|
@ -209,7 +210,11 @@ static bool clip_llava_handle_patches(clip_ctx * ctx_clip, std::vector<float *>
|
||||||
struct ggml_tensor *flatten = ggml_view_2d(model.ctx, permuted_cont, clip_n_mmproj_embd(ctx_clip), num_patches_height * num_patches_width * num_patches_per_side * num_patches_per_side, size_ele * clip_n_mmproj_embd(ctx_clip), 0);
|
struct ggml_tensor *flatten = ggml_view_2d(model.ctx, permuted_cont, clip_n_mmproj_embd(ctx_clip), num_patches_height * num_patches_width * num_patches_per_side * num_patches_per_side, size_ele * clip_n_mmproj_embd(ctx_clip), 0);
|
||||||
// ggml_tensor_printf(flatten,"flatten",__LINE__,false,false);
|
// ggml_tensor_printf(flatten,"flatten",__LINE__,false,false);
|
||||||
ggml_build_forward_expand(gf, flatten);
|
ggml_build_forward_expand(gf, flatten);
|
||||||
ggml_graph_compute_with_ctx(model.ctx, gf, 1);
|
|
||||||
|
ggml_backend_ptr backend { ggml_backend_init_by_type(GGML_BACKEND_DEVICE_TYPE_CPU, nullptr) };
|
||||||
|
GGML_ASSERT(backend != nullptr && "failed to initialize CPU backend");
|
||||||
|
ggml_backend_graph_compute(backend.get(), gf);
|
||||||
|
|
||||||
struct ggml_tensor* result = ggml_graph_node(gf, -1);
|
struct ggml_tensor* result = ggml_graph_node(gf, -1);
|
||||||
|
|
||||||
memcpy(image_embd_out, image_embd_v[0], clip_embd_nbytes(ctx_clip)); // main image as global context
|
memcpy(image_embd_out, image_embd_v[0], clip_embd_nbytes(ctx_clip)); // main image as global context
|
||||||
|
|
@ -457,7 +462,7 @@ struct llava_embd_batch {
|
||||||
std::vector<llama_seq_id *> seq_ids;
|
std::vector<llama_seq_id *> seq_ids;
|
||||||
std::vector<int8_t> logits;
|
std::vector<int8_t> logits;
|
||||||
llama_batch batch;
|
llama_batch batch;
|
||||||
llava_embd_batch(float * embd, int32_t n_embd, int32_t n_tokens, llama_pos pos_0, llama_seq_id seq_id) {
|
llava_embd_batch(float * embd, int32_t n_tokens, llama_pos pos_0, llama_seq_id seq_id) {
|
||||||
pos .resize(n_tokens);
|
pos .resize(n_tokens);
|
||||||
n_seq_id.resize(n_tokens);
|
n_seq_id.resize(n_tokens);
|
||||||
seq_ids .resize(n_tokens + 1);
|
seq_ids .resize(n_tokens + 1);
|
||||||
|
|
@ -469,7 +474,6 @@ struct llava_embd_batch {
|
||||||
/*n_tokens =*/ n_tokens,
|
/*n_tokens =*/ n_tokens,
|
||||||
/*tokens =*/ nullptr,
|
/*tokens =*/ nullptr,
|
||||||
/*embd =*/ embd,
|
/*embd =*/ embd,
|
||||||
/*n_embd =*/ n_embd,
|
|
||||||
/*pos =*/ pos.data(),
|
/*pos =*/ pos.data(),
|
||||||
/*n_seq_id =*/ n_seq_id.data(),
|
/*n_seq_id =*/ n_seq_id.data(),
|
||||||
/*seq_id =*/ seq_ids.data(),
|
/*seq_id =*/ seq_ids.data(),
|
||||||
|
|
@ -493,7 +497,7 @@ bool llava_eval_image_embed(llama_context * ctx_llama, const struct llava_image_
|
||||||
n_eval = n_batch;
|
n_eval = n_batch;
|
||||||
}
|
}
|
||||||
float * embd = image_embed->embed+i*n_embd;
|
float * embd = image_embed->embed+i*n_embd;
|
||||||
llava_embd_batch llava_batch = llava_embd_batch(embd, n_embd, n_eval, *n_past, 0);
|
llava_embd_batch llava_batch = llava_embd_batch(embd, n_eval, *n_past, 0);
|
||||||
if (llama_decode(ctx_llama, llava_batch.batch)) {
|
if (llama_decode(ctx_llama, llava_batch.batch)) {
|
||||||
LOG_ERR("%s : failed to eval\n", __func__);
|
LOG_ERR("%s : failed to eval\n", __func__);
|
||||||
return false;
|
return false;
|
||||||
|
|
@ -1,4 +1,4 @@
|
||||||
package llava
|
package mtmd
|
||||||
|
|
||||||
// #cgo CXXFLAGS: -std=c++11
|
// #cgo CXXFLAGS: -std=c++11
|
||||||
// #cgo CPPFLAGS: -I${SRCDIR}/../../include -I${SRCDIR}/../../common
|
// #cgo CPPFLAGS: -I${SRCDIR}/../../include -I${SRCDIR}/../../common
|
||||||
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue
Block a user