Update GGML to b6646 (#12245)

Notable EOLs with this change:
- MacOS v12 and v13 are no longer supported (v14+ required)
- AMD gfx900 and gfx906 are no longer supported
This commit is contained in:
Daniel Hiltgen 2025-10-02 14:47:10 -07:00 committed by GitHub
parent fdb109469f
commit c68f367ef6
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
326 changed files with 30615 additions and 20624 deletions

View File

@ -89,9 +89,9 @@ if(CMAKE_CUDA_COMPILER)
) )
endif() endif()
set(WINDOWS_AMDGPU_TARGETS_EXCLUDE_REGEX "^gfx(906|908|90a|1200|1201):xnack[+-]$" set(WINDOWS_AMDGPU_TARGETS_EXCLUDE_REGEX "^gfx(908|90a|1200|1201):xnack[+-]$"
CACHE STRING CACHE STRING
"Regular expression describing AMDGPU_TARGETS not supported on Windows. Override to force building these targets. Default \"^gfx(906|908|90a|1200|1201):xnack[+-]$\"." "Regular expression describing AMDGPU_TARGETS not supported on Windows. Override to force building these targets. Default \"^gfx(908|90a|1200|1201):xnack[+-]$\"."
) )
check_language(HIP) check_language(HIP)
@ -100,7 +100,7 @@ if(CMAKE_HIP_COMPILER)
if(NOT AMDGPU_TARGETS) if(NOT AMDGPU_TARGETS)
find_package(hip REQUIRED) find_package(hip REQUIRED)
list(FILTER AMDGPU_TARGETS INCLUDE REGEX "^gfx(900|94[012]|101[02]|1030|110[012]|120[01])$") list(FILTER AMDGPU_TARGETS INCLUDE REGEX "^gfx(94[012]|101[02]|1030|110[012]|120[01])$")
endif() endif()
if(WIN32 AND WINDOWS_AMDGPU_TARGETS_EXCLUDE_REGEX) if(WIN32 AND WINDOWS_AMDGPU_TARGETS_EXCLUDE_REGEX)

View File

@ -68,7 +68,7 @@
"inherits": [ "ROCm" ], "inherits": [ "ROCm" ],
"cacheVariables": { "cacheVariables": {
"CMAKE_HIP_FLAGS": "-parallel-jobs=4", "CMAKE_HIP_FLAGS": "-parallel-jobs=4",
"AMDGPU_TARGETS": "gfx900;gfx940;gfx941;gfx942;gfx1010;gfx1012;gfx1030;gfx1100;gfx1101;gfx1102;gfx1151;gfx1200;gfx1201;gfx906:xnack-;gfx908:xnack-;gfx90a:xnack+;gfx90a:xnack-" "AMDGPU_TARGETS": "gfx940;gfx941;gfx942;gfx1010;gfx1012;gfx1030;gfx1100;gfx1101;gfx1102;gfx1151;gfx1200;gfx1201;gfx908:xnack-;gfx90a:xnack+;gfx90a:xnack-"
} }
} }
], ],

View File

@ -1,6 +1,6 @@
UPSTREAM=https://github.com/ggml-org/llama.cpp.git UPSTREAM=https://github.com/ggml-org/llama.cpp.git
WORKDIR=llama/vendor WORKDIR=llama/vendor
FETCH_HEAD=e54d41befcc1575f4c898c5ff4ef43970cead75f FETCH_HEAD=364a7a6d4a786e98947c8a90430ea581213c0ba9
.PHONY: help .PHONY: help
help: help:

View File

@ -51,14 +51,14 @@ sudo modprobe nvidia_uvm`
Ollama supports the following AMD GPUs: Ollama supports the following AMD GPUs:
### Linux Support ### Linux Support
| Family | Cards and accelerators | | Family | Cards and accelerators |
| -------------- | ---------------------------------------------------------------------------------------------------------------------------------------------- | | -------------- | ---------------------------------------------------------------------------------------------------------------------------------------- |
| AMD Radeon RX | `7900 XTX` `7900 XT` `7900 GRE` `7800 XT` `7700 XT` `7600 XT` `7600` `6950 XT` `6900 XTX` `6900XT` `6800 XT` `6800` `Vega 64` `Vega 56` | | AMD Radeon RX | `7900 XTX` `7900 XT` `7900 GRE` `7800 XT` `7700 XT` `7600 XT` `7600` `6950 XT` `6900 XTX` `6900XT` `6800 XT` `6800` `Vega 64` |
| AMD Radeon PRO | `W7900` `W7800` `W7700` `W7600` `W7500` `W6900X` `W6800X Duo` `W6800X` `W6800` `V620` `V420` `V340` `V320` `Vega II Duo` `Vega II` `VII` `SSG` | | AMD Radeon PRO | `W7900` `W7800` `W7700` `W7600` `W7500` `W6900X` `W6800X Duo` `W6800X` `W6800` `V620` `V420` `V340` `V320` `Vega II Duo` `Vega II` `SSG` |
| AMD Instinct | `MI300X` `MI300A` `MI300` `MI250X` `MI250` `MI210` `MI200` `MI100` `MI60` `MI50` | | AMD Instinct | `MI300X` `MI300A` `MI300` `MI250X` `MI250` `MI210` `MI200` `MI100` `MI60` |
### Windows Support ### Windows Support
With ROCm v6.1, the following GPUs are supported on Windows. With ROCm v6.2, the following GPUs are supported on Windows.
| Family | Cards and accelerators | | Family | Cards and accelerators |
| -------------- | ---------------------------------------------------------------------------------------------------------------------------------------------- | | -------------- | ---------------------------------------------------------------------------------------------------------------------------------------------- |
@ -88,8 +88,6 @@ At this time, the known supported GPU types on linux are the following LLVM Targ
This table shows some example GPUs that map to these LLVM targets: This table shows some example GPUs that map to these LLVM targets:
| **LLVM Target** | **An Example GPU** | | **LLVM Target** | **An Example GPU** |
|-----------------|---------------------| |-----------------|---------------------|
| gfx900 | Radeon RX Vega 56 |
| gfx906 | Radeon Instinct MI50 |
| gfx908 | Radeon Instinct MI100 | | gfx908 | Radeon Instinct MI100 |
| gfx90a | Radeon Instinct MI210 | | gfx90a | Radeon Instinct MI210 |
| gfx940 | Radeon Instinct MI300 | | gfx940 | Radeon Instinct MI300 |

View File

@ -2,7 +2,7 @@
## System Requirements ## System Requirements
* MacOS Monterey (v12) or newer * MacOS Sonoma (v14) or newer
* Apple M series (CPU and GPU support) or x86 (CPU only) * Apple M series (CPU and GPU support) or x86 (CPU only)

View File

@ -36,7 +36,7 @@ func TestLongInputContext(t *testing.T) {
if err := PullIfMissing(ctx, client, req.Model); err != nil { if err := PullIfMissing(ctx, client, req.Model); err != nil {
t.Fatalf("PullIfMissing failed: %v", err) t.Fatalf("PullIfMissing failed: %v", err)
} }
DoGenerate(ctx, t, client, req, []string{"russia", "germany", "france", "england", "austria", "prussia", "europe", "individuals", "coalition", "conflict"}, 120*time.Second, 10*time.Second) DoGenerate(ctx, t, client, req, []string{"russia", "german", "france", "england", "austria", "prussia", "europe", "individuals", "coalition", "conflict"}, 120*time.Second, 10*time.Second)
} }
func TestContextExhaustion(t *testing.T) { func TestContextExhaustion(t *testing.T) {

2
llama/build-info.cpp generated vendored
View File

@ -1,4 +1,4 @@
int LLAMA_BUILD_NUMBER = 0; int LLAMA_BUILD_NUMBER = 0;
char const *LLAMA_COMMIT = "e54d41befcc1575f4c898c5ff4ef43970cead75f"; char const *LLAMA_COMMIT = "364a7a6d4a786e98947c8a90430ea581213c0ba9";
char const *LLAMA_COMPILER = ""; char const *LLAMA_COMPILER = "";
char const *LLAMA_BUILD_TARGET = ""; char const *LLAMA_BUILD_TARGET = "";

View File

@ -14,6 +14,7 @@
#include <climits> #include <climits>
#include <cmath> #include <cmath>
#include <codecvt> #include <codecvt>
#include <chrono>
#include <cstdarg> #include <cstdarg>
#include <cstring> #include <cstring>
#include <ctime> #include <ctime>
@ -41,6 +42,7 @@
#endif #endif
#include <locale> #include <locale>
#include <windows.h> #include <windows.h>
#include <string.h>
#include <fcntl.h> #include <fcntl.h>
#include <io.h> #include <io.h>
#else #else
@ -49,6 +51,11 @@
#include <unistd.h> #include <unistd.h>
#endif #endif
#if defined(__linux__)
#include <sys/types.h>
#include <pwd.h>
#endif
#if defined(_MSC_VER) #if defined(_MSC_VER)
#pragma warning(disable: 4244 4267) // possible loss of data #pragma warning(disable: 4244 4267) // possible loss of data
#endif #endif
@ -557,13 +564,6 @@ std::string string_from(const struct llama_context * ctx, const std::vector<llam
auto detokenized = common_token_to_piece(ctx, token); auto detokenized = common_token_to_piece(ctx, token);
detokenized.erase(
std::remove_if(
detokenized.begin(),
detokenized.end(),
[](const unsigned char c) { return !std::isprint(c); }),
detokenized.end());
buf << "'" << detokenized << "'" buf << "'" << detokenized << "'"
<< ":" << std::to_string(token); << ":" << std::to_string(token);
} }
@ -588,13 +588,6 @@ std::string string_from(const struct llama_context * ctx, const struct llama_bat
auto detokenized = common_token_to_piece(ctx, batch.token[i]); auto detokenized = common_token_to_piece(ctx, batch.token[i]);
detokenized.erase(
std::remove_if(
detokenized.begin(),
detokenized.end(),
[](const unsigned char c) { return !std::isprint(c); }),
detokenized.end());
buf << "\n" << std::to_string(i) buf << "\n" << std::to_string(i)
<< ", token '" << detokenized << "'" << ", token '" << detokenized << "'"
<< ", pos " << std::to_string(batch.pos[i]) << ", pos " << std::to_string(batch.pos[i])
@ -877,8 +870,20 @@ std::string fs_get_cache_directory() {
#if defined(__linux__) || defined(__FreeBSD__) || defined(_AIX) || defined(__OpenBSD__) #if defined(__linux__) || defined(__FreeBSD__) || defined(_AIX) || defined(__OpenBSD__)
if (std::getenv("XDG_CACHE_HOME")) { if (std::getenv("XDG_CACHE_HOME")) {
cache_directory = std::getenv("XDG_CACHE_HOME"); cache_directory = std::getenv("XDG_CACHE_HOME");
} else { } else if (std::getenv("HOME")) {
cache_directory = std::getenv("HOME") + std::string("/.cache/"); cache_directory = std::getenv("HOME") + std::string("/.cache/");
} else {
#if defined(__linux__)
/* no $HOME is defined, fallback to getpwuid */
struct passwd *pw = getpwuid(getuid());
if ((!pw) || (!pw->pw_dir)) {
throw std::runtime_error("Failed to find $HOME directory");
}
cache_directory = std::string(pw->pw_dir) + std::string("/.cache/");
#else /* defined(__linux__) */
throw std::runtime_error("Failed to find $HOME directory");
#endif /* defined(__linux__) */
} }
#elif defined(__APPLE__) #elif defined(__APPLE__)
cache_directory = std::getenv("HOME") + std::string("/Library/Caches/"); cache_directory = std::getenv("HOME") + std::string("/Library/Caches/");
@ -914,7 +919,8 @@ struct common_init_result common_init_from_params(common_params & params) {
llama_model * model = llama_model_load_from_file(params.model.path.c_str(), mparams); llama_model * model = llama_model_load_from_file(params.model.path.c_str(), mparams);
if (model == NULL) { if (model == NULL) {
LOG_ERR("%s: failed to load model '%s'\n", __func__, params.model.path.c_str()); LOG_ERR("%s: failed to load model '%s', try reducing --n-gpu-layers if you're running out of VRAM\n",
__func__, params.model.path.c_str());
return iparams; return iparams;
} }
@ -924,7 +930,8 @@ struct common_init_result common_init_from_params(common_params & params) {
llama_context * lctx = llama_init_from_model(model, cparams); llama_context * lctx = llama_init_from_model(model, cparams);
if (lctx == NULL) { if (lctx == NULL) {
LOG_ERR("%s: failed to create context with model '%s'\n", __func__, params.model.path.c_str()); LOG_ERR("%s: failed to create context with model '%s', try reducing --n-gpu-layers if you're running out of VRAM\n",
__func__, params.model.path.c_str());
llama_model_free(model); llama_model_free(model);
return iparams; return iparams;
} }
@ -971,15 +978,13 @@ struct common_init_result common_init_from_params(common_params & params) {
bool has_eos = llama_vocab_eos(vocab) != LLAMA_TOKEN_NULL; bool has_eos = llama_vocab_eos(vocab) != LLAMA_TOKEN_NULL;
bool has_sep = llama_vocab_sep(vocab) != LLAMA_TOKEN_NULL; bool has_sep = llama_vocab_sep(vocab) != LLAMA_TOKEN_NULL;
bool has_rerank_prompt = llama_model_chat_template(model, "rerank") != NULL;
if (!has_eos && !has_sep) { if (!has_eos && !has_sep && !has_rerank_prompt) {
LOG_WRN("%s: warning: vocab does not have an EOS token or SEP token, reranking will not work\n", __func__); LOG_WRN("%s: warning: vocab does not have an EOS token, SEP token, or rerank prompt. Reranking will not work\n", __func__);
ok = false; ok = false;
} else if (!has_eos) { } else if (!has_eos) {
LOG_WRN("%s: warning: vocab does not have an EOS token, using SEP token as fallback\n", __func__); LOG_WRN("%s: warning: vocab does not have an EOS token, using SEP token as fallback\n", __func__);
} else if (!has_sep) {
LOG_WRN("%s: warning: vocab does not have a SEP token, reranking will not work\n", __func__);
ok = false;
} }
if (!ok) { if (!ok) {
@ -1001,7 +1006,12 @@ struct common_init_result common_init_from_params(common_params & params) {
return iparams; return iparams;
} }
char buf[1024];
la.ptr = lora.get(); la.ptr = lora.get();
llama_adapter_meta_val_str(la.ptr, "adapter.lora.task_name", buf, sizeof(buf));
la.task_name = buf;
llama_adapter_meta_val_str(la.ptr, "adapter.lora.prompt_prefix", buf, sizeof(buf));
la.prompt_prefix = buf;
iparams.lora.emplace_back(std::move(lora)); // copy to list of loaded adapters iparams.lora.emplace_back(std::move(lora)); // copy to list of loaded adapters
} }
@ -1165,11 +1175,10 @@ struct llama_context_params common_context_params_to_llama(const common_params &
cparams.yarn_orig_ctx = params.yarn_orig_ctx; cparams.yarn_orig_ctx = params.yarn_orig_ctx;
cparams.pooling_type = params.pooling_type; cparams.pooling_type = params.pooling_type;
cparams.attention_type = params.attention_type; cparams.attention_type = params.attention_type;
cparams.defrag_thold = params.defrag_thold; cparams.flash_attn_type = params.flash_attn_type;
cparams.cb_eval = params.cb_eval; cparams.cb_eval = params.cb_eval;
cparams.cb_eval_user_data = params.cb_eval_user_data; cparams.cb_eval_user_data = params.cb_eval_user_data;
cparams.offload_kqv = !params.no_kv_offload; cparams.offload_kqv = !params.no_kv_offload;
cparams.flash_attn = params.flash_attn;
cparams.no_perf = params.no_perf; cparams.no_perf = params.no_perf;
cparams.op_offload = !params.no_op_offload; cparams.op_offload = !params.no_op_offload;
cparams.swa_full = params.swa_full; cparams.swa_full = params.swa_full;
@ -1565,3 +1574,56 @@ ggml_opt_dataset_t common_opt_dataset_init(struct llama_context * ctx, const std
return result; return result;
} }
ggml_opt_optimizer_params common_opt_lr_pars(void * userdata) {
ggml_opt_optimizer_params result = ggml_opt_get_default_optimizer_params(nullptr);
const lr_opt & d = *(lr_opt *) userdata;
result.adamw.alpha = result.sgd.alpha = d.get_lr(d.epoch);
result.sgd.wd = result.adamw.wd = d.wd;
return result;
}
// TODO make all command line args case-insensitive
static inline bool eq_case_insensitive(char const* a, char const* b) {
return !
#if defined(_MSC_VER)
_stricmp
#else
strcasecmp
#endif // defined(_MSC_VER)
(a, b);
}
enum ggml_opt_optimizer_type common_opt_get_optimizer(const char * n) {
if (eq_case_insensitive("adamw", n)) {
return GGML_OPT_OPTIMIZER_TYPE_ADAMW;
}
if (eq_case_insensitive("sgd", n)) {
return GGML_OPT_OPTIMIZER_TYPE_SGD;
}
return GGML_OPT_OPTIMIZER_TYPE_COUNT;
}
// TODO simplify to use just log and exp
static float const k_log_2 = std::log(2.f);
void lr_opt::init() {
if (lr_min > 0 && lr_min < lr0) {
float nhalf = std::log(lr0 / lr_min) / k_log_2;
float e = epochs;
if (decay_epochs > 0 && decay_epochs < e) {
e = decay_epochs;
} else {
decay_epochs = e;
}
scale_epoch = nhalf / e;
}
}
float lr_opt::get_lr(float epoch) const {
float r = lr_min <= 0 ? lr0 :
epoch >= decay_epochs ? lr_min :
lr0 * std::pow(0.5f, epoch * scale_epoch);
LOG_INF("epoch %.2g lr=%.2g\n", epoch, r);
return r;
}

View File

@ -2,14 +2,17 @@
#pragma once #pragma once
#include "llama-cpp.h"
#include <set> #include <set>
#include <sstream>
#include <string> #include <string>
#include <string_view> #include <string_view>
#include <vector> #include <vector>
#include <map> #include <map>
#include <sstream> #include <sstream>
#include <cmath>
#include "ggml-opt.h"
#include "llama-cpp.h"
#ifdef _WIN32 #ifdef _WIN32
#define DIRECTORY_SEPARATOR '\\' #define DIRECTORY_SEPARATOR '\\'
@ -31,6 +34,9 @@ struct common_adapter_lora_info {
std::string path; std::string path;
float scale; float scale;
std::string task_name;
std::string prompt_prefix;
struct llama_adapter_lora * ptr; struct llama_adapter_lora * ptr;
}; };
@ -82,6 +88,7 @@ enum llama_example {
LLAMA_EXAMPLE_PARALLEL, LLAMA_EXAMPLE_PARALLEL,
LLAMA_EXAMPLE_TTS, LLAMA_EXAMPLE_TTS,
LLAMA_EXAMPLE_DIFFUSION, LLAMA_EXAMPLE_DIFFUSION,
LLAMA_EXAMPLE_FINETUNE,
LLAMA_EXAMPLE_COUNT, LLAMA_EXAMPLE_COUNT,
}; };
@ -186,10 +193,11 @@ struct common_params_sampling {
}; };
struct common_params_model { struct common_params_model {
std::string path = ""; // model local path // NOLINT std::string path = ""; // model local path // NOLINT
std::string url = ""; // model url to download // NOLINT std::string url = ""; // model url to download // NOLINT
std::string hf_repo = ""; // HF repo // NOLINT std::string hf_repo = ""; // HF repo // NOLINT
std::string hf_file = ""; // HF file // NOLINT std::string hf_file = ""; // HF file // NOLINT
std::string docker_repo = ""; // Docker repo // NOLINT
}; };
struct common_params_speculative { struct common_params_speculative {
@ -202,6 +210,7 @@ struct common_params_speculative {
float p_split = 0.1f; // speculative decoding split probability float p_split = 0.1f; // speculative decoding split probability
float p_min = 0.75f; // minimum speculative decoding probability (greedy) float p_min = 0.75f; // minimum speculative decoding probability (greedy)
std::vector<std::pair<std::string, std::string>> replacements; // main to speculative model replacements std::vector<std::pair<std::string, std::string>> replacements; // main to speculative model replacements
std::vector<llama_model_tensor_buft_override> tensor_buft_overrides;
ggml_type cache_type_k = GGML_TYPE_F16; // KV cache data type for the K ggml_type cache_type_k = GGML_TYPE_F16; // KV cache data type for the K
ggml_type cache_type_v = GGML_TYPE_F16; // KV cache data type for the V ggml_type cache_type_v = GGML_TYPE_F16; // KV cache data type for the V
@ -234,14 +243,36 @@ struct common_params_diffusion {
bool add_gumbel_noise = false; // add gumbel noise to the logits if temp > 0.0 bool add_gumbel_noise = false; // add gumbel noise to the logits if temp > 0.0
}; };
// reasoning API response format (not to be confused as chat template's reasoning format)
enum common_reasoning_format { enum common_reasoning_format {
COMMON_REASONING_FORMAT_NONE, COMMON_REASONING_FORMAT_NONE,
COMMON_REASONING_FORMAT_AUTO, COMMON_REASONING_FORMAT_AUTO, // Same as deepseek, using `message.reasoning_content`
COMMON_REASONING_FORMAT_DEEPSEEK_LEGACY, // Extract thinking tag contents and return as `message.reasoning_content`, or leave inline in <think> tags in stream mode COMMON_REASONING_FORMAT_DEEPSEEK_LEGACY, // Extract thinking tag contents and return as `message.reasoning_content`, or leave inline in <think> tags in stream mode
COMMON_REASONING_FORMAT_DEEPSEEK, // Extract thinking tag contents and return as `message.reasoning_content`, including in streaming deltas. COMMON_REASONING_FORMAT_DEEPSEEK, // Extract thinking tag contents and return as `message.reasoning_content`, including in streaming deltas.
COMMON_REASONING_FORMAT_GRANITE, // Extract thinking tag contents and return as `message.reasoning_content`, including in streaming deltas. // do not extend this enum unless you absolutely have to
// in most cases, use COMMON_REASONING_FORMAT_AUTO
// see: https://github.com/ggml-org/llama.cpp/pull/15408
}; };
struct lr_opt {
float lr0 = 1e-5; // learning rate at first epoch
float lr_min = -1;
float decay_epochs = -1; // if >0, the learning rate starts at lr0 and decays to lr_min after this many epochs
float scale_epoch = 0;
float wd = 0;
unsigned epochs = 2;
unsigned epoch; // set by optimizer outer (epochs) loop
// learning rate decay - constant LR per epoch only for now
float get_lr(float e) const;
float get_lr() const { return get_lr(epoch); }
// must call after arg parse, before get_lr
void init();
};
struct ggml_opt_optimizer_params common_opt_lr_pars(void * userdata);
struct common_params { struct common_params {
int32_t n_predict = -1; // new tokens to predict int32_t n_predict = -1; // new tokens to predict
int32_t n_ctx = 4096; // context size int32_t n_ctx = 4096; // context size
@ -257,11 +288,10 @@ struct common_params {
float rope_freq_base = 0.0f; // RoPE base frequency float rope_freq_base = 0.0f; // RoPE base frequency
float rope_freq_scale = 0.0f; // RoPE frequency scaling factor float rope_freq_scale = 0.0f; // RoPE frequency scaling factor
float yarn_ext_factor = -1.0f; // YaRN extrapolation mix factor float yarn_ext_factor = -1.0f; // YaRN extrapolation mix factor
float yarn_attn_factor = 1.0f; // YaRN magnitude scaling factor float yarn_attn_factor = -1.0f; // YaRN magnitude scaling factor
float yarn_beta_fast = 32.0f; // YaRN low correction dim float yarn_beta_fast = -1.0f; // YaRN low correction dim
float yarn_beta_slow = 1.0f; // YaRN high correction dim float yarn_beta_slow = -1.0f; // YaRN high correction dim
int32_t yarn_orig_ctx = 0; // YaRN original context length int32_t yarn_orig_ctx = 0; // YaRN original context length
float defrag_thold = 0.1f; // KV cache defragmentation threshold
// offload params // offload params
std::vector<ggml_backend_dev_t> devices; // devices to use for offloading std::vector<ggml_backend_dev_t> devices; // devices to use for offloading
@ -283,6 +313,7 @@ struct common_params {
enum llama_rope_scaling_type rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED; enum llama_rope_scaling_type rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED;
enum llama_pooling_type pooling_type = LLAMA_POOLING_TYPE_UNSPECIFIED; // pooling type for embeddings enum llama_pooling_type pooling_type = LLAMA_POOLING_TYPE_UNSPECIFIED; // pooling type for embeddings
enum llama_attention_type attention_type = LLAMA_ATTENTION_TYPE_UNSPECIFIED; // attention type for embeddings enum llama_attention_type attention_type = LLAMA_ATTENTION_TYPE_UNSPECIFIED; // attention type for embeddings
enum llama_flash_attn_type flash_attn_type = LLAMA_FLASH_ATTN_TYPE_AUTO; // whether to use Flash Attention
struct common_params_sampling sampling; struct common_params_sampling sampling;
struct common_params_speculative speculative; struct common_params_speculative speculative;
@ -346,9 +377,8 @@ struct common_params {
bool multiline_input = false; // reverse the usage of `\` bool multiline_input = false; // reverse the usage of `\`
bool simple_io = false; // improves compatibility with subprocesses and limited consoles bool simple_io = false; // improves compatibility with subprocesses and limited consoles
bool cont_batching = true; // insert new sequences for decoding on-the-fly bool cont_batching = true; // insert new sequences for decoding on-the-fly
bool flash_attn = false; // flash attention
bool no_perf = false; // disable performance metrics bool no_perf = false; // disable performance metrics
bool ctx_shift = true; // context shift on inifinite text generation bool ctx_shift = false; // context shift on infinite text generation
bool swa_full = false; // use full-size SWA cache (https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055) bool swa_full = false; // use full-size SWA cache (https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055)
bool kv_unified = false; // enable unified KV cache bool kv_unified = false; // enable unified KV cache
@ -376,6 +406,11 @@ struct common_params {
bool no_mmproj = false; // explicitly disable multimodal model bool no_mmproj = false; // explicitly disable multimodal model
std::vector<std::string> image; // path to image file(s) std::vector<std::string> image; // path to image file(s)
// finetune
struct lr_opt lr;
enum ggml_opt_optimizer_type optimizer = GGML_OPT_OPTIMIZER_TYPE_ADAMW;
float val_split = 0.05f; // fraction of the data used for the validation set
// embedding // embedding
bool embedding = false; // get only sentence embedding bool embedding = false; // get only sentence embedding
int32_t embd_normalize = 2; // normalisation for embeddings (-1=none, 0=max absolute int16, 1=taxicab, 2=euclidean, >2=p-norm) int32_t embd_normalize = 2; // normalisation for embeddings (-1=none, 0=max absolute int16, 1=taxicab, 2=euclidean, >2=p-norm)
@ -384,11 +419,12 @@ struct common_params {
std::string cls_sep = "\t"; // separator of classification sequences std::string cls_sep = "\t"; // separator of classification sequences
// server params // server params
int32_t port = 8080; // server listens on this network port int32_t port = 8080; // server listens on this network port
int32_t timeout_read = 600; // http read timeout in seconds int32_t timeout_read = 600; // http read timeout in seconds
int32_t timeout_write = timeout_read; // http write timeout in seconds int32_t timeout_write = timeout_read; // http write timeout in seconds
int32_t n_threads_http = -1; // number of threads to process HTTP requests (TODO: support threadpool) int32_t n_threads_http = -1; // number of threads to process HTTP requests (TODO: support threadpool)
int32_t n_cache_reuse = 0; // min chunk size to reuse from the cache via KV shifting int32_t n_cache_reuse = 0; // min chunk size to reuse from the cache via KV shifting
int32_t n_swa_checkpoints = 3; // max number of SWA checkpoints per slot
std::string hostname = "127.0.0.1"; std::string hostname = "127.0.0.1";
std::string public_path = ""; // NOLINT std::string public_path = ""; // NOLINT
@ -409,7 +445,7 @@ struct common_params {
// "advanced" endpoints are disabled by default for better security // "advanced" endpoints are disabled by default for better security
bool webui = true; bool webui = true;
bool endpoint_slots = false; bool endpoint_slots = true;
bool endpoint_props = false; // only control POST requests, not GET bool endpoint_props = false; // only control POST requests, not GET
bool endpoint_metrics = false; bool endpoint_metrics = false;
@ -417,7 +453,7 @@ struct common_params {
std::string slot_save_path; std::string slot_save_path;
float slot_prompt_similarity = 0.5f; float slot_prompt_similarity = 0.1f;
// batched-bench params // batched-bench params
bool is_pp_shared = false; bool is_pp_shared = false;
@ -698,8 +734,25 @@ const char * const LLM_KV_SPLIT_TENSORS_COUNT = "split.tensors.count";
} }
//
// MoE utils
//
const char * const LLM_FFN_EXPS_REGEX = "\\.ffn_(up|down|gate)_(ch|)exps";
static std::string llm_ffn_exps_block_regex(int idx) {
return string_format("blk\\.%d%s", idx, LLM_FFN_EXPS_REGEX);
}
static llama_model_tensor_buft_override llm_ffn_exps_cpu_override() {
return { LLM_FFN_EXPS_REGEX, ggml_backend_cpu_buffer_type() };
}
// //
// training utils // training utils
// //
ggml_opt_dataset_t common_opt_dataset_init(struct llama_context * ctx, const std::vector<llama_token> & tokens, int64_t stride); ggml_opt_dataset_t common_opt_dataset_init(struct llama_context * ctx, const std::vector<llama_token> & tokens, int64_t stride);
// "adamw" or "sgd" (case insensitive)
enum ggml_opt_optimizer_type common_opt_get_optimizer(const char *);

View File

@ -257,12 +257,13 @@ std::unordered_map<std::string, BuiltinRule> STRING_FORMAT_RULES = {
}; };
static bool is_reserved_name(const std::string & name) { static bool is_reserved_name(const std::string & name) {
static std::unordered_set<std::string> RESERVED_NAMES; static const std::unordered_set<std::string> RESERVED_NAMES = [] {
if (RESERVED_NAMES.empty()) { std::unordered_set<std::string> s;
RESERVED_NAMES.insert("root"); s.insert("root");
for (const auto &p : PRIMITIVE_RULES) RESERVED_NAMES.insert(p.first); for (const auto & p : PRIMITIVE_RULES) s.insert(p.first);
for (const auto &p : STRING_FORMAT_RULES) RESERVED_NAMES.insert(p.first); for (const auto & p : STRING_FORMAT_RULES) s.insert(p.first);
} return s;
}();
return RESERVED_NAMES.find(name) != RESERVED_NAMES.end(); return RESERVED_NAMES.find(name) != RESERVED_NAMES.end();
} }
@ -843,9 +844,10 @@ public:
_build_object_rule( _build_object_rule(
properties, required, name, properties, required, name,
schema.contains("additionalProperties") ? schema["additionalProperties"] : json())); schema.contains("additionalProperties") ? schema["additionalProperties"] : json()));
} else if ((schema_type.is_null() || schema_type == "object") && schema.contains("allOf")) { } else if ((schema_type.is_null() || schema_type == "object" || schema_type == "string") && schema.contains("allOf")) {
std::unordered_set<std::string> required; std::unordered_set<std::string> required;
std::vector<std::pair<std::string, json>> properties; std::vector<std::pair<std::string, json>> properties;
std::map<std::string, size_t> enum_values;
std::string hybrid_name = name; std::string hybrid_name = name;
std::function<void(const json &, bool)> add_component = [&](const json & comp_schema, bool is_required) { std::function<void(const json &, bool)> add_component = [&](const json & comp_schema, bool is_required) {
if (comp_schema.contains("$ref")) { if (comp_schema.contains("$ref")) {
@ -857,6 +859,14 @@ public:
required.insert(prop.key()); required.insert(prop.key());
} }
} }
} else if (comp_schema.contains("enum")) {
for (const auto & v : comp_schema["enum"]) {
const auto rule = _generate_constant_rule(v);
if (enum_values.find(rule) == enum_values.end()) {
enum_values[rule] = 0;
}
enum_values[rule] += 1;
}
} else { } else {
// todo warning // todo warning
} }
@ -870,6 +880,17 @@ public:
add_component(t, true); add_component(t, true);
} }
} }
if (!enum_values.empty()) {
std::vector<std::string> enum_intersection;
for (const auto & p : enum_values) {
if (p.second == schema["allOf"].size()) {
enum_intersection.push_back(p.first);
}
}
if (!enum_intersection.empty()) {
return _add_rule(rule_name, "(" + string_join(enum_intersection, " | ") + ") space");
}
}
return _add_rule(rule_name, _build_object_rule(properties, required, hybrid_name, json())); return _add_rule(rule_name, _build_object_rule(properties, required, hybrid_name, json()));
} else if ((schema_type.is_null() || schema_type == "array") && (schema.contains("items") || schema.contains("prefixItems"))) { } else if ((schema_type.is_null() || schema_type == "array") && (schema.contains("items") || schema.contains("prefixItems"))) {
json items = schema.contains("items") ? schema["items"] : schema["prefixItems"]; json items = schema.contains("items") ? schema["items"] : schema["prefixItems"];

View File

@ -4,17 +4,52 @@
#include <condition_variable> #include <condition_variable>
#include <cstdarg> #include <cstdarg>
#include <cstdio> #include <cstdio>
#include <cstdlib>
#include <cstring>
#include <mutex> #include <mutex>
#include <sstream> #include <sstream>
#include <thread> #include <thread>
#include <vector> #include <vector>
#if defined(_WIN32)
# include <io.h>
# include <windows.h>
# define isatty _isatty
# define fileno _fileno
#else
# include <unistd.h>
#endif // defined(_WIN32)
int common_log_verbosity_thold = LOG_DEFAULT_LLAMA; int common_log_verbosity_thold = LOG_DEFAULT_LLAMA;
void common_log_set_verbosity_thold(int verbosity) { void common_log_set_verbosity_thold(int verbosity) {
common_log_verbosity_thold = verbosity; common_log_verbosity_thold = verbosity;
} }
// Auto-detect if colors should be enabled based on terminal and environment
static bool common_log_should_use_colors_auto() {
// Check NO_COLOR environment variable (https://no-color.org/)
if (const char * no_color = std::getenv("NO_COLOR")) {
if (no_color[0] != '\0') {
return false;
}
}
// Check TERM environment variable
if (const char * term = std::getenv("TERM")) {
if (std::strcmp(term, "dumb") == 0) {
return false;
}
}
// Check if stdout and stderr are connected to a terminal
// We check both because log messages can go to either
bool stdout_is_tty = isatty(fileno(stdout));
bool stderr_is_tty = isatty(fileno(stderr));
return stdout_is_tty || stderr_is_tty;
}
static int64_t t_us() { static int64_t t_us() {
return std::chrono::duration_cast<std::chrono::microseconds>(std::chrono::system_clock::now().time_since_epoch()).count(); return std::chrono::duration_cast<std::chrono::microseconds>(std::chrono::system_clock::now().time_since_epoch()).count();
} }
@ -353,6 +388,11 @@ struct common_log * common_log_init() {
struct common_log * common_log_main() { struct common_log * common_log_main() {
static struct common_log log; static struct common_log log;
static std::once_flag init_flag;
std::call_once(init_flag, [&]() {
// Set default to auto-detect colors
log.set_colors(common_log_should_use_colors_auto());
});
return &log; return &log;
} }
@ -380,8 +420,19 @@ void common_log_set_file(struct common_log * log, const char * file) {
log->set_file(file); log->set_file(file);
} }
void common_log_set_colors(struct common_log * log, bool colors) { void common_log_set_colors(struct common_log * log, log_colors colors) {
log->set_colors(colors); if (colors == LOG_COLORS_AUTO) {
log->set_colors(common_log_should_use_colors_auto());
return;
}
if (colors == LOG_COLORS_DISABLED) {
log->set_colors(false);
return;
}
GGML_ASSERT(colors == LOG_COLORS_ENABLED);
log->set_colors(true);
} }
void common_log_set_prefix(struct common_log * log, bool prefix) { void common_log_set_prefix(struct common_log * log, bool prefix) {

View File

@ -24,6 +24,12 @@
#define LOG_DEFAULT_DEBUG 1 #define LOG_DEFAULT_DEBUG 1
#define LOG_DEFAULT_LLAMA 0 #define LOG_DEFAULT_LLAMA 0
enum log_colors {
LOG_COLORS_AUTO = -1,
LOG_COLORS_DISABLED = 0,
LOG_COLORS_ENABLED = 1,
};
// needed by the LOG_TMPL macro to avoid computing log arguments if the verbosity lower // needed by the LOG_TMPL macro to avoid computing log arguments if the verbosity lower
// set via common_log_set_verbosity() // set via common_log_set_verbosity()
extern int common_log_verbosity_thold; extern int common_log_verbosity_thold;
@ -65,10 +71,10 @@ void common_log_add(struct common_log * log, enum ggml_log_level level, const ch
// D - debug (stderr, V = LOG_DEFAULT_DEBUG) // D - debug (stderr, V = LOG_DEFAULT_DEBUG)
// //
void common_log_set_file (struct common_log * log, const char * file); // not thread-safe void common_log_set_file (struct common_log * log, const char * file); // not thread-safe
void common_log_set_colors (struct common_log * log, bool colors); // not thread-safe void common_log_set_colors (struct common_log * log, log_colors colors); // not thread-safe
void common_log_set_prefix (struct common_log * log, bool prefix); // whether to output prefix to each log void common_log_set_prefix (struct common_log * log, bool prefix); // whether to output prefix to each log
void common_log_set_timestamps(struct common_log * log, bool timestamps); // whether to output timestamps in the prefix void common_log_set_timestamps(struct common_log * log, bool timestamps); // whether to output timestamps in the prefix
// helper macros for logging // helper macros for logging
// use these to avoid computing log arguments if the verbosity of the log is higher than the threshold // use these to avoid computing log arguments if the verbosity of the log is higher than the threshold

View File

@ -332,6 +332,7 @@ void common_perf_print(const struct llama_context * ctx, const struct common_sam
} }
if (ctx) { if (ctx) {
llama_perf_context_print(ctx); llama_perf_context_print(ctx);
llama_memory_breakdown_print(ctx);
} }
} }
@ -426,8 +427,29 @@ uint32_t common_sampler_get_seed(const struct common_sampler * gsmpl) {
// helpers // helpers
llama_token_data_array * common_sampler_get_candidates(struct common_sampler * gsmpl) { llama_token_data_array * common_sampler_get_candidates(struct common_sampler * gsmpl, bool do_sort) {
return &gsmpl->cur_p; auto * res = &gsmpl->cur_p;
if (do_sort && !res->sorted) {
// remember the selected token before sorting
const llama_token id = res->data[res->selected].id;
std::sort(res->data, res->data + res->size, [](const llama_token_data & a, const llama_token_data & b) {
return a.p > b.p;
});
// restore the selected token after sorting
for (size_t i = 0; i < res->size; ++i) {
if (res->data[i].id == id) {
res->selected = i;
break;
}
}
res->sorted = true;
}
return res;
} }
llama_token common_sampler_last(const struct common_sampler * gsmpl) { llama_token common_sampler_last(const struct common_sampler * gsmpl) {

View File

@ -86,7 +86,9 @@ uint32_t common_sampler_get_seed(const struct common_sampler * gsmpl);
// helpers // helpers
// access the internal list of current candidate tokens // access the internal list of current candidate tokens
llama_token_data_array * common_sampler_get_candidates(struct common_sampler * gsmpl); // if do_sort == true, the candidates are guaranteed to be sorted afterwards (in descending order of probability)
// the .sorted flag of the result indicates whether the returned candidates are sorted
llama_token_data_array * common_sampler_get_candidates(struct common_sampler * gsmpl, bool do_sort);
// get the last accepted token // get the last accepted token
llama_token common_sampler_last(const struct common_sampler * gsmpl); llama_token common_sampler_last(const struct common_sampler * gsmpl);

View File

@ -64,8 +64,6 @@ extern "C" {
typedef struct llama_memory_i * llama_memory_t; typedef struct llama_memory_i * llama_memory_t;
struct llama_kv_cache; // DEPRECATED (use llama_memory instead)
typedef int32_t llama_pos; typedef int32_t llama_pos;
typedef int32_t llama_token; typedef int32_t llama_token;
typedef int32_t llama_seq_id; typedef int32_t llama_seq_id;
@ -181,6 +179,14 @@ extern "C" {
LLAMA_ATTENTION_TYPE_NON_CAUSAL = 1, LLAMA_ATTENTION_TYPE_NON_CAUSAL = 1,
}; };
enum llama_flash_attn_type {
LLAMA_FLASH_ATTN_TYPE_AUTO = -1,
LLAMA_FLASH_ATTN_TYPE_DISABLED = 0,
LLAMA_FLASH_ATTN_TYPE_ENABLED = 1,
};
LLAMA_API const char * llama_flash_attn_type_name(enum llama_flash_attn_type flash_attn_type);
enum llama_split_mode { enum llama_split_mode {
LLAMA_SPLIT_MODE_NONE = 0, // single GPU LLAMA_SPLIT_MODE_NONE = 0, // single GPU
LLAMA_SPLIT_MODE_LAYER = 1, // split layers and KV across GPUs LLAMA_SPLIT_MODE_LAYER = 1, // split layers and KV across GPUs
@ -200,7 +206,7 @@ extern "C" {
llama_token_data * data; llama_token_data * data;
size_t size; size_t size;
int64_t selected; // this is the index in the data array (i.e. not the token id) int64_t selected; // this is the index in the data array (i.e. not the token id)
bool sorted; bool sorted; // note: do not assume the data is sorted - always check this flag
} llama_token_data_array; } llama_token_data_array;
typedef bool (*llama_progress_callback)(float progress, void * user_data); typedef bool (*llama_progress_callback)(float progress, void * user_data);
@ -305,6 +311,7 @@ extern "C" {
enum llama_rope_scaling_type rope_scaling_type; // RoPE scaling type, from `enum llama_rope_scaling_type` enum llama_rope_scaling_type rope_scaling_type; // RoPE scaling type, from `enum llama_rope_scaling_type`
enum llama_pooling_type pooling_type; // whether to pool (sum) embedding results by sequence id enum llama_pooling_type pooling_type; // whether to pool (sum) embedding results by sequence id
enum llama_attention_type attention_type; // attention type to use for embeddings enum llama_attention_type attention_type; // attention type to use for embeddings
enum llama_flash_attn_type flash_attn_type; // when to enable Flash Attention
// ref: https://github.com/ggml-org/llama.cpp/pull/2054 // ref: https://github.com/ggml-org/llama.cpp/pull/2054
float rope_freq_base; // RoPE base frequency, 0 = from model float rope_freq_base; // RoPE base frequency, 0 = from model
@ -314,7 +321,7 @@ extern "C" {
float yarn_beta_fast; // YaRN low correction dim float yarn_beta_fast; // YaRN low correction dim
float yarn_beta_slow; // YaRN high correction dim float yarn_beta_slow; // YaRN high correction dim
uint32_t yarn_orig_ctx; // YaRN original context size uint32_t yarn_orig_ctx; // YaRN original context size
float defrag_thold; // defragment the KV cache if holes/size > thold, <= 0 disabled (default) float defrag_thold; // [DEPRECATED] defragment the KV cache if holes/size > thold, <= 0 disabled (default)
ggml_backend_sched_eval_callback cb_eval; ggml_backend_sched_eval_callback cb_eval;
void * cb_eval_user_data; void * cb_eval_user_data;
@ -331,7 +338,6 @@ extern "C" {
// Keep the booleans together and at the end of the struct to avoid misalignment during copy-by-value. // Keep the booleans together and at the end of the struct to avoid misalignment during copy-by-value.
bool embeddings; // if true, extract embeddings (together with logits) bool embeddings; // if true, extract embeddings (together with logits)
bool offload_kqv; // offload the KQV ops (including the KV cache) to GPU bool offload_kqv; // offload the KQV ops (including the KV cache) to GPU
bool flash_attn; // use flash attention [EXPERIMENTAL]
bool no_perf; // measure performance timings bool no_perf; // measure performance timings
bool op_offload; // offload host tensor operations to device bool op_offload; // offload host tensor operations to device
bool swa_full; // use full-size SWA cache (https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055) bool swa_full; // use full-size SWA cache (https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055)
@ -469,8 +475,6 @@ extern "C" {
LLAMA_API llama_memory_t llama_get_memory (const struct llama_context * ctx); LLAMA_API llama_memory_t llama_get_memory (const struct llama_context * ctx);
LLAMA_API enum llama_pooling_type llama_pooling_type(const struct llama_context * ctx); // TODO: rename to llama_get_pooling_type LLAMA_API enum llama_pooling_type llama_pooling_type(const struct llama_context * ctx); // TODO: rename to llama_get_pooling_type
DEPRECATED(LLAMA_API struct llama_kv_cache * llama_get_kv_self(struct llama_context * ctx), "use llama_get_memory instead");
LLAMA_API const struct llama_vocab * llama_model_get_vocab(const struct llama_model * model); LLAMA_API const struct llama_vocab * llama_model_get_vocab(const struct llama_model * model);
LLAMA_API enum llama_rope_type llama_model_rope_type(const struct llama_model * model); LLAMA_API enum llama_rope_type llama_model_rope_type(const struct llama_model * model);
@ -557,10 +561,32 @@ extern "C" {
struct llama_model * model, struct llama_model * model,
const char * path_lora); const char * path_lora);
// Functions to access the adapter's GGUF metadata scalar values
// - The functions return the length of the string on success, or -1 on failure
// - The output string is always null-terminated and cleared on failure
// - When retrieving a string, an extra byte must be allocated to account for the null terminator
// - GGUF array values are not supported by these functions
// Get metadata value as a string by key name
LLAMA_API int32_t llama_adapter_meta_val_str(const struct llama_adapter_lora * adapter, const char * key, char * buf, size_t buf_size);
// Get the number of metadata key/value pairs
LLAMA_API int32_t llama_adapter_meta_count(const struct llama_adapter_lora * adapter);
// Get metadata key name by index
LLAMA_API int32_t llama_adapter_meta_key_by_index(const struct llama_adapter_lora * adapter, int32_t i, char * buf, size_t buf_size);
// Get metadata value as a string by index
LLAMA_API int32_t llama_adapter_meta_val_str_by_index(const struct llama_adapter_lora * adapter, int32_t i, char * buf, size_t buf_size);
// Manually free a LoRA adapter // Manually free a LoRA adapter
// Note: loaded adapters will be free when the associated model is deleted // Note: loaded adapters will be free when the associated model is deleted
LLAMA_API void llama_adapter_lora_free(struct llama_adapter_lora * adapter); LLAMA_API void llama_adapter_lora_free(struct llama_adapter_lora * adapter);
// Get the invocation tokens if the current lora is an alora
LLAMA_API uint64_t llama_adapter_get_alora_n_invocation_tokens(const struct llama_adapter_lora * adapter);
LLAMA_API const llama_token * llama_adapter_get_alora_invocation_tokens (const struct llama_adapter_lora * adapter);
// The following functions operate on a llama_context, hence the naming: llama_verb_... // The following functions operate on a llama_context, hence the naming: llama_verb_...
// Add a loaded LoRA adapter to given context // Add a loaded LoRA adapter to given context
@ -667,111 +693,6 @@ extern "C" {
// Check if the memory supports shifting // Check if the memory supports shifting
LLAMA_API bool llama_memory_can_shift(llama_memory_t mem); LLAMA_API bool llama_memory_can_shift(llama_memory_t mem);
//
// KV cache for self-attention (TODO: deprecate in favor of llama_memory)
//
// Returns the number of tokens in the KV cache (slow, use only for debug)
// If a KV cell has multiple sequences assigned to it, it will be counted multiple times
DEPRECATED(LLAMA_API int32_t llama_kv_self_n_tokens(const struct llama_context * ctx),
"Use llama_kv_self_seq_pos_max() and llama_kv_self_seq_pos_min() instead (https://github.com/ggml-org/llama.cpp/issues/13793)");
// Returns the number of used KV cells (i.e. have at least one sequence assigned to them)
DEPRECATED(LLAMA_API int32_t llama_kv_self_used_cells(const struct llama_context * ctx),
"Use llama_kv_self_seq_pos_max() and llama_kv_self_seq_pos_min() instead (https://github.com/ggml-org/llama.cpp/issues/13793)");
// Clear the KV cache - both cell info is erased and KV data is zeroed
DEPRECATED(LLAMA_API void llama_kv_self_clear(
struct llama_context * ctx),
"Use llama_memory_clear() instead");
// Removes all tokens that belong to the specified sequence and have positions in [p0, p1)
// Returns false if a partial sequence cannot be removed. Removing a whole sequence never fails
// seq_id < 0 : match any sequence
// p0 < 0 : [0, p1]
// p1 < 0 : [p0, inf)
DEPRECATED(LLAMA_API bool llama_kv_self_seq_rm(
struct llama_context * ctx,
llama_seq_id seq_id,
llama_pos p0,
llama_pos p1),
"Use llama_memory_seq_rm() instead");
// Copy all tokens that belong to the specified sequence to another sequence
// Note that this does not allocate extra KV cache memory - it simply assigns the tokens to the new sequence
// p0 < 0 : [0, p1]
// p1 < 0 : [p0, inf)
DEPRECATED(LLAMA_API void llama_kv_self_seq_cp(
struct llama_context * ctx,
llama_seq_id seq_id_src,
llama_seq_id seq_id_dst,
llama_pos p0,
llama_pos p1),
"Use llama_memory_seq_cp() instead");
// Removes all tokens that do not belong to the specified sequence
DEPRECATED(LLAMA_API void llama_kv_self_seq_keep(
struct llama_context * ctx,
llama_seq_id seq_id),
"Use llama_memory_seq_keep() instead");
// Adds relative position "delta" to all tokens that belong to the specified sequence and have positions in [p0, p1)
// If the KV cache is RoPEd, the KV data is updated accordingly:
// - lazily on next llama_decode()
// p0 < 0 : [0, p1]
// p1 < 0 : [p0, inf)
DEPRECATED(LLAMA_API void llama_kv_self_seq_add(
struct llama_context * ctx,
llama_seq_id seq_id,
llama_pos p0,
llama_pos p1,
llama_pos delta),
"Use llama_memory_seq_add() instead");
// Integer division of the positions by factor of `d > 1`
// If the KV cache is RoPEd, the KV data is updated accordingly:
// - lazily on next llama_decode()
// p0 < 0 : [0, p1]
// p1 < 0 : [p0, inf)
DEPRECATED(LLAMA_API void llama_kv_self_seq_div(
struct llama_context * ctx,
llama_seq_id seq_id,
llama_pos p0,
llama_pos p1,
int d),
"Use llama_memory_seq_div() instead");
// Returns the smallest position present in the KV cache for the specified sequence
// This is typically non-zero only for SWA caches
// Note that all positions in the range [pos_min, pos_max] are guaranteed to be present in the KV cache
// Return -1 if the sequence is empty
DEPRECATED(LLAMA_API llama_pos llama_kv_self_seq_pos_min(
struct llama_context * ctx,
llama_seq_id seq_id),
"Use llama_memory_seq_pos_min() instead");
// Returns the largest position present in the KV cache for the specified sequence
// Note that all positions in the range [pos_min, pos_max] are guaranteed to be present in the KV cache
// Return -1 if the sequence is empty
DEPRECATED(LLAMA_API llama_pos llama_kv_self_seq_pos_max(
struct llama_context * ctx,
llama_seq_id seq_id),
"Use llama_memory_seq_pos_max() instead");
// Defragment the KV cache
// This will be applied:
// - lazily on next llama_decode()
DEPRECATED(LLAMA_API void llama_kv_self_defrag(struct llama_context * ctx),
"simply remove this call, the context will automatically decide when to do a defragmentation based on 'defrag_thold'");
// Check if the context supports KV cache shifting
DEPRECATED(LLAMA_API bool llama_kv_self_can_shift(const struct llama_context * ctx),
"use llama_memory_can_shift() instead");
// Apply the KV cache updates (such as K-shifts, defragmentation, etc.)
DEPRECATED(LLAMA_API void llama_kv_self_update(struct llama_context * ctx),
"simply remove this call, updates are applied lazily on the next llama_decode()");
// //
// State / sessions // State / sessions
// //
@ -870,6 +791,29 @@ extern "C" {
size_t n_token_capacity, size_t n_token_capacity,
size_t * n_token_count_out); size_t * n_token_count_out);
#define LLAMA_STATE_SEQ_FLAGS_SWA_ONLY 1
typedef uint32_t llama_state_seq_flags;
LLAMA_API size_t llama_state_seq_get_size_ext(
struct llama_context * ctx,
llama_seq_id seq_id,
llama_state_seq_flags flags);
LLAMA_API size_t llama_state_seq_get_data_ext(
struct llama_context * ctx,
uint8_t * dst,
size_t size,
llama_seq_id seq_id,
llama_state_seq_flags flags);
LLAMA_API size_t llama_state_seq_set_data_ext(
struct llama_context * ctx,
const uint8_t * src,
size_t size,
llama_seq_id dest_seq_id,
llama_state_seq_flags flags);
// //
// Decoding // Decoding
// //
@ -1216,11 +1160,6 @@ extern "C" {
LLAMA_API struct llama_sampler * llama_sampler_init_greedy(void); LLAMA_API struct llama_sampler * llama_sampler_init_greedy(void);
LLAMA_API struct llama_sampler * llama_sampler_init_dist (uint32_t seed); LLAMA_API struct llama_sampler * llama_sampler_init_dist (uint32_t seed);
/// @details Sorts candidate tokens by their logits in descending order and calculate probabilities based on logits.
/// NOTE: Avoid using on the full vocabulary as the sorting can become slow. For example, apply top-k or top-p sampling first.
DEPRECATED(LLAMA_API struct llama_sampler * llama_sampler_init_softmax (void),
"will be removed in the future (see https://github.com/ggml-org/llama.cpp/pull/9896#discussion_r1800920915)");
/// @details Top-K sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751 /// @details Top-K sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751
/// Setting k <= 0 makes this a noop /// Setting k <= 0 makes this a noop
LLAMA_API struct llama_sampler * llama_sampler_init_top_k (int32_t k); LLAMA_API struct llama_sampler * llama_sampler_init_top_k (int32_t k);
@ -1390,24 +1329,25 @@ extern "C" {
// //
// Performance utils // Performance utils
// //
// NOTE: Used by llama.cpp examples, avoid using in third-party apps. Instead, do your own performance measurements. // NOTE: Used by llama.cpp examples/tools, avoid using in third-party apps. Instead, do your own performance measurements.
// //
struct llama_perf_context_data { struct llama_perf_context_data {
double t_start_ms; // ms == milliseconds
double t_load_ms; double t_start_ms; // absolute start time
double t_p_eval_ms; double t_load_ms; // time needed for loading the model
double t_eval_ms; double t_p_eval_ms; // time needed for processing the prompt
double t_eval_ms; // time needed for generating tokens
int32_t n_p_eval; int32_t n_p_eval; // number of prompt tokens
int32_t n_eval; int32_t n_eval; // number of generated tokens
int32_t n_reused; // number of times a ggml compute graph had been reused int32_t n_reused; // number of times a ggml compute graph had been reused
}; };
struct llama_perf_sampler_data { struct llama_perf_sampler_data {
double t_sample_ms; double t_sample_ms; // time needed for sampling in ms
int32_t n_sample; int32_t n_sample; // number of sampled tokens
}; };
LLAMA_API struct llama_perf_context_data llama_perf_context (const struct llama_context * ctx); LLAMA_API struct llama_perf_context_data llama_perf_context (const struct llama_context * ctx);
@ -1419,6 +1359,9 @@ extern "C" {
LLAMA_API void llama_perf_sampler_print(const struct llama_sampler * chain); LLAMA_API void llama_perf_sampler_print(const struct llama_sampler * chain);
LLAMA_API void llama_perf_sampler_reset( struct llama_sampler * chain); LLAMA_API void llama_perf_sampler_reset( struct llama_sampler * chain);
// print a breakdown of per-device memory use via LLAMA_LOG:
LLAMA_API void llama_memory_breakdown_print(const struct llama_context * ctx);
// //
// training // training
// //
@ -1437,6 +1380,8 @@ extern "C" {
ggml_opt_get_optimizer_params get_opt_pars; // callback for calculating optimizer parameters ggml_opt_get_optimizer_params get_opt_pars; // callback for calculating optimizer parameters
void * get_opt_pars_ud; // userdata for calculating optimizer parameters void * get_opt_pars_ud; // userdata for calculating optimizer parameters
enum ggml_opt_optimizer_type optimizer_type;
}; };
LLAMA_API void llama_opt_init(struct llama_context * lctx, struct llama_model * model, struct llama_opt_params lopt_params); LLAMA_API void llama_opt_init(struct llama_context * lctx, struct llama_model * model, struct llama_opt_params lopt_params);

View File

@ -6,6 +6,7 @@
#include <map> #include <map>
#include <cassert> #include <cassert>
#include <sstream>
#include <stdexcept> #include <stdexcept>
// vec // vec
@ -163,13 +164,38 @@ static void llama_adapter_lora_init_impl(llama_model & model, const char * path_
// check metadata // check metadata
{ {
const gguf_context * gguf_ctx = ctx_gguf.get();
LLAMA_LOG_INFO("%s: Dumping metadata keys/values.\n", __func__);
// get metadata as string
for (int i = 0; i < gguf_get_n_kv(gguf_ctx); i++) {
gguf_type type = gguf_get_kv_type(gguf_ctx, i);
const std::string type_name =
type == GGUF_TYPE_ARRAY
? format("%s[%s,%zu]", gguf_type_name(type), gguf_type_name(gguf_get_arr_type(gguf_ctx, i)), gguf_get_arr_n(gguf_ctx, i))
: gguf_type_name(type);
const char * name = gguf_get_key(gguf_ctx, i);
const std::string value = gguf_kv_to_str(gguf_ctx, i);
if (type != GGUF_TYPE_ARRAY) {
adapter.gguf_kv.emplace(name, value);
}
const size_t MAX_VALUE_LEN = 40;
std::string print_value = value.size() > MAX_VALUE_LEN ? format("%s...", value.substr(0, MAX_VALUE_LEN - 3).c_str()) : value;
replace_all(print_value, "\n", "\\n");
LLAMA_LOG_INFO("%s: - kv %3d: %42s %-16s = %s\n", __func__, i, name, type_name.c_str(), print_value.c_str());
}
auto get_kv_str = [&](const std::string & key) -> std::string { auto get_kv_str = [&](const std::string & key) -> std::string {
int id = gguf_find_key(ctx_gguf.get(), key.c_str()); int id = gguf_find_key(gguf_ctx, key.c_str());
return id < 0 ? "" : std::string(gguf_get_val_str(ctx_gguf.get(), id)); return id < 0 ? "" : std::string(gguf_get_val_str(gguf_ctx, id));
}; };
auto get_kv_f32 = [&](const std::string & key) -> float { auto get_kv_f32 = [&](const std::string & key) -> float {
int id = gguf_find_key(ctx_gguf.get(), key.c_str()); int id = gguf_find_key(gguf_ctx, key.c_str());
return id < 0 ? 0.0f : gguf_get_val_f32(ctx_gguf.get(), id); return id < 0 ? 0.0f : gguf_get_val_f32(gguf_ctx, id);
}; };
LLM_KV llm_kv = LLM_KV(LLM_ARCH_UNKNOWN); LLM_KV llm_kv = LLM_KV(LLM_ARCH_UNKNOWN);
@ -190,6 +216,26 @@ static void llama_adapter_lora_init_impl(llama_model & model, const char * path_
} }
adapter.alpha = get_kv_f32(llm_kv(LLM_KV_ADAPTER_LORA_ALPHA)); adapter.alpha = get_kv_f32(llm_kv(LLM_KV_ADAPTER_LORA_ALPHA));
// parse alora invocation sequence vector
const auto & key = llm_kv(LLM_KV_ADAPTER_ALORA_INVOCATION_TOKENS);
const int kid = gguf_find_key(ctx_gguf.get(), key.c_str());
if (kid >= 0) {
if (gguf_get_kv_type(ctx_gguf.get(), kid) != GGUF_TYPE_ARRAY) {
throw std::runtime_error("invalid gguf type for " + key);
}
const auto arr_type = gguf_get_arr_type(ctx_gguf.get(), kid);
if (arr_type != GGUF_TYPE_UINT32) {
throw std::runtime_error("invalid gguf element type for " + key);
}
const size_t seq_len = gguf_get_arr_n(ctx_gguf.get(), kid);
const void * data = gguf_get_arr_data(ctx_gguf.get(), kid);
adapter.alora_invocation_tokens.resize(seq_len);
std::copy(
(const llama_token *)data,
(const llama_token *)data + seq_len,
adapter.alora_invocation_tokens.begin());
}
} }
int n_tensors = gguf_get_n_tensors(ctx_gguf.get()); int n_tensors = gguf_get_n_tensors(ctx_gguf.get());
@ -383,6 +429,57 @@ llama_adapter_lora * llama_adapter_lora_init(llama_model * model, const char * p
return nullptr; return nullptr;
} }
int32_t llama_adapter_meta_val_str(const llama_adapter_lora * adapter, const char * key, char * buf, size_t buf_size) {
const auto & it = adapter->gguf_kv.find(key);
if (it == adapter->gguf_kv.end()) {
if (buf_size > 0) {
buf[0] = '\0';
}
return -1;
}
return snprintf(buf, buf_size, "%s", it->second.c_str());
}
int32_t llama_adapter_meta_count(const llama_adapter_lora * adapter) {
return (int)adapter->gguf_kv.size();
}
int32_t llama_adapter_meta_key_by_index(const llama_adapter_lora * adapter, int i, char * buf, size_t buf_size) {
if (i < 0 || i >= (int)adapter->gguf_kv.size()) {
if (buf_size > 0) {
buf[0] = '\0';
}
return -1;
}
auto it = adapter->gguf_kv.begin();
std::advance(it, i);
return snprintf(buf, buf_size, "%s", it->first.c_str());
}
int32_t llama_adapter_meta_val_str_by_index(const llama_adapter_lora * adapter, int32_t i, char * buf, size_t buf_size) {
if (i < 0 || i >= (int)adapter->gguf_kv.size()) {
if (buf_size > 0) {
buf[0] = '\0';
}
return -1;
}
auto it = adapter->gguf_kv.begin();
std::advance(it, i);
return snprintf(buf, buf_size, "%s", it->second.c_str());
}
void llama_adapter_lora_free(llama_adapter_lora * adapter) { void llama_adapter_lora_free(llama_adapter_lora * adapter) {
delete adapter; delete adapter;
} }
uint64_t llama_adapter_get_alora_n_invocation_tokens(const struct llama_adapter_lora * adapter) {
if (!adapter) {
return 0;
}
return adapter->alora_invocation_tokens.size();
}
const llama_token * llama_adapter_get_alora_invocation_tokens(const llama_adapter_lora * adapter) {
GGML_ASSERT(adapter);
return adapter->alora_invocation_tokens.data();
}

View File

@ -67,6 +67,12 @@ struct llama_adapter_lora {
float alpha; float alpha;
// gguf metadata
std::unordered_map<std::string, std::string> gguf_kv;
// activated lora (aLoRA)
std::vector<llama_token> alora_invocation_tokens;
llama_adapter_lora() = default; llama_adapter_lora() = default;
~llama_adapter_lora() = default; ~llama_adapter_lora() = default;

View File

@ -22,6 +22,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
{ LLM_ARCH_NOMIC_BERT_MOE, "nomic-bert-moe" }, { LLM_ARCH_NOMIC_BERT_MOE, "nomic-bert-moe" },
{ LLM_ARCH_NEO_BERT, "neo-bert" }, { LLM_ARCH_NEO_BERT, "neo-bert" },
{ LLM_ARCH_JINA_BERT_V2, "jina-bert-v2" }, { LLM_ARCH_JINA_BERT_V2, "jina-bert-v2" },
{ LLM_ARCH_JINA_BERT_V3, "jina-bert-v3" },
{ LLM_ARCH_BLOOM, "bloom" }, { LLM_ARCH_BLOOM, "bloom" },
{ LLM_ARCH_STABLELM, "stablelm" }, { LLM_ARCH_STABLELM, "stablelm" },
{ LLM_ARCH_QWEN, "qwen" }, { LLM_ARCH_QWEN, "qwen" },
@ -44,6 +45,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
{ LLM_ARCH_GEMMA2, "gemma2" }, { LLM_ARCH_GEMMA2, "gemma2" },
{ LLM_ARCH_GEMMA3, "gemma3" }, { LLM_ARCH_GEMMA3, "gemma3" },
{ LLM_ARCH_GEMMA3N, "gemma3n" }, { LLM_ARCH_GEMMA3N, "gemma3n" },
{ LLM_ARCH_GEMMA_EMBEDDING, "gemma-embedding" },
{ LLM_ARCH_STARCODER2, "starcoder2" }, { LLM_ARCH_STARCODER2, "starcoder2" },
{ LLM_ARCH_MAMBA, "mamba" }, { LLM_ARCH_MAMBA, "mamba" },
{ LLM_ARCH_MAMBA2, "mamba2" }, { LLM_ARCH_MAMBA2, "mamba2" },
@ -68,6 +70,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
{ LLM_ARCH_T5ENCODER, "t5encoder" }, { LLM_ARCH_T5ENCODER, "t5encoder" },
{ LLM_ARCH_JAIS, "jais" }, { LLM_ARCH_JAIS, "jais" },
{ LLM_ARCH_NEMOTRON, "nemotron" }, { LLM_ARCH_NEMOTRON, "nemotron" },
{ LLM_ARCH_NEMOTRON_H, "nemotron_h" },
{ LLM_ARCH_EXAONE, "exaone" }, { LLM_ARCH_EXAONE, "exaone" },
{ LLM_ARCH_EXAONE4, "exaone4" }, { LLM_ARCH_EXAONE4, "exaone4" },
{ LLM_ARCH_RWKV6, "rwkv6" }, { LLM_ARCH_RWKV6, "rwkv6" },
@ -94,6 +97,9 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
{ LLM_ARCH_DREAM, "dream" }, { LLM_ARCH_DREAM, "dream" },
{ LLM_ARCH_SMALLTHINKER, "smallthinker" }, { LLM_ARCH_SMALLTHINKER, "smallthinker" },
{ LLM_ARCH_LLADA, "llada" }, { LLM_ARCH_LLADA, "llada" },
{ LLM_ARCH_LLADA_MOE, "llada-moe" },
{ LLM_ARCH_SEED_OSS, "seed_oss" },
{ LLM_ARCH_GROVEMOE, "grovemoe" },
{ LLM_ARCH_UNKNOWN, "(unknown)" }, { LLM_ARCH_UNKNOWN, "(unknown)" },
}; };
@ -121,6 +127,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
{ LLM_KV_FEED_FORWARD_LENGTH, "%s.feed_forward_length" }, { LLM_KV_FEED_FORWARD_LENGTH, "%s.feed_forward_length" },
{ LLM_KV_EXPERT_FEED_FORWARD_LENGTH, "%s.expert_feed_forward_length" }, { LLM_KV_EXPERT_FEED_FORWARD_LENGTH, "%s.expert_feed_forward_length" },
{ LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, "%s.expert_shared_feed_forward_length" }, { LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, "%s.expert_shared_feed_forward_length" },
{ LLM_KV_EXPERT_CHUNK_FEED_FORWARD_LENGTH, "%s.expert_chunk_feed_forward_length" },
{ LLM_KV_USE_PARALLEL_RESIDUAL, "%s.use_parallel_residual" }, { LLM_KV_USE_PARALLEL_RESIDUAL, "%s.use_parallel_residual" },
{ LLM_KV_TENSOR_DATA_LAYOUT, "%s.tensor_data_layout" }, { LLM_KV_TENSOR_DATA_LAYOUT, "%s.tensor_data_layout" },
{ LLM_KV_EXPERT_COUNT, "%s.expert_count" }, { LLM_KV_EXPERT_COUNT, "%s.expert_count" },
@ -129,12 +136,16 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
{ LLM_KV_EXPERT_WEIGHTS_SCALE, "%s.expert_weights_scale" }, { LLM_KV_EXPERT_WEIGHTS_SCALE, "%s.expert_weights_scale" },
{ LLM_KV_EXPERT_WEIGHTS_NORM, "%s.expert_weights_norm" }, { LLM_KV_EXPERT_WEIGHTS_NORM, "%s.expert_weights_norm" },
{ LLM_KV_EXPERT_GATING_FUNC, "%s.expert_gating_func" }, { LLM_KV_EXPERT_GATING_FUNC, "%s.expert_gating_func" },
{ LLM_KV_EXPERT_GROUP_SCALE, "%s.expert_group_scale" },
{ LLM_KV_EXPERTS_PER_GROUP, "%s.experts_per_group" },
{ LLM_KV_MOE_EVERY_N_LAYERS, "%s.moe_every_n_layers" }, { LLM_KV_MOE_EVERY_N_LAYERS, "%s.moe_every_n_layers" },
{ LLM_KV_NEXTN_PREDICT_LAYERS, "%s.nextn_predict_layers" }, { LLM_KV_NEXTN_PREDICT_LAYERS, "%s.nextn_predict_layers" },
{ LLM_KV_POOLING_TYPE, "%s.pooling_type" }, { LLM_KV_POOLING_TYPE, "%s.pooling_type" },
{ LLM_KV_LOGIT_SCALE, "%s.logit_scale" }, { LLM_KV_LOGIT_SCALE, "%s.logit_scale" },
{ LLM_KV_DECODER_START_TOKEN_ID, "%s.decoder_start_token_id" }, { LLM_KV_DECODER_START_TOKEN_ID, "%s.decoder_start_token_id" },
{ LLM_KV_DECODER_BLOCK_COUNT, "%s.decoder_block_count" },
{ LLM_KV_ATTN_LOGIT_SOFTCAPPING, "%s.attn_logit_softcapping" }, { LLM_KV_ATTN_LOGIT_SOFTCAPPING, "%s.attn_logit_softcapping" },
{ LLM_KV_ROUTER_LOGIT_SOFTCAPPING, "%s.router_logit_softcapping" },
{ LLM_KV_FINAL_LOGIT_SOFTCAPPING, "%s.final_logit_softcapping" }, { LLM_KV_FINAL_LOGIT_SOFTCAPPING, "%s.final_logit_softcapping" },
{ LLM_KV_SWIN_NORM, "%s.swin_norm" }, { LLM_KV_SWIN_NORM, "%s.swin_norm" },
{ LLM_KV_RESCALE_EVERY_N_LAYERS, "%s.rescale_every_n_layers" }, { LLM_KV_RESCALE_EVERY_N_LAYERS, "%s.rescale_every_n_layers" },
@ -165,20 +176,26 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
{ LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT, "%s.attention.relative_buckets_count" }, { LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT, "%s.attention.relative_buckets_count" },
{ LLM_KV_ATTENTION_SLIDING_WINDOW, "%s.attention.sliding_window" }, { LLM_KV_ATTENTION_SLIDING_WINDOW, "%s.attention.sliding_window" },
{ LLM_KV_ATTENTION_SCALE, "%s.attention.scale" }, { LLM_KV_ATTENTION_SCALE, "%s.attention.scale" },
{ LLM_KV_ATTENTION_OUTPUT_SCALE, "%s.attention.output_scale" },
{ LLM_KV_ATTENTION_TEMPERATURE_LENGTH, "%s.attention.temperature_length" },
{ LLM_KV_ATTENTION_BLOCK_SKIP_CONNECTION, "%s.attention.block_skip_connection" }, { LLM_KV_ATTENTION_BLOCK_SKIP_CONNECTION, "%s.attention.block_skip_connection" },
{ LLM_KV_ATTENTION_KEY_LENGTH_MLA, "%s.attention.key_length_mla" }, { LLM_KV_ATTENTION_KEY_LENGTH_MLA, "%s.attention.key_length_mla" },
{ LLM_KV_ATTENTION_VALUE_LENGTH_MLA, "%s.attention.value_length_mla" }, { LLM_KV_ATTENTION_VALUE_LENGTH_MLA, "%s.attention.value_length_mla" },
{ LLM_KV_ROPE_DIMENSION_COUNT, "%s.rope.dimension_count" }, { LLM_KV_ROPE_DIMENSION_COUNT, "%s.rope.dimension_count" },
{ LLM_KV_ROPE_DIMENSION_SECTIONS, "%s.rope.dimension_sections" }, { LLM_KV_ROPE_DIMENSION_SECTIONS, "%s.rope.dimension_sections" },
{ LLM_KV_ROPE_FREQ_BASE, "%s.rope.freq_base" }, { LLM_KV_ROPE_FREQ_BASE, "%s.rope.freq_base" },
{ LLM_KV_ROPE_SCALE_LINEAR, "%s.rope.scale_linear" }, { LLM_KV_ROPE_SCALE_LINEAR, "%s.rope.scale_linear" },
{ LLM_KV_ROPE_SCALING_TYPE, "%s.rope.scaling.type" }, { LLM_KV_ROPE_SCALING_TYPE, "%s.rope.scaling.type" },
{ LLM_KV_ROPE_SCALING_FACTOR, "%s.rope.scaling.factor" }, { LLM_KV_ROPE_SCALING_FACTOR, "%s.rope.scaling.factor" },
{ LLM_KV_ROPE_SCALING_ATTN_FACTOR, "%s.rope.scaling.attn_factor" }, { LLM_KV_ROPE_SCALING_ATTN_FACTOR, "%s.rope.scaling.attn_factor" },
{ LLM_KV_ROPE_SCALING_ORIG_CTX_LEN, "%s.rope.scaling.original_context_length" }, { LLM_KV_ROPE_SCALING_ORIG_CTX_LEN, "%s.rope.scaling.original_context_length" },
{ LLM_KV_ROPE_SCALING_FINETUNED, "%s.rope.scaling.finetuned" }, { LLM_KV_ROPE_SCALING_FINETUNED, "%s.rope.scaling.finetuned" },
{ LLM_KV_ROPE_SCALING_YARN_LOG_MUL, "%s.rope.scaling.yarn_log_multiplier" }, { LLM_KV_ROPE_SCALING_YARN_LOG_MUL, "%s.rope.scaling.yarn_log_multiplier" },
{ LLM_KV_ROPE_SCALING_YARN_EXT_FACTOR, "%s.rope.scaling.yarn_ext_factor" },
{ LLM_KV_ROPE_SCALING_YARN_ATTN_FACTOR, "%s.rope.scaling.yarn_attn_factor" },
{ LLM_KV_ROPE_SCALING_YARN_BETA_FAST, "%s.rope.scaling.yarn_beta_fast" },
{ LLM_KV_ROPE_SCALING_YARN_BETA_SLOW, "%s.rope.scaling.yarn_beta_slow" },
{ LLM_KV_SPLIT_NO, "split.no" }, { LLM_KV_SPLIT_NO, "split.no" },
{ LLM_KV_SPLIT_COUNT, "split.count" }, { LLM_KV_SPLIT_COUNT, "split.count" },
@ -235,8 +252,11 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
{ LLM_KV_TOKENIZER_FIM_REP_ID, "tokenizer.ggml.fim_rep_token_id" }, { LLM_KV_TOKENIZER_FIM_REP_ID, "tokenizer.ggml.fim_rep_token_id" },
{ LLM_KV_TOKENIZER_FIM_SEP_ID, "tokenizer.ggml.fim_sep_token_id" }, { LLM_KV_TOKENIZER_FIM_SEP_ID, "tokenizer.ggml.fim_sep_token_id" },
{ LLM_KV_ADAPTER_TYPE, "adapter.type" }, { LLM_KV_ADAPTER_TYPE, "adapter.type" },
{ LLM_KV_ADAPTER_LORA_ALPHA, "adapter.lora.alpha" }, { LLM_KV_ADAPTER_LORA_ALPHA, "adapter.lora.alpha" },
{ LLM_KV_ADAPTER_LORA_TASK_NAME, "adapter.lora.task_name" },
{ LLM_KV_ADAPTER_LORA_PROMPT_PREFIX, "adapter.lora.prompt_prefix" },
{ LLM_KV_ADAPTER_ALORA_INVOCATION_TOKENS, "adapter.alora.invocation_tokens" },
// deprecated // deprecated
{ LLM_KV_TOKENIZER_PREFIX_ID, "tokenizer.ggml.prefix_token_id" }, { LLM_KV_TOKENIZER_PREFIX_ID, "tokenizer.ggml.prefix_token_id" },
@ -392,12 +412,16 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
{ LLM_TENSOR_ATTN_ROT_EMBD, "blk.%d.attn_rot_embd" }, { LLM_TENSOR_ATTN_ROT_EMBD, "blk.%d.attn_rot_embd" },
{ LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" }, { LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" },
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" }, { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
{ LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
{ LLM_TENSOR_FFN_GATE_EXP, "blk.%d.ffn_gate.%d" }, { LLM_TENSOR_FFN_GATE_EXP, "blk.%d.ffn_gate.%d" },
{ LLM_TENSOR_FFN_DOWN_EXP, "blk.%d.ffn_down.%d" }, { LLM_TENSOR_FFN_DOWN_EXP, "blk.%d.ffn_down.%d" },
{ LLM_TENSOR_FFN_UP_EXP, "blk.%d.ffn_up.%d" }, { LLM_TENSOR_FFN_UP_EXP, "blk.%d.ffn_up.%d" },
{ LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" }, { LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" },
{ LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" }, { LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" },
{ LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" }, { LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
{ LLM_TENSOR_FFN_POST_NORM, "blk.%d.post_ffw_norm" },
{ LLM_TENSOR_LAYER_OUT_NORM, "blk.%d.layer_output_norm" }, { LLM_TENSOR_LAYER_OUT_NORM, "blk.%d.layer_output_norm" },
{ LLM_TENSOR_ATTN_OUT_NORM, "blk.%d.attn_output_norm" }, { LLM_TENSOR_ATTN_OUT_NORM, "blk.%d.attn_output_norm" },
}, },
@ -576,6 +600,20 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
{ LLM_TENSOR_CLS, "cls" }, { LLM_TENSOR_CLS, "cls" },
}, },
}, },
{
LLM_ARCH_JINA_BERT_V3,
{
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
{ LLM_TENSOR_TOKEN_EMBD_NORM, "token_embd_norm" },
{ LLM_TENSOR_TOKEN_TYPES, "token_types" },
{ LLM_TENSOR_ATTN_OUT_NORM, "blk.%d.attn_output_norm" },
{ LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" },
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
{ LLM_TENSOR_LAYER_OUT_NORM, "blk.%d.layer_output_norm" },
},
},
{ {
LLM_ARCH_BLOOM, LLM_ARCH_BLOOM,
{ {
@ -689,6 +727,7 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" }, { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" }, { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
{ LLM_TENSOR_OUTPUT, "output" }, { LLM_TENSOR_OUTPUT, "output" },
{ LLM_TENSOR_CLS_OUT, "cls.output" },
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" }, { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" }, { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
{ LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm" }, { LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm" },
@ -1021,6 +1060,27 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
{ LLM_TENSOR_LAUREL_POST_NORM, "blk.%d.laurel_post_norm" }, { LLM_TENSOR_LAUREL_POST_NORM, "blk.%d.laurel_post_norm" },
}, },
}, },
{
LLM_ARCH_GEMMA_EMBEDDING,
{
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
{ LLM_TENSOR_OUTPUT, "output" },
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
{ LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm" },
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
{ LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" },
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
{ LLM_TENSOR_ATTN_POST_NORM, "blk.%d.post_attention_norm" },
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
{ LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
{ LLM_TENSOR_FFN_POST_NORM, "blk.%d.post_ffw_norm" },
},
},
{ {
LLM_ARCH_STARCODER2, LLM_ARCH_STARCODER2,
{ {
@ -1534,6 +1594,31 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" }, { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
}, },
}, },
{
LLM_ARCH_NEMOTRON_H,
{
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
{ LLM_TENSOR_OUTPUT, "output" },
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
// mamba(2) ssm layers
{ LLM_TENSOR_SSM_IN, "blk.%d.ssm_in" },
{ LLM_TENSOR_SSM_CONV1D, "blk.%d.ssm_conv1d" },
{ LLM_TENSOR_SSM_DT, "blk.%d.ssm_dt" },
{ LLM_TENSOR_SSM_A, "blk.%d.ssm_a" },
{ LLM_TENSOR_SSM_D, "blk.%d.ssm_d" },
{ LLM_TENSOR_SSM_NORM, "blk.%d.ssm_norm" },
{ LLM_TENSOR_SSM_OUT, "blk.%d.ssm_out" },
// attention layers
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
// dense FFN
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
},
},
{ {
LLM_ARCH_EXAONE, LLM_ARCH_EXAONE,
{ {
@ -2030,6 +2115,7 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
{ LLM_TENSOR_SHORTCONV_OUTPROJ, "blk.%d.shortconv.out_proj" }, { LLM_TENSOR_SHORTCONV_OUTPROJ, "blk.%d.shortconv.out_proj" },
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" }, { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
{ LLM_TENSOR_TOKEN_EMBD_NORM, "token_embd_norm" }, { LLM_TENSOR_TOKEN_EMBD_NORM, "token_embd_norm" },
{ LLM_TENSOR_OUTPUT, "output" },
} }
}, },
{ {
@ -2087,6 +2173,66 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" }, { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
}, },
}, },
{
LLM_ARCH_LLADA_MOE,
{
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
{ LLM_TENSOR_OUTPUT, "output" },
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
{ LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm" },
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
{ LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" },
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
{ LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" },
{ LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" },
{ LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" },
{ LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
},
},
{
LLM_ARCH_SEED_OSS,
{
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
{ LLM_TENSOR_OUTPUT, "output" },
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
{ LLM_TENSOR_ATTN_POST_NORM, "blk.%d.post_attention_norm" },
{ LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
},
},
{
LLM_ARCH_GROVEMOE,
{
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
{ LLM_TENSOR_OUTPUT, "output" },
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
{ LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm" },
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
{ LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" },
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
{ LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" },
{ LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" },
{ LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" },
{ LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
{ LLM_TENSOR_FFN_GATE_CHEXPS, "blk.%d.ffn_gate_chexps" },
{ LLM_TENSOR_FFN_DOWN_CHEXPS, "blk.%d.ffn_down_chexps" },
{ LLM_TENSOR_FFN_UP_CHEXPS, "blk.%d.ffn_up_chexps" },
},
},
{ {
LLM_ARCH_UNKNOWN, LLM_ARCH_UNKNOWN,
{ {
@ -2219,6 +2365,9 @@ static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
{LLM_TENSOR_FFN_DOWN_EXPS, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT_ID}}, {LLM_TENSOR_FFN_DOWN_EXPS, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT_ID}},
{LLM_TENSOR_FFN_GATE_EXPS, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT_ID}}, {LLM_TENSOR_FFN_GATE_EXPS, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT_ID}},
{LLM_TENSOR_FFN_UP_EXPS, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT_ID}}, {LLM_TENSOR_FFN_UP_EXPS, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT_ID}},
{LLM_TENSOR_FFN_DOWN_CHEXPS, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT_ID}},
{LLM_TENSOR_FFN_GATE_CHEXPS, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT_ID}},
{LLM_TENSOR_FFN_UP_CHEXPS, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT_ID}},
{LLM_TENSOR_FFN_EXP_PROBS_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_ADD}}, {LLM_TENSOR_FFN_EXP_PROBS_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_ADD}},
// altup / laurel (gemma 3n) // altup / laurel (gemma 3n)
{LLM_TENSOR_PER_LAYER_TOKEN_EMBD, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_GET_ROWS}}, {LLM_TENSOR_PER_LAYER_TOKEN_EMBD, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_GET_ROWS}},
@ -2340,6 +2489,7 @@ bool llm_arch_is_hybrid(const llm_arch & arch) {
case LLM_ARCH_PLAMO2: case LLM_ARCH_PLAMO2:
case LLM_ARCH_GRANITE_HYBRID: case LLM_ARCH_GRANITE_HYBRID:
case LLM_ARCH_LFM2: case LLM_ARCH_LFM2:
case LLM_ARCH_NEMOTRON_H:
return true; return true;
default: default:
return false; return false;
@ -2350,6 +2500,7 @@ bool llm_arch_is_diffusion(const llm_arch & arch) {
switch (arch) { switch (arch) {
case LLM_ARCH_DREAM: case LLM_ARCH_DREAM:
case LLM_ARCH_LLADA: case LLM_ARCH_LLADA:
case LLM_ARCH_LLADA_MOE:
return true; return true;
default: default:
return false; return false;

View File

@ -26,6 +26,7 @@ enum llm_arch {
LLM_ARCH_NOMIC_BERT_MOE, LLM_ARCH_NOMIC_BERT_MOE,
LLM_ARCH_NEO_BERT, LLM_ARCH_NEO_BERT,
LLM_ARCH_JINA_BERT_V2, LLM_ARCH_JINA_BERT_V2,
LLM_ARCH_JINA_BERT_V3,
LLM_ARCH_BLOOM, LLM_ARCH_BLOOM,
LLM_ARCH_STABLELM, LLM_ARCH_STABLELM,
LLM_ARCH_QWEN, LLM_ARCH_QWEN,
@ -48,6 +49,7 @@ enum llm_arch {
LLM_ARCH_GEMMA2, LLM_ARCH_GEMMA2,
LLM_ARCH_GEMMA3, LLM_ARCH_GEMMA3,
LLM_ARCH_GEMMA3N, LLM_ARCH_GEMMA3N,
LLM_ARCH_GEMMA_EMBEDDING,
LLM_ARCH_STARCODER2, LLM_ARCH_STARCODER2,
LLM_ARCH_MAMBA, LLM_ARCH_MAMBA,
LLM_ARCH_MAMBA2, LLM_ARCH_MAMBA2,
@ -72,6 +74,7 @@ enum llm_arch {
LLM_ARCH_T5ENCODER, LLM_ARCH_T5ENCODER,
LLM_ARCH_JAIS, LLM_ARCH_JAIS,
LLM_ARCH_NEMOTRON, LLM_ARCH_NEMOTRON,
LLM_ARCH_NEMOTRON_H,
LLM_ARCH_EXAONE, LLM_ARCH_EXAONE,
LLM_ARCH_EXAONE4, LLM_ARCH_EXAONE4,
LLM_ARCH_RWKV6, LLM_ARCH_RWKV6,
@ -98,6 +101,9 @@ enum llm_arch {
LLM_ARCH_DREAM, LLM_ARCH_DREAM,
LLM_ARCH_SMALLTHINKER, LLM_ARCH_SMALLTHINKER,
LLM_ARCH_LLADA, LLM_ARCH_LLADA,
LLM_ARCH_LLADA_MOE,
LLM_ARCH_SEED_OSS,
LLM_ARCH_GROVEMOE,
LLM_ARCH_UNKNOWN, LLM_ARCH_UNKNOWN,
}; };
@ -125,6 +131,7 @@ enum llm_kv {
LLM_KV_FEED_FORWARD_LENGTH, LLM_KV_FEED_FORWARD_LENGTH,
LLM_KV_EXPERT_FEED_FORWARD_LENGTH, LLM_KV_EXPERT_FEED_FORWARD_LENGTH,
LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH,
LLM_KV_EXPERT_CHUNK_FEED_FORWARD_LENGTH,
LLM_KV_USE_PARALLEL_RESIDUAL, LLM_KV_USE_PARALLEL_RESIDUAL,
LLM_KV_TENSOR_DATA_LAYOUT, LLM_KV_TENSOR_DATA_LAYOUT,
LLM_KV_EXPERT_COUNT, LLM_KV_EXPERT_COUNT,
@ -133,12 +140,16 @@ enum llm_kv {
LLM_KV_EXPERT_WEIGHTS_SCALE, LLM_KV_EXPERT_WEIGHTS_SCALE,
LLM_KV_EXPERT_WEIGHTS_NORM, LLM_KV_EXPERT_WEIGHTS_NORM,
LLM_KV_EXPERT_GATING_FUNC, LLM_KV_EXPERT_GATING_FUNC,
LLM_KV_EXPERT_GROUP_SCALE,
LLM_KV_EXPERTS_PER_GROUP,
LLM_KV_MOE_EVERY_N_LAYERS, LLM_KV_MOE_EVERY_N_LAYERS,
LLM_KV_NEXTN_PREDICT_LAYERS, LLM_KV_NEXTN_PREDICT_LAYERS,
LLM_KV_POOLING_TYPE, LLM_KV_POOLING_TYPE,
LLM_KV_LOGIT_SCALE, LLM_KV_LOGIT_SCALE,
LLM_KV_DECODER_START_TOKEN_ID, LLM_KV_DECODER_START_TOKEN_ID,
LLM_KV_DECODER_BLOCK_COUNT,
LLM_KV_ATTN_LOGIT_SOFTCAPPING, LLM_KV_ATTN_LOGIT_SOFTCAPPING,
LLM_KV_ROUTER_LOGIT_SOFTCAPPING,
LLM_KV_FINAL_LOGIT_SOFTCAPPING, LLM_KV_FINAL_LOGIT_SOFTCAPPING,
LLM_KV_SWIN_NORM, LLM_KV_SWIN_NORM,
LLM_KV_RESCALE_EVERY_N_LAYERS, LLM_KV_RESCALE_EVERY_N_LAYERS,
@ -169,6 +180,8 @@ enum llm_kv {
LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT, LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT,
LLM_KV_ATTENTION_SLIDING_WINDOW, LLM_KV_ATTENTION_SLIDING_WINDOW,
LLM_KV_ATTENTION_SCALE, LLM_KV_ATTENTION_SCALE,
LLM_KV_ATTENTION_OUTPUT_SCALE,
LLM_KV_ATTENTION_TEMPERATURE_LENGTH,
LLM_KV_ATTENTION_BLOCK_SKIP_CONNECTION, LLM_KV_ATTENTION_BLOCK_SKIP_CONNECTION,
LLM_KV_ATTENTION_KEY_LENGTH_MLA, LLM_KV_ATTENTION_KEY_LENGTH_MLA,
LLM_KV_ATTENTION_VALUE_LENGTH_MLA, LLM_KV_ATTENTION_VALUE_LENGTH_MLA,
@ -183,6 +196,10 @@ enum llm_kv {
LLM_KV_ROPE_SCALING_ORIG_CTX_LEN, LLM_KV_ROPE_SCALING_ORIG_CTX_LEN,
LLM_KV_ROPE_SCALING_FINETUNED, LLM_KV_ROPE_SCALING_FINETUNED,
LLM_KV_ROPE_SCALING_YARN_LOG_MUL, LLM_KV_ROPE_SCALING_YARN_LOG_MUL,
LLM_KV_ROPE_SCALING_YARN_EXT_FACTOR,
LLM_KV_ROPE_SCALING_YARN_ATTN_FACTOR,
LLM_KV_ROPE_SCALING_YARN_BETA_FAST,
LLM_KV_ROPE_SCALING_YARN_BETA_SLOW,
LLM_KV_SPLIT_NO, LLM_KV_SPLIT_NO,
LLM_KV_SPLIT_COUNT, LLM_KV_SPLIT_COUNT,
@ -231,6 +248,9 @@ enum llm_kv {
LLM_KV_ADAPTER_TYPE, LLM_KV_ADAPTER_TYPE,
LLM_KV_ADAPTER_LORA_ALPHA, LLM_KV_ADAPTER_LORA_ALPHA,
LLM_KV_ADAPTER_LORA_TASK_NAME,
LLM_KV_ADAPTER_LORA_PROMPT_PREFIX,
LLM_KV_ADAPTER_ALORA_INVOCATION_TOKENS,
LLM_KV_POSNET_EMBEDDING_LENGTH, LLM_KV_POSNET_EMBEDDING_LENGTH,
LLM_KV_POSNET_BLOCK_COUNT, LLM_KV_POSNET_BLOCK_COUNT,
@ -287,6 +307,9 @@ enum llm_tensor {
LLM_TENSOR_FFN_DOWN_SHEXP, LLM_TENSOR_FFN_DOWN_SHEXP,
LLM_TENSOR_FFN_GATE_SHEXP, LLM_TENSOR_FFN_GATE_SHEXP,
LLM_TENSOR_FFN_UP_SHEXP, LLM_TENSOR_FFN_UP_SHEXP,
LLM_TENSOR_FFN_DOWN_CHEXPS,
LLM_TENSOR_FFN_GATE_CHEXPS,
LLM_TENSOR_FFN_UP_CHEXPS,
LLM_TENSOR_FFN_EXP_PROBS_B, LLM_TENSOR_FFN_EXP_PROBS_B,
LLM_TENSOR_ATTN_Q_NORM, LLM_TENSOR_ATTN_Q_NORM,
LLM_TENSOR_ATTN_K_NORM, LLM_TENSOR_ATTN_K_NORM,

View File

@ -477,7 +477,7 @@ llama_ubatch llama_batch_allocr::split_simple(uint32_t n_ubatch) {
llama_ubatch llama_batch_allocr::split_equal(uint32_t n_ubatch, bool sequential) { llama_ubatch llama_batch_allocr::split_equal(uint32_t n_ubatch, bool sequential) {
if (sequential && has_cpl) { if (sequential && has_cpl) {
LLAMA_LOG_ERROR("%s: sequential split is not supported when there are coupled sequences in the input batch\n", __func__); LLAMA_LOG_ERROR("%s: sequential split is not supported when there are coupled sequences in the input batch (you may need to use the -kvu flag)\n", __func__);
return {}; return {};
} }

View File

@ -16,10 +16,10 @@
static std::string trim(const std::string & str) { static std::string trim(const std::string & str) {
size_t start = 0; size_t start = 0;
size_t end = str.size(); size_t end = str.size();
while (start < end && isspace(str[start])) { while (start < end && isspace(static_cast<unsigned char>(str[start]))) {
start += 1; start += 1;
} }
while (end > start && isspace(str[end - 1])) { while (end > start && isspace(static_cast<unsigned char>(str[end - 1]))) {
end -= 1; end -= 1;
} }
return str.substr(start, end - start); return str.substr(start, end - start);
@ -69,6 +69,8 @@ static const std::map<std::string, llm_chat_template> LLM_CHAT_TEMPLATES = {
{ "gpt-oss", LLM_CHAT_TEMPLATE_OPENAI_MOE }, { "gpt-oss", LLM_CHAT_TEMPLATE_OPENAI_MOE },
{ "hunyuan-dense", LLM_CHAT_TEMPLATE_HUNYUAN_DENSE }, { "hunyuan-dense", LLM_CHAT_TEMPLATE_HUNYUAN_DENSE },
{ "kimi-k2", LLM_CHAT_TEMPLATE_KIMI_K2 }, { "kimi-k2", LLM_CHAT_TEMPLATE_KIMI_K2 },
{ "seed_oss", LLM_CHAT_TEMPLATE_SEED_OSS },
{ "grok-2", LLM_CHAT_TEMPLATE_GROK_2 },
}; };
llm_chat_template llm_chat_template_from_str(const std::string & name) { llm_chat_template llm_chat_template_from_str(const std::string & name) {
@ -201,6 +203,10 @@ llm_chat_template llm_chat_detect_template(const std::string & tmpl) {
return LLM_CHAT_TEMPLATE_HUNYUAN_DENSE; return LLM_CHAT_TEMPLATE_HUNYUAN_DENSE;
} else if (tmpl_contains("<|im_assistant|>assistant<|im_middle|>")) { } else if (tmpl_contains("<|im_assistant|>assistant<|im_middle|>")) {
return LLM_CHAT_TEMPLATE_KIMI_K2; return LLM_CHAT_TEMPLATE_KIMI_K2;
} else if (tmpl_contains("<seed:bos>")) {
return LLM_CHAT_TEMPLATE_SEED_OSS;
} else if (tmpl_contains("'Assistant: ' + message['content'] + '<|separator|>")) {
return LLM_CHAT_TEMPLATE_GROK_2;
} }
return LLM_CHAT_TEMPLATE_UNKNOWN; return LLM_CHAT_TEMPLATE_UNKNOWN;
} }
@ -752,6 +758,28 @@ int32_t llm_chat_apply_template(
if (add_ass) { if (add_ass) {
ss << "<|im_assistant|>assistant<|im_middle|>"; ss << "<|im_assistant|>assistant<|im_middle|>";
} }
} else if (tmpl == LLM_CHAT_TEMPLATE_SEED_OSS) {
for (auto message: chat) {
std::string role(message->role);
ss << "<seed:bos>" << role << "\n" << (role == "assistant" ? trim(message->content) : message->content) << "<seed:eos>";
}
if (add_ass) {
ss << "<seed:bos>assistant\n";
}
} else if (tmpl == LLM_CHAT_TEMPLATE_GROK_2) {
for (auto message : chat) {
std::string role(message->role);
if (role == "system") {
ss << "System: " << trim(message->content) << "<|separator|>\n\n";
} else if (role == "user") {
ss << "Human: " << trim(message->content) << "<|separator|>\n\n";
} else if (role == "assistant") {
ss << "Assistant: " << message->content << "<|separator|>\n\n";
}
}
if (add_ass) {
ss << "Assistant:";
}
} else { } else {
// template not supported // template not supported
return -1; return -1;

View File

@ -49,6 +49,8 @@ enum llm_chat_template {
LLM_CHAT_TEMPLATE_OPENAI_MOE, LLM_CHAT_TEMPLATE_OPENAI_MOE,
LLM_CHAT_TEMPLATE_HUNYUAN_DENSE, LLM_CHAT_TEMPLATE_HUNYUAN_DENSE,
LLM_CHAT_TEMPLATE_KIMI_K2, LLM_CHAT_TEMPLATE_KIMI_K2,
LLM_CHAT_TEMPLATE_SEED_OSS,
LLM_CHAT_TEMPLATE_GROK_2,
LLM_CHAT_TEMPLATE_UNKNOWN, LLM_CHAT_TEMPLATE_UNKNOWN,
}; };

View File

@ -35,14 +35,12 @@ llama_context::llama_context(
cparams.n_threads = params.n_threads; cparams.n_threads = params.n_threads;
cparams.n_threads_batch = params.n_threads_batch; cparams.n_threads_batch = params.n_threads_batch;
cparams.yarn_ext_factor = params.yarn_ext_factor; cparams.yarn_ext_factor = params.yarn_ext_factor >= 0.0f ? params.yarn_ext_factor : hparams.yarn_ext_factor;
cparams.yarn_attn_factor = params.yarn_attn_factor; cparams.yarn_attn_factor = params.yarn_attn_factor >= 0.0f ? params.yarn_attn_factor : hparams.yarn_attn_factor;
cparams.yarn_beta_fast = params.yarn_beta_fast; cparams.yarn_beta_fast = params.yarn_beta_fast >= 0.0f ? params.yarn_beta_fast : hparams.yarn_beta_fast;
cparams.yarn_beta_slow = params.yarn_beta_slow; cparams.yarn_beta_slow = params.yarn_beta_slow >= 0.0f ? params.yarn_beta_slow : hparams.yarn_beta_slow;
cparams.defrag_thold = params.defrag_thold;
cparams.embeddings = params.embeddings; cparams.embeddings = params.embeddings;
cparams.offload_kqv = params.offload_kqv; cparams.offload_kqv = params.offload_kqv;
cparams.flash_attn = params.flash_attn;
cparams.no_perf = params.no_perf; cparams.no_perf = params.no_perf;
cparams.pooling_type = params.pooling_type; cparams.pooling_type = params.pooling_type;
cparams.warmup = false; cparams.warmup = false;
@ -87,13 +85,15 @@ llama_context::llama_context(
cparams.causal_attn = params.attention_type == LLAMA_ATTENTION_TYPE_CAUSAL; cparams.causal_attn = params.attention_type == LLAMA_ATTENTION_TYPE_CAUSAL;
} }
cparams.flash_attn = params.flash_attn_type != LLAMA_FLASH_ATTN_TYPE_DISABLED;
// with causal attention, the batch size is limited by the context size // with causal attention, the batch size is limited by the context size
cparams.n_batch = cparams.causal_attn ? std::min(cparams.n_ctx, params.n_batch) : params.n_batch; cparams.n_batch = cparams.causal_attn ? std::min(cparams.n_ctx, params.n_batch) : params.n_batch;
// the batch has to be at least GGML_KQ_MASK_PAD because we will be padding the KQ_mask // the batch has to be at least GGML_KQ_MASK_PAD because we will be padding the KQ_mask
// this is required by GPU kernels in order to avoid out-of-bounds accesses (e.g. ggml_flash_attn_ext) // this is required by GPU kernels in order to avoid out-of-bounds accesses (e.g. ggml_flash_attn_ext)
// ref: https://github.com/ggerganov/llama.cpp/pull/5021 // ref: https://github.com/ggerganov/llama.cpp/pull/5021
// TODO: this padding is not needed for the cache-less context so we should probably move it to llama_context_kv_self // TODO: this padding is not needed for the cache-less context so we should probably move it to llama_memory
if (cparams.n_batch < GGML_KQ_MASK_PAD) { if (cparams.n_batch < GGML_KQ_MASK_PAD) {
LLAMA_LOG_WARN("%s: n_batch is less than GGML_KQ_MASK_PAD - increasing to %d\n", __func__, GGML_KQ_MASK_PAD); LLAMA_LOG_WARN("%s: n_batch is less than GGML_KQ_MASK_PAD - increasing to %d\n", __func__, GGML_KQ_MASK_PAD);
cparams.n_batch = GGML_KQ_MASK_PAD; cparams.n_batch = GGML_KQ_MASK_PAD;
@ -103,16 +103,6 @@ llama_context::llama_context(
cparams.op_offload = params.op_offload; cparams.op_offload = params.op_offload;
cparams.kv_unified = params.kv_unified; cparams.kv_unified = params.kv_unified;
{
const char * LLAMA_SET_ROWS = getenv("LLAMA_SET_ROWS");
supports_set_rows = LLAMA_SET_ROWS ? (atoi(LLAMA_SET_ROWS) != 0) : supports_set_rows;
if (!supports_set_rows && !cparams.kv_unified) {
LLAMA_LOG_WARN("%s: non-unified KV cache requires ggml_set_rows() - forcing unified KV cache\n", __func__);
cparams.kv_unified = true;
}
}
{ {
const char * LLAMA_GRAPH_REUSE_DISABLE = getenv("LLAMA_GRAPH_REUSE_DISABLE"); const char * LLAMA_GRAPH_REUSE_DISABLE = getenv("LLAMA_GRAPH_REUSE_DISABLE");
graph_reuse_disable = LLAMA_GRAPH_REUSE_DISABLE ? (atoi(LLAMA_GRAPH_REUSE_DISABLE) != 0) : graph_reuse_disable; graph_reuse_disable = LLAMA_GRAPH_REUSE_DISABLE ? (atoi(LLAMA_GRAPH_REUSE_DISABLE) != 0) : graph_reuse_disable;
@ -130,7 +120,7 @@ llama_context::llama_context(
LLAMA_LOG_INFO("%s: n_batch = %u\n", __func__, cparams.n_batch); LLAMA_LOG_INFO("%s: n_batch = %u\n", __func__, cparams.n_batch);
LLAMA_LOG_INFO("%s: n_ubatch = %u\n", __func__, cparams.n_ubatch); LLAMA_LOG_INFO("%s: n_ubatch = %u\n", __func__, cparams.n_ubatch);
LLAMA_LOG_INFO("%s: causal_attn = %d\n", __func__, cparams.causal_attn); LLAMA_LOG_INFO("%s: causal_attn = %d\n", __func__, cparams.causal_attn);
LLAMA_LOG_INFO("%s: flash_attn = %d\n", __func__, cparams.flash_attn); LLAMA_LOG_INFO("%s: flash_attn = %s\n", __func__, llama_flash_attn_type_name(params.flash_attn_type));
LLAMA_LOG_INFO("%s: kv_unified = %s\n", __func__, cparams.kv_unified ? "true" : "false"); LLAMA_LOG_INFO("%s: kv_unified = %s\n", __func__, cparams.kv_unified ? "true" : "false");
LLAMA_LOG_INFO("%s: freq_base = %.1f\n", __func__, cparams.rope_freq_base); LLAMA_LOG_INFO("%s: freq_base = %.1f\n", __func__, cparams.rope_freq_base);
LLAMA_LOG_INFO("%s: freq_scale = %g\n", __func__, cparams.rope_freq_scale); LLAMA_LOG_INFO("%s: freq_scale = %g\n", __func__, cparams.rope_freq_scale);
@ -145,11 +135,6 @@ llama_context::llama_context(
__func__, n_ctx_per_seq, hparams.n_ctx_train); __func__, n_ctx_per_seq, hparams.n_ctx_train);
} }
if (!params.swa_full && cparams.n_seq_max > 1 && hparams.is_swa_any()) {
LLAMA_LOG_WARN("%s: requested n_seq_max (%u) > 1, but swa_full is not enabled -- performance may be degraded: %s\n",
__func__, cparams.n_seq_max, "https://github.com/ggml-org/llama.cpp/pull/13845#issuecomment-2924800573");
}
if (!hparams.vocab_only) { if (!hparams.vocab_only) {
// GPU backends // GPU backends
for (auto * dev : model.devices) { for (auto * dev : model.devices) {
@ -196,7 +181,7 @@ llama_context::llama_context(
// graph outputs buffer // graph outputs buffer
{ {
// resized during inference when a batch uses more outputs // resized during inference when a batch uses more outputs
if ((uint32_t) output_reserve(params.n_seq_max) < params.n_seq_max) { if (output_reserve(params.n_seq_max) < params.n_seq_max) {
throw std::runtime_error("failed to reserve initial output buffer"); throw std::runtime_error("failed to reserve initial output buffer");
} }
@ -285,28 +270,75 @@ llama_context::llama_context(
} }
} }
// reserve worst-case graph if (!hparams.vocab_only) {
if (!hparams.vocab_only && memory) { llama_memory_context_ptr mctx;
if (memory) {
LLAMA_LOG_DEBUG("%s: reserving full memory module\n", __func__);
mctx = memory->init_full();
if (!mctx) {
throw std::runtime_error("failed to initialize memory module");
}
}
cross.v_embd.clear();
const uint32_t n_seqs = cparams.kv_unified ? 1 : cparams.n_seq_max; const uint32_t n_seqs = cparams.kv_unified ? 1 : cparams.n_seq_max;
const uint32_t n_tokens = std::min(cparams.n_ctx, cparams.n_ubatch); const uint32_t n_tokens = std::min(cparams.n_ctx, cparams.n_ubatch);
// avoid reserving graphs with zero outputs - assume one output per sequence
n_outputs = n_seqs;
LLAMA_LOG_DEBUG("%s: worst-case: n_tokens = %d, n_seqs = %d, n_outputs = %d\n", __func__, n_tokens, n_seqs, n_outputs); LLAMA_LOG_DEBUG("%s: worst-case: n_tokens = %d, n_seqs = %d, n_outputs = %d\n", __func__, n_tokens, n_seqs, n_outputs);
// resolve automatic Flash Attention use
if (params.flash_attn_type == LLAMA_FLASH_ATTN_TYPE_AUTO) {
auto * gf = graph_reserve(1, n_seqs, n_outputs, mctx.get(), true);
if (!gf) {
throw std::runtime_error("failed to split graph for Flash Attention check");
}
const size_t prefix_len = strlen(LLAMA_TENSOR_NAME_FATTN) + 1;
bool fa_device_mismatch = false;
for (int i = 0; i < ggml_graph_n_nodes(gf); i++) {
ggml_tensor * n = ggml_graph_node(gf, i);
if (n->op != GGML_OP_FLASH_ATTN_EXT) {
continue;
}
ggml_backend_dev_t device_fa = ggml_backend_get_device(
ggml_backend_sched_get_tensor_backend(sched.get(), n));
// TODO: instead of the tensor names, use a map to keep track of which (FA) tensors belong to which layer
GGML_ASSERT(strncmp(n->name, LLAMA_TENSOR_NAME_FATTN "-", prefix_len) == 0);
const int il = std::stoi(n->name + prefix_len);
ggml_backend_dev_t device_kv = model.dev_layer(il);
if (device_fa != device_kv) {
LLAMA_LOG_WARN("%s: layer %d is assigned to device %s but the Flash Attention tensor "
"is assigned to device %s (usually due to missing support)\n",
__func__, il, ggml_backend_dev_name(device_kv), ggml_backend_dev_name(device_fa));
// FIXME: fa_device_mismatch logic is wrong for --no-kv-offload, but this is broken anyways
fa_device_mismatch = true;
break;
}
}
if (fa_device_mismatch) {
cparams.flash_attn = false;
LLAMA_LOG_WARN("%s: Flash Attention was auto, set to disabled\n", __func__);
if (ggml_is_quantized(params.type_v)) {
throw std::runtime_error("quantized V cache was requested, but this requires Flash Attention");
}
} else {
cparams.flash_attn = true;
LLAMA_LOG_INFO("%s: Flash Attention was auto, set to enabled\n", __func__);
}
}
// reserve worst-case graph
int n_splits_pp = -1; int n_splits_pp = -1;
int n_nodes_pp = -1; int n_nodes_pp = -1;
int n_splits_tg = -1; int n_splits_tg = -1;
int n_nodes_tg = -1; int n_nodes_tg = -1;
// simulate full KV cache
const auto mctx = memory->init_full();
if (!mctx) {
throw std::runtime_error("failed to initialize KV cache");
}
cross.v_embd.clear();
// reserve pp (prompt processing) graph first so that buffers are only allocated once // reserve pp (prompt processing) graph first so that buffers are only allocated once
{ {
auto * gf = graph_reserve(n_tokens, n_seqs, n_tokens, mctx.get()); auto * gf = graph_reserve(n_tokens, n_seqs, n_tokens, mctx.get());
@ -444,26 +476,12 @@ llama_memory_t llama_context::get_memory() const {
return memory.get(); return memory.get();
} }
// deprecated bool llama_context::memory_update(bool optimize) {
void llama_context::kv_self_defrag_sched() {
if (!memory) {
return;
}
memory_force_optimize = true;
}
// deprecated
bool llama_context::kv_self_update(bool optimize) {
if (!memory) { if (!memory) {
return false; return false;
} }
{ {
// TODO: remove in the future
optimize |= memory_force_optimize;
memory_force_optimize = false;
const auto mctx = memory->init_update(this, optimize); const auto mctx = memory->init_update(this, optimize);
switch (mctx->get_status()) { switch (mctx->get_status()) {
case LLAMA_MEMORY_STATUS_SUCCESS: case LLAMA_MEMORY_STATUS_SUCCESS:
@ -908,12 +926,6 @@ int llama_context::encode(const llama_batch & batch_inp) {
} }
} }
if (!supports_set_rows) {
// Reset state for the next token before backend sync, to allow the CPU activities in the reset to
// overlap with device computation.
ggml_backend_sched_reset(sched.get());
}
// TODO: hacky solution // TODO: hacky solution
if (model.arch == LLM_ARCH_T5 && t_embd) { if (model.arch == LLM_ARCH_T5 && t_embd) {
//cross.t_embd = t_embd; //cross.t_embd = t_embd;
@ -996,8 +1008,8 @@ int llama_context::decode(const llama_batch & batch_inp) {
bool did_optimize = false; bool did_optimize = false;
// handle any pending defrags/shifts // handle any pending shifts/copies
kv_self_update(false); memory_update(false);
llama_memory_context_ptr mctx; llama_memory_context_ptr mctx;
@ -1022,7 +1034,7 @@ int llama_context::decode(const llama_batch & batch_inp) {
if (!did_optimize) { if (!did_optimize) {
did_optimize = true; did_optimize = true;
if (kv_self_update(true)) { if (memory_update(true)) {
LLAMA_LOG_DEBUG("%s: retrying batch size %d after cache optimization\n", __func__, balloc->get_n_tokens()); LLAMA_LOG_DEBUG("%s: retrying batch size %d after cache optimization\n", __func__, balloc->get_n_tokens());
continue; continue;
@ -1075,7 +1087,7 @@ int llama_context::decode(const llama_batch & batch_inp) {
const auto * res = process_ubatch(ubatch, LLM_GRAPH_TYPE_DECODER, mctx.get(), status); const auto * res = process_ubatch(ubatch, LLM_GRAPH_TYPE_DECODER, mctx.get(), status);
if (!res) { if (!res) {
// the last ubatch failed or was aborted -> remove all positions of that ubatch from the KV cache // the last ubatch failed or was aborted -> remove all positions of that ubatch from the memory module
llama_pos pos_min[LLAMA_MAX_SEQ]; llama_pos pos_min[LLAMA_MAX_SEQ];
for (int s = 0; s < LLAMA_MAX_SEQ; ++s) { for (int s = 0; s < LLAMA_MAX_SEQ; ++s) {
pos_min[s] = std::numeric_limits<llama_pos>::max(); pos_min[s] = std::numeric_limits<llama_pos>::max();
@ -1092,7 +1104,7 @@ int llama_context::decode(const llama_batch & batch_inp) {
continue; continue;
} }
LLAMA_LOG_WARN("%s: removing KV cache entries for seq_id = %d, pos = [%d, +inf)\n", __func__, s, pos_min[s]); LLAMA_LOG_WARN("%s: removing memory module entries for seq_id = %d, pos = [%d, +inf)\n", __func__, s, pos_min[s]);
memory->seq_rm(s, pos_min[s], -1); memory->seq_rm(s, pos_min[s], -1);
} }
@ -1243,12 +1255,6 @@ int llama_context::decode(const llama_batch & batch_inp) {
// wait for the computation to finish (automatically done when obtaining the model output) // wait for the computation to finish (automatically done when obtaining the model output)
//synchronize(); //synchronize();
if (!supports_set_rows) {
// Reset state for the next token before backend sync, to allow the CPU activities in the reset to
// overlap with device computation.
ggml_backend_sched_reset(sched.get());
}
return 0; return 0;
} }
@ -1362,8 +1368,9 @@ llm_graph_result * llama_context::get_gf_res_reserve() const {
return static_cast<llm_graph_result *>(gf_res_reserve.get()); return static_cast<llm_graph_result *>(gf_res_reserve.get());
} }
ggml_cgraph * llama_context::graph_reserve(uint32_t n_tokens, uint32_t n_seqs, uint32_t n_outputs, const llama_memory_context_i * mctx) { ggml_cgraph * llama_context::graph_reserve(uint32_t n_tokens, uint32_t n_seqs, uint32_t n_outputs, const llama_memory_context_i * mctx, bool split_only) {
LLAMA_LOG_DEBUG("%s: reserving a graph for ubatch with n_tokens = %4u, n_seqs = %2u, n_outputs = %4u\n", __func__, n_tokens, n_seqs, n_outputs); LLAMA_LOG_DEBUG("%s: reserving a graph for ubatch with n_tokens = %4u, n_seqs = %2u, n_outputs = %4u\n", __func__, n_tokens, n_seqs, n_outputs);
GGML_ASSERT(n_outputs >= 1);
if (n_tokens % n_seqs != 0) { if (n_tokens % n_seqs != 0) {
n_tokens = ((n_tokens + (n_seqs - 1)) / n_seqs) * n_seqs; // round to next multiple of n_seqs n_tokens = ((n_tokens + (n_seqs - 1)) / n_seqs) * n_seqs; // round to next multiple of n_seqs
@ -1397,7 +1404,9 @@ ggml_cgraph * llama_context::graph_reserve(uint32_t n_tokens, uint32_t n_seqs, u
this->n_outputs = save_n_outputs; this->n_outputs = save_n_outputs;
// initialize scheduler with the specified graph // initialize scheduler with the specified graph
if (!ggml_backend_sched_reserve(sched.get(), gf)) { if (split_only) {
ggml_backend_sched_split_graph(sched.get(), gf);
} else if (!ggml_backend_sched_reserve(sched.get(), gf)) {
LLAMA_LOG_ERROR("%s: failed to allocate compute buffers\n", __func__); LLAMA_LOG_ERROR("%s: failed to allocate compute buffers\n", __func__);
return nullptr; return nullptr;
} }
@ -1437,7 +1446,9 @@ ggml_status llama_context::graph_compute(
if (backend_cpu != nullptr) { if (backend_cpu != nullptr) {
auto * reg = ggml_backend_dev_backend_reg(ggml_backend_get_device(backend_cpu)); auto * reg = ggml_backend_dev_backend_reg(ggml_backend_get_device(backend_cpu));
auto * set_threadpool_fn = (decltype(ggml_backend_cpu_set_threadpool) *) ggml_backend_reg_get_proc_address(reg, "ggml_backend_cpu_set_threadpool"); auto * set_threadpool_fn = (decltype(ggml_backend_cpu_set_threadpool) *) ggml_backend_reg_get_proc_address(reg, "ggml_backend_cpu_set_threadpool");
set_threadpool_fn(backend_cpu, tp); if (set_threadpool_fn) {
set_threadpool_fn(backend_cpu, tp);
}
} }
// set the number of threads for all the backends // set the number of threads for all the backends
@ -1656,30 +1667,30 @@ size_t llama_context::state_set_data(const uint8_t * src, size_t size) {
} }
} }
size_t llama_context::state_seq_get_size(llama_seq_id seq_id) { size_t llama_context::state_seq_get_size(llama_seq_id seq_id, llama_state_seq_flags flags) {
llama_io_write_dummy io; llama_io_write_dummy io;
try { try {
return state_seq_write_data(io, seq_id); return state_seq_write_data(io, seq_id, flags);
} catch (const std::exception & err) { } catch (const std::exception & err) {
LLAMA_LOG_ERROR("%s: error getting state size: %s\n", __func__, err.what()); LLAMA_LOG_ERROR("%s: error getting state size: %s\n", __func__, err.what());
return 0; return 0;
} }
} }
size_t llama_context::state_seq_get_data(llama_seq_id seq_id, uint8_t * dst, size_t size) { size_t llama_context::state_seq_get_data(llama_seq_id seq_id, uint8_t * dst, size_t size, llama_state_seq_flags flags) {
llama_io_write_buffer io(dst, size); llama_io_write_buffer io(dst, size);
try { try {
return state_seq_write_data(io, seq_id); return state_seq_write_data(io, seq_id, flags);
} catch (const std::exception & err) { } catch (const std::exception & err) {
LLAMA_LOG_ERROR("%s: error saving state: %s\n", __func__, err.what()); LLAMA_LOG_ERROR("%s: error saving state: %s\n", __func__, err.what());
return 0; return 0;
} }
} }
size_t llama_context::state_seq_set_data(llama_seq_id seq_id, const uint8_t * src, size_t size) { size_t llama_context::state_seq_set_data(llama_seq_id seq_id, const uint8_t * src, size_t size, llama_state_seq_flags flags) {
llama_io_read_buffer io(src, size); llama_io_read_buffer io(src, size);
try { try {
return state_seq_read_data(io, seq_id); return state_seq_read_data(io, seq_id, flags);
} catch (const std::exception & err) { } catch (const std::exception & err) {
LLAMA_LOG_ERROR("%s: error loading state: %s\n", __func__, err.what()); LLAMA_LOG_ERROR("%s: error loading state: %s\n", __func__, err.what());
return 0; return 0;
@ -1777,7 +1788,7 @@ size_t llama_context::state_seq_load_file(llama_seq_id seq_id, const char * file
{ {
const size_t state_size = file.size() - file.tell(); const size_t state_size = file.size() - file.tell();
llama_io_read_file io(&file); llama_io_read_file io(&file);
const size_t nread = state_seq_read_data(io, seq_id); const size_t nread = state_seq_read_data(io, seq_id, 0);
if (!nread) { if (!nread) {
LLAMA_LOG_ERROR("%s: failed to restore sequence state\n", __func__); LLAMA_LOG_ERROR("%s: failed to restore sequence state\n", __func__);
return 0; return 0;
@ -1801,7 +1812,7 @@ size_t llama_context::state_seq_save_file(llama_seq_id seq_id, const char * file
// save the context state using stream saving // save the context state using stream saving
llama_io_write_file io(&file); llama_io_write_file io(&file);
state_seq_write_data(io, seq_id); state_seq_write_data(io, seq_id, 0);
const size_t res = file.tell(); const size_t res = file.tell();
GGML_ASSERT(res == sizeof(uint32_t) * 3 + sizeof(llama_token) * n_token_count + io.n_bytes()); GGML_ASSERT(res == sizeof(uint32_t) * 3 + sizeof(llama_token) * n_token_count + io.n_bytes());
@ -1876,7 +1887,7 @@ size_t llama_context::state_write_data(llama_io_write_i & io) {
} }
if (memory != nullptr) { if (memory != nullptr) {
LLAMA_LOG_DEBUG("%s: - writing KV self\n", __func__); LLAMA_LOG_DEBUG("%s: - writing memory module\n", __func__);
memory->state_write(io); memory->state_write(io);
} }
@ -1962,7 +1973,7 @@ size_t llama_context::state_read_data(llama_io_read_i & io) {
} }
if (memory) { if (memory) {
LLAMA_LOG_DEBUG("%s: - reading KV self\n", __func__); LLAMA_LOG_DEBUG("%s: - reading memory module\n", __func__);
memory->state_read(io); memory->state_read(io);
} }
@ -1970,21 +1981,21 @@ size_t llama_context::state_read_data(llama_io_read_i & io) {
return io.n_bytes(); return io.n_bytes();
} }
size_t llama_context::state_seq_write_data(llama_io_write_i & io, llama_seq_id seq_id) { size_t llama_context::state_seq_write_data(llama_io_write_i & io, llama_seq_id seq_id, llama_state_seq_flags flags) {
GGML_UNUSED(seq_id); GGML_UNUSED(seq_id);
if (memory) { if (memory) {
memory->state_write(io, seq_id); memory->state_write(io, seq_id, flags);
} }
return io.n_bytes(); return io.n_bytes();
} }
size_t llama_context::state_seq_read_data(llama_io_read_i & io, llama_seq_id seq_id) { size_t llama_context::state_seq_read_data(llama_io_read_i & io, llama_seq_id seq_id, llama_state_seq_flags flags) {
GGML_UNUSED(seq_id); GGML_UNUSED(seq_id);
if (memory) { if (memory) {
memory->state_read(io, seq_id); memory->state_read(io, seq_id, flags);
} }
return io.n_bytes(); return io.n_bytes();
@ -2015,6 +2026,21 @@ void llama_context::perf_reset() {
n_reused = 0; n_reused = 0;
} }
std::map<ggml_backend_buffer_type_t, llama_memory_breakdown_data> llama_context::memory_breakdown() const {
std::map<ggml_backend_buffer_type_t, llama_memory_breakdown_data> ret;
for (const auto & buft_size : model.memory_breakdown()) {
ret[buft_size.first].model += buft_size.second;
}
for (const auto & buft_size : memory->memory_breakdown()) {
ret[buft_size.first].context += buft_size.second;
}
for (const auto & backend_ptr : backends) {
ggml_backend_t backend = backend_ptr.get();
ret[ggml_backend_sched_get_buffer_type(sched.get(), backend)].compute += ggml_backend_sched_get_buffer_size(sched.get(), backend);
}
return ret;
}
// //
// training // training
// //
@ -2047,7 +2073,7 @@ void llama_context::opt_init(struct llama_model * model, struct llama_opt_params
opt_params.opt_period = n_batch / n_ubatch; opt_params.opt_period = n_batch / n_ubatch;
opt_params.get_opt_pars = lopt_params.get_opt_pars; opt_params.get_opt_pars = lopt_params.get_opt_pars;
opt_params.get_opt_pars_ud = lopt_params.get_opt_pars_ud; opt_params.get_opt_pars_ud = lopt_params.get_opt_pars_ud;
opt_params.optimizer = lopt_params.optimizer_type;
opt_ctx = ggml_opt_init(opt_params); opt_ctx = ggml_opt_init(opt_params);
llama_opt_param_filter param_filter = lopt_params.param_filter; llama_opt_param_filter param_filter = lopt_params.param_filter;
@ -2247,12 +2273,13 @@ llama_context_params llama_context_default_params() {
/*.rope_scaling_type =*/ LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED, /*.rope_scaling_type =*/ LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED,
/*.pooling_type =*/ LLAMA_POOLING_TYPE_UNSPECIFIED, /*.pooling_type =*/ LLAMA_POOLING_TYPE_UNSPECIFIED,
/*.attention_type =*/ LLAMA_ATTENTION_TYPE_UNSPECIFIED, /*.attention_type =*/ LLAMA_ATTENTION_TYPE_UNSPECIFIED,
/*.flash_attn_type =*/ LLAMA_FLASH_ATTN_TYPE_AUTO,
/*.rope_freq_base =*/ 0.0f, /*.rope_freq_base =*/ 0.0f,
/*.rope_freq_scale =*/ 0.0f, /*.rope_freq_scale =*/ 0.0f,
/*.yarn_ext_factor =*/ -1.0f, /*.yarn_ext_factor =*/ -1.0f,
/*.yarn_attn_factor =*/ 1.0f, /*.yarn_attn_factor =*/ -1.0f,
/*.yarn_beta_fast =*/ 32.0f, /*.yarn_beta_fast =*/ -1.0f,
/*.yarn_beta_slow =*/ 1.0f, /*.yarn_beta_slow =*/ -1.0f,
/*.yarn_orig_ctx =*/ 0, /*.yarn_orig_ctx =*/ 0,
/*.defrag_thold =*/ -1.0f, /*.defrag_thold =*/ -1.0f,
/*.cb_eval =*/ nullptr, /*.cb_eval =*/ nullptr,
@ -2263,7 +2290,6 @@ llama_context_params llama_context_default_params() {
/*.abort_callback_data =*/ nullptr, /*.abort_callback_data =*/ nullptr,
/*.embeddings =*/ false, /*.embeddings =*/ false,
/*.offload_kqv =*/ true, /*.offload_kqv =*/ true,
/*.flash_attn =*/ false,
/*.no_perf =*/ true, /*.no_perf =*/ true,
/*.op_offload =*/ true, /*.op_offload =*/ true,
/*.swa_full =*/ true, /*.swa_full =*/ true,
@ -2291,12 +2317,30 @@ llama_context * llama_init_from_model(
return nullptr; return nullptr;
} }
if (params.flash_attn && model->arch == LLM_ARCH_GROK) { if (params.flash_attn_type != LLAMA_FLASH_ATTN_TYPE_DISABLED && model->arch == LLM_ARCH_GROK) {
LLAMA_LOG_WARN("%s: flash_attn is not compatible with Grok - forcing off\n", __func__); LLAMA_LOG_WARN("%s: flash_attn is not compatible with Grok - forcing off\n", __func__);
params.flash_attn = false; params.flash_attn_type = LLAMA_FLASH_ATTN_TYPE_DISABLED;
} }
if (ggml_is_quantized(params.type_v) && !params.flash_attn) { if (params.flash_attn_type == LLAMA_FLASH_ATTN_TYPE_AUTO && ggml_is_quantized(params.type_k)) {
const uint32_t blck_size = ggml_blck_size(params.type_k);
if (model->hparams.n_embd_head_k % blck_size != 0) {
LLAMA_LOG_ERROR("%s: K cache type %s with block size %u does not divide n_embd_head_k=%u\n",
__func__, ggml_type_name(params.type_k), blck_size, model->hparams.n_embd_head_k);
return nullptr;
}
}
if (params.flash_attn_type == LLAMA_FLASH_ATTN_TYPE_AUTO && ggml_is_quantized(params.type_v)) {
const uint32_t blck_size = ggml_blck_size(params.type_v);
if (model->hparams.n_embd_head_v % blck_size != 0) {
LLAMA_LOG_ERROR("%s: V cache type %s with block size %u does not divide n_embd_head_k=%u\n",
__func__, ggml_type_name(params.type_v), blck_size, model->hparams.n_embd_head_v);
return nullptr;
}
}
if (ggml_is_quantized(params.type_v) && params.flash_attn_type == LLAMA_FLASH_ATTN_TYPE_DISABLED) {
LLAMA_LOG_ERROR("%s: V cache quantization requires flash_attn\n", __func__); LLAMA_LOG_ERROR("%s: V cache quantization requires flash_attn\n", __func__);
return nullptr; return nullptr;
} }
@ -2342,16 +2386,6 @@ const llama_model * llama_get_model(const llama_context * ctx) {
return &ctx->get_model(); return &ctx->get_model();
} }
// deprecated
llama_kv_cache * llama_get_kv_self(llama_context * ctx) {
return dynamic_cast<llama_kv_cache *>(ctx->get_memory());
}
// deprecated
void llama_kv_self_update(llama_context * ctx) {
ctx->kv_self_update(false);
}
enum llama_pooling_type llama_pooling_type(const llama_context * ctx) { enum llama_pooling_type llama_pooling_type(const llama_context * ctx) {
return ctx->pooling_type(); return ctx->pooling_type();
} }
@ -2569,168 +2603,6 @@ bool llama_memory_can_shift(llama_memory_t mem) {
return mem->get_can_shift(); return mem->get_can_shift();
} }
//
// kv cache
//
// deprecated
int32_t llama_kv_self_n_tokens(const llama_context * ctx) {
const auto * kv = llama_get_memory(ctx);
if (!kv) {
return 0;
}
int32_t res = 0;
for (uint32_t s = 0; s < ctx->get_cparams().n_seq_max; s++) {
const llama_pos p0 = kv->seq_pos_min(s);
const llama_pos p1 = kv->seq_pos_max(s);
if (p0 >= 0) {
res += (p1 - p0) + 1;
}
}
return res;
}
// deprecated
// note: this is the same as above - will be removed anyway, so it's ok
int32_t llama_kv_self_used_cells(const llama_context * ctx) {
const auto * kv = llama_get_memory(ctx);
if (!kv) {
return 0;
}
int32_t res = 0;
for (uint32_t s = 0; s < ctx->get_cparams().n_seq_max; s++) {
const llama_pos p0 = kv->seq_pos_min(s);
const llama_pos p1 = kv->seq_pos_max(s);
if (p0 >= 0) {
res += (p1 - p0) + 1;
}
}
return res;
}
// deprecated
void llama_kv_self_clear(llama_context * ctx) {
auto * kv = llama_get_memory(ctx);
if (!kv) {
return;
}
llama_memory_clear(kv, true);
}
// deprecated
bool llama_kv_self_seq_rm(
llama_context * ctx,
llama_seq_id seq_id,
llama_pos p0,
llama_pos p1) {
auto * kv = llama_get_memory(ctx);
if (!kv) {
return true;
}
return llama_memory_seq_rm(kv, seq_id, p0, p1);
}
// deprecated
void llama_kv_self_seq_cp(
llama_context * ctx,
llama_seq_id seq_id_src,
llama_seq_id seq_id_dst,
llama_pos p0,
llama_pos p1) {
auto * kv = llama_get_memory(ctx);
if (!kv) {
return;
}
llama_memory_seq_cp(kv, seq_id_src, seq_id_dst, p0, p1);
}
// deprecated
void llama_kv_self_seq_keep(llama_context * ctx, llama_seq_id seq_id) {
auto * kv = llama_get_memory(ctx);
if (!kv) {
return;
}
llama_memory_seq_keep(kv, seq_id);
}
// deprecated
void llama_kv_self_seq_add(
llama_context * ctx,
llama_seq_id seq_id,
llama_pos p0,
llama_pos p1,
llama_pos delta) {
auto * kv = llama_get_memory(ctx);
if (!kv) {
return;
}
llama_memory_seq_add(kv, seq_id, p0, p1, delta);
}
// deprecated
void llama_kv_self_seq_div(
llama_context * ctx,
llama_seq_id seq_id,
llama_pos p0,
llama_pos p1,
int d) {
auto * kv = llama_get_memory(ctx);
if (!kv) {
return;
}
llama_memory_seq_div(kv, seq_id, p0, p1, d);
}
// deprecated
llama_pos llama_kv_self_seq_pos_min(llama_context * ctx, llama_seq_id seq_id) {
auto * kv = llama_get_memory(ctx);
if (!kv) {
return -1;
}
return llama_memory_seq_pos_min(kv, seq_id);
}
// deprecated
llama_pos llama_kv_self_seq_pos_max(llama_context * ctx, llama_seq_id seq_id) {
auto * kv = llama_get_memory(ctx);
if (!kv) {
return -1;
}
return llama_memory_seq_pos_max(kv, seq_id);
}
// deprecated
void llama_kv_self_defrag(llama_context * ctx) {
// force defrag
ctx->kv_self_defrag_sched();
}
// deprecated
bool llama_kv_self_can_shift(const llama_context * ctx) {
auto * kv = llama_get_memory(ctx);
if (!kv) {
return false;
}
return llama_memory_can_shift(kv);
}
// llama state API // llama state API
// deprecated // deprecated
@ -2800,19 +2672,31 @@ bool llama_state_save_file(llama_context * ctx, const char * path_session, const
} }
size_t llama_state_seq_get_size(llama_context * ctx, llama_seq_id seq_id) { size_t llama_state_seq_get_size(llama_context * ctx, llama_seq_id seq_id) {
return ctx->state_seq_get_size(seq_id); return llama_state_seq_get_size_ext(ctx, seq_id, 0);
} }
size_t llama_state_seq_get_data(llama_context * ctx, uint8_t * dst, size_t size, llama_seq_id seq_id) { size_t llama_state_seq_get_data(llama_context * ctx, uint8_t * dst, size_t size, llama_seq_id seq_id) {
ctx->synchronize(); return llama_state_seq_get_data_ext(ctx, dst, size, seq_id, 0);
return ctx->state_seq_get_data(seq_id, dst, size);
} }
size_t llama_state_seq_set_data(llama_context * ctx, const uint8_t * src, size_t size, llama_seq_id seq_id) { size_t llama_state_seq_set_data(llama_context * ctx, const uint8_t * src, size_t size, llama_seq_id seq_id) {
return llama_state_seq_set_data_ext(ctx, src, size, seq_id, 0);
}
size_t llama_state_seq_get_size_ext(llama_context * ctx, llama_seq_id seq_id, llama_state_seq_flags flags) {
return ctx->state_seq_get_size(seq_id, flags);
}
size_t llama_state_seq_get_data_ext(llama_context * ctx, uint8_t * dst, size_t size, llama_seq_id seq_id, llama_state_seq_flags flags) {
ctx->synchronize(); ctx->synchronize();
return ctx->state_seq_set_data(seq_id, src, size); return ctx->state_seq_get_data(seq_id, dst, size, flags);
}
size_t llama_state_seq_set_data_ext(llama_context * ctx, const uint8_t * src, size_t size, llama_seq_id seq_id, llama_state_seq_flags flags) {
ctx->synchronize();
return ctx->state_seq_set_data(seq_id, src, size, flags);
} }
size_t llama_state_seq_save_file(llama_context * ctx, const char * filepath, llama_seq_id seq_id, const llama_token * tokens, size_t n_token_count) { size_t llama_state_seq_save_file(llama_context * ctx, const char * filepath, llama_seq_id seq_id, const llama_token * tokens, size_t n_token_count) {
@ -2895,6 +2779,142 @@ void llama_perf_context_reset(llama_context * ctx) {
ctx->perf_reset(); ctx->perf_reset();
} }
void llama_memory_breakdown_print(const struct llama_context * ctx) {
const std::vector<ggml_backend_dev_t> & devices = ctx->get_model().devices;
std::map<ggml_backend_buffer_type_t, llama_memory_breakdown_data> memory_breakdown = ctx->memory_breakdown();
std::vector<std::array<std::string, 9>> table_data;
table_data.reserve(devices.size());
const std::string template_header = "%s: | %s | %s %s %s %s %s %s %s |\n";
const std::string template_gpu = "%s: | %s | %s = %s + (%s = %s + %s + %s) + %s |\n";
const std::string template_other = "%s: | %s | %s %s %s = %s + %s + %s %s |\n";
table_data.push_back({template_header, "memory breakdown [MiB]", "total", "free", "self", "model", "context", "compute", "unaccounted"});
constexpr size_t MiB = 1024 * 1024;
const std::vector<std::string> desc_prefixes_strip = {"NVIDIA ", "GeForce ", "Tesla ", "AMD ", "Radeon ", "Instinct "};
// track seen buffer types to avoid double counting:
std::set<ggml_backend_buffer_type_t> seen_buffer_types;
// accumulative memory breakdown for each device and for host:
std::vector<llama_memory_breakdown_data> mb_dev(devices.size());
llama_memory_breakdown_data mb_host;
for (const auto & buft_mb : memory_breakdown) {
ggml_backend_buffer_type_t buft = buft_mb.first;
const llama_memory_breakdown_data & mb = buft_mb.second;
if (ggml_backend_buft_is_host(buft)) {
mb_host.model += mb.model;
mb_host.context += mb.context;
mb_host.compute += mb.compute;
seen_buffer_types.insert(buft);
continue;
}
ggml_backend_dev_t dev = ggml_backend_buft_get_device(buft);
if (dev) {
int i_dev = -1;
for (size_t i = 0; i < devices.size(); i++) {
if (devices[i] == dev) {
i_dev = i;
break;
}
}
if (i_dev != -1) {
mb_dev[i_dev].model += mb.model;
mb_dev[i_dev].context += mb.context;
mb_dev[i_dev].compute += mb.compute;
seen_buffer_types.insert(buft);
continue;
}
}
}
// print memory breakdown for each device:
for (size_t i = 0; i < devices.size(); i++) {
ggml_backend_dev_t dev = devices[i];
llama_memory_breakdown_data mb = mb_dev[i];
const std::string name = ggml_backend_dev_name(dev);
std::string desc = ggml_backend_dev_description(dev);
for (const std::string & prefix : desc_prefixes_strip) {
if (desc.length() >= prefix.length() && desc.substr(0, prefix.length()) == prefix) {
desc = desc.substr(prefix.length());
}
}
size_t free, total;
ggml_backend_dev_memory(dev, &free, &total);
const size_t self = mb.model + mb.context + mb.compute;
const size_t unaccounted = total - self - free;
table_data.push_back({
template_gpu,
" - " + name + " (" + desc + ")",
std::to_string(total / MiB),
std::to_string(free / MiB),
std::to_string(self / MiB),
std::to_string(mb.model / MiB),
std::to_string(mb.context / MiB),
std::to_string(mb.compute / MiB),
std::to_string(unaccounted / MiB)});
}
// print memory breakdown for host:
{
const size_t self = mb_host.model + mb_host.context + mb_host.compute;
table_data.push_back({
template_other,
" - Host",
"", // total
"", // free
std::to_string(self / MiB),
std::to_string(mb_host.model / MiB),
std::to_string(mb_host.context / MiB),
std::to_string(mb_host.compute / MiB),
""}); // unaccounted
}
// print memory breakdown for all remaining buffer types:
for (const auto & buft_mb : memory_breakdown) {
ggml_backend_buffer_type_t buft = buft_mb.first;
const llama_memory_breakdown_data & mb = buft_mb.second;
if (seen_buffer_types.count(buft) == 1) {
continue;
}
const std::string name = ggml_backend_buft_name(buft);
const size_t self = mb.model + mb.context + mb.compute;
table_data.push_back({
template_other,
" - " + name,
"", // total
"", // free
std::to_string(self / MiB),
std::to_string(mb.model / MiB),
std::to_string(mb.context / MiB),
std::to_string(mb.compute / MiB),
""}); // unaccounted
seen_buffer_types.insert(buft);
}
for (size_t j = 1; j < table_data[0].size(); j++) {
size_t max_len = 0;
for (const auto & td : table_data) {
max_len = std::max(max_len, td[j].length());
}
for (auto & td : table_data) {
td[j].insert(j == 1 ? td[j].length() : 0, max_len - td[j].length(), ' ');
}
}
for (const auto & td : table_data) {
LLAMA_LOG_INFO(td[0].c_str(),
__func__, td[1].c_str(), td[2].c_str(), td[3].c_str(), td[4].c_str(), td[5].c_str(),
td[6].c_str(), td[7].c_str(), td[8].c_str());
}
}
// //
// training // training
// //

View File

@ -17,9 +17,17 @@ class llama_batch_allocr;
class llama_io_read_i; class llama_io_read_i;
class llama_io_write_i; class llama_io_write_i;
// "memory" as in abstract memory for the context
struct llama_memory_i; struct llama_memory_i;
struct llama_memory_context_i; struct llama_memory_context_i;
// "memory" as in physical memory for a buffer type, in bytes
struct llama_memory_breakdown_data {
size_t model = 0; // memory allocated for the model
size_t context = 0; // memory allocated for the context
size_t compute = 0; // memory allocated for temporary compute buffers
};
struct llama_context { struct llama_context {
// init scheduler and compute buffers, reserve worst-case graphs // init scheduler and compute buffers, reserve worst-case graphs
llama_context( llama_context(
@ -46,10 +54,8 @@ struct llama_context {
llama_memory_t get_memory() const; llama_memory_t get_memory() const;
// return true of the KV cache was updated // return true if the memory was updated
// TODO: remove bool memory_update(bool optimize);
bool kv_self_update(bool optimize);
void kv_self_defrag_sched();
enum llama_pooling_type pooling_type() const; enum llama_pooling_type pooling_type() const;
@ -111,9 +117,9 @@ struct llama_context {
size_t state_get_data( uint8_t * dst, size_t size); size_t state_get_data( uint8_t * dst, size_t size);
size_t state_set_data(const uint8_t * src, size_t size); size_t state_set_data(const uint8_t * src, size_t size);
size_t state_seq_get_size(llama_seq_id seq_id); size_t state_seq_get_size(llama_seq_id seq_id, llama_state_seq_flags flags);
size_t state_seq_get_data(llama_seq_id seq_id, uint8_t * dst, size_t size); size_t state_seq_get_data(llama_seq_id seq_id, uint8_t * dst, size_t size, llama_state_seq_flags flags);
size_t state_seq_set_data(llama_seq_id seq_id, const uint8_t * src, size_t size); size_t state_seq_set_data(llama_seq_id seq_id, const uint8_t * src, size_t size, llama_state_seq_flags flags);
bool state_load_file( bool state_load_file(
const char * filepath, const char * filepath,
@ -146,12 +152,15 @@ struct llama_context {
llama_perf_context_data perf_get_data() const; llama_perf_context_data perf_get_data() const;
void perf_reset(); void perf_reset();
std::map<ggml_backend_buffer_type_t, llama_memory_breakdown_data> memory_breakdown() const;
// //
// training // training
// //
void opt_init(struct llama_model * model, struct llama_opt_params lopt_params); void opt_init(struct llama_model * model, struct llama_opt_params lopt_params);
// TODO: more flexible combinations of logical/physical batch size and context size
void opt_epoch( void opt_epoch(
ggml_opt_dataset_t dataset, ggml_opt_dataset_t dataset,
ggml_opt_result_t result_train, ggml_opt_result_t result_train,
@ -197,7 +206,7 @@ public:
ggml_status graph_compute(ggml_cgraph * gf, bool batched); ggml_status graph_compute(ggml_cgraph * gf, bool batched);
// reserve a graph with a dummy ubatch of the specified size // reserve a graph with a dummy ubatch of the specified size
ggml_cgraph * graph_reserve(uint32_t n_tokens, uint32_t n_seqs, uint32_t n_outputs, const llama_memory_context_i * mctx); ggml_cgraph * graph_reserve(uint32_t n_tokens, uint32_t n_seqs, uint32_t n_outputs, const llama_memory_context_i * mctx, bool split_only = false);
private: private:
llm_graph_params graph_params( llm_graph_params graph_params(
@ -212,8 +221,8 @@ private:
size_t state_write_data(llama_io_write_i & io); size_t state_write_data(llama_io_write_i & io);
size_t state_read_data (llama_io_read_i & io); size_t state_read_data (llama_io_read_i & io);
size_t state_seq_write_data(llama_io_write_i & io, llama_seq_id seq_id); size_t state_seq_write_data(llama_io_write_i & io, llama_seq_id seq_id, llama_state_seq_flags flags);
size_t state_seq_read_data (llama_io_read_i & io, llama_seq_id seq_id); size_t state_seq_read_data (llama_io_read_i & io, llama_seq_id seq_id, llama_state_seq_flags flags);
// //
// members // members
@ -229,9 +238,6 @@ private:
std::unique_ptr<llama_memory_i> memory; std::unique_ptr<llama_memory_i> memory;
// TODO: temporary, until the llama_kv_self_defrag() API is removed
bool memory_force_optimize = false;
// decode output (2-dimensional array: [n_outputs][n_vocab]) // decode output (2-dimensional array: [n_outputs][n_vocab])
size_t logits_size = 0; // capacity (of floats) for logits size_t logits_size = 0; // capacity (of floats) for logits
float * logits = nullptr; float * logits = nullptr;
@ -287,10 +293,6 @@ private:
bool has_evaluated_once = false; bool has_evaluated_once = false;
// env: LLAMA_SET_ROWS (temporary)
// ref: https://github.com/ggml-org/llama.cpp/pull/14285
bool supports_set_rows = true;
// env: LLAMA_GRAPH_REUSE_DISABLE // env: LLAMA_GRAPH_REUSE_DISABLE
bool graph_reuse_disable = false; bool graph_reuse_disable = false;

View File

@ -4,7 +4,7 @@
#include <cstdint> #include <cstdint>
#define LLAMA_MAX_SEQ 64 #define LLAMA_MAX_SEQ 256
struct llama_cparams { struct llama_cparams {
uint32_t n_ctx; // context size used during inference uint32_t n_ctx; // context size used during inference
@ -24,7 +24,6 @@ struct llama_cparams {
float yarn_attn_factor; float yarn_attn_factor;
float yarn_beta_fast; float yarn_beta_fast;
float yarn_beta_slow; float yarn_beta_slow;
float defrag_thold;
bool embeddings; bool embeddings;
bool causal_attn; bool causal_attn;

View File

@ -4,8 +4,8 @@
#include "llama-batch.h" #include "llama-batch.h"
#include "llama-cparams.h" #include "llama-cparams.h"
#include "llama-kv-cache-unified.h" #include "llama-kv-cache.h"
#include "llama-kv-cache-unified-iswa.h" #include "llama-kv-cache-iswa.h"
#include "llama-memory-hybrid.h" #include "llama-memory-hybrid.h"
#include "llama-memory-recurrent.h" #include "llama-memory-recurrent.h"
@ -204,7 +204,10 @@ void llm_graph_input_cls::set_input(const llama_ubatch * ubatch) {
std::vector<int> target_pos(n_seqs_unq, -1); std::vector<int> target_pos(n_seqs_unq, -1);
std::vector<int> target_row(n_seqs_unq, -1); std::vector<int> target_row(n_seqs_unq, -1);
bool last = cparams.pooling_type == LLAMA_POOLING_TYPE_LAST; const bool last = (
cparams.pooling_type == LLAMA_POOLING_TYPE_LAST ||
(cparams.pooling_type == LLAMA_POOLING_TYPE_RANK && arch == LLM_ARCH_QWEN3) // qwen3 reranking & embedding models use last token
);
for (int i = 0; i < n_tokens; ++i) { for (int i = 0; i < n_tokens; ++i) {
const llama_pos pos = ubatch->pos[i]; const llama_pos pos = ubatch->pos[i];
@ -258,6 +261,36 @@ void llm_graph_input_cross_embd::set_input(const llama_ubatch * ubatch) {
} }
} }
static void print_mask(float * data, int64_t n_tokens, int64_t n_kv, int64_t n_swa, llama_swa_type swa_type) {
LLAMA_LOG_DEBUG("%s: === Attention mask ===\n", __func__);
const char * swa_type_str = (swa_type == LLAMA_SWA_TYPE_NONE) ? "LLAMA_SWA_TYPE_NONE" :
(swa_type == LLAMA_SWA_TYPE_STANDARD) ? "LLAMA_SWA_TYPE_STANDARD" :
(swa_type == LLAMA_SWA_TYPE_CHUNKED) ? "LLAMA_SWA_TYPE_CHUNKED" :
(swa_type == LLAMA_SWA_TYPE_SYMMETRIC) ? "LLAMA_SWA_TYPE_SYMMETRIC" : "unknown";
LLAMA_LOG_DEBUG("%s: n_swa : %d, n_kv: %d, swq_type: %s\n", __func__, (int)n_swa, (int)n_kv, swa_type_str);
LLAMA_LOG_DEBUG("%s: '0' = can attend, '∞' = masked\n", __func__);
LLAMA_LOG_DEBUG("%s: Rows = query tokens, Columns = key/value tokens\n\n", __func__);
LLAMA_LOG_DEBUG(" ");
for (int j = 0; j < std::min((int64_t)20, n_kv); ++j) {
LLAMA_LOG_DEBUG("%2d", j);
}
LLAMA_LOG_DEBUG("\n");
for (int i = 0; i < std::min((int64_t)20, n_tokens); ++i) {
LLAMA_LOG_DEBUG(" %2d ", i);
for (int j = 0; j < std::min((int64_t)20, n_kv); ++j) {
float val = data[i * n_kv + j];
if (val == -INFINITY) {
LLAMA_LOG_DEBUG("");
} else {
LLAMA_LOG_DEBUG(" 0");
}
}
LLAMA_LOG_DEBUG("\n");
}
}
void llm_graph_input_attn_no_cache::set_input(const llama_ubatch * ubatch) { void llm_graph_input_attn_no_cache::set_input(const llama_ubatch * ubatch) {
const int64_t n_kv = ubatch->n_tokens; const int64_t n_kv = ubatch->n_tokens;
const int64_t n_tokens = ubatch->n_tokens; const int64_t n_tokens = ubatch->n_tokens;
@ -267,6 +300,9 @@ void llm_graph_input_attn_no_cache::set_input(const llama_ubatch * ubatch) {
float * data = (float *) kq_mask->data; float * data = (float *) kq_mask->data;
// [TAG_NO_CACHE_ISWA]
GGML_ASSERT(hparams.swa_type == LLAMA_SWA_TYPE_NONE && "TODO: implement");
for (int h = 0; h < 1; ++h) { for (int h = 0; h < 1; ++h) {
for (int i1 = 0; i1 < n_tokens; ++i1) { for (int i1 = 0; i1 < n_tokens; ++i1) {
const llama_seq_id s1 = ubatch->seq_id[i1][0]; const llama_seq_id s1 = ubatch->seq_id[i1][0];
@ -277,32 +313,44 @@ void llm_graph_input_attn_no_cache::set_input(const llama_ubatch * ubatch) {
for (int s = 0; s < ubatch->n_seq_id[i0]; ++s) { for (int s = 0; s < ubatch->n_seq_id[i0]; ++s) {
const llama_seq_id s0 = ubatch->seq_id[i0][0]; const llama_seq_id s0 = ubatch->seq_id[i0][0];
if (s0 != s1) {
continue; // skip different sequences
}
if (cparams.causal_attn && ubatch->pos[i0] > ubatch->pos[i1]) {
continue; // skip future tokens for causal attention
}
// TODO: this does not take into account that some layers are SWA and others are note (i.e. iSWA) [TAG_NO_CACHE_ISWA]
//if (hparams.is_masked_swa(ubatch->pos[i0], ubatch->pos[i1])) {
// continue; // skip masked tokens for SWA
//}
// TODO: reimplement this like in llama_kv_cache_unified // TODO: reimplement this like in llama_kv_cache_unified
if (s0 == s1 && (!cparams.causal_attn || ubatch->pos[i0] <= ubatch->pos[i1])) { if (hparams.use_alibi) {
if (hparams.use_alibi) { f = -std::abs(ubatch->pos[i0] - ubatch->pos[i1]);
f = -std::abs(ubatch->pos[i0] - ubatch->pos[i1]); } else {
} else { f = 0.0f;
f = 0.0f;
}
break;
} }
} }
data[h*(n_kv*n_tokens) + i1*n_kv + i0] = f; data[h*(n_kv*n_tokens) + i1*n_kv + i0] = f;
} }
} }
} }
if (debug) {
print_mask(data, n_tokens, n_kv, hparams.n_swa, hparams.swa_type);
}
} }
void llm_graph_input_attn_kv_unified::set_input(const llama_ubatch * ubatch) { void llm_graph_input_attn_kv::set_input(const llama_ubatch * ubatch) {
mctx->set_input_k_idxs(self_k_idxs, ubatch); mctx->set_input_k_idxs(self_k_idxs, ubatch);
mctx->set_input_v_idxs(self_v_idxs, ubatch); mctx->set_input_v_idxs(self_v_idxs, ubatch);
mctx->set_input_kq_mask(self_kq_mask, ubatch, cparams.causal_attn); mctx->set_input_kq_mask(self_kq_mask, ubatch, cparams.causal_attn);
} }
bool llm_graph_input_attn_kv_unified::can_reuse(const llm_graph_params & params) { bool llm_graph_input_attn_kv::can_reuse(const llm_graph_params & params) {
const auto * mctx = static_cast<const llama_kv_cache_unified_context *>(params.mctx); const auto * mctx = static_cast<const llama_kv_cache_context *>(params.mctx);
this->mctx = mctx; this->mctx = mctx;
@ -314,12 +362,10 @@ bool llm_graph_input_attn_kv_unified::can_reuse(const llm_graph_params & params)
res &= self_kq_mask->ne[0] == mctx->get_n_kv(); res &= self_kq_mask->ne[0] == mctx->get_n_kv();
res &= self_kq_mask->ne[1] == GGML_PAD(params.ubatch.n_tokens, GGML_KQ_MASK_PAD); res &= self_kq_mask->ne[1] == GGML_PAD(params.ubatch.n_tokens, GGML_KQ_MASK_PAD);
res &= mctx->get_supports_set_rows(); // TODO: tmp
return res; return res;
} }
void llm_graph_input_attn_kv_unified_iswa::set_input(const llama_ubatch * ubatch) { void llm_graph_input_attn_kv_iswa::set_input(const llama_ubatch * ubatch) {
mctx->get_base()->set_input_k_idxs(self_k_idxs, ubatch); mctx->get_base()->set_input_k_idxs(self_k_idxs, ubatch);
mctx->get_base()->set_input_v_idxs(self_v_idxs, ubatch); mctx->get_base()->set_input_v_idxs(self_v_idxs, ubatch);
@ -331,8 +377,8 @@ void llm_graph_input_attn_kv_unified_iswa::set_input(const llama_ubatch * ubatch
mctx->get_swa()->set_input_kq_mask(self_kq_mask_swa, ubatch, cparams.causal_attn); mctx->get_swa()->set_input_kq_mask(self_kq_mask_swa, ubatch, cparams.causal_attn);
} }
bool llm_graph_input_attn_kv_unified_iswa::can_reuse(const llm_graph_params & params) { bool llm_graph_input_attn_kv_iswa::can_reuse(const llm_graph_params & params) {
const auto * mctx = static_cast<const llama_kv_cache_unified_iswa_context *>(params.mctx); const auto * mctx = static_cast<const llama_kv_cache_iswa_context *>(params.mctx);
this->mctx = mctx; this->mctx = mctx;
@ -350,8 +396,6 @@ bool llm_graph_input_attn_kv_unified_iswa::can_reuse(const llm_graph_params & pa
res &= self_kq_mask_swa->ne[0] == mctx->get_swa()->get_n_kv(); res &= self_kq_mask_swa->ne[0] == mctx->get_swa()->get_n_kv();
res &= self_kq_mask_swa->ne[1] == GGML_PAD(params.ubatch.n_tokens, GGML_KQ_MASK_PAD); res &= self_kq_mask_swa->ne[1] == GGML_PAD(params.ubatch.n_tokens, GGML_KQ_MASK_PAD);
res &= mctx->get_base()->get_supports_set_rows(); // TODO: tmp
return res; return res;
} }
@ -879,15 +923,29 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
selection_probs = logits; selection_probs = logits;
} }
if (arch == LLM_ARCH_GROVEMOE) {
selection_probs = ggml_sigmoid(ctx0, logits); // [n_expert, n_tokens]
cb(selection_probs, "ffn_moe_probs_biased", il);
}
// select experts // select experts
ggml_tensor * selected_experts = ggml_top_k(ctx0, selection_probs, n_expert_used); // [n_expert_used, n_tokens] ggml_tensor * selected_experts = ggml_top_k(ctx0, selection_probs, n_expert_used); // [n_expert_used, n_tokens]
cb(selected_experts->src[0], "ffn_moe_argsort", il); cb(selected_experts->src[0], "ffn_moe_argsort", il);
cb(selected_experts, "ffn_moe_topk", il); cb(selected_experts, "ffn_moe_topk", il);
ggml_tensor * weights = ggml_get_rows(ctx0, if (arch == LLM_ARCH_GROVEMOE && n_expert != hparams.n_expert) {
ggml_reshape_3d(ctx0, probs, 1, n_expert, n_tokens), selected_experts); // [1, n_expert_used, n_tokens] // TODO: Use scalar div instead when/if implemented
ggml_tensor * f_sel = ggml_cast(ctx0, selected_experts, GGML_TYPE_F32);
selected_experts = ggml_cast(ctx0, ggml_scale(ctx0, f_sel, 1.0f / float(hparams.n_group_experts)), GGML_TYPE_I32);
probs = ggml_reshape_3d(ctx0, probs, 1, hparams.n_expert, n_tokens);
} else {
probs = ggml_reshape_3d(ctx0, probs, 1, n_expert, n_tokens);
}
ggml_tensor * weights = ggml_get_rows(ctx0, probs, selected_experts); // [1, n_expert_used, n_tokens]
cb(weights, "ffn_moe_weights", il); cb(weights, "ffn_moe_weights", il);
if (gating_op == LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX_WEIGHT) { if (gating_op == LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX_WEIGHT) {
weights = ggml_reshape_2d(ctx0, weights, n_expert_used, n_tokens); weights = ggml_reshape_2d(ctx0, weights, n_expert_used, n_tokens);
weights = ggml_soft_max(ctx0, weights); // [n_expert_used, n_tokens] weights = ggml_soft_max(ctx0, weights); // [n_expert_used, n_tokens]
@ -911,6 +969,9 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
cb(weights, "ffn_moe_weights_scaled", il); cb(weights, "ffn_moe_weights_scaled", il);
} }
//call early so that topk-moe can be used
ggml_build_forward_expand(gf, weights);
cur = ggml_reshape_3d(ctx0, cur, n_embd, 1, n_tokens); cur = ggml_reshape_3d(ctx0, cur, n_embd, 1, n_tokens);
if (weight_before_ffn) { if (weight_before_ffn) {
@ -1136,7 +1197,7 @@ ggml_tensor * llm_graph_context::build_inp_mean() const {
} }
ggml_tensor * llm_graph_context::build_inp_cls() const { ggml_tensor * llm_graph_context::build_inp_cls() const {
auto inp = std::make_unique<llm_graph_input_cls>(cparams); auto inp = std::make_unique<llm_graph_input_cls>(cparams, arch);
auto & cur = inp->cls; auto & cur = inp->cls;
@ -1186,7 +1247,7 @@ ggml_tensor * llm_graph_context::build_inp_pos_bucket_enc() const {
} }
ggml_tensor * llm_graph_context::build_inp_pos_bucket_dec() const { ggml_tensor * llm_graph_context::build_inp_pos_bucket_dec() const {
const auto * mctx_cur = static_cast<const llama_kv_cache_unified_context *>(mctx); const auto * mctx_cur = static_cast<const llama_kv_cache_context *>(mctx);
auto inp = std::make_unique<llm_graph_input_pos_bucket_kv>(hparams, mctx_cur); auto inp = std::make_unique<llm_graph_input_pos_bucket_kv>(hparams, mctx_cur);
@ -1223,15 +1284,16 @@ ggml_tensor * llm_graph_context::build_attn_mha(
ggml_tensor * v, ggml_tensor * v,
ggml_tensor * kq_b, ggml_tensor * kq_b,
ggml_tensor * kq_mask, ggml_tensor * kq_mask,
ggml_tensor * v_mla,
ggml_tensor * sinks, ggml_tensor * sinks,
float kq_scale) const { ggml_tensor * v_mla,
float kq_scale,
int il) const {
const bool v_trans = v->nb[1] > v->nb[2]; const bool v_trans = v->nb[1] > v->nb[2];
// split the batch into streams if needed // split the batch into streams if needed
const auto n_stream = k->ne[3]; const auto n_stream = k->ne[3];
q = ggml_reshape_4d(ctx0, q, q->ne[0], q->ne[1], q->ne[2]/n_stream, n_stream); q = ggml_view_4d(ctx0, q, q->ne[0], q->ne[1], q->ne[2]/n_stream, n_stream, q->nb[1], q->nb[2], q->nb[3]/n_stream, 0);
q = ggml_permute(ctx0, q, 0, 2, 1, 3); q = ggml_permute(ctx0, q, 0, 2, 1, 3);
k = ggml_permute(ctx0, k, 0, 2, 1, 3); k = ggml_permute(ctx0, k, 0, 2, 1, 3);
@ -1260,6 +1322,7 @@ ggml_tensor * llm_graph_context::build_attn_mha(
cur = ggml_flash_attn_ext(ctx0, q, k, v, kq_mask, kq_scale, hparams.f_max_alibi_bias, cur = ggml_flash_attn_ext(ctx0, q, k, v, kq_mask, kq_scale, hparams.f_max_alibi_bias,
hparams.attn_soft_cap ? hparams.f_attn_logit_softcapping : 0.0f); hparams.attn_soft_cap ? hparams.f_attn_logit_softcapping : 0.0f);
cb(cur, LLAMA_TENSOR_NAME_FATTN, il);
ggml_flash_attn_ext_add_sinks(cur, sinks); ggml_flash_attn_ext_add_sinks(cur, sinks);
ggml_flash_attn_ext_set_prec (cur, GGML_PREC_F32); ggml_flash_attn_ext_set_prec (cur, GGML_PREC_F32);
@ -1275,6 +1338,7 @@ ggml_tensor * llm_graph_context::build_attn_mha(
// The permutations are noops and only change how the tensor data is interpreted. // The permutations are noops and only change how the tensor data is interpreted.
cur = ggml_permute(ctx0, cur, 0, 2, 1, 3); cur = ggml_permute(ctx0, cur, 0, 2, 1, 3);
cur = ggml_mul_mat(ctx0, v_mla, cur); cur = ggml_mul_mat(ctx0, v_mla, cur);
cb(cur, "fattn_mla", il);
cur = ggml_permute(ctx0, cur, 0, 2, 1, 3); cur = ggml_permute(ctx0, cur, 0, 2, 1, 3);
cur = ggml_cont(ctx0, cur); // Needed because ggml_reshape_2d expects contiguous inputs. cur = ggml_cont(ctx0, cur); // Needed because ggml_reshape_2d expects contiguous inputs.
#endif #endif
@ -1283,6 +1347,7 @@ ggml_tensor * llm_graph_context::build_attn_mha(
cur = ggml_reshape_2d(ctx0, cur, cur->ne[0]*cur->ne[1], cur->ne[2]*cur->ne[3]); cur = ggml_reshape_2d(ctx0, cur, cur->ne[0]*cur->ne[1], cur->ne[2]*cur->ne[3]);
} else { } else {
ggml_tensor * kq = ggml_mul_mat(ctx0, k, q); ggml_tensor * kq = ggml_mul_mat(ctx0, k, q);
cb(kq, "kq", il);
// note: this op tends to require high floating point range // note: this op tends to require high floating point range
// while for some models F16 is enough, for others it is not, so we default to F32 here // while for some models F16 is enough, for others it is not, so we default to F32 here
@ -1290,38 +1355,48 @@ ggml_tensor * llm_graph_context::build_attn_mha(
if (arch == LLM_ARCH_GROK) { if (arch == LLM_ARCH_GROK) {
// need to do the following: // need to do the following:
// multiply by attn_output_multiplyer of 0.08838834764831845 // multiply by attn_output_multiplier
// and then : // and then :
// kq = 30 * tanh(kq / 30) // kq = 30 * tanh(kq / 30)
// before the softmax below // before the softmax below
kq = ggml_tanh(ctx0, ggml_scale(ctx0, kq, 0.08838834764831845f/30.0f)); kq = ggml_tanh(ctx0, ggml_scale(ctx0, kq, hparams.f_attn_out_scale / hparams.f_attn_logit_softcapping));
kq = ggml_scale(ctx0, kq, 30); cb(kq, "kq_tanh", il);
kq = ggml_scale(ctx0, kq, hparams.f_attn_logit_softcapping);
cb(kq, "kq_scaled", il);
} }
if (hparams.attn_soft_cap) { if (hparams.attn_soft_cap) {
kq = ggml_scale(ctx0, kq, 1.0f / hparams.f_attn_logit_softcapping); kq = ggml_scale(ctx0, kq, 1.0f / hparams.f_attn_logit_softcapping);
cb(kq, "kq_scaled_1", il);
kq = ggml_tanh (ctx0, kq); kq = ggml_tanh (ctx0, kq);
cb(kq, "kq_tanh", il);
kq = ggml_scale(ctx0, kq, hparams.f_attn_logit_softcapping); kq = ggml_scale(ctx0, kq, hparams.f_attn_logit_softcapping);
cb(kq, "kq_scaled_2", il);
} }
if (kq_b) { if (kq_b) {
kq = ggml_add(ctx0, kq, kq_b); kq = ggml_add(ctx0, kq, kq_b);
cb(kq, "kq_plus_kq_b", il);
} }
kq = ggml_soft_max_ext(ctx0, kq, kq_mask, kq_scale, hparams.f_max_alibi_bias); kq = ggml_soft_max_ext(ctx0, kq, kq_mask, kq_scale, hparams.f_max_alibi_bias);
ggml_soft_max_add_sinks(kq, sinks); ggml_soft_max_add_sinks(kq, sinks);
cb(kq, "kq_soft_max", il);
if (!v_trans) { if (!v_trans) {
// note: avoid this branch // note: avoid this branch
v = ggml_cont(ctx0, ggml_transpose(ctx0, v)); v = ggml_cont(ctx0, ggml_transpose(ctx0, v));
cb(v, "v_cont", il);
} }
ggml_tensor * kqv = ggml_mul_mat(ctx0, v, kq); ggml_tensor * kqv = ggml_mul_mat(ctx0, v, kq);
cb(kqv, "kqv", il);
// for MLA with the absorption optimization, we need to "decompress" from MQA back to MHA // for MLA with the absorption optimization, we need to "decompress" from MQA back to MHA
if (v_mla) { if (v_mla) {
kqv = ggml_mul_mat(ctx0, v_mla, kqv); kqv = ggml_mul_mat(ctx0, v_mla, kqv);
cb(kqv, "kqv_mla", il);
} }
cur = ggml_permute(ctx0, kqv, 0, 2, 1, 3); cur = ggml_permute(ctx0, kqv, 0, 2, 1, 3);
@ -1360,6 +1435,7 @@ ggml_tensor * llm_graph_context::build_attn(
ggml_tensor * k_cur, ggml_tensor * k_cur,
ggml_tensor * v_cur, ggml_tensor * v_cur,
ggml_tensor * kq_b, ggml_tensor * kq_b,
ggml_tensor * sinks,
ggml_tensor * v_mla, ggml_tensor * v_mla,
float kq_scale, float kq_scale,
int il) const { int il) const {
@ -1375,13 +1451,14 @@ ggml_tensor * llm_graph_context::build_attn(
// [TAG_NO_CACHE_PAD] // [TAG_NO_CACHE_PAD]
// TODO: if ubatch.equal_seqs() == true, we can split the three tensors below into ubatch.n_seqs_unq streams // TODO: if ubatch.equal_seqs() == true, we can split the three tensors below into ubatch.n_seqs_unq streams
assert(!ubatch.equal_seqs()); // but it might not be worth it: https://github.com/ggml-org/llama.cpp/pull/15636
//assert(!ubatch.equal_seqs() || (k_cur->ne[3] == 1 && k_cur->ne[3] == ubatch.n_seqs_unq));
ggml_tensor * q = q_cur; ggml_tensor * q = q_cur;
ggml_tensor * k = k_cur; ggml_tensor * k = k_cur;
ggml_tensor * v = v_cur; ggml_tensor * v = v_cur;
ggml_tensor * cur = build_attn_mha(q, k, v, kq_b, kq_mask, v_mla, nullptr, kq_scale); ggml_tensor * cur = build_attn_mha(q, k, v, kq_b, kq_mask, sinks, v_mla, kq_scale, il);
cb(cur, "kqv_out", il); cb(cur, "kqv_out", il);
if (wo) { if (wo) {
@ -1399,17 +1476,17 @@ ggml_tensor * llm_graph_context::build_attn(
return cur; return cur;
} }
static std::unique_ptr<llm_graph_input_attn_kv_unified> build_attn_inp_kv_unified_impl( static std::unique_ptr<llm_graph_input_attn_kv> build_attn_inp_kv_impl(
ggml_context * ctx0, ggml_context * ctx0,
const llama_ubatch & ubatch, const llama_ubatch & ubatch,
const llama_hparams & hparams, const llama_hparams & hparams,
const llama_cparams & cparams, const llama_cparams & cparams,
const llama_kv_cache_unified_context * mctx_cur) { const llama_kv_cache_context * mctx_cur) {
auto inp = std::make_unique<llm_graph_input_attn_kv_unified>(hparams, cparams, mctx_cur); auto inp = std::make_unique<llm_graph_input_attn_kv>(hparams, cparams, mctx_cur);
{ {
GGML_ASSERT(hparams.swa_type == LLAMA_SWA_TYPE_NONE && "Use llama_kv_cache_unified_iswa for SWA"); GGML_ASSERT(hparams.swa_type == LLAMA_SWA_TYPE_NONE && "Use llama_kv_cache_iswa for SWA");
const auto n_kv = mctx_cur->get_n_kv(); const auto n_kv = mctx_cur->get_n_kv();
const auto n_tokens = ubatch.n_tokens; const auto n_tokens = ubatch.n_tokens;
@ -1427,22 +1504,23 @@ static std::unique_ptr<llm_graph_input_attn_kv_unified> build_attn_inp_kv_unifie
return inp; return inp;
} }
llm_graph_input_attn_kv_unified * llm_graph_context::build_attn_inp_kv_unified() const { llm_graph_input_attn_kv * llm_graph_context::build_attn_inp_kv() const {
const auto * mctx_cur = static_cast<const llama_kv_cache_unified_context *>(mctx); const auto * mctx_cur = static_cast<const llama_kv_cache_context *>(mctx);
auto inp = build_attn_inp_kv_unified_impl(ctx0, ubatch, hparams, cparams, mctx_cur); auto inp = build_attn_inp_kv_impl(ctx0, ubatch, hparams, cparams, mctx_cur);
return (llm_graph_input_attn_kv_unified *) res->add_input(std::move(inp)); return (llm_graph_input_attn_kv *) res->add_input(std::move(inp));
} }
ggml_tensor * llm_graph_context::build_attn( ggml_tensor * llm_graph_context::build_attn(
llm_graph_input_attn_kv_unified * inp, llm_graph_input_attn_kv * inp,
ggml_tensor * wo, ggml_tensor * wo,
ggml_tensor * wo_b, ggml_tensor * wo_b,
ggml_tensor * q_cur, ggml_tensor * q_cur,
ggml_tensor * k_cur, ggml_tensor * k_cur,
ggml_tensor * v_cur, ggml_tensor * v_cur,
ggml_tensor * kq_b, ggml_tensor * kq_b,
ggml_tensor * sinks,
ggml_tensor * v_mla, ggml_tensor * v_mla,
float kq_scale, float kq_scale,
int il) const { int il) const {
@ -1469,7 +1547,7 @@ ggml_tensor * llm_graph_context::build_attn(
ggml_tensor * k = mctx_cur->get_k(ctx0, il); ggml_tensor * k = mctx_cur->get_k(ctx0, il);
ggml_tensor * v = mctx_cur->get_v(ctx0, il); ggml_tensor * v = mctx_cur->get_v(ctx0, il);
ggml_tensor * cur = build_attn_mha(q, k, v, kq_b, kq_mask, v_mla, nullptr, kq_scale); ggml_tensor * cur = build_attn_mha(q, k, v, kq_b, kq_mask, sinks, v_mla, kq_scale, il);
cb(cur, "kqv_out", il); cb(cur, "kqv_out", il);
if (wo) { if (wo) {
@ -1488,40 +1566,15 @@ ggml_tensor * llm_graph_context::build_attn(
} }
ggml_tensor * llm_graph_context::build_attn( ggml_tensor * llm_graph_context::build_attn(
llm_graph_input_attn_kv_unified_iswa * inp, llm_graph_input_attn_kv_iswa * inp,
ggml_tensor * wo, ggml_tensor * wo,
ggml_tensor * wo_b, ggml_tensor * wo_b,
ggml_tensor * q_cur, ggml_tensor * q_cur,
ggml_tensor * k_cur, ggml_tensor * k_cur,
ggml_tensor * v_cur, ggml_tensor * v_cur,
ggml_tensor * kq_b, ggml_tensor * kq_b,
ggml_tensor * v_mla,
float kq_scale,
int il) const {
return build_attn_with_sinks(
inp,
wo,
wo_b,
q_cur,
k_cur,
v_cur,
kq_b,
v_mla,
nullptr,
kq_scale,
il);
}
ggml_tensor * llm_graph_context::build_attn_with_sinks(
llm_graph_input_attn_kv_unified_iswa * inp,
ggml_tensor * wo,
ggml_tensor * wo_b,
ggml_tensor * q_cur,
ggml_tensor * k_cur,
ggml_tensor * v_cur,
ggml_tensor * kq_b,
ggml_tensor * v_mla,
ggml_tensor * sinks, ggml_tensor * sinks,
ggml_tensor * v_mla,
float kq_scale, float kq_scale,
int il) const { int il) const {
// these nodes are added to the graph together so that they are not reordered // these nodes are added to the graph together so that they are not reordered
@ -1561,7 +1614,7 @@ ggml_tensor * llm_graph_context::build_attn_with_sinks(
ggml_tensor * k = mctx_cur->get_k(ctx0, il); ggml_tensor * k = mctx_cur->get_k(ctx0, il);
ggml_tensor * v = mctx_cur->get_v(ctx0, il); ggml_tensor * v = mctx_cur->get_v(ctx0, il);
ggml_tensor * cur = build_attn_mha(q, k, v, kq_b, kq_mask, v_mla, sinks, kq_scale); ggml_tensor * cur = build_attn_mha(q, k, v, kq_b, kq_mask, sinks, v_mla, kq_scale, il);
cb(cur, "kqv_out", il); cb(cur, "kqv_out", il);
if (wo) { if (wo) {
@ -1600,6 +1653,7 @@ ggml_tensor * llm_graph_context::build_attn(
ggml_tensor * k_cur, ggml_tensor * k_cur,
ggml_tensor * v_cur, ggml_tensor * v_cur,
ggml_tensor * kq_b, ggml_tensor * kq_b,
ggml_tensor * sinks,
ggml_tensor * v_mla, ggml_tensor * v_mla,
float kq_scale, float kq_scale,
int il) const { int il) const {
@ -1615,7 +1669,7 @@ ggml_tensor * llm_graph_context::build_attn(
ggml_tensor * k = k_cur; ggml_tensor * k = k_cur;
ggml_tensor * v = v_cur; ggml_tensor * v = v_cur;
ggml_tensor * cur = build_attn_mha(q, k, v, kq_b, kq_mask, v_mla, nullptr, kq_scale); ggml_tensor * cur = build_attn_mha(q, k, v, kq_b, kq_mask, sinks, v_mla, kq_scale, il);
cb(cur, "kqv_out", il); cb(cur, "kqv_out", il);
if (wo) { if (wo) {
@ -1636,10 +1690,10 @@ ggml_tensor * llm_graph_context::build_attn(
// TODO: maybe separate the inner implementation into a separate function // TODO: maybe separate the inner implementation into a separate function
// like with the non-sliding window equivalent // like with the non-sliding window equivalent
// once sliding-window hybrid caches are a thing. // once sliding-window hybrid caches are a thing.
llm_graph_input_attn_kv_unified_iswa * llm_graph_context::build_attn_inp_kv_unified_iswa() const { llm_graph_input_attn_kv_iswa * llm_graph_context::build_attn_inp_kv_iswa() const {
const auto * mctx_cur = static_cast<const llama_kv_cache_unified_iswa_context *>(mctx); const auto * mctx_cur = static_cast<const llama_kv_cache_iswa_context *>(mctx);
auto inp = std::make_unique<llm_graph_input_attn_kv_unified_iswa>(hparams, cparams, mctx_cur); auto inp = std::make_unique<llm_graph_input_attn_kv_iswa>(hparams, cparams, mctx_cur);
const auto n_stream = cparams.kv_unified ? 1 : ubatch.n_seqs_unq; const auto n_stream = cparams.kv_unified ? 1 : ubatch.n_seqs_unq;
@ -1656,7 +1710,7 @@ llm_graph_input_attn_kv_unified_iswa * llm_graph_context::build_attn_inp_kv_unif
} }
{ {
GGML_ASSERT(hparams.swa_type != LLAMA_SWA_TYPE_NONE && "Use llama_kv_cache_unified for non-SWA"); GGML_ASSERT(hparams.swa_type != LLAMA_SWA_TYPE_NONE && "Use llama_kv_cache for non-SWA");
const auto n_kv = mctx_cur->get_swa()->get_n_kv(); const auto n_kv = mctx_cur->get_swa()->get_n_kv();
@ -1669,7 +1723,7 @@ llm_graph_input_attn_kv_unified_iswa * llm_graph_context::build_attn_inp_kv_unif
inp->self_kq_mask_swa_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp->self_kq_mask_swa, GGML_TYPE_F16) : inp->self_kq_mask_swa; inp->self_kq_mask_swa_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp->self_kq_mask_swa, GGML_TYPE_F16) : inp->self_kq_mask_swa;
} }
return (llm_graph_input_attn_kv_unified_iswa *) res->add_input(std::move(inp)); return (llm_graph_input_attn_kv_iswa *) res->add_input(std::move(inp));
} }
ggml_tensor * llm_graph_context::build_rs( ggml_tensor * llm_graph_context::build_rs(
@ -1792,7 +1846,7 @@ llm_graph_input_mem_hybrid * llm_graph_context::build_inp_mem_hybrid() const {
const auto * mctx_cur = static_cast<const llama_memory_hybrid_context *>(mctx); const auto * mctx_cur = static_cast<const llama_memory_hybrid_context *>(mctx);
auto inp_rs = build_rs_inp_impl(ctx0, ubatch, mctx_cur->get_recr()); auto inp_rs = build_rs_inp_impl(ctx0, ubatch, mctx_cur->get_recr());
auto inp_attn = build_attn_inp_kv_unified_impl(ctx0, ubatch, hparams, cparams, mctx_cur->get_attn()); auto inp_attn = build_attn_inp_kv_impl(ctx0, ubatch, hparams, cparams, mctx_cur->get_attn());
auto inp = std::make_unique<llm_graph_input_mem_hybrid>(std::move(inp_attn), std::move(inp_rs), mctx_cur); auto inp = std::make_unique<llm_graph_input_mem_hybrid>(std::move(inp_attn), std::move(inp_rs), mctx_cur);
@ -1843,34 +1897,32 @@ void llm_graph_context::build_pooling(
case LLAMA_POOLING_TYPE_RANK: case LLAMA_POOLING_TYPE_RANK:
{ {
ggml_tensor * inp_cls = build_inp_cls(); ggml_tensor * inp_cls = build_inp_cls();
inp = ggml_get_rows(ctx0, inp, inp_cls); cur = ggml_get_rows(ctx0, inp, inp_cls);
// classification head
// https://github.com/huggingface/transformers/blob/5af7d41e49bbfc8319f462eb45253dcb3863dfb7/src/transformers/models/roberta/modeling_roberta.py#L1566
if (cls) { if (cls) {
// classification head cur = ggml_mul_mat(ctx0, cls, cur);
// https://github.com/huggingface/transformers/blob/5af7d41e49bbfc8319f462eb45253dcb3863dfb7/src/transformers/models/roberta/modeling_roberta.py#L1566
cur = ggml_mul_mat(ctx0, cls, inp);
if (cls_b) { if (cls_b) {
cur = ggml_add(ctx0, cur, cls_b); cur = ggml_add(ctx0, cur, cls_b);
} }
cur = ggml_tanh(ctx0, cur); cur = ggml_tanh(ctx0, cur);
}
// some models don't have `cls_out`, for example: https://huggingface.co/jinaai/jina-reranker-v1-tiny-en // some models don't have `cls_out`, for example: https://huggingface.co/jinaai/jina-reranker-v1-tiny-en
// https://huggingface.co/jinaai/jina-reranker-v1-tiny-en/blob/cb5347e43979c3084a890e3f99491952603ae1b7/modeling_bert.py#L884-L896 // https://huggingface.co/jinaai/jina-reranker-v1-tiny-en/blob/cb5347e43979c3084a890e3f99491952603ae1b7/modeling_bert.py#L884-L896
if (cls_out) { // Single layer classification head (direct projection)
cur = ggml_mul_mat(ctx0, cls_out, cur); // https://github.com/huggingface/transformers/blob/f4fc42216cd56ab6b68270bf80d811614d8d59e4/src/transformers/models/bert/modeling_bert.py#L1476
if (cls_out_b) { if (cls_out) {
cur = ggml_add(ctx0, cur, cls_out_b); cur = ggml_mul_mat(ctx0, cls_out, cur);
}
}
} else if (cls_out) {
// Single layer classification head (direct projection)
// https://github.com/huggingface/transformers/blob/f4fc42216cd56ab6b68270bf80d811614d8d59e4/src/transformers/models/bert/modeling_bert.py#L1476
cur = ggml_mul_mat(ctx0, cls_out, inp);
if (cls_out_b) { if (cls_out_b) {
cur = ggml_add(ctx0, cur, cls_out_b); cur = ggml_add(ctx0, cur, cls_out_b);
} }
} else { }
GGML_ABORT("RANK pooling requires either cls+cls_b or cls_out+cls_out_b");
// softmax for qwen3 reranker
if (arch == LLM_ARCH_QWEN3) {
cur = ggml_soft_max(ctx0, cur);
} }
} break; } break;
default: default:

View File

@ -19,8 +19,8 @@ struct llama_cparams;
struct llama_memory_context_i; struct llama_memory_context_i;
class llama_kv_cache_unified_context; class llama_kv_cache_context;
class llama_kv_cache_unified_iswa_context; class llama_kv_cache_iswa_context;
class llama_memory_recurrent_context; class llama_memory_recurrent_context;
class llama_memory_hybrid_context; class llama_memory_hybrid_context;
@ -78,6 +78,11 @@ struct llm_graph_params;
class llm_graph_input_i { class llm_graph_input_i {
public: public:
llm_graph_input_i() {
const char * LLAMA_GRAPH_INPUT_DEBUG = getenv("LLAMA_GRAPH_INPUT_DEBUG");
debug = LLAMA_GRAPH_INPUT_DEBUG ? atoi(LLAMA_GRAPH_INPUT_DEBUG) : 0;
}
virtual ~llm_graph_input_i() = default; virtual ~llm_graph_input_i() = default;
virtual void set_input(const llama_ubatch * ubatch) = 0; virtual void set_input(const llama_ubatch * ubatch) = 0;
@ -90,6 +95,9 @@ public:
GGML_UNUSED(params); GGML_UNUSED(params);
return false; return false;
} }
protected:
// env: LLAMA_GRAPH_INPUT_DEBUG
int debug = 0;
}; };
using llm_graph_input_ptr = std::unique_ptr<llm_graph_input_i>; using llm_graph_input_ptr = std::unique_ptr<llm_graph_input_i>;
@ -152,7 +160,7 @@ class llm_graph_input_pos_bucket_kv : public llm_graph_input_i {
public: public:
llm_graph_input_pos_bucket_kv( llm_graph_input_pos_bucket_kv(
const llama_hparams & hparams, const llama_hparams & hparams,
const llama_kv_cache_unified_context * mctx) : hparams(hparams), mctx(mctx) {} const llama_kv_cache_context * mctx) : hparams(hparams), mctx(mctx) {}
virtual ~llm_graph_input_pos_bucket_kv() = default; virtual ~llm_graph_input_pos_bucket_kv() = default;
void set_input(const llama_ubatch * ubatch) override; void set_input(const llama_ubatch * ubatch) override;
@ -161,7 +169,7 @@ public:
const llama_hparams hparams; const llama_hparams hparams;
const llama_kv_cache_unified_context * mctx; const llama_kv_cache_context * mctx;
}; };
class llm_graph_input_out_ids : public llm_graph_input_i { class llm_graph_input_out_ids : public llm_graph_input_i {
@ -198,7 +206,7 @@ public:
class llm_graph_input_cls : public llm_graph_input_i { class llm_graph_input_cls : public llm_graph_input_i {
public: public:
llm_graph_input_cls(const llama_cparams & cparams) : cparams(cparams) {} llm_graph_input_cls(const llama_cparams & cparams, const llm_arch arch) : cparams(cparams), arch(arch) {}
virtual ~llm_graph_input_cls() = default; virtual ~llm_graph_input_cls() = default;
void set_input(const llama_ubatch * ubatch) override; void set_input(const llama_ubatch * ubatch) override;
@ -206,6 +214,7 @@ public:
ggml_tensor * cls; // I32 [n_batch] ggml_tensor * cls; // I32 [n_batch]
const llama_cparams cparams; const llama_cparams cparams;
const llm_arch arch;
}; };
class llm_graph_input_rs : public llm_graph_input_i { class llm_graph_input_rs : public llm_graph_input_i {
@ -257,17 +266,17 @@ public:
const llama_cparams cparams; const llama_cparams cparams;
}; };
class llm_graph_input_attn_kv_unified : public llm_graph_input_i { class llm_graph_input_attn_kv : public llm_graph_input_i {
public: public:
llm_graph_input_attn_kv_unified( llm_graph_input_attn_kv(
const llama_hparams & hparams, const llama_hparams & hparams,
const llama_cparams & cparams, const llama_cparams & cparams,
const llama_kv_cache_unified_context * mctx) : const llama_kv_cache_context * mctx) :
hparams(hparams), hparams(hparams),
cparams(cparams), cparams(cparams),
mctx(mctx) { mctx(mctx) {
} }
~llm_graph_input_attn_kv_unified() = default; ~llm_graph_input_attn_kv() = default;
void set_input(const llama_ubatch * ubatch) override; void set_input(const llama_ubatch * ubatch) override;
@ -290,20 +299,20 @@ public:
const llama_hparams hparams; const llama_hparams hparams;
const llama_cparams cparams; const llama_cparams cparams;
const llama_kv_cache_unified_context * mctx; const llama_kv_cache_context * mctx;
}; };
class llm_graph_input_attn_kv_unified_iswa : public llm_graph_input_i { class llm_graph_input_attn_kv_iswa : public llm_graph_input_i {
public: public:
llm_graph_input_attn_kv_unified_iswa( llm_graph_input_attn_kv_iswa(
const llama_hparams & hparams, const llama_hparams & hparams,
const llama_cparams & cparams, const llama_cparams & cparams,
const llama_kv_cache_unified_iswa_context * mctx) : const llama_kv_cache_iswa_context * mctx) :
hparams(hparams), hparams(hparams),
cparams(cparams), cparams(cparams),
mctx(mctx) { mctx(mctx) {
} }
~llm_graph_input_attn_kv_unified_iswa() = default; ~llm_graph_input_attn_kv_iswa() = default;
void set_input(const llama_ubatch * ubatch) override; void set_input(const llama_ubatch * ubatch) override;
@ -330,7 +339,7 @@ public:
const llama_hparams hparams; const llama_hparams hparams;
const llama_cparams cparams; const llama_cparams cparams;
const llama_kv_cache_unified_iswa_context * mctx; const llama_kv_cache_iswa_context * mctx;
}; };
class llm_graph_input_attn_cross : public llm_graph_input_i { class llm_graph_input_attn_cross : public llm_graph_input_i {
@ -351,7 +360,7 @@ public:
class llm_graph_input_mem_hybrid : public llm_graph_input_i { class llm_graph_input_mem_hybrid : public llm_graph_input_i {
public: public:
llm_graph_input_mem_hybrid( llm_graph_input_mem_hybrid(
std::unique_ptr<llm_graph_input_attn_kv_unified> inp_attn, std::unique_ptr<llm_graph_input_attn_kv> inp_attn,
std::unique_ptr<llm_graph_input_rs> inp_rs, std::unique_ptr<llm_graph_input_rs> inp_rs,
const llama_memory_hybrid_context * mctx) : const llama_memory_hybrid_context * mctx) :
inp_attn(std::move(inp_attn)), inp_attn(std::move(inp_attn)),
@ -361,11 +370,11 @@ public:
void set_input(const llama_ubatch * ubatch) override; void set_input(const llama_ubatch * ubatch) override;
std::unique_ptr<llm_graph_input_attn_kv_unified> inp_attn; std::unique_ptr<llm_graph_input_attn_kv> inp_attn;
std::unique_ptr<llm_graph_input_rs> inp_rs; std::unique_ptr<llm_graph_input_rs> inp_rs;
llm_graph_input_attn_kv_unified * get_attn() const { return inp_attn.get(); } llm_graph_input_attn_kv * get_attn() const { return inp_attn.get(); }
llm_graph_input_rs * get_recr() const { return inp_rs.get(); } llm_graph_input_rs * get_recr() const { return inp_rs.get(); }
const llama_memory_hybrid_context * mctx; const llama_memory_hybrid_context * mctx;
}; };
@ -680,14 +689,15 @@ struct llm_graph_context {
// //
ggml_tensor * build_attn_mha( ggml_tensor * build_attn_mha(
ggml_tensor * q, // [n_embd_head_q, n_head_q, n_tokens] ggml_tensor * q, // [n_embd_head_q, n_head_q, n_tokens]
ggml_tensor * k, // [n_embd_head_k, n_head_k, n_tokens] ggml_tensor * k, // [n_embd_head_k, n_head_k, n_tokens]
ggml_tensor * v, // [n_embd_head_v, n_head_v, n_tokens] (v_trans == false) ggml_tensor * v, // [n_embd_head_v, n_head_v, n_tokens] (v_trans == false)
ggml_tensor * kq_b, ggml_tensor * kq_b,
ggml_tensor * kq_mask, ggml_tensor * kq_mask,
ggml_tensor * sinks, ggml_tensor * sinks, // [n_head_q]
ggml_tensor * v_mla, // [n_embd_head_v_mla, n_embd_head_v, n_head_v] ggml_tensor * v_mla, // [n_embd_head_v_mla, n_embd_head_v, n_head_v]
float kq_scale) const; float kq_scale,
int il) const;
llm_graph_input_attn_no_cache * build_attn_inp_no_cache() const; llm_graph_input_attn_no_cache * build_attn_inp_no_cache() const;
@ -699,50 +709,39 @@ struct llm_graph_context {
ggml_tensor * k_cur, // [n_embd_head_k, n_head_k, n_tokens] ggml_tensor * k_cur, // [n_embd_head_k, n_head_k, n_tokens]
ggml_tensor * v_cur, // [n_embd_head_v, n_head_v, n_tokens] ggml_tensor * v_cur, // [n_embd_head_v, n_head_v, n_tokens]
ggml_tensor * kq_b, ggml_tensor * kq_b,
ggml_tensor * sinks, // [n_head_q]
ggml_tensor * v_mla, // [n_embd_head_v_mla, n_embd_head_v, n_head_v] ggml_tensor * v_mla, // [n_embd_head_v_mla, n_embd_head_v, n_head_v]
float kq_scale, float kq_scale,
int il) const; int il) const;
llm_graph_input_attn_kv_unified * build_attn_inp_kv_unified() const; llm_graph_input_attn_kv * build_attn_inp_kv() const;
ggml_tensor * build_attn( ggml_tensor * build_attn(
llm_graph_input_attn_kv_unified * inp, llm_graph_input_attn_kv * inp,
ggml_tensor * wo, ggml_tensor * wo,
ggml_tensor * wo_b, ggml_tensor * wo_b,
ggml_tensor * q_cur, // [n_embd_head_q, n_head_q, n_tokens] ggml_tensor * q_cur, // [n_embd_head_q, n_head_q, n_tokens]
ggml_tensor * k_cur, // [n_embd_head_k, n_head_k, n_tokens] ggml_tensor * k_cur, // [n_embd_head_k, n_head_k, n_tokens]
ggml_tensor * v_cur, // [n_embd_head_v, n_head_v, n_tokens] ggml_tensor * v_cur, // [n_embd_head_v, n_head_v, n_tokens]
ggml_tensor * kq_b, ggml_tensor * kq_b,
ggml_tensor * sinks, // [n_head_q]
ggml_tensor * v_mla, // [n_embd_head_v_mla, n_embd_head_v, n_head_v] ggml_tensor * v_mla, // [n_embd_head_v_mla, n_embd_head_v, n_head_v]
float kq_scale, float kq_scale,
int il) const; int il) const;
llm_graph_input_attn_kv_unified_iswa * build_attn_inp_kv_unified_iswa() const; llm_graph_input_attn_kv_iswa * build_attn_inp_kv_iswa() const;
// note: if k_cur or v_cur are not provided, they will not be stored in the memory // note: if k_cur or v_cur are not provided, they will not be stored in the memory
ggml_tensor * build_attn( ggml_tensor * build_attn(
llm_graph_input_attn_kv_unified_iswa * inp, llm_graph_input_attn_kv_iswa * inp,
ggml_tensor * wo, ggml_tensor * wo,
ggml_tensor * wo_b, ggml_tensor * wo_b,
ggml_tensor * q_cur, // [n_embd_head_q, n_head_q, n_tokens] ggml_tensor * q_cur, // [n_embd_head_q, n_head_q, n_tokens]
ggml_tensor * k_cur, // [n_embd_head_k, n_head_k, n_tokens] optional ggml_tensor * k_cur, // [n_embd_head_k, n_head_k, n_tokens] optional
ggml_tensor * v_cur, // [n_embd_head_v, n_head_v, n_tokens] optional ggml_tensor * v_cur, // [n_embd_head_v, n_head_v, n_tokens] optional
ggml_tensor * kq_b, ggml_tensor * kq_b,
ggml_tensor * v_mla, // [n_embd_head_v_mla, n_embd_head_v, n_head_v]
float kq_scale,
int il) const;
// TODO: temporary to keep the diff small. after the code is public will refactor to simplify this
ggml_tensor * build_attn_with_sinks(
llm_graph_input_attn_kv_unified_iswa * inp,
ggml_tensor * wo,
ggml_tensor * wo_b,
ggml_tensor * q_cur, // [n_embd_head_q, n_head_q, n_tokens]
ggml_tensor * k_cur, // [n_embd_head_k, n_head_k, n_tokens] optional
ggml_tensor * v_cur, // [n_embd_head_v, n_head_v, n_tokens] optional
ggml_tensor * kq_b,
ggml_tensor * v_mla, // [n_embd_head_v_mla, n_embd_head_v, n_head_v]
ggml_tensor * sinks, // [n_head_q] ggml_tensor * sinks, // [n_head_q]
ggml_tensor * v_mla, // [n_embd_head_v_mla, n_embd_head_v, n_head_v]
float kq_scale, float kq_scale,
int il) const; int il) const;
@ -756,6 +755,7 @@ struct llm_graph_context {
ggml_tensor * k_cur, // [n_embd_head_k, n_head_k, n_tokens] ggml_tensor * k_cur, // [n_embd_head_k, n_head_k, n_tokens]
ggml_tensor * v_cur, // [n_embd_head_v, n_head_v, n_tokens] ggml_tensor * v_cur, // [n_embd_head_v, n_head_v, n_tokens]
ggml_tensor * kq_b, ggml_tensor * kq_b,
ggml_tensor * sinks, // [n_head_q]
ggml_tensor * v_mla, // [n_embd_head_v_mla, n_embd_head_v, n_head_v] ggml_tensor * v_mla, // [n_embd_head_v_mla, n_embd_head_v, n_head_v]
float kq_scale, float kq_scale,
int il) const; int il) const;
@ -765,7 +765,7 @@ struct llm_graph_context {
// //
// TODO: move this implementation to llama_memory_recurrent. // TODO: move this implementation to llama_memory_recurrent.
// this is analogous to llama_kv_cache_unified::cpy_k / cpy_v // this is analogous to llama_kv_cache::cpy_k / cpy_v
// when moving, avoid passing `ggml_cgraph` - only pass `ggml_context`. would likely need to split the // when moving, avoid passing `ggml_cgraph` - only pass `ggml_context`. would likely need to split the
// implementation in 2 separate methods. the goal is to avoid calling `ggml_build_forward_expand` in // implementation in 2 separate methods. the goal is to avoid calling `ggml_build_forward_expand` in
// `llama_memory_recurrent` // `llama_memory_recurrent`

View File

@ -1,6 +1,7 @@
#include "llama-hparams.h" #include "llama-hparams.h"
#include "ggml.h" #include "ggml.h"
#include <cassert>
void llama_hparams::set_swa_pattern(uint32_t n_pattern, bool dense_first) { void llama_hparams::set_swa_pattern(uint32_t n_pattern, bool dense_first) {
if (dense_first) { if (dense_first) {
@ -161,3 +162,64 @@ bool llama_hparams::is_swa(uint32_t il) const {
GGML_ABORT("fatal error"); GGML_ABORT("fatal error");
} }
bool llama_hparams::has_kv(uint32_t il) const {
if (n_layer_kv_from_start >= 0) {
if (il < (uint32_t) n_layer_kv_from_start) {
return true;
}
return false;
}
// by default, all layers have kv
return true;
}
uint32_t llama_hparams::n_layer_kv() const {
uint32_t res = 0;
for (uint32_t il = 0; il < n_layer; ++il) {
if (has_kv(il)) {
res++;
}
}
return res;
}
bool llama_hparams::is_masked_swa(uint32_t n_swa, llama_swa_type swa_type, llama_pos p0, llama_pos p1) {
assert(p0 >= 0 && p1 >= 0);
switch (swa_type) {
case LLAMA_SWA_TYPE_NONE:
{
} break;
case LLAMA_SWA_TYPE_STANDARD:
{
if (p1 - p0 >= (int32_t) n_swa) {
return true;
}
} break;
case LLAMA_SWA_TYPE_CHUNKED:
{
const llama_pos pos_chunk_start = (p1 / n_swa) * n_swa;
if (p0 < pos_chunk_start) {
return true;
}
} break;
case LLAMA_SWA_TYPE_SYMMETRIC:
{
const int32_t half_n_swa = (int32_t) n_swa / 2;
const int32_t pos_diff = p1 - p0;
// Mask if outside the symmetric window
if (pos_diff < -half_n_swa || pos_diff > half_n_swa) {
return true;
}
} break;
}
return false;
}

View File

@ -16,9 +16,10 @@ enum llama_expert_gating_func_type {
}; };
enum llama_swa_type { enum llama_swa_type {
LLAMA_SWA_TYPE_NONE = 0, LLAMA_SWA_TYPE_NONE = 0,
LLAMA_SWA_TYPE_STANDARD = 1, LLAMA_SWA_TYPE_STANDARD = 1,
LLAMA_SWA_TYPE_CHUNKED = 2, LLAMA_SWA_TYPE_CHUNKED = 2,
LLAMA_SWA_TYPE_SYMMETRIC = 3,
}; };
struct llama_hparams_posnet { struct llama_hparams_posnet {
@ -41,6 +42,7 @@ struct llama_hparams {
uint32_t n_embd; uint32_t n_embd;
uint32_t n_embd_features = 0; uint32_t n_embd_features = 0;
uint32_t n_layer; uint32_t n_layer;
int32_t n_layer_kv_from_start = -1; // if non-negative, the first n_layer_kv_from_start layers have KV cache
uint32_t n_rot; uint32_t n_rot;
uint32_t n_embd_head_k; // dimension of keys (d_k). d_q is assumed to be the same, but there are n_head q heads, and only n_head_kv k-v heads uint32_t n_embd_head_k; // dimension of keys (d_k). d_q is assumed to be the same, but there are n_head q heads, and only n_head_kv k-v heads
uint32_t n_embd_head_v; // dimension of values (d_v) aka n_embd_head uint32_t n_embd_head_v; // dimension of values (d_v) aka n_embd_head
@ -69,10 +71,13 @@ struct llama_hparams {
uint32_t n_lora_kv = 0; uint32_t n_lora_kv = 0;
uint32_t n_ff_exp = 0; uint32_t n_ff_exp = 0;
uint32_t n_ff_shexp = 0; uint32_t n_ff_shexp = 0;
uint32_t n_ff_chexp = 0;
uint32_t n_expert_shared = 0; uint32_t n_expert_shared = 0;
uint32_t n_norm_groups = 0; uint32_t n_norm_groups = 0;
uint32_t n_group_experts = 0;
float expert_weights_scale = 0.0; float expert_group_scale = 0.05f;
float expert_weights_scale = 0.0f;
bool expert_weights_norm = false; bool expert_weights_norm = false;
uint32_t expert_gating_func = LLAMA_EXPERT_GATING_FUNC_TYPE_NONE; uint32_t expert_gating_func = LLAMA_EXPERT_GATING_FUNC_TYPE_NONE;
uint32_t moe_every_n_layers = 0; uint32_t moe_every_n_layers = 0;
@ -82,8 +87,9 @@ struct llama_hparams {
float f_norm_rms_eps; float f_norm_rms_eps;
float f_norm_group_eps; float f_norm_group_eps;
float f_attn_logit_softcapping = 50.0f; float f_attn_logit_softcapping = 50.0f;
float f_final_logit_softcapping = 30.0f; float f_router_logit_softcapping = 30.0f;
float f_final_logit_softcapping = 30.0f;
// for RWKV // for RWKV
uint32_t rescale_every_n_layers = 0; uint32_t rescale_every_n_layers = 0;
@ -104,6 +110,11 @@ struct llama_hparams {
uint32_t n_ctx_orig_yarn; uint32_t n_ctx_orig_yarn;
float rope_yarn_log_mul = 0.0f; float rope_yarn_log_mul = 0.0f;
float yarn_ext_factor = -1.0f;
float yarn_attn_factor = 1.0f;
float yarn_beta_fast = 32.0f;
float yarn_beta_slow = 1.0f;
std::array<int, 4> rope_sections; std::array<int, 4> rope_sections;
// Sliding Window Attention (SWA) // Sliding Window Attention (SWA)
@ -136,10 +147,14 @@ struct llama_hparams {
float f_embedding_scale = 0.0f; float f_embedding_scale = 0.0f;
float f_attention_scale = 0.0f; float f_attention_scale = 0.0f;
// grok-2
float f_attn_out_scale = 0.0f;
uint32_t attn_temp_length = 0;
bool causal_attn = true; bool causal_attn = true;
bool use_alibi = false; bool use_alibi = false;
bool attn_soft_cap = false; bool attn_soft_cap = false;
bool use_kq_norm = true; bool use_kq_norm = false;
// for Classifiers // for Classifiers
uint32_t n_cls_out = 1; uint32_t n_cls_out = 1;
@ -159,6 +174,7 @@ struct llama_hparams {
// needed by encoder-decoder models (e.g. T5, FLAN-T5) // needed by encoder-decoder models (e.g. T5, FLAN-T5)
// ref: https://github.com/ggerganov/llama.cpp/pull/8141 // ref: https://github.com/ggerganov/llama.cpp/pull/8141
llama_token dec_start_token_id = LLAMA_TOKEN_NULL; llama_token dec_start_token_id = LLAMA_TOKEN_NULL;
uint32_t dec_n_layer = 0;
enum llama_pooling_type pooling_type = LLAMA_POOLING_TYPE_NONE; enum llama_pooling_type pooling_type = LLAMA_POOLING_TYPE_NONE;
enum llama_rope_type rope_type = LLAMA_ROPE_TYPE_NONE; enum llama_rope_type rope_type = LLAMA_ROPE_TYPE_NONE;
@ -226,6 +242,16 @@ struct llama_hparams {
bool n_bskcn(uint32_t n, uint32_t il) const; bool n_bskcn(uint32_t n, uint32_t il) const;
bool is_swa(uint32_t il) const; bool is_swa(uint32_t il) const;
bool has_kv(uint32_t il) const;
// number of layers for which has_kv() returns true
uint32_t n_layer_kv() const;
// note that this function uses different SWA parameters from those in the hparams
// TODO: think of a better place for this function
// TODO: pack the SWA params in a struct?
static bool is_masked_swa(uint32_t n_swa, llama_swa_type swa_type, llama_pos p0, llama_pos p1);
}; };
static_assert(std::is_trivially_copyable<llama_hparams>::value, "llama_hparams must be trivially copyable"); static_assert(std::is_trivially_copyable<llama_hparams>::value, "llama_hparams must be trivially copyable");

View File

@ -59,3 +59,5 @@ std::string llama_format_tensor_shape(const std::vector<int64_t> & ne);
std::string llama_format_tensor_shape(const struct ggml_tensor * t); std::string llama_format_tensor_shape(const struct ggml_tensor * t);
std::string gguf_kv_to_str(const struct gguf_context * ctx_gguf, int i); std::string gguf_kv_to_str(const struct gguf_context * ctx_gguf, int i);
#define LLAMA_TENSOR_NAME_FATTN "__fattn__"

View File

@ -1,4 +1,4 @@
#include "llama-kv-cache-unified-iswa.h" #include "llama-kv-cache-iswa.h"
#include "llama-impl.h" #include "llama-impl.h"
#include "llama-batch.h" #include "llama-batch.h"
@ -8,10 +8,10 @@
#include <cassert> #include <cassert>
// //
// llama_kv_cache_unified_iswa // llama_kv_cache_iswa
// //
llama_kv_cache_unified_iswa::llama_kv_cache_unified_iswa( llama_kv_cache_iswa::llama_kv_cache_iswa(
const llama_model & model, const llama_model & model,
ggml_type type_k, ggml_type type_k,
ggml_type type_v, ggml_type type_v,
@ -22,9 +22,26 @@ llama_kv_cache_unified_iswa::llama_kv_cache_unified_iswa(
uint32_t kv_size, uint32_t kv_size,
uint32_t n_seq_max, uint32_t n_seq_max,
uint32_t n_ubatch, uint32_t n_ubatch,
uint32_t n_pad) : hparams(model.hparams), unified(unified) { uint32_t n_pad,
llama_kv_cache_unified::layer_filter_cb filter_base = [&](int32_t il) { return !model.hparams.is_swa(il); }; const layer_filter_cb & filter,
llama_kv_cache_unified::layer_filter_cb filter_swa = [&](int32_t il) { return model.hparams.is_swa(il); }; const layer_reuse_cb & reuse) : hparams(model.hparams), unified(unified) {
// chain filters
const layer_filter_cb filter_base = [&](int32_t il) {
if (filter && !filter(il)) {
return false;
}
return !model.hparams.is_swa(il);
};
const layer_filter_cb filter_swa = [&](int32_t il) {
if (filter && !filter(il)) {
return false;
}
return model.hparams.is_swa(il);
};
const uint32_t size_base = kv_size; const uint32_t size_base = kv_size;
@ -40,25 +57,25 @@ llama_kv_cache_unified_iswa::llama_kv_cache_unified_iswa(
LLAMA_LOG_INFO("%s: creating non-SWA KV cache, size = %u cells\n", __func__, size_base); LLAMA_LOG_INFO("%s: creating non-SWA KV cache, size = %u cells\n", __func__, size_base);
kv_base = std::make_unique<llama_kv_cache_unified>( kv_base = std::make_unique<llama_kv_cache>(
model, std::move(filter_base), type_k, type_v, model, type_k, type_v,
v_trans, offload, unified, size_base, n_seq_max, n_pad, v_trans, offload, unified, size_base, n_seq_max, n_pad,
0, LLAMA_SWA_TYPE_NONE); 0, LLAMA_SWA_TYPE_NONE, filter_base, reuse);
LLAMA_LOG_INFO("%s: creating SWA KV cache, size = %u cells\n", __func__, size_swa); LLAMA_LOG_INFO("%s: creating SWA KV cache, size = %u cells\n", __func__, size_swa);
kv_swa = std::make_unique<llama_kv_cache_unified>( kv_swa = std::make_unique<llama_kv_cache>(
model, std::move(filter_swa), type_k, type_v, model, type_k, type_v,
v_trans, offload, unified, size_swa, n_seq_max, n_pad, v_trans, offload, unified, size_swa, n_seq_max, n_pad,
hparams.n_swa, hparams.swa_type); hparams.n_swa, hparams.swa_type, filter_swa, reuse);
} }
void llama_kv_cache_unified_iswa::clear(bool data) { void llama_kv_cache_iswa::clear(bool data) {
kv_base->clear(data); kv_base->clear(data);
kv_swa ->clear(data); kv_swa ->clear(data);
} }
bool llama_kv_cache_unified_iswa::seq_rm(llama_seq_id seq_id, llama_pos p0, llama_pos p1) { bool llama_kv_cache_iswa::seq_rm(llama_seq_id seq_id, llama_pos p0, llama_pos p1) {
bool res = true; bool res = true;
res = res & kv_base->seq_rm(seq_id, p0, p1); res = res & kv_base->seq_rm(seq_id, p0, p1);
@ -67,36 +84,44 @@ bool llama_kv_cache_unified_iswa::seq_rm(llama_seq_id seq_id, llama_pos p0, llam
return res; return res;
} }
void llama_kv_cache_unified_iswa::seq_cp(llama_seq_id seq_id_src, llama_seq_id seq_id_dst, llama_pos p0, llama_pos p1) { void llama_kv_cache_iswa::seq_cp(llama_seq_id seq_id_src, llama_seq_id seq_id_dst, llama_pos p0, llama_pos p1) {
kv_base->seq_cp(seq_id_src, seq_id_dst, p0, p1); kv_base->seq_cp(seq_id_src, seq_id_dst, p0, p1);
kv_swa ->seq_cp(seq_id_src, seq_id_dst, p0, p1); kv_swa ->seq_cp(seq_id_src, seq_id_dst, p0, p1);
} }
void llama_kv_cache_unified_iswa::seq_keep(llama_seq_id seq_id) { void llama_kv_cache_iswa::seq_keep(llama_seq_id seq_id) {
kv_base->seq_keep(seq_id); kv_base->seq_keep(seq_id);
kv_swa ->seq_keep(seq_id); kv_swa ->seq_keep(seq_id);
} }
void llama_kv_cache_unified_iswa::seq_add(llama_seq_id seq_id, llama_pos p0, llama_pos p1, llama_pos shift) { void llama_kv_cache_iswa::seq_add(llama_seq_id seq_id, llama_pos p0, llama_pos p1, llama_pos shift) {
kv_base->seq_add(seq_id, p0, p1, shift); kv_base->seq_add(seq_id, p0, p1, shift);
kv_swa ->seq_add(seq_id, p0, p1, shift); kv_swa ->seq_add(seq_id, p0, p1, shift);
} }
void llama_kv_cache_unified_iswa::seq_div(llama_seq_id seq_id, llama_pos p0, llama_pos p1, int d) { void llama_kv_cache_iswa::seq_div(llama_seq_id seq_id, llama_pos p0, llama_pos p1, int d) {
kv_base->seq_div(seq_id, p0, p1, d); kv_base->seq_div(seq_id, p0, p1, d);
kv_swa ->seq_div(seq_id, p0, p1, d); kv_swa ->seq_div(seq_id, p0, p1, d);
} }
llama_pos llama_kv_cache_unified_iswa::seq_pos_min(llama_seq_id seq_id) const { llama_pos llama_kv_cache_iswa::seq_pos_min(llama_seq_id seq_id) const {
// the base cache is a superset of the SWA cache, so we can just check the SWA cache // the base cache is a superset of the SWA cache, so we can just check the SWA cache
return kv_swa->seq_pos_min(seq_id); return kv_swa->seq_pos_min(seq_id);
} }
llama_pos llama_kv_cache_unified_iswa::seq_pos_max(llama_seq_id seq_id) const { llama_pos llama_kv_cache_iswa::seq_pos_max(llama_seq_id seq_id) const {
return kv_swa->seq_pos_max(seq_id); return kv_swa->seq_pos_max(seq_id);
} }
llama_memory_context_ptr llama_kv_cache_unified_iswa::init_batch(llama_batch_allocr & balloc, uint32_t n_ubatch, bool embd_all) { std::map<ggml_backend_buffer_type_t, size_t> llama_kv_cache_iswa::memory_breakdown() const {
std::map<ggml_backend_buffer_type_t, size_t> mb = kv_base->memory_breakdown();
for (const auto & buft_size : kv_swa->memory_breakdown()) {
mb[buft_size.first] += buft_size.second;
}
return mb;
}
llama_memory_context_ptr llama_kv_cache_iswa::init_batch(llama_batch_allocr & balloc, uint32_t n_ubatch, bool embd_all) {
GGML_UNUSED(embd_all); GGML_UNUSED(embd_all);
// first try simple split // first try simple split
@ -136,7 +161,7 @@ llama_memory_context_ptr llama_kv_cache_unified_iswa::init_batch(llama_batch_all
assert(sinfos_base.size() == sinfos_swa.size()); assert(sinfos_base.size() == sinfos_swa.size());
return std::make_unique<llama_kv_cache_unified_iswa_context>( return std::make_unique<llama_kv_cache_iswa_context>(
this, std::move(sinfos_base), std::move(sinfos_swa), std::move(ubatches)); this, std::move(sinfos_base), std::move(sinfos_swa), std::move(ubatches));
} while (false); } while (false);
@ -172,61 +197,67 @@ llama_memory_context_ptr llama_kv_cache_unified_iswa::init_batch(llama_batch_all
assert(sinfos_base.size() == sinfos_swa.size()); assert(sinfos_base.size() == sinfos_swa.size());
return std::make_unique<llama_kv_cache_unified_iswa_context>( return std::make_unique<llama_kv_cache_iswa_context>(
this, std::move(sinfos_base), std::move(sinfos_swa), std::move(ubatches)); this, std::move(sinfos_base), std::move(sinfos_swa), std::move(ubatches));
} while (false); } while (false);
// TODO: if we fail again, we should attempt different splitting strategies // TODO: if we fail again, we should attempt different splitting strategies
// but to do that properly, we first have to refactor the batches to be more flexible // but to do that properly, we first have to refactor the batches to be more flexible
return std::make_unique<llama_kv_cache_unified_iswa_context>(LLAMA_MEMORY_STATUS_FAILED_PREPARE); return std::make_unique<llama_kv_cache_iswa_context>(LLAMA_MEMORY_STATUS_FAILED_PREPARE);
} }
llama_memory_context_ptr llama_kv_cache_unified_iswa::init_full() { llama_memory_context_ptr llama_kv_cache_iswa::init_full() {
return std::make_unique<llama_kv_cache_unified_iswa_context>(this); return std::make_unique<llama_kv_cache_iswa_context>(this);
} }
llama_memory_context_ptr llama_kv_cache_unified_iswa::init_update(llama_context * lctx, bool optimize) { llama_memory_context_ptr llama_kv_cache_iswa::init_update(llama_context * lctx, bool optimize) {
return std::make_unique<llama_kv_cache_unified_iswa_context>(this, lctx, optimize); return std::make_unique<llama_kv_cache_iswa_context>(this, lctx, optimize);
} }
bool llama_kv_cache_unified_iswa::get_can_shift() const { bool llama_kv_cache_iswa::get_can_shift() const {
return kv_base->get_size() == kv_swa->get_size(); return kv_base->get_size() == kv_swa->get_size();
} }
void llama_kv_cache_unified_iswa::state_write(llama_io_write_i & io, llama_seq_id seq_id) const { void llama_kv_cache_iswa::state_write(llama_io_write_i & io, llama_seq_id seq_id, llama_state_seq_flags flags) const {
kv_base->state_write(io, seq_id); if ((flags & LLAMA_STATE_SEQ_FLAGS_SWA_ONLY) == 0) {
kv_swa ->state_write(io, seq_id); kv_base->state_write(io, seq_id, flags);
}
kv_swa->state_write(io, seq_id, flags);
} }
void llama_kv_cache_unified_iswa::state_read(llama_io_read_i & io, llama_seq_id seq_id) { void llama_kv_cache_iswa::state_read(llama_io_read_i & io, llama_seq_id seq_id, llama_state_seq_flags flags) {
kv_base->state_read(io, seq_id); if ((flags & LLAMA_STATE_SEQ_FLAGS_SWA_ONLY) == 0) {
kv_swa ->state_read(io, seq_id); kv_base->state_read(io, seq_id, flags);
}
kv_swa->state_read(io, seq_id, flags);
} }
llama_kv_cache_unified * llama_kv_cache_unified_iswa::get_base() const { llama_kv_cache * llama_kv_cache_iswa::get_base() const {
return kv_base.get(); return kv_base.get();
} }
llama_kv_cache_unified * llama_kv_cache_unified_iswa::get_swa() const { llama_kv_cache * llama_kv_cache_iswa::get_swa() const {
return kv_swa.get(); return kv_swa.get();
} }
// //
// llama_kv_cache_unified_iswa_context // llama_kv_cache_iswa_context
// //
llama_kv_cache_unified_iswa_context::llama_kv_cache_unified_iswa_context(llama_memory_status status) : status(status) {} llama_kv_cache_iswa_context::llama_kv_cache_iswa_context(llama_memory_status status) : status(status) {}
llama_kv_cache_unified_iswa_context::llama_kv_cache_unified_iswa_context( llama_kv_cache_iswa_context::llama_kv_cache_iswa_context(
llama_kv_cache_unified_iswa * kv) : llama_kv_cache_iswa * kv) :
ctx_base(kv->get_base()->init_full()), ctx_base(kv->get_base()->init_full()),
ctx_swa (kv->get_swa ()->init_full()), ctx_swa (kv->get_swa ()->init_full()),
status(llama_memory_status_combine(ctx_base->get_status(), ctx_swa->get_status())) { status(llama_memory_status_combine(ctx_base->get_status(), ctx_swa->get_status())) {
} }
llama_kv_cache_unified_iswa_context::llama_kv_cache_unified_iswa_context( llama_kv_cache_iswa_context::llama_kv_cache_iswa_context(
llama_kv_cache_unified_iswa * kv, llama_kv_cache_iswa * kv,
llama_context * lctx, llama_context * lctx,
bool optimize) : bool optimize) :
ctx_base(kv->get_base()->init_update(lctx, optimize)), ctx_base(kv->get_base()->init_update(lctx, optimize)),
@ -234,21 +265,21 @@ llama_kv_cache_unified_iswa_context::llama_kv_cache_unified_iswa_context(
status(llama_memory_status_combine(ctx_base->get_status(), ctx_swa->get_status())) { status(llama_memory_status_combine(ctx_base->get_status(), ctx_swa->get_status())) {
} }
llama_kv_cache_unified_iswa_context::llama_kv_cache_unified_iswa_context( llama_kv_cache_iswa_context::llama_kv_cache_iswa_context(
llama_kv_cache_unified_iswa * kv, llama_kv_cache_iswa * kv,
slot_info_vec_t sinfos_base, slot_info_vec_t sinfos_base,
slot_info_vec_t sinfos_swa, slot_info_vec_t sinfos_swa,
std::vector<llama_ubatch> ubatches) : std::vector<llama_ubatch> ubatches) :
ubatches(std::move(ubatches)), ubatches(std::move(ubatches)),
// note: here we copy the ubatches. not sure if this is ideal // note: here we copy the ubatches. not sure if this is ideal
ctx_base(new llama_kv_cache_unified_context(kv->get_base(), std::move(sinfos_base), this->ubatches)), ctx_base(new llama_kv_cache_context(kv->get_base(), std::move(sinfos_base), this->ubatches)),
ctx_swa (new llama_kv_cache_unified_context(kv->get_swa (), std::move(sinfos_swa), this->ubatches)), ctx_swa (new llama_kv_cache_context(kv->get_swa (), std::move(sinfos_swa), this->ubatches)),
status(llama_memory_status_combine(ctx_base->get_status(), ctx_swa->get_status())) { status(llama_memory_status_combine(ctx_base->get_status(), ctx_swa->get_status())) {
} }
llama_kv_cache_unified_iswa_context:: ~llama_kv_cache_unified_iswa_context() = default; llama_kv_cache_iswa_context:: ~llama_kv_cache_iswa_context() = default;
bool llama_kv_cache_unified_iswa_context::next() { bool llama_kv_cache_iswa_context::next() {
assert(status == LLAMA_MEMORY_STATUS_SUCCESS); assert(status == LLAMA_MEMORY_STATUS_SUCCESS);
ctx_base->next(); ctx_base->next();
@ -261,7 +292,7 @@ bool llama_kv_cache_unified_iswa_context::next() {
return true; return true;
} }
bool llama_kv_cache_unified_iswa_context::apply() { bool llama_kv_cache_iswa_context::apply() {
assert(!llama_memory_status_is_fail(status)); assert(!llama_memory_status_is_fail(status));
bool res = true; bool res = true;
@ -272,24 +303,24 @@ bool llama_kv_cache_unified_iswa_context::apply() {
return res; return res;
} }
llama_memory_status llama_kv_cache_unified_iswa_context::get_status() const { llama_memory_status llama_kv_cache_iswa_context::get_status() const {
return status; return status;
} }
const llama_ubatch & llama_kv_cache_unified_iswa_context::get_ubatch() const { const llama_ubatch & llama_kv_cache_iswa_context::get_ubatch() const {
assert(status == LLAMA_MEMORY_STATUS_SUCCESS); assert(status == LLAMA_MEMORY_STATUS_SUCCESS);
return ubatches[i_next]; return ubatches[i_next];
} }
const llama_kv_cache_unified_context * llama_kv_cache_unified_iswa_context::get_base() const { const llama_kv_cache_context * llama_kv_cache_iswa_context::get_base() const {
assert(status == LLAMA_MEMORY_STATUS_SUCCESS); assert(status == LLAMA_MEMORY_STATUS_SUCCESS);
return static_cast<const llama_kv_cache_unified_context *>(ctx_base.get()); return static_cast<const llama_kv_cache_context *>(ctx_base.get());
} }
const llama_kv_cache_unified_context * llama_kv_cache_unified_iswa_context::get_swa() const { const llama_kv_cache_context * llama_kv_cache_iswa_context::get_swa() const {
assert(status == LLAMA_MEMORY_STATUS_SUCCESS); assert(status == LLAMA_MEMORY_STATUS_SUCCESS);
return static_cast<const llama_kv_cache_unified_context *>(ctx_swa.get()); return static_cast<const llama_kv_cache_context *>(ctx_swa.get());
} }

View File

@ -1,19 +1,19 @@
#pragma once #pragma once
#include "llama-kv-cache-unified.h" #include "llama-kv-cache.h"
#include <vector> #include <vector>
// //
// llama_kv_cache_unified_iswa // llama_kv_cache_iswa
// //
// utilizes two instances of llama_kv_cache_unified // utilizes two instances of llama_kv_cache
// the first instance is for the non-SWA layers of the model and the second instance is for the SWA layers // the first instance is for the non-SWA layers of the model and the second instance is for the SWA layers
class llama_kv_cache_unified_iswa : public llama_memory_i { class llama_kv_cache_iswa : public llama_memory_i {
public: public:
llama_kv_cache_unified_iswa( llama_kv_cache_iswa(
const llama_model & model, const llama_model & model,
ggml_type type_k, ggml_type type_k,
ggml_type type_v, ggml_type type_v,
@ -24,9 +24,11 @@ public:
uint32_t kv_size, uint32_t kv_size,
uint32_t n_seq_max, uint32_t n_seq_max,
uint32_t n_ubatch, uint32_t n_ubatch,
uint32_t n_pad); uint32_t n_pad,
const layer_filter_cb & filter,
const layer_reuse_cb & reuse);
~llama_kv_cache_unified_iswa() = default; ~llama_kv_cache_iswa() = default;
// //
// llama_memory_i // llama_memory_i
@ -54,52 +56,54 @@ public:
llama_pos seq_pos_min(llama_seq_id seq_id) const override; llama_pos seq_pos_min(llama_seq_id seq_id) const override;
llama_pos seq_pos_max(llama_seq_id seq_id) const override; llama_pos seq_pos_max(llama_seq_id seq_id) const override;
std::map<ggml_backend_buffer_type_t, size_t> memory_breakdown() const override;
// state write/load // state write/load
void state_write(llama_io_write_i & io, llama_seq_id seq_id = -1) const override; void state_write(llama_io_write_i & io, llama_seq_id seq_id = -1, llama_state_seq_flags flags = 0) const override;
void state_read (llama_io_read_i & io, llama_seq_id seq_id = -1) override; void state_read (llama_io_read_i & io, llama_seq_id seq_id = -1, llama_state_seq_flags flags = 0) override;
// //
// llama_kv_cache_unified_iswa specific API // llama_kv_cache_iswa specific API
// //
llama_kv_cache_unified * get_base() const; llama_kv_cache * get_base() const;
llama_kv_cache_unified * get_swa () const; llama_kv_cache * get_swa () const;
private: private:
const llama_hparams & hparams; const llama_hparams & hparams;
const bool unified; const bool unified;
std::unique_ptr<llama_kv_cache_unified> kv_base; std::unique_ptr<llama_kv_cache> kv_base;
std::unique_ptr<llama_kv_cache_unified> kv_swa; std::unique_ptr<llama_kv_cache> kv_swa;
}; };
class llama_kv_cache_unified_iswa_context : public llama_memory_context_i { class llama_kv_cache_iswa_context : public llama_memory_context_i {
public: public:
using slot_info_vec_t = llama_kv_cache_unified::slot_info_vec_t; using slot_info_vec_t = llama_kv_cache::slot_info_vec_t;
// used for errors // used for errors
llama_kv_cache_unified_iswa_context(llama_memory_status status); llama_kv_cache_iswa_context(llama_memory_status status);
// used to create a full-cache context // used to create a full-cache context
llama_kv_cache_unified_iswa_context( llama_kv_cache_iswa_context(
llama_kv_cache_unified_iswa * kv); llama_kv_cache_iswa * kv);
// used to create an update context // used to create an update context
llama_kv_cache_unified_iswa_context( llama_kv_cache_iswa_context(
llama_kv_cache_unified_iswa * kv, llama_kv_cache_iswa * kv,
llama_context * lctx, llama_context * lctx,
bool optimize); bool optimize);
// used to create a batch processing context from a batch // used to create a batch processing context from a batch
llama_kv_cache_unified_iswa_context( llama_kv_cache_iswa_context(
llama_kv_cache_unified_iswa * kv, llama_kv_cache_iswa * kv,
slot_info_vec_t sinfos_base, slot_info_vec_t sinfos_base,
slot_info_vec_t sinfos_swa, slot_info_vec_t sinfos_swa,
std::vector<llama_ubatch> ubatches); std::vector<llama_ubatch> ubatches);
virtual ~llama_kv_cache_unified_iswa_context(); virtual ~llama_kv_cache_iswa_context();
// //
// llama_memory_context_i // llama_memory_context_i
@ -112,14 +116,14 @@ public:
const llama_ubatch & get_ubatch() const override; const llama_ubatch & get_ubatch() const override;
// //
// llama_kv_cache_unified_iswa_context specific API // llama_kv_cache_iswa_context specific API
// //
const llama_kv_cache_unified_context * get_base() const; const llama_kv_cache_context * get_base() const;
const llama_kv_cache_unified_context * get_swa() const; const llama_kv_cache_context * get_swa() const;
private: private:
//llama_kv_cache_unified_iswa * kv; //llama_kv_cache_iswa * kv;
// the index of the next ubatch to process // the index of the next ubatch to process
size_t i_next = 0; size_t i_next = 0;

View File

@ -14,27 +14,13 @@ struct llama_model;
struct llama_context; struct llama_context;
// //
// llama_kv_cache_unified // llama_kv_cache
// //
class llama_kv_cache_unified : public llama_memory_i { class llama_kv_cache : public llama_memory_i {
public: public:
static uint32_t get_padding(const llama_cparams & cparams); static uint32_t get_padding(const llama_cparams & cparams);
// this callback is used to filter out layers that should not be included in the cache
using layer_filter_cb = std::function<bool(int32_t il)>;
struct defrag_info {
bool empty() const {
return ids.empty();
}
// contains information about which cell moves where:
// - cell i moves to ids[i]
// - if ids[i] == i || ids[i] == ids.size(), then cell i is not moved
std::vector<uint32_t> ids;
};
struct stream_copy_info { struct stream_copy_info {
bool empty() const { bool empty() const {
assert(ssrc.size() == sdst.size()); assert(ssrc.size() == sdst.size());
@ -52,8 +38,8 @@ public:
using idx_vec_t = std::vector<uint32_t>; using idx_vec_t = std::vector<uint32_t>;
// number of streams: ns = s1 - s0 + 1 // number of streams: ns = s1 - s0 + 1
llama_seq_id s0; uint32_t s0;
llama_seq_id s1; uint32_t s1;
std::vector<llama_seq_id> strm; // [ns] std::vector<llama_seq_id> strm; // [ns]
std::vector<idx_vec_t> idxs; // [ns] std::vector<idx_vec_t> idxs; // [ns]
@ -92,21 +78,22 @@ public:
using slot_info_vec_t = std::vector<slot_info>; using slot_info_vec_t = std::vector<slot_info>;
llama_kv_cache_unified( llama_kv_cache(
const llama_model & model, const llama_model & model,
layer_filter_cb && filter, ggml_type type_k,
ggml_type type_k, ggml_type type_v,
ggml_type type_v, bool v_trans,
bool v_trans, bool offload,
bool offload, bool unified,
bool unified, uint32_t kv_size,
uint32_t kv_size, uint32_t n_seq_max,
uint32_t n_seq_max, uint32_t n_pad,
uint32_t n_pad, uint32_t n_swa,
uint32_t n_swa, llama_swa_type swa_type,
llama_swa_type swa_type); const layer_filter_cb & filter,
const layer_reuse_cb & reuse);
~llama_kv_cache_unified() = default; ~llama_kv_cache() = default;
// //
// llama_memory_i // llama_memory_i
@ -134,13 +121,15 @@ public:
llama_pos seq_pos_min(llama_seq_id seq_id) const override; llama_pos seq_pos_min(llama_seq_id seq_id) const override;
llama_pos seq_pos_max(llama_seq_id seq_id) const override; llama_pos seq_pos_max(llama_seq_id seq_id) const override;
std::map<ggml_backend_buffer_type_t, size_t> memory_breakdown() const override;
// state write/load // state write/load
void state_write(llama_io_write_i & io, llama_seq_id seq_id = -1) const override; void state_write(llama_io_write_i & io, llama_seq_id seq_id = -1, llama_state_seq_flags flags = 0) const override;
void state_read (llama_io_read_i & io, llama_seq_id seq_id = -1) override; void state_read (llama_io_read_i & io, llama_seq_id seq_id = -1, llama_state_seq_flags flags = 0) override;
// //
// llama_kv_cache_unified specific API // llama_kv_cache specific API
// //
uint32_t get_size() const; uint32_t get_size() const;
@ -152,10 +141,7 @@ public:
// graph_build API // graph_build API
// //
uint32_t get_n_kv() const; uint32_t get_n_kv(const slot_info & sinfo) const;
// TODO: temporary
bool get_supports_set_rows() const;
// get views of the current state of the cache // get views of the current state of the cache
ggml_tensor * get_k(ggml_context * ctx, int32_t il, uint32_t n_kv, const slot_info & sinfo) const; ggml_tensor * get_k(ggml_context * ctx, int32_t il, uint32_t n_kv, const slot_info & sinfo) const;
@ -173,7 +159,7 @@ public:
// return empty vector on failure // return empty vector on failure
slot_info_vec_t prepare(const std::vector<llama_ubatch> & ubatches); slot_info_vec_t prepare(const std::vector<llama_ubatch> & ubatches);
bool update(llama_context * lctx, bool do_shift, const defrag_info & dinfo, const stream_copy_info & sc_info); bool update(llama_context * lctx, bool do_shift, const stream_copy_info & sc_info);
// find a slot of kv cells that can hold the ubatch // find a slot of kv cells that can hold the ubatch
// if cont == true, then the slot must be continuous // if cont == true, then the slot must be continuous
@ -228,10 +214,7 @@ private:
// env: LLAMA_KV_CACHE_DEBUG // env: LLAMA_KV_CACHE_DEBUG
int debug = 0; int debug = 0;
// env: LLAMA_SET_ROWS (temporary) // this is the SWA type of the cache - not to be confused with the model SWA type
// ref: https://github.com/ggml-org/llama.cpp/pull/14285
bool supports_set_rows = true;
const llama_swa_type swa_type = LLAMA_SWA_TYPE_NONE; const llama_swa_type swa_type = LLAMA_SWA_TYPE_NONE;
std::vector<ggml_context_ptr> ctxs; std::vector<ggml_context_ptr> ctxs;
@ -241,7 +224,7 @@ private:
// note: this is not part of the KV state and it's only used to speed-up the find_slot() method // note: this is not part of the KV state and it's only used to speed-up the find_slot() method
std::vector<uint32_t> v_heads; std::vector<uint32_t> v_heads;
std::vector<llama_kv_cells_unified> v_cells; std::vector<llama_kv_cells> v_cells;
// maps from a sequence id to a stream id // maps from a sequence id to a stream id
std::vector<uint32_t> seq_to_stream; std::vector<uint32_t> seq_to_stream;
@ -254,9 +237,6 @@ private:
// model layer id -> KV cache layer id // model layer id -> KV cache layer id
std::unordered_map<int32_t, int32_t> map_layer_ids; std::unordered_map<int32_t, int32_t> map_layer_ids;
// return non-empty vector if cells have been moved
defrag_info defrag_prepare(int32_t n_max_nodes) const;
size_t total_size() const; size_t total_size() const;
size_t size_k_bytes() const; size_t size_k_bytes() const;
@ -277,11 +257,6 @@ private:
llm_graph_result * res, llm_graph_result * res,
llama_context * lctx) const; llama_context * lctx) const;
ggml_cgraph * build_graph_defrag(
llm_graph_result * res,
llama_context * lctx,
const defrag_info & dinfo) const;
struct cell_ranges_t { struct cell_ranges_t {
uint32_t strm; uint32_t strm;
@ -295,35 +270,33 @@ private:
bool state_read_data(llama_io_read_i & io, uint32_t strm, uint32_t cell_count); bool state_read_data(llama_io_read_i & io, uint32_t strm, uint32_t cell_count);
}; };
class llama_kv_cache_unified_context : public llama_memory_context_i { class llama_kv_cache_context : public llama_memory_context_i {
public: public:
// some shorthands // some shorthands
using slot_info_vec_t = llama_kv_cache_unified::slot_info_vec_t; using slot_info_vec_t = llama_kv_cache::slot_info_vec_t;
using defrag_info = llama_kv_cache_unified::defrag_info; using stream_copy_info = llama_kv_cache::stream_copy_info;
using stream_copy_info = llama_kv_cache_unified::stream_copy_info;
// used for errors // used for errors
llama_kv_cache_unified_context(llama_memory_status status); llama_kv_cache_context(llama_memory_status status);
// used to create a full-cache context // used to create a full-cache context
llama_kv_cache_unified_context( llama_kv_cache_context(
llama_kv_cache_unified * kv); llama_kv_cache * kv);
// used to create an update context // used to create an update context
llama_kv_cache_unified_context( llama_kv_cache_context(
llama_kv_cache_unified * kv, llama_kv_cache * kv,
llama_context * lctx, llama_context * lctx,
bool do_shift, bool do_shift,
defrag_info dinfo,
stream_copy_info sc_info); stream_copy_info sc_info);
// used to create a batch procesing context from a batch // used to create a batch procesing context from a batch
llama_kv_cache_unified_context( llama_kv_cache_context(
llama_kv_cache_unified * kv, llama_kv_cache * kv,
slot_info_vec_t sinfos, slot_info_vec_t sinfos,
std::vector<llama_ubatch> ubatches); std::vector<llama_ubatch> ubatches);
virtual ~llama_kv_cache_unified_context(); virtual ~llama_kv_cache_context();
// //
// llama_memory_context_i // llama_memory_context_i
@ -336,22 +309,27 @@ public:
const llama_ubatch & get_ubatch() const override; const llama_ubatch & get_ubatch() const override;
// //
// llama_kv_cache_unified_context specific API // llama_kv_cache_context specific API
// //
uint32_t get_n_kv() const; uint32_t get_n_kv() const;
// TODO: temporary
bool get_supports_set_rows() const;
// get views of the current state of the cache // get views of the current state of the cache
ggml_tensor * get_k(ggml_context * ctx, int32_t il) const; ggml_tensor * get_k(ggml_context * ctx, int32_t il) const;
ggml_tensor * get_v(ggml_context * ctx, int32_t il) const; ggml_tensor * get_v(ggml_context * ctx, int32_t il) const;
// store k_cur and v_cur in the cache based on the provided head location // store k_cur and v_cur in the cache based on the provided head location
// note: the heads in k_cur and v_cur should be layed out contiguously in memory
// - k_cur [n_embd_head_k, n_head_k, n_tokens]
// - k_idxs [n_tokens]
// - v_cur [n_embd_head_v, n_head_v, n_tokens]
// - v_idxs [n_tokens] or [n_tokens*n_embd_v_gqa] depending if V cache is transposed
ggml_tensor * cpy_k(ggml_context * ctx, ggml_tensor * k_cur, ggml_tensor * k_idxs, int32_t il) const; ggml_tensor * cpy_k(ggml_context * ctx, ggml_tensor * k_cur, ggml_tensor * k_idxs, int32_t il) const;
ggml_tensor * cpy_v(ggml_context * ctx, ggml_tensor * v_cur, ggml_tensor * v_idxs, int32_t il) const; ggml_tensor * cpy_v(ggml_context * ctx, ggml_tensor * v_cur, ggml_tensor * v_idxs, int32_t il) const;
// create destination indices for each head of the current batch for where it would be written in the KV cache
// the indices address the global KV cache (not per stream) - this is not relevant for the user of this API, but
// helps understand the implementation logic of cpy_k and cpy_v
ggml_tensor * build_input_k_idxs(ggml_context * ctx, const llama_ubatch & ubatch) const; ggml_tensor * build_input_k_idxs(ggml_context * ctx, const llama_ubatch & ubatch) const;
ggml_tensor * build_input_v_idxs(ggml_context * ctx, const llama_ubatch & ubatch) const; ggml_tensor * build_input_v_idxs(ggml_context * ctx, const llama_ubatch & ubatch) const;
@ -365,7 +343,7 @@ public:
private: private:
llama_memory_status status; llama_memory_status status;
llama_kv_cache_unified * kv; llama_kv_cache * kv;
llama_context * lctx; llama_context * lctx;
// //
@ -374,8 +352,6 @@ private:
bool do_shift = false; bool do_shift = false;
defrag_info dinfo;
stream_copy_info sc_info; stream_copy_info sc_info;
// //

View File

@ -11,7 +11,7 @@
// meta information about KV cells that can be part of multiple sequences at the same time // meta information about KV cells that can be part of multiple sequences at the same time
// TODO: add unit tests // TODO: add unit tests
class llama_kv_cells_unified { class llama_kv_cells {
public: public:
void reset() { void reset() {
for (uint32_t i = 0; i < pos.size(); ++i) { for (uint32_t i = 0; i < pos.size(); ++i) {
@ -77,30 +77,30 @@ public:
} }
// move cell isrc to idst (used during defrag) // move cell isrc to idst (used during defrag)
void mv(uint32_t isrc, uint32_t idst) { //void mv(uint32_t isrc, uint32_t idst) {
assert(isrc < pos.size()); // assert(isrc < pos.size());
assert(idst < pos.size()); // assert(idst < pos.size());
assert(pos[idst] == -1); // assert(pos[idst] == -1);
assert(pos[isrc] != -1); // assert(pos[isrc] != -1);
pos [idst] = pos [isrc]; // pos [idst] = pos [isrc];
shift[idst] = shift[isrc]; // shift[idst] = shift[isrc];
seq [idst] = seq [isrc]; // seq [idst] = seq [isrc];
pos [isrc] = -1; // pos [isrc] = -1;
shift[isrc] = 0; // shift[isrc] = 0;
seq [isrc].reset(); // seq [isrc].reset();
used.erase (isrc); // used.erase (isrc);
used.insert(idst); // used.insert(idst);
} //}
// copy the state of cells [i, i + n) (used for save/restore the state of the cells) // copy the state of cells [i, i + n) (used for save/restore the state of the cells)
llama_kv_cells_unified cp(uint32_t i, uint32_t n) const { llama_kv_cells cp(uint32_t i, uint32_t n) const {
assert(i + n <= pos.size()); assert(i + n <= pos.size());
llama_kv_cells_unified res; llama_kv_cells res;
res.resize(n); res.resize(n);
@ -117,8 +117,8 @@ public:
} }
// copy the state of cells [idxs[0], idxs[1], ..., idxs[idxs.size() - 1]) // copy the state of cells [idxs[0], idxs[1], ..., idxs[idxs.size() - 1])
llama_kv_cells_unified cp(const std::vector<uint32_t> & idxs) const { llama_kv_cells cp(const std::vector<uint32_t> & idxs) const {
llama_kv_cells_unified res; llama_kv_cells res;
res.resize(idxs.size()); res.resize(idxs.size());
@ -135,7 +135,7 @@ public:
} }
// set the state of cells [i, i + other.pos.size()) (used for save/restore the state of the cells) // set the state of cells [i, i + other.pos.size()) (used for save/restore the state of the cells)
void set(uint32_t i, const llama_kv_cells_unified & other) { void set(uint32_t i, const llama_kv_cells & other) {
assert(i + other.pos.size() <= pos.size()); assert(i + other.pos.size() <= pos.size());
for (uint32_t j = 0; j < other.pos.size(); ++j) { for (uint32_t j = 0; j < other.pos.size(); ++j) {
@ -165,7 +165,7 @@ public:
} }
// set the state of cells [idxs[0], idxs[1], ..., idxs[idxs.size() - 1]) // set the state of cells [idxs[0], idxs[1], ..., idxs[idxs.size() - 1])
void set(const std::vector<uint32_t> & idxs, const llama_kv_cells_unified & other) { void set(const std::vector<uint32_t> & idxs, const llama_kv_cells & other) {
assert(idxs.size() == other.pos.size()); assert(idxs.size() == other.pos.size());
for (uint32_t j = 0; j < other.pos.size(); ++j) { for (uint32_t j = 0; j < other.pos.size(); ++j) {

View File

@ -9,32 +9,29 @@
// //
llama_memory_hybrid::llama_memory_hybrid( llama_memory_hybrid::llama_memory_hybrid(
const llama_model & model, const llama_model & model,
/* attn */ /* attn */
ggml_type type_k, ggml_type type_k,
ggml_type type_v, ggml_type type_v,
bool v_trans, bool v_trans,
uint32_t kv_size, uint32_t kv_size,
uint32_t n_pad, uint32_t n_pad,
uint32_t n_swa, uint32_t n_swa,
llama_swa_type swa_type, llama_swa_type swa_type,
/* recurrent */ /* recurrent */
ggml_type type_r, ggml_type type_r,
ggml_type type_s, ggml_type type_s,
uint32_t rs_size, uint32_t rs_size,
/* common */ /* common */
uint32_t n_seq_max, uint32_t n_seq_max,
bool offload, bool offload,
bool unified, bool unified,
/* layer filters */ /* layer filters */
layer_filter_cb && filter_attn, const layer_filter_cb & filter_attn,
layer_filter_cb && filter_recr) : const layer_filter_cb & filter_recr) :
hparams(model.hparams), hparams(model.hparams),
mem_attn(new llama_kv_cache_unified( mem_attn(new llama_kv_cache(
model, model,
filter_attn == nullptr ?
[&](int32_t il) { return !hparams.is_recurrent(il); }
: filter_attn,
type_k, type_k,
type_v, type_v,
v_trans, v_trans,
@ -44,18 +41,22 @@ llama_memory_hybrid::llama_memory_hybrid(
n_seq_max, n_seq_max,
n_pad, n_pad,
n_swa, n_swa,
swa_type swa_type,
filter_attn == nullptr ?
[&](int32_t il) { return !hparams.is_recurrent(il); }
: filter_attn,
nullptr
)), )),
mem_recr(new llama_memory_recurrent( mem_recr(new llama_memory_recurrent(
model, model,
filter_recr == nullptr ?
[&](int32_t il) { return hparams.is_recurrent(il); }
: filter_recr,
type_r, type_r,
type_s, type_s,
offload, offload,
rs_size, rs_size,
n_seq_max n_seq_max,
filter_recr == nullptr ?
[&](int32_t il) { return hparams.is_recurrent(il); }
: filter_recr
)) {} )) {}
llama_memory_context_ptr llama_memory_hybrid::init_batch(llama_batch_allocr & balloc, uint32_t n_ubatch, bool embd_all) { llama_memory_context_ptr llama_memory_hybrid::init_batch(llama_batch_allocr & balloc, uint32_t n_ubatch, bool embd_all) {
@ -165,17 +166,29 @@ llama_pos llama_memory_hybrid::seq_pos_max(llama_seq_id seq_id) const {
return std::min(mem_attn->seq_pos_max(seq_id), mem_recr->seq_pos_max(seq_id)); return std::min(mem_attn->seq_pos_max(seq_id), mem_recr->seq_pos_max(seq_id));
} }
void llama_memory_hybrid::state_write(llama_io_write_i & io, llama_seq_id seq_id) const { std::map<ggml_backend_buffer_type_t, size_t> llama_memory_hybrid::memory_breakdown() const {
std::map<ggml_backend_buffer_type_t, size_t> mb = mem_attn->memory_breakdown();
for (const auto & buft_size : mem_recr->memory_breakdown()) {
mb[buft_size.first] += buft_size.second;
}
return mb;
}
void llama_memory_hybrid::state_write(llama_io_write_i & io, llama_seq_id seq_id, llama_state_seq_flags flags) const {
GGML_UNUSED(flags);
mem_attn->state_write(io, seq_id); mem_attn->state_write(io, seq_id);
mem_recr->state_write(io, seq_id); mem_recr->state_write(io, seq_id);
} }
void llama_memory_hybrid::state_read(llama_io_read_i & io, llama_seq_id seq_id) { void llama_memory_hybrid::state_read(llama_io_read_i & io, llama_seq_id seq_id, llama_state_seq_flags flags) {
GGML_UNUSED(flags);
mem_attn->state_read(io, seq_id); mem_attn->state_read(io, seq_id);
mem_recr->state_read(io, seq_id); mem_recr->state_read(io, seq_id);
} }
llama_kv_cache_unified * llama_memory_hybrid::get_mem_attn() const { llama_kv_cache * llama_memory_hybrid::get_mem_attn() const {
return mem_attn.get(); return mem_attn.get();
} }
@ -206,7 +219,7 @@ llama_memory_hybrid_context::llama_memory_hybrid_context(
std::vector<llama_ubatch> ubatches) : std::vector<llama_ubatch> ubatches) :
ubatches(std::move(ubatches)), ubatches(std::move(ubatches)),
// note: here we copy the ubatches. not sure if this is ideal // note: here we copy the ubatches. not sure if this is ideal
ctx_attn(new llama_kv_cache_unified_context(mem->get_mem_attn(), std::move(sinfos_attn), this->ubatches)), ctx_attn(new llama_kv_cache_context(mem->get_mem_attn(), std::move(sinfos_attn), this->ubatches)),
ctx_recr(new llama_memory_recurrent_context(mem->get_mem_recr(), this->ubatches)), ctx_recr(new llama_memory_recurrent_context(mem->get_mem_recr(), this->ubatches)),
status(llama_memory_status_combine(ctx_attn->get_status(), ctx_recr->get_status())) { status(llama_memory_status_combine(ctx_attn->get_status(), ctx_recr->get_status())) {
} }
@ -244,8 +257,8 @@ const llama_ubatch & llama_memory_hybrid_context::get_ubatch() const {
return ubatches[i_next]; return ubatches[i_next];
} }
const llama_kv_cache_unified_context * llama_memory_hybrid_context::get_attn() const { const llama_kv_cache_context * llama_memory_hybrid_context::get_attn() const {
return static_cast<const llama_kv_cache_unified_context *>(ctx_attn.get()); return static_cast<const llama_kv_cache_context *>(ctx_attn.get());
} }
const llama_memory_recurrent_context * llama_memory_hybrid_context::get_recr() const { const llama_memory_recurrent_context * llama_memory_hybrid_context::get_recr() const {

View File

@ -2,7 +2,7 @@
#include "llama-batch.h" #include "llama-batch.h"
#include "llama-graph.h" #include "llama-graph.h"
#include "llama-kv-cache-unified.h" #include "llama-kv-cache.h"
#include "llama-memory.h" #include "llama-memory.h"
#include "llama-memory-recurrent.h" #include "llama-memory-recurrent.h"
@ -13,36 +13,32 @@
// llama_memory_hybrid // llama_memory_hybrid
// //
// utilizes instances of llama_memory_recurrent and llama_kv_cache_unified to // utilizes instances of llama_memory_recurrent and llama_kv_cache to
// support models where each layer may be either attention-based or recurrent // support models where each layer may be either attention-based or recurrent
class llama_memory_hybrid : public llama_memory_i { class llama_memory_hybrid : public llama_memory_i {
public: public:
// this callback is used to filter out layers that should not be included in the cache
using layer_filter_cb = std::function<bool(int32_t il)>;
llama_memory_hybrid( llama_memory_hybrid(
const llama_model & model, const llama_model & model,
/* attn */ /* attn */
ggml_type type_k, ggml_type type_k,
ggml_type type_v, ggml_type type_v,
bool v_trans, bool v_trans,
uint32_t kv_size, uint32_t kv_size,
uint32_t n_pad, uint32_t n_pad,
uint32_t n_swa, uint32_t n_swa,
llama_swa_type swa_type, llama_swa_type swa_type,
/* recurrent */ /* recurrent */
ggml_type type_r, ggml_type type_r,
ggml_type type_s, ggml_type type_s,
uint32_t rs_size, uint32_t rs_size,
/* common */ /* common */
uint32_t n_seq_max, uint32_t n_seq_max,
bool offload, bool offload,
bool unified, bool unified,
/* layer filters */ /* layer filters */
layer_filter_cb && filter_attn = nullptr, const layer_filter_cb & filter_attn = nullptr,
layer_filter_cb && filter_recr = nullptr); const layer_filter_cb & filter_recr = nullptr);
~llama_memory_hybrid() = default; ~llama_memory_hybrid() = default;
@ -72,28 +68,30 @@ public:
llama_pos seq_pos_min(llama_seq_id seq_id) const override; llama_pos seq_pos_min(llama_seq_id seq_id) const override;
llama_pos seq_pos_max(llama_seq_id seq_id) const override; llama_pos seq_pos_max(llama_seq_id seq_id) const override;
std::map<ggml_backend_buffer_type_t, size_t> memory_breakdown() const override;
// state write/load // state write/load
void state_write(llama_io_write_i & io, llama_seq_id seq_id = -1) const override; void state_write(llama_io_write_i & io, llama_seq_id seq_id = -1, llama_state_seq_flags flags = 0) const override;
void state_read (llama_io_read_i & io, llama_seq_id seq_id = -1) override; void state_read (llama_io_read_i & io, llama_seq_id seq_id = -1, llama_state_seq_flags flags = 0) override;
// //
// llama_memory_hybrid specific API // llama_memory_hybrid specific API
// //
llama_kv_cache_unified * get_mem_attn() const; llama_kv_cache * get_mem_attn() const;
llama_memory_recurrent * get_mem_recr() const; llama_memory_recurrent * get_mem_recr() const;
private: private:
const llama_hparams & hparams; const llama_hparams & hparams;
const std::unique_ptr<llama_kv_cache_unified> mem_attn; const std::unique_ptr<llama_kv_cache> mem_attn;
const std::unique_ptr<llama_memory_recurrent> mem_recr; const std::unique_ptr<llama_memory_recurrent> mem_recr;
}; };
class llama_memory_hybrid_context : public llama_memory_context_i { class llama_memory_hybrid_context : public llama_memory_context_i {
public: public:
using slot_info_vec_t = llama_kv_cache_unified::slot_info_vec_t; using slot_info_vec_t = llama_kv_cache::slot_info_vec_t;
// init failure // init failure
explicit llama_memory_hybrid_context(llama_memory_status status); explicit llama_memory_hybrid_context(llama_memory_status status);
@ -125,7 +123,7 @@ public:
// llama_memory_hybrid_context // llama_memory_hybrid_context
// //
const llama_kv_cache_unified_context * get_attn() const; const llama_kv_cache_context * get_attn() const;
const llama_memory_recurrent_context * get_recr() const; const llama_memory_recurrent_context * get_recr() const;
private: private:

View File

@ -16,13 +16,13 @@
// //
llama_memory_recurrent::llama_memory_recurrent( llama_memory_recurrent::llama_memory_recurrent(
const llama_model & model, const llama_model & model,
layer_filter_cb && filter, ggml_type type_r,
ggml_type type_r, ggml_type type_s,
ggml_type type_s, bool offload,
bool offload, uint32_t mem_size,
uint32_t mem_size, uint32_t n_seq_max,
uint32_t n_seq_max) : hparams(model.hparams), n_seq_max(n_seq_max) { const layer_filter_cb & filter) : hparams(model.hparams), n_seq_max(n_seq_max) {
const int32_t n_layer = hparams.n_layer; const int32_t n_layer = hparams.n_layer;
head = 0; head = 0;
@ -359,6 +359,14 @@ llama_pos llama_memory_recurrent::seq_pos_max(llama_seq_id seq_id) const {
return result; return result;
} }
std::map<ggml_backend_buffer_type_t, size_t> llama_memory_recurrent::memory_breakdown() const {
std::map<ggml_backend_buffer_type_t, size_t> ret;
for (const ggml_backend_buffer_ptr & buf_ptr : bufs) {
ret[ggml_backend_buffer_get_type(buf_ptr.get())] += ggml_backend_buffer_get_size(buf_ptr.get());
}
return ret;
}
llama_memory_context_ptr llama_memory_recurrent::init_batch(llama_batch_allocr & balloc, uint32_t n_ubatch, bool embd_all) { llama_memory_context_ptr llama_memory_recurrent::init_batch(llama_batch_allocr & balloc, uint32_t n_ubatch, bool embd_all) {
do { do {
balloc.split_reset(); balloc.split_reset();
@ -680,7 +688,9 @@ size_t llama_memory_recurrent::size_s_bytes() const {
return size_s_bytes; return size_s_bytes;
} }
void llama_memory_recurrent::state_write(llama_io_write_i & io, llama_seq_id seq_id) const { void llama_memory_recurrent::state_write(llama_io_write_i & io, llama_seq_id seq_id, llama_state_seq_flags flags) const {
GGML_UNUSED(flags);
std::vector<std::pair<uint32_t, uint32_t>> cell_ranges; // ranges, from inclusive, to exclusive std::vector<std::pair<uint32_t, uint32_t>> cell_ranges; // ranges, from inclusive, to exclusive
uint32_t cell_count = 0; uint32_t cell_count = 0;
@ -718,7 +728,9 @@ void llama_memory_recurrent::state_write(llama_io_write_i & io, llama_seq_id seq
state_write_data(io, cell_ranges); state_write_data(io, cell_ranges);
} }
void llama_memory_recurrent::state_read(llama_io_read_i & io, llama_seq_id seq_id) { void llama_memory_recurrent::state_read(llama_io_read_i & io, llama_seq_id seq_id, llama_state_seq_flags flags) {
GGML_UNUSED(flags);
uint32_t cell_count; uint32_t cell_count;
io.read_to(&cell_count, sizeof(cell_count)); io.read_to(&cell_count, sizeof(cell_count));

View File

@ -4,6 +4,7 @@
#include "llama-graph.h" #include "llama-graph.h"
#include "llama-memory.h" #include "llama-memory.h"
#include <map>
#include <set> #include <set>
#include <vector> #include <vector>
@ -12,21 +13,17 @@
// //
// TODO: extract the cache state used for graph computation into llama_memory_recurrent_context_i // TODO: extract the cache state used for graph computation into llama_memory_recurrent_context_i
// see the implementation of llama_kv_cache_unified_context_i for an example how to do it // see the implementation of llama_kv_cache_context_i for an example how to do it
class llama_memory_recurrent : public llama_memory_i { class llama_memory_recurrent : public llama_memory_i {
public: public:
// this callback is used to filter out layers that should not be included in the cache
using layer_filter_cb = std::function<bool(int32_t il)>;
llama_memory_recurrent( llama_memory_recurrent(
const llama_model & model, const llama_model & model,
layer_filter_cb && filter, ggml_type type_r,
ggml_type type_r, ggml_type type_s,
ggml_type type_s, bool offload,
bool offload, uint32_t mem_size,
uint32_t mem_size, uint32_t n_seq_max,
uint32_t n_seq_max); const layer_filter_cb & filter);
~llama_memory_recurrent() = default; ~llama_memory_recurrent() = default;
@ -54,6 +51,8 @@ public:
llama_pos seq_pos_min(llama_seq_id seq_id) const override; llama_pos seq_pos_min(llama_seq_id seq_id) const override;
llama_pos seq_pos_max(llama_seq_id seq_id) const override; llama_pos seq_pos_max(llama_seq_id seq_id) const override;
std::map<ggml_backend_buffer_type_t, size_t> memory_breakdown() const override;
bool prepare(const std::vector<llama_ubatch> & ubatches); bool prepare(const std::vector<llama_ubatch> & ubatches);
// find a contiguous slot of memory cells and emplace the ubatch there // find a contiguous slot of memory cells and emplace the ubatch there
@ -63,8 +62,8 @@ public:
// state write/load // state write/load
void state_write(llama_io_write_i & io, llama_seq_id seq_id = -1) const override; void state_write(llama_io_write_i & io, llama_seq_id seq_id = -1, llama_state_seq_flags flags = 0) const override;
void state_read (llama_io_read_i & io, llama_seq_id seq_id = -1) override; void state_read (llama_io_read_i & io, llama_seq_id seq_id = -1, llama_state_seq_flags flags = 0) override;
uint32_t head = 0; // the location where the batch will be placed in the cache (see find_slot()) uint32_t head = 0; // the location where the batch will be placed in the cache (see find_slot())
uint32_t size = 0; // total number of cells, shared across all sequences uint32_t size = 0; // total number of cells, shared across all sequences

View File

@ -2,7 +2,9 @@
#include "llama.h" #include "llama.h"
#include <map>
#include <memory> #include <memory>
#include <functional>
struct llama_ubatch; struct llama_ubatch;
@ -36,8 +38,8 @@ bool llama_memory_status_is_fail(llama_memory_status status);
// the interface for managing the memory context during batch processing // the interface for managing the memory context during batch processing
// this interface is implemented per memory type. see: // this interface is implemented per memory type. see:
// - llama_kv_cache_unified_context // - llama_kv_cache_context
// - llama_kv_cache_unified_iswa_context // - llama_kv_cache_iswa_context
// ... // ...
// //
// the only method that should mutate the memory and the memory context is llama_memory_i::apply() // the only method that should mutate the memory and the memory context is llama_memory_i::apply()
@ -64,6 +66,13 @@ using llama_memory_context_ptr = std::unique_ptr<llama_memory_context_i>;
// general concept of LLM memory // general concept of LLM memory
// the KV cache is a type of LLM memory, but there can be other types // the KV cache is a type of LLM memory, but there can be other types
struct llama_memory_i { struct llama_memory_i {
// this callback is used to filter out layers that should not be included in the cache
using layer_filter_cb = std::function<bool(int32_t il)>;
// this callback is used to specify which layers should reuse memory from other layers
// return negative value to indicate that the layer il should not reuse memory
using layer_reuse_cb = std::function<int32_t(int32_t il)>;
virtual ~llama_memory_i() = default; virtual ~llama_memory_i() = default;
// split the input batch into a set of ubatches and verify that they can fit into the cache // split the input batch into a set of ubatches and verify that they can fit into the cache
@ -77,7 +86,7 @@ struct llama_memory_i {
// simulate full cache, used for allocating worst-case compute buffers // simulate full cache, used for allocating worst-case compute buffers
virtual llama_memory_context_ptr init_full() = 0; virtual llama_memory_context_ptr init_full() = 0;
// prepare for any pending memory updates, such as shifts, defrags, etc. // prepare for any pending memory updates, such as shifts, copies, etc.
// status == LLAMA_MEMORY_STATUS_NO_UPDATE if there is nothing to update // status == LLAMA_MEMORY_STATUS_NO_UPDATE if there is nothing to update
virtual llama_memory_context_ptr init_update(llama_context * lctx, bool optimize) = 0; virtual llama_memory_context_ptr init_update(llama_context * lctx, bool optimize) = 0;
@ -100,17 +109,14 @@ struct llama_memory_i {
virtual llama_pos seq_pos_min(llama_seq_id seq_id) const = 0; virtual llama_pos seq_pos_min(llama_seq_id seq_id) const = 0;
virtual llama_pos seq_pos_max(llama_seq_id seq_id) const = 0; virtual llama_pos seq_pos_max(llama_seq_id seq_id) const = 0;
virtual std::map<ggml_backend_buffer_type_t, size_t> memory_breakdown() const = 0;
// //
// state write/read // state write/read
// //
virtual void state_write(llama_io_write_i & io, llama_seq_id seq_id = -1) const = 0; virtual void state_write(llama_io_write_i & io, llama_seq_id seq_id = -1, llama_state_seq_flags flags = 0) const = 0;
virtual void state_read (llama_io_read_i & io, llama_seq_id seq_id = -1) = 0; virtual void state_read (llama_io_read_i & io, llama_seq_id seq_id = -1, llama_state_seq_flags flags = 0) = 0;
}; };
using llama_memory_ptr = std::unique_ptr<llama_memory_i>; using llama_memory_ptr = std::unique_ptr<llama_memory_i>;
// TODO: temporary until the llama_kv_cache is removed from the public API
struct llama_kv_cache : public llama_memory_i {
virtual ~llama_kv_cache() = default;
};

View File

@ -789,6 +789,7 @@ const struct ggml_tensor * llama_model_loader::check_tensor_dims(const std::stri
} }
struct ggml_tensor * llama_model_loader::create_tensor(struct ggml_context * ctx, const std::string & name, const std::initializer_list<int64_t> & ne, int flags) { struct ggml_tensor * llama_model_loader::create_tensor(struct ggml_context * ctx, const std::string & name, const std::initializer_list<int64_t> & ne, int flags) {
LLAMA_LOG_DEBUG("%s: loading tensor %s\n", __func__, name.c_str());
const struct ggml_tensor * cur = check_tensor_dims(name, ne, !(flags & TENSOR_NOT_REQUIRED)); const struct ggml_tensor * cur = check_tensor_dims(name, ne, !(flags & TENSOR_NOT_REQUIRED));
if (cur == NULL) { if (cur == NULL) {

File diff suppressed because it is too large Load Diff

View File

@ -7,6 +7,7 @@
#include "llama-memory.h" #include "llama-memory.h"
#include "llama-vocab.h" #include "llama-vocab.h"
#include <map>
#include <memory> #include <memory>
#include <string> #include <string>
#include <unordered_map> #include <unordered_map>
@ -28,6 +29,7 @@ enum llm_type {
LLM_TYPE_80M, LLM_TYPE_80M,
LLM_TYPE_109M, LLM_TYPE_109M,
LLM_TYPE_137M, LLM_TYPE_137M,
LLM_TYPE_140M,
LLM_TYPE_160M, LLM_TYPE_160M,
LLM_TYPE_190M, LLM_TYPE_190M,
LLM_TYPE_220M, LLM_TYPE_220M,
@ -36,12 +38,15 @@ enum llm_type {
LLM_TYPE_270M, LLM_TYPE_270M,
LLM_TYPE_335M, LLM_TYPE_335M,
LLM_TYPE_350M, LLM_TYPE_350M,
LLM_TYPE_360M,
LLM_TYPE_410M, LLM_TYPE_410M,
LLM_TYPE_450M, LLM_TYPE_450M,
LLM_TYPE_475M, LLM_TYPE_475M,
LLM_TYPE_558M,
LLM_TYPE_700M, LLM_TYPE_700M,
LLM_TYPE_770M, LLM_TYPE_770M,
LLM_TYPE_780M, LLM_TYPE_780M,
LLM_TYPE_950M,
LLM_TYPE_0_3B, LLM_TYPE_0_3B,
LLM_TYPE_0_5B, LLM_TYPE_0_5B,
LLM_TYPE_0_6B, LLM_TYPE_0_6B,
@ -54,6 +59,7 @@ enum llm_type {
LLM_TYPE_1_7B, LLM_TYPE_1_7B,
LLM_TYPE_1_8B, LLM_TYPE_1_8B,
LLM_TYPE_2B, LLM_TYPE_2B,
LLM_TYPE_2_6B,
LLM_TYPE_2_8B, LLM_TYPE_2_8B,
LLM_TYPE_2_9B, LLM_TYPE_2_9B,
LLM_TYPE_3B, LLM_TYPE_3B,
@ -76,9 +82,11 @@ enum llm_type {
LLM_TYPE_32B, LLM_TYPE_32B,
LLM_TYPE_34B, LLM_TYPE_34B,
LLM_TYPE_35B, LLM_TYPE_35B,
LLM_TYPE_36B,
LLM_TYPE_40B, LLM_TYPE_40B,
LLM_TYPE_65B, LLM_TYPE_65B,
LLM_TYPE_70B, LLM_TYPE_70B,
LLM_TYPE_120B,
LLM_TYPE_142B, LLM_TYPE_142B,
LLM_TYPE_236B, LLM_TYPE_236B,
LLM_TYPE_290B, LLM_TYPE_290B,
@ -268,6 +276,11 @@ struct llama_layer {
struct ggml_tensor * ffn_down_shexp = nullptr; struct ggml_tensor * ffn_down_shexp = nullptr;
struct ggml_tensor * ffn_up_shexp = nullptr; struct ggml_tensor * ffn_up_shexp = nullptr;
// ff adjugate experts (chexps)
struct ggml_tensor * ffn_gate_chexps = nullptr;
struct ggml_tensor * ffn_down_chexps = nullptr;
struct ggml_tensor * ffn_up_chexps = nullptr;
// ff bias // ff bias
struct ggml_tensor * ffn_gate_b = nullptr; struct ggml_tensor * ffn_gate_b = nullptr;
struct ggml_tensor * ffn_down_b = nullptr; // b2 struct ggml_tensor * ffn_down_b = nullptr; // b2
@ -449,10 +462,12 @@ struct llama_model {
std::string desc() const; std::string desc() const;
size_t size() const; size_t size() const; // file size
size_t n_tensors() const; size_t n_tensors() const;
size_t n_devices() const; size_t n_devices() const;
std::map<ggml_backend_buffer_type_t, size_t> memory_breakdown() const;
// total number of parameters in the model // total number of parameters in the model
uint64_t n_elements() const; uint64_t n_elements() const;

View File

@ -725,7 +725,9 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
// attention layers have a non-zero number of kv heads // attention layers have a non-zero number of kv heads
int32_t n_attn_layer = model.hparams.n_layer - std::count(n_head_kv_iter, n_head_kv_iter + model.hparams.n_layer, 0); int32_t n_attn_layer = model.hparams.n_layer - std::count(n_head_kv_iter, n_head_kv_iter + model.hparams.n_layer, 0);
if (llama_model_has_encoder(&model)) { if (llama_model_has_encoder(&model)) {
n_attn_layer *= 3; // now n_attn_layer is the number of attention layers in the encoder
// for each decoder block, there are 2 attention layers
n_attn_layer += 2 * model.hparams.dec_n_layer;
} }
GGML_ASSERT((qs.n_attention_wv == n_attn_layer - pruned_attention_w) && "n_attention_wv is unexpected"); GGML_ASSERT((qs.n_attention_wv == n_attn_layer - pruned_attention_w) && "n_attention_wv is unexpected");
} }
@ -920,7 +922,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
new_type = tensor->type; new_type = tensor->type;
new_data = tensor->data; new_data = tensor->data;
new_size = ggml_nbytes(tensor); new_size = ggml_nbytes(tensor);
LLAMA_LOG_INFO("size = %8.3f MB\n", ggml_nbytes(tensor)/1024.0/1024.0); LLAMA_LOG_INFO("size = %8.3f MiB\n", ggml_nbytes(tensor)/1024.0/1024.0);
} else { } else {
const int64_t nelements = ggml_nelements(tensor); const int64_t nelements = ggml_nelements(tensor);
@ -1037,8 +1039,8 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
} }
close_ofstream(); close_ofstream();
LLAMA_LOG_INFO("%s: model size = %8.2f MB\n", __func__, total_size_org/1024.0/1024.0); LLAMA_LOG_INFO("%s: model size = %8.2f MiB\n", __func__, total_size_org/1024.0/1024.0);
LLAMA_LOG_INFO("%s: quant size = %8.2f MB\n", __func__, total_size_new/1024.0/1024.0); LLAMA_LOG_INFO("%s: quant size = %8.2f MiB\n", __func__, total_size_new/1024.0/1024.0);
if (qs.n_fallback > 0) { if (qs.n_fallback > 0) {
LLAMA_LOG_WARN("%s: WARNING: %d of %d tensor(s) required fallback quantization\n", LLAMA_LOG_WARN("%s: WARNING: %d of %d tensor(s) required fallback quantization\n",

View File

@ -128,6 +128,89 @@ struct ring_buffer {
std::vector<T> data; std::vector<T> data;
}; };
// writes result in res, does not mutate cur
static void llama_token_data_array_partial_sort(const llama_token_data_array & cur, int npartial, std::vector<llama_token_data> & res) {
static const auto comp = [](const llama_token_data & a, const llama_token_data & b) {
return a.logit > b.logit;
};
constexpr int nbuckets = 128;
constexpr float bucket_low = -10.0f;
constexpr float bucket_high = 10.0f;
constexpr float bucket_scale = nbuckets/(bucket_high - bucket_low);
constexpr float bucket_inter = -bucket_low * bucket_scale;
std::vector<int> bucket_idx;
std::vector<int> histo(nbuckets, 0);
std::vector<llama_token_data*> bucket_ptrs;
bucket_idx.reserve(cur.size);
for (int i = 0; i < (int)cur.size; ++i) {
const float val = cur.data[i].logit;
int ib = int(bucket_scale * val + bucket_inter); //nbuckets * (val - bucket_low) / (bucket_high - bucket_low);
ib = std::max(0, std::min(nbuckets - 1, ib));
bucket_idx.push_back(ib);
++histo[ib];
}
int nhave = 0;
int ib = nbuckets - 1;
for ( ; ib >= 0; --ib) {
nhave += histo[ib];
if (nhave >= npartial) {
break;
}
}
res.resize(nhave);
auto * ptr = res.data();
bucket_ptrs.reserve(nbuckets - ib);
for (int j = nbuckets - 1; j >= ib; --j) {
bucket_ptrs.push_back(ptr);
ptr += histo[j];
}
for (int i = 0; i < (int)cur.size; ++i) {
int j = bucket_idx[i];
if (j >= ib) {
*bucket_ptrs[nbuckets - 1 - j]++ = cur.data[i];
}
}
ptr = res.data();
int ndone = 0;
for (int j = nbuckets - 1; j > ib; --j) {
std::sort(ptr, ptr + histo[j], comp);
ptr += histo[j];
ndone += histo[j];
}
std::partial_sort(ptr, ptr + npartial - ndone, ptr + histo[ib], comp);
}
// reduces the size of cur_p to npartial, keeping only the top npartial elements
static void llama_token_data_array_partial_sort_inplace(llama_token_data_array * cur_p, int npartial) {
static const auto comp = [](const llama_token_data & a, const llama_token_data & b) {
return a.logit > b.logit;
};
if (npartial <= 128) {
std::partial_sort(cur_p->data, cur_p->data + npartial, cur_p->data + cur_p->size, comp);
cur_p->size = npartial;
cur_p->sorted = true;
return;
}
std::vector<llama_token_data> tmp;
llama_token_data_array_partial_sort(*cur_p, npartial, tmp);
std::copy(tmp.data(), tmp.data() + npartial, cur_p->data);
cur_p->size = npartial;
cur_p->sorted = true;
}
static int llama_sample_dist(llama_token_data_array * cur_p, std::mt19937 & rng) { static int llama_sample_dist(llama_token_data_array * cur_p, std::mt19937 & rng) {
// iterator for the probabilities // iterator for the probabilities
#ifdef __GNUC__ #ifdef __GNUC__
@ -200,18 +283,21 @@ static void llama_sampler_temp_impl(llama_token_data_array * cur_p, float temp)
} }
} }
static void llama_sampler_softmax_impl(llama_token_data_array * cur_p) { static void llama_sampler_softmax_impl(llama_token_data_array * cur_p, bool do_sort) {
GGML_ASSERT(cur_p->size > 0); GGML_ASSERT(cur_p->size > 0);
// Sort the logits in descending order // Sort the logits in descending order if requested
if (!cur_p->sorted) { if (do_sort && !cur_p->sorted) {
std::sort(cur_p->data, cur_p->data + cur_p->size, [](const llama_token_data & a, const llama_token_data & b) { llama_token_data_array_partial_sort_inplace(cur_p, cur_p->size);
return a.logit > b.logit;
});
cur_p->sorted = true;
} }
float max_l = cur_p->data[0].logit; float max_l = cur_p->data[0].logit;
if (!cur_p->sorted) {
for (size_t i = 1; i < cur_p->size; ++i) {
max_l = std::max(max_l, cur_p->data[i].logit);
}
}
float cum_sum = 0.0f; float cum_sum = 0.0f;
for (size_t i = 0; i < cur_p->size; ++i) { for (size_t i = 0; i < cur_p->size; ++i) {
@ -226,7 +312,6 @@ static void llama_sampler_softmax_impl(llama_token_data_array * cur_p) {
} }
static void llama_sampler_top_k_impl(llama_token_data_array * cur_p, int32_t k) { static void llama_sampler_top_k_impl(llama_token_data_array * cur_p, int32_t k) {
// TODO: move bucket sort to separate function so that top_p/typical/softmax first is equally fast
// if (k >= (int32_t)cur_p->size) { // if (k >= (int32_t)cur_p->size) {
// return; // return;
// } // }
@ -239,64 +324,7 @@ static void llama_sampler_top_k_impl(llama_token_data_array * cur_p, int32_t k)
// Sort scores in descending order // Sort scores in descending order
if (!cur_p->sorted) { if (!cur_p->sorted) {
auto comp = [](const llama_token_data & a, const llama_token_data & b) { llama_token_data_array_partial_sort_inplace(cur_p, k);
return a.logit > b.logit;
};
if (k <= 128) {
std::partial_sort(cur_p->data, cur_p->data + k, cur_p->data + cur_p->size, comp);
} else {
constexpr int nbuckets = 128;
constexpr float bucket_low = -10.0f;
constexpr float bucket_high = 10.0f;
constexpr float bucket_scale = nbuckets/(bucket_high - bucket_low);
constexpr float bucket_inter = -bucket_low * bucket_scale;
std::vector<int> bucket_idx(cur_p->size);
std::vector<int> histo(nbuckets, 0);
for (int i = 0; i < (int)cur_p->size; ++i) {
const float val = cur_p->data[i].logit;
int ib = int(bucket_scale * val + bucket_inter); //nbuckets * (val - bucket_low) / (bucket_high - bucket_low);
ib = std::max(0, std::min(nbuckets - 1, ib));
bucket_idx[i] = ib;
++histo[ib];
}
int nhave = 0;
int ib = nbuckets - 1;
for ( ; ib >= 0; --ib) {
nhave += histo[ib];
if (nhave >= k) {
break;
}
}
std::vector<llama_token_data> tmp_tokens(nhave);
auto * ptr = tmp_tokens.data();
std::vector<llama_token_data*> bucket_ptrs;
bucket_ptrs.reserve(nbuckets - ib);
for (int j = nbuckets - 1; j >= ib; --j) {
bucket_ptrs.push_back(ptr);
ptr += histo[j];
}
for (int i = 0; i < (int)cur_p->size; ++i) {
int j = bucket_idx[i];
if (j >= ib) {
*bucket_ptrs[nbuckets - 1 - j]++ = cur_p->data[i];
}
}
ptr = tmp_tokens.data();
int ndone = 0;
for (int j = nbuckets - 1; j > ib; --j) {
std::sort(ptr, ptr + histo[j], comp);
ptr += histo[j];
ndone += histo[j];
}
std::partial_sort(ptr, ptr + k - ndone, ptr + histo[ib], comp);
std::memcpy(cur_p->data, tmp_tokens.data(), k*sizeof(llama_token_data));
}
cur_p->sorted = true;
} }
cur_p->size = k; cur_p->size = k;
@ -576,9 +604,73 @@ static const char * llama_sampler_dist_name(const struct llama_sampler * /*smpl*
static void llama_sampler_dist_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) { static void llama_sampler_dist_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) {
auto * ctx = (llama_sampler_dist *) smpl->ctx; auto * ctx = (llama_sampler_dist *) smpl->ctx;
llama_sampler_softmax_impl(cur_p); // edge cases
if (cur_p->size == 0) {
cur_p->selected = -1;
return;
}
cur_p->selected = 0;
if (cur_p->size == 1) {
cur_p->data[0].p = 1.0f;
return;
}
// max logit for numerical stability
float max_l = cur_p->data[0].logit;
if (!cur_p->sorted) {
for (size_t i = 1; i < cur_p->size; ++i) {
max_l = std::max(max_l, cur_p->data[i].logit);
}
}
// apply softmax to obtain the probabilities
double sum_cum = 0.0f;
for (size_t i = 0; i < cur_p->size; ++i) {
float p = expf(cur_p->data[i].logit - max_l);
cur_p->data[i].p = p;
sum_cum += p;
}
#if 1
// sample from the obtained probabilities and normalize the probs in a single pass
// this is ~3x faster on Mac with full gpt-oss vocab than the version below
//
std::uniform_real_distribution<double> dist(0.0f, 1.0f);
const double rnd = dist(ctx->rng);
double sum_run = 0.0f;
const double sum_tgt = sum_cum*rnd;
bool found = false;
for (size_t i = 0; i < cur_p->size; ++i) {
if (!found) {
// accumulate probs until we reach the target sum
sum_run += cur_p->data[i].p;
if (sum_run >= sum_tgt) {
cur_p->selected = i;
found = true;
}
}
// normalize probs
cur_p->data[i].p /= sum_cum;
}
// fallback to the last token (don't think this can happen)
assert(found);
if (!found) {
cur_p->selected = cur_p->size - 1;
}
#else
// for clarity, this is the same as above but does one pass for normalization and one extra pass for sampling
for (size_t i = 0; i < cur_p->size; ++i) {
cur_p->data[i].p /= sum_cum;
}
cur_p->selected = llama_sample_dist(cur_p, ctx->rng); cur_p->selected = llama_sample_dist(cur_p, ctx->rng);
#endif
} }
static struct llama_sampler * llama_sampler_dist_clone(const struct llama_sampler * smpl) { static struct llama_sampler * llama_sampler_dist_clone(const struct llama_sampler * smpl) {
@ -626,32 +718,6 @@ struct llama_sampler * llama_sampler_init_dist(uint32_t seed) {
); );
} }
// softmax
static const char * llama_sampler_softmax_name(const struct llama_sampler * /*smpl*/) {
return "softmax";
}
static void llama_sampler_softmax_apply(struct llama_sampler * /*smpl*/, llama_token_data_array * cur_p) {
llama_sampler_softmax_impl(cur_p);
}
static struct llama_sampler_i llama_sampler_softmax_i = {
/* .name = */ llama_sampler_softmax_name,
/* .accept = */ nullptr,
/* .apply = */ llama_sampler_softmax_apply,
/* .reset = */ nullptr,
/* .clone = */ nullptr,
/* .free = */ nullptr,
};
struct llama_sampler * llama_sampler_init_softmax() {
return llama_sampler_init(
/* .iface = */ &llama_sampler_softmax_i,
/* .ctx = */ nullptr
);
}
// top-k // top-k
struct llama_sampler_top_k { struct llama_sampler_top_k {
@ -663,7 +729,7 @@ static const char * llama_sampler_top_k_name(const struct llama_sampler * /*smpl
} }
static void llama_sampler_top_k_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) { static void llama_sampler_top_k_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) {
const auto * ctx = (llama_sampler_top_k *) smpl->ctx; auto * ctx = (llama_sampler_top_k *) smpl->ctx;
llama_sampler_top_k_impl(cur_p, ctx->k); llama_sampler_top_k_impl(cur_p, ctx->k);
} }
@ -699,6 +765,8 @@ struct llama_sampler * llama_sampler_init_top_k(int32_t k) {
struct llama_sampler_top_p { struct llama_sampler_top_p {
const float p; const float p;
const size_t min_keep; const size_t min_keep;
std::vector<llama_token_data> buf_sort;
}; };
static const char * llama_sampler_top_p_name(const struct llama_sampler * /*smpl*/) { static const char * llama_sampler_top_p_name(const struct llama_sampler * /*smpl*/) {
@ -706,20 +774,35 @@ static const char * llama_sampler_top_p_name(const struct llama_sampler * /*smpl
} }
static void llama_sampler_top_p_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) { static void llama_sampler_top_p_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) {
const auto * ctx = (llama_sampler_top_p *) smpl->ctx; auto * ctx = (llama_sampler_top_p *) smpl->ctx;
if (ctx->p >= 1.0f) { if (ctx->p >= 1.0f) {
return; return;
} }
llama_sampler_softmax_impl(cur_p); llama_sampler_softmax_impl(cur_p, false);
size_t k = cur_p->size;
auto * pdata = cur_p->data;
auto & buf_sort = ctx->buf_sort;
// if not sorted, try adaptive top-k sorting
if (!cur_p->sorted && cur_p->size > 1024) {
k = std::min<size_t>(256, cur_p->size);
llama_token_data_array_partial_sort(*cur_p, k, buf_sort);
pdata = buf_sort.data();
} else if (!cur_p->sorted) {
// small candidates -> sort inplace
llama_token_data_array_partial_sort_inplace(cur_p, k);
}
// Compute the cumulative probabilities // Compute the cumulative probabilities
float cum_sum = 0.0f; float cum_sum = 0.0f;
size_t last_idx = cur_p->size; size_t last_idx = cur_p->size;
for (size_t i = 0; i < cur_p->size; ++i) { for (size_t i = 0; i < cur_p->size; ++i) {
cum_sum += cur_p->data[i].p; cum_sum += pdata[i].p;
// Check if the running sum is at least p or if we have kept at least min_keep tokens // Check if the running sum is at least p or if we have kept at least min_keep tokens
// we set the last index to i+1 to indicate that the current iterate should be included in the set // we set the last index to i+1 to indicate that the current iterate should be included in the set
@ -727,9 +810,21 @@ static void llama_sampler_top_p_apply(struct llama_sampler * smpl, llama_token_d
last_idx = i + 1; last_idx = i + 1;
break; break;
} }
// we exceeded the current top-k heuristic -> increase k and continue
if (!cur_p->sorted && i == k - 1) {
k = cur_p->size;
llama_token_data_array_partial_sort(*cur_p, k, buf_sort);
pdata = buf_sort.data();
}
} }
// Resize the output vector to keep only the top-p tokens // Resize the output vector to keep only the top-p tokens
if (!cur_p->sorted) {
std::copy(buf_sort.data(), buf_sort.data() + last_idx, cur_p->data);
cur_p->sorted = true;
}
cur_p->size = last_idx; cur_p->size = last_idx;
} }
@ -757,6 +852,7 @@ struct llama_sampler * llama_sampler_init_top_p(float p, size_t min_keep) {
/* .ctx = */ new llama_sampler_top_p { /* .ctx = */ new llama_sampler_top_p {
/* .p = */ p, /* .p = */ p,
/* .min_keep = */ min_keep, /* .min_keep = */ min_keep,
/* .buf_sort = */ {},
} }
); );
} }
@ -773,7 +869,7 @@ static const char * llama_sampler_min_p_name(const struct llama_sampler * /*smpl
} }
static void llama_sampler_min_p_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) { static void llama_sampler_min_p_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) {
const auto * ctx = (llama_sampler_min_p *) smpl->ctx; auto * ctx = (llama_sampler_min_p *) smpl->ctx;
if (ctx->p <= 0.0f || !cur_p->size) { if (ctx->p <= 0.0f || !cur_p->size) {
return; return;
@ -799,7 +895,7 @@ static void llama_sampler_min_p_apply(struct llama_sampler * smpl, llama_token_d
// if we have enough values the operation was a success // if we have enough values the operation was a success
if (!filtered_tokens.empty() && filtered_tokens.size() >= ctx->min_keep) { if (!filtered_tokens.empty() && filtered_tokens.size() >= ctx->min_keep) {
memcpy(cur_p->data, filtered_tokens.data(), filtered_tokens.size()*sizeof(llama_token_data)); std::copy(filtered_tokens.begin(), filtered_tokens.end(), cur_p->data);
cur_p->size = filtered_tokens.size(); cur_p->size = filtered_tokens.size();
min_p_applied = true; min_p_applied = true;
} }
@ -809,10 +905,7 @@ static void llama_sampler_min_p_apply(struct llama_sampler * smpl, llama_token_d
if (!min_p_applied) { if (!min_p_applied) {
// Sort the logits in descending order // Sort the logits in descending order
if (!cur_p->sorted) { if (!cur_p->sorted) {
std::sort(cur_p->data, cur_p->data + cur_p->size, [](const llama_token_data & a, const llama_token_data & b) { llama_token_data_array_partial_sort_inplace(cur_p, cur_p->size);
return a.logit > b.logit;
});
cur_p->sorted = true;
} }
const float min_logit = cur_p->data[0].logit + logf(ctx->p); // min logit for p_i >= p * p_max const float min_logit = cur_p->data[0].logit + logf(ctx->p); // min logit for p_i >= p * p_max
@ -869,7 +962,7 @@ static const char * llama_sampler_typical_name(const struct llama_sampler * /*sm
} }
static void llama_sampler_typical_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) { static void llama_sampler_typical_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) {
const auto * ctx = (llama_sampler_typical *) smpl->ctx; auto * ctx = (llama_sampler_typical *) smpl->ctx;
// Reference implementation: // Reference implementation:
// https://github.com/huggingface/transformers/compare/main...cimeister:typical-sampling:typical-pr // https://github.com/huggingface/transformers/compare/main...cimeister:typical-sampling:typical-pr
@ -878,7 +971,7 @@ static void llama_sampler_typical_apply(struct llama_sampler * smpl, llama_token
} }
// Compute the softmax of logits and calculate entropy // Compute the softmax of logits and calculate entropy
llama_sampler_softmax_impl(cur_p); llama_sampler_softmax_impl(cur_p, true);
float entropy = 0.0f; float entropy = 0.0f;
for (size_t i = 0; i < cur_p->size; ++i) { for (size_t i = 0; i < cur_p->size; ++i) {
@ -1012,7 +1105,7 @@ static const char * llama_sampler_temp_ext_name(const struct llama_sampler * /*s
} }
static void llama_sampler_temp_ext_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) { static void llama_sampler_temp_ext_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) {
const auto * ctx = (llama_sampler_temp_ext *) smpl->ctx; auto * ctx = (llama_sampler_temp_ext *) smpl->ctx;
if (ctx->delta > 0) { if (ctx->delta > 0) {
const float min_temp = std::max(0.0f, ctx->temp - ctx->delta); const float min_temp = std::max(0.0f, ctx->temp - ctx->delta);
const float max_temp = ctx->temp + ctx->delta; const float max_temp = ctx->temp + ctx->delta;
@ -1027,7 +1120,7 @@ static void llama_sampler_temp_ext_apply(struct llama_sampler * smpl, llama_toke
// Calculate maximum possible entropy // Calculate maximum possible entropy
float max_entropy = -logf(1.0f / cur_p->size); float max_entropy = -logf(1.0f / cur_p->size);
llama_sampler_softmax_impl(cur_p); llama_sampler_softmax_impl(cur_p, true);
// Calculate entropy of the softmax probabilities // Calculate entropy of the softmax probabilities
float entropy = 0.0f; float entropy = 0.0f;
@ -1121,7 +1214,7 @@ struct llama_sampler_xtc {
const uint32_t seed; const uint32_t seed;
uint32_t seed_cur; uint32_t seed_cur;
std::mt19937 rng; std::mt19937 rng;
}; };
static const char * llama_sampler_xtc_name(const struct llama_sampler * /*smpl*/) { static const char * llama_sampler_xtc_name(const struct llama_sampler * /*smpl*/) {
@ -1139,17 +1232,20 @@ static void llama_sample_xtc_apply(struct llama_sampler * smpl, llama_token_data
std::uniform_real_distribution<float> distribution(0.0f, 1.0f); std::uniform_real_distribution<float> distribution(0.0f, 1.0f);
float chance = distribution(ctx->rng); float chance = distribution(ctx->rng);
if (chance > ctx->probability) return; if (chance > ctx->probability) {
return;
}
// in case it's not sorted/recalculated yet llama_sampler_softmax_impl(cur_p, true);
llama_sampler_softmax_impl(cur_p);
int pos_last = 0; int pos_last = 0;
for (size_t i = 0; i < cur_p->size; ++i) { for (size_t i = 0; i < cur_p->size; ++i) {
if (cur_p->data[i].p >= ctx->threshold) { if (cur_p->data[i].p >= ctx->threshold) {
pos_last = i; pos_last = i;
} else break; } else {
break;
}
} }
if (cur_p->size - pos_last >= ctx->min_keep && pos_last > 0) { if (cur_p->size - pos_last >= ctx->min_keep && pos_last > 0) {
@ -1221,7 +1317,7 @@ struct llama_sampler_mirostat {
float mu; float mu;
std::mt19937 rng; std::mt19937 rng;
}; };
static const char * llama_sampler_mirostat_name(const struct llama_sampler * /*smpl*/) { static const char * llama_sampler_mirostat_name(const struct llama_sampler * /*smpl*/) {
@ -1231,7 +1327,7 @@ static const char * llama_sampler_mirostat_name(const struct llama_sampler * /*s
static void llama_sampler_mirostat_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) { static void llama_sampler_mirostat_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) {
auto * ctx = (llama_sampler_mirostat *) smpl->ctx; auto * ctx = (llama_sampler_mirostat *) smpl->ctx;
llama_sampler_softmax_impl(cur_p); llama_sampler_softmax_impl(cur_p, true);
// Estimate s_hat using the most probable m tokens // Estimate s_hat using the most probable m tokens
float s_hat = 0.0; float s_hat = 0.0;
@ -1250,7 +1346,8 @@ static void llama_sampler_mirostat_apply(struct llama_sampler * smpl, llama_toke
float k = powf((epsilon_hat * powf(2, ctx->mu)) / (1 - powf(ctx->n_vocab, -epsilon_hat)), 1 / s_hat); float k = powf((epsilon_hat * powf(2, ctx->mu)) / (1 - powf(ctx->n_vocab, -epsilon_hat)), 1 / s_hat);
llama_sampler_top_k_impl(cur_p, std::max(int(k), 1)); llama_sampler_top_k_impl(cur_p, std::max(int(k), 1));
llama_sampler_softmax_impl(cur_p);
llama_sampler_softmax_impl(cur_p, true);
const int idx = llama_sample_dist(cur_p, ctx->rng); const int idx = llama_sample_dist(cur_p, ctx->rng);
@ -1336,7 +1433,7 @@ static const char * llama_sampler_mirostat_v2_name(const struct llama_sampler *
static void llama_sampler_mirostat_v2_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) { static void llama_sampler_mirostat_v2_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) {
auto * ctx = (llama_sampler_mirostat_v2 *) smpl->ctx; auto * ctx = (llama_sampler_mirostat_v2 *) smpl->ctx;
llama_sampler_softmax_impl(cur_p); llama_sampler_softmax_impl(cur_p, true);
// Truncate the words with surprise values greater than mu // Truncate the words with surprise values greater than mu
cur_p->size = std::distance(cur_p->data, std::find_if(cur_p->data, cur_p->data + cur_p->size, [&](const llama_token_data & candidate) { cur_p->size = std::distance(cur_p->data, std::find_if(cur_p->data, cur_p->data + cur_p->size, [&](const llama_token_data & candidate) {
@ -1348,7 +1445,7 @@ static void llama_sampler_mirostat_v2_apply(struct llama_sampler * smpl, llama_t
} }
// Normalize the probabilities of the remaining words // Normalize the probabilities of the remaining words
llama_sampler_softmax_impl(cur_p); llama_sampler_softmax_impl(cur_p, true);
const int idx = llama_sample_dist(cur_p, ctx->rng); const int idx = llama_sample_dist(cur_p, ctx->rng);
@ -1540,7 +1637,7 @@ static struct llama_sampler * llama_sampler_init_grammar_impl(
trigger_pattern += std::regex_replace(trigger_words[i], special_chars, "\\$0"); trigger_pattern += std::regex_replace(trigger_words[i], special_chars, "\\$0");
} }
trigger_pattern += ")[\\s\\S]*"; trigger_pattern += ")[\\s\\S]*";
auto trigger_pattern_c = trigger_pattern.c_str(); const auto * trigger_pattern_c = trigger_pattern.c_str();
trigger_patterns = &trigger_pattern_c; trigger_patterns = &trigger_pattern_c;
num_trigger_patterns = 1; num_trigger_patterns = 1;
} }
@ -1748,7 +1845,7 @@ static const char * llama_sampler_top_n_sigma_name(const struct llama_sampler *
} }
static void llama_sampler_top_n_sigma_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) { static void llama_sampler_top_n_sigma_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) {
const auto * ctx = (llama_sampler_top_n_sigma *) smpl->ctx; auto * ctx = (llama_sampler_top_n_sigma *) smpl->ctx;
if (ctx->n <= 0.0f || cur_p->size <= 1) { if (ctx->n <= 0.0f || cur_p->size <= 1) {
return; return;
@ -1780,13 +1877,14 @@ static void llama_sampler_top_n_sigma_apply(struct llama_sampler * smpl, llama_t
} }
float std = valid_count > 0 ? sqrt(acc/valid_count) : 0; float std = valid_count > 0 ? sqrt(acc/valid_count) : 0;
//apply mask // apply mask
for (size_t i = 0; i < cur_p->size; ++i) { for (size_t i = 0; i < cur_p->size; ++i) {
if (cur_p->data[i].logit < max - (ctx->n * std)) { if (cur_p->data[i].logit < max - (ctx->n * std)) {
cur_p->data[i].logit = -INFINITY; cur_p->data[i].logit = -INFINITY;
} }
} }
llama_sampler_softmax_impl(cur_p);
llama_sampler_softmax_impl(cur_p, true);
} }
static struct llama_sampler * llama_sampler_top_n_sigma_clone(const struct llama_sampler * smpl) { static struct llama_sampler * llama_sampler_top_n_sigma_clone(const struct llama_sampler * smpl) {
@ -1991,7 +2089,9 @@ static void llama_sampler_dry_apply(struct llama_sampler * smpl, llama_token_dat
{ {
const int last = last_n_repeat - 1; const int last = last_n_repeat - 1;
int rt = 0, lt = 0;
int rt = 0;
int lt = 0;
for (int k = 1; k < last_n_repeat; ++k) { for (int k = 1; k < last_n_repeat; ++k) {
if (k > rt) { if (k > rt) {
@ -2135,8 +2235,8 @@ static struct llama_sampler_i llama_sampler_dry_i = {
/* .free = */ llama_sampler_dry_free, /* .free = */ llama_sampler_dry_free,
}; };
struct llama_sampler * llama_sampler_init_dry(const struct llama_vocab * vocab, int32_t context_size, float dry_multiplier, float dry_base, int32_t dry_allowed_length, int32_t dry_penalty_last_n, const char** seq_breakers, size_t num_breakers) { struct llama_sampler * llama_sampler_init_dry(const struct llama_vocab * vocab, int32_t n_ctx_train, float dry_multiplier, float dry_base, int32_t dry_allowed_length, int32_t dry_penalty_last_n, const char** seq_breakers, size_t num_breakers) {
int32_t effective_dry_penalty_last_n = (dry_penalty_last_n == -1) ? context_size : std::max(dry_penalty_last_n, 0); int32_t effective_dry_penalty_last_n = (dry_penalty_last_n == -1) ? n_ctx_train : std::max(dry_penalty_last_n, 0);
std::unordered_multimap<llama_token, std::vector<llama_token>> processed_breakers; std::unordered_multimap<llama_token, std::vector<llama_token>> processed_breakers;
const int MAX_CHAR_LEN = 40; const int MAX_CHAR_LEN = 40;
const int MAX_SEQ_LEN = 20; const int MAX_SEQ_LEN = 20;
@ -2169,7 +2269,7 @@ struct llama_sampler * llama_sampler_init_dry(const struct llama_vocab * vocab,
return llama_sampler_init( return llama_sampler_init(
/* .iface = */ &llama_sampler_dry_i, /* .iface = */ &llama_sampler_dry_i,
/* .ctx = */ new llama_sampler_dry { /* .ctx = */ new llama_sampler_dry {
/* .total_context_size = */ context_size, /* .total_context_size = */ n_ctx_train,
/* .dry_multiplier = */ dry_multiplier, /* .dry_multiplier = */ dry_multiplier,
/* .dry_base = */ dry_base, /* .dry_base = */ dry_base,
/* .dry_allowed_length = */ dry_allowed_length, /* .dry_allowed_length = */ dry_allowed_length,
@ -2308,7 +2408,7 @@ static const char * llama_sampler_infill_name(const struct llama_sampler * /*smp
static void llama_sampler_infill_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) { static void llama_sampler_infill_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) {
auto * ctx = (llama_sampler_infill *) smpl->ctx; auto * ctx = (llama_sampler_infill *) smpl->ctx;
llama_sampler_softmax_impl(cur_p); llama_sampler_softmax_impl(cur_p, true);
#if defined(GGML_DEBUG_SAMPLER_INFILL) #if defined(GGML_DEBUG_SAMPLER_INFILL)
#define LOG_DBG_CUR LLAMA_LOG_DEBUG #define LOG_DBG_CUR LLAMA_LOG_DEBUG

View File

@ -434,6 +434,13 @@ struct llm_tokenizer_bpe : llm_tokenizer {
"(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1}| ?[^\\s\\p{L}\\p{N}\\r\\n]+|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+", "(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1}| ?[^\\s\\p{L}\\p{N}\\r\\n]+|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
}; };
break; break;
case LLAMA_VOCAB_PRE_TYPE_GROK_2:
regex_exprs = {
// original regex from tokenizer.json
// "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+"
"(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
};
break;
default: default:
// default regex for BPE tokenization pre-processing // default regex for BPE tokenization pre-processing
regex_exprs = { regex_exprs = {
@ -1763,7 +1770,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
const size_t n_precompiled_charsmap = gguf_get_arr_data_n(ctx, precompiled_charsmap_keyidx); const size_t n_precompiled_charsmap = gguf_get_arr_data_n(ctx, precompiled_charsmap_keyidx);
const char * pc = (const char *) gguf_get_arr_data(ctx, precompiled_charsmap_keyidx); const char * pc = (const char *) gguf_get_arr_data(ctx, precompiled_charsmap_keyidx);
precompiled_charsmap.assign(pc, pc + n_precompiled_charsmap); precompiled_charsmap.assign(pc, pc + n_precompiled_charsmap);
#ifdef IS_BIG_ENDIAN #if defined(__BYTE_ORDER__) && defined(__ORDER_BIG_ENDIAN__) && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
// correct endiannes of data in precompiled_charsmap binary blob // correct endiannes of data in precompiled_charsmap binary blob
uint32_t * xcda_blob_size = (uint32_t *) &precompiled_charsmap[0]; uint32_t * xcda_blob_size = (uint32_t *) &precompiled_charsmap[0];
*xcda_blob_size = __builtin_bswap32(*xcda_blob_size); *xcda_blob_size = __builtin_bswap32(*xcda_blob_size);
@ -1944,7 +1951,8 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
pre_type = LLAMA_VOCAB_PRE_TYPE_TRILLION; pre_type = LLAMA_VOCAB_PRE_TYPE_TRILLION;
clean_spaces = false; clean_spaces = false;
} else if ( } else if (
tokenizer_pre == "bailingmoe") { tokenizer_pre == "bailingmoe" ||
tokenizer_pre == "llada-moe") {
pre_type = LLAMA_VOCAB_PRE_TYPE_BAILINGMOE; pre_type = LLAMA_VOCAB_PRE_TYPE_BAILINGMOE;
clean_spaces = false; clean_spaces = false;
} else if ( } else if (
@ -1963,6 +1971,10 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
tokenizer_pre == "kimi-k2") { tokenizer_pre == "kimi-k2") {
pre_type = LLAMA_VOCAB_PRE_TYPE_KIMI_K2; pre_type = LLAMA_VOCAB_PRE_TYPE_KIMI_K2;
clean_spaces = false; clean_spaces = false;
} else if (
tokenizer_pre == "grok-2") {
pre_type = LLAMA_VOCAB_PRE_TYPE_GROK_2;
clean_spaces = false;
} else { } else {
LLAMA_LOG_WARN("%s: missing or unrecognized pre-tokenizer type, using: 'default'\n", __func__); LLAMA_LOG_WARN("%s: missing or unrecognized pre-tokenizer type, using: 'default'\n", __func__);
pre_type = LLAMA_VOCAB_PRE_TYPE_DEFAULT; pre_type = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
@ -2331,7 +2343,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
// @ngxson : quick hack for gpt-oss, always render these tokens // @ngxson : quick hack for gpt-oss, always render these tokens
for (const auto & t : token_to_id) { for (const auto & t : token_to_id) {
if (t.first == "<|channel|>" || t.first == "<|message|>" || t.first == "<|start|>") { if (t.first == "<|channel|>" || t.first == "<|message|>" || t.first == "<|start|>" || t.first == "<|constrain|>") {
id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_USER_DEFINED; id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_USER_DEFINED;
} }
} }
@ -2378,6 +2390,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
if (has_return && has_call && has_end) { if (has_return && has_call && has_end) {
special_eog_ids.erase(end_id); special_eog_ids.erase(end_id);
id_to_token[end_id].attr = LLAMA_TOKEN_ATTR_USER_DEFINED;
LLAMA_LOG_WARN("%s: special_eog_ids contains both '<|return|>' and '<|call|>' tokens, removing '<|end|>' token from EOG list\n", __func__); LLAMA_LOG_WARN("%s: special_eog_ids contains both '<|return|>' and '<|call|>' tokens, removing '<|end|>' token from EOG list\n", __func__);
} }
} }
@ -2459,7 +2472,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
// set attributes by model/tokenizer/architecture name // set attributes by model/tokenizer/architecture name
if (false if (false
|| _contains_any(tokenizer_pre, {"jina-v2-de", "jina-v2-es", "jina-v2-code"}) || _contains_any(tokenizer_pre, {"jina-v2-de", "jina-v2-es", "jina-v2-code"})
|| _contains_any(general_arch, {"nomic-bert-moe"}) || _contains_any(general_arch, {"nomic-bert-moe", "jina-bert-v3"})
) { ) {
if (token_to_id.count("<mask>") == 0) { if (token_to_id.count("<mask>") == 0) {
LLAMA_LOG_WARN("%s: Mask token is missing in vocab, please reconvert model!\n", __func__); LLAMA_LOG_WARN("%s: Mask token is missing in vocab, please reconvert model!\n", __func__);

View File

@ -47,6 +47,7 @@ enum llama_vocab_pre_type {
LLAMA_VOCAB_PRE_TYPE_HUNYUAN = 36, LLAMA_VOCAB_PRE_TYPE_HUNYUAN = 36,
LLAMA_VOCAB_PRE_TYPE_KIMI_K2 = 37, LLAMA_VOCAB_PRE_TYPE_KIMI_K2 = 37,
LLAMA_VOCAB_PRE_TYPE_HUNYUAN_DENSE = 38, LLAMA_VOCAB_PRE_TYPE_HUNYUAN_DENSE = 38,
LLAMA_VOCAB_PRE_TYPE_GROK_2 = 39,
}; };
struct LLM_KV; struct LLM_KV;

View File

@ -25,6 +25,18 @@
// interface implementation // interface implementation
// //
const char * llama_flash_attn_type_name(enum llama_flash_attn_type flash_attn_type) {
switch (flash_attn_type) {
case LLAMA_FLASH_ATTN_TYPE_AUTO:
return "auto";
case LLAMA_FLASH_ATTN_TYPE_DISABLED:
return "disabled";
case LLAMA_FLASH_ATTN_TYPE_ENABLED:
return "enabled";
}
GGML_ABORT("fatal error");
}
struct llama_sampler_chain_params llama_sampler_chain_default_params() { struct llama_sampler_chain_params llama_sampler_chain_default_params() {
struct llama_sampler_chain_params result = { struct llama_sampler_chain_params result = {
/*.no_perf =*/ true, /*.no_perf =*/ true,
@ -47,6 +59,7 @@ bool llama_supports_mlock(void) {
bool llama_supports_gpu_offload(void) { bool llama_supports_gpu_offload(void) {
return ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_GPU) != nullptr || return ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_GPU) != nullptr ||
ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_IGPU) != nullptr ||
llama_supports_rpc(); llama_supports_rpc();
} }
@ -71,7 +84,9 @@ void llama_numa_init(enum ggml_numa_strategy numa) {
GGML_ASSERT(dev && "CPU backend is not loaded"); GGML_ASSERT(dev && "CPU backend is not loaded");
auto * reg = ggml_backend_dev_backend_reg(dev); auto * reg = ggml_backend_dev_backend_reg(dev);
auto * numa_init_fn = (decltype(ggml_numa_init) *) ggml_backend_reg_get_proc_address(reg, "ggml_backend_cpu_numa_init"); auto * numa_init_fn = (decltype(ggml_numa_init) *) ggml_backend_reg_get_proc_address(reg, "ggml_backend_cpu_numa_init");
numa_init_fn(numa); if (numa_init_fn) {
numa_init_fn(numa);
}
} }
} }
@ -170,8 +185,13 @@ static struct llama_model * llama_model_load_from_file_impl(
model->devices.push_back(*dev); model->devices.push_back(*dev);
} }
} else { } else {
// default device selection
// build list of available devices
std::vector<ggml_backend_dev_t> gpus;
std::vector<ggml_backend_dev_t> igpus;
std::vector<ggml_backend_dev_t> rpc_servers; std::vector<ggml_backend_dev_t> rpc_servers;
// use all available devices
for (size_t i = 0; i < ggml_backend_dev_count(); ++i) { for (size_t i = 0; i < ggml_backend_dev_count(); ++i) {
ggml_backend_dev_t dev = ggml_backend_dev_get(i); ggml_backend_dev_t dev = ggml_backend_dev_get(i);
switch (ggml_backend_dev_type(dev)) { switch (ggml_backend_dev_type(dev)) {
@ -180,19 +200,51 @@ static struct llama_model * llama_model_load_from_file_impl(
// skip CPU backends since they are handled separately // skip CPU backends since they are handled separately
break; break;
case GGML_BACKEND_DEVICE_TYPE_GPU: case GGML_BACKEND_DEVICE_TYPE_GPU: {
ggml_backend_reg_t reg = ggml_backend_dev_backend_reg(dev); ggml_backend_reg_t reg = ggml_backend_dev_backend_reg(dev);
if (ggml_backend_reg_name(reg) == std::string("RPC")) { if (ggml_backend_reg_name(reg) == std::string("RPC")) {
rpc_servers.push_back(dev); rpc_servers.push_back(dev);
} else { } else {
model->devices.push_back(dev); // check if there is already a GPU with the same device id
ggml_backend_dev_props props;
ggml_backend_dev_get_props(dev, &props);
auto it = std::find_if(gpus.begin(), gpus.end(), [&props](ggml_backend_dev_t d) {
ggml_backend_dev_props d_props;
ggml_backend_dev_get_props(d, &d_props);
if (props.device_id && d_props.device_id) {
return strcmp(props.device_id, d_props.device_id) == 0;
}
return false;
});
if (it != gpus.end()) {
LLAMA_LOG_INFO("%s: skipping device %s (%s) with id %s - already using device %s (%s) with the same id\n",
__func__,
ggml_backend_dev_name(dev), ggml_backend_dev_description(dev),
props.device_id ? props.device_id : "unknown id",
ggml_backend_dev_name(*it), ggml_backend_dev_description(*it));
} else {
gpus.push_back(dev);
}
} }
break; break;
}
case GGML_BACKEND_DEVICE_TYPE_IGPU:
igpus.push_back(dev);
break;
} }
} }
// add RPC servers at the front of the list
if (!rpc_servers.empty()) { // add RPC servers at the front of the list to minimize network transfers
model->devices.insert(model->devices.begin(), rpc_servers.begin(), rpc_servers.end()); model->devices.insert(model->devices.begin(), rpc_servers.begin(), rpc_servers.end());
// add GPUs
model->devices.insert(model->devices.end(), gpus.begin(), gpus.end());
// add integrated GPUs only if no other devices were found
if (model->devices.empty()) {
model->devices.insert(model->devices.end(), igpus.begin(), igpus.end());
} }
} }
@ -213,9 +265,12 @@ static struct llama_model * llama_model_load_from_file_impl(
} }
for (auto * dev : model->devices) { for (auto * dev : model->devices) {
size_t free, total; // NOLINT ggml_backend_dev_props props;
ggml_backend_dev_memory(dev, &free, &total); ggml_backend_dev_get_props(dev, &props);
LLAMA_LOG_INFO("%s: using device %s (%s) - %zu MiB free\n", __func__, ggml_backend_dev_name(dev), ggml_backend_dev_description(dev), free/1024/1024); LLAMA_LOG_INFO("%s: using device %s (%s) (%s) - %zu MiB free\n", __func__,
ggml_backend_dev_name(dev), ggml_backend_dev_description(dev),
props.device_id ? props.device_id : "unknown id",
props.memory_free/1024/1024);
} }
const int status = llama_model_load(path_model, splits, *model, params); const int status = llama_model_load(path_model, splits, *model, params);

View File

@ -4,6 +4,7 @@
#include <string> #include <string>
#include <vector> #include <vector>
// TODO: reimplement this structure in endian-independent way
struct unicode_cpt_flags { struct unicode_cpt_flags {
enum { enum {
UNDEFINED = 0x0001, UNDEFINED = 0x0001,
@ -15,6 +16,10 @@ struct unicode_cpt_flags {
SYMBOL = 0x0040, // regex: \p{S} SYMBOL = 0x0040, // regex: \p{S}
CONTROL = 0x0080, // regex: \p{C} CONTROL = 0x0080, // regex: \p{C}
MASK_CATEGORIES = 0x00FF, MASK_CATEGORIES = 0x00FF,
WHITESPACE = 0x0100,
LOWERCASE = 0x0200,
UPPERCASE = 0x0400,
NFD = 0x0800,
}; };
// codepoint type // codepoint type
@ -34,11 +39,49 @@ struct unicode_cpt_flags {
// decode from uint16 // decode from uint16
inline unicode_cpt_flags(const uint16_t flags = 0) { inline unicode_cpt_flags(const uint16_t flags = 0) {
#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
*reinterpret_cast<uint16_t*>(this) = flags; *reinterpret_cast<uint16_t*>(this) = flags;
#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
is_undefined = (flags & UNDEFINED) ? 1 : 0;
is_number = (flags & NUMBER) ? 1 : 0;
is_letter = (flags & LETTER) ? 1 : 0;
is_separator = (flags & SEPARATOR) ? 1 : 0;
is_accent_mark = (flags & ACCENT_MARK) ? 1 : 0;
is_punctuation = (flags & PUNCTUATION) ? 1 : 0;
is_symbol = (flags & SYMBOL) ? 1 : 0;
is_control = (flags & CONTROL) ? 1 : 0;
is_whitespace = (flags & WHITESPACE) ? 1 : 0;
is_lowercase = (flags & LOWERCASE) ? 1 : 0;
is_uppercase = (flags & UPPERCASE) ? 1 : 0;
is_nfd = (flags & NFD) ? 1 : 0;
#else
#error Unexpected or undefined __BYTE_ORDER__
#endif
} }
inline uint16_t as_uint() const { inline uint16_t as_uint() const {
#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
return *reinterpret_cast<const uint16_t*>(this); return *reinterpret_cast<const uint16_t*>(this);
#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
uint16_t result =
is_undefined * UNDEFINED
+ is_number * NUMBER
+ is_letter * LETTER
+ is_separator * SEPARATOR
+ is_accent_mark * ACCENT_MARK
+ is_punctuation * PUNCTUATION
+ is_symbol * SYMBOL
+ is_control * CONTROL
+ is_whitespace * WHITESPACE
+ is_lowercase * LOWERCASE
+ is_uppercase * UPPERCASE
+ is_nfd * NFD
;
return result;
#else
#error Unexpected or undefined __BYTE_ORDER__
#endif
} }
inline uint16_t category_flag() const { inline uint16_t category_flag() const {

View File

@ -44,6 +44,7 @@
#define KEY_WIN_ATTN_PATTERN "clip.vision.n_wa_pattern" #define KEY_WIN_ATTN_PATTERN "clip.vision.n_wa_pattern"
#define KEY_ATTN_WINDOW_SIZE "clip.vision.window_size" #define KEY_ATTN_WINDOW_SIZE "clip.vision.window_size"
#define KEY_MINICPMV_VERSION "clip.minicpmv_version" #define KEY_MINICPMV_VERSION "clip.minicpmv_version"
#define KEY_MINICPMV_QUERY_NUM "clip.minicpmv_query_num"
// audio-specific // audio-specific
#define KEY_A_NUM_MEL_BINS "clip.audio.num_mel_bins" #define KEY_A_NUM_MEL_BINS "clip.audio.num_mel_bins"
@ -81,6 +82,7 @@
#define TN_MVLM_PROJ_PEG "mm.model.peg.%d.%s" #define TN_MVLM_PROJ_PEG "mm.model.peg.%d.%s"
#define TN_IMAGE_NEWLINE "model.image_newline" #define TN_IMAGE_NEWLINE "model.image_newline"
#define TN_MM_INP_NORM "mm.input_norm.weight" #define TN_MM_INP_NORM "mm.input_norm.weight"
#define TN_MM_INP_NORM_B "mm.input_norm.bias"
#define TN_MM_INP_PROJ "mm.input_projection.weight" // gemma3 #define TN_MM_INP_PROJ "mm.input_projection.weight" // gemma3
#define TN_MM_SOFT_EMB_N "mm.soft_emb_norm.weight" // gemma3 #define TN_MM_SOFT_EMB_N "mm.soft_emb_norm.weight" // gemma3
#define TN_MM_PROJECTOR "mm.model.fc.weight" // idefics3 #define TN_MM_PROJECTOR "mm.model.fc.weight" // idefics3
@ -132,6 +134,8 @@ enum projector_type {
PROJECTOR_TYPE_QWEN2A, PROJECTOR_TYPE_QWEN2A,
PROJECTOR_TYPE_QWEN25O, // will be replaced by QWEN2A or QWEN25VL depending on clip_ctx PROJECTOR_TYPE_QWEN25O, // will be replaced by QWEN2A or QWEN25VL depending on clip_ctx
PROJECTOR_TYPE_VOXTRAL, PROJECTOR_TYPE_VOXTRAL,
PROJECTOR_TYPE_LFM2,
PROJECTOR_TYPE_KIMIVL,
PROJECTOR_TYPE_UNKNOWN, PROJECTOR_TYPE_UNKNOWN,
}; };
@ -152,6 +156,8 @@ static std::map<projector_type, std::string> PROJECTOR_TYPE_NAMES = {
{ PROJECTOR_TYPE_QWEN2A, "qwen2a"}, { PROJECTOR_TYPE_QWEN2A, "qwen2a"},
{ PROJECTOR_TYPE_QWEN25O, "qwen2.5o"}, { PROJECTOR_TYPE_QWEN25O, "qwen2.5o"},
{ PROJECTOR_TYPE_VOXTRAL, "voxtral"}, { PROJECTOR_TYPE_VOXTRAL, "voxtral"},
{ PROJECTOR_TYPE_LFM2, "lfm2"},
{ PROJECTOR_TYPE_KIMIVL, "kimivl"},
}; };
static projector_type clip_projector_type_from_string(const std::string & str) { static projector_type clip_projector_type_from_string(const std::string & str) {

View File

@ -214,6 +214,7 @@ struct clip_hparams {
// legacy // legacy
bool has_llava_projector = false; bool has_llava_projector = false;
int minicpmv_version = 0; int minicpmv_version = 0;
int32_t minicpmv_query_num = 0; // MiniCPM-V query number
}; };
struct clip_layer { struct clip_layer {
@ -277,6 +278,7 @@ struct clip_model {
// LLaVA projection // LLaVA projection
ggml_tensor * mm_input_norm_w = nullptr; ggml_tensor * mm_input_norm_w = nullptr;
ggml_tensor * mm_input_norm_b = nullptr;
ggml_tensor * mm_0_w = nullptr; ggml_tensor * mm_0_w = nullptr;
ggml_tensor * mm_0_b = nullptr; ggml_tensor * mm_0_b = nullptr;
ggml_tensor * mm_2_w = nullptr; ggml_tensor * mm_2_w = nullptr;
@ -417,6 +419,7 @@ struct clip_ctx {
} }
if (!backend) { if (!backend) {
backend = ggml_backend_init_by_type(GGML_BACKEND_DEVICE_TYPE_GPU, nullptr); backend = ggml_backend_init_by_type(GGML_BACKEND_DEVICE_TYPE_GPU, nullptr);
backend = backend ? backend : ggml_backend_init_by_type(GGML_BACKEND_DEVICE_TYPE_IGPU, nullptr);
} }
} }
@ -500,11 +503,17 @@ struct clip_graph {
ggml_cgraph * build_siglip() { ggml_cgraph * build_siglip() {
ggml_tensor * inp = build_inp(); ggml_tensor * inp = build_inp();
ggml_tensor * learned_pos_embd = model.position_embeddings;
if (ctx->proj_type() == PROJECTOR_TYPE_LFM2) {
learned_pos_embd = resize_position_embeddings();
}
ggml_tensor * cur = build_vit( ggml_tensor * cur = build_vit(
inp, n_patches, inp, n_patches,
NORM_TYPE_NORMAL, NORM_TYPE_NORMAL,
hparams.ffn_op, hparams.ffn_op,
model.position_embeddings, learned_pos_embd,
nullptr); nullptr);
if (ctx->proj_type() == PROJECTOR_TYPE_GEMMA3) { if (ctx->proj_type() == PROJECTOR_TYPE_GEMMA3) {
@ -513,8 +522,8 @@ struct clip_graph {
const int patches_per_image = n_patches_x; const int patches_per_image = n_patches_x;
const int kernel_size = hparams.proj_scale_factor; const int kernel_size = hparams.proj_scale_factor;
cur = ggml_cont(ctx0, ggml_transpose(ctx0, cur)); cur = ggml_transpose(ctx0, cur);
cur = ggml_reshape_4d(ctx0, cur, patches_per_image, patches_per_image, n_embd, batch_size); cur = ggml_cont_4d(ctx0, cur, patches_per_image, patches_per_image, n_embd, batch_size);
// doing a pool2d to reduce the number of output tokens // doing a pool2d to reduce the number of output tokens
cur = ggml_pool_2d(ctx0, cur, GGML_OP_POOL_AVG, kernel_size, kernel_size, kernel_size, kernel_size, 0, 0); cur = ggml_pool_2d(ctx0, cur, GGML_OP_POOL_AVG, kernel_size, kernel_size, kernel_size, kernel_size, 0, 0);
@ -531,29 +540,27 @@ struct clip_graph {
cur); cur);
} else if (ctx->proj_type() == PROJECTOR_TYPE_IDEFICS3) { } else if (ctx->proj_type() == PROJECTOR_TYPE_IDEFICS3) {
// pixel_shuffle
// https://github.com/huggingface/transformers/blob/0a950e0bbe1ed58d5401a6b547af19f15f0c195e/src/transformers/models/idefics3/modeling_idefics3.py#L578 // https://github.com/huggingface/transformers/blob/0a950e0bbe1ed58d5401a6b547af19f15f0c195e/src/transformers/models/idefics3/modeling_idefics3.py#L578
const int scale_factor = model.hparams.proj_scale_factor; const int scale_factor = model.hparams.proj_scale_factor;
const int n_embd = cur->ne[0]; cur = build_patch_merge_permute(cur, scale_factor);
const int seq = cur->ne[1];
const int bsz = 1; // batch size, always 1 for now since we don't support batching
const int height = std::sqrt(seq);
const int width = std::sqrt(seq);
GGML_ASSERT(scale_factor != 0);
cur = ggml_reshape_4d(ctx0, cur, n_embd * scale_factor, width / scale_factor, height, bsz);
cur = ggml_permute(ctx0, cur, 0, 2, 1, 3);
cur = ggml_reshape_4d(ctx0, ggml_cont(ctx0, cur),
n_embd * scale_factor * scale_factor,
height / scale_factor,
width / scale_factor,
bsz);
cur = ggml_permute(ctx0, cur, 0, 2, 1, 3);
cur = ggml_reshape_3d(ctx0, ggml_cont(ctx0, cur),
n_embd * scale_factor * scale_factor,
seq / (scale_factor * scale_factor),
bsz);
cur = ggml_mul_mat(ctx0, model.projection, cur); cur = ggml_mul_mat(ctx0, model.projection, cur);
} else if (ctx->proj_type() == PROJECTOR_TYPE_LFM2) {
// pixel unshuffle block
const int scale_factor = model.hparams.proj_scale_factor;
cur = build_patch_merge_permute(cur, scale_factor);
// projection
cur = ggml_norm(ctx0, cur, 1e-5); // default nn.LayerNorm
cur = ggml_mul(ctx0, cur, model.mm_input_norm_w);
cur = ggml_add(ctx0, cur, model.mm_input_norm_b);
cur = ggml_mul_mat(ctx0, model.mm_1_w, cur);
cur = ggml_add(ctx0, cur, model.mm_1_b);
cur = ggml_gelu(ctx0, cur);
cur = ggml_mul_mat(ctx0, model.mm_2_w, cur);
cur = ggml_add(ctx0, cur, model.mm_2_b);
} else { } else {
GGML_ABORT("SigLIP: Unsupported projector type"); GGML_ABORT("SigLIP: Unsupported projector type");
} }
@ -681,15 +688,15 @@ struct clip_graph {
auto inp_1 = ggml_conv_2d(ctx0, model.patch_embeddings_1, inp_raw, patch_size, patch_size, 0, 0, 1, 1); auto inp_1 = ggml_conv_2d(ctx0, model.patch_embeddings_1, inp_raw, patch_size, patch_size, 0, 0, 1, 1);
inp = ggml_add(ctx0, inp, inp_1); inp = ggml_add(ctx0, inp, inp_1);
inp = ggml_cont(ctx0, ggml_permute(ctx0, inp, 1, 2, 0, 3)); // [w, h, c, b] -> [c, w, h, b] inp = ggml_permute(ctx0, inp, 1, 2, 0, 3); // [w, h, c, b] -> [c, w, h, b]
inp = ggml_reshape_4d( inp = ggml_cont_4d(
ctx0, inp, ctx0, inp,
n_embd * 2, n_patches_x / 2, n_patches_y, batch_size); n_embd * 2, n_patches_x / 2, n_patches_y, batch_size);
inp = ggml_reshape_4d( inp = ggml_reshape_4d(
ctx0, inp, ctx0, inp,
n_embd * 2, n_patches_x / 2, 2, batch_size * (n_patches_y / 2)); n_embd * 2, n_patches_x / 2, 2, batch_size * (n_patches_y / 2));
inp = ggml_cont(ctx0, ggml_permute(ctx0, inp, 0, 2, 1, 3)); inp = ggml_permute(ctx0, inp, 0, 2, 1, 3);
inp = ggml_reshape_3d( inp = ggml_cont_3d(
ctx0, inp, ctx0, inp,
n_embd, n_patches_x * n_patches_y, batch_size); n_embd, n_patches_x * n_patches_y, batch_size);
} }
@ -879,21 +886,8 @@ struct clip_graph {
int n_embd = clip_n_mmproj_embd(ctx); int n_embd = clip_n_mmproj_embd(ctx);
const int d_head = 128; const int d_head = 128;
int n_head = n_embd/d_head; int n_head = n_embd/d_head;
int num_query = 96; // Use actual config value if available, otherwise fall back to hardcoded values
if (ctx->model.hparams.minicpmv_version == 2) { int num_query = ctx->model.hparams.minicpmv_query_num;
// MiniCPM-V 2.5
num_query = 96;
} else if (ctx->model.hparams.minicpmv_version == 3) {
// MiniCPM-V 2.6
num_query = 64;
} else if (ctx->model.hparams.minicpmv_version == 4) {
// MiniCPM-o 2.6
num_query = 64;
} else if (ctx->model.hparams.minicpmv_version == 5) {
// MiniCPM-V 4.0
num_query = 64;
}
ggml_tensor * Q = ggml_add(ctx0, ggml_tensor * Q = ggml_add(ctx0,
ggml_mul_mat(ctx0, model.mm_model_attn_q_w, q), ggml_mul_mat(ctx0, model.mm_model_attn_q_w, q),
model.mm_model_attn_q_b); model.mm_model_attn_q_b);
@ -967,14 +961,14 @@ struct clip_graph {
GGML_ASSERT(scale_factor > 0); GGML_ASSERT(scale_factor > 0);
cur = ggml_reshape_4d(ctx0, cur, n_embd * scale_factor, height / scale_factor, width, bsz); cur = ggml_reshape_4d(ctx0, cur, n_embd * scale_factor, height / scale_factor, width, bsz);
cur = ggml_permute(ctx0, cur, 0, 2, 1, 3); cur = ggml_permute(ctx0, cur, 0, 2, 1, 3);
cur = ggml_reshape_4d(ctx0, ggml_cont(ctx0, cur), cur = ggml_cont_4d(ctx0, cur,
n_embd * scale_factor * scale_factor, n_embd * scale_factor * scale_factor,
height / scale_factor, height / scale_factor,
width / scale_factor, width / scale_factor,
bsz); bsz);
cur = ggml_permute(ctx0, cur, 0, 2, 1, 3); cur = ggml_permute(ctx0, cur, 0, 2, 1, 3);
// flatten to 2D // flatten to 2D
cur = ggml_reshape_2d(ctx0, ggml_cont(ctx0, cur), cur = ggml_cont_2d(ctx0, cur,
n_embd * scale_factor * scale_factor, n_embd * scale_factor * scale_factor,
cur->ne[1] * cur->ne[2]); cur->ne[1] * cur->ne[2]);
} }
@ -1060,14 +1054,14 @@ struct clip_graph {
n_patches_y, n_patches_y,
bsz); bsz);
cur = ggml_permute(ctx0, cur, 0, 2, 1, 3); cur = ggml_permute(ctx0, cur, 0, 2, 1, 3);
cur = ggml_reshape_4d(ctx0, ggml_cont(ctx0, cur), cur = ggml_cont_4d(ctx0, cur,
n_embd * scale_factor * scale_factor, n_embd * scale_factor * scale_factor,
n_patches_x / scale_factor, n_patches_x / scale_factor,
n_patches_y / scale_factor, n_patches_y / scale_factor,
bsz); bsz);
cur = ggml_permute(ctx0, cur, 0, 2, 1, 3); //cur = ggml_permute(ctx0, cur, 0, 2, 1, 3);
// flatten to 2D // flatten to 2D
cur = ggml_reshape_2d(ctx0, ggml_cont(ctx0, cur), cur = ggml_cont_2d(ctx0, cur,
n_embd * scale_factor * scale_factor, n_embd * scale_factor * scale_factor,
n_patches / scale_factor / scale_factor); n_patches / scale_factor / scale_factor);
cb(cur, "pixel_shuffle", -1); cb(cur, "pixel_shuffle", -1);
@ -1092,6 +1086,67 @@ struct clip_graph {
return gf; return gf;
} }
ggml_cgraph * build_kimivl() {
// 2D input positions
ggml_tensor * pos_h = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_patches);
ggml_set_name(pos_h, "pos_h");
ggml_set_input(pos_h);
ggml_tensor * pos_w = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_patches);
ggml_set_name(pos_w, "pos_w");
ggml_set_input(pos_w);
ggml_tensor * learned_pos_embd = resize_position_embeddings();
// build ViT with 2D position embeddings
auto add_pos = [&](ggml_tensor * cur, const clip_layer &) {
// first half is X axis and second half is Y axis
return build_rope_2d(ctx0, cur, pos_w, pos_h, hparams.rope_theta, false);
};
ggml_tensor * inp = build_inp();
ggml_tensor * cur = build_vit(
inp, n_patches,
NORM_TYPE_NORMAL,
hparams.ffn_op,
learned_pos_embd,
add_pos);
cb(cur, "vit_out", -1);
{
// patch_merger
const int scale_factor = model.hparams.proj_scale_factor;
cur = build_patch_merge_permute(cur, scale_factor);
// projection norm
int proj_inp_dim = cur->ne[0];
cur = ggml_view_2d(ctx0, cur,
n_embd, cur->ne[1] * scale_factor * scale_factor,
ggml_row_size(cur->type, n_embd), 0);
cur = ggml_norm(ctx0, cur, 1e-5); // default nn.LayerNorm
cur = ggml_mul(ctx0, cur, model.mm_input_norm_w);
cur = ggml_add(ctx0, cur, model.mm_input_norm_b);
cur = ggml_view_2d(ctx0, cur,
proj_inp_dim, cur->ne[1] / scale_factor / scale_factor,
ggml_row_size(cur->type, proj_inp_dim), 0);
cb(cur, "proj_inp_normed", -1);
// projection mlp
cur = ggml_mul_mat(ctx0, model.mm_1_w, cur);
cur = ggml_add(ctx0, cur, model.mm_1_b);
cur = ggml_gelu(ctx0, cur);
cur = ggml_mul_mat(ctx0, model.mm_2_w, cur);
cur = ggml_add(ctx0, cur, model.mm_2_b);
cb(cur, "proj_out", -1);
}
// build the graph
ggml_build_forward_expand(gf, cur);
return gf;
}
// this graph is used by llava, granite and glm // this graph is used by llava, granite and glm
// due to having embedding_stack (used by granite), we cannot reuse build_vit // due to having embedding_stack (used by granite), we cannot reuse build_vit
ggml_cgraph * build_llava() { ggml_cgraph * build_llava() {
@ -1300,8 +1355,8 @@ struct clip_graph {
ggml_tensor * block_1 = nullptr; ggml_tensor * block_1 = nullptr;
{ {
// transpose from [1, 576, 2048] --> [1, 2048, 576] --> [1, 2048, 24, 24] // transpose from [1, 576, 2048] --> [1, 2048, 576] --> [1, 2048, 24, 24]
mlp_3 = ggml_cont(ctx0, ggml_permute(ctx0, mlp_3, 1, 0, 2, 3)); mlp_3 = ggml_permute(ctx0, mlp_3, 1, 0, 2, 3);
mlp_3 = ggml_reshape_4d(ctx0, mlp_3, n_patch, n_patch, mlp_3->ne[1], mlp_3->ne[2]); mlp_3 = ggml_cont_4d(ctx0, mlp_3, n_patch, n_patch, mlp_3->ne[1], mlp_3->ne[2]);
// stride = 1, padding = 1, bias is nullptr // stride = 1, padding = 1, bias is nullptr
block_1 = ggml_conv_2d_dw(ctx0, model.mm_model_block_1_block_0_0_w, mlp_3, 1, 1, 1, 1, 1, 1); block_1 = ggml_conv_2d_dw(ctx0, model.mm_model_block_1_block_0_0_w, mlp_3, 1, 1, 1, 1, 1, 1);
@ -1406,9 +1461,9 @@ struct clip_graph {
mlp_2 = ggml_add(ctx0, mlp_2, model.mm_model_mlp_2_b); mlp_2 = ggml_add(ctx0, mlp_2, model.mm_model_mlp_2_b);
// mlp_2 ne = [2048, 576, 1, 1] // mlp_2 ne = [2048, 576, 1, 1]
// // AVG Pool Layer 2*2, strides = 2 // // AVG Pool Layer 2*2, strides = 2
mlp_2 = ggml_cont(ctx0, ggml_permute(ctx0, mlp_2, 1, 0, 2, 3)); mlp_2 = ggml_permute(ctx0, mlp_2, 1, 0, 2, 3);
// mlp_2 ne = [576, 2048, 1, 1] // mlp_2 ne = [576, 2048, 1, 1]
mlp_2 = ggml_reshape_4d(ctx0, mlp_2, n_patch, n_patch, mlp_2->ne[1], mlp_2->ne[2]); mlp_2 = ggml_cont_4d(ctx0, mlp_2, n_patch, n_patch, mlp_2->ne[1], mlp_2->ne[2]);
// mlp_2 ne [24, 24, 2048, 1] // mlp_2 ne [24, 24, 2048, 1]
mlp_2 = ggml_pool_2d(ctx0, mlp_2, GGML_OP_POOL_AVG, 2, 2, 2, 2, 0, 0); mlp_2 = ggml_pool_2d(ctx0, mlp_2, GGML_OP_POOL_AVG, 2, 2, 2, 2, 0, 0);
// weight ne = [3, 3, 2048, 1] // weight ne = [3, 3, 2048, 1]
@ -1428,8 +1483,8 @@ struct clip_graph {
// glm projector // glm projector
else if (ctx->proj_type() == PROJECTOR_TYPE_GLM_EDGE) { else if (ctx->proj_type() == PROJECTOR_TYPE_GLM_EDGE) {
size_t gridsz = (size_t)sqrt(embeddings->ne[1]); size_t gridsz = (size_t)sqrt(embeddings->ne[1]);
embeddings = ggml_cont(ctx0, ggml_permute(ctx0,embeddings,1,0,2,3)); embeddings = ggml_permute(ctx0,embeddings,1,0,2,3);
embeddings = ggml_reshape_3d(ctx0, embeddings, gridsz, gridsz, embeddings->ne[1]); embeddings = ggml_cont_3d(ctx0, embeddings, gridsz, gridsz, embeddings->ne[1]);
embeddings = ggml_conv_2d(ctx0, model.mm_model_adapter_conv_w, embeddings, 2, 2, 0, 0, 1, 1); embeddings = ggml_conv_2d(ctx0, model.mm_model_adapter_conv_w, embeddings, 2, 2, 0, 0, 1, 1);
embeddings = ggml_reshape_3d(ctx0, embeddings,embeddings->ne[0]*embeddings->ne[1] , embeddings->ne[2], batch_size); embeddings = ggml_reshape_3d(ctx0, embeddings,embeddings->ne[0]*embeddings->ne[1] , embeddings->ne[2], batch_size);
embeddings = ggml_cont(ctx0, ggml_permute(ctx0,embeddings, 1, 0, 2, 3)); embeddings = ggml_cont(ctx0, ggml_permute(ctx0,embeddings, 1, 0, 2, 3));
@ -1585,6 +1640,29 @@ private:
} }
} }
// siglip2 naflex
ggml_tensor * resize_position_embeddings() {
ggml_tensor * pos_embd = model.position_embeddings;
const int height = img.ny / patch_size;
const int width = img.nx / patch_size;
const uint32_t mode = GGML_SCALE_MODE_BILINEAR;
const int n_per_side = (int)std::sqrt(pos_embd->ne[1]);
GGML_ASSERT(pos_embd);
if (height == n_per_side && width == n_per_side) {
return pos_embd;
}
pos_embd = ggml_reshape_3d(ctx0, pos_embd, n_embd, n_per_side, n_per_side); // -> (n_embd, n_per_side, n_per_side)
pos_embd = ggml_permute(ctx0, pos_embd, 2, 0, 1, 3); // -> (n_per_side, n_per_side, n_embd)
pos_embd = ggml_interpolate(ctx0, pos_embd, width, height, n_embd, 1, mode); // -> (width, height, n_embd)
pos_embd = ggml_permute(ctx0, pos_embd, 1, 2, 0, 3); // -> (n_embd, width, height)
pos_embd = ggml_cont_2d(ctx0, pos_embd, n_embd, width * height); // -> (n_embd, width * height)
return pos_embd;
}
// build vision transformer (ViT) cgraph // build vision transformer (ViT) cgraph
// this function should cover most of the models // this function should cover most of the models
// if your model has specific features, you should probably duplicate this function // if your model has specific features, you should probably duplicate this function
@ -1963,7 +2041,6 @@ private:
ggml_row_size(cur->type, n_dim), ggml_row_size(cur->type, n_dim),
ggml_row_size(cur->type, n_dim*n_head), ggml_row_size(cur->type, n_dim*n_head),
n_dim/2 * ggml_element_size(cur)); n_dim/2 * ggml_element_size(cur));
second = ggml_cont(ctx0, second); // copy, because ggml_rope don't play well with non-contiguous tensors
second = ggml_rope_ext( second = ggml_rope_ext(
ctx0, ctx0,
second, second,
@ -1980,6 +2057,39 @@ private:
return cur; return cur;
} }
// aka pixel_shuffle / pixel_unshuffle / patch_merger (Kimi-VL)
// support dynamic resolution
ggml_tensor * build_patch_merge_permute(ggml_tensor * cur, int scale_factor) {
GGML_ASSERT(scale_factor > 1);
const int n_embd = cur->ne[0];
int width = img.nx / patch_size;
int height = img.ny / patch_size;
// pad width and height to factor
const int64_t pad_width = CLIP_ALIGN(width, scale_factor) - width;
const int64_t pad_height = CLIP_ALIGN(height, scale_factor) - height;
cur = ggml_reshape_3d(ctx0, cur, n_embd, width, height);
if (pad_width || pad_height) {
cur = ggml_pad(ctx0, cur, 0, pad_width, pad_height, 0);
width += pad_width;
height += pad_height;
}
// unshuffle h
cur = ggml_reshape_3d(ctx0, cur, n_embd * scale_factor, width / scale_factor, height);
cur = ggml_permute(ctx0, cur, 0, 2, 1, 3);
// unshuffle w
cur = ggml_cont_3d(ctx0, cur, n_embd * scale_factor * scale_factor, height / scale_factor, width / scale_factor);
cur = ggml_permute(ctx0, cur, 0, 2, 1, 3);
cur = ggml_cont_2d(ctx0, cur, cur->ne[0], cur->ne[1] * cur->ne[2]);
cb(cur, "pixel_shuffle", -1);
return cur;
}
}; };
static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32_batch & imgs) { static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32_batch & imgs) {
@ -1991,6 +2101,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
switch (ctx->proj_type()) { switch (ctx->proj_type()) {
case PROJECTOR_TYPE_GEMMA3: case PROJECTOR_TYPE_GEMMA3:
case PROJECTOR_TYPE_IDEFICS3: case PROJECTOR_TYPE_IDEFICS3:
case PROJECTOR_TYPE_LFM2:
{ {
res = graph.build_siglip(); res = graph.build_siglip();
} break; } break;
@ -2021,6 +2132,10 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
{ {
res = graph.build_whisper_enc(); res = graph.build_whisper_enc();
} break; } break;
case PROJECTOR_TYPE_KIMIVL:
{
res = graph.build_kimivl();
} break;
default: default:
{ {
res = graph.build_llava(); res = graph.build_llava();
@ -2151,7 +2266,21 @@ struct clip_model_loader {
get_u32(KEY_PATCH_SIZE, hparams.patch_size); get_u32(KEY_PATCH_SIZE, hparams.patch_size);
get_u32(KEY_IMAGE_CROP_RESOLUTION, hparams.image_crop_resolution, false); get_u32(KEY_IMAGE_CROP_RESOLUTION, hparams.image_crop_resolution, false);
get_i32(KEY_MINICPMV_VERSION, hparams.minicpmv_version, false); // legacy get_i32(KEY_MINICPMV_VERSION, hparams.minicpmv_version, false); // legacy
get_u32(KEY_MINICPMV_QUERY_NUM, hparams.minicpmv_query_num, false);
if (hparams.minicpmv_query_num == 0) {
// Fallback to hardcoded values for legacy models
if (hparams.minicpmv_version == 3) {
hparams.minicpmv_query_num = 64;
} else if (hparams.minicpmv_version == 4) {
hparams.minicpmv_query_num = 64;
} else if (hparams.minicpmv_version == 5) {
hparams.minicpmv_query_num = 64;
} else if (hparams.minicpmv_version == 6) {
hparams.minicpmv_query_num = 64;
} else {
hparams.minicpmv_query_num = 96;
}
}
} else if (is_audio) { } else if (is_audio) {
get_u32(KEY_A_NUM_MEL_BINS, hparams.n_mel_bins); get_u32(KEY_A_NUM_MEL_BINS, hparams.n_mel_bins);
@ -2243,6 +2372,7 @@ struct clip_model_loader {
} }
} break; } break;
case PROJECTOR_TYPE_IDEFICS3: case PROJECTOR_TYPE_IDEFICS3:
case PROJECTOR_TYPE_LFM2:
case PROJECTOR_TYPE_INTERNVL: case PROJECTOR_TYPE_INTERNVL:
{ {
get_u32(KEY_PROJ_SCALE_FACTOR, hparams.proj_scale_factor, false); get_u32(KEY_PROJ_SCALE_FACTOR, hparams.proj_scale_factor, false);
@ -2256,6 +2386,12 @@ struct clip_model_loader {
hparams.image_size = 1024; hparams.image_size = 1024;
get_u32(KEY_SPATIAL_MERGE_SIZE, hparams.spatial_merge_size, false); get_u32(KEY_SPATIAL_MERGE_SIZE, hparams.spatial_merge_size, false);
} break; } break;
case PROJECTOR_TYPE_KIMIVL:
{
hparams.rope_theta = 10000.0f;
hparams.warmup_image_size = hparams.patch_size * 8;
get_u32(KEY_PROJ_SCALE_FACTOR, hparams.proj_scale_factor, false);
} break;
case PROJECTOR_TYPE_GEMMA3: case PROJECTOR_TYPE_GEMMA3:
{ {
// default value (used by all model sizes in gemma 3 family) // default value (used by all model sizes in gemma 3 family)
@ -2420,7 +2556,20 @@ struct clip_model_loader {
// some models already exported with legacy (incorrect) naming which is quite messy, let's fix it here // some models already exported with legacy (incorrect) naming which is quite messy, let's fix it here
// note: Qwen model converted from the old surgery script has n_ff = 0, so we cannot use n_ff to check! // note: Qwen model converted from the old surgery script has n_ff = 0, so we cannot use n_ff to check!
if (layer.ff_up_w && layer.ff_down_w && layer.ff_down_w->ne[0] == hparams.n_embd) { bool is_ffn_swapped = (
// only old models need this fix
model.proj_type == PROJECTOR_TYPE_MLP
|| model.proj_type == PROJECTOR_TYPE_MLP_NORM
|| model.proj_type == PROJECTOR_TYPE_LDP
|| model.proj_type == PROJECTOR_TYPE_LDPV2
|| model.proj_type == PROJECTOR_TYPE_QWEN2VL
|| model.proj_type == PROJECTOR_TYPE_QWEN25VL
|| model.proj_type == PROJECTOR_TYPE_GLM_EDGE
|| model.proj_type == PROJECTOR_TYPE_GEMMA3
|| model.proj_type == PROJECTOR_TYPE_IDEFICS3
|| model.proj_type == PROJECTOR_TYPE_MINICPMV
) && layer.ff_up_w && layer.ff_down_w && layer.ff_down_w->ne[0] == hparams.n_embd;
if (is_ffn_swapped) {
// swap up and down weights // swap up and down weights
ggml_tensor * tmp = layer.ff_up_w; ggml_tensor * tmp = layer.ff_up_w;
layer.ff_up_w = layer.ff_down_w; layer.ff_up_w = layer.ff_down_w;
@ -2429,6 +2578,9 @@ struct clip_model_loader {
tmp = layer.ff_up_b; tmp = layer.ff_up_b;
layer.ff_up_b = layer.ff_down_b; layer.ff_up_b = layer.ff_down_b;
layer.ff_down_b = tmp; layer.ff_down_b = tmp;
if (il == 0) {
LOG_WRN("%s: ffn up/down are swapped\n", __func__);
}
} }
} }
@ -2546,6 +2698,16 @@ struct clip_model_loader {
{ {
model.projection = get_tensor(TN_MM_PROJECTOR); model.projection = get_tensor(TN_MM_PROJECTOR);
} break; } break;
case PROJECTOR_TYPE_LFM2:
case PROJECTOR_TYPE_KIMIVL:
{
model.mm_input_norm_w = get_tensor(TN_MM_INP_NORM);
model.mm_input_norm_b = get_tensor(TN_MM_INP_NORM_B);
model.mm_1_w = get_tensor(string_format(TN_LLAVA_PROJ, 1, "weight"));
model.mm_1_b = get_tensor(string_format(TN_LLAVA_PROJ, 1, "bias"));
model.mm_2_w = get_tensor(string_format(TN_LLAVA_PROJ, 2, "weight"));
model.mm_2_b = get_tensor(string_format(TN_LLAVA_PROJ, 2, "bias"));
} break;
case PROJECTOR_TYPE_PIXTRAL: case PROJECTOR_TYPE_PIXTRAL:
{ {
model.mm_1_w = get_tensor(string_format(TN_LLAVA_PROJ, 1, "weight")); model.mm_1_w = get_tensor(string_format(TN_LLAVA_PROJ, 1, "weight"));
@ -2944,7 +3106,7 @@ struct image_manipulation {
dst.buf.resize(3 * target_width * target_height); dst.buf.resize(3 * target_width * target_height);
float Cc; float Cc;
float C[5]; float C[5] = {};
float d0, d2, d3, a0, a1, a2, a3; float d0, d2, d3, a0, a1, a2, a3;
int i, j, k, jj; int i, j, k, jj;
int x, y; int x, y;
@ -3467,6 +3629,45 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str
res_imgs->grid_y = inst.grid_size.height; res_imgs->grid_y = inst.grid_size.height;
return true; return true;
} else if ( ctx->proj_type() == PROJECTOR_TYPE_LFM2
|| ctx->proj_type() == PROJECTOR_TYPE_KIMIVL
) {
GGML_ASSERT(params.proj_scale_factor);
// smart resize
const int width = img->nx;
const int height = img->ny;
const int total_factor = params.patch_size * params.proj_scale_factor;
constexpr int min_image_tokens = 64;
constexpr int max_image_tokens = 1024;
const float min_pixels = min_image_tokens * total_factor * total_factor;
const float max_pixels = max_image_tokens * total_factor * total_factor;
auto round_by_factor = [f = total_factor](float x) { return static_cast<int>(std::nearbyintf(x / static_cast<float>(f))) * f; };
auto ceil_by_factor = [f = total_factor](float x) { return static_cast<int>(std::ceil(x / static_cast<float>(f))) * f; };
auto floor_by_factor = [f = total_factor](float x) { return static_cast<int>(std::floor(x / static_cast<float>(f))) * f; };
int h_bar = std::max(total_factor, round_by_factor(height));
int w_bar = std::max(total_factor, round_by_factor(width));
if (h_bar * w_bar > max_pixels) {
const auto beta = std::sqrt((height * width) / max_pixels);
h_bar = std::max(total_factor, floor_by_factor(height / beta));
w_bar = std::max(total_factor, floor_by_factor(width / beta));
} else if (h_bar * w_bar < min_pixels) {
const auto beta = std::sqrt(min_pixels / (height * width));
h_bar = ceil_by_factor(height * beta);
w_bar = ceil_by_factor(width * beta);
}
const std::array<uint8_t, 3> pad_color = {122, 116, 104};
clip_image_u8 resized_img;
image_manipulation::resize_and_pad_image(*img, resized_img, clip_image_size{w_bar, h_bar}, pad_color);
clip_image_f32_ptr res(clip_image_f32_init());
normalize_image_u8_to_f32(resized_img, *res, params.image_mean, params.image_std);
res_imgs->entries.push_back(std::move(res));
return true;
} }
// the logic below is to pad the shorter side to the longer side with a background color: rgb(122, 116, 104) // the logic below is to pad the shorter side to the longer side with a background color: rgb(122, 116, 104)
@ -3506,10 +3707,10 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str
} }
return true; return true;
} else {
GGML_ABORT("Unknown image preprocessing type");
} }
GGML_ASSERT(false && "Unknown image preprocessing type");
} }
ggml_tensor * clip_get_newline_tensor(const struct clip_ctx * ctx) { ggml_tensor * clip_get_newline_tensor(const struct clip_ctx * ctx) {
@ -3573,8 +3774,9 @@ int clip_n_output_tokens_y(const struct clip_ctx * ctx, struct clip_image_f32 *
int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * img) { int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * img) {
const auto & params = ctx->model.hparams; const auto & params = ctx->model.hparams;
// only for models using fixed size square images // for models with fixed size image, the input image is already pre-processed and resized to square
int n_patches_sq = (params.image_size / params.patch_size) * (params.image_size / params.patch_size); int patch_size = params.patch_size;
int n_patches = (img->nx / patch_size) * (img->ny / patch_size);
projector_type proj = ctx->proj_type(); projector_type proj = ctx->proj_type();
@ -3588,89 +3790,97 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im
case PROJECTOR_TYPE_LDPV2: case PROJECTOR_TYPE_LDPV2:
case PROJECTOR_TYPE_GLM_EDGE: case PROJECTOR_TYPE_GLM_EDGE:
{ {
n_patches_sq /= 4; n_patches /= 4;
if (ctx->model.mm_glm_tok_boi) { if (ctx->model.mm_glm_tok_boi) {
n_patches_sq += 2; // for BOI and EOI token embeddings n_patches += 2; // for BOI and EOI token embeddings
} }
} break; } break;
case PROJECTOR_TYPE_MINICPMV: case PROJECTOR_TYPE_MINICPMV:
{ {
if (params.minicpmv_version == 2) { // Use actual config value if available, otherwise fall back to hardcoded values
// MiniCPM-V 2.5 if (params.minicpmv_query_num > 0) {
n_patches_sq = 96; n_patches = params.minicpmv_query_num;
} else if (params.minicpmv_version == 3) {
// MiniCPM-V 2.6
n_patches_sq = 64;
} else if (params.minicpmv_version == 4) {
// MiniCPM-o 2.6
n_patches_sq = 64;
} else if (params.minicpmv_version == 5) {
// MiniCPM-V 4.0
n_patches_sq = 64;
} else { } else {
GGML_ABORT("Unknown minicpmv version"); // Fallback to hardcoded values for legacy models
if (params.minicpmv_version == 2) {
n_patches = 96;
} else if (params.minicpmv_version == 3) {
n_patches = 64;
} else if (params.minicpmv_version == 4) {
n_patches = 64;
} else if (params.minicpmv_version == 5) {
// MiniCPM-V 4.0
n_patches = 64;
} else if (params.minicpmv_version == 6) {
// MiniCPM-V 4.5
n_patches = 64;
} else {
GGML_ABORT("Unknown minicpmv version");
}
} }
} break; } break;
case PROJECTOR_TYPE_QWEN2VL: case PROJECTOR_TYPE_QWEN2VL:
case PROJECTOR_TYPE_QWEN25VL: case PROJECTOR_TYPE_QWEN25VL:
{ {
// dynamic size // dynamic size (2 conv, so double patch size)
int patch_size = params.patch_size * 2; int patch_size = params.patch_size * 2;
int x_patch = img->nx / patch_size + (int)(img->nx % patch_size > 0); int x_patch = img->nx / patch_size + (int)(img->nx % patch_size > 0);
int y_patch = img->ny / patch_size + (int)(img->ny % patch_size > 0); int y_patch = img->ny / patch_size + (int)(img->ny % patch_size > 0);
n_patches_sq = x_patch * y_patch; n_patches = x_patch * y_patch;
} break; } break;
case PROJECTOR_TYPE_GEMMA3: case PROJECTOR_TYPE_GEMMA3:
{
int n_per_side = params.image_size / params.patch_size;
int n_per_side_2d_pool = n_per_side / params.proj_scale_factor;
n_patches_sq = n_per_side_2d_pool * n_per_side_2d_pool;
} break;
case PROJECTOR_TYPE_IDEFICS3: case PROJECTOR_TYPE_IDEFICS3:
case PROJECTOR_TYPE_INTERNVL: case PROJECTOR_TYPE_INTERNVL:
case PROJECTOR_TYPE_LLAMA4:
{ {
// both W and H are divided by proj_scale_factor // both X and Y are downscaled by the scale factor
n_patches_sq /= (params.proj_scale_factor * params.proj_scale_factor); int scale_factor = ctx->model.hparams.proj_scale_factor;
n_patches /= (scale_factor * scale_factor);
} break;
case PROJECTOR_TYPE_LFM2:
case PROJECTOR_TYPE_KIMIVL:
{
// dynamic size
int scale_factor = ctx->model.hparams.proj_scale_factor;
int out_patch_size = params.patch_size * scale_factor;
int x_patch = CLIP_ALIGN(img->nx, out_patch_size) / out_patch_size;
int y_patch = CLIP_ALIGN(img->ny, out_patch_size) / out_patch_size;
n_patches = x_patch * y_patch;
} break; } break;
case PROJECTOR_TYPE_PIXTRAL: case PROJECTOR_TYPE_PIXTRAL:
{ {
// dynamic size // dynamic size
int n_merge = params.spatial_merge_size; int n_merge = params.spatial_merge_size;
int n_patches_x = img->nx / params.patch_size / (n_merge > 0 ? n_merge : 1); int n_patches_x = img->nx / patch_size / (n_merge > 0 ? n_merge : 1);
int n_patches_y = img->ny / params.patch_size / (n_merge > 0 ? n_merge : 1); int n_patches_y = img->ny / patch_size / (n_merge > 0 ? n_merge : 1);
n_patches_sq = n_patches_y * n_patches_x + n_patches_y - 1; // + one [IMG_BREAK] per row, except the last row n_patches = n_patches_y * n_patches_x + n_patches_y - 1; // + one [IMG_BREAK] per row, except the last row
} break;
case PROJECTOR_TYPE_LLAMA4:
{
int scale_factor = ctx->model.hparams.proj_scale_factor;
n_patches_sq /= (scale_factor * scale_factor);
} break; } break;
case PROJECTOR_TYPE_VOXTRAL: case PROJECTOR_TYPE_VOXTRAL:
case PROJECTOR_TYPE_ULTRAVOX: case PROJECTOR_TYPE_ULTRAVOX:
case PROJECTOR_TYPE_QWEN2A: case PROJECTOR_TYPE_QWEN2A:
{ {
n_patches_sq = img->nx; n_patches = img->nx;
const int proj_stack_factor = ctx->model.hparams.proj_stack_factor; const int proj_stack_factor = ctx->model.hparams.proj_stack_factor;
if (ctx->model.audio_has_stack_frames()) { if (ctx->model.audio_has_stack_frames()) {
GGML_ASSERT(proj_stack_factor > 0); GGML_ASSERT(proj_stack_factor > 0);
const int n_len = CLIP_ALIGN(n_patches_sq, proj_stack_factor); const int n_len = CLIP_ALIGN(n_patches, proj_stack_factor);
n_patches_sq = n_len / proj_stack_factor; n_patches = n_len / proj_stack_factor;
} }
// whisper downscales input token by half after conv1d // whisper downscales input token by half after conv1d
n_patches_sq /= 2; n_patches /= 2;
if (ctx->model.audio_has_avgpool()) { if (ctx->model.audio_has_avgpool()) {
// divide by 2 because of nn.AvgPool1d(2, stride=2) // divide by 2 because of nn.AvgPool1d(2, stride=2)
n_patches_sq /= 2; n_patches /= 2;
} }
} break; } break;
default: default:
GGML_ABORT("unsupported projector type"); GGML_ABORT("unsupported projector type");
} }
return n_patches_sq; return n_patches;
} }
static std::vector<std::vector<std::vector<float>>> get_1d_sincos_pos_embed_from_grid_new(int embed_dim, const std::vector<std::vector<float>> & pos) { static std::vector<std::vector<std::vector<float>>> get_1d_sincos_pos_embed_from_grid_new(int embed_dim, const std::vector<std::vector<float>> & pos) {
@ -4019,6 +4229,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
set_input_i32("positions", positions); set_input_i32("positions", positions);
} break; } break;
case PROJECTOR_TYPE_PIXTRAL: case PROJECTOR_TYPE_PIXTRAL:
case PROJECTOR_TYPE_KIMIVL:
{ {
// set the 2D positions // set the 2D positions
int n_patches_per_col = image_size_width / patch_size; int n_patches_per_col = image_size_width / patch_size;
@ -4070,6 +4281,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
case PROJECTOR_TYPE_INTERNVL: case PROJECTOR_TYPE_INTERNVL:
case PROJECTOR_TYPE_QWEN2A: case PROJECTOR_TYPE_QWEN2A:
case PROJECTOR_TYPE_ULTRAVOX: case PROJECTOR_TYPE_ULTRAVOX:
case PROJECTOR_TYPE_LFM2:
case PROJECTOR_TYPE_VOXTRAL: case PROJECTOR_TYPE_VOXTRAL:
{ {
// do nothing // do nothing
@ -4141,7 +4353,6 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
} }
int clip_n_mmproj_embd(const struct clip_ctx * ctx) { int clip_n_mmproj_embd(const struct clip_ctx * ctx) {
const auto & hparams = ctx->model.hparams;
switch (ctx->model.proj_type) { switch (ctx->model.proj_type) {
case PROJECTOR_TYPE_LDP: case PROJECTOR_TYPE_LDP:
return ctx->model.mm_model_block_1_block_2_1_b->ne[0]; return ctx->model.mm_model_block_1_block_2_1_b->ne[0];
@ -4153,20 +4364,7 @@ int clip_n_mmproj_embd(const struct clip_ctx * ctx) {
case PROJECTOR_TYPE_MLP_NORM: case PROJECTOR_TYPE_MLP_NORM:
return ctx->model.mm_3_b->ne[0]; return ctx->model.mm_3_b->ne[0];
case PROJECTOR_TYPE_MINICPMV: case PROJECTOR_TYPE_MINICPMV:
if (hparams.minicpmv_version == 2) { return ctx->model.mm_model_proj->ne[0];
// MiniCPM-V 2.5
return 4096;
} else if (hparams.minicpmv_version == 3) {
// MiniCPM-V 2.6
return 3584;
} else if (hparams.minicpmv_version == 4) {
// MiniCPM-o 2.6
return 3584;
} else if (hparams.minicpmv_version == 5) {
// MiniCPM-V 4.0
return 2560;
}
GGML_ABORT("Unknown minicpmv version");
case PROJECTOR_TYPE_GLM_EDGE: case PROJECTOR_TYPE_GLM_EDGE:
return ctx->model.mm_model_mlp_3_w->ne[1]; return ctx->model.mm_model_mlp_3_w->ne[1];
case PROJECTOR_TYPE_QWEN2VL: case PROJECTOR_TYPE_QWEN2VL:
@ -4185,6 +4383,9 @@ int clip_n_mmproj_embd(const struct clip_ctx * ctx) {
return ctx->model.mm_model_proj->ne[1]; return ctx->model.mm_model_proj->ne[1];
case PROJECTOR_TYPE_QWEN2A: case PROJECTOR_TYPE_QWEN2A:
return ctx->model.mm_fc_w->ne[1]; return ctx->model.mm_fc_w->ne[1];
case PROJECTOR_TYPE_LFM2:
case PROJECTOR_TYPE_KIMIVL:
return ctx->model.mm_2_w->ne[1];
default: default:
GGML_ABORT("Unknown projector type"); GGML_ABORT("Unknown projector type");
} }

View File

@ -82,11 +82,6 @@ struct clip_image_f32 * clip_image_f32_get_img(const struct clip_image_f32_batch
*/ */
void clip_build_img_from_pixels(const unsigned char * rgb_pixels, int nx, int ny, struct clip_image_u8 * img); void clip_build_img_from_pixels(const unsigned char * rgb_pixels, int nx, int ny, struct clip_image_u8 * img);
bool clip_image_load_from_file(const char * fname, struct clip_image_u8 * img);
/** interpret bytes as an image file with length bytes_length, and use the result to populate img */
bool clip_image_load_from_bytes(const unsigned char * bytes, size_t bytes_length, struct clip_image_u8 * img);
/** preprocess img and store the result in res_imgs, pad_to_square may be overridden to false depending on model configuration */ /** preprocess img and store the result in res_imgs, pad_to_square may be overridden to false depending on model configuration */
bool clip_image_preprocess(struct clip_ctx * ctx, const struct clip_image_u8 * img, struct clip_image_f32_batch * res_imgs ); bool clip_image_preprocess(struct clip_ctx * ctx, const struct clip_image_u8 * img, struct clip_image_f32_batch * res_imgs );

View File

@ -217,7 +217,7 @@ struct mtmd_context {
tok_row_end_trail = false; // no trailing end-of-row token tok_row_end_trail = false; // no trailing end-of-row token
ov_img_first = true; ov_img_first = true;
} else if (minicpmv_version == 3 || minicpmv_version == 4 || minicpmv_version == 5) { } else if (minicpmv_version == 3 || minicpmv_version == 4 || minicpmv_version == 5 || minicpmv_version == 6) {
// minicpmv 2.6 format: // minicpmv 2.6 format:
// <image> (overview) </image><slice> (slice) </slice><slice> (slice) </slice>\n ... // <image> (overview) </image><slice> (slice) </slice><slice> (slice) </slice>\n ...
slice_tmpl = MTMD_SLICE_TMPL_MINICPMV_2_6; slice_tmpl = MTMD_SLICE_TMPL_MINICPMV_2_6;

File diff suppressed because it is too large Load Diff

View File

@ -116,7 +116,11 @@ func NewContextParams(numCtx int, batchSize int, numSeqMax int, threads int, fla
params.n_threads = C.int(threads) params.n_threads = C.int(threads)
params.n_threads_batch = params.n_threads params.n_threads_batch = params.n_threads
params.embeddings = C.bool(true) params.embeddings = C.bool(true)
params.flash_attn = C.bool(flashAttention) if flashAttention {
params.flash_attn_type = C.LLAMA_FLASH_ATTN_TYPE_ENABLED
} else {
params.flash_attn_type = C.LLAMA_FLASH_ATTN_TYPE_DISABLED
}
params.type_k = kvCacheTypeFromStr(strings.ToLower(kvCacheType)) params.type_k = kvCacheTypeFromStr(strings.ToLower(kvCacheType))
params.type_v = kvCacheTypeFromStr(strings.ToLower(kvCacheType)) params.type_v = kvCacheTypeFromStr(strings.ToLower(kvCacheType))

View File

@ -15,18 +15,18 @@ problem.
ggml/src/ggml-backend.cpp | 9 +++++++-- ggml/src/ggml-backend.cpp | 9 +++++++--
ggml/src/ggml-cann/ggml-cann.cpp | 2 ++ ggml/src/ggml-cann/ggml-cann.cpp | 2 ++
ggml/src/ggml-cuda/ggml-cuda.cu | 3 +++ ggml/src/ggml-cuda/ggml-cuda.cu | 3 +++
ggml/src/ggml-metal/ggml-metal.m | 1 + ggml/src/ggml-metal/ggml-metal.cpp | 2 ++
ggml/src/ggml-opencl/ggml-opencl.cpp | 1 + ggml/src/ggml-opencl/ggml-opencl.cpp | 1 +
ggml/src/ggml-rpc/ggml-rpc.cpp | 1 + ggml/src/ggml-rpc/ggml-rpc.cpp | 1 +
ggml/src/ggml-sycl/ggml-sycl.cpp | 3 +++ ggml/src/ggml-sycl/ggml-sycl.cpp | 3 +++
ggml/src/ggml-vulkan/ggml-vulkan.cpp | 2 ++ ggml/src/ggml-vulkan/ggml-vulkan.cpp | 2 ++
8 files changed, 20 insertions(+), 2 deletions(-) 8 files changed, 21 insertions(+), 2 deletions(-)
diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp
index 1b9d29e9..97f47abd 100644 index ff9135fe..8ba86f82 100644
--- a/ggml/src/ggml-backend.cpp --- a/ggml/src/ggml-backend.cpp
+++ b/ggml/src/ggml-backend.cpp +++ b/ggml/src/ggml-backend.cpp
@@ -107,7 +107,6 @@ void ggml_backend_buffer_free(ggml_backend_buffer_t buffer) { @@ -113,7 +113,6 @@ void ggml_backend_buffer_free(ggml_backend_buffer_t buffer) {
if (buffer->iface.free_buffer != NULL) { if (buffer->iface.free_buffer != NULL) {
buffer->iface.free_buffer(buffer); buffer->iface.free_buffer(buffer);
} }
@ -34,7 +34,7 @@ index 1b9d29e9..97f47abd 100644
} }
size_t ggml_backend_buffer_get_size(ggml_backend_buffer_t buffer) { size_t ggml_backend_buffer_get_size(ggml_backend_buffer_t buffer) {
@@ -529,6 +528,7 @@ static void ggml_backend_multi_buffer_free_buffer(ggml_backend_buffer_t buffer) @@ -586,6 +585,7 @@ static void ggml_backend_multi_buffer_free_buffer(ggml_backend_buffer_t buffer)
free(ctx->buffers); free(ctx->buffers);
free(ctx); free(ctx);
@ -42,9 +42,9 @@ index 1b9d29e9..97f47abd 100644
} }
static void ggml_backend_multi_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) { static void ggml_backend_multi_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
@@ -1890,6 +1890,11 @@ static void * ggml_backend_cpu_buffer_get_base(ggml_backend_buffer_t buffer) { @@ -2075,6 +2075,11 @@ static void * ggml_backend_cpu_buffer_get_base(ggml_backend_buffer_t buffer) {
static void ggml_backend_cpu_buffer_free_buffer(ggml_backend_buffer_t buffer) { static void ggml_backend_cpu_buffer_free_buffer(ggml_backend_buffer_t buffer) {
GGML_ASSERT(buffer);
ggml_aligned_free(buffer->context, buffer->size); ggml_aligned_free(buffer->context, buffer->size);
+ delete buffer; + delete buffer;
+} +}
@ -54,7 +54,7 @@ index 1b9d29e9..97f47abd 100644
} }
static void ggml_backend_cpu_buffer_memset_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) { static void ggml_backend_cpu_buffer_memset_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) {
@@ -1937,7 +1942,7 @@ static const struct ggml_backend_buffer_i ggml_backend_cpu_buffer_i = { @@ -2127,7 +2132,7 @@ static const struct ggml_backend_buffer_i ggml_backend_cpu_buffer_i = {
}; };
static const struct ggml_backend_buffer_i ggml_backend_cpu_buffer_from_ptr_i = { static const struct ggml_backend_buffer_i ggml_backend_cpu_buffer_from_ptr_i = {
@ -64,10 +64,10 @@ index 1b9d29e9..97f47abd 100644
/* .init_tensor = */ NULL, // no initialization required /* .init_tensor = */ NULL, // no initialization required
/* .memset_tensor = */ ggml_backend_cpu_buffer_memset_tensor, /* .memset_tensor = */ ggml_backend_cpu_buffer_memset_tensor,
diff --git a/ggml/src/ggml-cann/ggml-cann.cpp b/ggml/src/ggml-cann/ggml-cann.cpp diff --git a/ggml/src/ggml-cann/ggml-cann.cpp b/ggml/src/ggml-cann/ggml-cann.cpp
index cf575b36..ca1addfa 100755 index b51b554e..3ba0f5a6 100755
--- a/ggml/src/ggml-cann/ggml-cann.cpp --- a/ggml/src/ggml-cann/ggml-cann.cpp
+++ b/ggml/src/ggml-cann/ggml-cann.cpp +++ b/ggml/src/ggml-cann/ggml-cann.cpp
@@ -826,6 +826,7 @@ static void ggml_backend_cann_buffer_free_buffer( @@ -843,6 +843,7 @@ static void ggml_backend_cann_buffer_free_buffer(
ggml_backend_cann_buffer_context* ctx = ggml_backend_cann_buffer_context* ctx =
(ggml_backend_cann_buffer_context*)buffer->context; (ggml_backend_cann_buffer_context*)buffer->context;
delete ctx; delete ctx;
@ -75,7 +75,7 @@ index cf575b36..ca1addfa 100755
} }
/** /**
@@ -1572,6 +1573,7 @@ static const char * ggml_backend_cann_host_buffer_name(ggml_backend_buffer_t buf @@ -1630,6 +1631,7 @@ static const char * ggml_backend_cann_host_buffer_name(ggml_backend_buffer_t buf
*/ */
static void ggml_backend_cann_host_buffer_free(ggml_backend_buffer_t buffer) { static void ggml_backend_cann_host_buffer_free(ggml_backend_buffer_t buffer) {
ACL_CHECK(aclrtFreeHost(buffer->context)); ACL_CHECK(aclrtFreeHost(buffer->context));
@ -84,7 +84,7 @@ index cf575b36..ca1addfa 100755
/** /**
diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
index d9110491..37ee2a6d 100644 index b7e81b21..fdf8c63d 100644
--- a/ggml/src/ggml-cuda/ggml-cuda.cu --- a/ggml/src/ggml-cuda/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu +++ b/ggml/src/ggml-cuda/ggml-cuda.cu
@@ -567,6 +567,7 @@ struct ggml_backend_cuda_buffer_context { @@ -567,6 +567,7 @@ struct ggml_backend_cuda_buffer_context {
@ -111,23 +111,31 @@ index d9110491..37ee2a6d 100644
} }
static void * ggml_cuda_host_malloc(size_t size) { static void * ggml_cuda_host_malloc(size_t size) {
diff --git a/ggml/src/ggml-metal/ggml-metal.m b/ggml/src/ggml-metal/ggml-metal.m diff --git a/ggml/src/ggml-metal/ggml-metal.cpp b/ggml/src/ggml-metal/ggml-metal.cpp
index cb8eff4a..7bccc7bf 100644 index e11555a7..909e17de 100644
--- a/ggml/src/ggml-metal/ggml-metal.m --- a/ggml/src/ggml-metal/ggml-metal.cpp
+++ b/ggml/src/ggml-metal/ggml-metal.m +++ b/ggml/src/ggml-metal/ggml-metal.cpp
@@ -6032,6 +6032,7 @@ static void ggml_backend_metal_buffer_free_buffer(ggml_backend_buffer_t buffer) @@ -25,6 +25,7 @@ static void ggml_backend_metal_buffer_shared_free_buffer(ggml_backend_buffer_t b
} GGML_ASSERT(ggml_metal_buffer_is_shared(ctx));
free(ctx); ggml_metal_buffer_free(ctx);
+ free(buffer); + delete buffer;
} }
static void * ggml_backend_metal_buffer_get_base(ggml_backend_buffer_t buffer) { static void * ggml_backend_metal_buffer_shared_get_base(ggml_backend_buffer_t buffer) {
@@ -99,6 +100,7 @@ static void ggml_backend_metal_buffer_private_free_buffer(ggml_backend_buffer_t
GGML_ASSERT(!ggml_metal_buffer_is_shared(ctx));
ggml_metal_buffer_free(ctx);
+ delete buffer;
}
static void * ggml_backend_metal_buffer_private_get_base(ggml_backend_buffer_t buffer) {
diff --git a/ggml/src/ggml-opencl/ggml-opencl.cpp b/ggml/src/ggml-opencl/ggml-opencl.cpp diff --git a/ggml/src/ggml-opencl/ggml-opencl.cpp b/ggml/src/ggml-opencl/ggml-opencl.cpp
index 8ba1e00d..8163e8dc 100644 index 0cf3b924..09d706b5 100644
--- a/ggml/src/ggml-opencl/ggml-opencl.cpp --- a/ggml/src/ggml-opencl/ggml-opencl.cpp
+++ b/ggml/src/ggml-opencl/ggml-opencl.cpp +++ b/ggml/src/ggml-opencl/ggml-opencl.cpp
@@ -2745,6 +2745,7 @@ struct ggml_backend_opencl_buffer_context { @@ -3215,6 +3215,7 @@ struct ggml_backend_opencl_buffer_context {
static void ggml_backend_opencl_buffer_free_buffer(ggml_backend_buffer_t buffer) { static void ggml_backend_opencl_buffer_free_buffer(ggml_backend_buffer_t buffer) {
ggml_backend_opencl_buffer_context * ctx = (ggml_backend_opencl_buffer_context *) buffer->context; ggml_backend_opencl_buffer_context * ctx = (ggml_backend_opencl_buffer_context *) buffer->context;
delete ctx; delete ctx;
@ -136,10 +144,10 @@ index 8ba1e00d..8163e8dc 100644
static void * ggml_backend_opencl_buffer_get_base(ggml_backend_buffer_t buffer) { static void * ggml_backend_opencl_buffer_get_base(ggml_backend_buffer_t buffer) {
diff --git a/ggml/src/ggml-rpc/ggml-rpc.cpp b/ggml/src/ggml-rpc/ggml-rpc.cpp diff --git a/ggml/src/ggml-rpc/ggml-rpc.cpp b/ggml/src/ggml-rpc/ggml-rpc.cpp
index df6ba540..2e395968 100644 index f99681c8..59591770 100644
--- a/ggml/src/ggml-rpc/ggml-rpc.cpp --- a/ggml/src/ggml-rpc/ggml-rpc.cpp
+++ b/ggml/src/ggml-rpc/ggml-rpc.cpp +++ b/ggml/src/ggml-rpc/ggml-rpc.cpp
@@ -486,6 +486,7 @@ static void ggml_backend_rpc_buffer_free_buffer(ggml_backend_buffer_t buffer) { @@ -505,6 +505,7 @@ static void ggml_backend_rpc_buffer_free_buffer(ggml_backend_buffer_t buffer) {
bool status = send_rpc_cmd(ctx->sock, RPC_CMD_FREE_BUFFER, &request, sizeof(request), nullptr, 0); bool status = send_rpc_cmd(ctx->sock, RPC_CMD_FREE_BUFFER, &request, sizeof(request), nullptr, 0);
RPC_STATUS_ASSERT(status); RPC_STATUS_ASSERT(status);
delete ctx; delete ctx;
@ -148,7 +156,7 @@ index df6ba540..2e395968 100644
static void * ggml_backend_rpc_buffer_get_base(ggml_backend_buffer_t buffer) { static void * ggml_backend_rpc_buffer_get_base(ggml_backend_buffer_t buffer) {
diff --git a/ggml/src/ggml-sycl/ggml-sycl.cpp b/ggml/src/ggml-sycl/ggml-sycl.cpp diff --git a/ggml/src/ggml-sycl/ggml-sycl.cpp b/ggml/src/ggml-sycl/ggml-sycl.cpp
index 3992dad0..67503951 100644 index 4ac919ea..447ea3c4 100644
--- a/ggml/src/ggml-sycl/ggml-sycl.cpp --- a/ggml/src/ggml-sycl/ggml-sycl.cpp
+++ b/ggml/src/ggml-sycl/ggml-sycl.cpp +++ b/ggml/src/ggml-sycl/ggml-sycl.cpp
@@ -331,6 +331,7 @@ ggml_backend_sycl_buffer_free_buffer(ggml_backend_buffer_t buffer) try { @@ -331,6 +331,7 @@ ggml_backend_sycl_buffer_free_buffer(ggml_backend_buffer_t buffer) try {
@ -176,10 +184,10 @@ index 3992dad0..67503951 100644
static ggml_backend_buffer_t ggml_backend_sycl_host_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) { static ggml_backend_buffer_t ggml_backend_sycl_host_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
index 4070e248..394a2839 100644 index 2608cbd0..061cd078 100644
--- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp --- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp
+++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp +++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
@@ -10209,6 +10209,7 @@ static void ggml_backend_vk_buffer_free_buffer(ggml_backend_buffer_t buffer) { @@ -11603,6 +11603,7 @@ static void ggml_backend_vk_buffer_free_buffer(ggml_backend_buffer_t buffer) {
ggml_backend_vk_buffer_context * ctx = (ggml_backend_vk_buffer_context *)buffer->context; ggml_backend_vk_buffer_context * ctx = (ggml_backend_vk_buffer_context *)buffer->context;
ggml_vk_destroy_buffer(ctx->dev_buffer); ggml_vk_destroy_buffer(ctx->dev_buffer);
delete ctx; delete ctx;
@ -187,7 +195,7 @@ index 4070e248..394a2839 100644
} }
static void * ggml_backend_vk_buffer_get_base(ggml_backend_buffer_t buffer) { static void * ggml_backend_vk_buffer_get_base(ggml_backend_buffer_t buffer) {
@@ -10352,6 +10353,7 @@ static const char * ggml_backend_vk_host_buffer_name(ggml_backend_buffer_t buffe @@ -11746,6 +11747,7 @@ static const char * ggml_backend_vk_host_buffer_name(ggml_backend_buffer_t buffe
static void ggml_backend_vk_host_buffer_free_buffer(ggml_backend_buffer_t buffer) { static void ggml_backend_vk_host_buffer_free_buffer(ggml_backend_buffer_t buffer) {
VK_LOG_MEMORY("ggml_backend_vk_host_buffer_free_buffer()"); VK_LOG_MEMORY("ggml_backend_vk_host_buffer_free_buffer()");
ggml_vk_host_free(vk_instance.devices[0], buffer->context); ggml_vk_host_free(vk_instance.devices[0], buffer->context);

View File

@ -10,10 +10,10 @@ logs instead of throwing an error
1 file changed, 3 insertions(+), 11 deletions(-) 1 file changed, 3 insertions(+), 11 deletions(-)
diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp
index f7e03e70..8ebe11cf 100644 index da938af0..2a38abf4 100644
--- a/src/llama-vocab.cpp --- a/src/llama-vocab.cpp
+++ b/src/llama-vocab.cpp +++ b/src/llama-vocab.cpp
@@ -1804,16 +1804,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) { @@ -1811,16 +1811,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
if (type == LLAMA_VOCAB_TYPE_BPE) { if (type == LLAMA_VOCAB_TYPE_BPE) {
add_space_prefix = false; add_space_prefix = false;
clean_spaces = true; clean_spaces = true;
@ -31,8 +31,8 @@ index f7e03e70..8ebe11cf 100644
pre_type = LLAMA_VOCAB_PRE_TYPE_DEFAULT; pre_type = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
} else if ( } else if (
tokenizer_pre == "llama3" || tokenizer_pre == "llama3" ||
@@ -1975,7 +1966,8 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) { @@ -1987,7 +1978,8 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
pre_type = LLAMA_VOCAB_PRE_TYPE_KIMI_K2; pre_type = LLAMA_VOCAB_PRE_TYPE_GROK_2;
clean_spaces = false; clean_spaces = false;
} else { } else {
- throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str())); - throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));

View File

@ -10,7 +10,7 @@ filesystems for paths that include wide characters
1 file changed, 39 insertions(+) 1 file changed, 39 insertions(+)
diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp
index 20c21733..f4f69cfc 100644 index 210ecc88..355219a9 100644
--- a/tools/mtmd/clip.cpp --- a/tools/mtmd/clip.cpp
+++ b/tools/mtmd/clip.cpp +++ b/tools/mtmd/clip.cpp
@@ -28,6 +28,19 @@ @@ -28,6 +28,19 @@
@ -33,7 +33,7 @@ index 20c21733..f4f69cfc 100644
struct clip_logger_state g_logger_state = {GGML_LOG_LEVEL_CONT, clip_log_callback_default, NULL}; struct clip_logger_state g_logger_state = {GGML_LOG_LEVEL_CONT, clip_log_callback_default, NULL};
enum ffn_op_type { enum ffn_op_type {
@@ -2597,7 +2610,29 @@ struct clip_model_loader { @@ -2759,7 +2772,29 @@ struct clip_model_loader {
{ {
std::vector<uint8_t> read_buf; std::vector<uint8_t> read_buf;
@ -63,7 +63,7 @@ index 20c21733..f4f69cfc 100644
if (!fin) { if (!fin) {
throw std::runtime_error(string_format("%s: failed to open %s\n", __func__, fname.c_str())); throw std::runtime_error(string_format("%s: failed to open %s\n", __func__, fname.c_str()));
} }
@@ -2624,7 +2659,11 @@ struct clip_model_loader { @@ -2786,7 +2821,11 @@ struct clip_model_loader {
ggml_backend_tensor_set(cur, read_buf.data(), 0, num_bytes); ggml_backend_tensor_set(cur, read_buf.data(), 0, num_bytes);
} }
} }

View File

@ -15,10 +15,10 @@ adds support for the Solar Pro architecture
7 files changed, 248 insertions(+) 7 files changed, 248 insertions(+)
diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp
index 18dcc6dd..4b285646 100644 index 4e8d54c4..f98a3574 100644
--- a/src/llama-arch.cpp --- a/src/llama-arch.cpp
+++ b/src/llama-arch.cpp +++ b/src/llama-arch.cpp
@@ -78,6 +78,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = { @@ -81,6 +81,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
{ LLM_ARCH_GRANITE_MOE, "granitemoe" }, { LLM_ARCH_GRANITE_MOE, "granitemoe" },
{ LLM_ARCH_GRANITE_HYBRID, "granitehybrid" }, { LLM_ARCH_GRANITE_HYBRID, "granitehybrid" },
{ LLM_ARCH_CHAMELEON, "chameleon" }, { LLM_ARCH_CHAMELEON, "chameleon" },
@ -26,15 +26,15 @@ index 18dcc6dd..4b285646 100644
{ LLM_ARCH_WAVTOKENIZER_DEC, "wavtokenizer-dec" }, { LLM_ARCH_WAVTOKENIZER_DEC, "wavtokenizer-dec" },
{ LLM_ARCH_PLM, "plm" }, { LLM_ARCH_PLM, "plm" },
{ LLM_ARCH_BAILINGMOE, "bailingmoe" }, { LLM_ARCH_BAILINGMOE, "bailingmoe" },
@@ -164,6 +165,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = { @@ -177,6 +178,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
{ LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT, "%s.attention.relative_buckets_count" },
{ LLM_KV_ATTENTION_SLIDING_WINDOW, "%s.attention.sliding_window" },
{ LLM_KV_ATTENTION_SCALE, "%s.attention.scale" }, { LLM_KV_ATTENTION_SCALE, "%s.attention.scale" },
{ LLM_KV_ATTENTION_OUTPUT_SCALE, "%s.attention.output_scale" },
{ LLM_KV_ATTENTION_TEMPERATURE_LENGTH, "%s.attention.temperature_length" },
+ { LLM_KV_ATTENTION_BLOCK_SKIP_CONNECTION, "%s.attention.block_skip_connection" }, + { LLM_KV_ATTENTION_BLOCK_SKIP_CONNECTION, "%s.attention.block_skip_connection" },
{ LLM_KV_ATTENTION_KEY_LENGTH_MLA, "%s.attention.key_length_mla" }, { LLM_KV_ATTENTION_KEY_LENGTH_MLA, "%s.attention.key_length_mla" },
{ LLM_KV_ATTENTION_VALUE_LENGTH_MLA, "%s.attention.value_length_mla" }, { LLM_KV_ATTENTION_VALUE_LENGTH_MLA, "%s.attention.value_length_mla" },
@@ -1794,6 +1796,24 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N @@ -1879,6 +1881,24 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
{ LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" }, { LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" },
}, },
}, },
@ -59,7 +59,7 @@ index 18dcc6dd..4b285646 100644
{ {
LLM_ARCH_WAVTOKENIZER_DEC, LLM_ARCH_WAVTOKENIZER_DEC,
{ {
@@ -2219,6 +2239,7 @@ static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = { @@ -2368,6 +2388,7 @@ static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
{LLM_TENSOR_LAUREL_POST_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}}, {LLM_TENSOR_LAUREL_POST_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
// this tensor is loaded for T5, but never used // this tensor is loaded for T5, but never used
{LLM_TENSOR_DEC_CROSS_ATTN_REL_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_NONE}}, {LLM_TENSOR_DEC_CROSS_ATTN_REL_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_NONE}},
@ -68,10 +68,10 @@ index 18dcc6dd..4b285646 100644
{LLM_TENSOR_POS_NET_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}}, {LLM_TENSOR_POS_NET_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
{LLM_TENSOR_POS_NET_NORM1, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}}, {LLM_TENSOR_POS_NET_NORM1, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
diff --git a/src/llama-arch.h b/src/llama-arch.h diff --git a/src/llama-arch.h b/src/llama-arch.h
index 7af587e7..3ea994c7 100644 index b5c6f3d7..aa8e0e7b 100644
--- a/src/llama-arch.h --- a/src/llama-arch.h
+++ b/src/llama-arch.h +++ b/src/llama-arch.h
@@ -82,6 +82,7 @@ enum llm_arch { @@ -85,6 +85,7 @@ enum llm_arch {
LLM_ARCH_GRANITE_MOE, LLM_ARCH_GRANITE_MOE,
LLM_ARCH_GRANITE_HYBRID, LLM_ARCH_GRANITE_HYBRID,
LLM_ARCH_CHAMELEON, LLM_ARCH_CHAMELEON,
@ -79,15 +79,15 @@ index 7af587e7..3ea994c7 100644
LLM_ARCH_WAVTOKENIZER_DEC, LLM_ARCH_WAVTOKENIZER_DEC,
LLM_ARCH_PLM, LLM_ARCH_PLM,
LLM_ARCH_BAILINGMOE, LLM_ARCH_BAILINGMOE,
@@ -168,6 +169,7 @@ enum llm_kv { @@ -181,6 +182,7 @@ enum llm_kv {
LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT,
LLM_KV_ATTENTION_SLIDING_WINDOW,
LLM_KV_ATTENTION_SCALE, LLM_KV_ATTENTION_SCALE,
LLM_KV_ATTENTION_OUTPUT_SCALE,
LLM_KV_ATTENTION_TEMPERATURE_LENGTH,
+ LLM_KV_ATTENTION_BLOCK_SKIP_CONNECTION, + LLM_KV_ATTENTION_BLOCK_SKIP_CONNECTION,
LLM_KV_ATTENTION_KEY_LENGTH_MLA, LLM_KV_ATTENTION_KEY_LENGTH_MLA,
LLM_KV_ATTENTION_VALUE_LENGTH_MLA, LLM_KV_ATTENTION_VALUE_LENGTH_MLA,
@@ -394,6 +396,7 @@ enum llm_tensor { @@ -417,6 +419,7 @@ enum llm_tensor {
LLM_TENSOR_ENC_OUTPUT_NORM, LLM_TENSOR_ENC_OUTPUT_NORM,
LLM_TENSOR_CLS, LLM_TENSOR_CLS,
LLM_TENSOR_CLS_OUT, LLM_TENSOR_CLS_OUT,
@ -96,10 +96,10 @@ index 7af587e7..3ea994c7 100644
LLM_TENSOR_CONVNEXT_DW, LLM_TENSOR_CONVNEXT_DW,
LLM_TENSOR_CONVNEXT_NORM, LLM_TENSOR_CONVNEXT_NORM,
diff --git a/src/llama-hparams.cpp b/src/llama-hparams.cpp diff --git a/src/llama-hparams.cpp b/src/llama-hparams.cpp
index 7a06368d..35fc054f 100644 index c04ac58f..24a515a0 100644
--- a/src/llama-hparams.cpp --- a/src/llama-hparams.cpp
+++ b/src/llama-hparams.cpp +++ b/src/llama-hparams.cpp
@@ -146,6 +146,14 @@ uint32_t llama_hparams::n_pos_per_embd() const { @@ -147,6 +147,14 @@ uint32_t llama_hparams::n_pos_per_embd() const {
return rope_type == LLAMA_ROPE_TYPE_MROPE ? 4 : 1; return rope_type == LLAMA_ROPE_TYPE_MROPE ? 4 : 1;
} }
@ -115,10 +115,10 @@ index 7a06368d..35fc054f 100644
if (il < n_layer) { if (il < n_layer) {
return swa_layers[il]; return swa_layers[il];
diff --git a/src/llama-hparams.h b/src/llama-hparams.h diff --git a/src/llama-hparams.h b/src/llama-hparams.h
index bd231224..29bd9056 100644 index 0fe4b569..eb13709f 100644
--- a/src/llama-hparams.h --- a/src/llama-hparams.h
+++ b/src/llama-hparams.h +++ b/src/llama-hparams.h
@@ -62,6 +62,8 @@ struct llama_hparams { @@ -64,6 +64,8 @@ struct llama_hparams {
std::array<uint32_t, LLAMA_MAX_LAYERS> n_head_kv_arr; std::array<uint32_t, LLAMA_MAX_LAYERS> n_head_kv_arr;
std::array<uint32_t, LLAMA_MAX_LAYERS> n_ff_arr; std::array<uint32_t, LLAMA_MAX_LAYERS> n_ff_arr;
@ -127,7 +127,7 @@ index bd231224..29bd9056 100644
uint32_t n_layer_dense_lead = 0; uint32_t n_layer_dense_lead = 0;
uint32_t n_lora_q = 0; uint32_t n_lora_q = 0;
uint32_t n_lora_kv = 0; uint32_t n_lora_kv = 0;
@@ -220,6 +222,9 @@ struct llama_hparams { @@ -236,6 +238,9 @@ struct llama_hparams {
uint32_t n_pos_per_embd() const; uint32_t n_pos_per_embd() const;
@ -135,10 +135,10 @@ index bd231224..29bd9056 100644
+ bool n_bskcn(uint32_t n, uint32_t il) const; + bool n_bskcn(uint32_t n, uint32_t il) const;
+ +
bool is_swa(uint32_t il) const; bool is_swa(uint32_t il) const;
};
bool has_kv(uint32_t il) const;
diff --git a/src/llama-model-loader.cpp b/src/llama-model-loader.cpp diff --git a/src/llama-model-loader.cpp b/src/llama-model-loader.cpp
index f71c40f8..7eab9b68 100644 index 8182a9ad..daef900c 100644
--- a/src/llama-model-loader.cpp --- a/src/llama-model-loader.cpp
+++ b/src/llama-model-loader.cpp +++ b/src/llama-model-loader.cpp
@@ -465,6 +465,7 @@ namespace GGUFMeta { @@ -465,6 +465,7 @@ namespace GGUFMeta {
@ -150,10 +150,10 @@ index f71c40f8..7eab9b68 100644
llama_model_loader::llama_model_loader( llama_model_loader::llama_model_loader(
const std::string & fname, const std::string & fname,
diff --git a/src/llama-model.cpp b/src/llama-model.cpp diff --git a/src/llama-model.cpp b/src/llama-model.cpp
index 58ca7df7..280129e1 100644 index 2470f878..0398b553 100644
--- a/src/llama-model.cpp --- a/src/llama-model.cpp
+++ b/src/llama-model.cpp +++ b/src/llama-model.cpp
@@ -1706,6 +1706,21 @@ void llama_model::load_hparams(llama_model_loader & ml) { @@ -1845,6 +1845,21 @@ void llama_model::load_hparams(llama_model_loader & ml) {
default: type = LLM_TYPE_UNKNOWN; default: type = LLM_TYPE_UNKNOWN;
} }
} break; } break;
@ -175,7 +175,7 @@ index 58ca7df7..280129e1 100644
case LLM_ARCH_WAVTOKENIZER_DEC: case LLM_ARCH_WAVTOKENIZER_DEC:
{ {
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps); ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
@@ -4793,6 +4808,34 @@ bool llama_model::load_tensors(llama_model_loader & ml) { @@ -5113,6 +5128,34 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0); layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
@ -210,7 +210,7 @@ index 58ca7df7..280129e1 100644
layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0); layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0); layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0); layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
@@ -15495,6 +15538,165 @@ struct llm_build_granite_hybrid : public llm_graph_context_mamba { @@ -16273,6 +16316,165 @@ struct llm_build_granite_hybrid : public llm_graph_context_mamba {
} }
}; };
@ -229,7 +229,7 @@ index 58ca7df7..280129e1 100644
+ struct ggml_tensor * inp_pos = build_inp_pos(); + struct ggml_tensor * inp_pos = build_inp_pos();
+ +
+ // KQ_mask (mask for 1 head, it will be broadcasted to all heads) + // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
+ auto * inp_attn = build_attn_inp_kv_unified(); + auto * inp_attn = build_attn_inp_kv();
+ +
+ const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale; + const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
+ +
@ -316,7 +316,7 @@ index 58ca7df7..280129e1 100644
+ +
+ cur = build_attn(inp_attn, + cur = build_attn(inp_attn,
+ model.layers[il].wo, model.layers[il].bo, + model.layers[il].wo, model.layers[il].bo,
+ Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il); + Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
+ cb(cur, "attn_out", il); + cb(cur, "attn_out", il);
+ } + }
+ +
@ -376,7 +376,7 @@ index 58ca7df7..280129e1 100644
// ref: https://github.com/facebookresearch/chameleon // ref: https://github.com/facebookresearch/chameleon
// based on the original build_llama() function, changes: // based on the original build_llama() function, changes:
// * qk-norm // * qk-norm
@@ -18439,6 +18641,10 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const { @@ -19552,6 +19754,10 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
{ {
llm = std::make_unique<llm_build_chameleon>(*this, params); llm = std::make_unique<llm_build_chameleon>(*this, params);
} break; } break;
@ -387,7 +387,7 @@ index 58ca7df7..280129e1 100644
case LLM_ARCH_WAVTOKENIZER_DEC: case LLM_ARCH_WAVTOKENIZER_DEC:
{ {
llm = std::make_unique<llm_build_wavtokenizer_dec>(*this, params); llm = std::make_unique<llm_build_wavtokenizer_dec>(*this, params);
@@ -18652,6 +18858,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) { @@ -19770,6 +19976,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
case LLM_ARCH_GRANITE_MOE: case LLM_ARCH_GRANITE_MOE:
case LLM_ARCH_GRANITE_HYBRID: case LLM_ARCH_GRANITE_HYBRID:
case LLM_ARCH_CHAMELEON: case LLM_ARCH_CHAMELEON:
@ -396,10 +396,10 @@ index 58ca7df7..280129e1 100644
case LLM_ARCH_NEO_BERT: case LLM_ARCH_NEO_BERT:
case LLM_ARCH_SMOLLM3: case LLM_ARCH_SMOLLM3:
diff --git a/src/llama-model.h b/src/llama-model.h diff --git a/src/llama-model.h b/src/llama-model.h
index 6fcd74d5..09964533 100644 index d73ce969..c086f94e 100644
--- a/src/llama-model.h --- a/src/llama-model.h
+++ b/src/llama-model.h +++ b/src/llama-model.h
@@ -70,6 +70,7 @@ enum llm_type { @@ -76,6 +76,7 @@ enum llm_type {
LLM_TYPE_15B, LLM_TYPE_15B,
LLM_TYPE_16B, LLM_TYPE_16B,
LLM_TYPE_20B, LLM_TYPE_20B,
@ -407,7 +407,7 @@ index 6fcd74d5..09964533 100644
LLM_TYPE_27B, LLM_TYPE_27B,
LLM_TYPE_30B, LLM_TYPE_30B,
LLM_TYPE_32B, LLM_TYPE_32B,
@@ -367,6 +368,8 @@ struct llama_layer { @@ -380,6 +381,8 @@ struct llama_layer {
// openai-moe // openai-moe
struct ggml_tensor * attn_sinks = nullptr; struct ggml_tensor * attn_sinks = nullptr;

View File

@ -12,7 +12,7 @@ regex
2 files changed, 22 insertions(+), 1 deletion(-) 2 files changed, 22 insertions(+), 1 deletion(-)
diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp
index 8ebe11cf..c011008f 100644 index 2a38abf4..26fa9fad 100644
--- a/src/llama-vocab.cpp --- a/src/llama-vocab.cpp
+++ b/src/llama-vocab.cpp +++ b/src/llama-vocab.cpp
@@ -299,7 +299,7 @@ struct llm_tokenizer_bpe : llm_tokenizer { @@ -299,7 +299,7 @@ struct llm_tokenizer_bpe : llm_tokenizer {

View File

@ -8,10 +8,10 @@ Subject: [PATCH] maintain ordering for rules for grammar
1 file changed, 1 insertion(+), 1 deletion(-) 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/common/json-schema-to-grammar.cpp b/common/json-schema-to-grammar.cpp diff --git a/common/json-schema-to-grammar.cpp b/common/json-schema-to-grammar.cpp
index 637891f5..98b8280f 100644 index db1f0b23..f4de7e34 100644
--- a/common/json-schema-to-grammar.cpp --- a/common/json-schema-to-grammar.cpp
+++ b/common/json-schema-to-grammar.cpp +++ b/common/json-schema-to-grammar.cpp
@@ -307,7 +307,7 @@ private: @@ -308,7 +308,7 @@ private:
friend std::string build_grammar(const std::function<void(const common_grammar_builder &)> & cb, const common_grammar_options & options); friend std::string build_grammar(const std::function<void(const common_grammar_builder &)> & cb, const common_grammar_options & options);
std::function<json(const std::string &)> _fetch_json; std::function<json(const std::string &)> _fetch_json;
bool _dotall; bool _dotall;

View File

@ -11,10 +11,10 @@ with the fastest acceleration is loaded
1 file changed, 13 insertions(+), 8 deletions(-) 1 file changed, 13 insertions(+), 8 deletions(-)
diff --git a/ggml/src/ggml-backend-reg.cpp b/ggml/src/ggml-backend-reg.cpp diff --git a/ggml/src/ggml-backend-reg.cpp b/ggml/src/ggml-backend-reg.cpp
index 6c315137..3040b2aa 100644 index 136afec7..f794d9cf 100644
--- a/ggml/src/ggml-backend-reg.cpp --- a/ggml/src/ggml-backend-reg.cpp
+++ b/ggml/src/ggml-backend-reg.cpp +++ b/ggml/src/ggml-backend-reg.cpp
@@ -162,7 +162,7 @@ struct ggml_backend_reg_entry { @@ -175,7 +175,7 @@ struct ggml_backend_reg_entry {
struct ggml_backend_registry { struct ggml_backend_registry {
std::vector<ggml_backend_reg_entry> backends; std::vector<ggml_backend_reg_entry> backends;
@ -23,7 +23,7 @@ index 6c315137..3040b2aa 100644
ggml_backend_registry() { ggml_backend_registry() {
#ifdef GGML_USE_CUDA #ifdef GGML_USE_CUDA
@@ -207,7 +207,7 @@ struct ggml_backend_registry { @@ -223,7 +223,7 @@ struct ggml_backend_registry {
} }
} }
@ -32,7 +32,7 @@ index 6c315137..3040b2aa 100644
if (!reg) { if (!reg) {
return; return;
} }
@@ -218,15 +218,20 @@ struct ggml_backend_registry { @@ -234,15 +234,20 @@ struct ggml_backend_registry {
#endif #endif
backends.push_back({ reg, std::move(handle) }); backends.push_back({ reg, std::move(handle) });
for (size_t i = 0; i < ggml_backend_reg_dev_count(reg); i++) { for (size_t i = 0; i < ggml_backend_reg_dev_count(reg); i++) {
@ -56,7 +56,7 @@ index 6c315137..3040b2aa 100644
} }
ggml_backend_reg_t load_backend(const fs::path & path, bool silent) { ggml_backend_reg_t load_backend(const fs::path & path, bool silent) {
@@ -270,7 +275,7 @@ struct ggml_backend_registry { @@ -286,7 +291,7 @@ struct ggml_backend_registry {
GGML_LOG_INFO("%s: loaded %s backend from %s\n", __func__, ggml_backend_reg_name(reg), path_str(path).c_str()); GGML_LOG_INFO("%s: loaded %s backend from %s\n", __func__, ggml_backend_reg_name(reg), path_str(path).c_str());
@ -65,7 +65,7 @@ index 6c315137..3040b2aa 100644
return reg; return reg;
} }
@@ -293,7 +298,7 @@ struct ggml_backend_registry { @@ -309,7 +314,7 @@ struct ggml_backend_registry {
// remove devices // remove devices
devices.erase( devices.erase(
std::remove_if(devices.begin(), devices.end(), std::remove_if(devices.begin(), devices.end(),
@ -74,7 +74,7 @@ index 6c315137..3040b2aa 100644
devices.end()); devices.end());
// remove backend // remove backend
@@ -351,7 +356,7 @@ size_t ggml_backend_dev_count() { @@ -367,7 +372,7 @@ size_t ggml_backend_dev_count() {
ggml_backend_dev_t ggml_backend_dev_get(size_t index) { ggml_backend_dev_t ggml_backend_dev_get(size_t index) {
GGML_ASSERT(index < ggml_backend_dev_count()); GGML_ASSERT(index < ggml_backend_dev_count());

View File

@ -8,10 +8,10 @@ Subject: [PATCH] add phony target ggml-cpu for all cpu variants
1 file changed, 2 insertions(+) 1 file changed, 2 insertions(+)
diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt
index 177fb282..f5a5079a 100644 index c8f3d859..ff6229a0 100644
--- a/ggml/src/CMakeLists.txt --- a/ggml/src/CMakeLists.txt
+++ b/ggml/src/CMakeLists.txt +++ b/ggml/src/CMakeLists.txt
@@ -304,6 +304,7 @@ function(ggml_add_cpu_backend_variant tag_name) @@ -307,6 +307,7 @@ function(ggml_add_cpu_backend_variant tag_name)
endif() endif()
ggml_add_cpu_backend_variant_impl(${tag_name}) ggml_add_cpu_backend_variant_impl(${tag_name})
@ -19,7 +19,7 @@ index 177fb282..f5a5079a 100644
endfunction() endfunction()
ggml_add_backend(CPU) ggml_add_backend(CPU)
@@ -314,6 +315,7 @@ if (GGML_CPU_ALL_VARIANTS) @@ -317,6 +318,7 @@ if (GGML_CPU_ALL_VARIANTS)
elseif (GGML_CPU_ARM_ARCH) elseif (GGML_CPU_ARM_ARCH)
message(FATAL_ERROR "Cannot use both GGML_CPU_ARM_ARCH and GGML_CPU_ALL_VARIANTS") message(FATAL_ERROR "Cannot use both GGML_CPU_ARM_ARCH and GGML_CPU_ALL_VARIANTS")
endif() endif()

View File

@ -9,10 +9,10 @@ disable amx as it reduces performance on some systems
1 file changed, 4 deletions(-) 1 file changed, 4 deletions(-)
diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt
index f5a5079a..5158acd6 100644 index ff6229a0..33b3a15f 100644
--- a/ggml/src/CMakeLists.txt --- a/ggml/src/CMakeLists.txt
+++ b/ggml/src/CMakeLists.txt +++ b/ggml/src/CMakeLists.txt
@@ -324,10 +324,6 @@ if (GGML_CPU_ALL_VARIANTS) @@ -327,10 +327,6 @@ if (GGML_CPU_ALL_VARIANTS)
ggml_add_cpu_backend_variant(skylakex SSE42 AVX F16C AVX2 BMI2 FMA AVX512) ggml_add_cpu_backend_variant(skylakex SSE42 AVX F16C AVX2 BMI2 FMA AVX512)
ggml_add_cpu_backend_variant(icelake SSE42 AVX F16C AVX2 BMI2 FMA AVX512 AVX512_VBMI AVX512_VNNI) ggml_add_cpu_backend_variant(icelake SSE42 AVX F16C AVX2 BMI2 FMA AVX512 AVX512_VBMI AVX512_VNNI)
ggml_add_cpu_backend_variant(alderlake SSE42 AVX F16C AVX2 BMI2 FMA AVX_VNNI) ggml_add_cpu_backend_variant(alderlake SSE42 AVX F16C AVX2 BMI2 FMA AVX_VNNI)

View File

@ -25,7 +25,7 @@ index 79ee2020..3efb22f0 100644
// get ith C string from array with given key_id // get ith C string from array with given key_id
GGML_API const char * gguf_get_arr_str (const struct gguf_context * ctx, int64_t key_id, size_t i); GGML_API const char * gguf_get_arr_str (const struct gguf_context * ctx, int64_t key_id, size_t i);
diff --git a/ggml/src/gguf.cpp b/ggml/src/gguf.cpp diff --git a/ggml/src/gguf.cpp b/ggml/src/gguf.cpp
index 53504399..0f71d5f3 100644 index 8cc4ef1c..d950dbdf 100644
--- a/ggml/src/gguf.cpp --- a/ggml/src/gguf.cpp
+++ b/ggml/src/gguf.cpp +++ b/ggml/src/gguf.cpp
@@ -805,10 +805,14 @@ enum gguf_type gguf_get_arr_type(const struct gguf_context * ctx, int64_t key_id @@ -805,10 +805,14 @@ enum gguf_type gguf_get_arr_type(const struct gguf_context * ctx, int64_t key_id
@ -53,10 +53,10 @@ index 53504399..0f71d5f3 100644
} }
diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp
index c011008f..fa388b03 100644 index 26fa9fad..64c78a16 100644
--- a/src/llama-vocab.cpp --- a/src/llama-vocab.cpp
+++ b/src/llama-vocab.cpp +++ b/src/llama-vocab.cpp
@@ -1760,9 +1760,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) { @@ -1767,9 +1767,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
const int precompiled_charsmap_keyidx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_PRECOMPILED_CHARSMAP).c_str()); const int precompiled_charsmap_keyidx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_PRECOMPILED_CHARSMAP).c_str());
if (precompiled_charsmap_keyidx != -1) { if (precompiled_charsmap_keyidx != -1) {
const gguf_type pc_type = gguf_get_arr_type(ctx, precompiled_charsmap_keyidx); const gguf_type pc_type = gguf_get_arr_type(ctx, precompiled_charsmap_keyidx);
@ -66,4 +66,4 @@ index c011008f..fa388b03 100644
+ const size_t n_precompiled_charsmap = gguf_get_arr_data_n(ctx, precompiled_charsmap_keyidx); + const size_t n_precompiled_charsmap = gguf_get_arr_data_n(ctx, precompiled_charsmap_keyidx);
const char * pc = (const char *) gguf_get_arr_data(ctx, precompiled_charsmap_keyidx); const char * pc = (const char *) gguf_get_arr_data(ctx, precompiled_charsmap_keyidx);
precompiled_charsmap.assign(pc, pc + n_precompiled_charsmap); precompiled_charsmap.assign(pc, pc + n_precompiled_charsmap);
#ifdef IS_BIG_ENDIAN #if defined(__BYTE_ORDER__) && defined(__ORDER_BIG_ENDIAN__) && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__

View File

@ -8,7 +8,7 @@ Subject: [PATCH] ollama debug tensor
1 file changed, 6 insertions(+) 1 file changed, 6 insertions(+)
diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c
index d89cd8f4..a5689c18 100644 index dbc07301..f8574d01 100644
--- a/ggml/src/ggml-cpu/ggml-cpu.c --- a/ggml/src/ggml-cpu/ggml-cpu.c
+++ b/ggml/src/ggml-cpu/ggml-cpu.c +++ b/ggml/src/ggml-cpu/ggml-cpu.c
@@ -15,6 +15,8 @@ @@ -15,6 +15,8 @@
@ -20,7 +20,7 @@ index d89cd8f4..a5689c18 100644
#if defined(_MSC_VER) || defined(__MINGW32__) #if defined(_MSC_VER) || defined(__MINGW32__)
#include <malloc.h> // using malloc.h with MSC/MINGW #include <malloc.h> // using malloc.h with MSC/MINGW
#elif !defined(__FreeBSD__) && !defined(__NetBSD__) && !defined(__OpenBSD__) #elif !defined(__FreeBSD__) && !defined(__NetBSD__) && !defined(__OpenBSD__)
@@ -2858,6 +2860,10 @@ static thread_ret_t ggml_graph_compute_thread(void * data) { @@ -2881,6 +2883,10 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
ggml_compute_forward(&params, node); ggml_compute_forward(&params, node);

View File

@ -184,10 +184,10 @@ index f8c291de..2a3a62db 100644
const char * grammar_root, const char * grammar_root,
bool lazy, bool lazy,
diff --git a/src/llama-sampling.cpp b/src/llama-sampling.cpp diff --git a/src/llama-sampling.cpp b/src/llama-sampling.cpp
index bfbf5fa2..11f93f42 100644 index 2186f827..8fb86009 100644
--- a/src/llama-sampling.cpp --- a/src/llama-sampling.cpp
+++ b/src/llama-sampling.cpp +++ b/src/llama-sampling.cpp
@@ -1466,7 +1466,7 @@ static void llama_sampler_grammar_reset(struct llama_sampler * smpl) { @@ -1563,7 +1563,7 @@ static void llama_sampler_grammar_reset(struct llama_sampler * smpl) {
trigger_patterns_c.push_back(trigger_pattern.pattern.c_str()); trigger_patterns_c.push_back(trigger_pattern.pattern.c_str());
} }
@ -196,7 +196,7 @@ index bfbf5fa2..11f93f42 100644
ctx->grammar->lazy, trigger_patterns_c.data(), trigger_patterns_c.size(), ctx->grammar->lazy, trigger_patterns_c.data(), trigger_patterns_c.size(),
ctx->grammar->trigger_tokens.data(), ctx->grammar->trigger_tokens.size()); ctx->grammar->trigger_tokens.data(), ctx->grammar->trigger_tokens.size());
@@ -1548,7 +1548,7 @@ static struct llama_sampler * llama_sampler_init_grammar_impl( @@ -1645,7 +1645,7 @@ static struct llama_sampler * llama_sampler_init_grammar_impl(
/* .vocab = */ vocab, /* .vocab = */ vocab,
/* .grammar_str = */ grammar_str, /* .grammar_str = */ grammar_str,
/* .grammar_root = */ grammar_root, /* .grammar_root = */ grammar_root,

View File

@ -4,17 +4,18 @@ Date: Thu, 1 May 2025 13:45:12 -0700
Subject: [PATCH] add argsort and cuda copy for i32 Subject: [PATCH] add argsort and cuda copy for i32
--- ---
ggml/src/ggml-cpu/ops.cpp | 43 +++++++++++++ ggml/src/ggml-cpu/ops.cpp | 43 +++++++++++
ggml/src/ggml-cuda/argsort.cu | 102 ++++++++++++++++++++++++++++++- ggml/src/ggml-cuda/argsort.cu | 102 ++++++++++++++++++++++++++-
ggml/src/ggml-cuda/cpy-utils.cuh | 6 ++ ggml/src/ggml-cuda/cpy-utils.cuh | 6 ++
ggml/src/ggml-cuda/cpy.cu | 43 +++++++++++++ ggml/src/ggml-cuda/cpy.cu | 43 +++++++++++
4 files changed, 192 insertions(+), 2 deletions(-) ggml/src/ggml-metal/ggml-metal.metal | 64 +++++++++++++++++
5 files changed, 256 insertions(+), 2 deletions(-)
diff --git a/ggml/src/ggml-cpu/ops.cpp b/ggml/src/ggml-cpu/ops.cpp diff --git a/ggml/src/ggml-cpu/ops.cpp b/ggml/src/ggml-cpu/ops.cpp
index 854f1c2b..a2924757 100644 index 14f7dcf4..f7f8da35 100644
--- a/ggml/src/ggml-cpu/ops.cpp --- a/ggml/src/ggml-cpu/ops.cpp
+++ b/ggml/src/ggml-cpu/ops.cpp +++ b/ggml/src/ggml-cpu/ops.cpp
@@ -8146,6 +8146,45 @@ static void ggml_compute_forward_argsort_f32( @@ -7893,6 +7893,45 @@ static void ggml_compute_forward_argsort_f32(
} }
} }
@ -60,7 +61,7 @@ index 854f1c2b..a2924757 100644
void ggml_compute_forward_argsort( void ggml_compute_forward_argsort(
const ggml_compute_params * params, const ggml_compute_params * params,
ggml_tensor * dst) { ggml_tensor * dst) {
@@ -8157,6 +8196,10 @@ void ggml_compute_forward_argsort( @@ -7904,6 +7943,10 @@ void ggml_compute_forward_argsort(
{ {
ggml_compute_forward_argsort_f32(params, dst); ggml_compute_forward_argsort_f32(params, dst);
} break; } break;
@ -196,12 +197,12 @@ index 607ded85..53b02634 100644
+ } + }
} }
diff --git a/ggml/src/ggml-cuda/cpy-utils.cuh b/ggml/src/ggml-cuda/cpy-utils.cuh diff --git a/ggml/src/ggml-cuda/cpy-utils.cuh b/ggml/src/ggml-cuda/cpy-utils.cuh
index 410c12b7..b8e9e107 100644 index e621cb98..597c0c8b 100644
--- a/ggml/src/ggml-cuda/cpy-utils.cuh --- a/ggml/src/ggml-cuda/cpy-utils.cuh
+++ b/ggml/src/ggml-cuda/cpy-utils.cuh +++ b/ggml/src/ggml-cuda/cpy-utils.cuh
@@ -223,3 +223,9 @@ template<typename src_t, typename dst_t> @@ -215,3 +215,9 @@ template<typename src_t, typename dst_t>
static __device__ void cpy_1_flt(const char * cxi, char * cdsti) { static __device__ void cpy_1_flt(const char * cxi, char * cdsti) {
convert_flt((const src_t *)cxi, (dst_t *)cdsti); *(dst_t *) cdsti = ggml_cuda_cast<dst_t>(*(const src_t *) cxi);
} }
+ +
+static __device__ void cpy_1_i32_i32(const char * cxi, char * cdsti) { +static __device__ void cpy_1_i32_i32(const char * cxi, char * cdsti) {
@ -210,10 +211,10 @@ index 410c12b7..b8e9e107 100644
+ *dst = *src; + *dst = *src;
+} +}
diff --git a/ggml/src/ggml-cuda/cpy.cu b/ggml/src/ggml-cuda/cpy.cu diff --git a/ggml/src/ggml-cuda/cpy.cu b/ggml/src/ggml-cuda/cpy.cu
index f9bb0256..9c3774e5 100644 index 746f4396..911220e9 100644
--- a/ggml/src/ggml-cuda/cpy.cu --- a/ggml/src/ggml-cuda/cpy.cu
+++ b/ggml/src/ggml-cuda/cpy.cu +++ b/ggml/src/ggml-cuda/cpy.cu
@@ -278,6 +278,47 @@ static void ggml_cpy_f32_iq4_nl_cuda( @@ -277,6 +277,47 @@ static void ggml_cpy_f32_iq4_nl_cuda(
(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, cdst_indirect, graph_cpynode_index++); (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, cdst_indirect, graph_cpynode_index++);
} }
@ -261,7 +262,7 @@ index f9bb0256..9c3774e5 100644
void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, ggml_tensor * src1, bool disable_indirection_for_this_node) { void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, ggml_tensor * src1, bool disable_indirection_for_this_node) {
const int64_t ne = ggml_nelements(src0); const int64_t ne = ggml_nelements(src0);
GGML_ASSERT(ne == ggml_nelements(src1)); GGML_ASSERT(ne == ggml_nelements(src1));
@@ -369,6 +410,8 @@ void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, gg @@ -372,6 +413,8 @@ void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, gg
ggml_cpy_flt_cuda<half, nv_bfloat16> (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs_d, graph_cpynode_index); ggml_cpy_flt_cuda<half, nv_bfloat16> (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs_d, graph_cpynode_index);
} else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F32) { } else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F32) {
ggml_cpy_flt_cuda<half, float> (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs_d, graph_cpynode_index); ggml_cpy_flt_cuda<half, float> (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs_d, graph_cpynode_index);
@ -270,3 +271,80 @@ index f9bb0256..9c3774e5 100644
} else if (src0->type == GGML_TYPE_BF16 && src1->type == GGML_TYPE_BF16) { } else if (src0->type == GGML_TYPE_BF16 && src1->type == GGML_TYPE_BF16) {
ggml_cpy_flt_cuda<nv_bfloat16, nv_bfloat16> (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs_d, graph_cpynode_index); ggml_cpy_flt_cuda<nv_bfloat16, nv_bfloat16> (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs_d, graph_cpynode_index);
} else if (src0->type == GGML_TYPE_BF16 && src1->type == GGML_TYPE_F16) { } else if (src0->type == GGML_TYPE_BF16 && src1->type == GGML_TYPE_F16) {
diff --git a/ggml/src/ggml-metal/ggml-metal.metal b/ggml/src/ggml-metal/ggml-metal.metal
index 96df6f0c..44dc31c0 100644
--- a/ggml/src/ggml-metal/ggml-metal.metal
+++ b/ggml/src/ggml-metal/ggml-metal.metal
@@ -4428,8 +4428,72 @@ kernel void kernel_argsort_f32_i32(
}
}
+typedef void (i32_argsort_t)(
+ constant ggml_metal_kargs_argsort & args,
+ device const int32_t * x,
+ device int32_t * dst,
+ threadgroup int32_t * shared_values [[threadgroup(0)]],
+ uint3 tgpig[[threadgroup_position_in_grid]],
+ uint3 tpitg[[thread_position_in_threadgroup]]);
+
+template<ggml_sort_order order>
+kernel void kernel_argsort_i32_i32(
+ constant ggml_metal_kargs_argsort & args,
+ device const int32_t * x,
+ device int32_t * dst,
+ threadgroup int32_t * shared_values [[threadgroup(0)]],
+ uint3 tgpig[[threadgroup_position_in_grid]],
+ uint3 tpitg[[thread_position_in_threadgroup]]) {
+ // bitonic sort
+ int col = tpitg[0];
+ int row = tgpig[1];
+
+ if (col >= args.ncols_pad) return;
+
+ device const int32_t * x_row = x + row * args.ncols;
+ threadgroup int32_t * dst_row = shared_values;
+
+ // initialize indices
+ dst_row[col] = col;
+
+ threadgroup_barrier(mem_flags::mem_threadgroup);
+
+ for (int k = 2; k <= args.ncols_pad; k *= 2) {
+ for (int j = k / 2; j > 0; j /= 2) {
+ int ixj = col ^ j;
+ if (ixj > col) {
+ if ((col & k) == 0) {
+ if (dst_row[col] >= args.ncols ||
+ (dst_row[ixj] < args.ncols && (order == GGML_SORT_ORDER_ASC ?
+ x_row[dst_row[col]] > x_row[dst_row[ixj]] :
+ x_row[dst_row[col]] < x_row[dst_row[ixj]]))
+ ) {
+ SWAP(dst_row[col], dst_row[ixj]);
+ }
+ } else {
+ if (dst_row[ixj] >= args.ncols ||
+ (dst_row[col] < args.ncols && (order == GGML_SORT_ORDER_ASC ?
+ x_row[dst_row[col]] < x_row[dst_row[ixj]] :
+ x_row[dst_row[col]] > x_row[dst_row[ixj]]))
+ ) {
+ SWAP(dst_row[col], dst_row[ixj]);
+ }
+ }
+ }
+ threadgroup_barrier(mem_flags::mem_threadgroup);
+ }
+ }
+
+ // copy the result to dst without the padding
+ if (col < args.ncols) {
+ dst[row * args.ncols + col] = dst_row[col];
+ }
+}
+
template [[host_name("kernel_argsort_f32_i32_asc")]] kernel argsort_t kernel_argsort_f32_i32<GGML_SORT_ORDER_ASC>;
template [[host_name("kernel_argsort_f32_i32_desc")]] kernel argsort_t kernel_argsort_f32_i32<GGML_SORT_ORDER_DESC>;
+template [[host_name("kernel_argsort_i32_i32_asc")]] kernel i32_argsort_t kernel_argsort_i32_i32<GGML_SORT_ORDER_ASC>;
+template [[host_name("kernel_argsort_i32_i32_desc")]] kernel i32_argsort_t kernel_argsort_i32_i32<GGML_SORT_ORDER_DESC>;
kernel void kernel_leaky_relu_f32(
constant ggml_metal_kargs_leaky_relu & args,

View File

@ -6,12 +6,12 @@ Subject: [PATCH] graph memory reporting on failure
--- ---
ggml/include/ggml-alloc.h | 1 + ggml/include/ggml-alloc.h | 1 +
ggml/include/ggml-backend.h | 1 + ggml/include/ggml-backend.h | 1 +
ggml/src/ggml-alloc.c | 36 ++++++++++++++++++++++++++++++++---- ggml/src/ggml-alloc.c | 34 +++++++++++++++++++++++++++++++---
ggml/src/ggml-backend.cpp | 7 +++++++ ggml/src/ggml-backend.cpp | 7 +++++++
4 files changed, 41 insertions(+), 4 deletions(-) 4 files changed, 40 insertions(+), 3 deletions(-)
diff --git a/ggml/include/ggml-alloc.h b/ggml/include/ggml-alloc.h diff --git a/ggml/include/ggml-alloc.h b/ggml/include/ggml-alloc.h
index 2cb150fd2..7ab3f0192 100644 index 2cb150fd..7ab3f019 100644
--- a/ggml/include/ggml-alloc.h --- a/ggml/include/ggml-alloc.h
+++ b/ggml/include/ggml-alloc.h +++ b/ggml/include/ggml-alloc.h
@@ -65,6 +65,7 @@ GGML_API bool ggml_gallocr_reserve_n( @@ -65,6 +65,7 @@ GGML_API bool ggml_gallocr_reserve_n(
@ -23,31 +23,31 @@ index 2cb150fd2..7ab3f0192 100644
// Utils // Utils
// Create a buffer and allocate all the tensors in a ggml_context // Create a buffer and allocate all the tensors in a ggml_context
diff --git a/ggml/include/ggml-backend.h b/ggml/include/ggml-backend.h diff --git a/ggml/include/ggml-backend.h b/ggml/include/ggml-backend.h
index a2977ea2e..e8cf30841 100644 index 62b6d65e..fe20dca3 100644
--- a/ggml/include/ggml-backend.h --- a/ggml/include/ggml-backend.h
+++ b/ggml/include/ggml-backend.h +++ b/ggml/include/ggml-backend.h
@@ -303,6 +303,7 @@ extern "C" { @@ -316,6 +316,7 @@ extern "C" {
GGML_API int ggml_backend_sched_get_n_copies(ggml_backend_sched_t sched);
GGML_API size_t ggml_backend_sched_get_buffer_size(ggml_backend_sched_t sched, ggml_backend_t backend); GGML_API ggml_backend_buffer_type_t ggml_backend_sched_get_buffer_type(ggml_backend_sched_t sched, ggml_backend_t backend);
+ GGML_API size_t ggml_backend_sched_get_attempted_buffer_size(ggml_backend_sched_t sched, ggml_backend_t backend); GGML_API size_t ggml_backend_sched_get_buffer_size(ggml_backend_sched_t sched, ggml_backend_t backend);
+ GGML_API size_t ggml_backend_sched_get_attempted_buffer_size(ggml_backend_sched_t sched, ggml_backend_t backend);
GGML_API void ggml_backend_sched_set_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node, ggml_backend_t backend); GGML_API void ggml_backend_sched_set_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node, ggml_backend_t backend);
GGML_API ggml_backend_t ggml_backend_sched_get_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node); GGML_API ggml_backend_t ggml_backend_sched_get_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node);
diff --git a/ggml/src/ggml-alloc.c b/ggml/src/ggml-alloc.c diff --git a/ggml/src/ggml-alloc.c b/ggml/src/ggml-alloc.c
index 8b6e60283..b58bd671d 100644 index fa46f3b4..421ff7c7 100644
--- a/ggml/src/ggml-alloc.c --- a/ggml/src/ggml-alloc.c
+++ b/ggml/src/ggml-alloc.c +++ b/ggml/src/ggml-alloc.c
@@ -350,6 +350,7 @@ struct node_alloc { @@ -492,6 +492,7 @@ struct node_alloc {
struct ggml_gallocr { struct ggml_gallocr {
ggml_backend_buffer_type_t * bufts; // [n_buffers] ggml_backend_buffer_type_t * bufts; // [n_buffers]
ggml_backend_buffer_t * buffers; // [n_buffers] struct vbuffer ** buffers; // [n_buffers]
+ size_t *buffer_sizes; // [n_buffers] + size_t *buffer_sizes; // [n_buffers]
struct ggml_dyn_tallocr ** buf_tallocs; // [n_buffers] struct ggml_dyn_tallocr ** buf_tallocs; // [n_buffers]
int n_buffers; int n_buffers;
@@ -373,6 +374,9 @@ ggml_gallocr_t ggml_gallocr_new_n(ggml_backend_buffer_type_t * bufts, int n_bufs @@ -515,6 +516,9 @@ ggml_gallocr_t ggml_gallocr_new_n(ggml_backend_buffer_type_t * bufts, int n_bufs
galloc->buffers = calloc(n_bufs, sizeof(ggml_backend_buffer_t)); galloc->buffers = calloc(n_bufs, sizeof(struct vbuffer *));
GGML_ASSERT(galloc->buffers != NULL); GGML_ASSERT(galloc->buffers != NULL);
+ galloc->buffer_sizes = calloc(n_bufs, sizeof(size_t)); + galloc->buffer_sizes = calloc(n_bufs, sizeof(size_t));
@ -56,7 +56,7 @@ index 8b6e60283..b58bd671d 100644
galloc->buf_tallocs = calloc(n_bufs, sizeof(struct ggml_dyn_tallocr *)); galloc->buf_tallocs = calloc(n_bufs, sizeof(struct ggml_dyn_tallocr *));
GGML_ASSERT(galloc->buf_tallocs != NULL); GGML_ASSERT(galloc->buf_tallocs != NULL);
@@ -439,6 +443,7 @@ void ggml_gallocr_free(ggml_gallocr_t galloc) { @@ -582,6 +586,7 @@ void ggml_gallocr_free(ggml_gallocr_t galloc) {
ggml_hash_set_free(&galloc->hash_set); ggml_hash_set_free(&galloc->hash_set);
free(galloc->hash_values); free(galloc->hash_values);
free(galloc->bufts); free(galloc->bufts);
@ -64,7 +64,7 @@ index 8b6e60283..b58bd671d 100644
free(galloc->buffers); free(galloc->buffers);
free(galloc->buf_tallocs); free(galloc->buf_tallocs);
free(galloc->node_allocs); free(galloc->node_allocs);
@@ -734,6 +739,8 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c @@ -875,6 +880,8 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
} }
} }
@ -73,23 +73,21 @@ index 8b6e60283..b58bd671d 100644
// reallocate buffers if needed // reallocate buffers if needed
for (int i = 0; i < galloc->n_buffers; i++) { for (int i = 0; i < galloc->n_buffers; i++) {
// if the buffer type is used multiple times, we reuse the same buffer // if the buffer type is used multiple times, we reuse the same buffer
@@ -755,15 +762,20 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c @@ -896,14 +903,19 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
ggml_backend_buffer_free(galloc->buffers[i]); ggml_vbuffer_free(galloc->buffers[i]);
galloc->buffers[i] = ggml_backend_buft_alloc_buffer(galloc->bufts[i], new_size); galloc->buffers[i] = ggml_vbuffer_alloc(galloc->bufts[i], galloc->buf_tallocs[i], GGML_BACKEND_BUFFER_USAGE_COMPUTE);
- if (galloc->buffers[i] == NULL) { - if (galloc->buffers[i] == NULL) {
+ if (galloc->buffers[i]) { + if (galloc->buffers[i]) {
+ galloc->buffer_sizes[i] = ggml_backend_buffer_get_size(galloc->buffers[i]); + galloc->buffer_sizes[i] = ggml_vbuffer_size(galloc->buffers[i]);
+ ggml_backend_buffer_set_usage(galloc->buffers[i], GGML_BACKEND_BUFFER_USAGE_COMPUTE);
+ } else { + } else {
GGML_LOG_ERROR("%s: failed to allocate %s buffer of size %zu\n", __func__, ggml_backend_buft_name(galloc->bufts[i]), new_size); GGML_LOG_ERROR("%s: failed to allocate %s buffer of size %zu\n", __func__, ggml_backend_buft_name(galloc->bufts[i]), new_size);
- return false; - return false;
+ galloc->buffer_sizes[i] = new_size; + galloc->buffer_sizes[i] = new_size;
+ success = false; + success = false;
} }
- ggml_backend_buffer_set_usage(galloc->buffers[i], GGML_BACKEND_BUFFER_USAGE_COMPUTE);
+ } else { + } else {
+ galloc->buffer_sizes[i] = ggml_backend_buffer_get_size(galloc->buffers[i]); + galloc->buffer_sizes[i] = ggml_vbuffer_size(galloc->buffers[i]);
} }
} }
@ -98,8 +96,8 @@ index 8b6e60283..b58bd671d 100644
} }
bool ggml_gallocr_reserve(ggml_gallocr_t galloc, struct ggml_cgraph *graph) { bool ggml_gallocr_reserve(ggml_gallocr_t galloc, struct ggml_cgraph *graph) {
@@ -920,6 +932,22 @@ size_t ggml_gallocr_get_buffer_size(ggml_gallocr_t galloc, int buffer_id) { @@ -1058,6 +1070,22 @@ size_t ggml_gallocr_get_buffer_size(ggml_gallocr_t galloc, int buffer_id) {
return ggml_backend_buffer_get_size(galloc->buffers[buffer_id]); return ggml_vbuffer_size(galloc->buffers[buffer_id]);
} }
+size_t ggml_gallocr_get_attempted_buffer_size(ggml_gallocr_t galloc, int buffer_id) { +size_t ggml_gallocr_get_attempted_buffer_size(ggml_gallocr_t galloc, int buffer_id) {
@ -122,10 +120,10 @@ index 8b6e60283..b58bd671d 100644
static void free_buffers(ggml_backend_buffer_t ** buffers, const size_t * n_buffers) { static void free_buffers(ggml_backend_buffer_t ** buffers, const size_t * n_buffers) {
diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp
index 97f47abd2..d02a40e60 100644 index 8ba86f82..cb2b9956 100644
--- a/ggml/src/ggml-backend.cpp --- a/ggml/src/ggml-backend.cpp
+++ b/ggml/src/ggml-backend.cpp +++ b/ggml/src/ggml-backend.cpp
@@ -1631,6 +1631,13 @@ size_t ggml_backend_sched_get_buffer_size(ggml_backend_sched_t sched, ggml_backe @@ -1809,6 +1809,13 @@ size_t ggml_backend_sched_get_buffer_size(ggml_backend_sched_t sched, ggml_backe
return ggml_gallocr_get_buffer_size(sched->galloc, backend_index); return ggml_gallocr_get_buffer_size(sched->galloc, backend_index);
} }
@ -137,5 +135,5 @@ index 97f47abd2..d02a40e60 100644
+} +}
+ +
void ggml_backend_sched_set_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node, ggml_backend_t backend) { void ggml_backend_sched_set_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node, ggml_backend_t backend) {
GGML_ASSERT(sched);
int backend_index = ggml_backend_sched_backend_id(sched, backend); int backend_index = ggml_backend_sched_backend_id(sched, backend);
GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends);

View File

@ -6,28 +6,28 @@ Subject: [PATCH] ggml: Export GPU UUIDs
This enables matching up devices and information reported by the backend This enables matching up devices and information reported by the backend
with tools (e.g. nvidia-smi) and system management libraries (e.g. nvml). with tools (e.g. nvidia-smi) and system management libraries (e.g. nvml).
--- ---
ggml/include/ggml-backend.h | 1 + ggml/include/ggml-backend.h | 1 +
ggml/src/ggml-cuda/ggml-cuda.cu | 67 +++++++++++++++++++++++++++++--- ggml/src/ggml-cuda/ggml-cuda.cu | 67 +++++++++++++++++++++++++++---
ggml/src/ggml-metal/ggml-metal.m | 1 + ggml/src/ggml-metal/ggml-metal.cpp | 1 +
3 files changed, 63 insertions(+), 6 deletions(-) 3 files changed, 63 insertions(+), 6 deletions(-)
diff --git a/ggml/include/ggml-backend.h b/ggml/include/ggml-backend.h diff --git a/ggml/include/ggml-backend.h b/ggml/include/ggml-backend.h
index 8a91b381..9424394e 100644 index fe20dca3..48777212 100644
--- a/ggml/include/ggml-backend.h --- a/ggml/include/ggml-backend.h
+++ b/ggml/include/ggml-backend.h +++ b/ggml/include/ggml-backend.h
@@ -152,6 +152,7 @@ extern "C" { @@ -158,6 +158,7 @@ extern "C" {
struct ggml_backend_dev_props {
const char * name;
const char * description; const char * description;
+ const char * id; // device free memory in bytes
size_t memory_free; size_t memory_free;
+ const char * id;
// device total memory in bytes
size_t memory_total; size_t memory_total;
enum ggml_backend_dev_type type; // device type
diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
index 37ee2a6d..57eae461 100644 index fdf8c63d..ad389ece 100644
--- a/ggml/src/ggml-cuda/ggml-cuda.cu --- a/ggml/src/ggml-cuda/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu +++ b/ggml/src/ggml-cuda/ggml-cuda.cu
@@ -179,6 +179,51 @@ static int ggml_cuda_parse_id(char devName[]) { @@ -183,6 +183,51 @@ static int ggml_cuda_parse_id(char devName[]) {
} }
#endif // defined(GGML_USE_HIP) #endif // defined(GGML_USE_HIP)
@ -77,9 +77,9 @@ index 37ee2a6d..57eae461 100644
+} +}
+ +
static ggml_cuda_device_info ggml_cuda_init() { static ggml_cuda_device_info ggml_cuda_init() {
#if defined(GGML_USE_HIP) ggml_cuda_device_info info = {};
// Workaround for a rocBLAS bug when using multiple graphics cards:
@@ -267,22 +312,24 @@ static ggml_cuda_device_info ggml_cuda_init() { @@ -249,22 +294,24 @@ static ggml_cuda_device_info ggml_cuda_init() {
info.devices[id].cc += prop.minor * 0x10; info.devices[id].cc += prop.minor * 0x10;
} }
} }
@ -107,18 +107,18 @@ index 37ee2a6d..57eae461 100644
+ GGML_LOG_INFO(" Device %d: %s, compute capability %d.%d, VMM: %s, ID: %s\n", + GGML_LOG_INFO(" Device %d: %s, compute capability %d.%d, VMM: %s, ID: %s\n",
+ id, prop.name, prop.major, prop.minor, device_vmm ? "yes" : "no", + id, prop.name, prop.major, prop.minor, device_vmm ? "yes" : "no",
+ ggml_cuda_parse_uuid(prop, id).c_str()); + ggml_cuda_parse_uuid(prop, id).c_str());
#endif // defined(GGML_USE_HIP) std::string device_name(prop.name);
} if (device_name == "NVIDIA GeForce MX450") {
turing_devices_without_mma.push_back({ id, device_name });
@@ -3144,6 +3191,7 @@ struct ggml_backend_cuda_device_context { @@ -3273,6 +3320,7 @@ struct ggml_backend_cuda_device_context {
int device;
std::string name; std::string name;
std::string description; std::string description;
std::string pci_bus_id;
+ std::string id; + std::string id;
}; };
static const char * ggml_backend_cuda_device_get_name(ggml_backend_dev_t dev) { static const char * ggml_backend_cuda_device_get_name(ggml_backend_dev_t dev) {
@@ -3156,6 +3204,11 @@ static const char * ggml_backend_cuda_device_get_description(ggml_backend_dev_t @@ -3285,6 +3333,11 @@ static const char * ggml_backend_cuda_device_get_description(ggml_backend_dev_t
return ctx->description.c_str(); return ctx->description.c_str();
} }
@ -130,31 +130,31 @@ index 37ee2a6d..57eae461 100644
static void ggml_backend_cuda_device_get_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) { static void ggml_backend_cuda_device_get_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) {
ggml_backend_cuda_device_context * ctx = (ggml_backend_cuda_device_context *)dev->context; ggml_backend_cuda_device_context * ctx = (ggml_backend_cuda_device_context *)dev->context;
ggml_cuda_set_device(ctx->device); ggml_cuda_set_device(ctx->device);
@@ -3170,6 +3223,7 @@ static enum ggml_backend_dev_type ggml_backend_cuda_device_get_type(ggml_backend @@ -3301,6 +3354,7 @@ static void ggml_backend_cuda_device_get_props(ggml_backend_dev_t dev, ggml_back
static void ggml_backend_cuda_device_get_props(ggml_backend_dev_t dev, ggml_backend_dev_props * props) {
props->name = ggml_backend_cuda_device_get_name(dev); props->name = ggml_backend_cuda_device_get_name(dev);
props->description = ggml_backend_cuda_device_get_description(dev); props->description = ggml_backend_cuda_device_get_description(dev);
+ props->id = ggml_backend_cuda_device_get_id(dev); + props->id = ggml_backend_cuda_device_get_id(dev);
props->type = ggml_backend_cuda_device_get_type(dev); props->type = ggml_backend_cuda_device_get_type(dev);
props->device_id = ctx->pci_bus_id.empty() ? nullptr : ctx->pci_bus_id.c_str();
ggml_backend_cuda_device_get_memory(dev, &props->memory_free, &props->memory_total); ggml_backend_cuda_device_get_memory(dev, &props->memory_free, &props->memory_total);
@@ -3871,6 +3925,7 @@ ggml_backend_reg_t ggml_backend_cuda_reg() {
@@ -3767,6 +3821,7 @@ ggml_backend_reg_t ggml_backend_cuda_reg() {
cudaDeviceProp prop; cudaDeviceProp prop;
CUDA_CHECK(cudaGetDeviceProperties(&prop, i)); CUDA_CHECK(cudaGetDeviceProperties(&prop, i));
dev_ctx->description = prop.name; dev_ctx->description = prop.name;
+ dev_ctx->id = ggml_cuda_parse_uuid(prop, i); + dev_ctx->id = ggml_cuda_parse_uuid(prop, i);
ggml_backend_dev_t dev = new ggml_backend_device { char pci_bus_id[16] = {};
/* .iface = */ ggml_backend_cuda_device_interface, snprintf(pci_bus_id, sizeof(pci_bus_id), "%04x:%02x:%02x.0", prop.pciDomainID, prop.pciBusID, prop.pciDeviceID);
diff --git a/ggml/src/ggml-metal/ggml-metal.m b/ggml/src/ggml-metal/ggml-metal.m diff --git a/ggml/src/ggml-metal/ggml-metal.cpp b/ggml/src/ggml-metal/ggml-metal.cpp
index 7bccc7bf..fe7b2f0a 100644 index 909e17de..08ab4fc9 100644
--- a/ggml/src/ggml-metal/ggml-metal.m --- a/ggml/src/ggml-metal/ggml-metal.cpp
+++ b/ggml/src/ggml-metal/ggml-metal.m +++ b/ggml/src/ggml-metal/ggml-metal.cpp
@@ -6522,6 +6522,7 @@ static enum ggml_backend_dev_type ggml_backend_metal_device_get_type(ggml_backen @@ -538,6 +538,7 @@ static enum ggml_backend_dev_type ggml_backend_metal_device_get_type(ggml_backen
static void ggml_backend_metal_device_get_props(ggml_backend_dev_t dev, struct ggml_backend_dev_props * props) { static void ggml_backend_metal_device_get_props(ggml_backend_dev_t dev, ggml_backend_dev_props * props) {
props->name = ggml_backend_metal_device_get_name(dev); props->name = ggml_backend_metal_device_get_name(dev);
props->description = ggml_backend_metal_device_get_description(dev); props->description = ggml_backend_metal_device_get_description(dev);
+ props->id = "0"; + props->id = "0";
props->type = ggml_backend_metal_device_get_type(dev); props->type = ggml_backend_metal_device_get_type(dev);
ggml_backend_metal_device_get_memory(dev, &props->memory_free, &props->memory_total); ggml_backend_metal_device_get_memory(dev, &props->memory_free, &props->memory_total);
props->caps = (struct ggml_backend_dev_caps) {

View File

@ -10,7 +10,7 @@ Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>
2 files changed, 13 insertions(+) 2 files changed, 13 insertions(+)
diff --git a/tools/mtmd/mtmd.cpp b/tools/mtmd/mtmd.cpp diff --git a/tools/mtmd/mtmd.cpp b/tools/mtmd/mtmd.cpp
index a05373d5..6f70f7f4 100644 index cd022c5e..3d680945 100644
--- a/tools/mtmd/mtmd.cpp --- a/tools/mtmd/mtmd.cpp
+++ b/tools/mtmd/mtmd.cpp +++ b/tools/mtmd/mtmd.cpp
@@ -79,6 +79,16 @@ enum mtmd_slice_tmpl { @@ -79,6 +79,16 @@ enum mtmd_slice_tmpl {

View File

@ -8,10 +8,10 @@ Subject: [PATCH] no power throttling win32 with gnuc
1 file changed, 1 insertion(+), 1 deletion(-) 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c
index a5689c18..85af19a3 100644 index f8574d01..530efce0 100644
--- a/ggml/src/ggml-cpu/ggml-cpu.c --- a/ggml/src/ggml-cpu/ggml-cpu.c
+++ b/ggml/src/ggml-cpu/ggml-cpu.c +++ b/ggml/src/ggml-cpu/ggml-cpu.c
@@ -2412,7 +2412,7 @@ static bool ggml_thread_apply_priority(int32_t prio) { @@ -2431,7 +2431,7 @@ static bool ggml_thread_apply_priority(int32_t prio) {
// Newer Windows 11 versions aggresively park (offline) CPU cores and often place // Newer Windows 11 versions aggresively park (offline) CPU cores and often place
// all our threads onto the first 4 cores which results in terrible performance with // all our threads onto the first 4 cores which results in terrible performance with
// n_threads > 4 // n_threads > 4

View File

@ -5,23 +5,24 @@ Subject: [PATCH] BF16 macos version guard
Only enable BF16 on supported MacOS versions (v14+) Only enable BF16 on supported MacOS versions (v14+)
--- ---
ggml/src/ggml-metal/ggml-metal.m | 6 +++++- ggml/src/ggml-metal/ggml-metal-context.m | 7 ++++++-
1 file changed, 5 insertions(+), 1 deletion(-) 1 file changed, 6 insertions(+), 1 deletion(-)
diff --git a/ggml/src/ggml-metal/ggml-metal.m b/ggml/src/ggml-metal/ggml-metal.m diff --git a/ggml/src/ggml-metal/ggml-metal-context.m b/ggml/src/ggml-metal/ggml-metal-context.m
index fe7b2f0a..e4c31268 100644 index 052efb7a..b47dc787 100644
--- a/ggml/src/ggml-metal/ggml-metal.m --- a/ggml/src/ggml-metal/ggml-metal-context.m
+++ b/ggml/src/ggml-metal/ggml-metal.m +++ b/ggml/src/ggml-metal/ggml-metal-context.m
@@ -106,7 +106,11 @@ static id<MTLDevice> ggml_backend_metal_device_acq(struct ggml_backend_metal_dev @@ -125,7 +125,12 @@ ggml_metal_t ggml_metal_init(ggml_metal_device_t dev) {
ctx->has_bfloat |= [ctx->mtl_device supportsFamily:MTLGPUFamilyApple6];
res->d_queue = dispatch_queue_create("ggml-metal", DISPATCH_QUEUE_CONCURRENT);
- res->use_bfloat = props_dev->has_bfloat;
+ if (@available(macOS 14.0, *)) {
+ res->use_bfloat = props_dev->has_bfloat;
+ } else {
+ res->use_bfloat = false;
+ }
+
res->use_fusion = getenv("GGML_METAL_FUSION_DISABLE") == nil;
res->use_concurrency = getenv("GGML_METAL_CONCURRENCY_DISABLE") == nil;
#if defined(GGML_METAL_USE_BF16)
- ctx->use_bfloat = ctx->has_bfloat;
+ if (@available(macOS 14.0, *)) {
+ ctx->use_bfloat = ctx->has_bfloat;
+ } else {
+ ctx->use_bfloat = false;
+ }
#else
ctx->use_bfloat = false;
#endif

View File

@ -13,10 +13,10 @@ checks.
1 file changed, 18 insertions(+) 1 file changed, 18 insertions(+)
diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
index 57eae461..c7f9dc3a 100644 index ad389ece..e51c5035 100644
--- a/ggml/src/ggml-cuda/ggml-cuda.cu --- a/ggml/src/ggml-cuda/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu +++ b/ggml/src/ggml-cuda/ggml-cuda.cu
@@ -2671,12 +2671,24 @@ static bool check_node_graph_compatibility_and_refresh_copy_ops(ggml_backend_cud @@ -2686,14 +2686,26 @@ static bool check_node_graph_compatibility_and_refresh_copy_ops(ggml_backend_cud
// Loop over nodes in GGML graph to obtain info needed for CUDA graph // Loop over nodes in GGML graph to obtain info needed for CUDA graph
cuda_ctx->cuda_graph->cpy_dest_ptrs.clear(); cuda_ctx->cuda_graph->cpy_dest_ptrs.clear();
@ -36,12 +36,14 @@ index 57eae461..c7f9dc3a 100644
const std::string ffn_moe_gate_bias_prefix = "ffn_moe_gate_biased"; const std::string ffn_moe_gate_bias_prefix = "ffn_moe_gate_biased";
const std::string ffn_moe_up_bias_prefix = "ffn_moe_up_biased"; const std::string ffn_moe_up_bias_prefix = "ffn_moe_up_biased";
const std::string ffn_moe_down_bias_prefix = "ffn_moe_down_biased"; const std::string ffn_moe_down_bias_prefix = "ffn_moe_down_biased";
const std::string nemotron_h_block_out_prefix = "nemotron_h_block_out";
const std::string mamba2_y_add_d_prefix = "mamba2_y_add_d";
+ +
for (int i = 0; i < cgraph->n_nodes; i++) { for (int i = 0; i < cgraph->n_nodes; i++) {
ggml_tensor * node = cgraph->nodes[i]; ggml_tensor * node = cgraph->nodes[i];
@@ -2700,6 +2712,12 @@ static bool check_node_graph_compatibility_and_refresh_copy_ops(ggml_backend_cud @@ -2717,6 +2729,12 @@ static bool check_node_graph_compatibility_and_refresh_copy_ops(ggml_backend_cud
if (node->op == GGML_OP_ADD && if (node->op == GGML_OP_ADD &&
node->src[1] && node->src[1]->ne[1] > 1 && node->src[1] && node->src[1]->ne[1] > 1 &&

View File

@ -8,10 +8,10 @@ Subject: [PATCH] Disable ggml-blas on macos v13 and older
1 file changed, 5 insertions(+) 1 file changed, 5 insertions(+)
diff --git a/ggml/src/ggml-blas/ggml-blas.cpp b/ggml/src/ggml-blas/ggml-blas.cpp diff --git a/ggml/src/ggml-blas/ggml-blas.cpp b/ggml/src/ggml-blas/ggml-blas.cpp
index aeac2e57..40738d5b 100644 index 5b888cdd..2a9ff7f6 100644
--- a/ggml/src/ggml-blas/ggml-blas.cpp --- a/ggml/src/ggml-blas/ggml-blas.cpp
+++ b/ggml/src/ggml-blas/ggml-blas.cpp +++ b/ggml/src/ggml-blas/ggml-blas.cpp
@@ -505,6 +505,11 @@ static const struct ggml_backend_reg_i ggml_backend_blas_reg_i = { @@ -506,6 +506,11 @@ static const struct ggml_backend_reg_i ggml_backend_blas_reg_i = {
}; };
ggml_backend_reg_t ggml_backend_blas_reg(void) { ggml_backend_reg_t ggml_backend_blas_reg(void) {

View File

@ -16,10 +16,10 @@ must be recreated with no-alloc set to false before loading data.
5 files changed, 310 insertions(+), 44 deletions(-) 5 files changed, 310 insertions(+), 44 deletions(-)
diff --git a/ggml/include/ggml-backend.h b/ggml/include/ggml-backend.h diff --git a/ggml/include/ggml-backend.h b/ggml/include/ggml-backend.h
index 2773cc310..ae94887dd 100644 index 48777212..d4352663 100644
--- a/ggml/include/ggml-backend.h --- a/ggml/include/ggml-backend.h
+++ b/ggml/include/ggml-backend.h +++ b/ggml/include/ggml-backend.h
@@ -291,6 +291,7 @@ extern "C" { @@ -303,6 +303,7 @@ extern "C" {
// Initialize a backend scheduler, backends with low index are given priority over backends with high index // Initialize a backend scheduler, backends with low index are given priority over backends with high index
GGML_API ggml_backend_sched_t ggml_backend_sched_new(ggml_backend_t * backends, ggml_backend_buffer_type_t * bufts, int n_backends, size_t graph_size, bool parallel, bool op_offload); GGML_API ggml_backend_sched_t ggml_backend_sched_new(ggml_backend_t * backends, ggml_backend_buffer_type_t * bufts, int n_backends, size_t graph_size, bool parallel, bool op_offload);
@ -28,7 +28,7 @@ index 2773cc310..ae94887dd 100644
// Initialize backend buffers from a measure graph // Initialize backend buffers from a measure graph
diff --git a/ggml/src/ggml-backend-impl.h b/ggml/src/ggml-backend-impl.h diff --git a/ggml/src/ggml-backend-impl.h b/ggml/src/ggml-backend-impl.h
index c36c12d65..369e9e25a 100644 index 07784d6f..869dc07d 100644
--- a/ggml/src/ggml-backend-impl.h --- a/ggml/src/ggml-backend-impl.h
+++ b/ggml/src/ggml-backend-impl.h +++ b/ggml/src/ggml-backend-impl.h
@@ -26,12 +26,17 @@ extern "C" { @@ -26,12 +26,17 @@ extern "C" {
@ -57,10 +57,10 @@ index c36c12d65..369e9e25a 100644
}; };
GGML_API ggml_backend_buffer_t ggml_backend_buffer_init( GGML_API ggml_backend_buffer_t ggml_backend_buffer_init(
@@ -114,6 +120,16 @@ extern "C" { @@ -117,6 +123,16 @@ extern "C" {
void (*event_record)(ggml_backend_t backend, ggml_backend_event_t event);
// wait for an event on on a different stream // (optional) sort/optimize the nodes in the graph
void (*event_wait) (ggml_backend_t backend, ggml_backend_event_t event); void (*graph_optimize) (ggml_backend_t backend, struct ggml_cgraph * cgraph);
+ +
+ // (optional) reserves intermediate buffers needed for the compution + // (optional) reserves intermediate buffers needed for the compution
+ // if alloc is true, memory is actually allocated, otherwise the required amount is just returned by buffer_size + // if alloc is true, memory is actually allocated, otherwise the required amount is just returned by buffer_size
@ -75,7 +75,7 @@ index c36c12d65..369e9e25a 100644
struct ggml_backend { struct ggml_backend {
diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp
index d02a40e60..6b4dee4c7 100644 index cb2b9956..6ef5eeaf 100644
--- a/ggml/src/ggml-backend.cpp --- a/ggml/src/ggml-backend.cpp
+++ b/ggml/src/ggml-backend.cpp +++ b/ggml/src/ggml-backend.cpp
@@ -41,6 +41,19 @@ ggml_backend_buffer_t ggml_backend_buft_alloc_buffer(ggml_backend_buffer_type_t @@ -41,6 +41,19 @@ ggml_backend_buffer_t ggml_backend_buft_alloc_buffer(ggml_backend_buffer_type_t
@ -95,10 +95,10 @@ index d02a40e60..6b4dee4c7 100644
+ return buf; + return buf;
+ } + }
+ +
GGML_ASSERT(buft);
return buft->iface.alloc_buffer(buft, size); return buft->iface.alloc_buffer(buft, size);
} }
@@ -95,7 +108,8 @@ ggml_backend_buffer_t ggml_backend_buffer_init(
@@ -89,7 +102,8 @@ ggml_backend_buffer_t ggml_backend_buffer_init(
/* .buft = */ buft, /* .buft = */ buft,
/* .context = */ context, /* .context = */ context,
/* .size = */ size, /* .size = */ size,
@ -108,7 +108,7 @@ index d02a40e60..6b4dee4c7 100644
}; };
return buffer; return buffer;
@@ -119,6 +133,12 @@ void * ggml_backend_buffer_get_base(ggml_backend_buffer_t buffer) { @@ -127,6 +141,12 @@ void * ggml_backend_buffer_get_base(ggml_backend_buffer_t buffer) {
return NULL; return NULL;
} }
@ -121,7 +121,7 @@ index d02a40e60..6b4dee4c7 100644
void * base = buffer->iface.get_base(buffer); void * base = buffer->iface.get_base(buffer);
GGML_ASSERT(base != NULL && "backend buffer base cannot be NULL"); GGML_ASSERT(base != NULL && "backend buffer base cannot be NULL");
@@ -663,6 +683,12 @@ struct ggml_backend_sched { @@ -723,6 +743,12 @@ struct ggml_backend_sched {
bool op_offload; bool op_offload;
int debug; int debug;
@ -134,7 +134,7 @@ index d02a40e60..6b4dee4c7 100644
}; };
#define hash_id(tensor) ggml_hash_find_or_insert(&sched->hash_set, tensor) #define hash_id(tensor) ggml_hash_find_or_insert(&sched->hash_set, tensor)
@@ -1449,6 +1475,17 @@ ggml_backend_sched_t ggml_backend_sched_new( @@ -1606,6 +1632,17 @@ ggml_backend_sched_t ggml_backend_sched_new(
size_t graph_size, size_t graph_size,
bool parallel, bool parallel,
bool op_offload) { bool op_offload) {
@ -152,7 +152,7 @@ index d02a40e60..6b4dee4c7 100644
GGML_ASSERT(n_backends > 0); GGML_ASSERT(n_backends > 0);
GGML_ASSERT(n_backends <= GGML_SCHED_MAX_BACKENDS); GGML_ASSERT(n_backends <= GGML_SCHED_MAX_BACKENDS);
GGML_ASSERT(ggml_backend_dev_type(ggml_backend_get_device(backends[n_backends - 1])) == GGML_BACKEND_DEVICE_TYPE_CPU); GGML_ASSERT(ggml_backend_dev_type(ggml_backend_get_device(backends[n_backends - 1])) == GGML_BACKEND_DEVICE_TYPE_CPU);
@@ -1490,10 +1527,13 @@ ggml_backend_sched_t ggml_backend_sched_new( @@ -1647,10 +1684,13 @@ ggml_backend_sched_t ggml_backend_sched_new(
sched->events[b][c] = ggml_backend_event_new(backends[b]->device); sched->events[b][c] = ggml_backend_event_new(backends[b]->device);
} }
} }
@ -166,7 +166,7 @@ index d02a40e60..6b4dee4c7 100644
ggml_backend_sched_reset(sched); ggml_backend_sched_reset(sched);
@@ -1508,6 +1548,10 @@ void ggml_backend_sched_free(ggml_backend_sched_t sched) { @@ -1665,6 +1705,10 @@ void ggml_backend_sched_free(ggml_backend_sched_t sched) {
for (int c = 0; c < sched->n_copies; c++) { for (int c = 0; c < sched->n_copies; c++) {
ggml_backend_event_free(sched->events[b][c]); ggml_backend_event_free(sched->events[b][c]);
} }
@ -177,7 +177,7 @@ index d02a40e60..6b4dee4c7 100644
} }
ggml_gallocr_free(sched->galloc); ggml_gallocr_free(sched->galloc);
ggml_free(sched->ctx); ggml_free(sched->ctx);
@@ -1547,6 +1591,24 @@ bool ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph * @@ -1708,6 +1752,24 @@ bool ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph *
return false; return false;
} }
@ -202,7 +202,7 @@ index d02a40e60..6b4dee4c7 100644
ggml_backend_sched_reset(sched); ggml_backend_sched_reset(sched);
return true; return true;
@@ -1635,7 +1697,13 @@ size_t ggml_backend_sched_get_attempted_buffer_size(ggml_backend_sched_t sched, @@ -1813,7 +1875,13 @@ size_t ggml_backend_sched_get_attempted_buffer_size(ggml_backend_sched_t sched,
int backend_index = ggml_backend_sched_backend_id(sched, backend); int backend_index = ggml_backend_sched_backend_id(sched, backend);
GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends); GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends);
@ -218,7 +218,7 @@ index d02a40e60..6b4dee4c7 100644
void ggml_backend_sched_set_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node, ggml_backend_t backend) { void ggml_backend_sched_set_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node, ggml_backend_t backend) {
diff --git a/ggml/src/ggml-cuda/common.cuh b/ggml/src/ggml-cuda/common.cuh diff --git a/ggml/src/ggml-cuda/common.cuh b/ggml/src/ggml-cuda/common.cuh
index 2e5d48797..b915ee1b8 100644 index c4246b65..448badf0 100644
--- a/ggml/src/ggml-cuda/common.cuh --- a/ggml/src/ggml-cuda/common.cuh
+++ b/ggml/src/ggml-cuda/common.cuh +++ b/ggml/src/ggml-cuda/common.cuh
@@ -35,6 +35,31 @@ @@ -35,6 +35,31 @@
@ -253,7 +253,7 @@ index 2e5d48797..b915ee1b8 100644
#define STRINGIZE_IMPL(...) #__VA_ARGS__ #define STRINGIZE_IMPL(...) #__VA_ARGS__
#define STRINGIZE(...) STRINGIZE_IMPL(__VA_ARGS__) #define STRINGIZE(...) STRINGIZE_IMPL(__VA_ARGS__)
@@ -771,6 +796,9 @@ struct ggml_cuda_pool { @@ -880,6 +905,9 @@ struct ggml_cuda_pool {
virtual void * alloc(size_t size, size_t * actual_size) = 0; virtual void * alloc(size_t size, size_t * actual_size) = 0;
virtual void free(void * ptr, size_t size) = 0; virtual void free(void * ptr, size_t size) = 0;
@ -263,7 +263,7 @@ index 2e5d48797..b915ee1b8 100644
}; };
template<typename T> template<typename T>
@@ -914,11 +942,11 @@ struct ggml_backend_cuda_context { @@ -1023,11 +1051,11 @@ struct ggml_backend_cuda_context {
// pool // pool
std::unique_ptr<ggml_cuda_pool> pools[GGML_CUDA_MAX_DEVICES]; std::unique_ptr<ggml_cuda_pool> pools[GGML_CUDA_MAX_DEVICES];
@ -277,7 +277,7 @@ index 2e5d48797..b915ee1b8 100644
} }
return *pools[device]; return *pools[device];
} }
@@ -926,4 +954,20 @@ struct ggml_backend_cuda_context { @@ -1035,4 +1063,20 @@ struct ggml_backend_cuda_context {
ggml_cuda_pool & pool() { ggml_cuda_pool & pool() {
return pool(device); return pool(device);
} }
@ -299,7 +299,7 @@ index 2e5d48797..b915ee1b8 100644
+ } + }
}; };
diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
index c7f9dc3a5..d5abe09e0 100644 index e51c5035..d324bc68 100644
--- a/ggml/src/ggml-cuda/ggml-cuda.cu --- a/ggml/src/ggml-cuda/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu +++ b/ggml/src/ggml-cuda/ggml-cuda.cu
@@ -350,6 +350,8 @@ const ggml_cuda_device_info & ggml_cuda_info() { @@ -350,6 +350,8 @@ const ggml_cuda_device_info & ggml_cuda_info() {
@ -540,7 +540,7 @@ index c7f9dc3a5..d5abe09e0 100644
}; };
ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device) { ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device) {
@@ -2936,6 +2998,7 @@ static bool ggml_cuda_can_fuse(const struct ggml_cgraph * cgraph, int node_idx, @@ -3008,6 +3070,7 @@ static bool ggml_cuda_can_fuse(const struct ggml_cgraph * cgraph, int node_idx,
static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx, ggml_cgraph * cgraph, static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx, ggml_cgraph * cgraph,
bool & graph_evaluated_or_captured, bool & use_cuda_graph, bool & cuda_graph_update_required) { bool & graph_evaluated_or_captured, bool & use_cuda_graph, bool & cuda_graph_update_required) {
@ -548,7 +548,7 @@ index c7f9dc3a5..d5abe09e0 100644
// flag used to determine whether it is an integrated_gpu // flag used to determine whether it is an integrated_gpu
const bool integrated = ggml_cuda_info().devices[cuda_ctx->device].integrated; const bool integrated = ggml_cuda_info().devices[cuda_ctx->device].integrated;
@@ -2951,6 +3014,11 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx @@ -3023,6 +3086,11 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx
continue; continue;
} }
@ -559,8 +559,8 @@ index c7f9dc3a5..d5abe09e0 100644
+ +
static bool disable_fusion = (getenv("GGML_CUDA_DISABLE_FUSION") != nullptr); static bool disable_fusion = (getenv("GGML_CUDA_DISABLE_FUSION") != nullptr);
if (!disable_fusion) { if (!disable_fusion) {
if (ggml_cuda_can_fuse(cgraph, i, { GGML_OP_RMS_NORM, GGML_OP_MUL }, {})) {
@@ -3022,6 +3090,7 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx @@ -3149,6 +3217,7 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx
static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) { static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context; ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
@ -568,7 +568,7 @@ index c7f9dc3a5..d5abe09e0 100644
ggml_cuda_set_device(cuda_ctx->device); ggml_cuda_set_device(cuda_ctx->device);
@@ -3101,6 +3170,71 @@ static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend, @@ -3228,6 +3297,71 @@ static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend,
return GGML_STATUS_SUCCESS; return GGML_STATUS_SUCCESS;
} }
@ -640,10 +640,10 @@ index c7f9dc3a5..d5abe09e0 100644
static void ggml_backend_cuda_event_record(ggml_backend_t backend, ggml_backend_event_t event) { static void ggml_backend_cuda_event_record(ggml_backend_t backend, ggml_backend_event_t event) {
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context; ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
@@ -3140,6 +3274,9 @@ static const ggml_backend_i ggml_backend_cuda_interface = { @@ -3268,6 +3402,9 @@ static const ggml_backend_i ggml_backend_cuda_interface = {
/* .graph_compute = */ ggml_backend_cuda_graph_compute,
/* .event_record = */ ggml_backend_cuda_event_record, /* .event_record = */ ggml_backend_cuda_event_record,
/* .event_wait = */ ggml_backend_cuda_event_wait, /* .event_wait = */ ggml_backend_cuda_event_wait,
/* .graph_optimize = */ NULL,
+ /* .graph_reserve = */ ggml_backend_cuda_graph_reserve, + /* .graph_reserve = */ ggml_backend_cuda_graph_reserve,
+ /* .buffer_size = */ ggml_backend_cuda_buffer_size, + /* .buffer_size = */ ggml_backend_cuda_buffer_size,
+ /* .reset = */ ggml_backend_cuda_reset, + /* .reset = */ ggml_backend_cuda_reset,

View File

@ -8,10 +8,10 @@ Subject: [PATCH] decode: disable output_all
1 file changed, 1 insertion(+), 2 deletions(-) 1 file changed, 1 insertion(+), 2 deletions(-)
diff --git a/src/llama-context.cpp b/src/llama-context.cpp diff --git a/src/llama-context.cpp b/src/llama-context.cpp
index 26a5cf9c..6ece5263 100644 index d8a8b5e6..09247cef 100644
--- a/src/llama-context.cpp --- a/src/llama-context.cpp
+++ b/src/llama-context.cpp +++ b/src/llama-context.cpp
@@ -962,8 +962,7 @@ int llama_context::decode(const llama_batch & batch_inp) { @@ -974,8 +974,7 @@ int llama_context::decode(const llama_batch & batch_inp) {
const int64_t n_vocab = vocab.n_tokens(); const int64_t n_vocab = vocab.n_tokens();
const int64_t n_embd = hparams.n_embd; const int64_t n_embd = hparams.n_embd;

View File

@ -15,10 +15,10 @@ unused then it can be reset to free these data structures.
5 files changed, 29 insertions(+), 2 deletions(-) 5 files changed, 29 insertions(+), 2 deletions(-)
diff --git a/ggml/include/ggml-backend.h b/ggml/include/ggml-backend.h diff --git a/ggml/include/ggml-backend.h b/ggml/include/ggml-backend.h
index b602a7c78..fda5ceb24 100644 index d4352663..0a2dae26 100644
--- a/ggml/include/ggml-backend.h --- a/ggml/include/ggml-backend.h
+++ b/ggml/include/ggml-backend.h +++ b/ggml/include/ggml-backend.h
@@ -167,6 +167,7 @@ extern "C" { @@ -178,6 +178,7 @@ extern "C" {
GGML_API void ggml_backend_dev_get_props(ggml_backend_dev_t device, struct ggml_backend_dev_props * props); GGML_API void ggml_backend_dev_get_props(ggml_backend_dev_t device, struct ggml_backend_dev_props * props);
GGML_API ggml_backend_reg_t ggml_backend_dev_backend_reg(ggml_backend_dev_t device); GGML_API ggml_backend_reg_t ggml_backend_dev_backend_reg(ggml_backend_dev_t device);
GGML_API ggml_backend_t ggml_backend_dev_init(ggml_backend_dev_t device, const char * params); GGML_API ggml_backend_t ggml_backend_dev_init(ggml_backend_dev_t device, const char * params);
@ -27,10 +27,10 @@ index b602a7c78..fda5ceb24 100644
GGML_API ggml_backend_buffer_type_t ggml_backend_dev_host_buffer_type(ggml_backend_dev_t device); GGML_API ggml_backend_buffer_type_t ggml_backend_dev_host_buffer_type(ggml_backend_dev_t device);
GGML_API ggml_backend_buffer_t ggml_backend_dev_buffer_from_host_ptr(ggml_backend_dev_t device, void * ptr, size_t size, size_t max_tensor_size); GGML_API ggml_backend_buffer_t ggml_backend_dev_buffer_from_host_ptr(ggml_backend_dev_t device, void * ptr, size_t size, size_t max_tensor_size);
diff --git a/ggml/src/ggml-backend-impl.h b/ggml/src/ggml-backend-impl.h diff --git a/ggml/src/ggml-backend-impl.h b/ggml/src/ggml-backend-impl.h
index 81749a5a3..6f10c353b 100644 index 869dc07d..4889df79 100644
--- a/ggml/src/ggml-backend-impl.h --- a/ggml/src/ggml-backend-impl.h
+++ b/ggml/src/ggml-backend-impl.h +++ b/ggml/src/ggml-backend-impl.h
@@ -178,6 +178,10 @@ extern "C" { @@ -195,6 +195,10 @@ extern "C" {
ggml_backend_event_t (*event_new) (ggml_backend_dev_t dev); ggml_backend_event_t (*event_new) (ggml_backend_dev_t dev);
void (*event_free) (ggml_backend_dev_t dev, ggml_backend_event_t event); void (*event_free) (ggml_backend_dev_t dev, ggml_backend_event_t event);
void (*event_synchronize) (ggml_backend_dev_t dev, ggml_backend_event_t event); void (*event_synchronize) (ggml_backend_dev_t dev, ggml_backend_event_t event);
@ -42,10 +42,10 @@ index 81749a5a3..6f10c353b 100644
struct ggml_backend_device { struct ggml_backend_device {
diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp
index 05a842ed5..6556943b0 100644 index 6ef5eeaf..0b757af5 100644
--- a/ggml/src/ggml-backend.cpp --- a/ggml/src/ggml-backend.cpp
+++ b/ggml/src/ggml-backend.cpp +++ b/ggml/src/ggml-backend.cpp
@@ -477,6 +477,14 @@ ggml_backend_t ggml_backend_dev_init(ggml_backend_dev_t device, const char * par @@ -526,6 +526,14 @@ ggml_backend_t ggml_backend_dev_init(ggml_backend_dev_t device, const char * par
return device->iface.init_backend(device, params); return device->iface.init_backend(device, params);
} }
@ -58,13 +58,13 @@ index 05a842ed5..6556943b0 100644
+} +}
+ +
ggml_backend_buffer_type_t ggml_backend_dev_buffer_type(ggml_backend_dev_t device) { ggml_backend_buffer_type_t ggml_backend_dev_buffer_type(ggml_backend_dev_t device) {
GGML_ASSERT(device);
return device->iface.get_buffer_type(device); return device->iface.get_buffer_type(device);
}
diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
index c7f9dc3a5..e43fde523 100644 index d324bc68..531d6e27 100644
--- a/ggml/src/ggml-cuda/ggml-cuda.cu --- a/ggml/src/ggml-cuda/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu +++ b/ggml/src/ggml-cuda/ggml-cuda.cu
@@ -103,6 +103,11 @@ int ggml_cuda_get_device() { @@ -107,6 +107,11 @@ int ggml_cuda_get_device() {
return id; return id;
} }
@ -76,10 +76,10 @@ index c7f9dc3a5..e43fde523 100644
static cudaError_t ggml_cuda_device_malloc(void ** ptr, size_t size, int device) { static cudaError_t ggml_cuda_device_malloc(void ** ptr, size_t size, int device) {
ggml_cuda_set_device(device); ggml_cuda_set_device(device);
cudaError_t err; cudaError_t err;
@@ -3243,7 +3248,10 @@ static void ggml_backend_cuda_device_get_props(ggml_backend_dev_t dev, ggml_back @@ -3512,7 +3517,10 @@ static void ggml_backend_cuda_device_get_props(ggml_backend_dev_t dev, ggml_back
props->description = ggml_backend_cuda_device_get_description(dev);
props->id = ggml_backend_cuda_device_get_id(dev); props->id = ggml_backend_cuda_device_get_id(dev);
props->type = ggml_backend_cuda_device_get_type(dev); props->type = ggml_backend_cuda_device_get_type(dev);
props->device_id = ctx->pci_bus_id.empty() ? nullptr : ctx->pci_bus_id.c_str();
- ggml_backend_cuda_device_get_memory(dev, &props->memory_free, &props->memory_total); - ggml_backend_cuda_device_get_memory(dev, &props->memory_free, &props->memory_total);
+ +
+ // Memory reporting is disabled to avoid allocation of a CUDA primary context (~300 MB per device). + // Memory reporting is disabled to avoid allocation of a CUDA primary context (~300 MB per device).
@ -88,7 +88,7 @@ index c7f9dc3a5..e43fde523 100644
bool host_buffer = getenv("GGML_CUDA_NO_PINNED") == nullptr; bool host_buffer = getenv("GGML_CUDA_NO_PINNED") == nullptr;
#ifdef GGML_CUDA_NO_PEER_COPY #ifdef GGML_CUDA_NO_PEER_COPY
@@ -3700,6 +3708,11 @@ static void ggml_backend_cuda_device_event_synchronize(ggml_backend_dev_t dev, g @@ -3945,6 +3953,11 @@ static void ggml_backend_cuda_device_event_synchronize(ggml_backend_dev_t dev, g
CUDA_CHECK(cudaEventSynchronize((cudaEvent_t)event->context)); CUDA_CHECK(cudaEventSynchronize((cudaEvent_t)event->context));
} }
@ -100,7 +100,7 @@ index c7f9dc3a5..e43fde523 100644
static const ggml_backend_device_i ggml_backend_cuda_device_interface = { static const ggml_backend_device_i ggml_backend_cuda_device_interface = {
/* .get_name = */ ggml_backend_cuda_device_get_name, /* .get_name = */ ggml_backend_cuda_device_get_name,
/* .get_description = */ ggml_backend_cuda_device_get_description, /* .get_description = */ ggml_backend_cuda_device_get_description,
@@ -3716,6 +3729,7 @@ static const ggml_backend_device_i ggml_backend_cuda_device_interface = { @@ -3961,6 +3974,7 @@ static const ggml_backend_device_i ggml_backend_cuda_device_interface = {
/* .event_new = */ ggml_backend_cuda_device_event_new, /* .event_new = */ ggml_backend_cuda_device_event_new,
/* .event_free = */ ggml_backend_cuda_device_event_free, /* .event_free = */ ggml_backend_cuda_device_event_free,
/* .event_synchronize = */ ggml_backend_cuda_device_event_synchronize, /* .event_synchronize = */ ggml_backend_cuda_device_event_synchronize,
@ -108,7 +108,7 @@ index c7f9dc3a5..e43fde523 100644
}; };
// backend reg // backend reg
@@ -3835,7 +3849,6 @@ ggml_backend_reg_t ggml_backend_cuda_reg() { @@ -4076,7 +4090,6 @@ ggml_backend_reg_t ggml_backend_cuda_reg() {
dev_ctx->device = i; dev_ctx->device = i;
dev_ctx->name = GGML_CUDA_NAME + std::to_string(i); dev_ctx->name = GGML_CUDA_NAME + std::to_string(i);
@ -117,10 +117,10 @@ index c7f9dc3a5..e43fde523 100644
CUDA_CHECK(cudaGetDeviceProperties(&prop, i)); CUDA_CHECK(cudaGetDeviceProperties(&prop, i));
dev_ctx->description = prop.name; dev_ctx->description = prop.name;
diff --git a/ggml/src/ggml-cuda/vendors/hip.h b/ggml/src/ggml-cuda/vendors/hip.h diff --git a/ggml/src/ggml-cuda/vendors/hip.h b/ggml/src/ggml-cuda/vendors/hip.h
index c31f31923..cf22e60d2 100644 index 37386afc..06f9e7c1 100644
--- a/ggml/src/ggml-cuda/vendors/hip.h --- a/ggml/src/ggml-cuda/vendors/hip.h
+++ b/ggml/src/ggml-cuda/vendors/hip.h +++ b/ggml/src/ggml-cuda/vendors/hip.h
@@ -40,6 +40,7 @@ @@ -41,6 +41,7 @@
#define cudaDeviceDisablePeerAccess hipDeviceDisablePeerAccess #define cudaDeviceDisablePeerAccess hipDeviceDisablePeerAccess
#define cudaDeviceEnablePeerAccess hipDeviceEnablePeerAccess #define cudaDeviceEnablePeerAccess hipDeviceEnablePeerAccess
#define cudaDeviceProp hipDeviceProp_t #define cudaDeviceProp hipDeviceProp_t

View File

@ -6,25 +6,25 @@ Subject: [PATCH] GPU discovery enhancements
Expose more information about the devices through backend props, and leverage Expose more information about the devices through backend props, and leverage
management libraries for more accurate VRAM usage reporting if available. management libraries for more accurate VRAM usage reporting if available.
--- ---
ggml/include/ggml-backend.h | 9 + ggml/include/ggml-backend.h | 9 +
ggml/src/CMakeLists.txt | 2 + ggml/src/CMakeLists.txt | 2 +
ggml/src/ggml-cuda/ggml-cuda.cu | 75 +++++- ggml/src/ggml-cuda/ggml-cuda.cu | 72 +++++
ggml/src/ggml-cuda/vendors/hip.h | 1 + ggml/src/ggml-cuda/vendors/hip.h | 4 +
ggml/src/ggml-impl.h | 8 + ggml/src/ggml-impl.h | 8 +
ggml/src/ggml-metal/ggml-metal.m | 2 + ggml/src/ggml-metal/ggml-metal.cpp | 3 +-
ggml/src/mem_hip.cpp | 449 +++++++++++++++++++++++++++++++ ggml/src/mem_hip.cpp | 449 +++++++++++++++++++++++++++++
ggml/src/mem_nvml.cpp | 172 ++++++++++++ ggml/src/mem_nvml.cpp | 172 +++++++++++
8 files changed, 717 insertions(+), 1 deletion(-) 8 files changed, 718 insertions(+), 1 deletion(-)
create mode 100644 ggml/src/mem_hip.cpp create mode 100644 ggml/src/mem_hip.cpp
create mode 100644 ggml/src/mem_nvml.cpp create mode 100644 ggml/src/mem_nvml.cpp
diff --git a/ggml/include/ggml-backend.h b/ggml/include/ggml-backend.h diff --git a/ggml/include/ggml-backend.h b/ggml/include/ggml-backend.h
index fda5ceb24..7c2d86703 100644 index 0a2dae26..a6bf3378 100644
--- a/ggml/include/ggml-backend.h --- a/ggml/include/ggml-backend.h
+++ b/ggml/include/ggml-backend.h +++ b/ggml/include/ggml-backend.h
@@ -158,6 +158,15 @@ extern "C" { @@ -169,6 +169,15 @@ extern "C" {
size_t memory_total; const char * device_id;
enum ggml_backend_dev_type type; // device capabilities
struct ggml_backend_dev_caps caps; struct ggml_backend_dev_caps caps;
+ int driver_major; + int driver_major;
+ int driver_minor; + int driver_minor;
@ -39,10 +39,10 @@ index fda5ceb24..7c2d86703 100644
GGML_API const char * ggml_backend_dev_name(ggml_backend_dev_t device); GGML_API const char * ggml_backend_dev_name(ggml_backend_dev_t device);
diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt
index 5158acd6a..3a428a22d 100644 index 33b3a15f..86191ef2 100644
--- a/ggml/src/CMakeLists.txt --- a/ggml/src/CMakeLists.txt
+++ b/ggml/src/CMakeLists.txt +++ b/ggml/src/CMakeLists.txt
@@ -203,6 +203,8 @@ add_library(ggml-base @@ -206,6 +206,8 @@ add_library(ggml-base
ggml-threading.h ggml-threading.h
ggml-quants.c ggml-quants.c
ggml-quants.h ggml-quants.h
@ -52,10 +52,10 @@ index 5158acd6a..3a428a22d 100644
target_include_directories(ggml-base PRIVATE .) target_include_directories(ggml-base PRIVATE .)
diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
index e43fde523..14baf0fb1 100644 index 531d6e27..3fa3a057 100644
--- a/ggml/src/ggml-cuda/ggml-cuda.cu --- a/ggml/src/ggml-cuda/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu +++ b/ggml/src/ggml-cuda/ggml-cuda.cu
@@ -279,6 +279,16 @@ static ggml_cuda_device_info ggml_cuda_init() { @@ -261,6 +261,16 @@ static ggml_cuda_device_info ggml_cuda_init() {
for (int id = 0; id < info.device_count; ++id) { for (int id = 0; id < info.device_count; ++id) {
int device_vmm = 0; int device_vmm = 0;
@ -72,7 +72,7 @@ index e43fde523..14baf0fb1 100644
#if defined(GGML_USE_VMM) #if defined(GGML_USE_VMM)
CUdevice device; CUdevice device;
CU_CHECK(cuDeviceGet(&device, id)); CU_CHECK(cuDeviceGet(&device, id));
@@ -332,9 +342,15 @@ static ggml_cuda_device_info ggml_cuda_init() { @@ -314,6 +324,11 @@ static ggml_cuda_device_info ggml_cuda_init() {
#else #else
info.devices[id].smpbo = prop.sharedMemPerBlockOptin; info.devices[id].smpbo = prop.sharedMemPerBlockOptin;
info.devices[id].cc = 100*prop.major + 10*prop.minor; info.devices[id].cc = 100*prop.major + 10*prop.minor;
@ -84,33 +84,29 @@ index e43fde523..14baf0fb1 100644
GGML_LOG_INFO(" Device %d: %s, compute capability %d.%d, VMM: %s, ID: %s\n", GGML_LOG_INFO(" Device %d: %s, compute capability %d.%d, VMM: %s, ID: %s\n",
id, prop.name, prop.major, prop.minor, device_vmm ? "yes" : "no", id, prop.name, prop.major, prop.minor, device_vmm ? "yes" : "no",
ggml_cuda_parse_uuid(prop, id).c_str()); ggml_cuda_parse_uuid(prop, id).c_str());
+ @@ -3481,6 +3496,14 @@ struct ggml_backend_cuda_device_context {
#endif // defined(GGML_USE_HIP)
}
@@ -3215,6 +3231,14 @@ struct ggml_backend_cuda_device_context {
std::string name;
std::string description; std::string description;
std::string pci_bus_id;
std::string id; std::string id;
+ int major; + int major;
+ int minor; + int minor;
+ int driver_major; + int driver_major;
+ int driver_minor; + int driver_minor;
+ int integrated; + int integrated;
+ int pci_bus_id; + int pciBusID;
+ int pci_device_id; + int pciDeviceID;
+ int pci_domain_id; + int pciDomainID;
}; };
static const char * ggml_backend_cuda_device_get_name(ggml_backend_dev_t dev) { static const char * ggml_backend_cuda_device_get_name(ggml_backend_dev_t dev) {
@@ -3235,6 +3259,28 @@ static const char * ggml_backend_cuda_device_get_id(ggml_backend_dev_t dev) { @@ -3501,6 +3524,28 @@ static const char * ggml_backend_cuda_device_get_id(ggml_backend_dev_t dev) {
static void ggml_backend_cuda_device_get_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) { static void ggml_backend_cuda_device_get_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) {
ggml_backend_cuda_device_context * ctx = (ggml_backend_cuda_device_context *)dev->context; ggml_backend_cuda_device_context * ctx = (ggml_backend_cuda_device_context *)dev->context;
ggml_cuda_set_device(ctx->device); ggml_cuda_set_device(ctx->device);
+ +
+#if defined(GGML_USE_HIP) +#if defined(GGML_USE_HIP)
+ if (ggml_hip_mgmt_init() == 0) { + if (ggml_hip_mgmt_init() == 0) {
+ int status = ggml_hip_get_device_memory(ctx->pci_bus_id, ctx->pci_device_id, free, total); + int status = ggml_hip_get_device_memory(ctx->pciBusID, ctx->pciDeviceID, free, total);
+ if (status == 0) { + if (status == 0) {
+ GGML_LOG_DEBUG("%s utilizing ADLX memory reporting free: %zu total: %zu\n", __func__, *free, *total); + GGML_LOG_DEBUG("%s utilizing ADLX memory reporting free: %zu total: %zu\n", __func__, *free, *total);
+ ggml_hip_mgmt_release(); + ggml_hip_mgmt_release();
@ -132,19 +128,18 @@ index e43fde523..14baf0fb1 100644
CUDA_CHECK(cudaMemGetInfo(free, total)); CUDA_CHECK(cudaMemGetInfo(free, total));
} }
@@ -3243,6 +3289,7 @@ static enum ggml_backend_dev_type ggml_backend_cuda_device_get_type(ggml_backend @@ -3509,6 +3554,7 @@ static enum ggml_backend_dev_type ggml_backend_cuda_device_get_type(ggml_backend
return GGML_BACKEND_DEVICE_TYPE_GPU; return GGML_BACKEND_DEVICE_TYPE_GPU;
} }
+#define GGML_HIP_NAME "HIP" +#define GGML_HIP_NAME "HIP"
static void ggml_backend_cuda_device_get_props(ggml_backend_dev_t dev, ggml_backend_dev_props * props) { static void ggml_backend_cuda_device_get_props(ggml_backend_dev_t dev, ggml_backend_dev_props * props) {
props->name = ggml_backend_cuda_device_get_name(dev); ggml_backend_cuda_device_context * ctx = (ggml_backend_cuda_device_context *)dev->context;
props->description = ggml_backend_cuda_device_get_description(dev);
@@ -3253,6 +3300,23 @@ static void ggml_backend_cuda_device_get_props(ggml_backend_dev_t dev, ggml_back @@ -3522,6 +3568,22 @@ static void ggml_backend_cuda_device_get_props(ggml_backend_dev_t dev, ggml_back
// If you need the memory data, call ggml_backend_dev_memory() explicitly. // If you need the memory data, call ggml_backend_dev_memory() explicitly.
props->memory_total = props->memory_free = 0; props->memory_total = props->memory_free = 0;
+ ggml_backend_cuda_device_context * ctx = (ggml_backend_cuda_device_context *)dev->context;
+#if defined(GGML_USE_HIP) +#if defined(GGML_USE_HIP)
+ int cc = ggml_cuda_info().devices[ctx->device].cc - GGML_CUDA_CC_OFFSET_AMD; + int cc = ggml_cuda_info().devices[ctx->device].cc - GGML_CUDA_CC_OFFSET_AMD;
+ props->compute_major = cc / 0x100; + props->compute_major = cc / 0x100;
@ -156,15 +151,15 @@ index e43fde523..14baf0fb1 100644
+ props->driver_major = ctx->driver_major; + props->driver_major = ctx->driver_major;
+ props->driver_minor = ctx->driver_minor; + props->driver_minor = ctx->driver_minor;
+ props->integrated = ctx->integrated; + props->integrated = ctx->integrated;
+ props->pci_bus_id = ctx->pci_bus_id; + props->pci_bus_id = ctx->pciBusID;
+ props->pci_device_id = ctx->pci_device_id; + props->pci_device_id = ctx->pciDeviceID;
+ props->pci_domain_id = ctx->pci_domain_id; + props->pci_domain_id = ctx->pciDomainID;
+ props->library = GGML_CUDA_NAME; + props->library = GGML_CUDA_NAME;
+ +
bool host_buffer = getenv("GGML_CUDA_NO_PINNED") == nullptr; bool host_buffer = getenv("GGML_CUDA_NO_PINNED") == nullptr;
#ifdef GGML_CUDA_NO_PEER_COPY #ifdef GGML_CUDA_NO_PEER_COPY
bool events = false; bool events = false;
@@ -3843,6 +3907,8 @@ ggml_backend_reg_t ggml_backend_cuda_reg() { @@ -4084,6 +4146,8 @@ ggml_backend_reg_t ggml_backend_cuda_reg() {
std::lock_guard<std::mutex> lock(mutex); std::lock_guard<std::mutex> lock(mutex);
if (!initialized) { if (!initialized) {
ggml_backend_cuda_reg_context * ctx = new ggml_backend_cuda_reg_context; ggml_backend_cuda_reg_context * ctx = new ggml_backend_cuda_reg_context;
@ -173,27 +168,36 @@ index e43fde523..14baf0fb1 100644
for (int i = 0; i < ggml_cuda_info().device_count; i++) { for (int i = 0; i < ggml_cuda_info().device_count; i++) {
ggml_backend_cuda_device_context * dev_ctx = new ggml_backend_cuda_device_context; ggml_backend_cuda_device_context * dev_ctx = new ggml_backend_cuda_device_context;
@@ -3853,7 +3919,14 @@ ggml_backend_reg_t ggml_backend_cuda_reg() { @@ -4099,6 +4163,14 @@ ggml_backend_reg_t ggml_backend_cuda_reg() {
CUDA_CHECK(cudaGetDeviceProperties(&prop, i)); snprintf(pci_bus_id, sizeof(pci_bus_id), "%04x:%02x:%02x.0", prop.pciDomainID, prop.pciBusID, prop.pciDeviceID);
dev_ctx->description = prop.name; dev_ctx->pci_bus_id = pci_bus_id;
dev_ctx->id = ggml_cuda_parse_uuid(prop, i);
-
+ dev_ctx->major = prop.major; + dev_ctx->major = prop.major;
+ dev_ctx->minor = prop.minor; + dev_ctx->minor = prop.minor;
+ dev_ctx->driver_major = driverVersion / 1000; + dev_ctx->driver_major = driverVersion / 1000;
+ dev_ctx->driver_minor = (driverVersion - (dev_ctx->driver_major * 1000)) / 10; + dev_ctx->driver_minor = (driverVersion - (dev_ctx->driver_major * 1000)) / 10;
+ dev_ctx->integrated = prop.integrated; + dev_ctx->integrated = prop.integrated;
+ dev_ctx->pci_bus_id = prop.pciBusID; + dev_ctx->pciBusID = prop.pciBusID;
+ dev_ctx->pci_device_id = prop.pciDeviceID; + dev_ctx->pciDeviceID = prop.pciDeviceID;
+ dev_ctx->pci_domain_id = prop.pciDomainID; + dev_ctx->pciDomainID = prop.pciDomainID;
ggml_backend_dev_t dev = new ggml_backend_device { ggml_backend_dev_t dev = new ggml_backend_device {
/* .iface = */ ggml_backend_cuda_device_interface, /* .iface = */ ggml_backend_cuda_device_interface,
/* .reg = */ &reg, /* .reg = */ &reg,
diff --git a/ggml/src/ggml-cuda/vendors/hip.h b/ggml/src/ggml-cuda/vendors/hip.h diff --git a/ggml/src/ggml-cuda/vendors/hip.h b/ggml/src/ggml-cuda/vendors/hip.h
index cf22e60d2..957a795f2 100644 index 06f9e7c1..eb8f66cb 100644
--- a/ggml/src/ggml-cuda/vendors/hip.h --- a/ggml/src/ggml-cuda/vendors/hip.h
+++ b/ggml/src/ggml-cuda/vendors/hip.h +++ b/ggml/src/ggml-cuda/vendors/hip.h
@@ -42,6 +42,7 @@ @@ -5,6 +5,9 @@
#include <hipblas/hipblas.h>
#include <hip/hip_fp16.h>
#include <hip/hip_bf16.h>
+// for rocblas_initialize()
+#include "rocblas/rocblas.h"
+
#define CUBLAS_GEMM_DEFAULT HIPBLAS_GEMM_DEFAULT
#define CUBLAS_GEMM_DEFAULT_TENSOR_OP HIPBLAS_GEMM_DEFAULT
@@ -43,6 +46,7 @@
#define cudaDeviceProp hipDeviceProp_t #define cudaDeviceProp hipDeviceProp_t
#define cudaDeviceReset hipDeviceReset #define cudaDeviceReset hipDeviceReset
#define cudaDeviceSynchronize hipDeviceSynchronize #define cudaDeviceSynchronize hipDeviceSynchronize
@ -202,11 +206,11 @@ index cf22e60d2..957a795f2 100644
#define cudaErrorPeerAccessAlreadyEnabled hipErrorPeerAccessAlreadyEnabled #define cudaErrorPeerAccessAlreadyEnabled hipErrorPeerAccessAlreadyEnabled
#define cudaErrorPeerAccessNotEnabled hipErrorPeerAccessNotEnabled #define cudaErrorPeerAccessNotEnabled hipErrorPeerAccessNotEnabled
diff --git a/ggml/src/ggml-impl.h b/ggml/src/ggml-impl.h diff --git a/ggml/src/ggml-impl.h b/ggml/src/ggml-impl.h
index 19a7adb2d..b9b102a5e 100644 index 86a1ebf6..9fc9fbfc 100644
--- a/ggml/src/ggml-impl.h --- a/ggml/src/ggml-impl.h
+++ b/ggml/src/ggml-impl.h +++ b/ggml/src/ggml-impl.h
@@ -602,6 +602,14 @@ static inline bool ggml_can_fuse(const struct ggml_cgraph * cgraph, int node_idx @@ -635,6 +635,14 @@ static inline bool ggml_can_fuse(const struct ggml_cgraph * cgraph, int node_idx
return true; return ggml_can_fuse_ext(cgraph, idxs, ops, num_ops);
} }
+// Management libraries for fetching more accurate free VRAM data +// Management libraries for fetching more accurate free VRAM data
@ -220,28 +224,30 @@ index 19a7adb2d..b9b102a5e 100644
#ifdef __cplusplus #ifdef __cplusplus
} }
#endif #endif
diff --git a/ggml/src/ggml-metal/ggml-metal.m b/ggml/src/ggml-metal/ggml-metal.m diff --git a/ggml/src/ggml-metal/ggml-metal.cpp b/ggml/src/ggml-metal/ggml-metal.cpp
index e4c31268f..ec6b385ba 100644 index 08ab4fc9..17999a61 100644
--- a/ggml/src/ggml-metal/ggml-metal.m --- a/ggml/src/ggml-metal/ggml-metal.cpp
+++ b/ggml/src/ggml-metal/ggml-metal.m +++ b/ggml/src/ggml-metal/ggml-metal.cpp
@@ -6523,12 +6523,14 @@ static enum ggml_backend_dev_type ggml_backend_metal_device_get_type(ggml_backen @@ -535,6 +535,7 @@ static enum ggml_backend_dev_type ggml_backend_metal_device_get_type(ggml_backen
GGML_UNUSED(dev); GGML_UNUSED(dev);
} }
+#define GGML_METAL_NAME "Metal" +#define GGML_METAL_NAME "Metal"
static void ggml_backend_metal_device_get_props(ggml_backend_dev_t dev, struct ggml_backend_dev_props * props) { static void ggml_backend_metal_device_get_props(ggml_backend_dev_t dev, ggml_backend_dev_props * props) {
props->name = ggml_backend_metal_device_get_name(dev); props->name = ggml_backend_metal_device_get_name(dev);
props->description = ggml_backend_metal_device_get_description(dev); props->description = ggml_backend_metal_device_get_description(dev);
props->id = "0"; @@ -542,7 +543,7 @@ static void ggml_backend_metal_device_get_props(ggml_backend_dev_t dev, ggml_bac
props->type = ggml_backend_metal_device_get_type(dev); props->type = ggml_backend_metal_device_get_type(dev);
ggml_backend_metal_device_get_memory(dev, &props->memory_free, &props->memory_total); ggml_backend_metal_device_get_memory(dev, &props->memory_free, &props->memory_total);
-
+ props->library = GGML_METAL_NAME; + props->library = GGML_METAL_NAME;
props->caps = (struct ggml_backend_dev_caps) { props->caps = {
/* .async = */ false, /* .async = */ true,
/* .host_buffer = */ false, /* .host_buffer = */ false,
diff --git a/ggml/src/mem_hip.cpp b/ggml/src/mem_hip.cpp diff --git a/ggml/src/mem_hip.cpp b/ggml/src/mem_hip.cpp
new file mode 100644 new file mode 100644
index 000000000..8ef19b8cf index 00000000..8ef19b8c
--- /dev/null --- /dev/null
+++ b/ggml/src/mem_hip.cpp +++ b/ggml/src/mem_hip.cpp
@@ -0,0 +1,449 @@ @@ -0,0 +1,449 @@
@ -697,7 +703,7 @@ index 000000000..8ef19b8cf
\ No newline at end of file \ No newline at end of file
diff --git a/ggml/src/mem_nvml.cpp b/ggml/src/mem_nvml.cpp diff --git a/ggml/src/mem_nvml.cpp b/ggml/src/mem_nvml.cpp
new file mode 100644 new file mode 100644
index 000000000..aa05e9dc1 index 00000000..aa05e9dc
--- /dev/null --- /dev/null
+++ b/ggml/src/mem_nvml.cpp +++ b/ggml/src/mem_nvml.cpp
@@ -0,0 +1,172 @@ @@ -0,0 +1,172 @@

View File

@ -1,57 +0,0 @@
From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
From: Jesse Gross <jesse@ollama.com>
Date: Tue, 23 Sep 2025 15:41:58 -0700
Subject: [PATCH] ggml: Backport scale kernel fixes
The GGML scale kernel uses signed 32-bit ints to represent
the number of elements in the tensor. For large images,
mistral-small3.2 overflows this, triggering CUDA errors due
to negative arguments.
Currently, this can happen when the user passes a large image
to mistral-small3.2. However, with upcoming changes to reserve
CUDA memory, it happens every time mistral-small is loaded as
we reserve using a worst case batch.
This patch is part of an upstream GGML commit and should be removed
after GGML is updated past 0a1b398 "ggml: add ops for WAN video model
(cuda && cpu) (#15669)".
Fixes #10388
---
ggml/src/ggml-cuda/scale.cu | 19 ++++++++++---------
1 file changed, 10 insertions(+), 9 deletions(-)
diff --git a/ggml/src/ggml-cuda/scale.cu b/ggml/src/ggml-cuda/scale.cu
index 2ee9e5889..0ddeff6a1 100644
--- a/ggml/src/ggml-cuda/scale.cu
+++ b/ggml/src/ggml-cuda/scale.cu
@@ -1,18 +1,19 @@
#include "scale.cuh"
-static __global__ void scale_f32(const float * x, float * dst, const float scale, const float bias, const int k) {
- const int i = blockDim.x*blockIdx.x + threadIdx.x;
+#define MAX_GRIDDIM_X 0x7FFFFFFF
- if (i >= k) {
- return;
- }
+static __global__ void scale_f32(const float * x, float * dst, const float scale, const float bias, const int64_t nelements) {
+ int64_t tid = (int64_t)blockIdx.x * (int64_t)blockDim.x + (int64_t)threadIdx.x;
+ int64_t stride = (int64_t)blockDim.x * (int64_t)gridDim.x;
- dst[i] = scale * x[i] + bias;
+ for (int64_t i = tid; i < nelements; i += stride) {
+ dst[i] = scale * x[i] + bias;
+ }
}
-static void scale_f32_cuda(const float * x, float * dst, const float scale, const float bias, const int k, cudaStream_t stream) {
- const int num_blocks = (k + CUDA_SCALE_BLOCK_SIZE - 1) / CUDA_SCALE_BLOCK_SIZE;
- scale_f32<<<num_blocks, CUDA_SCALE_BLOCK_SIZE, 0, stream>>>(x, dst, scale, bias, k);
+static void scale_f32_cuda(const float * x, float * dst, const float scale, const float bias, const int64_t nelements, cudaStream_t stream) {
+ const int64_t num_blocks = (nelements + CUDA_SCALE_BLOCK_SIZE - 1) / CUDA_SCALE_BLOCK_SIZE;
+ scale_f32<<<MIN(MAX_GRIDDIM_X, num_blocks), CUDA_SCALE_BLOCK_SIZE, 0, stream>>>(x, dst, scale, bias, nelements);
}
void ggml_cuda_op_scale(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {

View File

@ -132,6 +132,8 @@ extern "C" {
GGML_BACKEND_DEVICE_TYPE_CPU, GGML_BACKEND_DEVICE_TYPE_CPU,
// GPU device using dedicated memory // GPU device using dedicated memory
GGML_BACKEND_DEVICE_TYPE_GPU, GGML_BACKEND_DEVICE_TYPE_GPU,
// integrated GPU device using host memory
GGML_BACKEND_DEVICE_TYPE_IGPU,
// accelerator devices intended to be used together with the CPU backend (e.g. BLAS or AMX) // accelerator devices intended to be used together with the CPU backend (e.g. BLAS or AMX)
GGML_BACKEND_DEVICE_TYPE_ACCEL GGML_BACKEND_DEVICE_TYPE_ACCEL
}; };
@ -150,12 +152,22 @@ extern "C" {
// all the device properties // all the device properties
struct ggml_backend_dev_props { struct ggml_backend_dev_props {
// device name
const char * name; const char * name;
// device description
const char * description; const char * description;
const char * id; // device free memory in bytes
size_t memory_free; size_t memory_free;
const char * id;
// device total memory in bytes
size_t memory_total; size_t memory_total;
// device type
enum ggml_backend_dev_type type; enum ggml_backend_dev_type type;
// device id
// for PCI devices, this should be the PCI bus id formatted as "domain:bus:device.function" (e.g. "0000:01:00.0")
// if the id is unknown, this should be NULL
const char * device_id;
// device capabilities
struct ggml_backend_dev_caps caps; struct ggml_backend_dev_caps caps;
int driver_major; int driver_major;
int driver_minor; int driver_minor;
@ -314,12 +326,16 @@ extern "C" {
GGML_API int ggml_backend_sched_get_n_splits(ggml_backend_sched_t sched); GGML_API int ggml_backend_sched_get_n_splits(ggml_backend_sched_t sched);
GGML_API int ggml_backend_sched_get_n_copies(ggml_backend_sched_t sched); GGML_API int ggml_backend_sched_get_n_copies(ggml_backend_sched_t sched);
GGML_API size_t ggml_backend_sched_get_buffer_size(ggml_backend_sched_t sched, ggml_backend_t backend); GGML_API ggml_backend_buffer_type_t ggml_backend_sched_get_buffer_type(ggml_backend_sched_t sched, ggml_backend_t backend);
GGML_API size_t ggml_backend_sched_get_attempted_buffer_size(ggml_backend_sched_t sched, ggml_backend_t backend); GGML_API size_t ggml_backend_sched_get_buffer_size(ggml_backend_sched_t sched, ggml_backend_t backend);
GGML_API size_t ggml_backend_sched_get_attempted_buffer_size(ggml_backend_sched_t sched, ggml_backend_t backend);
GGML_API void ggml_backend_sched_set_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node, ggml_backend_t backend); GGML_API void ggml_backend_sched_set_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node, ggml_backend_t backend);
GGML_API ggml_backend_t ggml_backend_sched_get_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node); GGML_API ggml_backend_t ggml_backend_sched_get_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node);
// Split graph without allocating it
GGML_API void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * graph);
// Allocate and compute graph on the backend scheduler // Allocate and compute graph on the backend scheduler
GGML_API bool ggml_backend_sched_alloc_graph(ggml_backend_sched_t sched, struct ggml_cgraph * graph); // returns success GGML_API bool ggml_backend_sched_alloc_graph(ggml_backend_sched_t sched, struct ggml_cgraph * graph); // returns success
GGML_API enum ggml_status ggml_backend_sched_graph_compute(ggml_backend_sched_t sched, struct ggml_cgraph * graph); GGML_API enum ggml_status ggml_backend_sched_graph_compute(ggml_backend_sched_t sched, struct ggml_cgraph * graph);

View File

@ -101,7 +101,6 @@ extern "C" {
GGML_BACKEND_API int ggml_cpu_has_riscv_v (void); GGML_BACKEND_API int ggml_cpu_has_riscv_v (void);
GGML_BACKEND_API int ggml_cpu_has_vsx (void); GGML_BACKEND_API int ggml_cpu_has_vsx (void);
GGML_BACKEND_API int ggml_cpu_has_vxe (void); GGML_BACKEND_API int ggml_cpu_has_vxe (void);
GGML_BACKEND_API int ggml_cpu_has_nnpa (void);
GGML_BACKEND_API int ggml_cpu_has_wasm_simd (void); GGML_BACKEND_API int ggml_cpu_has_wasm_simd (void);
GGML_BACKEND_API int ggml_cpu_has_llamafile (void); GGML_BACKEND_API int ggml_cpu_has_llamafile (void);
@ -135,6 +134,7 @@ extern "C" {
GGML_BACKEND_API ggml_backend_reg_t ggml_backend_cpu_reg(void); GGML_BACKEND_API ggml_backend_reg_t ggml_backend_cpu_reg(void);
GGML_BACKEND_API void ggml_cpu_fp32_to_fp32(const float *, float *, int64_t); GGML_BACKEND_API void ggml_cpu_fp32_to_fp32(const float *, float *, int64_t);
GGML_BACKEND_API void ggml_cpu_fp32_to_i32 (const float *, int32_t *, int64_t);
GGML_BACKEND_API void ggml_cpu_fp32_to_fp16(const float *, ggml_fp16_t *, int64_t); GGML_BACKEND_API void ggml_cpu_fp32_to_fp16(const float *, ggml_fp16_t *, int64_t);
GGML_BACKEND_API void ggml_cpu_fp16_to_fp32(const ggml_fp16_t *, float *, int64_t); GGML_BACKEND_API void ggml_cpu_fp16_to_fp32(const ggml_fp16_t *, float *, int64_t);
GGML_BACKEND_API void ggml_cpu_fp32_to_bf16(const float *, ggml_bf16_t *, int64_t); GGML_BACKEND_API void ggml_cpu_fp32_to_bf16(const float *, ggml_bf16_t *, int64_t);

View File

@ -39,18 +39,13 @@ extern "C" {
// user-code should use only these functions // user-code should use only these functions
// //
// TODO: remove in the future
GGML_BACKEND_API ggml_backend_t ggml_backend_metal_init(void); GGML_BACKEND_API ggml_backend_t ggml_backend_metal_init(void);
GGML_BACKEND_API bool ggml_backend_is_metal(ggml_backend_t backend); GGML_BACKEND_API bool ggml_backend_is_metal(ggml_backend_t backend);
GGML_DEPRECATED(
GGML_BACKEND_API ggml_backend_buffer_t ggml_backend_metal_buffer_from_ptr(void * data, size_t size, size_t max_size),
"obsoleted by the new device interface - https://github.com/ggml-org/llama.cpp/pull/9713");
GGML_BACKEND_API void ggml_backend_metal_set_abort_callback(ggml_backend_t backend, ggml_abort_callback abort_callback, void * user_data); GGML_BACKEND_API void ggml_backend_metal_set_abort_callback(ggml_backend_t backend, ggml_abort_callback abort_callback, void * user_data);
GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void);
// helper to check if the device supports a specific family // helper to check if the device supports a specific family
// ideally, the user code should be doing these checks // ideally, the user code should be doing these checks
// ref: https://developer.apple.com/metal/Metal-Feature-Set-Tables.pdf // ref: https://developer.apple.com/metal/Metal-Feature-Set-Tables.pdf

View File

@ -74,16 +74,26 @@ extern "C" {
GGML_OPT_BUILD_TYPE_OPT = 30, GGML_OPT_BUILD_TYPE_OPT = 30,
}; };
enum ggml_opt_optimizer_type {
GGML_OPT_OPTIMIZER_TYPE_ADAMW,
GGML_OPT_OPTIMIZER_TYPE_SGD,
GGML_OPT_OPTIMIZER_TYPE_COUNT
};
// parameters that control which optimizer is used and how said optimizer tries to find the minimal loss // parameters that control which optimizer is used and how said optimizer tries to find the minimal loss
struct ggml_opt_optimizer_params { struct ggml_opt_optimizer_params {
// AdamW optimizer parameters
struct { struct {
float alpha; // learning rate float alpha; // learning rate
float beta1; float beta1; // first AdamW momentum
float beta2; float beta2; // second AdamW momentum
float eps; // epsilon for numerical stability float eps; // epsilon for numerical stability
float wd; // weight decay for AdamW, use 0.0f to disable float wd; // weight decay - 0.0f to disable
} adamw; } adamw;
struct {
float alpha; // learning rate
float wd; // weight decay
} sgd;
}; };
// callback to calculate optimizer parameters prior to a backward pass // callback to calculate optimizer parameters prior to a backward pass
@ -112,8 +122,11 @@ extern "C" {
int32_t opt_period; // after how many gradient accumulation steps an optimizer step should be done int32_t opt_period; // after how many gradient accumulation steps an optimizer step should be done
ggml_opt_get_optimizer_params get_opt_pars; // callback for calculating optimizer parameters ggml_opt_get_optimizer_params get_opt_pars; // callback for calculating optimizer parameters
void * get_opt_pars_ud; // userdata for calculating optimizer parameters void * get_opt_pars_ud; // userdata for calculating optimizer parameters
// only GGML_OPT_OPTIMIZER_TYPE_ADAMW needs m, v momenta per parameter tensor
enum ggml_opt_optimizer_type optimizer;
}; };
// get parameters for an optimization context with defaults set where possible // get parameters for an optimization context with defaults set where possible
@ -142,6 +155,10 @@ extern "C" {
// get the gradient accumulator for a node from the forward graph // get the gradient accumulator for a node from the forward graph
GGML_API struct ggml_tensor * ggml_opt_grad_acc(ggml_opt_context_t opt_ctx, struct ggml_tensor * node); GGML_API struct ggml_tensor * ggml_opt_grad_acc(ggml_opt_context_t opt_ctx, struct ggml_tensor * node);
GGML_API enum ggml_opt_optimizer_type ggml_opt_context_optimizer_type(ggml_opt_context_t); //TODO consistent naming scheme
GGML_API const char * ggml_opt_optimizer_name(enum ggml_opt_optimizer_type);
// ====== Optimization Result ====== // ====== Optimization Result ======
GGML_API ggml_opt_result_t ggml_opt_result_init(void); GGML_API ggml_opt_result_t ggml_opt_result_init(void);
@ -226,12 +243,14 @@ extern "C" {
struct ggml_tensor * outputs, // output tensor, must have shape [ne_label, ndata_batch] if labels are used struct ggml_tensor * outputs, // output tensor, must have shape [ne_label, ndata_batch] if labels are used
ggml_opt_dataset_t dataset, // dataset with data and optionally also labels ggml_opt_dataset_t dataset, // dataset with data and optionally also labels
enum ggml_opt_loss_type loss_type, // loss to minimize enum ggml_opt_loss_type loss_type, // loss to minimize
enum ggml_opt_optimizer_type optimizer, // sgd or adamw
ggml_opt_get_optimizer_params get_opt_pars, // callback to get optimizer params, userdata is pointer to epoch (of type int64_t) ggml_opt_get_optimizer_params get_opt_pars, // callback to get optimizer params, userdata is pointer to epoch (of type int64_t)
int64_t nepoch, // how many times the dataset should be iterated over int64_t nepoch, // how many times the dataset should be iterated over
int64_t nbatch_logical, // datapoints optimizer step, must be a multiple of ndata_batch in inputs/outputs int64_t nbatch_logical, // datapoints optimizer step, must be a multiple of ndata_batch in inputs/outputs
float val_split, // fraction of the dataset to use for validation, must be in [0.0f, 1.0f) float val_split, // fraction of the dataset to use for validation, must be in [0.0f, 1.0f)
bool silent); // whether or not info prints to stderr should be suppressed bool silent); // whether or not info prints to stderr should be suppressed
#ifdef __cplusplus #ifdef __cplusplus
} }
#endif #endif

View File

@ -0,0 +1,17 @@
#pragma once
#include "ggml.h"
#include "ggml-backend.h"
#ifdef __cplusplus
extern "C" {
#endif
// device buffer
GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_zdnn_buffer_type(void);
GGML_BACKEND_API ggml_backend_reg_t ggml_backend_zdnn_reg(void);
#ifdef __cplusplus
}
#endif

View File

@ -241,7 +241,16 @@
#define GGML_ROPE_TYPE_MROPE 8 #define GGML_ROPE_TYPE_MROPE 8
#define GGML_ROPE_TYPE_VISION 24 #define GGML_ROPE_TYPE_VISION 24
#define GGML_MROPE_SECTIONS 4
#define GGML_UNUSED(x) (void)(x) #define GGML_UNUSED(x) (void)(x)
#ifdef __CUDACC__
template<typename... Args>
__host__ __device__ constexpr inline void ggml_unused_vars_impl(Args&&...) noexcept {}
#define GGML_UNUSED_VARS(...) ggml_unused_vars_impl(__VA_ARGS__)
#else
#define GGML_UNUSED_VARS(...) do { (void)sizeof((__VA_ARGS__, 0)); } while(0)
#endif // __CUDACC__
#define GGML_PAD(x, n) (((x) + (n) - 1) & ~((n) - 1)) #define GGML_PAD(x, n) (((x) + (n) - 1) & ~((n) - 1))
@ -275,19 +284,19 @@
// GGML_TENSOR_LOCALS(size_t, nb1, src1, nb); // GGML_TENSOR_LOCALS(size_t, nb1, src1, nb);
// //
#define GGML_TENSOR_LOCALS_1(type, prefix, pointer, array) \ #define GGML_TENSOR_LOCALS_1(type, prefix, pointer, array) \
const type prefix##0 = (pointer)->array[0]; \ const type prefix##0 = (pointer) ? (pointer)->array[0] : 0; \
GGML_UNUSED(prefix##0); GGML_UNUSED(prefix##0);
#define GGML_TENSOR_LOCALS_2(type, prefix, pointer, array) \ #define GGML_TENSOR_LOCALS_2(type, prefix, pointer, array) \
GGML_TENSOR_LOCALS_1 (type, prefix, pointer, array) \ GGML_TENSOR_LOCALS_1 (type, prefix, pointer, array) \
const type prefix##1 = (pointer)->array[1]; \ const type prefix##1 = (pointer) ? (pointer)->array[1] : 0; \
GGML_UNUSED(prefix##1); GGML_UNUSED(prefix##1);
#define GGML_TENSOR_LOCALS_3(type, prefix, pointer, array) \ #define GGML_TENSOR_LOCALS_3(type, prefix, pointer, array) \
GGML_TENSOR_LOCALS_2 (type, prefix, pointer, array) \ GGML_TENSOR_LOCALS_2 (type, prefix, pointer, array) \
const type prefix##2 = (pointer)->array[2]; \ const type prefix##2 = (pointer) ? (pointer)->array[2] : 0; \
GGML_UNUSED(prefix##2); GGML_UNUSED(prefix##2);
#define GGML_TENSOR_LOCALS(type, prefix, pointer, array) \ #define GGML_TENSOR_LOCALS(type, prefix, pointer, array) \
GGML_TENSOR_LOCALS_3 (type, prefix, pointer, array) \ GGML_TENSOR_LOCALS_3 (type, prefix, pointer, array) \
const type prefix##3 = (pointer)->array[3]; \ const type prefix##3 = (pointer) ? (pointer)->array[3] : 0; \
GGML_UNUSED(prefix##3); GGML_UNUSED(prefix##3);
#define GGML_TENSOR_UNARY_OP_LOCALS \ #define GGML_TENSOR_UNARY_OP_LOCALS \
@ -502,7 +511,9 @@ extern "C" {
GGML_OP_CONV_TRANSPOSE_1D, GGML_OP_CONV_TRANSPOSE_1D,
GGML_OP_IM2COL, GGML_OP_IM2COL,
GGML_OP_IM2COL_BACK, GGML_OP_IM2COL_BACK,
GGML_OP_IM2COL_3D,
GGML_OP_CONV_2D, GGML_OP_CONV_2D,
GGML_OP_CONV_3D,
GGML_OP_CONV_2D_DW, GGML_OP_CONV_2D_DW,
GGML_OP_CONV_TRANSPOSE_2D, GGML_OP_CONV_TRANSPOSE_2D,
GGML_OP_POOL_1D, GGML_OP_POOL_1D,
@ -540,6 +551,7 @@ extern "C" {
GGML_OP_CROSS_ENTROPY_LOSS, GGML_OP_CROSS_ENTROPY_LOSS,
GGML_OP_CROSS_ENTROPY_LOSS_BACK, GGML_OP_CROSS_ENTROPY_LOSS_BACK,
GGML_OP_OPT_STEP_ADAMW, GGML_OP_OPT_STEP_ADAMW,
GGML_OP_OPT_STEP_SGD,
GGML_OP_GLU, GGML_OP_GLU,
@ -1392,6 +1404,7 @@ extern "C" {
struct ggml_tensor * a, struct ggml_tensor * a,
struct ggml_tensor * b); struct ggml_tensor * b);
// note: casting from f32 to i32 will discard the fractional part
GGML_API struct ggml_tensor * ggml_cast( GGML_API struct ggml_tensor * ggml_cast(
struct ggml_context * ctx, struct ggml_context * ctx,
struct ggml_tensor * a, struct ggml_tensor * a,
@ -1516,7 +1529,11 @@ extern "C" {
struct ggml_context * ctx, struct ggml_context * ctx,
struct ggml_tensor * a); struct ggml_tensor * a);
// supports 3D: a->ne[2] == b->ne[1] // supports 4D a:
// a [n_embd, ne1, ne2, ne3]
// b I32 [n_rows, ne2, ne3, 1]
//
// return [n_embd, n_rows, ne2, ne3]
GGML_API struct ggml_tensor * ggml_get_rows( GGML_API struct ggml_tensor * ggml_get_rows(
struct ggml_context * ctx, struct ggml_context * ctx,
struct ggml_tensor * a, // data struct ggml_tensor * a, // data
@ -1660,7 +1677,7 @@ extern "C" {
struct ggml_tensor * b, struct ggml_tensor * b,
struct ggml_tensor * c, struct ggml_tensor * c,
int n_dims, int n_dims,
int sections[4], int sections[GGML_MROPE_SECTIONS],
int mode, int mode,
int n_ctx_orig, int n_ctx_orig,
float freq_base, float freq_base,
@ -1686,6 +1703,22 @@ extern "C" {
float beta_fast, float beta_fast,
float beta_slow); float beta_slow);
GGML_API struct ggml_tensor * ggml_rope_multi_inplace(
struct ggml_context * ctx,
struct ggml_tensor * a,
struct ggml_tensor * b,
struct ggml_tensor * c,
int n_dims,
int sections[GGML_MROPE_SECTIONS],
int mode,
int n_ctx_orig,
float freq_base,
float freq_scale,
float ext_factor,
float attn_factor,
float beta_fast,
float beta_slow);
GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_rope_custom( GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_rope_custom(
struct ggml_context * ctx, struct ggml_context * ctx,
struct ggml_tensor * a, struct ggml_tensor * a,
@ -1843,6 +1876,41 @@ extern "C" {
int d0, // dilation dimension 0 int d0, // dilation dimension 0
int d1); // dilation dimension 1 int d1); // dilation dimension 1
GGML_API struct ggml_tensor * ggml_im2col_3d(
struct ggml_context * ctx,
struct ggml_tensor * a,
struct ggml_tensor * b,
int64_t IC,
int s0, // stride width
int s1, // stride height
int s2, // stride depth
int p0, // padding width
int p1, // padding height
int p2, // padding depth
int d0, // dilation width
int d1, // dilation height
int d2, // dilation depth
enum ggml_type dst_type);
// a: [OC*IC, KD, KH, KW]
// b: [N*IC, ID, IH, IW]
// result: [N*OC, OD, OH, OW]
GGML_API struct ggml_tensor * ggml_conv_3d(
struct ggml_context * ctx,
struct ggml_tensor * a,
struct ggml_tensor * b,
int64_t IC,
int s0, // stride width
int s1, // stride height
int s2, // stride depth
int p0, // padding width
int p1, // padding height
int p2, // padding depth
int d0, // dilation width
int d1, // dilation height
int d2 // dilation depth
);
// kernel size is a->ne[0] x a->ne[1] // kernel size is a->ne[0] x a->ne[1]
// stride is equal to kernel size // stride is equal to kernel size
// padding is zero // padding is zero
@ -1914,6 +1982,23 @@ extern "C" {
int d0, // dilation dimension 0 int d0, // dilation dimension 0
int d1); // dilation dimension 1 int d1); // dilation dimension 1
GGML_API struct ggml_tensor * ggml_conv_3d_direct(
struct ggml_context * ctx,
struct ggml_tensor * a, // kernel [KW, KH, KD, IC * OC]
struct ggml_tensor * b, // input [W, H, D, C * N]
int s0, // stride
int s1,
int s2,
int p0, // padding
int p1,
int p2,
int d0, // dilation
int d1,
int d2,
int n_channels,
int n_batch,
int n_channels_out);
enum ggml_op_pool { enum ggml_op_pool {
GGML_OP_POOL_MAX, GGML_OP_POOL_MAX,
GGML_OP_POOL_AVG, GGML_OP_POOL_AVG,
@ -2004,6 +2089,19 @@ extern "C" {
int p2, int p2,
int p3); int p3);
GGML_API struct ggml_tensor * ggml_pad_ext(
struct ggml_context * ctx,
struct ggml_tensor * a,
int lp0,
int rp0,
int lp1,
int rp1,
int lp2,
int rp2,
int lp3,
int rp3
);
// pad each dimension with reflection: [a, b, c, d] -> [b, a, b, c, d, c] // pad each dimension with reflection: [a, b, c, d] -> [b, a, b, c, d, c]
GGML_API struct ggml_tensor * ggml_pad_reflect_1d( GGML_API struct ggml_tensor * ggml_pad_reflect_1d(
struct ggml_context * ctx, struct ggml_context * ctx,
@ -2293,7 +2391,14 @@ extern "C" {
struct ggml_tensor * grad, struct ggml_tensor * grad,
struct ggml_tensor * m, struct ggml_tensor * m,
struct ggml_tensor * v, struct ggml_tensor * v,
struct ggml_tensor * adamw_params); // parameters such a the learning rate struct ggml_tensor * adamw_params); // parameters such as the learning rate
// stochastic gradient descent step (with weight decay)
GGML_API struct ggml_tensor * ggml_opt_step_sgd(
struct ggml_context * ctx,
struct ggml_tensor * a,
struct ggml_tensor * grad,
struct ggml_tensor * sgd_params); // alpha, weight decay
// //
// automatic differentiation // automatic differentiation

View File

@ -114,6 +114,9 @@ message(STATUS "GGML_SYSTEM_ARCH: ${GGML_SYSTEM_ARCH}")
if (NOT MSVC) if (NOT MSVC)
if (GGML_STATIC) if (GGML_STATIC)
if (UNIX AND NOT APPLE)
set(CMAKE_FIND_LIBRARY_SUFFIXES ".a;.so")
endif()
add_link_options(-static) add_link_options(-static)
if (MINGW) if (MINGW)
add_link_options(-static-libgcc -static-libstdc++) add_link_options(-static-libgcc -static-libstdc++)
@ -382,6 +385,7 @@ ggml_add_backend(RPC)
ggml_add_backend(SYCL) ggml_add_backend(SYCL)
ggml_add_backend(Vulkan) ggml_add_backend(Vulkan)
ggml_add_backend(WebGPU) ggml_add_backend(WebGPU)
ggml_add_backend(zDNN)
ggml_add_backend(OpenCL) ggml_add_backend(OpenCL)
foreach (target ggml-base ggml) foreach (target ggml-base ggml)

View File

@ -23,7 +23,7 @@ static bool ggml_is_view(const struct ggml_tensor * t) {
} }
// ops that return true for this function must not use restrict pointers for their backend implementations // ops that return true for this function must not use restrict pointers for their backend implementations
static bool ggml_op_can_inplace(enum ggml_op op) { bool ggml_op_can_inplace(enum ggml_op op) {
switch (op) { switch (op) {
case GGML_OP_SCALE: case GGML_OP_SCALE:
case GGML_OP_DIAG_MASK_ZERO: case GGML_OP_DIAG_MASK_ZERO:
@ -95,39 +95,104 @@ enum ggml_status ggml_tallocr_alloc(struct ggml_tallocr * talloc, struct ggml_te
// dynamic tensor allocator // dynamic tensor allocator
#define GGML_VBUFFER_MAX_CHUNKS 16
// relative memory address within an allocation that can be split into multiple buffers (chunks)
struct buffer_address {
int chunk; // index of a backend buffer
size_t offset; // local memory offset within the buffer
};
static const struct buffer_address GGML_BUFFER_ADDRESS_INVALID = { -1, SIZE_MAX };
static bool ggml_buffer_address_less(struct buffer_address a, struct buffer_address b) {
return a.chunk != b.chunk ? a.chunk < b.chunk : a.offset < b.offset;
}
struct free_block { struct free_block {
size_t offset; size_t offset;
size_t size; size_t size;
}; };
struct tallocr_chunk {
struct free_block free_blocks[MAX_FREE_BLOCKS];
int n_free_blocks;
size_t max_size;
};
struct ggml_dyn_tallocr { struct ggml_dyn_tallocr {
size_t alignment; size_t alignment;
int n_free_blocks; size_t max_chunk_size;
struct free_block free_blocks[MAX_FREE_BLOCKS]; struct tallocr_chunk * chunks[GGML_VBUFFER_MAX_CHUNKS];
size_t max_size; int n_chunks;
#ifdef GGML_ALLOCATOR_DEBUG #ifdef GGML_ALLOCATOR_DEBUG
struct { struct {
const struct ggml_tensor * tensor; const struct ggml_tensor * tensor;
size_t offset; struct buffer_address addr;
} allocated_tensors[1024]; } allocated_tensors[1024];
#endif #endif
}; };
static void ggml_dyn_tallocr_insert_block(struct tallocr_chunk * chunk, size_t offset, size_t size) {
GGML_ASSERT(chunk->n_free_blocks < MAX_FREE_BLOCKS && "out of free blocks");
// insert the new block in the correct position to keep the array sorted by address (to make merging blocks faster)
int insert_pos = 0;
while (insert_pos < chunk->n_free_blocks && chunk->free_blocks[insert_pos].offset < offset) {
insert_pos++;
}
// shift all blocks from insert_pos onward to make room for the new block
for (int i = chunk->n_free_blocks; i > insert_pos; i--) {
chunk->free_blocks[i] = chunk->free_blocks[i-1];
}
// insert the new block
chunk->free_blocks[insert_pos].offset = offset;
chunk->free_blocks[insert_pos].size = size;
chunk->n_free_blocks++;
}
static void ggml_dyn_tallocr_remove_block(struct tallocr_chunk * chunk, int idx) {
// shift all elements after idx by 1 to the left, overwriting the element at idx
for (int i = idx; i < chunk->n_free_blocks; i++) {
chunk->free_blocks[i] = chunk->free_blocks[i+1];
}
chunk->n_free_blocks--;
}
static int ggml_dyn_tallocr_new_chunk(struct ggml_dyn_tallocr * alloc, size_t min_size) {
if (alloc->n_chunks >= GGML_VBUFFER_MAX_CHUNKS) {
return -1;
}
struct tallocr_chunk * chunk = calloc(1, sizeof(struct tallocr_chunk));
chunk->n_free_blocks = 1;
chunk->free_blocks[0].offset = 0;
// available space in a chunk is limited to max_chunk_size, but can be higher if:
// 1. a single tensor exceeds the maximum, and cannot fit any other way
// 2. we are running out of chunks
// backends will either manage to allocate the larger size, or report an error.
chunk->free_blocks[0].size = MAX(min_size, alloc->max_chunk_size);
if (alloc->n_chunks == GGML_VBUFFER_MAX_CHUNKS - 1) {
chunk->free_blocks[0].size = SIZE_MAX/2;
}
alloc->chunks[alloc->n_chunks] = chunk;
alloc->n_chunks++;
return alloc->n_chunks - 1;
}
#ifdef GGML_ALLOCATOR_DEBUG #ifdef GGML_ALLOCATOR_DEBUG
static void add_allocated_tensor(struct ggml_dyn_tallocr * alloc, size_t offset, const struct ggml_tensor * tensor) { static void add_allocated_tensor(struct ggml_dyn_tallocr * alloc, struct buffer_address addr, const struct ggml_tensor * tensor) {
for (int i = 0; i < 1024; i++) { for (int i = 0; i < 1024; i++) {
if (alloc->allocated_tensors[i].tensor == NULL) { if (alloc->allocated_tensors[i].tensor == NULL) {
alloc->allocated_tensors[i].tensor = tensor; alloc->allocated_tensors[i].tensor = tensor;
alloc->allocated_tensors[i].offset = offset; alloc->allocated_tensors[i].addr = addr;
return; return;
} }
} }
GGML_ABORT("out of allocated_tensors"); GGML_ABORT("out of allocated_tensors");
} }
static void remove_allocated_tensor(struct ggml_dyn_tallocr * alloc, size_t offset, const struct ggml_tensor * tensor) { static void remove_allocated_tensor(struct ggml_dyn_tallocr * alloc, struct buffer_address addr, const struct ggml_tensor * tensor) {
for (int i = 0; i < 1024; i++) { for (int i = 0; i < 1024; i++) {
if (alloc->allocated_tensors[i].offset == offset) { if (alloc->allocated_tensors[i].addr.chunk == addr.chunk && alloc->allocated_tensors[i].addr.offset == addr.offset) {
alloc->allocated_tensors[i].tensor = NULL; alloc->allocated_tensors[i].tensor = NULL;
return; return;
} }
@ -136,76 +201,94 @@ static void remove_allocated_tensor(struct ggml_dyn_tallocr * alloc, size_t offs
} }
#endif #endif
static size_t ggml_dyn_tallocr_alloc(struct ggml_dyn_tallocr * alloc, size_t size, const struct ggml_tensor * tensor) { static struct buffer_address ggml_dyn_tallocr_alloc(struct ggml_dyn_tallocr * alloc, size_t size, const struct ggml_tensor * tensor) {
size = aligned_offset(NULL, size, alloc->alignment); size = aligned_offset(NULL, size, alloc->alignment);
AT_PRINTF("%s: allocating %s (%zu bytes) - ", __func__, tensor->name, size); AT_PRINTF("%s: allocating %s (%zu bytes) - ", __func__, tensor->name, size);
int best_fit_chunk = -1;
int best_fit_block = -1;
size_t max_avail = 0; size_t max_avail = 0;
// find the best fitting free block besides the last block // find the best fitting free block besides the last block, within any chunk
int best_fit_block = -1; for (int c = 0; c < alloc->n_chunks; ++c) {
size_t best_fit_size = SIZE_MAX; struct tallocr_chunk * chunk = alloc->chunks[c];
for (int i = 0; i < alloc->n_free_blocks - 1; i++) { size_t best_fit_size = SIZE_MAX;
struct free_block * block = &alloc->free_blocks[i]; for (int i = 0; i < chunk->n_free_blocks - 1; i++) {
max_avail = MAX(max_avail, block->size); struct free_block * block = &chunk->free_blocks[i];
if (block->size >= size && block->size <= best_fit_size) { max_avail = MAX(max_avail, block->size);
best_fit_block = i; if (block->size >= size && block->size <= best_fit_size) {
best_fit_size = block->size; best_fit_chunk = c;
best_fit_block = i;
best_fit_size = block->size;
}
} }
} }
if (best_fit_block == -1) { if (best_fit_block == -1) {
// the last block is our last resort // no suitable block found, try the last block (this will grow a chunks size)
struct free_block * block = &alloc->free_blocks[alloc->n_free_blocks - 1]; for (int c = 0; c < alloc->n_chunks; ++c) {
max_avail = MAX(max_avail, block->size); struct tallocr_chunk * chunk = alloc->chunks[c];
if (block->size >= size) { if (chunk->n_free_blocks > 0) {
best_fit_block = alloc->n_free_blocks - 1; struct free_block * block = &chunk->free_blocks[chunk->n_free_blocks - 1];
} else { max_avail = MAX(max_avail, block->size);
// this should never happen if (block->size >= size) {
GGML_LOG_ERROR("%s: not enough space in the buffer to allocate %zu bytes, largest block available %zu bytes\n", best_fit_chunk = c;
__func__, size, max_avail); best_fit_block = chunk->n_free_blocks - 1;
GGML_ABORT("not enough space in the buffer"); break;
}
}
struct free_block * block = &alloc->free_blocks[best_fit_block];
size_t offset = block->offset;
block->offset = offset + size;
block->size -= size;
if (block->size == 0) {
// remove block if empty
alloc->n_free_blocks--;
for (int j = best_fit_block; j < alloc->n_free_blocks; j++) {
alloc->free_blocks[j] = alloc->free_blocks[j+1];
}
}
AT_PRINTF("block %d, offset %zu\n", best_fit_block, offset);
#ifdef GGML_ALLOCATOR_DEBUG
add_allocated_tensor(alloc, offset, tensor);
size_t cur_max = offset + size;
if (cur_max > alloc->max_size) {
// sort allocated_tensors by offset
for (int i = 0; i < 1024; i++) {
for (int j = i + 1; j < 1024; j++) {
if (alloc->allocated_tensors[i].offset > alloc->allocated_tensors[j].offset) {
const struct ggml_tensor * tmp_tensor = alloc->allocated_tensors[i].tensor;
size_t tmp_offset = alloc->allocated_tensors[i].offset;
alloc->allocated_tensors[i].tensor = alloc->allocated_tensors[j].tensor;
alloc->allocated_tensors[i].offset = alloc->allocated_tensors[j].offset;
alloc->allocated_tensors[j].tensor = tmp_tensor;
alloc->allocated_tensors[j].offset = tmp_offset;
} }
} }
} }
GGML_LOG_DEBUG("max_size = %.2f MB: tensors: ", cur_max / 1024.0 / 1024.0); }
if (best_fit_block == -1) {
// none of the existing chunks have enough space left
best_fit_chunk = ggml_dyn_tallocr_new_chunk(alloc, size);
best_fit_block = 0;
}
if (best_fit_chunk == -1) {
// since the last chunk always has virtually endless memory, this should never happen
GGML_LOG_ERROR("%s: not enough space in the buffer to allocate %zu bytes, largest block available %zu bytes\n",
__func__, size, max_avail);
GGML_ABORT("graph allocation: failed to reserve memory");
}
struct tallocr_chunk * chunk = alloc->chunks[best_fit_chunk];
struct free_block * block = &chunk->free_blocks[best_fit_block];
struct buffer_address addr = {.chunk = best_fit_chunk, .offset = block->offset };
block->offset += size;
block->size -= size;
if (block->size == 0) {
// remove block if empty
ggml_dyn_tallocr_remove_block(chunk, best_fit_block);
}
AT_PRINTF("block %d, offset %zu, chunk %d\n", best_fit_block, addr.offset, addr.chunk);
#ifdef GGML_ALLOCATOR_DEBUG
add_allocated_tensor(alloc, addr, tensor);
size_t cur_max = addr.offset + size;
if (cur_max > alloc->max_size[addr.chunk]) {
// sort allocated_tensors by chunk/offset
for (int i = 0; i < 1024; i++) {
for (int j = i + 1; j < 1024; j++) {
if (ggml_buffer_address_less(alloc->allocated_tensors[j].addr, alloc->allocated_tensors[i].addr)) {
const struct ggml_tensor * tmp_tensor = alloc->allocated_tensors[i].tensor;
struct buffer_address tmp_addr = alloc->allocated_tensors[i].addr;
alloc->allocated_tensors[i].tensor = alloc->allocated_tensors[j].tensor;
alloc->allocated_tensors[i].addr = alloc->allocated_tensors[j].addr;
alloc->allocated_tensors[j].tensor = tmp_tensor;
alloc->allocated_tensors[j].addr = tmp_addr;
}
}
}
GGML_LOG_DEBUG("max_size[%d] = %.2f MB: tensors: ", addr.chunk, cur_max / 1024.0 / 1024.0);
for (int i = 0; i < 1024; i++) { for (int i = 0; i < 1024; i++) {
if (alloc->allocated_tensors[i].tensor) { if (alloc->allocated_tensors[i].tensor) {
GGML_LOG_DEBUG("%s [%zx-%zx] (%.2f MB) ", alloc->allocated_tensors[i].tensor->name, GGML_LOG_DEBUG("%s [%d: %zx-%zx] (%.2f MB) ", alloc->allocated_tensors[i].tensor->name,
alloc->allocated_tensors[i].offset, alloc->allocated_tensors[i].addr.chunk,
alloc->allocated_tensors[i].offset + ggml_nbytes(alloc->allocated_tensors[i].tensor), alloc->allocated_tensors[i].addr.offset,
alloc->allocated_tensors[i].addr.offset + ggml_nbytes(alloc->allocated_tensors[i].tensor),
ggml_nbytes(alloc->allocated_tensors[i].tensor) / 1024.0 / 1024.0); ggml_nbytes(alloc->allocated_tensors[i].tensor) / 1024.0 / 1024.0);
} }
} }
@ -213,78 +296,69 @@ static size_t ggml_dyn_tallocr_alloc(struct ggml_dyn_tallocr * alloc, size_t siz
} }
#endif #endif
alloc->max_size = MAX(alloc->max_size, offset + size); chunk->max_size = MAX(chunk->max_size, addr.offset + size);
return offset; return addr;
GGML_UNUSED(tensor); GGML_UNUSED(tensor);
} }
// this is a very naive implementation, but for our case the number of free blocks should be very small // this is a very naive implementation, but for our case the number of free blocks should be very small
static void ggml_dyn_tallocr_free_tensor(struct ggml_dyn_tallocr * alloc, size_t offset, size_t size, const struct ggml_tensor * tensor) { static void ggml_dyn_tallocr_free_tensor(struct ggml_dyn_tallocr * alloc, struct buffer_address addr, size_t size, const struct ggml_tensor * tensor) {
size = aligned_offset(NULL, size, alloc->alignment); size = aligned_offset(NULL, size, alloc->alignment);
AT_PRINTF("%s: freeing %s at %zu (%zu bytes) - n_free_blocks = %d\n", __func__, tensor->name, offset, size, alloc->n_free_blocks); AT_PRINTF("%s: freeing %s at {chunk=%d, offset=%zu} (%zu bytes) - n_free_blocks = %d\n",
__func__, tensor->name, addr.chunk, addr.offset, size, alloc->chunks[addr.chunk]->n_free_blocks);
#ifdef GGML_ALLOCATOR_DEBUG #ifdef GGML_ALLOCATOR_DEBUG
remove_allocated_tensor(alloc, offset, tensor); remove_allocated_tensor(alloc, addr, tensor);
#endif #endif
struct tallocr_chunk * chunk = alloc->chunks[addr.chunk];
// see if we can merge with an existing block // see if we can merge with an existing block
for (int i = 0; i < alloc->n_free_blocks; i++) { for (int i = 0; i < chunk->n_free_blocks; i++) {
struct free_block * block = &alloc->free_blocks[i]; struct free_block * block = &chunk->free_blocks[i];
// check if ptr is at the end of the block // check if ptr is at the end of the block
if (block->offset + block->size == offset) { if (block->offset + block->size == addr.offset) {
block->size += size; block->size += size;
// check if we can merge with the next block // check if we can merge with the next block
if (i < alloc->n_free_blocks - 1 && block->offset + block->size == alloc->free_blocks[i+1].offset) { if (i < chunk->n_free_blocks - 1) {
block->size += alloc->free_blocks[i+1].size; struct free_block * next = &chunk->free_blocks[i+1];
alloc->n_free_blocks--; if (block->offset + block->size == next->offset) {
for (int j = i+1; j < alloc->n_free_blocks; j++) { block->size += next->size;
alloc->free_blocks[j] = alloc->free_blocks[j+1]; ggml_dyn_tallocr_remove_block(chunk, i+1);
} }
} }
return; return;
} }
// check if ptr is at the beginning of the block // check if ptr is at the beginning of the block
if (offset + size == block->offset) { if (addr.offset + size == block->offset) {
block->offset = offset; block->offset = addr.offset;
block->size += size; block->size += size;
// check if we can merge with the previous block // check if we can merge with the previous block
if (i > 0 && alloc->free_blocks[i-1].offset + alloc->free_blocks[i-1].size == block->offset) { if (i > 0) {
alloc->free_blocks[i-1].size += block->size; struct free_block * prev = &chunk->free_blocks[i-1];
alloc->n_free_blocks--; if (prev->offset + prev->size == block->offset) {
for (int j = i; j < alloc->n_free_blocks; j++) { prev->size += block->size;
alloc->free_blocks[j] = alloc->free_blocks[j+1]; ggml_dyn_tallocr_remove_block(chunk, i);
} }
} }
return; return;
} }
} }
// otherwise, add a new block // otherwise, add a new block
GGML_ASSERT(alloc->n_free_blocks < MAX_FREE_BLOCKS && "out of free blocks"); ggml_dyn_tallocr_insert_block(chunk, addr.offset, size);
// insert the new block in the correct position to keep the array sorted by address (to make merging blocks faster)
int insert_pos = 0;
while (insert_pos < alloc->n_free_blocks && alloc->free_blocks[insert_pos].offset < offset) {
insert_pos++;
}
// shift all blocks from insert_pos onward to make room for the new block
for (int i = alloc->n_free_blocks; i > insert_pos; i--) {
alloc->free_blocks[i] = alloc->free_blocks[i-1];
}
// insert the new block
alloc->free_blocks[insert_pos].offset = offset;
alloc->free_blocks[insert_pos].size = size;
alloc->n_free_blocks++;
GGML_UNUSED(tensor); GGML_UNUSED(tensor);
} }
static void ggml_dyn_tallocr_reset(struct ggml_dyn_tallocr * alloc) { static void ggml_dyn_tallocr_reset(struct ggml_dyn_tallocr * alloc) {
alloc->n_free_blocks = 1; for (int i = 0; i < GGML_VBUFFER_MAX_CHUNKS; i++) {
alloc->free_blocks[0].offset = 0; free(alloc->chunks[i]);
alloc->free_blocks[0].size = SIZE_MAX/2; // restrict maximum size of a measure allocator to half size_t max to avoid overflows alloc->chunks[i] = NULL;
alloc->max_size = 0; }
alloc->n_chunks = 0;
#ifdef GGML_ALLOCATOR_DEBUG #ifdef GGML_ALLOCATOR_DEBUG
for (int i = 0; i < 1024; i++) { for (int i = 0; i < 1024; i++) {
@ -293,14 +367,14 @@ static void ggml_dyn_tallocr_reset(struct ggml_dyn_tallocr * alloc) {
#endif #endif
} }
static struct ggml_dyn_tallocr * ggml_dyn_tallocr_new(size_t alignment) { static struct ggml_dyn_tallocr * ggml_dyn_tallocr_new(size_t alignment, size_t max_buffer_size) {
struct ggml_dyn_tallocr * alloc = (struct ggml_dyn_tallocr *)malloc(sizeof(struct ggml_dyn_tallocr)); struct ggml_dyn_tallocr * alloc = (struct ggml_dyn_tallocr *)malloc(sizeof(struct ggml_dyn_tallocr));
*alloc = (struct ggml_dyn_tallocr) { *alloc = (struct ggml_dyn_tallocr) {
/*.alignment = */ alignment, /*.alignment = */ alignment,
/*.n_free_blocks = */ 0, /*.max_chunk_size = */ MIN(max_buffer_size, SIZE_MAX/2), // clamp to avoid overflows
/*.free_blocks = */ {{0}}, /*.chunks = */ {NULL},
/*.max_size = */ 0, /*.n_chunks = */ 0,
#ifdef GGML_ALLOCATOR_DEBUG #ifdef GGML_ALLOCATOR_DEBUG
/*.allocated_tensors = */ {{0}}, /*.allocated_tensors = */ {{0}},
#endif #endif
@ -312,11 +386,79 @@ static struct ggml_dyn_tallocr * ggml_dyn_tallocr_new(size_t alignment) {
} }
static void ggml_dyn_tallocr_free(struct ggml_dyn_tallocr * alloc) { static void ggml_dyn_tallocr_free(struct ggml_dyn_tallocr * alloc) {
for (int i = 0; i < alloc->n_chunks; ++i) {
free(alloc->chunks[i]);
}
free(alloc); free(alloc);
} }
static size_t ggml_dyn_tallocr_max_size(struct ggml_dyn_tallocr * alloc) { static size_t ggml_dyn_tallocr_max_size(struct ggml_dyn_tallocr * alloc) {
return alloc->max_size; size_t max_size = 0;
for (int i = 0; i < alloc->n_chunks; i++) {
max_size += alloc->chunks[i]->max_size;
}
return max_size;
}
// virtual buffer with contiguous memory range, split into multiple backend buffers (chunks)
struct vbuffer {
ggml_backend_buffer_t chunks[GGML_VBUFFER_MAX_CHUNKS];
};
static void ggml_vbuffer_free(struct vbuffer * buf) {
if (buf == NULL) {
return;
}
for (int i = 0; i < GGML_VBUFFER_MAX_CHUNKS; ++i) {
ggml_backend_buffer_free(buf->chunks[i]);
}
free(buf);
}
static int ggml_vbuffer_n_chunks(struct vbuffer * buf) {
int n = 0;
while (n < GGML_VBUFFER_MAX_CHUNKS && buf->chunks[n]) n++;
return n;
}
static size_t ggml_vbuffer_size(struct vbuffer * buf) {
size_t size = 0;
for (int i = 0; i < GGML_VBUFFER_MAX_CHUNKS && buf->chunks[i]; ++i) {
size += ggml_backend_buffer_get_size(buf->chunks[i]);
}
return size;
}
static struct vbuffer * ggml_vbuffer_alloc(ggml_backend_buffer_type_t buft, const struct ggml_dyn_tallocr * talloc, enum ggml_backend_buffer_usage usage) {
struct vbuffer * buf = (struct vbuffer *)calloc(1, sizeof(struct vbuffer));
if (buf == NULL) {
return NULL;
}
for (int n = 0; n < talloc->n_chunks; n++) {
size_t chunk_size = talloc->chunks[n]->max_size;
buf->chunks[n] = ggml_backend_buft_alloc_buffer(buft, chunk_size);
if (buf->chunks[n] == NULL) {
ggml_vbuffer_free(buf);
return NULL;
}
ggml_backend_buffer_set_usage(buf->chunks[n], usage);
}
return buf;
}
static void ggml_vbuffer_tensor_alloc(struct vbuffer * buf, struct ggml_tensor * tensor, struct buffer_address buf_addr) {
void * base = ggml_backend_buffer_get_base(buf->chunks[buf_addr.chunk]);
void * addr = (char *)base + buf_addr.offset;
ggml_backend_tensor_alloc(buf->chunks[buf_addr.chunk], tensor, addr);
}
static void ggml_vbuffer_reset(struct vbuffer * buf) {
for (int i = 0; i < GGML_VBUFFER_MAX_CHUNKS && buf->chunks[i]; ++i) {
ggml_backend_buffer_reset(buf->chunks[i]);
}
} }
@ -328,13 +470,13 @@ struct hash_node {
int n_children; int n_children;
int n_views; int n_views;
int buffer_id; int buffer_id;
size_t offset; // offset within the buffer struct buffer_address addr;
bool allocated; bool allocated;
}; };
struct tensor_alloc { struct tensor_alloc {
int buffer_id; int buffer_id;
size_t offset; struct buffer_address addr;
size_t size_max; // 0 = pre-allocated, unused, or view size_t size_max; // 0 = pre-allocated, unused, or view
}; };
@ -349,7 +491,7 @@ struct node_alloc {
struct ggml_gallocr { struct ggml_gallocr {
ggml_backend_buffer_type_t * bufts; // [n_buffers] ggml_backend_buffer_type_t * bufts; // [n_buffers]
ggml_backend_buffer_t * buffers; // [n_buffers] struct vbuffer ** buffers; // [n_buffers]
size_t *buffer_sizes; // [n_buffers] size_t *buffer_sizes; // [n_buffers]
struct ggml_dyn_tallocr ** buf_tallocs; // [n_buffers] struct ggml_dyn_tallocr ** buf_tallocs; // [n_buffers]
int n_buffers; int n_buffers;
@ -371,7 +513,7 @@ ggml_gallocr_t ggml_gallocr_new_n(ggml_backend_buffer_type_t * bufts, int n_bufs
galloc->bufts = calloc(n_bufs, sizeof(ggml_backend_buffer_type_t)); galloc->bufts = calloc(n_bufs, sizeof(ggml_backend_buffer_type_t));
GGML_ASSERT(galloc->bufts != NULL); GGML_ASSERT(galloc->bufts != NULL);
galloc->buffers = calloc(n_bufs, sizeof(ggml_backend_buffer_t)); galloc->buffers = calloc(n_bufs, sizeof(struct vbuffer *));
GGML_ASSERT(galloc->buffers != NULL); GGML_ASSERT(galloc->buffers != NULL);
galloc->buffer_sizes = calloc(n_bufs, sizeof(size_t)); galloc->buffer_sizes = calloc(n_bufs, sizeof(size_t));
@ -394,7 +536,8 @@ ggml_gallocr_t ggml_gallocr_new_n(ggml_backend_buffer_type_t * bufts, int n_bufs
if (galloc->buf_tallocs[i] == NULL) { if (galloc->buf_tallocs[i] == NULL) {
size_t alignment = ggml_backend_buft_get_alignment(bufts[i]); size_t alignment = ggml_backend_buft_get_alignment(bufts[i]);
galloc->buf_tallocs[i] = ggml_dyn_tallocr_new(alignment); size_t max_size = ggml_backend_buft_get_max_size(bufts[i]);
galloc->buf_tallocs[i] = ggml_dyn_tallocr_new(alignment, max_size);
} }
} }
galloc->n_buffers = n_bufs; galloc->n_buffers = n_bufs;
@ -422,7 +565,7 @@ void ggml_gallocr_free(ggml_gallocr_t galloc) {
} }
} }
if (!freed) { if (!freed) {
ggml_backend_buffer_free(galloc->buffers[i]); ggml_vbuffer_free(galloc->buffers[i]);
} }
} }
if (galloc->buf_tallocs != NULL) { if (galloc->buf_tallocs != NULL) {
@ -472,7 +615,7 @@ static void ggml_gallocr_allocate_node(ggml_gallocr_t galloc, struct ggml_tensor
if (!ggml_gallocr_is_allocated(galloc, node) && !ggml_is_view(node)) { if (!ggml_gallocr_is_allocated(galloc, node) && !ggml_is_view(node)) {
hn->allocated = true; hn->allocated = true;
assert(hn->offset == 0); assert(hn->addr.offset == 0);
// try to reuse a parent's buffer (inplace) // try to reuse a parent's buffer (inplace)
if (ggml_op_can_inplace(node->op)) { if (ggml_op_can_inplace(node->op)) {
@ -506,9 +649,9 @@ static void ggml_gallocr_allocate_node(ggml_gallocr_t galloc, struct ggml_tensor
struct hash_node * view_src_hn = ggml_gallocr_hash_get(galloc, view_src); struct hash_node * view_src_hn = ggml_gallocr_hash_get(galloc, view_src);
if (view_src_hn->n_views == 1 && view_src_hn->n_children == 0 && view_src->data == parent->data) { if (view_src_hn->n_views == 1 && view_src_hn->n_children == 0 && view_src->data == parent->data) {
AT_PRINTF("reusing view parent %s (%s) for %s\n", parent->name, view_src->name, node->name); AT_PRINTF("reusing view parent %s (%s) for %s\n", parent->name, view_src->name, node->name);
assert(view_src_hn->offset == p_hn->offset); assert(view_src_hn->addr.chunk == p_hn->addr.chunk && view_src_hn->addr.offset == p_hn->addr.offset);
hn->buffer_id = p_hn->buffer_id; hn->buffer_id = p_hn->buffer_id;
hn->offset = p_hn->offset; hn->addr = p_hn->addr;
p_hn->allocated = false; // avoid freeing the parent p_hn->allocated = false; // avoid freeing the parent
view_src_hn->allocated = false; view_src_hn->allocated = false;
return; return;
@ -516,7 +659,7 @@ static void ggml_gallocr_allocate_node(ggml_gallocr_t galloc, struct ggml_tensor
} else { } else {
AT_PRINTF("reusing parent %s for %s\n", parent->name, node->name); AT_PRINTF("reusing parent %s for %s\n", parent->name, node->name);
hn->buffer_id = p_hn->buffer_id; hn->buffer_id = p_hn->buffer_id;
hn->offset = p_hn->offset; hn->addr = p_hn->addr;
p_hn->allocated = false; // avoid freeing the parent p_hn->allocated = false; // avoid freeing the parent
return; return;
} }
@ -527,9 +670,8 @@ static void ggml_gallocr_allocate_node(ggml_gallocr_t galloc, struct ggml_tensor
struct ggml_dyn_tallocr * alloc = galloc->buf_tallocs[buffer_id]; struct ggml_dyn_tallocr * alloc = galloc->buf_tallocs[buffer_id];
ggml_backend_buffer_type_t buft = galloc->bufts[buffer_id]; ggml_backend_buffer_type_t buft = galloc->bufts[buffer_id];
size_t size = ggml_backend_buft_get_alloc_size(buft, node); size_t size = ggml_backend_buft_get_alloc_size(buft, node);
size_t offset = ggml_dyn_tallocr_alloc(alloc, size, node);
hn->buffer_id = buffer_id; hn->buffer_id = buffer_id;
hn->offset = offset; hn->addr = ggml_dyn_tallocr_alloc(alloc, size, node);
} }
} }
@ -541,12 +683,11 @@ static void ggml_gallocr_free_node(ggml_gallocr_t galloc, struct ggml_tensor * n
} }
struct hash_node * hn = ggml_gallocr_hash_get(galloc, node); struct hash_node * hn = ggml_gallocr_hash_get(galloc, node);
size_t offset = hn->offset;
int buffer_id = hn->buffer_id; int buffer_id = hn->buffer_id;
struct ggml_dyn_tallocr * alloc = galloc->buf_tallocs[buffer_id]; struct ggml_dyn_tallocr * alloc = galloc->buf_tallocs[buffer_id];
ggml_backend_buffer_type_t buft = galloc->bufts[buffer_id]; ggml_backend_buffer_type_t buft = galloc->bufts[buffer_id];
size_t size = ggml_backend_buft_get_alloc_size(buft, node); size_t size = ggml_backend_buft_get_alloc_size(buft, node);
ggml_dyn_tallocr_free_tensor(alloc, offset, size, node); ggml_dyn_tallocr_free_tensor(alloc, hn->addr, size, node);
hn->allocated = false; hn->allocated = false;
} }
@ -697,24 +838,24 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
struct node_alloc * node_alloc = &galloc->node_allocs[i]; struct node_alloc * node_alloc = &galloc->node_allocs[i];
if (node->view_src || node->data) { if (node->view_src || node->data) {
node_alloc->dst.buffer_id = -1; node_alloc->dst.buffer_id = -1;
node_alloc->dst.offset = SIZE_MAX; node_alloc->dst.addr = GGML_BUFFER_ADDRESS_INVALID;
node_alloc->dst.size_max = 0; node_alloc->dst.size_max = 0;
} else { } else {
struct hash_node * hn = ggml_gallocr_hash_get(galloc, node); struct hash_node * hn = ggml_gallocr_hash_get(galloc, node);
node_alloc->dst.buffer_id = hn->buffer_id; node_alloc->dst.buffer_id = hn->buffer_id;
node_alloc->dst.offset = hn->offset; node_alloc->dst.addr = hn->addr;
node_alloc->dst.size_max = ggml_backend_buft_get_alloc_size(galloc->bufts[hn->buffer_id], node); node_alloc->dst.size_max = ggml_backend_buft_get_alloc_size(galloc->bufts[hn->buffer_id], node);
} }
for (int j = 0; j < GGML_MAX_SRC; j++) { for (int j = 0; j < GGML_MAX_SRC; j++) {
struct ggml_tensor * src = node->src[j]; struct ggml_tensor * src = node->src[j];
if (!src || src->view_src || src->data) { if (!src || src->view_src || src->data) {
node_alloc->src[j].buffer_id = -1; node_alloc->src[j].buffer_id = -1;
node_alloc->src[j].offset = SIZE_MAX; node_alloc->src[j].addr = GGML_BUFFER_ADDRESS_INVALID;
node_alloc->src[j].size_max = 0; node_alloc->src[j].size_max = 0;
} else { } else {
struct hash_node * hn = ggml_gallocr_hash_get(galloc, src); struct hash_node * hn = ggml_gallocr_hash_get(galloc, src);
node_alloc->src[j].buffer_id = hn->buffer_id; node_alloc->src[j].buffer_id = hn->buffer_id;
node_alloc->src[j].offset = hn->offset; node_alloc->src[j].addr = hn->addr;
node_alloc->src[j].size_max = ggml_backend_buft_get_alloc_size(galloc->bufts[hn->buffer_id], src); node_alloc->src[j].size_max = ggml_backend_buft_get_alloc_size(galloc->bufts[hn->buffer_id], src);
} }
} }
@ -730,11 +871,11 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
struct hash_node * hn = ggml_gallocr_hash_get(galloc, leaf); struct hash_node * hn = ggml_gallocr_hash_get(galloc, leaf);
if (leaf->view_src || leaf->data) { if (leaf->view_src || leaf->data) {
galloc->leaf_allocs[i].leaf.buffer_id = -1; galloc->leaf_allocs[i].leaf.buffer_id = -1;
galloc->leaf_allocs[i].leaf.offset = SIZE_MAX; galloc->leaf_allocs[i].leaf.addr = GGML_BUFFER_ADDRESS_INVALID;
galloc->leaf_allocs[i].leaf.size_max = 0; galloc->leaf_allocs[i].leaf.size_max = 0;
} else { } else {
galloc->leaf_allocs[i].leaf.buffer_id = hn->buffer_id; galloc->leaf_allocs[i].leaf.buffer_id = hn->buffer_id;
galloc->leaf_allocs[i].leaf.offset = hn->offset; galloc->leaf_allocs[i].leaf.addr = hn->addr;
galloc->leaf_allocs[i].leaf.size_max = ggml_backend_buft_get_alloc_size(galloc->bufts[hn->buffer_id], leaf); galloc->leaf_allocs[i].leaf.size_max = ggml_backend_buft_get_alloc_size(galloc->bufts[hn->buffer_id], leaf);
} }
} }
@ -751,7 +892,7 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
} }
} }
size_t cur_size = galloc->buffers[i] ? ggml_backend_buffer_get_size(galloc->buffers[i]) : 0; size_t cur_size = galloc->buffers[i] ? ggml_vbuffer_size(galloc->buffers[i]) : 0;
size_t new_size = ggml_dyn_tallocr_max_size(galloc->buf_tallocs[i]); size_t new_size = ggml_dyn_tallocr_max_size(galloc->buf_tallocs[i]);
// even if there are no tensors allocated in this buffer, we still need to allocate it to initialize views // even if there are no tensors allocated in this buffer, we still need to allocate it to initialize views
@ -760,18 +901,17 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
GGML_LOG_DEBUG("%s: reallocating %s buffer from size %.02f MiB to %.02f MiB\n", __func__, ggml_backend_buft_name(galloc->bufts[i]), cur_size / 1024.0 / 1024.0, new_size / 1024.0 / 1024.0); GGML_LOG_DEBUG("%s: reallocating %s buffer from size %.02f MiB to %.02f MiB\n", __func__, ggml_backend_buft_name(galloc->bufts[i]), cur_size / 1024.0 / 1024.0, new_size / 1024.0 / 1024.0);
#endif #endif
ggml_backend_buffer_free(galloc->buffers[i]); ggml_vbuffer_free(galloc->buffers[i]);
galloc->buffers[i] = ggml_backend_buft_alloc_buffer(galloc->bufts[i], new_size); galloc->buffers[i] = ggml_vbuffer_alloc(galloc->bufts[i], galloc->buf_tallocs[i], GGML_BACKEND_BUFFER_USAGE_COMPUTE);
if (galloc->buffers[i]) { if (galloc->buffers[i]) {
galloc->buffer_sizes[i] = ggml_backend_buffer_get_size(galloc->buffers[i]); galloc->buffer_sizes[i] = ggml_vbuffer_size(galloc->buffers[i]);
ggml_backend_buffer_set_usage(galloc->buffers[i], GGML_BACKEND_BUFFER_USAGE_COMPUTE);
} else { } else {
GGML_LOG_ERROR("%s: failed to allocate %s buffer of size %zu\n", __func__, ggml_backend_buft_name(galloc->bufts[i]), new_size); GGML_LOG_ERROR("%s: failed to allocate %s buffer of size %zu\n", __func__, ggml_backend_buft_name(galloc->bufts[i]), new_size);
galloc->buffer_sizes[i] = new_size; galloc->buffer_sizes[i] = new_size;
success = false; success = false;
} }
} else { } else {
galloc->buffer_sizes[i] = ggml_backend_buffer_get_size(galloc->buffers[i]); galloc->buffer_sizes[i] = ggml_vbuffer_size(galloc->buffers[i]);
} }
} }
@ -784,11 +924,11 @@ bool ggml_gallocr_reserve(ggml_gallocr_t galloc, struct ggml_cgraph *graph) {
static void ggml_gallocr_init_tensor(ggml_gallocr_t galloc, struct ggml_tensor * tensor, struct tensor_alloc * tensor_alloc) { static void ggml_gallocr_init_tensor(ggml_gallocr_t galloc, struct ggml_tensor * tensor, struct tensor_alloc * tensor_alloc) {
int buffer_id = tensor_alloc->buffer_id; int buffer_id = tensor_alloc->buffer_id;
assert(tensor->data || tensor->view_src || ggml_backend_buffer_get_alloc_size(galloc->buffers[buffer_id], tensor) <= tensor_alloc->size_max); assert(tensor->data || tensor->view_src || ggml_backend_buft_get_alloc_size(galloc->bufts[buffer_id], tensor) <= tensor_alloc->size_max);
if (tensor->view_src != NULL) { if (tensor->view_src != NULL) {
if (tensor->buffer == NULL) { if (tensor->buffer == NULL) {
assert(tensor_alloc->offset == SIZE_MAX); assert(tensor_alloc->addr.offset == SIZE_MAX);
if (tensor->view_src->buffer == NULL) { if (tensor->view_src->buffer == NULL) {
// this tensor was allocated without ggml-backend // this tensor was allocated without ggml-backend
return; return;
@ -797,11 +937,9 @@ static void ggml_gallocr_init_tensor(ggml_gallocr_t galloc, struct ggml_tensor *
} }
} else { } else {
if (tensor->data == NULL) { if (tensor->data == NULL) {
assert(tensor_alloc->offset != SIZE_MAX); assert(tensor_alloc->addr.offset != SIZE_MAX);
assert(ggml_backend_buffer_get_alloc_size(galloc->buffers[buffer_id], tensor) <= tensor_alloc->size_max); assert(ggml_backend_buft_get_alloc_size(galloc->bufts[buffer_id], tensor) <= tensor_alloc->size_max);
void * base = ggml_backend_buffer_get_base(galloc->buffers[buffer_id]); ggml_vbuffer_tensor_alloc(galloc->buffers[buffer_id], tensor, tensor_alloc->addr);
void * addr = (char *)base + tensor_alloc->offset;
ggml_backend_tensor_alloc(galloc->buffers[buffer_id], tensor, addr);
} else { } else {
if (tensor->buffer == NULL) { if (tensor->buffer == NULL) {
// this tensor was allocated without ggml-backend // this tensor was allocated without ggml-backend
@ -886,7 +1024,7 @@ bool ggml_gallocr_alloc_graph(ggml_gallocr_t galloc, struct ggml_cgraph * graph)
// reset buffers // reset buffers
for (int i = 0; i < galloc->n_buffers; i++) { for (int i = 0; i < galloc->n_buffers; i++) {
if (galloc->buffers[i] != NULL) { if (galloc->buffers[i] != NULL) {
ggml_backend_buffer_reset(galloc->buffers[i]); ggml_vbuffer_reset(galloc->buffers[i]);
} }
} }
@ -929,7 +1067,7 @@ size_t ggml_gallocr_get_buffer_size(ggml_gallocr_t galloc, int buffer_id) {
} }
} }
return ggml_backend_buffer_get_size(galloc->buffers[buffer_id]); return ggml_vbuffer_size(galloc->buffers[buffer_id]);
} }
size_t ggml_gallocr_get_attempted_buffer_size(ggml_gallocr_t galloc, int buffer_id) { size_t ggml_gallocr_get_attempted_buffer_size(ggml_gallocr_t galloc, int buffer_id) {

View File

@ -8,7 +8,7 @@
extern "C" { extern "C" {
#endif #endif
#define GGML_BACKEND_API_VERSION 1 #define GGML_BACKEND_API_VERSION 2
// //
// Backend buffer type // Backend buffer type
@ -121,6 +121,9 @@ extern "C" {
// wait for an event on on a different stream // wait for an event on on a different stream
void (*event_wait) (ggml_backend_t backend, ggml_backend_event_t event); void (*event_wait) (ggml_backend_t backend, ggml_backend_event_t event);
// (optional) sort/optimize the nodes in the graph
void (*graph_optimize) (ggml_backend_t backend, struct ggml_cgraph * cgraph);
// (optional) reserves intermediate buffers needed for the compution // (optional) reserves intermediate buffers needed for the compution
// if alloc is true, memory is actually allocated, otherwise the required amount is just returned by buffer_size // if alloc is true, memory is actually allocated, otherwise the required amount is just returned by buffer_size
enum ggml_status (*graph_reserve) (ggml_backend_t backend, struct ggml_cgraph * cgraph, bool alloc); enum ggml_status (*graph_reserve) (ggml_backend_t backend, struct ggml_cgraph * cgraph, bool alloc);

View File

@ -49,6 +49,10 @@
#include "ggml-webgpu.h" #include "ggml-webgpu.h"
#endif #endif
#ifdef GGML_USE_ZDNN
#include "ggml-zdnn.h"
#endif
#ifdef GGML_USE_OPENCL #ifdef GGML_USE_OPENCL
#include "ggml-opencl.h" #include "ggml-opencl.h"
#endif #endif
@ -131,6 +135,10 @@ static void * dl_get_sym(dl_handle * handle, const char * name) {
return p; return p;
} }
static const char * dl_error() {
return "";
}
#else #else
using dl_handle = void; using dl_handle = void;
@ -151,6 +159,11 @@ static void * dl_get_sym(dl_handle * handle, const char * name) {
return dlsym(handle, name); return dlsym(handle, name);
} }
static const char * dl_error() {
const char *rslt = dlerror();
return rslt != nullptr ? rslt : "";
}
#endif #endif
using dl_handle_ptr = std::unique_ptr<dl_handle, dl_handle_deleter>; using dl_handle_ptr = std::unique_ptr<dl_handle, dl_handle_deleter>;
@ -180,6 +193,9 @@ struct ggml_backend_registry {
#ifdef GGML_USE_WEBGPU #ifdef GGML_USE_WEBGPU
register_backend(ggml_backend_webgpu_reg()); register_backend(ggml_backend_webgpu_reg());
#endif #endif
#ifdef GGML_USE_ZDNN
register_backend(ggml_backend_zdnn_reg());
#endif
#ifdef GGML_USE_OPENCL #ifdef GGML_USE_OPENCL
register_backend(ggml_backend_opencl_reg()); register_backend(ggml_backend_opencl_reg());
#endif #endif
@ -238,7 +254,7 @@ struct ggml_backend_registry {
dl_handle_ptr handle { dl_load_library(path) }; dl_handle_ptr handle { dl_load_library(path) };
if (!handle) { if (!handle) {
if (!silent) { if (!silent) {
GGML_LOG_ERROR("%s: failed to load %s\n", __func__, path_str(path).c_str()); GGML_LOG_ERROR("%s: failed to load %s: %s\n", __func__, path_str(path).c_str(), dl_error());
} }
return nullptr; return nullptr;
} }
@ -398,9 +414,8 @@ ggml_backend_t ggml_backend_init_by_type(enum ggml_backend_dev_type type, const
ggml_backend_t ggml_backend_init_best(void) { ggml_backend_t ggml_backend_init_best(void) {
ggml_backend_dev_t dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_GPU); ggml_backend_dev_t dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_GPU);
if (!dev) { dev = dev ? dev : ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_IGPU);
dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU); dev = dev ? dev : ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
}
if (!dev) { if (!dev) {
return nullptr; return nullptr;
} }
@ -529,7 +544,7 @@ static ggml_backend_reg_t ggml_backend_load_best(const char * name, bool silent,
if (filename.native().find(file_prefix) == 0 && ext == file_extension) { if (filename.native().find(file_prefix) == 0 && ext == file_extension) {
dl_handle_ptr handle { dl_load_library(entry) }; dl_handle_ptr handle { dl_load_library(entry) };
if (!handle && !silent) { if (!handle && !silent) {
GGML_LOG_ERROR("%s: failed to load %s\n", __func__, path_str(entry.path()).c_str()); GGML_LOG_ERROR("%s: failed to load %s: %s\n", __func__, path_str(entry.path()).c_str(), dl_error());
} }
if (handle) { if (handle) {
auto score_fn = (ggml_backend_score_t) dl_get_sym(handle.get(), "ggml_backend_score"); auto score_fn = (ggml_backend_score_t) dl_get_sym(handle.get(), "ggml_backend_score");

View File

@ -19,9 +19,8 @@
#include <stdio.h> #include <stdio.h>
#include <stdlib.h> #include <stdlib.h>
#include <string.h> #include <string.h>
#include <string>
#include <vector>
#include <algorithm> #include <algorithm>
#include <vector>
#ifdef __APPLE__ #ifdef __APPLE__
#include <sys/types.h> #include <sys/types.h>
@ -32,6 +31,7 @@
// backend buffer type // backend buffer type
const char * ggml_backend_buft_name(ggml_backend_buffer_type_t buft) { const char * ggml_backend_buft_name(ggml_backend_buffer_type_t buft) {
GGML_ASSERT(buft);
return buft->iface.get_name(buft); return buft->iface.get_name(buft);
} }
@ -54,14 +54,17 @@ ggml_backend_buffer_t ggml_backend_buft_alloc_buffer(ggml_backend_buffer_type_t
return buf; return buf;
} }
GGML_ASSERT(buft);
return buft->iface.alloc_buffer(buft, size); return buft->iface.alloc_buffer(buft, size);
} }
size_t ggml_backend_buft_get_alignment(ggml_backend_buffer_type_t buft) { size_t ggml_backend_buft_get_alignment(ggml_backend_buffer_type_t buft) {
GGML_ASSERT(buft);
return buft->iface.get_alignment(buft); return buft->iface.get_alignment(buft);
} }
size_t ggml_backend_buft_get_max_size(ggml_backend_buffer_type_t buft) { size_t ggml_backend_buft_get_max_size(ggml_backend_buffer_type_t buft) {
GGML_ASSERT(buft);
// get_max_size is optional, defaults to SIZE_MAX // get_max_size is optional, defaults to SIZE_MAX
if (buft->iface.get_max_size) { if (buft->iface.get_max_size) {
return buft->iface.get_max_size(buft); return buft->iface.get_max_size(buft);
@ -70,6 +73,7 @@ size_t ggml_backend_buft_get_max_size(ggml_backend_buffer_type_t buft) {
} }
size_t ggml_backend_buft_get_alloc_size(ggml_backend_buffer_type_t buft, const struct ggml_tensor * tensor) { size_t ggml_backend_buft_get_alloc_size(ggml_backend_buffer_type_t buft, const struct ggml_tensor * tensor) {
GGML_ASSERT(buft);
// get_alloc_size is optional, defaults to ggml_nbytes // get_alloc_size is optional, defaults to ggml_nbytes
if (buft->iface.get_alloc_size) { if (buft->iface.get_alloc_size) {
size_t size = buft->iface.get_alloc_size(buft, tensor); size_t size = buft->iface.get_alloc_size(buft, tensor);
@ -80,6 +84,7 @@ size_t ggml_backend_buft_get_alloc_size(ggml_backend_buffer_type_t buft, const s
} }
bool ggml_backend_buft_is_host(ggml_backend_buffer_type_t buft) { bool ggml_backend_buft_is_host(ggml_backend_buffer_type_t buft) {
GGML_ASSERT(buft);
if (buft->iface.is_host) { if (buft->iface.is_host) {
return buft->iface.is_host(buft); return buft->iface.is_host(buft);
} }
@ -87,6 +92,7 @@ bool ggml_backend_buft_is_host(ggml_backend_buffer_type_t buft) {
} }
ggml_backend_dev_t ggml_backend_buft_get_device(ggml_backend_buffer_type_t buft) { ggml_backend_dev_t ggml_backend_buft_get_device(ggml_backend_buffer_type_t buft) {
GGML_ASSERT(buft);
return buft->device; return buft->device;
} }
@ -124,10 +130,12 @@ void ggml_backend_buffer_free(ggml_backend_buffer_t buffer) {
} }
size_t ggml_backend_buffer_get_size(ggml_backend_buffer_t buffer) { size_t ggml_backend_buffer_get_size(ggml_backend_buffer_t buffer) {
GGML_ASSERT(buffer);
return buffer->size; return buffer->size;
} }
void * ggml_backend_buffer_get_base(ggml_backend_buffer_t buffer) { void * ggml_backend_buffer_get_base(ggml_backend_buffer_t buffer) {
GGML_ASSERT(buffer);
// get_base is optional if the buffer is zero-sized // get_base is optional if the buffer is zero-sized
if (buffer->size == 0) { if (buffer->size == 0) {
return NULL; return NULL;
@ -147,6 +155,7 @@ void * ggml_backend_buffer_get_base(ggml_backend_buffer_t buffer) {
} }
enum ggml_status ggml_backend_buffer_init_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) { enum ggml_status ggml_backend_buffer_init_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) {
GGML_ASSERT(buffer);
// init_tensor is optional // init_tensor is optional
if (buffer->iface.init_tensor) { if (buffer->iface.init_tensor) {
return buffer->iface.init_tensor(buffer, tensor); return buffer->iface.init_tensor(buffer, tensor);
@ -155,6 +164,7 @@ enum ggml_status ggml_backend_buffer_init_tensor(ggml_backend_buffer_t buffer, s
} }
void ggml_backend_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) { void ggml_backend_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
GGML_ASSERT(buffer);
// clear is optional if the buffer is zero-sized // clear is optional if the buffer is zero-sized
if (buffer->size == 0) { if (buffer->size == 0) {
return; return;
@ -180,6 +190,7 @@ bool ggml_backend_buffer_is_host(ggml_backend_buffer_t buffer) {
} }
void ggml_backend_buffer_set_usage(ggml_backend_buffer_t buffer, enum ggml_backend_buffer_usage usage) { void ggml_backend_buffer_set_usage(ggml_backend_buffer_t buffer, enum ggml_backend_buffer_usage usage) {
GGML_ASSERT(buffer);
buffer->usage = usage; buffer->usage = usage;
// FIXME: add a generic callback to the buffer interface // FIXME: add a generic callback to the buffer interface
@ -189,14 +200,17 @@ void ggml_backend_buffer_set_usage(ggml_backend_buffer_t buffer, enum ggml_backe
} }
enum ggml_backend_buffer_usage ggml_backend_buffer_get_usage(ggml_backend_buffer_t buffer) { enum ggml_backend_buffer_usage ggml_backend_buffer_get_usage(ggml_backend_buffer_t buffer) {
GGML_ASSERT(buffer);
return buffer->usage; return buffer->usage;
} }
ggml_backend_buffer_type_t ggml_backend_buffer_get_type(ggml_backend_buffer_t buffer) { ggml_backend_buffer_type_t ggml_backend_buffer_get_type(ggml_backend_buffer_t buffer) {
GGML_ASSERT(buffer);
return buffer->buft; return buffer->buft;
} }
void ggml_backend_buffer_reset(ggml_backend_buffer_t buffer) { void ggml_backend_buffer_reset(ggml_backend_buffer_t buffer) {
GGML_ASSERT(buffer);
if (buffer->iface.reset) { if (buffer->iface.reset) {
buffer->iface.reset(buffer); buffer->iface.reset(buffer);
} }
@ -235,6 +249,7 @@ void ggml_backend_free(ggml_backend_t backend) {
} }
ggml_backend_buffer_type_t ggml_backend_get_default_buffer_type(ggml_backend_t backend) { ggml_backend_buffer_type_t ggml_backend_get_default_buffer_type(ggml_backend_t backend) {
GGML_ASSERT(backend);
return ggml_backend_dev_buffer_type(backend->device); return ggml_backend_dev_buffer_type(backend->device);
} }
@ -251,6 +266,8 @@ size_t ggml_backend_get_max_size(ggml_backend_t backend) {
} }
void ggml_backend_tensor_set_async(ggml_backend_t backend, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) { void ggml_backend_tensor_set_async(ggml_backend_t backend, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
GGML_ASSERT(backend);
GGML_ASSERT(tensor);
GGML_ASSERT(tensor->data != NULL && "tensor not allocated"); GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds"); GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds");
@ -262,6 +279,8 @@ void ggml_backend_tensor_set_async(ggml_backend_t backend, struct ggml_tensor *
} }
void ggml_backend_tensor_get_async(ggml_backend_t backend, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) { void ggml_backend_tensor_get_async(ggml_backend_t backend, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
GGML_ASSERT(backend);
GGML_ASSERT(tensor);
GGML_ASSERT(tensor->data != NULL && "tensor not allocated"); GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor read out of bounds"); GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor read out of bounds");
@ -303,6 +322,7 @@ void ggml_backend_tensor_get(const struct ggml_tensor * tensor, void * data, siz
} }
void ggml_backend_tensor_memset(struct ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) { void ggml_backend_tensor_memset(struct ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) {
GGML_ASSERT(tensor);
ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer; ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
if (size == 0) { if (size == 0) {
@ -318,6 +338,7 @@ void ggml_backend_tensor_memset(struct ggml_tensor * tensor, uint8_t value, size
} }
void ggml_backend_synchronize(ggml_backend_t backend) { void ggml_backend_synchronize(ggml_backend_t backend) {
GGML_ASSERT(backend);
if (backend->iface.synchronize == NULL) { if (backend->iface.synchronize == NULL) {
return; return;
} }
@ -326,18 +347,21 @@ void ggml_backend_synchronize(ggml_backend_t backend) {
} }
ggml_backend_graph_plan_t ggml_backend_graph_plan_create(ggml_backend_t backend, struct ggml_cgraph * cgraph) { ggml_backend_graph_plan_t ggml_backend_graph_plan_create(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
GGML_ASSERT(backend);
GGML_ASSERT(backend->iface.graph_plan_create != NULL); GGML_ASSERT(backend->iface.graph_plan_create != NULL);
return backend->iface.graph_plan_create(backend, cgraph); return backend->iface.graph_plan_create(backend, cgraph);
} }
void ggml_backend_graph_plan_free(ggml_backend_t backend, ggml_backend_graph_plan_t plan) { void ggml_backend_graph_plan_free(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
GGML_ASSERT(backend);
GGML_ASSERT(backend->iface.graph_plan_free != NULL); GGML_ASSERT(backend->iface.graph_plan_free != NULL);
backend->iface.graph_plan_free(backend, plan); backend->iface.graph_plan_free(backend, plan);
} }
enum ggml_status ggml_backend_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan) { enum ggml_status ggml_backend_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
GGML_ASSERT(backend);
GGML_ASSERT(backend->iface.graph_plan_compute != NULL); GGML_ASSERT(backend->iface.graph_plan_compute != NULL);
return backend->iface.graph_plan_compute(backend, plan); return backend->iface.graph_plan_compute(backend, plan);
@ -350,22 +374,27 @@ enum ggml_status ggml_backend_graph_compute(ggml_backend_t backend, struct ggml_
} }
enum ggml_status ggml_backend_graph_compute_async(ggml_backend_t backend, struct ggml_cgraph * cgraph) { enum ggml_status ggml_backend_graph_compute_async(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
GGML_ASSERT(backend);
return backend->iface.graph_compute(backend, cgraph); return backend->iface.graph_compute(backend, cgraph);
} }
bool ggml_backend_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) { bool ggml_backend_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) {
GGML_ASSERT(backend);
return ggml_backend_dev_supports_op(backend->device, op); return ggml_backend_dev_supports_op(backend->device, op);
} }
bool ggml_backend_supports_buft(ggml_backend_t backend, ggml_backend_buffer_type_t buft) { bool ggml_backend_supports_buft(ggml_backend_t backend, ggml_backend_buffer_type_t buft) {
GGML_ASSERT(backend);
return ggml_backend_dev_supports_buft(backend->device, buft); return ggml_backend_dev_supports_buft(backend->device, buft);
} }
bool ggml_backend_offload_op(ggml_backend_t backend, const struct ggml_tensor * op) { bool ggml_backend_offload_op(ggml_backend_t backend, const struct ggml_tensor * op) {
GGML_ASSERT(backend);
return ggml_backend_dev_offload_op(backend->device, op); return ggml_backend_dev_offload_op(backend->device, op);
} }
ggml_backend_dev_t ggml_backend_get_device(ggml_backend_t backend) { ggml_backend_dev_t ggml_backend_get_device(ggml_backend_t backend) {
GGML_ASSERT(backend);
return backend->device; return backend->device;
} }
@ -401,6 +430,7 @@ void ggml_backend_tensor_copy_async(ggml_backend_t backend_src, ggml_backend_t b
return; return;
} }
GGML_ASSERT(backend_dst);
if (backend_dst->iface.cpy_tensor_async != NULL) { if (backend_dst->iface.cpy_tensor_async != NULL) {
if (backend_dst->iface.cpy_tensor_async(backend_src, backend_dst, src, dst)) { if (backend_dst->iface.cpy_tensor_async(backend_src, backend_dst, src, dst)) {
return; return;
@ -432,38 +462,52 @@ void ggml_backend_event_free(ggml_backend_event_t event) {
} }
void ggml_backend_event_record(ggml_backend_event_t event, ggml_backend_t backend) { void ggml_backend_event_record(ggml_backend_event_t event, ggml_backend_t backend) {
GGML_ASSERT(backend);
GGML_ASSERT(backend->iface.event_record != NULL); GGML_ASSERT(backend->iface.event_record != NULL);
backend->iface.event_record(backend, event); backend->iface.event_record(backend, event);
} }
void ggml_backend_event_synchronize(ggml_backend_event_t event) { void ggml_backend_event_synchronize(ggml_backend_event_t event) {
GGML_ASSERT(event);
GGML_ASSERT(event->device->iface.event_synchronize); GGML_ASSERT(event->device->iface.event_synchronize);
event->device->iface.event_synchronize(event->device, event); event->device->iface.event_synchronize(event->device, event);
} }
void ggml_backend_event_wait(ggml_backend_t backend, ggml_backend_event_t event) { void ggml_backend_event_wait(ggml_backend_t backend, ggml_backend_event_t event) {
GGML_ASSERT(backend);
GGML_ASSERT(backend->iface.event_wait != NULL); GGML_ASSERT(backend->iface.event_wait != NULL);
backend->iface.event_wait(backend, event); backend->iface.event_wait(backend, event);
} }
static void ggml_backend_graph_optimize(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
GGML_ASSERT(backend);
if (backend->iface.graph_optimize != NULL) {
backend->iface.graph_optimize(backend, cgraph);
}
}
// Backend device // Backend device
const char * ggml_backend_dev_name(ggml_backend_dev_t device) { const char * ggml_backend_dev_name(ggml_backend_dev_t device) {
GGML_ASSERT(device);
return device->iface.get_name(device); return device->iface.get_name(device);
} }
const char * ggml_backend_dev_description(ggml_backend_dev_t device) { const char * ggml_backend_dev_description(ggml_backend_dev_t device) {
GGML_ASSERT(device);
return device->iface.get_description(device); return device->iface.get_description(device);
} }
void ggml_backend_dev_memory(ggml_backend_dev_t device, size_t * free, size_t * total) { void ggml_backend_dev_memory(ggml_backend_dev_t device, size_t * free, size_t * total) {
GGML_ASSERT(device);
device->iface.get_memory(device, free, total); device->iface.get_memory(device, free, total);
} }
enum ggml_backend_dev_type ggml_backend_dev_type(ggml_backend_dev_t device) { enum ggml_backend_dev_type ggml_backend_dev_type(ggml_backend_dev_t device) {
GGML_ASSERT(device);
return device->iface.get_type(device); return device->iface.get_type(device);
} }
@ -473,10 +517,12 @@ void ggml_backend_dev_get_props(ggml_backend_dev_t device, struct ggml_backend_d
} }
ggml_backend_reg_t ggml_backend_dev_backend_reg(ggml_backend_dev_t device) { ggml_backend_reg_t ggml_backend_dev_backend_reg(ggml_backend_dev_t device) {
GGML_ASSERT(device);
return device->reg; return device->reg;
} }
ggml_backend_t ggml_backend_dev_init(ggml_backend_dev_t device, const char * params) { ggml_backend_t ggml_backend_dev_init(ggml_backend_dev_t device, const char * params) {
GGML_ASSERT(device);
return device->iface.init_backend(device, params); return device->iface.init_backend(device, params);
} }
@ -489,10 +535,12 @@ void ggml_backend_dev_reset(ggml_backend_dev_t device) {
} }
ggml_backend_buffer_type_t ggml_backend_dev_buffer_type(ggml_backend_dev_t device) { ggml_backend_buffer_type_t ggml_backend_dev_buffer_type(ggml_backend_dev_t device) {
GGML_ASSERT(device);
return device->iface.get_buffer_type(device); return device->iface.get_buffer_type(device);
} }
ggml_backend_buffer_type_t ggml_backend_dev_host_buffer_type(ggml_backend_dev_t device) { ggml_backend_buffer_type_t ggml_backend_dev_host_buffer_type(ggml_backend_dev_t device) {
GGML_ASSERT(device);
if (device->iface.get_host_buffer_type == NULL) { if (device->iface.get_host_buffer_type == NULL) {
return NULL; return NULL;
} }
@ -501,18 +549,22 @@ ggml_backend_buffer_type_t ggml_backend_dev_host_buffer_type(ggml_backend_dev_t
} }
ggml_backend_buffer_t ggml_backend_dev_buffer_from_host_ptr(ggml_backend_dev_t device, void * ptr, size_t size, size_t max_tensor_size) { ggml_backend_buffer_t ggml_backend_dev_buffer_from_host_ptr(ggml_backend_dev_t device, void * ptr, size_t size, size_t max_tensor_size) {
GGML_ASSERT(device);
return device->iface.buffer_from_host_ptr(device, ptr, size, max_tensor_size); return device->iface.buffer_from_host_ptr(device, ptr, size, max_tensor_size);
} }
bool ggml_backend_dev_supports_op(ggml_backend_dev_t device, const struct ggml_tensor * op) { bool ggml_backend_dev_supports_op(ggml_backend_dev_t device, const struct ggml_tensor * op) {
GGML_ASSERT(device);
return device->iface.supports_op(device, op); return device->iface.supports_op(device, op);
} }
bool ggml_backend_dev_supports_buft(ggml_backend_dev_t device, ggml_backend_buffer_type_t buft) { bool ggml_backend_dev_supports_buft(ggml_backend_dev_t device, ggml_backend_buffer_type_t buft) {
GGML_ASSERT(device);
return device->iface.supports_buft(device, buft); return device->iface.supports_buft(device, buft);
} }
bool ggml_backend_dev_offload_op(ggml_backend_dev_t device, const struct ggml_tensor * op) { bool ggml_backend_dev_offload_op(ggml_backend_dev_t device, const struct ggml_tensor * op) {
GGML_ASSERT(device);
if (device->iface.offload_op != NULL) { if (device->iface.offload_op != NULL) {
return device->iface.offload_op(device, op); return device->iface.offload_op(device, op);
} }
@ -523,18 +575,22 @@ bool ggml_backend_dev_offload_op(ggml_backend_dev_t device, const struct ggml_te
// Backend (reg) // Backend (reg)
const char * ggml_backend_reg_name(ggml_backend_reg_t reg) { const char * ggml_backend_reg_name(ggml_backend_reg_t reg) {
GGML_ASSERT(reg);
return reg->iface.get_name(reg); return reg->iface.get_name(reg);
} }
size_t ggml_backend_reg_dev_count(ggml_backend_reg_t reg) { size_t ggml_backend_reg_dev_count(ggml_backend_reg_t reg) {
GGML_ASSERT(reg);
return reg->iface.get_device_count(reg); return reg->iface.get_device_count(reg);
} }
ggml_backend_dev_t ggml_backend_reg_dev_get(ggml_backend_reg_t reg, size_t index) { ggml_backend_dev_t ggml_backend_reg_dev_get(ggml_backend_reg_t reg, size_t index) {
GGML_ASSERT(reg);
return reg->iface.get_device(reg, index); return reg->iface.get_device(reg, index);
} }
void * ggml_backend_reg_get_proc_address(ggml_backend_reg_t reg, const char * name) { void * ggml_backend_reg_get_proc_address(ggml_backend_reg_t reg, const char * name) {
GGML_ASSERT(reg);
if (!reg->iface.get_proc_address) { if (!reg->iface.get_proc_address) {
return NULL; return NULL;
} }
@ -549,6 +605,7 @@ struct ggml_backend_multi_buffer_context {
}; };
static void ggml_backend_multi_buffer_free_buffer(ggml_backend_buffer_t buffer) { static void ggml_backend_multi_buffer_free_buffer(ggml_backend_buffer_t buffer) {
GGML_ASSERT(buffer);
ggml_backend_multi_buffer_context * ctx = (ggml_backend_multi_buffer_context *) buffer->context; ggml_backend_multi_buffer_context * ctx = (ggml_backend_multi_buffer_context *) buffer->context;
for (size_t i = 0; i < ctx->n_buffers; i++) { for (size_t i = 0; i < ctx->n_buffers; i++) {
ggml_backend_buffer_free(ctx->buffers[i]); ggml_backend_buffer_free(ctx->buffers[i]);
@ -560,6 +617,7 @@ static void ggml_backend_multi_buffer_free_buffer(ggml_backend_buffer_t buffer)
} }
static void ggml_backend_multi_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) { static void ggml_backend_multi_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
GGML_ASSERT(buffer);
ggml_backend_multi_buffer_context * ctx = (ggml_backend_multi_buffer_context *) buffer->context; ggml_backend_multi_buffer_context * ctx = (ggml_backend_multi_buffer_context *) buffer->context;
for (size_t i = 0; i < ctx->n_buffers; i++) { for (size_t i = 0; i < ctx->n_buffers; i++) {
ggml_backend_buffer_clear(ctx->buffers[i], value); ggml_backend_buffer_clear(ctx->buffers[i], value);
@ -595,10 +653,12 @@ ggml_backend_buffer_t ggml_backend_multi_buffer_alloc_buffer(ggml_backend_buffer
} }
bool ggml_backend_buffer_is_multi_buffer(ggml_backend_buffer_t buffer) { bool ggml_backend_buffer_is_multi_buffer(ggml_backend_buffer_t buffer) {
GGML_ASSERT(buffer);
return buffer->iface.free_buffer == ggml_backend_multi_buffer_free_buffer; return buffer->iface.free_buffer == ggml_backend_multi_buffer_free_buffer;
} }
void ggml_backend_multi_buffer_set_usage(ggml_backend_buffer_t buffer, enum ggml_backend_buffer_usage usage) { void ggml_backend_multi_buffer_set_usage(ggml_backend_buffer_t buffer, enum ggml_backend_buffer_usage usage) {
GGML_ASSERT(buffer);
GGML_ASSERT(ggml_backend_buffer_is_multi_buffer(buffer)); GGML_ASSERT(ggml_backend_buffer_is_multi_buffer(buffer));
ggml_backend_multi_buffer_context * ctx = (ggml_backend_multi_buffer_context *) buffer->context; ggml_backend_multi_buffer_context * ctx = (ggml_backend_multi_buffer_context *) buffer->context;
for (size_t i = 0; i < ctx->n_buffers; i++) { for (size_t i = 0; i < ctx->n_buffers; i++) {
@ -626,7 +686,7 @@ static bool ggml_is_view_op(enum ggml_op op) {
#endif #endif
#ifndef GGML_SCHED_MAX_SPLIT_INPUTS #ifndef GGML_SCHED_MAX_SPLIT_INPUTS
#define GGML_SCHED_MAX_SPLIT_INPUTS GGML_MAX_SRC #define GGML_SCHED_MAX_SPLIT_INPUTS 30
#endif #endif
#ifndef GGML_SCHED_MAX_COPIES #ifndef GGML_SCHED_MAX_COPIES
@ -883,7 +943,7 @@ static void ggml_backend_sched_set_if_supported(ggml_backend_sched_t sched, stru
} }
// assigns backends to ops and splits the graph into subgraphs that can be computed on the same backend // assigns backends to ops and splits the graph into subgraphs that can be computed on the same backend
static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * graph) { void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * graph) {
// reset splits // reset splits
sched->n_splits = 0; sched->n_splits = 0;
sched->n_graph_inputs = 0; sched->n_graph_inputs = 0;
@ -1279,6 +1339,10 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
struct ggml_backend_sched_split * split = &sched->splits[i]; struct ggml_backend_sched_split * split = &sched->splits[i];
split->graph = ggml_graph_view(graph, split->i_start, split->i_end); split->graph = ggml_graph_view(graph, split->i_start, split->i_end);
// Optimize this split of the graph. This needs to happen before we make graph_copy,
// so they are in sync.
ggml_backend_graph_optimize(sched->backends[split->backend_id], &split->graph);
// add inputs to the graph copy so that they are allocated by ggml-alloc at the start of the split // add inputs to the graph copy so that they are allocated by ggml-alloc at the start of the split
for (int j = 0; j < split->n_inputs; j++) { for (int j = 0; j < split->n_inputs; j++) {
assert(graph_copy->size > (graph_copy->n_nodes + 1)); assert(graph_copy->size > (graph_copy->n_nodes + 1));
@ -1384,17 +1448,22 @@ static bool ggml_backend_sched_alloc_splits(ggml_backend_sched_t sched) {
} }
static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t sched) { static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t sched) {
GGML_ASSERT(sched);
struct ggml_backend_sched_split * splits = sched->splits; struct ggml_backend_sched_split * splits = sched->splits;
for (int i = 0; i < sched->n_splits; i++) { ggml_tensor * prev_ids_tensor = nullptr;
struct ggml_backend_sched_split * split = &splits[i]; std::vector<int32_t> ids;
std::vector<ggml_bitset_t> used_ids;
for (int split_id = 0; split_id < sched->n_splits; split_id++) {
struct ggml_backend_sched_split * split = &splits[split_id];
int split_backend_id = split->backend_id; int split_backend_id = split->backend_id;
ggml_backend_t split_backend = sched->backends[split_backend_id]; ggml_backend_t split_backend = sched->backends[split_backend_id];
// copy the input tensors to the split backend // copy the input tensors to the split backend
for (int j = 0; j < split->n_inputs; j++) { for (int input_id = 0; input_id < split->n_inputs; input_id++) {
ggml_backend_t input_backend = ggml_backend_sched_get_tensor_backend(sched, split->inputs[j]); ggml_backend_t input_backend = ggml_backend_sched_get_tensor_backend(sched, split->inputs[input_id]);
struct ggml_tensor * input = split->inputs[j]; struct ggml_tensor * input = split->inputs[input_id];
struct ggml_tensor * input_cpy = tensor_copy(input, split_backend_id, sched->cur_copy); struct ggml_tensor * input_cpy = tensor_copy(input, split_backend_id, sched->cur_copy);
if (input->flags & GGML_TENSOR_FLAG_INPUT) { if (input->flags & GGML_TENSOR_FLAG_INPUT) {
@ -1412,16 +1481,104 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s
} else { } else {
ggml_backend_synchronize(split_backend); ggml_backend_synchronize(split_backend);
} }
// try async copy, but if not possible, we can still use a sync copy without synchronizing the dst backend, since we handle the synchronization here with multiple copies and events
// TODO: add public function to facilitate this, since applications do not have direct access to the backend interface // when offloading MoE weights, we can reduce the amount of data copied by copying only the experts that are used
if (!split_backend->iface.cpy_tensor_async || !split_backend->iface.cpy_tensor_async(input_backend, split_backend, input, input_cpy)) { ggml_tensor * node = split->graph.nodes[0];
if (split->graph.n_nodes > 0 &&
ggml_backend_buffer_get_usage(input->buffer) == GGML_BACKEND_BUFFER_USAGE_WEIGHTS &&
ggml_backend_buffer_is_host(input->buffer) && (
(node->src[0] == input_cpy && node->op == GGML_OP_MUL_MAT_ID)
//|| (node->src[1] == input_cpy && node->op == GGML_OP_ADD_ID) /* GGML_OP_ADD_ID weights are small and not worth splitting */
)) {
const int64_t n_expert = node->op == GGML_OP_MUL_MAT_ID ? input->ne[2] : input->ne[1];
const size_t expert_size = node->op == GGML_OP_MUL_MAT_ID ? input->nb[2] : input->nb[1];
ggml_backend_synchronize(input_backend); ggml_backend_synchronize(input_backend);
if (sched->events[split_backend_id][sched->cur_copy] != NULL) {
ggml_backend_event_synchronize(sched->events[split_backend_id][sched->cur_copy]); // get the ids
} else { ggml_tensor * ids_tensor = node->src[2];
ggml_backend_synchronize(split_backend); ggml_backend_t ids_backend = split_backend;
// if the ids tensor is also an input of the split, it may not have been copied yet to the split backend
// in that case, we use the original ids tensor
for (int i = input_id + 1; i < split->n_inputs; i++) {
if (ids_tensor == tensor_copy(split->inputs[i], split_backend_id, sched->cur_copy)) {
ids_tensor = split->inputs[i];
ids_backend = ggml_backend_sched_get_tensor_backend(sched, split->inputs[i]);
break;
}
}
if (ids_tensor != prev_ids_tensor) {
ids.resize(ggml_nbytes(ids_tensor) / sizeof(int32_t));
ggml_backend_tensor_get_async(ids_backend, ids_tensor, ids.data(), 0, ggml_nbytes(ids_tensor));
ggml_backend_synchronize(ids_backend);
// find the used experts
used_ids.clear();
used_ids.resize(ggml_bitset_size(n_expert));
for (int64_t i1 = 0; i1 < ids_tensor->ne[1]; i1++) {
for (int64_t i0 = 0; i0 < ids_tensor->ne[0]; i0++) {
int32_t id = ids[i1 * ids_tensor->nb[1]/sizeof(int32_t) + i0 * ids_tensor->nb[0]/sizeof(int32_t)];
GGML_ASSERT(id >= 0 && id < n_expert);
ggml_bitset_set(used_ids.data(), id);
}
}
prev_ids_tensor = ids_tensor;
}
// group consecutive experts and copy them together
auto copy_experts = [&](int32_t first_id, int32_t last_id) {
const size_t expert_offset = first_id * expert_size;
const size_t expert_size_copy = (last_id - first_id + 1) * expert_size;
const size_t padding = std::min<size_t>(expert_size, 512);
const size_t padding_end = last_id < n_expert - 1 ? padding : 0;
ggml_backend_tensor_set_async(split_backend,
input_cpy,
(const uint8_t *)input->data + expert_offset, expert_offset,
// copy a bit extra at the to ensure there are no NaNs in the padding of the last expert
// this is necessary for MMQ in the CUDA backend
expert_size_copy + padding_end);
};
int id = 0;
while (!ggml_bitset_get(used_ids.data(), id)) {
id++;
}
int32_t first_id = id;
int32_t last_id = first_id;
for (++id; id < n_expert; ++id) {
if (!ggml_bitset_get(used_ids.data(), id)) {
continue;
}
if (id == last_id + 1) {
last_id = id;
continue;
}
copy_experts(first_id, last_id);
first_id = id;
last_id = id;
}
copy_experts(first_id, last_id);
} else {
// try async copy, but if not possible, we can still use a sync copy without synchronizing the dst backend, since we handle the synchronization here with multiple copies and events
// TODO: add public function to facilitate this, since applications do not have direct access to the backend interface
if (!split_backend->iface.cpy_tensor_async || !split_backend->iface.cpy_tensor_async(input_backend, split_backend, input, input_cpy)) {
ggml_backend_synchronize(input_backend);
if (sched->events[split_backend_id][sched->cur_copy] != NULL) {
ggml_backend_event_synchronize(sched->events[split_backend_id][sched->cur_copy]);
} else {
ggml_backend_synchronize(split_backend);
}
ggml_backend_tensor_copy(input, input_cpy);
} }
ggml_backend_tensor_copy(input, input_cpy);
} }
} }
} }
@ -1578,6 +1735,7 @@ void ggml_backend_sched_free(ggml_backend_sched_t sched) {
} }
void ggml_backend_sched_reset(ggml_backend_sched_t sched) { void ggml_backend_sched_reset(ggml_backend_sched_t sched) {
GGML_ASSERT(sched);
// reset state for the next run // reset state for the next run
if (!sched->is_reset) { if (!sched->is_reset) {
ggml_hash_set_reset(&sched->hash_set); ggml_hash_set_reset(&sched->hash_set);
@ -1589,8 +1747,11 @@ void ggml_backend_sched_reset(ggml_backend_sched_t sched) {
} }
bool ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph) { bool ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph) {
GGML_ASSERT(sched);
GGML_ASSERT((int)sched->hash_set.size >= measure_graph->n_nodes + measure_graph->n_leafs); GGML_ASSERT((int)sched->hash_set.size >= measure_graph->n_nodes + measure_graph->n_leafs);
ggml_backend_sched_reset(sched);
ggml_backend_sched_synchronize(sched); ggml_backend_sched_synchronize(sched);
ggml_backend_sched_split_graph(sched, measure_graph); ggml_backend_sched_split_graph(sched, measure_graph);
@ -1623,6 +1784,7 @@ bool ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph *
} }
bool ggml_backend_sched_alloc_graph(ggml_backend_sched_t sched, struct ggml_cgraph * graph) { bool ggml_backend_sched_alloc_graph(ggml_backend_sched_t sched, struct ggml_cgraph * graph) {
GGML_ASSERT(sched);
GGML_ASSERT((int)sched->hash_set.size >= graph->n_nodes + graph->n_leafs); GGML_ASSERT((int)sched->hash_set.size >= graph->n_nodes + graph->n_leafs);
GGML_ASSERT(!sched->is_alloc); GGML_ASSERT(!sched->is_alloc);
@ -1647,6 +1809,7 @@ enum ggml_status ggml_backend_sched_graph_compute(ggml_backend_sched_t sched, st
} }
enum ggml_status ggml_backend_sched_graph_compute_async(ggml_backend_sched_t sched, struct ggml_cgraph * graph) { enum ggml_status ggml_backend_sched_graph_compute_async(ggml_backend_sched_t sched, struct ggml_cgraph * graph) {
GGML_ASSERT(sched);
if (!sched->is_reset && !sched->is_alloc) { if (!sched->is_reset && !sched->is_alloc) {
ggml_backend_sched_reset(sched); ggml_backend_sched_reset(sched);
} }
@ -1661,6 +1824,7 @@ enum ggml_status ggml_backend_sched_graph_compute_async(ggml_backend_sched_t sch
} }
void ggml_backend_sched_synchronize(ggml_backend_sched_t sched) { void ggml_backend_sched_synchronize(ggml_backend_sched_t sched) {
GGML_ASSERT(sched);
for (int i = 0; i < sched->n_backends; i++) { for (int i = 0; i < sched->n_backends; i++) {
ggml_backend_synchronize(sched->backends[i]); ggml_backend_synchronize(sched->backends[i]);
} }
@ -1673,28 +1837,42 @@ void ggml_backend_sched_synchronize(ggml_backend_sched_t sched) {
} }
void ggml_backend_sched_set_eval_callback(ggml_backend_sched_t sched, ggml_backend_sched_eval_callback callback, void * user_data) { void ggml_backend_sched_set_eval_callback(ggml_backend_sched_t sched, ggml_backend_sched_eval_callback callback, void * user_data) {
GGML_ASSERT(sched);
sched->callback_eval = callback; sched->callback_eval = callback;
sched->callback_eval_user_data = user_data; sched->callback_eval_user_data = user_data;
} }
int ggml_backend_sched_get_n_splits(ggml_backend_sched_t sched) { int ggml_backend_sched_get_n_splits(ggml_backend_sched_t sched) {
GGML_ASSERT(sched);
return sched->n_splits; return sched->n_splits;
} }
int ggml_backend_sched_get_n_copies(ggml_backend_sched_t sched) { int ggml_backend_sched_get_n_copies(ggml_backend_sched_t sched) {
GGML_ASSERT(sched);
return sched->n_copies; return sched->n_copies;
} }
int ggml_backend_sched_get_n_backends(ggml_backend_sched_t sched) { int ggml_backend_sched_get_n_backends(ggml_backend_sched_t sched) {
GGML_ASSERT(sched);
return sched->n_backends; return sched->n_backends;
} }
ggml_backend_t ggml_backend_sched_get_backend(ggml_backend_sched_t sched, int i) { ggml_backend_t ggml_backend_sched_get_backend(ggml_backend_sched_t sched, int i) {
GGML_ASSERT(sched);
GGML_ASSERT(i >= 0 && i < sched->n_backends); GGML_ASSERT(i >= 0 && i < sched->n_backends);
return sched->backends[i]; return sched->backends[i];
} }
ggml_backend_buffer_type_t ggml_backend_sched_get_buffer_type(ggml_backend_sched_t sched, ggml_backend_t backend) {
GGML_ASSERT(sched);
int backend_index = ggml_backend_sched_backend_id(sched, backend);
GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends);
return sched->bufts[backend_index];
}
size_t ggml_backend_sched_get_buffer_size(ggml_backend_sched_t sched, ggml_backend_t backend) { size_t ggml_backend_sched_get_buffer_size(ggml_backend_sched_t sched, ggml_backend_t backend) {
GGML_ASSERT(sched);
int backend_index = ggml_backend_sched_backend_id(sched, backend); int backend_index = ggml_backend_sched_backend_id(sched, backend);
GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends); GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends);
@ -1715,6 +1893,7 @@ size_t ggml_backend_sched_get_attempted_buffer_size(ggml_backend_sched_t sched,
} }
void ggml_backend_sched_set_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node, ggml_backend_t backend) { void ggml_backend_sched_set_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node, ggml_backend_t backend) {
GGML_ASSERT(sched);
int backend_index = ggml_backend_sched_backend_id(sched, backend); int backend_index = ggml_backend_sched_backend_id(sched, backend);
GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends); GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends);
tensor_backend_id(node) = backend_index; tensor_backend_id(node) = backend_index;
@ -1723,6 +1902,7 @@ void ggml_backend_sched_set_tensor_backend(ggml_backend_sched_t sched, struct gg
} }
ggml_backend_t ggml_backend_sched_get_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node) { ggml_backend_t ggml_backend_sched_get_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node) {
GGML_ASSERT(sched);
int backend_index = tensor_backend_id(node); int backend_index = tensor_backend_id(node);
if (backend_index == -1) { if (backend_index == -1) {
return NULL; return NULL;
@ -1733,6 +1913,7 @@ ggml_backend_t ggml_backend_sched_get_tensor_backend(ggml_backend_sched_t sched,
// utils // utils
enum ggml_status ggml_backend_view_init(struct ggml_tensor * tensor) { enum ggml_status ggml_backend_view_init(struct ggml_tensor * tensor) {
GGML_ASSERT(tensor);
GGML_ASSERT(tensor->buffer == NULL); GGML_ASSERT(tensor->buffer == NULL);
GGML_ASSERT(tensor->view_src != NULL); GGML_ASSERT(tensor->view_src != NULL);
GGML_ASSERT(tensor->view_src->buffer != NULL); GGML_ASSERT(tensor->view_src->buffer != NULL);
@ -1744,6 +1925,7 @@ enum ggml_status ggml_backend_view_init(struct ggml_tensor * tensor) {
} }
enum ggml_status ggml_backend_tensor_alloc(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, void * addr) { enum ggml_status ggml_backend_tensor_alloc(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, void * addr) {
GGML_ASSERT(tensor);
GGML_ASSERT(tensor->buffer == NULL); GGML_ASSERT(tensor->buffer == NULL);
GGML_ASSERT(tensor->data == NULL); GGML_ASSERT(tensor->data == NULL);
GGML_ASSERT(tensor->view_src == NULL); GGML_ASSERT(tensor->view_src == NULL);
@ -1817,6 +1999,7 @@ static void graph_copy_init_tensor(struct ggml_hash_set * hash_set, struct ggml_
} }
struct ggml_backend_graph_copy ggml_backend_graph_copy(ggml_backend_t backend, struct ggml_cgraph * graph) { struct ggml_backend_graph_copy ggml_backend_graph_copy(ggml_backend_t backend, struct ggml_cgraph * graph) {
GGML_ASSERT(graph);
struct ggml_hash_set hash_set = ggml_hash_set_new(graph->visited_hash_set.size); struct ggml_hash_set hash_set = ggml_hash_set_new(graph->visited_hash_set.size);
struct ggml_tensor ** node_copies = (ggml_tensor **) calloc(hash_set.size, sizeof(node_copies[0])); // NOLINT struct ggml_tensor ** node_copies = (ggml_tensor **) calloc(hash_set.size, sizeof(node_copies[0])); // NOLINT
bool * node_init = (bool *) calloc(hash_set.size, sizeof(node_init[0])); bool * node_init = (bool *) calloc(hash_set.size, sizeof(node_init[0]));
@ -1961,6 +2144,7 @@ bool ggml_backend_compare_graph_backend(ggml_backend_t backend1, ggml_backend_t
// CPU backend - buffer // CPU backend - buffer
static void * ggml_backend_cpu_buffer_get_base(ggml_backend_buffer_t buffer) { static void * ggml_backend_cpu_buffer_get_base(ggml_backend_buffer_t buffer) {
GGML_ASSERT(buffer);
uintptr_t data = (uintptr_t)buffer->context; uintptr_t data = (uintptr_t)buffer->context;
// align the buffer // align the buffer
@ -1972,6 +2156,7 @@ static void * ggml_backend_cpu_buffer_get_base(ggml_backend_buffer_t buffer) {
} }
static void ggml_backend_cpu_buffer_free_buffer(ggml_backend_buffer_t buffer) { static void ggml_backend_cpu_buffer_free_buffer(ggml_backend_buffer_t buffer) {
GGML_ASSERT(buffer);
ggml_aligned_free(buffer->context, buffer->size); ggml_aligned_free(buffer->context, buffer->size);
delete buffer; delete buffer;
} }
@ -1981,24 +2166,28 @@ static void ggml_backend_cpu_ptr_buffer_free_buffer(ggml_backend_buffer_t buffer
} }
static void ggml_backend_cpu_buffer_memset_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) { static void ggml_backend_cpu_buffer_memset_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) {
GGML_ASSERT(tensor);
memset((char *)tensor->data + offset, value, size); memset((char *)tensor->data + offset, value, size);
GGML_UNUSED(buffer); GGML_UNUSED(buffer);
} }
static void ggml_backend_cpu_buffer_set_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) { static void ggml_backend_cpu_buffer_set_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
GGML_ASSERT(tensor);
memcpy((char *)tensor->data + offset, data, size); memcpy((char *)tensor->data + offset, data, size);
GGML_UNUSED(buffer); GGML_UNUSED(buffer);
} }
static void ggml_backend_cpu_buffer_get_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) { static void ggml_backend_cpu_buffer_get_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
GGML_ASSERT(tensor);
memcpy(data, (const char *)tensor->data + offset, size); memcpy(data, (const char *)tensor->data + offset, size);
GGML_UNUSED(buffer); GGML_UNUSED(buffer);
} }
static bool ggml_backend_cpu_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * src, struct ggml_tensor * dst) { static bool ggml_backend_cpu_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * src, struct ggml_tensor * dst) {
GGML_ASSERT(src);
if (ggml_backend_buffer_is_host(src->buffer)) { if (ggml_backend_buffer_is_host(src->buffer)) {
memcpy(dst->data, src->data, ggml_nbytes(src)); memcpy(dst->data, src->data, ggml_nbytes(src));
return true; return true;
@ -2009,6 +2198,7 @@ static bool ggml_backend_cpu_buffer_cpy_tensor(ggml_backend_buffer_t buffer, con
} }
static void ggml_backend_cpu_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) { static void ggml_backend_cpu_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
GGML_ASSERT(buffer);
memset(buffer->context, value, buffer->size); memset(buffer->context, value, buffer->size);
} }

View File

@ -74,7 +74,7 @@ if (BLAS_FOUND)
target_compile_options(ggml-blas PRIVATE ${BLAS_LINKER_FLAGS}) target_compile_options(ggml-blas PRIVATE ${BLAS_LINKER_FLAGS})
if (${BLAS_INCLUDE_DIRS} MATCHES "mkl" AND (${GGML_BLAS_VENDOR} MATCHES "Generic" OR ${GGML_BLAS_VENDOR} MATCHES "Intel")) if ("${BLAS_INCLUDE_DIRS}" MATCHES "mkl" AND (${GGML_BLAS_VENDOR} MATCHES "Generic" OR ${GGML_BLAS_VENDOR} MATCHES "Intel"))
add_compile_definitions(GGML_BLAS_USE_MKL) add_compile_definitions(GGML_BLAS_USE_MKL)
endif() endif()

View File

@ -270,6 +270,7 @@ static struct ggml_backend_i blas_backend_i = {
/* .graph_compute = */ ggml_backend_blas_graph_compute, /* .graph_compute = */ ggml_backend_blas_graph_compute,
/* .event_record = */ NULL, /* .event_record = */ NULL,
/* .event_wait = */ NULL, /* .event_wait = */ NULL,
/* .graph_optimize = */ NULL,
}; };
static ggml_guid_t ggml_backend_blas_guid(void) { static ggml_guid_t ggml_backend_blas_guid(void) {

View File

@ -224,7 +224,13 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
foreach(feature DOTPROD SVE MATMUL_INT8 FMA FP16_VECTOR_ARITHMETIC SME) foreach(feature DOTPROD SVE MATMUL_INT8 FMA FP16_VECTOR_ARITHMETIC SME)
string(FIND "${ARM_FEATURE}" "__ARM_FEATURE_${feature} 1" feature_pos) string(FIND "${ARM_FEATURE}" "__ARM_FEATURE_${feature} 1" feature_pos)
if (NOT ${feature_pos} EQUAL -1) if (NOT ${feature_pos} EQUAL -1)
message(STATUS "ARM feature ${feature} enabled") # Special handling for MATMUL_INT8 when machine doesn't support i8mm
if ("${feature}" STREQUAL "MATMUL_INT8" AND GGML_MACHINE_SUPPORTS_noi8mm)
message(STATUS "ARM feature ${feature} detected but unsetting due to machine not supporting i8mm")
list(APPEND ARCH_FLAGS -U__ARM_FEATURE_MATMUL_INT8)
else()
message(STATUS "ARM feature ${feature} enabled")
endif()
endif() endif()
endforeach() endforeach()
endif() endif()
@ -433,15 +439,31 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
ggml-cpu/arch/riscv/quants.c ggml-cpu/arch/riscv/quants.c
ggml-cpu/arch/riscv/repack.cpp ggml-cpu/arch/riscv/repack.cpp
) )
if (GGML_RVV) if (GGML_CPU_RISCV64_SPACEMIT)
if (GGML_XTHEADVECTOR) target_compile_definitions(${GGML_CPU_NAME} PRIVATE GGML_USE_CPU_RISCV64_SPACEMIT ${RISCV64_SPACEMIT_IME_SPEC})
list(APPEND ARCH_FLAGS -march=rv64gc_xtheadvector -mabi=lp64d) list(APPEND GGML_CPU_SOURCES
elseif (GGML_RV_ZFH) ggml-cpu/spacemit/ime.cpp
list(APPEND ARCH_FLAGS -march=rv64gcv_zfhmin -mabi=lp64d) ggml-cpu/spacemit/ime.h
else() ggml-cpu/spacemit/ime1_kernels.cpp
list(APPEND ARCH_FLAGS -march=rv64gcv -mabi=lp64d) ggml-cpu/spacemit/ime_kernels.h
)
endif()
set(MARCH_STR "rv64gc")
if (GGML_RV_ZFH)
string(APPEND MARCH_STR "_zfh")
endif()
if (GGML_XTHEADVECTOR)
string(APPEND MARCH_STR "_xtheadvector")
elseif (GGML_RVV)
string(APPEND MARCH_STR "_v")
if (GGML_RV_ZVFH)
string(APPEND MARCH_STR "_zvfh")
endif() endif()
endif() endif()
if (GGML_RV_ZICBOP)
string(APPEND MARCH_STR "_zicbop")
endif()
list(APPEND ARCH_FLAGS "-march=${MARCH_STR}" -mabi=lp64d)
elseif (GGML_SYSTEM_ARCH STREQUAL "s390x") elseif (GGML_SYSTEM_ARCH STREQUAL "s390x")
message(STATUS "s390x detected") message(STATUS "s390x detected")
list(APPEND GGML_CPU_SOURCES ggml-cpu/arch/s390/quants.c) list(APPEND GGML_CPU_SOURCES ggml-cpu/arch/s390/quants.c)
@ -450,7 +472,6 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
# TODO: Separation to determine activation of VX/VXE/VXE2 # TODO: Separation to determine activation of VX/VXE/VXE2
if (${S390X_M} MATCHES "8561|8562") if (${S390X_M} MATCHES "8561|8562")
set(GGML_NNPA OFF)
message(STATUS "z15 target") message(STATUS "z15 target")
list(APPEND ARCH_FLAGS -march=z15) list(APPEND ARCH_FLAGS -march=z15)
elseif (${S390X_M} MATCHES "3931") elseif (${S390X_M} MATCHES "3931")
@ -460,7 +481,7 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
# NOTE: Only available from GCC 15.1.0 onwards. Any z17 machine with compile issues must first verify their GCC version. # NOTE: Only available from GCC 15.1.0 onwards. Any z17 machine with compile issues must first verify their GCC version.
# binutils must also be updated to the latest for the -march=z17 flag to work. Otherwise, use -march=arch15. # binutils must also be updated to the latest for the -march=z17 flag to work. Otherwise, use -march=arch15.
message(STATUS "z17 target") message(STATUS "z17 target")
list(APPEND ARCH_FLAGS -march=z17) list(APPEND ARCH_FLAGS -march=arch15)
else() else()
message(STATUS "Unknown target") message(STATUS "Unknown target")
message(WARNING "Unknown target. If you are compiling for z14 and earlier, you might have to add -DGGML_VXE=OFF.") message(WARNING "Unknown target. If you are compiling for z14 and earlier, you might have to add -DGGML_VXE=OFF.")
@ -472,11 +493,6 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
list(APPEND ARCH_FLAGS -mvx -mzvector) list(APPEND ARCH_FLAGS -mvx -mzvector)
list(APPEND ARCH_DEFINITIONS GGML_VXE) list(APPEND ARCH_DEFINITIONS GGML_VXE)
endif() endif()
if (GGML_NNPA)
message(STATUS "NNPA enabled")
list(APPEND ARCH_DEFINITIONS GGML_NNPA)
endif()
elseif (CMAKE_SYSTEM_PROCESSOR MATCHES "wasm") elseif (CMAKE_SYSTEM_PROCESSOR MATCHES "wasm")
message(STATUS "Wasm detected") message(STATUS "Wasm detected")
list (APPEND GGML_CPU_SOURCES ggml-cpu/arch/wasm/quants.c) list (APPEND GGML_CPU_SOURCES ggml-cpu/arch/wasm/quants.c)
@ -497,9 +513,9 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
# Fetch KleidiAI sources: # Fetch KleidiAI sources:
include(FetchContent) include(FetchContent)
set(KLEIDIAI_COMMIT_TAG "v1.11.0") set(KLEIDIAI_COMMIT_TAG "v1.14.0")
set(KLEIDIAI_DOWNLOAD_URL "https://github.com/ARM-software/kleidiai/archive/refs/tags/${KLEIDIAI_COMMIT_TAG}.tar.gz") set(KLEIDIAI_DOWNLOAD_URL "https://github.com/ARM-software/kleidiai/archive/refs/tags/${KLEIDIAI_COMMIT_TAG}.tar.gz")
set(KLEIDIAI_ARCHIVE_MD5 "3fe9e5ab964c375c53839296eb71eaa2") set(KLEIDIAI_ARCHIVE_MD5 "45e110675d93f99f82c23a1afcca76bc")
if (POLICY CMP0135) if (POLICY CMP0135)
cmake_policy(SET CMP0135 NEW) cmake_policy(SET CMP0135 NEW)
@ -555,6 +571,7 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
list(APPEND GGML_KLEIDIAI_SOURCES list(APPEND GGML_KLEIDIAI_SOURCES
${KLEIDIAI_SRC}/kai/ukernels/matmul/pack/kai_lhs_quant_pack_qsi8d32p_f32.c ${KLEIDIAI_SRC}/kai/ukernels/matmul/pack/kai_lhs_quant_pack_qsi8d32p_f32.c
${KLEIDIAI_SRC}/kai/ukernels/matmul/pack/kai_lhs_quant_pack_qsi8d32p4x8sb_f32_neon.c
${KLEIDIAI_SRC}/kai/ukernels/matmul/pack/kai_rhs_pack_nxk_qsi4c32ps1s0scalef16_qsu4c32s16s0_neon.c ${KLEIDIAI_SRC}/kai/ukernels/matmul/pack/kai_rhs_pack_nxk_qsi4c32ps1s0scalef16_qsu4c32s16s0_neon.c
${KLEIDIAI_SRC}/kai/ukernels/matmul/pack/kai_lhs_quant_pack_qsi8d32p_f32_neon.c ${KLEIDIAI_SRC}/kai/ukernels/matmul/pack/kai_lhs_quant_pack_qsi8d32p_f32_neon.c
${KLEIDIAI_SRC}/kai/ukernels/matmul/pack/kai_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0.c) ${KLEIDIAI_SRC}/kai/ukernels/matmul/pack/kai_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0.c)
@ -575,8 +592,10 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p1vlx4_qsi4c32p4vlx4_1vlx4vl_sme2_mopa.c ${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p1vlx4_qsi4c32p4vlx4_1vlx4vl_sme2_mopa.c
${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p1x4_qsi4c32p4vlx4_1x4vl_sme2_sdot.c ${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p1x4_qsi4c32p4vlx4_1x4vl_sme2_sdot.c
${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_fp32_bf16p_bf16p/kai_matmul_clamp_f32_bf16p2vlx2_bf16p2vlx2_2vlx2vl_sme2_mopa.c ${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_fp32_bf16p_bf16p/kai_matmul_clamp_f32_bf16p2vlx2_bf16p2vlx2_2vlx2vl_sme2_mopa.c
${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_fp32_bf16p_bf16p/kai_matmul_clamp_f32_bf16p2vlx2_bf16p2vlx2_2vlx2vl_sme2_mopa_asm.S
${KLEIDIAI_SRC}/kai/ukernels/matmul/pack/kai_lhs_pack_bf16p2vlx2_f32_sme.c ${KLEIDIAI_SRC}/kai/ukernels/matmul/pack/kai_lhs_pack_bf16p2vlx2_f32_sme.c
${KLEIDIAI_SRC}/kai/ukernels/matmul/pack/kai_rhs_pack_kxn_bf16p2vlx2b_f32_x32_sme.c) ${KLEIDIAI_SRC}/kai/ukernels/matmul/pack/kai_rhs_pack_kxn_bf16p2vlx2b_f32_x32_sme.c
${KLEIDIAI_SRC}/kai/kai_common_sme_asm.S)
set(PRIVATE_ARCH_FLAGS "-fno-tree-vectorize;${PRIVATE_ARCH_FLAGS}+sve+sve2") set(PRIVATE_ARCH_FLAGS "-fno-tree-vectorize;${PRIVATE_ARCH_FLAGS}+sve+sve2")
endif() endif()

View File

@ -7,7 +7,7 @@
#include "ggml-cpu.h" #include "ggml-cpu.h"
#include "traits.h" #include "traits.h"
#if defined(__gnu_linux__) #if defined(__linux__)
#include <sys/syscall.h> #include <sys/syscall.h>
#include <unistd.h> #include <unistd.h>
#endif #endif
@ -186,7 +186,7 @@ static size_t ggml_backend_amx_buffer_type_get_alloc_size(ggml_backend_buffer_ty
#define XFEATURE_XTILEDATA 18 #define XFEATURE_XTILEDATA 18
static bool ggml_amx_init() { static bool ggml_amx_init() {
#if defined(__gnu_linux__) #if defined(__linux__)
if (syscall(SYS_arch_prctl, ARCH_REQ_XCOMP_PERM, XFEATURE_XTILEDATA)) { if (syscall(SYS_arch_prctl, ARCH_REQ_XCOMP_PERM, XFEATURE_XTILEDATA)) {
fprintf(stderr, "AMX is not ready to be used!\n"); fprintf(stderr, "AMX is not ready to be used!\n");
return false; return false;
@ -194,6 +194,8 @@ static bool ggml_amx_init() {
return true; return true;
#elif defined(_WIN32) #elif defined(_WIN32)
return true; return true;
#else
return false;
#endif #endif
} }

View File

@ -40,18 +40,22 @@
#define ggml_gemv_q4_K_8x8_q8_K_generic ggml_gemv_q4_K_8x8_q8_K #define ggml_gemv_q4_K_8x8_q8_K_generic ggml_gemv_q4_K_8x8_q8_K
#define ggml_gemv_q2_K_8x8_q8_K_generic ggml_gemv_q2_K_8x8_q8_K #define ggml_gemv_q2_K_8x8_q8_K_generic ggml_gemv_q2_K_8x8_q8_K
#define ggml_gemv_iq4_nl_4x4_q8_0_generic ggml_gemv_iq4_nl_4x4_q8_0 #define ggml_gemv_iq4_nl_4x4_q8_0_generic ggml_gemv_iq4_nl_4x4_q8_0
#define ggml_gemv_iq4_nl_8x8_q8_0_generic ggml_gemv_iq4_nl_8x8_q8_0
#define ggml_gemm_q4_0_4x4_q8_0_generic ggml_gemm_q4_0_4x4_q8_0 #define ggml_gemm_q4_0_4x4_q8_0_generic ggml_gemm_q4_0_4x4_q8_0
#define ggml_gemm_q4_0_4x8_q8_0_generic ggml_gemm_q4_0_4x8_q8_0 #define ggml_gemm_q4_0_4x8_q8_0_generic ggml_gemm_q4_0_4x8_q8_0
#define ggml_gemm_q4_0_8x8_q8_0_generic ggml_gemm_q4_0_8x8_q8_0 #define ggml_gemm_q4_0_8x8_q8_0_generic ggml_gemm_q4_0_8x8_q8_0
#define ggml_gemm_q4_K_8x8_q8_K_generic ggml_gemm_q4_K_8x8_q8_K #define ggml_gemm_q4_K_8x8_q8_K_generic ggml_gemm_q4_K_8x8_q8_K
#define ggml_gemm_q2_K_8x8_q8_K_generic ggml_gemm_q2_K_8x8_q8_K #define ggml_gemm_q2_K_8x8_q8_K_generic ggml_gemm_q2_K_8x8_q8_K
#define ggml_gemm_iq4_nl_4x4_q8_0_generic ggml_gemm_iq4_nl_4x4_q8_0 #define ggml_gemm_iq4_nl_4x4_q8_0_generic ggml_gemm_iq4_nl_4x4_q8_0
#define ggml_gemm_iq4_nl_8x8_q8_0_generic ggml_gemm_iq4_nl_8x8_q8_0
#elif defined(__aarch64__) || defined(__arm__) || defined(_M_ARM) || defined(_M_ARM64) #elif defined(__aarch64__) || defined(__arm__) || defined(_M_ARM) || defined(_M_ARM64)
// repack.cpp // repack.cpp
#define ggml_quantize_mat_q8_K_4x8_generic ggml_quantize_mat_q8_K_4x8 #define ggml_quantize_mat_q8_K_4x8_generic ggml_quantize_mat_q8_K_4x8
#define ggml_gemv_q4_K_8x8_q8_K_generic ggml_gemv_q4_K_8x8_q8_K #define ggml_gemv_q4_K_8x8_q8_K_generic ggml_gemv_q4_K_8x8_q8_K
#define ggml_gemv_iq4_nl_8x8_q8_0_generic ggml_gemv_iq4_nl_8x8_q8_0
#define ggml_gemv_q2_K_8x8_q8_K_generic ggml_gemv_q2_K_8x8_q8_K #define ggml_gemv_q2_K_8x8_q8_K_generic ggml_gemv_q2_K_8x8_q8_K
#define ggml_gemm_q4_K_8x8_q8_K_generic ggml_gemm_q4_K_8x8_q8_K #define ggml_gemm_q4_K_8x8_q8_K_generic ggml_gemm_q4_K_8x8_q8_K
#define ggml_gemm_iq4_nl_8x8_q8_0_generic ggml_gemm_iq4_nl_8x8_q8_0
#define ggml_gemm_q2_K_8x8_q8_K_generic ggml_gemm_q2_K_8x8_q8_K #define ggml_gemm_q2_K_8x8_q8_K_generic ggml_gemm_q2_K_8x8_q8_K
#elif defined(__x86_64__) || defined(__i386__) || defined(_M_IX86) || defined(_M_X64) #elif defined(__x86_64__) || defined(__i386__) || defined(_M_IX86) || defined(_M_X64)
// repack.cpp // repack.cpp
@ -69,7 +73,6 @@
#define ggml_vec_dot_tq1_0_q8_K_generic ggml_vec_dot_tq1_0_q8_K #define ggml_vec_dot_tq1_0_q8_K_generic ggml_vec_dot_tq1_0_q8_K
#define ggml_vec_dot_tq2_0_q8_K_generic ggml_vec_dot_tq2_0_q8_K #define ggml_vec_dot_tq2_0_q8_K_generic ggml_vec_dot_tq2_0_q8_K
#define ggml_vec_dot_iq1_m_q8_K_generic ggml_vec_dot_iq1_m_q8_K #define ggml_vec_dot_iq1_m_q8_K_generic ggml_vec_dot_iq1_m_q8_K
#define ggml_vec_dot_mxfp4_q8_0_generic ggml_vec_dot_mxfp4_q8_0
// repack.cpp // repack.cpp
#define ggml_quantize_mat_q8_0_4x4_generic ggml_quantize_mat_q8_0_4x4 #define ggml_quantize_mat_q8_0_4x4_generic ggml_quantize_mat_q8_0_4x4
#define ggml_quantize_mat_q8_0_4x8_generic ggml_quantize_mat_q8_0_4x8 #define ggml_quantize_mat_q8_0_4x8_generic ggml_quantize_mat_q8_0_4x8
@ -80,12 +83,14 @@
#define ggml_gemv_q4_K_8x8_q8_K_generic ggml_gemv_q4_K_8x8_q8_K #define ggml_gemv_q4_K_8x8_q8_K_generic ggml_gemv_q4_K_8x8_q8_K
#define ggml_gemv_q2_K_8x8_q8_K_generic ggml_gemv_q2_K_8x8_q8_K #define ggml_gemv_q2_K_8x8_q8_K_generic ggml_gemv_q2_K_8x8_q8_K
#define ggml_gemv_iq4_nl_4x4_q8_0_generic ggml_gemv_iq4_nl_4x4_q8_0 #define ggml_gemv_iq4_nl_4x4_q8_0_generic ggml_gemv_iq4_nl_4x4_q8_0
#define ggml_gemv_iq4_nl_8x8_q8_0_generic ggml_gemv_iq4_nl_8x8_q8_0
#define ggml_gemm_q4_0_4x4_q8_0_generic ggml_gemm_q4_0_4x4_q8_0 #define ggml_gemm_q4_0_4x4_q8_0_generic ggml_gemm_q4_0_4x4_q8_0
#define ggml_gemm_q4_0_4x8_q8_0_generic ggml_gemm_q4_0_4x8_q8_0 #define ggml_gemm_q4_0_4x8_q8_0_generic ggml_gemm_q4_0_4x8_q8_0
#define ggml_gemm_q4_0_8x8_q8_0_generic ggml_gemm_q4_0_8x8_q8_0 #define ggml_gemm_q4_0_8x8_q8_0_generic ggml_gemm_q4_0_8x8_q8_0
#define ggml_gemm_q4_K_8x8_q8_K_generic ggml_gemm_q4_K_8x8_q8_K #define ggml_gemm_q4_K_8x8_q8_K_generic ggml_gemm_q4_K_8x8_q8_K
#define ggml_gemm_q2_K_8x8_q8_K_generic ggml_gemm_q2_K_8x8_q8_K #define ggml_gemm_q2_K_8x8_q8_K_generic ggml_gemm_q2_K_8x8_q8_K
#define ggml_gemm_iq4_nl_4x4_q8_0_generic ggml_gemm_iq4_nl_4x4_q8_0 #define ggml_gemm_iq4_nl_4x4_q8_0_generic ggml_gemm_iq4_nl_4x4_q8_0
#define ggml_gemm_iq4_nl_8x8_q8_0_generic ggml_gemm_iq4_nl_8x8_q8_0
#elif defined(__loongarch64) #elif defined(__loongarch64)
// quants.c // quants.c
#define quantize_row_q8_K_generic quantize_row_q8_K #define quantize_row_q8_K_generic quantize_row_q8_K
@ -103,12 +108,14 @@
#define ggml_gemv_q4_K_8x8_q8_K_generic ggml_gemv_q4_K_8x8_q8_K #define ggml_gemv_q4_K_8x8_q8_K_generic ggml_gemv_q4_K_8x8_q8_K
#define ggml_gemv_q2_K_8x8_q8_K_generic ggml_gemv_q2_K_8x8_q8_K #define ggml_gemv_q2_K_8x8_q8_K_generic ggml_gemv_q2_K_8x8_q8_K
#define ggml_gemv_iq4_nl_4x4_q8_0_generic ggml_gemv_iq4_nl_4x4_q8_0 #define ggml_gemv_iq4_nl_4x4_q8_0_generic ggml_gemv_iq4_nl_4x4_q8_0
#define ggml_gemv_iq4_nl_8x8_q8_0_generic ggml_gemv_iq4_nl_8x8_q8_0
#define ggml_gemm_q4_0_4x4_q8_0_generic ggml_gemm_q4_0_4x4_q8_0 #define ggml_gemm_q4_0_4x4_q8_0_generic ggml_gemm_q4_0_4x4_q8_0
#define ggml_gemm_q4_0_4x8_q8_0_generic ggml_gemm_q4_0_4x8_q8_0 #define ggml_gemm_q4_0_4x8_q8_0_generic ggml_gemm_q4_0_4x8_q8_0
#define ggml_gemm_q4_0_8x8_q8_0_generic ggml_gemm_q4_0_8x8_q8_0 #define ggml_gemm_q4_0_8x8_q8_0_generic ggml_gemm_q4_0_8x8_q8_0
#define ggml_gemm_q4_K_8x8_q8_K_generic ggml_gemm_q4_K_8x8_q8_K #define ggml_gemm_q4_K_8x8_q8_K_generic ggml_gemm_q4_K_8x8_q8_K
#define ggml_gemm_q2_K_8x8_q8_K_generic ggml_gemm_q2_K_8x8_q8_K #define ggml_gemm_q2_K_8x8_q8_K_generic ggml_gemm_q2_K_8x8_q8_K
#define ggml_gemm_iq4_nl_4x4_q8_0_generic ggml_gemm_iq4_nl_4x4_q8_0 #define ggml_gemm_iq4_nl_4x4_q8_0_generic ggml_gemm_iq4_nl_4x4_q8_0
#define ggml_gemm_iq4_nl_8x8_q8_0_generic ggml_gemm_iq4_nl_8x8_q8_0
#elif defined(__riscv) #elif defined(__riscv)
// quants.c // quants.c
#define quantize_row_q8_K_generic quantize_row_q8_K #define quantize_row_q8_K_generic quantize_row_q8_K
@ -133,16 +140,16 @@
#define ggml_gemv_q4_K_8x8_q8_K_generic ggml_gemv_q4_K_8x8_q8_K #define ggml_gemv_q4_K_8x8_q8_K_generic ggml_gemv_q4_K_8x8_q8_K
#define ggml_gemv_q2_K_8x8_q8_K_generic ggml_gemv_q2_K_8x8_q8_K #define ggml_gemv_q2_K_8x8_q8_K_generic ggml_gemv_q2_K_8x8_q8_K
#define ggml_gemv_iq4_nl_4x4_q8_0_generic ggml_gemv_iq4_nl_4x4_q8_0 #define ggml_gemv_iq4_nl_4x4_q8_0_generic ggml_gemv_iq4_nl_4x4_q8_0
#define ggml_gemv_iq4_nl_8x8_q8_0_generic ggml_gemv_iq4_nl_8x8_q8_0
#define ggml_gemm_q4_0_4x4_q8_0_generic ggml_gemm_q4_0_4x4_q8_0 #define ggml_gemm_q4_0_4x4_q8_0_generic ggml_gemm_q4_0_4x4_q8_0
#define ggml_gemm_q4_0_4x8_q8_0_generic ggml_gemm_q4_0_4x8_q8_0 #define ggml_gemm_q4_0_4x8_q8_0_generic ggml_gemm_q4_0_4x8_q8_0
#define ggml_gemm_q4_K_8x8_q8_K_generic ggml_gemm_q4_K_8x8_q8_K #define ggml_gemm_q4_K_8x8_q8_K_generic ggml_gemm_q4_K_8x8_q8_K
#define ggml_gemm_q2_K_8x8_q8_K_generic ggml_gemm_q2_K_8x8_q8_K #define ggml_gemm_q2_K_8x8_q8_K_generic ggml_gemm_q2_K_8x8_q8_K
#define ggml_gemm_iq4_nl_4x4_q8_0_generic ggml_gemm_iq4_nl_4x4_q8_0 #define ggml_gemm_iq4_nl_4x4_q8_0_generic ggml_gemm_iq4_nl_4x4_q8_0
#define ggml_gemm_iq4_nl_8x8_q8_0_generic ggml_gemm_iq4_nl_8x8_q8_0
#elif defined(__s390x__) #elif defined(__s390x__)
// quants.c // quants.c
#define quantize_row_q8_K_generic quantize_row_q8_K #define quantize_row_q8_K_generic quantize_row_q8_K
#define ggml_vec_dot_q5_0_q8_0_generic ggml_vec_dot_q5_0_q8_0
#define ggml_vec_dot_q5_1_q8_1_generic ggml_vec_dot_q5_1_q8_1
#define ggml_vec_dot_tq1_0_q8_K_generic ggml_vec_dot_tq1_0_q8_K #define ggml_vec_dot_tq1_0_q8_K_generic ggml_vec_dot_tq1_0_q8_K
#define ggml_vec_dot_tq2_0_q8_K_generic ggml_vec_dot_tq2_0_q8_K #define ggml_vec_dot_tq2_0_q8_K_generic ggml_vec_dot_tq2_0_q8_K
#define ggml_vec_dot_q2_K_q8_K_generic ggml_vec_dot_q2_K_q8_K #define ggml_vec_dot_q2_K_q8_K_generic ggml_vec_dot_q2_K_q8_K
@ -153,7 +160,6 @@
#define ggml_vec_dot_iq3_s_q8_K_generic ggml_vec_dot_iq3_s_q8_K #define ggml_vec_dot_iq3_s_q8_K_generic ggml_vec_dot_iq3_s_q8_K
#define ggml_vec_dot_iq1_s_q8_K_generic ggml_vec_dot_iq1_s_q8_K #define ggml_vec_dot_iq1_s_q8_K_generic ggml_vec_dot_iq1_s_q8_K
#define ggml_vec_dot_iq1_m_q8_K_generic ggml_vec_dot_iq1_m_q8_K #define ggml_vec_dot_iq1_m_q8_K_generic ggml_vec_dot_iq1_m_q8_K
#define ggml_vec_dot_mxfp4_q8_0_generic ggml_vec_dot_mxfp4_q8_0
// repack.cpp // repack.cpp
#define ggml_quantize_mat_q8_0_4x4_generic ggml_quantize_mat_q8_0_4x4 #define ggml_quantize_mat_q8_0_4x4_generic ggml_quantize_mat_q8_0_4x4
#define ggml_quantize_mat_q8_0_4x8_generic ggml_quantize_mat_q8_0_4x8 #define ggml_quantize_mat_q8_0_4x8_generic ggml_quantize_mat_q8_0_4x8
@ -164,12 +170,14 @@
#define ggml_gemv_q4_K_8x8_q8_K_generic ggml_gemv_q4_K_8x8_q8_K #define ggml_gemv_q4_K_8x8_q8_K_generic ggml_gemv_q4_K_8x8_q8_K
#define ggml_gemv_q2_K_8x8_q8_K_generic ggml_gemv_q2_K_8x8_q8_K #define ggml_gemv_q2_K_8x8_q8_K_generic ggml_gemv_q2_K_8x8_q8_K
#define ggml_gemv_iq4_nl_4x4_q8_0_generic ggml_gemv_iq4_nl_4x4_q8_0 #define ggml_gemv_iq4_nl_4x4_q8_0_generic ggml_gemv_iq4_nl_4x4_q8_0
#define ggml_gemv_iq4_nl_8x8_q8_0_generic ggml_gemv_iq4_nl_8x8_q8_0
#define ggml_gemm_q4_0_4x4_q8_0_generic ggml_gemm_q4_0_4x4_q8_0 #define ggml_gemm_q4_0_4x4_q8_0_generic ggml_gemm_q4_0_4x4_q8_0
#define ggml_gemm_q4_0_4x8_q8_0_generic ggml_gemm_q4_0_4x8_q8_0 #define ggml_gemm_q4_0_4x8_q8_0_generic ggml_gemm_q4_0_4x8_q8_0
#define ggml_gemm_q4_0_8x8_q8_0_generic ggml_gemm_q4_0_8x8_q8_0 #define ggml_gemm_q4_0_8x8_q8_0_generic ggml_gemm_q4_0_8x8_q8_0
#define ggml_gemm_q4_K_8x8_q8_K_generic ggml_gemm_q4_K_8x8_q8_K #define ggml_gemm_q4_K_8x8_q8_K_generic ggml_gemm_q4_K_8x8_q8_K
#define ggml_gemm_q2_K_8x8_q8_K_generic ggml_gemm_q2_K_8x8_q8_K #define ggml_gemm_q2_K_8x8_q8_K_generic ggml_gemm_q2_K_8x8_q8_K
#define ggml_gemm_iq4_nl_4x4_q8_0_generic ggml_gemm_iq4_nl_4x4_q8_0 #define ggml_gemm_iq4_nl_4x4_q8_0_generic ggml_gemm_iq4_nl_4x4_q8_0
#define ggml_gemm_iq4_nl_8x8_q8_0_generic ggml_gemm_iq4_nl_8x8_q8_0
#elif defined(__wasm__) #elif defined(__wasm__)
// quants.c // quants.c
#define ggml_vec_dot_q4_1_q8_1_generic ggml_vec_dot_q4_1_q8_1 #define ggml_vec_dot_q4_1_q8_1_generic ggml_vec_dot_q4_1_q8_1
@ -195,10 +203,12 @@
#define ggml_gemv_q4_K_8x8_q8_K_generic ggml_gemv_q4_K_8x8_q8_K #define ggml_gemv_q4_K_8x8_q8_K_generic ggml_gemv_q4_K_8x8_q8_K
#define ggml_gemv_q2_K_8x8_q8_K_generic ggml_gemv_q2_K_8x8_q8_K #define ggml_gemv_q2_K_8x8_q8_K_generic ggml_gemv_q2_K_8x8_q8_K
#define ggml_gemv_iq4_nl_4x4_q8_0_generic ggml_gemv_iq4_nl_4x4_q8_0 #define ggml_gemv_iq4_nl_4x4_q8_0_generic ggml_gemv_iq4_nl_4x4_q8_0
#define ggml_gemv_iq4_nl_8x8_q8_0_generic ggml_gemv_iq4_nl_8x8_q8_0
#define ggml_gemm_q4_0_4x4_q8_0_generic ggml_gemm_q4_0_4x4_q8_0 #define ggml_gemm_q4_0_4x4_q8_0_generic ggml_gemm_q4_0_4x4_q8_0
#define ggml_gemm_q4_0_4x8_q8_0_generic ggml_gemm_q4_0_4x8_q8_0 #define ggml_gemm_q4_0_4x8_q8_0_generic ggml_gemm_q4_0_4x8_q8_0
#define ggml_gemm_q4_0_8x8_q8_0_generic ggml_gemm_q4_0_8x8_q8_0 #define ggml_gemm_q4_0_8x8_q8_0_generic ggml_gemm_q4_0_8x8_q8_0
#define ggml_gemm_q4_K_8x8_q8_K_generic ggml_gemm_q4_K_8x8_q8_K #define ggml_gemm_q4_K_8x8_q8_K_generic ggml_gemm_q4_K_8x8_q8_K
#define ggml_gemm_q2_K_8x8_q8_K_generic ggml_gemm_q2_K_8x8_q8_K #define ggml_gemm_q2_K_8x8_q8_K_generic ggml_gemm_q2_K_8x8_q8_K
#define ggml_gemm_iq4_nl_4x4_q8_0_generic ggml_gemm_iq4_nl_4x4_q8_0 #define ggml_gemm_iq4_nl_4x4_q8_0_generic ggml_gemm_iq4_nl_4x4_q8_0
#define ggml_gemm_iq4_nl_8x8_q8_0_generic ggml_gemm_iq4_nl_8x8_q8_0
#endif #endif

File diff suppressed because it is too large Load Diff

View File

@ -28,6 +28,14 @@ static inline float bf16_to_f32(ggml_bf16_t x) {
return GGML_BF16_TO_FP32(x); return GGML_BF16_TO_FP32(x);
} }
static inline float i32_to_f32(int32_t x) {
return x;
}
static inline int32_t f32_to_i32(float x) {
return x;
}
static inline float f32_to_f32(float x) { static inline float f32_to_f32(float x) {
return x; return x;
} }
@ -54,6 +62,12 @@ struct type_conversion_table<ggml_bf16_t> {
static constexpr ggml_bf16_t (*from_f32)(float) = f32_to_bf16; static constexpr ggml_bf16_t (*from_f32)(float) = f32_to_bf16;
}; };
template <>
struct type_conversion_table<int32_t> {
static constexpr float (*to_f32)(int32_t) = i32_to_f32;
static constexpr int32_t (*from_f32)(float) = f32_to_i32;
};
static std::pair<int64_t, int64_t> get_thread_range(const struct ggml_compute_params * params, const struct ggml_tensor * src0) { static std::pair<int64_t, int64_t> get_thread_range(const struct ggml_compute_params * params, const struct ggml_tensor * src0) {
const int64_t ith = params->ith; const int64_t ith = params->ith;
const int64_t nth = params->nth; const int64_t nth = params->nth;

View File

@ -68,12 +68,6 @@ struct ggml_compute_params {
#endif // __VXE2__ #endif // __VXE2__
#endif // __s390x__ && __VEC__ #endif // __s390x__ && __VEC__
#if defined(__s390x__) && defined(GGML_NNPA)
#ifndef __NNPA__
#define __NNPA__
#endif // __NNPA__
#endif // __s390x__ && GGML_NNPA
#if defined(__ARM_FEATURE_SVE) #if defined(__ARM_FEATURE_SVE)
#include <sys/prctl.h> #include <sys/prctl.h>
#endif #endif
@ -486,6 +480,19 @@ inline static int16x8_t vec_padd_s16(int16x8_t a, int16x8_t b) {
return v_abo + v_abe; return v_abo + v_abe;
} }
/**
* @see https://github.com/ggml-org/llama.cpp/pull/14037
*/
inline static float vec_hsum_f32x4(float32x4_t v) {
float32x4_t v_temp = v + vec_reve(v);
return v_temp[0] + v_temp[1];
}
inline static int32_t vec_hsum_i32x4(int32x4_t v) {
int32x4_t v_temp = v + vec_reve(v);
return v_temp[0] + v_temp[1];
}
inline static int32x4_t ggml_vec_dot(int32x4_t acc, int8x16_t a, int8x16_t b) { inline static int32x4_t ggml_vec_dot(int32x4_t acc, int8x16_t a, int8x16_t b) {
const int16x8_t p = vec_mule(a, b) + vec_mulo(a, b); const int16x8_t p = vec_mule(a, b) + vec_mulo(a, b);
return acc + (vec_unpackh(p) + vec_unpackl(p)); return acc + (vec_unpackh(p) + vec_unpackl(p));

View File

@ -375,6 +375,9 @@ static const struct ggml_type_traits_cpu type_traits_cpu[GGML_TYPE_COUNT] = {
.vec_dot_type = GGML_TYPE_Q8_K, .vec_dot_type = GGML_TYPE_Q8_K,
.nrows = 1, .nrows = 1,
}, },
[GGML_TYPE_I32] = {
.from_float = (ggml_from_float_t) ggml_cpu_fp32_to_i32,
},
}; };
const struct ggml_type_traits_cpu * ggml_get_type_traits_cpu(enum ggml_type type) { const struct ggml_type_traits_cpu * ggml_get_type_traits_cpu(enum ggml_type type) {
@ -472,10 +475,10 @@ struct ggml_threadpool {
struct ggml_compute_state { struct ggml_compute_state {
#ifndef GGML_USE_OPENMP #ifndef GGML_USE_OPENMP
ggml_thread_t thrd; ggml_thread_t thrd;
bool cpumask[GGML_MAX_N_THREADS];
int last_graph; int last_graph;
bool pending; bool pending;
#endif #endif
bool cpumask[GGML_MAX_N_THREADS];
struct ggml_threadpool * threadpool; struct ggml_threadpool * threadpool;
int ith; int ith;
}; };
@ -1878,10 +1881,18 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
{ {
ggml_compute_forward_im2col_back_f32(params, tensor); ggml_compute_forward_im2col_back_f32(params, tensor);
} break; } break;
case GGML_OP_IM2COL_3D:
{
ggml_compute_forward_im2col_3d(params, tensor);
} break;
case GGML_OP_CONV_2D: case GGML_OP_CONV_2D:
{ {
ggml_compute_forward_conv_2d(params, tensor); ggml_compute_forward_conv_2d(params, tensor);
} break; } break;
case GGML_OP_CONV_3D:
{
ggml_compute_forward_conv_3d(params, tensor);
} break;
case GGML_OP_CONV_2D_DW: case GGML_OP_CONV_2D_DW:
{ {
ggml_compute_forward_conv_2d_dw(params, tensor); ggml_compute_forward_conv_2d_dw(params, tensor);
@ -2024,6 +2035,11 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
ggml_compute_forward_opt_step_adamw(params, tensor); ggml_compute_forward_opt_step_adamw(params, tensor);
} }
break; break;
case GGML_OP_OPT_STEP_SGD:
{
ggml_compute_forward_opt_step_sgd(params, tensor);
}
break;
case GGML_OP_NONE: case GGML_OP_NONE:
{ {
// nop // nop
@ -2248,7 +2264,9 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
} break; } break;
case GGML_OP_IM2COL: case GGML_OP_IM2COL:
case GGML_OP_IM2COL_BACK: case GGML_OP_IM2COL_BACK:
case GGML_OP_IM2COL_3D:
case GGML_OP_CONV_2D: case GGML_OP_CONV_2D:
case GGML_OP_CONV_3D:
case GGML_OP_CONV_2D_DW: case GGML_OP_CONV_2D_DW:
case GGML_OP_CONV_TRANSPOSE_1D: case GGML_OP_CONV_TRANSPOSE_1D:
case GGML_OP_CONV_TRANSPOSE_2D: case GGML_OP_CONV_TRANSPOSE_2D:
@ -2327,6 +2345,7 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
case GGML_OP_CROSS_ENTROPY_LOSS: case GGML_OP_CROSS_ENTROPY_LOSS:
case GGML_OP_CROSS_ENTROPY_LOSS_BACK: case GGML_OP_CROSS_ENTROPY_LOSS_BACK:
case GGML_OP_OPT_STEP_ADAMW: case GGML_OP_OPT_STEP_ADAMW:
case GGML_OP_OPT_STEP_SGD:
{ {
n_tasks = n_threads; n_tasks = n_threads;
} break; } break;
@ -2682,7 +2701,10 @@ struct ggml_cplan ggml_graph_plan(
if (ggml_is_quantized(node->type) || if (ggml_is_quantized(node->type) ||
// F16 -> BF16 and BF16 -> F16 copies go through intermediate F32 // F16 -> BF16 and BF16 -> F16 copies go through intermediate F32
(node->src[0]->type == GGML_TYPE_F16 && node->src[1] && node->src[1]->type == GGML_TYPE_BF16) || (node->src[0]->type == GGML_TYPE_F16 && node->src[1] && node->src[1]->type == GGML_TYPE_BF16) ||
(node->src[0]->type == GGML_TYPE_BF16 && node->src[1] && node->src[1]->type == GGML_TYPE_F16)) { (node->src[0]->type == GGML_TYPE_BF16 && node->src[1] && node->src[1]->type == GGML_TYPE_F16) ||
// conversion between F32 and I32
(node->src[0]->type == GGML_TYPE_F32 && node->src[1] && node->src[1]->type == GGML_TYPE_I32) ||
(node->src[0]->type == GGML_TYPE_I32 && node->src[1] && node->src[1]->type == GGML_TYPE_F32)) {
cur = ggml_type_size(GGML_TYPE_F32) * node->ne[0] * n_tasks; cur = ggml_type_size(GGML_TYPE_F32) * node->ne[0] * n_tasks;
} }
} break; } break;
@ -2769,6 +2791,7 @@ struct ggml_cplan ggml_graph_plan(
} }
} break; } break;
case GGML_OP_CONV_2D: case GGML_OP_CONV_2D:
case GGML_OP_CONV_3D:
{ {
cur = GGML_IM2COL_WORK_SIZE; cur = GGML_IM2COL_WORK_SIZE;
} break; } break;
@ -3064,7 +3087,14 @@ static struct ggml_threadpool * ggml_threadpool_new_impl(
threadpool->workers = workers; threadpool->workers = workers;
#ifndef GGML_USE_OPENMP #ifdef GGML_USE_OPENMP
int32_t cpumask_iter = 0;
// Compute CPU masks for each thread
for (int j = 0; j < tpp->n_threads; j++) {
ggml_thread_cpumask_next(tpp->cpumask, workers[j].cpumask, tpp->strict_cpu, &cpumask_iter);
}
#else // GGML_USE_OPENMP
ggml_mutex_init(&threadpool->mutex); ggml_mutex_init(&threadpool->mutex);
ggml_cond_init(&threadpool->cond); ggml_cond_init(&threadpool->cond);
@ -3137,7 +3167,14 @@ enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cpl
atomic_store_explicit(&threadpool->n_threads_cur, n_threads, memory_order_relaxed); atomic_store_explicit(&threadpool->n_threads_cur, n_threads, memory_order_relaxed);
} }
ggml_graph_compute_thread(&threadpool->workers[omp_get_thread_num()]); // Apply thread CPU mask and priority
int ith = omp_get_thread_num();
ggml_thread_apply_priority(threadpool->prio);
if (ggml_thread_cpumask_is_valid(threadpool->workers[ith].cpumask)) {
ggml_thread_apply_affinity(threadpool->workers[ith].cpumask);
}
ggml_graph_compute_thread(&threadpool->workers[ith]);
} }
} else { } else {
atomic_store_explicit(&threadpool->n_threads_cur, 1, memory_order_relaxed); atomic_store_explicit(&threadpool->n_threads_cur, 1, memory_order_relaxed);
@ -3200,20 +3237,12 @@ void ggml_cpu_fp32_to_fp16(const float * x, ggml_fp16_t * y, int64_t n) {
__m128i y_vec = _mm_cvtps_ph(x_vec, _MM_FROUND_TO_NEAREST_INT); __m128i y_vec = _mm_cvtps_ph(x_vec, _MM_FROUND_TO_NEAREST_INT);
_mm_storel_epi64((__m128i *)(y + i), y_vec); _mm_storel_epi64((__m128i *)(y + i), y_vec);
} }
#elif defined(__NNPA__) #elif defined(__riscv_zvfh)
for (; i + 7 < n; i += 8) { for (int vl; i < n; i += vl) {
float32x4_t v_xh = vec_xl(0, (const float *)(x + i + 0)); vl = __riscv_vsetvl_e32m2(n - i);
float32x4_t v_xl = vec_xl(0, (const float *)(x + i + 4)); vfloat32m2_t vx = __riscv_vle32_v_f32m2(&x[i], vl);
uint16x8_t v_yd = vec_round_from_fp32(v_xh, v_xl, 0); vfloat16m1_t vy = __riscv_vfncvt_f_f_w_f16m1(vx, vl);
uint16x8_t v_y = vec_convert_to_fp16(v_yd, 0); __riscv_vse16_v_f16m1((_Float16 *)&y[i], vy, vl);
vec_xst(v_y, 0, (ggml_fp16_t *)(y + i));
}
for (; i + 3 < n; i += 4) {
float32x4_t v_x = vec_xl(0, (const float *)(x + i));
float32x4_t v_zero = vec_splats(0.0f);
uint16x8_t v_yd = vec_round_from_fp32(v_x, v_zero, 0);
uint16x8_t v_y = vec_convert_to_fp16(v_yd, 0);
vec_xst(v_y, 0, (ggml_fp16_t *)(y + i));
} }
#endif #endif
for (; i < n; ++i) { for (; i < n; ++i) {
@ -3241,21 +3270,6 @@ void ggml_cpu_fp16_to_fp32(const ggml_fp16_t * x, float * y, int64_t n) {
__m128 y_vec = _mm_cvtph_ps(x_vec); __m128 y_vec = _mm_cvtph_ps(x_vec);
_mm_storeu_ps(y + i, y_vec); _mm_storeu_ps(y + i, y_vec);
} }
#elif defined(__NNPA__)
for (; i + 7 < n; i += 8) {
uint16x8_t v_x = vec_xl(0, (const ggml_fp16_t *)(x + i));
uint16x8_t v_yd = vec_convert_from_fp16(v_x, 0);
float32x4_t v_yh = vec_extend_to_fp32_hi(v_yd, 0);
float32x4_t v_yl = vec_extend_to_fp32_lo(v_yd, 0);
vec_xst(v_yh, 0, (float *)(y + i + 0));
vec_xst(v_yl, 0, (float *)(y + i + 4));
}
for (; i + 3 < n; i += 4) {
uint16x8_t v_x = vec_xl(0, (const ggml_fp16_t *)(x + i));
uint16x8_t v_yd = vec_convert_from_fp16(v_x, 0);
float32x4_t v_yh = vec_extend_to_fp32_hi(v_yd, 0);
vec_xst(v_yh, 0, (float *)(y + i));
}
#endif #endif
for (; i < n; ++i) { for (; i < n; ++i) {
@ -3270,6 +3284,13 @@ void ggml_cpu_fp32_to_bf16(const float * x, ggml_bf16_t * y, int64_t n) {
} }
} }
void ggml_cpu_fp32_to_i32(const float * x, int32_t * y, int64_t n) {
int64_t i = 0;
for (; i < n; ++i) {
y[i] = x[i];
}
}
void ggml_cpu_bf16_to_fp32(const ggml_bf16_t * x, float * y, int64_t n) { void ggml_cpu_bf16_to_fp32(const ggml_bf16_t * x, float * y, int64_t n) {
int64_t i = 0; int64_t i = 0;
#if defined(__AVX2__) #if defined(__AVX2__)
@ -3459,14 +3480,6 @@ int ggml_cpu_has_vxe(void) {
#endif #endif
} }
int ggml_cpu_has_nnpa(void) {
#if defined(GGML_NNPA)
return 1;
#else
return 0;
#endif
}
int ggml_cpu_has_neon(void) { int ggml_cpu_has_neon(void) {
#if defined(__ARM_ARCH) && defined(__ARM_NEON) #if defined(__ARM_ARCH) && defined(__ARM_NEON)
return 1; return 1;

Some files were not shown because too many files have changed in this diff Show More