Update GGML to b6646 (#12245)

Notable EOLs with this change:
- MacOS v12 and v13 are no longer supported (v14+ required)
- AMD gfx900 and gfx906 are no longer supported
This commit is contained in:
Daniel Hiltgen 2025-10-02 14:47:10 -07:00 committed by GitHub
parent fdb109469f
commit c68f367ef6
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
326 changed files with 30615 additions and 20624 deletions

View File

@ -89,9 +89,9 @@ if(CMAKE_CUDA_COMPILER)
)
endif()
set(WINDOWS_AMDGPU_TARGETS_EXCLUDE_REGEX "^gfx(906|908|90a|1200|1201):xnack[+-]$"
set(WINDOWS_AMDGPU_TARGETS_EXCLUDE_REGEX "^gfx(908|90a|1200|1201):xnack[+-]$"
CACHE STRING
"Regular expression describing AMDGPU_TARGETS not supported on Windows. Override to force building these targets. Default \"^gfx(906|908|90a|1200|1201):xnack[+-]$\"."
"Regular expression describing AMDGPU_TARGETS not supported on Windows. Override to force building these targets. Default \"^gfx(908|90a|1200|1201):xnack[+-]$\"."
)
check_language(HIP)
@ -100,7 +100,7 @@ if(CMAKE_HIP_COMPILER)
if(NOT AMDGPU_TARGETS)
find_package(hip REQUIRED)
list(FILTER AMDGPU_TARGETS INCLUDE REGEX "^gfx(900|94[012]|101[02]|1030|110[012]|120[01])$")
list(FILTER AMDGPU_TARGETS INCLUDE REGEX "^gfx(94[012]|101[02]|1030|110[012]|120[01])$")
endif()
if(WIN32 AND WINDOWS_AMDGPU_TARGETS_EXCLUDE_REGEX)

View File

@ -68,7 +68,7 @@
"inherits": [ "ROCm" ],
"cacheVariables": {
"CMAKE_HIP_FLAGS": "-parallel-jobs=4",
"AMDGPU_TARGETS": "gfx900;gfx940;gfx941;gfx942;gfx1010;gfx1012;gfx1030;gfx1100;gfx1101;gfx1102;gfx1151;gfx1200;gfx1201;gfx906:xnack-;gfx908:xnack-;gfx90a:xnack+;gfx90a:xnack-"
"AMDGPU_TARGETS": "gfx940;gfx941;gfx942;gfx1010;gfx1012;gfx1030;gfx1100;gfx1101;gfx1102;gfx1151;gfx1200;gfx1201;gfx908:xnack-;gfx90a:xnack+;gfx90a:xnack-"
}
}
],

View File

@ -1,6 +1,6 @@
UPSTREAM=https://github.com/ggml-org/llama.cpp.git
WORKDIR=llama/vendor
FETCH_HEAD=e54d41befcc1575f4c898c5ff4ef43970cead75f
FETCH_HEAD=364a7a6d4a786e98947c8a90430ea581213c0ba9
.PHONY: help
help:

View File

@ -52,13 +52,13 @@ Ollama supports the following AMD GPUs:
### Linux Support
| Family | Cards and accelerators |
| -------------- | ---------------------------------------------------------------------------------------------------------------------------------------------- |
| AMD Radeon RX | `7900 XTX` `7900 XT` `7900 GRE` `7800 XT` `7700 XT` `7600 XT` `7600` `6950 XT` `6900 XTX` `6900XT` `6800 XT` `6800` `Vega 64` `Vega 56` |
| AMD Radeon PRO | `W7900` `W7800` `W7700` `W7600` `W7500` `W6900X` `W6800X Duo` `W6800X` `W6800` `V620` `V420` `V340` `V320` `Vega II Duo` `Vega II` `VII` `SSG` |
| AMD Instinct | `MI300X` `MI300A` `MI300` `MI250X` `MI250` `MI210` `MI200` `MI100` `MI60` `MI50` |
| -------------- | ---------------------------------------------------------------------------------------------------------------------------------------- |
| AMD Radeon RX | `7900 XTX` `7900 XT` `7900 GRE` `7800 XT` `7700 XT` `7600 XT` `7600` `6950 XT` `6900 XTX` `6900XT` `6800 XT` `6800` `Vega 64` |
| AMD Radeon PRO | `W7900` `W7800` `W7700` `W7600` `W7500` `W6900X` `W6800X Duo` `W6800X` `W6800` `V620` `V420` `V340` `V320` `Vega II Duo` `Vega II` `SSG` |
| AMD Instinct | `MI300X` `MI300A` `MI300` `MI250X` `MI250` `MI210` `MI200` `MI100` `MI60` |
### Windows Support
With ROCm v6.1, the following GPUs are supported on Windows.
With ROCm v6.2, the following GPUs are supported on Windows.
| Family | Cards and accelerators |
| -------------- | ---------------------------------------------------------------------------------------------------------------------------------------------- |
@ -88,8 +88,6 @@ At this time, the known supported GPU types on linux are the following LLVM Targ
This table shows some example GPUs that map to these LLVM targets:
| **LLVM Target** | **An Example GPU** |
|-----------------|---------------------|
| gfx900 | Radeon RX Vega 56 |
| gfx906 | Radeon Instinct MI50 |
| gfx908 | Radeon Instinct MI100 |
| gfx90a | Radeon Instinct MI210 |
| gfx940 | Radeon Instinct MI300 |

View File

@ -2,7 +2,7 @@
## System Requirements
* MacOS Monterey (v12) or newer
* MacOS Sonoma (v14) or newer
* Apple M series (CPU and GPU support) or x86 (CPU only)

View File

@ -36,7 +36,7 @@ func TestLongInputContext(t *testing.T) {
if err := PullIfMissing(ctx, client, req.Model); err != nil {
t.Fatalf("PullIfMissing failed: %v", err)
}
DoGenerate(ctx, t, client, req, []string{"russia", "germany", "france", "england", "austria", "prussia", "europe", "individuals", "coalition", "conflict"}, 120*time.Second, 10*time.Second)
DoGenerate(ctx, t, client, req, []string{"russia", "german", "france", "england", "austria", "prussia", "europe", "individuals", "coalition", "conflict"}, 120*time.Second, 10*time.Second)
}
func TestContextExhaustion(t *testing.T) {

2
llama/build-info.cpp generated vendored
View File

@ -1,4 +1,4 @@
int LLAMA_BUILD_NUMBER = 0;
char const *LLAMA_COMMIT = "e54d41befcc1575f4c898c5ff4ef43970cead75f";
char const *LLAMA_COMMIT = "364a7a6d4a786e98947c8a90430ea581213c0ba9";
char const *LLAMA_COMPILER = "";
char const *LLAMA_BUILD_TARGET = "";

View File

@ -14,6 +14,7 @@
#include <climits>
#include <cmath>
#include <codecvt>
#include <chrono>
#include <cstdarg>
#include <cstring>
#include <ctime>
@ -41,6 +42,7 @@
#endif
#include <locale>
#include <windows.h>
#include <string.h>
#include <fcntl.h>
#include <io.h>
#else
@ -49,6 +51,11 @@
#include <unistd.h>
#endif
#if defined(__linux__)
#include <sys/types.h>
#include <pwd.h>
#endif
#if defined(_MSC_VER)
#pragma warning(disable: 4244 4267) // possible loss of data
#endif
@ -557,13 +564,6 @@ std::string string_from(const struct llama_context * ctx, const std::vector<llam
auto detokenized = common_token_to_piece(ctx, token);
detokenized.erase(
std::remove_if(
detokenized.begin(),
detokenized.end(),
[](const unsigned char c) { return !std::isprint(c); }),
detokenized.end());
buf << "'" << detokenized << "'"
<< ":" << std::to_string(token);
}
@ -588,13 +588,6 @@ std::string string_from(const struct llama_context * ctx, const struct llama_bat
auto detokenized = common_token_to_piece(ctx, batch.token[i]);
detokenized.erase(
std::remove_if(
detokenized.begin(),
detokenized.end(),
[](const unsigned char c) { return !std::isprint(c); }),
detokenized.end());
buf << "\n" << std::to_string(i)
<< ", token '" << detokenized << "'"
<< ", pos " << std::to_string(batch.pos[i])
@ -877,8 +870,20 @@ std::string fs_get_cache_directory() {
#if defined(__linux__) || defined(__FreeBSD__) || defined(_AIX) || defined(__OpenBSD__)
if (std::getenv("XDG_CACHE_HOME")) {
cache_directory = std::getenv("XDG_CACHE_HOME");
} else {
} else if (std::getenv("HOME")) {
cache_directory = std::getenv("HOME") + std::string("/.cache/");
} else {
#if defined(__linux__)
/* no $HOME is defined, fallback to getpwuid */
struct passwd *pw = getpwuid(getuid());
if ((!pw) || (!pw->pw_dir)) {
throw std::runtime_error("Failed to find $HOME directory");
}
cache_directory = std::string(pw->pw_dir) + std::string("/.cache/");
#else /* defined(__linux__) */
throw std::runtime_error("Failed to find $HOME directory");
#endif /* defined(__linux__) */
}
#elif defined(__APPLE__)
cache_directory = std::getenv("HOME") + std::string("/Library/Caches/");
@ -914,7 +919,8 @@ struct common_init_result common_init_from_params(common_params & params) {
llama_model * model = llama_model_load_from_file(params.model.path.c_str(), mparams);
if (model == NULL) {
LOG_ERR("%s: failed to load model '%s'\n", __func__, params.model.path.c_str());
LOG_ERR("%s: failed to load model '%s', try reducing --n-gpu-layers if you're running out of VRAM\n",
__func__, params.model.path.c_str());
return iparams;
}
@ -924,7 +930,8 @@ struct common_init_result common_init_from_params(common_params & params) {
llama_context * lctx = llama_init_from_model(model, cparams);
if (lctx == NULL) {
LOG_ERR("%s: failed to create context with model '%s'\n", __func__, params.model.path.c_str());
LOG_ERR("%s: failed to create context with model '%s', try reducing --n-gpu-layers if you're running out of VRAM\n",
__func__, params.model.path.c_str());
llama_model_free(model);
return iparams;
}
@ -971,15 +978,13 @@ struct common_init_result common_init_from_params(common_params & params) {
bool has_eos = llama_vocab_eos(vocab) != LLAMA_TOKEN_NULL;
bool has_sep = llama_vocab_sep(vocab) != LLAMA_TOKEN_NULL;
bool has_rerank_prompt = llama_model_chat_template(model, "rerank") != NULL;
if (!has_eos && !has_sep) {
LOG_WRN("%s: warning: vocab does not have an EOS token or SEP token, reranking will not work\n", __func__);
if (!has_eos && !has_sep && !has_rerank_prompt) {
LOG_WRN("%s: warning: vocab does not have an EOS token, SEP token, or rerank prompt. Reranking will not work\n", __func__);
ok = false;
} else if (!has_eos) {
LOG_WRN("%s: warning: vocab does not have an EOS token, using SEP token as fallback\n", __func__);
} else if (!has_sep) {
LOG_WRN("%s: warning: vocab does not have a SEP token, reranking will not work\n", __func__);
ok = false;
}
if (!ok) {
@ -1001,7 +1006,12 @@ struct common_init_result common_init_from_params(common_params & params) {
return iparams;
}
char buf[1024];
la.ptr = lora.get();
llama_adapter_meta_val_str(la.ptr, "adapter.lora.task_name", buf, sizeof(buf));
la.task_name = buf;
llama_adapter_meta_val_str(la.ptr, "adapter.lora.prompt_prefix", buf, sizeof(buf));
la.prompt_prefix = buf;
iparams.lora.emplace_back(std::move(lora)); // copy to list of loaded adapters
}
@ -1165,11 +1175,10 @@ struct llama_context_params common_context_params_to_llama(const common_params &
cparams.yarn_orig_ctx = params.yarn_orig_ctx;
cparams.pooling_type = params.pooling_type;
cparams.attention_type = params.attention_type;
cparams.defrag_thold = params.defrag_thold;
cparams.flash_attn_type = params.flash_attn_type;
cparams.cb_eval = params.cb_eval;
cparams.cb_eval_user_data = params.cb_eval_user_data;
cparams.offload_kqv = !params.no_kv_offload;
cparams.flash_attn = params.flash_attn;
cparams.no_perf = params.no_perf;
cparams.op_offload = !params.no_op_offload;
cparams.swa_full = params.swa_full;
@ -1565,3 +1574,56 @@ ggml_opt_dataset_t common_opt_dataset_init(struct llama_context * ctx, const std
return result;
}
ggml_opt_optimizer_params common_opt_lr_pars(void * userdata) {
ggml_opt_optimizer_params result = ggml_opt_get_default_optimizer_params(nullptr);
const lr_opt & d = *(lr_opt *) userdata;
result.adamw.alpha = result.sgd.alpha = d.get_lr(d.epoch);
result.sgd.wd = result.adamw.wd = d.wd;
return result;
}
// TODO make all command line args case-insensitive
static inline bool eq_case_insensitive(char const* a, char const* b) {
return !
#if defined(_MSC_VER)
_stricmp
#else
strcasecmp
#endif // defined(_MSC_VER)
(a, b);
}
enum ggml_opt_optimizer_type common_opt_get_optimizer(const char * n) {
if (eq_case_insensitive("adamw", n)) {
return GGML_OPT_OPTIMIZER_TYPE_ADAMW;
}
if (eq_case_insensitive("sgd", n)) {
return GGML_OPT_OPTIMIZER_TYPE_SGD;
}
return GGML_OPT_OPTIMIZER_TYPE_COUNT;
}
// TODO simplify to use just log and exp
static float const k_log_2 = std::log(2.f);
void lr_opt::init() {
if (lr_min > 0 && lr_min < lr0) {
float nhalf = std::log(lr0 / lr_min) / k_log_2;
float e = epochs;
if (decay_epochs > 0 && decay_epochs < e) {
e = decay_epochs;
} else {
decay_epochs = e;
}
scale_epoch = nhalf / e;
}
}
float lr_opt::get_lr(float epoch) const {
float r = lr_min <= 0 ? lr0 :
epoch >= decay_epochs ? lr_min :
lr0 * std::pow(0.5f, epoch * scale_epoch);
LOG_INF("epoch %.2g lr=%.2g\n", epoch, r);
return r;
}

View File

@ -2,14 +2,17 @@
#pragma once
#include "llama-cpp.h"
#include <set>
#include <sstream>
#include <string>
#include <string_view>
#include <vector>
#include <map>
#include <sstream>
#include <cmath>
#include "ggml-opt.h"
#include "llama-cpp.h"
#ifdef _WIN32
#define DIRECTORY_SEPARATOR '\\'
@ -31,6 +34,9 @@ struct common_adapter_lora_info {
std::string path;
float scale;
std::string task_name;
std::string prompt_prefix;
struct llama_adapter_lora * ptr;
};
@ -82,6 +88,7 @@ enum llama_example {
LLAMA_EXAMPLE_PARALLEL,
LLAMA_EXAMPLE_TTS,
LLAMA_EXAMPLE_DIFFUSION,
LLAMA_EXAMPLE_FINETUNE,
LLAMA_EXAMPLE_COUNT,
};
@ -190,6 +197,7 @@ struct common_params_model {
std::string url = ""; // model url to download // NOLINT
std::string hf_repo = ""; // HF repo // NOLINT
std::string hf_file = ""; // HF file // NOLINT
std::string docker_repo = ""; // Docker repo // NOLINT
};
struct common_params_speculative {
@ -202,6 +210,7 @@ struct common_params_speculative {
float p_split = 0.1f; // speculative decoding split probability
float p_min = 0.75f; // minimum speculative decoding probability (greedy)
std::vector<std::pair<std::string, std::string>> replacements; // main to speculative model replacements
std::vector<llama_model_tensor_buft_override> tensor_buft_overrides;
ggml_type cache_type_k = GGML_TYPE_F16; // KV cache data type for the K
ggml_type cache_type_v = GGML_TYPE_F16; // KV cache data type for the V
@ -234,14 +243,36 @@ struct common_params_diffusion {
bool add_gumbel_noise = false; // add gumbel noise to the logits if temp > 0.0
};
// reasoning API response format (not to be confused as chat template's reasoning format)
enum common_reasoning_format {
COMMON_REASONING_FORMAT_NONE,
COMMON_REASONING_FORMAT_AUTO,
COMMON_REASONING_FORMAT_AUTO, // Same as deepseek, using `message.reasoning_content`
COMMON_REASONING_FORMAT_DEEPSEEK_LEGACY, // Extract thinking tag contents and return as `message.reasoning_content`, or leave inline in <think> tags in stream mode
COMMON_REASONING_FORMAT_DEEPSEEK, // Extract thinking tag contents and return as `message.reasoning_content`, including in streaming deltas.
COMMON_REASONING_FORMAT_GRANITE, // Extract thinking tag contents and return as `message.reasoning_content`, including in streaming deltas.
// do not extend this enum unless you absolutely have to
// in most cases, use COMMON_REASONING_FORMAT_AUTO
// see: https://github.com/ggml-org/llama.cpp/pull/15408
};
struct lr_opt {
float lr0 = 1e-5; // learning rate at first epoch
float lr_min = -1;
float decay_epochs = -1; // if >0, the learning rate starts at lr0 and decays to lr_min after this many epochs
float scale_epoch = 0;
float wd = 0;
unsigned epochs = 2;
unsigned epoch; // set by optimizer outer (epochs) loop
// learning rate decay - constant LR per epoch only for now
float get_lr(float e) const;
float get_lr() const { return get_lr(epoch); }
// must call after arg parse, before get_lr
void init();
};
struct ggml_opt_optimizer_params common_opt_lr_pars(void * userdata);
struct common_params {
int32_t n_predict = -1; // new tokens to predict
int32_t n_ctx = 4096; // context size
@ -257,11 +288,10 @@ struct common_params {
float rope_freq_base = 0.0f; // RoPE base frequency
float rope_freq_scale = 0.0f; // RoPE frequency scaling factor
float yarn_ext_factor = -1.0f; // YaRN extrapolation mix factor
float yarn_attn_factor = 1.0f; // YaRN magnitude scaling factor
float yarn_beta_fast = 32.0f; // YaRN low correction dim
float yarn_beta_slow = 1.0f; // YaRN high correction dim
float yarn_attn_factor = -1.0f; // YaRN magnitude scaling factor
float yarn_beta_fast = -1.0f; // YaRN low correction dim
float yarn_beta_slow = -1.0f; // YaRN high correction dim
int32_t yarn_orig_ctx = 0; // YaRN original context length
float defrag_thold = 0.1f; // KV cache defragmentation threshold
// offload params
std::vector<ggml_backend_dev_t> devices; // devices to use for offloading
@ -283,6 +313,7 @@ struct common_params {
enum llama_rope_scaling_type rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED;
enum llama_pooling_type pooling_type = LLAMA_POOLING_TYPE_UNSPECIFIED; // pooling type for embeddings
enum llama_attention_type attention_type = LLAMA_ATTENTION_TYPE_UNSPECIFIED; // attention type for embeddings
enum llama_flash_attn_type flash_attn_type = LLAMA_FLASH_ATTN_TYPE_AUTO; // whether to use Flash Attention
struct common_params_sampling sampling;
struct common_params_speculative speculative;
@ -346,9 +377,8 @@ struct common_params {
bool multiline_input = false; // reverse the usage of `\`
bool simple_io = false; // improves compatibility with subprocesses and limited consoles
bool cont_batching = true; // insert new sequences for decoding on-the-fly
bool flash_attn = false; // flash attention
bool no_perf = false; // disable performance metrics
bool ctx_shift = true; // context shift on inifinite text generation
bool ctx_shift = false; // context shift on infinite text generation
bool swa_full = false; // use full-size SWA cache (https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055)
bool kv_unified = false; // enable unified KV cache
@ -376,6 +406,11 @@ struct common_params {
bool no_mmproj = false; // explicitly disable multimodal model
std::vector<std::string> image; // path to image file(s)
// finetune
struct lr_opt lr;
enum ggml_opt_optimizer_type optimizer = GGML_OPT_OPTIMIZER_TYPE_ADAMW;
float val_split = 0.05f; // fraction of the data used for the validation set
// embedding
bool embedding = false; // get only sentence embedding
int32_t embd_normalize = 2; // normalisation for embeddings (-1=none, 0=max absolute int16, 1=taxicab, 2=euclidean, >2=p-norm)
@ -389,6 +424,7 @@ struct common_params {
int32_t timeout_write = timeout_read; // http write timeout in seconds
int32_t n_threads_http = -1; // number of threads to process HTTP requests (TODO: support threadpool)
int32_t n_cache_reuse = 0; // min chunk size to reuse from the cache via KV shifting
int32_t n_swa_checkpoints = 3; // max number of SWA checkpoints per slot
std::string hostname = "127.0.0.1";
std::string public_path = ""; // NOLINT
@ -409,7 +445,7 @@ struct common_params {
// "advanced" endpoints are disabled by default for better security
bool webui = true;
bool endpoint_slots = false;
bool endpoint_slots = true;
bool endpoint_props = false; // only control POST requests, not GET
bool endpoint_metrics = false;
@ -417,7 +453,7 @@ struct common_params {
std::string slot_save_path;
float slot_prompt_similarity = 0.5f;
float slot_prompt_similarity = 0.1f;
// batched-bench params
bool is_pp_shared = false;
@ -698,8 +734,25 @@ const char * const LLM_KV_SPLIT_TENSORS_COUNT = "split.tensors.count";
}
//
// MoE utils
//
const char * const LLM_FFN_EXPS_REGEX = "\\.ffn_(up|down|gate)_(ch|)exps";
static std::string llm_ffn_exps_block_regex(int idx) {
return string_format("blk\\.%d%s", idx, LLM_FFN_EXPS_REGEX);
}
static llama_model_tensor_buft_override llm_ffn_exps_cpu_override() {
return { LLM_FFN_EXPS_REGEX, ggml_backend_cpu_buffer_type() };
}
//
// training utils
//
ggml_opt_dataset_t common_opt_dataset_init(struct llama_context * ctx, const std::vector<llama_token> & tokens, int64_t stride);
// "adamw" or "sgd" (case insensitive)
enum ggml_opt_optimizer_type common_opt_get_optimizer(const char *);

View File

@ -257,12 +257,13 @@ std::unordered_map<std::string, BuiltinRule> STRING_FORMAT_RULES = {
};
static bool is_reserved_name(const std::string & name) {
static std::unordered_set<std::string> RESERVED_NAMES;
if (RESERVED_NAMES.empty()) {
RESERVED_NAMES.insert("root");
for (const auto &p : PRIMITIVE_RULES) RESERVED_NAMES.insert(p.first);
for (const auto &p : STRING_FORMAT_RULES) RESERVED_NAMES.insert(p.first);
}
static const std::unordered_set<std::string> RESERVED_NAMES = [] {
std::unordered_set<std::string> s;
s.insert("root");
for (const auto & p : PRIMITIVE_RULES) s.insert(p.first);
for (const auto & p : STRING_FORMAT_RULES) s.insert(p.first);
return s;
}();
return RESERVED_NAMES.find(name) != RESERVED_NAMES.end();
}
@ -843,9 +844,10 @@ public:
_build_object_rule(
properties, required, name,
schema.contains("additionalProperties") ? schema["additionalProperties"] : json()));
} else if ((schema_type.is_null() || schema_type == "object") && schema.contains("allOf")) {
} else if ((schema_type.is_null() || schema_type == "object" || schema_type == "string") && schema.contains("allOf")) {
std::unordered_set<std::string> required;
std::vector<std::pair<std::string, json>> properties;
std::map<std::string, size_t> enum_values;
std::string hybrid_name = name;
std::function<void(const json &, bool)> add_component = [&](const json & comp_schema, bool is_required) {
if (comp_schema.contains("$ref")) {
@ -857,6 +859,14 @@ public:
required.insert(prop.key());
}
}
} else if (comp_schema.contains("enum")) {
for (const auto & v : comp_schema["enum"]) {
const auto rule = _generate_constant_rule(v);
if (enum_values.find(rule) == enum_values.end()) {
enum_values[rule] = 0;
}
enum_values[rule] += 1;
}
} else {
// todo warning
}
@ -870,6 +880,17 @@ public:
add_component(t, true);
}
}
if (!enum_values.empty()) {
std::vector<std::string> enum_intersection;
for (const auto & p : enum_values) {
if (p.second == schema["allOf"].size()) {
enum_intersection.push_back(p.first);
}
}
if (!enum_intersection.empty()) {
return _add_rule(rule_name, "(" + string_join(enum_intersection, " | ") + ") space");
}
}
return _add_rule(rule_name, _build_object_rule(properties, required, hybrid_name, json()));
} else if ((schema_type.is_null() || schema_type == "array") && (schema.contains("items") || schema.contains("prefixItems"))) {
json items = schema.contains("items") ? schema["items"] : schema["prefixItems"];

View File

@ -4,17 +4,52 @@
#include <condition_variable>
#include <cstdarg>
#include <cstdio>
#include <cstdlib>
#include <cstring>
#include <mutex>
#include <sstream>
#include <thread>
#include <vector>
#if defined(_WIN32)
# include <io.h>
# include <windows.h>
# define isatty _isatty
# define fileno _fileno
#else
# include <unistd.h>
#endif // defined(_WIN32)
int common_log_verbosity_thold = LOG_DEFAULT_LLAMA;
void common_log_set_verbosity_thold(int verbosity) {
common_log_verbosity_thold = verbosity;
}
// Auto-detect if colors should be enabled based on terminal and environment
static bool common_log_should_use_colors_auto() {
// Check NO_COLOR environment variable (https://no-color.org/)
if (const char * no_color = std::getenv("NO_COLOR")) {
if (no_color[0] != '\0') {
return false;
}
}
// Check TERM environment variable
if (const char * term = std::getenv("TERM")) {
if (std::strcmp(term, "dumb") == 0) {
return false;
}
}
// Check if stdout and stderr are connected to a terminal
// We check both because log messages can go to either
bool stdout_is_tty = isatty(fileno(stdout));
bool stderr_is_tty = isatty(fileno(stderr));
return stdout_is_tty || stderr_is_tty;
}
static int64_t t_us() {
return std::chrono::duration_cast<std::chrono::microseconds>(std::chrono::system_clock::now().time_since_epoch()).count();
}
@ -353,6 +388,11 @@ struct common_log * common_log_init() {
struct common_log * common_log_main() {
static struct common_log log;
static std::once_flag init_flag;
std::call_once(init_flag, [&]() {
// Set default to auto-detect colors
log.set_colors(common_log_should_use_colors_auto());
});
return &log;
}
@ -380,8 +420,19 @@ void common_log_set_file(struct common_log * log, const char * file) {
log->set_file(file);
}
void common_log_set_colors(struct common_log * log, bool colors) {
log->set_colors(colors);
void common_log_set_colors(struct common_log * log, log_colors colors) {
if (colors == LOG_COLORS_AUTO) {
log->set_colors(common_log_should_use_colors_auto());
return;
}
if (colors == LOG_COLORS_DISABLED) {
log->set_colors(false);
return;
}
GGML_ASSERT(colors == LOG_COLORS_ENABLED);
log->set_colors(true);
}
void common_log_set_prefix(struct common_log * log, bool prefix) {

View File

@ -24,6 +24,12 @@
#define LOG_DEFAULT_DEBUG 1
#define LOG_DEFAULT_LLAMA 0
enum log_colors {
LOG_COLORS_AUTO = -1,
LOG_COLORS_DISABLED = 0,
LOG_COLORS_ENABLED = 1,
};
// needed by the LOG_TMPL macro to avoid computing log arguments if the verbosity lower
// set via common_log_set_verbosity()
extern int common_log_verbosity_thold;
@ -66,7 +72,7 @@ void common_log_add(struct common_log * log, enum ggml_log_level level, const ch
//
void common_log_set_file (struct common_log * log, const char * file); // not thread-safe
void common_log_set_colors (struct common_log * log, bool colors); // not thread-safe
void common_log_set_colors (struct common_log * log, log_colors colors); // not thread-safe
void common_log_set_prefix (struct common_log * log, bool prefix); // whether to output prefix to each log
void common_log_set_timestamps(struct common_log * log, bool timestamps); // whether to output timestamps in the prefix

View File

@ -332,6 +332,7 @@ void common_perf_print(const struct llama_context * ctx, const struct common_sam
}
if (ctx) {
llama_perf_context_print(ctx);
llama_memory_breakdown_print(ctx);
}
}
@ -426,8 +427,29 @@ uint32_t common_sampler_get_seed(const struct common_sampler * gsmpl) {
// helpers
llama_token_data_array * common_sampler_get_candidates(struct common_sampler * gsmpl) {
return &gsmpl->cur_p;
llama_token_data_array * common_sampler_get_candidates(struct common_sampler * gsmpl, bool do_sort) {
auto * res = &gsmpl->cur_p;
if (do_sort && !res->sorted) {
// remember the selected token before sorting
const llama_token id = res->data[res->selected].id;
std::sort(res->data, res->data + res->size, [](const llama_token_data & a, const llama_token_data & b) {
return a.p > b.p;
});
// restore the selected token after sorting
for (size_t i = 0; i < res->size; ++i) {
if (res->data[i].id == id) {
res->selected = i;
break;
}
}
res->sorted = true;
}
return res;
}
llama_token common_sampler_last(const struct common_sampler * gsmpl) {

View File

@ -86,7 +86,9 @@ uint32_t common_sampler_get_seed(const struct common_sampler * gsmpl);
// helpers
// access the internal list of current candidate tokens
llama_token_data_array * common_sampler_get_candidates(struct common_sampler * gsmpl);
// if do_sort == true, the candidates are guaranteed to be sorted afterwards (in descending order of probability)
// the .sorted flag of the result indicates whether the returned candidates are sorted
llama_token_data_array * common_sampler_get_candidates(struct common_sampler * gsmpl, bool do_sort);
// get the last accepted token
llama_token common_sampler_last(const struct common_sampler * gsmpl);

View File

@ -64,8 +64,6 @@ extern "C" {
typedef struct llama_memory_i * llama_memory_t;
struct llama_kv_cache; // DEPRECATED (use llama_memory instead)
typedef int32_t llama_pos;
typedef int32_t llama_token;
typedef int32_t llama_seq_id;
@ -181,6 +179,14 @@ extern "C" {
LLAMA_ATTENTION_TYPE_NON_CAUSAL = 1,
};
enum llama_flash_attn_type {
LLAMA_FLASH_ATTN_TYPE_AUTO = -1,
LLAMA_FLASH_ATTN_TYPE_DISABLED = 0,
LLAMA_FLASH_ATTN_TYPE_ENABLED = 1,
};
LLAMA_API const char * llama_flash_attn_type_name(enum llama_flash_attn_type flash_attn_type);
enum llama_split_mode {
LLAMA_SPLIT_MODE_NONE = 0, // single GPU
LLAMA_SPLIT_MODE_LAYER = 1, // split layers and KV across GPUs
@ -200,7 +206,7 @@ extern "C" {
llama_token_data * data;
size_t size;
int64_t selected; // this is the index in the data array (i.e. not the token id)
bool sorted;
bool sorted; // note: do not assume the data is sorted - always check this flag
} llama_token_data_array;
typedef bool (*llama_progress_callback)(float progress, void * user_data);
@ -305,6 +311,7 @@ extern "C" {
enum llama_rope_scaling_type rope_scaling_type; // RoPE scaling type, from `enum llama_rope_scaling_type`
enum llama_pooling_type pooling_type; // whether to pool (sum) embedding results by sequence id
enum llama_attention_type attention_type; // attention type to use for embeddings
enum llama_flash_attn_type flash_attn_type; // when to enable Flash Attention
// ref: https://github.com/ggml-org/llama.cpp/pull/2054
float rope_freq_base; // RoPE base frequency, 0 = from model
@ -314,7 +321,7 @@ extern "C" {
float yarn_beta_fast; // YaRN low correction dim
float yarn_beta_slow; // YaRN high correction dim
uint32_t yarn_orig_ctx; // YaRN original context size
float defrag_thold; // defragment the KV cache if holes/size > thold, <= 0 disabled (default)
float defrag_thold; // [DEPRECATED] defragment the KV cache if holes/size > thold, <= 0 disabled (default)
ggml_backend_sched_eval_callback cb_eval;
void * cb_eval_user_data;
@ -331,7 +338,6 @@ extern "C" {
// Keep the booleans together and at the end of the struct to avoid misalignment during copy-by-value.
bool embeddings; // if true, extract embeddings (together with logits)
bool offload_kqv; // offload the KQV ops (including the KV cache) to GPU
bool flash_attn; // use flash attention [EXPERIMENTAL]
bool no_perf; // measure performance timings
bool op_offload; // offload host tensor operations to device
bool swa_full; // use full-size SWA cache (https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055)
@ -469,8 +475,6 @@ extern "C" {
LLAMA_API llama_memory_t llama_get_memory (const struct llama_context * ctx);
LLAMA_API enum llama_pooling_type llama_pooling_type(const struct llama_context * ctx); // TODO: rename to llama_get_pooling_type
DEPRECATED(LLAMA_API struct llama_kv_cache * llama_get_kv_self(struct llama_context * ctx), "use llama_get_memory instead");
LLAMA_API const struct llama_vocab * llama_model_get_vocab(const struct llama_model * model);
LLAMA_API enum llama_rope_type llama_model_rope_type(const struct llama_model * model);
@ -557,10 +561,32 @@ extern "C" {
struct llama_model * model,
const char * path_lora);
// Functions to access the adapter's GGUF metadata scalar values
// - The functions return the length of the string on success, or -1 on failure
// - The output string is always null-terminated and cleared on failure
// - When retrieving a string, an extra byte must be allocated to account for the null terminator
// - GGUF array values are not supported by these functions
// Get metadata value as a string by key name
LLAMA_API int32_t llama_adapter_meta_val_str(const struct llama_adapter_lora * adapter, const char * key, char * buf, size_t buf_size);
// Get the number of metadata key/value pairs
LLAMA_API int32_t llama_adapter_meta_count(const struct llama_adapter_lora * adapter);
// Get metadata key name by index
LLAMA_API int32_t llama_adapter_meta_key_by_index(const struct llama_adapter_lora * adapter, int32_t i, char * buf, size_t buf_size);
// Get metadata value as a string by index
LLAMA_API int32_t llama_adapter_meta_val_str_by_index(const struct llama_adapter_lora * adapter, int32_t i, char * buf, size_t buf_size);
// Manually free a LoRA adapter
// Note: loaded adapters will be free when the associated model is deleted
LLAMA_API void llama_adapter_lora_free(struct llama_adapter_lora * adapter);
// Get the invocation tokens if the current lora is an alora
LLAMA_API uint64_t llama_adapter_get_alora_n_invocation_tokens(const struct llama_adapter_lora * adapter);
LLAMA_API const llama_token * llama_adapter_get_alora_invocation_tokens (const struct llama_adapter_lora * adapter);
// The following functions operate on a llama_context, hence the naming: llama_verb_...
// Add a loaded LoRA adapter to given context
@ -667,111 +693,6 @@ extern "C" {
// Check if the memory supports shifting
LLAMA_API bool llama_memory_can_shift(llama_memory_t mem);
//
// KV cache for self-attention (TODO: deprecate in favor of llama_memory)
//
// Returns the number of tokens in the KV cache (slow, use only for debug)
// If a KV cell has multiple sequences assigned to it, it will be counted multiple times
DEPRECATED(LLAMA_API int32_t llama_kv_self_n_tokens(const struct llama_context * ctx),
"Use llama_kv_self_seq_pos_max() and llama_kv_self_seq_pos_min() instead (https://github.com/ggml-org/llama.cpp/issues/13793)");
// Returns the number of used KV cells (i.e. have at least one sequence assigned to them)
DEPRECATED(LLAMA_API int32_t llama_kv_self_used_cells(const struct llama_context * ctx),
"Use llama_kv_self_seq_pos_max() and llama_kv_self_seq_pos_min() instead (https://github.com/ggml-org/llama.cpp/issues/13793)");
// Clear the KV cache - both cell info is erased and KV data is zeroed
DEPRECATED(LLAMA_API void llama_kv_self_clear(
struct llama_context * ctx),
"Use llama_memory_clear() instead");
// Removes all tokens that belong to the specified sequence and have positions in [p0, p1)
// Returns false if a partial sequence cannot be removed. Removing a whole sequence never fails
// seq_id < 0 : match any sequence
// p0 < 0 : [0, p1]
// p1 < 0 : [p0, inf)
DEPRECATED(LLAMA_API bool llama_kv_self_seq_rm(
struct llama_context * ctx,
llama_seq_id seq_id,
llama_pos p0,
llama_pos p1),
"Use llama_memory_seq_rm() instead");
// Copy all tokens that belong to the specified sequence to another sequence
// Note that this does not allocate extra KV cache memory - it simply assigns the tokens to the new sequence
// p0 < 0 : [0, p1]
// p1 < 0 : [p0, inf)
DEPRECATED(LLAMA_API void llama_kv_self_seq_cp(
struct llama_context * ctx,
llama_seq_id seq_id_src,
llama_seq_id seq_id_dst,
llama_pos p0,
llama_pos p1),
"Use llama_memory_seq_cp() instead");
// Removes all tokens that do not belong to the specified sequence
DEPRECATED(LLAMA_API void llama_kv_self_seq_keep(
struct llama_context * ctx,
llama_seq_id seq_id),
"Use llama_memory_seq_keep() instead");
// Adds relative position "delta" to all tokens that belong to the specified sequence and have positions in [p0, p1)
// If the KV cache is RoPEd, the KV data is updated accordingly:
// - lazily on next llama_decode()
// p0 < 0 : [0, p1]
// p1 < 0 : [p0, inf)
DEPRECATED(LLAMA_API void llama_kv_self_seq_add(
struct llama_context * ctx,
llama_seq_id seq_id,
llama_pos p0,
llama_pos p1,
llama_pos delta),
"Use llama_memory_seq_add() instead");
// Integer division of the positions by factor of `d > 1`
// If the KV cache is RoPEd, the KV data is updated accordingly:
// - lazily on next llama_decode()
// p0 < 0 : [0, p1]
// p1 < 0 : [p0, inf)
DEPRECATED(LLAMA_API void llama_kv_self_seq_div(
struct llama_context * ctx,
llama_seq_id seq_id,
llama_pos p0,
llama_pos p1,
int d),
"Use llama_memory_seq_div() instead");
// Returns the smallest position present in the KV cache for the specified sequence
// This is typically non-zero only for SWA caches
// Note that all positions in the range [pos_min, pos_max] are guaranteed to be present in the KV cache
// Return -1 if the sequence is empty
DEPRECATED(LLAMA_API llama_pos llama_kv_self_seq_pos_min(
struct llama_context * ctx,
llama_seq_id seq_id),
"Use llama_memory_seq_pos_min() instead");
// Returns the largest position present in the KV cache for the specified sequence
// Note that all positions in the range [pos_min, pos_max] are guaranteed to be present in the KV cache
// Return -1 if the sequence is empty
DEPRECATED(LLAMA_API llama_pos llama_kv_self_seq_pos_max(
struct llama_context * ctx,
llama_seq_id seq_id),
"Use llama_memory_seq_pos_max() instead");
// Defragment the KV cache
// This will be applied:
// - lazily on next llama_decode()
DEPRECATED(LLAMA_API void llama_kv_self_defrag(struct llama_context * ctx),
"simply remove this call, the context will automatically decide when to do a defragmentation based on 'defrag_thold'");
// Check if the context supports KV cache shifting
DEPRECATED(LLAMA_API bool llama_kv_self_can_shift(const struct llama_context * ctx),
"use llama_memory_can_shift() instead");
// Apply the KV cache updates (such as K-shifts, defragmentation, etc.)
DEPRECATED(LLAMA_API void llama_kv_self_update(struct llama_context * ctx),
"simply remove this call, updates are applied lazily on the next llama_decode()");
//
// State / sessions
//
@ -870,6 +791,29 @@ extern "C" {
size_t n_token_capacity,
size_t * n_token_count_out);
#define LLAMA_STATE_SEQ_FLAGS_SWA_ONLY 1
typedef uint32_t llama_state_seq_flags;
LLAMA_API size_t llama_state_seq_get_size_ext(
struct llama_context * ctx,
llama_seq_id seq_id,
llama_state_seq_flags flags);
LLAMA_API size_t llama_state_seq_get_data_ext(
struct llama_context * ctx,
uint8_t * dst,
size_t size,
llama_seq_id seq_id,
llama_state_seq_flags flags);
LLAMA_API size_t llama_state_seq_set_data_ext(
struct llama_context * ctx,
const uint8_t * src,
size_t size,
llama_seq_id dest_seq_id,
llama_state_seq_flags flags);
//
// Decoding
//
@ -1216,11 +1160,6 @@ extern "C" {
LLAMA_API struct llama_sampler * llama_sampler_init_greedy(void);
LLAMA_API struct llama_sampler * llama_sampler_init_dist (uint32_t seed);
/// @details Sorts candidate tokens by their logits in descending order and calculate probabilities based on logits.
/// NOTE: Avoid using on the full vocabulary as the sorting can become slow. For example, apply top-k or top-p sampling first.
DEPRECATED(LLAMA_API struct llama_sampler * llama_sampler_init_softmax (void),
"will be removed in the future (see https://github.com/ggml-org/llama.cpp/pull/9896#discussion_r1800920915)");
/// @details Top-K sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751
/// Setting k <= 0 makes this a noop
LLAMA_API struct llama_sampler * llama_sampler_init_top_k (int32_t k);
@ -1390,24 +1329,25 @@ extern "C" {
//
// Performance utils
//
// NOTE: Used by llama.cpp examples, avoid using in third-party apps. Instead, do your own performance measurements.
// NOTE: Used by llama.cpp examples/tools, avoid using in third-party apps. Instead, do your own performance measurements.
//
struct llama_perf_context_data {
double t_start_ms;
double t_load_ms;
double t_p_eval_ms;
double t_eval_ms;
// ms == milliseconds
double t_start_ms; // absolute start time
double t_load_ms; // time needed for loading the model
double t_p_eval_ms; // time needed for processing the prompt
double t_eval_ms; // time needed for generating tokens
int32_t n_p_eval;
int32_t n_eval;
int32_t n_p_eval; // number of prompt tokens
int32_t n_eval; // number of generated tokens
int32_t n_reused; // number of times a ggml compute graph had been reused
};
struct llama_perf_sampler_data {
double t_sample_ms;
double t_sample_ms; // time needed for sampling in ms
int32_t n_sample;
int32_t n_sample; // number of sampled tokens
};
LLAMA_API struct llama_perf_context_data llama_perf_context (const struct llama_context * ctx);
@ -1419,6 +1359,9 @@ extern "C" {
LLAMA_API void llama_perf_sampler_print(const struct llama_sampler * chain);
LLAMA_API void llama_perf_sampler_reset( struct llama_sampler * chain);
// print a breakdown of per-device memory use via LLAMA_LOG:
LLAMA_API void llama_memory_breakdown_print(const struct llama_context * ctx);
//
// training
//
@ -1437,6 +1380,8 @@ extern "C" {
ggml_opt_get_optimizer_params get_opt_pars; // callback for calculating optimizer parameters
void * get_opt_pars_ud; // userdata for calculating optimizer parameters
enum ggml_opt_optimizer_type optimizer_type;
};
LLAMA_API void llama_opt_init(struct llama_context * lctx, struct llama_model * model, struct llama_opt_params lopt_params);

View File

@ -6,6 +6,7 @@
#include <map>
#include <cassert>
#include <sstream>
#include <stdexcept>
// vec
@ -163,13 +164,38 @@ static void llama_adapter_lora_init_impl(llama_model & model, const char * path_
// check metadata
{
const gguf_context * gguf_ctx = ctx_gguf.get();
LLAMA_LOG_INFO("%s: Dumping metadata keys/values.\n", __func__);
// get metadata as string
for (int i = 0; i < gguf_get_n_kv(gguf_ctx); i++) {
gguf_type type = gguf_get_kv_type(gguf_ctx, i);
const std::string type_name =
type == GGUF_TYPE_ARRAY
? format("%s[%s,%zu]", gguf_type_name(type), gguf_type_name(gguf_get_arr_type(gguf_ctx, i)), gguf_get_arr_n(gguf_ctx, i))
: gguf_type_name(type);
const char * name = gguf_get_key(gguf_ctx, i);
const std::string value = gguf_kv_to_str(gguf_ctx, i);
if (type != GGUF_TYPE_ARRAY) {
adapter.gguf_kv.emplace(name, value);
}
const size_t MAX_VALUE_LEN = 40;
std::string print_value = value.size() > MAX_VALUE_LEN ? format("%s...", value.substr(0, MAX_VALUE_LEN - 3).c_str()) : value;
replace_all(print_value, "\n", "\\n");
LLAMA_LOG_INFO("%s: - kv %3d: %42s %-16s = %s\n", __func__, i, name, type_name.c_str(), print_value.c_str());
}
auto get_kv_str = [&](const std::string & key) -> std::string {
int id = gguf_find_key(ctx_gguf.get(), key.c_str());
return id < 0 ? "" : std::string(gguf_get_val_str(ctx_gguf.get(), id));
int id = gguf_find_key(gguf_ctx, key.c_str());
return id < 0 ? "" : std::string(gguf_get_val_str(gguf_ctx, id));
};
auto get_kv_f32 = [&](const std::string & key) -> float {
int id = gguf_find_key(ctx_gguf.get(), key.c_str());
return id < 0 ? 0.0f : gguf_get_val_f32(ctx_gguf.get(), id);
int id = gguf_find_key(gguf_ctx, key.c_str());
return id < 0 ? 0.0f : gguf_get_val_f32(gguf_ctx, id);
};
LLM_KV llm_kv = LLM_KV(LLM_ARCH_UNKNOWN);
@ -190,6 +216,26 @@ static void llama_adapter_lora_init_impl(llama_model & model, const char * path_
}
adapter.alpha = get_kv_f32(llm_kv(LLM_KV_ADAPTER_LORA_ALPHA));
// parse alora invocation sequence vector
const auto & key = llm_kv(LLM_KV_ADAPTER_ALORA_INVOCATION_TOKENS);
const int kid = gguf_find_key(ctx_gguf.get(), key.c_str());
if (kid >= 0) {
if (gguf_get_kv_type(ctx_gguf.get(), kid) != GGUF_TYPE_ARRAY) {
throw std::runtime_error("invalid gguf type for " + key);
}
const auto arr_type = gguf_get_arr_type(ctx_gguf.get(), kid);
if (arr_type != GGUF_TYPE_UINT32) {
throw std::runtime_error("invalid gguf element type for " + key);
}
const size_t seq_len = gguf_get_arr_n(ctx_gguf.get(), kid);
const void * data = gguf_get_arr_data(ctx_gguf.get(), kid);
adapter.alora_invocation_tokens.resize(seq_len);
std::copy(
(const llama_token *)data,
(const llama_token *)data + seq_len,
adapter.alora_invocation_tokens.begin());
}
}
int n_tensors = gguf_get_n_tensors(ctx_gguf.get());
@ -383,6 +429,57 @@ llama_adapter_lora * llama_adapter_lora_init(llama_model * model, const char * p
return nullptr;
}
int32_t llama_adapter_meta_val_str(const llama_adapter_lora * adapter, const char * key, char * buf, size_t buf_size) {
const auto & it = adapter->gguf_kv.find(key);
if (it == adapter->gguf_kv.end()) {
if (buf_size > 0) {
buf[0] = '\0';
}
return -1;
}
return snprintf(buf, buf_size, "%s", it->second.c_str());
}
int32_t llama_adapter_meta_count(const llama_adapter_lora * adapter) {
return (int)adapter->gguf_kv.size();
}
int32_t llama_adapter_meta_key_by_index(const llama_adapter_lora * adapter, int i, char * buf, size_t buf_size) {
if (i < 0 || i >= (int)adapter->gguf_kv.size()) {
if (buf_size > 0) {
buf[0] = '\0';
}
return -1;
}
auto it = adapter->gguf_kv.begin();
std::advance(it, i);
return snprintf(buf, buf_size, "%s", it->first.c_str());
}
int32_t llama_adapter_meta_val_str_by_index(const llama_adapter_lora * adapter, int32_t i, char * buf, size_t buf_size) {
if (i < 0 || i >= (int)adapter->gguf_kv.size()) {
if (buf_size > 0) {
buf[0] = '\0';
}
return -1;
}
auto it = adapter->gguf_kv.begin();
std::advance(it, i);
return snprintf(buf, buf_size, "%s", it->second.c_str());
}
void llama_adapter_lora_free(llama_adapter_lora * adapter) {
delete adapter;
}
uint64_t llama_adapter_get_alora_n_invocation_tokens(const struct llama_adapter_lora * adapter) {
if (!adapter) {
return 0;
}
return adapter->alora_invocation_tokens.size();
}
const llama_token * llama_adapter_get_alora_invocation_tokens(const llama_adapter_lora * adapter) {
GGML_ASSERT(adapter);
return adapter->alora_invocation_tokens.data();
}

View File

@ -67,6 +67,12 @@ struct llama_adapter_lora {
float alpha;
// gguf metadata
std::unordered_map<std::string, std::string> gguf_kv;
// activated lora (aLoRA)
std::vector<llama_token> alora_invocation_tokens;
llama_adapter_lora() = default;
~llama_adapter_lora() = default;

View File

@ -22,6 +22,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
{ LLM_ARCH_NOMIC_BERT_MOE, "nomic-bert-moe" },
{ LLM_ARCH_NEO_BERT, "neo-bert" },
{ LLM_ARCH_JINA_BERT_V2, "jina-bert-v2" },
{ LLM_ARCH_JINA_BERT_V3, "jina-bert-v3" },
{ LLM_ARCH_BLOOM, "bloom" },
{ LLM_ARCH_STABLELM, "stablelm" },
{ LLM_ARCH_QWEN, "qwen" },
@ -44,6 +45,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
{ LLM_ARCH_GEMMA2, "gemma2" },
{ LLM_ARCH_GEMMA3, "gemma3" },
{ LLM_ARCH_GEMMA3N, "gemma3n" },
{ LLM_ARCH_GEMMA_EMBEDDING, "gemma-embedding" },
{ LLM_ARCH_STARCODER2, "starcoder2" },
{ LLM_ARCH_MAMBA, "mamba" },
{ LLM_ARCH_MAMBA2, "mamba2" },
@ -68,6 +70,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
{ LLM_ARCH_T5ENCODER, "t5encoder" },
{ LLM_ARCH_JAIS, "jais" },
{ LLM_ARCH_NEMOTRON, "nemotron" },
{ LLM_ARCH_NEMOTRON_H, "nemotron_h" },
{ LLM_ARCH_EXAONE, "exaone" },
{ LLM_ARCH_EXAONE4, "exaone4" },
{ LLM_ARCH_RWKV6, "rwkv6" },
@ -94,6 +97,9 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
{ LLM_ARCH_DREAM, "dream" },
{ LLM_ARCH_SMALLTHINKER, "smallthinker" },
{ LLM_ARCH_LLADA, "llada" },
{ LLM_ARCH_LLADA_MOE, "llada-moe" },
{ LLM_ARCH_SEED_OSS, "seed_oss" },
{ LLM_ARCH_GROVEMOE, "grovemoe" },
{ LLM_ARCH_UNKNOWN, "(unknown)" },
};
@ -121,6 +127,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
{ LLM_KV_FEED_FORWARD_LENGTH, "%s.feed_forward_length" },
{ LLM_KV_EXPERT_FEED_FORWARD_LENGTH, "%s.expert_feed_forward_length" },
{ LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, "%s.expert_shared_feed_forward_length" },
{ LLM_KV_EXPERT_CHUNK_FEED_FORWARD_LENGTH, "%s.expert_chunk_feed_forward_length" },
{ LLM_KV_USE_PARALLEL_RESIDUAL, "%s.use_parallel_residual" },
{ LLM_KV_TENSOR_DATA_LAYOUT, "%s.tensor_data_layout" },
{ LLM_KV_EXPERT_COUNT, "%s.expert_count" },
@ -129,12 +136,16 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
{ LLM_KV_EXPERT_WEIGHTS_SCALE, "%s.expert_weights_scale" },
{ LLM_KV_EXPERT_WEIGHTS_NORM, "%s.expert_weights_norm" },
{ LLM_KV_EXPERT_GATING_FUNC, "%s.expert_gating_func" },
{ LLM_KV_EXPERT_GROUP_SCALE, "%s.expert_group_scale" },
{ LLM_KV_EXPERTS_PER_GROUP, "%s.experts_per_group" },
{ LLM_KV_MOE_EVERY_N_LAYERS, "%s.moe_every_n_layers" },
{ LLM_KV_NEXTN_PREDICT_LAYERS, "%s.nextn_predict_layers" },
{ LLM_KV_POOLING_TYPE, "%s.pooling_type" },
{ LLM_KV_LOGIT_SCALE, "%s.logit_scale" },
{ LLM_KV_DECODER_START_TOKEN_ID, "%s.decoder_start_token_id" },
{ LLM_KV_DECODER_BLOCK_COUNT, "%s.decoder_block_count" },
{ LLM_KV_ATTN_LOGIT_SOFTCAPPING, "%s.attn_logit_softcapping" },
{ LLM_KV_ROUTER_LOGIT_SOFTCAPPING, "%s.router_logit_softcapping" },
{ LLM_KV_FINAL_LOGIT_SOFTCAPPING, "%s.final_logit_softcapping" },
{ LLM_KV_SWIN_NORM, "%s.swin_norm" },
{ LLM_KV_RESCALE_EVERY_N_LAYERS, "%s.rescale_every_n_layers" },
@ -165,6 +176,8 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
{ LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT, "%s.attention.relative_buckets_count" },
{ LLM_KV_ATTENTION_SLIDING_WINDOW, "%s.attention.sliding_window" },
{ LLM_KV_ATTENTION_SCALE, "%s.attention.scale" },
{ LLM_KV_ATTENTION_OUTPUT_SCALE, "%s.attention.output_scale" },
{ LLM_KV_ATTENTION_TEMPERATURE_LENGTH, "%s.attention.temperature_length" },
{ LLM_KV_ATTENTION_BLOCK_SKIP_CONNECTION, "%s.attention.block_skip_connection" },
{ LLM_KV_ATTENTION_KEY_LENGTH_MLA, "%s.attention.key_length_mla" },
{ LLM_KV_ATTENTION_VALUE_LENGTH_MLA, "%s.attention.value_length_mla" },
@ -179,6 +192,10 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
{ LLM_KV_ROPE_SCALING_ORIG_CTX_LEN, "%s.rope.scaling.original_context_length" },
{ LLM_KV_ROPE_SCALING_FINETUNED, "%s.rope.scaling.finetuned" },
{ LLM_KV_ROPE_SCALING_YARN_LOG_MUL, "%s.rope.scaling.yarn_log_multiplier" },
{ LLM_KV_ROPE_SCALING_YARN_EXT_FACTOR, "%s.rope.scaling.yarn_ext_factor" },
{ LLM_KV_ROPE_SCALING_YARN_ATTN_FACTOR, "%s.rope.scaling.yarn_attn_factor" },
{ LLM_KV_ROPE_SCALING_YARN_BETA_FAST, "%s.rope.scaling.yarn_beta_fast" },
{ LLM_KV_ROPE_SCALING_YARN_BETA_SLOW, "%s.rope.scaling.yarn_beta_slow" },
{ LLM_KV_SPLIT_NO, "split.no" },
{ LLM_KV_SPLIT_COUNT, "split.count" },
@ -237,6 +254,9 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
{ LLM_KV_ADAPTER_TYPE, "adapter.type" },
{ LLM_KV_ADAPTER_LORA_ALPHA, "adapter.lora.alpha" },
{ LLM_KV_ADAPTER_LORA_TASK_NAME, "adapter.lora.task_name" },
{ LLM_KV_ADAPTER_LORA_PROMPT_PREFIX, "adapter.lora.prompt_prefix" },
{ LLM_KV_ADAPTER_ALORA_INVOCATION_TOKENS, "adapter.alora.invocation_tokens" },
// deprecated
{ LLM_KV_TOKENIZER_PREFIX_ID, "tokenizer.ggml.prefix_token_id" },
@ -392,12 +412,16 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
{ LLM_TENSOR_ATTN_ROT_EMBD, "blk.%d.attn_rot_embd" },
{ LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" },
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
{ LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
{ LLM_TENSOR_FFN_GATE_EXP, "blk.%d.ffn_gate.%d" },
{ LLM_TENSOR_FFN_DOWN_EXP, "blk.%d.ffn_down.%d" },
{ LLM_TENSOR_FFN_UP_EXP, "blk.%d.ffn_up.%d" },
{ LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" },
{ LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" },
{ LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
{ LLM_TENSOR_FFN_POST_NORM, "blk.%d.post_ffw_norm" },
{ LLM_TENSOR_LAYER_OUT_NORM, "blk.%d.layer_output_norm" },
{ LLM_TENSOR_ATTN_OUT_NORM, "blk.%d.attn_output_norm" },
},
@ -576,6 +600,20 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
{ LLM_TENSOR_CLS, "cls" },
},
},
{
LLM_ARCH_JINA_BERT_V3,
{
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
{ LLM_TENSOR_TOKEN_EMBD_NORM, "token_embd_norm" },
{ LLM_TENSOR_TOKEN_TYPES, "token_types" },
{ LLM_TENSOR_ATTN_OUT_NORM, "blk.%d.attn_output_norm" },
{ LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" },
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
{ LLM_TENSOR_LAYER_OUT_NORM, "blk.%d.layer_output_norm" },
},
},
{
LLM_ARCH_BLOOM,
{
@ -689,6 +727,7 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
{ LLM_TENSOR_OUTPUT, "output" },
{ LLM_TENSOR_CLS_OUT, "cls.output" },
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
{ LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm" },
@ -1021,6 +1060,27 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
{ LLM_TENSOR_LAUREL_POST_NORM, "blk.%d.laurel_post_norm" },
},
},
{
LLM_ARCH_GEMMA_EMBEDDING,
{
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
{ LLM_TENSOR_OUTPUT, "output" },
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
{ LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm" },
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
{ LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" },
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
{ LLM_TENSOR_ATTN_POST_NORM, "blk.%d.post_attention_norm" },
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
{ LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
{ LLM_TENSOR_FFN_POST_NORM, "blk.%d.post_ffw_norm" },
},
},
{
LLM_ARCH_STARCODER2,
{
@ -1534,6 +1594,31 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
},
},
{
LLM_ARCH_NEMOTRON_H,
{
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
{ LLM_TENSOR_OUTPUT, "output" },
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
// mamba(2) ssm layers
{ LLM_TENSOR_SSM_IN, "blk.%d.ssm_in" },
{ LLM_TENSOR_SSM_CONV1D, "blk.%d.ssm_conv1d" },
{ LLM_TENSOR_SSM_DT, "blk.%d.ssm_dt" },
{ LLM_TENSOR_SSM_A, "blk.%d.ssm_a" },
{ LLM_TENSOR_SSM_D, "blk.%d.ssm_d" },
{ LLM_TENSOR_SSM_NORM, "blk.%d.ssm_norm" },
{ LLM_TENSOR_SSM_OUT, "blk.%d.ssm_out" },
// attention layers
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
// dense FFN
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
},
},
{
LLM_ARCH_EXAONE,
{
@ -2030,6 +2115,7 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
{ LLM_TENSOR_SHORTCONV_OUTPROJ, "blk.%d.shortconv.out_proj" },
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
{ LLM_TENSOR_TOKEN_EMBD_NORM, "token_embd_norm" },
{ LLM_TENSOR_OUTPUT, "output" },
}
},
{
@ -2087,6 +2173,66 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
},
},
{
LLM_ARCH_LLADA_MOE,
{
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
{ LLM_TENSOR_OUTPUT, "output" },
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
{ LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm" },
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
{ LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" },
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
{ LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" },
{ LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" },
{ LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" },
{ LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
},
},
{
LLM_ARCH_SEED_OSS,
{
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
{ LLM_TENSOR_OUTPUT, "output" },
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
{ LLM_TENSOR_ATTN_POST_NORM, "blk.%d.post_attention_norm" },
{ LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
},
},
{
LLM_ARCH_GROVEMOE,
{
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
{ LLM_TENSOR_OUTPUT, "output" },
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
{ LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm" },
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
{ LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" },
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
{ LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" },
{ LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" },
{ LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" },
{ LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
{ LLM_TENSOR_FFN_GATE_CHEXPS, "blk.%d.ffn_gate_chexps" },
{ LLM_TENSOR_FFN_DOWN_CHEXPS, "blk.%d.ffn_down_chexps" },
{ LLM_TENSOR_FFN_UP_CHEXPS, "blk.%d.ffn_up_chexps" },
},
},
{
LLM_ARCH_UNKNOWN,
{
@ -2219,6 +2365,9 @@ static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
{LLM_TENSOR_FFN_DOWN_EXPS, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT_ID}},
{LLM_TENSOR_FFN_GATE_EXPS, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT_ID}},
{LLM_TENSOR_FFN_UP_EXPS, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT_ID}},
{LLM_TENSOR_FFN_DOWN_CHEXPS, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT_ID}},
{LLM_TENSOR_FFN_GATE_CHEXPS, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT_ID}},
{LLM_TENSOR_FFN_UP_CHEXPS, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT_ID}},
{LLM_TENSOR_FFN_EXP_PROBS_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_ADD}},
// altup / laurel (gemma 3n)
{LLM_TENSOR_PER_LAYER_TOKEN_EMBD, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_GET_ROWS}},
@ -2340,6 +2489,7 @@ bool llm_arch_is_hybrid(const llm_arch & arch) {
case LLM_ARCH_PLAMO2:
case LLM_ARCH_GRANITE_HYBRID:
case LLM_ARCH_LFM2:
case LLM_ARCH_NEMOTRON_H:
return true;
default:
return false;
@ -2350,6 +2500,7 @@ bool llm_arch_is_diffusion(const llm_arch & arch) {
switch (arch) {
case LLM_ARCH_DREAM:
case LLM_ARCH_LLADA:
case LLM_ARCH_LLADA_MOE:
return true;
default:
return false;

View File

@ -26,6 +26,7 @@ enum llm_arch {
LLM_ARCH_NOMIC_BERT_MOE,
LLM_ARCH_NEO_BERT,
LLM_ARCH_JINA_BERT_V2,
LLM_ARCH_JINA_BERT_V3,
LLM_ARCH_BLOOM,
LLM_ARCH_STABLELM,
LLM_ARCH_QWEN,
@ -48,6 +49,7 @@ enum llm_arch {
LLM_ARCH_GEMMA2,
LLM_ARCH_GEMMA3,
LLM_ARCH_GEMMA3N,
LLM_ARCH_GEMMA_EMBEDDING,
LLM_ARCH_STARCODER2,
LLM_ARCH_MAMBA,
LLM_ARCH_MAMBA2,
@ -72,6 +74,7 @@ enum llm_arch {
LLM_ARCH_T5ENCODER,
LLM_ARCH_JAIS,
LLM_ARCH_NEMOTRON,
LLM_ARCH_NEMOTRON_H,
LLM_ARCH_EXAONE,
LLM_ARCH_EXAONE4,
LLM_ARCH_RWKV6,
@ -98,6 +101,9 @@ enum llm_arch {
LLM_ARCH_DREAM,
LLM_ARCH_SMALLTHINKER,
LLM_ARCH_LLADA,
LLM_ARCH_LLADA_MOE,
LLM_ARCH_SEED_OSS,
LLM_ARCH_GROVEMOE,
LLM_ARCH_UNKNOWN,
};
@ -125,6 +131,7 @@ enum llm_kv {
LLM_KV_FEED_FORWARD_LENGTH,
LLM_KV_EXPERT_FEED_FORWARD_LENGTH,
LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH,
LLM_KV_EXPERT_CHUNK_FEED_FORWARD_LENGTH,
LLM_KV_USE_PARALLEL_RESIDUAL,
LLM_KV_TENSOR_DATA_LAYOUT,
LLM_KV_EXPERT_COUNT,
@ -133,12 +140,16 @@ enum llm_kv {
LLM_KV_EXPERT_WEIGHTS_SCALE,
LLM_KV_EXPERT_WEIGHTS_NORM,
LLM_KV_EXPERT_GATING_FUNC,
LLM_KV_EXPERT_GROUP_SCALE,
LLM_KV_EXPERTS_PER_GROUP,
LLM_KV_MOE_EVERY_N_LAYERS,
LLM_KV_NEXTN_PREDICT_LAYERS,
LLM_KV_POOLING_TYPE,
LLM_KV_LOGIT_SCALE,
LLM_KV_DECODER_START_TOKEN_ID,
LLM_KV_DECODER_BLOCK_COUNT,
LLM_KV_ATTN_LOGIT_SOFTCAPPING,
LLM_KV_ROUTER_LOGIT_SOFTCAPPING,
LLM_KV_FINAL_LOGIT_SOFTCAPPING,
LLM_KV_SWIN_NORM,
LLM_KV_RESCALE_EVERY_N_LAYERS,
@ -169,6 +180,8 @@ enum llm_kv {
LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT,
LLM_KV_ATTENTION_SLIDING_WINDOW,
LLM_KV_ATTENTION_SCALE,
LLM_KV_ATTENTION_OUTPUT_SCALE,
LLM_KV_ATTENTION_TEMPERATURE_LENGTH,
LLM_KV_ATTENTION_BLOCK_SKIP_CONNECTION,
LLM_KV_ATTENTION_KEY_LENGTH_MLA,
LLM_KV_ATTENTION_VALUE_LENGTH_MLA,
@ -183,6 +196,10 @@ enum llm_kv {
LLM_KV_ROPE_SCALING_ORIG_CTX_LEN,
LLM_KV_ROPE_SCALING_FINETUNED,
LLM_KV_ROPE_SCALING_YARN_LOG_MUL,
LLM_KV_ROPE_SCALING_YARN_EXT_FACTOR,
LLM_KV_ROPE_SCALING_YARN_ATTN_FACTOR,
LLM_KV_ROPE_SCALING_YARN_BETA_FAST,
LLM_KV_ROPE_SCALING_YARN_BETA_SLOW,
LLM_KV_SPLIT_NO,
LLM_KV_SPLIT_COUNT,
@ -231,6 +248,9 @@ enum llm_kv {
LLM_KV_ADAPTER_TYPE,
LLM_KV_ADAPTER_LORA_ALPHA,
LLM_KV_ADAPTER_LORA_TASK_NAME,
LLM_KV_ADAPTER_LORA_PROMPT_PREFIX,
LLM_KV_ADAPTER_ALORA_INVOCATION_TOKENS,
LLM_KV_POSNET_EMBEDDING_LENGTH,
LLM_KV_POSNET_BLOCK_COUNT,
@ -287,6 +307,9 @@ enum llm_tensor {
LLM_TENSOR_FFN_DOWN_SHEXP,
LLM_TENSOR_FFN_GATE_SHEXP,
LLM_TENSOR_FFN_UP_SHEXP,
LLM_TENSOR_FFN_DOWN_CHEXPS,
LLM_TENSOR_FFN_GATE_CHEXPS,
LLM_TENSOR_FFN_UP_CHEXPS,
LLM_TENSOR_FFN_EXP_PROBS_B,
LLM_TENSOR_ATTN_Q_NORM,
LLM_TENSOR_ATTN_K_NORM,

View File

@ -477,7 +477,7 @@ llama_ubatch llama_batch_allocr::split_simple(uint32_t n_ubatch) {
llama_ubatch llama_batch_allocr::split_equal(uint32_t n_ubatch, bool sequential) {
if (sequential && has_cpl) {
LLAMA_LOG_ERROR("%s: sequential split is not supported when there are coupled sequences in the input batch\n", __func__);
LLAMA_LOG_ERROR("%s: sequential split is not supported when there are coupled sequences in the input batch (you may need to use the -kvu flag)\n", __func__);
return {};
}

View File

@ -16,10 +16,10 @@
static std::string trim(const std::string & str) {
size_t start = 0;
size_t end = str.size();
while (start < end && isspace(str[start])) {
while (start < end && isspace(static_cast<unsigned char>(str[start]))) {
start += 1;
}
while (end > start && isspace(str[end - 1])) {
while (end > start && isspace(static_cast<unsigned char>(str[end - 1]))) {
end -= 1;
}
return str.substr(start, end - start);
@ -69,6 +69,8 @@ static const std::map<std::string, llm_chat_template> LLM_CHAT_TEMPLATES = {
{ "gpt-oss", LLM_CHAT_TEMPLATE_OPENAI_MOE },
{ "hunyuan-dense", LLM_CHAT_TEMPLATE_HUNYUAN_DENSE },
{ "kimi-k2", LLM_CHAT_TEMPLATE_KIMI_K2 },
{ "seed_oss", LLM_CHAT_TEMPLATE_SEED_OSS },
{ "grok-2", LLM_CHAT_TEMPLATE_GROK_2 },
};
llm_chat_template llm_chat_template_from_str(const std::string & name) {
@ -201,6 +203,10 @@ llm_chat_template llm_chat_detect_template(const std::string & tmpl) {
return LLM_CHAT_TEMPLATE_HUNYUAN_DENSE;
} else if (tmpl_contains("<|im_assistant|>assistant<|im_middle|>")) {
return LLM_CHAT_TEMPLATE_KIMI_K2;
} else if (tmpl_contains("<seed:bos>")) {
return LLM_CHAT_TEMPLATE_SEED_OSS;
} else if (tmpl_contains("'Assistant: ' + message['content'] + '<|separator|>")) {
return LLM_CHAT_TEMPLATE_GROK_2;
}
return LLM_CHAT_TEMPLATE_UNKNOWN;
}
@ -752,6 +758,28 @@ int32_t llm_chat_apply_template(
if (add_ass) {
ss << "<|im_assistant|>assistant<|im_middle|>";
}
} else if (tmpl == LLM_CHAT_TEMPLATE_SEED_OSS) {
for (auto message: chat) {
std::string role(message->role);
ss << "<seed:bos>" << role << "\n" << (role == "assistant" ? trim(message->content) : message->content) << "<seed:eos>";
}
if (add_ass) {
ss << "<seed:bos>assistant\n";
}
} else if (tmpl == LLM_CHAT_TEMPLATE_GROK_2) {
for (auto message : chat) {
std::string role(message->role);
if (role == "system") {
ss << "System: " << trim(message->content) << "<|separator|>\n\n";
} else if (role == "user") {
ss << "Human: " << trim(message->content) << "<|separator|>\n\n";
} else if (role == "assistant") {
ss << "Assistant: " << message->content << "<|separator|>\n\n";
}
}
if (add_ass) {
ss << "Assistant:";
}
} else {
// template not supported
return -1;

View File

@ -49,6 +49,8 @@ enum llm_chat_template {
LLM_CHAT_TEMPLATE_OPENAI_MOE,
LLM_CHAT_TEMPLATE_HUNYUAN_DENSE,
LLM_CHAT_TEMPLATE_KIMI_K2,
LLM_CHAT_TEMPLATE_SEED_OSS,
LLM_CHAT_TEMPLATE_GROK_2,
LLM_CHAT_TEMPLATE_UNKNOWN,
};

View File

@ -35,14 +35,12 @@ llama_context::llama_context(
cparams.n_threads = params.n_threads;
cparams.n_threads_batch = params.n_threads_batch;
cparams.yarn_ext_factor = params.yarn_ext_factor;
cparams.yarn_attn_factor = params.yarn_attn_factor;
cparams.yarn_beta_fast = params.yarn_beta_fast;
cparams.yarn_beta_slow = params.yarn_beta_slow;
cparams.defrag_thold = params.defrag_thold;
cparams.yarn_ext_factor = params.yarn_ext_factor >= 0.0f ? params.yarn_ext_factor : hparams.yarn_ext_factor;
cparams.yarn_attn_factor = params.yarn_attn_factor >= 0.0f ? params.yarn_attn_factor : hparams.yarn_attn_factor;
cparams.yarn_beta_fast = params.yarn_beta_fast >= 0.0f ? params.yarn_beta_fast : hparams.yarn_beta_fast;
cparams.yarn_beta_slow = params.yarn_beta_slow >= 0.0f ? params.yarn_beta_slow : hparams.yarn_beta_slow;
cparams.embeddings = params.embeddings;
cparams.offload_kqv = params.offload_kqv;
cparams.flash_attn = params.flash_attn;
cparams.no_perf = params.no_perf;
cparams.pooling_type = params.pooling_type;
cparams.warmup = false;
@ -87,13 +85,15 @@ llama_context::llama_context(
cparams.causal_attn = params.attention_type == LLAMA_ATTENTION_TYPE_CAUSAL;
}
cparams.flash_attn = params.flash_attn_type != LLAMA_FLASH_ATTN_TYPE_DISABLED;
// with causal attention, the batch size is limited by the context size
cparams.n_batch = cparams.causal_attn ? std::min(cparams.n_ctx, params.n_batch) : params.n_batch;
// the batch has to be at least GGML_KQ_MASK_PAD because we will be padding the KQ_mask
// this is required by GPU kernels in order to avoid out-of-bounds accesses (e.g. ggml_flash_attn_ext)
// ref: https://github.com/ggerganov/llama.cpp/pull/5021
// TODO: this padding is not needed for the cache-less context so we should probably move it to llama_context_kv_self
// TODO: this padding is not needed for the cache-less context so we should probably move it to llama_memory
if (cparams.n_batch < GGML_KQ_MASK_PAD) {
LLAMA_LOG_WARN("%s: n_batch is less than GGML_KQ_MASK_PAD - increasing to %d\n", __func__, GGML_KQ_MASK_PAD);
cparams.n_batch = GGML_KQ_MASK_PAD;
@ -103,16 +103,6 @@ llama_context::llama_context(
cparams.op_offload = params.op_offload;
cparams.kv_unified = params.kv_unified;
{
const char * LLAMA_SET_ROWS = getenv("LLAMA_SET_ROWS");
supports_set_rows = LLAMA_SET_ROWS ? (atoi(LLAMA_SET_ROWS) != 0) : supports_set_rows;
if (!supports_set_rows && !cparams.kv_unified) {
LLAMA_LOG_WARN("%s: non-unified KV cache requires ggml_set_rows() - forcing unified KV cache\n", __func__);
cparams.kv_unified = true;
}
}
{
const char * LLAMA_GRAPH_REUSE_DISABLE = getenv("LLAMA_GRAPH_REUSE_DISABLE");
graph_reuse_disable = LLAMA_GRAPH_REUSE_DISABLE ? (atoi(LLAMA_GRAPH_REUSE_DISABLE) != 0) : graph_reuse_disable;
@ -130,7 +120,7 @@ llama_context::llama_context(
LLAMA_LOG_INFO("%s: n_batch = %u\n", __func__, cparams.n_batch);
LLAMA_LOG_INFO("%s: n_ubatch = %u\n", __func__, cparams.n_ubatch);
LLAMA_LOG_INFO("%s: causal_attn = %d\n", __func__, cparams.causal_attn);
LLAMA_LOG_INFO("%s: flash_attn = %d\n", __func__, cparams.flash_attn);
LLAMA_LOG_INFO("%s: flash_attn = %s\n", __func__, llama_flash_attn_type_name(params.flash_attn_type));
LLAMA_LOG_INFO("%s: kv_unified = %s\n", __func__, cparams.kv_unified ? "true" : "false");
LLAMA_LOG_INFO("%s: freq_base = %.1f\n", __func__, cparams.rope_freq_base);
LLAMA_LOG_INFO("%s: freq_scale = %g\n", __func__, cparams.rope_freq_scale);
@ -145,11 +135,6 @@ llama_context::llama_context(
__func__, n_ctx_per_seq, hparams.n_ctx_train);
}
if (!params.swa_full && cparams.n_seq_max > 1 && hparams.is_swa_any()) {
LLAMA_LOG_WARN("%s: requested n_seq_max (%u) > 1, but swa_full is not enabled -- performance may be degraded: %s\n",
__func__, cparams.n_seq_max, "https://github.com/ggml-org/llama.cpp/pull/13845#issuecomment-2924800573");
}
if (!hparams.vocab_only) {
// GPU backends
for (auto * dev : model.devices) {
@ -196,7 +181,7 @@ llama_context::llama_context(
// graph outputs buffer
{
// resized during inference when a batch uses more outputs
if ((uint32_t) output_reserve(params.n_seq_max) < params.n_seq_max) {
if (output_reserve(params.n_seq_max) < params.n_seq_max) {
throw std::runtime_error("failed to reserve initial output buffer");
}
@ -285,28 +270,75 @@ llama_context::llama_context(
}
}
// reserve worst-case graph
if (!hparams.vocab_only && memory) {
if (!hparams.vocab_only) {
llama_memory_context_ptr mctx;
if (memory) {
LLAMA_LOG_DEBUG("%s: reserving full memory module\n", __func__);
mctx = memory->init_full();
if (!mctx) {
throw std::runtime_error("failed to initialize memory module");
}
}
cross.v_embd.clear();
const uint32_t n_seqs = cparams.kv_unified ? 1 : cparams.n_seq_max;
const uint32_t n_tokens = std::min(cparams.n_ctx, cparams.n_ubatch);
// avoid reserving graphs with zero outputs - assume one output per sequence
n_outputs = n_seqs;
LLAMA_LOG_DEBUG("%s: worst-case: n_tokens = %d, n_seqs = %d, n_outputs = %d\n", __func__, n_tokens, n_seqs, n_outputs);
// resolve automatic Flash Attention use
if (params.flash_attn_type == LLAMA_FLASH_ATTN_TYPE_AUTO) {
auto * gf = graph_reserve(1, n_seqs, n_outputs, mctx.get(), true);
if (!gf) {
throw std::runtime_error("failed to split graph for Flash Attention check");
}
const size_t prefix_len = strlen(LLAMA_TENSOR_NAME_FATTN) + 1;
bool fa_device_mismatch = false;
for (int i = 0; i < ggml_graph_n_nodes(gf); i++) {
ggml_tensor * n = ggml_graph_node(gf, i);
if (n->op != GGML_OP_FLASH_ATTN_EXT) {
continue;
}
ggml_backend_dev_t device_fa = ggml_backend_get_device(
ggml_backend_sched_get_tensor_backend(sched.get(), n));
// TODO: instead of the tensor names, use a map to keep track of which (FA) tensors belong to which layer
GGML_ASSERT(strncmp(n->name, LLAMA_TENSOR_NAME_FATTN "-", prefix_len) == 0);
const int il = std::stoi(n->name + prefix_len);
ggml_backend_dev_t device_kv = model.dev_layer(il);
if (device_fa != device_kv) {
LLAMA_LOG_WARN("%s: layer %d is assigned to device %s but the Flash Attention tensor "
"is assigned to device %s (usually due to missing support)\n",
__func__, il, ggml_backend_dev_name(device_kv), ggml_backend_dev_name(device_fa));
// FIXME: fa_device_mismatch logic is wrong for --no-kv-offload, but this is broken anyways
fa_device_mismatch = true;
break;
}
}
if (fa_device_mismatch) {
cparams.flash_attn = false;
LLAMA_LOG_WARN("%s: Flash Attention was auto, set to disabled\n", __func__);
if (ggml_is_quantized(params.type_v)) {
throw std::runtime_error("quantized V cache was requested, but this requires Flash Attention");
}
} else {
cparams.flash_attn = true;
LLAMA_LOG_INFO("%s: Flash Attention was auto, set to enabled\n", __func__);
}
}
// reserve worst-case graph
int n_splits_pp = -1;
int n_nodes_pp = -1;
int n_splits_tg = -1;
int n_nodes_tg = -1;
// simulate full KV cache
const auto mctx = memory->init_full();
if (!mctx) {
throw std::runtime_error("failed to initialize KV cache");
}
cross.v_embd.clear();
// reserve pp (prompt processing) graph first so that buffers are only allocated once
{
auto * gf = graph_reserve(n_tokens, n_seqs, n_tokens, mctx.get());
@ -444,26 +476,12 @@ llama_memory_t llama_context::get_memory() const {
return memory.get();
}
// deprecated
void llama_context::kv_self_defrag_sched() {
if (!memory) {
return;
}
memory_force_optimize = true;
}
// deprecated
bool llama_context::kv_self_update(bool optimize) {
bool llama_context::memory_update(bool optimize) {
if (!memory) {
return false;
}
{
// TODO: remove in the future
optimize |= memory_force_optimize;
memory_force_optimize = false;
const auto mctx = memory->init_update(this, optimize);
switch (mctx->get_status()) {
case LLAMA_MEMORY_STATUS_SUCCESS:
@ -908,12 +926,6 @@ int llama_context::encode(const llama_batch & batch_inp) {
}
}
if (!supports_set_rows) {
// Reset state for the next token before backend sync, to allow the CPU activities in the reset to
// overlap with device computation.
ggml_backend_sched_reset(sched.get());
}
// TODO: hacky solution
if (model.arch == LLM_ARCH_T5 && t_embd) {
//cross.t_embd = t_embd;
@ -996,8 +1008,8 @@ int llama_context::decode(const llama_batch & batch_inp) {
bool did_optimize = false;
// handle any pending defrags/shifts
kv_self_update(false);
// handle any pending shifts/copies
memory_update(false);
llama_memory_context_ptr mctx;
@ -1022,7 +1034,7 @@ int llama_context::decode(const llama_batch & batch_inp) {
if (!did_optimize) {
did_optimize = true;
if (kv_self_update(true)) {
if (memory_update(true)) {
LLAMA_LOG_DEBUG("%s: retrying batch size %d after cache optimization\n", __func__, balloc->get_n_tokens());
continue;
@ -1075,7 +1087,7 @@ int llama_context::decode(const llama_batch & batch_inp) {
const auto * res = process_ubatch(ubatch, LLM_GRAPH_TYPE_DECODER, mctx.get(), status);
if (!res) {
// the last ubatch failed or was aborted -> remove all positions of that ubatch from the KV cache
// the last ubatch failed or was aborted -> remove all positions of that ubatch from the memory module
llama_pos pos_min[LLAMA_MAX_SEQ];
for (int s = 0; s < LLAMA_MAX_SEQ; ++s) {
pos_min[s] = std::numeric_limits<llama_pos>::max();
@ -1092,7 +1104,7 @@ int llama_context::decode(const llama_batch & batch_inp) {
continue;
}
LLAMA_LOG_WARN("%s: removing KV cache entries for seq_id = %d, pos = [%d, +inf)\n", __func__, s, pos_min[s]);
LLAMA_LOG_WARN("%s: removing memory module entries for seq_id = %d, pos = [%d, +inf)\n", __func__, s, pos_min[s]);
memory->seq_rm(s, pos_min[s], -1);
}
@ -1243,12 +1255,6 @@ int llama_context::decode(const llama_batch & batch_inp) {
// wait for the computation to finish (automatically done when obtaining the model output)
//synchronize();
if (!supports_set_rows) {
// Reset state for the next token before backend sync, to allow the CPU activities in the reset to
// overlap with device computation.
ggml_backend_sched_reset(sched.get());
}
return 0;
}
@ -1362,8 +1368,9 @@ llm_graph_result * llama_context::get_gf_res_reserve() const {
return static_cast<llm_graph_result *>(gf_res_reserve.get());
}
ggml_cgraph * llama_context::graph_reserve(uint32_t n_tokens, uint32_t n_seqs, uint32_t n_outputs, const llama_memory_context_i * mctx) {
ggml_cgraph * llama_context::graph_reserve(uint32_t n_tokens, uint32_t n_seqs, uint32_t n_outputs, const llama_memory_context_i * mctx, bool split_only) {
LLAMA_LOG_DEBUG("%s: reserving a graph for ubatch with n_tokens = %4u, n_seqs = %2u, n_outputs = %4u\n", __func__, n_tokens, n_seqs, n_outputs);
GGML_ASSERT(n_outputs >= 1);
if (n_tokens % n_seqs != 0) {
n_tokens = ((n_tokens + (n_seqs - 1)) / n_seqs) * n_seqs; // round to next multiple of n_seqs
@ -1397,7 +1404,9 @@ ggml_cgraph * llama_context::graph_reserve(uint32_t n_tokens, uint32_t n_seqs, u
this->n_outputs = save_n_outputs;
// initialize scheduler with the specified graph
if (!ggml_backend_sched_reserve(sched.get(), gf)) {
if (split_only) {
ggml_backend_sched_split_graph(sched.get(), gf);
} else if (!ggml_backend_sched_reserve(sched.get(), gf)) {
LLAMA_LOG_ERROR("%s: failed to allocate compute buffers\n", __func__);
return nullptr;
}
@ -1437,8 +1446,10 @@ ggml_status llama_context::graph_compute(
if (backend_cpu != nullptr) {
auto * reg = ggml_backend_dev_backend_reg(ggml_backend_get_device(backend_cpu));
auto * set_threadpool_fn = (decltype(ggml_backend_cpu_set_threadpool) *) ggml_backend_reg_get_proc_address(reg, "ggml_backend_cpu_set_threadpool");
if (set_threadpool_fn) {
set_threadpool_fn(backend_cpu, tp);
}
}
// set the number of threads for all the backends
for (const auto & set_n_threads_fn : set_n_threads_fns) {
@ -1656,30 +1667,30 @@ size_t llama_context::state_set_data(const uint8_t * src, size_t size) {
}
}
size_t llama_context::state_seq_get_size(llama_seq_id seq_id) {
size_t llama_context::state_seq_get_size(llama_seq_id seq_id, llama_state_seq_flags flags) {
llama_io_write_dummy io;
try {
return state_seq_write_data(io, seq_id);
return state_seq_write_data(io, seq_id, flags);
} catch (const std::exception & err) {
LLAMA_LOG_ERROR("%s: error getting state size: %s\n", __func__, err.what());
return 0;
}
}
size_t llama_context::state_seq_get_data(llama_seq_id seq_id, uint8_t * dst, size_t size) {
size_t llama_context::state_seq_get_data(llama_seq_id seq_id, uint8_t * dst, size_t size, llama_state_seq_flags flags) {
llama_io_write_buffer io(dst, size);
try {
return state_seq_write_data(io, seq_id);
return state_seq_write_data(io, seq_id, flags);
} catch (const std::exception & err) {
LLAMA_LOG_ERROR("%s: error saving state: %s\n", __func__, err.what());
return 0;
}
}
size_t llama_context::state_seq_set_data(llama_seq_id seq_id, const uint8_t * src, size_t size) {
size_t llama_context::state_seq_set_data(llama_seq_id seq_id, const uint8_t * src, size_t size, llama_state_seq_flags flags) {
llama_io_read_buffer io(src, size);
try {
return state_seq_read_data(io, seq_id);
return state_seq_read_data(io, seq_id, flags);
} catch (const std::exception & err) {
LLAMA_LOG_ERROR("%s: error loading state: %s\n", __func__, err.what());
return 0;
@ -1777,7 +1788,7 @@ size_t llama_context::state_seq_load_file(llama_seq_id seq_id, const char * file
{
const size_t state_size = file.size() - file.tell();
llama_io_read_file io(&file);
const size_t nread = state_seq_read_data(io, seq_id);
const size_t nread = state_seq_read_data(io, seq_id, 0);
if (!nread) {
LLAMA_LOG_ERROR("%s: failed to restore sequence state\n", __func__);
return 0;
@ -1801,7 +1812,7 @@ size_t llama_context::state_seq_save_file(llama_seq_id seq_id, const char * file
// save the context state using stream saving
llama_io_write_file io(&file);
state_seq_write_data(io, seq_id);
state_seq_write_data(io, seq_id, 0);
const size_t res = file.tell();
GGML_ASSERT(res == sizeof(uint32_t) * 3 + sizeof(llama_token) * n_token_count + io.n_bytes());
@ -1876,7 +1887,7 @@ size_t llama_context::state_write_data(llama_io_write_i & io) {
}
if (memory != nullptr) {
LLAMA_LOG_DEBUG("%s: - writing KV self\n", __func__);
LLAMA_LOG_DEBUG("%s: - writing memory module\n", __func__);
memory->state_write(io);
}
@ -1962,7 +1973,7 @@ size_t llama_context::state_read_data(llama_io_read_i & io) {
}
if (memory) {
LLAMA_LOG_DEBUG("%s: - reading KV self\n", __func__);
LLAMA_LOG_DEBUG("%s: - reading memory module\n", __func__);
memory->state_read(io);
}
@ -1970,21 +1981,21 @@ size_t llama_context::state_read_data(llama_io_read_i & io) {
return io.n_bytes();
}
size_t llama_context::state_seq_write_data(llama_io_write_i & io, llama_seq_id seq_id) {
size_t llama_context::state_seq_write_data(llama_io_write_i & io, llama_seq_id seq_id, llama_state_seq_flags flags) {
GGML_UNUSED(seq_id);
if (memory) {
memory->state_write(io, seq_id);
memory->state_write(io, seq_id, flags);
}
return io.n_bytes();
}
size_t llama_context::state_seq_read_data(llama_io_read_i & io, llama_seq_id seq_id) {
size_t llama_context::state_seq_read_data(llama_io_read_i & io, llama_seq_id seq_id, llama_state_seq_flags flags) {
GGML_UNUSED(seq_id);
if (memory) {
memory->state_read(io, seq_id);
memory->state_read(io, seq_id, flags);
}
return io.n_bytes();
@ -2015,6 +2026,21 @@ void llama_context::perf_reset() {
n_reused = 0;
}
std::map<ggml_backend_buffer_type_t, llama_memory_breakdown_data> llama_context::memory_breakdown() const {
std::map<ggml_backend_buffer_type_t, llama_memory_breakdown_data> ret;
for (const auto & buft_size : model.memory_breakdown()) {
ret[buft_size.first].model += buft_size.second;
}
for (const auto & buft_size : memory->memory_breakdown()) {
ret[buft_size.first].context += buft_size.second;
}
for (const auto & backend_ptr : backends) {
ggml_backend_t backend = backend_ptr.get();
ret[ggml_backend_sched_get_buffer_type(sched.get(), backend)].compute += ggml_backend_sched_get_buffer_size(sched.get(), backend);
}
return ret;
}
//
// training
//
@ -2047,7 +2073,7 @@ void llama_context::opt_init(struct llama_model * model, struct llama_opt_params
opt_params.opt_period = n_batch / n_ubatch;
opt_params.get_opt_pars = lopt_params.get_opt_pars;
opt_params.get_opt_pars_ud = lopt_params.get_opt_pars_ud;
opt_params.optimizer = lopt_params.optimizer_type;
opt_ctx = ggml_opt_init(opt_params);
llama_opt_param_filter param_filter = lopt_params.param_filter;
@ -2247,12 +2273,13 @@ llama_context_params llama_context_default_params() {
/*.rope_scaling_type =*/ LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED,
/*.pooling_type =*/ LLAMA_POOLING_TYPE_UNSPECIFIED,
/*.attention_type =*/ LLAMA_ATTENTION_TYPE_UNSPECIFIED,
/*.flash_attn_type =*/ LLAMA_FLASH_ATTN_TYPE_AUTO,
/*.rope_freq_base =*/ 0.0f,
/*.rope_freq_scale =*/ 0.0f,
/*.yarn_ext_factor =*/ -1.0f,
/*.yarn_attn_factor =*/ 1.0f,
/*.yarn_beta_fast =*/ 32.0f,
/*.yarn_beta_slow =*/ 1.0f,
/*.yarn_attn_factor =*/ -1.0f,
/*.yarn_beta_fast =*/ -1.0f,
/*.yarn_beta_slow =*/ -1.0f,
/*.yarn_orig_ctx =*/ 0,
/*.defrag_thold =*/ -1.0f,
/*.cb_eval =*/ nullptr,
@ -2263,7 +2290,6 @@ llama_context_params llama_context_default_params() {
/*.abort_callback_data =*/ nullptr,
/*.embeddings =*/ false,
/*.offload_kqv =*/ true,
/*.flash_attn =*/ false,
/*.no_perf =*/ true,
/*.op_offload =*/ true,
/*.swa_full =*/ true,
@ -2291,12 +2317,30 @@ llama_context * llama_init_from_model(
return nullptr;
}
if (params.flash_attn && model->arch == LLM_ARCH_GROK) {
if (params.flash_attn_type != LLAMA_FLASH_ATTN_TYPE_DISABLED && model->arch == LLM_ARCH_GROK) {
LLAMA_LOG_WARN("%s: flash_attn is not compatible with Grok - forcing off\n", __func__);
params.flash_attn = false;
params.flash_attn_type = LLAMA_FLASH_ATTN_TYPE_DISABLED;
}
if (ggml_is_quantized(params.type_v) && !params.flash_attn) {
if (params.flash_attn_type == LLAMA_FLASH_ATTN_TYPE_AUTO && ggml_is_quantized(params.type_k)) {
const uint32_t blck_size = ggml_blck_size(params.type_k);
if (model->hparams.n_embd_head_k % blck_size != 0) {
LLAMA_LOG_ERROR("%s: K cache type %s with block size %u does not divide n_embd_head_k=%u\n",
__func__, ggml_type_name(params.type_k), blck_size, model->hparams.n_embd_head_k);
return nullptr;
}
}
if (params.flash_attn_type == LLAMA_FLASH_ATTN_TYPE_AUTO && ggml_is_quantized(params.type_v)) {
const uint32_t blck_size = ggml_blck_size(params.type_v);
if (model->hparams.n_embd_head_v % blck_size != 0) {
LLAMA_LOG_ERROR("%s: V cache type %s with block size %u does not divide n_embd_head_k=%u\n",
__func__, ggml_type_name(params.type_v), blck_size, model->hparams.n_embd_head_v);
return nullptr;
}
}
if (ggml_is_quantized(params.type_v) && params.flash_attn_type == LLAMA_FLASH_ATTN_TYPE_DISABLED) {
LLAMA_LOG_ERROR("%s: V cache quantization requires flash_attn\n", __func__);
return nullptr;
}
@ -2342,16 +2386,6 @@ const llama_model * llama_get_model(const llama_context * ctx) {
return &ctx->get_model();
}
// deprecated
llama_kv_cache * llama_get_kv_self(llama_context * ctx) {
return dynamic_cast<llama_kv_cache *>(ctx->get_memory());
}
// deprecated
void llama_kv_self_update(llama_context * ctx) {
ctx->kv_self_update(false);
}
enum llama_pooling_type llama_pooling_type(const llama_context * ctx) {
return ctx->pooling_type();
}
@ -2569,168 +2603,6 @@ bool llama_memory_can_shift(llama_memory_t mem) {
return mem->get_can_shift();
}
//
// kv cache
//
// deprecated
int32_t llama_kv_self_n_tokens(const llama_context * ctx) {
const auto * kv = llama_get_memory(ctx);
if (!kv) {
return 0;
}
int32_t res = 0;
for (uint32_t s = 0; s < ctx->get_cparams().n_seq_max; s++) {
const llama_pos p0 = kv->seq_pos_min(s);
const llama_pos p1 = kv->seq_pos_max(s);
if (p0 >= 0) {
res += (p1 - p0) + 1;
}
}
return res;
}
// deprecated
// note: this is the same as above - will be removed anyway, so it's ok
int32_t llama_kv_self_used_cells(const llama_context * ctx) {
const auto * kv = llama_get_memory(ctx);
if (!kv) {
return 0;
}
int32_t res = 0;
for (uint32_t s = 0; s < ctx->get_cparams().n_seq_max; s++) {
const llama_pos p0 = kv->seq_pos_min(s);
const llama_pos p1 = kv->seq_pos_max(s);
if (p0 >= 0) {
res += (p1 - p0) + 1;
}
}
return res;
}
// deprecated
void llama_kv_self_clear(llama_context * ctx) {
auto * kv = llama_get_memory(ctx);
if (!kv) {
return;
}
llama_memory_clear(kv, true);
}
// deprecated
bool llama_kv_self_seq_rm(
llama_context * ctx,
llama_seq_id seq_id,
llama_pos p0,
llama_pos p1) {
auto * kv = llama_get_memory(ctx);
if (!kv) {
return true;
}
return llama_memory_seq_rm(kv, seq_id, p0, p1);
}
// deprecated
void llama_kv_self_seq_cp(
llama_context * ctx,
llama_seq_id seq_id_src,
llama_seq_id seq_id_dst,
llama_pos p0,
llama_pos p1) {
auto * kv = llama_get_memory(ctx);
if (!kv) {
return;
}
llama_memory_seq_cp(kv, seq_id_src, seq_id_dst, p0, p1);
}
// deprecated
void llama_kv_self_seq_keep(llama_context * ctx, llama_seq_id seq_id) {
auto * kv = llama_get_memory(ctx);
if (!kv) {
return;
}
llama_memory_seq_keep(kv, seq_id);
}
// deprecated
void llama_kv_self_seq_add(
llama_context * ctx,
llama_seq_id seq_id,
llama_pos p0,
llama_pos p1,
llama_pos delta) {
auto * kv = llama_get_memory(ctx);
if (!kv) {
return;
}
llama_memory_seq_add(kv, seq_id, p0, p1, delta);
}
// deprecated
void llama_kv_self_seq_div(
llama_context * ctx,
llama_seq_id seq_id,
llama_pos p0,
llama_pos p1,
int d) {
auto * kv = llama_get_memory(ctx);
if (!kv) {
return;
}
llama_memory_seq_div(kv, seq_id, p0, p1, d);
}
// deprecated
llama_pos llama_kv_self_seq_pos_min(llama_context * ctx, llama_seq_id seq_id) {
auto * kv = llama_get_memory(ctx);
if (!kv) {
return -1;
}
return llama_memory_seq_pos_min(kv, seq_id);
}
// deprecated
llama_pos llama_kv_self_seq_pos_max(llama_context * ctx, llama_seq_id seq_id) {
auto * kv = llama_get_memory(ctx);
if (!kv) {
return -1;
}
return llama_memory_seq_pos_max(kv, seq_id);
}
// deprecated
void llama_kv_self_defrag(llama_context * ctx) {
// force defrag
ctx->kv_self_defrag_sched();
}
// deprecated
bool llama_kv_self_can_shift(const llama_context * ctx) {
auto * kv = llama_get_memory(ctx);
if (!kv) {
return false;
}
return llama_memory_can_shift(kv);
}
// llama state API
// deprecated
@ -2800,19 +2672,31 @@ bool llama_state_save_file(llama_context * ctx, const char * path_session, const
}
size_t llama_state_seq_get_size(llama_context * ctx, llama_seq_id seq_id) {
return ctx->state_seq_get_size(seq_id);
return llama_state_seq_get_size_ext(ctx, seq_id, 0);
}
size_t llama_state_seq_get_data(llama_context * ctx, uint8_t * dst, size_t size, llama_seq_id seq_id) {
ctx->synchronize();
return ctx->state_seq_get_data(seq_id, dst, size);
return llama_state_seq_get_data_ext(ctx, dst, size, seq_id, 0);
}
size_t llama_state_seq_set_data(llama_context * ctx, const uint8_t * src, size_t size, llama_seq_id seq_id) {
return llama_state_seq_set_data_ext(ctx, src, size, seq_id, 0);
}
size_t llama_state_seq_get_size_ext(llama_context * ctx, llama_seq_id seq_id, llama_state_seq_flags flags) {
return ctx->state_seq_get_size(seq_id, flags);
}
size_t llama_state_seq_get_data_ext(llama_context * ctx, uint8_t * dst, size_t size, llama_seq_id seq_id, llama_state_seq_flags flags) {
ctx->synchronize();
return ctx->state_seq_set_data(seq_id, src, size);
return ctx->state_seq_get_data(seq_id, dst, size, flags);
}
size_t llama_state_seq_set_data_ext(llama_context * ctx, const uint8_t * src, size_t size, llama_seq_id seq_id, llama_state_seq_flags flags) {
ctx->synchronize();
return ctx->state_seq_set_data(seq_id, src, size, flags);
}
size_t llama_state_seq_save_file(llama_context * ctx, const char * filepath, llama_seq_id seq_id, const llama_token * tokens, size_t n_token_count) {
@ -2895,6 +2779,142 @@ void llama_perf_context_reset(llama_context * ctx) {
ctx->perf_reset();
}
void llama_memory_breakdown_print(const struct llama_context * ctx) {
const std::vector<ggml_backend_dev_t> & devices = ctx->get_model().devices;
std::map<ggml_backend_buffer_type_t, llama_memory_breakdown_data> memory_breakdown = ctx->memory_breakdown();
std::vector<std::array<std::string, 9>> table_data;
table_data.reserve(devices.size());
const std::string template_header = "%s: | %s | %s %s %s %s %s %s %s |\n";
const std::string template_gpu = "%s: | %s | %s = %s + (%s = %s + %s + %s) + %s |\n";
const std::string template_other = "%s: | %s | %s %s %s = %s + %s + %s %s |\n";
table_data.push_back({template_header, "memory breakdown [MiB]", "total", "free", "self", "model", "context", "compute", "unaccounted"});
constexpr size_t MiB = 1024 * 1024;
const std::vector<std::string> desc_prefixes_strip = {"NVIDIA ", "GeForce ", "Tesla ", "AMD ", "Radeon ", "Instinct "};
// track seen buffer types to avoid double counting:
std::set<ggml_backend_buffer_type_t> seen_buffer_types;
// accumulative memory breakdown for each device and for host:
std::vector<llama_memory_breakdown_data> mb_dev(devices.size());
llama_memory_breakdown_data mb_host;
for (const auto & buft_mb : memory_breakdown) {
ggml_backend_buffer_type_t buft = buft_mb.first;
const llama_memory_breakdown_data & mb = buft_mb.second;
if (ggml_backend_buft_is_host(buft)) {
mb_host.model += mb.model;
mb_host.context += mb.context;
mb_host.compute += mb.compute;
seen_buffer_types.insert(buft);
continue;
}
ggml_backend_dev_t dev = ggml_backend_buft_get_device(buft);
if (dev) {
int i_dev = -1;
for (size_t i = 0; i < devices.size(); i++) {
if (devices[i] == dev) {
i_dev = i;
break;
}
}
if (i_dev != -1) {
mb_dev[i_dev].model += mb.model;
mb_dev[i_dev].context += mb.context;
mb_dev[i_dev].compute += mb.compute;
seen_buffer_types.insert(buft);
continue;
}
}
}
// print memory breakdown for each device:
for (size_t i = 0; i < devices.size(); i++) {
ggml_backend_dev_t dev = devices[i];
llama_memory_breakdown_data mb = mb_dev[i];
const std::string name = ggml_backend_dev_name(dev);
std::string desc = ggml_backend_dev_description(dev);
for (const std::string & prefix : desc_prefixes_strip) {
if (desc.length() >= prefix.length() && desc.substr(0, prefix.length()) == prefix) {
desc = desc.substr(prefix.length());
}
}
size_t free, total;
ggml_backend_dev_memory(dev, &free, &total);
const size_t self = mb.model + mb.context + mb.compute;
const size_t unaccounted = total - self - free;
table_data.push_back({
template_gpu,
" - " + name + " (" + desc + ")",
std::to_string(total / MiB),
std::to_string(free / MiB),
std::to_string(self / MiB),
std::to_string(mb.model / MiB),
std::to_string(mb.context / MiB),
std::to_string(mb.compute / MiB),
std::to_string(unaccounted / MiB)});
}
// print memory breakdown for host:
{
const size_t self = mb_host.model + mb_host.context + mb_host.compute;
table_data.push_back({
template_other,
" - Host",
"", // total
"", // free
std::to_string(self / MiB),
std::to_string(mb_host.model / MiB),
std::to_string(mb_host.context / MiB),
std::to_string(mb_host.compute / MiB),
""}); // unaccounted
}
// print memory breakdown for all remaining buffer types:
for (const auto & buft_mb : memory_breakdown) {
ggml_backend_buffer_type_t buft = buft_mb.first;
const llama_memory_breakdown_data & mb = buft_mb.second;
if (seen_buffer_types.count(buft) == 1) {
continue;
}
const std::string name = ggml_backend_buft_name(buft);
const size_t self = mb.model + mb.context + mb.compute;
table_data.push_back({
template_other,
" - " + name,
"", // total
"", // free
std::to_string(self / MiB),
std::to_string(mb.model / MiB),
std::to_string(mb.context / MiB),
std::to_string(mb.compute / MiB),
""}); // unaccounted
seen_buffer_types.insert(buft);
}
for (size_t j = 1; j < table_data[0].size(); j++) {
size_t max_len = 0;
for (const auto & td : table_data) {
max_len = std::max(max_len, td[j].length());
}
for (auto & td : table_data) {
td[j].insert(j == 1 ? td[j].length() : 0, max_len - td[j].length(), ' ');
}
}
for (const auto & td : table_data) {
LLAMA_LOG_INFO(td[0].c_str(),
__func__, td[1].c_str(), td[2].c_str(), td[3].c_str(), td[4].c_str(), td[5].c_str(),
td[6].c_str(), td[7].c_str(), td[8].c_str());
}
}
//
// training
//

View File

@ -17,9 +17,17 @@ class llama_batch_allocr;
class llama_io_read_i;
class llama_io_write_i;
// "memory" as in abstract memory for the context
struct llama_memory_i;
struct llama_memory_context_i;
// "memory" as in physical memory for a buffer type, in bytes
struct llama_memory_breakdown_data {
size_t model = 0; // memory allocated for the model
size_t context = 0; // memory allocated for the context
size_t compute = 0; // memory allocated for temporary compute buffers
};
struct llama_context {
// init scheduler and compute buffers, reserve worst-case graphs
llama_context(
@ -46,10 +54,8 @@ struct llama_context {
llama_memory_t get_memory() const;
// return true of the KV cache was updated
// TODO: remove
bool kv_self_update(bool optimize);
void kv_self_defrag_sched();
// return true if the memory was updated
bool memory_update(bool optimize);
enum llama_pooling_type pooling_type() const;
@ -111,9 +117,9 @@ struct llama_context {
size_t state_get_data( uint8_t * dst, size_t size);
size_t state_set_data(const uint8_t * src, size_t size);
size_t state_seq_get_size(llama_seq_id seq_id);
size_t state_seq_get_data(llama_seq_id seq_id, uint8_t * dst, size_t size);
size_t state_seq_set_data(llama_seq_id seq_id, const uint8_t * src, size_t size);
size_t state_seq_get_size(llama_seq_id seq_id, llama_state_seq_flags flags);
size_t state_seq_get_data(llama_seq_id seq_id, uint8_t * dst, size_t size, llama_state_seq_flags flags);
size_t state_seq_set_data(llama_seq_id seq_id, const uint8_t * src, size_t size, llama_state_seq_flags flags);
bool state_load_file(
const char * filepath,
@ -146,12 +152,15 @@ struct llama_context {
llama_perf_context_data perf_get_data() const;
void perf_reset();
std::map<ggml_backend_buffer_type_t, llama_memory_breakdown_data> memory_breakdown() const;
//
// training
//
void opt_init(struct llama_model * model, struct llama_opt_params lopt_params);
// TODO: more flexible combinations of logical/physical batch size and context size
void opt_epoch(
ggml_opt_dataset_t dataset,
ggml_opt_result_t result_train,
@ -197,7 +206,7 @@ public:
ggml_status graph_compute(ggml_cgraph * gf, bool batched);
// reserve a graph with a dummy ubatch of the specified size
ggml_cgraph * graph_reserve(uint32_t n_tokens, uint32_t n_seqs, uint32_t n_outputs, const llama_memory_context_i * mctx);
ggml_cgraph * graph_reserve(uint32_t n_tokens, uint32_t n_seqs, uint32_t n_outputs, const llama_memory_context_i * mctx, bool split_only = false);
private:
llm_graph_params graph_params(
@ -212,8 +221,8 @@ private:
size_t state_write_data(llama_io_write_i & io);
size_t state_read_data (llama_io_read_i & io);
size_t state_seq_write_data(llama_io_write_i & io, llama_seq_id seq_id);
size_t state_seq_read_data (llama_io_read_i & io, llama_seq_id seq_id);
size_t state_seq_write_data(llama_io_write_i & io, llama_seq_id seq_id, llama_state_seq_flags flags);
size_t state_seq_read_data (llama_io_read_i & io, llama_seq_id seq_id, llama_state_seq_flags flags);
//
// members
@ -229,9 +238,6 @@ private:
std::unique_ptr<llama_memory_i> memory;
// TODO: temporary, until the llama_kv_self_defrag() API is removed
bool memory_force_optimize = false;
// decode output (2-dimensional array: [n_outputs][n_vocab])
size_t logits_size = 0; // capacity (of floats) for logits
float * logits = nullptr;
@ -287,10 +293,6 @@ private:
bool has_evaluated_once = false;
// env: LLAMA_SET_ROWS (temporary)
// ref: https://github.com/ggml-org/llama.cpp/pull/14285
bool supports_set_rows = true;
// env: LLAMA_GRAPH_REUSE_DISABLE
bool graph_reuse_disable = false;

View File

@ -4,7 +4,7 @@
#include <cstdint>
#define LLAMA_MAX_SEQ 64
#define LLAMA_MAX_SEQ 256
struct llama_cparams {
uint32_t n_ctx; // context size used during inference
@ -24,7 +24,6 @@ struct llama_cparams {
float yarn_attn_factor;
float yarn_beta_fast;
float yarn_beta_slow;
float defrag_thold;
bool embeddings;
bool causal_attn;

View File

@ -4,8 +4,8 @@
#include "llama-batch.h"
#include "llama-cparams.h"
#include "llama-kv-cache-unified.h"
#include "llama-kv-cache-unified-iswa.h"
#include "llama-kv-cache.h"
#include "llama-kv-cache-iswa.h"
#include "llama-memory-hybrid.h"
#include "llama-memory-recurrent.h"
@ -204,7 +204,10 @@ void llm_graph_input_cls::set_input(const llama_ubatch * ubatch) {
std::vector<int> target_pos(n_seqs_unq, -1);
std::vector<int> target_row(n_seqs_unq, -1);
bool last = cparams.pooling_type == LLAMA_POOLING_TYPE_LAST;
const bool last = (
cparams.pooling_type == LLAMA_POOLING_TYPE_LAST ||
(cparams.pooling_type == LLAMA_POOLING_TYPE_RANK && arch == LLM_ARCH_QWEN3) // qwen3 reranking & embedding models use last token
);
for (int i = 0; i < n_tokens; ++i) {
const llama_pos pos = ubatch->pos[i];
@ -258,6 +261,36 @@ void llm_graph_input_cross_embd::set_input(const llama_ubatch * ubatch) {
}
}
static void print_mask(float * data, int64_t n_tokens, int64_t n_kv, int64_t n_swa, llama_swa_type swa_type) {
LLAMA_LOG_DEBUG("%s: === Attention mask ===\n", __func__);
const char * swa_type_str = (swa_type == LLAMA_SWA_TYPE_NONE) ? "LLAMA_SWA_TYPE_NONE" :
(swa_type == LLAMA_SWA_TYPE_STANDARD) ? "LLAMA_SWA_TYPE_STANDARD" :
(swa_type == LLAMA_SWA_TYPE_CHUNKED) ? "LLAMA_SWA_TYPE_CHUNKED" :
(swa_type == LLAMA_SWA_TYPE_SYMMETRIC) ? "LLAMA_SWA_TYPE_SYMMETRIC" : "unknown";
LLAMA_LOG_DEBUG("%s: n_swa : %d, n_kv: %d, swq_type: %s\n", __func__, (int)n_swa, (int)n_kv, swa_type_str);
LLAMA_LOG_DEBUG("%s: '0' = can attend, '∞' = masked\n", __func__);
LLAMA_LOG_DEBUG("%s: Rows = query tokens, Columns = key/value tokens\n\n", __func__);
LLAMA_LOG_DEBUG(" ");
for (int j = 0; j < std::min((int64_t)20, n_kv); ++j) {
LLAMA_LOG_DEBUG("%2d", j);
}
LLAMA_LOG_DEBUG("\n");
for (int i = 0; i < std::min((int64_t)20, n_tokens); ++i) {
LLAMA_LOG_DEBUG(" %2d ", i);
for (int j = 0; j < std::min((int64_t)20, n_kv); ++j) {
float val = data[i * n_kv + j];
if (val == -INFINITY) {
LLAMA_LOG_DEBUG("");
} else {
LLAMA_LOG_DEBUG(" 0");
}
}
LLAMA_LOG_DEBUG("\n");
}
}
void llm_graph_input_attn_no_cache::set_input(const llama_ubatch * ubatch) {
const int64_t n_kv = ubatch->n_tokens;
const int64_t n_tokens = ubatch->n_tokens;
@ -267,6 +300,9 @@ void llm_graph_input_attn_no_cache::set_input(const llama_ubatch * ubatch) {
float * data = (float *) kq_mask->data;
// [TAG_NO_CACHE_ISWA]
GGML_ASSERT(hparams.swa_type == LLAMA_SWA_TYPE_NONE && "TODO: implement");
for (int h = 0; h < 1; ++h) {
for (int i1 = 0; i1 < n_tokens; ++i1) {
const llama_seq_id s1 = ubatch->seq_id[i1][0];
@ -277,32 +313,44 @@ void llm_graph_input_attn_no_cache::set_input(const llama_ubatch * ubatch) {
for (int s = 0; s < ubatch->n_seq_id[i0]; ++s) {
const llama_seq_id s0 = ubatch->seq_id[i0][0];
if (s0 != s1) {
continue; // skip different sequences
}
if (cparams.causal_attn && ubatch->pos[i0] > ubatch->pos[i1]) {
continue; // skip future tokens for causal attention
}
// TODO: this does not take into account that some layers are SWA and others are note (i.e. iSWA) [TAG_NO_CACHE_ISWA]
//if (hparams.is_masked_swa(ubatch->pos[i0], ubatch->pos[i1])) {
// continue; // skip masked tokens for SWA
//}
// TODO: reimplement this like in llama_kv_cache_unified
if (s0 == s1 && (!cparams.causal_attn || ubatch->pos[i0] <= ubatch->pos[i1])) {
if (hparams.use_alibi) {
f = -std::abs(ubatch->pos[i0] - ubatch->pos[i1]);
} else {
f = 0.0f;
}
break;
}
}
data[h*(n_kv*n_tokens) + i1*n_kv + i0] = f;
}
}
}
if (debug) {
print_mask(data, n_tokens, n_kv, hparams.n_swa, hparams.swa_type);
}
}
void llm_graph_input_attn_kv_unified::set_input(const llama_ubatch * ubatch) {
void llm_graph_input_attn_kv::set_input(const llama_ubatch * ubatch) {
mctx->set_input_k_idxs(self_k_idxs, ubatch);
mctx->set_input_v_idxs(self_v_idxs, ubatch);
mctx->set_input_kq_mask(self_kq_mask, ubatch, cparams.causal_attn);
}
bool llm_graph_input_attn_kv_unified::can_reuse(const llm_graph_params & params) {
const auto * mctx = static_cast<const llama_kv_cache_unified_context *>(params.mctx);
bool llm_graph_input_attn_kv::can_reuse(const llm_graph_params & params) {
const auto * mctx = static_cast<const llama_kv_cache_context *>(params.mctx);
this->mctx = mctx;
@ -314,12 +362,10 @@ bool llm_graph_input_attn_kv_unified::can_reuse(const llm_graph_params & params)
res &= self_kq_mask->ne[0] == mctx->get_n_kv();
res &= self_kq_mask->ne[1] == GGML_PAD(params.ubatch.n_tokens, GGML_KQ_MASK_PAD);
res &= mctx->get_supports_set_rows(); // TODO: tmp
return res;
}
void llm_graph_input_attn_kv_unified_iswa::set_input(const llama_ubatch * ubatch) {
void llm_graph_input_attn_kv_iswa::set_input(const llama_ubatch * ubatch) {
mctx->get_base()->set_input_k_idxs(self_k_idxs, ubatch);
mctx->get_base()->set_input_v_idxs(self_v_idxs, ubatch);
@ -331,8 +377,8 @@ void llm_graph_input_attn_kv_unified_iswa::set_input(const llama_ubatch * ubatch
mctx->get_swa()->set_input_kq_mask(self_kq_mask_swa, ubatch, cparams.causal_attn);
}
bool llm_graph_input_attn_kv_unified_iswa::can_reuse(const llm_graph_params & params) {
const auto * mctx = static_cast<const llama_kv_cache_unified_iswa_context *>(params.mctx);
bool llm_graph_input_attn_kv_iswa::can_reuse(const llm_graph_params & params) {
const auto * mctx = static_cast<const llama_kv_cache_iswa_context *>(params.mctx);
this->mctx = mctx;
@ -350,8 +396,6 @@ bool llm_graph_input_attn_kv_unified_iswa::can_reuse(const llm_graph_params & pa
res &= self_kq_mask_swa->ne[0] == mctx->get_swa()->get_n_kv();
res &= self_kq_mask_swa->ne[1] == GGML_PAD(params.ubatch.n_tokens, GGML_KQ_MASK_PAD);
res &= mctx->get_base()->get_supports_set_rows(); // TODO: tmp
return res;
}
@ -879,15 +923,29 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
selection_probs = logits;
}
if (arch == LLM_ARCH_GROVEMOE) {
selection_probs = ggml_sigmoid(ctx0, logits); // [n_expert, n_tokens]
cb(selection_probs, "ffn_moe_probs_biased", il);
}
// select experts
ggml_tensor * selected_experts = ggml_top_k(ctx0, selection_probs, n_expert_used); // [n_expert_used, n_tokens]
cb(selected_experts->src[0], "ffn_moe_argsort", il);
cb(selected_experts, "ffn_moe_topk", il);
ggml_tensor * weights = ggml_get_rows(ctx0,
ggml_reshape_3d(ctx0, probs, 1, n_expert, n_tokens), selected_experts); // [1, n_expert_used, n_tokens]
if (arch == LLM_ARCH_GROVEMOE && n_expert != hparams.n_expert) {
// TODO: Use scalar div instead when/if implemented
ggml_tensor * f_sel = ggml_cast(ctx0, selected_experts, GGML_TYPE_F32);
selected_experts = ggml_cast(ctx0, ggml_scale(ctx0, f_sel, 1.0f / float(hparams.n_group_experts)), GGML_TYPE_I32);
probs = ggml_reshape_3d(ctx0, probs, 1, hparams.n_expert, n_tokens);
} else {
probs = ggml_reshape_3d(ctx0, probs, 1, n_expert, n_tokens);
}
ggml_tensor * weights = ggml_get_rows(ctx0, probs, selected_experts); // [1, n_expert_used, n_tokens]
cb(weights, "ffn_moe_weights", il);
if (gating_op == LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX_WEIGHT) {
weights = ggml_reshape_2d(ctx0, weights, n_expert_used, n_tokens);
weights = ggml_soft_max(ctx0, weights); // [n_expert_used, n_tokens]
@ -911,6 +969,9 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
cb(weights, "ffn_moe_weights_scaled", il);
}
//call early so that topk-moe can be used
ggml_build_forward_expand(gf, weights);
cur = ggml_reshape_3d(ctx0, cur, n_embd, 1, n_tokens);
if (weight_before_ffn) {
@ -1136,7 +1197,7 @@ ggml_tensor * llm_graph_context::build_inp_mean() const {
}
ggml_tensor * llm_graph_context::build_inp_cls() const {
auto inp = std::make_unique<llm_graph_input_cls>(cparams);
auto inp = std::make_unique<llm_graph_input_cls>(cparams, arch);
auto & cur = inp->cls;
@ -1186,7 +1247,7 @@ ggml_tensor * llm_graph_context::build_inp_pos_bucket_enc() const {
}
ggml_tensor * llm_graph_context::build_inp_pos_bucket_dec() const {
const auto * mctx_cur = static_cast<const llama_kv_cache_unified_context *>(mctx);
const auto * mctx_cur = static_cast<const llama_kv_cache_context *>(mctx);
auto inp = std::make_unique<llm_graph_input_pos_bucket_kv>(hparams, mctx_cur);
@ -1223,15 +1284,16 @@ ggml_tensor * llm_graph_context::build_attn_mha(
ggml_tensor * v,
ggml_tensor * kq_b,
ggml_tensor * kq_mask,
ggml_tensor * v_mla,
ggml_tensor * sinks,
float kq_scale) const {
ggml_tensor * v_mla,
float kq_scale,
int il) const {
const bool v_trans = v->nb[1] > v->nb[2];
// split the batch into streams if needed
const auto n_stream = k->ne[3];
q = ggml_reshape_4d(ctx0, q, q->ne[0], q->ne[1], q->ne[2]/n_stream, n_stream);
q = ggml_view_4d(ctx0, q, q->ne[0], q->ne[1], q->ne[2]/n_stream, n_stream, q->nb[1], q->nb[2], q->nb[3]/n_stream, 0);
q = ggml_permute(ctx0, q, 0, 2, 1, 3);
k = ggml_permute(ctx0, k, 0, 2, 1, 3);
@ -1260,6 +1322,7 @@ ggml_tensor * llm_graph_context::build_attn_mha(
cur = ggml_flash_attn_ext(ctx0, q, k, v, kq_mask, kq_scale, hparams.f_max_alibi_bias,
hparams.attn_soft_cap ? hparams.f_attn_logit_softcapping : 0.0f);
cb(cur, LLAMA_TENSOR_NAME_FATTN, il);
ggml_flash_attn_ext_add_sinks(cur, sinks);
ggml_flash_attn_ext_set_prec (cur, GGML_PREC_F32);
@ -1275,6 +1338,7 @@ ggml_tensor * llm_graph_context::build_attn_mha(
// The permutations are noops and only change how the tensor data is interpreted.
cur = ggml_permute(ctx0, cur, 0, 2, 1, 3);
cur = ggml_mul_mat(ctx0, v_mla, cur);
cb(cur, "fattn_mla", il);
cur = ggml_permute(ctx0, cur, 0, 2, 1, 3);
cur = ggml_cont(ctx0, cur); // Needed because ggml_reshape_2d expects contiguous inputs.
#endif
@ -1283,6 +1347,7 @@ ggml_tensor * llm_graph_context::build_attn_mha(
cur = ggml_reshape_2d(ctx0, cur, cur->ne[0]*cur->ne[1], cur->ne[2]*cur->ne[3]);
} else {
ggml_tensor * kq = ggml_mul_mat(ctx0, k, q);
cb(kq, "kq", il);
// note: this op tends to require high floating point range
// while for some models F16 is enough, for others it is not, so we default to F32 here
@ -1290,38 +1355,48 @@ ggml_tensor * llm_graph_context::build_attn_mha(
if (arch == LLM_ARCH_GROK) {
// need to do the following:
// multiply by attn_output_multiplyer of 0.08838834764831845
// multiply by attn_output_multiplier
// and then :
// kq = 30 * tanh(kq / 30)
// before the softmax below
kq = ggml_tanh(ctx0, ggml_scale(ctx0, kq, 0.08838834764831845f/30.0f));
kq = ggml_scale(ctx0, kq, 30);
kq = ggml_tanh(ctx0, ggml_scale(ctx0, kq, hparams.f_attn_out_scale / hparams.f_attn_logit_softcapping));
cb(kq, "kq_tanh", il);
kq = ggml_scale(ctx0, kq, hparams.f_attn_logit_softcapping);
cb(kq, "kq_scaled", il);
}
if (hparams.attn_soft_cap) {
kq = ggml_scale(ctx0, kq, 1.0f / hparams.f_attn_logit_softcapping);
cb(kq, "kq_scaled_1", il);
kq = ggml_tanh (ctx0, kq);
cb(kq, "kq_tanh", il);
kq = ggml_scale(ctx0, kq, hparams.f_attn_logit_softcapping);
cb(kq, "kq_scaled_2", il);
}
if (kq_b) {
kq = ggml_add(ctx0, kq, kq_b);
cb(kq, "kq_plus_kq_b", il);
}
kq = ggml_soft_max_ext(ctx0, kq, kq_mask, kq_scale, hparams.f_max_alibi_bias);
ggml_soft_max_add_sinks(kq, sinks);
cb(kq, "kq_soft_max", il);
if (!v_trans) {
// note: avoid this branch
v = ggml_cont(ctx0, ggml_transpose(ctx0, v));
cb(v, "v_cont", il);
}
ggml_tensor * kqv = ggml_mul_mat(ctx0, v, kq);
cb(kqv, "kqv", il);
// for MLA with the absorption optimization, we need to "decompress" from MQA back to MHA
if (v_mla) {
kqv = ggml_mul_mat(ctx0, v_mla, kqv);
cb(kqv, "kqv_mla", il);
}
cur = ggml_permute(ctx0, kqv, 0, 2, 1, 3);
@ -1360,6 +1435,7 @@ ggml_tensor * llm_graph_context::build_attn(
ggml_tensor * k_cur,
ggml_tensor * v_cur,
ggml_tensor * kq_b,
ggml_tensor * sinks,
ggml_tensor * v_mla,
float kq_scale,
int il) const {
@ -1375,13 +1451,14 @@ ggml_tensor * llm_graph_context::build_attn(
// [TAG_NO_CACHE_PAD]
// TODO: if ubatch.equal_seqs() == true, we can split the three tensors below into ubatch.n_seqs_unq streams
assert(!ubatch.equal_seqs());
// but it might not be worth it: https://github.com/ggml-org/llama.cpp/pull/15636
//assert(!ubatch.equal_seqs() || (k_cur->ne[3] == 1 && k_cur->ne[3] == ubatch.n_seqs_unq));
ggml_tensor * q = q_cur;
ggml_tensor * k = k_cur;
ggml_tensor * v = v_cur;
ggml_tensor * cur = build_attn_mha(q, k, v, kq_b, kq_mask, v_mla, nullptr, kq_scale);
ggml_tensor * cur = build_attn_mha(q, k, v, kq_b, kq_mask, sinks, v_mla, kq_scale, il);
cb(cur, "kqv_out", il);
if (wo) {
@ -1399,17 +1476,17 @@ ggml_tensor * llm_graph_context::build_attn(
return cur;
}
static std::unique_ptr<llm_graph_input_attn_kv_unified> build_attn_inp_kv_unified_impl(
static std::unique_ptr<llm_graph_input_attn_kv> build_attn_inp_kv_impl(
ggml_context * ctx0,
const llama_ubatch & ubatch,
const llama_hparams & hparams,
const llama_cparams & cparams,
const llama_kv_cache_unified_context * mctx_cur) {
const llama_kv_cache_context * mctx_cur) {
auto inp = std::make_unique<llm_graph_input_attn_kv_unified>(hparams, cparams, mctx_cur);
auto inp = std::make_unique<llm_graph_input_attn_kv>(hparams, cparams, mctx_cur);
{
GGML_ASSERT(hparams.swa_type == LLAMA_SWA_TYPE_NONE && "Use llama_kv_cache_unified_iswa for SWA");
GGML_ASSERT(hparams.swa_type == LLAMA_SWA_TYPE_NONE && "Use llama_kv_cache_iswa for SWA");
const auto n_kv = mctx_cur->get_n_kv();
const auto n_tokens = ubatch.n_tokens;
@ -1427,22 +1504,23 @@ static std::unique_ptr<llm_graph_input_attn_kv_unified> build_attn_inp_kv_unifie
return inp;
}
llm_graph_input_attn_kv_unified * llm_graph_context::build_attn_inp_kv_unified() const {
const auto * mctx_cur = static_cast<const llama_kv_cache_unified_context *>(mctx);
llm_graph_input_attn_kv * llm_graph_context::build_attn_inp_kv() const {
const auto * mctx_cur = static_cast<const llama_kv_cache_context *>(mctx);
auto inp = build_attn_inp_kv_unified_impl(ctx0, ubatch, hparams, cparams, mctx_cur);
auto inp = build_attn_inp_kv_impl(ctx0, ubatch, hparams, cparams, mctx_cur);
return (llm_graph_input_attn_kv_unified *) res->add_input(std::move(inp));
return (llm_graph_input_attn_kv *) res->add_input(std::move(inp));
}
ggml_tensor * llm_graph_context::build_attn(
llm_graph_input_attn_kv_unified * inp,
llm_graph_input_attn_kv * inp,
ggml_tensor * wo,
ggml_tensor * wo_b,
ggml_tensor * q_cur,
ggml_tensor * k_cur,
ggml_tensor * v_cur,
ggml_tensor * kq_b,
ggml_tensor * sinks,
ggml_tensor * v_mla,
float kq_scale,
int il) const {
@ -1469,7 +1547,7 @@ ggml_tensor * llm_graph_context::build_attn(
ggml_tensor * k = mctx_cur->get_k(ctx0, il);
ggml_tensor * v = mctx_cur->get_v(ctx0, il);
ggml_tensor * cur = build_attn_mha(q, k, v, kq_b, kq_mask, v_mla, nullptr, kq_scale);
ggml_tensor * cur = build_attn_mha(q, k, v, kq_b, kq_mask, sinks, v_mla, kq_scale, il);
cb(cur, "kqv_out", il);
if (wo) {
@ -1488,40 +1566,15 @@ ggml_tensor * llm_graph_context::build_attn(
}
ggml_tensor * llm_graph_context::build_attn(
llm_graph_input_attn_kv_unified_iswa * inp,
llm_graph_input_attn_kv_iswa * inp,
ggml_tensor * wo,
ggml_tensor * wo_b,
ggml_tensor * q_cur,
ggml_tensor * k_cur,
ggml_tensor * v_cur,
ggml_tensor * kq_b,
ggml_tensor * v_mla,
float kq_scale,
int il) const {
return build_attn_with_sinks(
inp,
wo,
wo_b,
q_cur,
k_cur,
v_cur,
kq_b,
v_mla,
nullptr,
kq_scale,
il);
}
ggml_tensor * llm_graph_context::build_attn_with_sinks(
llm_graph_input_attn_kv_unified_iswa * inp,
ggml_tensor * wo,
ggml_tensor * wo_b,
ggml_tensor * q_cur,
ggml_tensor * k_cur,
ggml_tensor * v_cur,
ggml_tensor * kq_b,
ggml_tensor * v_mla,
ggml_tensor * sinks,
ggml_tensor * v_mla,
float kq_scale,
int il) const {
// these nodes are added to the graph together so that they are not reordered
@ -1561,7 +1614,7 @@ ggml_tensor * llm_graph_context::build_attn_with_sinks(
ggml_tensor * k = mctx_cur->get_k(ctx0, il);
ggml_tensor * v = mctx_cur->get_v(ctx0, il);
ggml_tensor * cur = build_attn_mha(q, k, v, kq_b, kq_mask, v_mla, sinks, kq_scale);
ggml_tensor * cur = build_attn_mha(q, k, v, kq_b, kq_mask, sinks, v_mla, kq_scale, il);
cb(cur, "kqv_out", il);
if (wo) {
@ -1600,6 +1653,7 @@ ggml_tensor * llm_graph_context::build_attn(
ggml_tensor * k_cur,
ggml_tensor * v_cur,
ggml_tensor * kq_b,
ggml_tensor * sinks,
ggml_tensor * v_mla,
float kq_scale,
int il) const {
@ -1615,7 +1669,7 @@ ggml_tensor * llm_graph_context::build_attn(
ggml_tensor * k = k_cur;
ggml_tensor * v = v_cur;
ggml_tensor * cur = build_attn_mha(q, k, v, kq_b, kq_mask, v_mla, nullptr, kq_scale);
ggml_tensor * cur = build_attn_mha(q, k, v, kq_b, kq_mask, sinks, v_mla, kq_scale, il);
cb(cur, "kqv_out", il);
if (wo) {
@ -1636,10 +1690,10 @@ ggml_tensor * llm_graph_context::build_attn(
// TODO: maybe separate the inner implementation into a separate function
// like with the non-sliding window equivalent
// once sliding-window hybrid caches are a thing.
llm_graph_input_attn_kv_unified_iswa * llm_graph_context::build_attn_inp_kv_unified_iswa() const {
const auto * mctx_cur = static_cast<const llama_kv_cache_unified_iswa_context *>(mctx);
llm_graph_input_attn_kv_iswa * llm_graph_context::build_attn_inp_kv_iswa() const {
const auto * mctx_cur = static_cast<const llama_kv_cache_iswa_context *>(mctx);
auto inp = std::make_unique<llm_graph_input_attn_kv_unified_iswa>(hparams, cparams, mctx_cur);
auto inp = std::make_unique<llm_graph_input_attn_kv_iswa>(hparams, cparams, mctx_cur);
const auto n_stream = cparams.kv_unified ? 1 : ubatch.n_seqs_unq;
@ -1656,7 +1710,7 @@ llm_graph_input_attn_kv_unified_iswa * llm_graph_context::build_attn_inp_kv_unif
}
{
GGML_ASSERT(hparams.swa_type != LLAMA_SWA_TYPE_NONE && "Use llama_kv_cache_unified for non-SWA");
GGML_ASSERT(hparams.swa_type != LLAMA_SWA_TYPE_NONE && "Use llama_kv_cache for non-SWA");
const auto n_kv = mctx_cur->get_swa()->get_n_kv();
@ -1669,7 +1723,7 @@ llm_graph_input_attn_kv_unified_iswa * llm_graph_context::build_attn_inp_kv_unif
inp->self_kq_mask_swa_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp->self_kq_mask_swa, GGML_TYPE_F16) : inp->self_kq_mask_swa;
}
return (llm_graph_input_attn_kv_unified_iswa *) res->add_input(std::move(inp));
return (llm_graph_input_attn_kv_iswa *) res->add_input(std::move(inp));
}
ggml_tensor * llm_graph_context::build_rs(
@ -1792,7 +1846,7 @@ llm_graph_input_mem_hybrid * llm_graph_context::build_inp_mem_hybrid() const {
const auto * mctx_cur = static_cast<const llama_memory_hybrid_context *>(mctx);
auto inp_rs = build_rs_inp_impl(ctx0, ubatch, mctx_cur->get_recr());
auto inp_attn = build_attn_inp_kv_unified_impl(ctx0, ubatch, hparams, cparams, mctx_cur->get_attn());
auto inp_attn = build_attn_inp_kv_impl(ctx0, ubatch, hparams, cparams, mctx_cur->get_attn());
auto inp = std::make_unique<llm_graph_input_mem_hybrid>(std::move(inp_attn), std::move(inp_rs), mctx_cur);
@ -1843,34 +1897,32 @@ void llm_graph_context::build_pooling(
case LLAMA_POOLING_TYPE_RANK:
{
ggml_tensor * inp_cls = build_inp_cls();
inp = ggml_get_rows(ctx0, inp, inp_cls);
cur = ggml_get_rows(ctx0, inp, inp_cls);
if (cls) {
// classification head
// https://github.com/huggingface/transformers/blob/5af7d41e49bbfc8319f462eb45253dcb3863dfb7/src/transformers/models/roberta/modeling_roberta.py#L1566
cur = ggml_mul_mat(ctx0, cls, inp);
if (cls) {
cur = ggml_mul_mat(ctx0, cls, cur);
if (cls_b) {
cur = ggml_add(ctx0, cur, cls_b);
}
cur = ggml_tanh(ctx0, cur);
}
// some models don't have `cls_out`, for example: https://huggingface.co/jinaai/jina-reranker-v1-tiny-en
// https://huggingface.co/jinaai/jina-reranker-v1-tiny-en/blob/cb5347e43979c3084a890e3f99491952603ae1b7/modeling_bert.py#L884-L896
// Single layer classification head (direct projection)
// https://github.com/huggingface/transformers/blob/f4fc42216cd56ab6b68270bf80d811614d8d59e4/src/transformers/models/bert/modeling_bert.py#L1476
if (cls_out) {
cur = ggml_mul_mat(ctx0, cls_out, cur);
if (cls_out_b) {
cur = ggml_add(ctx0, cur, cls_out_b);
}
}
} else if (cls_out) {
// Single layer classification head (direct projection)
// https://github.com/huggingface/transformers/blob/f4fc42216cd56ab6b68270bf80d811614d8d59e4/src/transformers/models/bert/modeling_bert.py#L1476
cur = ggml_mul_mat(ctx0, cls_out, inp);
if (cls_out_b) {
cur = ggml_add(ctx0, cur, cls_out_b);
}
} else {
GGML_ABORT("RANK pooling requires either cls+cls_b or cls_out+cls_out_b");
// softmax for qwen3 reranker
if (arch == LLM_ARCH_QWEN3) {
cur = ggml_soft_max(ctx0, cur);
}
} break;
default:

View File

@ -19,8 +19,8 @@ struct llama_cparams;
struct llama_memory_context_i;
class llama_kv_cache_unified_context;
class llama_kv_cache_unified_iswa_context;
class llama_kv_cache_context;
class llama_kv_cache_iswa_context;
class llama_memory_recurrent_context;
class llama_memory_hybrid_context;
@ -78,6 +78,11 @@ struct llm_graph_params;
class llm_graph_input_i {
public:
llm_graph_input_i() {
const char * LLAMA_GRAPH_INPUT_DEBUG = getenv("LLAMA_GRAPH_INPUT_DEBUG");
debug = LLAMA_GRAPH_INPUT_DEBUG ? atoi(LLAMA_GRAPH_INPUT_DEBUG) : 0;
}
virtual ~llm_graph_input_i() = default;
virtual void set_input(const llama_ubatch * ubatch) = 0;
@ -90,6 +95,9 @@ public:
GGML_UNUSED(params);
return false;
}
protected:
// env: LLAMA_GRAPH_INPUT_DEBUG
int debug = 0;
};
using llm_graph_input_ptr = std::unique_ptr<llm_graph_input_i>;
@ -152,7 +160,7 @@ class llm_graph_input_pos_bucket_kv : public llm_graph_input_i {
public:
llm_graph_input_pos_bucket_kv(
const llama_hparams & hparams,
const llama_kv_cache_unified_context * mctx) : hparams(hparams), mctx(mctx) {}
const llama_kv_cache_context * mctx) : hparams(hparams), mctx(mctx) {}
virtual ~llm_graph_input_pos_bucket_kv() = default;
void set_input(const llama_ubatch * ubatch) override;
@ -161,7 +169,7 @@ public:
const llama_hparams hparams;
const llama_kv_cache_unified_context * mctx;
const llama_kv_cache_context * mctx;
};
class llm_graph_input_out_ids : public llm_graph_input_i {
@ -198,7 +206,7 @@ public:
class llm_graph_input_cls : public llm_graph_input_i {
public:
llm_graph_input_cls(const llama_cparams & cparams) : cparams(cparams) {}
llm_graph_input_cls(const llama_cparams & cparams, const llm_arch arch) : cparams(cparams), arch(arch) {}
virtual ~llm_graph_input_cls() = default;
void set_input(const llama_ubatch * ubatch) override;
@ -206,6 +214,7 @@ public:
ggml_tensor * cls; // I32 [n_batch]
const llama_cparams cparams;
const llm_arch arch;
};
class llm_graph_input_rs : public llm_graph_input_i {
@ -257,17 +266,17 @@ public:
const llama_cparams cparams;
};
class llm_graph_input_attn_kv_unified : public llm_graph_input_i {
class llm_graph_input_attn_kv : public llm_graph_input_i {
public:
llm_graph_input_attn_kv_unified(
llm_graph_input_attn_kv(
const llama_hparams & hparams,
const llama_cparams & cparams,
const llama_kv_cache_unified_context * mctx) :
const llama_kv_cache_context * mctx) :
hparams(hparams),
cparams(cparams),
mctx(mctx) {
}
~llm_graph_input_attn_kv_unified() = default;
~llm_graph_input_attn_kv() = default;
void set_input(const llama_ubatch * ubatch) override;
@ -290,20 +299,20 @@ public:
const llama_hparams hparams;
const llama_cparams cparams;
const llama_kv_cache_unified_context * mctx;
const llama_kv_cache_context * mctx;
};
class llm_graph_input_attn_kv_unified_iswa : public llm_graph_input_i {
class llm_graph_input_attn_kv_iswa : public llm_graph_input_i {
public:
llm_graph_input_attn_kv_unified_iswa(
llm_graph_input_attn_kv_iswa(
const llama_hparams & hparams,
const llama_cparams & cparams,
const llama_kv_cache_unified_iswa_context * mctx) :
const llama_kv_cache_iswa_context * mctx) :
hparams(hparams),
cparams(cparams),
mctx(mctx) {
}
~llm_graph_input_attn_kv_unified_iswa() = default;
~llm_graph_input_attn_kv_iswa() = default;
void set_input(const llama_ubatch * ubatch) override;
@ -330,7 +339,7 @@ public:
const llama_hparams hparams;
const llama_cparams cparams;
const llama_kv_cache_unified_iswa_context * mctx;
const llama_kv_cache_iswa_context * mctx;
};
class llm_graph_input_attn_cross : public llm_graph_input_i {
@ -351,7 +360,7 @@ public:
class llm_graph_input_mem_hybrid : public llm_graph_input_i {
public:
llm_graph_input_mem_hybrid(
std::unique_ptr<llm_graph_input_attn_kv_unified> inp_attn,
std::unique_ptr<llm_graph_input_attn_kv> inp_attn,
std::unique_ptr<llm_graph_input_rs> inp_rs,
const llama_memory_hybrid_context * mctx) :
inp_attn(std::move(inp_attn)),
@ -361,10 +370,10 @@ public:
void set_input(const llama_ubatch * ubatch) override;
std::unique_ptr<llm_graph_input_attn_kv_unified> inp_attn;
std::unique_ptr<llm_graph_input_attn_kv> inp_attn;
std::unique_ptr<llm_graph_input_rs> inp_rs;
llm_graph_input_attn_kv_unified * get_attn() const { return inp_attn.get(); }
llm_graph_input_attn_kv * get_attn() const { return inp_attn.get(); }
llm_graph_input_rs * get_recr() const { return inp_rs.get(); }
const llama_memory_hybrid_context * mctx;
@ -685,9 +694,10 @@ struct llm_graph_context {
ggml_tensor * v, // [n_embd_head_v, n_head_v, n_tokens] (v_trans == false)
ggml_tensor * kq_b,
ggml_tensor * kq_mask,
ggml_tensor * sinks,
ggml_tensor * sinks, // [n_head_q]
ggml_tensor * v_mla, // [n_embd_head_v_mla, n_embd_head_v, n_head_v]
float kq_scale) const;
float kq_scale,
int il) const;
llm_graph_input_attn_no_cache * build_attn_inp_no_cache() const;
@ -699,50 +709,39 @@ struct llm_graph_context {
ggml_tensor * k_cur, // [n_embd_head_k, n_head_k, n_tokens]
ggml_tensor * v_cur, // [n_embd_head_v, n_head_v, n_tokens]
ggml_tensor * kq_b,
ggml_tensor * sinks, // [n_head_q]
ggml_tensor * v_mla, // [n_embd_head_v_mla, n_embd_head_v, n_head_v]
float kq_scale,
int il) const;
llm_graph_input_attn_kv_unified * build_attn_inp_kv_unified() const;
llm_graph_input_attn_kv * build_attn_inp_kv() const;
ggml_tensor * build_attn(
llm_graph_input_attn_kv_unified * inp,
llm_graph_input_attn_kv * inp,
ggml_tensor * wo,
ggml_tensor * wo_b,
ggml_tensor * q_cur, // [n_embd_head_q, n_head_q, n_tokens]
ggml_tensor * k_cur, // [n_embd_head_k, n_head_k, n_tokens]
ggml_tensor * v_cur, // [n_embd_head_v, n_head_v, n_tokens]
ggml_tensor * kq_b,
ggml_tensor * sinks, // [n_head_q]
ggml_tensor * v_mla, // [n_embd_head_v_mla, n_embd_head_v, n_head_v]
float kq_scale,
int il) const;
llm_graph_input_attn_kv_unified_iswa * build_attn_inp_kv_unified_iswa() const;
llm_graph_input_attn_kv_iswa * build_attn_inp_kv_iswa() const;
// note: if k_cur or v_cur are not provided, they will not be stored in the memory
ggml_tensor * build_attn(
llm_graph_input_attn_kv_unified_iswa * inp,
llm_graph_input_attn_kv_iswa * inp,
ggml_tensor * wo,
ggml_tensor * wo_b,
ggml_tensor * q_cur, // [n_embd_head_q, n_head_q, n_tokens]
ggml_tensor * k_cur, // [n_embd_head_k, n_head_k, n_tokens] optional
ggml_tensor * v_cur, // [n_embd_head_v, n_head_v, n_tokens] optional
ggml_tensor * kq_b,
ggml_tensor * v_mla, // [n_embd_head_v_mla, n_embd_head_v, n_head_v]
float kq_scale,
int il) const;
// TODO: temporary to keep the diff small. after the code is public will refactor to simplify this
ggml_tensor * build_attn_with_sinks(
llm_graph_input_attn_kv_unified_iswa * inp,
ggml_tensor * wo,
ggml_tensor * wo_b,
ggml_tensor * q_cur, // [n_embd_head_q, n_head_q, n_tokens]
ggml_tensor * k_cur, // [n_embd_head_k, n_head_k, n_tokens] optional
ggml_tensor * v_cur, // [n_embd_head_v, n_head_v, n_tokens] optional
ggml_tensor * kq_b,
ggml_tensor * v_mla, // [n_embd_head_v_mla, n_embd_head_v, n_head_v]
ggml_tensor * sinks, // [n_head_q]
ggml_tensor * v_mla, // [n_embd_head_v_mla, n_embd_head_v, n_head_v]
float kq_scale,
int il) const;
@ -756,6 +755,7 @@ struct llm_graph_context {
ggml_tensor * k_cur, // [n_embd_head_k, n_head_k, n_tokens]
ggml_tensor * v_cur, // [n_embd_head_v, n_head_v, n_tokens]
ggml_tensor * kq_b,
ggml_tensor * sinks, // [n_head_q]
ggml_tensor * v_mla, // [n_embd_head_v_mla, n_embd_head_v, n_head_v]
float kq_scale,
int il) const;
@ -765,7 +765,7 @@ struct llm_graph_context {
//
// TODO: move this implementation to llama_memory_recurrent.
// this is analogous to llama_kv_cache_unified::cpy_k / cpy_v
// this is analogous to llama_kv_cache::cpy_k / cpy_v
// when moving, avoid passing `ggml_cgraph` - only pass `ggml_context`. would likely need to split the
// implementation in 2 separate methods. the goal is to avoid calling `ggml_build_forward_expand` in
// `llama_memory_recurrent`

View File

@ -1,6 +1,7 @@
#include "llama-hparams.h"
#include "ggml.h"
#include <cassert>
void llama_hparams::set_swa_pattern(uint32_t n_pattern, bool dense_first) {
if (dense_first) {
@ -161,3 +162,64 @@ bool llama_hparams::is_swa(uint32_t il) const {
GGML_ABORT("fatal error");
}
bool llama_hparams::has_kv(uint32_t il) const {
if (n_layer_kv_from_start >= 0) {
if (il < (uint32_t) n_layer_kv_from_start) {
return true;
}
return false;
}
// by default, all layers have kv
return true;
}
uint32_t llama_hparams::n_layer_kv() const {
uint32_t res = 0;
for (uint32_t il = 0; il < n_layer; ++il) {
if (has_kv(il)) {
res++;
}
}
return res;
}
bool llama_hparams::is_masked_swa(uint32_t n_swa, llama_swa_type swa_type, llama_pos p0, llama_pos p1) {
assert(p0 >= 0 && p1 >= 0);
switch (swa_type) {
case LLAMA_SWA_TYPE_NONE:
{
} break;
case LLAMA_SWA_TYPE_STANDARD:
{
if (p1 - p0 >= (int32_t) n_swa) {
return true;
}
} break;
case LLAMA_SWA_TYPE_CHUNKED:
{
const llama_pos pos_chunk_start = (p1 / n_swa) * n_swa;
if (p0 < pos_chunk_start) {
return true;
}
} break;
case LLAMA_SWA_TYPE_SYMMETRIC:
{
const int32_t half_n_swa = (int32_t) n_swa / 2;
const int32_t pos_diff = p1 - p0;
// Mask if outside the symmetric window
if (pos_diff < -half_n_swa || pos_diff > half_n_swa) {
return true;
}
} break;
}
return false;
}

View File

@ -19,6 +19,7 @@ enum llama_swa_type {
LLAMA_SWA_TYPE_NONE = 0,
LLAMA_SWA_TYPE_STANDARD = 1,
LLAMA_SWA_TYPE_CHUNKED = 2,
LLAMA_SWA_TYPE_SYMMETRIC = 3,
};
struct llama_hparams_posnet {
@ -41,6 +42,7 @@ struct llama_hparams {
uint32_t n_embd;
uint32_t n_embd_features = 0;
uint32_t n_layer;
int32_t n_layer_kv_from_start = -1; // if non-negative, the first n_layer_kv_from_start layers have KV cache
uint32_t n_rot;
uint32_t n_embd_head_k; // dimension of keys (d_k). d_q is assumed to be the same, but there are n_head q heads, and only n_head_kv k-v heads
uint32_t n_embd_head_v; // dimension of values (d_v) aka n_embd_head
@ -69,10 +71,13 @@ struct llama_hparams {
uint32_t n_lora_kv = 0;
uint32_t n_ff_exp = 0;
uint32_t n_ff_shexp = 0;
uint32_t n_ff_chexp = 0;
uint32_t n_expert_shared = 0;
uint32_t n_norm_groups = 0;
uint32_t n_group_experts = 0;
float expert_weights_scale = 0.0;
float expert_group_scale = 0.05f;
float expert_weights_scale = 0.0f;
bool expert_weights_norm = false;
uint32_t expert_gating_func = LLAMA_EXPERT_GATING_FUNC_TYPE_NONE;
uint32_t moe_every_n_layers = 0;
@ -83,6 +88,7 @@ struct llama_hparams {
float f_norm_group_eps;
float f_attn_logit_softcapping = 50.0f;
float f_router_logit_softcapping = 30.0f;
float f_final_logit_softcapping = 30.0f;
// for RWKV
@ -104,6 +110,11 @@ struct llama_hparams {
uint32_t n_ctx_orig_yarn;
float rope_yarn_log_mul = 0.0f;
float yarn_ext_factor = -1.0f;
float yarn_attn_factor = 1.0f;
float yarn_beta_fast = 32.0f;
float yarn_beta_slow = 1.0f;
std::array<int, 4> rope_sections;
// Sliding Window Attention (SWA)
@ -136,10 +147,14 @@ struct llama_hparams {
float f_embedding_scale = 0.0f;
float f_attention_scale = 0.0f;
// grok-2
float f_attn_out_scale = 0.0f;
uint32_t attn_temp_length = 0;
bool causal_attn = true;
bool use_alibi = false;
bool attn_soft_cap = false;
bool use_kq_norm = true;
bool use_kq_norm = false;
// for Classifiers
uint32_t n_cls_out = 1;
@ -159,6 +174,7 @@ struct llama_hparams {
// needed by encoder-decoder models (e.g. T5, FLAN-T5)
// ref: https://github.com/ggerganov/llama.cpp/pull/8141
llama_token dec_start_token_id = LLAMA_TOKEN_NULL;
uint32_t dec_n_layer = 0;
enum llama_pooling_type pooling_type = LLAMA_POOLING_TYPE_NONE;
enum llama_rope_type rope_type = LLAMA_ROPE_TYPE_NONE;
@ -226,6 +242,16 @@ struct llama_hparams {
bool n_bskcn(uint32_t n, uint32_t il) const;
bool is_swa(uint32_t il) const;
bool has_kv(uint32_t il) const;
// number of layers for which has_kv() returns true
uint32_t n_layer_kv() const;
// note that this function uses different SWA parameters from those in the hparams
// TODO: think of a better place for this function
// TODO: pack the SWA params in a struct?
static bool is_masked_swa(uint32_t n_swa, llama_swa_type swa_type, llama_pos p0, llama_pos p1);
};
static_assert(std::is_trivially_copyable<llama_hparams>::value, "llama_hparams must be trivially copyable");

View File

@ -59,3 +59,5 @@ std::string llama_format_tensor_shape(const std::vector<int64_t> & ne);
std::string llama_format_tensor_shape(const struct ggml_tensor * t);
std::string gguf_kv_to_str(const struct gguf_context * ctx_gguf, int i);
#define LLAMA_TENSOR_NAME_FATTN "__fattn__"

View File

@ -1,4 +1,4 @@
#include "llama-kv-cache-unified-iswa.h"
#include "llama-kv-cache-iswa.h"
#include "llama-impl.h"
#include "llama-batch.h"
@ -8,10 +8,10 @@
#include <cassert>
//
// llama_kv_cache_unified_iswa
// llama_kv_cache_iswa
//
llama_kv_cache_unified_iswa::llama_kv_cache_unified_iswa(
llama_kv_cache_iswa::llama_kv_cache_iswa(
const llama_model & model,
ggml_type type_k,
ggml_type type_v,
@ -22,9 +22,26 @@ llama_kv_cache_unified_iswa::llama_kv_cache_unified_iswa(
uint32_t kv_size,
uint32_t n_seq_max,
uint32_t n_ubatch,
uint32_t n_pad) : hparams(model.hparams), unified(unified) {
llama_kv_cache_unified::layer_filter_cb filter_base = [&](int32_t il) { return !model.hparams.is_swa(il); };
llama_kv_cache_unified::layer_filter_cb filter_swa = [&](int32_t il) { return model.hparams.is_swa(il); };
uint32_t n_pad,
const layer_filter_cb & filter,
const layer_reuse_cb & reuse) : hparams(model.hparams), unified(unified) {
// chain filters
const layer_filter_cb filter_base = [&](int32_t il) {
if (filter && !filter(il)) {
return false;
}
return !model.hparams.is_swa(il);
};
const layer_filter_cb filter_swa = [&](int32_t il) {
if (filter && !filter(il)) {
return false;
}
return model.hparams.is_swa(il);
};
const uint32_t size_base = kv_size;
@ -40,25 +57,25 @@ llama_kv_cache_unified_iswa::llama_kv_cache_unified_iswa(
LLAMA_LOG_INFO("%s: creating non-SWA KV cache, size = %u cells\n", __func__, size_base);
kv_base = std::make_unique<llama_kv_cache_unified>(
model, std::move(filter_base), type_k, type_v,
kv_base = std::make_unique<llama_kv_cache>(
model, type_k, type_v,
v_trans, offload, unified, size_base, n_seq_max, n_pad,
0, LLAMA_SWA_TYPE_NONE);
0, LLAMA_SWA_TYPE_NONE, filter_base, reuse);
LLAMA_LOG_INFO("%s: creating SWA KV cache, size = %u cells\n", __func__, size_swa);
kv_swa = std::make_unique<llama_kv_cache_unified>(
model, std::move(filter_swa), type_k, type_v,
kv_swa = std::make_unique<llama_kv_cache>(
model, type_k, type_v,
v_trans, offload, unified, size_swa, n_seq_max, n_pad,
hparams.n_swa, hparams.swa_type);
hparams.n_swa, hparams.swa_type, filter_swa, reuse);
}
void llama_kv_cache_unified_iswa::clear(bool data) {
void llama_kv_cache_iswa::clear(bool data) {
kv_base->clear(data);
kv_swa ->clear(data);
}
bool llama_kv_cache_unified_iswa::seq_rm(llama_seq_id seq_id, llama_pos p0, llama_pos p1) {
bool llama_kv_cache_iswa::seq_rm(llama_seq_id seq_id, llama_pos p0, llama_pos p1) {
bool res = true;
res = res & kv_base->seq_rm(seq_id, p0, p1);
@ -67,36 +84,44 @@ bool llama_kv_cache_unified_iswa::seq_rm(llama_seq_id seq_id, llama_pos p0, llam
return res;
}
void llama_kv_cache_unified_iswa::seq_cp(llama_seq_id seq_id_src, llama_seq_id seq_id_dst, llama_pos p0, llama_pos p1) {
void llama_kv_cache_iswa::seq_cp(llama_seq_id seq_id_src, llama_seq_id seq_id_dst, llama_pos p0, llama_pos p1) {
kv_base->seq_cp(seq_id_src, seq_id_dst, p0, p1);
kv_swa ->seq_cp(seq_id_src, seq_id_dst, p0, p1);
}
void llama_kv_cache_unified_iswa::seq_keep(llama_seq_id seq_id) {
void llama_kv_cache_iswa::seq_keep(llama_seq_id seq_id) {
kv_base->seq_keep(seq_id);
kv_swa ->seq_keep(seq_id);
}
void llama_kv_cache_unified_iswa::seq_add(llama_seq_id seq_id, llama_pos p0, llama_pos p1, llama_pos shift) {
void llama_kv_cache_iswa::seq_add(llama_seq_id seq_id, llama_pos p0, llama_pos p1, llama_pos shift) {
kv_base->seq_add(seq_id, p0, p1, shift);
kv_swa ->seq_add(seq_id, p0, p1, shift);
}
void llama_kv_cache_unified_iswa::seq_div(llama_seq_id seq_id, llama_pos p0, llama_pos p1, int d) {
void llama_kv_cache_iswa::seq_div(llama_seq_id seq_id, llama_pos p0, llama_pos p1, int d) {
kv_base->seq_div(seq_id, p0, p1, d);
kv_swa ->seq_div(seq_id, p0, p1, d);
}
llama_pos llama_kv_cache_unified_iswa::seq_pos_min(llama_seq_id seq_id) const {
llama_pos llama_kv_cache_iswa::seq_pos_min(llama_seq_id seq_id) const {
// the base cache is a superset of the SWA cache, so we can just check the SWA cache
return kv_swa->seq_pos_min(seq_id);
}
llama_pos llama_kv_cache_unified_iswa::seq_pos_max(llama_seq_id seq_id) const {
llama_pos llama_kv_cache_iswa::seq_pos_max(llama_seq_id seq_id) const {
return kv_swa->seq_pos_max(seq_id);
}
llama_memory_context_ptr llama_kv_cache_unified_iswa::init_batch(llama_batch_allocr & balloc, uint32_t n_ubatch, bool embd_all) {
std::map<ggml_backend_buffer_type_t, size_t> llama_kv_cache_iswa::memory_breakdown() const {
std::map<ggml_backend_buffer_type_t, size_t> mb = kv_base->memory_breakdown();
for (const auto & buft_size : kv_swa->memory_breakdown()) {
mb[buft_size.first] += buft_size.second;
}
return mb;
}
llama_memory_context_ptr llama_kv_cache_iswa::init_batch(llama_batch_allocr & balloc, uint32_t n_ubatch, bool embd_all) {
GGML_UNUSED(embd_all);
// first try simple split
@ -136,7 +161,7 @@ llama_memory_context_ptr llama_kv_cache_unified_iswa::init_batch(llama_batch_all
assert(sinfos_base.size() == sinfos_swa.size());
return std::make_unique<llama_kv_cache_unified_iswa_context>(
return std::make_unique<llama_kv_cache_iswa_context>(
this, std::move(sinfos_base), std::move(sinfos_swa), std::move(ubatches));
} while (false);
@ -172,61 +197,67 @@ llama_memory_context_ptr llama_kv_cache_unified_iswa::init_batch(llama_batch_all
assert(sinfos_base.size() == sinfos_swa.size());
return std::make_unique<llama_kv_cache_unified_iswa_context>(
return std::make_unique<llama_kv_cache_iswa_context>(
this, std::move(sinfos_base), std::move(sinfos_swa), std::move(ubatches));
} while (false);
// TODO: if we fail again, we should attempt different splitting strategies
// but to do that properly, we first have to refactor the batches to be more flexible
return std::make_unique<llama_kv_cache_unified_iswa_context>(LLAMA_MEMORY_STATUS_FAILED_PREPARE);
return std::make_unique<llama_kv_cache_iswa_context>(LLAMA_MEMORY_STATUS_FAILED_PREPARE);
}
llama_memory_context_ptr llama_kv_cache_unified_iswa::init_full() {
return std::make_unique<llama_kv_cache_unified_iswa_context>(this);
llama_memory_context_ptr llama_kv_cache_iswa::init_full() {
return std::make_unique<llama_kv_cache_iswa_context>(this);
}
llama_memory_context_ptr llama_kv_cache_unified_iswa::init_update(llama_context * lctx, bool optimize) {
return std::make_unique<llama_kv_cache_unified_iswa_context>(this, lctx, optimize);
llama_memory_context_ptr llama_kv_cache_iswa::init_update(llama_context * lctx, bool optimize) {
return std::make_unique<llama_kv_cache_iswa_context>(this, lctx, optimize);
}
bool llama_kv_cache_unified_iswa::get_can_shift() const {
bool llama_kv_cache_iswa::get_can_shift() const {
return kv_base->get_size() == kv_swa->get_size();
}
void llama_kv_cache_unified_iswa::state_write(llama_io_write_i & io, llama_seq_id seq_id) const {
kv_base->state_write(io, seq_id);
kv_swa ->state_write(io, seq_id);
void llama_kv_cache_iswa::state_write(llama_io_write_i & io, llama_seq_id seq_id, llama_state_seq_flags flags) const {
if ((flags & LLAMA_STATE_SEQ_FLAGS_SWA_ONLY) == 0) {
kv_base->state_write(io, seq_id, flags);
}
kv_swa->state_write(io, seq_id, flags);
}
void llama_kv_cache_unified_iswa::state_read(llama_io_read_i & io, llama_seq_id seq_id) {
kv_base->state_read(io, seq_id);
kv_swa ->state_read(io, seq_id);
void llama_kv_cache_iswa::state_read(llama_io_read_i & io, llama_seq_id seq_id, llama_state_seq_flags flags) {
if ((flags & LLAMA_STATE_SEQ_FLAGS_SWA_ONLY) == 0) {
kv_base->state_read(io, seq_id, flags);
}
kv_swa->state_read(io, seq_id, flags);
}
llama_kv_cache_unified * llama_kv_cache_unified_iswa::get_base() const {
llama_kv_cache * llama_kv_cache_iswa::get_base() const {
return kv_base.get();
}
llama_kv_cache_unified * llama_kv_cache_unified_iswa::get_swa() const {
llama_kv_cache * llama_kv_cache_iswa::get_swa() const {
return kv_swa.get();
}
//
// llama_kv_cache_unified_iswa_context
// llama_kv_cache_iswa_context
//
llama_kv_cache_unified_iswa_context::llama_kv_cache_unified_iswa_context(llama_memory_status status) : status(status) {}
llama_kv_cache_iswa_context::llama_kv_cache_iswa_context(llama_memory_status status) : status(status) {}
llama_kv_cache_unified_iswa_context::llama_kv_cache_unified_iswa_context(
llama_kv_cache_unified_iswa * kv) :
llama_kv_cache_iswa_context::llama_kv_cache_iswa_context(
llama_kv_cache_iswa * kv) :
ctx_base(kv->get_base()->init_full()),
ctx_swa (kv->get_swa ()->init_full()),
status(llama_memory_status_combine(ctx_base->get_status(), ctx_swa->get_status())) {
}
llama_kv_cache_unified_iswa_context::llama_kv_cache_unified_iswa_context(
llama_kv_cache_unified_iswa * kv,
llama_kv_cache_iswa_context::llama_kv_cache_iswa_context(
llama_kv_cache_iswa * kv,
llama_context * lctx,
bool optimize) :
ctx_base(kv->get_base()->init_update(lctx, optimize)),
@ -234,21 +265,21 @@ llama_kv_cache_unified_iswa_context::llama_kv_cache_unified_iswa_context(
status(llama_memory_status_combine(ctx_base->get_status(), ctx_swa->get_status())) {
}
llama_kv_cache_unified_iswa_context::llama_kv_cache_unified_iswa_context(
llama_kv_cache_unified_iswa * kv,
llama_kv_cache_iswa_context::llama_kv_cache_iswa_context(
llama_kv_cache_iswa * kv,
slot_info_vec_t sinfos_base,
slot_info_vec_t sinfos_swa,
std::vector<llama_ubatch> ubatches) :
ubatches(std::move(ubatches)),
// note: here we copy the ubatches. not sure if this is ideal
ctx_base(new llama_kv_cache_unified_context(kv->get_base(), std::move(sinfos_base), this->ubatches)),
ctx_swa (new llama_kv_cache_unified_context(kv->get_swa (), std::move(sinfos_swa), this->ubatches)),
ctx_base(new llama_kv_cache_context(kv->get_base(), std::move(sinfos_base), this->ubatches)),
ctx_swa (new llama_kv_cache_context(kv->get_swa (), std::move(sinfos_swa), this->ubatches)),
status(llama_memory_status_combine(ctx_base->get_status(), ctx_swa->get_status())) {
}
llama_kv_cache_unified_iswa_context:: ~llama_kv_cache_unified_iswa_context() = default;
llama_kv_cache_iswa_context:: ~llama_kv_cache_iswa_context() = default;
bool llama_kv_cache_unified_iswa_context::next() {
bool llama_kv_cache_iswa_context::next() {
assert(status == LLAMA_MEMORY_STATUS_SUCCESS);
ctx_base->next();
@ -261,7 +292,7 @@ bool llama_kv_cache_unified_iswa_context::next() {
return true;
}
bool llama_kv_cache_unified_iswa_context::apply() {
bool llama_kv_cache_iswa_context::apply() {
assert(!llama_memory_status_is_fail(status));
bool res = true;
@ -272,24 +303,24 @@ bool llama_kv_cache_unified_iswa_context::apply() {
return res;
}
llama_memory_status llama_kv_cache_unified_iswa_context::get_status() const {
llama_memory_status llama_kv_cache_iswa_context::get_status() const {
return status;
}
const llama_ubatch & llama_kv_cache_unified_iswa_context::get_ubatch() const {
const llama_ubatch & llama_kv_cache_iswa_context::get_ubatch() const {
assert(status == LLAMA_MEMORY_STATUS_SUCCESS);
return ubatches[i_next];
}
const llama_kv_cache_unified_context * llama_kv_cache_unified_iswa_context::get_base() const {
const llama_kv_cache_context * llama_kv_cache_iswa_context::get_base() const {
assert(status == LLAMA_MEMORY_STATUS_SUCCESS);
return static_cast<const llama_kv_cache_unified_context *>(ctx_base.get());
return static_cast<const llama_kv_cache_context *>(ctx_base.get());
}
const llama_kv_cache_unified_context * llama_kv_cache_unified_iswa_context::get_swa() const {
const llama_kv_cache_context * llama_kv_cache_iswa_context::get_swa() const {
assert(status == LLAMA_MEMORY_STATUS_SUCCESS);
return static_cast<const llama_kv_cache_unified_context *>(ctx_swa.get());
return static_cast<const llama_kv_cache_context *>(ctx_swa.get());
}

View File

@ -1,19 +1,19 @@
#pragma once
#include "llama-kv-cache-unified.h"
#include "llama-kv-cache.h"
#include <vector>
//
// llama_kv_cache_unified_iswa
// llama_kv_cache_iswa
//
// utilizes two instances of llama_kv_cache_unified
// utilizes two instances of llama_kv_cache
// the first instance is for the non-SWA layers of the model and the second instance is for the SWA layers
class llama_kv_cache_unified_iswa : public llama_memory_i {
class llama_kv_cache_iswa : public llama_memory_i {
public:
llama_kv_cache_unified_iswa(
llama_kv_cache_iswa(
const llama_model & model,
ggml_type type_k,
ggml_type type_v,
@ -24,9 +24,11 @@ public:
uint32_t kv_size,
uint32_t n_seq_max,
uint32_t n_ubatch,
uint32_t n_pad);
uint32_t n_pad,
const layer_filter_cb & filter,
const layer_reuse_cb & reuse);
~llama_kv_cache_unified_iswa() = default;
~llama_kv_cache_iswa() = default;
//
// llama_memory_i
@ -54,52 +56,54 @@ public:
llama_pos seq_pos_min(llama_seq_id seq_id) const override;
llama_pos seq_pos_max(llama_seq_id seq_id) const override;
std::map<ggml_backend_buffer_type_t, size_t> memory_breakdown() const override;
// state write/load
void state_write(llama_io_write_i & io, llama_seq_id seq_id = -1) const override;
void state_read (llama_io_read_i & io, llama_seq_id seq_id = -1) override;
void state_write(llama_io_write_i & io, llama_seq_id seq_id = -1, llama_state_seq_flags flags = 0) const override;
void state_read (llama_io_read_i & io, llama_seq_id seq_id = -1, llama_state_seq_flags flags = 0) override;
//
// llama_kv_cache_unified_iswa specific API
// llama_kv_cache_iswa specific API
//
llama_kv_cache_unified * get_base() const;
llama_kv_cache_unified * get_swa () const;
llama_kv_cache * get_base() const;
llama_kv_cache * get_swa () const;
private:
const llama_hparams & hparams;
const bool unified;
std::unique_ptr<llama_kv_cache_unified> kv_base;
std::unique_ptr<llama_kv_cache_unified> kv_swa;
std::unique_ptr<llama_kv_cache> kv_base;
std::unique_ptr<llama_kv_cache> kv_swa;
};
class llama_kv_cache_unified_iswa_context : public llama_memory_context_i {
class llama_kv_cache_iswa_context : public llama_memory_context_i {
public:
using slot_info_vec_t = llama_kv_cache_unified::slot_info_vec_t;
using slot_info_vec_t = llama_kv_cache::slot_info_vec_t;
// used for errors
llama_kv_cache_unified_iswa_context(llama_memory_status status);
llama_kv_cache_iswa_context(llama_memory_status status);
// used to create a full-cache context
llama_kv_cache_unified_iswa_context(
llama_kv_cache_unified_iswa * kv);
llama_kv_cache_iswa_context(
llama_kv_cache_iswa * kv);
// used to create an update context
llama_kv_cache_unified_iswa_context(
llama_kv_cache_unified_iswa * kv,
llama_kv_cache_iswa_context(
llama_kv_cache_iswa * kv,
llama_context * lctx,
bool optimize);
// used to create a batch processing context from a batch
llama_kv_cache_unified_iswa_context(
llama_kv_cache_unified_iswa * kv,
llama_kv_cache_iswa_context(
llama_kv_cache_iswa * kv,
slot_info_vec_t sinfos_base,
slot_info_vec_t sinfos_swa,
std::vector<llama_ubatch> ubatches);
virtual ~llama_kv_cache_unified_iswa_context();
virtual ~llama_kv_cache_iswa_context();
//
// llama_memory_context_i
@ -112,14 +116,14 @@ public:
const llama_ubatch & get_ubatch() const override;
//
// llama_kv_cache_unified_iswa_context specific API
// llama_kv_cache_iswa_context specific API
//
const llama_kv_cache_unified_context * get_base() const;
const llama_kv_cache_unified_context * get_swa() const;
const llama_kv_cache_context * get_base() const;
const llama_kv_cache_context * get_swa() const;
private:
//llama_kv_cache_unified_iswa * kv;
//llama_kv_cache_iswa * kv;
// the index of the next ubatch to process
size_t i_next = 0;

View File

@ -14,27 +14,13 @@ struct llama_model;
struct llama_context;
//
// llama_kv_cache_unified
// llama_kv_cache
//
class llama_kv_cache_unified : public llama_memory_i {
class llama_kv_cache : public llama_memory_i {
public:
static uint32_t get_padding(const llama_cparams & cparams);
// this callback is used to filter out layers that should not be included in the cache
using layer_filter_cb = std::function<bool(int32_t il)>;
struct defrag_info {
bool empty() const {
return ids.empty();
}
// contains information about which cell moves where:
// - cell i moves to ids[i]
// - if ids[i] == i || ids[i] == ids.size(), then cell i is not moved
std::vector<uint32_t> ids;
};
struct stream_copy_info {
bool empty() const {
assert(ssrc.size() == sdst.size());
@ -52,8 +38,8 @@ public:
using idx_vec_t = std::vector<uint32_t>;
// number of streams: ns = s1 - s0 + 1
llama_seq_id s0;
llama_seq_id s1;
uint32_t s0;
uint32_t s1;
std::vector<llama_seq_id> strm; // [ns]
std::vector<idx_vec_t> idxs; // [ns]
@ -92,9 +78,8 @@ public:
using slot_info_vec_t = std::vector<slot_info>;
llama_kv_cache_unified(
llama_kv_cache(
const llama_model & model,
layer_filter_cb && filter,
ggml_type type_k,
ggml_type type_v,
bool v_trans,
@ -104,9 +89,11 @@ public:
uint32_t n_seq_max,
uint32_t n_pad,
uint32_t n_swa,
llama_swa_type swa_type);
llama_swa_type swa_type,
const layer_filter_cb & filter,
const layer_reuse_cb & reuse);
~llama_kv_cache_unified() = default;
~llama_kv_cache() = default;
//
// llama_memory_i
@ -134,13 +121,15 @@ public:
llama_pos seq_pos_min(llama_seq_id seq_id) const override;
llama_pos seq_pos_max(llama_seq_id seq_id) const override;
std::map<ggml_backend_buffer_type_t, size_t> memory_breakdown() const override;
// state write/load
void state_write(llama_io_write_i & io, llama_seq_id seq_id = -1) const override;
void state_read (llama_io_read_i & io, llama_seq_id seq_id = -1) override;
void state_write(llama_io_write_i & io, llama_seq_id seq_id = -1, llama_state_seq_flags flags = 0) const override;
void state_read (llama_io_read_i & io, llama_seq_id seq_id = -1, llama_state_seq_flags flags = 0) override;
//
// llama_kv_cache_unified specific API
// llama_kv_cache specific API
//
uint32_t get_size() const;
@ -152,10 +141,7 @@ public:
// graph_build API
//
uint32_t get_n_kv() const;
// TODO: temporary
bool get_supports_set_rows() const;
uint32_t get_n_kv(const slot_info & sinfo) const;
// get views of the current state of the cache
ggml_tensor * get_k(ggml_context * ctx, int32_t il, uint32_t n_kv, const slot_info & sinfo) const;
@ -173,7 +159,7 @@ public:
// return empty vector on failure
slot_info_vec_t prepare(const std::vector<llama_ubatch> & ubatches);
bool update(llama_context * lctx, bool do_shift, const defrag_info & dinfo, const stream_copy_info & sc_info);
bool update(llama_context * lctx, bool do_shift, const stream_copy_info & sc_info);
// find a slot of kv cells that can hold the ubatch
// if cont == true, then the slot must be continuous
@ -228,10 +214,7 @@ private:
// env: LLAMA_KV_CACHE_DEBUG
int debug = 0;
// env: LLAMA_SET_ROWS (temporary)
// ref: https://github.com/ggml-org/llama.cpp/pull/14285
bool supports_set_rows = true;
// this is the SWA type of the cache - not to be confused with the model SWA type
const llama_swa_type swa_type = LLAMA_SWA_TYPE_NONE;
std::vector<ggml_context_ptr> ctxs;
@ -241,7 +224,7 @@ private:
// note: this is not part of the KV state and it's only used to speed-up the find_slot() method
std::vector<uint32_t> v_heads;
std::vector<llama_kv_cells_unified> v_cells;
std::vector<llama_kv_cells> v_cells;
// maps from a sequence id to a stream id
std::vector<uint32_t> seq_to_stream;
@ -254,9 +237,6 @@ private:
// model layer id -> KV cache layer id
std::unordered_map<int32_t, int32_t> map_layer_ids;
// return non-empty vector if cells have been moved
defrag_info defrag_prepare(int32_t n_max_nodes) const;
size_t total_size() const;
size_t size_k_bytes() const;
@ -277,11 +257,6 @@ private:
llm_graph_result * res,
llama_context * lctx) const;
ggml_cgraph * build_graph_defrag(
llm_graph_result * res,
llama_context * lctx,
const defrag_info & dinfo) const;
struct cell_ranges_t {
uint32_t strm;
@ -295,35 +270,33 @@ private:
bool state_read_data(llama_io_read_i & io, uint32_t strm, uint32_t cell_count);
};
class llama_kv_cache_unified_context : public llama_memory_context_i {
class llama_kv_cache_context : public llama_memory_context_i {
public:
// some shorthands
using slot_info_vec_t = llama_kv_cache_unified::slot_info_vec_t;
using defrag_info = llama_kv_cache_unified::defrag_info;
using stream_copy_info = llama_kv_cache_unified::stream_copy_info;
using slot_info_vec_t = llama_kv_cache::slot_info_vec_t;
using stream_copy_info = llama_kv_cache::stream_copy_info;
// used for errors
llama_kv_cache_unified_context(llama_memory_status status);
llama_kv_cache_context(llama_memory_status status);
// used to create a full-cache context
llama_kv_cache_unified_context(
llama_kv_cache_unified * kv);
llama_kv_cache_context(
llama_kv_cache * kv);
// used to create an update context
llama_kv_cache_unified_context(
llama_kv_cache_unified * kv,
llama_kv_cache_context(
llama_kv_cache * kv,
llama_context * lctx,
bool do_shift,
defrag_info dinfo,
stream_copy_info sc_info);
// used to create a batch procesing context from a batch
llama_kv_cache_unified_context(
llama_kv_cache_unified * kv,
llama_kv_cache_context(
llama_kv_cache * kv,
slot_info_vec_t sinfos,
std::vector<llama_ubatch> ubatches);
virtual ~llama_kv_cache_unified_context();
virtual ~llama_kv_cache_context();
//
// llama_memory_context_i
@ -336,22 +309,27 @@ public:
const llama_ubatch & get_ubatch() const override;
//
// llama_kv_cache_unified_context specific API
// llama_kv_cache_context specific API
//
uint32_t get_n_kv() const;
// TODO: temporary
bool get_supports_set_rows() const;
// get views of the current state of the cache
ggml_tensor * get_k(ggml_context * ctx, int32_t il) const;
ggml_tensor * get_v(ggml_context * ctx, int32_t il) const;
// store k_cur and v_cur in the cache based on the provided head location
// note: the heads in k_cur and v_cur should be layed out contiguously in memory
// - k_cur [n_embd_head_k, n_head_k, n_tokens]
// - k_idxs [n_tokens]
// - v_cur [n_embd_head_v, n_head_v, n_tokens]
// - v_idxs [n_tokens] or [n_tokens*n_embd_v_gqa] depending if V cache is transposed
ggml_tensor * cpy_k(ggml_context * ctx, ggml_tensor * k_cur, ggml_tensor * k_idxs, int32_t il) const;
ggml_tensor * cpy_v(ggml_context * ctx, ggml_tensor * v_cur, ggml_tensor * v_idxs, int32_t il) const;
// create destination indices for each head of the current batch for where it would be written in the KV cache
// the indices address the global KV cache (not per stream) - this is not relevant for the user of this API, but
// helps understand the implementation logic of cpy_k and cpy_v
ggml_tensor * build_input_k_idxs(ggml_context * ctx, const llama_ubatch & ubatch) const;
ggml_tensor * build_input_v_idxs(ggml_context * ctx, const llama_ubatch & ubatch) const;
@ -365,7 +343,7 @@ public:
private:
llama_memory_status status;
llama_kv_cache_unified * kv;
llama_kv_cache * kv;
llama_context * lctx;
//
@ -374,8 +352,6 @@ private:
bool do_shift = false;
defrag_info dinfo;
stream_copy_info sc_info;
//

View File

@ -11,7 +11,7 @@
// meta information about KV cells that can be part of multiple sequences at the same time
// TODO: add unit tests
class llama_kv_cells_unified {
class llama_kv_cells {
public:
void reset() {
for (uint32_t i = 0; i < pos.size(); ++i) {
@ -77,30 +77,30 @@ public:
}
// move cell isrc to idst (used during defrag)
void mv(uint32_t isrc, uint32_t idst) {
assert(isrc < pos.size());
assert(idst < pos.size());
//void mv(uint32_t isrc, uint32_t idst) {
// assert(isrc < pos.size());
// assert(idst < pos.size());
assert(pos[idst] == -1);
assert(pos[isrc] != -1);
// assert(pos[idst] == -1);
// assert(pos[isrc] != -1);
pos [idst] = pos [isrc];
shift[idst] = shift[isrc];
seq [idst] = seq [isrc];
// pos [idst] = pos [isrc];
// shift[idst] = shift[isrc];
// seq [idst] = seq [isrc];
pos [isrc] = -1;
shift[isrc] = 0;
seq [isrc].reset();
// pos [isrc] = -1;
// shift[isrc] = 0;
// seq [isrc].reset();
used.erase (isrc);
used.insert(idst);
}
// used.erase (isrc);
// used.insert(idst);
//}
// copy the state of cells [i, i + n) (used for save/restore the state of the cells)
llama_kv_cells_unified cp(uint32_t i, uint32_t n) const {
llama_kv_cells cp(uint32_t i, uint32_t n) const {
assert(i + n <= pos.size());
llama_kv_cells_unified res;
llama_kv_cells res;
res.resize(n);
@ -117,8 +117,8 @@ public:
}
// copy the state of cells [idxs[0], idxs[1], ..., idxs[idxs.size() - 1])
llama_kv_cells_unified cp(const std::vector<uint32_t> & idxs) const {
llama_kv_cells_unified res;
llama_kv_cells cp(const std::vector<uint32_t> & idxs) const {
llama_kv_cells res;
res.resize(idxs.size());
@ -135,7 +135,7 @@ public:
}
// set the state of cells [i, i + other.pos.size()) (used for save/restore the state of the cells)
void set(uint32_t i, const llama_kv_cells_unified & other) {
void set(uint32_t i, const llama_kv_cells & other) {
assert(i + other.pos.size() <= pos.size());
for (uint32_t j = 0; j < other.pos.size(); ++j) {
@ -165,7 +165,7 @@ public:
}
// set the state of cells [idxs[0], idxs[1], ..., idxs[idxs.size() - 1])
void set(const std::vector<uint32_t> & idxs, const llama_kv_cells_unified & other) {
void set(const std::vector<uint32_t> & idxs, const llama_kv_cells & other) {
assert(idxs.size() == other.pos.size());
for (uint32_t j = 0; j < other.pos.size(); ++j) {

View File

@ -27,14 +27,11 @@ llama_memory_hybrid::llama_memory_hybrid(
bool offload,
bool unified,
/* layer filters */
layer_filter_cb && filter_attn,
layer_filter_cb && filter_recr) :
const layer_filter_cb & filter_attn,
const layer_filter_cb & filter_recr) :
hparams(model.hparams),
mem_attn(new llama_kv_cache_unified(
mem_attn(new llama_kv_cache(
model,
filter_attn == nullptr ?
[&](int32_t il) { return !hparams.is_recurrent(il); }
: filter_attn,
type_k,
type_v,
v_trans,
@ -44,18 +41,22 @@ llama_memory_hybrid::llama_memory_hybrid(
n_seq_max,
n_pad,
n_swa,
swa_type
swa_type,
filter_attn == nullptr ?
[&](int32_t il) { return !hparams.is_recurrent(il); }
: filter_attn,
nullptr
)),
mem_recr(new llama_memory_recurrent(
model,
filter_recr == nullptr ?
[&](int32_t il) { return hparams.is_recurrent(il); }
: filter_recr,
type_r,
type_s,
offload,
rs_size,
n_seq_max
n_seq_max,
filter_recr == nullptr ?
[&](int32_t il) { return hparams.is_recurrent(il); }
: filter_recr
)) {}
llama_memory_context_ptr llama_memory_hybrid::init_batch(llama_batch_allocr & balloc, uint32_t n_ubatch, bool embd_all) {
@ -165,17 +166,29 @@ llama_pos llama_memory_hybrid::seq_pos_max(llama_seq_id seq_id) const {
return std::min(mem_attn->seq_pos_max(seq_id), mem_recr->seq_pos_max(seq_id));
}
void llama_memory_hybrid::state_write(llama_io_write_i & io, llama_seq_id seq_id) const {
std::map<ggml_backend_buffer_type_t, size_t> llama_memory_hybrid::memory_breakdown() const {
std::map<ggml_backend_buffer_type_t, size_t> mb = mem_attn->memory_breakdown();
for (const auto & buft_size : mem_recr->memory_breakdown()) {
mb[buft_size.first] += buft_size.second;
}
return mb;
}
void llama_memory_hybrid::state_write(llama_io_write_i & io, llama_seq_id seq_id, llama_state_seq_flags flags) const {
GGML_UNUSED(flags);
mem_attn->state_write(io, seq_id);
mem_recr->state_write(io, seq_id);
}
void llama_memory_hybrid::state_read(llama_io_read_i & io, llama_seq_id seq_id) {
void llama_memory_hybrid::state_read(llama_io_read_i & io, llama_seq_id seq_id, llama_state_seq_flags flags) {
GGML_UNUSED(flags);
mem_attn->state_read(io, seq_id);
mem_recr->state_read(io, seq_id);
}
llama_kv_cache_unified * llama_memory_hybrid::get_mem_attn() const {
llama_kv_cache * llama_memory_hybrid::get_mem_attn() const {
return mem_attn.get();
}
@ -206,7 +219,7 @@ llama_memory_hybrid_context::llama_memory_hybrid_context(
std::vector<llama_ubatch> ubatches) :
ubatches(std::move(ubatches)),
// note: here we copy the ubatches. not sure if this is ideal
ctx_attn(new llama_kv_cache_unified_context(mem->get_mem_attn(), std::move(sinfos_attn), this->ubatches)),
ctx_attn(new llama_kv_cache_context(mem->get_mem_attn(), std::move(sinfos_attn), this->ubatches)),
ctx_recr(new llama_memory_recurrent_context(mem->get_mem_recr(), this->ubatches)),
status(llama_memory_status_combine(ctx_attn->get_status(), ctx_recr->get_status())) {
}
@ -244,8 +257,8 @@ const llama_ubatch & llama_memory_hybrid_context::get_ubatch() const {
return ubatches[i_next];
}
const llama_kv_cache_unified_context * llama_memory_hybrid_context::get_attn() const {
return static_cast<const llama_kv_cache_unified_context *>(ctx_attn.get());
const llama_kv_cache_context * llama_memory_hybrid_context::get_attn() const {
return static_cast<const llama_kv_cache_context *>(ctx_attn.get());
}
const llama_memory_recurrent_context * llama_memory_hybrid_context::get_recr() const {

View File

@ -2,7 +2,7 @@
#include "llama-batch.h"
#include "llama-graph.h"
#include "llama-kv-cache-unified.h"
#include "llama-kv-cache.h"
#include "llama-memory.h"
#include "llama-memory-recurrent.h"
@ -13,15 +13,11 @@
// llama_memory_hybrid
//
// utilizes instances of llama_memory_recurrent and llama_kv_cache_unified to
// utilizes instances of llama_memory_recurrent and llama_kv_cache to
// support models where each layer may be either attention-based or recurrent
class llama_memory_hybrid : public llama_memory_i {
public:
// this callback is used to filter out layers that should not be included in the cache
using layer_filter_cb = std::function<bool(int32_t il)>;
llama_memory_hybrid(
const llama_model & model,
/* attn */
@ -41,8 +37,8 @@ public:
bool offload,
bool unified,
/* layer filters */
layer_filter_cb && filter_attn = nullptr,
layer_filter_cb && filter_recr = nullptr);
const layer_filter_cb & filter_attn = nullptr,
const layer_filter_cb & filter_recr = nullptr);
~llama_memory_hybrid() = default;
@ -72,28 +68,30 @@ public:
llama_pos seq_pos_min(llama_seq_id seq_id) const override;
llama_pos seq_pos_max(llama_seq_id seq_id) const override;
std::map<ggml_backend_buffer_type_t, size_t> memory_breakdown() const override;
// state write/load
void state_write(llama_io_write_i & io, llama_seq_id seq_id = -1) const override;
void state_read (llama_io_read_i & io, llama_seq_id seq_id = -1) override;
void state_write(llama_io_write_i & io, llama_seq_id seq_id = -1, llama_state_seq_flags flags = 0) const override;
void state_read (llama_io_read_i & io, llama_seq_id seq_id = -1, llama_state_seq_flags flags = 0) override;
//
// llama_memory_hybrid specific API
//
llama_kv_cache_unified * get_mem_attn() const;
llama_kv_cache * get_mem_attn() const;
llama_memory_recurrent * get_mem_recr() const;
private:
const llama_hparams & hparams;
const std::unique_ptr<llama_kv_cache_unified> mem_attn;
const std::unique_ptr<llama_kv_cache> mem_attn;
const std::unique_ptr<llama_memory_recurrent> mem_recr;
};
class llama_memory_hybrid_context : public llama_memory_context_i {
public:
using slot_info_vec_t = llama_kv_cache_unified::slot_info_vec_t;
using slot_info_vec_t = llama_kv_cache::slot_info_vec_t;
// init failure
explicit llama_memory_hybrid_context(llama_memory_status status);
@ -125,7 +123,7 @@ public:
// llama_memory_hybrid_context
//
const llama_kv_cache_unified_context * get_attn() const;
const llama_kv_cache_context * get_attn() const;
const llama_memory_recurrent_context * get_recr() const;
private:

View File

@ -17,12 +17,12 @@
llama_memory_recurrent::llama_memory_recurrent(
const llama_model & model,
layer_filter_cb && filter,
ggml_type type_r,
ggml_type type_s,
bool offload,
uint32_t mem_size,
uint32_t n_seq_max) : hparams(model.hparams), n_seq_max(n_seq_max) {
uint32_t n_seq_max,
const layer_filter_cb & filter) : hparams(model.hparams), n_seq_max(n_seq_max) {
const int32_t n_layer = hparams.n_layer;
head = 0;
@ -359,6 +359,14 @@ llama_pos llama_memory_recurrent::seq_pos_max(llama_seq_id seq_id) const {
return result;
}
std::map<ggml_backend_buffer_type_t, size_t> llama_memory_recurrent::memory_breakdown() const {
std::map<ggml_backend_buffer_type_t, size_t> ret;
for (const ggml_backend_buffer_ptr & buf_ptr : bufs) {
ret[ggml_backend_buffer_get_type(buf_ptr.get())] += ggml_backend_buffer_get_size(buf_ptr.get());
}
return ret;
}
llama_memory_context_ptr llama_memory_recurrent::init_batch(llama_batch_allocr & balloc, uint32_t n_ubatch, bool embd_all) {
do {
balloc.split_reset();
@ -680,7 +688,9 @@ size_t llama_memory_recurrent::size_s_bytes() const {
return size_s_bytes;
}
void llama_memory_recurrent::state_write(llama_io_write_i & io, llama_seq_id seq_id) const {
void llama_memory_recurrent::state_write(llama_io_write_i & io, llama_seq_id seq_id, llama_state_seq_flags flags) const {
GGML_UNUSED(flags);
std::vector<std::pair<uint32_t, uint32_t>> cell_ranges; // ranges, from inclusive, to exclusive
uint32_t cell_count = 0;
@ -718,7 +728,9 @@ void llama_memory_recurrent::state_write(llama_io_write_i & io, llama_seq_id seq
state_write_data(io, cell_ranges);
}
void llama_memory_recurrent::state_read(llama_io_read_i & io, llama_seq_id seq_id) {
void llama_memory_recurrent::state_read(llama_io_read_i & io, llama_seq_id seq_id, llama_state_seq_flags flags) {
GGML_UNUSED(flags);
uint32_t cell_count;
io.read_to(&cell_count, sizeof(cell_count));

View File

@ -4,6 +4,7 @@
#include "llama-graph.h"
#include "llama-memory.h"
#include <map>
#include <set>
#include <vector>
@ -12,21 +13,17 @@
//
// TODO: extract the cache state used for graph computation into llama_memory_recurrent_context_i
// see the implementation of llama_kv_cache_unified_context_i for an example how to do it
// see the implementation of llama_kv_cache_context_i for an example how to do it
class llama_memory_recurrent : public llama_memory_i {
public:
// this callback is used to filter out layers that should not be included in the cache
using layer_filter_cb = std::function<bool(int32_t il)>;
llama_memory_recurrent(
const llama_model & model,
layer_filter_cb && filter,
ggml_type type_r,
ggml_type type_s,
bool offload,
uint32_t mem_size,
uint32_t n_seq_max);
uint32_t n_seq_max,
const layer_filter_cb & filter);
~llama_memory_recurrent() = default;
@ -54,6 +51,8 @@ public:
llama_pos seq_pos_min(llama_seq_id seq_id) const override;
llama_pos seq_pos_max(llama_seq_id seq_id) const override;
std::map<ggml_backend_buffer_type_t, size_t> memory_breakdown() const override;
bool prepare(const std::vector<llama_ubatch> & ubatches);
// find a contiguous slot of memory cells and emplace the ubatch there
@ -63,8 +62,8 @@ public:
// state write/load
void state_write(llama_io_write_i & io, llama_seq_id seq_id = -1) const override;
void state_read (llama_io_read_i & io, llama_seq_id seq_id = -1) override;
void state_write(llama_io_write_i & io, llama_seq_id seq_id = -1, llama_state_seq_flags flags = 0) const override;
void state_read (llama_io_read_i & io, llama_seq_id seq_id = -1, llama_state_seq_flags flags = 0) override;
uint32_t head = 0; // the location where the batch will be placed in the cache (see find_slot())
uint32_t size = 0; // total number of cells, shared across all sequences

View File

@ -2,7 +2,9 @@
#include "llama.h"
#include <map>
#include <memory>
#include <functional>
struct llama_ubatch;
@ -36,8 +38,8 @@ bool llama_memory_status_is_fail(llama_memory_status status);
// the interface for managing the memory context during batch processing
// this interface is implemented per memory type. see:
// - llama_kv_cache_unified_context
// - llama_kv_cache_unified_iswa_context
// - llama_kv_cache_context
// - llama_kv_cache_iswa_context
// ...
//
// the only method that should mutate the memory and the memory context is llama_memory_i::apply()
@ -64,6 +66,13 @@ using llama_memory_context_ptr = std::unique_ptr<llama_memory_context_i>;
// general concept of LLM memory
// the KV cache is a type of LLM memory, but there can be other types
struct llama_memory_i {
// this callback is used to filter out layers that should not be included in the cache
using layer_filter_cb = std::function<bool(int32_t il)>;
// this callback is used to specify which layers should reuse memory from other layers
// return negative value to indicate that the layer il should not reuse memory
using layer_reuse_cb = std::function<int32_t(int32_t il)>;
virtual ~llama_memory_i() = default;
// split the input batch into a set of ubatches and verify that they can fit into the cache
@ -77,7 +86,7 @@ struct llama_memory_i {
// simulate full cache, used for allocating worst-case compute buffers
virtual llama_memory_context_ptr init_full() = 0;
// prepare for any pending memory updates, such as shifts, defrags, etc.
// prepare for any pending memory updates, such as shifts, copies, etc.
// status == LLAMA_MEMORY_STATUS_NO_UPDATE if there is nothing to update
virtual llama_memory_context_ptr init_update(llama_context * lctx, bool optimize) = 0;
@ -100,17 +109,14 @@ struct llama_memory_i {
virtual llama_pos seq_pos_min(llama_seq_id seq_id) const = 0;
virtual llama_pos seq_pos_max(llama_seq_id seq_id) const = 0;
virtual std::map<ggml_backend_buffer_type_t, size_t> memory_breakdown() const = 0;
//
// state write/read
//
virtual void state_write(llama_io_write_i & io, llama_seq_id seq_id = -1) const = 0;
virtual void state_read (llama_io_read_i & io, llama_seq_id seq_id = -1) = 0;
virtual void state_write(llama_io_write_i & io, llama_seq_id seq_id = -1, llama_state_seq_flags flags = 0) const = 0;
virtual void state_read (llama_io_read_i & io, llama_seq_id seq_id = -1, llama_state_seq_flags flags = 0) = 0;
};
using llama_memory_ptr = std::unique_ptr<llama_memory_i>;
// TODO: temporary until the llama_kv_cache is removed from the public API
struct llama_kv_cache : public llama_memory_i {
virtual ~llama_kv_cache() = default;
};

View File

@ -789,6 +789,7 @@ const struct ggml_tensor * llama_model_loader::check_tensor_dims(const std::stri
}
struct ggml_tensor * llama_model_loader::create_tensor(struct ggml_context * ctx, const std::string & name, const std::initializer_list<int64_t> & ne, int flags) {
LLAMA_LOG_DEBUG("%s: loading tensor %s\n", __func__, name.c_str());
const struct ggml_tensor * cur = check_tensor_dims(name, ne, !(flags & TENSOR_NOT_REQUIRED));
if (cur == NULL) {

File diff suppressed because it is too large Load Diff

View File

@ -7,6 +7,7 @@
#include "llama-memory.h"
#include "llama-vocab.h"
#include <map>
#include <memory>
#include <string>
#include <unordered_map>
@ -28,6 +29,7 @@ enum llm_type {
LLM_TYPE_80M,
LLM_TYPE_109M,
LLM_TYPE_137M,
LLM_TYPE_140M,
LLM_TYPE_160M,
LLM_TYPE_190M,
LLM_TYPE_220M,
@ -36,12 +38,15 @@ enum llm_type {
LLM_TYPE_270M,
LLM_TYPE_335M,
LLM_TYPE_350M,
LLM_TYPE_360M,
LLM_TYPE_410M,
LLM_TYPE_450M,
LLM_TYPE_475M,
LLM_TYPE_558M,
LLM_TYPE_700M,
LLM_TYPE_770M,
LLM_TYPE_780M,
LLM_TYPE_950M,
LLM_TYPE_0_3B,
LLM_TYPE_0_5B,
LLM_TYPE_0_6B,
@ -54,6 +59,7 @@ enum llm_type {
LLM_TYPE_1_7B,
LLM_TYPE_1_8B,
LLM_TYPE_2B,
LLM_TYPE_2_6B,
LLM_TYPE_2_8B,
LLM_TYPE_2_9B,
LLM_TYPE_3B,
@ -76,9 +82,11 @@ enum llm_type {
LLM_TYPE_32B,
LLM_TYPE_34B,
LLM_TYPE_35B,
LLM_TYPE_36B,
LLM_TYPE_40B,
LLM_TYPE_65B,
LLM_TYPE_70B,
LLM_TYPE_120B,
LLM_TYPE_142B,
LLM_TYPE_236B,
LLM_TYPE_290B,
@ -268,6 +276,11 @@ struct llama_layer {
struct ggml_tensor * ffn_down_shexp = nullptr;
struct ggml_tensor * ffn_up_shexp = nullptr;
// ff adjugate experts (chexps)
struct ggml_tensor * ffn_gate_chexps = nullptr;
struct ggml_tensor * ffn_down_chexps = nullptr;
struct ggml_tensor * ffn_up_chexps = nullptr;
// ff bias
struct ggml_tensor * ffn_gate_b = nullptr;
struct ggml_tensor * ffn_down_b = nullptr; // b2
@ -449,10 +462,12 @@ struct llama_model {
std::string desc() const;
size_t size() const;
size_t size() const; // file size
size_t n_tensors() const;
size_t n_devices() const;
std::map<ggml_backend_buffer_type_t, size_t> memory_breakdown() const;
// total number of parameters in the model
uint64_t n_elements() const;

View File

@ -725,7 +725,9 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
// attention layers have a non-zero number of kv heads
int32_t n_attn_layer = model.hparams.n_layer - std::count(n_head_kv_iter, n_head_kv_iter + model.hparams.n_layer, 0);
if (llama_model_has_encoder(&model)) {
n_attn_layer *= 3;
// now n_attn_layer is the number of attention layers in the encoder
// for each decoder block, there are 2 attention layers
n_attn_layer += 2 * model.hparams.dec_n_layer;
}
GGML_ASSERT((qs.n_attention_wv == n_attn_layer - pruned_attention_w) && "n_attention_wv is unexpected");
}
@ -920,7 +922,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
new_type = tensor->type;
new_data = tensor->data;
new_size = ggml_nbytes(tensor);
LLAMA_LOG_INFO("size = %8.3f MB\n", ggml_nbytes(tensor)/1024.0/1024.0);
LLAMA_LOG_INFO("size = %8.3f MiB\n", ggml_nbytes(tensor)/1024.0/1024.0);
} else {
const int64_t nelements = ggml_nelements(tensor);
@ -1037,8 +1039,8 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
}
close_ofstream();
LLAMA_LOG_INFO("%s: model size = %8.2f MB\n", __func__, total_size_org/1024.0/1024.0);
LLAMA_LOG_INFO("%s: quant size = %8.2f MB\n", __func__, total_size_new/1024.0/1024.0);
LLAMA_LOG_INFO("%s: model size = %8.2f MiB\n", __func__, total_size_org/1024.0/1024.0);
LLAMA_LOG_INFO("%s: quant size = %8.2f MiB\n", __func__, total_size_new/1024.0/1024.0);
if (qs.n_fallback > 0) {
LLAMA_LOG_WARN("%s: WARNING: %d of %d tensor(s) required fallback quantization\n",

View File

@ -128,6 +128,89 @@ struct ring_buffer {
std::vector<T> data;
};
// writes result in res, does not mutate cur
static void llama_token_data_array_partial_sort(const llama_token_data_array & cur, int npartial, std::vector<llama_token_data> & res) {
static const auto comp = [](const llama_token_data & a, const llama_token_data & b) {
return a.logit > b.logit;
};
constexpr int nbuckets = 128;
constexpr float bucket_low = -10.0f;
constexpr float bucket_high = 10.0f;
constexpr float bucket_scale = nbuckets/(bucket_high - bucket_low);
constexpr float bucket_inter = -bucket_low * bucket_scale;
std::vector<int> bucket_idx;
std::vector<int> histo(nbuckets, 0);
std::vector<llama_token_data*> bucket_ptrs;
bucket_idx.reserve(cur.size);
for (int i = 0; i < (int)cur.size; ++i) {
const float val = cur.data[i].logit;
int ib = int(bucket_scale * val + bucket_inter); //nbuckets * (val - bucket_low) / (bucket_high - bucket_low);
ib = std::max(0, std::min(nbuckets - 1, ib));
bucket_idx.push_back(ib);
++histo[ib];
}
int nhave = 0;
int ib = nbuckets - 1;
for ( ; ib >= 0; --ib) {
nhave += histo[ib];
if (nhave >= npartial) {
break;
}
}
res.resize(nhave);
auto * ptr = res.data();
bucket_ptrs.reserve(nbuckets - ib);
for (int j = nbuckets - 1; j >= ib; --j) {
bucket_ptrs.push_back(ptr);
ptr += histo[j];
}
for (int i = 0; i < (int)cur.size; ++i) {
int j = bucket_idx[i];
if (j >= ib) {
*bucket_ptrs[nbuckets - 1 - j]++ = cur.data[i];
}
}
ptr = res.data();
int ndone = 0;
for (int j = nbuckets - 1; j > ib; --j) {
std::sort(ptr, ptr + histo[j], comp);
ptr += histo[j];
ndone += histo[j];
}
std::partial_sort(ptr, ptr + npartial - ndone, ptr + histo[ib], comp);
}
// reduces the size of cur_p to npartial, keeping only the top npartial elements
static void llama_token_data_array_partial_sort_inplace(llama_token_data_array * cur_p, int npartial) {
static const auto comp = [](const llama_token_data & a, const llama_token_data & b) {
return a.logit > b.logit;
};
if (npartial <= 128) {
std::partial_sort(cur_p->data, cur_p->data + npartial, cur_p->data + cur_p->size, comp);
cur_p->size = npartial;
cur_p->sorted = true;
return;
}
std::vector<llama_token_data> tmp;
llama_token_data_array_partial_sort(*cur_p, npartial, tmp);
std::copy(tmp.data(), tmp.data() + npartial, cur_p->data);
cur_p->size = npartial;
cur_p->sorted = true;
}
static int llama_sample_dist(llama_token_data_array * cur_p, std::mt19937 & rng) {
// iterator for the probabilities
#ifdef __GNUC__
@ -200,18 +283,21 @@ static void llama_sampler_temp_impl(llama_token_data_array * cur_p, float temp)
}
}
static void llama_sampler_softmax_impl(llama_token_data_array * cur_p) {
static void llama_sampler_softmax_impl(llama_token_data_array * cur_p, bool do_sort) {
GGML_ASSERT(cur_p->size > 0);
// Sort the logits in descending order
if (!cur_p->sorted) {
std::sort(cur_p->data, cur_p->data + cur_p->size, [](const llama_token_data & a, const llama_token_data & b) {
return a.logit > b.logit;
});
cur_p->sorted = true;
// Sort the logits in descending order if requested
if (do_sort && !cur_p->sorted) {
llama_token_data_array_partial_sort_inplace(cur_p, cur_p->size);
}
float max_l = cur_p->data[0].logit;
if (!cur_p->sorted) {
for (size_t i = 1; i < cur_p->size; ++i) {
max_l = std::max(max_l, cur_p->data[i].logit);
}
}
float cum_sum = 0.0f;
for (size_t i = 0; i < cur_p->size; ++i) {
@ -226,7 +312,6 @@ static void llama_sampler_softmax_impl(llama_token_data_array * cur_p) {
}
static void llama_sampler_top_k_impl(llama_token_data_array * cur_p, int32_t k) {
// TODO: move bucket sort to separate function so that top_p/typical/softmax first is equally fast
// if (k >= (int32_t)cur_p->size) {
// return;
// }
@ -239,64 +324,7 @@ static void llama_sampler_top_k_impl(llama_token_data_array * cur_p, int32_t k)
// Sort scores in descending order
if (!cur_p->sorted) {
auto comp = [](const llama_token_data & a, const llama_token_data & b) {
return a.logit > b.logit;
};
if (k <= 128) {
std::partial_sort(cur_p->data, cur_p->data + k, cur_p->data + cur_p->size, comp);
} else {
constexpr int nbuckets = 128;
constexpr float bucket_low = -10.0f;
constexpr float bucket_high = 10.0f;
constexpr float bucket_scale = nbuckets/(bucket_high - bucket_low);
constexpr float bucket_inter = -bucket_low * bucket_scale;
std::vector<int> bucket_idx(cur_p->size);
std::vector<int> histo(nbuckets, 0);
for (int i = 0; i < (int)cur_p->size; ++i) {
const float val = cur_p->data[i].logit;
int ib = int(bucket_scale * val + bucket_inter); //nbuckets * (val - bucket_low) / (bucket_high - bucket_low);
ib = std::max(0, std::min(nbuckets - 1, ib));
bucket_idx[i] = ib;
++histo[ib];
}
int nhave = 0;
int ib = nbuckets - 1;
for ( ; ib >= 0; --ib) {
nhave += histo[ib];
if (nhave >= k) {
break;
}
}
std::vector<llama_token_data> tmp_tokens(nhave);
auto * ptr = tmp_tokens.data();
std::vector<llama_token_data*> bucket_ptrs;
bucket_ptrs.reserve(nbuckets - ib);
for (int j = nbuckets - 1; j >= ib; --j) {
bucket_ptrs.push_back(ptr);
ptr += histo[j];
}
for (int i = 0; i < (int)cur_p->size; ++i) {
int j = bucket_idx[i];
if (j >= ib) {
*bucket_ptrs[nbuckets - 1 - j]++ = cur_p->data[i];
}
}
ptr = tmp_tokens.data();
int ndone = 0;
for (int j = nbuckets - 1; j > ib; --j) {
std::sort(ptr, ptr + histo[j], comp);
ptr += histo[j];
ndone += histo[j];
}
std::partial_sort(ptr, ptr + k - ndone, ptr + histo[ib], comp);
std::memcpy(cur_p->data, tmp_tokens.data(), k*sizeof(llama_token_data));
}
cur_p->sorted = true;
llama_token_data_array_partial_sort_inplace(cur_p, k);
}
cur_p->size = k;
@ -576,9 +604,73 @@ static const char * llama_sampler_dist_name(const struct llama_sampler * /*smpl*
static void llama_sampler_dist_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) {
auto * ctx = (llama_sampler_dist *) smpl->ctx;
llama_sampler_softmax_impl(cur_p);
// edge cases
if (cur_p->size == 0) {
cur_p->selected = -1;
return;
}
cur_p->selected = 0;
if (cur_p->size == 1) {
cur_p->data[0].p = 1.0f;
return;
}
// max logit for numerical stability
float max_l = cur_p->data[0].logit;
if (!cur_p->sorted) {
for (size_t i = 1; i < cur_p->size; ++i) {
max_l = std::max(max_l, cur_p->data[i].logit);
}
}
// apply softmax to obtain the probabilities
double sum_cum = 0.0f;
for (size_t i = 0; i < cur_p->size; ++i) {
float p = expf(cur_p->data[i].logit - max_l);
cur_p->data[i].p = p;
sum_cum += p;
}
#if 1
// sample from the obtained probabilities and normalize the probs in a single pass
// this is ~3x faster on Mac with full gpt-oss vocab than the version below
//
std::uniform_real_distribution<double> dist(0.0f, 1.0f);
const double rnd = dist(ctx->rng);
double sum_run = 0.0f;
const double sum_tgt = sum_cum*rnd;
bool found = false;
for (size_t i = 0; i < cur_p->size; ++i) {
if (!found) {
// accumulate probs until we reach the target sum
sum_run += cur_p->data[i].p;
if (sum_run >= sum_tgt) {
cur_p->selected = i;
found = true;
}
}
// normalize probs
cur_p->data[i].p /= sum_cum;
}
// fallback to the last token (don't think this can happen)
assert(found);
if (!found) {
cur_p->selected = cur_p->size - 1;
}
#else
// for clarity, this is the same as above but does one pass for normalization and one extra pass for sampling
for (size_t i = 0; i < cur_p->size; ++i) {
cur_p->data[i].p /= sum_cum;
}
cur_p->selected = llama_sample_dist(cur_p, ctx->rng);
#endif
}
static struct llama_sampler * llama_sampler_dist_clone(const struct llama_sampler * smpl) {
@ -626,32 +718,6 @@ struct llama_sampler * llama_sampler_init_dist(uint32_t seed) {
);
}
// softmax
static const char * llama_sampler_softmax_name(const struct llama_sampler * /*smpl*/) {
return "softmax";
}
static void llama_sampler_softmax_apply(struct llama_sampler * /*smpl*/, llama_token_data_array * cur_p) {
llama_sampler_softmax_impl(cur_p);
}
static struct llama_sampler_i llama_sampler_softmax_i = {
/* .name = */ llama_sampler_softmax_name,
/* .accept = */ nullptr,
/* .apply = */ llama_sampler_softmax_apply,
/* .reset = */ nullptr,
/* .clone = */ nullptr,
/* .free = */ nullptr,
};
struct llama_sampler * llama_sampler_init_softmax() {
return llama_sampler_init(
/* .iface = */ &llama_sampler_softmax_i,
/* .ctx = */ nullptr
);
}
// top-k
struct llama_sampler_top_k {
@ -663,7 +729,7 @@ static const char * llama_sampler_top_k_name(const struct llama_sampler * /*smpl
}
static void llama_sampler_top_k_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) {
const auto * ctx = (llama_sampler_top_k *) smpl->ctx;
auto * ctx = (llama_sampler_top_k *) smpl->ctx;
llama_sampler_top_k_impl(cur_p, ctx->k);
}
@ -699,6 +765,8 @@ struct llama_sampler * llama_sampler_init_top_k(int32_t k) {
struct llama_sampler_top_p {
const float p;
const size_t min_keep;
std::vector<llama_token_data> buf_sort;
};
static const char * llama_sampler_top_p_name(const struct llama_sampler * /*smpl*/) {
@ -706,20 +774,35 @@ static const char * llama_sampler_top_p_name(const struct llama_sampler * /*smpl
}
static void llama_sampler_top_p_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) {
const auto * ctx = (llama_sampler_top_p *) smpl->ctx;
auto * ctx = (llama_sampler_top_p *) smpl->ctx;
if (ctx->p >= 1.0f) {
return;
}
llama_sampler_softmax_impl(cur_p);
llama_sampler_softmax_impl(cur_p, false);
size_t k = cur_p->size;
auto * pdata = cur_p->data;
auto & buf_sort = ctx->buf_sort;
// if not sorted, try adaptive top-k sorting
if (!cur_p->sorted && cur_p->size > 1024) {
k = std::min<size_t>(256, cur_p->size);
llama_token_data_array_partial_sort(*cur_p, k, buf_sort);
pdata = buf_sort.data();
} else if (!cur_p->sorted) {
// small candidates -> sort inplace
llama_token_data_array_partial_sort_inplace(cur_p, k);
}
// Compute the cumulative probabilities
float cum_sum = 0.0f;
size_t last_idx = cur_p->size;
for (size_t i = 0; i < cur_p->size; ++i) {
cum_sum += cur_p->data[i].p;
cum_sum += pdata[i].p;
// Check if the running sum is at least p or if we have kept at least min_keep tokens
// we set the last index to i+1 to indicate that the current iterate should be included in the set
@ -727,9 +810,21 @@ static void llama_sampler_top_p_apply(struct llama_sampler * smpl, llama_token_d
last_idx = i + 1;
break;
}
// we exceeded the current top-k heuristic -> increase k and continue
if (!cur_p->sorted && i == k - 1) {
k = cur_p->size;
llama_token_data_array_partial_sort(*cur_p, k, buf_sort);
pdata = buf_sort.data();
}
}
// Resize the output vector to keep only the top-p tokens
if (!cur_p->sorted) {
std::copy(buf_sort.data(), buf_sort.data() + last_idx, cur_p->data);
cur_p->sorted = true;
}
cur_p->size = last_idx;
}
@ -757,6 +852,7 @@ struct llama_sampler * llama_sampler_init_top_p(float p, size_t min_keep) {
/* .ctx = */ new llama_sampler_top_p {
/* .p = */ p,
/* .min_keep = */ min_keep,
/* .buf_sort = */ {},
}
);
}
@ -773,7 +869,7 @@ static const char * llama_sampler_min_p_name(const struct llama_sampler * /*smpl
}
static void llama_sampler_min_p_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) {
const auto * ctx = (llama_sampler_min_p *) smpl->ctx;
auto * ctx = (llama_sampler_min_p *) smpl->ctx;
if (ctx->p <= 0.0f || !cur_p->size) {
return;
@ -799,7 +895,7 @@ static void llama_sampler_min_p_apply(struct llama_sampler * smpl, llama_token_d
// if we have enough values the operation was a success
if (!filtered_tokens.empty() && filtered_tokens.size() >= ctx->min_keep) {
memcpy(cur_p->data, filtered_tokens.data(), filtered_tokens.size()*sizeof(llama_token_data));
std::copy(filtered_tokens.begin(), filtered_tokens.end(), cur_p->data);
cur_p->size = filtered_tokens.size();
min_p_applied = true;
}
@ -809,10 +905,7 @@ static void llama_sampler_min_p_apply(struct llama_sampler * smpl, llama_token_d
if (!min_p_applied) {
// Sort the logits in descending order
if (!cur_p->sorted) {
std::sort(cur_p->data, cur_p->data + cur_p->size, [](const llama_token_data & a, const llama_token_data & b) {
return a.logit > b.logit;
});
cur_p->sorted = true;
llama_token_data_array_partial_sort_inplace(cur_p, cur_p->size);
}
const float min_logit = cur_p->data[0].logit + logf(ctx->p); // min logit for p_i >= p * p_max
@ -869,7 +962,7 @@ static const char * llama_sampler_typical_name(const struct llama_sampler * /*sm
}
static void llama_sampler_typical_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) {
const auto * ctx = (llama_sampler_typical *) smpl->ctx;
auto * ctx = (llama_sampler_typical *) smpl->ctx;
// Reference implementation:
// https://github.com/huggingface/transformers/compare/main...cimeister:typical-sampling:typical-pr
@ -878,7 +971,7 @@ static void llama_sampler_typical_apply(struct llama_sampler * smpl, llama_token
}
// Compute the softmax of logits and calculate entropy
llama_sampler_softmax_impl(cur_p);
llama_sampler_softmax_impl(cur_p, true);
float entropy = 0.0f;
for (size_t i = 0; i < cur_p->size; ++i) {
@ -1012,7 +1105,7 @@ static const char * llama_sampler_temp_ext_name(const struct llama_sampler * /*s
}
static void llama_sampler_temp_ext_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) {
const auto * ctx = (llama_sampler_temp_ext *) smpl->ctx;
auto * ctx = (llama_sampler_temp_ext *) smpl->ctx;
if (ctx->delta > 0) {
const float min_temp = std::max(0.0f, ctx->temp - ctx->delta);
const float max_temp = ctx->temp + ctx->delta;
@ -1027,7 +1120,7 @@ static void llama_sampler_temp_ext_apply(struct llama_sampler * smpl, llama_toke
// Calculate maximum possible entropy
float max_entropy = -logf(1.0f / cur_p->size);
llama_sampler_softmax_impl(cur_p);
llama_sampler_softmax_impl(cur_p, true);
// Calculate entropy of the softmax probabilities
float entropy = 0.0f;
@ -1139,17 +1232,20 @@ static void llama_sample_xtc_apply(struct llama_sampler * smpl, llama_token_data
std::uniform_real_distribution<float> distribution(0.0f, 1.0f);
float chance = distribution(ctx->rng);
if (chance > ctx->probability) return;
if (chance > ctx->probability) {
return;
}
// in case it's not sorted/recalculated yet
llama_sampler_softmax_impl(cur_p);
llama_sampler_softmax_impl(cur_p, true);
int pos_last = 0;
for (size_t i = 0; i < cur_p->size; ++i) {
if (cur_p->data[i].p >= ctx->threshold) {
pos_last = i;
} else break;
} else {
break;
}
}
if (cur_p->size - pos_last >= ctx->min_keep && pos_last > 0) {
@ -1231,7 +1327,7 @@ static const char * llama_sampler_mirostat_name(const struct llama_sampler * /*s
static void llama_sampler_mirostat_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) {
auto * ctx = (llama_sampler_mirostat *) smpl->ctx;
llama_sampler_softmax_impl(cur_p);
llama_sampler_softmax_impl(cur_p, true);
// Estimate s_hat using the most probable m tokens
float s_hat = 0.0;
@ -1250,7 +1346,8 @@ static void llama_sampler_mirostat_apply(struct llama_sampler * smpl, llama_toke
float k = powf((epsilon_hat * powf(2, ctx->mu)) / (1 - powf(ctx->n_vocab, -epsilon_hat)), 1 / s_hat);
llama_sampler_top_k_impl(cur_p, std::max(int(k), 1));
llama_sampler_softmax_impl(cur_p);
llama_sampler_softmax_impl(cur_p, true);
const int idx = llama_sample_dist(cur_p, ctx->rng);
@ -1336,7 +1433,7 @@ static const char * llama_sampler_mirostat_v2_name(const struct llama_sampler *
static void llama_sampler_mirostat_v2_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) {
auto * ctx = (llama_sampler_mirostat_v2 *) smpl->ctx;
llama_sampler_softmax_impl(cur_p);
llama_sampler_softmax_impl(cur_p, true);
// Truncate the words with surprise values greater than mu
cur_p->size = std::distance(cur_p->data, std::find_if(cur_p->data, cur_p->data + cur_p->size, [&](const llama_token_data & candidate) {
@ -1348,7 +1445,7 @@ static void llama_sampler_mirostat_v2_apply(struct llama_sampler * smpl, llama_t
}
// Normalize the probabilities of the remaining words
llama_sampler_softmax_impl(cur_p);
llama_sampler_softmax_impl(cur_p, true);
const int idx = llama_sample_dist(cur_p, ctx->rng);
@ -1540,7 +1637,7 @@ static struct llama_sampler * llama_sampler_init_grammar_impl(
trigger_pattern += std::regex_replace(trigger_words[i], special_chars, "\\$0");
}
trigger_pattern += ")[\\s\\S]*";
auto trigger_pattern_c = trigger_pattern.c_str();
const auto * trigger_pattern_c = trigger_pattern.c_str();
trigger_patterns = &trigger_pattern_c;
num_trigger_patterns = 1;
}
@ -1748,7 +1845,7 @@ static const char * llama_sampler_top_n_sigma_name(const struct llama_sampler *
}
static void llama_sampler_top_n_sigma_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) {
const auto * ctx = (llama_sampler_top_n_sigma *) smpl->ctx;
auto * ctx = (llama_sampler_top_n_sigma *) smpl->ctx;
if (ctx->n <= 0.0f || cur_p->size <= 1) {
return;
@ -1780,13 +1877,14 @@ static void llama_sampler_top_n_sigma_apply(struct llama_sampler * smpl, llama_t
}
float std = valid_count > 0 ? sqrt(acc/valid_count) : 0;
//apply mask
// apply mask
for (size_t i = 0; i < cur_p->size; ++i) {
if (cur_p->data[i].logit < max - (ctx->n * std)) {
cur_p->data[i].logit = -INFINITY;
}
}
llama_sampler_softmax_impl(cur_p);
llama_sampler_softmax_impl(cur_p, true);
}
static struct llama_sampler * llama_sampler_top_n_sigma_clone(const struct llama_sampler * smpl) {
@ -1991,7 +2089,9 @@ static void llama_sampler_dry_apply(struct llama_sampler * smpl, llama_token_dat
{
const int last = last_n_repeat - 1;
int rt = 0, lt = 0;
int rt = 0;
int lt = 0;
for (int k = 1; k < last_n_repeat; ++k) {
if (k > rt) {
@ -2135,8 +2235,8 @@ static struct llama_sampler_i llama_sampler_dry_i = {
/* .free = */ llama_sampler_dry_free,
};
struct llama_sampler * llama_sampler_init_dry(const struct llama_vocab * vocab, int32_t context_size, float dry_multiplier, float dry_base, int32_t dry_allowed_length, int32_t dry_penalty_last_n, const char** seq_breakers, size_t num_breakers) {
int32_t effective_dry_penalty_last_n = (dry_penalty_last_n == -1) ? context_size : std::max(dry_penalty_last_n, 0);
struct llama_sampler * llama_sampler_init_dry(const struct llama_vocab * vocab, int32_t n_ctx_train, float dry_multiplier, float dry_base, int32_t dry_allowed_length, int32_t dry_penalty_last_n, const char** seq_breakers, size_t num_breakers) {
int32_t effective_dry_penalty_last_n = (dry_penalty_last_n == -1) ? n_ctx_train : std::max(dry_penalty_last_n, 0);
std::unordered_multimap<llama_token, std::vector<llama_token>> processed_breakers;
const int MAX_CHAR_LEN = 40;
const int MAX_SEQ_LEN = 20;
@ -2169,7 +2269,7 @@ struct llama_sampler * llama_sampler_init_dry(const struct llama_vocab * vocab,
return llama_sampler_init(
/* .iface = */ &llama_sampler_dry_i,
/* .ctx = */ new llama_sampler_dry {
/* .total_context_size = */ context_size,
/* .total_context_size = */ n_ctx_train,
/* .dry_multiplier = */ dry_multiplier,
/* .dry_base = */ dry_base,
/* .dry_allowed_length = */ dry_allowed_length,
@ -2308,7 +2408,7 @@ static const char * llama_sampler_infill_name(const struct llama_sampler * /*smp
static void llama_sampler_infill_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) {
auto * ctx = (llama_sampler_infill *) smpl->ctx;
llama_sampler_softmax_impl(cur_p);
llama_sampler_softmax_impl(cur_p, true);
#if defined(GGML_DEBUG_SAMPLER_INFILL)
#define LOG_DBG_CUR LLAMA_LOG_DEBUG

View File

@ -434,6 +434,13 @@ struct llm_tokenizer_bpe : llm_tokenizer {
"(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1}| ?[^\\s\\p{L}\\p{N}\\r\\n]+|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
};
break;
case LLAMA_VOCAB_PRE_TYPE_GROK_2:
regex_exprs = {
// original regex from tokenizer.json
// "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+"
"(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
};
break;
default:
// default regex for BPE tokenization pre-processing
regex_exprs = {
@ -1763,7 +1770,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
const size_t n_precompiled_charsmap = gguf_get_arr_data_n(ctx, precompiled_charsmap_keyidx);
const char * pc = (const char *) gguf_get_arr_data(ctx, precompiled_charsmap_keyidx);
precompiled_charsmap.assign(pc, pc + n_precompiled_charsmap);
#ifdef IS_BIG_ENDIAN
#if defined(__BYTE_ORDER__) && defined(__ORDER_BIG_ENDIAN__) && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
// correct endiannes of data in precompiled_charsmap binary blob
uint32_t * xcda_blob_size = (uint32_t *) &precompiled_charsmap[0];
*xcda_blob_size = __builtin_bswap32(*xcda_blob_size);
@ -1944,7 +1951,8 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
pre_type = LLAMA_VOCAB_PRE_TYPE_TRILLION;
clean_spaces = false;
} else if (
tokenizer_pre == "bailingmoe") {
tokenizer_pre == "bailingmoe" ||
tokenizer_pre == "llada-moe") {
pre_type = LLAMA_VOCAB_PRE_TYPE_BAILINGMOE;
clean_spaces = false;
} else if (
@ -1963,6 +1971,10 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
tokenizer_pre == "kimi-k2") {
pre_type = LLAMA_VOCAB_PRE_TYPE_KIMI_K2;
clean_spaces = false;
} else if (
tokenizer_pre == "grok-2") {
pre_type = LLAMA_VOCAB_PRE_TYPE_GROK_2;
clean_spaces = false;
} else {
LLAMA_LOG_WARN("%s: missing or unrecognized pre-tokenizer type, using: 'default'\n", __func__);
pre_type = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
@ -2331,7 +2343,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
// @ngxson : quick hack for gpt-oss, always render these tokens
for (const auto & t : token_to_id) {
if (t.first == "<|channel|>" || t.first == "<|message|>" || t.first == "<|start|>") {
if (t.first == "<|channel|>" || t.first == "<|message|>" || t.first == "<|start|>" || t.first == "<|constrain|>") {
id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_USER_DEFINED;
}
}
@ -2378,6 +2390,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
if (has_return && has_call && has_end) {
special_eog_ids.erase(end_id);
id_to_token[end_id].attr = LLAMA_TOKEN_ATTR_USER_DEFINED;
LLAMA_LOG_WARN("%s: special_eog_ids contains both '<|return|>' and '<|call|>' tokens, removing '<|end|>' token from EOG list\n", __func__);
}
}
@ -2459,7 +2472,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
// set attributes by model/tokenizer/architecture name
if (false
|| _contains_any(tokenizer_pre, {"jina-v2-de", "jina-v2-es", "jina-v2-code"})
|| _contains_any(general_arch, {"nomic-bert-moe"})
|| _contains_any(general_arch, {"nomic-bert-moe", "jina-bert-v3"})
) {
if (token_to_id.count("<mask>") == 0) {
LLAMA_LOG_WARN("%s: Mask token is missing in vocab, please reconvert model!\n", __func__);

View File

@ -47,6 +47,7 @@ enum llama_vocab_pre_type {
LLAMA_VOCAB_PRE_TYPE_HUNYUAN = 36,
LLAMA_VOCAB_PRE_TYPE_KIMI_K2 = 37,
LLAMA_VOCAB_PRE_TYPE_HUNYUAN_DENSE = 38,
LLAMA_VOCAB_PRE_TYPE_GROK_2 = 39,
};
struct LLM_KV;

View File

@ -25,6 +25,18 @@
// interface implementation
//
const char * llama_flash_attn_type_name(enum llama_flash_attn_type flash_attn_type) {
switch (flash_attn_type) {
case LLAMA_FLASH_ATTN_TYPE_AUTO:
return "auto";
case LLAMA_FLASH_ATTN_TYPE_DISABLED:
return "disabled";
case LLAMA_FLASH_ATTN_TYPE_ENABLED:
return "enabled";
}
GGML_ABORT("fatal error");
}
struct llama_sampler_chain_params llama_sampler_chain_default_params() {
struct llama_sampler_chain_params result = {
/*.no_perf =*/ true,
@ -47,6 +59,7 @@ bool llama_supports_mlock(void) {
bool llama_supports_gpu_offload(void) {
return ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_GPU) != nullptr ||
ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_IGPU) != nullptr ||
llama_supports_rpc();
}
@ -71,8 +84,10 @@ void llama_numa_init(enum ggml_numa_strategy numa) {
GGML_ASSERT(dev && "CPU backend is not loaded");
auto * reg = ggml_backend_dev_backend_reg(dev);
auto * numa_init_fn = (decltype(ggml_numa_init) *) ggml_backend_reg_get_proc_address(reg, "ggml_backend_cpu_numa_init");
if (numa_init_fn) {
numa_init_fn(numa);
}
}
}
void llama_backend_free(void) {
@ -170,8 +185,13 @@ static struct llama_model * llama_model_load_from_file_impl(
model->devices.push_back(*dev);
}
} else {
// default device selection
// build list of available devices
std::vector<ggml_backend_dev_t> gpus;
std::vector<ggml_backend_dev_t> igpus;
std::vector<ggml_backend_dev_t> rpc_servers;
// use all available devices
for (size_t i = 0; i < ggml_backend_dev_count(); ++i) {
ggml_backend_dev_t dev = ggml_backend_dev_get(i);
switch (ggml_backend_dev_type(dev)) {
@ -180,19 +200,51 @@ static struct llama_model * llama_model_load_from_file_impl(
// skip CPU backends since they are handled separately
break;
case GGML_BACKEND_DEVICE_TYPE_GPU:
case GGML_BACKEND_DEVICE_TYPE_GPU: {
ggml_backend_reg_t reg = ggml_backend_dev_backend_reg(dev);
if (ggml_backend_reg_name(reg) == std::string("RPC")) {
rpc_servers.push_back(dev);
} else {
model->devices.push_back(dev);
// check if there is already a GPU with the same device id
ggml_backend_dev_props props;
ggml_backend_dev_get_props(dev, &props);
auto it = std::find_if(gpus.begin(), gpus.end(), [&props](ggml_backend_dev_t d) {
ggml_backend_dev_props d_props;
ggml_backend_dev_get_props(d, &d_props);
if (props.device_id && d_props.device_id) {
return strcmp(props.device_id, d_props.device_id) == 0;
}
return false;
});
if (it != gpus.end()) {
LLAMA_LOG_INFO("%s: skipping device %s (%s) with id %s - already using device %s (%s) with the same id\n",
__func__,
ggml_backend_dev_name(dev), ggml_backend_dev_description(dev),
props.device_id ? props.device_id : "unknown id",
ggml_backend_dev_name(*it), ggml_backend_dev_description(*it));
} else {
gpus.push_back(dev);
}
}
break;
}
case GGML_BACKEND_DEVICE_TYPE_IGPU:
igpus.push_back(dev);
break;
}
// add RPC servers at the front of the list
if (!rpc_servers.empty()) {
}
// add RPC servers at the front of the list to minimize network transfers
model->devices.insert(model->devices.begin(), rpc_servers.begin(), rpc_servers.end());
// add GPUs
model->devices.insert(model->devices.end(), gpus.begin(), gpus.end());
// add integrated GPUs only if no other devices were found
if (model->devices.empty()) {
model->devices.insert(model->devices.end(), igpus.begin(), igpus.end());
}
}
@ -213,9 +265,12 @@ static struct llama_model * llama_model_load_from_file_impl(
}
for (auto * dev : model->devices) {
size_t free, total; // NOLINT
ggml_backend_dev_memory(dev, &free, &total);
LLAMA_LOG_INFO("%s: using device %s (%s) - %zu MiB free\n", __func__, ggml_backend_dev_name(dev), ggml_backend_dev_description(dev), free/1024/1024);
ggml_backend_dev_props props;
ggml_backend_dev_get_props(dev, &props);
LLAMA_LOG_INFO("%s: using device %s (%s) (%s) - %zu MiB free\n", __func__,
ggml_backend_dev_name(dev), ggml_backend_dev_description(dev),
props.device_id ? props.device_id : "unknown id",
props.memory_free/1024/1024);
}
const int status = llama_model_load(path_model, splits, *model, params);

View File

@ -4,6 +4,7 @@
#include <string>
#include <vector>
// TODO: reimplement this structure in endian-independent way
struct unicode_cpt_flags {
enum {
UNDEFINED = 0x0001,
@ -15,6 +16,10 @@ struct unicode_cpt_flags {
SYMBOL = 0x0040, // regex: \p{S}
CONTROL = 0x0080, // regex: \p{C}
MASK_CATEGORIES = 0x00FF,
WHITESPACE = 0x0100,
LOWERCASE = 0x0200,
UPPERCASE = 0x0400,
NFD = 0x0800,
};
// codepoint type
@ -34,11 +39,49 @@ struct unicode_cpt_flags {
// decode from uint16
inline unicode_cpt_flags(const uint16_t flags = 0) {
#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
*reinterpret_cast<uint16_t*>(this) = flags;
#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
is_undefined = (flags & UNDEFINED) ? 1 : 0;
is_number = (flags & NUMBER) ? 1 : 0;
is_letter = (flags & LETTER) ? 1 : 0;
is_separator = (flags & SEPARATOR) ? 1 : 0;
is_accent_mark = (flags & ACCENT_MARK) ? 1 : 0;
is_punctuation = (flags & PUNCTUATION) ? 1 : 0;
is_symbol = (flags & SYMBOL) ? 1 : 0;
is_control = (flags & CONTROL) ? 1 : 0;
is_whitespace = (flags & WHITESPACE) ? 1 : 0;
is_lowercase = (flags & LOWERCASE) ? 1 : 0;
is_uppercase = (flags & UPPERCASE) ? 1 : 0;
is_nfd = (flags & NFD) ? 1 : 0;
#else
#error Unexpected or undefined __BYTE_ORDER__
#endif
}
inline uint16_t as_uint() const {
#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
return *reinterpret_cast<const uint16_t*>(this);
#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
uint16_t result =
is_undefined * UNDEFINED
+ is_number * NUMBER
+ is_letter * LETTER
+ is_separator * SEPARATOR
+ is_accent_mark * ACCENT_MARK
+ is_punctuation * PUNCTUATION
+ is_symbol * SYMBOL
+ is_control * CONTROL
+ is_whitespace * WHITESPACE
+ is_lowercase * LOWERCASE
+ is_uppercase * UPPERCASE
+ is_nfd * NFD
;
return result;
#else
#error Unexpected or undefined __BYTE_ORDER__
#endif
}
inline uint16_t category_flag() const {

View File

@ -44,6 +44,7 @@
#define KEY_WIN_ATTN_PATTERN "clip.vision.n_wa_pattern"
#define KEY_ATTN_WINDOW_SIZE "clip.vision.window_size"
#define KEY_MINICPMV_VERSION "clip.minicpmv_version"
#define KEY_MINICPMV_QUERY_NUM "clip.minicpmv_query_num"
// audio-specific
#define KEY_A_NUM_MEL_BINS "clip.audio.num_mel_bins"
@ -81,6 +82,7 @@
#define TN_MVLM_PROJ_PEG "mm.model.peg.%d.%s"
#define TN_IMAGE_NEWLINE "model.image_newline"
#define TN_MM_INP_NORM "mm.input_norm.weight"
#define TN_MM_INP_NORM_B "mm.input_norm.bias"
#define TN_MM_INP_PROJ "mm.input_projection.weight" // gemma3
#define TN_MM_SOFT_EMB_N "mm.soft_emb_norm.weight" // gemma3
#define TN_MM_PROJECTOR "mm.model.fc.weight" // idefics3
@ -132,6 +134,8 @@ enum projector_type {
PROJECTOR_TYPE_QWEN2A,
PROJECTOR_TYPE_QWEN25O, // will be replaced by QWEN2A or QWEN25VL depending on clip_ctx
PROJECTOR_TYPE_VOXTRAL,
PROJECTOR_TYPE_LFM2,
PROJECTOR_TYPE_KIMIVL,
PROJECTOR_TYPE_UNKNOWN,
};
@ -152,6 +156,8 @@ static std::map<projector_type, std::string> PROJECTOR_TYPE_NAMES = {
{ PROJECTOR_TYPE_QWEN2A, "qwen2a"},
{ PROJECTOR_TYPE_QWEN25O, "qwen2.5o"},
{ PROJECTOR_TYPE_VOXTRAL, "voxtral"},
{ PROJECTOR_TYPE_LFM2, "lfm2"},
{ PROJECTOR_TYPE_KIMIVL, "kimivl"},
};
static projector_type clip_projector_type_from_string(const std::string & str) {

View File

@ -214,6 +214,7 @@ struct clip_hparams {
// legacy
bool has_llava_projector = false;
int minicpmv_version = 0;
int32_t minicpmv_query_num = 0; // MiniCPM-V query number
};
struct clip_layer {
@ -277,6 +278,7 @@ struct clip_model {
// LLaVA projection
ggml_tensor * mm_input_norm_w = nullptr;
ggml_tensor * mm_input_norm_b = nullptr;
ggml_tensor * mm_0_w = nullptr;
ggml_tensor * mm_0_b = nullptr;
ggml_tensor * mm_2_w = nullptr;
@ -417,6 +419,7 @@ struct clip_ctx {
}
if (!backend) {
backend = ggml_backend_init_by_type(GGML_BACKEND_DEVICE_TYPE_GPU, nullptr);
backend = backend ? backend : ggml_backend_init_by_type(GGML_BACKEND_DEVICE_TYPE_IGPU, nullptr);
}
}
@ -500,11 +503,17 @@ struct clip_graph {
ggml_cgraph * build_siglip() {
ggml_tensor * inp = build_inp();
ggml_tensor * learned_pos_embd = model.position_embeddings;
if (ctx->proj_type() == PROJECTOR_TYPE_LFM2) {
learned_pos_embd = resize_position_embeddings();
}
ggml_tensor * cur = build_vit(
inp, n_patches,
NORM_TYPE_NORMAL,
hparams.ffn_op,
model.position_embeddings,
learned_pos_embd,
nullptr);
if (ctx->proj_type() == PROJECTOR_TYPE_GEMMA3) {
@ -513,8 +522,8 @@ struct clip_graph {
const int patches_per_image = n_patches_x;
const int kernel_size = hparams.proj_scale_factor;
cur = ggml_cont(ctx0, ggml_transpose(ctx0, cur));
cur = ggml_reshape_4d(ctx0, cur, patches_per_image, patches_per_image, n_embd, batch_size);
cur = ggml_transpose(ctx0, cur);
cur = ggml_cont_4d(ctx0, cur, patches_per_image, patches_per_image, n_embd, batch_size);
// doing a pool2d to reduce the number of output tokens
cur = ggml_pool_2d(ctx0, cur, GGML_OP_POOL_AVG, kernel_size, kernel_size, kernel_size, kernel_size, 0, 0);
@ -531,29 +540,27 @@ struct clip_graph {
cur);
} else if (ctx->proj_type() == PROJECTOR_TYPE_IDEFICS3) {
// pixel_shuffle
// https://github.com/huggingface/transformers/blob/0a950e0bbe1ed58d5401a6b547af19f15f0c195e/src/transformers/models/idefics3/modeling_idefics3.py#L578
const int scale_factor = model.hparams.proj_scale_factor;
const int n_embd = cur->ne[0];
const int seq = cur->ne[1];
const int bsz = 1; // batch size, always 1 for now since we don't support batching
const int height = std::sqrt(seq);
const int width = std::sqrt(seq);
GGML_ASSERT(scale_factor != 0);
cur = ggml_reshape_4d(ctx0, cur, n_embd * scale_factor, width / scale_factor, height, bsz);
cur = ggml_permute(ctx0, cur, 0, 2, 1, 3);
cur = ggml_reshape_4d(ctx0, ggml_cont(ctx0, cur),
n_embd * scale_factor * scale_factor,
height / scale_factor,
width / scale_factor,
bsz);
cur = ggml_permute(ctx0, cur, 0, 2, 1, 3);
cur = ggml_reshape_3d(ctx0, ggml_cont(ctx0, cur),
n_embd * scale_factor * scale_factor,
seq / (scale_factor * scale_factor),
bsz);
cur = build_patch_merge_permute(cur, scale_factor);
cur = ggml_mul_mat(ctx0, model.projection, cur);
} else if (ctx->proj_type() == PROJECTOR_TYPE_LFM2) {
// pixel unshuffle block
const int scale_factor = model.hparams.proj_scale_factor;
cur = build_patch_merge_permute(cur, scale_factor);
// projection
cur = ggml_norm(ctx0, cur, 1e-5); // default nn.LayerNorm
cur = ggml_mul(ctx0, cur, model.mm_input_norm_w);
cur = ggml_add(ctx0, cur, model.mm_input_norm_b);
cur = ggml_mul_mat(ctx0, model.mm_1_w, cur);
cur = ggml_add(ctx0, cur, model.mm_1_b);
cur = ggml_gelu(ctx0, cur);
cur = ggml_mul_mat(ctx0, model.mm_2_w, cur);
cur = ggml_add(ctx0, cur, model.mm_2_b);
} else {
GGML_ABORT("SigLIP: Unsupported projector type");
}
@ -681,15 +688,15 @@ struct clip_graph {
auto inp_1 = ggml_conv_2d(ctx0, model.patch_embeddings_1, inp_raw, patch_size, patch_size, 0, 0, 1, 1);
inp = ggml_add(ctx0, inp, inp_1);
inp = ggml_cont(ctx0, ggml_permute(ctx0, inp, 1, 2, 0, 3)); // [w, h, c, b] -> [c, w, h, b]
inp = ggml_reshape_4d(
inp = ggml_permute(ctx0, inp, 1, 2, 0, 3); // [w, h, c, b] -> [c, w, h, b]
inp = ggml_cont_4d(
ctx0, inp,
n_embd * 2, n_patches_x / 2, n_patches_y, batch_size);
inp = ggml_reshape_4d(
ctx0, inp,
n_embd * 2, n_patches_x / 2, 2, batch_size * (n_patches_y / 2));
inp = ggml_cont(ctx0, ggml_permute(ctx0, inp, 0, 2, 1, 3));
inp = ggml_reshape_3d(
inp = ggml_permute(ctx0, inp, 0, 2, 1, 3);
inp = ggml_cont_3d(
ctx0, inp,
n_embd, n_patches_x * n_patches_y, batch_size);
}
@ -879,21 +886,8 @@ struct clip_graph {
int n_embd = clip_n_mmproj_embd(ctx);
const int d_head = 128;
int n_head = n_embd/d_head;
int num_query = 96;
if (ctx->model.hparams.minicpmv_version == 2) {
// MiniCPM-V 2.5
num_query = 96;
} else if (ctx->model.hparams.minicpmv_version == 3) {
// MiniCPM-V 2.6
num_query = 64;
} else if (ctx->model.hparams.minicpmv_version == 4) {
// MiniCPM-o 2.6
num_query = 64;
} else if (ctx->model.hparams.minicpmv_version == 5) {
// MiniCPM-V 4.0
num_query = 64;
}
// Use actual config value if available, otherwise fall back to hardcoded values
int num_query = ctx->model.hparams.minicpmv_query_num;
ggml_tensor * Q = ggml_add(ctx0,
ggml_mul_mat(ctx0, model.mm_model_attn_q_w, q),
model.mm_model_attn_q_b);
@ -967,14 +961,14 @@ struct clip_graph {
GGML_ASSERT(scale_factor > 0);
cur = ggml_reshape_4d(ctx0, cur, n_embd * scale_factor, height / scale_factor, width, bsz);
cur = ggml_permute(ctx0, cur, 0, 2, 1, 3);
cur = ggml_reshape_4d(ctx0, ggml_cont(ctx0, cur),
cur = ggml_cont_4d(ctx0, cur,
n_embd * scale_factor * scale_factor,
height / scale_factor,
width / scale_factor,
bsz);
cur = ggml_permute(ctx0, cur, 0, 2, 1, 3);
// flatten to 2D
cur = ggml_reshape_2d(ctx0, ggml_cont(ctx0, cur),
cur = ggml_cont_2d(ctx0, cur,
n_embd * scale_factor * scale_factor,
cur->ne[1] * cur->ne[2]);
}
@ -1060,14 +1054,14 @@ struct clip_graph {
n_patches_y,
bsz);
cur = ggml_permute(ctx0, cur, 0, 2, 1, 3);
cur = ggml_reshape_4d(ctx0, ggml_cont(ctx0, cur),
cur = ggml_cont_4d(ctx0, cur,
n_embd * scale_factor * scale_factor,
n_patches_x / scale_factor,
n_patches_y / scale_factor,
bsz);
cur = ggml_permute(ctx0, cur, 0, 2, 1, 3);
//cur = ggml_permute(ctx0, cur, 0, 2, 1, 3);
// flatten to 2D
cur = ggml_reshape_2d(ctx0, ggml_cont(ctx0, cur),
cur = ggml_cont_2d(ctx0, cur,
n_embd * scale_factor * scale_factor,
n_patches / scale_factor / scale_factor);
cb(cur, "pixel_shuffle", -1);
@ -1092,6 +1086,67 @@ struct clip_graph {
return gf;
}
ggml_cgraph * build_kimivl() {
// 2D input positions
ggml_tensor * pos_h = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_patches);
ggml_set_name(pos_h, "pos_h");
ggml_set_input(pos_h);
ggml_tensor * pos_w = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_patches);
ggml_set_name(pos_w, "pos_w");
ggml_set_input(pos_w);
ggml_tensor * learned_pos_embd = resize_position_embeddings();
// build ViT with 2D position embeddings
auto add_pos = [&](ggml_tensor * cur, const clip_layer &) {
// first half is X axis and second half is Y axis
return build_rope_2d(ctx0, cur, pos_w, pos_h, hparams.rope_theta, false);
};
ggml_tensor * inp = build_inp();
ggml_tensor * cur = build_vit(
inp, n_patches,
NORM_TYPE_NORMAL,
hparams.ffn_op,
learned_pos_embd,
add_pos);
cb(cur, "vit_out", -1);
{
// patch_merger
const int scale_factor = model.hparams.proj_scale_factor;
cur = build_patch_merge_permute(cur, scale_factor);
// projection norm
int proj_inp_dim = cur->ne[0];
cur = ggml_view_2d(ctx0, cur,
n_embd, cur->ne[1] * scale_factor * scale_factor,
ggml_row_size(cur->type, n_embd), 0);
cur = ggml_norm(ctx0, cur, 1e-5); // default nn.LayerNorm
cur = ggml_mul(ctx0, cur, model.mm_input_norm_w);
cur = ggml_add(ctx0, cur, model.mm_input_norm_b);
cur = ggml_view_2d(ctx0, cur,
proj_inp_dim, cur->ne[1] / scale_factor / scale_factor,
ggml_row_size(cur->type, proj_inp_dim), 0);
cb(cur, "proj_inp_normed", -1);
// projection mlp
cur = ggml_mul_mat(ctx0, model.mm_1_w, cur);
cur = ggml_add(ctx0, cur, model.mm_1_b);
cur = ggml_gelu(ctx0, cur);
cur = ggml_mul_mat(ctx0, model.mm_2_w, cur);
cur = ggml_add(ctx0, cur, model.mm_2_b);
cb(cur, "proj_out", -1);
}
// build the graph
ggml_build_forward_expand(gf, cur);
return gf;
}
// this graph is used by llava, granite and glm
// due to having embedding_stack (used by granite), we cannot reuse build_vit
ggml_cgraph * build_llava() {
@ -1300,8 +1355,8 @@ struct clip_graph {
ggml_tensor * block_1 = nullptr;
{
// transpose from [1, 576, 2048] --> [1, 2048, 576] --> [1, 2048, 24, 24]
mlp_3 = ggml_cont(ctx0, ggml_permute(ctx0, mlp_3, 1, 0, 2, 3));
mlp_3 = ggml_reshape_4d(ctx0, mlp_3, n_patch, n_patch, mlp_3->ne[1], mlp_3->ne[2]);
mlp_3 = ggml_permute(ctx0, mlp_3, 1, 0, 2, 3);
mlp_3 = ggml_cont_4d(ctx0, mlp_3, n_patch, n_patch, mlp_3->ne[1], mlp_3->ne[2]);
// stride = 1, padding = 1, bias is nullptr
block_1 = ggml_conv_2d_dw(ctx0, model.mm_model_block_1_block_0_0_w, mlp_3, 1, 1, 1, 1, 1, 1);
@ -1406,9 +1461,9 @@ struct clip_graph {
mlp_2 = ggml_add(ctx0, mlp_2, model.mm_model_mlp_2_b);
// mlp_2 ne = [2048, 576, 1, 1]
// // AVG Pool Layer 2*2, strides = 2
mlp_2 = ggml_cont(ctx0, ggml_permute(ctx0, mlp_2, 1, 0, 2, 3));
mlp_2 = ggml_permute(ctx0, mlp_2, 1, 0, 2, 3);
// mlp_2 ne = [576, 2048, 1, 1]
mlp_2 = ggml_reshape_4d(ctx0, mlp_2, n_patch, n_patch, mlp_2->ne[1], mlp_2->ne[2]);
mlp_2 = ggml_cont_4d(ctx0, mlp_2, n_patch, n_patch, mlp_2->ne[1], mlp_2->ne[2]);
// mlp_2 ne [24, 24, 2048, 1]
mlp_2 = ggml_pool_2d(ctx0, mlp_2, GGML_OP_POOL_AVG, 2, 2, 2, 2, 0, 0);
// weight ne = [3, 3, 2048, 1]
@ -1428,8 +1483,8 @@ struct clip_graph {
// glm projector
else if (ctx->proj_type() == PROJECTOR_TYPE_GLM_EDGE) {
size_t gridsz = (size_t)sqrt(embeddings->ne[1]);
embeddings = ggml_cont(ctx0, ggml_permute(ctx0,embeddings,1,0,2,3));
embeddings = ggml_reshape_3d(ctx0, embeddings, gridsz, gridsz, embeddings->ne[1]);
embeddings = ggml_permute(ctx0,embeddings,1,0,2,3);
embeddings = ggml_cont_3d(ctx0, embeddings, gridsz, gridsz, embeddings->ne[1]);
embeddings = ggml_conv_2d(ctx0, model.mm_model_adapter_conv_w, embeddings, 2, 2, 0, 0, 1, 1);
embeddings = ggml_reshape_3d(ctx0, embeddings,embeddings->ne[0]*embeddings->ne[1] , embeddings->ne[2], batch_size);
embeddings = ggml_cont(ctx0, ggml_permute(ctx0,embeddings, 1, 0, 2, 3));
@ -1585,6 +1640,29 @@ private:
}
}
// siglip2 naflex
ggml_tensor * resize_position_embeddings() {
ggml_tensor * pos_embd = model.position_embeddings;
const int height = img.ny / patch_size;
const int width = img.nx / patch_size;
const uint32_t mode = GGML_SCALE_MODE_BILINEAR;
const int n_per_side = (int)std::sqrt(pos_embd->ne[1]);
GGML_ASSERT(pos_embd);
if (height == n_per_side && width == n_per_side) {
return pos_embd;
}
pos_embd = ggml_reshape_3d(ctx0, pos_embd, n_embd, n_per_side, n_per_side); // -> (n_embd, n_per_side, n_per_side)
pos_embd = ggml_permute(ctx0, pos_embd, 2, 0, 1, 3); // -> (n_per_side, n_per_side, n_embd)
pos_embd = ggml_interpolate(ctx0, pos_embd, width, height, n_embd, 1, mode); // -> (width, height, n_embd)
pos_embd = ggml_permute(ctx0, pos_embd, 1, 2, 0, 3); // -> (n_embd, width, height)
pos_embd = ggml_cont_2d(ctx0, pos_embd, n_embd, width * height); // -> (n_embd, width * height)
return pos_embd;
}
// build vision transformer (ViT) cgraph
// this function should cover most of the models
// if your model has specific features, you should probably duplicate this function
@ -1963,7 +2041,6 @@ private:
ggml_row_size(cur->type, n_dim),
ggml_row_size(cur->type, n_dim*n_head),
n_dim/2 * ggml_element_size(cur));
second = ggml_cont(ctx0, second); // copy, because ggml_rope don't play well with non-contiguous tensors
second = ggml_rope_ext(
ctx0,
second,
@ -1980,6 +2057,39 @@ private:
return cur;
}
// aka pixel_shuffle / pixel_unshuffle / patch_merger (Kimi-VL)
// support dynamic resolution
ggml_tensor * build_patch_merge_permute(ggml_tensor * cur, int scale_factor) {
GGML_ASSERT(scale_factor > 1);
const int n_embd = cur->ne[0];
int width = img.nx / patch_size;
int height = img.ny / patch_size;
// pad width and height to factor
const int64_t pad_width = CLIP_ALIGN(width, scale_factor) - width;
const int64_t pad_height = CLIP_ALIGN(height, scale_factor) - height;
cur = ggml_reshape_3d(ctx0, cur, n_embd, width, height);
if (pad_width || pad_height) {
cur = ggml_pad(ctx0, cur, 0, pad_width, pad_height, 0);
width += pad_width;
height += pad_height;
}
// unshuffle h
cur = ggml_reshape_3d(ctx0, cur, n_embd * scale_factor, width / scale_factor, height);
cur = ggml_permute(ctx0, cur, 0, 2, 1, 3);
// unshuffle w
cur = ggml_cont_3d(ctx0, cur, n_embd * scale_factor * scale_factor, height / scale_factor, width / scale_factor);
cur = ggml_permute(ctx0, cur, 0, 2, 1, 3);
cur = ggml_cont_2d(ctx0, cur, cur->ne[0], cur->ne[1] * cur->ne[2]);
cb(cur, "pixel_shuffle", -1);
return cur;
}
};
static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32_batch & imgs) {
@ -1991,6 +2101,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
switch (ctx->proj_type()) {
case PROJECTOR_TYPE_GEMMA3:
case PROJECTOR_TYPE_IDEFICS3:
case PROJECTOR_TYPE_LFM2:
{
res = graph.build_siglip();
} break;
@ -2021,6 +2132,10 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
{
res = graph.build_whisper_enc();
} break;
case PROJECTOR_TYPE_KIMIVL:
{
res = graph.build_kimivl();
} break;
default:
{
res = graph.build_llava();
@ -2151,7 +2266,21 @@ struct clip_model_loader {
get_u32(KEY_PATCH_SIZE, hparams.patch_size);
get_u32(KEY_IMAGE_CROP_RESOLUTION, hparams.image_crop_resolution, false);
get_i32(KEY_MINICPMV_VERSION, hparams.minicpmv_version, false); // legacy
get_u32(KEY_MINICPMV_QUERY_NUM, hparams.minicpmv_query_num, false);
if (hparams.minicpmv_query_num == 0) {
// Fallback to hardcoded values for legacy models
if (hparams.minicpmv_version == 3) {
hparams.minicpmv_query_num = 64;
} else if (hparams.minicpmv_version == 4) {
hparams.minicpmv_query_num = 64;
} else if (hparams.minicpmv_version == 5) {
hparams.minicpmv_query_num = 64;
} else if (hparams.minicpmv_version == 6) {
hparams.minicpmv_query_num = 64;
} else {
hparams.minicpmv_query_num = 96;
}
}
} else if (is_audio) {
get_u32(KEY_A_NUM_MEL_BINS, hparams.n_mel_bins);
@ -2243,6 +2372,7 @@ struct clip_model_loader {
}
} break;
case PROJECTOR_TYPE_IDEFICS3:
case PROJECTOR_TYPE_LFM2:
case PROJECTOR_TYPE_INTERNVL:
{
get_u32(KEY_PROJ_SCALE_FACTOR, hparams.proj_scale_factor, false);
@ -2256,6 +2386,12 @@ struct clip_model_loader {
hparams.image_size = 1024;
get_u32(KEY_SPATIAL_MERGE_SIZE, hparams.spatial_merge_size, false);
} break;
case PROJECTOR_TYPE_KIMIVL:
{
hparams.rope_theta = 10000.0f;
hparams.warmup_image_size = hparams.patch_size * 8;
get_u32(KEY_PROJ_SCALE_FACTOR, hparams.proj_scale_factor, false);
} break;
case PROJECTOR_TYPE_GEMMA3:
{
// default value (used by all model sizes in gemma 3 family)
@ -2420,7 +2556,20 @@ struct clip_model_loader {
// some models already exported with legacy (incorrect) naming which is quite messy, let's fix it here
// note: Qwen model converted from the old surgery script has n_ff = 0, so we cannot use n_ff to check!
if (layer.ff_up_w && layer.ff_down_w && layer.ff_down_w->ne[0] == hparams.n_embd) {
bool is_ffn_swapped = (
// only old models need this fix
model.proj_type == PROJECTOR_TYPE_MLP
|| model.proj_type == PROJECTOR_TYPE_MLP_NORM
|| model.proj_type == PROJECTOR_TYPE_LDP
|| model.proj_type == PROJECTOR_TYPE_LDPV2
|| model.proj_type == PROJECTOR_TYPE_QWEN2VL
|| model.proj_type == PROJECTOR_TYPE_QWEN25VL
|| model.proj_type == PROJECTOR_TYPE_GLM_EDGE
|| model.proj_type == PROJECTOR_TYPE_GEMMA3
|| model.proj_type == PROJECTOR_TYPE_IDEFICS3
|| model.proj_type == PROJECTOR_TYPE_MINICPMV
) && layer.ff_up_w && layer.ff_down_w && layer.ff_down_w->ne[0] == hparams.n_embd;
if (is_ffn_swapped) {
// swap up and down weights
ggml_tensor * tmp = layer.ff_up_w;
layer.ff_up_w = layer.ff_down_w;
@ -2429,6 +2578,9 @@ struct clip_model_loader {
tmp = layer.ff_up_b;
layer.ff_up_b = layer.ff_down_b;
layer.ff_down_b = tmp;
if (il == 0) {
LOG_WRN("%s: ffn up/down are swapped\n", __func__);
}
}
}
@ -2546,6 +2698,16 @@ struct clip_model_loader {
{
model.projection = get_tensor(TN_MM_PROJECTOR);
} break;
case PROJECTOR_TYPE_LFM2:
case PROJECTOR_TYPE_KIMIVL:
{
model.mm_input_norm_w = get_tensor(TN_MM_INP_NORM);
model.mm_input_norm_b = get_tensor(TN_MM_INP_NORM_B);
model.mm_1_w = get_tensor(string_format(TN_LLAVA_PROJ, 1, "weight"));
model.mm_1_b = get_tensor(string_format(TN_LLAVA_PROJ, 1, "bias"));
model.mm_2_w = get_tensor(string_format(TN_LLAVA_PROJ, 2, "weight"));
model.mm_2_b = get_tensor(string_format(TN_LLAVA_PROJ, 2, "bias"));
} break;
case PROJECTOR_TYPE_PIXTRAL:
{
model.mm_1_w = get_tensor(string_format(TN_LLAVA_PROJ, 1, "weight"));
@ -2944,7 +3106,7 @@ struct image_manipulation {
dst.buf.resize(3 * target_width * target_height);
float Cc;
float C[5];
float C[5] = {};
float d0, d2, d3, a0, a1, a2, a3;
int i, j, k, jj;
int x, y;
@ -3467,6 +3629,45 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str
res_imgs->grid_y = inst.grid_size.height;
return true;
} else if ( ctx->proj_type() == PROJECTOR_TYPE_LFM2
|| ctx->proj_type() == PROJECTOR_TYPE_KIMIVL
) {
GGML_ASSERT(params.proj_scale_factor);
// smart resize
const int width = img->nx;
const int height = img->ny;
const int total_factor = params.patch_size * params.proj_scale_factor;
constexpr int min_image_tokens = 64;
constexpr int max_image_tokens = 1024;
const float min_pixels = min_image_tokens * total_factor * total_factor;
const float max_pixels = max_image_tokens * total_factor * total_factor;
auto round_by_factor = [f = total_factor](float x) { return static_cast<int>(std::nearbyintf(x / static_cast<float>(f))) * f; };
auto ceil_by_factor = [f = total_factor](float x) { return static_cast<int>(std::ceil(x / static_cast<float>(f))) * f; };
auto floor_by_factor = [f = total_factor](float x) { return static_cast<int>(std::floor(x / static_cast<float>(f))) * f; };
int h_bar = std::max(total_factor, round_by_factor(height));
int w_bar = std::max(total_factor, round_by_factor(width));
if (h_bar * w_bar > max_pixels) {
const auto beta = std::sqrt((height * width) / max_pixels);
h_bar = std::max(total_factor, floor_by_factor(height / beta));
w_bar = std::max(total_factor, floor_by_factor(width / beta));
} else if (h_bar * w_bar < min_pixels) {
const auto beta = std::sqrt(min_pixels / (height * width));
h_bar = ceil_by_factor(height * beta);
w_bar = ceil_by_factor(width * beta);
}
const std::array<uint8_t, 3> pad_color = {122, 116, 104};
clip_image_u8 resized_img;
image_manipulation::resize_and_pad_image(*img, resized_img, clip_image_size{w_bar, h_bar}, pad_color);
clip_image_f32_ptr res(clip_image_f32_init());
normalize_image_u8_to_f32(resized_img, *res, params.image_mean, params.image_std);
res_imgs->entries.push_back(std::move(res));
return true;
}
// the logic below is to pad the shorter side to the longer side with a background color: rgb(122, 116, 104)
@ -3506,10 +3707,10 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str
}
return true;
} else {
GGML_ABORT("Unknown image preprocessing type");
}
GGML_ASSERT(false && "Unknown image preprocessing type");
}
ggml_tensor * clip_get_newline_tensor(const struct clip_ctx * ctx) {
@ -3573,8 +3774,9 @@ int clip_n_output_tokens_y(const struct clip_ctx * ctx, struct clip_image_f32 *
int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * img) {
const auto & params = ctx->model.hparams;
// only for models using fixed size square images
int n_patches_sq = (params.image_size / params.patch_size) * (params.image_size / params.patch_size);
// for models with fixed size image, the input image is already pre-processed and resized to square
int patch_size = params.patch_size;
int n_patches = (img->nx / patch_size) * (img->ny / patch_size);
projector_type proj = ctx->proj_type();
@ -3588,89 +3790,97 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im
case PROJECTOR_TYPE_LDPV2:
case PROJECTOR_TYPE_GLM_EDGE:
{
n_patches_sq /= 4;
n_patches /= 4;
if (ctx->model.mm_glm_tok_boi) {
n_patches_sq += 2; // for BOI and EOI token embeddings
n_patches += 2; // for BOI and EOI token embeddings
}
} break;
case PROJECTOR_TYPE_MINICPMV:
{
// Use actual config value if available, otherwise fall back to hardcoded values
if (params.minicpmv_query_num > 0) {
n_patches = params.minicpmv_query_num;
} else {
// Fallback to hardcoded values for legacy models
if (params.minicpmv_version == 2) {
// MiniCPM-V 2.5
n_patches_sq = 96;
n_patches = 96;
} else if (params.minicpmv_version == 3) {
// MiniCPM-V 2.6
n_patches_sq = 64;
n_patches = 64;
} else if (params.minicpmv_version == 4) {
// MiniCPM-o 2.6
n_patches_sq = 64;
n_patches = 64;
} else if (params.minicpmv_version == 5) {
// MiniCPM-V 4.0
n_patches_sq = 64;
n_patches = 64;
} else if (params.minicpmv_version == 6) {
// MiniCPM-V 4.5
n_patches = 64;
} else {
GGML_ABORT("Unknown minicpmv version");
}
}
} break;
case PROJECTOR_TYPE_QWEN2VL:
case PROJECTOR_TYPE_QWEN25VL:
{
// dynamic size
// dynamic size (2 conv, so double patch size)
int patch_size = params.patch_size * 2;
int x_patch = img->nx / patch_size + (int)(img->nx % patch_size > 0);
int y_patch = img->ny / patch_size + (int)(img->ny % patch_size > 0);
n_patches_sq = x_patch * y_patch;
n_patches = x_patch * y_patch;
} break;
case PROJECTOR_TYPE_GEMMA3:
{
int n_per_side = params.image_size / params.patch_size;
int n_per_side_2d_pool = n_per_side / params.proj_scale_factor;
n_patches_sq = n_per_side_2d_pool * n_per_side_2d_pool;
} break;
case PROJECTOR_TYPE_IDEFICS3:
case PROJECTOR_TYPE_INTERNVL:
case PROJECTOR_TYPE_LLAMA4:
{
// both W and H are divided by proj_scale_factor
n_patches_sq /= (params.proj_scale_factor * params.proj_scale_factor);
// both X and Y are downscaled by the scale factor
int scale_factor = ctx->model.hparams.proj_scale_factor;
n_patches /= (scale_factor * scale_factor);
} break;
case PROJECTOR_TYPE_LFM2:
case PROJECTOR_TYPE_KIMIVL:
{
// dynamic size
int scale_factor = ctx->model.hparams.proj_scale_factor;
int out_patch_size = params.patch_size * scale_factor;
int x_patch = CLIP_ALIGN(img->nx, out_patch_size) / out_patch_size;
int y_patch = CLIP_ALIGN(img->ny, out_patch_size) / out_patch_size;
n_patches = x_patch * y_patch;
} break;
case PROJECTOR_TYPE_PIXTRAL:
{
// dynamic size
int n_merge = params.spatial_merge_size;
int n_patches_x = img->nx / params.patch_size / (n_merge > 0 ? n_merge : 1);
int n_patches_y = img->ny / params.patch_size / (n_merge > 0 ? n_merge : 1);
n_patches_sq = n_patches_y * n_patches_x + n_patches_y - 1; // + one [IMG_BREAK] per row, except the last row
} break;
case PROJECTOR_TYPE_LLAMA4:
{
int scale_factor = ctx->model.hparams.proj_scale_factor;
n_patches_sq /= (scale_factor * scale_factor);
int n_patches_x = img->nx / patch_size / (n_merge > 0 ? n_merge : 1);
int n_patches_y = img->ny / patch_size / (n_merge > 0 ? n_merge : 1);
n_patches = n_patches_y * n_patches_x + n_patches_y - 1; // + one [IMG_BREAK] per row, except the last row
} break;
case PROJECTOR_TYPE_VOXTRAL:
case PROJECTOR_TYPE_ULTRAVOX:
case PROJECTOR_TYPE_QWEN2A:
{
n_patches_sq = img->nx;
n_patches = img->nx;
const int proj_stack_factor = ctx->model.hparams.proj_stack_factor;
if (ctx->model.audio_has_stack_frames()) {
GGML_ASSERT(proj_stack_factor > 0);
const int n_len = CLIP_ALIGN(n_patches_sq, proj_stack_factor);
n_patches_sq = n_len / proj_stack_factor;
const int n_len = CLIP_ALIGN(n_patches, proj_stack_factor);
n_patches = n_len / proj_stack_factor;
}
// whisper downscales input token by half after conv1d
n_patches_sq /= 2;
n_patches /= 2;
if (ctx->model.audio_has_avgpool()) {
// divide by 2 because of nn.AvgPool1d(2, stride=2)
n_patches_sq /= 2;
n_patches /= 2;
}
} break;
default:
GGML_ABORT("unsupported projector type");
}
return n_patches_sq;
return n_patches;
}
static std::vector<std::vector<std::vector<float>>> get_1d_sincos_pos_embed_from_grid_new(int embed_dim, const std::vector<std::vector<float>> & pos) {
@ -4019,6 +4229,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
set_input_i32("positions", positions);
} break;
case PROJECTOR_TYPE_PIXTRAL:
case PROJECTOR_TYPE_KIMIVL:
{
// set the 2D positions
int n_patches_per_col = image_size_width / patch_size;
@ -4070,6 +4281,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
case PROJECTOR_TYPE_INTERNVL:
case PROJECTOR_TYPE_QWEN2A:
case PROJECTOR_TYPE_ULTRAVOX:
case PROJECTOR_TYPE_LFM2:
case PROJECTOR_TYPE_VOXTRAL:
{
// do nothing
@ -4141,7 +4353,6 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
}
int clip_n_mmproj_embd(const struct clip_ctx * ctx) {
const auto & hparams = ctx->model.hparams;
switch (ctx->model.proj_type) {
case PROJECTOR_TYPE_LDP:
return ctx->model.mm_model_block_1_block_2_1_b->ne[0];
@ -4153,20 +4364,7 @@ int clip_n_mmproj_embd(const struct clip_ctx * ctx) {
case PROJECTOR_TYPE_MLP_NORM:
return ctx->model.mm_3_b->ne[0];
case PROJECTOR_TYPE_MINICPMV:
if (hparams.minicpmv_version == 2) {
// MiniCPM-V 2.5
return 4096;
} else if (hparams.minicpmv_version == 3) {
// MiniCPM-V 2.6
return 3584;
} else if (hparams.minicpmv_version == 4) {
// MiniCPM-o 2.6
return 3584;
} else if (hparams.minicpmv_version == 5) {
// MiniCPM-V 4.0
return 2560;
}
GGML_ABORT("Unknown minicpmv version");
return ctx->model.mm_model_proj->ne[0];
case PROJECTOR_TYPE_GLM_EDGE:
return ctx->model.mm_model_mlp_3_w->ne[1];
case PROJECTOR_TYPE_QWEN2VL:
@ -4185,6 +4383,9 @@ int clip_n_mmproj_embd(const struct clip_ctx * ctx) {
return ctx->model.mm_model_proj->ne[1];
case PROJECTOR_TYPE_QWEN2A:
return ctx->model.mm_fc_w->ne[1];
case PROJECTOR_TYPE_LFM2:
case PROJECTOR_TYPE_KIMIVL:
return ctx->model.mm_2_w->ne[1];
default:
GGML_ABORT("Unknown projector type");
}

View File

@ -82,11 +82,6 @@ struct clip_image_f32 * clip_image_f32_get_img(const struct clip_image_f32_batch
*/
void clip_build_img_from_pixels(const unsigned char * rgb_pixels, int nx, int ny, struct clip_image_u8 * img);
bool clip_image_load_from_file(const char * fname, struct clip_image_u8 * img);
/** interpret bytes as an image file with length bytes_length, and use the result to populate img */
bool clip_image_load_from_bytes(const unsigned char * bytes, size_t bytes_length, struct clip_image_u8 * img);
/** preprocess img and store the result in res_imgs, pad_to_square may be overridden to false depending on model configuration */
bool clip_image_preprocess(struct clip_ctx * ctx, const struct clip_image_u8 * img, struct clip_image_f32_batch * res_imgs );

View File

@ -217,7 +217,7 @@ struct mtmd_context {
tok_row_end_trail = false; // no trailing end-of-row token
ov_img_first = true;
} else if (minicpmv_version == 3 || minicpmv_version == 4 || minicpmv_version == 5) {
} else if (minicpmv_version == 3 || minicpmv_version == 4 || minicpmv_version == 5 || minicpmv_version == 6) {
// minicpmv 2.6 format:
// <image> (overview) </image><slice> (slice) </slice><slice> (slice) </slice>\n ...
slice_tmpl = MTMD_SLICE_TMPL_MINICPMV_2_6;

File diff suppressed because it is too large Load Diff

View File

@ -116,7 +116,11 @@ func NewContextParams(numCtx int, batchSize int, numSeqMax int, threads int, fla
params.n_threads = C.int(threads)
params.n_threads_batch = params.n_threads
params.embeddings = C.bool(true)
params.flash_attn = C.bool(flashAttention)
if flashAttention {
params.flash_attn_type = C.LLAMA_FLASH_ATTN_TYPE_ENABLED
} else {
params.flash_attn_type = C.LLAMA_FLASH_ATTN_TYPE_DISABLED
}
params.type_k = kvCacheTypeFromStr(strings.ToLower(kvCacheType))
params.type_v = kvCacheTypeFromStr(strings.ToLower(kvCacheType))

View File

@ -15,18 +15,18 @@ problem.
ggml/src/ggml-backend.cpp | 9 +++++++--
ggml/src/ggml-cann/ggml-cann.cpp | 2 ++
ggml/src/ggml-cuda/ggml-cuda.cu | 3 +++
ggml/src/ggml-metal/ggml-metal.m | 1 +
ggml/src/ggml-metal/ggml-metal.cpp | 2 ++
ggml/src/ggml-opencl/ggml-opencl.cpp | 1 +
ggml/src/ggml-rpc/ggml-rpc.cpp | 1 +
ggml/src/ggml-sycl/ggml-sycl.cpp | 3 +++
ggml/src/ggml-vulkan/ggml-vulkan.cpp | 2 ++
8 files changed, 20 insertions(+), 2 deletions(-)
8 files changed, 21 insertions(+), 2 deletions(-)
diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp
index 1b9d29e9..97f47abd 100644
index ff9135fe..8ba86f82 100644
--- a/ggml/src/ggml-backend.cpp
+++ b/ggml/src/ggml-backend.cpp
@@ -107,7 +107,6 @@ void ggml_backend_buffer_free(ggml_backend_buffer_t buffer) {
@@ -113,7 +113,6 @@ void ggml_backend_buffer_free(ggml_backend_buffer_t buffer) {
if (buffer->iface.free_buffer != NULL) {
buffer->iface.free_buffer(buffer);
}
@ -34,7 +34,7 @@ index 1b9d29e9..97f47abd 100644
}
size_t ggml_backend_buffer_get_size(ggml_backend_buffer_t buffer) {
@@ -529,6 +528,7 @@ static void ggml_backend_multi_buffer_free_buffer(ggml_backend_buffer_t buffer)
@@ -586,6 +585,7 @@ static void ggml_backend_multi_buffer_free_buffer(ggml_backend_buffer_t buffer)
free(ctx->buffers);
free(ctx);
@ -42,9 +42,9 @@ index 1b9d29e9..97f47abd 100644
}
static void ggml_backend_multi_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
@@ -1890,6 +1890,11 @@ static void * ggml_backend_cpu_buffer_get_base(ggml_backend_buffer_t buffer) {
@@ -2075,6 +2075,11 @@ static void * ggml_backend_cpu_buffer_get_base(ggml_backend_buffer_t buffer) {
static void ggml_backend_cpu_buffer_free_buffer(ggml_backend_buffer_t buffer) {
GGML_ASSERT(buffer);
ggml_aligned_free(buffer->context, buffer->size);
+ delete buffer;
+}
@ -54,7 +54,7 @@ index 1b9d29e9..97f47abd 100644
}
static void ggml_backend_cpu_buffer_memset_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) {
@@ -1937,7 +1942,7 @@ static const struct ggml_backend_buffer_i ggml_backend_cpu_buffer_i = {
@@ -2127,7 +2132,7 @@ static const struct ggml_backend_buffer_i ggml_backend_cpu_buffer_i = {
};
static const struct ggml_backend_buffer_i ggml_backend_cpu_buffer_from_ptr_i = {
@ -64,10 +64,10 @@ index 1b9d29e9..97f47abd 100644
/* .init_tensor = */ NULL, // no initialization required
/* .memset_tensor = */ ggml_backend_cpu_buffer_memset_tensor,
diff --git a/ggml/src/ggml-cann/ggml-cann.cpp b/ggml/src/ggml-cann/ggml-cann.cpp
index cf575b36..ca1addfa 100755
index b51b554e..3ba0f5a6 100755
--- a/ggml/src/ggml-cann/ggml-cann.cpp
+++ b/ggml/src/ggml-cann/ggml-cann.cpp
@@ -826,6 +826,7 @@ static void ggml_backend_cann_buffer_free_buffer(
@@ -843,6 +843,7 @@ static void ggml_backend_cann_buffer_free_buffer(
ggml_backend_cann_buffer_context* ctx =
(ggml_backend_cann_buffer_context*)buffer->context;
delete ctx;
@ -75,7 +75,7 @@ index cf575b36..ca1addfa 100755
}
/**
@@ -1572,6 +1573,7 @@ static const char * ggml_backend_cann_host_buffer_name(ggml_backend_buffer_t buf
@@ -1630,6 +1631,7 @@ static const char * ggml_backend_cann_host_buffer_name(ggml_backend_buffer_t buf
*/
static void ggml_backend_cann_host_buffer_free(ggml_backend_buffer_t buffer) {
ACL_CHECK(aclrtFreeHost(buffer->context));
@ -84,7 +84,7 @@ index cf575b36..ca1addfa 100755
/**
diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
index d9110491..37ee2a6d 100644
index b7e81b21..fdf8c63d 100644
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
@@ -567,6 +567,7 @@ struct ggml_backend_cuda_buffer_context {
@ -111,23 +111,31 @@ index d9110491..37ee2a6d 100644
}
static void * ggml_cuda_host_malloc(size_t size) {
diff --git a/ggml/src/ggml-metal/ggml-metal.m b/ggml/src/ggml-metal/ggml-metal.m
index cb8eff4a..7bccc7bf 100644
--- a/ggml/src/ggml-metal/ggml-metal.m
+++ b/ggml/src/ggml-metal/ggml-metal.m
@@ -6032,6 +6032,7 @@ static void ggml_backend_metal_buffer_free_buffer(ggml_backend_buffer_t buffer)
diff --git a/ggml/src/ggml-metal/ggml-metal.cpp b/ggml/src/ggml-metal/ggml-metal.cpp
index e11555a7..909e17de 100644
--- a/ggml/src/ggml-metal/ggml-metal.cpp
+++ b/ggml/src/ggml-metal/ggml-metal.cpp
@@ -25,6 +25,7 @@ static void ggml_backend_metal_buffer_shared_free_buffer(ggml_backend_buffer_t b
GGML_ASSERT(ggml_metal_buffer_is_shared(ctx));
ggml_metal_buffer_free(ctx);
+ delete buffer;
}
free(ctx);
+ free(buffer);
static void * ggml_backend_metal_buffer_shared_get_base(ggml_backend_buffer_t buffer) {
@@ -99,6 +100,7 @@ static void ggml_backend_metal_buffer_private_free_buffer(ggml_backend_buffer_t
GGML_ASSERT(!ggml_metal_buffer_is_shared(ctx));
ggml_metal_buffer_free(ctx);
+ delete buffer;
}
static void * ggml_backend_metal_buffer_get_base(ggml_backend_buffer_t buffer) {
static void * ggml_backend_metal_buffer_private_get_base(ggml_backend_buffer_t buffer) {
diff --git a/ggml/src/ggml-opencl/ggml-opencl.cpp b/ggml/src/ggml-opencl/ggml-opencl.cpp
index 8ba1e00d..8163e8dc 100644
index 0cf3b924..09d706b5 100644
--- a/ggml/src/ggml-opencl/ggml-opencl.cpp
+++ b/ggml/src/ggml-opencl/ggml-opencl.cpp
@@ -2745,6 +2745,7 @@ struct ggml_backend_opencl_buffer_context {
@@ -3215,6 +3215,7 @@ struct ggml_backend_opencl_buffer_context {
static void ggml_backend_opencl_buffer_free_buffer(ggml_backend_buffer_t buffer) {
ggml_backend_opencl_buffer_context * ctx = (ggml_backend_opencl_buffer_context *) buffer->context;
delete ctx;
@ -136,10 +144,10 @@ index 8ba1e00d..8163e8dc 100644
static void * ggml_backend_opencl_buffer_get_base(ggml_backend_buffer_t buffer) {
diff --git a/ggml/src/ggml-rpc/ggml-rpc.cpp b/ggml/src/ggml-rpc/ggml-rpc.cpp
index df6ba540..2e395968 100644
index f99681c8..59591770 100644
--- a/ggml/src/ggml-rpc/ggml-rpc.cpp
+++ b/ggml/src/ggml-rpc/ggml-rpc.cpp
@@ -486,6 +486,7 @@ static void ggml_backend_rpc_buffer_free_buffer(ggml_backend_buffer_t buffer) {
@@ -505,6 +505,7 @@ static void ggml_backend_rpc_buffer_free_buffer(ggml_backend_buffer_t buffer) {
bool status = send_rpc_cmd(ctx->sock, RPC_CMD_FREE_BUFFER, &request, sizeof(request), nullptr, 0);
RPC_STATUS_ASSERT(status);
delete ctx;
@ -148,7 +156,7 @@ index df6ba540..2e395968 100644
static void * ggml_backend_rpc_buffer_get_base(ggml_backend_buffer_t buffer) {
diff --git a/ggml/src/ggml-sycl/ggml-sycl.cpp b/ggml/src/ggml-sycl/ggml-sycl.cpp
index 3992dad0..67503951 100644
index 4ac919ea..447ea3c4 100644
--- a/ggml/src/ggml-sycl/ggml-sycl.cpp
+++ b/ggml/src/ggml-sycl/ggml-sycl.cpp
@@ -331,6 +331,7 @@ ggml_backend_sycl_buffer_free_buffer(ggml_backend_buffer_t buffer) try {
@ -176,10 +184,10 @@ index 3992dad0..67503951 100644
static ggml_backend_buffer_t ggml_backend_sycl_host_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
index 4070e248..394a2839 100644
index 2608cbd0..061cd078 100644
--- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp
+++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
@@ -10209,6 +10209,7 @@ static void ggml_backend_vk_buffer_free_buffer(ggml_backend_buffer_t buffer) {
@@ -11603,6 +11603,7 @@ static void ggml_backend_vk_buffer_free_buffer(ggml_backend_buffer_t buffer) {
ggml_backend_vk_buffer_context * ctx = (ggml_backend_vk_buffer_context *)buffer->context;
ggml_vk_destroy_buffer(ctx->dev_buffer);
delete ctx;
@ -187,7 +195,7 @@ index 4070e248..394a2839 100644
}
static void * ggml_backend_vk_buffer_get_base(ggml_backend_buffer_t buffer) {
@@ -10352,6 +10353,7 @@ static const char * ggml_backend_vk_host_buffer_name(ggml_backend_buffer_t buffe
@@ -11746,6 +11747,7 @@ static const char * ggml_backend_vk_host_buffer_name(ggml_backend_buffer_t buffe
static void ggml_backend_vk_host_buffer_free_buffer(ggml_backend_buffer_t buffer) {
VK_LOG_MEMORY("ggml_backend_vk_host_buffer_free_buffer()");
ggml_vk_host_free(vk_instance.devices[0], buffer->context);

View File

@ -10,10 +10,10 @@ logs instead of throwing an error
1 file changed, 3 insertions(+), 11 deletions(-)
diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp
index f7e03e70..8ebe11cf 100644
index da938af0..2a38abf4 100644
--- a/src/llama-vocab.cpp
+++ b/src/llama-vocab.cpp
@@ -1804,16 +1804,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
@@ -1811,16 +1811,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
if (type == LLAMA_VOCAB_TYPE_BPE) {
add_space_prefix = false;
clean_spaces = true;
@ -31,8 +31,8 @@ index f7e03e70..8ebe11cf 100644
pre_type = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
} else if (
tokenizer_pre == "llama3" ||
@@ -1975,7 +1966,8 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
pre_type = LLAMA_VOCAB_PRE_TYPE_KIMI_K2;
@@ -1987,7 +1978,8 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
pre_type = LLAMA_VOCAB_PRE_TYPE_GROK_2;
clean_spaces = false;
} else {
- throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));

View File

@ -10,7 +10,7 @@ filesystems for paths that include wide characters
1 file changed, 39 insertions(+)
diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp
index 20c21733..f4f69cfc 100644
index 210ecc88..355219a9 100644
--- a/tools/mtmd/clip.cpp
+++ b/tools/mtmd/clip.cpp
@@ -28,6 +28,19 @@
@ -33,7 +33,7 @@ index 20c21733..f4f69cfc 100644
struct clip_logger_state g_logger_state = {GGML_LOG_LEVEL_CONT, clip_log_callback_default, NULL};
enum ffn_op_type {
@@ -2597,7 +2610,29 @@ struct clip_model_loader {
@@ -2759,7 +2772,29 @@ struct clip_model_loader {
{
std::vector<uint8_t> read_buf;
@ -63,7 +63,7 @@ index 20c21733..f4f69cfc 100644
if (!fin) {
throw std::runtime_error(string_format("%s: failed to open %s\n", __func__, fname.c_str()));
}
@@ -2624,7 +2659,11 @@ struct clip_model_loader {
@@ -2786,7 +2821,11 @@ struct clip_model_loader {
ggml_backend_tensor_set(cur, read_buf.data(), 0, num_bytes);
}
}

View File

@ -15,10 +15,10 @@ adds support for the Solar Pro architecture
7 files changed, 248 insertions(+)
diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp
index 18dcc6dd..4b285646 100644
index 4e8d54c4..f98a3574 100644
--- a/src/llama-arch.cpp
+++ b/src/llama-arch.cpp
@@ -78,6 +78,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
@@ -81,6 +81,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
{ LLM_ARCH_GRANITE_MOE, "granitemoe" },
{ LLM_ARCH_GRANITE_HYBRID, "granitehybrid" },
{ LLM_ARCH_CHAMELEON, "chameleon" },
@ -26,15 +26,15 @@ index 18dcc6dd..4b285646 100644
{ LLM_ARCH_WAVTOKENIZER_DEC, "wavtokenizer-dec" },
{ LLM_ARCH_PLM, "plm" },
{ LLM_ARCH_BAILINGMOE, "bailingmoe" },
@@ -164,6 +165,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
{ LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT, "%s.attention.relative_buckets_count" },
{ LLM_KV_ATTENTION_SLIDING_WINDOW, "%s.attention.sliding_window" },
@@ -177,6 +178,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
{ LLM_KV_ATTENTION_SCALE, "%s.attention.scale" },
{ LLM_KV_ATTENTION_OUTPUT_SCALE, "%s.attention.output_scale" },
{ LLM_KV_ATTENTION_TEMPERATURE_LENGTH, "%s.attention.temperature_length" },
+ { LLM_KV_ATTENTION_BLOCK_SKIP_CONNECTION, "%s.attention.block_skip_connection" },
{ LLM_KV_ATTENTION_KEY_LENGTH_MLA, "%s.attention.key_length_mla" },
{ LLM_KV_ATTENTION_VALUE_LENGTH_MLA, "%s.attention.value_length_mla" },
@@ -1794,6 +1796,24 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
@@ -1879,6 +1881,24 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
{ LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" },
},
},
@ -59,7 +59,7 @@ index 18dcc6dd..4b285646 100644
{
LLM_ARCH_WAVTOKENIZER_DEC,
{
@@ -2219,6 +2239,7 @@ static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
@@ -2368,6 +2388,7 @@ static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
{LLM_TENSOR_LAUREL_POST_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
// this tensor is loaded for T5, but never used
{LLM_TENSOR_DEC_CROSS_ATTN_REL_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_NONE}},
@ -68,10 +68,10 @@ index 18dcc6dd..4b285646 100644
{LLM_TENSOR_POS_NET_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
{LLM_TENSOR_POS_NET_NORM1, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
diff --git a/src/llama-arch.h b/src/llama-arch.h
index 7af587e7..3ea994c7 100644
index b5c6f3d7..aa8e0e7b 100644
--- a/src/llama-arch.h
+++ b/src/llama-arch.h
@@ -82,6 +82,7 @@ enum llm_arch {
@@ -85,6 +85,7 @@ enum llm_arch {
LLM_ARCH_GRANITE_MOE,
LLM_ARCH_GRANITE_HYBRID,
LLM_ARCH_CHAMELEON,
@ -79,15 +79,15 @@ index 7af587e7..3ea994c7 100644
LLM_ARCH_WAVTOKENIZER_DEC,
LLM_ARCH_PLM,
LLM_ARCH_BAILINGMOE,
@@ -168,6 +169,7 @@ enum llm_kv {
LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT,
LLM_KV_ATTENTION_SLIDING_WINDOW,
@@ -181,6 +182,7 @@ enum llm_kv {
LLM_KV_ATTENTION_SCALE,
LLM_KV_ATTENTION_OUTPUT_SCALE,
LLM_KV_ATTENTION_TEMPERATURE_LENGTH,
+ LLM_KV_ATTENTION_BLOCK_SKIP_CONNECTION,
LLM_KV_ATTENTION_KEY_LENGTH_MLA,
LLM_KV_ATTENTION_VALUE_LENGTH_MLA,
@@ -394,6 +396,7 @@ enum llm_tensor {
@@ -417,6 +419,7 @@ enum llm_tensor {
LLM_TENSOR_ENC_OUTPUT_NORM,
LLM_TENSOR_CLS,
LLM_TENSOR_CLS_OUT,
@ -96,10 +96,10 @@ index 7af587e7..3ea994c7 100644
LLM_TENSOR_CONVNEXT_DW,
LLM_TENSOR_CONVNEXT_NORM,
diff --git a/src/llama-hparams.cpp b/src/llama-hparams.cpp
index 7a06368d..35fc054f 100644
index c04ac58f..24a515a0 100644
--- a/src/llama-hparams.cpp
+++ b/src/llama-hparams.cpp
@@ -146,6 +146,14 @@ uint32_t llama_hparams::n_pos_per_embd() const {
@@ -147,6 +147,14 @@ uint32_t llama_hparams::n_pos_per_embd() const {
return rope_type == LLAMA_ROPE_TYPE_MROPE ? 4 : 1;
}
@ -115,10 +115,10 @@ index 7a06368d..35fc054f 100644
if (il < n_layer) {
return swa_layers[il];
diff --git a/src/llama-hparams.h b/src/llama-hparams.h
index bd231224..29bd9056 100644
index 0fe4b569..eb13709f 100644
--- a/src/llama-hparams.h
+++ b/src/llama-hparams.h
@@ -62,6 +62,8 @@ struct llama_hparams {
@@ -64,6 +64,8 @@ struct llama_hparams {
std::array<uint32_t, LLAMA_MAX_LAYERS> n_head_kv_arr;
std::array<uint32_t, LLAMA_MAX_LAYERS> n_ff_arr;
@ -127,7 +127,7 @@ index bd231224..29bd9056 100644
uint32_t n_layer_dense_lead = 0;
uint32_t n_lora_q = 0;
uint32_t n_lora_kv = 0;
@@ -220,6 +222,9 @@ struct llama_hparams {
@@ -236,6 +238,9 @@ struct llama_hparams {
uint32_t n_pos_per_embd() const;
@ -135,10 +135,10 @@ index bd231224..29bd9056 100644
+ bool n_bskcn(uint32_t n, uint32_t il) const;
+
bool is_swa(uint32_t il) const;
};
bool has_kv(uint32_t il) const;
diff --git a/src/llama-model-loader.cpp b/src/llama-model-loader.cpp
index f71c40f8..7eab9b68 100644
index 8182a9ad..daef900c 100644
--- a/src/llama-model-loader.cpp
+++ b/src/llama-model-loader.cpp
@@ -465,6 +465,7 @@ namespace GGUFMeta {
@ -150,10 +150,10 @@ index f71c40f8..7eab9b68 100644
llama_model_loader::llama_model_loader(
const std::string & fname,
diff --git a/src/llama-model.cpp b/src/llama-model.cpp
index 58ca7df7..280129e1 100644
index 2470f878..0398b553 100644
--- a/src/llama-model.cpp
+++ b/src/llama-model.cpp
@@ -1706,6 +1706,21 @@ void llama_model::load_hparams(llama_model_loader & ml) {
@@ -1845,6 +1845,21 @@ void llama_model::load_hparams(llama_model_loader & ml) {
default: type = LLM_TYPE_UNKNOWN;
}
} break;
@ -175,7 +175,7 @@ index 58ca7df7..280129e1 100644
case LLM_ARCH_WAVTOKENIZER_DEC:
{
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
@@ -4793,6 +4808,34 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
@@ -5113,6 +5128,34 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
@ -210,7 +210,7 @@ index 58ca7df7..280129e1 100644
layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
@@ -15495,6 +15538,165 @@ struct llm_build_granite_hybrid : public llm_graph_context_mamba {
@@ -16273,6 +16316,165 @@ struct llm_build_granite_hybrid : public llm_graph_context_mamba {
}
};
@ -229,7 +229,7 @@ index 58ca7df7..280129e1 100644
+ struct ggml_tensor * inp_pos = build_inp_pos();
+
+ // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
+ auto * inp_attn = build_attn_inp_kv_unified();
+ auto * inp_attn = build_attn_inp_kv();
+
+ const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
+
@ -316,7 +316,7 @@ index 58ca7df7..280129e1 100644
+
+ cur = build_attn(inp_attn,
+ model.layers[il].wo, model.layers[il].bo,
+ Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il);
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
+ cb(cur, "attn_out", il);
+ }
+
@ -376,7 +376,7 @@ index 58ca7df7..280129e1 100644
// ref: https://github.com/facebookresearch/chameleon
// based on the original build_llama() function, changes:
// * qk-norm
@@ -18439,6 +18641,10 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
@@ -19552,6 +19754,10 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
{
llm = std::make_unique<llm_build_chameleon>(*this, params);
} break;
@ -387,7 +387,7 @@ index 58ca7df7..280129e1 100644
case LLM_ARCH_WAVTOKENIZER_DEC:
{
llm = std::make_unique<llm_build_wavtokenizer_dec>(*this, params);
@@ -18652,6 +18858,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
@@ -19770,6 +19976,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
case LLM_ARCH_GRANITE_MOE:
case LLM_ARCH_GRANITE_HYBRID:
case LLM_ARCH_CHAMELEON:
@ -396,10 +396,10 @@ index 58ca7df7..280129e1 100644
case LLM_ARCH_NEO_BERT:
case LLM_ARCH_SMOLLM3:
diff --git a/src/llama-model.h b/src/llama-model.h
index 6fcd74d5..09964533 100644
index d73ce969..c086f94e 100644
--- a/src/llama-model.h
+++ b/src/llama-model.h
@@ -70,6 +70,7 @@ enum llm_type {
@@ -76,6 +76,7 @@ enum llm_type {
LLM_TYPE_15B,
LLM_TYPE_16B,
LLM_TYPE_20B,
@ -407,7 +407,7 @@ index 6fcd74d5..09964533 100644
LLM_TYPE_27B,
LLM_TYPE_30B,
LLM_TYPE_32B,
@@ -367,6 +368,8 @@ struct llama_layer {
@@ -380,6 +381,8 @@ struct llama_layer {
// openai-moe
struct ggml_tensor * attn_sinks = nullptr;

View File

@ -12,7 +12,7 @@ regex
2 files changed, 22 insertions(+), 1 deletion(-)
diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp
index 8ebe11cf..c011008f 100644
index 2a38abf4..26fa9fad 100644
--- a/src/llama-vocab.cpp
+++ b/src/llama-vocab.cpp
@@ -299,7 +299,7 @@ struct llm_tokenizer_bpe : llm_tokenizer {

View File

@ -8,10 +8,10 @@ Subject: [PATCH] maintain ordering for rules for grammar
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/common/json-schema-to-grammar.cpp b/common/json-schema-to-grammar.cpp
index 637891f5..98b8280f 100644
index db1f0b23..f4de7e34 100644
--- a/common/json-schema-to-grammar.cpp
+++ b/common/json-schema-to-grammar.cpp
@@ -307,7 +307,7 @@ private:
@@ -308,7 +308,7 @@ private:
friend std::string build_grammar(const std::function<void(const common_grammar_builder &)> & cb, const common_grammar_options & options);
std::function<json(const std::string &)> _fetch_json;
bool _dotall;

View File

@ -11,10 +11,10 @@ with the fastest acceleration is loaded
1 file changed, 13 insertions(+), 8 deletions(-)
diff --git a/ggml/src/ggml-backend-reg.cpp b/ggml/src/ggml-backend-reg.cpp
index 6c315137..3040b2aa 100644
index 136afec7..f794d9cf 100644
--- a/ggml/src/ggml-backend-reg.cpp
+++ b/ggml/src/ggml-backend-reg.cpp
@@ -162,7 +162,7 @@ struct ggml_backend_reg_entry {
@@ -175,7 +175,7 @@ struct ggml_backend_reg_entry {
struct ggml_backend_registry {
std::vector<ggml_backend_reg_entry> backends;
@ -23,7 +23,7 @@ index 6c315137..3040b2aa 100644
ggml_backend_registry() {
#ifdef GGML_USE_CUDA
@@ -207,7 +207,7 @@ struct ggml_backend_registry {
@@ -223,7 +223,7 @@ struct ggml_backend_registry {
}
}
@ -32,7 +32,7 @@ index 6c315137..3040b2aa 100644
if (!reg) {
return;
}
@@ -218,15 +218,20 @@ struct ggml_backend_registry {
@@ -234,15 +234,20 @@ struct ggml_backend_registry {
#endif
backends.push_back({ reg, std::move(handle) });
for (size_t i = 0; i < ggml_backend_reg_dev_count(reg); i++) {
@ -56,7 +56,7 @@ index 6c315137..3040b2aa 100644
}
ggml_backend_reg_t load_backend(const fs::path & path, bool silent) {
@@ -270,7 +275,7 @@ struct ggml_backend_registry {
@@ -286,7 +291,7 @@ struct ggml_backend_registry {
GGML_LOG_INFO("%s: loaded %s backend from %s\n", __func__, ggml_backend_reg_name(reg), path_str(path).c_str());
@ -65,7 +65,7 @@ index 6c315137..3040b2aa 100644
return reg;
}
@@ -293,7 +298,7 @@ struct ggml_backend_registry {
@@ -309,7 +314,7 @@ struct ggml_backend_registry {
// remove devices
devices.erase(
std::remove_if(devices.begin(), devices.end(),
@ -74,7 +74,7 @@ index 6c315137..3040b2aa 100644
devices.end());
// remove backend
@@ -351,7 +356,7 @@ size_t ggml_backend_dev_count() {
@@ -367,7 +372,7 @@ size_t ggml_backend_dev_count() {
ggml_backend_dev_t ggml_backend_dev_get(size_t index) {
GGML_ASSERT(index < ggml_backend_dev_count());

View File

@ -8,10 +8,10 @@ Subject: [PATCH] add phony target ggml-cpu for all cpu variants
1 file changed, 2 insertions(+)
diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt
index 177fb282..f5a5079a 100644
index c8f3d859..ff6229a0 100644
--- a/ggml/src/CMakeLists.txt
+++ b/ggml/src/CMakeLists.txt
@@ -304,6 +304,7 @@ function(ggml_add_cpu_backend_variant tag_name)
@@ -307,6 +307,7 @@ function(ggml_add_cpu_backend_variant tag_name)
endif()
ggml_add_cpu_backend_variant_impl(${tag_name})
@ -19,7 +19,7 @@ index 177fb282..f5a5079a 100644
endfunction()
ggml_add_backend(CPU)
@@ -314,6 +315,7 @@ if (GGML_CPU_ALL_VARIANTS)
@@ -317,6 +318,7 @@ if (GGML_CPU_ALL_VARIANTS)
elseif (GGML_CPU_ARM_ARCH)
message(FATAL_ERROR "Cannot use both GGML_CPU_ARM_ARCH and GGML_CPU_ALL_VARIANTS")
endif()

View File

@ -9,10 +9,10 @@ disable amx as it reduces performance on some systems
1 file changed, 4 deletions(-)
diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt
index f5a5079a..5158acd6 100644
index ff6229a0..33b3a15f 100644
--- a/ggml/src/CMakeLists.txt
+++ b/ggml/src/CMakeLists.txt
@@ -324,10 +324,6 @@ if (GGML_CPU_ALL_VARIANTS)
@@ -327,10 +327,6 @@ if (GGML_CPU_ALL_VARIANTS)
ggml_add_cpu_backend_variant(skylakex SSE42 AVX F16C AVX2 BMI2 FMA AVX512)
ggml_add_cpu_backend_variant(icelake SSE42 AVX F16C AVX2 BMI2 FMA AVX512 AVX512_VBMI AVX512_VNNI)
ggml_add_cpu_backend_variant(alderlake SSE42 AVX F16C AVX2 BMI2 FMA AVX_VNNI)

View File

@ -25,7 +25,7 @@ index 79ee2020..3efb22f0 100644
// get ith C string from array with given key_id
GGML_API const char * gguf_get_arr_str (const struct gguf_context * ctx, int64_t key_id, size_t i);
diff --git a/ggml/src/gguf.cpp b/ggml/src/gguf.cpp
index 53504399..0f71d5f3 100644
index 8cc4ef1c..d950dbdf 100644
--- a/ggml/src/gguf.cpp
+++ b/ggml/src/gguf.cpp
@@ -805,10 +805,14 @@ enum gguf_type gguf_get_arr_type(const struct gguf_context * ctx, int64_t key_id
@ -53,10 +53,10 @@ index 53504399..0f71d5f3 100644
}
diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp
index c011008f..fa388b03 100644
index 26fa9fad..64c78a16 100644
--- a/src/llama-vocab.cpp
+++ b/src/llama-vocab.cpp
@@ -1760,9 +1760,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
@@ -1767,9 +1767,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
const int precompiled_charsmap_keyidx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_PRECOMPILED_CHARSMAP).c_str());
if (precompiled_charsmap_keyidx != -1) {
const gguf_type pc_type = gguf_get_arr_type(ctx, precompiled_charsmap_keyidx);
@ -66,4 +66,4 @@ index c011008f..fa388b03 100644
+ const size_t n_precompiled_charsmap = gguf_get_arr_data_n(ctx, precompiled_charsmap_keyidx);
const char * pc = (const char *) gguf_get_arr_data(ctx, precompiled_charsmap_keyidx);
precompiled_charsmap.assign(pc, pc + n_precompiled_charsmap);
#ifdef IS_BIG_ENDIAN
#if defined(__BYTE_ORDER__) && defined(__ORDER_BIG_ENDIAN__) && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__

View File

@ -8,7 +8,7 @@ Subject: [PATCH] ollama debug tensor
1 file changed, 6 insertions(+)
diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c
index d89cd8f4..a5689c18 100644
index dbc07301..f8574d01 100644
--- a/ggml/src/ggml-cpu/ggml-cpu.c
+++ b/ggml/src/ggml-cpu/ggml-cpu.c
@@ -15,6 +15,8 @@
@ -20,7 +20,7 @@ index d89cd8f4..a5689c18 100644
#if defined(_MSC_VER) || defined(__MINGW32__)
#include <malloc.h> // using malloc.h with MSC/MINGW
#elif !defined(__FreeBSD__) && !defined(__NetBSD__) && !defined(__OpenBSD__)
@@ -2858,6 +2860,10 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
@@ -2881,6 +2883,10 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
ggml_compute_forward(&params, node);

View File

@ -184,10 +184,10 @@ index f8c291de..2a3a62db 100644
const char * grammar_root,
bool lazy,
diff --git a/src/llama-sampling.cpp b/src/llama-sampling.cpp
index bfbf5fa2..11f93f42 100644
index 2186f827..8fb86009 100644
--- a/src/llama-sampling.cpp
+++ b/src/llama-sampling.cpp
@@ -1466,7 +1466,7 @@ static void llama_sampler_grammar_reset(struct llama_sampler * smpl) {
@@ -1563,7 +1563,7 @@ static void llama_sampler_grammar_reset(struct llama_sampler * smpl) {
trigger_patterns_c.push_back(trigger_pattern.pattern.c_str());
}
@ -196,7 +196,7 @@ index bfbf5fa2..11f93f42 100644
ctx->grammar->lazy, trigger_patterns_c.data(), trigger_patterns_c.size(),
ctx->grammar->trigger_tokens.data(), ctx->grammar->trigger_tokens.size());
@@ -1548,7 +1548,7 @@ static struct llama_sampler * llama_sampler_init_grammar_impl(
@@ -1645,7 +1645,7 @@ static struct llama_sampler * llama_sampler_init_grammar_impl(
/* .vocab = */ vocab,
/* .grammar_str = */ grammar_str,
/* .grammar_root = */ grammar_root,

View File

@ -4,17 +4,18 @@ Date: Thu, 1 May 2025 13:45:12 -0700
Subject: [PATCH] add argsort and cuda copy for i32
---
ggml/src/ggml-cpu/ops.cpp | 43 +++++++++++++
ggml/src/ggml-cuda/argsort.cu | 102 ++++++++++++++++++++++++++++++-
ggml/src/ggml-cpu/ops.cpp | 43 +++++++++++
ggml/src/ggml-cuda/argsort.cu | 102 ++++++++++++++++++++++++++-
ggml/src/ggml-cuda/cpy-utils.cuh | 6 ++
ggml/src/ggml-cuda/cpy.cu | 43 +++++++++++++
4 files changed, 192 insertions(+), 2 deletions(-)
ggml/src/ggml-cuda/cpy.cu | 43 +++++++++++
ggml/src/ggml-metal/ggml-metal.metal | 64 +++++++++++++++++
5 files changed, 256 insertions(+), 2 deletions(-)
diff --git a/ggml/src/ggml-cpu/ops.cpp b/ggml/src/ggml-cpu/ops.cpp
index 854f1c2b..a2924757 100644
index 14f7dcf4..f7f8da35 100644
--- a/ggml/src/ggml-cpu/ops.cpp
+++ b/ggml/src/ggml-cpu/ops.cpp
@@ -8146,6 +8146,45 @@ static void ggml_compute_forward_argsort_f32(
@@ -7893,6 +7893,45 @@ static void ggml_compute_forward_argsort_f32(
}
}
@ -60,7 +61,7 @@ index 854f1c2b..a2924757 100644
void ggml_compute_forward_argsort(
const ggml_compute_params * params,
ggml_tensor * dst) {
@@ -8157,6 +8196,10 @@ void ggml_compute_forward_argsort(
@@ -7904,6 +7943,10 @@ void ggml_compute_forward_argsort(
{
ggml_compute_forward_argsort_f32(params, dst);
} break;
@ -196,12 +197,12 @@ index 607ded85..53b02634 100644
+ }
}
diff --git a/ggml/src/ggml-cuda/cpy-utils.cuh b/ggml/src/ggml-cuda/cpy-utils.cuh
index 410c12b7..b8e9e107 100644
index e621cb98..597c0c8b 100644
--- a/ggml/src/ggml-cuda/cpy-utils.cuh
+++ b/ggml/src/ggml-cuda/cpy-utils.cuh
@@ -223,3 +223,9 @@ template<typename src_t, typename dst_t>
@@ -215,3 +215,9 @@ template<typename src_t, typename dst_t>
static __device__ void cpy_1_flt(const char * cxi, char * cdsti) {
convert_flt((const src_t *)cxi, (dst_t *)cdsti);
*(dst_t *) cdsti = ggml_cuda_cast<dst_t>(*(const src_t *) cxi);
}
+
+static __device__ void cpy_1_i32_i32(const char * cxi, char * cdsti) {
@ -210,10 +211,10 @@ index 410c12b7..b8e9e107 100644
+ *dst = *src;
+}
diff --git a/ggml/src/ggml-cuda/cpy.cu b/ggml/src/ggml-cuda/cpy.cu
index f9bb0256..9c3774e5 100644
index 746f4396..911220e9 100644
--- a/ggml/src/ggml-cuda/cpy.cu
+++ b/ggml/src/ggml-cuda/cpy.cu
@@ -278,6 +278,47 @@ static void ggml_cpy_f32_iq4_nl_cuda(
@@ -277,6 +277,47 @@ static void ggml_cpy_f32_iq4_nl_cuda(
(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, cdst_indirect, graph_cpynode_index++);
}
@ -261,7 +262,7 @@ index f9bb0256..9c3774e5 100644
void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, ggml_tensor * src1, bool disable_indirection_for_this_node) {
const int64_t ne = ggml_nelements(src0);
GGML_ASSERT(ne == ggml_nelements(src1));
@@ -369,6 +410,8 @@ void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, gg
@@ -372,6 +413,8 @@ void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, gg
ggml_cpy_flt_cuda<half, nv_bfloat16> (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs_d, graph_cpynode_index);
} else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F32) {
ggml_cpy_flt_cuda<half, float> (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs_d, graph_cpynode_index);
@ -270,3 +271,80 @@ index f9bb0256..9c3774e5 100644
} else if (src0->type == GGML_TYPE_BF16 && src1->type == GGML_TYPE_BF16) {
ggml_cpy_flt_cuda<nv_bfloat16, nv_bfloat16> (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs_d, graph_cpynode_index);
} else if (src0->type == GGML_TYPE_BF16 && src1->type == GGML_TYPE_F16) {
diff --git a/ggml/src/ggml-metal/ggml-metal.metal b/ggml/src/ggml-metal/ggml-metal.metal
index 96df6f0c..44dc31c0 100644
--- a/ggml/src/ggml-metal/ggml-metal.metal
+++ b/ggml/src/ggml-metal/ggml-metal.metal
@@ -4428,8 +4428,72 @@ kernel void kernel_argsort_f32_i32(
}
}
+typedef void (i32_argsort_t)(
+ constant ggml_metal_kargs_argsort & args,
+ device const int32_t * x,
+ device int32_t * dst,
+ threadgroup int32_t * shared_values [[threadgroup(0)]],
+ uint3 tgpig[[threadgroup_position_in_grid]],
+ uint3 tpitg[[thread_position_in_threadgroup]]);
+
+template<ggml_sort_order order>
+kernel void kernel_argsort_i32_i32(
+ constant ggml_metal_kargs_argsort & args,
+ device const int32_t * x,
+ device int32_t * dst,
+ threadgroup int32_t * shared_values [[threadgroup(0)]],
+ uint3 tgpig[[threadgroup_position_in_grid]],
+ uint3 tpitg[[thread_position_in_threadgroup]]) {
+ // bitonic sort
+ int col = tpitg[0];
+ int row = tgpig[1];
+
+ if (col >= args.ncols_pad) return;
+
+ device const int32_t * x_row = x + row * args.ncols;
+ threadgroup int32_t * dst_row = shared_values;
+
+ // initialize indices
+ dst_row[col] = col;
+
+ threadgroup_barrier(mem_flags::mem_threadgroup);
+
+ for (int k = 2; k <= args.ncols_pad; k *= 2) {
+ for (int j = k / 2; j > 0; j /= 2) {
+ int ixj = col ^ j;
+ if (ixj > col) {
+ if ((col & k) == 0) {
+ if (dst_row[col] >= args.ncols ||
+ (dst_row[ixj] < args.ncols && (order == GGML_SORT_ORDER_ASC ?
+ x_row[dst_row[col]] > x_row[dst_row[ixj]] :
+ x_row[dst_row[col]] < x_row[dst_row[ixj]]))
+ ) {
+ SWAP(dst_row[col], dst_row[ixj]);
+ }
+ } else {
+ if (dst_row[ixj] >= args.ncols ||
+ (dst_row[col] < args.ncols && (order == GGML_SORT_ORDER_ASC ?
+ x_row[dst_row[col]] < x_row[dst_row[ixj]] :
+ x_row[dst_row[col]] > x_row[dst_row[ixj]]))
+ ) {
+ SWAP(dst_row[col], dst_row[ixj]);
+ }
+ }
+ }
+ threadgroup_barrier(mem_flags::mem_threadgroup);
+ }
+ }
+
+ // copy the result to dst without the padding
+ if (col < args.ncols) {
+ dst[row * args.ncols + col] = dst_row[col];
+ }
+}
+
template [[host_name("kernel_argsort_f32_i32_asc")]] kernel argsort_t kernel_argsort_f32_i32<GGML_SORT_ORDER_ASC>;
template [[host_name("kernel_argsort_f32_i32_desc")]] kernel argsort_t kernel_argsort_f32_i32<GGML_SORT_ORDER_DESC>;
+template [[host_name("kernel_argsort_i32_i32_asc")]] kernel i32_argsort_t kernel_argsort_i32_i32<GGML_SORT_ORDER_ASC>;
+template [[host_name("kernel_argsort_i32_i32_desc")]] kernel i32_argsort_t kernel_argsort_i32_i32<GGML_SORT_ORDER_DESC>;
kernel void kernel_leaky_relu_f32(
constant ggml_metal_kargs_leaky_relu & args,

View File

@ -6,12 +6,12 @@ Subject: [PATCH] graph memory reporting on failure
---
ggml/include/ggml-alloc.h | 1 +
ggml/include/ggml-backend.h | 1 +
ggml/src/ggml-alloc.c | 36 ++++++++++++++++++++++++++++++++----
ggml/src/ggml-alloc.c | 34 +++++++++++++++++++++++++++++++---
ggml/src/ggml-backend.cpp | 7 +++++++
4 files changed, 41 insertions(+), 4 deletions(-)
4 files changed, 40 insertions(+), 3 deletions(-)
diff --git a/ggml/include/ggml-alloc.h b/ggml/include/ggml-alloc.h
index 2cb150fd2..7ab3f0192 100644
index 2cb150fd..7ab3f019 100644
--- a/ggml/include/ggml-alloc.h
+++ b/ggml/include/ggml-alloc.h
@@ -65,6 +65,7 @@ GGML_API bool ggml_gallocr_reserve_n(
@ -23,31 +23,31 @@ index 2cb150fd2..7ab3f0192 100644
// Utils
// Create a buffer and allocate all the tensors in a ggml_context
diff --git a/ggml/include/ggml-backend.h b/ggml/include/ggml-backend.h
index a2977ea2e..e8cf30841 100644
index 62b6d65e..fe20dca3 100644
--- a/ggml/include/ggml-backend.h
+++ b/ggml/include/ggml-backend.h
@@ -303,6 +303,7 @@ extern "C" {
GGML_API int ggml_backend_sched_get_n_copies(ggml_backend_sched_t sched);
@@ -316,6 +316,7 @@ extern "C" {
GGML_API ggml_backend_buffer_type_t ggml_backend_sched_get_buffer_type(ggml_backend_sched_t sched, ggml_backend_t backend);
GGML_API size_t ggml_backend_sched_get_buffer_size(ggml_backend_sched_t sched, ggml_backend_t backend);
+ GGML_API size_t ggml_backend_sched_get_attempted_buffer_size(ggml_backend_sched_t sched, ggml_backend_t backend);
GGML_API void ggml_backend_sched_set_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node, ggml_backend_t backend);
GGML_API ggml_backend_t ggml_backend_sched_get_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node);
diff --git a/ggml/src/ggml-alloc.c b/ggml/src/ggml-alloc.c
index 8b6e60283..b58bd671d 100644
index fa46f3b4..421ff7c7 100644
--- a/ggml/src/ggml-alloc.c
+++ b/ggml/src/ggml-alloc.c
@@ -350,6 +350,7 @@ struct node_alloc {
@@ -492,6 +492,7 @@ struct node_alloc {
struct ggml_gallocr {
ggml_backend_buffer_type_t * bufts; // [n_buffers]
ggml_backend_buffer_t * buffers; // [n_buffers]
struct vbuffer ** buffers; // [n_buffers]
+ size_t *buffer_sizes; // [n_buffers]
struct ggml_dyn_tallocr ** buf_tallocs; // [n_buffers]
int n_buffers;
@@ -373,6 +374,9 @@ ggml_gallocr_t ggml_gallocr_new_n(ggml_backend_buffer_type_t * bufts, int n_bufs
galloc->buffers = calloc(n_bufs, sizeof(ggml_backend_buffer_t));
@@ -515,6 +516,9 @@ ggml_gallocr_t ggml_gallocr_new_n(ggml_backend_buffer_type_t * bufts, int n_bufs
galloc->buffers = calloc(n_bufs, sizeof(struct vbuffer *));
GGML_ASSERT(galloc->buffers != NULL);
+ galloc->buffer_sizes = calloc(n_bufs, sizeof(size_t));
@ -56,7 +56,7 @@ index 8b6e60283..b58bd671d 100644
galloc->buf_tallocs = calloc(n_bufs, sizeof(struct ggml_dyn_tallocr *));
GGML_ASSERT(galloc->buf_tallocs != NULL);
@@ -439,6 +443,7 @@ void ggml_gallocr_free(ggml_gallocr_t galloc) {
@@ -582,6 +586,7 @@ void ggml_gallocr_free(ggml_gallocr_t galloc) {
ggml_hash_set_free(&galloc->hash_set);
free(galloc->hash_values);
free(galloc->bufts);
@ -64,7 +64,7 @@ index 8b6e60283..b58bd671d 100644
free(galloc->buffers);
free(galloc->buf_tallocs);
free(galloc->node_allocs);
@@ -734,6 +739,8 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
@@ -875,6 +880,8 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
}
}
@ -73,23 +73,21 @@ index 8b6e60283..b58bd671d 100644
// reallocate buffers if needed
for (int i = 0; i < galloc->n_buffers; i++) {
// if the buffer type is used multiple times, we reuse the same buffer
@@ -755,15 +762,20 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
@@ -896,14 +903,19 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
ggml_backend_buffer_free(galloc->buffers[i]);
galloc->buffers[i] = ggml_backend_buft_alloc_buffer(galloc->bufts[i], new_size);
ggml_vbuffer_free(galloc->buffers[i]);
galloc->buffers[i] = ggml_vbuffer_alloc(galloc->bufts[i], galloc->buf_tallocs[i], GGML_BACKEND_BUFFER_USAGE_COMPUTE);
- if (galloc->buffers[i] == NULL) {
+ if (galloc->buffers[i]) {
+ galloc->buffer_sizes[i] = ggml_backend_buffer_get_size(galloc->buffers[i]);
+ ggml_backend_buffer_set_usage(galloc->buffers[i], GGML_BACKEND_BUFFER_USAGE_COMPUTE);
+ galloc->buffer_sizes[i] = ggml_vbuffer_size(galloc->buffers[i]);
+ } else {
GGML_LOG_ERROR("%s: failed to allocate %s buffer of size %zu\n", __func__, ggml_backend_buft_name(galloc->bufts[i]), new_size);
- return false;
+ galloc->buffer_sizes[i] = new_size;
+ success = false;
}
- ggml_backend_buffer_set_usage(galloc->buffers[i], GGML_BACKEND_BUFFER_USAGE_COMPUTE);
+ } else {
+ galloc->buffer_sizes[i] = ggml_backend_buffer_get_size(galloc->buffers[i]);
+ galloc->buffer_sizes[i] = ggml_vbuffer_size(galloc->buffers[i]);
}
}
@ -98,8 +96,8 @@ index 8b6e60283..b58bd671d 100644
}
bool ggml_gallocr_reserve(ggml_gallocr_t galloc, struct ggml_cgraph *graph) {
@@ -920,6 +932,22 @@ size_t ggml_gallocr_get_buffer_size(ggml_gallocr_t galloc, int buffer_id) {
return ggml_backend_buffer_get_size(galloc->buffers[buffer_id]);
@@ -1058,6 +1070,22 @@ size_t ggml_gallocr_get_buffer_size(ggml_gallocr_t galloc, int buffer_id) {
return ggml_vbuffer_size(galloc->buffers[buffer_id]);
}
+size_t ggml_gallocr_get_attempted_buffer_size(ggml_gallocr_t galloc, int buffer_id) {
@ -122,10 +120,10 @@ index 8b6e60283..b58bd671d 100644
static void free_buffers(ggml_backend_buffer_t ** buffers, const size_t * n_buffers) {
diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp
index 97f47abd2..d02a40e60 100644
index 8ba86f82..cb2b9956 100644
--- a/ggml/src/ggml-backend.cpp
+++ b/ggml/src/ggml-backend.cpp
@@ -1631,6 +1631,13 @@ size_t ggml_backend_sched_get_buffer_size(ggml_backend_sched_t sched, ggml_backe
@@ -1809,6 +1809,13 @@ size_t ggml_backend_sched_get_buffer_size(ggml_backend_sched_t sched, ggml_backe
return ggml_gallocr_get_buffer_size(sched->galloc, backend_index);
}
@ -137,5 +135,5 @@ index 97f47abd2..d02a40e60 100644
+}
+
void ggml_backend_sched_set_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node, ggml_backend_t backend) {
GGML_ASSERT(sched);
int backend_index = ggml_backend_sched_backend_id(sched, backend);
GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends);

View File

@ -7,27 +7,27 @@ This enables matching up devices and information reported by the backend
with tools (e.g. nvidia-smi) and system management libraries (e.g. nvml).
---
ggml/include/ggml-backend.h | 1 +
ggml/src/ggml-cuda/ggml-cuda.cu | 67 +++++++++++++++++++++++++++++---
ggml/src/ggml-metal/ggml-metal.m | 1 +
ggml/src/ggml-cuda/ggml-cuda.cu | 67 +++++++++++++++++++++++++++---
ggml/src/ggml-metal/ggml-metal.cpp | 1 +
3 files changed, 63 insertions(+), 6 deletions(-)
diff --git a/ggml/include/ggml-backend.h b/ggml/include/ggml-backend.h
index 8a91b381..9424394e 100644
index fe20dca3..48777212 100644
--- a/ggml/include/ggml-backend.h
+++ b/ggml/include/ggml-backend.h
@@ -152,6 +152,7 @@ extern "C" {
struct ggml_backend_dev_props {
const char * name;
@@ -158,6 +158,7 @@ extern "C" {
const char * description;
+ const char * id;
// device free memory in bytes
size_t memory_free;
+ const char * id;
// device total memory in bytes
size_t memory_total;
enum ggml_backend_dev_type type;
// device type
diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
index 37ee2a6d..57eae461 100644
index fdf8c63d..ad389ece 100644
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
@@ -179,6 +179,51 @@ static int ggml_cuda_parse_id(char devName[]) {
@@ -183,6 +183,51 @@ static int ggml_cuda_parse_id(char devName[]) {
}
#endif // defined(GGML_USE_HIP)
@ -77,9 +77,9 @@ index 37ee2a6d..57eae461 100644
+}
+
static ggml_cuda_device_info ggml_cuda_init() {
#if defined(GGML_USE_HIP)
// Workaround for a rocBLAS bug when using multiple graphics cards:
@@ -267,22 +312,24 @@ static ggml_cuda_device_info ggml_cuda_init() {
ggml_cuda_device_info info = {};
@@ -249,22 +294,24 @@ static ggml_cuda_device_info ggml_cuda_init() {
info.devices[id].cc += prop.minor * 0x10;
}
}
@ -107,18 +107,18 @@ index 37ee2a6d..57eae461 100644
+ GGML_LOG_INFO(" Device %d: %s, compute capability %d.%d, VMM: %s, ID: %s\n",
+ id, prop.name, prop.major, prop.minor, device_vmm ? "yes" : "no",
+ ggml_cuda_parse_uuid(prop, id).c_str());
#endif // defined(GGML_USE_HIP)
}
@@ -3144,6 +3191,7 @@ struct ggml_backend_cuda_device_context {
int device;
std::string device_name(prop.name);
if (device_name == "NVIDIA GeForce MX450") {
turing_devices_without_mma.push_back({ id, device_name });
@@ -3273,6 +3320,7 @@ struct ggml_backend_cuda_device_context {
std::string name;
std::string description;
std::string pci_bus_id;
+ std::string id;
};
static const char * ggml_backend_cuda_device_get_name(ggml_backend_dev_t dev) {
@@ -3156,6 +3204,11 @@ static const char * ggml_backend_cuda_device_get_description(ggml_backend_dev_t
@@ -3285,6 +3333,11 @@ static const char * ggml_backend_cuda_device_get_description(ggml_backend_dev_t
return ctx->description.c_str();
}
@ -130,31 +130,31 @@ index 37ee2a6d..57eae461 100644
static void ggml_backend_cuda_device_get_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) {
ggml_backend_cuda_device_context * ctx = (ggml_backend_cuda_device_context *)dev->context;
ggml_cuda_set_device(ctx->device);
@@ -3170,6 +3223,7 @@ static enum ggml_backend_dev_type ggml_backend_cuda_device_get_type(ggml_backend
static void ggml_backend_cuda_device_get_props(ggml_backend_dev_t dev, ggml_backend_dev_props * props) {
@@ -3301,6 +3354,7 @@ static void ggml_backend_cuda_device_get_props(ggml_backend_dev_t dev, ggml_back
props->name = ggml_backend_cuda_device_get_name(dev);
props->description = ggml_backend_cuda_device_get_description(dev);
+ props->id = ggml_backend_cuda_device_get_id(dev);
props->type = ggml_backend_cuda_device_get_type(dev);
props->device_id = ctx->pci_bus_id.empty() ? nullptr : ctx->pci_bus_id.c_str();
ggml_backend_cuda_device_get_memory(dev, &props->memory_free, &props->memory_total);
@@ -3767,6 +3821,7 @@ ggml_backend_reg_t ggml_backend_cuda_reg() {
@@ -3871,6 +3925,7 @@ ggml_backend_reg_t ggml_backend_cuda_reg() {
cudaDeviceProp prop;
CUDA_CHECK(cudaGetDeviceProperties(&prop, i));
dev_ctx->description = prop.name;
+ dev_ctx->id = ggml_cuda_parse_uuid(prop, i);
ggml_backend_dev_t dev = new ggml_backend_device {
/* .iface = */ ggml_backend_cuda_device_interface,
diff --git a/ggml/src/ggml-metal/ggml-metal.m b/ggml/src/ggml-metal/ggml-metal.m
index 7bccc7bf..fe7b2f0a 100644
--- a/ggml/src/ggml-metal/ggml-metal.m
+++ b/ggml/src/ggml-metal/ggml-metal.m
@@ -6522,6 +6522,7 @@ static enum ggml_backend_dev_type ggml_backend_metal_device_get_type(ggml_backen
static void ggml_backend_metal_device_get_props(ggml_backend_dev_t dev, struct ggml_backend_dev_props * props) {
char pci_bus_id[16] = {};
snprintf(pci_bus_id, sizeof(pci_bus_id), "%04x:%02x:%02x.0", prop.pciDomainID, prop.pciBusID, prop.pciDeviceID);
diff --git a/ggml/src/ggml-metal/ggml-metal.cpp b/ggml/src/ggml-metal/ggml-metal.cpp
index 909e17de..08ab4fc9 100644
--- a/ggml/src/ggml-metal/ggml-metal.cpp
+++ b/ggml/src/ggml-metal/ggml-metal.cpp
@@ -538,6 +538,7 @@ static enum ggml_backend_dev_type ggml_backend_metal_device_get_type(ggml_backen
static void ggml_backend_metal_device_get_props(ggml_backend_dev_t dev, ggml_backend_dev_props * props) {
props->name = ggml_backend_metal_device_get_name(dev);
props->description = ggml_backend_metal_device_get_description(dev);
+ props->id = "0";
props->type = ggml_backend_metal_device_get_type(dev);
ggml_backend_metal_device_get_memory(dev, &props->memory_free, &props->memory_total);
props->caps = (struct ggml_backend_dev_caps) {

View File

@ -10,7 +10,7 @@ Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>
2 files changed, 13 insertions(+)
diff --git a/tools/mtmd/mtmd.cpp b/tools/mtmd/mtmd.cpp
index a05373d5..6f70f7f4 100644
index cd022c5e..3d680945 100644
--- a/tools/mtmd/mtmd.cpp
+++ b/tools/mtmd/mtmd.cpp
@@ -79,6 +79,16 @@ enum mtmd_slice_tmpl {

View File

@ -8,10 +8,10 @@ Subject: [PATCH] no power throttling win32 with gnuc
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c
index a5689c18..85af19a3 100644
index f8574d01..530efce0 100644
--- a/ggml/src/ggml-cpu/ggml-cpu.c
+++ b/ggml/src/ggml-cpu/ggml-cpu.c
@@ -2412,7 +2412,7 @@ static bool ggml_thread_apply_priority(int32_t prio) {
@@ -2431,7 +2431,7 @@ static bool ggml_thread_apply_priority(int32_t prio) {
// Newer Windows 11 versions aggresively park (offline) CPU cores and often place
// all our threads onto the first 4 cores which results in terrible performance with
// n_threads > 4

View File

@ -5,23 +5,24 @@ Subject: [PATCH] BF16 macos version guard
Only enable BF16 on supported MacOS versions (v14+)
---
ggml/src/ggml-metal/ggml-metal.m | 6 +++++-
1 file changed, 5 insertions(+), 1 deletion(-)
ggml/src/ggml-metal/ggml-metal-context.m | 7 ++++++-
1 file changed, 6 insertions(+), 1 deletion(-)
diff --git a/ggml/src/ggml-metal/ggml-metal.m b/ggml/src/ggml-metal/ggml-metal.m
index fe7b2f0a..e4c31268 100644
--- a/ggml/src/ggml-metal/ggml-metal.m
+++ b/ggml/src/ggml-metal/ggml-metal.m
@@ -106,7 +106,11 @@ static id<MTLDevice> ggml_backend_metal_device_acq(struct ggml_backend_metal_dev
ctx->has_bfloat |= [ctx->mtl_device supportsFamily:MTLGPUFamilyApple6];
diff --git a/ggml/src/ggml-metal/ggml-metal-context.m b/ggml/src/ggml-metal/ggml-metal-context.m
index 052efb7a..b47dc787 100644
--- a/ggml/src/ggml-metal/ggml-metal-context.m
+++ b/ggml/src/ggml-metal/ggml-metal-context.m
@@ -125,7 +125,12 @@ ggml_metal_t ggml_metal_init(ggml_metal_device_t dev) {
#if defined(GGML_METAL_USE_BF16)
- ctx->use_bfloat = ctx->has_bfloat;
res->d_queue = dispatch_queue_create("ggml-metal", DISPATCH_QUEUE_CONCURRENT);
- res->use_bfloat = props_dev->has_bfloat;
+ if (@available(macOS 14.0, *)) {
+ ctx->use_bfloat = ctx->has_bfloat;
+ res->use_bfloat = props_dev->has_bfloat;
+ } else {
+ ctx->use_bfloat = false;
+ res->use_bfloat = false;
+ }
#else
ctx->use_bfloat = false;
#endif
+
res->use_fusion = getenv("GGML_METAL_FUSION_DISABLE") == nil;
res->use_concurrency = getenv("GGML_METAL_CONCURRENCY_DISABLE") == nil;

View File

@ -13,10 +13,10 @@ checks.
1 file changed, 18 insertions(+)
diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
index 57eae461..c7f9dc3a 100644
index ad389ece..e51c5035 100644
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
@@ -2671,12 +2671,24 @@ static bool check_node_graph_compatibility_and_refresh_copy_ops(ggml_backend_cud
@@ -2686,14 +2686,26 @@ static bool check_node_graph_compatibility_and_refresh_copy_ops(ggml_backend_cud
// Loop over nodes in GGML graph to obtain info needed for CUDA graph
cuda_ctx->cuda_graph->cpy_dest_ptrs.clear();
@ -36,12 +36,14 @@ index 57eae461..c7f9dc3a 100644
const std::string ffn_moe_gate_bias_prefix = "ffn_moe_gate_biased";
const std::string ffn_moe_up_bias_prefix = "ffn_moe_up_biased";
const std::string ffn_moe_down_bias_prefix = "ffn_moe_down_biased";
const std::string nemotron_h_block_out_prefix = "nemotron_h_block_out";
const std::string mamba2_y_add_d_prefix = "mamba2_y_add_d";
+
for (int i = 0; i < cgraph->n_nodes; i++) {
ggml_tensor * node = cgraph->nodes[i];
@@ -2700,6 +2712,12 @@ static bool check_node_graph_compatibility_and_refresh_copy_ops(ggml_backend_cud
@@ -2717,6 +2729,12 @@ static bool check_node_graph_compatibility_and_refresh_copy_ops(ggml_backend_cud
if (node->op == GGML_OP_ADD &&
node->src[1] && node->src[1]->ne[1] > 1 &&

View File

@ -8,10 +8,10 @@ Subject: [PATCH] Disable ggml-blas on macos v13 and older
1 file changed, 5 insertions(+)
diff --git a/ggml/src/ggml-blas/ggml-blas.cpp b/ggml/src/ggml-blas/ggml-blas.cpp
index aeac2e57..40738d5b 100644
index 5b888cdd..2a9ff7f6 100644
--- a/ggml/src/ggml-blas/ggml-blas.cpp
+++ b/ggml/src/ggml-blas/ggml-blas.cpp
@@ -505,6 +505,11 @@ static const struct ggml_backend_reg_i ggml_backend_blas_reg_i = {
@@ -506,6 +506,11 @@ static const struct ggml_backend_reg_i ggml_backend_blas_reg_i = {
};
ggml_backend_reg_t ggml_backend_blas_reg(void) {

View File

@ -16,10 +16,10 @@ must be recreated with no-alloc set to false before loading data.
5 files changed, 310 insertions(+), 44 deletions(-)
diff --git a/ggml/include/ggml-backend.h b/ggml/include/ggml-backend.h
index 2773cc310..ae94887dd 100644
index 48777212..d4352663 100644
--- a/ggml/include/ggml-backend.h
+++ b/ggml/include/ggml-backend.h
@@ -291,6 +291,7 @@ extern "C" {
@@ -303,6 +303,7 @@ extern "C" {
// Initialize a backend scheduler, backends with low index are given priority over backends with high index
GGML_API ggml_backend_sched_t ggml_backend_sched_new(ggml_backend_t * backends, ggml_backend_buffer_type_t * bufts, int n_backends, size_t graph_size, bool parallel, bool op_offload);
@ -28,7 +28,7 @@ index 2773cc310..ae94887dd 100644
// Initialize backend buffers from a measure graph
diff --git a/ggml/src/ggml-backend-impl.h b/ggml/src/ggml-backend-impl.h
index c36c12d65..369e9e25a 100644
index 07784d6f..869dc07d 100644
--- a/ggml/src/ggml-backend-impl.h
+++ b/ggml/src/ggml-backend-impl.h
@@ -26,12 +26,17 @@ extern "C" {
@ -57,10 +57,10 @@ index c36c12d65..369e9e25a 100644
};
GGML_API ggml_backend_buffer_t ggml_backend_buffer_init(
@@ -114,6 +120,16 @@ extern "C" {
void (*event_record)(ggml_backend_t backend, ggml_backend_event_t event);
// wait for an event on on a different stream
void (*event_wait) (ggml_backend_t backend, ggml_backend_event_t event);
@@ -117,6 +123,16 @@ extern "C" {
// (optional) sort/optimize the nodes in the graph
void (*graph_optimize) (ggml_backend_t backend, struct ggml_cgraph * cgraph);
+
+ // (optional) reserves intermediate buffers needed for the compution
+ // if alloc is true, memory is actually allocated, otherwise the required amount is just returned by buffer_size
@ -75,7 +75,7 @@ index c36c12d65..369e9e25a 100644
struct ggml_backend {
diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp
index d02a40e60..6b4dee4c7 100644
index cb2b9956..6ef5eeaf 100644
--- a/ggml/src/ggml-backend.cpp
+++ b/ggml/src/ggml-backend.cpp
@@ -41,6 +41,19 @@ ggml_backend_buffer_t ggml_backend_buft_alloc_buffer(ggml_backend_buffer_type_t
@ -95,10 +95,10 @@ index d02a40e60..6b4dee4c7 100644
+ return buf;
+ }
+
GGML_ASSERT(buft);
return buft->iface.alloc_buffer(buft, size);
}
@@ -89,7 +102,8 @@ ggml_backend_buffer_t ggml_backend_buffer_init(
@@ -95,7 +108,8 @@ ggml_backend_buffer_t ggml_backend_buffer_init(
/* .buft = */ buft,
/* .context = */ context,
/* .size = */ size,
@ -108,7 +108,7 @@ index d02a40e60..6b4dee4c7 100644
};
return buffer;
@@ -119,6 +133,12 @@ void * ggml_backend_buffer_get_base(ggml_backend_buffer_t buffer) {
@@ -127,6 +141,12 @@ void * ggml_backend_buffer_get_base(ggml_backend_buffer_t buffer) {
return NULL;
}
@ -121,7 +121,7 @@ index d02a40e60..6b4dee4c7 100644
void * base = buffer->iface.get_base(buffer);
GGML_ASSERT(base != NULL && "backend buffer base cannot be NULL");
@@ -663,6 +683,12 @@ struct ggml_backend_sched {
@@ -723,6 +743,12 @@ struct ggml_backend_sched {
bool op_offload;
int debug;
@ -134,7 +134,7 @@ index d02a40e60..6b4dee4c7 100644
};
#define hash_id(tensor) ggml_hash_find_or_insert(&sched->hash_set, tensor)
@@ -1449,6 +1475,17 @@ ggml_backend_sched_t ggml_backend_sched_new(
@@ -1606,6 +1632,17 @@ ggml_backend_sched_t ggml_backend_sched_new(
size_t graph_size,
bool parallel,
bool op_offload) {
@ -152,7 +152,7 @@ index d02a40e60..6b4dee4c7 100644
GGML_ASSERT(n_backends > 0);
GGML_ASSERT(n_backends <= GGML_SCHED_MAX_BACKENDS);
GGML_ASSERT(ggml_backend_dev_type(ggml_backend_get_device(backends[n_backends - 1])) == GGML_BACKEND_DEVICE_TYPE_CPU);
@@ -1490,10 +1527,13 @@ ggml_backend_sched_t ggml_backend_sched_new(
@@ -1647,10 +1684,13 @@ ggml_backend_sched_t ggml_backend_sched_new(
sched->events[b][c] = ggml_backend_event_new(backends[b]->device);
}
}
@ -166,7 +166,7 @@ index d02a40e60..6b4dee4c7 100644
ggml_backend_sched_reset(sched);
@@ -1508,6 +1548,10 @@ void ggml_backend_sched_free(ggml_backend_sched_t sched) {
@@ -1665,6 +1705,10 @@ void ggml_backend_sched_free(ggml_backend_sched_t sched) {
for (int c = 0; c < sched->n_copies; c++) {
ggml_backend_event_free(sched->events[b][c]);
}
@ -177,7 +177,7 @@ index d02a40e60..6b4dee4c7 100644
}
ggml_gallocr_free(sched->galloc);
ggml_free(sched->ctx);
@@ -1547,6 +1591,24 @@ bool ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph *
@@ -1708,6 +1752,24 @@ bool ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph *
return false;
}
@ -202,7 +202,7 @@ index d02a40e60..6b4dee4c7 100644
ggml_backend_sched_reset(sched);
return true;
@@ -1635,7 +1697,13 @@ size_t ggml_backend_sched_get_attempted_buffer_size(ggml_backend_sched_t sched,
@@ -1813,7 +1875,13 @@ size_t ggml_backend_sched_get_attempted_buffer_size(ggml_backend_sched_t sched,
int backend_index = ggml_backend_sched_backend_id(sched, backend);
GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends);
@ -218,7 +218,7 @@ index d02a40e60..6b4dee4c7 100644
void ggml_backend_sched_set_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node, ggml_backend_t backend) {
diff --git a/ggml/src/ggml-cuda/common.cuh b/ggml/src/ggml-cuda/common.cuh
index 2e5d48797..b915ee1b8 100644
index c4246b65..448badf0 100644
--- a/ggml/src/ggml-cuda/common.cuh
+++ b/ggml/src/ggml-cuda/common.cuh
@@ -35,6 +35,31 @@
@ -253,7 +253,7 @@ index 2e5d48797..b915ee1b8 100644
#define STRINGIZE_IMPL(...) #__VA_ARGS__
#define STRINGIZE(...) STRINGIZE_IMPL(__VA_ARGS__)
@@ -771,6 +796,9 @@ struct ggml_cuda_pool {
@@ -880,6 +905,9 @@ struct ggml_cuda_pool {
virtual void * alloc(size_t size, size_t * actual_size) = 0;
virtual void free(void * ptr, size_t size) = 0;
@ -263,7 +263,7 @@ index 2e5d48797..b915ee1b8 100644
};
template<typename T>
@@ -914,11 +942,11 @@ struct ggml_backend_cuda_context {
@@ -1023,11 +1051,11 @@ struct ggml_backend_cuda_context {
// pool
std::unique_ptr<ggml_cuda_pool> pools[GGML_CUDA_MAX_DEVICES];
@ -277,7 +277,7 @@ index 2e5d48797..b915ee1b8 100644
}
return *pools[device];
}
@@ -926,4 +954,20 @@ struct ggml_backend_cuda_context {
@@ -1035,4 +1063,20 @@ struct ggml_backend_cuda_context {
ggml_cuda_pool & pool() {
return pool(device);
}
@ -299,7 +299,7 @@ index 2e5d48797..b915ee1b8 100644
+ }
};
diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
index c7f9dc3a5..d5abe09e0 100644
index e51c5035..d324bc68 100644
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
@@ -350,6 +350,8 @@ const ggml_cuda_device_info & ggml_cuda_info() {
@ -540,7 +540,7 @@ index c7f9dc3a5..d5abe09e0 100644
};
ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device) {
@@ -2936,6 +2998,7 @@ static bool ggml_cuda_can_fuse(const struct ggml_cgraph * cgraph, int node_idx,
@@ -3008,6 +3070,7 @@ static bool ggml_cuda_can_fuse(const struct ggml_cgraph * cgraph, int node_idx,
static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx, ggml_cgraph * cgraph,
bool & graph_evaluated_or_captured, bool & use_cuda_graph, bool & cuda_graph_update_required) {
@ -548,7 +548,7 @@ index c7f9dc3a5..d5abe09e0 100644
// flag used to determine whether it is an integrated_gpu
const bool integrated = ggml_cuda_info().devices[cuda_ctx->device].integrated;
@@ -2951,6 +3014,11 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx
@@ -3023,6 +3086,11 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx
continue;
}
@ -559,8 +559,8 @@ index c7f9dc3a5..d5abe09e0 100644
+
static bool disable_fusion = (getenv("GGML_CUDA_DISABLE_FUSION") != nullptr);
if (!disable_fusion) {
if (ggml_cuda_can_fuse(cgraph, i, { GGML_OP_RMS_NORM, GGML_OP_MUL }, {})) {
@@ -3022,6 +3090,7 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx
@@ -3149,6 +3217,7 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx
static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
@ -568,7 +568,7 @@ index c7f9dc3a5..d5abe09e0 100644
ggml_cuda_set_device(cuda_ctx->device);
@@ -3101,6 +3170,71 @@ static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend,
@@ -3228,6 +3297,71 @@ static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend,
return GGML_STATUS_SUCCESS;
}
@ -640,10 +640,10 @@ index c7f9dc3a5..d5abe09e0 100644
static void ggml_backend_cuda_event_record(ggml_backend_t backend, ggml_backend_event_t event) {
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
@@ -3140,6 +3274,9 @@ static const ggml_backend_i ggml_backend_cuda_interface = {
/* .graph_compute = */ ggml_backend_cuda_graph_compute,
@@ -3268,6 +3402,9 @@ static const ggml_backend_i ggml_backend_cuda_interface = {
/* .event_record = */ ggml_backend_cuda_event_record,
/* .event_wait = */ ggml_backend_cuda_event_wait,
/* .graph_optimize = */ NULL,
+ /* .graph_reserve = */ ggml_backend_cuda_graph_reserve,
+ /* .buffer_size = */ ggml_backend_cuda_buffer_size,
+ /* .reset = */ ggml_backend_cuda_reset,

View File

@ -8,10 +8,10 @@ Subject: [PATCH] decode: disable output_all
1 file changed, 1 insertion(+), 2 deletions(-)
diff --git a/src/llama-context.cpp b/src/llama-context.cpp
index 26a5cf9c..6ece5263 100644
index d8a8b5e6..09247cef 100644
--- a/src/llama-context.cpp
+++ b/src/llama-context.cpp
@@ -962,8 +962,7 @@ int llama_context::decode(const llama_batch & batch_inp) {
@@ -974,8 +974,7 @@ int llama_context::decode(const llama_batch & batch_inp) {
const int64_t n_vocab = vocab.n_tokens();
const int64_t n_embd = hparams.n_embd;

View File

@ -15,10 +15,10 @@ unused then it can be reset to free these data structures.
5 files changed, 29 insertions(+), 2 deletions(-)
diff --git a/ggml/include/ggml-backend.h b/ggml/include/ggml-backend.h
index b602a7c78..fda5ceb24 100644
index d4352663..0a2dae26 100644
--- a/ggml/include/ggml-backend.h
+++ b/ggml/include/ggml-backend.h
@@ -167,6 +167,7 @@ extern "C" {
@@ -178,6 +178,7 @@ extern "C" {
GGML_API void ggml_backend_dev_get_props(ggml_backend_dev_t device, struct ggml_backend_dev_props * props);
GGML_API ggml_backend_reg_t ggml_backend_dev_backend_reg(ggml_backend_dev_t device);
GGML_API ggml_backend_t ggml_backend_dev_init(ggml_backend_dev_t device, const char * params);
@ -27,10 +27,10 @@ index b602a7c78..fda5ceb24 100644
GGML_API ggml_backend_buffer_type_t ggml_backend_dev_host_buffer_type(ggml_backend_dev_t device);
GGML_API ggml_backend_buffer_t ggml_backend_dev_buffer_from_host_ptr(ggml_backend_dev_t device, void * ptr, size_t size, size_t max_tensor_size);
diff --git a/ggml/src/ggml-backend-impl.h b/ggml/src/ggml-backend-impl.h
index 81749a5a3..6f10c353b 100644
index 869dc07d..4889df79 100644
--- a/ggml/src/ggml-backend-impl.h
+++ b/ggml/src/ggml-backend-impl.h
@@ -178,6 +178,10 @@ extern "C" {
@@ -195,6 +195,10 @@ extern "C" {
ggml_backend_event_t (*event_new) (ggml_backend_dev_t dev);
void (*event_free) (ggml_backend_dev_t dev, ggml_backend_event_t event);
void (*event_synchronize) (ggml_backend_dev_t dev, ggml_backend_event_t event);
@ -42,10 +42,10 @@ index 81749a5a3..6f10c353b 100644
struct ggml_backend_device {
diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp
index 05a842ed5..6556943b0 100644
index 6ef5eeaf..0b757af5 100644
--- a/ggml/src/ggml-backend.cpp
+++ b/ggml/src/ggml-backend.cpp
@@ -477,6 +477,14 @@ ggml_backend_t ggml_backend_dev_init(ggml_backend_dev_t device, const char * par
@@ -526,6 +526,14 @@ ggml_backend_t ggml_backend_dev_init(ggml_backend_dev_t device, const char * par
return device->iface.init_backend(device, params);
}
@ -58,13 +58,13 @@ index 05a842ed5..6556943b0 100644
+}
+
ggml_backend_buffer_type_t ggml_backend_dev_buffer_type(ggml_backend_dev_t device) {
GGML_ASSERT(device);
return device->iface.get_buffer_type(device);
}
diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
index c7f9dc3a5..e43fde523 100644
index d324bc68..531d6e27 100644
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
@@ -103,6 +103,11 @@ int ggml_cuda_get_device() {
@@ -107,6 +107,11 @@ int ggml_cuda_get_device() {
return id;
}
@ -76,10 +76,10 @@ index c7f9dc3a5..e43fde523 100644
static cudaError_t ggml_cuda_device_malloc(void ** ptr, size_t size, int device) {
ggml_cuda_set_device(device);
cudaError_t err;
@@ -3243,7 +3248,10 @@ static void ggml_backend_cuda_device_get_props(ggml_backend_dev_t dev, ggml_back
props->description = ggml_backend_cuda_device_get_description(dev);
@@ -3512,7 +3517,10 @@ static void ggml_backend_cuda_device_get_props(ggml_backend_dev_t dev, ggml_back
props->id = ggml_backend_cuda_device_get_id(dev);
props->type = ggml_backend_cuda_device_get_type(dev);
props->device_id = ctx->pci_bus_id.empty() ? nullptr : ctx->pci_bus_id.c_str();
- ggml_backend_cuda_device_get_memory(dev, &props->memory_free, &props->memory_total);
+
+ // Memory reporting is disabled to avoid allocation of a CUDA primary context (~300 MB per device).
@ -88,7 +88,7 @@ index c7f9dc3a5..e43fde523 100644
bool host_buffer = getenv("GGML_CUDA_NO_PINNED") == nullptr;
#ifdef GGML_CUDA_NO_PEER_COPY
@@ -3700,6 +3708,11 @@ static void ggml_backend_cuda_device_event_synchronize(ggml_backend_dev_t dev, g
@@ -3945,6 +3953,11 @@ static void ggml_backend_cuda_device_event_synchronize(ggml_backend_dev_t dev, g
CUDA_CHECK(cudaEventSynchronize((cudaEvent_t)event->context));
}
@ -100,7 +100,7 @@ index c7f9dc3a5..e43fde523 100644
static const ggml_backend_device_i ggml_backend_cuda_device_interface = {
/* .get_name = */ ggml_backend_cuda_device_get_name,
/* .get_description = */ ggml_backend_cuda_device_get_description,
@@ -3716,6 +3729,7 @@ static const ggml_backend_device_i ggml_backend_cuda_device_interface = {
@@ -3961,6 +3974,7 @@ static const ggml_backend_device_i ggml_backend_cuda_device_interface = {
/* .event_new = */ ggml_backend_cuda_device_event_new,
/* .event_free = */ ggml_backend_cuda_device_event_free,
/* .event_synchronize = */ ggml_backend_cuda_device_event_synchronize,
@ -108,7 +108,7 @@ index c7f9dc3a5..e43fde523 100644
};
// backend reg
@@ -3835,7 +3849,6 @@ ggml_backend_reg_t ggml_backend_cuda_reg() {
@@ -4076,7 +4090,6 @@ ggml_backend_reg_t ggml_backend_cuda_reg() {
dev_ctx->device = i;
dev_ctx->name = GGML_CUDA_NAME + std::to_string(i);
@ -117,10 +117,10 @@ index c7f9dc3a5..e43fde523 100644
CUDA_CHECK(cudaGetDeviceProperties(&prop, i));
dev_ctx->description = prop.name;
diff --git a/ggml/src/ggml-cuda/vendors/hip.h b/ggml/src/ggml-cuda/vendors/hip.h
index c31f31923..cf22e60d2 100644
index 37386afc..06f9e7c1 100644
--- a/ggml/src/ggml-cuda/vendors/hip.h
+++ b/ggml/src/ggml-cuda/vendors/hip.h
@@ -40,6 +40,7 @@
@@ -41,6 +41,7 @@
#define cudaDeviceDisablePeerAccess hipDeviceDisablePeerAccess
#define cudaDeviceEnablePeerAccess hipDeviceEnablePeerAccess
#define cudaDeviceProp hipDeviceProp_t

View File

@ -8,23 +8,23 @@ management libraries for more accurate VRAM usage reporting if available.
---
ggml/include/ggml-backend.h | 9 +
ggml/src/CMakeLists.txt | 2 +
ggml/src/ggml-cuda/ggml-cuda.cu | 75 +++++-
ggml/src/ggml-cuda/vendors/hip.h | 1 +
ggml/src/ggml-cuda/ggml-cuda.cu | 72 +++++
ggml/src/ggml-cuda/vendors/hip.h | 4 +
ggml/src/ggml-impl.h | 8 +
ggml/src/ggml-metal/ggml-metal.m | 2 +
ggml/src/mem_hip.cpp | 449 +++++++++++++++++++++++++++++++
ggml/src/mem_nvml.cpp | 172 ++++++++++++
8 files changed, 717 insertions(+), 1 deletion(-)
ggml/src/ggml-metal/ggml-metal.cpp | 3 +-
ggml/src/mem_hip.cpp | 449 +++++++++++++++++++++++++++++
ggml/src/mem_nvml.cpp | 172 +++++++++++
8 files changed, 718 insertions(+), 1 deletion(-)
create mode 100644 ggml/src/mem_hip.cpp
create mode 100644 ggml/src/mem_nvml.cpp
diff --git a/ggml/include/ggml-backend.h b/ggml/include/ggml-backend.h
index fda5ceb24..7c2d86703 100644
index 0a2dae26..a6bf3378 100644
--- a/ggml/include/ggml-backend.h
+++ b/ggml/include/ggml-backend.h
@@ -158,6 +158,15 @@ extern "C" {
size_t memory_total;
enum ggml_backend_dev_type type;
@@ -169,6 +169,15 @@ extern "C" {
const char * device_id;
// device capabilities
struct ggml_backend_dev_caps caps;
+ int driver_major;
+ int driver_minor;
@ -39,10 +39,10 @@ index fda5ceb24..7c2d86703 100644
GGML_API const char * ggml_backend_dev_name(ggml_backend_dev_t device);
diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt
index 5158acd6a..3a428a22d 100644
index 33b3a15f..86191ef2 100644
--- a/ggml/src/CMakeLists.txt
+++ b/ggml/src/CMakeLists.txt
@@ -203,6 +203,8 @@ add_library(ggml-base
@@ -206,6 +206,8 @@ add_library(ggml-base
ggml-threading.h
ggml-quants.c
ggml-quants.h
@ -52,10 +52,10 @@ index 5158acd6a..3a428a22d 100644
target_include_directories(ggml-base PRIVATE .)
diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
index e43fde523..14baf0fb1 100644
index 531d6e27..3fa3a057 100644
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
@@ -279,6 +279,16 @@ static ggml_cuda_device_info ggml_cuda_init() {
@@ -261,6 +261,16 @@ static ggml_cuda_device_info ggml_cuda_init() {
for (int id = 0; id < info.device_count; ++id) {
int device_vmm = 0;
@ -72,7 +72,7 @@ index e43fde523..14baf0fb1 100644
#if defined(GGML_USE_VMM)
CUdevice device;
CU_CHECK(cuDeviceGet(&device, id));
@@ -332,9 +342,15 @@ static ggml_cuda_device_info ggml_cuda_init() {
@@ -314,6 +324,11 @@ static ggml_cuda_device_info ggml_cuda_init() {
#else
info.devices[id].smpbo = prop.sharedMemPerBlockOptin;
info.devices[id].cc = 100*prop.major + 10*prop.minor;
@ -84,33 +84,29 @@ index e43fde523..14baf0fb1 100644
GGML_LOG_INFO(" Device %d: %s, compute capability %d.%d, VMM: %s, ID: %s\n",
id, prop.name, prop.major, prop.minor, device_vmm ? "yes" : "no",
ggml_cuda_parse_uuid(prop, id).c_str());
+
#endif // defined(GGML_USE_HIP)
}
@@ -3215,6 +3231,14 @@ struct ggml_backend_cuda_device_context {
std::string name;
@@ -3481,6 +3496,14 @@ struct ggml_backend_cuda_device_context {
std::string description;
std::string pci_bus_id;
std::string id;
+ int major;
+ int minor;
+ int driver_major;
+ int driver_minor;
+ int integrated;
+ int pci_bus_id;
+ int pci_device_id;
+ int pci_domain_id;
+ int pciBusID;
+ int pciDeviceID;
+ int pciDomainID;
};
static const char * ggml_backend_cuda_device_get_name(ggml_backend_dev_t dev) {
@@ -3235,6 +3259,28 @@ static const char * ggml_backend_cuda_device_get_id(ggml_backend_dev_t dev) {
@@ -3501,6 +3524,28 @@ static const char * ggml_backend_cuda_device_get_id(ggml_backend_dev_t dev) {
static void ggml_backend_cuda_device_get_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) {
ggml_backend_cuda_device_context * ctx = (ggml_backend_cuda_device_context *)dev->context;
ggml_cuda_set_device(ctx->device);
+
+#if defined(GGML_USE_HIP)
+ if (ggml_hip_mgmt_init() == 0) {
+ int status = ggml_hip_get_device_memory(ctx->pci_bus_id, ctx->pci_device_id, free, total);
+ int status = ggml_hip_get_device_memory(ctx->pciBusID, ctx->pciDeviceID, free, total);
+ if (status == 0) {
+ GGML_LOG_DEBUG("%s utilizing ADLX memory reporting free: %zu total: %zu\n", __func__, *free, *total);
+ ggml_hip_mgmt_release();
@ -132,19 +128,18 @@ index e43fde523..14baf0fb1 100644
CUDA_CHECK(cudaMemGetInfo(free, total));
}
@@ -3243,6 +3289,7 @@ static enum ggml_backend_dev_type ggml_backend_cuda_device_get_type(ggml_backend
@@ -3509,6 +3554,7 @@ static enum ggml_backend_dev_type ggml_backend_cuda_device_get_type(ggml_backend
return GGML_BACKEND_DEVICE_TYPE_GPU;
}
+#define GGML_HIP_NAME "HIP"
static void ggml_backend_cuda_device_get_props(ggml_backend_dev_t dev, ggml_backend_dev_props * props) {
props->name = ggml_backend_cuda_device_get_name(dev);
props->description = ggml_backend_cuda_device_get_description(dev);
@@ -3253,6 +3300,23 @@ static void ggml_backend_cuda_device_get_props(ggml_backend_dev_t dev, ggml_back
ggml_backend_cuda_device_context * ctx = (ggml_backend_cuda_device_context *)dev->context;
@@ -3522,6 +3568,22 @@ static void ggml_backend_cuda_device_get_props(ggml_backend_dev_t dev, ggml_back
// If you need the memory data, call ggml_backend_dev_memory() explicitly.
props->memory_total = props->memory_free = 0;
+ ggml_backend_cuda_device_context * ctx = (ggml_backend_cuda_device_context *)dev->context;
+#if defined(GGML_USE_HIP)
+ int cc = ggml_cuda_info().devices[ctx->device].cc - GGML_CUDA_CC_OFFSET_AMD;
+ props->compute_major = cc / 0x100;
@ -156,15 +151,15 @@ index e43fde523..14baf0fb1 100644
+ props->driver_major = ctx->driver_major;
+ props->driver_minor = ctx->driver_minor;
+ props->integrated = ctx->integrated;
+ props->pci_bus_id = ctx->pci_bus_id;
+ props->pci_device_id = ctx->pci_device_id;
+ props->pci_domain_id = ctx->pci_domain_id;
+ props->pci_bus_id = ctx->pciBusID;
+ props->pci_device_id = ctx->pciDeviceID;
+ props->pci_domain_id = ctx->pciDomainID;
+ props->library = GGML_CUDA_NAME;
+
bool host_buffer = getenv("GGML_CUDA_NO_PINNED") == nullptr;
#ifdef GGML_CUDA_NO_PEER_COPY
bool events = false;
@@ -3843,6 +3907,8 @@ ggml_backend_reg_t ggml_backend_cuda_reg() {
@@ -4084,6 +4146,8 @@ ggml_backend_reg_t ggml_backend_cuda_reg() {
std::lock_guard<std::mutex> lock(mutex);
if (!initialized) {
ggml_backend_cuda_reg_context * ctx = new ggml_backend_cuda_reg_context;
@ -173,27 +168,36 @@ index e43fde523..14baf0fb1 100644
for (int i = 0; i < ggml_cuda_info().device_count; i++) {
ggml_backend_cuda_device_context * dev_ctx = new ggml_backend_cuda_device_context;
@@ -3853,7 +3919,14 @@ ggml_backend_reg_t ggml_backend_cuda_reg() {
CUDA_CHECK(cudaGetDeviceProperties(&prop, i));
dev_ctx->description = prop.name;
dev_ctx->id = ggml_cuda_parse_uuid(prop, i);
-
@@ -4099,6 +4163,14 @@ ggml_backend_reg_t ggml_backend_cuda_reg() {
snprintf(pci_bus_id, sizeof(pci_bus_id), "%04x:%02x:%02x.0", prop.pciDomainID, prop.pciBusID, prop.pciDeviceID);
dev_ctx->pci_bus_id = pci_bus_id;
+ dev_ctx->major = prop.major;
+ dev_ctx->minor = prop.minor;
+ dev_ctx->driver_major = driverVersion / 1000;
+ dev_ctx->driver_minor = (driverVersion - (dev_ctx->driver_major * 1000)) / 10;
+ dev_ctx->integrated = prop.integrated;
+ dev_ctx->pci_bus_id = prop.pciBusID;
+ dev_ctx->pci_device_id = prop.pciDeviceID;
+ dev_ctx->pci_domain_id = prop.pciDomainID;
+ dev_ctx->pciBusID = prop.pciBusID;
+ dev_ctx->pciDeviceID = prop.pciDeviceID;
+ dev_ctx->pciDomainID = prop.pciDomainID;
ggml_backend_dev_t dev = new ggml_backend_device {
/* .iface = */ ggml_backend_cuda_device_interface,
/* .reg = */ &reg,
diff --git a/ggml/src/ggml-cuda/vendors/hip.h b/ggml/src/ggml-cuda/vendors/hip.h
index cf22e60d2..957a795f2 100644
index 06f9e7c1..eb8f66cb 100644
--- a/ggml/src/ggml-cuda/vendors/hip.h
+++ b/ggml/src/ggml-cuda/vendors/hip.h
@@ -42,6 +42,7 @@
@@ -5,6 +5,9 @@
#include <hipblas/hipblas.h>
#include <hip/hip_fp16.h>
#include <hip/hip_bf16.h>
+// for rocblas_initialize()
+#include "rocblas/rocblas.h"
+
#define CUBLAS_GEMM_DEFAULT HIPBLAS_GEMM_DEFAULT
#define CUBLAS_GEMM_DEFAULT_TENSOR_OP HIPBLAS_GEMM_DEFAULT
@@ -43,6 +46,7 @@
#define cudaDeviceProp hipDeviceProp_t
#define cudaDeviceReset hipDeviceReset
#define cudaDeviceSynchronize hipDeviceSynchronize
@ -202,11 +206,11 @@ index cf22e60d2..957a795f2 100644
#define cudaErrorPeerAccessAlreadyEnabled hipErrorPeerAccessAlreadyEnabled
#define cudaErrorPeerAccessNotEnabled hipErrorPeerAccessNotEnabled
diff --git a/ggml/src/ggml-impl.h b/ggml/src/ggml-impl.h
index 19a7adb2d..b9b102a5e 100644
index 86a1ebf6..9fc9fbfc 100644
--- a/ggml/src/ggml-impl.h
+++ b/ggml/src/ggml-impl.h
@@ -602,6 +602,14 @@ static inline bool ggml_can_fuse(const struct ggml_cgraph * cgraph, int node_idx
return true;
@@ -635,6 +635,14 @@ static inline bool ggml_can_fuse(const struct ggml_cgraph * cgraph, int node_idx
return ggml_can_fuse_ext(cgraph, idxs, ops, num_ops);
}
+// Management libraries for fetching more accurate free VRAM data
@ -220,28 +224,30 @@ index 19a7adb2d..b9b102a5e 100644
#ifdef __cplusplus
}
#endif
diff --git a/ggml/src/ggml-metal/ggml-metal.m b/ggml/src/ggml-metal/ggml-metal.m
index e4c31268f..ec6b385ba 100644
--- a/ggml/src/ggml-metal/ggml-metal.m
+++ b/ggml/src/ggml-metal/ggml-metal.m
@@ -6523,12 +6523,14 @@ static enum ggml_backend_dev_type ggml_backend_metal_device_get_type(ggml_backen
diff --git a/ggml/src/ggml-metal/ggml-metal.cpp b/ggml/src/ggml-metal/ggml-metal.cpp
index 08ab4fc9..17999a61 100644
--- a/ggml/src/ggml-metal/ggml-metal.cpp
+++ b/ggml/src/ggml-metal/ggml-metal.cpp
@@ -535,6 +535,7 @@ static enum ggml_backend_dev_type ggml_backend_metal_device_get_type(ggml_backen
GGML_UNUSED(dev);
}
+#define GGML_METAL_NAME "Metal"
static void ggml_backend_metal_device_get_props(ggml_backend_dev_t dev, struct ggml_backend_dev_props * props) {
static void ggml_backend_metal_device_get_props(ggml_backend_dev_t dev, ggml_backend_dev_props * props) {
props->name = ggml_backend_metal_device_get_name(dev);
props->description = ggml_backend_metal_device_get_description(dev);
props->id = "0";
@@ -542,7 +543,7 @@ static void ggml_backend_metal_device_get_props(ggml_backend_dev_t dev, ggml_bac
props->type = ggml_backend_metal_device_get_type(dev);
ggml_backend_metal_device_get_memory(dev, &props->memory_free, &props->memory_total);
-
+ props->library = GGML_METAL_NAME;
props->caps = (struct ggml_backend_dev_caps) {
/* .async = */ false,
props->caps = {
/* .async = */ true,
/* .host_buffer = */ false,
diff --git a/ggml/src/mem_hip.cpp b/ggml/src/mem_hip.cpp
new file mode 100644
index 000000000..8ef19b8cf
index 00000000..8ef19b8c
--- /dev/null
+++ b/ggml/src/mem_hip.cpp
@@ -0,0 +1,449 @@
@ -697,7 +703,7 @@ index 000000000..8ef19b8cf
\ No newline at end of file
diff --git a/ggml/src/mem_nvml.cpp b/ggml/src/mem_nvml.cpp
new file mode 100644
index 000000000..aa05e9dc1
index 00000000..aa05e9dc
--- /dev/null
+++ b/ggml/src/mem_nvml.cpp
@@ -0,0 +1,172 @@

View File

@ -1,57 +0,0 @@
From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
From: Jesse Gross <jesse@ollama.com>
Date: Tue, 23 Sep 2025 15:41:58 -0700
Subject: [PATCH] ggml: Backport scale kernel fixes
The GGML scale kernel uses signed 32-bit ints to represent
the number of elements in the tensor. For large images,
mistral-small3.2 overflows this, triggering CUDA errors due
to negative arguments.
Currently, this can happen when the user passes a large image
to mistral-small3.2. However, with upcoming changes to reserve
CUDA memory, it happens every time mistral-small is loaded as
we reserve using a worst case batch.
This patch is part of an upstream GGML commit and should be removed
after GGML is updated past 0a1b398 "ggml: add ops for WAN video model
(cuda && cpu) (#15669)".
Fixes #10388
---
ggml/src/ggml-cuda/scale.cu | 19 ++++++++++---------
1 file changed, 10 insertions(+), 9 deletions(-)
diff --git a/ggml/src/ggml-cuda/scale.cu b/ggml/src/ggml-cuda/scale.cu
index 2ee9e5889..0ddeff6a1 100644
--- a/ggml/src/ggml-cuda/scale.cu
+++ b/ggml/src/ggml-cuda/scale.cu
@@ -1,18 +1,19 @@
#include "scale.cuh"
-static __global__ void scale_f32(const float * x, float * dst, const float scale, const float bias, const int k) {
- const int i = blockDim.x*blockIdx.x + threadIdx.x;
+#define MAX_GRIDDIM_X 0x7FFFFFFF
- if (i >= k) {
- return;
- }
+static __global__ void scale_f32(const float * x, float * dst, const float scale, const float bias, const int64_t nelements) {
+ int64_t tid = (int64_t)blockIdx.x * (int64_t)blockDim.x + (int64_t)threadIdx.x;
+ int64_t stride = (int64_t)blockDim.x * (int64_t)gridDim.x;
- dst[i] = scale * x[i] + bias;
+ for (int64_t i = tid; i < nelements; i += stride) {
+ dst[i] = scale * x[i] + bias;
+ }
}
-static void scale_f32_cuda(const float * x, float * dst, const float scale, const float bias, const int k, cudaStream_t stream) {
- const int num_blocks = (k + CUDA_SCALE_BLOCK_SIZE - 1) / CUDA_SCALE_BLOCK_SIZE;
- scale_f32<<<num_blocks, CUDA_SCALE_BLOCK_SIZE, 0, stream>>>(x, dst, scale, bias, k);
+static void scale_f32_cuda(const float * x, float * dst, const float scale, const float bias, const int64_t nelements, cudaStream_t stream) {
+ const int64_t num_blocks = (nelements + CUDA_SCALE_BLOCK_SIZE - 1) / CUDA_SCALE_BLOCK_SIZE;
+ scale_f32<<<MIN(MAX_GRIDDIM_X, num_blocks), CUDA_SCALE_BLOCK_SIZE, 0, stream>>>(x, dst, scale, bias, nelements);
}
void ggml_cuda_op_scale(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {

View File

@ -132,6 +132,8 @@ extern "C" {
GGML_BACKEND_DEVICE_TYPE_CPU,
// GPU device using dedicated memory
GGML_BACKEND_DEVICE_TYPE_GPU,
// integrated GPU device using host memory
GGML_BACKEND_DEVICE_TYPE_IGPU,
// accelerator devices intended to be used together with the CPU backend (e.g. BLAS or AMX)
GGML_BACKEND_DEVICE_TYPE_ACCEL
};
@ -150,12 +152,22 @@ extern "C" {
// all the device properties
struct ggml_backend_dev_props {
// device name
const char * name;
// device description
const char * description;
const char * id;
// device free memory in bytes
size_t memory_free;
const char * id;
// device total memory in bytes
size_t memory_total;
// device type
enum ggml_backend_dev_type type;
// device id
// for PCI devices, this should be the PCI bus id formatted as "domain:bus:device.function" (e.g. "0000:01:00.0")
// if the id is unknown, this should be NULL
const char * device_id;
// device capabilities
struct ggml_backend_dev_caps caps;
int driver_major;
int driver_minor;
@ -314,12 +326,16 @@ extern "C" {
GGML_API int ggml_backend_sched_get_n_splits(ggml_backend_sched_t sched);
GGML_API int ggml_backend_sched_get_n_copies(ggml_backend_sched_t sched);
GGML_API ggml_backend_buffer_type_t ggml_backend_sched_get_buffer_type(ggml_backend_sched_t sched, ggml_backend_t backend);
GGML_API size_t ggml_backend_sched_get_buffer_size(ggml_backend_sched_t sched, ggml_backend_t backend);
GGML_API size_t ggml_backend_sched_get_attempted_buffer_size(ggml_backend_sched_t sched, ggml_backend_t backend);
GGML_API void ggml_backend_sched_set_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node, ggml_backend_t backend);
GGML_API ggml_backend_t ggml_backend_sched_get_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node);
// Split graph without allocating it
GGML_API void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * graph);
// Allocate and compute graph on the backend scheduler
GGML_API bool ggml_backend_sched_alloc_graph(ggml_backend_sched_t sched, struct ggml_cgraph * graph); // returns success
GGML_API enum ggml_status ggml_backend_sched_graph_compute(ggml_backend_sched_t sched, struct ggml_cgraph * graph);

View File

@ -101,7 +101,6 @@ extern "C" {
GGML_BACKEND_API int ggml_cpu_has_riscv_v (void);
GGML_BACKEND_API int ggml_cpu_has_vsx (void);
GGML_BACKEND_API int ggml_cpu_has_vxe (void);
GGML_BACKEND_API int ggml_cpu_has_nnpa (void);
GGML_BACKEND_API int ggml_cpu_has_wasm_simd (void);
GGML_BACKEND_API int ggml_cpu_has_llamafile (void);
@ -135,6 +134,7 @@ extern "C" {
GGML_BACKEND_API ggml_backend_reg_t ggml_backend_cpu_reg(void);
GGML_BACKEND_API void ggml_cpu_fp32_to_fp32(const float *, float *, int64_t);
GGML_BACKEND_API void ggml_cpu_fp32_to_i32 (const float *, int32_t *, int64_t);
GGML_BACKEND_API void ggml_cpu_fp32_to_fp16(const float *, ggml_fp16_t *, int64_t);
GGML_BACKEND_API void ggml_cpu_fp16_to_fp32(const ggml_fp16_t *, float *, int64_t);
GGML_BACKEND_API void ggml_cpu_fp32_to_bf16(const float *, ggml_bf16_t *, int64_t);

View File

@ -39,18 +39,13 @@ extern "C" {
// user-code should use only these functions
//
// TODO: remove in the future
GGML_BACKEND_API ggml_backend_t ggml_backend_metal_init(void);
GGML_BACKEND_API bool ggml_backend_is_metal(ggml_backend_t backend);
GGML_DEPRECATED(
GGML_BACKEND_API ggml_backend_buffer_t ggml_backend_metal_buffer_from_ptr(void * data, size_t size, size_t max_size),
"obsoleted by the new device interface - https://github.com/ggml-org/llama.cpp/pull/9713");
GGML_BACKEND_API void ggml_backend_metal_set_abort_callback(ggml_backend_t backend, ggml_abort_callback abort_callback, void * user_data);
GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void);
// helper to check if the device supports a specific family
// ideally, the user code should be doing these checks
// ref: https://developer.apple.com/metal/Metal-Feature-Set-Tables.pdf

View File

@ -74,16 +74,26 @@ extern "C" {
GGML_OPT_BUILD_TYPE_OPT = 30,
};
enum ggml_opt_optimizer_type {
GGML_OPT_OPTIMIZER_TYPE_ADAMW,
GGML_OPT_OPTIMIZER_TYPE_SGD,
GGML_OPT_OPTIMIZER_TYPE_COUNT
};
// parameters that control which optimizer is used and how said optimizer tries to find the minimal loss
struct ggml_opt_optimizer_params {
// AdamW optimizer parameters
struct {
float alpha; // learning rate
float beta1;
float beta2;
float beta1; // first AdamW momentum
float beta2; // second AdamW momentum
float eps; // epsilon for numerical stability
float wd; // weight decay for AdamW, use 0.0f to disable
float wd; // weight decay - 0.0f to disable
} adamw;
struct {
float alpha; // learning rate
float wd; // weight decay
} sgd;
};
// callback to calculate optimizer parameters prior to a backward pass
@ -114,6 +124,9 @@ extern "C" {
ggml_opt_get_optimizer_params get_opt_pars; // callback for calculating optimizer parameters
void * get_opt_pars_ud; // userdata for calculating optimizer parameters
// only GGML_OPT_OPTIMIZER_TYPE_ADAMW needs m, v momenta per parameter tensor
enum ggml_opt_optimizer_type optimizer;
};
// get parameters for an optimization context with defaults set where possible
@ -142,6 +155,10 @@ extern "C" {
// get the gradient accumulator for a node from the forward graph
GGML_API struct ggml_tensor * ggml_opt_grad_acc(ggml_opt_context_t opt_ctx, struct ggml_tensor * node);
GGML_API enum ggml_opt_optimizer_type ggml_opt_context_optimizer_type(ggml_opt_context_t); //TODO consistent naming scheme
GGML_API const char * ggml_opt_optimizer_name(enum ggml_opt_optimizer_type);
// ====== Optimization Result ======
GGML_API ggml_opt_result_t ggml_opt_result_init(void);
@ -226,12 +243,14 @@ extern "C" {
struct ggml_tensor * outputs, // output tensor, must have shape [ne_label, ndata_batch] if labels are used
ggml_opt_dataset_t dataset, // dataset with data and optionally also labels
enum ggml_opt_loss_type loss_type, // loss to minimize
enum ggml_opt_optimizer_type optimizer, // sgd or adamw
ggml_opt_get_optimizer_params get_opt_pars, // callback to get optimizer params, userdata is pointer to epoch (of type int64_t)
int64_t nepoch, // how many times the dataset should be iterated over
int64_t nbatch_logical, // datapoints optimizer step, must be a multiple of ndata_batch in inputs/outputs
float val_split, // fraction of the dataset to use for validation, must be in [0.0f, 1.0f)
bool silent); // whether or not info prints to stderr should be suppressed
#ifdef __cplusplus
}
#endif

View File

@ -0,0 +1,17 @@
#pragma once
#include "ggml.h"
#include "ggml-backend.h"
#ifdef __cplusplus
extern "C" {
#endif
// device buffer
GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_zdnn_buffer_type(void);
GGML_BACKEND_API ggml_backend_reg_t ggml_backend_zdnn_reg(void);
#ifdef __cplusplus
}
#endif

View File

@ -241,7 +241,16 @@
#define GGML_ROPE_TYPE_MROPE 8
#define GGML_ROPE_TYPE_VISION 24
#define GGML_MROPE_SECTIONS 4
#define GGML_UNUSED(x) (void)(x)
#ifdef __CUDACC__
template<typename... Args>
__host__ __device__ constexpr inline void ggml_unused_vars_impl(Args&&...) noexcept {}
#define GGML_UNUSED_VARS(...) ggml_unused_vars_impl(__VA_ARGS__)
#else
#define GGML_UNUSED_VARS(...) do { (void)sizeof((__VA_ARGS__, 0)); } while(0)
#endif // __CUDACC__
#define GGML_PAD(x, n) (((x) + (n) - 1) & ~((n) - 1))
@ -275,19 +284,19 @@
// GGML_TENSOR_LOCALS(size_t, nb1, src1, nb);
//
#define GGML_TENSOR_LOCALS_1(type, prefix, pointer, array) \
const type prefix##0 = (pointer)->array[0]; \
const type prefix##0 = (pointer) ? (pointer)->array[0] : 0; \
GGML_UNUSED(prefix##0);
#define GGML_TENSOR_LOCALS_2(type, prefix, pointer, array) \
GGML_TENSOR_LOCALS_1 (type, prefix, pointer, array) \
const type prefix##1 = (pointer)->array[1]; \
const type prefix##1 = (pointer) ? (pointer)->array[1] : 0; \
GGML_UNUSED(prefix##1);
#define GGML_TENSOR_LOCALS_3(type, prefix, pointer, array) \
GGML_TENSOR_LOCALS_2 (type, prefix, pointer, array) \
const type prefix##2 = (pointer)->array[2]; \
const type prefix##2 = (pointer) ? (pointer)->array[2] : 0; \
GGML_UNUSED(prefix##2);
#define GGML_TENSOR_LOCALS(type, prefix, pointer, array) \
GGML_TENSOR_LOCALS_3 (type, prefix, pointer, array) \
const type prefix##3 = (pointer)->array[3]; \
const type prefix##3 = (pointer) ? (pointer)->array[3] : 0; \
GGML_UNUSED(prefix##3);
#define GGML_TENSOR_UNARY_OP_LOCALS \
@ -502,7 +511,9 @@ extern "C" {
GGML_OP_CONV_TRANSPOSE_1D,
GGML_OP_IM2COL,
GGML_OP_IM2COL_BACK,
GGML_OP_IM2COL_3D,
GGML_OP_CONV_2D,
GGML_OP_CONV_3D,
GGML_OP_CONV_2D_DW,
GGML_OP_CONV_TRANSPOSE_2D,
GGML_OP_POOL_1D,
@ -540,6 +551,7 @@ extern "C" {
GGML_OP_CROSS_ENTROPY_LOSS,
GGML_OP_CROSS_ENTROPY_LOSS_BACK,
GGML_OP_OPT_STEP_ADAMW,
GGML_OP_OPT_STEP_SGD,
GGML_OP_GLU,
@ -1392,6 +1404,7 @@ extern "C" {
struct ggml_tensor * a,
struct ggml_tensor * b);
// note: casting from f32 to i32 will discard the fractional part
GGML_API struct ggml_tensor * ggml_cast(
struct ggml_context * ctx,
struct ggml_tensor * a,
@ -1516,7 +1529,11 @@ extern "C" {
struct ggml_context * ctx,
struct ggml_tensor * a);
// supports 3D: a->ne[2] == b->ne[1]
// supports 4D a:
// a [n_embd, ne1, ne2, ne3]
// b I32 [n_rows, ne2, ne3, 1]
//
// return [n_embd, n_rows, ne2, ne3]
GGML_API struct ggml_tensor * ggml_get_rows(
struct ggml_context * ctx,
struct ggml_tensor * a, // data
@ -1660,7 +1677,7 @@ extern "C" {
struct ggml_tensor * b,
struct ggml_tensor * c,
int n_dims,
int sections[4],
int sections[GGML_MROPE_SECTIONS],
int mode,
int n_ctx_orig,
float freq_base,
@ -1686,6 +1703,22 @@ extern "C" {
float beta_fast,
float beta_slow);
GGML_API struct ggml_tensor * ggml_rope_multi_inplace(
struct ggml_context * ctx,
struct ggml_tensor * a,
struct ggml_tensor * b,
struct ggml_tensor * c,
int n_dims,
int sections[GGML_MROPE_SECTIONS],
int mode,
int n_ctx_orig,
float freq_base,
float freq_scale,
float ext_factor,
float attn_factor,
float beta_fast,
float beta_slow);
GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_rope_custom(
struct ggml_context * ctx,
struct ggml_tensor * a,
@ -1843,6 +1876,41 @@ extern "C" {
int d0, // dilation dimension 0
int d1); // dilation dimension 1
GGML_API struct ggml_tensor * ggml_im2col_3d(
struct ggml_context * ctx,
struct ggml_tensor * a,
struct ggml_tensor * b,
int64_t IC,
int s0, // stride width
int s1, // stride height
int s2, // stride depth
int p0, // padding width
int p1, // padding height
int p2, // padding depth
int d0, // dilation width
int d1, // dilation height
int d2, // dilation depth
enum ggml_type dst_type);
// a: [OC*IC, KD, KH, KW]
// b: [N*IC, ID, IH, IW]
// result: [N*OC, OD, OH, OW]
GGML_API struct ggml_tensor * ggml_conv_3d(
struct ggml_context * ctx,
struct ggml_tensor * a,
struct ggml_tensor * b,
int64_t IC,
int s0, // stride width
int s1, // stride height
int s2, // stride depth
int p0, // padding width
int p1, // padding height
int p2, // padding depth
int d0, // dilation width
int d1, // dilation height
int d2 // dilation depth
);
// kernel size is a->ne[0] x a->ne[1]
// stride is equal to kernel size
// padding is zero
@ -1914,6 +1982,23 @@ extern "C" {
int d0, // dilation dimension 0
int d1); // dilation dimension 1
GGML_API struct ggml_tensor * ggml_conv_3d_direct(
struct ggml_context * ctx,
struct ggml_tensor * a, // kernel [KW, KH, KD, IC * OC]
struct ggml_tensor * b, // input [W, H, D, C * N]
int s0, // stride
int s1,
int s2,
int p0, // padding
int p1,
int p2,
int d0, // dilation
int d1,
int d2,
int n_channels,
int n_batch,
int n_channels_out);
enum ggml_op_pool {
GGML_OP_POOL_MAX,
GGML_OP_POOL_AVG,
@ -2004,6 +2089,19 @@ extern "C" {
int p2,
int p3);
GGML_API struct ggml_tensor * ggml_pad_ext(
struct ggml_context * ctx,
struct ggml_tensor * a,
int lp0,
int rp0,
int lp1,
int rp1,
int lp2,
int rp2,
int lp3,
int rp3
);
// pad each dimension with reflection: [a, b, c, d] -> [b, a, b, c, d, c]
GGML_API struct ggml_tensor * ggml_pad_reflect_1d(
struct ggml_context * ctx,
@ -2293,7 +2391,14 @@ extern "C" {
struct ggml_tensor * grad,
struct ggml_tensor * m,
struct ggml_tensor * v,
struct ggml_tensor * adamw_params); // parameters such a the learning rate
struct ggml_tensor * adamw_params); // parameters such as the learning rate
// stochastic gradient descent step (with weight decay)
GGML_API struct ggml_tensor * ggml_opt_step_sgd(
struct ggml_context * ctx,
struct ggml_tensor * a,
struct ggml_tensor * grad,
struct ggml_tensor * sgd_params); // alpha, weight decay
//
// automatic differentiation

View File

@ -114,6 +114,9 @@ message(STATUS "GGML_SYSTEM_ARCH: ${GGML_SYSTEM_ARCH}")
if (NOT MSVC)
if (GGML_STATIC)
if (UNIX AND NOT APPLE)
set(CMAKE_FIND_LIBRARY_SUFFIXES ".a;.so")
endif()
add_link_options(-static)
if (MINGW)
add_link_options(-static-libgcc -static-libstdc++)
@ -382,6 +385,7 @@ ggml_add_backend(RPC)
ggml_add_backend(SYCL)
ggml_add_backend(Vulkan)
ggml_add_backend(WebGPU)
ggml_add_backend(zDNN)
ggml_add_backend(OpenCL)
foreach (target ggml-base ggml)

View File

@ -23,7 +23,7 @@ static bool ggml_is_view(const struct ggml_tensor * t) {
}
// ops that return true for this function must not use restrict pointers for their backend implementations
static bool ggml_op_can_inplace(enum ggml_op op) {
bool ggml_op_can_inplace(enum ggml_op op) {
switch (op) {
case GGML_OP_SCALE:
case GGML_OP_DIAG_MASK_ZERO:
@ -95,39 +95,104 @@ enum ggml_status ggml_tallocr_alloc(struct ggml_tallocr * talloc, struct ggml_te
// dynamic tensor allocator
#define GGML_VBUFFER_MAX_CHUNKS 16
// relative memory address within an allocation that can be split into multiple buffers (chunks)
struct buffer_address {
int chunk; // index of a backend buffer
size_t offset; // local memory offset within the buffer
};
static const struct buffer_address GGML_BUFFER_ADDRESS_INVALID = { -1, SIZE_MAX };
static bool ggml_buffer_address_less(struct buffer_address a, struct buffer_address b) {
return a.chunk != b.chunk ? a.chunk < b.chunk : a.offset < b.offset;
}
struct free_block {
size_t offset;
size_t size;
};
struct tallocr_chunk {
struct free_block free_blocks[MAX_FREE_BLOCKS];
int n_free_blocks;
size_t max_size;
};
struct ggml_dyn_tallocr {
size_t alignment;
int n_free_blocks;
struct free_block free_blocks[MAX_FREE_BLOCKS];
size_t max_size;
size_t max_chunk_size;
struct tallocr_chunk * chunks[GGML_VBUFFER_MAX_CHUNKS];
int n_chunks;
#ifdef GGML_ALLOCATOR_DEBUG
struct {
const struct ggml_tensor * tensor;
size_t offset;
struct buffer_address addr;
} allocated_tensors[1024];
#endif
};
static void ggml_dyn_tallocr_insert_block(struct tallocr_chunk * chunk, size_t offset, size_t size) {
GGML_ASSERT(chunk->n_free_blocks < MAX_FREE_BLOCKS && "out of free blocks");
// insert the new block in the correct position to keep the array sorted by address (to make merging blocks faster)
int insert_pos = 0;
while (insert_pos < chunk->n_free_blocks && chunk->free_blocks[insert_pos].offset < offset) {
insert_pos++;
}
// shift all blocks from insert_pos onward to make room for the new block
for (int i = chunk->n_free_blocks; i > insert_pos; i--) {
chunk->free_blocks[i] = chunk->free_blocks[i-1];
}
// insert the new block
chunk->free_blocks[insert_pos].offset = offset;
chunk->free_blocks[insert_pos].size = size;
chunk->n_free_blocks++;
}
static void ggml_dyn_tallocr_remove_block(struct tallocr_chunk * chunk, int idx) {
// shift all elements after idx by 1 to the left, overwriting the element at idx
for (int i = idx; i < chunk->n_free_blocks; i++) {
chunk->free_blocks[i] = chunk->free_blocks[i+1];
}
chunk->n_free_blocks--;
}
static int ggml_dyn_tallocr_new_chunk(struct ggml_dyn_tallocr * alloc, size_t min_size) {
if (alloc->n_chunks >= GGML_VBUFFER_MAX_CHUNKS) {
return -1;
}
struct tallocr_chunk * chunk = calloc(1, sizeof(struct tallocr_chunk));
chunk->n_free_blocks = 1;
chunk->free_blocks[0].offset = 0;
// available space in a chunk is limited to max_chunk_size, but can be higher if:
// 1. a single tensor exceeds the maximum, and cannot fit any other way
// 2. we are running out of chunks
// backends will either manage to allocate the larger size, or report an error.
chunk->free_blocks[0].size = MAX(min_size, alloc->max_chunk_size);
if (alloc->n_chunks == GGML_VBUFFER_MAX_CHUNKS - 1) {
chunk->free_blocks[0].size = SIZE_MAX/2;
}
alloc->chunks[alloc->n_chunks] = chunk;
alloc->n_chunks++;
return alloc->n_chunks - 1;
}
#ifdef GGML_ALLOCATOR_DEBUG
static void add_allocated_tensor(struct ggml_dyn_tallocr * alloc, size_t offset, const struct ggml_tensor * tensor) {
static void add_allocated_tensor(struct ggml_dyn_tallocr * alloc, struct buffer_address addr, const struct ggml_tensor * tensor) {
for (int i = 0; i < 1024; i++) {
if (alloc->allocated_tensors[i].tensor == NULL) {
alloc->allocated_tensors[i].tensor = tensor;
alloc->allocated_tensors[i].offset = offset;
alloc->allocated_tensors[i].addr = addr;
return;
}
}
GGML_ABORT("out of allocated_tensors");
}
static void remove_allocated_tensor(struct ggml_dyn_tallocr * alloc, size_t offset, const struct ggml_tensor * tensor) {
static void remove_allocated_tensor(struct ggml_dyn_tallocr * alloc, struct buffer_address addr, const struct ggml_tensor * tensor) {
for (int i = 0; i < 1024; i++) {
if (alloc->allocated_tensors[i].offset == offset) {
if (alloc->allocated_tensors[i].addr.chunk == addr.chunk && alloc->allocated_tensors[i].addr.offset == addr.offset) {
alloc->allocated_tensors[i].tensor = NULL;
return;
}
@ -136,76 +201,94 @@ static void remove_allocated_tensor(struct ggml_dyn_tallocr * alloc, size_t offs
}
#endif
static size_t ggml_dyn_tallocr_alloc(struct ggml_dyn_tallocr * alloc, size_t size, const struct ggml_tensor * tensor) {
static struct buffer_address ggml_dyn_tallocr_alloc(struct ggml_dyn_tallocr * alloc, size_t size, const struct ggml_tensor * tensor) {
size = aligned_offset(NULL, size, alloc->alignment);
AT_PRINTF("%s: allocating %s (%zu bytes) - ", __func__, tensor->name, size);
int best_fit_chunk = -1;
int best_fit_block = -1;
size_t max_avail = 0;
// find the best fitting free block besides the last block
int best_fit_block = -1;
// find the best fitting free block besides the last block, within any chunk
for (int c = 0; c < alloc->n_chunks; ++c) {
struct tallocr_chunk * chunk = alloc->chunks[c];
size_t best_fit_size = SIZE_MAX;
for (int i = 0; i < alloc->n_free_blocks - 1; i++) {
struct free_block * block = &alloc->free_blocks[i];
for (int i = 0; i < chunk->n_free_blocks - 1; i++) {
struct free_block * block = &chunk->free_blocks[i];
max_avail = MAX(max_avail, block->size);
if (block->size >= size && block->size <= best_fit_size) {
best_fit_chunk = c;
best_fit_block = i;
best_fit_size = block->size;
}
}
}
if (best_fit_block == -1) {
// the last block is our last resort
struct free_block * block = &alloc->free_blocks[alloc->n_free_blocks - 1];
// no suitable block found, try the last block (this will grow a chunks size)
for (int c = 0; c < alloc->n_chunks; ++c) {
struct tallocr_chunk * chunk = alloc->chunks[c];
if (chunk->n_free_blocks > 0) {
struct free_block * block = &chunk->free_blocks[chunk->n_free_blocks - 1];
max_avail = MAX(max_avail, block->size);
if (block->size >= size) {
best_fit_block = alloc->n_free_blocks - 1;
} else {
// this should never happen
GGML_LOG_ERROR("%s: not enough space in the buffer to allocate %zu bytes, largest block available %zu bytes\n",
__func__, size, max_avail);
GGML_ABORT("not enough space in the buffer");
best_fit_chunk = c;
best_fit_block = chunk->n_free_blocks - 1;
break;
}
}
}
}
struct free_block * block = &alloc->free_blocks[best_fit_block];
size_t offset = block->offset;
block->offset = offset + size;
if (best_fit_block == -1) {
// none of the existing chunks have enough space left
best_fit_chunk = ggml_dyn_tallocr_new_chunk(alloc, size);
best_fit_block = 0;
}
if (best_fit_chunk == -1) {
// since the last chunk always has virtually endless memory, this should never happen
GGML_LOG_ERROR("%s: not enough space in the buffer to allocate %zu bytes, largest block available %zu bytes\n",
__func__, size, max_avail);
GGML_ABORT("graph allocation: failed to reserve memory");
}
struct tallocr_chunk * chunk = alloc->chunks[best_fit_chunk];
struct free_block * block = &chunk->free_blocks[best_fit_block];
struct buffer_address addr = {.chunk = best_fit_chunk, .offset = block->offset };
block->offset += size;
block->size -= size;
if (block->size == 0) {
// remove block if empty
alloc->n_free_blocks--;
for (int j = best_fit_block; j < alloc->n_free_blocks; j++) {
alloc->free_blocks[j] = alloc->free_blocks[j+1];
}
ggml_dyn_tallocr_remove_block(chunk, best_fit_block);
}
AT_PRINTF("block %d, offset %zu\n", best_fit_block, offset);
AT_PRINTF("block %d, offset %zu, chunk %d\n", best_fit_block, addr.offset, addr.chunk);
#ifdef GGML_ALLOCATOR_DEBUG
add_allocated_tensor(alloc, offset, tensor);
size_t cur_max = offset + size;
if (cur_max > alloc->max_size) {
// sort allocated_tensors by offset
add_allocated_tensor(alloc, addr, tensor);
size_t cur_max = addr.offset + size;
if (cur_max > alloc->max_size[addr.chunk]) {
// sort allocated_tensors by chunk/offset
for (int i = 0; i < 1024; i++) {
for (int j = i + 1; j < 1024; j++) {
if (alloc->allocated_tensors[i].offset > alloc->allocated_tensors[j].offset) {
if (ggml_buffer_address_less(alloc->allocated_tensors[j].addr, alloc->allocated_tensors[i].addr)) {
const struct ggml_tensor * tmp_tensor = alloc->allocated_tensors[i].tensor;
size_t tmp_offset = alloc->allocated_tensors[i].offset;
struct buffer_address tmp_addr = alloc->allocated_tensors[i].addr;
alloc->allocated_tensors[i].tensor = alloc->allocated_tensors[j].tensor;
alloc->allocated_tensors[i].offset = alloc->allocated_tensors[j].offset;
alloc->allocated_tensors[i].addr = alloc->allocated_tensors[j].addr;
alloc->allocated_tensors[j].tensor = tmp_tensor;
alloc->allocated_tensors[j].offset = tmp_offset;
alloc->allocated_tensors[j].addr = tmp_addr;
}
}
}
GGML_LOG_DEBUG("max_size = %.2f MB: tensors: ", cur_max / 1024.0 / 1024.0);
GGML_LOG_DEBUG("max_size[%d] = %.2f MB: tensors: ", addr.chunk, cur_max / 1024.0 / 1024.0);
for (int i = 0; i < 1024; i++) {
if (alloc->allocated_tensors[i].tensor) {
GGML_LOG_DEBUG("%s [%zx-%zx] (%.2f MB) ", alloc->allocated_tensors[i].tensor->name,
alloc->allocated_tensors[i].offset,
alloc->allocated_tensors[i].offset + ggml_nbytes(alloc->allocated_tensors[i].tensor),
GGML_LOG_DEBUG("%s [%d: %zx-%zx] (%.2f MB) ", alloc->allocated_tensors[i].tensor->name,
alloc->allocated_tensors[i].addr.chunk,
alloc->allocated_tensors[i].addr.offset,
alloc->allocated_tensors[i].addr.offset + ggml_nbytes(alloc->allocated_tensors[i].tensor),
ggml_nbytes(alloc->allocated_tensors[i].tensor) / 1024.0 / 1024.0);
}
}
@ -213,78 +296,69 @@ static size_t ggml_dyn_tallocr_alloc(struct ggml_dyn_tallocr * alloc, size_t siz
}
#endif
alloc->max_size = MAX(alloc->max_size, offset + size);
chunk->max_size = MAX(chunk->max_size, addr.offset + size);
return offset;
return addr;
GGML_UNUSED(tensor);
}
// this is a very naive implementation, but for our case the number of free blocks should be very small
static void ggml_dyn_tallocr_free_tensor(struct ggml_dyn_tallocr * alloc, size_t offset, size_t size, const struct ggml_tensor * tensor) {
static void ggml_dyn_tallocr_free_tensor(struct ggml_dyn_tallocr * alloc, struct buffer_address addr, size_t size, const struct ggml_tensor * tensor) {
size = aligned_offset(NULL, size, alloc->alignment);
AT_PRINTF("%s: freeing %s at %zu (%zu bytes) - n_free_blocks = %d\n", __func__, tensor->name, offset, size, alloc->n_free_blocks);
AT_PRINTF("%s: freeing %s at {chunk=%d, offset=%zu} (%zu bytes) - n_free_blocks = %d\n",
__func__, tensor->name, addr.chunk, addr.offset, size, alloc->chunks[addr.chunk]->n_free_blocks);
#ifdef GGML_ALLOCATOR_DEBUG
remove_allocated_tensor(alloc, offset, tensor);
remove_allocated_tensor(alloc, addr, tensor);
#endif
struct tallocr_chunk * chunk = alloc->chunks[addr.chunk];
// see if we can merge with an existing block
for (int i = 0; i < alloc->n_free_blocks; i++) {
struct free_block * block = &alloc->free_blocks[i];
for (int i = 0; i < chunk->n_free_blocks; i++) {
struct free_block * block = &chunk->free_blocks[i];
// check if ptr is at the end of the block
if (block->offset + block->size == offset) {
if (block->offset + block->size == addr.offset) {
block->size += size;
// check if we can merge with the next block
if (i < alloc->n_free_blocks - 1 && block->offset + block->size == alloc->free_blocks[i+1].offset) {
block->size += alloc->free_blocks[i+1].size;
alloc->n_free_blocks--;
for (int j = i+1; j < alloc->n_free_blocks; j++) {
alloc->free_blocks[j] = alloc->free_blocks[j+1];
if (i < chunk->n_free_blocks - 1) {
struct free_block * next = &chunk->free_blocks[i+1];
if (block->offset + block->size == next->offset) {
block->size += next->size;
ggml_dyn_tallocr_remove_block(chunk, i+1);
}
}
return;
}
// check if ptr is at the beginning of the block
if (offset + size == block->offset) {
block->offset = offset;
if (addr.offset + size == block->offset) {
block->offset = addr.offset;
block->size += size;
// check if we can merge with the previous block
if (i > 0 && alloc->free_blocks[i-1].offset + alloc->free_blocks[i-1].size == block->offset) {
alloc->free_blocks[i-1].size += block->size;
alloc->n_free_blocks--;
for (int j = i; j < alloc->n_free_blocks; j++) {
alloc->free_blocks[j] = alloc->free_blocks[j+1];
if (i > 0) {
struct free_block * prev = &chunk->free_blocks[i-1];
if (prev->offset + prev->size == block->offset) {
prev->size += block->size;
ggml_dyn_tallocr_remove_block(chunk, i);
}
}
return;
}
}
// otherwise, add a new block
GGML_ASSERT(alloc->n_free_blocks < MAX_FREE_BLOCKS && "out of free blocks");
// insert the new block in the correct position to keep the array sorted by address (to make merging blocks faster)
int insert_pos = 0;
while (insert_pos < alloc->n_free_blocks && alloc->free_blocks[insert_pos].offset < offset) {
insert_pos++;
}
// shift all blocks from insert_pos onward to make room for the new block
for (int i = alloc->n_free_blocks; i > insert_pos; i--) {
alloc->free_blocks[i] = alloc->free_blocks[i-1];
}
// insert the new block
alloc->free_blocks[insert_pos].offset = offset;
alloc->free_blocks[insert_pos].size = size;
alloc->n_free_blocks++;
ggml_dyn_tallocr_insert_block(chunk, addr.offset, size);
GGML_UNUSED(tensor);
}
static void ggml_dyn_tallocr_reset(struct ggml_dyn_tallocr * alloc) {
alloc->n_free_blocks = 1;
alloc->free_blocks[0].offset = 0;
alloc->free_blocks[0].size = SIZE_MAX/2; // restrict maximum size of a measure allocator to half size_t max to avoid overflows
alloc->max_size = 0;
for (int i = 0; i < GGML_VBUFFER_MAX_CHUNKS; i++) {
free(alloc->chunks[i]);
alloc->chunks[i] = NULL;
}
alloc->n_chunks = 0;
#ifdef GGML_ALLOCATOR_DEBUG
for (int i = 0; i < 1024; i++) {
@ -293,14 +367,14 @@ static void ggml_dyn_tallocr_reset(struct ggml_dyn_tallocr * alloc) {
#endif
}
static struct ggml_dyn_tallocr * ggml_dyn_tallocr_new(size_t alignment) {
static struct ggml_dyn_tallocr * ggml_dyn_tallocr_new(size_t alignment, size_t max_buffer_size) {
struct ggml_dyn_tallocr * alloc = (struct ggml_dyn_tallocr *)malloc(sizeof(struct ggml_dyn_tallocr));
*alloc = (struct ggml_dyn_tallocr) {
/*.alignment = */ alignment,
/*.n_free_blocks = */ 0,
/*.free_blocks = */ {{0}},
/*.max_size = */ 0,
/*.max_chunk_size = */ MIN(max_buffer_size, SIZE_MAX/2), // clamp to avoid overflows
/*.chunks = */ {NULL},
/*.n_chunks = */ 0,
#ifdef GGML_ALLOCATOR_DEBUG
/*.allocated_tensors = */ {{0}},
#endif
@ -312,11 +386,79 @@ static struct ggml_dyn_tallocr * ggml_dyn_tallocr_new(size_t alignment) {
}
static void ggml_dyn_tallocr_free(struct ggml_dyn_tallocr * alloc) {
for (int i = 0; i < alloc->n_chunks; ++i) {
free(alloc->chunks[i]);
}
free(alloc);
}
static size_t ggml_dyn_tallocr_max_size(struct ggml_dyn_tallocr * alloc) {
return alloc->max_size;
size_t max_size = 0;
for (int i = 0; i < alloc->n_chunks; i++) {
max_size += alloc->chunks[i]->max_size;
}
return max_size;
}
// virtual buffer with contiguous memory range, split into multiple backend buffers (chunks)
struct vbuffer {
ggml_backend_buffer_t chunks[GGML_VBUFFER_MAX_CHUNKS];
};
static void ggml_vbuffer_free(struct vbuffer * buf) {
if (buf == NULL) {
return;
}
for (int i = 0; i < GGML_VBUFFER_MAX_CHUNKS; ++i) {
ggml_backend_buffer_free(buf->chunks[i]);
}
free(buf);
}
static int ggml_vbuffer_n_chunks(struct vbuffer * buf) {
int n = 0;
while (n < GGML_VBUFFER_MAX_CHUNKS && buf->chunks[n]) n++;
return n;
}
static size_t ggml_vbuffer_size(struct vbuffer * buf) {
size_t size = 0;
for (int i = 0; i < GGML_VBUFFER_MAX_CHUNKS && buf->chunks[i]; ++i) {
size += ggml_backend_buffer_get_size(buf->chunks[i]);
}
return size;
}
static struct vbuffer * ggml_vbuffer_alloc(ggml_backend_buffer_type_t buft, const struct ggml_dyn_tallocr * talloc, enum ggml_backend_buffer_usage usage) {
struct vbuffer * buf = (struct vbuffer *)calloc(1, sizeof(struct vbuffer));
if (buf == NULL) {
return NULL;
}
for (int n = 0; n < talloc->n_chunks; n++) {
size_t chunk_size = talloc->chunks[n]->max_size;
buf->chunks[n] = ggml_backend_buft_alloc_buffer(buft, chunk_size);
if (buf->chunks[n] == NULL) {
ggml_vbuffer_free(buf);
return NULL;
}
ggml_backend_buffer_set_usage(buf->chunks[n], usage);
}
return buf;
}
static void ggml_vbuffer_tensor_alloc(struct vbuffer * buf, struct ggml_tensor * tensor, struct buffer_address buf_addr) {
void * base = ggml_backend_buffer_get_base(buf->chunks[buf_addr.chunk]);
void * addr = (char *)base + buf_addr.offset;
ggml_backend_tensor_alloc(buf->chunks[buf_addr.chunk], tensor, addr);
}
static void ggml_vbuffer_reset(struct vbuffer * buf) {
for (int i = 0; i < GGML_VBUFFER_MAX_CHUNKS && buf->chunks[i]; ++i) {
ggml_backend_buffer_reset(buf->chunks[i]);
}
}
@ -328,13 +470,13 @@ struct hash_node {
int n_children;
int n_views;
int buffer_id;
size_t offset; // offset within the buffer
struct buffer_address addr;
bool allocated;
};
struct tensor_alloc {
int buffer_id;
size_t offset;
struct buffer_address addr;
size_t size_max; // 0 = pre-allocated, unused, or view
};
@ -349,7 +491,7 @@ struct node_alloc {
struct ggml_gallocr {
ggml_backend_buffer_type_t * bufts; // [n_buffers]
ggml_backend_buffer_t * buffers; // [n_buffers]
struct vbuffer ** buffers; // [n_buffers]
size_t *buffer_sizes; // [n_buffers]
struct ggml_dyn_tallocr ** buf_tallocs; // [n_buffers]
int n_buffers;
@ -371,7 +513,7 @@ ggml_gallocr_t ggml_gallocr_new_n(ggml_backend_buffer_type_t * bufts, int n_bufs
galloc->bufts = calloc(n_bufs, sizeof(ggml_backend_buffer_type_t));
GGML_ASSERT(galloc->bufts != NULL);
galloc->buffers = calloc(n_bufs, sizeof(ggml_backend_buffer_t));
galloc->buffers = calloc(n_bufs, sizeof(struct vbuffer *));
GGML_ASSERT(galloc->buffers != NULL);
galloc->buffer_sizes = calloc(n_bufs, sizeof(size_t));
@ -394,7 +536,8 @@ ggml_gallocr_t ggml_gallocr_new_n(ggml_backend_buffer_type_t * bufts, int n_bufs
if (galloc->buf_tallocs[i] == NULL) {
size_t alignment = ggml_backend_buft_get_alignment(bufts[i]);
galloc->buf_tallocs[i] = ggml_dyn_tallocr_new(alignment);
size_t max_size = ggml_backend_buft_get_max_size(bufts[i]);
galloc->buf_tallocs[i] = ggml_dyn_tallocr_new(alignment, max_size);
}
}
galloc->n_buffers = n_bufs;
@ -422,7 +565,7 @@ void ggml_gallocr_free(ggml_gallocr_t galloc) {
}
}
if (!freed) {
ggml_backend_buffer_free(galloc->buffers[i]);
ggml_vbuffer_free(galloc->buffers[i]);
}
}
if (galloc->buf_tallocs != NULL) {
@ -472,7 +615,7 @@ static void ggml_gallocr_allocate_node(ggml_gallocr_t galloc, struct ggml_tensor
if (!ggml_gallocr_is_allocated(galloc, node) && !ggml_is_view(node)) {
hn->allocated = true;
assert(hn->offset == 0);
assert(hn->addr.offset == 0);
// try to reuse a parent's buffer (inplace)
if (ggml_op_can_inplace(node->op)) {
@ -506,9 +649,9 @@ static void ggml_gallocr_allocate_node(ggml_gallocr_t galloc, struct ggml_tensor
struct hash_node * view_src_hn = ggml_gallocr_hash_get(galloc, view_src);
if (view_src_hn->n_views == 1 && view_src_hn->n_children == 0 && view_src->data == parent->data) {
AT_PRINTF("reusing view parent %s (%s) for %s\n", parent->name, view_src->name, node->name);
assert(view_src_hn->offset == p_hn->offset);
assert(view_src_hn->addr.chunk == p_hn->addr.chunk && view_src_hn->addr.offset == p_hn->addr.offset);
hn->buffer_id = p_hn->buffer_id;
hn->offset = p_hn->offset;
hn->addr = p_hn->addr;
p_hn->allocated = false; // avoid freeing the parent
view_src_hn->allocated = false;
return;
@ -516,7 +659,7 @@ static void ggml_gallocr_allocate_node(ggml_gallocr_t galloc, struct ggml_tensor
} else {
AT_PRINTF("reusing parent %s for %s\n", parent->name, node->name);
hn->buffer_id = p_hn->buffer_id;
hn->offset = p_hn->offset;
hn->addr = p_hn->addr;
p_hn->allocated = false; // avoid freeing the parent
return;
}
@ -527,9 +670,8 @@ static void ggml_gallocr_allocate_node(ggml_gallocr_t galloc, struct ggml_tensor
struct ggml_dyn_tallocr * alloc = galloc->buf_tallocs[buffer_id];
ggml_backend_buffer_type_t buft = galloc->bufts[buffer_id];
size_t size = ggml_backend_buft_get_alloc_size(buft, node);
size_t offset = ggml_dyn_tallocr_alloc(alloc, size, node);
hn->buffer_id = buffer_id;
hn->offset = offset;
hn->addr = ggml_dyn_tallocr_alloc(alloc, size, node);
}
}
@ -541,12 +683,11 @@ static void ggml_gallocr_free_node(ggml_gallocr_t galloc, struct ggml_tensor * n
}
struct hash_node * hn = ggml_gallocr_hash_get(galloc, node);
size_t offset = hn->offset;
int buffer_id = hn->buffer_id;
struct ggml_dyn_tallocr * alloc = galloc->buf_tallocs[buffer_id];
ggml_backend_buffer_type_t buft = galloc->bufts[buffer_id];
size_t size = ggml_backend_buft_get_alloc_size(buft, node);
ggml_dyn_tallocr_free_tensor(alloc, offset, size, node);
ggml_dyn_tallocr_free_tensor(alloc, hn->addr, size, node);
hn->allocated = false;
}
@ -697,24 +838,24 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
struct node_alloc * node_alloc = &galloc->node_allocs[i];
if (node->view_src || node->data) {
node_alloc->dst.buffer_id = -1;
node_alloc->dst.offset = SIZE_MAX;
node_alloc->dst.addr = GGML_BUFFER_ADDRESS_INVALID;
node_alloc->dst.size_max = 0;
} else {
struct hash_node * hn = ggml_gallocr_hash_get(galloc, node);
node_alloc->dst.buffer_id = hn->buffer_id;
node_alloc->dst.offset = hn->offset;
node_alloc->dst.addr = hn->addr;
node_alloc->dst.size_max = ggml_backend_buft_get_alloc_size(galloc->bufts[hn->buffer_id], node);
}
for (int j = 0; j < GGML_MAX_SRC; j++) {
struct ggml_tensor * src = node->src[j];
if (!src || src->view_src || src->data) {
node_alloc->src[j].buffer_id = -1;
node_alloc->src[j].offset = SIZE_MAX;
node_alloc->src[j].addr = GGML_BUFFER_ADDRESS_INVALID;
node_alloc->src[j].size_max = 0;
} else {
struct hash_node * hn = ggml_gallocr_hash_get(galloc, src);
node_alloc->src[j].buffer_id = hn->buffer_id;
node_alloc->src[j].offset = hn->offset;
node_alloc->src[j].addr = hn->addr;
node_alloc->src[j].size_max = ggml_backend_buft_get_alloc_size(galloc->bufts[hn->buffer_id], src);
}
}
@ -730,11 +871,11 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
struct hash_node * hn = ggml_gallocr_hash_get(galloc, leaf);
if (leaf->view_src || leaf->data) {
galloc->leaf_allocs[i].leaf.buffer_id = -1;
galloc->leaf_allocs[i].leaf.offset = SIZE_MAX;
galloc->leaf_allocs[i].leaf.addr = GGML_BUFFER_ADDRESS_INVALID;
galloc->leaf_allocs[i].leaf.size_max = 0;
} else {
galloc->leaf_allocs[i].leaf.buffer_id = hn->buffer_id;
galloc->leaf_allocs[i].leaf.offset = hn->offset;
galloc->leaf_allocs[i].leaf.addr = hn->addr;
galloc->leaf_allocs[i].leaf.size_max = ggml_backend_buft_get_alloc_size(galloc->bufts[hn->buffer_id], leaf);
}
}
@ -751,7 +892,7 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
}
}
size_t cur_size = galloc->buffers[i] ? ggml_backend_buffer_get_size(galloc->buffers[i]) : 0;
size_t cur_size = galloc->buffers[i] ? ggml_vbuffer_size(galloc->buffers[i]) : 0;
size_t new_size = ggml_dyn_tallocr_max_size(galloc->buf_tallocs[i]);
// even if there are no tensors allocated in this buffer, we still need to allocate it to initialize views
@ -760,18 +901,17 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
GGML_LOG_DEBUG("%s: reallocating %s buffer from size %.02f MiB to %.02f MiB\n", __func__, ggml_backend_buft_name(galloc->bufts[i]), cur_size / 1024.0 / 1024.0, new_size / 1024.0 / 1024.0);
#endif
ggml_backend_buffer_free(galloc->buffers[i]);
galloc->buffers[i] = ggml_backend_buft_alloc_buffer(galloc->bufts[i], new_size);
ggml_vbuffer_free(galloc->buffers[i]);
galloc->buffers[i] = ggml_vbuffer_alloc(galloc->bufts[i], galloc->buf_tallocs[i], GGML_BACKEND_BUFFER_USAGE_COMPUTE);
if (galloc->buffers[i]) {
galloc->buffer_sizes[i] = ggml_backend_buffer_get_size(galloc->buffers[i]);
ggml_backend_buffer_set_usage(galloc->buffers[i], GGML_BACKEND_BUFFER_USAGE_COMPUTE);
galloc->buffer_sizes[i] = ggml_vbuffer_size(galloc->buffers[i]);
} else {
GGML_LOG_ERROR("%s: failed to allocate %s buffer of size %zu\n", __func__, ggml_backend_buft_name(galloc->bufts[i]), new_size);
galloc->buffer_sizes[i] = new_size;
success = false;
}
} else {
galloc->buffer_sizes[i] = ggml_backend_buffer_get_size(galloc->buffers[i]);
galloc->buffer_sizes[i] = ggml_vbuffer_size(galloc->buffers[i]);
}
}
@ -784,11 +924,11 @@ bool ggml_gallocr_reserve(ggml_gallocr_t galloc, struct ggml_cgraph *graph) {
static void ggml_gallocr_init_tensor(ggml_gallocr_t galloc, struct ggml_tensor * tensor, struct tensor_alloc * tensor_alloc) {
int buffer_id = tensor_alloc->buffer_id;
assert(tensor->data || tensor->view_src || ggml_backend_buffer_get_alloc_size(galloc->buffers[buffer_id], tensor) <= tensor_alloc->size_max);
assert(tensor->data || tensor->view_src || ggml_backend_buft_get_alloc_size(galloc->bufts[buffer_id], tensor) <= tensor_alloc->size_max);
if (tensor->view_src != NULL) {
if (tensor->buffer == NULL) {
assert(tensor_alloc->offset == SIZE_MAX);
assert(tensor_alloc->addr.offset == SIZE_MAX);
if (tensor->view_src->buffer == NULL) {
// this tensor was allocated without ggml-backend
return;
@ -797,11 +937,9 @@ static void ggml_gallocr_init_tensor(ggml_gallocr_t galloc, struct ggml_tensor *
}
} else {
if (tensor->data == NULL) {
assert(tensor_alloc->offset != SIZE_MAX);
assert(ggml_backend_buffer_get_alloc_size(galloc->buffers[buffer_id], tensor) <= tensor_alloc->size_max);
void * base = ggml_backend_buffer_get_base(galloc->buffers[buffer_id]);
void * addr = (char *)base + tensor_alloc->offset;
ggml_backend_tensor_alloc(galloc->buffers[buffer_id], tensor, addr);
assert(tensor_alloc->addr.offset != SIZE_MAX);
assert(ggml_backend_buft_get_alloc_size(galloc->bufts[buffer_id], tensor) <= tensor_alloc->size_max);
ggml_vbuffer_tensor_alloc(galloc->buffers[buffer_id], tensor, tensor_alloc->addr);
} else {
if (tensor->buffer == NULL) {
// this tensor was allocated without ggml-backend
@ -886,7 +1024,7 @@ bool ggml_gallocr_alloc_graph(ggml_gallocr_t galloc, struct ggml_cgraph * graph)
// reset buffers
for (int i = 0; i < galloc->n_buffers; i++) {
if (galloc->buffers[i] != NULL) {
ggml_backend_buffer_reset(galloc->buffers[i]);
ggml_vbuffer_reset(galloc->buffers[i]);
}
}
@ -929,7 +1067,7 @@ size_t ggml_gallocr_get_buffer_size(ggml_gallocr_t galloc, int buffer_id) {
}
}
return ggml_backend_buffer_get_size(galloc->buffers[buffer_id]);
return ggml_vbuffer_size(galloc->buffers[buffer_id]);
}
size_t ggml_gallocr_get_attempted_buffer_size(ggml_gallocr_t galloc, int buffer_id) {

View File

@ -8,7 +8,7 @@
extern "C" {
#endif
#define GGML_BACKEND_API_VERSION 1
#define GGML_BACKEND_API_VERSION 2
//
// Backend buffer type
@ -121,6 +121,9 @@ extern "C" {
// wait for an event on on a different stream
void (*event_wait) (ggml_backend_t backend, ggml_backend_event_t event);
// (optional) sort/optimize the nodes in the graph
void (*graph_optimize) (ggml_backend_t backend, struct ggml_cgraph * cgraph);
// (optional) reserves intermediate buffers needed for the compution
// if alloc is true, memory is actually allocated, otherwise the required amount is just returned by buffer_size
enum ggml_status (*graph_reserve) (ggml_backend_t backend, struct ggml_cgraph * cgraph, bool alloc);

View File

@ -49,6 +49,10 @@
#include "ggml-webgpu.h"
#endif
#ifdef GGML_USE_ZDNN
#include "ggml-zdnn.h"
#endif
#ifdef GGML_USE_OPENCL
#include "ggml-opencl.h"
#endif
@ -131,6 +135,10 @@ static void * dl_get_sym(dl_handle * handle, const char * name) {
return p;
}
static const char * dl_error() {
return "";
}
#else
using dl_handle = void;
@ -151,6 +159,11 @@ static void * dl_get_sym(dl_handle * handle, const char * name) {
return dlsym(handle, name);
}
static const char * dl_error() {
const char *rslt = dlerror();
return rslt != nullptr ? rslt : "";
}
#endif
using dl_handle_ptr = std::unique_ptr<dl_handle, dl_handle_deleter>;
@ -180,6 +193,9 @@ struct ggml_backend_registry {
#ifdef GGML_USE_WEBGPU
register_backend(ggml_backend_webgpu_reg());
#endif
#ifdef GGML_USE_ZDNN
register_backend(ggml_backend_zdnn_reg());
#endif
#ifdef GGML_USE_OPENCL
register_backend(ggml_backend_opencl_reg());
#endif
@ -238,7 +254,7 @@ struct ggml_backend_registry {
dl_handle_ptr handle { dl_load_library(path) };
if (!handle) {
if (!silent) {
GGML_LOG_ERROR("%s: failed to load %s\n", __func__, path_str(path).c_str());
GGML_LOG_ERROR("%s: failed to load %s: %s\n", __func__, path_str(path).c_str(), dl_error());
}
return nullptr;
}
@ -398,9 +414,8 @@ ggml_backend_t ggml_backend_init_by_type(enum ggml_backend_dev_type type, const
ggml_backend_t ggml_backend_init_best(void) {
ggml_backend_dev_t dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_GPU);
if (!dev) {
dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
}
dev = dev ? dev : ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_IGPU);
dev = dev ? dev : ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
if (!dev) {
return nullptr;
}
@ -529,7 +544,7 @@ static ggml_backend_reg_t ggml_backend_load_best(const char * name, bool silent,
if (filename.native().find(file_prefix) == 0 && ext == file_extension) {
dl_handle_ptr handle { dl_load_library(entry) };
if (!handle && !silent) {
GGML_LOG_ERROR("%s: failed to load %s\n", __func__, path_str(entry.path()).c_str());
GGML_LOG_ERROR("%s: failed to load %s: %s\n", __func__, path_str(entry.path()).c_str(), dl_error());
}
if (handle) {
auto score_fn = (ggml_backend_score_t) dl_get_sym(handle.get(), "ggml_backend_score");

View File

@ -19,9 +19,8 @@
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <string>
#include <vector>
#include <algorithm>
#include <vector>
#ifdef __APPLE__
#include <sys/types.h>
@ -32,6 +31,7 @@
// backend buffer type
const char * ggml_backend_buft_name(ggml_backend_buffer_type_t buft) {
GGML_ASSERT(buft);
return buft->iface.get_name(buft);
}
@ -54,14 +54,17 @@ ggml_backend_buffer_t ggml_backend_buft_alloc_buffer(ggml_backend_buffer_type_t
return buf;
}
GGML_ASSERT(buft);
return buft->iface.alloc_buffer(buft, size);
}
size_t ggml_backend_buft_get_alignment(ggml_backend_buffer_type_t buft) {
GGML_ASSERT(buft);
return buft->iface.get_alignment(buft);
}
size_t ggml_backend_buft_get_max_size(ggml_backend_buffer_type_t buft) {
GGML_ASSERT(buft);
// get_max_size is optional, defaults to SIZE_MAX
if (buft->iface.get_max_size) {
return buft->iface.get_max_size(buft);
@ -70,6 +73,7 @@ size_t ggml_backend_buft_get_max_size(ggml_backend_buffer_type_t buft) {
}
size_t ggml_backend_buft_get_alloc_size(ggml_backend_buffer_type_t buft, const struct ggml_tensor * tensor) {
GGML_ASSERT(buft);
// get_alloc_size is optional, defaults to ggml_nbytes
if (buft->iface.get_alloc_size) {
size_t size = buft->iface.get_alloc_size(buft, tensor);
@ -80,6 +84,7 @@ size_t ggml_backend_buft_get_alloc_size(ggml_backend_buffer_type_t buft, const s
}
bool ggml_backend_buft_is_host(ggml_backend_buffer_type_t buft) {
GGML_ASSERT(buft);
if (buft->iface.is_host) {
return buft->iface.is_host(buft);
}
@ -87,6 +92,7 @@ bool ggml_backend_buft_is_host(ggml_backend_buffer_type_t buft) {
}
ggml_backend_dev_t ggml_backend_buft_get_device(ggml_backend_buffer_type_t buft) {
GGML_ASSERT(buft);
return buft->device;
}
@ -124,10 +130,12 @@ void ggml_backend_buffer_free(ggml_backend_buffer_t buffer) {
}
size_t ggml_backend_buffer_get_size(ggml_backend_buffer_t buffer) {
GGML_ASSERT(buffer);
return buffer->size;
}
void * ggml_backend_buffer_get_base(ggml_backend_buffer_t buffer) {
GGML_ASSERT(buffer);
// get_base is optional if the buffer is zero-sized
if (buffer->size == 0) {
return NULL;
@ -147,6 +155,7 @@ void * ggml_backend_buffer_get_base(ggml_backend_buffer_t buffer) {
}
enum ggml_status ggml_backend_buffer_init_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) {
GGML_ASSERT(buffer);
// init_tensor is optional
if (buffer->iface.init_tensor) {
return buffer->iface.init_tensor(buffer, tensor);
@ -155,6 +164,7 @@ enum ggml_status ggml_backend_buffer_init_tensor(ggml_backend_buffer_t buffer, s
}
void ggml_backend_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
GGML_ASSERT(buffer);
// clear is optional if the buffer is zero-sized
if (buffer->size == 0) {
return;
@ -180,6 +190,7 @@ bool ggml_backend_buffer_is_host(ggml_backend_buffer_t buffer) {
}
void ggml_backend_buffer_set_usage(ggml_backend_buffer_t buffer, enum ggml_backend_buffer_usage usage) {
GGML_ASSERT(buffer);
buffer->usage = usage;
// FIXME: add a generic callback to the buffer interface
@ -189,14 +200,17 @@ void ggml_backend_buffer_set_usage(ggml_backend_buffer_t buffer, enum ggml_backe
}
enum ggml_backend_buffer_usage ggml_backend_buffer_get_usage(ggml_backend_buffer_t buffer) {
GGML_ASSERT(buffer);
return buffer->usage;
}
ggml_backend_buffer_type_t ggml_backend_buffer_get_type(ggml_backend_buffer_t buffer) {
GGML_ASSERT(buffer);
return buffer->buft;
}
void ggml_backend_buffer_reset(ggml_backend_buffer_t buffer) {
GGML_ASSERT(buffer);
if (buffer->iface.reset) {
buffer->iface.reset(buffer);
}
@ -235,6 +249,7 @@ void ggml_backend_free(ggml_backend_t backend) {
}
ggml_backend_buffer_type_t ggml_backend_get_default_buffer_type(ggml_backend_t backend) {
GGML_ASSERT(backend);
return ggml_backend_dev_buffer_type(backend->device);
}
@ -251,6 +266,8 @@ size_t ggml_backend_get_max_size(ggml_backend_t backend) {
}
void ggml_backend_tensor_set_async(ggml_backend_t backend, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
GGML_ASSERT(backend);
GGML_ASSERT(tensor);
GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds");
@ -262,6 +279,8 @@ void ggml_backend_tensor_set_async(ggml_backend_t backend, struct ggml_tensor *
}
void ggml_backend_tensor_get_async(ggml_backend_t backend, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
GGML_ASSERT(backend);
GGML_ASSERT(tensor);
GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor read out of bounds");
@ -303,6 +322,7 @@ void ggml_backend_tensor_get(const struct ggml_tensor * tensor, void * data, siz
}
void ggml_backend_tensor_memset(struct ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) {
GGML_ASSERT(tensor);
ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
if (size == 0) {
@ -318,6 +338,7 @@ void ggml_backend_tensor_memset(struct ggml_tensor * tensor, uint8_t value, size
}
void ggml_backend_synchronize(ggml_backend_t backend) {
GGML_ASSERT(backend);
if (backend->iface.synchronize == NULL) {
return;
}
@ -326,18 +347,21 @@ void ggml_backend_synchronize(ggml_backend_t backend) {
}
ggml_backend_graph_plan_t ggml_backend_graph_plan_create(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
GGML_ASSERT(backend);
GGML_ASSERT(backend->iface.graph_plan_create != NULL);
return backend->iface.graph_plan_create(backend, cgraph);
}
void ggml_backend_graph_plan_free(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
GGML_ASSERT(backend);
GGML_ASSERT(backend->iface.graph_plan_free != NULL);
backend->iface.graph_plan_free(backend, plan);
}
enum ggml_status ggml_backend_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
GGML_ASSERT(backend);
GGML_ASSERT(backend->iface.graph_plan_compute != NULL);
return backend->iface.graph_plan_compute(backend, plan);
@ -350,22 +374,27 @@ enum ggml_status ggml_backend_graph_compute(ggml_backend_t backend, struct ggml_
}
enum ggml_status ggml_backend_graph_compute_async(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
GGML_ASSERT(backend);
return backend->iface.graph_compute(backend, cgraph);
}
bool ggml_backend_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) {
GGML_ASSERT(backend);
return ggml_backend_dev_supports_op(backend->device, op);
}
bool ggml_backend_supports_buft(ggml_backend_t backend, ggml_backend_buffer_type_t buft) {
GGML_ASSERT(backend);
return ggml_backend_dev_supports_buft(backend->device, buft);
}
bool ggml_backend_offload_op(ggml_backend_t backend, const struct ggml_tensor * op) {
GGML_ASSERT(backend);
return ggml_backend_dev_offload_op(backend->device, op);
}
ggml_backend_dev_t ggml_backend_get_device(ggml_backend_t backend) {
GGML_ASSERT(backend);
return backend->device;
}
@ -401,6 +430,7 @@ void ggml_backend_tensor_copy_async(ggml_backend_t backend_src, ggml_backend_t b
return;
}
GGML_ASSERT(backend_dst);
if (backend_dst->iface.cpy_tensor_async != NULL) {
if (backend_dst->iface.cpy_tensor_async(backend_src, backend_dst, src, dst)) {
return;
@ -432,38 +462,52 @@ void ggml_backend_event_free(ggml_backend_event_t event) {
}
void ggml_backend_event_record(ggml_backend_event_t event, ggml_backend_t backend) {
GGML_ASSERT(backend);
GGML_ASSERT(backend->iface.event_record != NULL);
backend->iface.event_record(backend, event);
}
void ggml_backend_event_synchronize(ggml_backend_event_t event) {
GGML_ASSERT(event);
GGML_ASSERT(event->device->iface.event_synchronize);
event->device->iface.event_synchronize(event->device, event);
}
void ggml_backend_event_wait(ggml_backend_t backend, ggml_backend_event_t event) {
GGML_ASSERT(backend);
GGML_ASSERT(backend->iface.event_wait != NULL);
backend->iface.event_wait(backend, event);
}
static void ggml_backend_graph_optimize(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
GGML_ASSERT(backend);
if (backend->iface.graph_optimize != NULL) {
backend->iface.graph_optimize(backend, cgraph);
}
}
// Backend device
const char * ggml_backend_dev_name(ggml_backend_dev_t device) {
GGML_ASSERT(device);
return device->iface.get_name(device);
}
const char * ggml_backend_dev_description(ggml_backend_dev_t device) {
GGML_ASSERT(device);
return device->iface.get_description(device);
}
void ggml_backend_dev_memory(ggml_backend_dev_t device, size_t * free, size_t * total) {
GGML_ASSERT(device);
device->iface.get_memory(device, free, total);
}
enum ggml_backend_dev_type ggml_backend_dev_type(ggml_backend_dev_t device) {
GGML_ASSERT(device);
return device->iface.get_type(device);
}
@ -473,10 +517,12 @@ void ggml_backend_dev_get_props(ggml_backend_dev_t device, struct ggml_backend_d
}
ggml_backend_reg_t ggml_backend_dev_backend_reg(ggml_backend_dev_t device) {
GGML_ASSERT(device);
return device->reg;
}
ggml_backend_t ggml_backend_dev_init(ggml_backend_dev_t device, const char * params) {
GGML_ASSERT(device);
return device->iface.init_backend(device, params);
}
@ -489,10 +535,12 @@ void ggml_backend_dev_reset(ggml_backend_dev_t device) {
}
ggml_backend_buffer_type_t ggml_backend_dev_buffer_type(ggml_backend_dev_t device) {
GGML_ASSERT(device);
return device->iface.get_buffer_type(device);
}
ggml_backend_buffer_type_t ggml_backend_dev_host_buffer_type(ggml_backend_dev_t device) {
GGML_ASSERT(device);
if (device->iface.get_host_buffer_type == NULL) {
return NULL;
}
@ -501,18 +549,22 @@ ggml_backend_buffer_type_t ggml_backend_dev_host_buffer_type(ggml_backend_dev_t
}
ggml_backend_buffer_t ggml_backend_dev_buffer_from_host_ptr(ggml_backend_dev_t device, void * ptr, size_t size, size_t max_tensor_size) {
GGML_ASSERT(device);
return device->iface.buffer_from_host_ptr(device, ptr, size, max_tensor_size);
}
bool ggml_backend_dev_supports_op(ggml_backend_dev_t device, const struct ggml_tensor * op) {
GGML_ASSERT(device);
return device->iface.supports_op(device, op);
}
bool ggml_backend_dev_supports_buft(ggml_backend_dev_t device, ggml_backend_buffer_type_t buft) {
GGML_ASSERT(device);
return device->iface.supports_buft(device, buft);
}
bool ggml_backend_dev_offload_op(ggml_backend_dev_t device, const struct ggml_tensor * op) {
GGML_ASSERT(device);
if (device->iface.offload_op != NULL) {
return device->iface.offload_op(device, op);
}
@ -523,18 +575,22 @@ bool ggml_backend_dev_offload_op(ggml_backend_dev_t device, const struct ggml_te
// Backend (reg)
const char * ggml_backend_reg_name(ggml_backend_reg_t reg) {
GGML_ASSERT(reg);
return reg->iface.get_name(reg);
}
size_t ggml_backend_reg_dev_count(ggml_backend_reg_t reg) {
GGML_ASSERT(reg);
return reg->iface.get_device_count(reg);
}
ggml_backend_dev_t ggml_backend_reg_dev_get(ggml_backend_reg_t reg, size_t index) {
GGML_ASSERT(reg);
return reg->iface.get_device(reg, index);
}
void * ggml_backend_reg_get_proc_address(ggml_backend_reg_t reg, const char * name) {
GGML_ASSERT(reg);
if (!reg->iface.get_proc_address) {
return NULL;
}
@ -549,6 +605,7 @@ struct ggml_backend_multi_buffer_context {
};
static void ggml_backend_multi_buffer_free_buffer(ggml_backend_buffer_t buffer) {
GGML_ASSERT(buffer);
ggml_backend_multi_buffer_context * ctx = (ggml_backend_multi_buffer_context *) buffer->context;
for (size_t i = 0; i < ctx->n_buffers; i++) {
ggml_backend_buffer_free(ctx->buffers[i]);
@ -560,6 +617,7 @@ static void ggml_backend_multi_buffer_free_buffer(ggml_backend_buffer_t buffer)
}
static void ggml_backend_multi_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
GGML_ASSERT(buffer);
ggml_backend_multi_buffer_context * ctx = (ggml_backend_multi_buffer_context *) buffer->context;
for (size_t i = 0; i < ctx->n_buffers; i++) {
ggml_backend_buffer_clear(ctx->buffers[i], value);
@ -595,10 +653,12 @@ ggml_backend_buffer_t ggml_backend_multi_buffer_alloc_buffer(ggml_backend_buffer
}
bool ggml_backend_buffer_is_multi_buffer(ggml_backend_buffer_t buffer) {
GGML_ASSERT(buffer);
return buffer->iface.free_buffer == ggml_backend_multi_buffer_free_buffer;
}
void ggml_backend_multi_buffer_set_usage(ggml_backend_buffer_t buffer, enum ggml_backend_buffer_usage usage) {
GGML_ASSERT(buffer);
GGML_ASSERT(ggml_backend_buffer_is_multi_buffer(buffer));
ggml_backend_multi_buffer_context * ctx = (ggml_backend_multi_buffer_context *) buffer->context;
for (size_t i = 0; i < ctx->n_buffers; i++) {
@ -626,7 +686,7 @@ static bool ggml_is_view_op(enum ggml_op op) {
#endif
#ifndef GGML_SCHED_MAX_SPLIT_INPUTS
#define GGML_SCHED_MAX_SPLIT_INPUTS GGML_MAX_SRC
#define GGML_SCHED_MAX_SPLIT_INPUTS 30
#endif
#ifndef GGML_SCHED_MAX_COPIES
@ -883,7 +943,7 @@ static void ggml_backend_sched_set_if_supported(ggml_backend_sched_t sched, stru
}
// assigns backends to ops and splits the graph into subgraphs that can be computed on the same backend
static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * graph) {
void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * graph) {
// reset splits
sched->n_splits = 0;
sched->n_graph_inputs = 0;
@ -1279,6 +1339,10 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
struct ggml_backend_sched_split * split = &sched->splits[i];
split->graph = ggml_graph_view(graph, split->i_start, split->i_end);
// Optimize this split of the graph. This needs to happen before we make graph_copy,
// so they are in sync.
ggml_backend_graph_optimize(sched->backends[split->backend_id], &split->graph);
// add inputs to the graph copy so that they are allocated by ggml-alloc at the start of the split
for (int j = 0; j < split->n_inputs; j++) {
assert(graph_copy->size > (graph_copy->n_nodes + 1));
@ -1384,17 +1448,22 @@ static bool ggml_backend_sched_alloc_splits(ggml_backend_sched_t sched) {
}
static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t sched) {
GGML_ASSERT(sched);
struct ggml_backend_sched_split * splits = sched->splits;
for (int i = 0; i < sched->n_splits; i++) {
struct ggml_backend_sched_split * split = &splits[i];
ggml_tensor * prev_ids_tensor = nullptr;
std::vector<int32_t> ids;
std::vector<ggml_bitset_t> used_ids;
for (int split_id = 0; split_id < sched->n_splits; split_id++) {
struct ggml_backend_sched_split * split = &splits[split_id];
int split_backend_id = split->backend_id;
ggml_backend_t split_backend = sched->backends[split_backend_id];
// copy the input tensors to the split backend
for (int j = 0; j < split->n_inputs; j++) {
ggml_backend_t input_backend = ggml_backend_sched_get_tensor_backend(sched, split->inputs[j]);
struct ggml_tensor * input = split->inputs[j];
for (int input_id = 0; input_id < split->n_inputs; input_id++) {
ggml_backend_t input_backend = ggml_backend_sched_get_tensor_backend(sched, split->inputs[input_id]);
struct ggml_tensor * input = split->inputs[input_id];
struct ggml_tensor * input_cpy = tensor_copy(input, split_backend_id, sched->cur_copy);
if (input->flags & GGML_TENSOR_FLAG_INPUT) {
@ -1412,6 +1481,93 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s
} else {
ggml_backend_synchronize(split_backend);
}
// when offloading MoE weights, we can reduce the amount of data copied by copying only the experts that are used
ggml_tensor * node = split->graph.nodes[0];
if (split->graph.n_nodes > 0 &&
ggml_backend_buffer_get_usage(input->buffer) == GGML_BACKEND_BUFFER_USAGE_WEIGHTS &&
ggml_backend_buffer_is_host(input->buffer) && (
(node->src[0] == input_cpy && node->op == GGML_OP_MUL_MAT_ID)
//|| (node->src[1] == input_cpy && node->op == GGML_OP_ADD_ID) /* GGML_OP_ADD_ID weights are small and not worth splitting */
)) {
const int64_t n_expert = node->op == GGML_OP_MUL_MAT_ID ? input->ne[2] : input->ne[1];
const size_t expert_size = node->op == GGML_OP_MUL_MAT_ID ? input->nb[2] : input->nb[1];
ggml_backend_synchronize(input_backend);
// get the ids
ggml_tensor * ids_tensor = node->src[2];
ggml_backend_t ids_backend = split_backend;
// if the ids tensor is also an input of the split, it may not have been copied yet to the split backend
// in that case, we use the original ids tensor
for (int i = input_id + 1; i < split->n_inputs; i++) {
if (ids_tensor == tensor_copy(split->inputs[i], split_backend_id, sched->cur_copy)) {
ids_tensor = split->inputs[i];
ids_backend = ggml_backend_sched_get_tensor_backend(sched, split->inputs[i]);
break;
}
}
if (ids_tensor != prev_ids_tensor) {
ids.resize(ggml_nbytes(ids_tensor) / sizeof(int32_t));
ggml_backend_tensor_get_async(ids_backend, ids_tensor, ids.data(), 0, ggml_nbytes(ids_tensor));
ggml_backend_synchronize(ids_backend);
// find the used experts
used_ids.clear();
used_ids.resize(ggml_bitset_size(n_expert));
for (int64_t i1 = 0; i1 < ids_tensor->ne[1]; i1++) {
for (int64_t i0 = 0; i0 < ids_tensor->ne[0]; i0++) {
int32_t id = ids[i1 * ids_tensor->nb[1]/sizeof(int32_t) + i0 * ids_tensor->nb[0]/sizeof(int32_t)];
GGML_ASSERT(id >= 0 && id < n_expert);
ggml_bitset_set(used_ids.data(), id);
}
}
prev_ids_tensor = ids_tensor;
}
// group consecutive experts and copy them together
auto copy_experts = [&](int32_t first_id, int32_t last_id) {
const size_t expert_offset = first_id * expert_size;
const size_t expert_size_copy = (last_id - first_id + 1) * expert_size;
const size_t padding = std::min<size_t>(expert_size, 512);
const size_t padding_end = last_id < n_expert - 1 ? padding : 0;
ggml_backend_tensor_set_async(split_backend,
input_cpy,
(const uint8_t *)input->data + expert_offset, expert_offset,
// copy a bit extra at the to ensure there are no NaNs in the padding of the last expert
// this is necessary for MMQ in the CUDA backend
expert_size_copy + padding_end);
};
int id = 0;
while (!ggml_bitset_get(used_ids.data(), id)) {
id++;
}
int32_t first_id = id;
int32_t last_id = first_id;
for (++id; id < n_expert; ++id) {
if (!ggml_bitset_get(used_ids.data(), id)) {
continue;
}
if (id == last_id + 1) {
last_id = id;
continue;
}
copy_experts(first_id, last_id);
first_id = id;
last_id = id;
}
copy_experts(first_id, last_id);
} else {
// try async copy, but if not possible, we can still use a sync copy without synchronizing the dst backend, since we handle the synchronization here with multiple copies and events
// TODO: add public function to facilitate this, since applications do not have direct access to the backend interface
if (!split_backend->iface.cpy_tensor_async || !split_backend->iface.cpy_tensor_async(input_backend, split_backend, input, input_cpy)) {
@ -1425,6 +1581,7 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s
}
}
}
}
if (!sched->callback_eval) {
enum ggml_status ec = ggml_backend_graph_compute_async(split_backend, &split->graph);
@ -1578,6 +1735,7 @@ void ggml_backend_sched_free(ggml_backend_sched_t sched) {
}
void ggml_backend_sched_reset(ggml_backend_sched_t sched) {
GGML_ASSERT(sched);
// reset state for the next run
if (!sched->is_reset) {
ggml_hash_set_reset(&sched->hash_set);
@ -1589,8 +1747,11 @@ void ggml_backend_sched_reset(ggml_backend_sched_t sched) {
}
bool ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph) {
GGML_ASSERT(sched);
GGML_ASSERT((int)sched->hash_set.size >= measure_graph->n_nodes + measure_graph->n_leafs);
ggml_backend_sched_reset(sched);
ggml_backend_sched_synchronize(sched);
ggml_backend_sched_split_graph(sched, measure_graph);
@ -1623,6 +1784,7 @@ bool ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph *
}
bool ggml_backend_sched_alloc_graph(ggml_backend_sched_t sched, struct ggml_cgraph * graph) {
GGML_ASSERT(sched);
GGML_ASSERT((int)sched->hash_set.size >= graph->n_nodes + graph->n_leafs);
GGML_ASSERT(!sched->is_alloc);
@ -1647,6 +1809,7 @@ enum ggml_status ggml_backend_sched_graph_compute(ggml_backend_sched_t sched, st
}
enum ggml_status ggml_backend_sched_graph_compute_async(ggml_backend_sched_t sched, struct ggml_cgraph * graph) {
GGML_ASSERT(sched);
if (!sched->is_reset && !sched->is_alloc) {
ggml_backend_sched_reset(sched);
}
@ -1661,6 +1824,7 @@ enum ggml_status ggml_backend_sched_graph_compute_async(ggml_backend_sched_t sch
}
void ggml_backend_sched_synchronize(ggml_backend_sched_t sched) {
GGML_ASSERT(sched);
for (int i = 0; i < sched->n_backends; i++) {
ggml_backend_synchronize(sched->backends[i]);
}
@ -1673,28 +1837,42 @@ void ggml_backend_sched_synchronize(ggml_backend_sched_t sched) {
}
void ggml_backend_sched_set_eval_callback(ggml_backend_sched_t sched, ggml_backend_sched_eval_callback callback, void * user_data) {
GGML_ASSERT(sched);
sched->callback_eval = callback;
sched->callback_eval_user_data = user_data;
}
int ggml_backend_sched_get_n_splits(ggml_backend_sched_t sched) {
GGML_ASSERT(sched);
return sched->n_splits;
}
int ggml_backend_sched_get_n_copies(ggml_backend_sched_t sched) {
GGML_ASSERT(sched);
return sched->n_copies;
}
int ggml_backend_sched_get_n_backends(ggml_backend_sched_t sched) {
GGML_ASSERT(sched);
return sched->n_backends;
}
ggml_backend_t ggml_backend_sched_get_backend(ggml_backend_sched_t sched, int i) {
GGML_ASSERT(sched);
GGML_ASSERT(i >= 0 && i < sched->n_backends);
return sched->backends[i];
}
ggml_backend_buffer_type_t ggml_backend_sched_get_buffer_type(ggml_backend_sched_t sched, ggml_backend_t backend) {
GGML_ASSERT(sched);
int backend_index = ggml_backend_sched_backend_id(sched, backend);
GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends);
return sched->bufts[backend_index];
}
size_t ggml_backend_sched_get_buffer_size(ggml_backend_sched_t sched, ggml_backend_t backend) {
GGML_ASSERT(sched);
int backend_index = ggml_backend_sched_backend_id(sched, backend);
GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends);
@ -1715,6 +1893,7 @@ size_t ggml_backend_sched_get_attempted_buffer_size(ggml_backend_sched_t sched,
}
void ggml_backend_sched_set_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node, ggml_backend_t backend) {
GGML_ASSERT(sched);
int backend_index = ggml_backend_sched_backend_id(sched, backend);
GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends);
tensor_backend_id(node) = backend_index;
@ -1723,6 +1902,7 @@ void ggml_backend_sched_set_tensor_backend(ggml_backend_sched_t sched, struct gg
}
ggml_backend_t ggml_backend_sched_get_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node) {
GGML_ASSERT(sched);
int backend_index = tensor_backend_id(node);
if (backend_index == -1) {
return NULL;
@ -1733,6 +1913,7 @@ ggml_backend_t ggml_backend_sched_get_tensor_backend(ggml_backend_sched_t sched,
// utils
enum ggml_status ggml_backend_view_init(struct ggml_tensor * tensor) {
GGML_ASSERT(tensor);
GGML_ASSERT(tensor->buffer == NULL);
GGML_ASSERT(tensor->view_src != NULL);
GGML_ASSERT(tensor->view_src->buffer != NULL);
@ -1744,6 +1925,7 @@ enum ggml_status ggml_backend_view_init(struct ggml_tensor * tensor) {
}
enum ggml_status ggml_backend_tensor_alloc(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, void * addr) {
GGML_ASSERT(tensor);
GGML_ASSERT(tensor->buffer == NULL);
GGML_ASSERT(tensor->data == NULL);
GGML_ASSERT(tensor->view_src == NULL);
@ -1817,6 +1999,7 @@ static void graph_copy_init_tensor(struct ggml_hash_set * hash_set, struct ggml_
}
struct ggml_backend_graph_copy ggml_backend_graph_copy(ggml_backend_t backend, struct ggml_cgraph * graph) {
GGML_ASSERT(graph);
struct ggml_hash_set hash_set = ggml_hash_set_new(graph->visited_hash_set.size);
struct ggml_tensor ** node_copies = (ggml_tensor **) calloc(hash_set.size, sizeof(node_copies[0])); // NOLINT
bool * node_init = (bool *) calloc(hash_set.size, sizeof(node_init[0]));
@ -1961,6 +2144,7 @@ bool ggml_backend_compare_graph_backend(ggml_backend_t backend1, ggml_backend_t
// CPU backend - buffer
static void * ggml_backend_cpu_buffer_get_base(ggml_backend_buffer_t buffer) {
GGML_ASSERT(buffer);
uintptr_t data = (uintptr_t)buffer->context;
// align the buffer
@ -1972,6 +2156,7 @@ static void * ggml_backend_cpu_buffer_get_base(ggml_backend_buffer_t buffer) {
}
static void ggml_backend_cpu_buffer_free_buffer(ggml_backend_buffer_t buffer) {
GGML_ASSERT(buffer);
ggml_aligned_free(buffer->context, buffer->size);
delete buffer;
}
@ -1981,24 +2166,28 @@ static void ggml_backend_cpu_ptr_buffer_free_buffer(ggml_backend_buffer_t buffer
}
static void ggml_backend_cpu_buffer_memset_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) {
GGML_ASSERT(tensor);
memset((char *)tensor->data + offset, value, size);
GGML_UNUSED(buffer);
}
static void ggml_backend_cpu_buffer_set_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
GGML_ASSERT(tensor);
memcpy((char *)tensor->data + offset, data, size);
GGML_UNUSED(buffer);
}
static void ggml_backend_cpu_buffer_get_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
GGML_ASSERT(tensor);
memcpy(data, (const char *)tensor->data + offset, size);
GGML_UNUSED(buffer);
}
static bool ggml_backend_cpu_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * src, struct ggml_tensor * dst) {
GGML_ASSERT(src);
if (ggml_backend_buffer_is_host(src->buffer)) {
memcpy(dst->data, src->data, ggml_nbytes(src));
return true;
@ -2009,6 +2198,7 @@ static bool ggml_backend_cpu_buffer_cpy_tensor(ggml_backend_buffer_t buffer, con
}
static void ggml_backend_cpu_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
GGML_ASSERT(buffer);
memset(buffer->context, value, buffer->size);
}

View File

@ -74,7 +74,7 @@ if (BLAS_FOUND)
target_compile_options(ggml-blas PRIVATE ${BLAS_LINKER_FLAGS})
if (${BLAS_INCLUDE_DIRS} MATCHES "mkl" AND (${GGML_BLAS_VENDOR} MATCHES "Generic" OR ${GGML_BLAS_VENDOR} MATCHES "Intel"))
if ("${BLAS_INCLUDE_DIRS}" MATCHES "mkl" AND (${GGML_BLAS_VENDOR} MATCHES "Generic" OR ${GGML_BLAS_VENDOR} MATCHES "Intel"))
add_compile_definitions(GGML_BLAS_USE_MKL)
endif()

View File

@ -270,6 +270,7 @@ static struct ggml_backend_i blas_backend_i = {
/* .graph_compute = */ ggml_backend_blas_graph_compute,
/* .event_record = */ NULL,
/* .event_wait = */ NULL,
/* .graph_optimize = */ NULL,
};
static ggml_guid_t ggml_backend_blas_guid(void) {

View File

@ -224,8 +224,14 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
foreach(feature DOTPROD SVE MATMUL_INT8 FMA FP16_VECTOR_ARITHMETIC SME)
string(FIND "${ARM_FEATURE}" "__ARM_FEATURE_${feature} 1" feature_pos)
if (NOT ${feature_pos} EQUAL -1)
# Special handling for MATMUL_INT8 when machine doesn't support i8mm
if ("${feature}" STREQUAL "MATMUL_INT8" AND GGML_MACHINE_SUPPORTS_noi8mm)
message(STATUS "ARM feature ${feature} detected but unsetting due to machine not supporting i8mm")
list(APPEND ARCH_FLAGS -U__ARM_FEATURE_MATMUL_INT8)
else()
message(STATUS "ARM feature ${feature} enabled")
endif()
endif()
endforeach()
endif()
endif()
@ -433,15 +439,31 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
ggml-cpu/arch/riscv/quants.c
ggml-cpu/arch/riscv/repack.cpp
)
if (GGML_RVV)
if (GGML_CPU_RISCV64_SPACEMIT)
target_compile_definitions(${GGML_CPU_NAME} PRIVATE GGML_USE_CPU_RISCV64_SPACEMIT ${RISCV64_SPACEMIT_IME_SPEC})
list(APPEND GGML_CPU_SOURCES
ggml-cpu/spacemit/ime.cpp
ggml-cpu/spacemit/ime.h
ggml-cpu/spacemit/ime1_kernels.cpp
ggml-cpu/spacemit/ime_kernels.h
)
endif()
set(MARCH_STR "rv64gc")
if (GGML_RV_ZFH)
string(APPEND MARCH_STR "_zfh")
endif()
if (GGML_XTHEADVECTOR)
list(APPEND ARCH_FLAGS -march=rv64gc_xtheadvector -mabi=lp64d)
elseif (GGML_RV_ZFH)
list(APPEND ARCH_FLAGS -march=rv64gcv_zfhmin -mabi=lp64d)
else()
list(APPEND ARCH_FLAGS -march=rv64gcv -mabi=lp64d)
string(APPEND MARCH_STR "_xtheadvector")
elseif (GGML_RVV)
string(APPEND MARCH_STR "_v")
if (GGML_RV_ZVFH)
string(APPEND MARCH_STR "_zvfh")
endif()
endif()
if (GGML_RV_ZICBOP)
string(APPEND MARCH_STR "_zicbop")
endif()
list(APPEND ARCH_FLAGS "-march=${MARCH_STR}" -mabi=lp64d)
elseif (GGML_SYSTEM_ARCH STREQUAL "s390x")
message(STATUS "s390x detected")
list(APPEND GGML_CPU_SOURCES ggml-cpu/arch/s390/quants.c)
@ -450,7 +472,6 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
# TODO: Separation to determine activation of VX/VXE/VXE2
if (${S390X_M} MATCHES "8561|8562")
set(GGML_NNPA OFF)
message(STATUS "z15 target")
list(APPEND ARCH_FLAGS -march=z15)
elseif (${S390X_M} MATCHES "3931")
@ -460,7 +481,7 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
# NOTE: Only available from GCC 15.1.0 onwards. Any z17 machine with compile issues must first verify their GCC version.
# binutils must also be updated to the latest for the -march=z17 flag to work. Otherwise, use -march=arch15.
message(STATUS "z17 target")
list(APPEND ARCH_FLAGS -march=z17)
list(APPEND ARCH_FLAGS -march=arch15)
else()
message(STATUS "Unknown target")
message(WARNING "Unknown target. If you are compiling for z14 and earlier, you might have to add -DGGML_VXE=OFF.")
@ -472,11 +493,6 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
list(APPEND ARCH_FLAGS -mvx -mzvector)
list(APPEND ARCH_DEFINITIONS GGML_VXE)
endif()
if (GGML_NNPA)
message(STATUS "NNPA enabled")
list(APPEND ARCH_DEFINITIONS GGML_NNPA)
endif()
elseif (CMAKE_SYSTEM_PROCESSOR MATCHES "wasm")
message(STATUS "Wasm detected")
list (APPEND GGML_CPU_SOURCES ggml-cpu/arch/wasm/quants.c)
@ -497,9 +513,9 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
# Fetch KleidiAI sources:
include(FetchContent)
set(KLEIDIAI_COMMIT_TAG "v1.11.0")
set(KLEIDIAI_COMMIT_TAG "v1.14.0")
set(KLEIDIAI_DOWNLOAD_URL "https://github.com/ARM-software/kleidiai/archive/refs/tags/${KLEIDIAI_COMMIT_TAG}.tar.gz")
set(KLEIDIAI_ARCHIVE_MD5 "3fe9e5ab964c375c53839296eb71eaa2")
set(KLEIDIAI_ARCHIVE_MD5 "45e110675d93f99f82c23a1afcca76bc")
if (POLICY CMP0135)
cmake_policy(SET CMP0135 NEW)
@ -555,6 +571,7 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
list(APPEND GGML_KLEIDIAI_SOURCES
${KLEIDIAI_SRC}/kai/ukernels/matmul/pack/kai_lhs_quant_pack_qsi8d32p_f32.c
${KLEIDIAI_SRC}/kai/ukernels/matmul/pack/kai_lhs_quant_pack_qsi8d32p4x8sb_f32_neon.c
${KLEIDIAI_SRC}/kai/ukernels/matmul/pack/kai_rhs_pack_nxk_qsi4c32ps1s0scalef16_qsu4c32s16s0_neon.c
${KLEIDIAI_SRC}/kai/ukernels/matmul/pack/kai_lhs_quant_pack_qsi8d32p_f32_neon.c
${KLEIDIAI_SRC}/kai/ukernels/matmul/pack/kai_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0.c)
@ -575,8 +592,10 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p1vlx4_qsi4c32p4vlx4_1vlx4vl_sme2_mopa.c
${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p1x4_qsi4c32p4vlx4_1x4vl_sme2_sdot.c
${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_fp32_bf16p_bf16p/kai_matmul_clamp_f32_bf16p2vlx2_bf16p2vlx2_2vlx2vl_sme2_mopa.c
${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_fp32_bf16p_bf16p/kai_matmul_clamp_f32_bf16p2vlx2_bf16p2vlx2_2vlx2vl_sme2_mopa_asm.S
${KLEIDIAI_SRC}/kai/ukernels/matmul/pack/kai_lhs_pack_bf16p2vlx2_f32_sme.c
${KLEIDIAI_SRC}/kai/ukernels/matmul/pack/kai_rhs_pack_kxn_bf16p2vlx2b_f32_x32_sme.c)
${KLEIDIAI_SRC}/kai/ukernels/matmul/pack/kai_rhs_pack_kxn_bf16p2vlx2b_f32_x32_sme.c
${KLEIDIAI_SRC}/kai/kai_common_sme_asm.S)
set(PRIVATE_ARCH_FLAGS "-fno-tree-vectorize;${PRIVATE_ARCH_FLAGS}+sve+sve2")
endif()

View File

@ -7,7 +7,7 @@
#include "ggml-cpu.h"
#include "traits.h"
#if defined(__gnu_linux__)
#if defined(__linux__)
#include <sys/syscall.h>
#include <unistd.h>
#endif
@ -186,7 +186,7 @@ static size_t ggml_backend_amx_buffer_type_get_alloc_size(ggml_backend_buffer_ty
#define XFEATURE_XTILEDATA 18
static bool ggml_amx_init() {
#if defined(__gnu_linux__)
#if defined(__linux__)
if (syscall(SYS_arch_prctl, ARCH_REQ_XCOMP_PERM, XFEATURE_XTILEDATA)) {
fprintf(stderr, "AMX is not ready to be used!\n");
return false;
@ -194,6 +194,8 @@ static bool ggml_amx_init() {
return true;
#elif defined(_WIN32)
return true;
#else
return false;
#endif
}

View File

@ -40,18 +40,22 @@
#define ggml_gemv_q4_K_8x8_q8_K_generic ggml_gemv_q4_K_8x8_q8_K
#define ggml_gemv_q2_K_8x8_q8_K_generic ggml_gemv_q2_K_8x8_q8_K
#define ggml_gemv_iq4_nl_4x4_q8_0_generic ggml_gemv_iq4_nl_4x4_q8_0
#define ggml_gemv_iq4_nl_8x8_q8_0_generic ggml_gemv_iq4_nl_8x8_q8_0
#define ggml_gemm_q4_0_4x4_q8_0_generic ggml_gemm_q4_0_4x4_q8_0
#define ggml_gemm_q4_0_4x8_q8_0_generic ggml_gemm_q4_0_4x8_q8_0
#define ggml_gemm_q4_0_8x8_q8_0_generic ggml_gemm_q4_0_8x8_q8_0
#define ggml_gemm_q4_K_8x8_q8_K_generic ggml_gemm_q4_K_8x8_q8_K
#define ggml_gemm_q2_K_8x8_q8_K_generic ggml_gemm_q2_K_8x8_q8_K
#define ggml_gemm_iq4_nl_4x4_q8_0_generic ggml_gemm_iq4_nl_4x4_q8_0
#define ggml_gemm_iq4_nl_8x8_q8_0_generic ggml_gemm_iq4_nl_8x8_q8_0
#elif defined(__aarch64__) || defined(__arm__) || defined(_M_ARM) || defined(_M_ARM64)
// repack.cpp
#define ggml_quantize_mat_q8_K_4x8_generic ggml_quantize_mat_q8_K_4x8
#define ggml_gemv_q4_K_8x8_q8_K_generic ggml_gemv_q4_K_8x8_q8_K
#define ggml_gemv_iq4_nl_8x8_q8_0_generic ggml_gemv_iq4_nl_8x8_q8_0
#define ggml_gemv_q2_K_8x8_q8_K_generic ggml_gemv_q2_K_8x8_q8_K
#define ggml_gemm_q4_K_8x8_q8_K_generic ggml_gemm_q4_K_8x8_q8_K
#define ggml_gemm_iq4_nl_8x8_q8_0_generic ggml_gemm_iq4_nl_8x8_q8_0
#define ggml_gemm_q2_K_8x8_q8_K_generic ggml_gemm_q2_K_8x8_q8_K
#elif defined(__x86_64__) || defined(__i386__) || defined(_M_IX86) || defined(_M_X64)
// repack.cpp
@ -69,7 +73,6 @@
#define ggml_vec_dot_tq1_0_q8_K_generic ggml_vec_dot_tq1_0_q8_K
#define ggml_vec_dot_tq2_0_q8_K_generic ggml_vec_dot_tq2_0_q8_K
#define ggml_vec_dot_iq1_m_q8_K_generic ggml_vec_dot_iq1_m_q8_K
#define ggml_vec_dot_mxfp4_q8_0_generic ggml_vec_dot_mxfp4_q8_0
// repack.cpp
#define ggml_quantize_mat_q8_0_4x4_generic ggml_quantize_mat_q8_0_4x4
#define ggml_quantize_mat_q8_0_4x8_generic ggml_quantize_mat_q8_0_4x8
@ -80,12 +83,14 @@
#define ggml_gemv_q4_K_8x8_q8_K_generic ggml_gemv_q4_K_8x8_q8_K
#define ggml_gemv_q2_K_8x8_q8_K_generic ggml_gemv_q2_K_8x8_q8_K
#define ggml_gemv_iq4_nl_4x4_q8_0_generic ggml_gemv_iq4_nl_4x4_q8_0
#define ggml_gemv_iq4_nl_8x8_q8_0_generic ggml_gemv_iq4_nl_8x8_q8_0
#define ggml_gemm_q4_0_4x4_q8_0_generic ggml_gemm_q4_0_4x4_q8_0
#define ggml_gemm_q4_0_4x8_q8_0_generic ggml_gemm_q4_0_4x8_q8_0
#define ggml_gemm_q4_0_8x8_q8_0_generic ggml_gemm_q4_0_8x8_q8_0
#define ggml_gemm_q4_K_8x8_q8_K_generic ggml_gemm_q4_K_8x8_q8_K
#define ggml_gemm_q2_K_8x8_q8_K_generic ggml_gemm_q2_K_8x8_q8_K
#define ggml_gemm_iq4_nl_4x4_q8_0_generic ggml_gemm_iq4_nl_4x4_q8_0
#define ggml_gemm_iq4_nl_8x8_q8_0_generic ggml_gemm_iq4_nl_8x8_q8_0
#elif defined(__loongarch64)
// quants.c
#define quantize_row_q8_K_generic quantize_row_q8_K
@ -103,12 +108,14 @@
#define ggml_gemv_q4_K_8x8_q8_K_generic ggml_gemv_q4_K_8x8_q8_K
#define ggml_gemv_q2_K_8x8_q8_K_generic ggml_gemv_q2_K_8x8_q8_K
#define ggml_gemv_iq4_nl_4x4_q8_0_generic ggml_gemv_iq4_nl_4x4_q8_0
#define ggml_gemv_iq4_nl_8x8_q8_0_generic ggml_gemv_iq4_nl_8x8_q8_0
#define ggml_gemm_q4_0_4x4_q8_0_generic ggml_gemm_q4_0_4x4_q8_0
#define ggml_gemm_q4_0_4x8_q8_0_generic ggml_gemm_q4_0_4x8_q8_0
#define ggml_gemm_q4_0_8x8_q8_0_generic ggml_gemm_q4_0_8x8_q8_0
#define ggml_gemm_q4_K_8x8_q8_K_generic ggml_gemm_q4_K_8x8_q8_K
#define ggml_gemm_q2_K_8x8_q8_K_generic ggml_gemm_q2_K_8x8_q8_K
#define ggml_gemm_iq4_nl_4x4_q8_0_generic ggml_gemm_iq4_nl_4x4_q8_0
#define ggml_gemm_iq4_nl_8x8_q8_0_generic ggml_gemm_iq4_nl_8x8_q8_0
#elif defined(__riscv)
// quants.c
#define quantize_row_q8_K_generic quantize_row_q8_K
@ -133,16 +140,16 @@
#define ggml_gemv_q4_K_8x8_q8_K_generic ggml_gemv_q4_K_8x8_q8_K
#define ggml_gemv_q2_K_8x8_q8_K_generic ggml_gemv_q2_K_8x8_q8_K
#define ggml_gemv_iq4_nl_4x4_q8_0_generic ggml_gemv_iq4_nl_4x4_q8_0
#define ggml_gemv_iq4_nl_8x8_q8_0_generic ggml_gemv_iq4_nl_8x8_q8_0
#define ggml_gemm_q4_0_4x4_q8_0_generic ggml_gemm_q4_0_4x4_q8_0
#define ggml_gemm_q4_0_4x8_q8_0_generic ggml_gemm_q4_0_4x8_q8_0
#define ggml_gemm_q4_K_8x8_q8_K_generic ggml_gemm_q4_K_8x8_q8_K
#define ggml_gemm_q2_K_8x8_q8_K_generic ggml_gemm_q2_K_8x8_q8_K
#define ggml_gemm_iq4_nl_4x4_q8_0_generic ggml_gemm_iq4_nl_4x4_q8_0
#define ggml_gemm_iq4_nl_8x8_q8_0_generic ggml_gemm_iq4_nl_8x8_q8_0
#elif defined(__s390x__)
// quants.c
#define quantize_row_q8_K_generic quantize_row_q8_K
#define ggml_vec_dot_q5_0_q8_0_generic ggml_vec_dot_q5_0_q8_0
#define ggml_vec_dot_q5_1_q8_1_generic ggml_vec_dot_q5_1_q8_1
#define ggml_vec_dot_tq1_0_q8_K_generic ggml_vec_dot_tq1_0_q8_K
#define ggml_vec_dot_tq2_0_q8_K_generic ggml_vec_dot_tq2_0_q8_K
#define ggml_vec_dot_q2_K_q8_K_generic ggml_vec_dot_q2_K_q8_K
@ -153,7 +160,6 @@
#define ggml_vec_dot_iq3_s_q8_K_generic ggml_vec_dot_iq3_s_q8_K
#define ggml_vec_dot_iq1_s_q8_K_generic ggml_vec_dot_iq1_s_q8_K
#define ggml_vec_dot_iq1_m_q8_K_generic ggml_vec_dot_iq1_m_q8_K
#define ggml_vec_dot_mxfp4_q8_0_generic ggml_vec_dot_mxfp4_q8_0
// repack.cpp
#define ggml_quantize_mat_q8_0_4x4_generic ggml_quantize_mat_q8_0_4x4
#define ggml_quantize_mat_q8_0_4x8_generic ggml_quantize_mat_q8_0_4x8
@ -164,12 +170,14 @@
#define ggml_gemv_q4_K_8x8_q8_K_generic ggml_gemv_q4_K_8x8_q8_K
#define ggml_gemv_q2_K_8x8_q8_K_generic ggml_gemv_q2_K_8x8_q8_K
#define ggml_gemv_iq4_nl_4x4_q8_0_generic ggml_gemv_iq4_nl_4x4_q8_0
#define ggml_gemv_iq4_nl_8x8_q8_0_generic ggml_gemv_iq4_nl_8x8_q8_0
#define ggml_gemm_q4_0_4x4_q8_0_generic ggml_gemm_q4_0_4x4_q8_0
#define ggml_gemm_q4_0_4x8_q8_0_generic ggml_gemm_q4_0_4x8_q8_0
#define ggml_gemm_q4_0_8x8_q8_0_generic ggml_gemm_q4_0_8x8_q8_0
#define ggml_gemm_q4_K_8x8_q8_K_generic ggml_gemm_q4_K_8x8_q8_K
#define ggml_gemm_q2_K_8x8_q8_K_generic ggml_gemm_q2_K_8x8_q8_K
#define ggml_gemm_iq4_nl_4x4_q8_0_generic ggml_gemm_iq4_nl_4x4_q8_0
#define ggml_gemm_iq4_nl_8x8_q8_0_generic ggml_gemm_iq4_nl_8x8_q8_0
#elif defined(__wasm__)
// quants.c
#define ggml_vec_dot_q4_1_q8_1_generic ggml_vec_dot_q4_1_q8_1
@ -195,10 +203,12 @@
#define ggml_gemv_q4_K_8x8_q8_K_generic ggml_gemv_q4_K_8x8_q8_K
#define ggml_gemv_q2_K_8x8_q8_K_generic ggml_gemv_q2_K_8x8_q8_K
#define ggml_gemv_iq4_nl_4x4_q8_0_generic ggml_gemv_iq4_nl_4x4_q8_0
#define ggml_gemv_iq4_nl_8x8_q8_0_generic ggml_gemv_iq4_nl_8x8_q8_0
#define ggml_gemm_q4_0_4x4_q8_0_generic ggml_gemm_q4_0_4x4_q8_0
#define ggml_gemm_q4_0_4x8_q8_0_generic ggml_gemm_q4_0_4x8_q8_0
#define ggml_gemm_q4_0_8x8_q8_0_generic ggml_gemm_q4_0_8x8_q8_0
#define ggml_gemm_q4_K_8x8_q8_K_generic ggml_gemm_q4_K_8x8_q8_K
#define ggml_gemm_q2_K_8x8_q8_K_generic ggml_gemm_q2_K_8x8_q8_K
#define ggml_gemm_iq4_nl_4x4_q8_0_generic ggml_gemm_iq4_nl_4x4_q8_0
#define ggml_gemm_iq4_nl_8x8_q8_0_generic ggml_gemm_iq4_nl_8x8_q8_0
#endif

File diff suppressed because it is too large Load Diff

View File

@ -28,6 +28,14 @@ static inline float bf16_to_f32(ggml_bf16_t x) {
return GGML_BF16_TO_FP32(x);
}
static inline float i32_to_f32(int32_t x) {
return x;
}
static inline int32_t f32_to_i32(float x) {
return x;
}
static inline float f32_to_f32(float x) {
return x;
}
@ -54,6 +62,12 @@ struct type_conversion_table<ggml_bf16_t> {
static constexpr ggml_bf16_t (*from_f32)(float) = f32_to_bf16;
};
template <>
struct type_conversion_table<int32_t> {
static constexpr float (*to_f32)(int32_t) = i32_to_f32;
static constexpr int32_t (*from_f32)(float) = f32_to_i32;
};
static std::pair<int64_t, int64_t> get_thread_range(const struct ggml_compute_params * params, const struct ggml_tensor * src0) {
const int64_t ith = params->ith;
const int64_t nth = params->nth;

View File

@ -68,12 +68,6 @@ struct ggml_compute_params {
#endif // __VXE2__
#endif // __s390x__ && __VEC__
#if defined(__s390x__) && defined(GGML_NNPA)
#ifndef __NNPA__
#define __NNPA__
#endif // __NNPA__
#endif // __s390x__ && GGML_NNPA
#if defined(__ARM_FEATURE_SVE)
#include <sys/prctl.h>
#endif
@ -486,6 +480,19 @@ inline static int16x8_t vec_padd_s16(int16x8_t a, int16x8_t b) {
return v_abo + v_abe;
}
/**
* @see https://github.com/ggml-org/llama.cpp/pull/14037
*/
inline static float vec_hsum_f32x4(float32x4_t v) {
float32x4_t v_temp = v + vec_reve(v);
return v_temp[0] + v_temp[1];
}
inline static int32_t vec_hsum_i32x4(int32x4_t v) {
int32x4_t v_temp = v + vec_reve(v);
return v_temp[0] + v_temp[1];
}
inline static int32x4_t ggml_vec_dot(int32x4_t acc, int8x16_t a, int8x16_t b) {
const int16x8_t p = vec_mule(a, b) + vec_mulo(a, b);
return acc + (vec_unpackh(p) + vec_unpackl(p));

View File

@ -375,6 +375,9 @@ static const struct ggml_type_traits_cpu type_traits_cpu[GGML_TYPE_COUNT] = {
.vec_dot_type = GGML_TYPE_Q8_K,
.nrows = 1,
},
[GGML_TYPE_I32] = {
.from_float = (ggml_from_float_t) ggml_cpu_fp32_to_i32,
},
};
const struct ggml_type_traits_cpu * ggml_get_type_traits_cpu(enum ggml_type type) {
@ -472,10 +475,10 @@ struct ggml_threadpool {
struct ggml_compute_state {
#ifndef GGML_USE_OPENMP
ggml_thread_t thrd;
bool cpumask[GGML_MAX_N_THREADS];
int last_graph;
bool pending;
#endif
bool cpumask[GGML_MAX_N_THREADS];
struct ggml_threadpool * threadpool;
int ith;
};
@ -1878,10 +1881,18 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
{
ggml_compute_forward_im2col_back_f32(params, tensor);
} break;
case GGML_OP_IM2COL_3D:
{
ggml_compute_forward_im2col_3d(params, tensor);
} break;
case GGML_OP_CONV_2D:
{
ggml_compute_forward_conv_2d(params, tensor);
} break;
case GGML_OP_CONV_3D:
{
ggml_compute_forward_conv_3d(params, tensor);
} break;
case GGML_OP_CONV_2D_DW:
{
ggml_compute_forward_conv_2d_dw(params, tensor);
@ -2024,6 +2035,11 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
ggml_compute_forward_opt_step_adamw(params, tensor);
}
break;
case GGML_OP_OPT_STEP_SGD:
{
ggml_compute_forward_opt_step_sgd(params, tensor);
}
break;
case GGML_OP_NONE:
{
// nop
@ -2248,7 +2264,9 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
} break;
case GGML_OP_IM2COL:
case GGML_OP_IM2COL_BACK:
case GGML_OP_IM2COL_3D:
case GGML_OP_CONV_2D:
case GGML_OP_CONV_3D:
case GGML_OP_CONV_2D_DW:
case GGML_OP_CONV_TRANSPOSE_1D:
case GGML_OP_CONV_TRANSPOSE_2D:
@ -2327,6 +2345,7 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
case GGML_OP_CROSS_ENTROPY_LOSS:
case GGML_OP_CROSS_ENTROPY_LOSS_BACK:
case GGML_OP_OPT_STEP_ADAMW:
case GGML_OP_OPT_STEP_SGD:
{
n_tasks = n_threads;
} break;
@ -2682,7 +2701,10 @@ struct ggml_cplan ggml_graph_plan(
if (ggml_is_quantized(node->type) ||
// F16 -> BF16 and BF16 -> F16 copies go through intermediate F32
(node->src[0]->type == GGML_TYPE_F16 && node->src[1] && node->src[1]->type == GGML_TYPE_BF16) ||
(node->src[0]->type == GGML_TYPE_BF16 && node->src[1] && node->src[1]->type == GGML_TYPE_F16)) {
(node->src[0]->type == GGML_TYPE_BF16 && node->src[1] && node->src[1]->type == GGML_TYPE_F16) ||
// conversion between F32 and I32
(node->src[0]->type == GGML_TYPE_F32 && node->src[1] && node->src[1]->type == GGML_TYPE_I32) ||
(node->src[0]->type == GGML_TYPE_I32 && node->src[1] && node->src[1]->type == GGML_TYPE_F32)) {
cur = ggml_type_size(GGML_TYPE_F32) * node->ne[0] * n_tasks;
}
} break;
@ -2769,6 +2791,7 @@ struct ggml_cplan ggml_graph_plan(
}
} break;
case GGML_OP_CONV_2D:
case GGML_OP_CONV_3D:
{
cur = GGML_IM2COL_WORK_SIZE;
} break;
@ -3064,7 +3087,14 @@ static struct ggml_threadpool * ggml_threadpool_new_impl(
threadpool->workers = workers;
#ifndef GGML_USE_OPENMP
#ifdef GGML_USE_OPENMP
int32_t cpumask_iter = 0;
// Compute CPU masks for each thread
for (int j = 0; j < tpp->n_threads; j++) {
ggml_thread_cpumask_next(tpp->cpumask, workers[j].cpumask, tpp->strict_cpu, &cpumask_iter);
}
#else // GGML_USE_OPENMP
ggml_mutex_init(&threadpool->mutex);
ggml_cond_init(&threadpool->cond);
@ -3137,7 +3167,14 @@ enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cpl
atomic_store_explicit(&threadpool->n_threads_cur, n_threads, memory_order_relaxed);
}
ggml_graph_compute_thread(&threadpool->workers[omp_get_thread_num()]);
// Apply thread CPU mask and priority
int ith = omp_get_thread_num();
ggml_thread_apply_priority(threadpool->prio);
if (ggml_thread_cpumask_is_valid(threadpool->workers[ith].cpumask)) {
ggml_thread_apply_affinity(threadpool->workers[ith].cpumask);
}
ggml_graph_compute_thread(&threadpool->workers[ith]);
}
} else {
atomic_store_explicit(&threadpool->n_threads_cur, 1, memory_order_relaxed);
@ -3200,20 +3237,12 @@ void ggml_cpu_fp32_to_fp16(const float * x, ggml_fp16_t * y, int64_t n) {
__m128i y_vec = _mm_cvtps_ph(x_vec, _MM_FROUND_TO_NEAREST_INT);
_mm_storel_epi64((__m128i *)(y + i), y_vec);
}
#elif defined(__NNPA__)
for (; i + 7 < n; i += 8) {
float32x4_t v_xh = vec_xl(0, (const float *)(x + i + 0));
float32x4_t v_xl = vec_xl(0, (const float *)(x + i + 4));
uint16x8_t v_yd = vec_round_from_fp32(v_xh, v_xl, 0);
uint16x8_t v_y = vec_convert_to_fp16(v_yd, 0);
vec_xst(v_y, 0, (ggml_fp16_t *)(y + i));
}
for (; i + 3 < n; i += 4) {
float32x4_t v_x = vec_xl(0, (const float *)(x + i));
float32x4_t v_zero = vec_splats(0.0f);
uint16x8_t v_yd = vec_round_from_fp32(v_x, v_zero, 0);
uint16x8_t v_y = vec_convert_to_fp16(v_yd, 0);
vec_xst(v_y, 0, (ggml_fp16_t *)(y + i));
#elif defined(__riscv_zvfh)
for (int vl; i < n; i += vl) {
vl = __riscv_vsetvl_e32m2(n - i);
vfloat32m2_t vx = __riscv_vle32_v_f32m2(&x[i], vl);
vfloat16m1_t vy = __riscv_vfncvt_f_f_w_f16m1(vx, vl);
__riscv_vse16_v_f16m1((_Float16 *)&y[i], vy, vl);
}
#endif
for (; i < n; ++i) {
@ -3241,21 +3270,6 @@ void ggml_cpu_fp16_to_fp32(const ggml_fp16_t * x, float * y, int64_t n) {
__m128 y_vec = _mm_cvtph_ps(x_vec);
_mm_storeu_ps(y + i, y_vec);
}
#elif defined(__NNPA__)
for (; i + 7 < n; i += 8) {
uint16x8_t v_x = vec_xl(0, (const ggml_fp16_t *)(x + i));
uint16x8_t v_yd = vec_convert_from_fp16(v_x, 0);
float32x4_t v_yh = vec_extend_to_fp32_hi(v_yd, 0);
float32x4_t v_yl = vec_extend_to_fp32_lo(v_yd, 0);
vec_xst(v_yh, 0, (float *)(y + i + 0));
vec_xst(v_yl, 0, (float *)(y + i + 4));
}
for (; i + 3 < n; i += 4) {
uint16x8_t v_x = vec_xl(0, (const ggml_fp16_t *)(x + i));
uint16x8_t v_yd = vec_convert_from_fp16(v_x, 0);
float32x4_t v_yh = vec_extend_to_fp32_hi(v_yd, 0);
vec_xst(v_yh, 0, (float *)(y + i));
}
#endif
for (; i < n; ++i) {
@ -3270,6 +3284,13 @@ void ggml_cpu_fp32_to_bf16(const float * x, ggml_bf16_t * y, int64_t n) {
}
}
void ggml_cpu_fp32_to_i32(const float * x, int32_t * y, int64_t n) {
int64_t i = 0;
for (; i < n; ++i) {
y[i] = x[i];
}
}
void ggml_cpu_bf16_to_fp32(const ggml_bf16_t * x, float * y, int64_t n) {
int64_t i = 0;
#if defined(__AVX2__)
@ -3459,14 +3480,6 @@ int ggml_cpu_has_vxe(void) {
#endif
}
int ggml_cpu_has_nnpa(void) {
#if defined(GGML_NNPA)
return 1;
#else
return 0;
#endif
}
int ggml_cpu_has_neon(void) {
#if defined(__ARM_ARCH) && defined(__ARM_NEON)
return 1;

Some files were not shown because too many files have changed in this diff Show More