mirror of
https://github.com/zebrajr/ollama.git
synced 2025-12-06 12:19:56 +01:00
Update GGML to b6646 (#12245)
Notable EOLs with this change: - MacOS v12 and v13 are no longer supported (v14+ required) - AMD gfx900 and gfx906 are no longer supported
This commit is contained in:
parent
fdb109469f
commit
c68f367ef6
|
|
@ -89,9 +89,9 @@ if(CMAKE_CUDA_COMPILER)
|
||||||
)
|
)
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
set(WINDOWS_AMDGPU_TARGETS_EXCLUDE_REGEX "^gfx(906|908|90a|1200|1201):xnack[+-]$"
|
set(WINDOWS_AMDGPU_TARGETS_EXCLUDE_REGEX "^gfx(908|90a|1200|1201):xnack[+-]$"
|
||||||
CACHE STRING
|
CACHE STRING
|
||||||
"Regular expression describing AMDGPU_TARGETS not supported on Windows. Override to force building these targets. Default \"^gfx(906|908|90a|1200|1201):xnack[+-]$\"."
|
"Regular expression describing AMDGPU_TARGETS not supported on Windows. Override to force building these targets. Default \"^gfx(908|90a|1200|1201):xnack[+-]$\"."
|
||||||
)
|
)
|
||||||
|
|
||||||
check_language(HIP)
|
check_language(HIP)
|
||||||
|
|
@ -100,7 +100,7 @@ if(CMAKE_HIP_COMPILER)
|
||||||
|
|
||||||
if(NOT AMDGPU_TARGETS)
|
if(NOT AMDGPU_TARGETS)
|
||||||
find_package(hip REQUIRED)
|
find_package(hip REQUIRED)
|
||||||
list(FILTER AMDGPU_TARGETS INCLUDE REGEX "^gfx(900|94[012]|101[02]|1030|110[012]|120[01])$")
|
list(FILTER AMDGPU_TARGETS INCLUDE REGEX "^gfx(94[012]|101[02]|1030|110[012]|120[01])$")
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
if(WIN32 AND WINDOWS_AMDGPU_TARGETS_EXCLUDE_REGEX)
|
if(WIN32 AND WINDOWS_AMDGPU_TARGETS_EXCLUDE_REGEX)
|
||||||
|
|
|
||||||
|
|
@ -68,7 +68,7 @@
|
||||||
"inherits": [ "ROCm" ],
|
"inherits": [ "ROCm" ],
|
||||||
"cacheVariables": {
|
"cacheVariables": {
|
||||||
"CMAKE_HIP_FLAGS": "-parallel-jobs=4",
|
"CMAKE_HIP_FLAGS": "-parallel-jobs=4",
|
||||||
"AMDGPU_TARGETS": "gfx900;gfx940;gfx941;gfx942;gfx1010;gfx1012;gfx1030;gfx1100;gfx1101;gfx1102;gfx1151;gfx1200;gfx1201;gfx906:xnack-;gfx908:xnack-;gfx90a:xnack+;gfx90a:xnack-"
|
"AMDGPU_TARGETS": "gfx940;gfx941;gfx942;gfx1010;gfx1012;gfx1030;gfx1100;gfx1101;gfx1102;gfx1151;gfx1200;gfx1201;gfx908:xnack-;gfx90a:xnack+;gfx90a:xnack-"
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
|
|
|
||||||
|
|
@ -1,6 +1,6 @@
|
||||||
UPSTREAM=https://github.com/ggml-org/llama.cpp.git
|
UPSTREAM=https://github.com/ggml-org/llama.cpp.git
|
||||||
WORKDIR=llama/vendor
|
WORKDIR=llama/vendor
|
||||||
FETCH_HEAD=e54d41befcc1575f4c898c5ff4ef43970cead75f
|
FETCH_HEAD=364a7a6d4a786e98947c8a90430ea581213c0ba9
|
||||||
|
|
||||||
.PHONY: help
|
.PHONY: help
|
||||||
help:
|
help:
|
||||||
|
|
|
||||||
12
docs/gpu.md
12
docs/gpu.md
|
|
@ -52,13 +52,13 @@ Ollama supports the following AMD GPUs:
|
||||||
|
|
||||||
### Linux Support
|
### Linux Support
|
||||||
| Family | Cards and accelerators |
|
| Family | Cards and accelerators |
|
||||||
| -------------- | ---------------------------------------------------------------------------------------------------------------------------------------------- |
|
| -------------- | ---------------------------------------------------------------------------------------------------------------------------------------- |
|
||||||
| AMD Radeon RX | `7900 XTX` `7900 XT` `7900 GRE` `7800 XT` `7700 XT` `7600 XT` `7600` `6950 XT` `6900 XTX` `6900XT` `6800 XT` `6800` `Vega 64` `Vega 56` |
|
| AMD Radeon RX | `7900 XTX` `7900 XT` `7900 GRE` `7800 XT` `7700 XT` `7600 XT` `7600` `6950 XT` `6900 XTX` `6900XT` `6800 XT` `6800` `Vega 64` |
|
||||||
| AMD Radeon PRO | `W7900` `W7800` `W7700` `W7600` `W7500` `W6900X` `W6800X Duo` `W6800X` `W6800` `V620` `V420` `V340` `V320` `Vega II Duo` `Vega II` `VII` `SSG` |
|
| AMD Radeon PRO | `W7900` `W7800` `W7700` `W7600` `W7500` `W6900X` `W6800X Duo` `W6800X` `W6800` `V620` `V420` `V340` `V320` `Vega II Duo` `Vega II` `SSG` |
|
||||||
| AMD Instinct | `MI300X` `MI300A` `MI300` `MI250X` `MI250` `MI210` `MI200` `MI100` `MI60` `MI50` |
|
| AMD Instinct | `MI300X` `MI300A` `MI300` `MI250X` `MI250` `MI210` `MI200` `MI100` `MI60` |
|
||||||
|
|
||||||
### Windows Support
|
### Windows Support
|
||||||
With ROCm v6.1, the following GPUs are supported on Windows.
|
With ROCm v6.2, the following GPUs are supported on Windows.
|
||||||
|
|
||||||
| Family | Cards and accelerators |
|
| Family | Cards and accelerators |
|
||||||
| -------------- | ---------------------------------------------------------------------------------------------------------------------------------------------- |
|
| -------------- | ---------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||||
|
|
@ -88,8 +88,6 @@ At this time, the known supported GPU types on linux are the following LLVM Targ
|
||||||
This table shows some example GPUs that map to these LLVM targets:
|
This table shows some example GPUs that map to these LLVM targets:
|
||||||
| **LLVM Target** | **An Example GPU** |
|
| **LLVM Target** | **An Example GPU** |
|
||||||
|-----------------|---------------------|
|
|-----------------|---------------------|
|
||||||
| gfx900 | Radeon RX Vega 56 |
|
|
||||||
| gfx906 | Radeon Instinct MI50 |
|
|
||||||
| gfx908 | Radeon Instinct MI100 |
|
| gfx908 | Radeon Instinct MI100 |
|
||||||
| gfx90a | Radeon Instinct MI210 |
|
| gfx90a | Radeon Instinct MI210 |
|
||||||
| gfx940 | Radeon Instinct MI300 |
|
| gfx940 | Radeon Instinct MI300 |
|
||||||
|
|
|
||||||
|
|
@ -2,7 +2,7 @@
|
||||||
|
|
||||||
## System Requirements
|
## System Requirements
|
||||||
|
|
||||||
* MacOS Monterey (v12) or newer
|
* MacOS Sonoma (v14) or newer
|
||||||
* Apple M series (CPU and GPU support) or x86 (CPU only)
|
* Apple M series (CPU and GPU support) or x86 (CPU only)
|
||||||
|
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -36,7 +36,7 @@ func TestLongInputContext(t *testing.T) {
|
||||||
if err := PullIfMissing(ctx, client, req.Model); err != nil {
|
if err := PullIfMissing(ctx, client, req.Model); err != nil {
|
||||||
t.Fatalf("PullIfMissing failed: %v", err)
|
t.Fatalf("PullIfMissing failed: %v", err)
|
||||||
}
|
}
|
||||||
DoGenerate(ctx, t, client, req, []string{"russia", "germany", "france", "england", "austria", "prussia", "europe", "individuals", "coalition", "conflict"}, 120*time.Second, 10*time.Second)
|
DoGenerate(ctx, t, client, req, []string{"russia", "german", "france", "england", "austria", "prussia", "europe", "individuals", "coalition", "conflict"}, 120*time.Second, 10*time.Second)
|
||||||
}
|
}
|
||||||
|
|
||||||
func TestContextExhaustion(t *testing.T) {
|
func TestContextExhaustion(t *testing.T) {
|
||||||
|
|
|
||||||
2
llama/build-info.cpp
generated
vendored
2
llama/build-info.cpp
generated
vendored
|
|
@ -1,4 +1,4 @@
|
||||||
int LLAMA_BUILD_NUMBER = 0;
|
int LLAMA_BUILD_NUMBER = 0;
|
||||||
char const *LLAMA_COMMIT = "e54d41befcc1575f4c898c5ff4ef43970cead75f";
|
char const *LLAMA_COMMIT = "364a7a6d4a786e98947c8a90430ea581213c0ba9";
|
||||||
char const *LLAMA_COMPILER = "";
|
char const *LLAMA_COMPILER = "";
|
||||||
char const *LLAMA_BUILD_TARGET = "";
|
char const *LLAMA_BUILD_TARGET = "";
|
||||||
|
|
|
||||||
110
llama/llama.cpp/common/common.cpp
vendored
110
llama/llama.cpp/common/common.cpp
vendored
|
|
@ -14,6 +14,7 @@
|
||||||
#include <climits>
|
#include <climits>
|
||||||
#include <cmath>
|
#include <cmath>
|
||||||
#include <codecvt>
|
#include <codecvt>
|
||||||
|
#include <chrono>
|
||||||
#include <cstdarg>
|
#include <cstdarg>
|
||||||
#include <cstring>
|
#include <cstring>
|
||||||
#include <ctime>
|
#include <ctime>
|
||||||
|
|
@ -41,6 +42,7 @@
|
||||||
#endif
|
#endif
|
||||||
#include <locale>
|
#include <locale>
|
||||||
#include <windows.h>
|
#include <windows.h>
|
||||||
|
#include <string.h>
|
||||||
#include <fcntl.h>
|
#include <fcntl.h>
|
||||||
#include <io.h>
|
#include <io.h>
|
||||||
#else
|
#else
|
||||||
|
|
@ -49,6 +51,11 @@
|
||||||
#include <unistd.h>
|
#include <unistd.h>
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#if defined(__linux__)
|
||||||
|
#include <sys/types.h>
|
||||||
|
#include <pwd.h>
|
||||||
|
#endif
|
||||||
|
|
||||||
#if defined(_MSC_VER)
|
#if defined(_MSC_VER)
|
||||||
#pragma warning(disable: 4244 4267) // possible loss of data
|
#pragma warning(disable: 4244 4267) // possible loss of data
|
||||||
#endif
|
#endif
|
||||||
|
|
@ -557,13 +564,6 @@ std::string string_from(const struct llama_context * ctx, const std::vector<llam
|
||||||
|
|
||||||
auto detokenized = common_token_to_piece(ctx, token);
|
auto detokenized = common_token_to_piece(ctx, token);
|
||||||
|
|
||||||
detokenized.erase(
|
|
||||||
std::remove_if(
|
|
||||||
detokenized.begin(),
|
|
||||||
detokenized.end(),
|
|
||||||
[](const unsigned char c) { return !std::isprint(c); }),
|
|
||||||
detokenized.end());
|
|
||||||
|
|
||||||
buf << "'" << detokenized << "'"
|
buf << "'" << detokenized << "'"
|
||||||
<< ":" << std::to_string(token);
|
<< ":" << std::to_string(token);
|
||||||
}
|
}
|
||||||
|
|
@ -588,13 +588,6 @@ std::string string_from(const struct llama_context * ctx, const struct llama_bat
|
||||||
|
|
||||||
auto detokenized = common_token_to_piece(ctx, batch.token[i]);
|
auto detokenized = common_token_to_piece(ctx, batch.token[i]);
|
||||||
|
|
||||||
detokenized.erase(
|
|
||||||
std::remove_if(
|
|
||||||
detokenized.begin(),
|
|
||||||
detokenized.end(),
|
|
||||||
[](const unsigned char c) { return !std::isprint(c); }),
|
|
||||||
detokenized.end());
|
|
||||||
|
|
||||||
buf << "\n" << std::to_string(i)
|
buf << "\n" << std::to_string(i)
|
||||||
<< ", token '" << detokenized << "'"
|
<< ", token '" << detokenized << "'"
|
||||||
<< ", pos " << std::to_string(batch.pos[i])
|
<< ", pos " << std::to_string(batch.pos[i])
|
||||||
|
|
@ -877,8 +870,20 @@ std::string fs_get_cache_directory() {
|
||||||
#if defined(__linux__) || defined(__FreeBSD__) || defined(_AIX) || defined(__OpenBSD__)
|
#if defined(__linux__) || defined(__FreeBSD__) || defined(_AIX) || defined(__OpenBSD__)
|
||||||
if (std::getenv("XDG_CACHE_HOME")) {
|
if (std::getenv("XDG_CACHE_HOME")) {
|
||||||
cache_directory = std::getenv("XDG_CACHE_HOME");
|
cache_directory = std::getenv("XDG_CACHE_HOME");
|
||||||
} else {
|
} else if (std::getenv("HOME")) {
|
||||||
cache_directory = std::getenv("HOME") + std::string("/.cache/");
|
cache_directory = std::getenv("HOME") + std::string("/.cache/");
|
||||||
|
} else {
|
||||||
|
#if defined(__linux__)
|
||||||
|
/* no $HOME is defined, fallback to getpwuid */
|
||||||
|
struct passwd *pw = getpwuid(getuid());
|
||||||
|
if ((!pw) || (!pw->pw_dir)) {
|
||||||
|
throw std::runtime_error("Failed to find $HOME directory");
|
||||||
|
}
|
||||||
|
|
||||||
|
cache_directory = std::string(pw->pw_dir) + std::string("/.cache/");
|
||||||
|
#else /* defined(__linux__) */
|
||||||
|
throw std::runtime_error("Failed to find $HOME directory");
|
||||||
|
#endif /* defined(__linux__) */
|
||||||
}
|
}
|
||||||
#elif defined(__APPLE__)
|
#elif defined(__APPLE__)
|
||||||
cache_directory = std::getenv("HOME") + std::string("/Library/Caches/");
|
cache_directory = std::getenv("HOME") + std::string("/Library/Caches/");
|
||||||
|
|
@ -914,7 +919,8 @@ struct common_init_result common_init_from_params(common_params & params) {
|
||||||
|
|
||||||
llama_model * model = llama_model_load_from_file(params.model.path.c_str(), mparams);
|
llama_model * model = llama_model_load_from_file(params.model.path.c_str(), mparams);
|
||||||
if (model == NULL) {
|
if (model == NULL) {
|
||||||
LOG_ERR("%s: failed to load model '%s'\n", __func__, params.model.path.c_str());
|
LOG_ERR("%s: failed to load model '%s', try reducing --n-gpu-layers if you're running out of VRAM\n",
|
||||||
|
__func__, params.model.path.c_str());
|
||||||
return iparams;
|
return iparams;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -924,7 +930,8 @@ struct common_init_result common_init_from_params(common_params & params) {
|
||||||
|
|
||||||
llama_context * lctx = llama_init_from_model(model, cparams);
|
llama_context * lctx = llama_init_from_model(model, cparams);
|
||||||
if (lctx == NULL) {
|
if (lctx == NULL) {
|
||||||
LOG_ERR("%s: failed to create context with model '%s'\n", __func__, params.model.path.c_str());
|
LOG_ERR("%s: failed to create context with model '%s', try reducing --n-gpu-layers if you're running out of VRAM\n",
|
||||||
|
__func__, params.model.path.c_str());
|
||||||
llama_model_free(model);
|
llama_model_free(model);
|
||||||
return iparams;
|
return iparams;
|
||||||
}
|
}
|
||||||
|
|
@ -971,15 +978,13 @@ struct common_init_result common_init_from_params(common_params & params) {
|
||||||
|
|
||||||
bool has_eos = llama_vocab_eos(vocab) != LLAMA_TOKEN_NULL;
|
bool has_eos = llama_vocab_eos(vocab) != LLAMA_TOKEN_NULL;
|
||||||
bool has_sep = llama_vocab_sep(vocab) != LLAMA_TOKEN_NULL;
|
bool has_sep = llama_vocab_sep(vocab) != LLAMA_TOKEN_NULL;
|
||||||
|
bool has_rerank_prompt = llama_model_chat_template(model, "rerank") != NULL;
|
||||||
|
|
||||||
if (!has_eos && !has_sep) {
|
if (!has_eos && !has_sep && !has_rerank_prompt) {
|
||||||
LOG_WRN("%s: warning: vocab does not have an EOS token or SEP token, reranking will not work\n", __func__);
|
LOG_WRN("%s: warning: vocab does not have an EOS token, SEP token, or rerank prompt. Reranking will not work\n", __func__);
|
||||||
ok = false;
|
ok = false;
|
||||||
} else if (!has_eos) {
|
} else if (!has_eos) {
|
||||||
LOG_WRN("%s: warning: vocab does not have an EOS token, using SEP token as fallback\n", __func__);
|
LOG_WRN("%s: warning: vocab does not have an EOS token, using SEP token as fallback\n", __func__);
|
||||||
} else if (!has_sep) {
|
|
||||||
LOG_WRN("%s: warning: vocab does not have a SEP token, reranking will not work\n", __func__);
|
|
||||||
ok = false;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!ok) {
|
if (!ok) {
|
||||||
|
|
@ -1001,7 +1006,12 @@ struct common_init_result common_init_from_params(common_params & params) {
|
||||||
return iparams;
|
return iparams;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
char buf[1024];
|
||||||
la.ptr = lora.get();
|
la.ptr = lora.get();
|
||||||
|
llama_adapter_meta_val_str(la.ptr, "adapter.lora.task_name", buf, sizeof(buf));
|
||||||
|
la.task_name = buf;
|
||||||
|
llama_adapter_meta_val_str(la.ptr, "adapter.lora.prompt_prefix", buf, sizeof(buf));
|
||||||
|
la.prompt_prefix = buf;
|
||||||
iparams.lora.emplace_back(std::move(lora)); // copy to list of loaded adapters
|
iparams.lora.emplace_back(std::move(lora)); // copy to list of loaded adapters
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -1165,11 +1175,10 @@ struct llama_context_params common_context_params_to_llama(const common_params &
|
||||||
cparams.yarn_orig_ctx = params.yarn_orig_ctx;
|
cparams.yarn_orig_ctx = params.yarn_orig_ctx;
|
||||||
cparams.pooling_type = params.pooling_type;
|
cparams.pooling_type = params.pooling_type;
|
||||||
cparams.attention_type = params.attention_type;
|
cparams.attention_type = params.attention_type;
|
||||||
cparams.defrag_thold = params.defrag_thold;
|
cparams.flash_attn_type = params.flash_attn_type;
|
||||||
cparams.cb_eval = params.cb_eval;
|
cparams.cb_eval = params.cb_eval;
|
||||||
cparams.cb_eval_user_data = params.cb_eval_user_data;
|
cparams.cb_eval_user_data = params.cb_eval_user_data;
|
||||||
cparams.offload_kqv = !params.no_kv_offload;
|
cparams.offload_kqv = !params.no_kv_offload;
|
||||||
cparams.flash_attn = params.flash_attn;
|
|
||||||
cparams.no_perf = params.no_perf;
|
cparams.no_perf = params.no_perf;
|
||||||
cparams.op_offload = !params.no_op_offload;
|
cparams.op_offload = !params.no_op_offload;
|
||||||
cparams.swa_full = params.swa_full;
|
cparams.swa_full = params.swa_full;
|
||||||
|
|
@ -1565,3 +1574,56 @@ ggml_opt_dataset_t common_opt_dataset_init(struct llama_context * ctx, const std
|
||||||
|
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
ggml_opt_optimizer_params common_opt_lr_pars(void * userdata) {
|
||||||
|
ggml_opt_optimizer_params result = ggml_opt_get_default_optimizer_params(nullptr);
|
||||||
|
const lr_opt & d = *(lr_opt *) userdata;
|
||||||
|
result.adamw.alpha = result.sgd.alpha = d.get_lr(d.epoch);
|
||||||
|
result.sgd.wd = result.adamw.wd = d.wd;
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
// TODO make all command line args case-insensitive
|
||||||
|
static inline bool eq_case_insensitive(char const* a, char const* b) {
|
||||||
|
return !
|
||||||
|
#if defined(_MSC_VER)
|
||||||
|
_stricmp
|
||||||
|
#else
|
||||||
|
strcasecmp
|
||||||
|
#endif // defined(_MSC_VER)
|
||||||
|
(a, b);
|
||||||
|
}
|
||||||
|
|
||||||
|
enum ggml_opt_optimizer_type common_opt_get_optimizer(const char * n) {
|
||||||
|
if (eq_case_insensitive("adamw", n)) {
|
||||||
|
return GGML_OPT_OPTIMIZER_TYPE_ADAMW;
|
||||||
|
}
|
||||||
|
if (eq_case_insensitive("sgd", n)) {
|
||||||
|
return GGML_OPT_OPTIMIZER_TYPE_SGD;
|
||||||
|
}
|
||||||
|
return GGML_OPT_OPTIMIZER_TYPE_COUNT;
|
||||||
|
}
|
||||||
|
|
||||||
|
// TODO simplify to use just log and exp
|
||||||
|
static float const k_log_2 = std::log(2.f);
|
||||||
|
|
||||||
|
void lr_opt::init() {
|
||||||
|
if (lr_min > 0 && lr_min < lr0) {
|
||||||
|
float nhalf = std::log(lr0 / lr_min) / k_log_2;
|
||||||
|
float e = epochs;
|
||||||
|
if (decay_epochs > 0 && decay_epochs < e) {
|
||||||
|
e = decay_epochs;
|
||||||
|
} else {
|
||||||
|
decay_epochs = e;
|
||||||
|
}
|
||||||
|
scale_epoch = nhalf / e;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
float lr_opt::get_lr(float epoch) const {
|
||||||
|
float r = lr_min <= 0 ? lr0 :
|
||||||
|
epoch >= decay_epochs ? lr_min :
|
||||||
|
lr0 * std::pow(0.5f, epoch * scale_epoch);
|
||||||
|
LOG_INF("epoch %.2g lr=%.2g\n", epoch, r);
|
||||||
|
return r;
|
||||||
|
}
|
||||||
|
|
|
||||||
77
llama/llama.cpp/common/common.h
vendored
77
llama/llama.cpp/common/common.h
vendored
|
|
@ -2,14 +2,17 @@
|
||||||
|
|
||||||
#pragma once
|
#pragma once
|
||||||
|
|
||||||
#include "llama-cpp.h"
|
|
||||||
|
|
||||||
#include <set>
|
#include <set>
|
||||||
|
#include <sstream>
|
||||||
#include <string>
|
#include <string>
|
||||||
#include <string_view>
|
#include <string_view>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
#include <map>
|
#include <map>
|
||||||
#include <sstream>
|
#include <sstream>
|
||||||
|
#include <cmath>
|
||||||
|
|
||||||
|
#include "ggml-opt.h"
|
||||||
|
#include "llama-cpp.h"
|
||||||
|
|
||||||
#ifdef _WIN32
|
#ifdef _WIN32
|
||||||
#define DIRECTORY_SEPARATOR '\\'
|
#define DIRECTORY_SEPARATOR '\\'
|
||||||
|
|
@ -31,6 +34,9 @@ struct common_adapter_lora_info {
|
||||||
std::string path;
|
std::string path;
|
||||||
float scale;
|
float scale;
|
||||||
|
|
||||||
|
std::string task_name;
|
||||||
|
std::string prompt_prefix;
|
||||||
|
|
||||||
struct llama_adapter_lora * ptr;
|
struct llama_adapter_lora * ptr;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
@ -82,6 +88,7 @@ enum llama_example {
|
||||||
LLAMA_EXAMPLE_PARALLEL,
|
LLAMA_EXAMPLE_PARALLEL,
|
||||||
LLAMA_EXAMPLE_TTS,
|
LLAMA_EXAMPLE_TTS,
|
||||||
LLAMA_EXAMPLE_DIFFUSION,
|
LLAMA_EXAMPLE_DIFFUSION,
|
||||||
|
LLAMA_EXAMPLE_FINETUNE,
|
||||||
|
|
||||||
LLAMA_EXAMPLE_COUNT,
|
LLAMA_EXAMPLE_COUNT,
|
||||||
};
|
};
|
||||||
|
|
@ -190,6 +197,7 @@ struct common_params_model {
|
||||||
std::string url = ""; // model url to download // NOLINT
|
std::string url = ""; // model url to download // NOLINT
|
||||||
std::string hf_repo = ""; // HF repo // NOLINT
|
std::string hf_repo = ""; // HF repo // NOLINT
|
||||||
std::string hf_file = ""; // HF file // NOLINT
|
std::string hf_file = ""; // HF file // NOLINT
|
||||||
|
std::string docker_repo = ""; // Docker repo // NOLINT
|
||||||
};
|
};
|
||||||
|
|
||||||
struct common_params_speculative {
|
struct common_params_speculative {
|
||||||
|
|
@ -202,6 +210,7 @@ struct common_params_speculative {
|
||||||
float p_split = 0.1f; // speculative decoding split probability
|
float p_split = 0.1f; // speculative decoding split probability
|
||||||
float p_min = 0.75f; // minimum speculative decoding probability (greedy)
|
float p_min = 0.75f; // minimum speculative decoding probability (greedy)
|
||||||
std::vector<std::pair<std::string, std::string>> replacements; // main to speculative model replacements
|
std::vector<std::pair<std::string, std::string>> replacements; // main to speculative model replacements
|
||||||
|
std::vector<llama_model_tensor_buft_override> tensor_buft_overrides;
|
||||||
|
|
||||||
ggml_type cache_type_k = GGML_TYPE_F16; // KV cache data type for the K
|
ggml_type cache_type_k = GGML_TYPE_F16; // KV cache data type for the K
|
||||||
ggml_type cache_type_v = GGML_TYPE_F16; // KV cache data type for the V
|
ggml_type cache_type_v = GGML_TYPE_F16; // KV cache data type for the V
|
||||||
|
|
@ -234,14 +243,36 @@ struct common_params_diffusion {
|
||||||
bool add_gumbel_noise = false; // add gumbel noise to the logits if temp > 0.0
|
bool add_gumbel_noise = false; // add gumbel noise to the logits if temp > 0.0
|
||||||
};
|
};
|
||||||
|
|
||||||
|
// reasoning API response format (not to be confused as chat template's reasoning format)
|
||||||
enum common_reasoning_format {
|
enum common_reasoning_format {
|
||||||
COMMON_REASONING_FORMAT_NONE,
|
COMMON_REASONING_FORMAT_NONE,
|
||||||
COMMON_REASONING_FORMAT_AUTO,
|
COMMON_REASONING_FORMAT_AUTO, // Same as deepseek, using `message.reasoning_content`
|
||||||
COMMON_REASONING_FORMAT_DEEPSEEK_LEGACY, // Extract thinking tag contents and return as `message.reasoning_content`, or leave inline in <think> tags in stream mode
|
COMMON_REASONING_FORMAT_DEEPSEEK_LEGACY, // Extract thinking tag contents and return as `message.reasoning_content`, or leave inline in <think> tags in stream mode
|
||||||
COMMON_REASONING_FORMAT_DEEPSEEK, // Extract thinking tag contents and return as `message.reasoning_content`, including in streaming deltas.
|
COMMON_REASONING_FORMAT_DEEPSEEK, // Extract thinking tag contents and return as `message.reasoning_content`, including in streaming deltas.
|
||||||
COMMON_REASONING_FORMAT_GRANITE, // Extract thinking tag contents and return as `message.reasoning_content`, including in streaming deltas.
|
// do not extend this enum unless you absolutely have to
|
||||||
|
// in most cases, use COMMON_REASONING_FORMAT_AUTO
|
||||||
|
// see: https://github.com/ggml-org/llama.cpp/pull/15408
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
|
struct lr_opt {
|
||||||
|
float lr0 = 1e-5; // learning rate at first epoch
|
||||||
|
float lr_min = -1;
|
||||||
|
float decay_epochs = -1; // if >0, the learning rate starts at lr0 and decays to lr_min after this many epochs
|
||||||
|
float scale_epoch = 0;
|
||||||
|
float wd = 0;
|
||||||
|
unsigned epochs = 2;
|
||||||
|
|
||||||
|
unsigned epoch; // set by optimizer outer (epochs) loop
|
||||||
|
// learning rate decay - constant LR per epoch only for now
|
||||||
|
float get_lr(float e) const;
|
||||||
|
float get_lr() const { return get_lr(epoch); }
|
||||||
|
// must call after arg parse, before get_lr
|
||||||
|
void init();
|
||||||
|
};
|
||||||
|
|
||||||
|
struct ggml_opt_optimizer_params common_opt_lr_pars(void * userdata);
|
||||||
|
|
||||||
struct common_params {
|
struct common_params {
|
||||||
int32_t n_predict = -1; // new tokens to predict
|
int32_t n_predict = -1; // new tokens to predict
|
||||||
int32_t n_ctx = 4096; // context size
|
int32_t n_ctx = 4096; // context size
|
||||||
|
|
@ -257,11 +288,10 @@ struct common_params {
|
||||||
float rope_freq_base = 0.0f; // RoPE base frequency
|
float rope_freq_base = 0.0f; // RoPE base frequency
|
||||||
float rope_freq_scale = 0.0f; // RoPE frequency scaling factor
|
float rope_freq_scale = 0.0f; // RoPE frequency scaling factor
|
||||||
float yarn_ext_factor = -1.0f; // YaRN extrapolation mix factor
|
float yarn_ext_factor = -1.0f; // YaRN extrapolation mix factor
|
||||||
float yarn_attn_factor = 1.0f; // YaRN magnitude scaling factor
|
float yarn_attn_factor = -1.0f; // YaRN magnitude scaling factor
|
||||||
float yarn_beta_fast = 32.0f; // YaRN low correction dim
|
float yarn_beta_fast = -1.0f; // YaRN low correction dim
|
||||||
float yarn_beta_slow = 1.0f; // YaRN high correction dim
|
float yarn_beta_slow = -1.0f; // YaRN high correction dim
|
||||||
int32_t yarn_orig_ctx = 0; // YaRN original context length
|
int32_t yarn_orig_ctx = 0; // YaRN original context length
|
||||||
float defrag_thold = 0.1f; // KV cache defragmentation threshold
|
|
||||||
|
|
||||||
// offload params
|
// offload params
|
||||||
std::vector<ggml_backend_dev_t> devices; // devices to use for offloading
|
std::vector<ggml_backend_dev_t> devices; // devices to use for offloading
|
||||||
|
|
@ -283,6 +313,7 @@ struct common_params {
|
||||||
enum llama_rope_scaling_type rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED;
|
enum llama_rope_scaling_type rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED;
|
||||||
enum llama_pooling_type pooling_type = LLAMA_POOLING_TYPE_UNSPECIFIED; // pooling type for embeddings
|
enum llama_pooling_type pooling_type = LLAMA_POOLING_TYPE_UNSPECIFIED; // pooling type for embeddings
|
||||||
enum llama_attention_type attention_type = LLAMA_ATTENTION_TYPE_UNSPECIFIED; // attention type for embeddings
|
enum llama_attention_type attention_type = LLAMA_ATTENTION_TYPE_UNSPECIFIED; // attention type for embeddings
|
||||||
|
enum llama_flash_attn_type flash_attn_type = LLAMA_FLASH_ATTN_TYPE_AUTO; // whether to use Flash Attention
|
||||||
|
|
||||||
struct common_params_sampling sampling;
|
struct common_params_sampling sampling;
|
||||||
struct common_params_speculative speculative;
|
struct common_params_speculative speculative;
|
||||||
|
|
@ -346,9 +377,8 @@ struct common_params {
|
||||||
bool multiline_input = false; // reverse the usage of `\`
|
bool multiline_input = false; // reverse the usage of `\`
|
||||||
bool simple_io = false; // improves compatibility with subprocesses and limited consoles
|
bool simple_io = false; // improves compatibility with subprocesses and limited consoles
|
||||||
bool cont_batching = true; // insert new sequences for decoding on-the-fly
|
bool cont_batching = true; // insert new sequences for decoding on-the-fly
|
||||||
bool flash_attn = false; // flash attention
|
|
||||||
bool no_perf = false; // disable performance metrics
|
bool no_perf = false; // disable performance metrics
|
||||||
bool ctx_shift = true; // context shift on inifinite text generation
|
bool ctx_shift = false; // context shift on infinite text generation
|
||||||
bool swa_full = false; // use full-size SWA cache (https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055)
|
bool swa_full = false; // use full-size SWA cache (https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055)
|
||||||
bool kv_unified = false; // enable unified KV cache
|
bool kv_unified = false; // enable unified KV cache
|
||||||
|
|
||||||
|
|
@ -376,6 +406,11 @@ struct common_params {
|
||||||
bool no_mmproj = false; // explicitly disable multimodal model
|
bool no_mmproj = false; // explicitly disable multimodal model
|
||||||
std::vector<std::string> image; // path to image file(s)
|
std::vector<std::string> image; // path to image file(s)
|
||||||
|
|
||||||
|
// finetune
|
||||||
|
struct lr_opt lr;
|
||||||
|
enum ggml_opt_optimizer_type optimizer = GGML_OPT_OPTIMIZER_TYPE_ADAMW;
|
||||||
|
float val_split = 0.05f; // fraction of the data used for the validation set
|
||||||
|
|
||||||
// embedding
|
// embedding
|
||||||
bool embedding = false; // get only sentence embedding
|
bool embedding = false; // get only sentence embedding
|
||||||
int32_t embd_normalize = 2; // normalisation for embeddings (-1=none, 0=max absolute int16, 1=taxicab, 2=euclidean, >2=p-norm)
|
int32_t embd_normalize = 2; // normalisation for embeddings (-1=none, 0=max absolute int16, 1=taxicab, 2=euclidean, >2=p-norm)
|
||||||
|
|
@ -389,6 +424,7 @@ struct common_params {
|
||||||
int32_t timeout_write = timeout_read; // http write timeout in seconds
|
int32_t timeout_write = timeout_read; // http write timeout in seconds
|
||||||
int32_t n_threads_http = -1; // number of threads to process HTTP requests (TODO: support threadpool)
|
int32_t n_threads_http = -1; // number of threads to process HTTP requests (TODO: support threadpool)
|
||||||
int32_t n_cache_reuse = 0; // min chunk size to reuse from the cache via KV shifting
|
int32_t n_cache_reuse = 0; // min chunk size to reuse from the cache via KV shifting
|
||||||
|
int32_t n_swa_checkpoints = 3; // max number of SWA checkpoints per slot
|
||||||
|
|
||||||
std::string hostname = "127.0.0.1";
|
std::string hostname = "127.0.0.1";
|
||||||
std::string public_path = ""; // NOLINT
|
std::string public_path = ""; // NOLINT
|
||||||
|
|
@ -409,7 +445,7 @@ struct common_params {
|
||||||
|
|
||||||
// "advanced" endpoints are disabled by default for better security
|
// "advanced" endpoints are disabled by default for better security
|
||||||
bool webui = true;
|
bool webui = true;
|
||||||
bool endpoint_slots = false;
|
bool endpoint_slots = true;
|
||||||
bool endpoint_props = false; // only control POST requests, not GET
|
bool endpoint_props = false; // only control POST requests, not GET
|
||||||
bool endpoint_metrics = false;
|
bool endpoint_metrics = false;
|
||||||
|
|
||||||
|
|
@ -417,7 +453,7 @@ struct common_params {
|
||||||
|
|
||||||
std::string slot_save_path;
|
std::string slot_save_path;
|
||||||
|
|
||||||
float slot_prompt_similarity = 0.5f;
|
float slot_prompt_similarity = 0.1f;
|
||||||
|
|
||||||
// batched-bench params
|
// batched-bench params
|
||||||
bool is_pp_shared = false;
|
bool is_pp_shared = false;
|
||||||
|
|
@ -698,8 +734,25 @@ const char * const LLM_KV_SPLIT_TENSORS_COUNT = "split.tensors.count";
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
//
|
||||||
|
// MoE utils
|
||||||
|
//
|
||||||
|
|
||||||
|
const char * const LLM_FFN_EXPS_REGEX = "\\.ffn_(up|down|gate)_(ch|)exps";
|
||||||
|
|
||||||
|
static std::string llm_ffn_exps_block_regex(int idx) {
|
||||||
|
return string_format("blk\\.%d%s", idx, LLM_FFN_EXPS_REGEX);
|
||||||
|
}
|
||||||
|
|
||||||
|
static llama_model_tensor_buft_override llm_ffn_exps_cpu_override() {
|
||||||
|
return { LLM_FFN_EXPS_REGEX, ggml_backend_cpu_buffer_type() };
|
||||||
|
}
|
||||||
|
|
||||||
//
|
//
|
||||||
// training utils
|
// training utils
|
||||||
//
|
//
|
||||||
|
|
||||||
ggml_opt_dataset_t common_opt_dataset_init(struct llama_context * ctx, const std::vector<llama_token> & tokens, int64_t stride);
|
ggml_opt_dataset_t common_opt_dataset_init(struct llama_context * ctx, const std::vector<llama_token> & tokens, int64_t stride);
|
||||||
|
|
||||||
|
// "adamw" or "sgd" (case insensitive)
|
||||||
|
enum ggml_opt_optimizer_type common_opt_get_optimizer(const char *);
|
||||||
|
|
|
||||||
|
|
@ -257,12 +257,13 @@ std::unordered_map<std::string, BuiltinRule> STRING_FORMAT_RULES = {
|
||||||
};
|
};
|
||||||
|
|
||||||
static bool is_reserved_name(const std::string & name) {
|
static bool is_reserved_name(const std::string & name) {
|
||||||
static std::unordered_set<std::string> RESERVED_NAMES;
|
static const std::unordered_set<std::string> RESERVED_NAMES = [] {
|
||||||
if (RESERVED_NAMES.empty()) {
|
std::unordered_set<std::string> s;
|
||||||
RESERVED_NAMES.insert("root");
|
s.insert("root");
|
||||||
for (const auto &p : PRIMITIVE_RULES) RESERVED_NAMES.insert(p.first);
|
for (const auto & p : PRIMITIVE_RULES) s.insert(p.first);
|
||||||
for (const auto &p : STRING_FORMAT_RULES) RESERVED_NAMES.insert(p.first);
|
for (const auto & p : STRING_FORMAT_RULES) s.insert(p.first);
|
||||||
}
|
return s;
|
||||||
|
}();
|
||||||
return RESERVED_NAMES.find(name) != RESERVED_NAMES.end();
|
return RESERVED_NAMES.find(name) != RESERVED_NAMES.end();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -843,9 +844,10 @@ public:
|
||||||
_build_object_rule(
|
_build_object_rule(
|
||||||
properties, required, name,
|
properties, required, name,
|
||||||
schema.contains("additionalProperties") ? schema["additionalProperties"] : json()));
|
schema.contains("additionalProperties") ? schema["additionalProperties"] : json()));
|
||||||
} else if ((schema_type.is_null() || schema_type == "object") && schema.contains("allOf")) {
|
} else if ((schema_type.is_null() || schema_type == "object" || schema_type == "string") && schema.contains("allOf")) {
|
||||||
std::unordered_set<std::string> required;
|
std::unordered_set<std::string> required;
|
||||||
std::vector<std::pair<std::string, json>> properties;
|
std::vector<std::pair<std::string, json>> properties;
|
||||||
|
std::map<std::string, size_t> enum_values;
|
||||||
std::string hybrid_name = name;
|
std::string hybrid_name = name;
|
||||||
std::function<void(const json &, bool)> add_component = [&](const json & comp_schema, bool is_required) {
|
std::function<void(const json &, bool)> add_component = [&](const json & comp_schema, bool is_required) {
|
||||||
if (comp_schema.contains("$ref")) {
|
if (comp_schema.contains("$ref")) {
|
||||||
|
|
@ -857,6 +859,14 @@ public:
|
||||||
required.insert(prop.key());
|
required.insert(prop.key());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
} else if (comp_schema.contains("enum")) {
|
||||||
|
for (const auto & v : comp_schema["enum"]) {
|
||||||
|
const auto rule = _generate_constant_rule(v);
|
||||||
|
if (enum_values.find(rule) == enum_values.end()) {
|
||||||
|
enum_values[rule] = 0;
|
||||||
|
}
|
||||||
|
enum_values[rule] += 1;
|
||||||
|
}
|
||||||
} else {
|
} else {
|
||||||
// todo warning
|
// todo warning
|
||||||
}
|
}
|
||||||
|
|
@ -870,6 +880,17 @@ public:
|
||||||
add_component(t, true);
|
add_component(t, true);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
if (!enum_values.empty()) {
|
||||||
|
std::vector<std::string> enum_intersection;
|
||||||
|
for (const auto & p : enum_values) {
|
||||||
|
if (p.second == schema["allOf"].size()) {
|
||||||
|
enum_intersection.push_back(p.first);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (!enum_intersection.empty()) {
|
||||||
|
return _add_rule(rule_name, "(" + string_join(enum_intersection, " | ") + ") space");
|
||||||
|
}
|
||||||
|
}
|
||||||
return _add_rule(rule_name, _build_object_rule(properties, required, hybrid_name, json()));
|
return _add_rule(rule_name, _build_object_rule(properties, required, hybrid_name, json()));
|
||||||
} else if ((schema_type.is_null() || schema_type == "array") && (schema.contains("items") || schema.contains("prefixItems"))) {
|
} else if ((schema_type.is_null() || schema_type == "array") && (schema.contains("items") || schema.contains("prefixItems"))) {
|
||||||
json items = schema.contains("items") ? schema["items"] : schema["prefixItems"];
|
json items = schema.contains("items") ? schema["items"] : schema["prefixItems"];
|
||||||
|
|
|
||||||
55
llama/llama.cpp/common/log.cpp
vendored
55
llama/llama.cpp/common/log.cpp
vendored
|
|
@ -4,17 +4,52 @@
|
||||||
#include <condition_variable>
|
#include <condition_variable>
|
||||||
#include <cstdarg>
|
#include <cstdarg>
|
||||||
#include <cstdio>
|
#include <cstdio>
|
||||||
|
#include <cstdlib>
|
||||||
|
#include <cstring>
|
||||||
#include <mutex>
|
#include <mutex>
|
||||||
#include <sstream>
|
#include <sstream>
|
||||||
#include <thread>
|
#include <thread>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
|
|
||||||
|
#if defined(_WIN32)
|
||||||
|
# include <io.h>
|
||||||
|
# include <windows.h>
|
||||||
|
# define isatty _isatty
|
||||||
|
# define fileno _fileno
|
||||||
|
#else
|
||||||
|
# include <unistd.h>
|
||||||
|
#endif // defined(_WIN32)
|
||||||
|
|
||||||
int common_log_verbosity_thold = LOG_DEFAULT_LLAMA;
|
int common_log_verbosity_thold = LOG_DEFAULT_LLAMA;
|
||||||
|
|
||||||
void common_log_set_verbosity_thold(int verbosity) {
|
void common_log_set_verbosity_thold(int verbosity) {
|
||||||
common_log_verbosity_thold = verbosity;
|
common_log_verbosity_thold = verbosity;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Auto-detect if colors should be enabled based on terminal and environment
|
||||||
|
static bool common_log_should_use_colors_auto() {
|
||||||
|
// Check NO_COLOR environment variable (https://no-color.org/)
|
||||||
|
if (const char * no_color = std::getenv("NO_COLOR")) {
|
||||||
|
if (no_color[0] != '\0') {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Check TERM environment variable
|
||||||
|
if (const char * term = std::getenv("TERM")) {
|
||||||
|
if (std::strcmp(term, "dumb") == 0) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Check if stdout and stderr are connected to a terminal
|
||||||
|
// We check both because log messages can go to either
|
||||||
|
bool stdout_is_tty = isatty(fileno(stdout));
|
||||||
|
bool stderr_is_tty = isatty(fileno(stderr));
|
||||||
|
|
||||||
|
return stdout_is_tty || stderr_is_tty;
|
||||||
|
}
|
||||||
|
|
||||||
static int64_t t_us() {
|
static int64_t t_us() {
|
||||||
return std::chrono::duration_cast<std::chrono::microseconds>(std::chrono::system_clock::now().time_since_epoch()).count();
|
return std::chrono::duration_cast<std::chrono::microseconds>(std::chrono::system_clock::now().time_since_epoch()).count();
|
||||||
}
|
}
|
||||||
|
|
@ -353,6 +388,11 @@ struct common_log * common_log_init() {
|
||||||
|
|
||||||
struct common_log * common_log_main() {
|
struct common_log * common_log_main() {
|
||||||
static struct common_log log;
|
static struct common_log log;
|
||||||
|
static std::once_flag init_flag;
|
||||||
|
std::call_once(init_flag, [&]() {
|
||||||
|
// Set default to auto-detect colors
|
||||||
|
log.set_colors(common_log_should_use_colors_auto());
|
||||||
|
});
|
||||||
|
|
||||||
return &log;
|
return &log;
|
||||||
}
|
}
|
||||||
|
|
@ -380,8 +420,19 @@ void common_log_set_file(struct common_log * log, const char * file) {
|
||||||
log->set_file(file);
|
log->set_file(file);
|
||||||
}
|
}
|
||||||
|
|
||||||
void common_log_set_colors(struct common_log * log, bool colors) {
|
void common_log_set_colors(struct common_log * log, log_colors colors) {
|
||||||
log->set_colors(colors);
|
if (colors == LOG_COLORS_AUTO) {
|
||||||
|
log->set_colors(common_log_should_use_colors_auto());
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (colors == LOG_COLORS_DISABLED) {
|
||||||
|
log->set_colors(false);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
GGML_ASSERT(colors == LOG_COLORS_ENABLED);
|
||||||
|
log->set_colors(true);
|
||||||
}
|
}
|
||||||
|
|
||||||
void common_log_set_prefix(struct common_log * log, bool prefix) {
|
void common_log_set_prefix(struct common_log * log, bool prefix) {
|
||||||
|
|
|
||||||
8
llama/llama.cpp/common/log.h
vendored
8
llama/llama.cpp/common/log.h
vendored
|
|
@ -24,6 +24,12 @@
|
||||||
#define LOG_DEFAULT_DEBUG 1
|
#define LOG_DEFAULT_DEBUG 1
|
||||||
#define LOG_DEFAULT_LLAMA 0
|
#define LOG_DEFAULT_LLAMA 0
|
||||||
|
|
||||||
|
enum log_colors {
|
||||||
|
LOG_COLORS_AUTO = -1,
|
||||||
|
LOG_COLORS_DISABLED = 0,
|
||||||
|
LOG_COLORS_ENABLED = 1,
|
||||||
|
};
|
||||||
|
|
||||||
// needed by the LOG_TMPL macro to avoid computing log arguments if the verbosity lower
|
// needed by the LOG_TMPL macro to avoid computing log arguments if the verbosity lower
|
||||||
// set via common_log_set_verbosity()
|
// set via common_log_set_verbosity()
|
||||||
extern int common_log_verbosity_thold;
|
extern int common_log_verbosity_thold;
|
||||||
|
|
@ -66,7 +72,7 @@ void common_log_add(struct common_log * log, enum ggml_log_level level, const ch
|
||||||
//
|
//
|
||||||
|
|
||||||
void common_log_set_file (struct common_log * log, const char * file); // not thread-safe
|
void common_log_set_file (struct common_log * log, const char * file); // not thread-safe
|
||||||
void common_log_set_colors (struct common_log * log, bool colors); // not thread-safe
|
void common_log_set_colors (struct common_log * log, log_colors colors); // not thread-safe
|
||||||
void common_log_set_prefix (struct common_log * log, bool prefix); // whether to output prefix to each log
|
void common_log_set_prefix (struct common_log * log, bool prefix); // whether to output prefix to each log
|
||||||
void common_log_set_timestamps(struct common_log * log, bool timestamps); // whether to output timestamps in the prefix
|
void common_log_set_timestamps(struct common_log * log, bool timestamps); // whether to output timestamps in the prefix
|
||||||
|
|
||||||
|
|
|
||||||
26
llama/llama.cpp/common/sampling.cpp
vendored
26
llama/llama.cpp/common/sampling.cpp
vendored
|
|
@ -332,6 +332,7 @@ void common_perf_print(const struct llama_context * ctx, const struct common_sam
|
||||||
}
|
}
|
||||||
if (ctx) {
|
if (ctx) {
|
||||||
llama_perf_context_print(ctx);
|
llama_perf_context_print(ctx);
|
||||||
|
llama_memory_breakdown_print(ctx);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -426,8 +427,29 @@ uint32_t common_sampler_get_seed(const struct common_sampler * gsmpl) {
|
||||||
|
|
||||||
// helpers
|
// helpers
|
||||||
|
|
||||||
llama_token_data_array * common_sampler_get_candidates(struct common_sampler * gsmpl) {
|
llama_token_data_array * common_sampler_get_candidates(struct common_sampler * gsmpl, bool do_sort) {
|
||||||
return &gsmpl->cur_p;
|
auto * res = &gsmpl->cur_p;
|
||||||
|
|
||||||
|
if (do_sort && !res->sorted) {
|
||||||
|
// remember the selected token before sorting
|
||||||
|
const llama_token id = res->data[res->selected].id;
|
||||||
|
|
||||||
|
std::sort(res->data, res->data + res->size, [](const llama_token_data & a, const llama_token_data & b) {
|
||||||
|
return a.p > b.p;
|
||||||
|
});
|
||||||
|
|
||||||
|
// restore the selected token after sorting
|
||||||
|
for (size_t i = 0; i < res->size; ++i) {
|
||||||
|
if (res->data[i].id == id) {
|
||||||
|
res->selected = i;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
res->sorted = true;
|
||||||
|
}
|
||||||
|
|
||||||
|
return res;
|
||||||
}
|
}
|
||||||
|
|
||||||
llama_token common_sampler_last(const struct common_sampler * gsmpl) {
|
llama_token common_sampler_last(const struct common_sampler * gsmpl) {
|
||||||
|
|
|
||||||
4
llama/llama.cpp/common/sampling.h
vendored
4
llama/llama.cpp/common/sampling.h
vendored
|
|
@ -86,7 +86,9 @@ uint32_t common_sampler_get_seed(const struct common_sampler * gsmpl);
|
||||||
// helpers
|
// helpers
|
||||||
|
|
||||||
// access the internal list of current candidate tokens
|
// access the internal list of current candidate tokens
|
||||||
llama_token_data_array * common_sampler_get_candidates(struct common_sampler * gsmpl);
|
// if do_sort == true, the candidates are guaranteed to be sorted afterwards (in descending order of probability)
|
||||||
|
// the .sorted flag of the result indicates whether the returned candidates are sorted
|
||||||
|
llama_token_data_array * common_sampler_get_candidates(struct common_sampler * gsmpl, bool do_sort);
|
||||||
|
|
||||||
// get the last accepted token
|
// get the last accepted token
|
||||||
llama_token common_sampler_last(const struct common_sampler * gsmpl);
|
llama_token common_sampler_last(const struct common_sampler * gsmpl);
|
||||||
|
|
|
||||||
197
llama/llama.cpp/include/llama.h
vendored
197
llama/llama.cpp/include/llama.h
vendored
|
|
@ -64,8 +64,6 @@ extern "C" {
|
||||||
|
|
||||||
typedef struct llama_memory_i * llama_memory_t;
|
typedef struct llama_memory_i * llama_memory_t;
|
||||||
|
|
||||||
struct llama_kv_cache; // DEPRECATED (use llama_memory instead)
|
|
||||||
|
|
||||||
typedef int32_t llama_pos;
|
typedef int32_t llama_pos;
|
||||||
typedef int32_t llama_token;
|
typedef int32_t llama_token;
|
||||||
typedef int32_t llama_seq_id;
|
typedef int32_t llama_seq_id;
|
||||||
|
|
@ -181,6 +179,14 @@ extern "C" {
|
||||||
LLAMA_ATTENTION_TYPE_NON_CAUSAL = 1,
|
LLAMA_ATTENTION_TYPE_NON_CAUSAL = 1,
|
||||||
};
|
};
|
||||||
|
|
||||||
|
enum llama_flash_attn_type {
|
||||||
|
LLAMA_FLASH_ATTN_TYPE_AUTO = -1,
|
||||||
|
LLAMA_FLASH_ATTN_TYPE_DISABLED = 0,
|
||||||
|
LLAMA_FLASH_ATTN_TYPE_ENABLED = 1,
|
||||||
|
};
|
||||||
|
|
||||||
|
LLAMA_API const char * llama_flash_attn_type_name(enum llama_flash_attn_type flash_attn_type);
|
||||||
|
|
||||||
enum llama_split_mode {
|
enum llama_split_mode {
|
||||||
LLAMA_SPLIT_MODE_NONE = 0, // single GPU
|
LLAMA_SPLIT_MODE_NONE = 0, // single GPU
|
||||||
LLAMA_SPLIT_MODE_LAYER = 1, // split layers and KV across GPUs
|
LLAMA_SPLIT_MODE_LAYER = 1, // split layers and KV across GPUs
|
||||||
|
|
@ -200,7 +206,7 @@ extern "C" {
|
||||||
llama_token_data * data;
|
llama_token_data * data;
|
||||||
size_t size;
|
size_t size;
|
||||||
int64_t selected; // this is the index in the data array (i.e. not the token id)
|
int64_t selected; // this is the index in the data array (i.e. not the token id)
|
||||||
bool sorted;
|
bool sorted; // note: do not assume the data is sorted - always check this flag
|
||||||
} llama_token_data_array;
|
} llama_token_data_array;
|
||||||
|
|
||||||
typedef bool (*llama_progress_callback)(float progress, void * user_data);
|
typedef bool (*llama_progress_callback)(float progress, void * user_data);
|
||||||
|
|
@ -305,6 +311,7 @@ extern "C" {
|
||||||
enum llama_rope_scaling_type rope_scaling_type; // RoPE scaling type, from `enum llama_rope_scaling_type`
|
enum llama_rope_scaling_type rope_scaling_type; // RoPE scaling type, from `enum llama_rope_scaling_type`
|
||||||
enum llama_pooling_type pooling_type; // whether to pool (sum) embedding results by sequence id
|
enum llama_pooling_type pooling_type; // whether to pool (sum) embedding results by sequence id
|
||||||
enum llama_attention_type attention_type; // attention type to use for embeddings
|
enum llama_attention_type attention_type; // attention type to use for embeddings
|
||||||
|
enum llama_flash_attn_type flash_attn_type; // when to enable Flash Attention
|
||||||
|
|
||||||
// ref: https://github.com/ggml-org/llama.cpp/pull/2054
|
// ref: https://github.com/ggml-org/llama.cpp/pull/2054
|
||||||
float rope_freq_base; // RoPE base frequency, 0 = from model
|
float rope_freq_base; // RoPE base frequency, 0 = from model
|
||||||
|
|
@ -314,7 +321,7 @@ extern "C" {
|
||||||
float yarn_beta_fast; // YaRN low correction dim
|
float yarn_beta_fast; // YaRN low correction dim
|
||||||
float yarn_beta_slow; // YaRN high correction dim
|
float yarn_beta_slow; // YaRN high correction dim
|
||||||
uint32_t yarn_orig_ctx; // YaRN original context size
|
uint32_t yarn_orig_ctx; // YaRN original context size
|
||||||
float defrag_thold; // defragment the KV cache if holes/size > thold, <= 0 disabled (default)
|
float defrag_thold; // [DEPRECATED] defragment the KV cache if holes/size > thold, <= 0 disabled (default)
|
||||||
|
|
||||||
ggml_backend_sched_eval_callback cb_eval;
|
ggml_backend_sched_eval_callback cb_eval;
|
||||||
void * cb_eval_user_data;
|
void * cb_eval_user_data;
|
||||||
|
|
@ -331,7 +338,6 @@ extern "C" {
|
||||||
// Keep the booleans together and at the end of the struct to avoid misalignment during copy-by-value.
|
// Keep the booleans together and at the end of the struct to avoid misalignment during copy-by-value.
|
||||||
bool embeddings; // if true, extract embeddings (together with logits)
|
bool embeddings; // if true, extract embeddings (together with logits)
|
||||||
bool offload_kqv; // offload the KQV ops (including the KV cache) to GPU
|
bool offload_kqv; // offload the KQV ops (including the KV cache) to GPU
|
||||||
bool flash_attn; // use flash attention [EXPERIMENTAL]
|
|
||||||
bool no_perf; // measure performance timings
|
bool no_perf; // measure performance timings
|
||||||
bool op_offload; // offload host tensor operations to device
|
bool op_offload; // offload host tensor operations to device
|
||||||
bool swa_full; // use full-size SWA cache (https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055)
|
bool swa_full; // use full-size SWA cache (https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055)
|
||||||
|
|
@ -469,8 +475,6 @@ extern "C" {
|
||||||
LLAMA_API llama_memory_t llama_get_memory (const struct llama_context * ctx);
|
LLAMA_API llama_memory_t llama_get_memory (const struct llama_context * ctx);
|
||||||
LLAMA_API enum llama_pooling_type llama_pooling_type(const struct llama_context * ctx); // TODO: rename to llama_get_pooling_type
|
LLAMA_API enum llama_pooling_type llama_pooling_type(const struct llama_context * ctx); // TODO: rename to llama_get_pooling_type
|
||||||
|
|
||||||
DEPRECATED(LLAMA_API struct llama_kv_cache * llama_get_kv_self(struct llama_context * ctx), "use llama_get_memory instead");
|
|
||||||
|
|
||||||
LLAMA_API const struct llama_vocab * llama_model_get_vocab(const struct llama_model * model);
|
LLAMA_API const struct llama_vocab * llama_model_get_vocab(const struct llama_model * model);
|
||||||
LLAMA_API enum llama_rope_type llama_model_rope_type(const struct llama_model * model);
|
LLAMA_API enum llama_rope_type llama_model_rope_type(const struct llama_model * model);
|
||||||
|
|
||||||
|
|
@ -557,10 +561,32 @@ extern "C" {
|
||||||
struct llama_model * model,
|
struct llama_model * model,
|
||||||
const char * path_lora);
|
const char * path_lora);
|
||||||
|
|
||||||
|
// Functions to access the adapter's GGUF metadata scalar values
|
||||||
|
// - The functions return the length of the string on success, or -1 on failure
|
||||||
|
// - The output string is always null-terminated and cleared on failure
|
||||||
|
// - When retrieving a string, an extra byte must be allocated to account for the null terminator
|
||||||
|
// - GGUF array values are not supported by these functions
|
||||||
|
|
||||||
|
// Get metadata value as a string by key name
|
||||||
|
LLAMA_API int32_t llama_adapter_meta_val_str(const struct llama_adapter_lora * adapter, const char * key, char * buf, size_t buf_size);
|
||||||
|
|
||||||
|
// Get the number of metadata key/value pairs
|
||||||
|
LLAMA_API int32_t llama_adapter_meta_count(const struct llama_adapter_lora * adapter);
|
||||||
|
|
||||||
|
// Get metadata key name by index
|
||||||
|
LLAMA_API int32_t llama_adapter_meta_key_by_index(const struct llama_adapter_lora * adapter, int32_t i, char * buf, size_t buf_size);
|
||||||
|
|
||||||
|
// Get metadata value as a string by index
|
||||||
|
LLAMA_API int32_t llama_adapter_meta_val_str_by_index(const struct llama_adapter_lora * adapter, int32_t i, char * buf, size_t buf_size);
|
||||||
|
|
||||||
// Manually free a LoRA adapter
|
// Manually free a LoRA adapter
|
||||||
// Note: loaded adapters will be free when the associated model is deleted
|
// Note: loaded adapters will be free when the associated model is deleted
|
||||||
LLAMA_API void llama_adapter_lora_free(struct llama_adapter_lora * adapter);
|
LLAMA_API void llama_adapter_lora_free(struct llama_adapter_lora * adapter);
|
||||||
|
|
||||||
|
// Get the invocation tokens if the current lora is an alora
|
||||||
|
LLAMA_API uint64_t llama_adapter_get_alora_n_invocation_tokens(const struct llama_adapter_lora * adapter);
|
||||||
|
LLAMA_API const llama_token * llama_adapter_get_alora_invocation_tokens (const struct llama_adapter_lora * adapter);
|
||||||
|
|
||||||
// The following functions operate on a llama_context, hence the naming: llama_verb_...
|
// The following functions operate on a llama_context, hence the naming: llama_verb_...
|
||||||
|
|
||||||
// Add a loaded LoRA adapter to given context
|
// Add a loaded LoRA adapter to given context
|
||||||
|
|
@ -667,111 +693,6 @@ extern "C" {
|
||||||
// Check if the memory supports shifting
|
// Check if the memory supports shifting
|
||||||
LLAMA_API bool llama_memory_can_shift(llama_memory_t mem);
|
LLAMA_API bool llama_memory_can_shift(llama_memory_t mem);
|
||||||
|
|
||||||
//
|
|
||||||
// KV cache for self-attention (TODO: deprecate in favor of llama_memory)
|
|
||||||
//
|
|
||||||
|
|
||||||
// Returns the number of tokens in the KV cache (slow, use only for debug)
|
|
||||||
// If a KV cell has multiple sequences assigned to it, it will be counted multiple times
|
|
||||||
DEPRECATED(LLAMA_API int32_t llama_kv_self_n_tokens(const struct llama_context * ctx),
|
|
||||||
"Use llama_kv_self_seq_pos_max() and llama_kv_self_seq_pos_min() instead (https://github.com/ggml-org/llama.cpp/issues/13793)");
|
|
||||||
|
|
||||||
// Returns the number of used KV cells (i.e. have at least one sequence assigned to them)
|
|
||||||
DEPRECATED(LLAMA_API int32_t llama_kv_self_used_cells(const struct llama_context * ctx),
|
|
||||||
"Use llama_kv_self_seq_pos_max() and llama_kv_self_seq_pos_min() instead (https://github.com/ggml-org/llama.cpp/issues/13793)");
|
|
||||||
|
|
||||||
// Clear the KV cache - both cell info is erased and KV data is zeroed
|
|
||||||
DEPRECATED(LLAMA_API void llama_kv_self_clear(
|
|
||||||
struct llama_context * ctx),
|
|
||||||
"Use llama_memory_clear() instead");
|
|
||||||
|
|
||||||
// Removes all tokens that belong to the specified sequence and have positions in [p0, p1)
|
|
||||||
// Returns false if a partial sequence cannot be removed. Removing a whole sequence never fails
|
|
||||||
// seq_id < 0 : match any sequence
|
|
||||||
// p0 < 0 : [0, p1]
|
|
||||||
// p1 < 0 : [p0, inf)
|
|
||||||
DEPRECATED(LLAMA_API bool llama_kv_self_seq_rm(
|
|
||||||
struct llama_context * ctx,
|
|
||||||
llama_seq_id seq_id,
|
|
||||||
llama_pos p0,
|
|
||||||
llama_pos p1),
|
|
||||||
"Use llama_memory_seq_rm() instead");
|
|
||||||
|
|
||||||
// Copy all tokens that belong to the specified sequence to another sequence
|
|
||||||
// Note that this does not allocate extra KV cache memory - it simply assigns the tokens to the new sequence
|
|
||||||
// p0 < 0 : [0, p1]
|
|
||||||
// p1 < 0 : [p0, inf)
|
|
||||||
DEPRECATED(LLAMA_API void llama_kv_self_seq_cp(
|
|
||||||
struct llama_context * ctx,
|
|
||||||
llama_seq_id seq_id_src,
|
|
||||||
llama_seq_id seq_id_dst,
|
|
||||||
llama_pos p0,
|
|
||||||
llama_pos p1),
|
|
||||||
"Use llama_memory_seq_cp() instead");
|
|
||||||
|
|
||||||
// Removes all tokens that do not belong to the specified sequence
|
|
||||||
DEPRECATED(LLAMA_API void llama_kv_self_seq_keep(
|
|
||||||
struct llama_context * ctx,
|
|
||||||
llama_seq_id seq_id),
|
|
||||||
"Use llama_memory_seq_keep() instead");
|
|
||||||
|
|
||||||
// Adds relative position "delta" to all tokens that belong to the specified sequence and have positions in [p0, p1)
|
|
||||||
// If the KV cache is RoPEd, the KV data is updated accordingly:
|
|
||||||
// - lazily on next llama_decode()
|
|
||||||
// p0 < 0 : [0, p1]
|
|
||||||
// p1 < 0 : [p0, inf)
|
|
||||||
DEPRECATED(LLAMA_API void llama_kv_self_seq_add(
|
|
||||||
struct llama_context * ctx,
|
|
||||||
llama_seq_id seq_id,
|
|
||||||
llama_pos p0,
|
|
||||||
llama_pos p1,
|
|
||||||
llama_pos delta),
|
|
||||||
"Use llama_memory_seq_add() instead");
|
|
||||||
|
|
||||||
// Integer division of the positions by factor of `d > 1`
|
|
||||||
// If the KV cache is RoPEd, the KV data is updated accordingly:
|
|
||||||
// - lazily on next llama_decode()
|
|
||||||
// p0 < 0 : [0, p1]
|
|
||||||
// p1 < 0 : [p0, inf)
|
|
||||||
DEPRECATED(LLAMA_API void llama_kv_self_seq_div(
|
|
||||||
struct llama_context * ctx,
|
|
||||||
llama_seq_id seq_id,
|
|
||||||
llama_pos p0,
|
|
||||||
llama_pos p1,
|
|
||||||
int d),
|
|
||||||
"Use llama_memory_seq_div() instead");
|
|
||||||
|
|
||||||
// Returns the smallest position present in the KV cache for the specified sequence
|
|
||||||
// This is typically non-zero only for SWA caches
|
|
||||||
// Note that all positions in the range [pos_min, pos_max] are guaranteed to be present in the KV cache
|
|
||||||
// Return -1 if the sequence is empty
|
|
||||||
DEPRECATED(LLAMA_API llama_pos llama_kv_self_seq_pos_min(
|
|
||||||
struct llama_context * ctx,
|
|
||||||
llama_seq_id seq_id),
|
|
||||||
"Use llama_memory_seq_pos_min() instead");
|
|
||||||
|
|
||||||
// Returns the largest position present in the KV cache for the specified sequence
|
|
||||||
// Note that all positions in the range [pos_min, pos_max] are guaranteed to be present in the KV cache
|
|
||||||
// Return -1 if the sequence is empty
|
|
||||||
DEPRECATED(LLAMA_API llama_pos llama_kv_self_seq_pos_max(
|
|
||||||
struct llama_context * ctx,
|
|
||||||
llama_seq_id seq_id),
|
|
||||||
"Use llama_memory_seq_pos_max() instead");
|
|
||||||
|
|
||||||
// Defragment the KV cache
|
|
||||||
// This will be applied:
|
|
||||||
// - lazily on next llama_decode()
|
|
||||||
DEPRECATED(LLAMA_API void llama_kv_self_defrag(struct llama_context * ctx),
|
|
||||||
"simply remove this call, the context will automatically decide when to do a defragmentation based on 'defrag_thold'");
|
|
||||||
|
|
||||||
// Check if the context supports KV cache shifting
|
|
||||||
DEPRECATED(LLAMA_API bool llama_kv_self_can_shift(const struct llama_context * ctx),
|
|
||||||
"use llama_memory_can_shift() instead");
|
|
||||||
|
|
||||||
// Apply the KV cache updates (such as K-shifts, defragmentation, etc.)
|
|
||||||
DEPRECATED(LLAMA_API void llama_kv_self_update(struct llama_context * ctx),
|
|
||||||
"simply remove this call, updates are applied lazily on the next llama_decode()");
|
|
||||||
|
|
||||||
//
|
//
|
||||||
// State / sessions
|
// State / sessions
|
||||||
//
|
//
|
||||||
|
|
@ -870,6 +791,29 @@ extern "C" {
|
||||||
size_t n_token_capacity,
|
size_t n_token_capacity,
|
||||||
size_t * n_token_count_out);
|
size_t * n_token_count_out);
|
||||||
|
|
||||||
|
#define LLAMA_STATE_SEQ_FLAGS_SWA_ONLY 1
|
||||||
|
|
||||||
|
typedef uint32_t llama_state_seq_flags;
|
||||||
|
|
||||||
|
LLAMA_API size_t llama_state_seq_get_size_ext(
|
||||||
|
struct llama_context * ctx,
|
||||||
|
llama_seq_id seq_id,
|
||||||
|
llama_state_seq_flags flags);
|
||||||
|
|
||||||
|
LLAMA_API size_t llama_state_seq_get_data_ext(
|
||||||
|
struct llama_context * ctx,
|
||||||
|
uint8_t * dst,
|
||||||
|
size_t size,
|
||||||
|
llama_seq_id seq_id,
|
||||||
|
llama_state_seq_flags flags);
|
||||||
|
|
||||||
|
LLAMA_API size_t llama_state_seq_set_data_ext(
|
||||||
|
struct llama_context * ctx,
|
||||||
|
const uint8_t * src,
|
||||||
|
size_t size,
|
||||||
|
llama_seq_id dest_seq_id,
|
||||||
|
llama_state_seq_flags flags);
|
||||||
|
|
||||||
//
|
//
|
||||||
// Decoding
|
// Decoding
|
||||||
//
|
//
|
||||||
|
|
@ -1216,11 +1160,6 @@ extern "C" {
|
||||||
LLAMA_API struct llama_sampler * llama_sampler_init_greedy(void);
|
LLAMA_API struct llama_sampler * llama_sampler_init_greedy(void);
|
||||||
LLAMA_API struct llama_sampler * llama_sampler_init_dist (uint32_t seed);
|
LLAMA_API struct llama_sampler * llama_sampler_init_dist (uint32_t seed);
|
||||||
|
|
||||||
/// @details Sorts candidate tokens by their logits in descending order and calculate probabilities based on logits.
|
|
||||||
/// NOTE: Avoid using on the full vocabulary as the sorting can become slow. For example, apply top-k or top-p sampling first.
|
|
||||||
DEPRECATED(LLAMA_API struct llama_sampler * llama_sampler_init_softmax (void),
|
|
||||||
"will be removed in the future (see https://github.com/ggml-org/llama.cpp/pull/9896#discussion_r1800920915)");
|
|
||||||
|
|
||||||
/// @details Top-K sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751
|
/// @details Top-K sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751
|
||||||
/// Setting k <= 0 makes this a noop
|
/// Setting k <= 0 makes this a noop
|
||||||
LLAMA_API struct llama_sampler * llama_sampler_init_top_k (int32_t k);
|
LLAMA_API struct llama_sampler * llama_sampler_init_top_k (int32_t k);
|
||||||
|
|
@ -1390,24 +1329,25 @@ extern "C" {
|
||||||
//
|
//
|
||||||
// Performance utils
|
// Performance utils
|
||||||
//
|
//
|
||||||
// NOTE: Used by llama.cpp examples, avoid using in third-party apps. Instead, do your own performance measurements.
|
// NOTE: Used by llama.cpp examples/tools, avoid using in third-party apps. Instead, do your own performance measurements.
|
||||||
//
|
//
|
||||||
|
|
||||||
struct llama_perf_context_data {
|
struct llama_perf_context_data {
|
||||||
double t_start_ms;
|
// ms == milliseconds
|
||||||
double t_load_ms;
|
double t_start_ms; // absolute start time
|
||||||
double t_p_eval_ms;
|
double t_load_ms; // time needed for loading the model
|
||||||
double t_eval_ms;
|
double t_p_eval_ms; // time needed for processing the prompt
|
||||||
|
double t_eval_ms; // time needed for generating tokens
|
||||||
|
|
||||||
int32_t n_p_eval;
|
int32_t n_p_eval; // number of prompt tokens
|
||||||
int32_t n_eval;
|
int32_t n_eval; // number of generated tokens
|
||||||
int32_t n_reused; // number of times a ggml compute graph had been reused
|
int32_t n_reused; // number of times a ggml compute graph had been reused
|
||||||
};
|
};
|
||||||
|
|
||||||
struct llama_perf_sampler_data {
|
struct llama_perf_sampler_data {
|
||||||
double t_sample_ms;
|
double t_sample_ms; // time needed for sampling in ms
|
||||||
|
|
||||||
int32_t n_sample;
|
int32_t n_sample; // number of sampled tokens
|
||||||
};
|
};
|
||||||
|
|
||||||
LLAMA_API struct llama_perf_context_data llama_perf_context (const struct llama_context * ctx);
|
LLAMA_API struct llama_perf_context_data llama_perf_context (const struct llama_context * ctx);
|
||||||
|
|
@ -1419,6 +1359,9 @@ extern "C" {
|
||||||
LLAMA_API void llama_perf_sampler_print(const struct llama_sampler * chain);
|
LLAMA_API void llama_perf_sampler_print(const struct llama_sampler * chain);
|
||||||
LLAMA_API void llama_perf_sampler_reset( struct llama_sampler * chain);
|
LLAMA_API void llama_perf_sampler_reset( struct llama_sampler * chain);
|
||||||
|
|
||||||
|
// print a breakdown of per-device memory use via LLAMA_LOG:
|
||||||
|
LLAMA_API void llama_memory_breakdown_print(const struct llama_context * ctx);
|
||||||
|
|
||||||
//
|
//
|
||||||
// training
|
// training
|
||||||
//
|
//
|
||||||
|
|
@ -1437,6 +1380,8 @@ extern "C" {
|
||||||
|
|
||||||
ggml_opt_get_optimizer_params get_opt_pars; // callback for calculating optimizer parameters
|
ggml_opt_get_optimizer_params get_opt_pars; // callback for calculating optimizer parameters
|
||||||
void * get_opt_pars_ud; // userdata for calculating optimizer parameters
|
void * get_opt_pars_ud; // userdata for calculating optimizer parameters
|
||||||
|
|
||||||
|
enum ggml_opt_optimizer_type optimizer_type;
|
||||||
};
|
};
|
||||||
|
|
||||||
LLAMA_API void llama_opt_init(struct llama_context * lctx, struct llama_model * model, struct llama_opt_params lopt_params);
|
LLAMA_API void llama_opt_init(struct llama_context * lctx, struct llama_model * model, struct llama_opt_params lopt_params);
|
||||||
|
|
|
||||||
105
llama/llama.cpp/src/llama-adapter.cpp
vendored
105
llama/llama.cpp/src/llama-adapter.cpp
vendored
|
|
@ -6,6 +6,7 @@
|
||||||
|
|
||||||
#include <map>
|
#include <map>
|
||||||
#include <cassert>
|
#include <cassert>
|
||||||
|
#include <sstream>
|
||||||
#include <stdexcept>
|
#include <stdexcept>
|
||||||
|
|
||||||
// vec
|
// vec
|
||||||
|
|
@ -163,13 +164,38 @@ static void llama_adapter_lora_init_impl(llama_model & model, const char * path_
|
||||||
|
|
||||||
// check metadata
|
// check metadata
|
||||||
{
|
{
|
||||||
|
const gguf_context * gguf_ctx = ctx_gguf.get();
|
||||||
|
|
||||||
|
LLAMA_LOG_INFO("%s: Dumping metadata keys/values.\n", __func__);
|
||||||
|
|
||||||
|
// get metadata as string
|
||||||
|
for (int i = 0; i < gguf_get_n_kv(gguf_ctx); i++) {
|
||||||
|
gguf_type type = gguf_get_kv_type(gguf_ctx, i);
|
||||||
|
const std::string type_name =
|
||||||
|
type == GGUF_TYPE_ARRAY
|
||||||
|
? format("%s[%s,%zu]", gguf_type_name(type), gguf_type_name(gguf_get_arr_type(gguf_ctx, i)), gguf_get_arr_n(gguf_ctx, i))
|
||||||
|
: gguf_type_name(type);
|
||||||
|
const char * name = gguf_get_key(gguf_ctx, i);
|
||||||
|
const std::string value = gguf_kv_to_str(gguf_ctx, i);
|
||||||
|
|
||||||
|
if (type != GGUF_TYPE_ARRAY) {
|
||||||
|
adapter.gguf_kv.emplace(name, value);
|
||||||
|
}
|
||||||
|
|
||||||
|
const size_t MAX_VALUE_LEN = 40;
|
||||||
|
std::string print_value = value.size() > MAX_VALUE_LEN ? format("%s...", value.substr(0, MAX_VALUE_LEN - 3).c_str()) : value;
|
||||||
|
replace_all(print_value, "\n", "\\n");
|
||||||
|
|
||||||
|
LLAMA_LOG_INFO("%s: - kv %3d: %42s %-16s = %s\n", __func__, i, name, type_name.c_str(), print_value.c_str());
|
||||||
|
}
|
||||||
|
|
||||||
auto get_kv_str = [&](const std::string & key) -> std::string {
|
auto get_kv_str = [&](const std::string & key) -> std::string {
|
||||||
int id = gguf_find_key(ctx_gguf.get(), key.c_str());
|
int id = gguf_find_key(gguf_ctx, key.c_str());
|
||||||
return id < 0 ? "" : std::string(gguf_get_val_str(ctx_gguf.get(), id));
|
return id < 0 ? "" : std::string(gguf_get_val_str(gguf_ctx, id));
|
||||||
};
|
};
|
||||||
auto get_kv_f32 = [&](const std::string & key) -> float {
|
auto get_kv_f32 = [&](const std::string & key) -> float {
|
||||||
int id = gguf_find_key(ctx_gguf.get(), key.c_str());
|
int id = gguf_find_key(gguf_ctx, key.c_str());
|
||||||
return id < 0 ? 0.0f : gguf_get_val_f32(ctx_gguf.get(), id);
|
return id < 0 ? 0.0f : gguf_get_val_f32(gguf_ctx, id);
|
||||||
};
|
};
|
||||||
LLM_KV llm_kv = LLM_KV(LLM_ARCH_UNKNOWN);
|
LLM_KV llm_kv = LLM_KV(LLM_ARCH_UNKNOWN);
|
||||||
|
|
||||||
|
|
@ -190,6 +216,26 @@ static void llama_adapter_lora_init_impl(llama_model & model, const char * path_
|
||||||
}
|
}
|
||||||
|
|
||||||
adapter.alpha = get_kv_f32(llm_kv(LLM_KV_ADAPTER_LORA_ALPHA));
|
adapter.alpha = get_kv_f32(llm_kv(LLM_KV_ADAPTER_LORA_ALPHA));
|
||||||
|
|
||||||
|
// parse alora invocation sequence vector
|
||||||
|
const auto & key = llm_kv(LLM_KV_ADAPTER_ALORA_INVOCATION_TOKENS);
|
||||||
|
const int kid = gguf_find_key(ctx_gguf.get(), key.c_str());
|
||||||
|
if (kid >= 0) {
|
||||||
|
if (gguf_get_kv_type(ctx_gguf.get(), kid) != GGUF_TYPE_ARRAY) {
|
||||||
|
throw std::runtime_error("invalid gguf type for " + key);
|
||||||
|
}
|
||||||
|
const auto arr_type = gguf_get_arr_type(ctx_gguf.get(), kid);
|
||||||
|
if (arr_type != GGUF_TYPE_UINT32) {
|
||||||
|
throw std::runtime_error("invalid gguf element type for " + key);
|
||||||
|
}
|
||||||
|
const size_t seq_len = gguf_get_arr_n(ctx_gguf.get(), kid);
|
||||||
|
const void * data = gguf_get_arr_data(ctx_gguf.get(), kid);
|
||||||
|
adapter.alora_invocation_tokens.resize(seq_len);
|
||||||
|
std::copy(
|
||||||
|
(const llama_token *)data,
|
||||||
|
(const llama_token *)data + seq_len,
|
||||||
|
adapter.alora_invocation_tokens.begin());
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
int n_tensors = gguf_get_n_tensors(ctx_gguf.get());
|
int n_tensors = gguf_get_n_tensors(ctx_gguf.get());
|
||||||
|
|
@ -383,6 +429,57 @@ llama_adapter_lora * llama_adapter_lora_init(llama_model * model, const char * p
|
||||||
return nullptr;
|
return nullptr;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
int32_t llama_adapter_meta_val_str(const llama_adapter_lora * adapter, const char * key, char * buf, size_t buf_size) {
|
||||||
|
const auto & it = adapter->gguf_kv.find(key);
|
||||||
|
if (it == adapter->gguf_kv.end()) {
|
||||||
|
if (buf_size > 0) {
|
||||||
|
buf[0] = '\0';
|
||||||
|
}
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
return snprintf(buf, buf_size, "%s", it->second.c_str());
|
||||||
|
}
|
||||||
|
|
||||||
|
int32_t llama_adapter_meta_count(const llama_adapter_lora * adapter) {
|
||||||
|
return (int)adapter->gguf_kv.size();
|
||||||
|
}
|
||||||
|
|
||||||
|
int32_t llama_adapter_meta_key_by_index(const llama_adapter_lora * adapter, int i, char * buf, size_t buf_size) {
|
||||||
|
if (i < 0 || i >= (int)adapter->gguf_kv.size()) {
|
||||||
|
if (buf_size > 0) {
|
||||||
|
buf[0] = '\0';
|
||||||
|
}
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
auto it = adapter->gguf_kv.begin();
|
||||||
|
std::advance(it, i);
|
||||||
|
return snprintf(buf, buf_size, "%s", it->first.c_str());
|
||||||
|
}
|
||||||
|
|
||||||
|
int32_t llama_adapter_meta_val_str_by_index(const llama_adapter_lora * adapter, int32_t i, char * buf, size_t buf_size) {
|
||||||
|
if (i < 0 || i >= (int)adapter->gguf_kv.size()) {
|
||||||
|
if (buf_size > 0) {
|
||||||
|
buf[0] = '\0';
|
||||||
|
}
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
auto it = adapter->gguf_kv.begin();
|
||||||
|
std::advance(it, i);
|
||||||
|
return snprintf(buf, buf_size, "%s", it->second.c_str());
|
||||||
|
}
|
||||||
|
|
||||||
void llama_adapter_lora_free(llama_adapter_lora * adapter) {
|
void llama_adapter_lora_free(llama_adapter_lora * adapter) {
|
||||||
delete adapter;
|
delete adapter;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
uint64_t llama_adapter_get_alora_n_invocation_tokens(const struct llama_adapter_lora * adapter) {
|
||||||
|
if (!adapter) {
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
return adapter->alora_invocation_tokens.size();
|
||||||
|
}
|
||||||
|
|
||||||
|
const llama_token * llama_adapter_get_alora_invocation_tokens(const llama_adapter_lora * adapter) {
|
||||||
|
GGML_ASSERT(adapter);
|
||||||
|
return adapter->alora_invocation_tokens.data();
|
||||||
|
}
|
||||||
|
|
|
||||||
6
llama/llama.cpp/src/llama-adapter.h
vendored
6
llama/llama.cpp/src/llama-adapter.h
vendored
|
|
@ -67,6 +67,12 @@ struct llama_adapter_lora {
|
||||||
|
|
||||||
float alpha;
|
float alpha;
|
||||||
|
|
||||||
|
// gguf metadata
|
||||||
|
std::unordered_map<std::string, std::string> gguf_kv;
|
||||||
|
|
||||||
|
// activated lora (aLoRA)
|
||||||
|
std::vector<llama_token> alora_invocation_tokens;
|
||||||
|
|
||||||
llama_adapter_lora() = default;
|
llama_adapter_lora() = default;
|
||||||
~llama_adapter_lora() = default;
|
~llama_adapter_lora() = default;
|
||||||
|
|
||||||
|
|
|
||||||
151
llama/llama.cpp/src/llama-arch.cpp
vendored
151
llama/llama.cpp/src/llama-arch.cpp
vendored
|
|
@ -22,6 +22,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
|
||||||
{ LLM_ARCH_NOMIC_BERT_MOE, "nomic-bert-moe" },
|
{ LLM_ARCH_NOMIC_BERT_MOE, "nomic-bert-moe" },
|
||||||
{ LLM_ARCH_NEO_BERT, "neo-bert" },
|
{ LLM_ARCH_NEO_BERT, "neo-bert" },
|
||||||
{ LLM_ARCH_JINA_BERT_V2, "jina-bert-v2" },
|
{ LLM_ARCH_JINA_BERT_V2, "jina-bert-v2" },
|
||||||
|
{ LLM_ARCH_JINA_BERT_V3, "jina-bert-v3" },
|
||||||
{ LLM_ARCH_BLOOM, "bloom" },
|
{ LLM_ARCH_BLOOM, "bloom" },
|
||||||
{ LLM_ARCH_STABLELM, "stablelm" },
|
{ LLM_ARCH_STABLELM, "stablelm" },
|
||||||
{ LLM_ARCH_QWEN, "qwen" },
|
{ LLM_ARCH_QWEN, "qwen" },
|
||||||
|
|
@ -44,6 +45,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
|
||||||
{ LLM_ARCH_GEMMA2, "gemma2" },
|
{ LLM_ARCH_GEMMA2, "gemma2" },
|
||||||
{ LLM_ARCH_GEMMA3, "gemma3" },
|
{ LLM_ARCH_GEMMA3, "gemma3" },
|
||||||
{ LLM_ARCH_GEMMA3N, "gemma3n" },
|
{ LLM_ARCH_GEMMA3N, "gemma3n" },
|
||||||
|
{ LLM_ARCH_GEMMA_EMBEDDING, "gemma-embedding" },
|
||||||
{ LLM_ARCH_STARCODER2, "starcoder2" },
|
{ LLM_ARCH_STARCODER2, "starcoder2" },
|
||||||
{ LLM_ARCH_MAMBA, "mamba" },
|
{ LLM_ARCH_MAMBA, "mamba" },
|
||||||
{ LLM_ARCH_MAMBA2, "mamba2" },
|
{ LLM_ARCH_MAMBA2, "mamba2" },
|
||||||
|
|
@ -68,6 +70,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
|
||||||
{ LLM_ARCH_T5ENCODER, "t5encoder" },
|
{ LLM_ARCH_T5ENCODER, "t5encoder" },
|
||||||
{ LLM_ARCH_JAIS, "jais" },
|
{ LLM_ARCH_JAIS, "jais" },
|
||||||
{ LLM_ARCH_NEMOTRON, "nemotron" },
|
{ LLM_ARCH_NEMOTRON, "nemotron" },
|
||||||
|
{ LLM_ARCH_NEMOTRON_H, "nemotron_h" },
|
||||||
{ LLM_ARCH_EXAONE, "exaone" },
|
{ LLM_ARCH_EXAONE, "exaone" },
|
||||||
{ LLM_ARCH_EXAONE4, "exaone4" },
|
{ LLM_ARCH_EXAONE4, "exaone4" },
|
||||||
{ LLM_ARCH_RWKV6, "rwkv6" },
|
{ LLM_ARCH_RWKV6, "rwkv6" },
|
||||||
|
|
@ -94,6 +97,9 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
|
||||||
{ LLM_ARCH_DREAM, "dream" },
|
{ LLM_ARCH_DREAM, "dream" },
|
||||||
{ LLM_ARCH_SMALLTHINKER, "smallthinker" },
|
{ LLM_ARCH_SMALLTHINKER, "smallthinker" },
|
||||||
{ LLM_ARCH_LLADA, "llada" },
|
{ LLM_ARCH_LLADA, "llada" },
|
||||||
|
{ LLM_ARCH_LLADA_MOE, "llada-moe" },
|
||||||
|
{ LLM_ARCH_SEED_OSS, "seed_oss" },
|
||||||
|
{ LLM_ARCH_GROVEMOE, "grovemoe" },
|
||||||
{ LLM_ARCH_UNKNOWN, "(unknown)" },
|
{ LLM_ARCH_UNKNOWN, "(unknown)" },
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
@ -121,6 +127,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
|
||||||
{ LLM_KV_FEED_FORWARD_LENGTH, "%s.feed_forward_length" },
|
{ LLM_KV_FEED_FORWARD_LENGTH, "%s.feed_forward_length" },
|
||||||
{ LLM_KV_EXPERT_FEED_FORWARD_LENGTH, "%s.expert_feed_forward_length" },
|
{ LLM_KV_EXPERT_FEED_FORWARD_LENGTH, "%s.expert_feed_forward_length" },
|
||||||
{ LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, "%s.expert_shared_feed_forward_length" },
|
{ LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, "%s.expert_shared_feed_forward_length" },
|
||||||
|
{ LLM_KV_EXPERT_CHUNK_FEED_FORWARD_LENGTH, "%s.expert_chunk_feed_forward_length" },
|
||||||
{ LLM_KV_USE_PARALLEL_RESIDUAL, "%s.use_parallel_residual" },
|
{ LLM_KV_USE_PARALLEL_RESIDUAL, "%s.use_parallel_residual" },
|
||||||
{ LLM_KV_TENSOR_DATA_LAYOUT, "%s.tensor_data_layout" },
|
{ LLM_KV_TENSOR_DATA_LAYOUT, "%s.tensor_data_layout" },
|
||||||
{ LLM_KV_EXPERT_COUNT, "%s.expert_count" },
|
{ LLM_KV_EXPERT_COUNT, "%s.expert_count" },
|
||||||
|
|
@ -129,12 +136,16 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
|
||||||
{ LLM_KV_EXPERT_WEIGHTS_SCALE, "%s.expert_weights_scale" },
|
{ LLM_KV_EXPERT_WEIGHTS_SCALE, "%s.expert_weights_scale" },
|
||||||
{ LLM_KV_EXPERT_WEIGHTS_NORM, "%s.expert_weights_norm" },
|
{ LLM_KV_EXPERT_WEIGHTS_NORM, "%s.expert_weights_norm" },
|
||||||
{ LLM_KV_EXPERT_GATING_FUNC, "%s.expert_gating_func" },
|
{ LLM_KV_EXPERT_GATING_FUNC, "%s.expert_gating_func" },
|
||||||
|
{ LLM_KV_EXPERT_GROUP_SCALE, "%s.expert_group_scale" },
|
||||||
|
{ LLM_KV_EXPERTS_PER_GROUP, "%s.experts_per_group" },
|
||||||
{ LLM_KV_MOE_EVERY_N_LAYERS, "%s.moe_every_n_layers" },
|
{ LLM_KV_MOE_EVERY_N_LAYERS, "%s.moe_every_n_layers" },
|
||||||
{ LLM_KV_NEXTN_PREDICT_LAYERS, "%s.nextn_predict_layers" },
|
{ LLM_KV_NEXTN_PREDICT_LAYERS, "%s.nextn_predict_layers" },
|
||||||
{ LLM_KV_POOLING_TYPE, "%s.pooling_type" },
|
{ LLM_KV_POOLING_TYPE, "%s.pooling_type" },
|
||||||
{ LLM_KV_LOGIT_SCALE, "%s.logit_scale" },
|
{ LLM_KV_LOGIT_SCALE, "%s.logit_scale" },
|
||||||
{ LLM_KV_DECODER_START_TOKEN_ID, "%s.decoder_start_token_id" },
|
{ LLM_KV_DECODER_START_TOKEN_ID, "%s.decoder_start_token_id" },
|
||||||
|
{ LLM_KV_DECODER_BLOCK_COUNT, "%s.decoder_block_count" },
|
||||||
{ LLM_KV_ATTN_LOGIT_SOFTCAPPING, "%s.attn_logit_softcapping" },
|
{ LLM_KV_ATTN_LOGIT_SOFTCAPPING, "%s.attn_logit_softcapping" },
|
||||||
|
{ LLM_KV_ROUTER_LOGIT_SOFTCAPPING, "%s.router_logit_softcapping" },
|
||||||
{ LLM_KV_FINAL_LOGIT_SOFTCAPPING, "%s.final_logit_softcapping" },
|
{ LLM_KV_FINAL_LOGIT_SOFTCAPPING, "%s.final_logit_softcapping" },
|
||||||
{ LLM_KV_SWIN_NORM, "%s.swin_norm" },
|
{ LLM_KV_SWIN_NORM, "%s.swin_norm" },
|
||||||
{ LLM_KV_RESCALE_EVERY_N_LAYERS, "%s.rescale_every_n_layers" },
|
{ LLM_KV_RESCALE_EVERY_N_LAYERS, "%s.rescale_every_n_layers" },
|
||||||
|
|
@ -165,6 +176,8 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
|
||||||
{ LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT, "%s.attention.relative_buckets_count" },
|
{ LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT, "%s.attention.relative_buckets_count" },
|
||||||
{ LLM_KV_ATTENTION_SLIDING_WINDOW, "%s.attention.sliding_window" },
|
{ LLM_KV_ATTENTION_SLIDING_WINDOW, "%s.attention.sliding_window" },
|
||||||
{ LLM_KV_ATTENTION_SCALE, "%s.attention.scale" },
|
{ LLM_KV_ATTENTION_SCALE, "%s.attention.scale" },
|
||||||
|
{ LLM_KV_ATTENTION_OUTPUT_SCALE, "%s.attention.output_scale" },
|
||||||
|
{ LLM_KV_ATTENTION_TEMPERATURE_LENGTH, "%s.attention.temperature_length" },
|
||||||
{ LLM_KV_ATTENTION_BLOCK_SKIP_CONNECTION, "%s.attention.block_skip_connection" },
|
{ LLM_KV_ATTENTION_BLOCK_SKIP_CONNECTION, "%s.attention.block_skip_connection" },
|
||||||
{ LLM_KV_ATTENTION_KEY_LENGTH_MLA, "%s.attention.key_length_mla" },
|
{ LLM_KV_ATTENTION_KEY_LENGTH_MLA, "%s.attention.key_length_mla" },
|
||||||
{ LLM_KV_ATTENTION_VALUE_LENGTH_MLA, "%s.attention.value_length_mla" },
|
{ LLM_KV_ATTENTION_VALUE_LENGTH_MLA, "%s.attention.value_length_mla" },
|
||||||
|
|
@ -179,6 +192,10 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
|
||||||
{ LLM_KV_ROPE_SCALING_ORIG_CTX_LEN, "%s.rope.scaling.original_context_length" },
|
{ LLM_KV_ROPE_SCALING_ORIG_CTX_LEN, "%s.rope.scaling.original_context_length" },
|
||||||
{ LLM_KV_ROPE_SCALING_FINETUNED, "%s.rope.scaling.finetuned" },
|
{ LLM_KV_ROPE_SCALING_FINETUNED, "%s.rope.scaling.finetuned" },
|
||||||
{ LLM_KV_ROPE_SCALING_YARN_LOG_MUL, "%s.rope.scaling.yarn_log_multiplier" },
|
{ LLM_KV_ROPE_SCALING_YARN_LOG_MUL, "%s.rope.scaling.yarn_log_multiplier" },
|
||||||
|
{ LLM_KV_ROPE_SCALING_YARN_EXT_FACTOR, "%s.rope.scaling.yarn_ext_factor" },
|
||||||
|
{ LLM_KV_ROPE_SCALING_YARN_ATTN_FACTOR, "%s.rope.scaling.yarn_attn_factor" },
|
||||||
|
{ LLM_KV_ROPE_SCALING_YARN_BETA_FAST, "%s.rope.scaling.yarn_beta_fast" },
|
||||||
|
{ LLM_KV_ROPE_SCALING_YARN_BETA_SLOW, "%s.rope.scaling.yarn_beta_slow" },
|
||||||
|
|
||||||
{ LLM_KV_SPLIT_NO, "split.no" },
|
{ LLM_KV_SPLIT_NO, "split.no" },
|
||||||
{ LLM_KV_SPLIT_COUNT, "split.count" },
|
{ LLM_KV_SPLIT_COUNT, "split.count" },
|
||||||
|
|
@ -237,6 +254,9 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
|
||||||
|
|
||||||
{ LLM_KV_ADAPTER_TYPE, "adapter.type" },
|
{ LLM_KV_ADAPTER_TYPE, "adapter.type" },
|
||||||
{ LLM_KV_ADAPTER_LORA_ALPHA, "adapter.lora.alpha" },
|
{ LLM_KV_ADAPTER_LORA_ALPHA, "adapter.lora.alpha" },
|
||||||
|
{ LLM_KV_ADAPTER_LORA_TASK_NAME, "adapter.lora.task_name" },
|
||||||
|
{ LLM_KV_ADAPTER_LORA_PROMPT_PREFIX, "adapter.lora.prompt_prefix" },
|
||||||
|
{ LLM_KV_ADAPTER_ALORA_INVOCATION_TOKENS, "adapter.alora.invocation_tokens" },
|
||||||
|
|
||||||
// deprecated
|
// deprecated
|
||||||
{ LLM_KV_TOKENIZER_PREFIX_ID, "tokenizer.ggml.prefix_token_id" },
|
{ LLM_KV_TOKENIZER_PREFIX_ID, "tokenizer.ggml.prefix_token_id" },
|
||||||
|
|
@ -392,12 +412,16 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
|
||||||
{ LLM_TENSOR_ATTN_ROT_EMBD, "blk.%d.attn_rot_embd" },
|
{ LLM_TENSOR_ATTN_ROT_EMBD, "blk.%d.attn_rot_embd" },
|
||||||
{ LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" },
|
{ LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" },
|
||||||
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
|
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
|
||||||
|
{ LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
|
||||||
|
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
||||||
|
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
||||||
{ LLM_TENSOR_FFN_GATE_EXP, "blk.%d.ffn_gate.%d" },
|
{ LLM_TENSOR_FFN_GATE_EXP, "blk.%d.ffn_gate.%d" },
|
||||||
{ LLM_TENSOR_FFN_DOWN_EXP, "blk.%d.ffn_down.%d" },
|
{ LLM_TENSOR_FFN_DOWN_EXP, "blk.%d.ffn_down.%d" },
|
||||||
{ LLM_TENSOR_FFN_UP_EXP, "blk.%d.ffn_up.%d" },
|
{ LLM_TENSOR_FFN_UP_EXP, "blk.%d.ffn_up.%d" },
|
||||||
{ LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" },
|
{ LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" },
|
||||||
{ LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" },
|
{ LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" },
|
||||||
{ LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
|
{ LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
|
||||||
|
{ LLM_TENSOR_FFN_POST_NORM, "blk.%d.post_ffw_norm" },
|
||||||
{ LLM_TENSOR_LAYER_OUT_NORM, "blk.%d.layer_output_norm" },
|
{ LLM_TENSOR_LAYER_OUT_NORM, "blk.%d.layer_output_norm" },
|
||||||
{ LLM_TENSOR_ATTN_OUT_NORM, "blk.%d.attn_output_norm" },
|
{ LLM_TENSOR_ATTN_OUT_NORM, "blk.%d.attn_output_norm" },
|
||||||
},
|
},
|
||||||
|
|
@ -576,6 +600,20 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
|
||||||
{ LLM_TENSOR_CLS, "cls" },
|
{ LLM_TENSOR_CLS, "cls" },
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
LLM_ARCH_JINA_BERT_V3,
|
||||||
|
{
|
||||||
|
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
||||||
|
{ LLM_TENSOR_TOKEN_EMBD_NORM, "token_embd_norm" },
|
||||||
|
{ LLM_TENSOR_TOKEN_TYPES, "token_types" },
|
||||||
|
{ LLM_TENSOR_ATTN_OUT_NORM, "blk.%d.attn_output_norm" },
|
||||||
|
{ LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" },
|
||||||
|
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
||||||
|
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
||||||
|
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
||||||
|
{ LLM_TENSOR_LAYER_OUT_NORM, "blk.%d.layer_output_norm" },
|
||||||
|
},
|
||||||
|
},
|
||||||
{
|
{
|
||||||
LLM_ARCH_BLOOM,
|
LLM_ARCH_BLOOM,
|
||||||
{
|
{
|
||||||
|
|
@ -689,6 +727,7 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
|
||||||
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
||||||
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
||||||
{ LLM_TENSOR_OUTPUT, "output" },
|
{ LLM_TENSOR_OUTPUT, "output" },
|
||||||
|
{ LLM_TENSOR_CLS_OUT, "cls.output" },
|
||||||
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
||||||
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
|
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
|
||||||
{ LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm" },
|
{ LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm" },
|
||||||
|
|
@ -1021,6 +1060,27 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
|
||||||
{ LLM_TENSOR_LAUREL_POST_NORM, "blk.%d.laurel_post_norm" },
|
{ LLM_TENSOR_LAUREL_POST_NORM, "blk.%d.laurel_post_norm" },
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
LLM_ARCH_GEMMA_EMBEDDING,
|
||||||
|
{
|
||||||
|
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
||||||
|
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
||||||
|
{ LLM_TENSOR_OUTPUT, "output" },
|
||||||
|
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
||||||
|
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
|
||||||
|
{ LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm" },
|
||||||
|
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
|
||||||
|
{ LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" },
|
||||||
|
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
|
||||||
|
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
||||||
|
{ LLM_TENSOR_ATTN_POST_NORM, "blk.%d.post_attention_norm" },
|
||||||
|
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
|
||||||
|
{ LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
|
||||||
|
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
||||||
|
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
||||||
|
{ LLM_TENSOR_FFN_POST_NORM, "blk.%d.post_ffw_norm" },
|
||||||
|
},
|
||||||
|
},
|
||||||
{
|
{
|
||||||
LLM_ARCH_STARCODER2,
|
LLM_ARCH_STARCODER2,
|
||||||
{
|
{
|
||||||
|
|
@ -1534,6 +1594,31 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
|
||||||
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
LLM_ARCH_NEMOTRON_H,
|
||||||
|
{
|
||||||
|
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
||||||
|
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
||||||
|
{ LLM_TENSOR_OUTPUT, "output" },
|
||||||
|
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
||||||
|
// mamba(2) ssm layers
|
||||||
|
{ LLM_TENSOR_SSM_IN, "blk.%d.ssm_in" },
|
||||||
|
{ LLM_TENSOR_SSM_CONV1D, "blk.%d.ssm_conv1d" },
|
||||||
|
{ LLM_TENSOR_SSM_DT, "blk.%d.ssm_dt" },
|
||||||
|
{ LLM_TENSOR_SSM_A, "blk.%d.ssm_a" },
|
||||||
|
{ LLM_TENSOR_SSM_D, "blk.%d.ssm_d" },
|
||||||
|
{ LLM_TENSOR_SSM_NORM, "blk.%d.ssm_norm" },
|
||||||
|
{ LLM_TENSOR_SSM_OUT, "blk.%d.ssm_out" },
|
||||||
|
// attention layers
|
||||||
|
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
|
||||||
|
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
|
||||||
|
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
|
||||||
|
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
||||||
|
// dense FFN
|
||||||
|
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
||||||
|
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
||||||
|
},
|
||||||
|
},
|
||||||
{
|
{
|
||||||
LLM_ARCH_EXAONE,
|
LLM_ARCH_EXAONE,
|
||||||
{
|
{
|
||||||
|
|
@ -2030,6 +2115,7 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
|
||||||
{ LLM_TENSOR_SHORTCONV_OUTPROJ, "blk.%d.shortconv.out_proj" },
|
{ LLM_TENSOR_SHORTCONV_OUTPROJ, "blk.%d.shortconv.out_proj" },
|
||||||
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
||||||
{ LLM_TENSOR_TOKEN_EMBD_NORM, "token_embd_norm" },
|
{ LLM_TENSOR_TOKEN_EMBD_NORM, "token_embd_norm" },
|
||||||
|
{ LLM_TENSOR_OUTPUT, "output" },
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
|
|
@ -2087,6 +2173,66 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
|
||||||
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
LLM_ARCH_LLADA_MOE,
|
||||||
|
{
|
||||||
|
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
||||||
|
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
||||||
|
{ LLM_TENSOR_OUTPUT, "output" },
|
||||||
|
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
||||||
|
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
|
||||||
|
{ LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm" },
|
||||||
|
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
|
||||||
|
{ LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" },
|
||||||
|
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
|
||||||
|
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
||||||
|
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
|
||||||
|
{ LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" },
|
||||||
|
{ LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" },
|
||||||
|
{ LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" },
|
||||||
|
{ LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
LLM_ARCH_SEED_OSS,
|
||||||
|
{
|
||||||
|
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
||||||
|
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
||||||
|
{ LLM_TENSOR_OUTPUT, "output" },
|
||||||
|
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
||||||
|
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
|
||||||
|
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
|
||||||
|
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
|
||||||
|
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
||||||
|
{ LLM_TENSOR_ATTN_POST_NORM, "blk.%d.post_attention_norm" },
|
||||||
|
{ LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
|
||||||
|
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
||||||
|
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
LLM_ARCH_GROVEMOE,
|
||||||
|
{
|
||||||
|
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
||||||
|
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
||||||
|
{ LLM_TENSOR_OUTPUT, "output" },
|
||||||
|
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
||||||
|
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
|
||||||
|
{ LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm" },
|
||||||
|
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
|
||||||
|
{ LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" },
|
||||||
|
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
|
||||||
|
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
||||||
|
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
|
||||||
|
{ LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" },
|
||||||
|
{ LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" },
|
||||||
|
{ LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" },
|
||||||
|
{ LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
|
||||||
|
{ LLM_TENSOR_FFN_GATE_CHEXPS, "blk.%d.ffn_gate_chexps" },
|
||||||
|
{ LLM_TENSOR_FFN_DOWN_CHEXPS, "blk.%d.ffn_down_chexps" },
|
||||||
|
{ LLM_TENSOR_FFN_UP_CHEXPS, "blk.%d.ffn_up_chexps" },
|
||||||
|
},
|
||||||
|
},
|
||||||
{
|
{
|
||||||
LLM_ARCH_UNKNOWN,
|
LLM_ARCH_UNKNOWN,
|
||||||
{
|
{
|
||||||
|
|
@ -2219,6 +2365,9 @@ static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
|
||||||
{LLM_TENSOR_FFN_DOWN_EXPS, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT_ID}},
|
{LLM_TENSOR_FFN_DOWN_EXPS, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT_ID}},
|
||||||
{LLM_TENSOR_FFN_GATE_EXPS, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT_ID}},
|
{LLM_TENSOR_FFN_GATE_EXPS, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT_ID}},
|
||||||
{LLM_TENSOR_FFN_UP_EXPS, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT_ID}},
|
{LLM_TENSOR_FFN_UP_EXPS, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT_ID}},
|
||||||
|
{LLM_TENSOR_FFN_DOWN_CHEXPS, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT_ID}},
|
||||||
|
{LLM_TENSOR_FFN_GATE_CHEXPS, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT_ID}},
|
||||||
|
{LLM_TENSOR_FFN_UP_CHEXPS, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT_ID}},
|
||||||
{LLM_TENSOR_FFN_EXP_PROBS_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_ADD}},
|
{LLM_TENSOR_FFN_EXP_PROBS_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_ADD}},
|
||||||
// altup / laurel (gemma 3n)
|
// altup / laurel (gemma 3n)
|
||||||
{LLM_TENSOR_PER_LAYER_TOKEN_EMBD, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_GET_ROWS}},
|
{LLM_TENSOR_PER_LAYER_TOKEN_EMBD, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_GET_ROWS}},
|
||||||
|
|
@ -2340,6 +2489,7 @@ bool llm_arch_is_hybrid(const llm_arch & arch) {
|
||||||
case LLM_ARCH_PLAMO2:
|
case LLM_ARCH_PLAMO2:
|
||||||
case LLM_ARCH_GRANITE_HYBRID:
|
case LLM_ARCH_GRANITE_HYBRID:
|
||||||
case LLM_ARCH_LFM2:
|
case LLM_ARCH_LFM2:
|
||||||
|
case LLM_ARCH_NEMOTRON_H:
|
||||||
return true;
|
return true;
|
||||||
default:
|
default:
|
||||||
return false;
|
return false;
|
||||||
|
|
@ -2350,6 +2500,7 @@ bool llm_arch_is_diffusion(const llm_arch & arch) {
|
||||||
switch (arch) {
|
switch (arch) {
|
||||||
case LLM_ARCH_DREAM:
|
case LLM_ARCH_DREAM:
|
||||||
case LLM_ARCH_LLADA:
|
case LLM_ARCH_LLADA:
|
||||||
|
case LLM_ARCH_LLADA_MOE:
|
||||||
return true;
|
return true;
|
||||||
default:
|
default:
|
||||||
return false;
|
return false;
|
||||||
|
|
|
||||||
23
llama/llama.cpp/src/llama-arch.h
vendored
23
llama/llama.cpp/src/llama-arch.h
vendored
|
|
@ -26,6 +26,7 @@ enum llm_arch {
|
||||||
LLM_ARCH_NOMIC_BERT_MOE,
|
LLM_ARCH_NOMIC_BERT_MOE,
|
||||||
LLM_ARCH_NEO_BERT,
|
LLM_ARCH_NEO_BERT,
|
||||||
LLM_ARCH_JINA_BERT_V2,
|
LLM_ARCH_JINA_BERT_V2,
|
||||||
|
LLM_ARCH_JINA_BERT_V3,
|
||||||
LLM_ARCH_BLOOM,
|
LLM_ARCH_BLOOM,
|
||||||
LLM_ARCH_STABLELM,
|
LLM_ARCH_STABLELM,
|
||||||
LLM_ARCH_QWEN,
|
LLM_ARCH_QWEN,
|
||||||
|
|
@ -48,6 +49,7 @@ enum llm_arch {
|
||||||
LLM_ARCH_GEMMA2,
|
LLM_ARCH_GEMMA2,
|
||||||
LLM_ARCH_GEMMA3,
|
LLM_ARCH_GEMMA3,
|
||||||
LLM_ARCH_GEMMA3N,
|
LLM_ARCH_GEMMA3N,
|
||||||
|
LLM_ARCH_GEMMA_EMBEDDING,
|
||||||
LLM_ARCH_STARCODER2,
|
LLM_ARCH_STARCODER2,
|
||||||
LLM_ARCH_MAMBA,
|
LLM_ARCH_MAMBA,
|
||||||
LLM_ARCH_MAMBA2,
|
LLM_ARCH_MAMBA2,
|
||||||
|
|
@ -72,6 +74,7 @@ enum llm_arch {
|
||||||
LLM_ARCH_T5ENCODER,
|
LLM_ARCH_T5ENCODER,
|
||||||
LLM_ARCH_JAIS,
|
LLM_ARCH_JAIS,
|
||||||
LLM_ARCH_NEMOTRON,
|
LLM_ARCH_NEMOTRON,
|
||||||
|
LLM_ARCH_NEMOTRON_H,
|
||||||
LLM_ARCH_EXAONE,
|
LLM_ARCH_EXAONE,
|
||||||
LLM_ARCH_EXAONE4,
|
LLM_ARCH_EXAONE4,
|
||||||
LLM_ARCH_RWKV6,
|
LLM_ARCH_RWKV6,
|
||||||
|
|
@ -98,6 +101,9 @@ enum llm_arch {
|
||||||
LLM_ARCH_DREAM,
|
LLM_ARCH_DREAM,
|
||||||
LLM_ARCH_SMALLTHINKER,
|
LLM_ARCH_SMALLTHINKER,
|
||||||
LLM_ARCH_LLADA,
|
LLM_ARCH_LLADA,
|
||||||
|
LLM_ARCH_LLADA_MOE,
|
||||||
|
LLM_ARCH_SEED_OSS,
|
||||||
|
LLM_ARCH_GROVEMOE,
|
||||||
LLM_ARCH_UNKNOWN,
|
LLM_ARCH_UNKNOWN,
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
@ -125,6 +131,7 @@ enum llm_kv {
|
||||||
LLM_KV_FEED_FORWARD_LENGTH,
|
LLM_KV_FEED_FORWARD_LENGTH,
|
||||||
LLM_KV_EXPERT_FEED_FORWARD_LENGTH,
|
LLM_KV_EXPERT_FEED_FORWARD_LENGTH,
|
||||||
LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH,
|
LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH,
|
||||||
|
LLM_KV_EXPERT_CHUNK_FEED_FORWARD_LENGTH,
|
||||||
LLM_KV_USE_PARALLEL_RESIDUAL,
|
LLM_KV_USE_PARALLEL_RESIDUAL,
|
||||||
LLM_KV_TENSOR_DATA_LAYOUT,
|
LLM_KV_TENSOR_DATA_LAYOUT,
|
||||||
LLM_KV_EXPERT_COUNT,
|
LLM_KV_EXPERT_COUNT,
|
||||||
|
|
@ -133,12 +140,16 @@ enum llm_kv {
|
||||||
LLM_KV_EXPERT_WEIGHTS_SCALE,
|
LLM_KV_EXPERT_WEIGHTS_SCALE,
|
||||||
LLM_KV_EXPERT_WEIGHTS_NORM,
|
LLM_KV_EXPERT_WEIGHTS_NORM,
|
||||||
LLM_KV_EXPERT_GATING_FUNC,
|
LLM_KV_EXPERT_GATING_FUNC,
|
||||||
|
LLM_KV_EXPERT_GROUP_SCALE,
|
||||||
|
LLM_KV_EXPERTS_PER_GROUP,
|
||||||
LLM_KV_MOE_EVERY_N_LAYERS,
|
LLM_KV_MOE_EVERY_N_LAYERS,
|
||||||
LLM_KV_NEXTN_PREDICT_LAYERS,
|
LLM_KV_NEXTN_PREDICT_LAYERS,
|
||||||
LLM_KV_POOLING_TYPE,
|
LLM_KV_POOLING_TYPE,
|
||||||
LLM_KV_LOGIT_SCALE,
|
LLM_KV_LOGIT_SCALE,
|
||||||
LLM_KV_DECODER_START_TOKEN_ID,
|
LLM_KV_DECODER_START_TOKEN_ID,
|
||||||
|
LLM_KV_DECODER_BLOCK_COUNT,
|
||||||
LLM_KV_ATTN_LOGIT_SOFTCAPPING,
|
LLM_KV_ATTN_LOGIT_SOFTCAPPING,
|
||||||
|
LLM_KV_ROUTER_LOGIT_SOFTCAPPING,
|
||||||
LLM_KV_FINAL_LOGIT_SOFTCAPPING,
|
LLM_KV_FINAL_LOGIT_SOFTCAPPING,
|
||||||
LLM_KV_SWIN_NORM,
|
LLM_KV_SWIN_NORM,
|
||||||
LLM_KV_RESCALE_EVERY_N_LAYERS,
|
LLM_KV_RESCALE_EVERY_N_LAYERS,
|
||||||
|
|
@ -169,6 +180,8 @@ enum llm_kv {
|
||||||
LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT,
|
LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT,
|
||||||
LLM_KV_ATTENTION_SLIDING_WINDOW,
|
LLM_KV_ATTENTION_SLIDING_WINDOW,
|
||||||
LLM_KV_ATTENTION_SCALE,
|
LLM_KV_ATTENTION_SCALE,
|
||||||
|
LLM_KV_ATTENTION_OUTPUT_SCALE,
|
||||||
|
LLM_KV_ATTENTION_TEMPERATURE_LENGTH,
|
||||||
LLM_KV_ATTENTION_BLOCK_SKIP_CONNECTION,
|
LLM_KV_ATTENTION_BLOCK_SKIP_CONNECTION,
|
||||||
LLM_KV_ATTENTION_KEY_LENGTH_MLA,
|
LLM_KV_ATTENTION_KEY_LENGTH_MLA,
|
||||||
LLM_KV_ATTENTION_VALUE_LENGTH_MLA,
|
LLM_KV_ATTENTION_VALUE_LENGTH_MLA,
|
||||||
|
|
@ -183,6 +196,10 @@ enum llm_kv {
|
||||||
LLM_KV_ROPE_SCALING_ORIG_CTX_LEN,
|
LLM_KV_ROPE_SCALING_ORIG_CTX_LEN,
|
||||||
LLM_KV_ROPE_SCALING_FINETUNED,
|
LLM_KV_ROPE_SCALING_FINETUNED,
|
||||||
LLM_KV_ROPE_SCALING_YARN_LOG_MUL,
|
LLM_KV_ROPE_SCALING_YARN_LOG_MUL,
|
||||||
|
LLM_KV_ROPE_SCALING_YARN_EXT_FACTOR,
|
||||||
|
LLM_KV_ROPE_SCALING_YARN_ATTN_FACTOR,
|
||||||
|
LLM_KV_ROPE_SCALING_YARN_BETA_FAST,
|
||||||
|
LLM_KV_ROPE_SCALING_YARN_BETA_SLOW,
|
||||||
|
|
||||||
LLM_KV_SPLIT_NO,
|
LLM_KV_SPLIT_NO,
|
||||||
LLM_KV_SPLIT_COUNT,
|
LLM_KV_SPLIT_COUNT,
|
||||||
|
|
@ -231,6 +248,9 @@ enum llm_kv {
|
||||||
|
|
||||||
LLM_KV_ADAPTER_TYPE,
|
LLM_KV_ADAPTER_TYPE,
|
||||||
LLM_KV_ADAPTER_LORA_ALPHA,
|
LLM_KV_ADAPTER_LORA_ALPHA,
|
||||||
|
LLM_KV_ADAPTER_LORA_TASK_NAME,
|
||||||
|
LLM_KV_ADAPTER_LORA_PROMPT_PREFIX,
|
||||||
|
LLM_KV_ADAPTER_ALORA_INVOCATION_TOKENS,
|
||||||
|
|
||||||
LLM_KV_POSNET_EMBEDDING_LENGTH,
|
LLM_KV_POSNET_EMBEDDING_LENGTH,
|
||||||
LLM_KV_POSNET_BLOCK_COUNT,
|
LLM_KV_POSNET_BLOCK_COUNT,
|
||||||
|
|
@ -287,6 +307,9 @@ enum llm_tensor {
|
||||||
LLM_TENSOR_FFN_DOWN_SHEXP,
|
LLM_TENSOR_FFN_DOWN_SHEXP,
|
||||||
LLM_TENSOR_FFN_GATE_SHEXP,
|
LLM_TENSOR_FFN_GATE_SHEXP,
|
||||||
LLM_TENSOR_FFN_UP_SHEXP,
|
LLM_TENSOR_FFN_UP_SHEXP,
|
||||||
|
LLM_TENSOR_FFN_DOWN_CHEXPS,
|
||||||
|
LLM_TENSOR_FFN_GATE_CHEXPS,
|
||||||
|
LLM_TENSOR_FFN_UP_CHEXPS,
|
||||||
LLM_TENSOR_FFN_EXP_PROBS_B,
|
LLM_TENSOR_FFN_EXP_PROBS_B,
|
||||||
LLM_TENSOR_ATTN_Q_NORM,
|
LLM_TENSOR_ATTN_Q_NORM,
|
||||||
LLM_TENSOR_ATTN_K_NORM,
|
LLM_TENSOR_ATTN_K_NORM,
|
||||||
|
|
|
||||||
2
llama/llama.cpp/src/llama-batch.cpp
vendored
2
llama/llama.cpp/src/llama-batch.cpp
vendored
|
|
@ -477,7 +477,7 @@ llama_ubatch llama_batch_allocr::split_simple(uint32_t n_ubatch) {
|
||||||
|
|
||||||
llama_ubatch llama_batch_allocr::split_equal(uint32_t n_ubatch, bool sequential) {
|
llama_ubatch llama_batch_allocr::split_equal(uint32_t n_ubatch, bool sequential) {
|
||||||
if (sequential && has_cpl) {
|
if (sequential && has_cpl) {
|
||||||
LLAMA_LOG_ERROR("%s: sequential split is not supported when there are coupled sequences in the input batch\n", __func__);
|
LLAMA_LOG_ERROR("%s: sequential split is not supported when there are coupled sequences in the input batch (you may need to use the -kvu flag)\n", __func__);
|
||||||
|
|
||||||
return {};
|
return {};
|
||||||
}
|
}
|
||||||
|
|
|
||||||
32
llama/llama.cpp/src/llama-chat.cpp
vendored
32
llama/llama.cpp/src/llama-chat.cpp
vendored
|
|
@ -16,10 +16,10 @@
|
||||||
static std::string trim(const std::string & str) {
|
static std::string trim(const std::string & str) {
|
||||||
size_t start = 0;
|
size_t start = 0;
|
||||||
size_t end = str.size();
|
size_t end = str.size();
|
||||||
while (start < end && isspace(str[start])) {
|
while (start < end && isspace(static_cast<unsigned char>(str[start]))) {
|
||||||
start += 1;
|
start += 1;
|
||||||
}
|
}
|
||||||
while (end > start && isspace(str[end - 1])) {
|
while (end > start && isspace(static_cast<unsigned char>(str[end - 1]))) {
|
||||||
end -= 1;
|
end -= 1;
|
||||||
}
|
}
|
||||||
return str.substr(start, end - start);
|
return str.substr(start, end - start);
|
||||||
|
|
@ -69,6 +69,8 @@ static const std::map<std::string, llm_chat_template> LLM_CHAT_TEMPLATES = {
|
||||||
{ "gpt-oss", LLM_CHAT_TEMPLATE_OPENAI_MOE },
|
{ "gpt-oss", LLM_CHAT_TEMPLATE_OPENAI_MOE },
|
||||||
{ "hunyuan-dense", LLM_CHAT_TEMPLATE_HUNYUAN_DENSE },
|
{ "hunyuan-dense", LLM_CHAT_TEMPLATE_HUNYUAN_DENSE },
|
||||||
{ "kimi-k2", LLM_CHAT_TEMPLATE_KIMI_K2 },
|
{ "kimi-k2", LLM_CHAT_TEMPLATE_KIMI_K2 },
|
||||||
|
{ "seed_oss", LLM_CHAT_TEMPLATE_SEED_OSS },
|
||||||
|
{ "grok-2", LLM_CHAT_TEMPLATE_GROK_2 },
|
||||||
};
|
};
|
||||||
|
|
||||||
llm_chat_template llm_chat_template_from_str(const std::string & name) {
|
llm_chat_template llm_chat_template_from_str(const std::string & name) {
|
||||||
|
|
@ -201,6 +203,10 @@ llm_chat_template llm_chat_detect_template(const std::string & tmpl) {
|
||||||
return LLM_CHAT_TEMPLATE_HUNYUAN_DENSE;
|
return LLM_CHAT_TEMPLATE_HUNYUAN_DENSE;
|
||||||
} else if (tmpl_contains("<|im_assistant|>assistant<|im_middle|>")) {
|
} else if (tmpl_contains("<|im_assistant|>assistant<|im_middle|>")) {
|
||||||
return LLM_CHAT_TEMPLATE_KIMI_K2;
|
return LLM_CHAT_TEMPLATE_KIMI_K2;
|
||||||
|
} else if (tmpl_contains("<seed:bos>")) {
|
||||||
|
return LLM_CHAT_TEMPLATE_SEED_OSS;
|
||||||
|
} else if (tmpl_contains("'Assistant: ' + message['content'] + '<|separator|>")) {
|
||||||
|
return LLM_CHAT_TEMPLATE_GROK_2;
|
||||||
}
|
}
|
||||||
return LLM_CHAT_TEMPLATE_UNKNOWN;
|
return LLM_CHAT_TEMPLATE_UNKNOWN;
|
||||||
}
|
}
|
||||||
|
|
@ -752,6 +758,28 @@ int32_t llm_chat_apply_template(
|
||||||
if (add_ass) {
|
if (add_ass) {
|
||||||
ss << "<|im_assistant|>assistant<|im_middle|>";
|
ss << "<|im_assistant|>assistant<|im_middle|>";
|
||||||
}
|
}
|
||||||
|
} else if (tmpl == LLM_CHAT_TEMPLATE_SEED_OSS) {
|
||||||
|
for (auto message: chat) {
|
||||||
|
std::string role(message->role);
|
||||||
|
ss << "<seed:bos>" << role << "\n" << (role == "assistant" ? trim(message->content) : message->content) << "<seed:eos>";
|
||||||
|
}
|
||||||
|
if (add_ass) {
|
||||||
|
ss << "<seed:bos>assistant\n";
|
||||||
|
}
|
||||||
|
} else if (tmpl == LLM_CHAT_TEMPLATE_GROK_2) {
|
||||||
|
for (auto message : chat) {
|
||||||
|
std::string role(message->role);
|
||||||
|
if (role == "system") {
|
||||||
|
ss << "System: " << trim(message->content) << "<|separator|>\n\n";
|
||||||
|
} else if (role == "user") {
|
||||||
|
ss << "Human: " << trim(message->content) << "<|separator|>\n\n";
|
||||||
|
} else if (role == "assistant") {
|
||||||
|
ss << "Assistant: " << message->content << "<|separator|>\n\n";
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (add_ass) {
|
||||||
|
ss << "Assistant:";
|
||||||
|
}
|
||||||
} else {
|
} else {
|
||||||
// template not supported
|
// template not supported
|
||||||
return -1;
|
return -1;
|
||||||
|
|
|
||||||
2
llama/llama.cpp/src/llama-chat.h
vendored
2
llama/llama.cpp/src/llama-chat.h
vendored
|
|
@ -49,6 +49,8 @@ enum llm_chat_template {
|
||||||
LLM_CHAT_TEMPLATE_OPENAI_MOE,
|
LLM_CHAT_TEMPLATE_OPENAI_MOE,
|
||||||
LLM_CHAT_TEMPLATE_HUNYUAN_DENSE,
|
LLM_CHAT_TEMPLATE_HUNYUAN_DENSE,
|
||||||
LLM_CHAT_TEMPLATE_KIMI_K2,
|
LLM_CHAT_TEMPLATE_KIMI_K2,
|
||||||
|
LLM_CHAT_TEMPLATE_SEED_OSS,
|
||||||
|
LLM_CHAT_TEMPLATE_GROK_2,
|
||||||
LLM_CHAT_TEMPLATE_UNKNOWN,
|
LLM_CHAT_TEMPLATE_UNKNOWN,
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
|
||||||
556
llama/llama.cpp/src/llama-context.cpp
vendored
556
llama/llama.cpp/src/llama-context.cpp
vendored
|
|
@ -35,14 +35,12 @@ llama_context::llama_context(
|
||||||
|
|
||||||
cparams.n_threads = params.n_threads;
|
cparams.n_threads = params.n_threads;
|
||||||
cparams.n_threads_batch = params.n_threads_batch;
|
cparams.n_threads_batch = params.n_threads_batch;
|
||||||
cparams.yarn_ext_factor = params.yarn_ext_factor;
|
cparams.yarn_ext_factor = params.yarn_ext_factor >= 0.0f ? params.yarn_ext_factor : hparams.yarn_ext_factor;
|
||||||
cparams.yarn_attn_factor = params.yarn_attn_factor;
|
cparams.yarn_attn_factor = params.yarn_attn_factor >= 0.0f ? params.yarn_attn_factor : hparams.yarn_attn_factor;
|
||||||
cparams.yarn_beta_fast = params.yarn_beta_fast;
|
cparams.yarn_beta_fast = params.yarn_beta_fast >= 0.0f ? params.yarn_beta_fast : hparams.yarn_beta_fast;
|
||||||
cparams.yarn_beta_slow = params.yarn_beta_slow;
|
cparams.yarn_beta_slow = params.yarn_beta_slow >= 0.0f ? params.yarn_beta_slow : hparams.yarn_beta_slow;
|
||||||
cparams.defrag_thold = params.defrag_thold;
|
|
||||||
cparams.embeddings = params.embeddings;
|
cparams.embeddings = params.embeddings;
|
||||||
cparams.offload_kqv = params.offload_kqv;
|
cparams.offload_kqv = params.offload_kqv;
|
||||||
cparams.flash_attn = params.flash_attn;
|
|
||||||
cparams.no_perf = params.no_perf;
|
cparams.no_perf = params.no_perf;
|
||||||
cparams.pooling_type = params.pooling_type;
|
cparams.pooling_type = params.pooling_type;
|
||||||
cparams.warmup = false;
|
cparams.warmup = false;
|
||||||
|
|
@ -87,13 +85,15 @@ llama_context::llama_context(
|
||||||
cparams.causal_attn = params.attention_type == LLAMA_ATTENTION_TYPE_CAUSAL;
|
cparams.causal_attn = params.attention_type == LLAMA_ATTENTION_TYPE_CAUSAL;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
cparams.flash_attn = params.flash_attn_type != LLAMA_FLASH_ATTN_TYPE_DISABLED;
|
||||||
|
|
||||||
// with causal attention, the batch size is limited by the context size
|
// with causal attention, the batch size is limited by the context size
|
||||||
cparams.n_batch = cparams.causal_attn ? std::min(cparams.n_ctx, params.n_batch) : params.n_batch;
|
cparams.n_batch = cparams.causal_attn ? std::min(cparams.n_ctx, params.n_batch) : params.n_batch;
|
||||||
|
|
||||||
// the batch has to be at least GGML_KQ_MASK_PAD because we will be padding the KQ_mask
|
// the batch has to be at least GGML_KQ_MASK_PAD because we will be padding the KQ_mask
|
||||||
// this is required by GPU kernels in order to avoid out-of-bounds accesses (e.g. ggml_flash_attn_ext)
|
// this is required by GPU kernels in order to avoid out-of-bounds accesses (e.g. ggml_flash_attn_ext)
|
||||||
// ref: https://github.com/ggerganov/llama.cpp/pull/5021
|
// ref: https://github.com/ggerganov/llama.cpp/pull/5021
|
||||||
// TODO: this padding is not needed for the cache-less context so we should probably move it to llama_context_kv_self
|
// TODO: this padding is not needed for the cache-less context so we should probably move it to llama_memory
|
||||||
if (cparams.n_batch < GGML_KQ_MASK_PAD) {
|
if (cparams.n_batch < GGML_KQ_MASK_PAD) {
|
||||||
LLAMA_LOG_WARN("%s: n_batch is less than GGML_KQ_MASK_PAD - increasing to %d\n", __func__, GGML_KQ_MASK_PAD);
|
LLAMA_LOG_WARN("%s: n_batch is less than GGML_KQ_MASK_PAD - increasing to %d\n", __func__, GGML_KQ_MASK_PAD);
|
||||||
cparams.n_batch = GGML_KQ_MASK_PAD;
|
cparams.n_batch = GGML_KQ_MASK_PAD;
|
||||||
|
|
@ -103,16 +103,6 @@ llama_context::llama_context(
|
||||||
cparams.op_offload = params.op_offload;
|
cparams.op_offload = params.op_offload;
|
||||||
cparams.kv_unified = params.kv_unified;
|
cparams.kv_unified = params.kv_unified;
|
||||||
|
|
||||||
{
|
|
||||||
const char * LLAMA_SET_ROWS = getenv("LLAMA_SET_ROWS");
|
|
||||||
supports_set_rows = LLAMA_SET_ROWS ? (atoi(LLAMA_SET_ROWS) != 0) : supports_set_rows;
|
|
||||||
|
|
||||||
if (!supports_set_rows && !cparams.kv_unified) {
|
|
||||||
LLAMA_LOG_WARN("%s: non-unified KV cache requires ggml_set_rows() - forcing unified KV cache\n", __func__);
|
|
||||||
cparams.kv_unified = true;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
{
|
{
|
||||||
const char * LLAMA_GRAPH_REUSE_DISABLE = getenv("LLAMA_GRAPH_REUSE_DISABLE");
|
const char * LLAMA_GRAPH_REUSE_DISABLE = getenv("LLAMA_GRAPH_REUSE_DISABLE");
|
||||||
graph_reuse_disable = LLAMA_GRAPH_REUSE_DISABLE ? (atoi(LLAMA_GRAPH_REUSE_DISABLE) != 0) : graph_reuse_disable;
|
graph_reuse_disable = LLAMA_GRAPH_REUSE_DISABLE ? (atoi(LLAMA_GRAPH_REUSE_DISABLE) != 0) : graph_reuse_disable;
|
||||||
|
|
@ -130,7 +120,7 @@ llama_context::llama_context(
|
||||||
LLAMA_LOG_INFO("%s: n_batch = %u\n", __func__, cparams.n_batch);
|
LLAMA_LOG_INFO("%s: n_batch = %u\n", __func__, cparams.n_batch);
|
||||||
LLAMA_LOG_INFO("%s: n_ubatch = %u\n", __func__, cparams.n_ubatch);
|
LLAMA_LOG_INFO("%s: n_ubatch = %u\n", __func__, cparams.n_ubatch);
|
||||||
LLAMA_LOG_INFO("%s: causal_attn = %d\n", __func__, cparams.causal_attn);
|
LLAMA_LOG_INFO("%s: causal_attn = %d\n", __func__, cparams.causal_attn);
|
||||||
LLAMA_LOG_INFO("%s: flash_attn = %d\n", __func__, cparams.flash_attn);
|
LLAMA_LOG_INFO("%s: flash_attn = %s\n", __func__, llama_flash_attn_type_name(params.flash_attn_type));
|
||||||
LLAMA_LOG_INFO("%s: kv_unified = %s\n", __func__, cparams.kv_unified ? "true" : "false");
|
LLAMA_LOG_INFO("%s: kv_unified = %s\n", __func__, cparams.kv_unified ? "true" : "false");
|
||||||
LLAMA_LOG_INFO("%s: freq_base = %.1f\n", __func__, cparams.rope_freq_base);
|
LLAMA_LOG_INFO("%s: freq_base = %.1f\n", __func__, cparams.rope_freq_base);
|
||||||
LLAMA_LOG_INFO("%s: freq_scale = %g\n", __func__, cparams.rope_freq_scale);
|
LLAMA_LOG_INFO("%s: freq_scale = %g\n", __func__, cparams.rope_freq_scale);
|
||||||
|
|
@ -145,11 +135,6 @@ llama_context::llama_context(
|
||||||
__func__, n_ctx_per_seq, hparams.n_ctx_train);
|
__func__, n_ctx_per_seq, hparams.n_ctx_train);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!params.swa_full && cparams.n_seq_max > 1 && hparams.is_swa_any()) {
|
|
||||||
LLAMA_LOG_WARN("%s: requested n_seq_max (%u) > 1, but swa_full is not enabled -- performance may be degraded: %s\n",
|
|
||||||
__func__, cparams.n_seq_max, "https://github.com/ggml-org/llama.cpp/pull/13845#issuecomment-2924800573");
|
|
||||||
}
|
|
||||||
|
|
||||||
if (!hparams.vocab_only) {
|
if (!hparams.vocab_only) {
|
||||||
// GPU backends
|
// GPU backends
|
||||||
for (auto * dev : model.devices) {
|
for (auto * dev : model.devices) {
|
||||||
|
|
@ -196,7 +181,7 @@ llama_context::llama_context(
|
||||||
// graph outputs buffer
|
// graph outputs buffer
|
||||||
{
|
{
|
||||||
// resized during inference when a batch uses more outputs
|
// resized during inference when a batch uses more outputs
|
||||||
if ((uint32_t) output_reserve(params.n_seq_max) < params.n_seq_max) {
|
if (output_reserve(params.n_seq_max) < params.n_seq_max) {
|
||||||
throw std::runtime_error("failed to reserve initial output buffer");
|
throw std::runtime_error("failed to reserve initial output buffer");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -285,28 +270,75 @@ llama_context::llama_context(
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// reserve worst-case graph
|
if (!hparams.vocab_only) {
|
||||||
if (!hparams.vocab_only && memory) {
|
llama_memory_context_ptr mctx;
|
||||||
|
if (memory) {
|
||||||
|
LLAMA_LOG_DEBUG("%s: reserving full memory module\n", __func__);
|
||||||
|
mctx = memory->init_full();
|
||||||
|
if (!mctx) {
|
||||||
|
throw std::runtime_error("failed to initialize memory module");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
cross.v_embd.clear();
|
||||||
|
|
||||||
const uint32_t n_seqs = cparams.kv_unified ? 1 : cparams.n_seq_max;
|
const uint32_t n_seqs = cparams.kv_unified ? 1 : cparams.n_seq_max;
|
||||||
const uint32_t n_tokens = std::min(cparams.n_ctx, cparams.n_ubatch);
|
const uint32_t n_tokens = std::min(cparams.n_ctx, cparams.n_ubatch);
|
||||||
|
|
||||||
|
// avoid reserving graphs with zero outputs - assume one output per sequence
|
||||||
|
n_outputs = n_seqs;
|
||||||
|
|
||||||
LLAMA_LOG_DEBUG("%s: worst-case: n_tokens = %d, n_seqs = %d, n_outputs = %d\n", __func__, n_tokens, n_seqs, n_outputs);
|
LLAMA_LOG_DEBUG("%s: worst-case: n_tokens = %d, n_seqs = %d, n_outputs = %d\n", __func__, n_tokens, n_seqs, n_outputs);
|
||||||
|
|
||||||
|
// resolve automatic Flash Attention use
|
||||||
|
if (params.flash_attn_type == LLAMA_FLASH_ATTN_TYPE_AUTO) {
|
||||||
|
auto * gf = graph_reserve(1, n_seqs, n_outputs, mctx.get(), true);
|
||||||
|
if (!gf) {
|
||||||
|
throw std::runtime_error("failed to split graph for Flash Attention check");
|
||||||
|
}
|
||||||
|
|
||||||
|
const size_t prefix_len = strlen(LLAMA_TENSOR_NAME_FATTN) + 1;
|
||||||
|
bool fa_device_mismatch = false;
|
||||||
|
for (int i = 0; i < ggml_graph_n_nodes(gf); i++) {
|
||||||
|
ggml_tensor * n = ggml_graph_node(gf, i);
|
||||||
|
if (n->op != GGML_OP_FLASH_ATTN_EXT) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
ggml_backend_dev_t device_fa = ggml_backend_get_device(
|
||||||
|
ggml_backend_sched_get_tensor_backend(sched.get(), n));
|
||||||
|
|
||||||
|
// TODO: instead of the tensor names, use a map to keep track of which (FA) tensors belong to which layer
|
||||||
|
GGML_ASSERT(strncmp(n->name, LLAMA_TENSOR_NAME_FATTN "-", prefix_len) == 0);
|
||||||
|
const int il = std::stoi(n->name + prefix_len);
|
||||||
|
ggml_backend_dev_t device_kv = model.dev_layer(il);
|
||||||
|
if (device_fa != device_kv) {
|
||||||
|
LLAMA_LOG_WARN("%s: layer %d is assigned to device %s but the Flash Attention tensor "
|
||||||
|
"is assigned to device %s (usually due to missing support)\n",
|
||||||
|
__func__, il, ggml_backend_dev_name(device_kv), ggml_backend_dev_name(device_fa));
|
||||||
|
// FIXME: fa_device_mismatch logic is wrong for --no-kv-offload, but this is broken anyways
|
||||||
|
fa_device_mismatch = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (fa_device_mismatch) {
|
||||||
|
cparams.flash_attn = false;
|
||||||
|
LLAMA_LOG_WARN("%s: Flash Attention was auto, set to disabled\n", __func__);
|
||||||
|
if (ggml_is_quantized(params.type_v)) {
|
||||||
|
throw std::runtime_error("quantized V cache was requested, but this requires Flash Attention");
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
cparams.flash_attn = true;
|
||||||
|
LLAMA_LOG_INFO("%s: Flash Attention was auto, set to enabled\n", __func__);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// reserve worst-case graph
|
||||||
int n_splits_pp = -1;
|
int n_splits_pp = -1;
|
||||||
int n_nodes_pp = -1;
|
int n_nodes_pp = -1;
|
||||||
|
|
||||||
int n_splits_tg = -1;
|
int n_splits_tg = -1;
|
||||||
int n_nodes_tg = -1;
|
int n_nodes_tg = -1;
|
||||||
|
|
||||||
// simulate full KV cache
|
|
||||||
|
|
||||||
const auto mctx = memory->init_full();
|
|
||||||
if (!mctx) {
|
|
||||||
throw std::runtime_error("failed to initialize KV cache");
|
|
||||||
}
|
|
||||||
|
|
||||||
cross.v_embd.clear();
|
|
||||||
|
|
||||||
// reserve pp (prompt processing) graph first so that buffers are only allocated once
|
// reserve pp (prompt processing) graph first so that buffers are only allocated once
|
||||||
{
|
{
|
||||||
auto * gf = graph_reserve(n_tokens, n_seqs, n_tokens, mctx.get());
|
auto * gf = graph_reserve(n_tokens, n_seqs, n_tokens, mctx.get());
|
||||||
|
|
@ -444,26 +476,12 @@ llama_memory_t llama_context::get_memory() const {
|
||||||
return memory.get();
|
return memory.get();
|
||||||
}
|
}
|
||||||
|
|
||||||
// deprecated
|
bool llama_context::memory_update(bool optimize) {
|
||||||
void llama_context::kv_self_defrag_sched() {
|
|
||||||
if (!memory) {
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
memory_force_optimize = true;
|
|
||||||
}
|
|
||||||
|
|
||||||
// deprecated
|
|
||||||
bool llama_context::kv_self_update(bool optimize) {
|
|
||||||
if (!memory) {
|
if (!memory) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
{
|
{
|
||||||
// TODO: remove in the future
|
|
||||||
optimize |= memory_force_optimize;
|
|
||||||
memory_force_optimize = false;
|
|
||||||
|
|
||||||
const auto mctx = memory->init_update(this, optimize);
|
const auto mctx = memory->init_update(this, optimize);
|
||||||
switch (mctx->get_status()) {
|
switch (mctx->get_status()) {
|
||||||
case LLAMA_MEMORY_STATUS_SUCCESS:
|
case LLAMA_MEMORY_STATUS_SUCCESS:
|
||||||
|
|
@ -908,12 +926,6 @@ int llama_context::encode(const llama_batch & batch_inp) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!supports_set_rows) {
|
|
||||||
// Reset state for the next token before backend sync, to allow the CPU activities in the reset to
|
|
||||||
// overlap with device computation.
|
|
||||||
ggml_backend_sched_reset(sched.get());
|
|
||||||
}
|
|
||||||
|
|
||||||
// TODO: hacky solution
|
// TODO: hacky solution
|
||||||
if (model.arch == LLM_ARCH_T5 && t_embd) {
|
if (model.arch == LLM_ARCH_T5 && t_embd) {
|
||||||
//cross.t_embd = t_embd;
|
//cross.t_embd = t_embd;
|
||||||
|
|
@ -996,8 +1008,8 @@ int llama_context::decode(const llama_batch & batch_inp) {
|
||||||
|
|
||||||
bool did_optimize = false;
|
bool did_optimize = false;
|
||||||
|
|
||||||
// handle any pending defrags/shifts
|
// handle any pending shifts/copies
|
||||||
kv_self_update(false);
|
memory_update(false);
|
||||||
|
|
||||||
llama_memory_context_ptr mctx;
|
llama_memory_context_ptr mctx;
|
||||||
|
|
||||||
|
|
@ -1022,7 +1034,7 @@ int llama_context::decode(const llama_batch & batch_inp) {
|
||||||
if (!did_optimize) {
|
if (!did_optimize) {
|
||||||
did_optimize = true;
|
did_optimize = true;
|
||||||
|
|
||||||
if (kv_self_update(true)) {
|
if (memory_update(true)) {
|
||||||
LLAMA_LOG_DEBUG("%s: retrying batch size %d after cache optimization\n", __func__, balloc->get_n_tokens());
|
LLAMA_LOG_DEBUG("%s: retrying batch size %d after cache optimization\n", __func__, balloc->get_n_tokens());
|
||||||
|
|
||||||
continue;
|
continue;
|
||||||
|
|
@ -1075,7 +1087,7 @@ int llama_context::decode(const llama_batch & batch_inp) {
|
||||||
const auto * res = process_ubatch(ubatch, LLM_GRAPH_TYPE_DECODER, mctx.get(), status);
|
const auto * res = process_ubatch(ubatch, LLM_GRAPH_TYPE_DECODER, mctx.get(), status);
|
||||||
|
|
||||||
if (!res) {
|
if (!res) {
|
||||||
// the last ubatch failed or was aborted -> remove all positions of that ubatch from the KV cache
|
// the last ubatch failed or was aborted -> remove all positions of that ubatch from the memory module
|
||||||
llama_pos pos_min[LLAMA_MAX_SEQ];
|
llama_pos pos_min[LLAMA_MAX_SEQ];
|
||||||
for (int s = 0; s < LLAMA_MAX_SEQ; ++s) {
|
for (int s = 0; s < LLAMA_MAX_SEQ; ++s) {
|
||||||
pos_min[s] = std::numeric_limits<llama_pos>::max();
|
pos_min[s] = std::numeric_limits<llama_pos>::max();
|
||||||
|
|
@ -1092,7 +1104,7 @@ int llama_context::decode(const llama_batch & batch_inp) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
LLAMA_LOG_WARN("%s: removing KV cache entries for seq_id = %d, pos = [%d, +inf)\n", __func__, s, pos_min[s]);
|
LLAMA_LOG_WARN("%s: removing memory module entries for seq_id = %d, pos = [%d, +inf)\n", __func__, s, pos_min[s]);
|
||||||
|
|
||||||
memory->seq_rm(s, pos_min[s], -1);
|
memory->seq_rm(s, pos_min[s], -1);
|
||||||
}
|
}
|
||||||
|
|
@ -1243,12 +1255,6 @@ int llama_context::decode(const llama_batch & batch_inp) {
|
||||||
// wait for the computation to finish (automatically done when obtaining the model output)
|
// wait for the computation to finish (automatically done when obtaining the model output)
|
||||||
//synchronize();
|
//synchronize();
|
||||||
|
|
||||||
if (!supports_set_rows) {
|
|
||||||
// Reset state for the next token before backend sync, to allow the CPU activities in the reset to
|
|
||||||
// overlap with device computation.
|
|
||||||
ggml_backend_sched_reset(sched.get());
|
|
||||||
}
|
|
||||||
|
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -1362,8 +1368,9 @@ llm_graph_result * llama_context::get_gf_res_reserve() const {
|
||||||
return static_cast<llm_graph_result *>(gf_res_reserve.get());
|
return static_cast<llm_graph_result *>(gf_res_reserve.get());
|
||||||
}
|
}
|
||||||
|
|
||||||
ggml_cgraph * llama_context::graph_reserve(uint32_t n_tokens, uint32_t n_seqs, uint32_t n_outputs, const llama_memory_context_i * mctx) {
|
ggml_cgraph * llama_context::graph_reserve(uint32_t n_tokens, uint32_t n_seqs, uint32_t n_outputs, const llama_memory_context_i * mctx, bool split_only) {
|
||||||
LLAMA_LOG_DEBUG("%s: reserving a graph for ubatch with n_tokens = %4u, n_seqs = %2u, n_outputs = %4u\n", __func__, n_tokens, n_seqs, n_outputs);
|
LLAMA_LOG_DEBUG("%s: reserving a graph for ubatch with n_tokens = %4u, n_seqs = %2u, n_outputs = %4u\n", __func__, n_tokens, n_seqs, n_outputs);
|
||||||
|
GGML_ASSERT(n_outputs >= 1);
|
||||||
|
|
||||||
if (n_tokens % n_seqs != 0) {
|
if (n_tokens % n_seqs != 0) {
|
||||||
n_tokens = ((n_tokens + (n_seqs - 1)) / n_seqs) * n_seqs; // round to next multiple of n_seqs
|
n_tokens = ((n_tokens + (n_seqs - 1)) / n_seqs) * n_seqs; // round to next multiple of n_seqs
|
||||||
|
|
@ -1397,7 +1404,9 @@ ggml_cgraph * llama_context::graph_reserve(uint32_t n_tokens, uint32_t n_seqs, u
|
||||||
this->n_outputs = save_n_outputs;
|
this->n_outputs = save_n_outputs;
|
||||||
|
|
||||||
// initialize scheduler with the specified graph
|
// initialize scheduler with the specified graph
|
||||||
if (!ggml_backend_sched_reserve(sched.get(), gf)) {
|
if (split_only) {
|
||||||
|
ggml_backend_sched_split_graph(sched.get(), gf);
|
||||||
|
} else if (!ggml_backend_sched_reserve(sched.get(), gf)) {
|
||||||
LLAMA_LOG_ERROR("%s: failed to allocate compute buffers\n", __func__);
|
LLAMA_LOG_ERROR("%s: failed to allocate compute buffers\n", __func__);
|
||||||
return nullptr;
|
return nullptr;
|
||||||
}
|
}
|
||||||
|
|
@ -1437,8 +1446,10 @@ ggml_status llama_context::graph_compute(
|
||||||
if (backend_cpu != nullptr) {
|
if (backend_cpu != nullptr) {
|
||||||
auto * reg = ggml_backend_dev_backend_reg(ggml_backend_get_device(backend_cpu));
|
auto * reg = ggml_backend_dev_backend_reg(ggml_backend_get_device(backend_cpu));
|
||||||
auto * set_threadpool_fn = (decltype(ggml_backend_cpu_set_threadpool) *) ggml_backend_reg_get_proc_address(reg, "ggml_backend_cpu_set_threadpool");
|
auto * set_threadpool_fn = (decltype(ggml_backend_cpu_set_threadpool) *) ggml_backend_reg_get_proc_address(reg, "ggml_backend_cpu_set_threadpool");
|
||||||
|
if (set_threadpool_fn) {
|
||||||
set_threadpool_fn(backend_cpu, tp);
|
set_threadpool_fn(backend_cpu, tp);
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// set the number of threads for all the backends
|
// set the number of threads for all the backends
|
||||||
for (const auto & set_n_threads_fn : set_n_threads_fns) {
|
for (const auto & set_n_threads_fn : set_n_threads_fns) {
|
||||||
|
|
@ -1656,30 +1667,30 @@ size_t llama_context::state_set_data(const uint8_t * src, size_t size) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
size_t llama_context::state_seq_get_size(llama_seq_id seq_id) {
|
size_t llama_context::state_seq_get_size(llama_seq_id seq_id, llama_state_seq_flags flags) {
|
||||||
llama_io_write_dummy io;
|
llama_io_write_dummy io;
|
||||||
try {
|
try {
|
||||||
return state_seq_write_data(io, seq_id);
|
return state_seq_write_data(io, seq_id, flags);
|
||||||
} catch (const std::exception & err) {
|
} catch (const std::exception & err) {
|
||||||
LLAMA_LOG_ERROR("%s: error getting state size: %s\n", __func__, err.what());
|
LLAMA_LOG_ERROR("%s: error getting state size: %s\n", __func__, err.what());
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
size_t llama_context::state_seq_get_data(llama_seq_id seq_id, uint8_t * dst, size_t size) {
|
size_t llama_context::state_seq_get_data(llama_seq_id seq_id, uint8_t * dst, size_t size, llama_state_seq_flags flags) {
|
||||||
llama_io_write_buffer io(dst, size);
|
llama_io_write_buffer io(dst, size);
|
||||||
try {
|
try {
|
||||||
return state_seq_write_data(io, seq_id);
|
return state_seq_write_data(io, seq_id, flags);
|
||||||
} catch (const std::exception & err) {
|
} catch (const std::exception & err) {
|
||||||
LLAMA_LOG_ERROR("%s: error saving state: %s\n", __func__, err.what());
|
LLAMA_LOG_ERROR("%s: error saving state: %s\n", __func__, err.what());
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
size_t llama_context::state_seq_set_data(llama_seq_id seq_id, const uint8_t * src, size_t size) {
|
size_t llama_context::state_seq_set_data(llama_seq_id seq_id, const uint8_t * src, size_t size, llama_state_seq_flags flags) {
|
||||||
llama_io_read_buffer io(src, size);
|
llama_io_read_buffer io(src, size);
|
||||||
try {
|
try {
|
||||||
return state_seq_read_data(io, seq_id);
|
return state_seq_read_data(io, seq_id, flags);
|
||||||
} catch (const std::exception & err) {
|
} catch (const std::exception & err) {
|
||||||
LLAMA_LOG_ERROR("%s: error loading state: %s\n", __func__, err.what());
|
LLAMA_LOG_ERROR("%s: error loading state: %s\n", __func__, err.what());
|
||||||
return 0;
|
return 0;
|
||||||
|
|
@ -1777,7 +1788,7 @@ size_t llama_context::state_seq_load_file(llama_seq_id seq_id, const char * file
|
||||||
{
|
{
|
||||||
const size_t state_size = file.size() - file.tell();
|
const size_t state_size = file.size() - file.tell();
|
||||||
llama_io_read_file io(&file);
|
llama_io_read_file io(&file);
|
||||||
const size_t nread = state_seq_read_data(io, seq_id);
|
const size_t nread = state_seq_read_data(io, seq_id, 0);
|
||||||
if (!nread) {
|
if (!nread) {
|
||||||
LLAMA_LOG_ERROR("%s: failed to restore sequence state\n", __func__);
|
LLAMA_LOG_ERROR("%s: failed to restore sequence state\n", __func__);
|
||||||
return 0;
|
return 0;
|
||||||
|
|
@ -1801,7 +1812,7 @@ size_t llama_context::state_seq_save_file(llama_seq_id seq_id, const char * file
|
||||||
|
|
||||||
// save the context state using stream saving
|
// save the context state using stream saving
|
||||||
llama_io_write_file io(&file);
|
llama_io_write_file io(&file);
|
||||||
state_seq_write_data(io, seq_id);
|
state_seq_write_data(io, seq_id, 0);
|
||||||
|
|
||||||
const size_t res = file.tell();
|
const size_t res = file.tell();
|
||||||
GGML_ASSERT(res == sizeof(uint32_t) * 3 + sizeof(llama_token) * n_token_count + io.n_bytes());
|
GGML_ASSERT(res == sizeof(uint32_t) * 3 + sizeof(llama_token) * n_token_count + io.n_bytes());
|
||||||
|
|
@ -1876,7 +1887,7 @@ size_t llama_context::state_write_data(llama_io_write_i & io) {
|
||||||
}
|
}
|
||||||
|
|
||||||
if (memory != nullptr) {
|
if (memory != nullptr) {
|
||||||
LLAMA_LOG_DEBUG("%s: - writing KV self\n", __func__);
|
LLAMA_LOG_DEBUG("%s: - writing memory module\n", __func__);
|
||||||
memory->state_write(io);
|
memory->state_write(io);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -1962,7 +1973,7 @@ size_t llama_context::state_read_data(llama_io_read_i & io) {
|
||||||
}
|
}
|
||||||
|
|
||||||
if (memory) {
|
if (memory) {
|
||||||
LLAMA_LOG_DEBUG("%s: - reading KV self\n", __func__);
|
LLAMA_LOG_DEBUG("%s: - reading memory module\n", __func__);
|
||||||
|
|
||||||
memory->state_read(io);
|
memory->state_read(io);
|
||||||
}
|
}
|
||||||
|
|
@ -1970,21 +1981,21 @@ size_t llama_context::state_read_data(llama_io_read_i & io) {
|
||||||
return io.n_bytes();
|
return io.n_bytes();
|
||||||
}
|
}
|
||||||
|
|
||||||
size_t llama_context::state_seq_write_data(llama_io_write_i & io, llama_seq_id seq_id) {
|
size_t llama_context::state_seq_write_data(llama_io_write_i & io, llama_seq_id seq_id, llama_state_seq_flags flags) {
|
||||||
GGML_UNUSED(seq_id);
|
GGML_UNUSED(seq_id);
|
||||||
|
|
||||||
if (memory) {
|
if (memory) {
|
||||||
memory->state_write(io, seq_id);
|
memory->state_write(io, seq_id, flags);
|
||||||
}
|
}
|
||||||
|
|
||||||
return io.n_bytes();
|
return io.n_bytes();
|
||||||
}
|
}
|
||||||
|
|
||||||
size_t llama_context::state_seq_read_data(llama_io_read_i & io, llama_seq_id seq_id) {
|
size_t llama_context::state_seq_read_data(llama_io_read_i & io, llama_seq_id seq_id, llama_state_seq_flags flags) {
|
||||||
GGML_UNUSED(seq_id);
|
GGML_UNUSED(seq_id);
|
||||||
|
|
||||||
if (memory) {
|
if (memory) {
|
||||||
memory->state_read(io, seq_id);
|
memory->state_read(io, seq_id, flags);
|
||||||
}
|
}
|
||||||
|
|
||||||
return io.n_bytes();
|
return io.n_bytes();
|
||||||
|
|
@ -2015,6 +2026,21 @@ void llama_context::perf_reset() {
|
||||||
n_reused = 0;
|
n_reused = 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
std::map<ggml_backend_buffer_type_t, llama_memory_breakdown_data> llama_context::memory_breakdown() const {
|
||||||
|
std::map<ggml_backend_buffer_type_t, llama_memory_breakdown_data> ret;
|
||||||
|
for (const auto & buft_size : model.memory_breakdown()) {
|
||||||
|
ret[buft_size.first].model += buft_size.second;
|
||||||
|
}
|
||||||
|
for (const auto & buft_size : memory->memory_breakdown()) {
|
||||||
|
ret[buft_size.first].context += buft_size.second;
|
||||||
|
}
|
||||||
|
for (const auto & backend_ptr : backends) {
|
||||||
|
ggml_backend_t backend = backend_ptr.get();
|
||||||
|
ret[ggml_backend_sched_get_buffer_type(sched.get(), backend)].compute += ggml_backend_sched_get_buffer_size(sched.get(), backend);
|
||||||
|
}
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
|
||||||
//
|
//
|
||||||
// training
|
// training
|
||||||
//
|
//
|
||||||
|
|
@ -2047,7 +2073,7 @@ void llama_context::opt_init(struct llama_model * model, struct llama_opt_params
|
||||||
opt_params.opt_period = n_batch / n_ubatch;
|
opt_params.opt_period = n_batch / n_ubatch;
|
||||||
opt_params.get_opt_pars = lopt_params.get_opt_pars;
|
opt_params.get_opt_pars = lopt_params.get_opt_pars;
|
||||||
opt_params.get_opt_pars_ud = lopt_params.get_opt_pars_ud;
|
opt_params.get_opt_pars_ud = lopt_params.get_opt_pars_ud;
|
||||||
|
opt_params.optimizer = lopt_params.optimizer_type;
|
||||||
opt_ctx = ggml_opt_init(opt_params);
|
opt_ctx = ggml_opt_init(opt_params);
|
||||||
|
|
||||||
llama_opt_param_filter param_filter = lopt_params.param_filter;
|
llama_opt_param_filter param_filter = lopt_params.param_filter;
|
||||||
|
|
@ -2247,12 +2273,13 @@ llama_context_params llama_context_default_params() {
|
||||||
/*.rope_scaling_type =*/ LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED,
|
/*.rope_scaling_type =*/ LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED,
|
||||||
/*.pooling_type =*/ LLAMA_POOLING_TYPE_UNSPECIFIED,
|
/*.pooling_type =*/ LLAMA_POOLING_TYPE_UNSPECIFIED,
|
||||||
/*.attention_type =*/ LLAMA_ATTENTION_TYPE_UNSPECIFIED,
|
/*.attention_type =*/ LLAMA_ATTENTION_TYPE_UNSPECIFIED,
|
||||||
|
/*.flash_attn_type =*/ LLAMA_FLASH_ATTN_TYPE_AUTO,
|
||||||
/*.rope_freq_base =*/ 0.0f,
|
/*.rope_freq_base =*/ 0.0f,
|
||||||
/*.rope_freq_scale =*/ 0.0f,
|
/*.rope_freq_scale =*/ 0.0f,
|
||||||
/*.yarn_ext_factor =*/ -1.0f,
|
/*.yarn_ext_factor =*/ -1.0f,
|
||||||
/*.yarn_attn_factor =*/ 1.0f,
|
/*.yarn_attn_factor =*/ -1.0f,
|
||||||
/*.yarn_beta_fast =*/ 32.0f,
|
/*.yarn_beta_fast =*/ -1.0f,
|
||||||
/*.yarn_beta_slow =*/ 1.0f,
|
/*.yarn_beta_slow =*/ -1.0f,
|
||||||
/*.yarn_orig_ctx =*/ 0,
|
/*.yarn_orig_ctx =*/ 0,
|
||||||
/*.defrag_thold =*/ -1.0f,
|
/*.defrag_thold =*/ -1.0f,
|
||||||
/*.cb_eval =*/ nullptr,
|
/*.cb_eval =*/ nullptr,
|
||||||
|
|
@ -2263,7 +2290,6 @@ llama_context_params llama_context_default_params() {
|
||||||
/*.abort_callback_data =*/ nullptr,
|
/*.abort_callback_data =*/ nullptr,
|
||||||
/*.embeddings =*/ false,
|
/*.embeddings =*/ false,
|
||||||
/*.offload_kqv =*/ true,
|
/*.offload_kqv =*/ true,
|
||||||
/*.flash_attn =*/ false,
|
|
||||||
/*.no_perf =*/ true,
|
/*.no_perf =*/ true,
|
||||||
/*.op_offload =*/ true,
|
/*.op_offload =*/ true,
|
||||||
/*.swa_full =*/ true,
|
/*.swa_full =*/ true,
|
||||||
|
|
@ -2291,12 +2317,30 @@ llama_context * llama_init_from_model(
|
||||||
return nullptr;
|
return nullptr;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (params.flash_attn && model->arch == LLM_ARCH_GROK) {
|
if (params.flash_attn_type != LLAMA_FLASH_ATTN_TYPE_DISABLED && model->arch == LLM_ARCH_GROK) {
|
||||||
LLAMA_LOG_WARN("%s: flash_attn is not compatible with Grok - forcing off\n", __func__);
|
LLAMA_LOG_WARN("%s: flash_attn is not compatible with Grok - forcing off\n", __func__);
|
||||||
params.flash_attn = false;
|
params.flash_attn_type = LLAMA_FLASH_ATTN_TYPE_DISABLED;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (ggml_is_quantized(params.type_v) && !params.flash_attn) {
|
if (params.flash_attn_type == LLAMA_FLASH_ATTN_TYPE_AUTO && ggml_is_quantized(params.type_k)) {
|
||||||
|
const uint32_t blck_size = ggml_blck_size(params.type_k);
|
||||||
|
if (model->hparams.n_embd_head_k % blck_size != 0) {
|
||||||
|
LLAMA_LOG_ERROR("%s: K cache type %s with block size %u does not divide n_embd_head_k=%u\n",
|
||||||
|
__func__, ggml_type_name(params.type_k), blck_size, model->hparams.n_embd_head_k);
|
||||||
|
return nullptr;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (params.flash_attn_type == LLAMA_FLASH_ATTN_TYPE_AUTO && ggml_is_quantized(params.type_v)) {
|
||||||
|
const uint32_t blck_size = ggml_blck_size(params.type_v);
|
||||||
|
if (model->hparams.n_embd_head_v % blck_size != 0) {
|
||||||
|
LLAMA_LOG_ERROR("%s: V cache type %s with block size %u does not divide n_embd_head_k=%u\n",
|
||||||
|
__func__, ggml_type_name(params.type_v), blck_size, model->hparams.n_embd_head_v);
|
||||||
|
return nullptr;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (ggml_is_quantized(params.type_v) && params.flash_attn_type == LLAMA_FLASH_ATTN_TYPE_DISABLED) {
|
||||||
LLAMA_LOG_ERROR("%s: V cache quantization requires flash_attn\n", __func__);
|
LLAMA_LOG_ERROR("%s: V cache quantization requires flash_attn\n", __func__);
|
||||||
return nullptr;
|
return nullptr;
|
||||||
}
|
}
|
||||||
|
|
@ -2342,16 +2386,6 @@ const llama_model * llama_get_model(const llama_context * ctx) {
|
||||||
return &ctx->get_model();
|
return &ctx->get_model();
|
||||||
}
|
}
|
||||||
|
|
||||||
// deprecated
|
|
||||||
llama_kv_cache * llama_get_kv_self(llama_context * ctx) {
|
|
||||||
return dynamic_cast<llama_kv_cache *>(ctx->get_memory());
|
|
||||||
}
|
|
||||||
|
|
||||||
// deprecated
|
|
||||||
void llama_kv_self_update(llama_context * ctx) {
|
|
||||||
ctx->kv_self_update(false);
|
|
||||||
}
|
|
||||||
|
|
||||||
enum llama_pooling_type llama_pooling_type(const llama_context * ctx) {
|
enum llama_pooling_type llama_pooling_type(const llama_context * ctx) {
|
||||||
return ctx->pooling_type();
|
return ctx->pooling_type();
|
||||||
}
|
}
|
||||||
|
|
@ -2569,168 +2603,6 @@ bool llama_memory_can_shift(llama_memory_t mem) {
|
||||||
return mem->get_can_shift();
|
return mem->get_can_shift();
|
||||||
}
|
}
|
||||||
|
|
||||||
//
|
|
||||||
// kv cache
|
|
||||||
//
|
|
||||||
|
|
||||||
// deprecated
|
|
||||||
int32_t llama_kv_self_n_tokens(const llama_context * ctx) {
|
|
||||||
const auto * kv = llama_get_memory(ctx);
|
|
||||||
if (!kv) {
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
int32_t res = 0;
|
|
||||||
|
|
||||||
for (uint32_t s = 0; s < ctx->get_cparams().n_seq_max; s++) {
|
|
||||||
const llama_pos p0 = kv->seq_pos_min(s);
|
|
||||||
const llama_pos p1 = kv->seq_pos_max(s);
|
|
||||||
|
|
||||||
if (p0 >= 0) {
|
|
||||||
res += (p1 - p0) + 1;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return res;
|
|
||||||
}
|
|
||||||
|
|
||||||
// deprecated
|
|
||||||
// note: this is the same as above - will be removed anyway, so it's ok
|
|
||||||
int32_t llama_kv_self_used_cells(const llama_context * ctx) {
|
|
||||||
const auto * kv = llama_get_memory(ctx);
|
|
||||||
if (!kv) {
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
int32_t res = 0;
|
|
||||||
|
|
||||||
for (uint32_t s = 0; s < ctx->get_cparams().n_seq_max; s++) {
|
|
||||||
const llama_pos p0 = kv->seq_pos_min(s);
|
|
||||||
const llama_pos p1 = kv->seq_pos_max(s);
|
|
||||||
|
|
||||||
if (p0 >= 0) {
|
|
||||||
res += (p1 - p0) + 1;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return res;
|
|
||||||
}
|
|
||||||
|
|
||||||
// deprecated
|
|
||||||
void llama_kv_self_clear(llama_context * ctx) {
|
|
||||||
auto * kv = llama_get_memory(ctx);
|
|
||||||
if (!kv) {
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
llama_memory_clear(kv, true);
|
|
||||||
}
|
|
||||||
|
|
||||||
// deprecated
|
|
||||||
bool llama_kv_self_seq_rm(
|
|
||||||
llama_context * ctx,
|
|
||||||
llama_seq_id seq_id,
|
|
||||||
llama_pos p0,
|
|
||||||
llama_pos p1) {
|
|
||||||
auto * kv = llama_get_memory(ctx);
|
|
||||||
if (!kv) {
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
return llama_memory_seq_rm(kv, seq_id, p0, p1);
|
|
||||||
}
|
|
||||||
|
|
||||||
// deprecated
|
|
||||||
void llama_kv_self_seq_cp(
|
|
||||||
llama_context * ctx,
|
|
||||||
llama_seq_id seq_id_src,
|
|
||||||
llama_seq_id seq_id_dst,
|
|
||||||
llama_pos p0,
|
|
||||||
llama_pos p1) {
|
|
||||||
auto * kv = llama_get_memory(ctx);
|
|
||||||
if (!kv) {
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
llama_memory_seq_cp(kv, seq_id_src, seq_id_dst, p0, p1);
|
|
||||||
}
|
|
||||||
|
|
||||||
// deprecated
|
|
||||||
void llama_kv_self_seq_keep(llama_context * ctx, llama_seq_id seq_id) {
|
|
||||||
auto * kv = llama_get_memory(ctx);
|
|
||||||
if (!kv) {
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
llama_memory_seq_keep(kv, seq_id);
|
|
||||||
}
|
|
||||||
|
|
||||||
// deprecated
|
|
||||||
void llama_kv_self_seq_add(
|
|
||||||
llama_context * ctx,
|
|
||||||
llama_seq_id seq_id,
|
|
||||||
llama_pos p0,
|
|
||||||
llama_pos p1,
|
|
||||||
llama_pos delta) {
|
|
||||||
auto * kv = llama_get_memory(ctx);
|
|
||||||
if (!kv) {
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
llama_memory_seq_add(kv, seq_id, p0, p1, delta);
|
|
||||||
}
|
|
||||||
|
|
||||||
// deprecated
|
|
||||||
void llama_kv_self_seq_div(
|
|
||||||
llama_context * ctx,
|
|
||||||
llama_seq_id seq_id,
|
|
||||||
llama_pos p0,
|
|
||||||
llama_pos p1,
|
|
||||||
int d) {
|
|
||||||
auto * kv = llama_get_memory(ctx);
|
|
||||||
if (!kv) {
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
llama_memory_seq_div(kv, seq_id, p0, p1, d);
|
|
||||||
}
|
|
||||||
|
|
||||||
// deprecated
|
|
||||||
llama_pos llama_kv_self_seq_pos_min(llama_context * ctx, llama_seq_id seq_id) {
|
|
||||||
auto * kv = llama_get_memory(ctx);
|
|
||||||
if (!kv) {
|
|
||||||
return -1;
|
|
||||||
}
|
|
||||||
|
|
||||||
return llama_memory_seq_pos_min(kv, seq_id);
|
|
||||||
}
|
|
||||||
|
|
||||||
// deprecated
|
|
||||||
llama_pos llama_kv_self_seq_pos_max(llama_context * ctx, llama_seq_id seq_id) {
|
|
||||||
auto * kv = llama_get_memory(ctx);
|
|
||||||
if (!kv) {
|
|
||||||
return -1;
|
|
||||||
}
|
|
||||||
|
|
||||||
return llama_memory_seq_pos_max(kv, seq_id);
|
|
||||||
}
|
|
||||||
|
|
||||||
// deprecated
|
|
||||||
void llama_kv_self_defrag(llama_context * ctx) {
|
|
||||||
// force defrag
|
|
||||||
ctx->kv_self_defrag_sched();
|
|
||||||
}
|
|
||||||
|
|
||||||
// deprecated
|
|
||||||
bool llama_kv_self_can_shift(const llama_context * ctx) {
|
|
||||||
auto * kv = llama_get_memory(ctx);
|
|
||||||
if (!kv) {
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
return llama_memory_can_shift(kv);
|
|
||||||
}
|
|
||||||
|
|
||||||
// llama state API
|
// llama state API
|
||||||
|
|
||||||
// deprecated
|
// deprecated
|
||||||
|
|
@ -2800,19 +2672,31 @@ bool llama_state_save_file(llama_context * ctx, const char * path_session, const
|
||||||
}
|
}
|
||||||
|
|
||||||
size_t llama_state_seq_get_size(llama_context * ctx, llama_seq_id seq_id) {
|
size_t llama_state_seq_get_size(llama_context * ctx, llama_seq_id seq_id) {
|
||||||
return ctx->state_seq_get_size(seq_id);
|
return llama_state_seq_get_size_ext(ctx, seq_id, 0);
|
||||||
}
|
}
|
||||||
|
|
||||||
size_t llama_state_seq_get_data(llama_context * ctx, uint8_t * dst, size_t size, llama_seq_id seq_id) {
|
size_t llama_state_seq_get_data(llama_context * ctx, uint8_t * dst, size_t size, llama_seq_id seq_id) {
|
||||||
ctx->synchronize();
|
return llama_state_seq_get_data_ext(ctx, dst, size, seq_id, 0);
|
||||||
|
|
||||||
return ctx->state_seq_get_data(seq_id, dst, size);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
size_t llama_state_seq_set_data(llama_context * ctx, const uint8_t * src, size_t size, llama_seq_id seq_id) {
|
size_t llama_state_seq_set_data(llama_context * ctx, const uint8_t * src, size_t size, llama_seq_id seq_id) {
|
||||||
|
return llama_state_seq_set_data_ext(ctx, src, size, seq_id, 0);
|
||||||
|
}
|
||||||
|
|
||||||
|
size_t llama_state_seq_get_size_ext(llama_context * ctx, llama_seq_id seq_id, llama_state_seq_flags flags) {
|
||||||
|
return ctx->state_seq_get_size(seq_id, flags);
|
||||||
|
}
|
||||||
|
|
||||||
|
size_t llama_state_seq_get_data_ext(llama_context * ctx, uint8_t * dst, size_t size, llama_seq_id seq_id, llama_state_seq_flags flags) {
|
||||||
ctx->synchronize();
|
ctx->synchronize();
|
||||||
|
|
||||||
return ctx->state_seq_set_data(seq_id, src, size);
|
return ctx->state_seq_get_data(seq_id, dst, size, flags);
|
||||||
|
}
|
||||||
|
|
||||||
|
size_t llama_state_seq_set_data_ext(llama_context * ctx, const uint8_t * src, size_t size, llama_seq_id seq_id, llama_state_seq_flags flags) {
|
||||||
|
ctx->synchronize();
|
||||||
|
|
||||||
|
return ctx->state_seq_set_data(seq_id, src, size, flags);
|
||||||
}
|
}
|
||||||
|
|
||||||
size_t llama_state_seq_save_file(llama_context * ctx, const char * filepath, llama_seq_id seq_id, const llama_token * tokens, size_t n_token_count) {
|
size_t llama_state_seq_save_file(llama_context * ctx, const char * filepath, llama_seq_id seq_id, const llama_token * tokens, size_t n_token_count) {
|
||||||
|
|
@ -2895,6 +2779,142 @@ void llama_perf_context_reset(llama_context * ctx) {
|
||||||
ctx->perf_reset();
|
ctx->perf_reset();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void llama_memory_breakdown_print(const struct llama_context * ctx) {
|
||||||
|
const std::vector<ggml_backend_dev_t> & devices = ctx->get_model().devices;
|
||||||
|
|
||||||
|
std::map<ggml_backend_buffer_type_t, llama_memory_breakdown_data> memory_breakdown = ctx->memory_breakdown();
|
||||||
|
|
||||||
|
std::vector<std::array<std::string, 9>> table_data;
|
||||||
|
table_data.reserve(devices.size());
|
||||||
|
const std::string template_header = "%s: | %s | %s %s %s %s %s %s %s |\n";
|
||||||
|
const std::string template_gpu = "%s: | %s | %s = %s + (%s = %s + %s + %s) + %s |\n";
|
||||||
|
const std::string template_other = "%s: | %s | %s %s %s = %s + %s + %s %s |\n";
|
||||||
|
|
||||||
|
table_data.push_back({template_header, "memory breakdown [MiB]", "total", "free", "self", "model", "context", "compute", "unaccounted"});
|
||||||
|
|
||||||
|
constexpr size_t MiB = 1024 * 1024;
|
||||||
|
const std::vector<std::string> desc_prefixes_strip = {"NVIDIA ", "GeForce ", "Tesla ", "AMD ", "Radeon ", "Instinct "};
|
||||||
|
|
||||||
|
// track seen buffer types to avoid double counting:
|
||||||
|
std::set<ggml_backend_buffer_type_t> seen_buffer_types;
|
||||||
|
|
||||||
|
// accumulative memory breakdown for each device and for host:
|
||||||
|
std::vector<llama_memory_breakdown_data> mb_dev(devices.size());
|
||||||
|
llama_memory_breakdown_data mb_host;
|
||||||
|
|
||||||
|
for (const auto & buft_mb : memory_breakdown) {
|
||||||
|
ggml_backend_buffer_type_t buft = buft_mb.first;
|
||||||
|
const llama_memory_breakdown_data & mb = buft_mb.second;
|
||||||
|
if (ggml_backend_buft_is_host(buft)) {
|
||||||
|
mb_host.model += mb.model;
|
||||||
|
mb_host.context += mb.context;
|
||||||
|
mb_host.compute += mb.compute;
|
||||||
|
seen_buffer_types.insert(buft);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
ggml_backend_dev_t dev = ggml_backend_buft_get_device(buft);
|
||||||
|
if (dev) {
|
||||||
|
int i_dev = -1;
|
||||||
|
for (size_t i = 0; i < devices.size(); i++) {
|
||||||
|
if (devices[i] == dev) {
|
||||||
|
i_dev = i;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (i_dev != -1) {
|
||||||
|
mb_dev[i_dev].model += mb.model;
|
||||||
|
mb_dev[i_dev].context += mb.context;
|
||||||
|
mb_dev[i_dev].compute += mb.compute;
|
||||||
|
seen_buffer_types.insert(buft);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// print memory breakdown for each device:
|
||||||
|
for (size_t i = 0; i < devices.size(); i++) {
|
||||||
|
ggml_backend_dev_t dev = devices[i];
|
||||||
|
llama_memory_breakdown_data mb = mb_dev[i];
|
||||||
|
|
||||||
|
const std::string name = ggml_backend_dev_name(dev);
|
||||||
|
std::string desc = ggml_backend_dev_description(dev);
|
||||||
|
for (const std::string & prefix : desc_prefixes_strip) {
|
||||||
|
if (desc.length() >= prefix.length() && desc.substr(0, prefix.length()) == prefix) {
|
||||||
|
desc = desc.substr(prefix.length());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
size_t free, total;
|
||||||
|
ggml_backend_dev_memory(dev, &free, &total);
|
||||||
|
|
||||||
|
const size_t self = mb.model + mb.context + mb.compute;
|
||||||
|
const size_t unaccounted = total - self - free;
|
||||||
|
|
||||||
|
table_data.push_back({
|
||||||
|
template_gpu,
|
||||||
|
" - " + name + " (" + desc + ")",
|
||||||
|
std::to_string(total / MiB),
|
||||||
|
std::to_string(free / MiB),
|
||||||
|
std::to_string(self / MiB),
|
||||||
|
std::to_string(mb.model / MiB),
|
||||||
|
std::to_string(mb.context / MiB),
|
||||||
|
std::to_string(mb.compute / MiB),
|
||||||
|
std::to_string(unaccounted / MiB)});
|
||||||
|
}
|
||||||
|
|
||||||
|
// print memory breakdown for host:
|
||||||
|
{
|
||||||
|
const size_t self = mb_host.model + mb_host.context + mb_host.compute;
|
||||||
|
table_data.push_back({
|
||||||
|
template_other,
|
||||||
|
" - Host",
|
||||||
|
"", // total
|
||||||
|
"", // free
|
||||||
|
std::to_string(self / MiB),
|
||||||
|
std::to_string(mb_host.model / MiB),
|
||||||
|
std::to_string(mb_host.context / MiB),
|
||||||
|
std::to_string(mb_host.compute / MiB),
|
||||||
|
""}); // unaccounted
|
||||||
|
}
|
||||||
|
|
||||||
|
// print memory breakdown for all remaining buffer types:
|
||||||
|
for (const auto & buft_mb : memory_breakdown) {
|
||||||
|
ggml_backend_buffer_type_t buft = buft_mb.first;
|
||||||
|
const llama_memory_breakdown_data & mb = buft_mb.second;
|
||||||
|
if (seen_buffer_types.count(buft) == 1) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
const std::string name = ggml_backend_buft_name(buft);
|
||||||
|
const size_t self = mb.model + mb.context + mb.compute;
|
||||||
|
table_data.push_back({
|
||||||
|
template_other,
|
||||||
|
" - " + name,
|
||||||
|
"", // total
|
||||||
|
"", // free
|
||||||
|
std::to_string(self / MiB),
|
||||||
|
std::to_string(mb.model / MiB),
|
||||||
|
std::to_string(mb.context / MiB),
|
||||||
|
std::to_string(mb.compute / MiB),
|
||||||
|
""}); // unaccounted
|
||||||
|
seen_buffer_types.insert(buft);
|
||||||
|
}
|
||||||
|
|
||||||
|
for (size_t j = 1; j < table_data[0].size(); j++) {
|
||||||
|
size_t max_len = 0;
|
||||||
|
for (const auto & td : table_data) {
|
||||||
|
max_len = std::max(max_len, td[j].length());
|
||||||
|
}
|
||||||
|
for (auto & td : table_data) {
|
||||||
|
td[j].insert(j == 1 ? td[j].length() : 0, max_len - td[j].length(), ' ');
|
||||||
|
}
|
||||||
|
}
|
||||||
|
for (const auto & td : table_data) {
|
||||||
|
LLAMA_LOG_INFO(td[0].c_str(),
|
||||||
|
__func__, td[1].c_str(), td[2].c_str(), td[3].c_str(), td[4].c_str(), td[5].c_str(),
|
||||||
|
td[6].c_str(), td[7].c_str(), td[8].c_str());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
//
|
//
|
||||||
// training
|
// training
|
||||||
//
|
//
|
||||||
|
|
|
||||||
36
llama/llama.cpp/src/llama-context.h
vendored
36
llama/llama.cpp/src/llama-context.h
vendored
|
|
@ -17,9 +17,17 @@ class llama_batch_allocr;
|
||||||
class llama_io_read_i;
|
class llama_io_read_i;
|
||||||
class llama_io_write_i;
|
class llama_io_write_i;
|
||||||
|
|
||||||
|
// "memory" as in abstract memory for the context
|
||||||
struct llama_memory_i;
|
struct llama_memory_i;
|
||||||
struct llama_memory_context_i;
|
struct llama_memory_context_i;
|
||||||
|
|
||||||
|
// "memory" as in physical memory for a buffer type, in bytes
|
||||||
|
struct llama_memory_breakdown_data {
|
||||||
|
size_t model = 0; // memory allocated for the model
|
||||||
|
size_t context = 0; // memory allocated for the context
|
||||||
|
size_t compute = 0; // memory allocated for temporary compute buffers
|
||||||
|
};
|
||||||
|
|
||||||
struct llama_context {
|
struct llama_context {
|
||||||
// init scheduler and compute buffers, reserve worst-case graphs
|
// init scheduler and compute buffers, reserve worst-case graphs
|
||||||
llama_context(
|
llama_context(
|
||||||
|
|
@ -46,10 +54,8 @@ struct llama_context {
|
||||||
|
|
||||||
llama_memory_t get_memory() const;
|
llama_memory_t get_memory() const;
|
||||||
|
|
||||||
// return true of the KV cache was updated
|
// return true if the memory was updated
|
||||||
// TODO: remove
|
bool memory_update(bool optimize);
|
||||||
bool kv_self_update(bool optimize);
|
|
||||||
void kv_self_defrag_sched();
|
|
||||||
|
|
||||||
enum llama_pooling_type pooling_type() const;
|
enum llama_pooling_type pooling_type() const;
|
||||||
|
|
||||||
|
|
@ -111,9 +117,9 @@ struct llama_context {
|
||||||
size_t state_get_data( uint8_t * dst, size_t size);
|
size_t state_get_data( uint8_t * dst, size_t size);
|
||||||
size_t state_set_data(const uint8_t * src, size_t size);
|
size_t state_set_data(const uint8_t * src, size_t size);
|
||||||
|
|
||||||
size_t state_seq_get_size(llama_seq_id seq_id);
|
size_t state_seq_get_size(llama_seq_id seq_id, llama_state_seq_flags flags);
|
||||||
size_t state_seq_get_data(llama_seq_id seq_id, uint8_t * dst, size_t size);
|
size_t state_seq_get_data(llama_seq_id seq_id, uint8_t * dst, size_t size, llama_state_seq_flags flags);
|
||||||
size_t state_seq_set_data(llama_seq_id seq_id, const uint8_t * src, size_t size);
|
size_t state_seq_set_data(llama_seq_id seq_id, const uint8_t * src, size_t size, llama_state_seq_flags flags);
|
||||||
|
|
||||||
bool state_load_file(
|
bool state_load_file(
|
||||||
const char * filepath,
|
const char * filepath,
|
||||||
|
|
@ -146,12 +152,15 @@ struct llama_context {
|
||||||
llama_perf_context_data perf_get_data() const;
|
llama_perf_context_data perf_get_data() const;
|
||||||
void perf_reset();
|
void perf_reset();
|
||||||
|
|
||||||
|
std::map<ggml_backend_buffer_type_t, llama_memory_breakdown_data> memory_breakdown() const;
|
||||||
|
|
||||||
//
|
//
|
||||||
// training
|
// training
|
||||||
//
|
//
|
||||||
|
|
||||||
void opt_init(struct llama_model * model, struct llama_opt_params lopt_params);
|
void opt_init(struct llama_model * model, struct llama_opt_params lopt_params);
|
||||||
|
|
||||||
|
// TODO: more flexible combinations of logical/physical batch size and context size
|
||||||
void opt_epoch(
|
void opt_epoch(
|
||||||
ggml_opt_dataset_t dataset,
|
ggml_opt_dataset_t dataset,
|
||||||
ggml_opt_result_t result_train,
|
ggml_opt_result_t result_train,
|
||||||
|
|
@ -197,7 +206,7 @@ public:
|
||||||
ggml_status graph_compute(ggml_cgraph * gf, bool batched);
|
ggml_status graph_compute(ggml_cgraph * gf, bool batched);
|
||||||
|
|
||||||
// reserve a graph with a dummy ubatch of the specified size
|
// reserve a graph with a dummy ubatch of the specified size
|
||||||
ggml_cgraph * graph_reserve(uint32_t n_tokens, uint32_t n_seqs, uint32_t n_outputs, const llama_memory_context_i * mctx);
|
ggml_cgraph * graph_reserve(uint32_t n_tokens, uint32_t n_seqs, uint32_t n_outputs, const llama_memory_context_i * mctx, bool split_only = false);
|
||||||
|
|
||||||
private:
|
private:
|
||||||
llm_graph_params graph_params(
|
llm_graph_params graph_params(
|
||||||
|
|
@ -212,8 +221,8 @@ private:
|
||||||
size_t state_write_data(llama_io_write_i & io);
|
size_t state_write_data(llama_io_write_i & io);
|
||||||
size_t state_read_data (llama_io_read_i & io);
|
size_t state_read_data (llama_io_read_i & io);
|
||||||
|
|
||||||
size_t state_seq_write_data(llama_io_write_i & io, llama_seq_id seq_id);
|
size_t state_seq_write_data(llama_io_write_i & io, llama_seq_id seq_id, llama_state_seq_flags flags);
|
||||||
size_t state_seq_read_data (llama_io_read_i & io, llama_seq_id seq_id);
|
size_t state_seq_read_data (llama_io_read_i & io, llama_seq_id seq_id, llama_state_seq_flags flags);
|
||||||
|
|
||||||
//
|
//
|
||||||
// members
|
// members
|
||||||
|
|
@ -229,9 +238,6 @@ private:
|
||||||
|
|
||||||
std::unique_ptr<llama_memory_i> memory;
|
std::unique_ptr<llama_memory_i> memory;
|
||||||
|
|
||||||
// TODO: temporary, until the llama_kv_self_defrag() API is removed
|
|
||||||
bool memory_force_optimize = false;
|
|
||||||
|
|
||||||
// decode output (2-dimensional array: [n_outputs][n_vocab])
|
// decode output (2-dimensional array: [n_outputs][n_vocab])
|
||||||
size_t logits_size = 0; // capacity (of floats) for logits
|
size_t logits_size = 0; // capacity (of floats) for logits
|
||||||
float * logits = nullptr;
|
float * logits = nullptr;
|
||||||
|
|
@ -287,10 +293,6 @@ private:
|
||||||
|
|
||||||
bool has_evaluated_once = false;
|
bool has_evaluated_once = false;
|
||||||
|
|
||||||
// env: LLAMA_SET_ROWS (temporary)
|
|
||||||
// ref: https://github.com/ggml-org/llama.cpp/pull/14285
|
|
||||||
bool supports_set_rows = true;
|
|
||||||
|
|
||||||
// env: LLAMA_GRAPH_REUSE_DISABLE
|
// env: LLAMA_GRAPH_REUSE_DISABLE
|
||||||
bool graph_reuse_disable = false;
|
bool graph_reuse_disable = false;
|
||||||
|
|
||||||
|
|
|
||||||
3
llama/llama.cpp/src/llama-cparams.h
vendored
3
llama/llama.cpp/src/llama-cparams.h
vendored
|
|
@ -4,7 +4,7 @@
|
||||||
|
|
||||||
#include <cstdint>
|
#include <cstdint>
|
||||||
|
|
||||||
#define LLAMA_MAX_SEQ 64
|
#define LLAMA_MAX_SEQ 256
|
||||||
|
|
||||||
struct llama_cparams {
|
struct llama_cparams {
|
||||||
uint32_t n_ctx; // context size used during inference
|
uint32_t n_ctx; // context size used during inference
|
||||||
|
|
@ -24,7 +24,6 @@ struct llama_cparams {
|
||||||
float yarn_attn_factor;
|
float yarn_attn_factor;
|
||||||
float yarn_beta_fast;
|
float yarn_beta_fast;
|
||||||
float yarn_beta_slow;
|
float yarn_beta_slow;
|
||||||
float defrag_thold;
|
|
||||||
|
|
||||||
bool embeddings;
|
bool embeddings;
|
||||||
bool causal_attn;
|
bool causal_attn;
|
||||||
|
|
|
||||||
224
llama/llama.cpp/src/llama-graph.cpp
vendored
224
llama/llama.cpp/src/llama-graph.cpp
vendored
|
|
@ -4,8 +4,8 @@
|
||||||
#include "llama-batch.h"
|
#include "llama-batch.h"
|
||||||
#include "llama-cparams.h"
|
#include "llama-cparams.h"
|
||||||
|
|
||||||
#include "llama-kv-cache-unified.h"
|
#include "llama-kv-cache.h"
|
||||||
#include "llama-kv-cache-unified-iswa.h"
|
#include "llama-kv-cache-iswa.h"
|
||||||
#include "llama-memory-hybrid.h"
|
#include "llama-memory-hybrid.h"
|
||||||
#include "llama-memory-recurrent.h"
|
#include "llama-memory-recurrent.h"
|
||||||
|
|
||||||
|
|
@ -204,7 +204,10 @@ void llm_graph_input_cls::set_input(const llama_ubatch * ubatch) {
|
||||||
std::vector<int> target_pos(n_seqs_unq, -1);
|
std::vector<int> target_pos(n_seqs_unq, -1);
|
||||||
std::vector<int> target_row(n_seqs_unq, -1);
|
std::vector<int> target_row(n_seqs_unq, -1);
|
||||||
|
|
||||||
bool last = cparams.pooling_type == LLAMA_POOLING_TYPE_LAST;
|
const bool last = (
|
||||||
|
cparams.pooling_type == LLAMA_POOLING_TYPE_LAST ||
|
||||||
|
(cparams.pooling_type == LLAMA_POOLING_TYPE_RANK && arch == LLM_ARCH_QWEN3) // qwen3 reranking & embedding models use last token
|
||||||
|
);
|
||||||
|
|
||||||
for (int i = 0; i < n_tokens; ++i) {
|
for (int i = 0; i < n_tokens; ++i) {
|
||||||
const llama_pos pos = ubatch->pos[i];
|
const llama_pos pos = ubatch->pos[i];
|
||||||
|
|
@ -258,6 +261,36 @@ void llm_graph_input_cross_embd::set_input(const llama_ubatch * ubatch) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static void print_mask(float * data, int64_t n_tokens, int64_t n_kv, int64_t n_swa, llama_swa_type swa_type) {
|
||||||
|
LLAMA_LOG_DEBUG("%s: === Attention mask ===\n", __func__);
|
||||||
|
const char * swa_type_str = (swa_type == LLAMA_SWA_TYPE_NONE) ? "LLAMA_SWA_TYPE_NONE" :
|
||||||
|
(swa_type == LLAMA_SWA_TYPE_STANDARD) ? "LLAMA_SWA_TYPE_STANDARD" :
|
||||||
|
(swa_type == LLAMA_SWA_TYPE_CHUNKED) ? "LLAMA_SWA_TYPE_CHUNKED" :
|
||||||
|
(swa_type == LLAMA_SWA_TYPE_SYMMETRIC) ? "LLAMA_SWA_TYPE_SYMMETRIC" : "unknown";
|
||||||
|
LLAMA_LOG_DEBUG("%s: n_swa : %d, n_kv: %d, swq_type: %s\n", __func__, (int)n_swa, (int)n_kv, swa_type_str);
|
||||||
|
LLAMA_LOG_DEBUG("%s: '0' = can attend, '∞' = masked\n", __func__);
|
||||||
|
LLAMA_LOG_DEBUG("%s: Rows = query tokens, Columns = key/value tokens\n\n", __func__);
|
||||||
|
|
||||||
|
LLAMA_LOG_DEBUG(" ");
|
||||||
|
for (int j = 0; j < std::min((int64_t)20, n_kv); ++j) {
|
||||||
|
LLAMA_LOG_DEBUG("%2d", j);
|
||||||
|
}
|
||||||
|
LLAMA_LOG_DEBUG("\n");
|
||||||
|
|
||||||
|
for (int i = 0; i < std::min((int64_t)20, n_tokens); ++i) {
|
||||||
|
LLAMA_LOG_DEBUG(" %2d ", i);
|
||||||
|
for (int j = 0; j < std::min((int64_t)20, n_kv); ++j) {
|
||||||
|
float val = data[i * n_kv + j];
|
||||||
|
if (val == -INFINITY) {
|
||||||
|
LLAMA_LOG_DEBUG(" ∞");
|
||||||
|
} else {
|
||||||
|
LLAMA_LOG_DEBUG(" 0");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
LLAMA_LOG_DEBUG("\n");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
void llm_graph_input_attn_no_cache::set_input(const llama_ubatch * ubatch) {
|
void llm_graph_input_attn_no_cache::set_input(const llama_ubatch * ubatch) {
|
||||||
const int64_t n_kv = ubatch->n_tokens;
|
const int64_t n_kv = ubatch->n_tokens;
|
||||||
const int64_t n_tokens = ubatch->n_tokens;
|
const int64_t n_tokens = ubatch->n_tokens;
|
||||||
|
|
@ -267,6 +300,9 @@ void llm_graph_input_attn_no_cache::set_input(const llama_ubatch * ubatch) {
|
||||||
|
|
||||||
float * data = (float *) kq_mask->data;
|
float * data = (float *) kq_mask->data;
|
||||||
|
|
||||||
|
// [TAG_NO_CACHE_ISWA]
|
||||||
|
GGML_ASSERT(hparams.swa_type == LLAMA_SWA_TYPE_NONE && "TODO: implement");
|
||||||
|
|
||||||
for (int h = 0; h < 1; ++h) {
|
for (int h = 0; h < 1; ++h) {
|
||||||
for (int i1 = 0; i1 < n_tokens; ++i1) {
|
for (int i1 = 0; i1 < n_tokens; ++i1) {
|
||||||
const llama_seq_id s1 = ubatch->seq_id[i1][0];
|
const llama_seq_id s1 = ubatch->seq_id[i1][0];
|
||||||
|
|
@ -277,32 +313,44 @@ void llm_graph_input_attn_no_cache::set_input(const llama_ubatch * ubatch) {
|
||||||
for (int s = 0; s < ubatch->n_seq_id[i0]; ++s) {
|
for (int s = 0; s < ubatch->n_seq_id[i0]; ++s) {
|
||||||
const llama_seq_id s0 = ubatch->seq_id[i0][0];
|
const llama_seq_id s0 = ubatch->seq_id[i0][0];
|
||||||
|
|
||||||
|
if (s0 != s1) {
|
||||||
|
continue; // skip different sequences
|
||||||
|
}
|
||||||
|
|
||||||
|
if (cparams.causal_attn && ubatch->pos[i0] > ubatch->pos[i1]) {
|
||||||
|
continue; // skip future tokens for causal attention
|
||||||
|
}
|
||||||
|
|
||||||
|
// TODO: this does not take into account that some layers are SWA and others are note (i.e. iSWA) [TAG_NO_CACHE_ISWA]
|
||||||
|
//if (hparams.is_masked_swa(ubatch->pos[i0], ubatch->pos[i1])) {
|
||||||
|
// continue; // skip masked tokens for SWA
|
||||||
|
//}
|
||||||
|
|
||||||
// TODO: reimplement this like in llama_kv_cache_unified
|
// TODO: reimplement this like in llama_kv_cache_unified
|
||||||
if (s0 == s1 && (!cparams.causal_attn || ubatch->pos[i0] <= ubatch->pos[i1])) {
|
|
||||||
if (hparams.use_alibi) {
|
if (hparams.use_alibi) {
|
||||||
f = -std::abs(ubatch->pos[i0] - ubatch->pos[i1]);
|
f = -std::abs(ubatch->pos[i0] - ubatch->pos[i1]);
|
||||||
} else {
|
} else {
|
||||||
f = 0.0f;
|
f = 0.0f;
|
||||||
}
|
}
|
||||||
break;
|
|
||||||
}
|
}
|
||||||
}
|
|
||||||
|
|
||||||
data[h*(n_kv*n_tokens) + i1*n_kv + i0] = f;
|
data[h*(n_kv*n_tokens) + i1*n_kv + i0] = f;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
if (debug) {
|
||||||
|
print_mask(data, n_tokens, n_kv, hparams.n_swa, hparams.swa_type);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void llm_graph_input_attn_kv_unified::set_input(const llama_ubatch * ubatch) {
|
void llm_graph_input_attn_kv::set_input(const llama_ubatch * ubatch) {
|
||||||
mctx->set_input_k_idxs(self_k_idxs, ubatch);
|
mctx->set_input_k_idxs(self_k_idxs, ubatch);
|
||||||
mctx->set_input_v_idxs(self_v_idxs, ubatch);
|
mctx->set_input_v_idxs(self_v_idxs, ubatch);
|
||||||
|
|
||||||
mctx->set_input_kq_mask(self_kq_mask, ubatch, cparams.causal_attn);
|
mctx->set_input_kq_mask(self_kq_mask, ubatch, cparams.causal_attn);
|
||||||
}
|
}
|
||||||
|
|
||||||
bool llm_graph_input_attn_kv_unified::can_reuse(const llm_graph_params & params) {
|
bool llm_graph_input_attn_kv::can_reuse(const llm_graph_params & params) {
|
||||||
const auto * mctx = static_cast<const llama_kv_cache_unified_context *>(params.mctx);
|
const auto * mctx = static_cast<const llama_kv_cache_context *>(params.mctx);
|
||||||
|
|
||||||
this->mctx = mctx;
|
this->mctx = mctx;
|
||||||
|
|
||||||
|
|
@ -314,12 +362,10 @@ bool llm_graph_input_attn_kv_unified::can_reuse(const llm_graph_params & params)
|
||||||
res &= self_kq_mask->ne[0] == mctx->get_n_kv();
|
res &= self_kq_mask->ne[0] == mctx->get_n_kv();
|
||||||
res &= self_kq_mask->ne[1] == GGML_PAD(params.ubatch.n_tokens, GGML_KQ_MASK_PAD);
|
res &= self_kq_mask->ne[1] == GGML_PAD(params.ubatch.n_tokens, GGML_KQ_MASK_PAD);
|
||||||
|
|
||||||
res &= mctx->get_supports_set_rows(); // TODO: tmp
|
|
||||||
|
|
||||||
return res;
|
return res;
|
||||||
}
|
}
|
||||||
|
|
||||||
void llm_graph_input_attn_kv_unified_iswa::set_input(const llama_ubatch * ubatch) {
|
void llm_graph_input_attn_kv_iswa::set_input(const llama_ubatch * ubatch) {
|
||||||
mctx->get_base()->set_input_k_idxs(self_k_idxs, ubatch);
|
mctx->get_base()->set_input_k_idxs(self_k_idxs, ubatch);
|
||||||
mctx->get_base()->set_input_v_idxs(self_v_idxs, ubatch);
|
mctx->get_base()->set_input_v_idxs(self_v_idxs, ubatch);
|
||||||
|
|
||||||
|
|
@ -331,8 +377,8 @@ void llm_graph_input_attn_kv_unified_iswa::set_input(const llama_ubatch * ubatch
|
||||||
mctx->get_swa()->set_input_kq_mask(self_kq_mask_swa, ubatch, cparams.causal_attn);
|
mctx->get_swa()->set_input_kq_mask(self_kq_mask_swa, ubatch, cparams.causal_attn);
|
||||||
}
|
}
|
||||||
|
|
||||||
bool llm_graph_input_attn_kv_unified_iswa::can_reuse(const llm_graph_params & params) {
|
bool llm_graph_input_attn_kv_iswa::can_reuse(const llm_graph_params & params) {
|
||||||
const auto * mctx = static_cast<const llama_kv_cache_unified_iswa_context *>(params.mctx);
|
const auto * mctx = static_cast<const llama_kv_cache_iswa_context *>(params.mctx);
|
||||||
|
|
||||||
this->mctx = mctx;
|
this->mctx = mctx;
|
||||||
|
|
||||||
|
|
@ -350,8 +396,6 @@ bool llm_graph_input_attn_kv_unified_iswa::can_reuse(const llm_graph_params & pa
|
||||||
res &= self_kq_mask_swa->ne[0] == mctx->get_swa()->get_n_kv();
|
res &= self_kq_mask_swa->ne[0] == mctx->get_swa()->get_n_kv();
|
||||||
res &= self_kq_mask_swa->ne[1] == GGML_PAD(params.ubatch.n_tokens, GGML_KQ_MASK_PAD);
|
res &= self_kq_mask_swa->ne[1] == GGML_PAD(params.ubatch.n_tokens, GGML_KQ_MASK_PAD);
|
||||||
|
|
||||||
res &= mctx->get_base()->get_supports_set_rows(); // TODO: tmp
|
|
||||||
|
|
||||||
return res;
|
return res;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -879,15 +923,29 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
|
||||||
selection_probs = logits;
|
selection_probs = logits;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (arch == LLM_ARCH_GROVEMOE) {
|
||||||
|
selection_probs = ggml_sigmoid(ctx0, logits); // [n_expert, n_tokens]
|
||||||
|
cb(selection_probs, "ffn_moe_probs_biased", il);
|
||||||
|
}
|
||||||
|
|
||||||
// select experts
|
// select experts
|
||||||
ggml_tensor * selected_experts = ggml_top_k(ctx0, selection_probs, n_expert_used); // [n_expert_used, n_tokens]
|
ggml_tensor * selected_experts = ggml_top_k(ctx0, selection_probs, n_expert_used); // [n_expert_used, n_tokens]
|
||||||
cb(selected_experts->src[0], "ffn_moe_argsort", il);
|
cb(selected_experts->src[0], "ffn_moe_argsort", il);
|
||||||
cb(selected_experts, "ffn_moe_topk", il);
|
cb(selected_experts, "ffn_moe_topk", il);
|
||||||
|
|
||||||
ggml_tensor * weights = ggml_get_rows(ctx0,
|
if (arch == LLM_ARCH_GROVEMOE && n_expert != hparams.n_expert) {
|
||||||
ggml_reshape_3d(ctx0, probs, 1, n_expert, n_tokens), selected_experts); // [1, n_expert_used, n_tokens]
|
// TODO: Use scalar div instead when/if implemented
|
||||||
|
ggml_tensor * f_sel = ggml_cast(ctx0, selected_experts, GGML_TYPE_F32);
|
||||||
|
selected_experts = ggml_cast(ctx0, ggml_scale(ctx0, f_sel, 1.0f / float(hparams.n_group_experts)), GGML_TYPE_I32);
|
||||||
|
probs = ggml_reshape_3d(ctx0, probs, 1, hparams.n_expert, n_tokens);
|
||||||
|
} else {
|
||||||
|
probs = ggml_reshape_3d(ctx0, probs, 1, n_expert, n_tokens);
|
||||||
|
}
|
||||||
|
|
||||||
|
ggml_tensor * weights = ggml_get_rows(ctx0, probs, selected_experts); // [1, n_expert_used, n_tokens]
|
||||||
cb(weights, "ffn_moe_weights", il);
|
cb(weights, "ffn_moe_weights", il);
|
||||||
|
|
||||||
|
|
||||||
if (gating_op == LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX_WEIGHT) {
|
if (gating_op == LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX_WEIGHT) {
|
||||||
weights = ggml_reshape_2d(ctx0, weights, n_expert_used, n_tokens);
|
weights = ggml_reshape_2d(ctx0, weights, n_expert_used, n_tokens);
|
||||||
weights = ggml_soft_max(ctx0, weights); // [n_expert_used, n_tokens]
|
weights = ggml_soft_max(ctx0, weights); // [n_expert_used, n_tokens]
|
||||||
|
|
@ -911,6 +969,9 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
|
||||||
cb(weights, "ffn_moe_weights_scaled", il);
|
cb(weights, "ffn_moe_weights_scaled", il);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
//call early so that topk-moe can be used
|
||||||
|
ggml_build_forward_expand(gf, weights);
|
||||||
|
|
||||||
cur = ggml_reshape_3d(ctx0, cur, n_embd, 1, n_tokens);
|
cur = ggml_reshape_3d(ctx0, cur, n_embd, 1, n_tokens);
|
||||||
|
|
||||||
if (weight_before_ffn) {
|
if (weight_before_ffn) {
|
||||||
|
|
@ -1136,7 +1197,7 @@ ggml_tensor * llm_graph_context::build_inp_mean() const {
|
||||||
}
|
}
|
||||||
|
|
||||||
ggml_tensor * llm_graph_context::build_inp_cls() const {
|
ggml_tensor * llm_graph_context::build_inp_cls() const {
|
||||||
auto inp = std::make_unique<llm_graph_input_cls>(cparams);
|
auto inp = std::make_unique<llm_graph_input_cls>(cparams, arch);
|
||||||
|
|
||||||
auto & cur = inp->cls;
|
auto & cur = inp->cls;
|
||||||
|
|
||||||
|
|
@ -1186,7 +1247,7 @@ ggml_tensor * llm_graph_context::build_inp_pos_bucket_enc() const {
|
||||||
}
|
}
|
||||||
|
|
||||||
ggml_tensor * llm_graph_context::build_inp_pos_bucket_dec() const {
|
ggml_tensor * llm_graph_context::build_inp_pos_bucket_dec() const {
|
||||||
const auto * mctx_cur = static_cast<const llama_kv_cache_unified_context *>(mctx);
|
const auto * mctx_cur = static_cast<const llama_kv_cache_context *>(mctx);
|
||||||
|
|
||||||
auto inp = std::make_unique<llm_graph_input_pos_bucket_kv>(hparams, mctx_cur);
|
auto inp = std::make_unique<llm_graph_input_pos_bucket_kv>(hparams, mctx_cur);
|
||||||
|
|
||||||
|
|
@ -1223,15 +1284,16 @@ ggml_tensor * llm_graph_context::build_attn_mha(
|
||||||
ggml_tensor * v,
|
ggml_tensor * v,
|
||||||
ggml_tensor * kq_b,
|
ggml_tensor * kq_b,
|
||||||
ggml_tensor * kq_mask,
|
ggml_tensor * kq_mask,
|
||||||
ggml_tensor * v_mla,
|
|
||||||
ggml_tensor * sinks,
|
ggml_tensor * sinks,
|
||||||
float kq_scale) const {
|
ggml_tensor * v_mla,
|
||||||
|
float kq_scale,
|
||||||
|
int il) const {
|
||||||
const bool v_trans = v->nb[1] > v->nb[2];
|
const bool v_trans = v->nb[1] > v->nb[2];
|
||||||
|
|
||||||
// split the batch into streams if needed
|
// split the batch into streams if needed
|
||||||
const auto n_stream = k->ne[3];
|
const auto n_stream = k->ne[3];
|
||||||
|
|
||||||
q = ggml_reshape_4d(ctx0, q, q->ne[0], q->ne[1], q->ne[2]/n_stream, n_stream);
|
q = ggml_view_4d(ctx0, q, q->ne[0], q->ne[1], q->ne[2]/n_stream, n_stream, q->nb[1], q->nb[2], q->nb[3]/n_stream, 0);
|
||||||
|
|
||||||
q = ggml_permute(ctx0, q, 0, 2, 1, 3);
|
q = ggml_permute(ctx0, q, 0, 2, 1, 3);
|
||||||
k = ggml_permute(ctx0, k, 0, 2, 1, 3);
|
k = ggml_permute(ctx0, k, 0, 2, 1, 3);
|
||||||
|
|
@ -1260,6 +1322,7 @@ ggml_tensor * llm_graph_context::build_attn_mha(
|
||||||
|
|
||||||
cur = ggml_flash_attn_ext(ctx0, q, k, v, kq_mask, kq_scale, hparams.f_max_alibi_bias,
|
cur = ggml_flash_attn_ext(ctx0, q, k, v, kq_mask, kq_scale, hparams.f_max_alibi_bias,
|
||||||
hparams.attn_soft_cap ? hparams.f_attn_logit_softcapping : 0.0f);
|
hparams.attn_soft_cap ? hparams.f_attn_logit_softcapping : 0.0f);
|
||||||
|
cb(cur, LLAMA_TENSOR_NAME_FATTN, il);
|
||||||
|
|
||||||
ggml_flash_attn_ext_add_sinks(cur, sinks);
|
ggml_flash_attn_ext_add_sinks(cur, sinks);
|
||||||
ggml_flash_attn_ext_set_prec (cur, GGML_PREC_F32);
|
ggml_flash_attn_ext_set_prec (cur, GGML_PREC_F32);
|
||||||
|
|
@ -1275,6 +1338,7 @@ ggml_tensor * llm_graph_context::build_attn_mha(
|
||||||
// The permutations are noops and only change how the tensor data is interpreted.
|
// The permutations are noops and only change how the tensor data is interpreted.
|
||||||
cur = ggml_permute(ctx0, cur, 0, 2, 1, 3);
|
cur = ggml_permute(ctx0, cur, 0, 2, 1, 3);
|
||||||
cur = ggml_mul_mat(ctx0, v_mla, cur);
|
cur = ggml_mul_mat(ctx0, v_mla, cur);
|
||||||
|
cb(cur, "fattn_mla", il);
|
||||||
cur = ggml_permute(ctx0, cur, 0, 2, 1, 3);
|
cur = ggml_permute(ctx0, cur, 0, 2, 1, 3);
|
||||||
cur = ggml_cont(ctx0, cur); // Needed because ggml_reshape_2d expects contiguous inputs.
|
cur = ggml_cont(ctx0, cur); // Needed because ggml_reshape_2d expects contiguous inputs.
|
||||||
#endif
|
#endif
|
||||||
|
|
@ -1283,6 +1347,7 @@ ggml_tensor * llm_graph_context::build_attn_mha(
|
||||||
cur = ggml_reshape_2d(ctx0, cur, cur->ne[0]*cur->ne[1], cur->ne[2]*cur->ne[3]);
|
cur = ggml_reshape_2d(ctx0, cur, cur->ne[0]*cur->ne[1], cur->ne[2]*cur->ne[3]);
|
||||||
} else {
|
} else {
|
||||||
ggml_tensor * kq = ggml_mul_mat(ctx0, k, q);
|
ggml_tensor * kq = ggml_mul_mat(ctx0, k, q);
|
||||||
|
cb(kq, "kq", il);
|
||||||
|
|
||||||
// note: this op tends to require high floating point range
|
// note: this op tends to require high floating point range
|
||||||
// while for some models F16 is enough, for others it is not, so we default to F32 here
|
// while for some models F16 is enough, for others it is not, so we default to F32 here
|
||||||
|
|
@ -1290,38 +1355,48 @@ ggml_tensor * llm_graph_context::build_attn_mha(
|
||||||
|
|
||||||
if (arch == LLM_ARCH_GROK) {
|
if (arch == LLM_ARCH_GROK) {
|
||||||
// need to do the following:
|
// need to do the following:
|
||||||
// multiply by attn_output_multiplyer of 0.08838834764831845
|
// multiply by attn_output_multiplier
|
||||||
// and then :
|
// and then :
|
||||||
// kq = 30 * tanh(kq / 30)
|
// kq = 30 * tanh(kq / 30)
|
||||||
// before the softmax below
|
// before the softmax below
|
||||||
|
|
||||||
kq = ggml_tanh(ctx0, ggml_scale(ctx0, kq, 0.08838834764831845f/30.0f));
|
kq = ggml_tanh(ctx0, ggml_scale(ctx0, kq, hparams.f_attn_out_scale / hparams.f_attn_logit_softcapping));
|
||||||
kq = ggml_scale(ctx0, kq, 30);
|
cb(kq, "kq_tanh", il);
|
||||||
|
kq = ggml_scale(ctx0, kq, hparams.f_attn_logit_softcapping);
|
||||||
|
cb(kq, "kq_scaled", il);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (hparams.attn_soft_cap) {
|
if (hparams.attn_soft_cap) {
|
||||||
kq = ggml_scale(ctx0, kq, 1.0f / hparams.f_attn_logit_softcapping);
|
kq = ggml_scale(ctx0, kq, 1.0f / hparams.f_attn_logit_softcapping);
|
||||||
|
cb(kq, "kq_scaled_1", il);
|
||||||
kq = ggml_tanh (ctx0, kq);
|
kq = ggml_tanh (ctx0, kq);
|
||||||
|
cb(kq, "kq_tanh", il);
|
||||||
kq = ggml_scale(ctx0, kq, hparams.f_attn_logit_softcapping);
|
kq = ggml_scale(ctx0, kq, hparams.f_attn_logit_softcapping);
|
||||||
|
cb(kq, "kq_scaled_2", il);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (kq_b) {
|
if (kq_b) {
|
||||||
kq = ggml_add(ctx0, kq, kq_b);
|
kq = ggml_add(ctx0, kq, kq_b);
|
||||||
|
cb(kq, "kq_plus_kq_b", il);
|
||||||
}
|
}
|
||||||
|
|
||||||
kq = ggml_soft_max_ext(ctx0, kq, kq_mask, kq_scale, hparams.f_max_alibi_bias);
|
kq = ggml_soft_max_ext(ctx0, kq, kq_mask, kq_scale, hparams.f_max_alibi_bias);
|
||||||
ggml_soft_max_add_sinks(kq, sinks);
|
ggml_soft_max_add_sinks(kq, sinks);
|
||||||
|
cb(kq, "kq_soft_max", il);
|
||||||
|
|
||||||
if (!v_trans) {
|
if (!v_trans) {
|
||||||
// note: avoid this branch
|
// note: avoid this branch
|
||||||
v = ggml_cont(ctx0, ggml_transpose(ctx0, v));
|
v = ggml_cont(ctx0, ggml_transpose(ctx0, v));
|
||||||
|
cb(v, "v_cont", il);
|
||||||
}
|
}
|
||||||
|
|
||||||
ggml_tensor * kqv = ggml_mul_mat(ctx0, v, kq);
|
ggml_tensor * kqv = ggml_mul_mat(ctx0, v, kq);
|
||||||
|
cb(kqv, "kqv", il);
|
||||||
|
|
||||||
// for MLA with the absorption optimization, we need to "decompress" from MQA back to MHA
|
// for MLA with the absorption optimization, we need to "decompress" from MQA back to MHA
|
||||||
if (v_mla) {
|
if (v_mla) {
|
||||||
kqv = ggml_mul_mat(ctx0, v_mla, kqv);
|
kqv = ggml_mul_mat(ctx0, v_mla, kqv);
|
||||||
|
cb(kqv, "kqv_mla", il);
|
||||||
}
|
}
|
||||||
|
|
||||||
cur = ggml_permute(ctx0, kqv, 0, 2, 1, 3);
|
cur = ggml_permute(ctx0, kqv, 0, 2, 1, 3);
|
||||||
|
|
@ -1360,6 +1435,7 @@ ggml_tensor * llm_graph_context::build_attn(
|
||||||
ggml_tensor * k_cur,
|
ggml_tensor * k_cur,
|
||||||
ggml_tensor * v_cur,
|
ggml_tensor * v_cur,
|
||||||
ggml_tensor * kq_b,
|
ggml_tensor * kq_b,
|
||||||
|
ggml_tensor * sinks,
|
||||||
ggml_tensor * v_mla,
|
ggml_tensor * v_mla,
|
||||||
float kq_scale,
|
float kq_scale,
|
||||||
int il) const {
|
int il) const {
|
||||||
|
|
@ -1375,13 +1451,14 @@ ggml_tensor * llm_graph_context::build_attn(
|
||||||
|
|
||||||
// [TAG_NO_CACHE_PAD]
|
// [TAG_NO_CACHE_PAD]
|
||||||
// TODO: if ubatch.equal_seqs() == true, we can split the three tensors below into ubatch.n_seqs_unq streams
|
// TODO: if ubatch.equal_seqs() == true, we can split the three tensors below into ubatch.n_seqs_unq streams
|
||||||
assert(!ubatch.equal_seqs());
|
// but it might not be worth it: https://github.com/ggml-org/llama.cpp/pull/15636
|
||||||
|
//assert(!ubatch.equal_seqs() || (k_cur->ne[3] == 1 && k_cur->ne[3] == ubatch.n_seqs_unq));
|
||||||
|
|
||||||
ggml_tensor * q = q_cur;
|
ggml_tensor * q = q_cur;
|
||||||
ggml_tensor * k = k_cur;
|
ggml_tensor * k = k_cur;
|
||||||
ggml_tensor * v = v_cur;
|
ggml_tensor * v = v_cur;
|
||||||
|
|
||||||
ggml_tensor * cur = build_attn_mha(q, k, v, kq_b, kq_mask, v_mla, nullptr, kq_scale);
|
ggml_tensor * cur = build_attn_mha(q, k, v, kq_b, kq_mask, sinks, v_mla, kq_scale, il);
|
||||||
cb(cur, "kqv_out", il);
|
cb(cur, "kqv_out", il);
|
||||||
|
|
||||||
if (wo) {
|
if (wo) {
|
||||||
|
|
@ -1399,17 +1476,17 @@ ggml_tensor * llm_graph_context::build_attn(
|
||||||
return cur;
|
return cur;
|
||||||
}
|
}
|
||||||
|
|
||||||
static std::unique_ptr<llm_graph_input_attn_kv_unified> build_attn_inp_kv_unified_impl(
|
static std::unique_ptr<llm_graph_input_attn_kv> build_attn_inp_kv_impl(
|
||||||
ggml_context * ctx0,
|
ggml_context * ctx0,
|
||||||
const llama_ubatch & ubatch,
|
const llama_ubatch & ubatch,
|
||||||
const llama_hparams & hparams,
|
const llama_hparams & hparams,
|
||||||
const llama_cparams & cparams,
|
const llama_cparams & cparams,
|
||||||
const llama_kv_cache_unified_context * mctx_cur) {
|
const llama_kv_cache_context * mctx_cur) {
|
||||||
|
|
||||||
auto inp = std::make_unique<llm_graph_input_attn_kv_unified>(hparams, cparams, mctx_cur);
|
auto inp = std::make_unique<llm_graph_input_attn_kv>(hparams, cparams, mctx_cur);
|
||||||
|
|
||||||
{
|
{
|
||||||
GGML_ASSERT(hparams.swa_type == LLAMA_SWA_TYPE_NONE && "Use llama_kv_cache_unified_iswa for SWA");
|
GGML_ASSERT(hparams.swa_type == LLAMA_SWA_TYPE_NONE && "Use llama_kv_cache_iswa for SWA");
|
||||||
|
|
||||||
const auto n_kv = mctx_cur->get_n_kv();
|
const auto n_kv = mctx_cur->get_n_kv();
|
||||||
const auto n_tokens = ubatch.n_tokens;
|
const auto n_tokens = ubatch.n_tokens;
|
||||||
|
|
@ -1427,22 +1504,23 @@ static std::unique_ptr<llm_graph_input_attn_kv_unified> build_attn_inp_kv_unifie
|
||||||
return inp;
|
return inp;
|
||||||
}
|
}
|
||||||
|
|
||||||
llm_graph_input_attn_kv_unified * llm_graph_context::build_attn_inp_kv_unified() const {
|
llm_graph_input_attn_kv * llm_graph_context::build_attn_inp_kv() const {
|
||||||
const auto * mctx_cur = static_cast<const llama_kv_cache_unified_context *>(mctx);
|
const auto * mctx_cur = static_cast<const llama_kv_cache_context *>(mctx);
|
||||||
|
|
||||||
auto inp = build_attn_inp_kv_unified_impl(ctx0, ubatch, hparams, cparams, mctx_cur);
|
auto inp = build_attn_inp_kv_impl(ctx0, ubatch, hparams, cparams, mctx_cur);
|
||||||
|
|
||||||
return (llm_graph_input_attn_kv_unified *) res->add_input(std::move(inp));
|
return (llm_graph_input_attn_kv *) res->add_input(std::move(inp));
|
||||||
}
|
}
|
||||||
|
|
||||||
ggml_tensor * llm_graph_context::build_attn(
|
ggml_tensor * llm_graph_context::build_attn(
|
||||||
llm_graph_input_attn_kv_unified * inp,
|
llm_graph_input_attn_kv * inp,
|
||||||
ggml_tensor * wo,
|
ggml_tensor * wo,
|
||||||
ggml_tensor * wo_b,
|
ggml_tensor * wo_b,
|
||||||
ggml_tensor * q_cur,
|
ggml_tensor * q_cur,
|
||||||
ggml_tensor * k_cur,
|
ggml_tensor * k_cur,
|
||||||
ggml_tensor * v_cur,
|
ggml_tensor * v_cur,
|
||||||
ggml_tensor * kq_b,
|
ggml_tensor * kq_b,
|
||||||
|
ggml_tensor * sinks,
|
||||||
ggml_tensor * v_mla,
|
ggml_tensor * v_mla,
|
||||||
float kq_scale,
|
float kq_scale,
|
||||||
int il) const {
|
int il) const {
|
||||||
|
|
@ -1469,7 +1547,7 @@ ggml_tensor * llm_graph_context::build_attn(
|
||||||
ggml_tensor * k = mctx_cur->get_k(ctx0, il);
|
ggml_tensor * k = mctx_cur->get_k(ctx0, il);
|
||||||
ggml_tensor * v = mctx_cur->get_v(ctx0, il);
|
ggml_tensor * v = mctx_cur->get_v(ctx0, il);
|
||||||
|
|
||||||
ggml_tensor * cur = build_attn_mha(q, k, v, kq_b, kq_mask, v_mla, nullptr, kq_scale);
|
ggml_tensor * cur = build_attn_mha(q, k, v, kq_b, kq_mask, sinks, v_mla, kq_scale, il);
|
||||||
cb(cur, "kqv_out", il);
|
cb(cur, "kqv_out", il);
|
||||||
|
|
||||||
if (wo) {
|
if (wo) {
|
||||||
|
|
@ -1488,40 +1566,15 @@ ggml_tensor * llm_graph_context::build_attn(
|
||||||
}
|
}
|
||||||
|
|
||||||
ggml_tensor * llm_graph_context::build_attn(
|
ggml_tensor * llm_graph_context::build_attn(
|
||||||
llm_graph_input_attn_kv_unified_iswa * inp,
|
llm_graph_input_attn_kv_iswa * inp,
|
||||||
ggml_tensor * wo,
|
ggml_tensor * wo,
|
||||||
ggml_tensor * wo_b,
|
ggml_tensor * wo_b,
|
||||||
ggml_tensor * q_cur,
|
ggml_tensor * q_cur,
|
||||||
ggml_tensor * k_cur,
|
ggml_tensor * k_cur,
|
||||||
ggml_tensor * v_cur,
|
ggml_tensor * v_cur,
|
||||||
ggml_tensor * kq_b,
|
ggml_tensor * kq_b,
|
||||||
ggml_tensor * v_mla,
|
|
||||||
float kq_scale,
|
|
||||||
int il) const {
|
|
||||||
return build_attn_with_sinks(
|
|
||||||
inp,
|
|
||||||
wo,
|
|
||||||
wo_b,
|
|
||||||
q_cur,
|
|
||||||
k_cur,
|
|
||||||
v_cur,
|
|
||||||
kq_b,
|
|
||||||
v_mla,
|
|
||||||
nullptr,
|
|
||||||
kq_scale,
|
|
||||||
il);
|
|
||||||
}
|
|
||||||
|
|
||||||
ggml_tensor * llm_graph_context::build_attn_with_sinks(
|
|
||||||
llm_graph_input_attn_kv_unified_iswa * inp,
|
|
||||||
ggml_tensor * wo,
|
|
||||||
ggml_tensor * wo_b,
|
|
||||||
ggml_tensor * q_cur,
|
|
||||||
ggml_tensor * k_cur,
|
|
||||||
ggml_tensor * v_cur,
|
|
||||||
ggml_tensor * kq_b,
|
|
||||||
ggml_tensor * v_mla,
|
|
||||||
ggml_tensor * sinks,
|
ggml_tensor * sinks,
|
||||||
|
ggml_tensor * v_mla,
|
||||||
float kq_scale,
|
float kq_scale,
|
||||||
int il) const {
|
int il) const {
|
||||||
// these nodes are added to the graph together so that they are not reordered
|
// these nodes are added to the graph together so that they are not reordered
|
||||||
|
|
@ -1561,7 +1614,7 @@ ggml_tensor * llm_graph_context::build_attn_with_sinks(
|
||||||
ggml_tensor * k = mctx_cur->get_k(ctx0, il);
|
ggml_tensor * k = mctx_cur->get_k(ctx0, il);
|
||||||
ggml_tensor * v = mctx_cur->get_v(ctx0, il);
|
ggml_tensor * v = mctx_cur->get_v(ctx0, il);
|
||||||
|
|
||||||
ggml_tensor * cur = build_attn_mha(q, k, v, kq_b, kq_mask, v_mla, sinks, kq_scale);
|
ggml_tensor * cur = build_attn_mha(q, k, v, kq_b, kq_mask, sinks, v_mla, kq_scale, il);
|
||||||
cb(cur, "kqv_out", il);
|
cb(cur, "kqv_out", il);
|
||||||
|
|
||||||
if (wo) {
|
if (wo) {
|
||||||
|
|
@ -1600,6 +1653,7 @@ ggml_tensor * llm_graph_context::build_attn(
|
||||||
ggml_tensor * k_cur,
|
ggml_tensor * k_cur,
|
||||||
ggml_tensor * v_cur,
|
ggml_tensor * v_cur,
|
||||||
ggml_tensor * kq_b,
|
ggml_tensor * kq_b,
|
||||||
|
ggml_tensor * sinks,
|
||||||
ggml_tensor * v_mla,
|
ggml_tensor * v_mla,
|
||||||
float kq_scale,
|
float kq_scale,
|
||||||
int il) const {
|
int il) const {
|
||||||
|
|
@ -1615,7 +1669,7 @@ ggml_tensor * llm_graph_context::build_attn(
|
||||||
ggml_tensor * k = k_cur;
|
ggml_tensor * k = k_cur;
|
||||||
ggml_tensor * v = v_cur;
|
ggml_tensor * v = v_cur;
|
||||||
|
|
||||||
ggml_tensor * cur = build_attn_mha(q, k, v, kq_b, kq_mask, v_mla, nullptr, kq_scale);
|
ggml_tensor * cur = build_attn_mha(q, k, v, kq_b, kq_mask, sinks, v_mla, kq_scale, il);
|
||||||
cb(cur, "kqv_out", il);
|
cb(cur, "kqv_out", il);
|
||||||
|
|
||||||
if (wo) {
|
if (wo) {
|
||||||
|
|
@ -1636,10 +1690,10 @@ ggml_tensor * llm_graph_context::build_attn(
|
||||||
// TODO: maybe separate the inner implementation into a separate function
|
// TODO: maybe separate the inner implementation into a separate function
|
||||||
// like with the non-sliding window equivalent
|
// like with the non-sliding window equivalent
|
||||||
// once sliding-window hybrid caches are a thing.
|
// once sliding-window hybrid caches are a thing.
|
||||||
llm_graph_input_attn_kv_unified_iswa * llm_graph_context::build_attn_inp_kv_unified_iswa() const {
|
llm_graph_input_attn_kv_iswa * llm_graph_context::build_attn_inp_kv_iswa() const {
|
||||||
const auto * mctx_cur = static_cast<const llama_kv_cache_unified_iswa_context *>(mctx);
|
const auto * mctx_cur = static_cast<const llama_kv_cache_iswa_context *>(mctx);
|
||||||
|
|
||||||
auto inp = std::make_unique<llm_graph_input_attn_kv_unified_iswa>(hparams, cparams, mctx_cur);
|
auto inp = std::make_unique<llm_graph_input_attn_kv_iswa>(hparams, cparams, mctx_cur);
|
||||||
|
|
||||||
const auto n_stream = cparams.kv_unified ? 1 : ubatch.n_seqs_unq;
|
const auto n_stream = cparams.kv_unified ? 1 : ubatch.n_seqs_unq;
|
||||||
|
|
||||||
|
|
@ -1656,7 +1710,7 @@ llm_graph_input_attn_kv_unified_iswa * llm_graph_context::build_attn_inp_kv_unif
|
||||||
}
|
}
|
||||||
|
|
||||||
{
|
{
|
||||||
GGML_ASSERT(hparams.swa_type != LLAMA_SWA_TYPE_NONE && "Use llama_kv_cache_unified for non-SWA");
|
GGML_ASSERT(hparams.swa_type != LLAMA_SWA_TYPE_NONE && "Use llama_kv_cache for non-SWA");
|
||||||
|
|
||||||
const auto n_kv = mctx_cur->get_swa()->get_n_kv();
|
const auto n_kv = mctx_cur->get_swa()->get_n_kv();
|
||||||
|
|
||||||
|
|
@ -1669,7 +1723,7 @@ llm_graph_input_attn_kv_unified_iswa * llm_graph_context::build_attn_inp_kv_unif
|
||||||
inp->self_kq_mask_swa_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp->self_kq_mask_swa, GGML_TYPE_F16) : inp->self_kq_mask_swa;
|
inp->self_kq_mask_swa_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp->self_kq_mask_swa, GGML_TYPE_F16) : inp->self_kq_mask_swa;
|
||||||
}
|
}
|
||||||
|
|
||||||
return (llm_graph_input_attn_kv_unified_iswa *) res->add_input(std::move(inp));
|
return (llm_graph_input_attn_kv_iswa *) res->add_input(std::move(inp));
|
||||||
}
|
}
|
||||||
|
|
||||||
ggml_tensor * llm_graph_context::build_rs(
|
ggml_tensor * llm_graph_context::build_rs(
|
||||||
|
|
@ -1792,7 +1846,7 @@ llm_graph_input_mem_hybrid * llm_graph_context::build_inp_mem_hybrid() const {
|
||||||
const auto * mctx_cur = static_cast<const llama_memory_hybrid_context *>(mctx);
|
const auto * mctx_cur = static_cast<const llama_memory_hybrid_context *>(mctx);
|
||||||
|
|
||||||
auto inp_rs = build_rs_inp_impl(ctx0, ubatch, mctx_cur->get_recr());
|
auto inp_rs = build_rs_inp_impl(ctx0, ubatch, mctx_cur->get_recr());
|
||||||
auto inp_attn = build_attn_inp_kv_unified_impl(ctx0, ubatch, hparams, cparams, mctx_cur->get_attn());
|
auto inp_attn = build_attn_inp_kv_impl(ctx0, ubatch, hparams, cparams, mctx_cur->get_attn());
|
||||||
|
|
||||||
auto inp = std::make_unique<llm_graph_input_mem_hybrid>(std::move(inp_attn), std::move(inp_rs), mctx_cur);
|
auto inp = std::make_unique<llm_graph_input_mem_hybrid>(std::move(inp_attn), std::move(inp_rs), mctx_cur);
|
||||||
|
|
||||||
|
|
@ -1843,34 +1897,32 @@ void llm_graph_context::build_pooling(
|
||||||
case LLAMA_POOLING_TYPE_RANK:
|
case LLAMA_POOLING_TYPE_RANK:
|
||||||
{
|
{
|
||||||
ggml_tensor * inp_cls = build_inp_cls();
|
ggml_tensor * inp_cls = build_inp_cls();
|
||||||
inp = ggml_get_rows(ctx0, inp, inp_cls);
|
cur = ggml_get_rows(ctx0, inp, inp_cls);
|
||||||
|
|
||||||
if (cls) {
|
|
||||||
// classification head
|
// classification head
|
||||||
// https://github.com/huggingface/transformers/blob/5af7d41e49bbfc8319f462eb45253dcb3863dfb7/src/transformers/models/roberta/modeling_roberta.py#L1566
|
// https://github.com/huggingface/transformers/blob/5af7d41e49bbfc8319f462eb45253dcb3863dfb7/src/transformers/models/roberta/modeling_roberta.py#L1566
|
||||||
cur = ggml_mul_mat(ctx0, cls, inp);
|
if (cls) {
|
||||||
|
cur = ggml_mul_mat(ctx0, cls, cur);
|
||||||
if (cls_b) {
|
if (cls_b) {
|
||||||
cur = ggml_add(ctx0, cur, cls_b);
|
cur = ggml_add(ctx0, cur, cls_b);
|
||||||
}
|
}
|
||||||
cur = ggml_tanh(ctx0, cur);
|
cur = ggml_tanh(ctx0, cur);
|
||||||
|
}
|
||||||
|
|
||||||
// some models don't have `cls_out`, for example: https://huggingface.co/jinaai/jina-reranker-v1-tiny-en
|
// some models don't have `cls_out`, for example: https://huggingface.co/jinaai/jina-reranker-v1-tiny-en
|
||||||
// https://huggingface.co/jinaai/jina-reranker-v1-tiny-en/blob/cb5347e43979c3084a890e3f99491952603ae1b7/modeling_bert.py#L884-L896
|
// https://huggingface.co/jinaai/jina-reranker-v1-tiny-en/blob/cb5347e43979c3084a890e3f99491952603ae1b7/modeling_bert.py#L884-L896
|
||||||
|
// Single layer classification head (direct projection)
|
||||||
|
// https://github.com/huggingface/transformers/blob/f4fc42216cd56ab6b68270bf80d811614d8d59e4/src/transformers/models/bert/modeling_bert.py#L1476
|
||||||
if (cls_out) {
|
if (cls_out) {
|
||||||
cur = ggml_mul_mat(ctx0, cls_out, cur);
|
cur = ggml_mul_mat(ctx0, cls_out, cur);
|
||||||
if (cls_out_b) {
|
if (cls_out_b) {
|
||||||
cur = ggml_add(ctx0, cur, cls_out_b);
|
cur = ggml_add(ctx0, cur, cls_out_b);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
} else if (cls_out) {
|
|
||||||
// Single layer classification head (direct projection)
|
// softmax for qwen3 reranker
|
||||||
// https://github.com/huggingface/transformers/blob/f4fc42216cd56ab6b68270bf80d811614d8d59e4/src/transformers/models/bert/modeling_bert.py#L1476
|
if (arch == LLM_ARCH_QWEN3) {
|
||||||
cur = ggml_mul_mat(ctx0, cls_out, inp);
|
cur = ggml_soft_max(ctx0, cur);
|
||||||
if (cls_out_b) {
|
|
||||||
cur = ggml_add(ctx0, cur, cls_out_b);
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
GGML_ABORT("RANK pooling requires either cls+cls_b or cls_out+cls_out_b");
|
|
||||||
}
|
}
|
||||||
} break;
|
} break;
|
||||||
default:
|
default:
|
||||||
|
|
|
||||||
78
llama/llama.cpp/src/llama-graph.h
vendored
78
llama/llama.cpp/src/llama-graph.h
vendored
|
|
@ -19,8 +19,8 @@ struct llama_cparams;
|
||||||
|
|
||||||
struct llama_memory_context_i;
|
struct llama_memory_context_i;
|
||||||
|
|
||||||
class llama_kv_cache_unified_context;
|
class llama_kv_cache_context;
|
||||||
class llama_kv_cache_unified_iswa_context;
|
class llama_kv_cache_iswa_context;
|
||||||
class llama_memory_recurrent_context;
|
class llama_memory_recurrent_context;
|
||||||
class llama_memory_hybrid_context;
|
class llama_memory_hybrid_context;
|
||||||
|
|
||||||
|
|
@ -78,6 +78,11 @@ struct llm_graph_params;
|
||||||
|
|
||||||
class llm_graph_input_i {
|
class llm_graph_input_i {
|
||||||
public:
|
public:
|
||||||
|
llm_graph_input_i() {
|
||||||
|
const char * LLAMA_GRAPH_INPUT_DEBUG = getenv("LLAMA_GRAPH_INPUT_DEBUG");
|
||||||
|
debug = LLAMA_GRAPH_INPUT_DEBUG ? atoi(LLAMA_GRAPH_INPUT_DEBUG) : 0;
|
||||||
|
}
|
||||||
|
|
||||||
virtual ~llm_graph_input_i() = default;
|
virtual ~llm_graph_input_i() = default;
|
||||||
|
|
||||||
virtual void set_input(const llama_ubatch * ubatch) = 0;
|
virtual void set_input(const llama_ubatch * ubatch) = 0;
|
||||||
|
|
@ -90,6 +95,9 @@ public:
|
||||||
GGML_UNUSED(params);
|
GGML_UNUSED(params);
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
protected:
|
||||||
|
// env: LLAMA_GRAPH_INPUT_DEBUG
|
||||||
|
int debug = 0;
|
||||||
};
|
};
|
||||||
|
|
||||||
using llm_graph_input_ptr = std::unique_ptr<llm_graph_input_i>;
|
using llm_graph_input_ptr = std::unique_ptr<llm_graph_input_i>;
|
||||||
|
|
@ -152,7 +160,7 @@ class llm_graph_input_pos_bucket_kv : public llm_graph_input_i {
|
||||||
public:
|
public:
|
||||||
llm_graph_input_pos_bucket_kv(
|
llm_graph_input_pos_bucket_kv(
|
||||||
const llama_hparams & hparams,
|
const llama_hparams & hparams,
|
||||||
const llama_kv_cache_unified_context * mctx) : hparams(hparams), mctx(mctx) {}
|
const llama_kv_cache_context * mctx) : hparams(hparams), mctx(mctx) {}
|
||||||
virtual ~llm_graph_input_pos_bucket_kv() = default;
|
virtual ~llm_graph_input_pos_bucket_kv() = default;
|
||||||
|
|
||||||
void set_input(const llama_ubatch * ubatch) override;
|
void set_input(const llama_ubatch * ubatch) override;
|
||||||
|
|
@ -161,7 +169,7 @@ public:
|
||||||
|
|
||||||
const llama_hparams hparams;
|
const llama_hparams hparams;
|
||||||
|
|
||||||
const llama_kv_cache_unified_context * mctx;
|
const llama_kv_cache_context * mctx;
|
||||||
};
|
};
|
||||||
|
|
||||||
class llm_graph_input_out_ids : public llm_graph_input_i {
|
class llm_graph_input_out_ids : public llm_graph_input_i {
|
||||||
|
|
@ -198,7 +206,7 @@ public:
|
||||||
|
|
||||||
class llm_graph_input_cls : public llm_graph_input_i {
|
class llm_graph_input_cls : public llm_graph_input_i {
|
||||||
public:
|
public:
|
||||||
llm_graph_input_cls(const llama_cparams & cparams) : cparams(cparams) {}
|
llm_graph_input_cls(const llama_cparams & cparams, const llm_arch arch) : cparams(cparams), arch(arch) {}
|
||||||
virtual ~llm_graph_input_cls() = default;
|
virtual ~llm_graph_input_cls() = default;
|
||||||
|
|
||||||
void set_input(const llama_ubatch * ubatch) override;
|
void set_input(const llama_ubatch * ubatch) override;
|
||||||
|
|
@ -206,6 +214,7 @@ public:
|
||||||
ggml_tensor * cls; // I32 [n_batch]
|
ggml_tensor * cls; // I32 [n_batch]
|
||||||
|
|
||||||
const llama_cparams cparams;
|
const llama_cparams cparams;
|
||||||
|
const llm_arch arch;
|
||||||
};
|
};
|
||||||
|
|
||||||
class llm_graph_input_rs : public llm_graph_input_i {
|
class llm_graph_input_rs : public llm_graph_input_i {
|
||||||
|
|
@ -257,17 +266,17 @@ public:
|
||||||
const llama_cparams cparams;
|
const llama_cparams cparams;
|
||||||
};
|
};
|
||||||
|
|
||||||
class llm_graph_input_attn_kv_unified : public llm_graph_input_i {
|
class llm_graph_input_attn_kv : public llm_graph_input_i {
|
||||||
public:
|
public:
|
||||||
llm_graph_input_attn_kv_unified(
|
llm_graph_input_attn_kv(
|
||||||
const llama_hparams & hparams,
|
const llama_hparams & hparams,
|
||||||
const llama_cparams & cparams,
|
const llama_cparams & cparams,
|
||||||
const llama_kv_cache_unified_context * mctx) :
|
const llama_kv_cache_context * mctx) :
|
||||||
hparams(hparams),
|
hparams(hparams),
|
||||||
cparams(cparams),
|
cparams(cparams),
|
||||||
mctx(mctx) {
|
mctx(mctx) {
|
||||||
}
|
}
|
||||||
~llm_graph_input_attn_kv_unified() = default;
|
~llm_graph_input_attn_kv() = default;
|
||||||
|
|
||||||
void set_input(const llama_ubatch * ubatch) override;
|
void set_input(const llama_ubatch * ubatch) override;
|
||||||
|
|
||||||
|
|
@ -290,20 +299,20 @@ public:
|
||||||
const llama_hparams hparams;
|
const llama_hparams hparams;
|
||||||
const llama_cparams cparams;
|
const llama_cparams cparams;
|
||||||
|
|
||||||
const llama_kv_cache_unified_context * mctx;
|
const llama_kv_cache_context * mctx;
|
||||||
};
|
};
|
||||||
|
|
||||||
class llm_graph_input_attn_kv_unified_iswa : public llm_graph_input_i {
|
class llm_graph_input_attn_kv_iswa : public llm_graph_input_i {
|
||||||
public:
|
public:
|
||||||
llm_graph_input_attn_kv_unified_iswa(
|
llm_graph_input_attn_kv_iswa(
|
||||||
const llama_hparams & hparams,
|
const llama_hparams & hparams,
|
||||||
const llama_cparams & cparams,
|
const llama_cparams & cparams,
|
||||||
const llama_kv_cache_unified_iswa_context * mctx) :
|
const llama_kv_cache_iswa_context * mctx) :
|
||||||
hparams(hparams),
|
hparams(hparams),
|
||||||
cparams(cparams),
|
cparams(cparams),
|
||||||
mctx(mctx) {
|
mctx(mctx) {
|
||||||
}
|
}
|
||||||
~llm_graph_input_attn_kv_unified_iswa() = default;
|
~llm_graph_input_attn_kv_iswa() = default;
|
||||||
|
|
||||||
void set_input(const llama_ubatch * ubatch) override;
|
void set_input(const llama_ubatch * ubatch) override;
|
||||||
|
|
||||||
|
|
@ -330,7 +339,7 @@ public:
|
||||||
const llama_hparams hparams;
|
const llama_hparams hparams;
|
||||||
const llama_cparams cparams;
|
const llama_cparams cparams;
|
||||||
|
|
||||||
const llama_kv_cache_unified_iswa_context * mctx;
|
const llama_kv_cache_iswa_context * mctx;
|
||||||
};
|
};
|
||||||
|
|
||||||
class llm_graph_input_attn_cross : public llm_graph_input_i {
|
class llm_graph_input_attn_cross : public llm_graph_input_i {
|
||||||
|
|
@ -351,7 +360,7 @@ public:
|
||||||
class llm_graph_input_mem_hybrid : public llm_graph_input_i {
|
class llm_graph_input_mem_hybrid : public llm_graph_input_i {
|
||||||
public:
|
public:
|
||||||
llm_graph_input_mem_hybrid(
|
llm_graph_input_mem_hybrid(
|
||||||
std::unique_ptr<llm_graph_input_attn_kv_unified> inp_attn,
|
std::unique_ptr<llm_graph_input_attn_kv> inp_attn,
|
||||||
std::unique_ptr<llm_graph_input_rs> inp_rs,
|
std::unique_ptr<llm_graph_input_rs> inp_rs,
|
||||||
const llama_memory_hybrid_context * mctx) :
|
const llama_memory_hybrid_context * mctx) :
|
||||||
inp_attn(std::move(inp_attn)),
|
inp_attn(std::move(inp_attn)),
|
||||||
|
|
@ -361,10 +370,10 @@ public:
|
||||||
|
|
||||||
void set_input(const llama_ubatch * ubatch) override;
|
void set_input(const llama_ubatch * ubatch) override;
|
||||||
|
|
||||||
std::unique_ptr<llm_graph_input_attn_kv_unified> inp_attn;
|
std::unique_ptr<llm_graph_input_attn_kv> inp_attn;
|
||||||
std::unique_ptr<llm_graph_input_rs> inp_rs;
|
std::unique_ptr<llm_graph_input_rs> inp_rs;
|
||||||
|
|
||||||
llm_graph_input_attn_kv_unified * get_attn() const { return inp_attn.get(); }
|
llm_graph_input_attn_kv * get_attn() const { return inp_attn.get(); }
|
||||||
llm_graph_input_rs * get_recr() const { return inp_rs.get(); }
|
llm_graph_input_rs * get_recr() const { return inp_rs.get(); }
|
||||||
|
|
||||||
const llama_memory_hybrid_context * mctx;
|
const llama_memory_hybrid_context * mctx;
|
||||||
|
|
@ -685,9 +694,10 @@ struct llm_graph_context {
|
||||||
ggml_tensor * v, // [n_embd_head_v, n_head_v, n_tokens] (v_trans == false)
|
ggml_tensor * v, // [n_embd_head_v, n_head_v, n_tokens] (v_trans == false)
|
||||||
ggml_tensor * kq_b,
|
ggml_tensor * kq_b,
|
||||||
ggml_tensor * kq_mask,
|
ggml_tensor * kq_mask,
|
||||||
ggml_tensor * sinks,
|
ggml_tensor * sinks, // [n_head_q]
|
||||||
ggml_tensor * v_mla, // [n_embd_head_v_mla, n_embd_head_v, n_head_v]
|
ggml_tensor * v_mla, // [n_embd_head_v_mla, n_embd_head_v, n_head_v]
|
||||||
float kq_scale) const;
|
float kq_scale,
|
||||||
|
int il) const;
|
||||||
|
|
||||||
llm_graph_input_attn_no_cache * build_attn_inp_no_cache() const;
|
llm_graph_input_attn_no_cache * build_attn_inp_no_cache() const;
|
||||||
|
|
||||||
|
|
@ -699,50 +709,39 @@ struct llm_graph_context {
|
||||||
ggml_tensor * k_cur, // [n_embd_head_k, n_head_k, n_tokens]
|
ggml_tensor * k_cur, // [n_embd_head_k, n_head_k, n_tokens]
|
||||||
ggml_tensor * v_cur, // [n_embd_head_v, n_head_v, n_tokens]
|
ggml_tensor * v_cur, // [n_embd_head_v, n_head_v, n_tokens]
|
||||||
ggml_tensor * kq_b,
|
ggml_tensor * kq_b,
|
||||||
|
ggml_tensor * sinks, // [n_head_q]
|
||||||
ggml_tensor * v_mla, // [n_embd_head_v_mla, n_embd_head_v, n_head_v]
|
ggml_tensor * v_mla, // [n_embd_head_v_mla, n_embd_head_v, n_head_v]
|
||||||
float kq_scale,
|
float kq_scale,
|
||||||
int il) const;
|
int il) const;
|
||||||
|
|
||||||
llm_graph_input_attn_kv_unified * build_attn_inp_kv_unified() const;
|
llm_graph_input_attn_kv * build_attn_inp_kv() const;
|
||||||
|
|
||||||
ggml_tensor * build_attn(
|
ggml_tensor * build_attn(
|
||||||
llm_graph_input_attn_kv_unified * inp,
|
llm_graph_input_attn_kv * inp,
|
||||||
ggml_tensor * wo,
|
ggml_tensor * wo,
|
||||||
ggml_tensor * wo_b,
|
ggml_tensor * wo_b,
|
||||||
ggml_tensor * q_cur, // [n_embd_head_q, n_head_q, n_tokens]
|
ggml_tensor * q_cur, // [n_embd_head_q, n_head_q, n_tokens]
|
||||||
ggml_tensor * k_cur, // [n_embd_head_k, n_head_k, n_tokens]
|
ggml_tensor * k_cur, // [n_embd_head_k, n_head_k, n_tokens]
|
||||||
ggml_tensor * v_cur, // [n_embd_head_v, n_head_v, n_tokens]
|
ggml_tensor * v_cur, // [n_embd_head_v, n_head_v, n_tokens]
|
||||||
ggml_tensor * kq_b,
|
ggml_tensor * kq_b,
|
||||||
|
ggml_tensor * sinks, // [n_head_q]
|
||||||
ggml_tensor * v_mla, // [n_embd_head_v_mla, n_embd_head_v, n_head_v]
|
ggml_tensor * v_mla, // [n_embd_head_v_mla, n_embd_head_v, n_head_v]
|
||||||
float kq_scale,
|
float kq_scale,
|
||||||
int il) const;
|
int il) const;
|
||||||
|
|
||||||
llm_graph_input_attn_kv_unified_iswa * build_attn_inp_kv_unified_iswa() const;
|
llm_graph_input_attn_kv_iswa * build_attn_inp_kv_iswa() const;
|
||||||
|
|
||||||
// note: if k_cur or v_cur are not provided, they will not be stored in the memory
|
// note: if k_cur or v_cur are not provided, they will not be stored in the memory
|
||||||
ggml_tensor * build_attn(
|
ggml_tensor * build_attn(
|
||||||
llm_graph_input_attn_kv_unified_iswa * inp,
|
llm_graph_input_attn_kv_iswa * inp,
|
||||||
ggml_tensor * wo,
|
ggml_tensor * wo,
|
||||||
ggml_tensor * wo_b,
|
ggml_tensor * wo_b,
|
||||||
ggml_tensor * q_cur, // [n_embd_head_q, n_head_q, n_tokens]
|
ggml_tensor * q_cur, // [n_embd_head_q, n_head_q, n_tokens]
|
||||||
ggml_tensor * k_cur, // [n_embd_head_k, n_head_k, n_tokens] optional
|
ggml_tensor * k_cur, // [n_embd_head_k, n_head_k, n_tokens] optional
|
||||||
ggml_tensor * v_cur, // [n_embd_head_v, n_head_v, n_tokens] optional
|
ggml_tensor * v_cur, // [n_embd_head_v, n_head_v, n_tokens] optional
|
||||||
ggml_tensor * kq_b,
|
ggml_tensor * kq_b,
|
||||||
ggml_tensor * v_mla, // [n_embd_head_v_mla, n_embd_head_v, n_head_v]
|
|
||||||
float kq_scale,
|
|
||||||
int il) const;
|
|
||||||
|
|
||||||
// TODO: temporary to keep the diff small. after the code is public will refactor to simplify this
|
|
||||||
ggml_tensor * build_attn_with_sinks(
|
|
||||||
llm_graph_input_attn_kv_unified_iswa * inp,
|
|
||||||
ggml_tensor * wo,
|
|
||||||
ggml_tensor * wo_b,
|
|
||||||
ggml_tensor * q_cur, // [n_embd_head_q, n_head_q, n_tokens]
|
|
||||||
ggml_tensor * k_cur, // [n_embd_head_k, n_head_k, n_tokens] optional
|
|
||||||
ggml_tensor * v_cur, // [n_embd_head_v, n_head_v, n_tokens] optional
|
|
||||||
ggml_tensor * kq_b,
|
|
||||||
ggml_tensor * v_mla, // [n_embd_head_v_mla, n_embd_head_v, n_head_v]
|
|
||||||
ggml_tensor * sinks, // [n_head_q]
|
ggml_tensor * sinks, // [n_head_q]
|
||||||
|
ggml_tensor * v_mla, // [n_embd_head_v_mla, n_embd_head_v, n_head_v]
|
||||||
float kq_scale,
|
float kq_scale,
|
||||||
int il) const;
|
int il) const;
|
||||||
|
|
||||||
|
|
@ -756,6 +755,7 @@ struct llm_graph_context {
|
||||||
ggml_tensor * k_cur, // [n_embd_head_k, n_head_k, n_tokens]
|
ggml_tensor * k_cur, // [n_embd_head_k, n_head_k, n_tokens]
|
||||||
ggml_tensor * v_cur, // [n_embd_head_v, n_head_v, n_tokens]
|
ggml_tensor * v_cur, // [n_embd_head_v, n_head_v, n_tokens]
|
||||||
ggml_tensor * kq_b,
|
ggml_tensor * kq_b,
|
||||||
|
ggml_tensor * sinks, // [n_head_q]
|
||||||
ggml_tensor * v_mla, // [n_embd_head_v_mla, n_embd_head_v, n_head_v]
|
ggml_tensor * v_mla, // [n_embd_head_v_mla, n_embd_head_v, n_head_v]
|
||||||
float kq_scale,
|
float kq_scale,
|
||||||
int il) const;
|
int il) const;
|
||||||
|
|
@ -765,7 +765,7 @@ struct llm_graph_context {
|
||||||
//
|
//
|
||||||
|
|
||||||
// TODO: move this implementation to llama_memory_recurrent.
|
// TODO: move this implementation to llama_memory_recurrent.
|
||||||
// this is analogous to llama_kv_cache_unified::cpy_k / cpy_v
|
// this is analogous to llama_kv_cache::cpy_k / cpy_v
|
||||||
// when moving, avoid passing `ggml_cgraph` - only pass `ggml_context`. would likely need to split the
|
// when moving, avoid passing `ggml_cgraph` - only pass `ggml_context`. would likely need to split the
|
||||||
// implementation in 2 separate methods. the goal is to avoid calling `ggml_build_forward_expand` in
|
// implementation in 2 separate methods. the goal is to avoid calling `ggml_build_forward_expand` in
|
||||||
// `llama_memory_recurrent`
|
// `llama_memory_recurrent`
|
||||||
|
|
|
||||||
62
llama/llama.cpp/src/llama-hparams.cpp
vendored
62
llama/llama.cpp/src/llama-hparams.cpp
vendored
|
|
@ -1,6 +1,7 @@
|
||||||
#include "llama-hparams.h"
|
#include "llama-hparams.h"
|
||||||
|
|
||||||
#include "ggml.h"
|
#include "ggml.h"
|
||||||
|
#include <cassert>
|
||||||
|
|
||||||
void llama_hparams::set_swa_pattern(uint32_t n_pattern, bool dense_first) {
|
void llama_hparams::set_swa_pattern(uint32_t n_pattern, bool dense_first) {
|
||||||
if (dense_first) {
|
if (dense_first) {
|
||||||
|
|
@ -161,3 +162,64 @@ bool llama_hparams::is_swa(uint32_t il) const {
|
||||||
|
|
||||||
GGML_ABORT("fatal error");
|
GGML_ABORT("fatal error");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
bool llama_hparams::has_kv(uint32_t il) const {
|
||||||
|
if (n_layer_kv_from_start >= 0) {
|
||||||
|
if (il < (uint32_t) n_layer_kv_from_start) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
// by default, all layers have kv
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
uint32_t llama_hparams::n_layer_kv() const {
|
||||||
|
uint32_t res = 0;
|
||||||
|
|
||||||
|
for (uint32_t il = 0; il < n_layer; ++il) {
|
||||||
|
if (has_kv(il)) {
|
||||||
|
res++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return res;
|
||||||
|
}
|
||||||
|
|
||||||
|
bool llama_hparams::is_masked_swa(uint32_t n_swa, llama_swa_type swa_type, llama_pos p0, llama_pos p1) {
|
||||||
|
assert(p0 >= 0 && p1 >= 0);
|
||||||
|
|
||||||
|
switch (swa_type) {
|
||||||
|
case LLAMA_SWA_TYPE_NONE:
|
||||||
|
{
|
||||||
|
} break;
|
||||||
|
case LLAMA_SWA_TYPE_STANDARD:
|
||||||
|
{
|
||||||
|
if (p1 - p0 >= (int32_t) n_swa) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
} break;
|
||||||
|
case LLAMA_SWA_TYPE_CHUNKED:
|
||||||
|
{
|
||||||
|
const llama_pos pos_chunk_start = (p1 / n_swa) * n_swa;
|
||||||
|
|
||||||
|
if (p0 < pos_chunk_start) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
} break;
|
||||||
|
case LLAMA_SWA_TYPE_SYMMETRIC:
|
||||||
|
{
|
||||||
|
const int32_t half_n_swa = (int32_t) n_swa / 2;
|
||||||
|
const int32_t pos_diff = p1 - p0;
|
||||||
|
|
||||||
|
// Mask if outside the symmetric window
|
||||||
|
if (pos_diff < -half_n_swa || pos_diff > half_n_swa) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
} break;
|
||||||
|
}
|
||||||
|
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
|
||||||
30
llama/llama.cpp/src/llama-hparams.h
vendored
30
llama/llama.cpp/src/llama-hparams.h
vendored
|
|
@ -19,6 +19,7 @@ enum llama_swa_type {
|
||||||
LLAMA_SWA_TYPE_NONE = 0,
|
LLAMA_SWA_TYPE_NONE = 0,
|
||||||
LLAMA_SWA_TYPE_STANDARD = 1,
|
LLAMA_SWA_TYPE_STANDARD = 1,
|
||||||
LLAMA_SWA_TYPE_CHUNKED = 2,
|
LLAMA_SWA_TYPE_CHUNKED = 2,
|
||||||
|
LLAMA_SWA_TYPE_SYMMETRIC = 3,
|
||||||
};
|
};
|
||||||
|
|
||||||
struct llama_hparams_posnet {
|
struct llama_hparams_posnet {
|
||||||
|
|
@ -41,6 +42,7 @@ struct llama_hparams {
|
||||||
uint32_t n_embd;
|
uint32_t n_embd;
|
||||||
uint32_t n_embd_features = 0;
|
uint32_t n_embd_features = 0;
|
||||||
uint32_t n_layer;
|
uint32_t n_layer;
|
||||||
|
int32_t n_layer_kv_from_start = -1; // if non-negative, the first n_layer_kv_from_start layers have KV cache
|
||||||
uint32_t n_rot;
|
uint32_t n_rot;
|
||||||
uint32_t n_embd_head_k; // dimension of keys (d_k). d_q is assumed to be the same, but there are n_head q heads, and only n_head_kv k-v heads
|
uint32_t n_embd_head_k; // dimension of keys (d_k). d_q is assumed to be the same, but there are n_head q heads, and only n_head_kv k-v heads
|
||||||
uint32_t n_embd_head_v; // dimension of values (d_v) aka n_embd_head
|
uint32_t n_embd_head_v; // dimension of values (d_v) aka n_embd_head
|
||||||
|
|
@ -69,10 +71,13 @@ struct llama_hparams {
|
||||||
uint32_t n_lora_kv = 0;
|
uint32_t n_lora_kv = 0;
|
||||||
uint32_t n_ff_exp = 0;
|
uint32_t n_ff_exp = 0;
|
||||||
uint32_t n_ff_shexp = 0;
|
uint32_t n_ff_shexp = 0;
|
||||||
|
uint32_t n_ff_chexp = 0;
|
||||||
uint32_t n_expert_shared = 0;
|
uint32_t n_expert_shared = 0;
|
||||||
uint32_t n_norm_groups = 0;
|
uint32_t n_norm_groups = 0;
|
||||||
|
uint32_t n_group_experts = 0;
|
||||||
|
|
||||||
float expert_weights_scale = 0.0;
|
float expert_group_scale = 0.05f;
|
||||||
|
float expert_weights_scale = 0.0f;
|
||||||
bool expert_weights_norm = false;
|
bool expert_weights_norm = false;
|
||||||
uint32_t expert_gating_func = LLAMA_EXPERT_GATING_FUNC_TYPE_NONE;
|
uint32_t expert_gating_func = LLAMA_EXPERT_GATING_FUNC_TYPE_NONE;
|
||||||
uint32_t moe_every_n_layers = 0;
|
uint32_t moe_every_n_layers = 0;
|
||||||
|
|
@ -83,6 +88,7 @@ struct llama_hparams {
|
||||||
float f_norm_group_eps;
|
float f_norm_group_eps;
|
||||||
|
|
||||||
float f_attn_logit_softcapping = 50.0f;
|
float f_attn_logit_softcapping = 50.0f;
|
||||||
|
float f_router_logit_softcapping = 30.0f;
|
||||||
float f_final_logit_softcapping = 30.0f;
|
float f_final_logit_softcapping = 30.0f;
|
||||||
|
|
||||||
// for RWKV
|
// for RWKV
|
||||||
|
|
@ -104,6 +110,11 @@ struct llama_hparams {
|
||||||
uint32_t n_ctx_orig_yarn;
|
uint32_t n_ctx_orig_yarn;
|
||||||
float rope_yarn_log_mul = 0.0f;
|
float rope_yarn_log_mul = 0.0f;
|
||||||
|
|
||||||
|
float yarn_ext_factor = -1.0f;
|
||||||
|
float yarn_attn_factor = 1.0f;
|
||||||
|
float yarn_beta_fast = 32.0f;
|
||||||
|
float yarn_beta_slow = 1.0f;
|
||||||
|
|
||||||
std::array<int, 4> rope_sections;
|
std::array<int, 4> rope_sections;
|
||||||
|
|
||||||
// Sliding Window Attention (SWA)
|
// Sliding Window Attention (SWA)
|
||||||
|
|
@ -136,10 +147,14 @@ struct llama_hparams {
|
||||||
float f_embedding_scale = 0.0f;
|
float f_embedding_scale = 0.0f;
|
||||||
float f_attention_scale = 0.0f;
|
float f_attention_scale = 0.0f;
|
||||||
|
|
||||||
|
// grok-2
|
||||||
|
float f_attn_out_scale = 0.0f;
|
||||||
|
uint32_t attn_temp_length = 0;
|
||||||
|
|
||||||
bool causal_attn = true;
|
bool causal_attn = true;
|
||||||
bool use_alibi = false;
|
bool use_alibi = false;
|
||||||
bool attn_soft_cap = false;
|
bool attn_soft_cap = false;
|
||||||
bool use_kq_norm = true;
|
bool use_kq_norm = false;
|
||||||
|
|
||||||
// for Classifiers
|
// for Classifiers
|
||||||
uint32_t n_cls_out = 1;
|
uint32_t n_cls_out = 1;
|
||||||
|
|
@ -159,6 +174,7 @@ struct llama_hparams {
|
||||||
// needed by encoder-decoder models (e.g. T5, FLAN-T5)
|
// needed by encoder-decoder models (e.g. T5, FLAN-T5)
|
||||||
// ref: https://github.com/ggerganov/llama.cpp/pull/8141
|
// ref: https://github.com/ggerganov/llama.cpp/pull/8141
|
||||||
llama_token dec_start_token_id = LLAMA_TOKEN_NULL;
|
llama_token dec_start_token_id = LLAMA_TOKEN_NULL;
|
||||||
|
uint32_t dec_n_layer = 0;
|
||||||
|
|
||||||
enum llama_pooling_type pooling_type = LLAMA_POOLING_TYPE_NONE;
|
enum llama_pooling_type pooling_type = LLAMA_POOLING_TYPE_NONE;
|
||||||
enum llama_rope_type rope_type = LLAMA_ROPE_TYPE_NONE;
|
enum llama_rope_type rope_type = LLAMA_ROPE_TYPE_NONE;
|
||||||
|
|
@ -226,6 +242,16 @@ struct llama_hparams {
|
||||||
bool n_bskcn(uint32_t n, uint32_t il) const;
|
bool n_bskcn(uint32_t n, uint32_t il) const;
|
||||||
|
|
||||||
bool is_swa(uint32_t il) const;
|
bool is_swa(uint32_t il) const;
|
||||||
|
|
||||||
|
bool has_kv(uint32_t il) const;
|
||||||
|
|
||||||
|
// number of layers for which has_kv() returns true
|
||||||
|
uint32_t n_layer_kv() const;
|
||||||
|
|
||||||
|
// note that this function uses different SWA parameters from those in the hparams
|
||||||
|
// TODO: think of a better place for this function
|
||||||
|
// TODO: pack the SWA params in a struct?
|
||||||
|
static bool is_masked_swa(uint32_t n_swa, llama_swa_type swa_type, llama_pos p0, llama_pos p1);
|
||||||
};
|
};
|
||||||
|
|
||||||
static_assert(std::is_trivially_copyable<llama_hparams>::value, "llama_hparams must be trivially copyable");
|
static_assert(std::is_trivially_copyable<llama_hparams>::value, "llama_hparams must be trivially copyable");
|
||||||
|
|
|
||||||
2
llama/llama.cpp/src/llama-impl.h
vendored
2
llama/llama.cpp/src/llama-impl.h
vendored
|
|
@ -59,3 +59,5 @@ std::string llama_format_tensor_shape(const std::vector<int64_t> & ne);
|
||||||
std::string llama_format_tensor_shape(const struct ggml_tensor * t);
|
std::string llama_format_tensor_shape(const struct ggml_tensor * t);
|
||||||
|
|
||||||
std::string gguf_kv_to_str(const struct gguf_context * ctx_gguf, int i);
|
std::string gguf_kv_to_str(const struct gguf_context * ctx_gguf, int i);
|
||||||
|
|
||||||
|
#define LLAMA_TENSOR_NAME_FATTN "__fattn__"
|
||||||
|
|
|
||||||
|
|
@ -1,4 +1,4 @@
|
||||||
#include "llama-kv-cache-unified-iswa.h"
|
#include "llama-kv-cache-iswa.h"
|
||||||
|
|
||||||
#include "llama-impl.h"
|
#include "llama-impl.h"
|
||||||
#include "llama-batch.h"
|
#include "llama-batch.h"
|
||||||
|
|
@ -8,10 +8,10 @@
|
||||||
#include <cassert>
|
#include <cassert>
|
||||||
|
|
||||||
//
|
//
|
||||||
// llama_kv_cache_unified_iswa
|
// llama_kv_cache_iswa
|
||||||
//
|
//
|
||||||
|
|
||||||
llama_kv_cache_unified_iswa::llama_kv_cache_unified_iswa(
|
llama_kv_cache_iswa::llama_kv_cache_iswa(
|
||||||
const llama_model & model,
|
const llama_model & model,
|
||||||
ggml_type type_k,
|
ggml_type type_k,
|
||||||
ggml_type type_v,
|
ggml_type type_v,
|
||||||
|
|
@ -22,9 +22,26 @@ llama_kv_cache_unified_iswa::llama_kv_cache_unified_iswa(
|
||||||
uint32_t kv_size,
|
uint32_t kv_size,
|
||||||
uint32_t n_seq_max,
|
uint32_t n_seq_max,
|
||||||
uint32_t n_ubatch,
|
uint32_t n_ubatch,
|
||||||
uint32_t n_pad) : hparams(model.hparams), unified(unified) {
|
uint32_t n_pad,
|
||||||
llama_kv_cache_unified::layer_filter_cb filter_base = [&](int32_t il) { return !model.hparams.is_swa(il); };
|
const layer_filter_cb & filter,
|
||||||
llama_kv_cache_unified::layer_filter_cb filter_swa = [&](int32_t il) { return model.hparams.is_swa(il); };
|
const layer_reuse_cb & reuse) : hparams(model.hparams), unified(unified) {
|
||||||
|
|
||||||
|
// chain filters
|
||||||
|
const layer_filter_cb filter_base = [&](int32_t il) {
|
||||||
|
if (filter && !filter(il)) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
return !model.hparams.is_swa(il);
|
||||||
|
};
|
||||||
|
|
||||||
|
const layer_filter_cb filter_swa = [&](int32_t il) {
|
||||||
|
if (filter && !filter(il)) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
return model.hparams.is_swa(il);
|
||||||
|
};
|
||||||
|
|
||||||
const uint32_t size_base = kv_size;
|
const uint32_t size_base = kv_size;
|
||||||
|
|
||||||
|
|
@ -40,25 +57,25 @@ llama_kv_cache_unified_iswa::llama_kv_cache_unified_iswa(
|
||||||
|
|
||||||
LLAMA_LOG_INFO("%s: creating non-SWA KV cache, size = %u cells\n", __func__, size_base);
|
LLAMA_LOG_INFO("%s: creating non-SWA KV cache, size = %u cells\n", __func__, size_base);
|
||||||
|
|
||||||
kv_base = std::make_unique<llama_kv_cache_unified>(
|
kv_base = std::make_unique<llama_kv_cache>(
|
||||||
model, std::move(filter_base), type_k, type_v,
|
model, type_k, type_v,
|
||||||
v_trans, offload, unified, size_base, n_seq_max, n_pad,
|
v_trans, offload, unified, size_base, n_seq_max, n_pad,
|
||||||
0, LLAMA_SWA_TYPE_NONE);
|
0, LLAMA_SWA_TYPE_NONE, filter_base, reuse);
|
||||||
|
|
||||||
LLAMA_LOG_INFO("%s: creating SWA KV cache, size = %u cells\n", __func__, size_swa);
|
LLAMA_LOG_INFO("%s: creating SWA KV cache, size = %u cells\n", __func__, size_swa);
|
||||||
|
|
||||||
kv_swa = std::make_unique<llama_kv_cache_unified>(
|
kv_swa = std::make_unique<llama_kv_cache>(
|
||||||
model, std::move(filter_swa), type_k, type_v,
|
model, type_k, type_v,
|
||||||
v_trans, offload, unified, size_swa, n_seq_max, n_pad,
|
v_trans, offload, unified, size_swa, n_seq_max, n_pad,
|
||||||
hparams.n_swa, hparams.swa_type);
|
hparams.n_swa, hparams.swa_type, filter_swa, reuse);
|
||||||
}
|
}
|
||||||
|
|
||||||
void llama_kv_cache_unified_iswa::clear(bool data) {
|
void llama_kv_cache_iswa::clear(bool data) {
|
||||||
kv_base->clear(data);
|
kv_base->clear(data);
|
||||||
kv_swa ->clear(data);
|
kv_swa ->clear(data);
|
||||||
}
|
}
|
||||||
|
|
||||||
bool llama_kv_cache_unified_iswa::seq_rm(llama_seq_id seq_id, llama_pos p0, llama_pos p1) {
|
bool llama_kv_cache_iswa::seq_rm(llama_seq_id seq_id, llama_pos p0, llama_pos p1) {
|
||||||
bool res = true;
|
bool res = true;
|
||||||
|
|
||||||
res = res & kv_base->seq_rm(seq_id, p0, p1);
|
res = res & kv_base->seq_rm(seq_id, p0, p1);
|
||||||
|
|
@ -67,36 +84,44 @@ bool llama_kv_cache_unified_iswa::seq_rm(llama_seq_id seq_id, llama_pos p0, llam
|
||||||
return res;
|
return res;
|
||||||
}
|
}
|
||||||
|
|
||||||
void llama_kv_cache_unified_iswa::seq_cp(llama_seq_id seq_id_src, llama_seq_id seq_id_dst, llama_pos p0, llama_pos p1) {
|
void llama_kv_cache_iswa::seq_cp(llama_seq_id seq_id_src, llama_seq_id seq_id_dst, llama_pos p0, llama_pos p1) {
|
||||||
kv_base->seq_cp(seq_id_src, seq_id_dst, p0, p1);
|
kv_base->seq_cp(seq_id_src, seq_id_dst, p0, p1);
|
||||||
kv_swa ->seq_cp(seq_id_src, seq_id_dst, p0, p1);
|
kv_swa ->seq_cp(seq_id_src, seq_id_dst, p0, p1);
|
||||||
}
|
}
|
||||||
|
|
||||||
void llama_kv_cache_unified_iswa::seq_keep(llama_seq_id seq_id) {
|
void llama_kv_cache_iswa::seq_keep(llama_seq_id seq_id) {
|
||||||
kv_base->seq_keep(seq_id);
|
kv_base->seq_keep(seq_id);
|
||||||
kv_swa ->seq_keep(seq_id);
|
kv_swa ->seq_keep(seq_id);
|
||||||
}
|
}
|
||||||
|
|
||||||
void llama_kv_cache_unified_iswa::seq_add(llama_seq_id seq_id, llama_pos p0, llama_pos p1, llama_pos shift) {
|
void llama_kv_cache_iswa::seq_add(llama_seq_id seq_id, llama_pos p0, llama_pos p1, llama_pos shift) {
|
||||||
kv_base->seq_add(seq_id, p0, p1, shift);
|
kv_base->seq_add(seq_id, p0, p1, shift);
|
||||||
kv_swa ->seq_add(seq_id, p0, p1, shift);
|
kv_swa ->seq_add(seq_id, p0, p1, shift);
|
||||||
}
|
}
|
||||||
|
|
||||||
void llama_kv_cache_unified_iswa::seq_div(llama_seq_id seq_id, llama_pos p0, llama_pos p1, int d) {
|
void llama_kv_cache_iswa::seq_div(llama_seq_id seq_id, llama_pos p0, llama_pos p1, int d) {
|
||||||
kv_base->seq_div(seq_id, p0, p1, d);
|
kv_base->seq_div(seq_id, p0, p1, d);
|
||||||
kv_swa ->seq_div(seq_id, p0, p1, d);
|
kv_swa ->seq_div(seq_id, p0, p1, d);
|
||||||
}
|
}
|
||||||
|
|
||||||
llama_pos llama_kv_cache_unified_iswa::seq_pos_min(llama_seq_id seq_id) const {
|
llama_pos llama_kv_cache_iswa::seq_pos_min(llama_seq_id seq_id) const {
|
||||||
// the base cache is a superset of the SWA cache, so we can just check the SWA cache
|
// the base cache is a superset of the SWA cache, so we can just check the SWA cache
|
||||||
return kv_swa->seq_pos_min(seq_id);
|
return kv_swa->seq_pos_min(seq_id);
|
||||||
}
|
}
|
||||||
|
|
||||||
llama_pos llama_kv_cache_unified_iswa::seq_pos_max(llama_seq_id seq_id) const {
|
llama_pos llama_kv_cache_iswa::seq_pos_max(llama_seq_id seq_id) const {
|
||||||
return kv_swa->seq_pos_max(seq_id);
|
return kv_swa->seq_pos_max(seq_id);
|
||||||
}
|
}
|
||||||
|
|
||||||
llama_memory_context_ptr llama_kv_cache_unified_iswa::init_batch(llama_batch_allocr & balloc, uint32_t n_ubatch, bool embd_all) {
|
std::map<ggml_backend_buffer_type_t, size_t> llama_kv_cache_iswa::memory_breakdown() const {
|
||||||
|
std::map<ggml_backend_buffer_type_t, size_t> mb = kv_base->memory_breakdown();
|
||||||
|
for (const auto & buft_size : kv_swa->memory_breakdown()) {
|
||||||
|
mb[buft_size.first] += buft_size.second;
|
||||||
|
}
|
||||||
|
return mb;
|
||||||
|
}
|
||||||
|
|
||||||
|
llama_memory_context_ptr llama_kv_cache_iswa::init_batch(llama_batch_allocr & balloc, uint32_t n_ubatch, bool embd_all) {
|
||||||
GGML_UNUSED(embd_all);
|
GGML_UNUSED(embd_all);
|
||||||
|
|
||||||
// first try simple split
|
// first try simple split
|
||||||
|
|
@ -136,7 +161,7 @@ llama_memory_context_ptr llama_kv_cache_unified_iswa::init_batch(llama_batch_all
|
||||||
|
|
||||||
assert(sinfos_base.size() == sinfos_swa.size());
|
assert(sinfos_base.size() == sinfos_swa.size());
|
||||||
|
|
||||||
return std::make_unique<llama_kv_cache_unified_iswa_context>(
|
return std::make_unique<llama_kv_cache_iswa_context>(
|
||||||
this, std::move(sinfos_base), std::move(sinfos_swa), std::move(ubatches));
|
this, std::move(sinfos_base), std::move(sinfos_swa), std::move(ubatches));
|
||||||
} while (false);
|
} while (false);
|
||||||
|
|
||||||
|
|
@ -172,61 +197,67 @@ llama_memory_context_ptr llama_kv_cache_unified_iswa::init_batch(llama_batch_all
|
||||||
|
|
||||||
assert(sinfos_base.size() == sinfos_swa.size());
|
assert(sinfos_base.size() == sinfos_swa.size());
|
||||||
|
|
||||||
return std::make_unique<llama_kv_cache_unified_iswa_context>(
|
return std::make_unique<llama_kv_cache_iswa_context>(
|
||||||
this, std::move(sinfos_base), std::move(sinfos_swa), std::move(ubatches));
|
this, std::move(sinfos_base), std::move(sinfos_swa), std::move(ubatches));
|
||||||
} while (false);
|
} while (false);
|
||||||
|
|
||||||
// TODO: if we fail again, we should attempt different splitting strategies
|
// TODO: if we fail again, we should attempt different splitting strategies
|
||||||
// but to do that properly, we first have to refactor the batches to be more flexible
|
// but to do that properly, we first have to refactor the batches to be more flexible
|
||||||
|
|
||||||
return std::make_unique<llama_kv_cache_unified_iswa_context>(LLAMA_MEMORY_STATUS_FAILED_PREPARE);
|
return std::make_unique<llama_kv_cache_iswa_context>(LLAMA_MEMORY_STATUS_FAILED_PREPARE);
|
||||||
}
|
}
|
||||||
|
|
||||||
llama_memory_context_ptr llama_kv_cache_unified_iswa::init_full() {
|
llama_memory_context_ptr llama_kv_cache_iswa::init_full() {
|
||||||
return std::make_unique<llama_kv_cache_unified_iswa_context>(this);
|
return std::make_unique<llama_kv_cache_iswa_context>(this);
|
||||||
}
|
}
|
||||||
|
|
||||||
llama_memory_context_ptr llama_kv_cache_unified_iswa::init_update(llama_context * lctx, bool optimize) {
|
llama_memory_context_ptr llama_kv_cache_iswa::init_update(llama_context * lctx, bool optimize) {
|
||||||
return std::make_unique<llama_kv_cache_unified_iswa_context>(this, lctx, optimize);
|
return std::make_unique<llama_kv_cache_iswa_context>(this, lctx, optimize);
|
||||||
}
|
}
|
||||||
|
|
||||||
bool llama_kv_cache_unified_iswa::get_can_shift() const {
|
bool llama_kv_cache_iswa::get_can_shift() const {
|
||||||
return kv_base->get_size() == kv_swa->get_size();
|
return kv_base->get_size() == kv_swa->get_size();
|
||||||
}
|
}
|
||||||
|
|
||||||
void llama_kv_cache_unified_iswa::state_write(llama_io_write_i & io, llama_seq_id seq_id) const {
|
void llama_kv_cache_iswa::state_write(llama_io_write_i & io, llama_seq_id seq_id, llama_state_seq_flags flags) const {
|
||||||
kv_base->state_write(io, seq_id);
|
if ((flags & LLAMA_STATE_SEQ_FLAGS_SWA_ONLY) == 0) {
|
||||||
kv_swa ->state_write(io, seq_id);
|
kv_base->state_write(io, seq_id, flags);
|
||||||
}
|
}
|
||||||
|
|
||||||
void llama_kv_cache_unified_iswa::state_read(llama_io_read_i & io, llama_seq_id seq_id) {
|
kv_swa->state_write(io, seq_id, flags);
|
||||||
kv_base->state_read(io, seq_id);
|
|
||||||
kv_swa ->state_read(io, seq_id);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
llama_kv_cache_unified * llama_kv_cache_unified_iswa::get_base() const {
|
void llama_kv_cache_iswa::state_read(llama_io_read_i & io, llama_seq_id seq_id, llama_state_seq_flags flags) {
|
||||||
|
if ((flags & LLAMA_STATE_SEQ_FLAGS_SWA_ONLY) == 0) {
|
||||||
|
kv_base->state_read(io, seq_id, flags);
|
||||||
|
}
|
||||||
|
|
||||||
|
kv_swa->state_read(io, seq_id, flags);
|
||||||
|
}
|
||||||
|
|
||||||
|
llama_kv_cache * llama_kv_cache_iswa::get_base() const {
|
||||||
return kv_base.get();
|
return kv_base.get();
|
||||||
}
|
}
|
||||||
|
|
||||||
llama_kv_cache_unified * llama_kv_cache_unified_iswa::get_swa() const {
|
llama_kv_cache * llama_kv_cache_iswa::get_swa() const {
|
||||||
return kv_swa.get();
|
return kv_swa.get();
|
||||||
}
|
}
|
||||||
|
|
||||||
//
|
//
|
||||||
// llama_kv_cache_unified_iswa_context
|
// llama_kv_cache_iswa_context
|
||||||
//
|
//
|
||||||
|
|
||||||
llama_kv_cache_unified_iswa_context::llama_kv_cache_unified_iswa_context(llama_memory_status status) : status(status) {}
|
llama_kv_cache_iswa_context::llama_kv_cache_iswa_context(llama_memory_status status) : status(status) {}
|
||||||
|
|
||||||
llama_kv_cache_unified_iswa_context::llama_kv_cache_unified_iswa_context(
|
llama_kv_cache_iswa_context::llama_kv_cache_iswa_context(
|
||||||
llama_kv_cache_unified_iswa * kv) :
|
llama_kv_cache_iswa * kv) :
|
||||||
ctx_base(kv->get_base()->init_full()),
|
ctx_base(kv->get_base()->init_full()),
|
||||||
ctx_swa (kv->get_swa ()->init_full()),
|
ctx_swa (kv->get_swa ()->init_full()),
|
||||||
status(llama_memory_status_combine(ctx_base->get_status(), ctx_swa->get_status())) {
|
status(llama_memory_status_combine(ctx_base->get_status(), ctx_swa->get_status())) {
|
||||||
}
|
}
|
||||||
|
|
||||||
llama_kv_cache_unified_iswa_context::llama_kv_cache_unified_iswa_context(
|
llama_kv_cache_iswa_context::llama_kv_cache_iswa_context(
|
||||||
llama_kv_cache_unified_iswa * kv,
|
llama_kv_cache_iswa * kv,
|
||||||
llama_context * lctx,
|
llama_context * lctx,
|
||||||
bool optimize) :
|
bool optimize) :
|
||||||
ctx_base(kv->get_base()->init_update(lctx, optimize)),
|
ctx_base(kv->get_base()->init_update(lctx, optimize)),
|
||||||
|
|
@ -234,21 +265,21 @@ llama_kv_cache_unified_iswa_context::llama_kv_cache_unified_iswa_context(
|
||||||
status(llama_memory_status_combine(ctx_base->get_status(), ctx_swa->get_status())) {
|
status(llama_memory_status_combine(ctx_base->get_status(), ctx_swa->get_status())) {
|
||||||
}
|
}
|
||||||
|
|
||||||
llama_kv_cache_unified_iswa_context::llama_kv_cache_unified_iswa_context(
|
llama_kv_cache_iswa_context::llama_kv_cache_iswa_context(
|
||||||
llama_kv_cache_unified_iswa * kv,
|
llama_kv_cache_iswa * kv,
|
||||||
slot_info_vec_t sinfos_base,
|
slot_info_vec_t sinfos_base,
|
||||||
slot_info_vec_t sinfos_swa,
|
slot_info_vec_t sinfos_swa,
|
||||||
std::vector<llama_ubatch> ubatches) :
|
std::vector<llama_ubatch> ubatches) :
|
||||||
ubatches(std::move(ubatches)),
|
ubatches(std::move(ubatches)),
|
||||||
// note: here we copy the ubatches. not sure if this is ideal
|
// note: here we copy the ubatches. not sure if this is ideal
|
||||||
ctx_base(new llama_kv_cache_unified_context(kv->get_base(), std::move(sinfos_base), this->ubatches)),
|
ctx_base(new llama_kv_cache_context(kv->get_base(), std::move(sinfos_base), this->ubatches)),
|
||||||
ctx_swa (new llama_kv_cache_unified_context(kv->get_swa (), std::move(sinfos_swa), this->ubatches)),
|
ctx_swa (new llama_kv_cache_context(kv->get_swa (), std::move(sinfos_swa), this->ubatches)),
|
||||||
status(llama_memory_status_combine(ctx_base->get_status(), ctx_swa->get_status())) {
|
status(llama_memory_status_combine(ctx_base->get_status(), ctx_swa->get_status())) {
|
||||||
}
|
}
|
||||||
|
|
||||||
llama_kv_cache_unified_iswa_context:: ~llama_kv_cache_unified_iswa_context() = default;
|
llama_kv_cache_iswa_context:: ~llama_kv_cache_iswa_context() = default;
|
||||||
|
|
||||||
bool llama_kv_cache_unified_iswa_context::next() {
|
bool llama_kv_cache_iswa_context::next() {
|
||||||
assert(status == LLAMA_MEMORY_STATUS_SUCCESS);
|
assert(status == LLAMA_MEMORY_STATUS_SUCCESS);
|
||||||
|
|
||||||
ctx_base->next();
|
ctx_base->next();
|
||||||
|
|
@ -261,7 +292,7 @@ bool llama_kv_cache_unified_iswa_context::next() {
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
bool llama_kv_cache_unified_iswa_context::apply() {
|
bool llama_kv_cache_iswa_context::apply() {
|
||||||
assert(!llama_memory_status_is_fail(status));
|
assert(!llama_memory_status_is_fail(status));
|
||||||
|
|
||||||
bool res = true;
|
bool res = true;
|
||||||
|
|
@ -272,24 +303,24 @@ bool llama_kv_cache_unified_iswa_context::apply() {
|
||||||
return res;
|
return res;
|
||||||
}
|
}
|
||||||
|
|
||||||
llama_memory_status llama_kv_cache_unified_iswa_context::get_status() const {
|
llama_memory_status llama_kv_cache_iswa_context::get_status() const {
|
||||||
return status;
|
return status;
|
||||||
}
|
}
|
||||||
|
|
||||||
const llama_ubatch & llama_kv_cache_unified_iswa_context::get_ubatch() const {
|
const llama_ubatch & llama_kv_cache_iswa_context::get_ubatch() const {
|
||||||
assert(status == LLAMA_MEMORY_STATUS_SUCCESS);
|
assert(status == LLAMA_MEMORY_STATUS_SUCCESS);
|
||||||
|
|
||||||
return ubatches[i_next];
|
return ubatches[i_next];
|
||||||
}
|
}
|
||||||
|
|
||||||
const llama_kv_cache_unified_context * llama_kv_cache_unified_iswa_context::get_base() const {
|
const llama_kv_cache_context * llama_kv_cache_iswa_context::get_base() const {
|
||||||
assert(status == LLAMA_MEMORY_STATUS_SUCCESS);
|
assert(status == LLAMA_MEMORY_STATUS_SUCCESS);
|
||||||
|
|
||||||
return static_cast<const llama_kv_cache_unified_context *>(ctx_base.get());
|
return static_cast<const llama_kv_cache_context *>(ctx_base.get());
|
||||||
}
|
}
|
||||||
|
|
||||||
const llama_kv_cache_unified_context * llama_kv_cache_unified_iswa_context::get_swa() const {
|
const llama_kv_cache_context * llama_kv_cache_iswa_context::get_swa() const {
|
||||||
assert(status == LLAMA_MEMORY_STATUS_SUCCESS);
|
assert(status == LLAMA_MEMORY_STATUS_SUCCESS);
|
||||||
|
|
||||||
return static_cast<const llama_kv_cache_unified_context *>(ctx_swa.get());
|
return static_cast<const llama_kv_cache_context *>(ctx_swa.get());
|
||||||
}
|
}
|
||||||
|
|
@ -1,19 +1,19 @@
|
||||||
#pragma once
|
#pragma once
|
||||||
|
|
||||||
#include "llama-kv-cache-unified.h"
|
#include "llama-kv-cache.h"
|
||||||
|
|
||||||
#include <vector>
|
#include <vector>
|
||||||
|
|
||||||
//
|
//
|
||||||
// llama_kv_cache_unified_iswa
|
// llama_kv_cache_iswa
|
||||||
//
|
//
|
||||||
|
|
||||||
// utilizes two instances of llama_kv_cache_unified
|
// utilizes two instances of llama_kv_cache
|
||||||
// the first instance is for the non-SWA layers of the model and the second instance is for the SWA layers
|
// the first instance is for the non-SWA layers of the model and the second instance is for the SWA layers
|
||||||
|
|
||||||
class llama_kv_cache_unified_iswa : public llama_memory_i {
|
class llama_kv_cache_iswa : public llama_memory_i {
|
||||||
public:
|
public:
|
||||||
llama_kv_cache_unified_iswa(
|
llama_kv_cache_iswa(
|
||||||
const llama_model & model,
|
const llama_model & model,
|
||||||
ggml_type type_k,
|
ggml_type type_k,
|
||||||
ggml_type type_v,
|
ggml_type type_v,
|
||||||
|
|
@ -24,9 +24,11 @@ public:
|
||||||
uint32_t kv_size,
|
uint32_t kv_size,
|
||||||
uint32_t n_seq_max,
|
uint32_t n_seq_max,
|
||||||
uint32_t n_ubatch,
|
uint32_t n_ubatch,
|
||||||
uint32_t n_pad);
|
uint32_t n_pad,
|
||||||
|
const layer_filter_cb & filter,
|
||||||
|
const layer_reuse_cb & reuse);
|
||||||
|
|
||||||
~llama_kv_cache_unified_iswa() = default;
|
~llama_kv_cache_iswa() = default;
|
||||||
|
|
||||||
//
|
//
|
||||||
// llama_memory_i
|
// llama_memory_i
|
||||||
|
|
@ -54,52 +56,54 @@ public:
|
||||||
llama_pos seq_pos_min(llama_seq_id seq_id) const override;
|
llama_pos seq_pos_min(llama_seq_id seq_id) const override;
|
||||||
llama_pos seq_pos_max(llama_seq_id seq_id) const override;
|
llama_pos seq_pos_max(llama_seq_id seq_id) const override;
|
||||||
|
|
||||||
|
std::map<ggml_backend_buffer_type_t, size_t> memory_breakdown() const override;
|
||||||
|
|
||||||
// state write/load
|
// state write/load
|
||||||
|
|
||||||
void state_write(llama_io_write_i & io, llama_seq_id seq_id = -1) const override;
|
void state_write(llama_io_write_i & io, llama_seq_id seq_id = -1, llama_state_seq_flags flags = 0) const override;
|
||||||
void state_read (llama_io_read_i & io, llama_seq_id seq_id = -1) override;
|
void state_read (llama_io_read_i & io, llama_seq_id seq_id = -1, llama_state_seq_flags flags = 0) override;
|
||||||
|
|
||||||
//
|
//
|
||||||
// llama_kv_cache_unified_iswa specific API
|
// llama_kv_cache_iswa specific API
|
||||||
//
|
//
|
||||||
|
|
||||||
llama_kv_cache_unified * get_base() const;
|
llama_kv_cache * get_base() const;
|
||||||
llama_kv_cache_unified * get_swa () const;
|
llama_kv_cache * get_swa () const;
|
||||||
|
|
||||||
private:
|
private:
|
||||||
const llama_hparams & hparams;
|
const llama_hparams & hparams;
|
||||||
|
|
||||||
const bool unified;
|
const bool unified;
|
||||||
|
|
||||||
std::unique_ptr<llama_kv_cache_unified> kv_base;
|
std::unique_ptr<llama_kv_cache> kv_base;
|
||||||
std::unique_ptr<llama_kv_cache_unified> kv_swa;
|
std::unique_ptr<llama_kv_cache> kv_swa;
|
||||||
};
|
};
|
||||||
|
|
||||||
class llama_kv_cache_unified_iswa_context : public llama_memory_context_i {
|
class llama_kv_cache_iswa_context : public llama_memory_context_i {
|
||||||
public:
|
public:
|
||||||
using slot_info_vec_t = llama_kv_cache_unified::slot_info_vec_t;
|
using slot_info_vec_t = llama_kv_cache::slot_info_vec_t;
|
||||||
|
|
||||||
// used for errors
|
// used for errors
|
||||||
llama_kv_cache_unified_iswa_context(llama_memory_status status);
|
llama_kv_cache_iswa_context(llama_memory_status status);
|
||||||
|
|
||||||
// used to create a full-cache context
|
// used to create a full-cache context
|
||||||
llama_kv_cache_unified_iswa_context(
|
llama_kv_cache_iswa_context(
|
||||||
llama_kv_cache_unified_iswa * kv);
|
llama_kv_cache_iswa * kv);
|
||||||
|
|
||||||
// used to create an update context
|
// used to create an update context
|
||||||
llama_kv_cache_unified_iswa_context(
|
llama_kv_cache_iswa_context(
|
||||||
llama_kv_cache_unified_iswa * kv,
|
llama_kv_cache_iswa * kv,
|
||||||
llama_context * lctx,
|
llama_context * lctx,
|
||||||
bool optimize);
|
bool optimize);
|
||||||
|
|
||||||
// used to create a batch processing context from a batch
|
// used to create a batch processing context from a batch
|
||||||
llama_kv_cache_unified_iswa_context(
|
llama_kv_cache_iswa_context(
|
||||||
llama_kv_cache_unified_iswa * kv,
|
llama_kv_cache_iswa * kv,
|
||||||
slot_info_vec_t sinfos_base,
|
slot_info_vec_t sinfos_base,
|
||||||
slot_info_vec_t sinfos_swa,
|
slot_info_vec_t sinfos_swa,
|
||||||
std::vector<llama_ubatch> ubatches);
|
std::vector<llama_ubatch> ubatches);
|
||||||
|
|
||||||
virtual ~llama_kv_cache_unified_iswa_context();
|
virtual ~llama_kv_cache_iswa_context();
|
||||||
|
|
||||||
//
|
//
|
||||||
// llama_memory_context_i
|
// llama_memory_context_i
|
||||||
|
|
@ -112,14 +116,14 @@ public:
|
||||||
const llama_ubatch & get_ubatch() const override;
|
const llama_ubatch & get_ubatch() const override;
|
||||||
|
|
||||||
//
|
//
|
||||||
// llama_kv_cache_unified_iswa_context specific API
|
// llama_kv_cache_iswa_context specific API
|
||||||
//
|
//
|
||||||
|
|
||||||
const llama_kv_cache_unified_context * get_base() const;
|
const llama_kv_cache_context * get_base() const;
|
||||||
const llama_kv_cache_unified_context * get_swa() const;
|
const llama_kv_cache_context * get_swa() const;
|
||||||
|
|
||||||
private:
|
private:
|
||||||
//llama_kv_cache_unified_iswa * kv;
|
//llama_kv_cache_iswa * kv;
|
||||||
|
|
||||||
// the index of the next ubatch to process
|
// the index of the next ubatch to process
|
||||||
size_t i_next = 0;
|
size_t i_next = 0;
|
||||||
File diff suppressed because it is too large
Load Diff
|
|
@ -14,27 +14,13 @@ struct llama_model;
|
||||||
struct llama_context;
|
struct llama_context;
|
||||||
|
|
||||||
//
|
//
|
||||||
// llama_kv_cache_unified
|
// llama_kv_cache
|
||||||
//
|
//
|
||||||
|
|
||||||
class llama_kv_cache_unified : public llama_memory_i {
|
class llama_kv_cache : public llama_memory_i {
|
||||||
public:
|
public:
|
||||||
static uint32_t get_padding(const llama_cparams & cparams);
|
static uint32_t get_padding(const llama_cparams & cparams);
|
||||||
|
|
||||||
// this callback is used to filter out layers that should not be included in the cache
|
|
||||||
using layer_filter_cb = std::function<bool(int32_t il)>;
|
|
||||||
|
|
||||||
struct defrag_info {
|
|
||||||
bool empty() const {
|
|
||||||
return ids.empty();
|
|
||||||
}
|
|
||||||
|
|
||||||
// contains information about which cell moves where:
|
|
||||||
// - cell i moves to ids[i]
|
|
||||||
// - if ids[i] == i || ids[i] == ids.size(), then cell i is not moved
|
|
||||||
std::vector<uint32_t> ids;
|
|
||||||
};
|
|
||||||
|
|
||||||
struct stream_copy_info {
|
struct stream_copy_info {
|
||||||
bool empty() const {
|
bool empty() const {
|
||||||
assert(ssrc.size() == sdst.size());
|
assert(ssrc.size() == sdst.size());
|
||||||
|
|
@ -52,8 +38,8 @@ public:
|
||||||
using idx_vec_t = std::vector<uint32_t>;
|
using idx_vec_t = std::vector<uint32_t>;
|
||||||
|
|
||||||
// number of streams: ns = s1 - s0 + 1
|
// number of streams: ns = s1 - s0 + 1
|
||||||
llama_seq_id s0;
|
uint32_t s0;
|
||||||
llama_seq_id s1;
|
uint32_t s1;
|
||||||
|
|
||||||
std::vector<llama_seq_id> strm; // [ns]
|
std::vector<llama_seq_id> strm; // [ns]
|
||||||
std::vector<idx_vec_t> idxs; // [ns]
|
std::vector<idx_vec_t> idxs; // [ns]
|
||||||
|
|
@ -92,9 +78,8 @@ public:
|
||||||
|
|
||||||
using slot_info_vec_t = std::vector<slot_info>;
|
using slot_info_vec_t = std::vector<slot_info>;
|
||||||
|
|
||||||
llama_kv_cache_unified(
|
llama_kv_cache(
|
||||||
const llama_model & model,
|
const llama_model & model,
|
||||||
layer_filter_cb && filter,
|
|
||||||
ggml_type type_k,
|
ggml_type type_k,
|
||||||
ggml_type type_v,
|
ggml_type type_v,
|
||||||
bool v_trans,
|
bool v_trans,
|
||||||
|
|
@ -104,9 +89,11 @@ public:
|
||||||
uint32_t n_seq_max,
|
uint32_t n_seq_max,
|
||||||
uint32_t n_pad,
|
uint32_t n_pad,
|
||||||
uint32_t n_swa,
|
uint32_t n_swa,
|
||||||
llama_swa_type swa_type);
|
llama_swa_type swa_type,
|
||||||
|
const layer_filter_cb & filter,
|
||||||
|
const layer_reuse_cb & reuse);
|
||||||
|
|
||||||
~llama_kv_cache_unified() = default;
|
~llama_kv_cache() = default;
|
||||||
|
|
||||||
//
|
//
|
||||||
// llama_memory_i
|
// llama_memory_i
|
||||||
|
|
@ -134,13 +121,15 @@ public:
|
||||||
llama_pos seq_pos_min(llama_seq_id seq_id) const override;
|
llama_pos seq_pos_min(llama_seq_id seq_id) const override;
|
||||||
llama_pos seq_pos_max(llama_seq_id seq_id) const override;
|
llama_pos seq_pos_max(llama_seq_id seq_id) const override;
|
||||||
|
|
||||||
|
std::map<ggml_backend_buffer_type_t, size_t> memory_breakdown() const override;
|
||||||
|
|
||||||
// state write/load
|
// state write/load
|
||||||
|
|
||||||
void state_write(llama_io_write_i & io, llama_seq_id seq_id = -1) const override;
|
void state_write(llama_io_write_i & io, llama_seq_id seq_id = -1, llama_state_seq_flags flags = 0) const override;
|
||||||
void state_read (llama_io_read_i & io, llama_seq_id seq_id = -1) override;
|
void state_read (llama_io_read_i & io, llama_seq_id seq_id = -1, llama_state_seq_flags flags = 0) override;
|
||||||
|
|
||||||
//
|
//
|
||||||
// llama_kv_cache_unified specific API
|
// llama_kv_cache specific API
|
||||||
//
|
//
|
||||||
|
|
||||||
uint32_t get_size() const;
|
uint32_t get_size() const;
|
||||||
|
|
@ -152,10 +141,7 @@ public:
|
||||||
// graph_build API
|
// graph_build API
|
||||||
//
|
//
|
||||||
|
|
||||||
uint32_t get_n_kv() const;
|
uint32_t get_n_kv(const slot_info & sinfo) const;
|
||||||
|
|
||||||
// TODO: temporary
|
|
||||||
bool get_supports_set_rows() const;
|
|
||||||
|
|
||||||
// get views of the current state of the cache
|
// get views of the current state of the cache
|
||||||
ggml_tensor * get_k(ggml_context * ctx, int32_t il, uint32_t n_kv, const slot_info & sinfo) const;
|
ggml_tensor * get_k(ggml_context * ctx, int32_t il, uint32_t n_kv, const slot_info & sinfo) const;
|
||||||
|
|
@ -173,7 +159,7 @@ public:
|
||||||
// return empty vector on failure
|
// return empty vector on failure
|
||||||
slot_info_vec_t prepare(const std::vector<llama_ubatch> & ubatches);
|
slot_info_vec_t prepare(const std::vector<llama_ubatch> & ubatches);
|
||||||
|
|
||||||
bool update(llama_context * lctx, bool do_shift, const defrag_info & dinfo, const stream_copy_info & sc_info);
|
bool update(llama_context * lctx, bool do_shift, const stream_copy_info & sc_info);
|
||||||
|
|
||||||
// find a slot of kv cells that can hold the ubatch
|
// find a slot of kv cells that can hold the ubatch
|
||||||
// if cont == true, then the slot must be continuous
|
// if cont == true, then the slot must be continuous
|
||||||
|
|
@ -228,10 +214,7 @@ private:
|
||||||
// env: LLAMA_KV_CACHE_DEBUG
|
// env: LLAMA_KV_CACHE_DEBUG
|
||||||
int debug = 0;
|
int debug = 0;
|
||||||
|
|
||||||
// env: LLAMA_SET_ROWS (temporary)
|
// this is the SWA type of the cache - not to be confused with the model SWA type
|
||||||
// ref: https://github.com/ggml-org/llama.cpp/pull/14285
|
|
||||||
bool supports_set_rows = true;
|
|
||||||
|
|
||||||
const llama_swa_type swa_type = LLAMA_SWA_TYPE_NONE;
|
const llama_swa_type swa_type = LLAMA_SWA_TYPE_NONE;
|
||||||
|
|
||||||
std::vector<ggml_context_ptr> ctxs;
|
std::vector<ggml_context_ptr> ctxs;
|
||||||
|
|
@ -241,7 +224,7 @@ private:
|
||||||
// note: this is not part of the KV state and it's only used to speed-up the find_slot() method
|
// note: this is not part of the KV state and it's only used to speed-up the find_slot() method
|
||||||
std::vector<uint32_t> v_heads;
|
std::vector<uint32_t> v_heads;
|
||||||
|
|
||||||
std::vector<llama_kv_cells_unified> v_cells;
|
std::vector<llama_kv_cells> v_cells;
|
||||||
|
|
||||||
// maps from a sequence id to a stream id
|
// maps from a sequence id to a stream id
|
||||||
std::vector<uint32_t> seq_to_stream;
|
std::vector<uint32_t> seq_to_stream;
|
||||||
|
|
@ -254,9 +237,6 @@ private:
|
||||||
// model layer id -> KV cache layer id
|
// model layer id -> KV cache layer id
|
||||||
std::unordered_map<int32_t, int32_t> map_layer_ids;
|
std::unordered_map<int32_t, int32_t> map_layer_ids;
|
||||||
|
|
||||||
// return non-empty vector if cells have been moved
|
|
||||||
defrag_info defrag_prepare(int32_t n_max_nodes) const;
|
|
||||||
|
|
||||||
size_t total_size() const;
|
size_t total_size() const;
|
||||||
|
|
||||||
size_t size_k_bytes() const;
|
size_t size_k_bytes() const;
|
||||||
|
|
@ -277,11 +257,6 @@ private:
|
||||||
llm_graph_result * res,
|
llm_graph_result * res,
|
||||||
llama_context * lctx) const;
|
llama_context * lctx) const;
|
||||||
|
|
||||||
ggml_cgraph * build_graph_defrag(
|
|
||||||
llm_graph_result * res,
|
|
||||||
llama_context * lctx,
|
|
||||||
const defrag_info & dinfo) const;
|
|
||||||
|
|
||||||
struct cell_ranges_t {
|
struct cell_ranges_t {
|
||||||
uint32_t strm;
|
uint32_t strm;
|
||||||
|
|
||||||
|
|
@ -295,35 +270,33 @@ private:
|
||||||
bool state_read_data(llama_io_read_i & io, uint32_t strm, uint32_t cell_count);
|
bool state_read_data(llama_io_read_i & io, uint32_t strm, uint32_t cell_count);
|
||||||
};
|
};
|
||||||
|
|
||||||
class llama_kv_cache_unified_context : public llama_memory_context_i {
|
class llama_kv_cache_context : public llama_memory_context_i {
|
||||||
public:
|
public:
|
||||||
// some shorthands
|
// some shorthands
|
||||||
using slot_info_vec_t = llama_kv_cache_unified::slot_info_vec_t;
|
using slot_info_vec_t = llama_kv_cache::slot_info_vec_t;
|
||||||
using defrag_info = llama_kv_cache_unified::defrag_info;
|
using stream_copy_info = llama_kv_cache::stream_copy_info;
|
||||||
using stream_copy_info = llama_kv_cache_unified::stream_copy_info;
|
|
||||||
|
|
||||||
// used for errors
|
// used for errors
|
||||||
llama_kv_cache_unified_context(llama_memory_status status);
|
llama_kv_cache_context(llama_memory_status status);
|
||||||
|
|
||||||
// used to create a full-cache context
|
// used to create a full-cache context
|
||||||
llama_kv_cache_unified_context(
|
llama_kv_cache_context(
|
||||||
llama_kv_cache_unified * kv);
|
llama_kv_cache * kv);
|
||||||
|
|
||||||
// used to create an update context
|
// used to create an update context
|
||||||
llama_kv_cache_unified_context(
|
llama_kv_cache_context(
|
||||||
llama_kv_cache_unified * kv,
|
llama_kv_cache * kv,
|
||||||
llama_context * lctx,
|
llama_context * lctx,
|
||||||
bool do_shift,
|
bool do_shift,
|
||||||
defrag_info dinfo,
|
|
||||||
stream_copy_info sc_info);
|
stream_copy_info sc_info);
|
||||||
|
|
||||||
// used to create a batch procesing context from a batch
|
// used to create a batch procesing context from a batch
|
||||||
llama_kv_cache_unified_context(
|
llama_kv_cache_context(
|
||||||
llama_kv_cache_unified * kv,
|
llama_kv_cache * kv,
|
||||||
slot_info_vec_t sinfos,
|
slot_info_vec_t sinfos,
|
||||||
std::vector<llama_ubatch> ubatches);
|
std::vector<llama_ubatch> ubatches);
|
||||||
|
|
||||||
virtual ~llama_kv_cache_unified_context();
|
virtual ~llama_kv_cache_context();
|
||||||
|
|
||||||
//
|
//
|
||||||
// llama_memory_context_i
|
// llama_memory_context_i
|
||||||
|
|
@ -336,22 +309,27 @@ public:
|
||||||
const llama_ubatch & get_ubatch() const override;
|
const llama_ubatch & get_ubatch() const override;
|
||||||
|
|
||||||
//
|
//
|
||||||
// llama_kv_cache_unified_context specific API
|
// llama_kv_cache_context specific API
|
||||||
//
|
//
|
||||||
|
|
||||||
uint32_t get_n_kv() const;
|
uint32_t get_n_kv() const;
|
||||||
|
|
||||||
// TODO: temporary
|
|
||||||
bool get_supports_set_rows() const;
|
|
||||||
|
|
||||||
// get views of the current state of the cache
|
// get views of the current state of the cache
|
||||||
ggml_tensor * get_k(ggml_context * ctx, int32_t il) const;
|
ggml_tensor * get_k(ggml_context * ctx, int32_t il) const;
|
||||||
ggml_tensor * get_v(ggml_context * ctx, int32_t il) const;
|
ggml_tensor * get_v(ggml_context * ctx, int32_t il) const;
|
||||||
|
|
||||||
// store k_cur and v_cur in the cache based on the provided head location
|
// store k_cur and v_cur in the cache based on the provided head location
|
||||||
|
// note: the heads in k_cur and v_cur should be layed out contiguously in memory
|
||||||
|
// - k_cur [n_embd_head_k, n_head_k, n_tokens]
|
||||||
|
// - k_idxs [n_tokens]
|
||||||
|
// - v_cur [n_embd_head_v, n_head_v, n_tokens]
|
||||||
|
// - v_idxs [n_tokens] or [n_tokens*n_embd_v_gqa] depending if V cache is transposed
|
||||||
ggml_tensor * cpy_k(ggml_context * ctx, ggml_tensor * k_cur, ggml_tensor * k_idxs, int32_t il) const;
|
ggml_tensor * cpy_k(ggml_context * ctx, ggml_tensor * k_cur, ggml_tensor * k_idxs, int32_t il) const;
|
||||||
ggml_tensor * cpy_v(ggml_context * ctx, ggml_tensor * v_cur, ggml_tensor * v_idxs, int32_t il) const;
|
ggml_tensor * cpy_v(ggml_context * ctx, ggml_tensor * v_cur, ggml_tensor * v_idxs, int32_t il) const;
|
||||||
|
|
||||||
|
// create destination indices for each head of the current batch for where it would be written in the KV cache
|
||||||
|
// the indices address the global KV cache (not per stream) - this is not relevant for the user of this API, but
|
||||||
|
// helps understand the implementation logic of cpy_k and cpy_v
|
||||||
ggml_tensor * build_input_k_idxs(ggml_context * ctx, const llama_ubatch & ubatch) const;
|
ggml_tensor * build_input_k_idxs(ggml_context * ctx, const llama_ubatch & ubatch) const;
|
||||||
ggml_tensor * build_input_v_idxs(ggml_context * ctx, const llama_ubatch & ubatch) const;
|
ggml_tensor * build_input_v_idxs(ggml_context * ctx, const llama_ubatch & ubatch) const;
|
||||||
|
|
||||||
|
|
@ -365,7 +343,7 @@ public:
|
||||||
private:
|
private:
|
||||||
llama_memory_status status;
|
llama_memory_status status;
|
||||||
|
|
||||||
llama_kv_cache_unified * kv;
|
llama_kv_cache * kv;
|
||||||
llama_context * lctx;
|
llama_context * lctx;
|
||||||
|
|
||||||
//
|
//
|
||||||
|
|
@ -374,8 +352,6 @@ private:
|
||||||
|
|
||||||
bool do_shift = false;
|
bool do_shift = false;
|
||||||
|
|
||||||
defrag_info dinfo;
|
|
||||||
|
|
||||||
stream_copy_info sc_info;
|
stream_copy_info sc_info;
|
||||||
|
|
||||||
//
|
//
|
||||||
42
llama/llama.cpp/src/llama-kv-cells.h
vendored
42
llama/llama.cpp/src/llama-kv-cells.h
vendored
|
|
@ -11,7 +11,7 @@
|
||||||
|
|
||||||
// meta information about KV cells that can be part of multiple sequences at the same time
|
// meta information about KV cells that can be part of multiple sequences at the same time
|
||||||
// TODO: add unit tests
|
// TODO: add unit tests
|
||||||
class llama_kv_cells_unified {
|
class llama_kv_cells {
|
||||||
public:
|
public:
|
||||||
void reset() {
|
void reset() {
|
||||||
for (uint32_t i = 0; i < pos.size(); ++i) {
|
for (uint32_t i = 0; i < pos.size(); ++i) {
|
||||||
|
|
@ -77,30 +77,30 @@ public:
|
||||||
}
|
}
|
||||||
|
|
||||||
// move cell isrc to idst (used during defrag)
|
// move cell isrc to idst (used during defrag)
|
||||||
void mv(uint32_t isrc, uint32_t idst) {
|
//void mv(uint32_t isrc, uint32_t idst) {
|
||||||
assert(isrc < pos.size());
|
// assert(isrc < pos.size());
|
||||||
assert(idst < pos.size());
|
// assert(idst < pos.size());
|
||||||
|
|
||||||
assert(pos[idst] == -1);
|
// assert(pos[idst] == -1);
|
||||||
assert(pos[isrc] != -1);
|
// assert(pos[isrc] != -1);
|
||||||
|
|
||||||
pos [idst] = pos [isrc];
|
// pos [idst] = pos [isrc];
|
||||||
shift[idst] = shift[isrc];
|
// shift[idst] = shift[isrc];
|
||||||
seq [idst] = seq [isrc];
|
// seq [idst] = seq [isrc];
|
||||||
|
|
||||||
pos [isrc] = -1;
|
// pos [isrc] = -1;
|
||||||
shift[isrc] = 0;
|
// shift[isrc] = 0;
|
||||||
seq [isrc].reset();
|
// seq [isrc].reset();
|
||||||
|
|
||||||
used.erase (isrc);
|
// used.erase (isrc);
|
||||||
used.insert(idst);
|
// used.insert(idst);
|
||||||
}
|
//}
|
||||||
|
|
||||||
// copy the state of cells [i, i + n) (used for save/restore the state of the cells)
|
// copy the state of cells [i, i + n) (used for save/restore the state of the cells)
|
||||||
llama_kv_cells_unified cp(uint32_t i, uint32_t n) const {
|
llama_kv_cells cp(uint32_t i, uint32_t n) const {
|
||||||
assert(i + n <= pos.size());
|
assert(i + n <= pos.size());
|
||||||
|
|
||||||
llama_kv_cells_unified res;
|
llama_kv_cells res;
|
||||||
|
|
||||||
res.resize(n);
|
res.resize(n);
|
||||||
|
|
||||||
|
|
@ -117,8 +117,8 @@ public:
|
||||||
}
|
}
|
||||||
|
|
||||||
// copy the state of cells [idxs[0], idxs[1], ..., idxs[idxs.size() - 1])
|
// copy the state of cells [idxs[0], idxs[1], ..., idxs[idxs.size() - 1])
|
||||||
llama_kv_cells_unified cp(const std::vector<uint32_t> & idxs) const {
|
llama_kv_cells cp(const std::vector<uint32_t> & idxs) const {
|
||||||
llama_kv_cells_unified res;
|
llama_kv_cells res;
|
||||||
|
|
||||||
res.resize(idxs.size());
|
res.resize(idxs.size());
|
||||||
|
|
||||||
|
|
@ -135,7 +135,7 @@ public:
|
||||||
}
|
}
|
||||||
|
|
||||||
// set the state of cells [i, i + other.pos.size()) (used for save/restore the state of the cells)
|
// set the state of cells [i, i + other.pos.size()) (used for save/restore the state of the cells)
|
||||||
void set(uint32_t i, const llama_kv_cells_unified & other) {
|
void set(uint32_t i, const llama_kv_cells & other) {
|
||||||
assert(i + other.pos.size() <= pos.size());
|
assert(i + other.pos.size() <= pos.size());
|
||||||
|
|
||||||
for (uint32_t j = 0; j < other.pos.size(); ++j) {
|
for (uint32_t j = 0; j < other.pos.size(); ++j) {
|
||||||
|
|
@ -165,7 +165,7 @@ public:
|
||||||
}
|
}
|
||||||
|
|
||||||
// set the state of cells [idxs[0], idxs[1], ..., idxs[idxs.size() - 1])
|
// set the state of cells [idxs[0], idxs[1], ..., idxs[idxs.size() - 1])
|
||||||
void set(const std::vector<uint32_t> & idxs, const llama_kv_cells_unified & other) {
|
void set(const std::vector<uint32_t> & idxs, const llama_kv_cells & other) {
|
||||||
assert(idxs.size() == other.pos.size());
|
assert(idxs.size() == other.pos.size());
|
||||||
|
|
||||||
for (uint32_t j = 0; j < other.pos.size(); ++j) {
|
for (uint32_t j = 0; j < other.pos.size(); ++j) {
|
||||||
|
|
|
||||||
47
llama/llama.cpp/src/llama-memory-hybrid.cpp
vendored
47
llama/llama.cpp/src/llama-memory-hybrid.cpp
vendored
|
|
@ -27,14 +27,11 @@ llama_memory_hybrid::llama_memory_hybrid(
|
||||||
bool offload,
|
bool offload,
|
||||||
bool unified,
|
bool unified,
|
||||||
/* layer filters */
|
/* layer filters */
|
||||||
layer_filter_cb && filter_attn,
|
const layer_filter_cb & filter_attn,
|
||||||
layer_filter_cb && filter_recr) :
|
const layer_filter_cb & filter_recr) :
|
||||||
hparams(model.hparams),
|
hparams(model.hparams),
|
||||||
mem_attn(new llama_kv_cache_unified(
|
mem_attn(new llama_kv_cache(
|
||||||
model,
|
model,
|
||||||
filter_attn == nullptr ?
|
|
||||||
[&](int32_t il) { return !hparams.is_recurrent(il); }
|
|
||||||
: filter_attn,
|
|
||||||
type_k,
|
type_k,
|
||||||
type_v,
|
type_v,
|
||||||
v_trans,
|
v_trans,
|
||||||
|
|
@ -44,18 +41,22 @@ llama_memory_hybrid::llama_memory_hybrid(
|
||||||
n_seq_max,
|
n_seq_max,
|
||||||
n_pad,
|
n_pad,
|
||||||
n_swa,
|
n_swa,
|
||||||
swa_type
|
swa_type,
|
||||||
|
filter_attn == nullptr ?
|
||||||
|
[&](int32_t il) { return !hparams.is_recurrent(il); }
|
||||||
|
: filter_attn,
|
||||||
|
nullptr
|
||||||
)),
|
)),
|
||||||
mem_recr(new llama_memory_recurrent(
|
mem_recr(new llama_memory_recurrent(
|
||||||
model,
|
model,
|
||||||
filter_recr == nullptr ?
|
|
||||||
[&](int32_t il) { return hparams.is_recurrent(il); }
|
|
||||||
: filter_recr,
|
|
||||||
type_r,
|
type_r,
|
||||||
type_s,
|
type_s,
|
||||||
offload,
|
offload,
|
||||||
rs_size,
|
rs_size,
|
||||||
n_seq_max
|
n_seq_max,
|
||||||
|
filter_recr == nullptr ?
|
||||||
|
[&](int32_t il) { return hparams.is_recurrent(il); }
|
||||||
|
: filter_recr
|
||||||
)) {}
|
)) {}
|
||||||
|
|
||||||
llama_memory_context_ptr llama_memory_hybrid::init_batch(llama_batch_allocr & balloc, uint32_t n_ubatch, bool embd_all) {
|
llama_memory_context_ptr llama_memory_hybrid::init_batch(llama_batch_allocr & balloc, uint32_t n_ubatch, bool embd_all) {
|
||||||
|
|
@ -165,17 +166,29 @@ llama_pos llama_memory_hybrid::seq_pos_max(llama_seq_id seq_id) const {
|
||||||
return std::min(mem_attn->seq_pos_max(seq_id), mem_recr->seq_pos_max(seq_id));
|
return std::min(mem_attn->seq_pos_max(seq_id), mem_recr->seq_pos_max(seq_id));
|
||||||
}
|
}
|
||||||
|
|
||||||
void llama_memory_hybrid::state_write(llama_io_write_i & io, llama_seq_id seq_id) const {
|
std::map<ggml_backend_buffer_type_t, size_t> llama_memory_hybrid::memory_breakdown() const {
|
||||||
|
std::map<ggml_backend_buffer_type_t, size_t> mb = mem_attn->memory_breakdown();
|
||||||
|
for (const auto & buft_size : mem_recr->memory_breakdown()) {
|
||||||
|
mb[buft_size.first] += buft_size.second;
|
||||||
|
}
|
||||||
|
return mb;
|
||||||
|
}
|
||||||
|
|
||||||
|
void llama_memory_hybrid::state_write(llama_io_write_i & io, llama_seq_id seq_id, llama_state_seq_flags flags) const {
|
||||||
|
GGML_UNUSED(flags);
|
||||||
|
|
||||||
mem_attn->state_write(io, seq_id);
|
mem_attn->state_write(io, seq_id);
|
||||||
mem_recr->state_write(io, seq_id);
|
mem_recr->state_write(io, seq_id);
|
||||||
}
|
}
|
||||||
|
|
||||||
void llama_memory_hybrid::state_read(llama_io_read_i & io, llama_seq_id seq_id) {
|
void llama_memory_hybrid::state_read(llama_io_read_i & io, llama_seq_id seq_id, llama_state_seq_flags flags) {
|
||||||
|
GGML_UNUSED(flags);
|
||||||
|
|
||||||
mem_attn->state_read(io, seq_id);
|
mem_attn->state_read(io, seq_id);
|
||||||
mem_recr->state_read(io, seq_id);
|
mem_recr->state_read(io, seq_id);
|
||||||
}
|
}
|
||||||
|
|
||||||
llama_kv_cache_unified * llama_memory_hybrid::get_mem_attn() const {
|
llama_kv_cache * llama_memory_hybrid::get_mem_attn() const {
|
||||||
return mem_attn.get();
|
return mem_attn.get();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -206,7 +219,7 @@ llama_memory_hybrid_context::llama_memory_hybrid_context(
|
||||||
std::vector<llama_ubatch> ubatches) :
|
std::vector<llama_ubatch> ubatches) :
|
||||||
ubatches(std::move(ubatches)),
|
ubatches(std::move(ubatches)),
|
||||||
// note: here we copy the ubatches. not sure if this is ideal
|
// note: here we copy the ubatches. not sure if this is ideal
|
||||||
ctx_attn(new llama_kv_cache_unified_context(mem->get_mem_attn(), std::move(sinfos_attn), this->ubatches)),
|
ctx_attn(new llama_kv_cache_context(mem->get_mem_attn(), std::move(sinfos_attn), this->ubatches)),
|
||||||
ctx_recr(new llama_memory_recurrent_context(mem->get_mem_recr(), this->ubatches)),
|
ctx_recr(new llama_memory_recurrent_context(mem->get_mem_recr(), this->ubatches)),
|
||||||
status(llama_memory_status_combine(ctx_attn->get_status(), ctx_recr->get_status())) {
|
status(llama_memory_status_combine(ctx_attn->get_status(), ctx_recr->get_status())) {
|
||||||
}
|
}
|
||||||
|
|
@ -244,8 +257,8 @@ const llama_ubatch & llama_memory_hybrid_context::get_ubatch() const {
|
||||||
return ubatches[i_next];
|
return ubatches[i_next];
|
||||||
}
|
}
|
||||||
|
|
||||||
const llama_kv_cache_unified_context * llama_memory_hybrid_context::get_attn() const {
|
const llama_kv_cache_context * llama_memory_hybrid_context::get_attn() const {
|
||||||
return static_cast<const llama_kv_cache_unified_context *>(ctx_attn.get());
|
return static_cast<const llama_kv_cache_context *>(ctx_attn.get());
|
||||||
}
|
}
|
||||||
|
|
||||||
const llama_memory_recurrent_context * llama_memory_hybrid_context::get_recr() const {
|
const llama_memory_recurrent_context * llama_memory_hybrid_context::get_recr() const {
|
||||||
|
|
|
||||||
26
llama/llama.cpp/src/llama-memory-hybrid.h
vendored
26
llama/llama.cpp/src/llama-memory-hybrid.h
vendored
|
|
@ -2,7 +2,7 @@
|
||||||
|
|
||||||
#include "llama-batch.h"
|
#include "llama-batch.h"
|
||||||
#include "llama-graph.h"
|
#include "llama-graph.h"
|
||||||
#include "llama-kv-cache-unified.h"
|
#include "llama-kv-cache.h"
|
||||||
#include "llama-memory.h"
|
#include "llama-memory.h"
|
||||||
#include "llama-memory-recurrent.h"
|
#include "llama-memory-recurrent.h"
|
||||||
|
|
||||||
|
|
@ -13,15 +13,11 @@
|
||||||
// llama_memory_hybrid
|
// llama_memory_hybrid
|
||||||
//
|
//
|
||||||
|
|
||||||
// utilizes instances of llama_memory_recurrent and llama_kv_cache_unified to
|
// utilizes instances of llama_memory_recurrent and llama_kv_cache to
|
||||||
// support models where each layer may be either attention-based or recurrent
|
// support models where each layer may be either attention-based or recurrent
|
||||||
|
|
||||||
class llama_memory_hybrid : public llama_memory_i {
|
class llama_memory_hybrid : public llama_memory_i {
|
||||||
public:
|
public:
|
||||||
|
|
||||||
// this callback is used to filter out layers that should not be included in the cache
|
|
||||||
using layer_filter_cb = std::function<bool(int32_t il)>;
|
|
||||||
|
|
||||||
llama_memory_hybrid(
|
llama_memory_hybrid(
|
||||||
const llama_model & model,
|
const llama_model & model,
|
||||||
/* attn */
|
/* attn */
|
||||||
|
|
@ -41,8 +37,8 @@ public:
|
||||||
bool offload,
|
bool offload,
|
||||||
bool unified,
|
bool unified,
|
||||||
/* layer filters */
|
/* layer filters */
|
||||||
layer_filter_cb && filter_attn = nullptr,
|
const layer_filter_cb & filter_attn = nullptr,
|
||||||
layer_filter_cb && filter_recr = nullptr);
|
const layer_filter_cb & filter_recr = nullptr);
|
||||||
|
|
||||||
~llama_memory_hybrid() = default;
|
~llama_memory_hybrid() = default;
|
||||||
|
|
||||||
|
|
@ -72,28 +68,30 @@ public:
|
||||||
llama_pos seq_pos_min(llama_seq_id seq_id) const override;
|
llama_pos seq_pos_min(llama_seq_id seq_id) const override;
|
||||||
llama_pos seq_pos_max(llama_seq_id seq_id) const override;
|
llama_pos seq_pos_max(llama_seq_id seq_id) const override;
|
||||||
|
|
||||||
|
std::map<ggml_backend_buffer_type_t, size_t> memory_breakdown() const override;
|
||||||
|
|
||||||
// state write/load
|
// state write/load
|
||||||
|
|
||||||
void state_write(llama_io_write_i & io, llama_seq_id seq_id = -1) const override;
|
void state_write(llama_io_write_i & io, llama_seq_id seq_id = -1, llama_state_seq_flags flags = 0) const override;
|
||||||
void state_read (llama_io_read_i & io, llama_seq_id seq_id = -1) override;
|
void state_read (llama_io_read_i & io, llama_seq_id seq_id = -1, llama_state_seq_flags flags = 0) override;
|
||||||
|
|
||||||
//
|
//
|
||||||
// llama_memory_hybrid specific API
|
// llama_memory_hybrid specific API
|
||||||
//
|
//
|
||||||
|
|
||||||
llama_kv_cache_unified * get_mem_attn() const;
|
llama_kv_cache * get_mem_attn() const;
|
||||||
llama_memory_recurrent * get_mem_recr() const;
|
llama_memory_recurrent * get_mem_recr() const;
|
||||||
|
|
||||||
private:
|
private:
|
||||||
const llama_hparams & hparams;
|
const llama_hparams & hparams;
|
||||||
|
|
||||||
const std::unique_ptr<llama_kv_cache_unified> mem_attn;
|
const std::unique_ptr<llama_kv_cache> mem_attn;
|
||||||
const std::unique_ptr<llama_memory_recurrent> mem_recr;
|
const std::unique_ptr<llama_memory_recurrent> mem_recr;
|
||||||
};
|
};
|
||||||
|
|
||||||
class llama_memory_hybrid_context : public llama_memory_context_i {
|
class llama_memory_hybrid_context : public llama_memory_context_i {
|
||||||
public:
|
public:
|
||||||
using slot_info_vec_t = llama_kv_cache_unified::slot_info_vec_t;
|
using slot_info_vec_t = llama_kv_cache::slot_info_vec_t;
|
||||||
|
|
||||||
// init failure
|
// init failure
|
||||||
explicit llama_memory_hybrid_context(llama_memory_status status);
|
explicit llama_memory_hybrid_context(llama_memory_status status);
|
||||||
|
|
@ -125,7 +123,7 @@ public:
|
||||||
// llama_memory_hybrid_context
|
// llama_memory_hybrid_context
|
||||||
//
|
//
|
||||||
|
|
||||||
const llama_kv_cache_unified_context * get_attn() const;
|
const llama_kv_cache_context * get_attn() const;
|
||||||
const llama_memory_recurrent_context * get_recr() const;
|
const llama_memory_recurrent_context * get_recr() const;
|
||||||
|
|
||||||
private:
|
private:
|
||||||
|
|
|
||||||
20
llama/llama.cpp/src/llama-memory-recurrent.cpp
vendored
20
llama/llama.cpp/src/llama-memory-recurrent.cpp
vendored
|
|
@ -17,12 +17,12 @@
|
||||||
|
|
||||||
llama_memory_recurrent::llama_memory_recurrent(
|
llama_memory_recurrent::llama_memory_recurrent(
|
||||||
const llama_model & model,
|
const llama_model & model,
|
||||||
layer_filter_cb && filter,
|
|
||||||
ggml_type type_r,
|
ggml_type type_r,
|
||||||
ggml_type type_s,
|
ggml_type type_s,
|
||||||
bool offload,
|
bool offload,
|
||||||
uint32_t mem_size,
|
uint32_t mem_size,
|
||||||
uint32_t n_seq_max) : hparams(model.hparams), n_seq_max(n_seq_max) {
|
uint32_t n_seq_max,
|
||||||
|
const layer_filter_cb & filter) : hparams(model.hparams), n_seq_max(n_seq_max) {
|
||||||
const int32_t n_layer = hparams.n_layer;
|
const int32_t n_layer = hparams.n_layer;
|
||||||
|
|
||||||
head = 0;
|
head = 0;
|
||||||
|
|
@ -359,6 +359,14 @@ llama_pos llama_memory_recurrent::seq_pos_max(llama_seq_id seq_id) const {
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
std::map<ggml_backend_buffer_type_t, size_t> llama_memory_recurrent::memory_breakdown() const {
|
||||||
|
std::map<ggml_backend_buffer_type_t, size_t> ret;
|
||||||
|
for (const ggml_backend_buffer_ptr & buf_ptr : bufs) {
|
||||||
|
ret[ggml_backend_buffer_get_type(buf_ptr.get())] += ggml_backend_buffer_get_size(buf_ptr.get());
|
||||||
|
}
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
|
||||||
llama_memory_context_ptr llama_memory_recurrent::init_batch(llama_batch_allocr & balloc, uint32_t n_ubatch, bool embd_all) {
|
llama_memory_context_ptr llama_memory_recurrent::init_batch(llama_batch_allocr & balloc, uint32_t n_ubatch, bool embd_all) {
|
||||||
do {
|
do {
|
||||||
balloc.split_reset();
|
balloc.split_reset();
|
||||||
|
|
@ -680,7 +688,9 @@ size_t llama_memory_recurrent::size_s_bytes() const {
|
||||||
return size_s_bytes;
|
return size_s_bytes;
|
||||||
}
|
}
|
||||||
|
|
||||||
void llama_memory_recurrent::state_write(llama_io_write_i & io, llama_seq_id seq_id) const {
|
void llama_memory_recurrent::state_write(llama_io_write_i & io, llama_seq_id seq_id, llama_state_seq_flags flags) const {
|
||||||
|
GGML_UNUSED(flags);
|
||||||
|
|
||||||
std::vector<std::pair<uint32_t, uint32_t>> cell_ranges; // ranges, from inclusive, to exclusive
|
std::vector<std::pair<uint32_t, uint32_t>> cell_ranges; // ranges, from inclusive, to exclusive
|
||||||
uint32_t cell_count = 0;
|
uint32_t cell_count = 0;
|
||||||
|
|
||||||
|
|
@ -718,7 +728,9 @@ void llama_memory_recurrent::state_write(llama_io_write_i & io, llama_seq_id seq
|
||||||
state_write_data(io, cell_ranges);
|
state_write_data(io, cell_ranges);
|
||||||
}
|
}
|
||||||
|
|
||||||
void llama_memory_recurrent::state_read(llama_io_read_i & io, llama_seq_id seq_id) {
|
void llama_memory_recurrent::state_read(llama_io_read_i & io, llama_seq_id seq_id, llama_state_seq_flags flags) {
|
||||||
|
GGML_UNUSED(flags);
|
||||||
|
|
||||||
uint32_t cell_count;
|
uint32_t cell_count;
|
||||||
io.read_to(&cell_count, sizeof(cell_count));
|
io.read_to(&cell_count, sizeof(cell_count));
|
||||||
|
|
||||||
|
|
|
||||||
17
llama/llama.cpp/src/llama-memory-recurrent.h
vendored
17
llama/llama.cpp/src/llama-memory-recurrent.h
vendored
|
|
@ -4,6 +4,7 @@
|
||||||
#include "llama-graph.h"
|
#include "llama-graph.h"
|
||||||
#include "llama-memory.h"
|
#include "llama-memory.h"
|
||||||
|
|
||||||
|
#include <map>
|
||||||
#include <set>
|
#include <set>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
|
|
||||||
|
|
@ -12,21 +13,17 @@
|
||||||
//
|
//
|
||||||
|
|
||||||
// TODO: extract the cache state used for graph computation into llama_memory_recurrent_context_i
|
// TODO: extract the cache state used for graph computation into llama_memory_recurrent_context_i
|
||||||
// see the implementation of llama_kv_cache_unified_context_i for an example how to do it
|
// see the implementation of llama_kv_cache_context_i for an example how to do it
|
||||||
class llama_memory_recurrent : public llama_memory_i {
|
class llama_memory_recurrent : public llama_memory_i {
|
||||||
public:
|
public:
|
||||||
|
|
||||||
// this callback is used to filter out layers that should not be included in the cache
|
|
||||||
using layer_filter_cb = std::function<bool(int32_t il)>;
|
|
||||||
|
|
||||||
llama_memory_recurrent(
|
llama_memory_recurrent(
|
||||||
const llama_model & model,
|
const llama_model & model,
|
||||||
layer_filter_cb && filter,
|
|
||||||
ggml_type type_r,
|
ggml_type type_r,
|
||||||
ggml_type type_s,
|
ggml_type type_s,
|
||||||
bool offload,
|
bool offload,
|
||||||
uint32_t mem_size,
|
uint32_t mem_size,
|
||||||
uint32_t n_seq_max);
|
uint32_t n_seq_max,
|
||||||
|
const layer_filter_cb & filter);
|
||||||
|
|
||||||
~llama_memory_recurrent() = default;
|
~llama_memory_recurrent() = default;
|
||||||
|
|
||||||
|
|
@ -54,6 +51,8 @@ public:
|
||||||
llama_pos seq_pos_min(llama_seq_id seq_id) const override;
|
llama_pos seq_pos_min(llama_seq_id seq_id) const override;
|
||||||
llama_pos seq_pos_max(llama_seq_id seq_id) const override;
|
llama_pos seq_pos_max(llama_seq_id seq_id) const override;
|
||||||
|
|
||||||
|
std::map<ggml_backend_buffer_type_t, size_t> memory_breakdown() const override;
|
||||||
|
|
||||||
bool prepare(const std::vector<llama_ubatch> & ubatches);
|
bool prepare(const std::vector<llama_ubatch> & ubatches);
|
||||||
|
|
||||||
// find a contiguous slot of memory cells and emplace the ubatch there
|
// find a contiguous slot of memory cells and emplace the ubatch there
|
||||||
|
|
@ -63,8 +62,8 @@ public:
|
||||||
|
|
||||||
// state write/load
|
// state write/load
|
||||||
|
|
||||||
void state_write(llama_io_write_i & io, llama_seq_id seq_id = -1) const override;
|
void state_write(llama_io_write_i & io, llama_seq_id seq_id = -1, llama_state_seq_flags flags = 0) const override;
|
||||||
void state_read (llama_io_read_i & io, llama_seq_id seq_id = -1) override;
|
void state_read (llama_io_read_i & io, llama_seq_id seq_id = -1, llama_state_seq_flags flags = 0) override;
|
||||||
|
|
||||||
uint32_t head = 0; // the location where the batch will be placed in the cache (see find_slot())
|
uint32_t head = 0; // the location where the batch will be placed in the cache (see find_slot())
|
||||||
uint32_t size = 0; // total number of cells, shared across all sequences
|
uint32_t size = 0; // total number of cells, shared across all sequences
|
||||||
|
|
|
||||||
26
llama/llama.cpp/src/llama-memory.h
vendored
26
llama/llama.cpp/src/llama-memory.h
vendored
|
|
@ -2,7 +2,9 @@
|
||||||
|
|
||||||
#include "llama.h"
|
#include "llama.h"
|
||||||
|
|
||||||
|
#include <map>
|
||||||
#include <memory>
|
#include <memory>
|
||||||
|
#include <functional>
|
||||||
|
|
||||||
struct llama_ubatch;
|
struct llama_ubatch;
|
||||||
|
|
||||||
|
|
@ -36,8 +38,8 @@ bool llama_memory_status_is_fail(llama_memory_status status);
|
||||||
|
|
||||||
// the interface for managing the memory context during batch processing
|
// the interface for managing the memory context during batch processing
|
||||||
// this interface is implemented per memory type. see:
|
// this interface is implemented per memory type. see:
|
||||||
// - llama_kv_cache_unified_context
|
// - llama_kv_cache_context
|
||||||
// - llama_kv_cache_unified_iswa_context
|
// - llama_kv_cache_iswa_context
|
||||||
// ...
|
// ...
|
||||||
//
|
//
|
||||||
// the only method that should mutate the memory and the memory context is llama_memory_i::apply()
|
// the only method that should mutate the memory and the memory context is llama_memory_i::apply()
|
||||||
|
|
@ -64,6 +66,13 @@ using llama_memory_context_ptr = std::unique_ptr<llama_memory_context_i>;
|
||||||
// general concept of LLM memory
|
// general concept of LLM memory
|
||||||
// the KV cache is a type of LLM memory, but there can be other types
|
// the KV cache is a type of LLM memory, but there can be other types
|
||||||
struct llama_memory_i {
|
struct llama_memory_i {
|
||||||
|
// this callback is used to filter out layers that should not be included in the cache
|
||||||
|
using layer_filter_cb = std::function<bool(int32_t il)>;
|
||||||
|
|
||||||
|
// this callback is used to specify which layers should reuse memory from other layers
|
||||||
|
// return negative value to indicate that the layer il should not reuse memory
|
||||||
|
using layer_reuse_cb = std::function<int32_t(int32_t il)>;
|
||||||
|
|
||||||
virtual ~llama_memory_i() = default;
|
virtual ~llama_memory_i() = default;
|
||||||
|
|
||||||
// split the input batch into a set of ubatches and verify that they can fit into the cache
|
// split the input batch into a set of ubatches and verify that they can fit into the cache
|
||||||
|
|
@ -77,7 +86,7 @@ struct llama_memory_i {
|
||||||
// simulate full cache, used for allocating worst-case compute buffers
|
// simulate full cache, used for allocating worst-case compute buffers
|
||||||
virtual llama_memory_context_ptr init_full() = 0;
|
virtual llama_memory_context_ptr init_full() = 0;
|
||||||
|
|
||||||
// prepare for any pending memory updates, such as shifts, defrags, etc.
|
// prepare for any pending memory updates, such as shifts, copies, etc.
|
||||||
// status == LLAMA_MEMORY_STATUS_NO_UPDATE if there is nothing to update
|
// status == LLAMA_MEMORY_STATUS_NO_UPDATE if there is nothing to update
|
||||||
virtual llama_memory_context_ptr init_update(llama_context * lctx, bool optimize) = 0;
|
virtual llama_memory_context_ptr init_update(llama_context * lctx, bool optimize) = 0;
|
||||||
|
|
||||||
|
|
@ -100,17 +109,14 @@ struct llama_memory_i {
|
||||||
virtual llama_pos seq_pos_min(llama_seq_id seq_id) const = 0;
|
virtual llama_pos seq_pos_min(llama_seq_id seq_id) const = 0;
|
||||||
virtual llama_pos seq_pos_max(llama_seq_id seq_id) const = 0;
|
virtual llama_pos seq_pos_max(llama_seq_id seq_id) const = 0;
|
||||||
|
|
||||||
|
virtual std::map<ggml_backend_buffer_type_t, size_t> memory_breakdown() const = 0;
|
||||||
|
|
||||||
//
|
//
|
||||||
// state write/read
|
// state write/read
|
||||||
//
|
//
|
||||||
|
|
||||||
virtual void state_write(llama_io_write_i & io, llama_seq_id seq_id = -1) const = 0;
|
virtual void state_write(llama_io_write_i & io, llama_seq_id seq_id = -1, llama_state_seq_flags flags = 0) const = 0;
|
||||||
virtual void state_read (llama_io_read_i & io, llama_seq_id seq_id = -1) = 0;
|
virtual void state_read (llama_io_read_i & io, llama_seq_id seq_id = -1, llama_state_seq_flags flags = 0) = 0;
|
||||||
};
|
};
|
||||||
|
|
||||||
using llama_memory_ptr = std::unique_ptr<llama_memory_i>;
|
using llama_memory_ptr = std::unique_ptr<llama_memory_i>;
|
||||||
|
|
||||||
// TODO: temporary until the llama_kv_cache is removed from the public API
|
|
||||||
struct llama_kv_cache : public llama_memory_i {
|
|
||||||
virtual ~llama_kv_cache() = default;
|
|
||||||
};
|
|
||||||
|
|
|
||||||
1
llama/llama.cpp/src/llama-model-loader.cpp
vendored
1
llama/llama.cpp/src/llama-model-loader.cpp
vendored
|
|
@ -789,6 +789,7 @@ const struct ggml_tensor * llama_model_loader::check_tensor_dims(const std::stri
|
||||||
}
|
}
|
||||||
|
|
||||||
struct ggml_tensor * llama_model_loader::create_tensor(struct ggml_context * ctx, const std::string & name, const std::initializer_list<int64_t> & ne, int flags) {
|
struct ggml_tensor * llama_model_loader::create_tensor(struct ggml_context * ctx, const std::string & name, const std::initializer_list<int64_t> & ne, int flags) {
|
||||||
|
LLAMA_LOG_DEBUG("%s: loading tensor %s\n", __func__, name.c_str());
|
||||||
const struct ggml_tensor * cur = check_tensor_dims(name, ne, !(flags & TENSOR_NOT_REQUIRED));
|
const struct ggml_tensor * cur = check_tensor_dims(name, ne, !(flags & TENSOR_NOT_REQUIRED));
|
||||||
|
|
||||||
if (cur == NULL) {
|
if (cur == NULL) {
|
||||||
|
|
|
||||||
1795
llama/llama.cpp/src/llama-model.cpp
vendored
1795
llama/llama.cpp/src/llama-model.cpp
vendored
File diff suppressed because it is too large
Load Diff
17
llama/llama.cpp/src/llama-model.h
vendored
17
llama/llama.cpp/src/llama-model.h
vendored
|
|
@ -7,6 +7,7 @@
|
||||||
#include "llama-memory.h"
|
#include "llama-memory.h"
|
||||||
#include "llama-vocab.h"
|
#include "llama-vocab.h"
|
||||||
|
|
||||||
|
#include <map>
|
||||||
#include <memory>
|
#include <memory>
|
||||||
#include <string>
|
#include <string>
|
||||||
#include <unordered_map>
|
#include <unordered_map>
|
||||||
|
|
@ -28,6 +29,7 @@ enum llm_type {
|
||||||
LLM_TYPE_80M,
|
LLM_TYPE_80M,
|
||||||
LLM_TYPE_109M,
|
LLM_TYPE_109M,
|
||||||
LLM_TYPE_137M,
|
LLM_TYPE_137M,
|
||||||
|
LLM_TYPE_140M,
|
||||||
LLM_TYPE_160M,
|
LLM_TYPE_160M,
|
||||||
LLM_TYPE_190M,
|
LLM_TYPE_190M,
|
||||||
LLM_TYPE_220M,
|
LLM_TYPE_220M,
|
||||||
|
|
@ -36,12 +38,15 @@ enum llm_type {
|
||||||
LLM_TYPE_270M,
|
LLM_TYPE_270M,
|
||||||
LLM_TYPE_335M,
|
LLM_TYPE_335M,
|
||||||
LLM_TYPE_350M,
|
LLM_TYPE_350M,
|
||||||
|
LLM_TYPE_360M,
|
||||||
LLM_TYPE_410M,
|
LLM_TYPE_410M,
|
||||||
LLM_TYPE_450M,
|
LLM_TYPE_450M,
|
||||||
LLM_TYPE_475M,
|
LLM_TYPE_475M,
|
||||||
|
LLM_TYPE_558M,
|
||||||
LLM_TYPE_700M,
|
LLM_TYPE_700M,
|
||||||
LLM_TYPE_770M,
|
LLM_TYPE_770M,
|
||||||
LLM_TYPE_780M,
|
LLM_TYPE_780M,
|
||||||
|
LLM_TYPE_950M,
|
||||||
LLM_TYPE_0_3B,
|
LLM_TYPE_0_3B,
|
||||||
LLM_TYPE_0_5B,
|
LLM_TYPE_0_5B,
|
||||||
LLM_TYPE_0_6B,
|
LLM_TYPE_0_6B,
|
||||||
|
|
@ -54,6 +59,7 @@ enum llm_type {
|
||||||
LLM_TYPE_1_7B,
|
LLM_TYPE_1_7B,
|
||||||
LLM_TYPE_1_8B,
|
LLM_TYPE_1_8B,
|
||||||
LLM_TYPE_2B,
|
LLM_TYPE_2B,
|
||||||
|
LLM_TYPE_2_6B,
|
||||||
LLM_TYPE_2_8B,
|
LLM_TYPE_2_8B,
|
||||||
LLM_TYPE_2_9B,
|
LLM_TYPE_2_9B,
|
||||||
LLM_TYPE_3B,
|
LLM_TYPE_3B,
|
||||||
|
|
@ -76,9 +82,11 @@ enum llm_type {
|
||||||
LLM_TYPE_32B,
|
LLM_TYPE_32B,
|
||||||
LLM_TYPE_34B,
|
LLM_TYPE_34B,
|
||||||
LLM_TYPE_35B,
|
LLM_TYPE_35B,
|
||||||
|
LLM_TYPE_36B,
|
||||||
LLM_TYPE_40B,
|
LLM_TYPE_40B,
|
||||||
LLM_TYPE_65B,
|
LLM_TYPE_65B,
|
||||||
LLM_TYPE_70B,
|
LLM_TYPE_70B,
|
||||||
|
LLM_TYPE_120B,
|
||||||
LLM_TYPE_142B,
|
LLM_TYPE_142B,
|
||||||
LLM_TYPE_236B,
|
LLM_TYPE_236B,
|
||||||
LLM_TYPE_290B,
|
LLM_TYPE_290B,
|
||||||
|
|
@ -268,6 +276,11 @@ struct llama_layer {
|
||||||
struct ggml_tensor * ffn_down_shexp = nullptr;
|
struct ggml_tensor * ffn_down_shexp = nullptr;
|
||||||
struct ggml_tensor * ffn_up_shexp = nullptr;
|
struct ggml_tensor * ffn_up_shexp = nullptr;
|
||||||
|
|
||||||
|
// ff adjugate experts (chexps)
|
||||||
|
struct ggml_tensor * ffn_gate_chexps = nullptr;
|
||||||
|
struct ggml_tensor * ffn_down_chexps = nullptr;
|
||||||
|
struct ggml_tensor * ffn_up_chexps = nullptr;
|
||||||
|
|
||||||
// ff bias
|
// ff bias
|
||||||
struct ggml_tensor * ffn_gate_b = nullptr;
|
struct ggml_tensor * ffn_gate_b = nullptr;
|
||||||
struct ggml_tensor * ffn_down_b = nullptr; // b2
|
struct ggml_tensor * ffn_down_b = nullptr; // b2
|
||||||
|
|
@ -449,10 +462,12 @@ struct llama_model {
|
||||||
|
|
||||||
std::string desc() const;
|
std::string desc() const;
|
||||||
|
|
||||||
size_t size() const;
|
size_t size() const; // file size
|
||||||
size_t n_tensors() const;
|
size_t n_tensors() const;
|
||||||
size_t n_devices() const;
|
size_t n_devices() const;
|
||||||
|
|
||||||
|
std::map<ggml_backend_buffer_type_t, size_t> memory_breakdown() const;
|
||||||
|
|
||||||
// total number of parameters in the model
|
// total number of parameters in the model
|
||||||
uint64_t n_elements() const;
|
uint64_t n_elements() const;
|
||||||
|
|
||||||
|
|
|
||||||
10
llama/llama.cpp/src/llama-quant.cpp
vendored
10
llama/llama.cpp/src/llama-quant.cpp
vendored
|
|
@ -725,7 +725,9 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
|
||||||
// attention layers have a non-zero number of kv heads
|
// attention layers have a non-zero number of kv heads
|
||||||
int32_t n_attn_layer = model.hparams.n_layer - std::count(n_head_kv_iter, n_head_kv_iter + model.hparams.n_layer, 0);
|
int32_t n_attn_layer = model.hparams.n_layer - std::count(n_head_kv_iter, n_head_kv_iter + model.hparams.n_layer, 0);
|
||||||
if (llama_model_has_encoder(&model)) {
|
if (llama_model_has_encoder(&model)) {
|
||||||
n_attn_layer *= 3;
|
// now n_attn_layer is the number of attention layers in the encoder
|
||||||
|
// for each decoder block, there are 2 attention layers
|
||||||
|
n_attn_layer += 2 * model.hparams.dec_n_layer;
|
||||||
}
|
}
|
||||||
GGML_ASSERT((qs.n_attention_wv == n_attn_layer - pruned_attention_w) && "n_attention_wv is unexpected");
|
GGML_ASSERT((qs.n_attention_wv == n_attn_layer - pruned_attention_w) && "n_attention_wv is unexpected");
|
||||||
}
|
}
|
||||||
|
|
@ -920,7 +922,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
|
||||||
new_type = tensor->type;
|
new_type = tensor->type;
|
||||||
new_data = tensor->data;
|
new_data = tensor->data;
|
||||||
new_size = ggml_nbytes(tensor);
|
new_size = ggml_nbytes(tensor);
|
||||||
LLAMA_LOG_INFO("size = %8.3f MB\n", ggml_nbytes(tensor)/1024.0/1024.0);
|
LLAMA_LOG_INFO("size = %8.3f MiB\n", ggml_nbytes(tensor)/1024.0/1024.0);
|
||||||
} else {
|
} else {
|
||||||
const int64_t nelements = ggml_nelements(tensor);
|
const int64_t nelements = ggml_nelements(tensor);
|
||||||
|
|
||||||
|
|
@ -1037,8 +1039,8 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
|
||||||
}
|
}
|
||||||
close_ofstream();
|
close_ofstream();
|
||||||
|
|
||||||
LLAMA_LOG_INFO("%s: model size = %8.2f MB\n", __func__, total_size_org/1024.0/1024.0);
|
LLAMA_LOG_INFO("%s: model size = %8.2f MiB\n", __func__, total_size_org/1024.0/1024.0);
|
||||||
LLAMA_LOG_INFO("%s: quant size = %8.2f MB\n", __func__, total_size_new/1024.0/1024.0);
|
LLAMA_LOG_INFO("%s: quant size = %8.2f MiB\n", __func__, total_size_new/1024.0/1024.0);
|
||||||
|
|
||||||
if (qs.n_fallback > 0) {
|
if (qs.n_fallback > 0) {
|
||||||
LLAMA_LOG_WARN("%s: WARNING: %d of %d tensor(s) required fallback quantization\n",
|
LLAMA_LOG_WARN("%s: WARNING: %d of %d tensor(s) required fallback quantization\n",
|
||||||
|
|
|
||||||
346
llama/llama.cpp/src/llama-sampling.cpp
vendored
346
llama/llama.cpp/src/llama-sampling.cpp
vendored
|
|
@ -128,6 +128,89 @@ struct ring_buffer {
|
||||||
std::vector<T> data;
|
std::vector<T> data;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
// writes result in res, does not mutate cur
|
||||||
|
static void llama_token_data_array_partial_sort(const llama_token_data_array & cur, int npartial, std::vector<llama_token_data> & res) {
|
||||||
|
static const auto comp = [](const llama_token_data & a, const llama_token_data & b) {
|
||||||
|
return a.logit > b.logit;
|
||||||
|
};
|
||||||
|
|
||||||
|
constexpr int nbuckets = 128;
|
||||||
|
constexpr float bucket_low = -10.0f;
|
||||||
|
constexpr float bucket_high = 10.0f;
|
||||||
|
constexpr float bucket_scale = nbuckets/(bucket_high - bucket_low);
|
||||||
|
constexpr float bucket_inter = -bucket_low * bucket_scale;
|
||||||
|
|
||||||
|
std::vector<int> bucket_idx;
|
||||||
|
std::vector<int> histo(nbuckets, 0);
|
||||||
|
|
||||||
|
std::vector<llama_token_data*> bucket_ptrs;
|
||||||
|
|
||||||
|
bucket_idx.reserve(cur.size);
|
||||||
|
|
||||||
|
for (int i = 0; i < (int)cur.size; ++i) {
|
||||||
|
const float val = cur.data[i].logit;
|
||||||
|
int ib = int(bucket_scale * val + bucket_inter); //nbuckets * (val - bucket_low) / (bucket_high - bucket_low);
|
||||||
|
ib = std::max(0, std::min(nbuckets - 1, ib));
|
||||||
|
bucket_idx.push_back(ib);
|
||||||
|
++histo[ib];
|
||||||
|
}
|
||||||
|
int nhave = 0;
|
||||||
|
int ib = nbuckets - 1;
|
||||||
|
for ( ; ib >= 0; --ib) {
|
||||||
|
nhave += histo[ib];
|
||||||
|
if (nhave >= npartial) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
res.resize(nhave);
|
||||||
|
auto * ptr = res.data();
|
||||||
|
bucket_ptrs.reserve(nbuckets - ib);
|
||||||
|
for (int j = nbuckets - 1; j >= ib; --j) {
|
||||||
|
bucket_ptrs.push_back(ptr);
|
||||||
|
ptr += histo[j];
|
||||||
|
}
|
||||||
|
for (int i = 0; i < (int)cur.size; ++i) {
|
||||||
|
int j = bucket_idx[i];
|
||||||
|
if (j >= ib) {
|
||||||
|
*bucket_ptrs[nbuckets - 1 - j]++ = cur.data[i];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
ptr = res.data();
|
||||||
|
int ndone = 0;
|
||||||
|
for (int j = nbuckets - 1; j > ib; --j) {
|
||||||
|
std::sort(ptr, ptr + histo[j], comp);
|
||||||
|
ptr += histo[j];
|
||||||
|
ndone += histo[j];
|
||||||
|
}
|
||||||
|
std::partial_sort(ptr, ptr + npartial - ndone, ptr + histo[ib], comp);
|
||||||
|
}
|
||||||
|
|
||||||
|
// reduces the size of cur_p to npartial, keeping only the top npartial elements
|
||||||
|
static void llama_token_data_array_partial_sort_inplace(llama_token_data_array * cur_p, int npartial) {
|
||||||
|
static const auto comp = [](const llama_token_data & a, const llama_token_data & b) {
|
||||||
|
return a.logit > b.logit;
|
||||||
|
};
|
||||||
|
|
||||||
|
if (npartial <= 128) {
|
||||||
|
std::partial_sort(cur_p->data, cur_p->data + npartial, cur_p->data + cur_p->size, comp);
|
||||||
|
|
||||||
|
cur_p->size = npartial;
|
||||||
|
cur_p->sorted = true;
|
||||||
|
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
std::vector<llama_token_data> tmp;
|
||||||
|
|
||||||
|
llama_token_data_array_partial_sort(*cur_p, npartial, tmp);
|
||||||
|
|
||||||
|
std::copy(tmp.data(), tmp.data() + npartial, cur_p->data);
|
||||||
|
|
||||||
|
cur_p->size = npartial;
|
||||||
|
cur_p->sorted = true;
|
||||||
|
}
|
||||||
|
|
||||||
static int llama_sample_dist(llama_token_data_array * cur_p, std::mt19937 & rng) {
|
static int llama_sample_dist(llama_token_data_array * cur_p, std::mt19937 & rng) {
|
||||||
// iterator for the probabilities
|
// iterator for the probabilities
|
||||||
#ifdef __GNUC__
|
#ifdef __GNUC__
|
||||||
|
|
@ -200,18 +283,21 @@ static void llama_sampler_temp_impl(llama_token_data_array * cur_p, float temp)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
static void llama_sampler_softmax_impl(llama_token_data_array * cur_p) {
|
static void llama_sampler_softmax_impl(llama_token_data_array * cur_p, bool do_sort) {
|
||||||
GGML_ASSERT(cur_p->size > 0);
|
GGML_ASSERT(cur_p->size > 0);
|
||||||
|
|
||||||
// Sort the logits in descending order
|
// Sort the logits in descending order if requested
|
||||||
if (!cur_p->sorted) {
|
if (do_sort && !cur_p->sorted) {
|
||||||
std::sort(cur_p->data, cur_p->data + cur_p->size, [](const llama_token_data & a, const llama_token_data & b) {
|
llama_token_data_array_partial_sort_inplace(cur_p, cur_p->size);
|
||||||
return a.logit > b.logit;
|
|
||||||
});
|
|
||||||
cur_p->sorted = true;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
float max_l = cur_p->data[0].logit;
|
float max_l = cur_p->data[0].logit;
|
||||||
|
if (!cur_p->sorted) {
|
||||||
|
for (size_t i = 1; i < cur_p->size; ++i) {
|
||||||
|
max_l = std::max(max_l, cur_p->data[i].logit);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
float cum_sum = 0.0f;
|
float cum_sum = 0.0f;
|
||||||
|
|
||||||
for (size_t i = 0; i < cur_p->size; ++i) {
|
for (size_t i = 0; i < cur_p->size; ++i) {
|
||||||
|
|
@ -226,7 +312,6 @@ static void llama_sampler_softmax_impl(llama_token_data_array * cur_p) {
|
||||||
}
|
}
|
||||||
|
|
||||||
static void llama_sampler_top_k_impl(llama_token_data_array * cur_p, int32_t k) {
|
static void llama_sampler_top_k_impl(llama_token_data_array * cur_p, int32_t k) {
|
||||||
// TODO: move bucket sort to separate function so that top_p/typical/softmax first is equally fast
|
|
||||||
// if (k >= (int32_t)cur_p->size) {
|
// if (k >= (int32_t)cur_p->size) {
|
||||||
// return;
|
// return;
|
||||||
// }
|
// }
|
||||||
|
|
@ -239,64 +324,7 @@ static void llama_sampler_top_k_impl(llama_token_data_array * cur_p, int32_t k)
|
||||||
|
|
||||||
// Sort scores in descending order
|
// Sort scores in descending order
|
||||||
if (!cur_p->sorted) {
|
if (!cur_p->sorted) {
|
||||||
auto comp = [](const llama_token_data & a, const llama_token_data & b) {
|
llama_token_data_array_partial_sort_inplace(cur_p, k);
|
||||||
return a.logit > b.logit;
|
|
||||||
};
|
|
||||||
if (k <= 128) {
|
|
||||||
std::partial_sort(cur_p->data, cur_p->data + k, cur_p->data + cur_p->size, comp);
|
|
||||||
} else {
|
|
||||||
constexpr int nbuckets = 128;
|
|
||||||
constexpr float bucket_low = -10.0f;
|
|
||||||
constexpr float bucket_high = 10.0f;
|
|
||||||
constexpr float bucket_scale = nbuckets/(bucket_high - bucket_low);
|
|
||||||
constexpr float bucket_inter = -bucket_low * bucket_scale;
|
|
||||||
|
|
||||||
std::vector<int> bucket_idx(cur_p->size);
|
|
||||||
std::vector<int> histo(nbuckets, 0);
|
|
||||||
|
|
||||||
for (int i = 0; i < (int)cur_p->size; ++i) {
|
|
||||||
const float val = cur_p->data[i].logit;
|
|
||||||
int ib = int(bucket_scale * val + bucket_inter); //nbuckets * (val - bucket_low) / (bucket_high - bucket_low);
|
|
||||||
ib = std::max(0, std::min(nbuckets - 1, ib));
|
|
||||||
bucket_idx[i] = ib;
|
|
||||||
++histo[ib];
|
|
||||||
}
|
|
||||||
int nhave = 0;
|
|
||||||
int ib = nbuckets - 1;
|
|
||||||
for ( ; ib >= 0; --ib) {
|
|
||||||
nhave += histo[ib];
|
|
||||||
if (nhave >= k) {
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
std::vector<llama_token_data> tmp_tokens(nhave);
|
|
||||||
auto * ptr = tmp_tokens.data();
|
|
||||||
std::vector<llama_token_data*> bucket_ptrs;
|
|
||||||
bucket_ptrs.reserve(nbuckets - ib);
|
|
||||||
for (int j = nbuckets - 1; j >= ib; --j) {
|
|
||||||
bucket_ptrs.push_back(ptr);
|
|
||||||
ptr += histo[j];
|
|
||||||
}
|
|
||||||
for (int i = 0; i < (int)cur_p->size; ++i) {
|
|
||||||
int j = bucket_idx[i];
|
|
||||||
if (j >= ib) {
|
|
||||||
*bucket_ptrs[nbuckets - 1 - j]++ = cur_p->data[i];
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
ptr = tmp_tokens.data();
|
|
||||||
int ndone = 0;
|
|
||||||
for (int j = nbuckets - 1; j > ib; --j) {
|
|
||||||
std::sort(ptr, ptr + histo[j], comp);
|
|
||||||
ptr += histo[j];
|
|
||||||
ndone += histo[j];
|
|
||||||
}
|
|
||||||
std::partial_sort(ptr, ptr + k - ndone, ptr + histo[ib], comp);
|
|
||||||
|
|
||||||
std::memcpy(cur_p->data, tmp_tokens.data(), k*sizeof(llama_token_data));
|
|
||||||
|
|
||||||
}
|
|
||||||
cur_p->sorted = true;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
cur_p->size = k;
|
cur_p->size = k;
|
||||||
|
|
@ -576,9 +604,73 @@ static const char * llama_sampler_dist_name(const struct llama_sampler * /*smpl*
|
||||||
static void llama_sampler_dist_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) {
|
static void llama_sampler_dist_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) {
|
||||||
auto * ctx = (llama_sampler_dist *) smpl->ctx;
|
auto * ctx = (llama_sampler_dist *) smpl->ctx;
|
||||||
|
|
||||||
llama_sampler_softmax_impl(cur_p);
|
// edge cases
|
||||||
|
if (cur_p->size == 0) {
|
||||||
|
cur_p->selected = -1;
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
cur_p->selected = 0;
|
||||||
|
|
||||||
|
if (cur_p->size == 1) {
|
||||||
|
cur_p->data[0].p = 1.0f;
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
// max logit for numerical stability
|
||||||
|
float max_l = cur_p->data[0].logit;
|
||||||
|
if (!cur_p->sorted) {
|
||||||
|
for (size_t i = 1; i < cur_p->size; ++i) {
|
||||||
|
max_l = std::max(max_l, cur_p->data[i].logit);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// apply softmax to obtain the probabilities
|
||||||
|
double sum_cum = 0.0f;
|
||||||
|
for (size_t i = 0; i < cur_p->size; ++i) {
|
||||||
|
float p = expf(cur_p->data[i].logit - max_l);
|
||||||
|
cur_p->data[i].p = p;
|
||||||
|
sum_cum += p;
|
||||||
|
}
|
||||||
|
|
||||||
|
#if 1
|
||||||
|
// sample from the obtained probabilities and normalize the probs in a single pass
|
||||||
|
// this is ~3x faster on Mac with full gpt-oss vocab than the version below
|
||||||
|
//
|
||||||
|
std::uniform_real_distribution<double> dist(0.0f, 1.0f);
|
||||||
|
const double rnd = dist(ctx->rng);
|
||||||
|
|
||||||
|
double sum_run = 0.0f;
|
||||||
|
const double sum_tgt = sum_cum*rnd;
|
||||||
|
|
||||||
|
bool found = false;
|
||||||
|
for (size_t i = 0; i < cur_p->size; ++i) {
|
||||||
|
if (!found) {
|
||||||
|
// accumulate probs until we reach the target sum
|
||||||
|
sum_run += cur_p->data[i].p;
|
||||||
|
if (sum_run >= sum_tgt) {
|
||||||
|
cur_p->selected = i;
|
||||||
|
found = true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// normalize probs
|
||||||
|
cur_p->data[i].p /= sum_cum;
|
||||||
|
}
|
||||||
|
|
||||||
|
// fallback to the last token (don't think this can happen)
|
||||||
|
assert(found);
|
||||||
|
if (!found) {
|
||||||
|
cur_p->selected = cur_p->size - 1;
|
||||||
|
}
|
||||||
|
#else
|
||||||
|
// for clarity, this is the same as above but does one pass for normalization and one extra pass for sampling
|
||||||
|
for (size_t i = 0; i < cur_p->size; ++i) {
|
||||||
|
cur_p->data[i].p /= sum_cum;
|
||||||
|
}
|
||||||
|
|
||||||
cur_p->selected = llama_sample_dist(cur_p, ctx->rng);
|
cur_p->selected = llama_sample_dist(cur_p, ctx->rng);
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
static struct llama_sampler * llama_sampler_dist_clone(const struct llama_sampler * smpl) {
|
static struct llama_sampler * llama_sampler_dist_clone(const struct llama_sampler * smpl) {
|
||||||
|
|
@ -626,32 +718,6 @@ struct llama_sampler * llama_sampler_init_dist(uint32_t seed) {
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
// softmax
|
|
||||||
|
|
||||||
static const char * llama_sampler_softmax_name(const struct llama_sampler * /*smpl*/) {
|
|
||||||
return "softmax";
|
|
||||||
}
|
|
||||||
|
|
||||||
static void llama_sampler_softmax_apply(struct llama_sampler * /*smpl*/, llama_token_data_array * cur_p) {
|
|
||||||
llama_sampler_softmax_impl(cur_p);
|
|
||||||
}
|
|
||||||
|
|
||||||
static struct llama_sampler_i llama_sampler_softmax_i = {
|
|
||||||
/* .name = */ llama_sampler_softmax_name,
|
|
||||||
/* .accept = */ nullptr,
|
|
||||||
/* .apply = */ llama_sampler_softmax_apply,
|
|
||||||
/* .reset = */ nullptr,
|
|
||||||
/* .clone = */ nullptr,
|
|
||||||
/* .free = */ nullptr,
|
|
||||||
};
|
|
||||||
|
|
||||||
struct llama_sampler * llama_sampler_init_softmax() {
|
|
||||||
return llama_sampler_init(
|
|
||||||
/* .iface = */ &llama_sampler_softmax_i,
|
|
||||||
/* .ctx = */ nullptr
|
|
||||||
);
|
|
||||||
}
|
|
||||||
|
|
||||||
// top-k
|
// top-k
|
||||||
|
|
||||||
struct llama_sampler_top_k {
|
struct llama_sampler_top_k {
|
||||||
|
|
@ -663,7 +729,7 @@ static const char * llama_sampler_top_k_name(const struct llama_sampler * /*smpl
|
||||||
}
|
}
|
||||||
|
|
||||||
static void llama_sampler_top_k_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) {
|
static void llama_sampler_top_k_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) {
|
||||||
const auto * ctx = (llama_sampler_top_k *) smpl->ctx;
|
auto * ctx = (llama_sampler_top_k *) smpl->ctx;
|
||||||
llama_sampler_top_k_impl(cur_p, ctx->k);
|
llama_sampler_top_k_impl(cur_p, ctx->k);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -699,6 +765,8 @@ struct llama_sampler * llama_sampler_init_top_k(int32_t k) {
|
||||||
struct llama_sampler_top_p {
|
struct llama_sampler_top_p {
|
||||||
const float p;
|
const float p;
|
||||||
const size_t min_keep;
|
const size_t min_keep;
|
||||||
|
|
||||||
|
std::vector<llama_token_data> buf_sort;
|
||||||
};
|
};
|
||||||
|
|
||||||
static const char * llama_sampler_top_p_name(const struct llama_sampler * /*smpl*/) {
|
static const char * llama_sampler_top_p_name(const struct llama_sampler * /*smpl*/) {
|
||||||
|
|
@ -706,20 +774,35 @@ static const char * llama_sampler_top_p_name(const struct llama_sampler * /*smpl
|
||||||
}
|
}
|
||||||
|
|
||||||
static void llama_sampler_top_p_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) {
|
static void llama_sampler_top_p_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) {
|
||||||
const auto * ctx = (llama_sampler_top_p *) smpl->ctx;
|
auto * ctx = (llama_sampler_top_p *) smpl->ctx;
|
||||||
|
|
||||||
if (ctx->p >= 1.0f) {
|
if (ctx->p >= 1.0f) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
llama_sampler_softmax_impl(cur_p);
|
llama_sampler_softmax_impl(cur_p, false);
|
||||||
|
|
||||||
|
size_t k = cur_p->size;
|
||||||
|
auto * pdata = cur_p->data;
|
||||||
|
|
||||||
|
auto & buf_sort = ctx->buf_sort;
|
||||||
|
|
||||||
|
// if not sorted, try adaptive top-k sorting
|
||||||
|
if (!cur_p->sorted && cur_p->size > 1024) {
|
||||||
|
k = std::min<size_t>(256, cur_p->size);
|
||||||
|
llama_token_data_array_partial_sort(*cur_p, k, buf_sort);
|
||||||
|
pdata = buf_sort.data();
|
||||||
|
} else if (!cur_p->sorted) {
|
||||||
|
// small candidates -> sort inplace
|
||||||
|
llama_token_data_array_partial_sort_inplace(cur_p, k);
|
||||||
|
}
|
||||||
|
|
||||||
// Compute the cumulative probabilities
|
// Compute the cumulative probabilities
|
||||||
float cum_sum = 0.0f;
|
float cum_sum = 0.0f;
|
||||||
size_t last_idx = cur_p->size;
|
size_t last_idx = cur_p->size;
|
||||||
|
|
||||||
for (size_t i = 0; i < cur_p->size; ++i) {
|
for (size_t i = 0; i < cur_p->size; ++i) {
|
||||||
cum_sum += cur_p->data[i].p;
|
cum_sum += pdata[i].p;
|
||||||
|
|
||||||
// Check if the running sum is at least p or if we have kept at least min_keep tokens
|
// Check if the running sum is at least p or if we have kept at least min_keep tokens
|
||||||
// we set the last index to i+1 to indicate that the current iterate should be included in the set
|
// we set the last index to i+1 to indicate that the current iterate should be included in the set
|
||||||
|
|
@ -727,9 +810,21 @@ static void llama_sampler_top_p_apply(struct llama_sampler * smpl, llama_token_d
|
||||||
last_idx = i + 1;
|
last_idx = i + 1;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// we exceeded the current top-k heuristic -> increase k and continue
|
||||||
|
if (!cur_p->sorted && i == k - 1) {
|
||||||
|
k = cur_p->size;
|
||||||
|
llama_token_data_array_partial_sort(*cur_p, k, buf_sort);
|
||||||
|
pdata = buf_sort.data();
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Resize the output vector to keep only the top-p tokens
|
// Resize the output vector to keep only the top-p tokens
|
||||||
|
if (!cur_p->sorted) {
|
||||||
|
std::copy(buf_sort.data(), buf_sort.data() + last_idx, cur_p->data);
|
||||||
|
cur_p->sorted = true;
|
||||||
|
}
|
||||||
|
|
||||||
cur_p->size = last_idx;
|
cur_p->size = last_idx;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -757,6 +852,7 @@ struct llama_sampler * llama_sampler_init_top_p(float p, size_t min_keep) {
|
||||||
/* .ctx = */ new llama_sampler_top_p {
|
/* .ctx = */ new llama_sampler_top_p {
|
||||||
/* .p = */ p,
|
/* .p = */ p,
|
||||||
/* .min_keep = */ min_keep,
|
/* .min_keep = */ min_keep,
|
||||||
|
/* .buf_sort = */ {},
|
||||||
}
|
}
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
@ -773,7 +869,7 @@ static const char * llama_sampler_min_p_name(const struct llama_sampler * /*smpl
|
||||||
}
|
}
|
||||||
|
|
||||||
static void llama_sampler_min_p_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) {
|
static void llama_sampler_min_p_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) {
|
||||||
const auto * ctx = (llama_sampler_min_p *) smpl->ctx;
|
auto * ctx = (llama_sampler_min_p *) smpl->ctx;
|
||||||
|
|
||||||
if (ctx->p <= 0.0f || !cur_p->size) {
|
if (ctx->p <= 0.0f || !cur_p->size) {
|
||||||
return;
|
return;
|
||||||
|
|
@ -799,7 +895,7 @@ static void llama_sampler_min_p_apply(struct llama_sampler * smpl, llama_token_d
|
||||||
|
|
||||||
// if we have enough values the operation was a success
|
// if we have enough values the operation was a success
|
||||||
if (!filtered_tokens.empty() && filtered_tokens.size() >= ctx->min_keep) {
|
if (!filtered_tokens.empty() && filtered_tokens.size() >= ctx->min_keep) {
|
||||||
memcpy(cur_p->data, filtered_tokens.data(), filtered_tokens.size()*sizeof(llama_token_data));
|
std::copy(filtered_tokens.begin(), filtered_tokens.end(), cur_p->data);
|
||||||
cur_p->size = filtered_tokens.size();
|
cur_p->size = filtered_tokens.size();
|
||||||
min_p_applied = true;
|
min_p_applied = true;
|
||||||
}
|
}
|
||||||
|
|
@ -809,10 +905,7 @@ static void llama_sampler_min_p_apply(struct llama_sampler * smpl, llama_token_d
|
||||||
if (!min_p_applied) {
|
if (!min_p_applied) {
|
||||||
// Sort the logits in descending order
|
// Sort the logits in descending order
|
||||||
if (!cur_p->sorted) {
|
if (!cur_p->sorted) {
|
||||||
std::sort(cur_p->data, cur_p->data + cur_p->size, [](const llama_token_data & a, const llama_token_data & b) {
|
llama_token_data_array_partial_sort_inplace(cur_p, cur_p->size);
|
||||||
return a.logit > b.logit;
|
|
||||||
});
|
|
||||||
cur_p->sorted = true;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
const float min_logit = cur_p->data[0].logit + logf(ctx->p); // min logit for p_i >= p * p_max
|
const float min_logit = cur_p->data[0].logit + logf(ctx->p); // min logit for p_i >= p * p_max
|
||||||
|
|
@ -869,7 +962,7 @@ static const char * llama_sampler_typical_name(const struct llama_sampler * /*sm
|
||||||
}
|
}
|
||||||
|
|
||||||
static void llama_sampler_typical_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) {
|
static void llama_sampler_typical_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) {
|
||||||
const auto * ctx = (llama_sampler_typical *) smpl->ctx;
|
auto * ctx = (llama_sampler_typical *) smpl->ctx;
|
||||||
|
|
||||||
// Reference implementation:
|
// Reference implementation:
|
||||||
// https://github.com/huggingface/transformers/compare/main...cimeister:typical-sampling:typical-pr
|
// https://github.com/huggingface/transformers/compare/main...cimeister:typical-sampling:typical-pr
|
||||||
|
|
@ -878,7 +971,7 @@ static void llama_sampler_typical_apply(struct llama_sampler * smpl, llama_token
|
||||||
}
|
}
|
||||||
|
|
||||||
// Compute the softmax of logits and calculate entropy
|
// Compute the softmax of logits and calculate entropy
|
||||||
llama_sampler_softmax_impl(cur_p);
|
llama_sampler_softmax_impl(cur_p, true);
|
||||||
|
|
||||||
float entropy = 0.0f;
|
float entropy = 0.0f;
|
||||||
for (size_t i = 0; i < cur_p->size; ++i) {
|
for (size_t i = 0; i < cur_p->size; ++i) {
|
||||||
|
|
@ -1012,7 +1105,7 @@ static const char * llama_sampler_temp_ext_name(const struct llama_sampler * /*s
|
||||||
}
|
}
|
||||||
|
|
||||||
static void llama_sampler_temp_ext_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) {
|
static void llama_sampler_temp_ext_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) {
|
||||||
const auto * ctx = (llama_sampler_temp_ext *) smpl->ctx;
|
auto * ctx = (llama_sampler_temp_ext *) smpl->ctx;
|
||||||
if (ctx->delta > 0) {
|
if (ctx->delta > 0) {
|
||||||
const float min_temp = std::max(0.0f, ctx->temp - ctx->delta);
|
const float min_temp = std::max(0.0f, ctx->temp - ctx->delta);
|
||||||
const float max_temp = ctx->temp + ctx->delta;
|
const float max_temp = ctx->temp + ctx->delta;
|
||||||
|
|
@ -1027,7 +1120,7 @@ static void llama_sampler_temp_ext_apply(struct llama_sampler * smpl, llama_toke
|
||||||
// Calculate maximum possible entropy
|
// Calculate maximum possible entropy
|
||||||
float max_entropy = -logf(1.0f / cur_p->size);
|
float max_entropy = -logf(1.0f / cur_p->size);
|
||||||
|
|
||||||
llama_sampler_softmax_impl(cur_p);
|
llama_sampler_softmax_impl(cur_p, true);
|
||||||
|
|
||||||
// Calculate entropy of the softmax probabilities
|
// Calculate entropy of the softmax probabilities
|
||||||
float entropy = 0.0f;
|
float entropy = 0.0f;
|
||||||
|
|
@ -1139,17 +1232,20 @@ static void llama_sample_xtc_apply(struct llama_sampler * smpl, llama_token_data
|
||||||
|
|
||||||
std::uniform_real_distribution<float> distribution(0.0f, 1.0f);
|
std::uniform_real_distribution<float> distribution(0.0f, 1.0f);
|
||||||
float chance = distribution(ctx->rng);
|
float chance = distribution(ctx->rng);
|
||||||
if (chance > ctx->probability) return;
|
if (chance > ctx->probability) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
// in case it's not sorted/recalculated yet
|
llama_sampler_softmax_impl(cur_p, true);
|
||||||
llama_sampler_softmax_impl(cur_p);
|
|
||||||
|
|
||||||
int pos_last = 0;
|
int pos_last = 0;
|
||||||
|
|
||||||
for (size_t i = 0; i < cur_p->size; ++i) {
|
for (size_t i = 0; i < cur_p->size; ++i) {
|
||||||
if (cur_p->data[i].p >= ctx->threshold) {
|
if (cur_p->data[i].p >= ctx->threshold) {
|
||||||
pos_last = i;
|
pos_last = i;
|
||||||
} else break;
|
} else {
|
||||||
|
break;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (cur_p->size - pos_last >= ctx->min_keep && pos_last > 0) {
|
if (cur_p->size - pos_last >= ctx->min_keep && pos_last > 0) {
|
||||||
|
|
@ -1231,7 +1327,7 @@ static const char * llama_sampler_mirostat_name(const struct llama_sampler * /*s
|
||||||
static void llama_sampler_mirostat_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) {
|
static void llama_sampler_mirostat_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) {
|
||||||
auto * ctx = (llama_sampler_mirostat *) smpl->ctx;
|
auto * ctx = (llama_sampler_mirostat *) smpl->ctx;
|
||||||
|
|
||||||
llama_sampler_softmax_impl(cur_p);
|
llama_sampler_softmax_impl(cur_p, true);
|
||||||
|
|
||||||
// Estimate s_hat using the most probable m tokens
|
// Estimate s_hat using the most probable m tokens
|
||||||
float s_hat = 0.0;
|
float s_hat = 0.0;
|
||||||
|
|
@ -1250,7 +1346,8 @@ static void llama_sampler_mirostat_apply(struct llama_sampler * smpl, llama_toke
|
||||||
float k = powf((epsilon_hat * powf(2, ctx->mu)) / (1 - powf(ctx->n_vocab, -epsilon_hat)), 1 / s_hat);
|
float k = powf((epsilon_hat * powf(2, ctx->mu)) / (1 - powf(ctx->n_vocab, -epsilon_hat)), 1 / s_hat);
|
||||||
|
|
||||||
llama_sampler_top_k_impl(cur_p, std::max(int(k), 1));
|
llama_sampler_top_k_impl(cur_p, std::max(int(k), 1));
|
||||||
llama_sampler_softmax_impl(cur_p);
|
|
||||||
|
llama_sampler_softmax_impl(cur_p, true);
|
||||||
|
|
||||||
const int idx = llama_sample_dist(cur_p, ctx->rng);
|
const int idx = llama_sample_dist(cur_p, ctx->rng);
|
||||||
|
|
||||||
|
|
@ -1336,7 +1433,7 @@ static const char * llama_sampler_mirostat_v2_name(const struct llama_sampler *
|
||||||
static void llama_sampler_mirostat_v2_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) {
|
static void llama_sampler_mirostat_v2_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) {
|
||||||
auto * ctx = (llama_sampler_mirostat_v2 *) smpl->ctx;
|
auto * ctx = (llama_sampler_mirostat_v2 *) smpl->ctx;
|
||||||
|
|
||||||
llama_sampler_softmax_impl(cur_p);
|
llama_sampler_softmax_impl(cur_p, true);
|
||||||
|
|
||||||
// Truncate the words with surprise values greater than mu
|
// Truncate the words with surprise values greater than mu
|
||||||
cur_p->size = std::distance(cur_p->data, std::find_if(cur_p->data, cur_p->data + cur_p->size, [&](const llama_token_data & candidate) {
|
cur_p->size = std::distance(cur_p->data, std::find_if(cur_p->data, cur_p->data + cur_p->size, [&](const llama_token_data & candidate) {
|
||||||
|
|
@ -1348,7 +1445,7 @@ static void llama_sampler_mirostat_v2_apply(struct llama_sampler * smpl, llama_t
|
||||||
}
|
}
|
||||||
|
|
||||||
// Normalize the probabilities of the remaining words
|
// Normalize the probabilities of the remaining words
|
||||||
llama_sampler_softmax_impl(cur_p);
|
llama_sampler_softmax_impl(cur_p, true);
|
||||||
|
|
||||||
const int idx = llama_sample_dist(cur_p, ctx->rng);
|
const int idx = llama_sample_dist(cur_p, ctx->rng);
|
||||||
|
|
||||||
|
|
@ -1540,7 +1637,7 @@ static struct llama_sampler * llama_sampler_init_grammar_impl(
|
||||||
trigger_pattern += std::regex_replace(trigger_words[i], special_chars, "\\$0");
|
trigger_pattern += std::regex_replace(trigger_words[i], special_chars, "\\$0");
|
||||||
}
|
}
|
||||||
trigger_pattern += ")[\\s\\S]*";
|
trigger_pattern += ")[\\s\\S]*";
|
||||||
auto trigger_pattern_c = trigger_pattern.c_str();
|
const auto * trigger_pattern_c = trigger_pattern.c_str();
|
||||||
trigger_patterns = &trigger_pattern_c;
|
trigger_patterns = &trigger_pattern_c;
|
||||||
num_trigger_patterns = 1;
|
num_trigger_patterns = 1;
|
||||||
}
|
}
|
||||||
|
|
@ -1748,7 +1845,7 @@ static const char * llama_sampler_top_n_sigma_name(const struct llama_sampler *
|
||||||
}
|
}
|
||||||
|
|
||||||
static void llama_sampler_top_n_sigma_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) {
|
static void llama_sampler_top_n_sigma_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) {
|
||||||
const auto * ctx = (llama_sampler_top_n_sigma *) smpl->ctx;
|
auto * ctx = (llama_sampler_top_n_sigma *) smpl->ctx;
|
||||||
|
|
||||||
if (ctx->n <= 0.0f || cur_p->size <= 1) {
|
if (ctx->n <= 0.0f || cur_p->size <= 1) {
|
||||||
return;
|
return;
|
||||||
|
|
@ -1786,7 +1883,8 @@ static void llama_sampler_top_n_sigma_apply(struct llama_sampler * smpl, llama_t
|
||||||
cur_p->data[i].logit = -INFINITY;
|
cur_p->data[i].logit = -INFINITY;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
llama_sampler_softmax_impl(cur_p);
|
|
||||||
|
llama_sampler_softmax_impl(cur_p, true);
|
||||||
}
|
}
|
||||||
|
|
||||||
static struct llama_sampler * llama_sampler_top_n_sigma_clone(const struct llama_sampler * smpl) {
|
static struct llama_sampler * llama_sampler_top_n_sigma_clone(const struct llama_sampler * smpl) {
|
||||||
|
|
@ -1991,7 +2089,9 @@ static void llama_sampler_dry_apply(struct llama_sampler * smpl, llama_token_dat
|
||||||
|
|
||||||
{
|
{
|
||||||
const int last = last_n_repeat - 1;
|
const int last = last_n_repeat - 1;
|
||||||
int rt = 0, lt = 0;
|
|
||||||
|
int rt = 0;
|
||||||
|
int lt = 0;
|
||||||
|
|
||||||
for (int k = 1; k < last_n_repeat; ++k) {
|
for (int k = 1; k < last_n_repeat; ++k) {
|
||||||
if (k > rt) {
|
if (k > rt) {
|
||||||
|
|
@ -2135,8 +2235,8 @@ static struct llama_sampler_i llama_sampler_dry_i = {
|
||||||
/* .free = */ llama_sampler_dry_free,
|
/* .free = */ llama_sampler_dry_free,
|
||||||
};
|
};
|
||||||
|
|
||||||
struct llama_sampler * llama_sampler_init_dry(const struct llama_vocab * vocab, int32_t context_size, float dry_multiplier, float dry_base, int32_t dry_allowed_length, int32_t dry_penalty_last_n, const char** seq_breakers, size_t num_breakers) {
|
struct llama_sampler * llama_sampler_init_dry(const struct llama_vocab * vocab, int32_t n_ctx_train, float dry_multiplier, float dry_base, int32_t dry_allowed_length, int32_t dry_penalty_last_n, const char** seq_breakers, size_t num_breakers) {
|
||||||
int32_t effective_dry_penalty_last_n = (dry_penalty_last_n == -1) ? context_size : std::max(dry_penalty_last_n, 0);
|
int32_t effective_dry_penalty_last_n = (dry_penalty_last_n == -1) ? n_ctx_train : std::max(dry_penalty_last_n, 0);
|
||||||
std::unordered_multimap<llama_token, std::vector<llama_token>> processed_breakers;
|
std::unordered_multimap<llama_token, std::vector<llama_token>> processed_breakers;
|
||||||
const int MAX_CHAR_LEN = 40;
|
const int MAX_CHAR_LEN = 40;
|
||||||
const int MAX_SEQ_LEN = 20;
|
const int MAX_SEQ_LEN = 20;
|
||||||
|
|
@ -2169,7 +2269,7 @@ struct llama_sampler * llama_sampler_init_dry(const struct llama_vocab * vocab,
|
||||||
return llama_sampler_init(
|
return llama_sampler_init(
|
||||||
/* .iface = */ &llama_sampler_dry_i,
|
/* .iface = */ &llama_sampler_dry_i,
|
||||||
/* .ctx = */ new llama_sampler_dry {
|
/* .ctx = */ new llama_sampler_dry {
|
||||||
/* .total_context_size = */ context_size,
|
/* .total_context_size = */ n_ctx_train,
|
||||||
/* .dry_multiplier = */ dry_multiplier,
|
/* .dry_multiplier = */ dry_multiplier,
|
||||||
/* .dry_base = */ dry_base,
|
/* .dry_base = */ dry_base,
|
||||||
/* .dry_allowed_length = */ dry_allowed_length,
|
/* .dry_allowed_length = */ dry_allowed_length,
|
||||||
|
|
@ -2308,7 +2408,7 @@ static const char * llama_sampler_infill_name(const struct llama_sampler * /*smp
|
||||||
static void llama_sampler_infill_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) {
|
static void llama_sampler_infill_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) {
|
||||||
auto * ctx = (llama_sampler_infill *) smpl->ctx;
|
auto * ctx = (llama_sampler_infill *) smpl->ctx;
|
||||||
|
|
||||||
llama_sampler_softmax_impl(cur_p);
|
llama_sampler_softmax_impl(cur_p, true);
|
||||||
|
|
||||||
#if defined(GGML_DEBUG_SAMPLER_INFILL)
|
#if defined(GGML_DEBUG_SAMPLER_INFILL)
|
||||||
#define LOG_DBG_CUR LLAMA_LOG_DEBUG
|
#define LOG_DBG_CUR LLAMA_LOG_DEBUG
|
||||||
|
|
|
||||||
21
llama/llama.cpp/src/llama-vocab.cpp
vendored
21
llama/llama.cpp/src/llama-vocab.cpp
vendored
|
|
@ -434,6 +434,13 @@ struct llm_tokenizer_bpe : llm_tokenizer {
|
||||||
"(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1}| ?[^\\s\\p{L}\\p{N}\\r\\n]+|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
|
"(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1}| ?[^\\s\\p{L}\\p{N}\\r\\n]+|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
|
||||||
};
|
};
|
||||||
break;
|
break;
|
||||||
|
case LLAMA_VOCAB_PRE_TYPE_GROK_2:
|
||||||
|
regex_exprs = {
|
||||||
|
// original regex from tokenizer.json
|
||||||
|
// "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+"
|
||||||
|
"(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
|
||||||
|
};
|
||||||
|
break;
|
||||||
default:
|
default:
|
||||||
// default regex for BPE tokenization pre-processing
|
// default regex for BPE tokenization pre-processing
|
||||||
regex_exprs = {
|
regex_exprs = {
|
||||||
|
|
@ -1763,7 +1770,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|
||||||
const size_t n_precompiled_charsmap = gguf_get_arr_data_n(ctx, precompiled_charsmap_keyidx);
|
const size_t n_precompiled_charsmap = gguf_get_arr_data_n(ctx, precompiled_charsmap_keyidx);
|
||||||
const char * pc = (const char *) gguf_get_arr_data(ctx, precompiled_charsmap_keyidx);
|
const char * pc = (const char *) gguf_get_arr_data(ctx, precompiled_charsmap_keyidx);
|
||||||
precompiled_charsmap.assign(pc, pc + n_precompiled_charsmap);
|
precompiled_charsmap.assign(pc, pc + n_precompiled_charsmap);
|
||||||
#ifdef IS_BIG_ENDIAN
|
#if defined(__BYTE_ORDER__) && defined(__ORDER_BIG_ENDIAN__) && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
|
||||||
// correct endiannes of data in precompiled_charsmap binary blob
|
// correct endiannes of data in precompiled_charsmap binary blob
|
||||||
uint32_t * xcda_blob_size = (uint32_t *) &precompiled_charsmap[0];
|
uint32_t * xcda_blob_size = (uint32_t *) &precompiled_charsmap[0];
|
||||||
*xcda_blob_size = __builtin_bswap32(*xcda_blob_size);
|
*xcda_blob_size = __builtin_bswap32(*xcda_blob_size);
|
||||||
|
|
@ -1944,7 +1951,8 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|
||||||
pre_type = LLAMA_VOCAB_PRE_TYPE_TRILLION;
|
pre_type = LLAMA_VOCAB_PRE_TYPE_TRILLION;
|
||||||
clean_spaces = false;
|
clean_spaces = false;
|
||||||
} else if (
|
} else if (
|
||||||
tokenizer_pre == "bailingmoe") {
|
tokenizer_pre == "bailingmoe" ||
|
||||||
|
tokenizer_pre == "llada-moe") {
|
||||||
pre_type = LLAMA_VOCAB_PRE_TYPE_BAILINGMOE;
|
pre_type = LLAMA_VOCAB_PRE_TYPE_BAILINGMOE;
|
||||||
clean_spaces = false;
|
clean_spaces = false;
|
||||||
} else if (
|
} else if (
|
||||||
|
|
@ -1963,6 +1971,10 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|
||||||
tokenizer_pre == "kimi-k2") {
|
tokenizer_pre == "kimi-k2") {
|
||||||
pre_type = LLAMA_VOCAB_PRE_TYPE_KIMI_K2;
|
pre_type = LLAMA_VOCAB_PRE_TYPE_KIMI_K2;
|
||||||
clean_spaces = false;
|
clean_spaces = false;
|
||||||
|
} else if (
|
||||||
|
tokenizer_pre == "grok-2") {
|
||||||
|
pre_type = LLAMA_VOCAB_PRE_TYPE_GROK_2;
|
||||||
|
clean_spaces = false;
|
||||||
} else {
|
} else {
|
||||||
LLAMA_LOG_WARN("%s: missing or unrecognized pre-tokenizer type, using: 'default'\n", __func__);
|
LLAMA_LOG_WARN("%s: missing or unrecognized pre-tokenizer type, using: 'default'\n", __func__);
|
||||||
pre_type = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
|
pre_type = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
|
||||||
|
|
@ -2331,7 +2343,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|
||||||
|
|
||||||
// @ngxson : quick hack for gpt-oss, always render these tokens
|
// @ngxson : quick hack for gpt-oss, always render these tokens
|
||||||
for (const auto & t : token_to_id) {
|
for (const auto & t : token_to_id) {
|
||||||
if (t.first == "<|channel|>" || t.first == "<|message|>" || t.first == "<|start|>") {
|
if (t.first == "<|channel|>" || t.first == "<|message|>" || t.first == "<|start|>" || t.first == "<|constrain|>") {
|
||||||
id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_USER_DEFINED;
|
id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_USER_DEFINED;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
@ -2378,6 +2390,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|
||||||
|
|
||||||
if (has_return && has_call && has_end) {
|
if (has_return && has_call && has_end) {
|
||||||
special_eog_ids.erase(end_id);
|
special_eog_ids.erase(end_id);
|
||||||
|
id_to_token[end_id].attr = LLAMA_TOKEN_ATTR_USER_DEFINED;
|
||||||
LLAMA_LOG_WARN("%s: special_eog_ids contains both '<|return|>' and '<|call|>' tokens, removing '<|end|>' token from EOG list\n", __func__);
|
LLAMA_LOG_WARN("%s: special_eog_ids contains both '<|return|>' and '<|call|>' tokens, removing '<|end|>' token from EOG list\n", __func__);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
@ -2459,7 +2472,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|
||||||
// set attributes by model/tokenizer/architecture name
|
// set attributes by model/tokenizer/architecture name
|
||||||
if (false
|
if (false
|
||||||
|| _contains_any(tokenizer_pre, {"jina-v2-de", "jina-v2-es", "jina-v2-code"})
|
|| _contains_any(tokenizer_pre, {"jina-v2-de", "jina-v2-es", "jina-v2-code"})
|
||||||
|| _contains_any(general_arch, {"nomic-bert-moe"})
|
|| _contains_any(general_arch, {"nomic-bert-moe", "jina-bert-v3"})
|
||||||
) {
|
) {
|
||||||
if (token_to_id.count("<mask>") == 0) {
|
if (token_to_id.count("<mask>") == 0) {
|
||||||
LLAMA_LOG_WARN("%s: Mask token is missing in vocab, please reconvert model!\n", __func__);
|
LLAMA_LOG_WARN("%s: Mask token is missing in vocab, please reconvert model!\n", __func__);
|
||||||
|
|
|
||||||
1
llama/llama.cpp/src/llama-vocab.h
vendored
1
llama/llama.cpp/src/llama-vocab.h
vendored
|
|
@ -47,6 +47,7 @@ enum llama_vocab_pre_type {
|
||||||
LLAMA_VOCAB_PRE_TYPE_HUNYUAN = 36,
|
LLAMA_VOCAB_PRE_TYPE_HUNYUAN = 36,
|
||||||
LLAMA_VOCAB_PRE_TYPE_KIMI_K2 = 37,
|
LLAMA_VOCAB_PRE_TYPE_KIMI_K2 = 37,
|
||||||
LLAMA_VOCAB_PRE_TYPE_HUNYUAN_DENSE = 38,
|
LLAMA_VOCAB_PRE_TYPE_HUNYUAN_DENSE = 38,
|
||||||
|
LLAMA_VOCAB_PRE_TYPE_GROK_2 = 39,
|
||||||
};
|
};
|
||||||
|
|
||||||
struct LLM_KV;
|
struct LLM_KV;
|
||||||
|
|
|
||||||
71
llama/llama.cpp/src/llama.cpp
vendored
71
llama/llama.cpp/src/llama.cpp
vendored
|
|
@ -25,6 +25,18 @@
|
||||||
// interface implementation
|
// interface implementation
|
||||||
//
|
//
|
||||||
|
|
||||||
|
const char * llama_flash_attn_type_name(enum llama_flash_attn_type flash_attn_type) {
|
||||||
|
switch (flash_attn_type) {
|
||||||
|
case LLAMA_FLASH_ATTN_TYPE_AUTO:
|
||||||
|
return "auto";
|
||||||
|
case LLAMA_FLASH_ATTN_TYPE_DISABLED:
|
||||||
|
return "disabled";
|
||||||
|
case LLAMA_FLASH_ATTN_TYPE_ENABLED:
|
||||||
|
return "enabled";
|
||||||
|
}
|
||||||
|
GGML_ABORT("fatal error");
|
||||||
|
}
|
||||||
|
|
||||||
struct llama_sampler_chain_params llama_sampler_chain_default_params() {
|
struct llama_sampler_chain_params llama_sampler_chain_default_params() {
|
||||||
struct llama_sampler_chain_params result = {
|
struct llama_sampler_chain_params result = {
|
||||||
/*.no_perf =*/ true,
|
/*.no_perf =*/ true,
|
||||||
|
|
@ -47,6 +59,7 @@ bool llama_supports_mlock(void) {
|
||||||
|
|
||||||
bool llama_supports_gpu_offload(void) {
|
bool llama_supports_gpu_offload(void) {
|
||||||
return ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_GPU) != nullptr ||
|
return ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_GPU) != nullptr ||
|
||||||
|
ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_IGPU) != nullptr ||
|
||||||
llama_supports_rpc();
|
llama_supports_rpc();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -71,9 +84,11 @@ void llama_numa_init(enum ggml_numa_strategy numa) {
|
||||||
GGML_ASSERT(dev && "CPU backend is not loaded");
|
GGML_ASSERT(dev && "CPU backend is not loaded");
|
||||||
auto * reg = ggml_backend_dev_backend_reg(dev);
|
auto * reg = ggml_backend_dev_backend_reg(dev);
|
||||||
auto * numa_init_fn = (decltype(ggml_numa_init) *) ggml_backend_reg_get_proc_address(reg, "ggml_backend_cpu_numa_init");
|
auto * numa_init_fn = (decltype(ggml_numa_init) *) ggml_backend_reg_get_proc_address(reg, "ggml_backend_cpu_numa_init");
|
||||||
|
if (numa_init_fn) {
|
||||||
numa_init_fn(numa);
|
numa_init_fn(numa);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
void llama_backend_free(void) {
|
void llama_backend_free(void) {
|
||||||
ggml_quantize_free();
|
ggml_quantize_free();
|
||||||
|
|
@ -170,8 +185,13 @@ static struct llama_model * llama_model_load_from_file_impl(
|
||||||
model->devices.push_back(*dev);
|
model->devices.push_back(*dev);
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
|
// default device selection
|
||||||
|
|
||||||
|
// build list of available devices
|
||||||
|
std::vector<ggml_backend_dev_t> gpus;
|
||||||
|
std::vector<ggml_backend_dev_t> igpus;
|
||||||
std::vector<ggml_backend_dev_t> rpc_servers;
|
std::vector<ggml_backend_dev_t> rpc_servers;
|
||||||
// use all available devices
|
|
||||||
for (size_t i = 0; i < ggml_backend_dev_count(); ++i) {
|
for (size_t i = 0; i < ggml_backend_dev_count(); ++i) {
|
||||||
ggml_backend_dev_t dev = ggml_backend_dev_get(i);
|
ggml_backend_dev_t dev = ggml_backend_dev_get(i);
|
||||||
switch (ggml_backend_dev_type(dev)) {
|
switch (ggml_backend_dev_type(dev)) {
|
||||||
|
|
@ -180,19 +200,51 @@ static struct llama_model * llama_model_load_from_file_impl(
|
||||||
// skip CPU backends since they are handled separately
|
// skip CPU backends since they are handled separately
|
||||||
break;
|
break;
|
||||||
|
|
||||||
case GGML_BACKEND_DEVICE_TYPE_GPU:
|
case GGML_BACKEND_DEVICE_TYPE_GPU: {
|
||||||
ggml_backend_reg_t reg = ggml_backend_dev_backend_reg(dev);
|
ggml_backend_reg_t reg = ggml_backend_dev_backend_reg(dev);
|
||||||
if (ggml_backend_reg_name(reg) == std::string("RPC")) {
|
if (ggml_backend_reg_name(reg) == std::string("RPC")) {
|
||||||
rpc_servers.push_back(dev);
|
rpc_servers.push_back(dev);
|
||||||
} else {
|
} else {
|
||||||
model->devices.push_back(dev);
|
// check if there is already a GPU with the same device id
|
||||||
|
ggml_backend_dev_props props;
|
||||||
|
ggml_backend_dev_get_props(dev, &props);
|
||||||
|
auto it = std::find_if(gpus.begin(), gpus.end(), [&props](ggml_backend_dev_t d) {
|
||||||
|
ggml_backend_dev_props d_props;
|
||||||
|
ggml_backend_dev_get_props(d, &d_props);
|
||||||
|
if (props.device_id && d_props.device_id) {
|
||||||
|
return strcmp(props.device_id, d_props.device_id) == 0;
|
||||||
|
}
|
||||||
|
return false;
|
||||||
|
});
|
||||||
|
|
||||||
|
if (it != gpus.end()) {
|
||||||
|
LLAMA_LOG_INFO("%s: skipping device %s (%s) with id %s - already using device %s (%s) with the same id\n",
|
||||||
|
__func__,
|
||||||
|
ggml_backend_dev_name(dev), ggml_backend_dev_description(dev),
|
||||||
|
props.device_id ? props.device_id : "unknown id",
|
||||||
|
ggml_backend_dev_name(*it), ggml_backend_dev_description(*it));
|
||||||
|
} else {
|
||||||
|
gpus.push_back(dev);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
case GGML_BACKEND_DEVICE_TYPE_IGPU:
|
||||||
|
igpus.push_back(dev);
|
||||||
|
break;
|
||||||
}
|
}
|
||||||
// add RPC servers at the front of the list
|
}
|
||||||
if (!rpc_servers.empty()) {
|
|
||||||
|
// add RPC servers at the front of the list to minimize network transfers
|
||||||
model->devices.insert(model->devices.begin(), rpc_servers.begin(), rpc_servers.end());
|
model->devices.insert(model->devices.begin(), rpc_servers.begin(), rpc_servers.end());
|
||||||
|
|
||||||
|
// add GPUs
|
||||||
|
model->devices.insert(model->devices.end(), gpus.begin(), gpus.end());
|
||||||
|
|
||||||
|
// add integrated GPUs only if no other devices were found
|
||||||
|
if (model->devices.empty()) {
|
||||||
|
model->devices.insert(model->devices.end(), igpus.begin(), igpus.end());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -213,9 +265,12 @@ static struct llama_model * llama_model_load_from_file_impl(
|
||||||
}
|
}
|
||||||
|
|
||||||
for (auto * dev : model->devices) {
|
for (auto * dev : model->devices) {
|
||||||
size_t free, total; // NOLINT
|
ggml_backend_dev_props props;
|
||||||
ggml_backend_dev_memory(dev, &free, &total);
|
ggml_backend_dev_get_props(dev, &props);
|
||||||
LLAMA_LOG_INFO("%s: using device %s (%s) - %zu MiB free\n", __func__, ggml_backend_dev_name(dev), ggml_backend_dev_description(dev), free/1024/1024);
|
LLAMA_LOG_INFO("%s: using device %s (%s) (%s) - %zu MiB free\n", __func__,
|
||||||
|
ggml_backend_dev_name(dev), ggml_backend_dev_description(dev),
|
||||||
|
props.device_id ? props.device_id : "unknown id",
|
||||||
|
props.memory_free/1024/1024);
|
||||||
}
|
}
|
||||||
|
|
||||||
const int status = llama_model_load(path_model, splits, *model, params);
|
const int status = llama_model_load(path_model, splits, *model, params);
|
||||||
|
|
|
||||||
43
llama/llama.cpp/src/unicode.h
vendored
43
llama/llama.cpp/src/unicode.h
vendored
|
|
@ -4,6 +4,7 @@
|
||||||
#include <string>
|
#include <string>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
|
|
||||||
|
// TODO: reimplement this structure in endian-independent way
|
||||||
struct unicode_cpt_flags {
|
struct unicode_cpt_flags {
|
||||||
enum {
|
enum {
|
||||||
UNDEFINED = 0x0001,
|
UNDEFINED = 0x0001,
|
||||||
|
|
@ -15,6 +16,10 @@ struct unicode_cpt_flags {
|
||||||
SYMBOL = 0x0040, // regex: \p{S}
|
SYMBOL = 0x0040, // regex: \p{S}
|
||||||
CONTROL = 0x0080, // regex: \p{C}
|
CONTROL = 0x0080, // regex: \p{C}
|
||||||
MASK_CATEGORIES = 0x00FF,
|
MASK_CATEGORIES = 0x00FF,
|
||||||
|
WHITESPACE = 0x0100,
|
||||||
|
LOWERCASE = 0x0200,
|
||||||
|
UPPERCASE = 0x0400,
|
||||||
|
NFD = 0x0800,
|
||||||
};
|
};
|
||||||
|
|
||||||
// codepoint type
|
// codepoint type
|
||||||
|
|
@ -34,11 +39,49 @@ struct unicode_cpt_flags {
|
||||||
|
|
||||||
// decode from uint16
|
// decode from uint16
|
||||||
inline unicode_cpt_flags(const uint16_t flags = 0) {
|
inline unicode_cpt_flags(const uint16_t flags = 0) {
|
||||||
|
#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
|
||||||
*reinterpret_cast<uint16_t*>(this) = flags;
|
*reinterpret_cast<uint16_t*>(this) = flags;
|
||||||
|
#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
|
||||||
|
is_undefined = (flags & UNDEFINED) ? 1 : 0;
|
||||||
|
is_number = (flags & NUMBER) ? 1 : 0;
|
||||||
|
is_letter = (flags & LETTER) ? 1 : 0;
|
||||||
|
is_separator = (flags & SEPARATOR) ? 1 : 0;
|
||||||
|
is_accent_mark = (flags & ACCENT_MARK) ? 1 : 0;
|
||||||
|
is_punctuation = (flags & PUNCTUATION) ? 1 : 0;
|
||||||
|
is_symbol = (flags & SYMBOL) ? 1 : 0;
|
||||||
|
is_control = (flags & CONTROL) ? 1 : 0;
|
||||||
|
is_whitespace = (flags & WHITESPACE) ? 1 : 0;
|
||||||
|
is_lowercase = (flags & LOWERCASE) ? 1 : 0;
|
||||||
|
is_uppercase = (flags & UPPERCASE) ? 1 : 0;
|
||||||
|
is_nfd = (flags & NFD) ? 1 : 0;
|
||||||
|
#else
|
||||||
|
#error Unexpected or undefined __BYTE_ORDER__
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
inline uint16_t as_uint() const {
|
inline uint16_t as_uint() const {
|
||||||
|
#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
|
||||||
return *reinterpret_cast<const uint16_t*>(this);
|
return *reinterpret_cast<const uint16_t*>(this);
|
||||||
|
#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
|
||||||
|
uint16_t result =
|
||||||
|
is_undefined * UNDEFINED
|
||||||
|
+ is_number * NUMBER
|
||||||
|
+ is_letter * LETTER
|
||||||
|
+ is_separator * SEPARATOR
|
||||||
|
+ is_accent_mark * ACCENT_MARK
|
||||||
|
+ is_punctuation * PUNCTUATION
|
||||||
|
+ is_symbol * SYMBOL
|
||||||
|
+ is_control * CONTROL
|
||||||
|
+ is_whitespace * WHITESPACE
|
||||||
|
+ is_lowercase * LOWERCASE
|
||||||
|
+ is_uppercase * UPPERCASE
|
||||||
|
+ is_nfd * NFD
|
||||||
|
;
|
||||||
|
|
||||||
|
return result;
|
||||||
|
#else
|
||||||
|
#error Unexpected or undefined __BYTE_ORDER__
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
inline uint16_t category_flag() const {
|
inline uint16_t category_flag() const {
|
||||||
|
|
|
||||||
6
llama/llama.cpp/tools/mtmd/clip-impl.h
vendored
6
llama/llama.cpp/tools/mtmd/clip-impl.h
vendored
|
|
@ -44,6 +44,7 @@
|
||||||
#define KEY_WIN_ATTN_PATTERN "clip.vision.n_wa_pattern"
|
#define KEY_WIN_ATTN_PATTERN "clip.vision.n_wa_pattern"
|
||||||
#define KEY_ATTN_WINDOW_SIZE "clip.vision.window_size"
|
#define KEY_ATTN_WINDOW_SIZE "clip.vision.window_size"
|
||||||
#define KEY_MINICPMV_VERSION "clip.minicpmv_version"
|
#define KEY_MINICPMV_VERSION "clip.minicpmv_version"
|
||||||
|
#define KEY_MINICPMV_QUERY_NUM "clip.minicpmv_query_num"
|
||||||
|
|
||||||
// audio-specific
|
// audio-specific
|
||||||
#define KEY_A_NUM_MEL_BINS "clip.audio.num_mel_bins"
|
#define KEY_A_NUM_MEL_BINS "clip.audio.num_mel_bins"
|
||||||
|
|
@ -81,6 +82,7 @@
|
||||||
#define TN_MVLM_PROJ_PEG "mm.model.peg.%d.%s"
|
#define TN_MVLM_PROJ_PEG "mm.model.peg.%d.%s"
|
||||||
#define TN_IMAGE_NEWLINE "model.image_newline"
|
#define TN_IMAGE_NEWLINE "model.image_newline"
|
||||||
#define TN_MM_INP_NORM "mm.input_norm.weight"
|
#define TN_MM_INP_NORM "mm.input_norm.weight"
|
||||||
|
#define TN_MM_INP_NORM_B "mm.input_norm.bias"
|
||||||
#define TN_MM_INP_PROJ "mm.input_projection.weight" // gemma3
|
#define TN_MM_INP_PROJ "mm.input_projection.weight" // gemma3
|
||||||
#define TN_MM_SOFT_EMB_N "mm.soft_emb_norm.weight" // gemma3
|
#define TN_MM_SOFT_EMB_N "mm.soft_emb_norm.weight" // gemma3
|
||||||
#define TN_MM_PROJECTOR "mm.model.fc.weight" // idefics3
|
#define TN_MM_PROJECTOR "mm.model.fc.weight" // idefics3
|
||||||
|
|
@ -132,6 +134,8 @@ enum projector_type {
|
||||||
PROJECTOR_TYPE_QWEN2A,
|
PROJECTOR_TYPE_QWEN2A,
|
||||||
PROJECTOR_TYPE_QWEN25O, // will be replaced by QWEN2A or QWEN25VL depending on clip_ctx
|
PROJECTOR_TYPE_QWEN25O, // will be replaced by QWEN2A or QWEN25VL depending on clip_ctx
|
||||||
PROJECTOR_TYPE_VOXTRAL,
|
PROJECTOR_TYPE_VOXTRAL,
|
||||||
|
PROJECTOR_TYPE_LFM2,
|
||||||
|
PROJECTOR_TYPE_KIMIVL,
|
||||||
PROJECTOR_TYPE_UNKNOWN,
|
PROJECTOR_TYPE_UNKNOWN,
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
@ -152,6 +156,8 @@ static std::map<projector_type, std::string> PROJECTOR_TYPE_NAMES = {
|
||||||
{ PROJECTOR_TYPE_QWEN2A, "qwen2a"},
|
{ PROJECTOR_TYPE_QWEN2A, "qwen2a"},
|
||||||
{ PROJECTOR_TYPE_QWEN25O, "qwen2.5o"},
|
{ PROJECTOR_TYPE_QWEN25O, "qwen2.5o"},
|
||||||
{ PROJECTOR_TYPE_VOXTRAL, "voxtral"},
|
{ PROJECTOR_TYPE_VOXTRAL, "voxtral"},
|
||||||
|
{ PROJECTOR_TYPE_LFM2, "lfm2"},
|
||||||
|
{ PROJECTOR_TYPE_KIMIVL, "kimivl"},
|
||||||
};
|
};
|
||||||
|
|
||||||
static projector_type clip_projector_type_from_string(const std::string & str) {
|
static projector_type clip_projector_type_from_string(const std::string & str) {
|
||||||
|
|
|
||||||
417
llama/llama.cpp/tools/mtmd/clip.cpp
vendored
417
llama/llama.cpp/tools/mtmd/clip.cpp
vendored
|
|
@ -214,6 +214,7 @@ struct clip_hparams {
|
||||||
// legacy
|
// legacy
|
||||||
bool has_llava_projector = false;
|
bool has_llava_projector = false;
|
||||||
int minicpmv_version = 0;
|
int minicpmv_version = 0;
|
||||||
|
int32_t minicpmv_query_num = 0; // MiniCPM-V query number
|
||||||
};
|
};
|
||||||
|
|
||||||
struct clip_layer {
|
struct clip_layer {
|
||||||
|
|
@ -277,6 +278,7 @@ struct clip_model {
|
||||||
|
|
||||||
// LLaVA projection
|
// LLaVA projection
|
||||||
ggml_tensor * mm_input_norm_w = nullptr;
|
ggml_tensor * mm_input_norm_w = nullptr;
|
||||||
|
ggml_tensor * mm_input_norm_b = nullptr;
|
||||||
ggml_tensor * mm_0_w = nullptr;
|
ggml_tensor * mm_0_w = nullptr;
|
||||||
ggml_tensor * mm_0_b = nullptr;
|
ggml_tensor * mm_0_b = nullptr;
|
||||||
ggml_tensor * mm_2_w = nullptr;
|
ggml_tensor * mm_2_w = nullptr;
|
||||||
|
|
@ -417,6 +419,7 @@ struct clip_ctx {
|
||||||
}
|
}
|
||||||
if (!backend) {
|
if (!backend) {
|
||||||
backend = ggml_backend_init_by_type(GGML_BACKEND_DEVICE_TYPE_GPU, nullptr);
|
backend = ggml_backend_init_by_type(GGML_BACKEND_DEVICE_TYPE_GPU, nullptr);
|
||||||
|
backend = backend ? backend : ggml_backend_init_by_type(GGML_BACKEND_DEVICE_TYPE_IGPU, nullptr);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -500,11 +503,17 @@ struct clip_graph {
|
||||||
|
|
||||||
ggml_cgraph * build_siglip() {
|
ggml_cgraph * build_siglip() {
|
||||||
ggml_tensor * inp = build_inp();
|
ggml_tensor * inp = build_inp();
|
||||||
|
|
||||||
|
ggml_tensor * learned_pos_embd = model.position_embeddings;
|
||||||
|
if (ctx->proj_type() == PROJECTOR_TYPE_LFM2) {
|
||||||
|
learned_pos_embd = resize_position_embeddings();
|
||||||
|
}
|
||||||
|
|
||||||
ggml_tensor * cur = build_vit(
|
ggml_tensor * cur = build_vit(
|
||||||
inp, n_patches,
|
inp, n_patches,
|
||||||
NORM_TYPE_NORMAL,
|
NORM_TYPE_NORMAL,
|
||||||
hparams.ffn_op,
|
hparams.ffn_op,
|
||||||
model.position_embeddings,
|
learned_pos_embd,
|
||||||
nullptr);
|
nullptr);
|
||||||
|
|
||||||
if (ctx->proj_type() == PROJECTOR_TYPE_GEMMA3) {
|
if (ctx->proj_type() == PROJECTOR_TYPE_GEMMA3) {
|
||||||
|
|
@ -513,8 +522,8 @@ struct clip_graph {
|
||||||
const int patches_per_image = n_patches_x;
|
const int patches_per_image = n_patches_x;
|
||||||
const int kernel_size = hparams.proj_scale_factor;
|
const int kernel_size = hparams.proj_scale_factor;
|
||||||
|
|
||||||
cur = ggml_cont(ctx0, ggml_transpose(ctx0, cur));
|
cur = ggml_transpose(ctx0, cur);
|
||||||
cur = ggml_reshape_4d(ctx0, cur, patches_per_image, patches_per_image, n_embd, batch_size);
|
cur = ggml_cont_4d(ctx0, cur, patches_per_image, patches_per_image, n_embd, batch_size);
|
||||||
|
|
||||||
// doing a pool2d to reduce the number of output tokens
|
// doing a pool2d to reduce the number of output tokens
|
||||||
cur = ggml_pool_2d(ctx0, cur, GGML_OP_POOL_AVG, kernel_size, kernel_size, kernel_size, kernel_size, 0, 0);
|
cur = ggml_pool_2d(ctx0, cur, GGML_OP_POOL_AVG, kernel_size, kernel_size, kernel_size, kernel_size, 0, 0);
|
||||||
|
|
@ -531,29 +540,27 @@ struct clip_graph {
|
||||||
cur);
|
cur);
|
||||||
|
|
||||||
} else if (ctx->proj_type() == PROJECTOR_TYPE_IDEFICS3) {
|
} else if (ctx->proj_type() == PROJECTOR_TYPE_IDEFICS3) {
|
||||||
|
// pixel_shuffle
|
||||||
// https://github.com/huggingface/transformers/blob/0a950e0bbe1ed58d5401a6b547af19f15f0c195e/src/transformers/models/idefics3/modeling_idefics3.py#L578
|
// https://github.com/huggingface/transformers/blob/0a950e0bbe1ed58d5401a6b547af19f15f0c195e/src/transformers/models/idefics3/modeling_idefics3.py#L578
|
||||||
|
|
||||||
const int scale_factor = model.hparams.proj_scale_factor;
|
const int scale_factor = model.hparams.proj_scale_factor;
|
||||||
const int n_embd = cur->ne[0];
|
cur = build_patch_merge_permute(cur, scale_factor);
|
||||||
const int seq = cur->ne[1];
|
|
||||||
const int bsz = 1; // batch size, always 1 for now since we don't support batching
|
|
||||||
const int height = std::sqrt(seq);
|
|
||||||
const int width = std::sqrt(seq);
|
|
||||||
GGML_ASSERT(scale_factor != 0);
|
|
||||||
cur = ggml_reshape_4d(ctx0, cur, n_embd * scale_factor, width / scale_factor, height, bsz);
|
|
||||||
cur = ggml_permute(ctx0, cur, 0, 2, 1, 3);
|
|
||||||
cur = ggml_reshape_4d(ctx0, ggml_cont(ctx0, cur),
|
|
||||||
n_embd * scale_factor * scale_factor,
|
|
||||||
height / scale_factor,
|
|
||||||
width / scale_factor,
|
|
||||||
bsz);
|
|
||||||
cur = ggml_permute(ctx0, cur, 0, 2, 1, 3);
|
|
||||||
cur = ggml_reshape_3d(ctx0, ggml_cont(ctx0, cur),
|
|
||||||
n_embd * scale_factor * scale_factor,
|
|
||||||
seq / (scale_factor * scale_factor),
|
|
||||||
bsz);
|
|
||||||
|
|
||||||
cur = ggml_mul_mat(ctx0, model.projection, cur);
|
cur = ggml_mul_mat(ctx0, model.projection, cur);
|
||||||
|
|
||||||
|
} else if (ctx->proj_type() == PROJECTOR_TYPE_LFM2) {
|
||||||
|
// pixel unshuffle block
|
||||||
|
const int scale_factor = model.hparams.proj_scale_factor;
|
||||||
|
cur = build_patch_merge_permute(cur, scale_factor);
|
||||||
|
|
||||||
|
// projection
|
||||||
|
cur = ggml_norm(ctx0, cur, 1e-5); // default nn.LayerNorm
|
||||||
|
cur = ggml_mul(ctx0, cur, model.mm_input_norm_w);
|
||||||
|
cur = ggml_add(ctx0, cur, model.mm_input_norm_b);
|
||||||
|
|
||||||
|
cur = ggml_mul_mat(ctx0, model.mm_1_w, cur);
|
||||||
|
cur = ggml_add(ctx0, cur, model.mm_1_b);
|
||||||
|
cur = ggml_gelu(ctx0, cur);
|
||||||
|
cur = ggml_mul_mat(ctx0, model.mm_2_w, cur);
|
||||||
|
cur = ggml_add(ctx0, cur, model.mm_2_b);
|
||||||
} else {
|
} else {
|
||||||
GGML_ABORT("SigLIP: Unsupported projector type");
|
GGML_ABORT("SigLIP: Unsupported projector type");
|
||||||
}
|
}
|
||||||
|
|
@ -681,15 +688,15 @@ struct clip_graph {
|
||||||
auto inp_1 = ggml_conv_2d(ctx0, model.patch_embeddings_1, inp_raw, patch_size, patch_size, 0, 0, 1, 1);
|
auto inp_1 = ggml_conv_2d(ctx0, model.patch_embeddings_1, inp_raw, patch_size, patch_size, 0, 0, 1, 1);
|
||||||
inp = ggml_add(ctx0, inp, inp_1);
|
inp = ggml_add(ctx0, inp, inp_1);
|
||||||
|
|
||||||
inp = ggml_cont(ctx0, ggml_permute(ctx0, inp, 1, 2, 0, 3)); // [w, h, c, b] -> [c, w, h, b]
|
inp = ggml_permute(ctx0, inp, 1, 2, 0, 3); // [w, h, c, b] -> [c, w, h, b]
|
||||||
inp = ggml_reshape_4d(
|
inp = ggml_cont_4d(
|
||||||
ctx0, inp,
|
ctx0, inp,
|
||||||
n_embd * 2, n_patches_x / 2, n_patches_y, batch_size);
|
n_embd * 2, n_patches_x / 2, n_patches_y, batch_size);
|
||||||
inp = ggml_reshape_4d(
|
inp = ggml_reshape_4d(
|
||||||
ctx0, inp,
|
ctx0, inp,
|
||||||
n_embd * 2, n_patches_x / 2, 2, batch_size * (n_patches_y / 2));
|
n_embd * 2, n_patches_x / 2, 2, batch_size * (n_patches_y / 2));
|
||||||
inp = ggml_cont(ctx0, ggml_permute(ctx0, inp, 0, 2, 1, 3));
|
inp = ggml_permute(ctx0, inp, 0, 2, 1, 3);
|
||||||
inp = ggml_reshape_3d(
|
inp = ggml_cont_3d(
|
||||||
ctx0, inp,
|
ctx0, inp,
|
||||||
n_embd, n_patches_x * n_patches_y, batch_size);
|
n_embd, n_patches_x * n_patches_y, batch_size);
|
||||||
}
|
}
|
||||||
|
|
@ -879,21 +886,8 @@ struct clip_graph {
|
||||||
int n_embd = clip_n_mmproj_embd(ctx);
|
int n_embd = clip_n_mmproj_embd(ctx);
|
||||||
const int d_head = 128;
|
const int d_head = 128;
|
||||||
int n_head = n_embd/d_head;
|
int n_head = n_embd/d_head;
|
||||||
int num_query = 96;
|
// Use actual config value if available, otherwise fall back to hardcoded values
|
||||||
if (ctx->model.hparams.minicpmv_version == 2) {
|
int num_query = ctx->model.hparams.minicpmv_query_num;
|
||||||
// MiniCPM-V 2.5
|
|
||||||
num_query = 96;
|
|
||||||
} else if (ctx->model.hparams.minicpmv_version == 3) {
|
|
||||||
// MiniCPM-V 2.6
|
|
||||||
num_query = 64;
|
|
||||||
} else if (ctx->model.hparams.minicpmv_version == 4) {
|
|
||||||
// MiniCPM-o 2.6
|
|
||||||
num_query = 64;
|
|
||||||
} else if (ctx->model.hparams.minicpmv_version == 5) {
|
|
||||||
// MiniCPM-V 4.0
|
|
||||||
num_query = 64;
|
|
||||||
}
|
|
||||||
|
|
||||||
ggml_tensor * Q = ggml_add(ctx0,
|
ggml_tensor * Q = ggml_add(ctx0,
|
||||||
ggml_mul_mat(ctx0, model.mm_model_attn_q_w, q),
|
ggml_mul_mat(ctx0, model.mm_model_attn_q_w, q),
|
||||||
model.mm_model_attn_q_b);
|
model.mm_model_attn_q_b);
|
||||||
|
|
@ -967,14 +961,14 @@ struct clip_graph {
|
||||||
GGML_ASSERT(scale_factor > 0);
|
GGML_ASSERT(scale_factor > 0);
|
||||||
cur = ggml_reshape_4d(ctx0, cur, n_embd * scale_factor, height / scale_factor, width, bsz);
|
cur = ggml_reshape_4d(ctx0, cur, n_embd * scale_factor, height / scale_factor, width, bsz);
|
||||||
cur = ggml_permute(ctx0, cur, 0, 2, 1, 3);
|
cur = ggml_permute(ctx0, cur, 0, 2, 1, 3);
|
||||||
cur = ggml_reshape_4d(ctx0, ggml_cont(ctx0, cur),
|
cur = ggml_cont_4d(ctx0, cur,
|
||||||
n_embd * scale_factor * scale_factor,
|
n_embd * scale_factor * scale_factor,
|
||||||
height / scale_factor,
|
height / scale_factor,
|
||||||
width / scale_factor,
|
width / scale_factor,
|
||||||
bsz);
|
bsz);
|
||||||
cur = ggml_permute(ctx0, cur, 0, 2, 1, 3);
|
cur = ggml_permute(ctx0, cur, 0, 2, 1, 3);
|
||||||
// flatten to 2D
|
// flatten to 2D
|
||||||
cur = ggml_reshape_2d(ctx0, ggml_cont(ctx0, cur),
|
cur = ggml_cont_2d(ctx0, cur,
|
||||||
n_embd * scale_factor * scale_factor,
|
n_embd * scale_factor * scale_factor,
|
||||||
cur->ne[1] * cur->ne[2]);
|
cur->ne[1] * cur->ne[2]);
|
||||||
}
|
}
|
||||||
|
|
@ -1060,14 +1054,14 @@ struct clip_graph {
|
||||||
n_patches_y,
|
n_patches_y,
|
||||||
bsz);
|
bsz);
|
||||||
cur = ggml_permute(ctx0, cur, 0, 2, 1, 3);
|
cur = ggml_permute(ctx0, cur, 0, 2, 1, 3);
|
||||||
cur = ggml_reshape_4d(ctx0, ggml_cont(ctx0, cur),
|
cur = ggml_cont_4d(ctx0, cur,
|
||||||
n_embd * scale_factor * scale_factor,
|
n_embd * scale_factor * scale_factor,
|
||||||
n_patches_x / scale_factor,
|
n_patches_x / scale_factor,
|
||||||
n_patches_y / scale_factor,
|
n_patches_y / scale_factor,
|
||||||
bsz);
|
bsz);
|
||||||
cur = ggml_permute(ctx0, cur, 0, 2, 1, 3);
|
//cur = ggml_permute(ctx0, cur, 0, 2, 1, 3);
|
||||||
// flatten to 2D
|
// flatten to 2D
|
||||||
cur = ggml_reshape_2d(ctx0, ggml_cont(ctx0, cur),
|
cur = ggml_cont_2d(ctx0, cur,
|
||||||
n_embd * scale_factor * scale_factor,
|
n_embd * scale_factor * scale_factor,
|
||||||
n_patches / scale_factor / scale_factor);
|
n_patches / scale_factor / scale_factor);
|
||||||
cb(cur, "pixel_shuffle", -1);
|
cb(cur, "pixel_shuffle", -1);
|
||||||
|
|
@ -1092,6 +1086,67 @@ struct clip_graph {
|
||||||
return gf;
|
return gf;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
ggml_cgraph * build_kimivl() {
|
||||||
|
// 2D input positions
|
||||||
|
ggml_tensor * pos_h = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_patches);
|
||||||
|
ggml_set_name(pos_h, "pos_h");
|
||||||
|
ggml_set_input(pos_h);
|
||||||
|
|
||||||
|
ggml_tensor * pos_w = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_patches);
|
||||||
|
ggml_set_name(pos_w, "pos_w");
|
||||||
|
ggml_set_input(pos_w);
|
||||||
|
|
||||||
|
ggml_tensor * learned_pos_embd = resize_position_embeddings();
|
||||||
|
|
||||||
|
// build ViT with 2D position embeddings
|
||||||
|
auto add_pos = [&](ggml_tensor * cur, const clip_layer &) {
|
||||||
|
// first half is X axis and second half is Y axis
|
||||||
|
return build_rope_2d(ctx0, cur, pos_w, pos_h, hparams.rope_theta, false);
|
||||||
|
};
|
||||||
|
|
||||||
|
ggml_tensor * inp = build_inp();
|
||||||
|
ggml_tensor * cur = build_vit(
|
||||||
|
inp, n_patches,
|
||||||
|
NORM_TYPE_NORMAL,
|
||||||
|
hparams.ffn_op,
|
||||||
|
learned_pos_embd,
|
||||||
|
add_pos);
|
||||||
|
|
||||||
|
cb(cur, "vit_out", -1);
|
||||||
|
|
||||||
|
{
|
||||||
|
// patch_merger
|
||||||
|
const int scale_factor = model.hparams.proj_scale_factor;
|
||||||
|
cur = build_patch_merge_permute(cur, scale_factor);
|
||||||
|
|
||||||
|
// projection norm
|
||||||
|
int proj_inp_dim = cur->ne[0];
|
||||||
|
cur = ggml_view_2d(ctx0, cur,
|
||||||
|
n_embd, cur->ne[1] * scale_factor * scale_factor,
|
||||||
|
ggml_row_size(cur->type, n_embd), 0);
|
||||||
|
cur = ggml_norm(ctx0, cur, 1e-5); // default nn.LayerNorm
|
||||||
|
cur = ggml_mul(ctx0, cur, model.mm_input_norm_w);
|
||||||
|
cur = ggml_add(ctx0, cur, model.mm_input_norm_b);
|
||||||
|
cur = ggml_view_2d(ctx0, cur,
|
||||||
|
proj_inp_dim, cur->ne[1] / scale_factor / scale_factor,
|
||||||
|
ggml_row_size(cur->type, proj_inp_dim), 0);
|
||||||
|
cb(cur, "proj_inp_normed", -1);
|
||||||
|
|
||||||
|
// projection mlp
|
||||||
|
cur = ggml_mul_mat(ctx0, model.mm_1_w, cur);
|
||||||
|
cur = ggml_add(ctx0, cur, model.mm_1_b);
|
||||||
|
cur = ggml_gelu(ctx0, cur);
|
||||||
|
cur = ggml_mul_mat(ctx0, model.mm_2_w, cur);
|
||||||
|
cur = ggml_add(ctx0, cur, model.mm_2_b);
|
||||||
|
cb(cur, "proj_out", -1);
|
||||||
|
}
|
||||||
|
|
||||||
|
// build the graph
|
||||||
|
ggml_build_forward_expand(gf, cur);
|
||||||
|
|
||||||
|
return gf;
|
||||||
|
}
|
||||||
|
|
||||||
// this graph is used by llava, granite and glm
|
// this graph is used by llava, granite and glm
|
||||||
// due to having embedding_stack (used by granite), we cannot reuse build_vit
|
// due to having embedding_stack (used by granite), we cannot reuse build_vit
|
||||||
ggml_cgraph * build_llava() {
|
ggml_cgraph * build_llava() {
|
||||||
|
|
@ -1300,8 +1355,8 @@ struct clip_graph {
|
||||||
ggml_tensor * block_1 = nullptr;
|
ggml_tensor * block_1 = nullptr;
|
||||||
{
|
{
|
||||||
// transpose from [1, 576, 2048] --> [1, 2048, 576] --> [1, 2048, 24, 24]
|
// transpose from [1, 576, 2048] --> [1, 2048, 576] --> [1, 2048, 24, 24]
|
||||||
mlp_3 = ggml_cont(ctx0, ggml_permute(ctx0, mlp_3, 1, 0, 2, 3));
|
mlp_3 = ggml_permute(ctx0, mlp_3, 1, 0, 2, 3);
|
||||||
mlp_3 = ggml_reshape_4d(ctx0, mlp_3, n_patch, n_patch, mlp_3->ne[1], mlp_3->ne[2]);
|
mlp_3 = ggml_cont_4d(ctx0, mlp_3, n_patch, n_patch, mlp_3->ne[1], mlp_3->ne[2]);
|
||||||
// stride = 1, padding = 1, bias is nullptr
|
// stride = 1, padding = 1, bias is nullptr
|
||||||
block_1 = ggml_conv_2d_dw(ctx0, model.mm_model_block_1_block_0_0_w, mlp_3, 1, 1, 1, 1, 1, 1);
|
block_1 = ggml_conv_2d_dw(ctx0, model.mm_model_block_1_block_0_0_w, mlp_3, 1, 1, 1, 1, 1, 1);
|
||||||
|
|
||||||
|
|
@ -1406,9 +1461,9 @@ struct clip_graph {
|
||||||
mlp_2 = ggml_add(ctx0, mlp_2, model.mm_model_mlp_2_b);
|
mlp_2 = ggml_add(ctx0, mlp_2, model.mm_model_mlp_2_b);
|
||||||
// mlp_2 ne = [2048, 576, 1, 1]
|
// mlp_2 ne = [2048, 576, 1, 1]
|
||||||
// // AVG Pool Layer 2*2, strides = 2
|
// // AVG Pool Layer 2*2, strides = 2
|
||||||
mlp_2 = ggml_cont(ctx0, ggml_permute(ctx0, mlp_2, 1, 0, 2, 3));
|
mlp_2 = ggml_permute(ctx0, mlp_2, 1, 0, 2, 3);
|
||||||
// mlp_2 ne = [576, 2048, 1, 1]
|
// mlp_2 ne = [576, 2048, 1, 1]
|
||||||
mlp_2 = ggml_reshape_4d(ctx0, mlp_2, n_patch, n_patch, mlp_2->ne[1], mlp_2->ne[2]);
|
mlp_2 = ggml_cont_4d(ctx0, mlp_2, n_patch, n_patch, mlp_2->ne[1], mlp_2->ne[2]);
|
||||||
// mlp_2 ne [24, 24, 2048, 1]
|
// mlp_2 ne [24, 24, 2048, 1]
|
||||||
mlp_2 = ggml_pool_2d(ctx0, mlp_2, GGML_OP_POOL_AVG, 2, 2, 2, 2, 0, 0);
|
mlp_2 = ggml_pool_2d(ctx0, mlp_2, GGML_OP_POOL_AVG, 2, 2, 2, 2, 0, 0);
|
||||||
// weight ne = [3, 3, 2048, 1]
|
// weight ne = [3, 3, 2048, 1]
|
||||||
|
|
@ -1428,8 +1483,8 @@ struct clip_graph {
|
||||||
// glm projector
|
// glm projector
|
||||||
else if (ctx->proj_type() == PROJECTOR_TYPE_GLM_EDGE) {
|
else if (ctx->proj_type() == PROJECTOR_TYPE_GLM_EDGE) {
|
||||||
size_t gridsz = (size_t)sqrt(embeddings->ne[1]);
|
size_t gridsz = (size_t)sqrt(embeddings->ne[1]);
|
||||||
embeddings = ggml_cont(ctx0, ggml_permute(ctx0,embeddings,1,0,2,3));
|
embeddings = ggml_permute(ctx0,embeddings,1,0,2,3);
|
||||||
embeddings = ggml_reshape_3d(ctx0, embeddings, gridsz, gridsz, embeddings->ne[1]);
|
embeddings = ggml_cont_3d(ctx0, embeddings, gridsz, gridsz, embeddings->ne[1]);
|
||||||
embeddings = ggml_conv_2d(ctx0, model.mm_model_adapter_conv_w, embeddings, 2, 2, 0, 0, 1, 1);
|
embeddings = ggml_conv_2d(ctx0, model.mm_model_adapter_conv_w, embeddings, 2, 2, 0, 0, 1, 1);
|
||||||
embeddings = ggml_reshape_3d(ctx0, embeddings,embeddings->ne[0]*embeddings->ne[1] , embeddings->ne[2], batch_size);
|
embeddings = ggml_reshape_3d(ctx0, embeddings,embeddings->ne[0]*embeddings->ne[1] , embeddings->ne[2], batch_size);
|
||||||
embeddings = ggml_cont(ctx0, ggml_permute(ctx0,embeddings, 1, 0, 2, 3));
|
embeddings = ggml_cont(ctx0, ggml_permute(ctx0,embeddings, 1, 0, 2, 3));
|
||||||
|
|
@ -1585,6 +1640,29 @@ private:
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// siglip2 naflex
|
||||||
|
ggml_tensor * resize_position_embeddings() {
|
||||||
|
ggml_tensor * pos_embd = model.position_embeddings;
|
||||||
|
const int height = img.ny / patch_size;
|
||||||
|
const int width = img.nx / patch_size;
|
||||||
|
const uint32_t mode = GGML_SCALE_MODE_BILINEAR;
|
||||||
|
const int n_per_side = (int)std::sqrt(pos_embd->ne[1]);
|
||||||
|
|
||||||
|
GGML_ASSERT(pos_embd);
|
||||||
|
|
||||||
|
if (height == n_per_side && width == n_per_side) {
|
||||||
|
return pos_embd;
|
||||||
|
}
|
||||||
|
|
||||||
|
pos_embd = ggml_reshape_3d(ctx0, pos_embd, n_embd, n_per_side, n_per_side); // -> (n_embd, n_per_side, n_per_side)
|
||||||
|
pos_embd = ggml_permute(ctx0, pos_embd, 2, 0, 1, 3); // -> (n_per_side, n_per_side, n_embd)
|
||||||
|
pos_embd = ggml_interpolate(ctx0, pos_embd, width, height, n_embd, 1, mode); // -> (width, height, n_embd)
|
||||||
|
pos_embd = ggml_permute(ctx0, pos_embd, 1, 2, 0, 3); // -> (n_embd, width, height)
|
||||||
|
pos_embd = ggml_cont_2d(ctx0, pos_embd, n_embd, width * height); // -> (n_embd, width * height)
|
||||||
|
|
||||||
|
return pos_embd;
|
||||||
|
}
|
||||||
|
|
||||||
// build vision transformer (ViT) cgraph
|
// build vision transformer (ViT) cgraph
|
||||||
// this function should cover most of the models
|
// this function should cover most of the models
|
||||||
// if your model has specific features, you should probably duplicate this function
|
// if your model has specific features, you should probably duplicate this function
|
||||||
|
|
@ -1963,7 +2041,6 @@ private:
|
||||||
ggml_row_size(cur->type, n_dim),
|
ggml_row_size(cur->type, n_dim),
|
||||||
ggml_row_size(cur->type, n_dim*n_head),
|
ggml_row_size(cur->type, n_dim*n_head),
|
||||||
n_dim/2 * ggml_element_size(cur));
|
n_dim/2 * ggml_element_size(cur));
|
||||||
second = ggml_cont(ctx0, second); // copy, because ggml_rope don't play well with non-contiguous tensors
|
|
||||||
second = ggml_rope_ext(
|
second = ggml_rope_ext(
|
||||||
ctx0,
|
ctx0,
|
||||||
second,
|
second,
|
||||||
|
|
@ -1980,6 +2057,39 @@ private:
|
||||||
return cur;
|
return cur;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// aka pixel_shuffle / pixel_unshuffle / patch_merger (Kimi-VL)
|
||||||
|
// support dynamic resolution
|
||||||
|
ggml_tensor * build_patch_merge_permute(ggml_tensor * cur, int scale_factor) {
|
||||||
|
GGML_ASSERT(scale_factor > 1);
|
||||||
|
|
||||||
|
const int n_embd = cur->ne[0];
|
||||||
|
int width = img.nx / patch_size;
|
||||||
|
int height = img.ny / patch_size;
|
||||||
|
|
||||||
|
// pad width and height to factor
|
||||||
|
const int64_t pad_width = CLIP_ALIGN(width, scale_factor) - width;
|
||||||
|
const int64_t pad_height = CLIP_ALIGN(height, scale_factor) - height;
|
||||||
|
cur = ggml_reshape_3d(ctx0, cur, n_embd, width, height);
|
||||||
|
if (pad_width || pad_height) {
|
||||||
|
cur = ggml_pad(ctx0, cur, 0, pad_width, pad_height, 0);
|
||||||
|
width += pad_width;
|
||||||
|
height += pad_height;
|
||||||
|
}
|
||||||
|
|
||||||
|
// unshuffle h
|
||||||
|
cur = ggml_reshape_3d(ctx0, cur, n_embd * scale_factor, width / scale_factor, height);
|
||||||
|
cur = ggml_permute(ctx0, cur, 0, 2, 1, 3);
|
||||||
|
|
||||||
|
// unshuffle w
|
||||||
|
cur = ggml_cont_3d(ctx0, cur, n_embd * scale_factor * scale_factor, height / scale_factor, width / scale_factor);
|
||||||
|
cur = ggml_permute(ctx0, cur, 0, 2, 1, 3);
|
||||||
|
|
||||||
|
cur = ggml_cont_2d(ctx0, cur, cur->ne[0], cur->ne[1] * cur->ne[2]);
|
||||||
|
cb(cur, "pixel_shuffle", -1);
|
||||||
|
|
||||||
|
return cur;
|
||||||
|
}
|
||||||
|
|
||||||
};
|
};
|
||||||
|
|
||||||
static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32_batch & imgs) {
|
static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32_batch & imgs) {
|
||||||
|
|
@ -1991,6 +2101,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
|
||||||
switch (ctx->proj_type()) {
|
switch (ctx->proj_type()) {
|
||||||
case PROJECTOR_TYPE_GEMMA3:
|
case PROJECTOR_TYPE_GEMMA3:
|
||||||
case PROJECTOR_TYPE_IDEFICS3:
|
case PROJECTOR_TYPE_IDEFICS3:
|
||||||
|
case PROJECTOR_TYPE_LFM2:
|
||||||
{
|
{
|
||||||
res = graph.build_siglip();
|
res = graph.build_siglip();
|
||||||
} break;
|
} break;
|
||||||
|
|
@ -2021,6 +2132,10 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
|
||||||
{
|
{
|
||||||
res = graph.build_whisper_enc();
|
res = graph.build_whisper_enc();
|
||||||
} break;
|
} break;
|
||||||
|
case PROJECTOR_TYPE_KIMIVL:
|
||||||
|
{
|
||||||
|
res = graph.build_kimivl();
|
||||||
|
} break;
|
||||||
default:
|
default:
|
||||||
{
|
{
|
||||||
res = graph.build_llava();
|
res = graph.build_llava();
|
||||||
|
|
@ -2151,7 +2266,21 @@ struct clip_model_loader {
|
||||||
get_u32(KEY_PATCH_SIZE, hparams.patch_size);
|
get_u32(KEY_PATCH_SIZE, hparams.patch_size);
|
||||||
get_u32(KEY_IMAGE_CROP_RESOLUTION, hparams.image_crop_resolution, false);
|
get_u32(KEY_IMAGE_CROP_RESOLUTION, hparams.image_crop_resolution, false);
|
||||||
get_i32(KEY_MINICPMV_VERSION, hparams.minicpmv_version, false); // legacy
|
get_i32(KEY_MINICPMV_VERSION, hparams.minicpmv_version, false); // legacy
|
||||||
|
get_u32(KEY_MINICPMV_QUERY_NUM, hparams.minicpmv_query_num, false);
|
||||||
|
if (hparams.minicpmv_query_num == 0) {
|
||||||
|
// Fallback to hardcoded values for legacy models
|
||||||
|
if (hparams.minicpmv_version == 3) {
|
||||||
|
hparams.minicpmv_query_num = 64;
|
||||||
|
} else if (hparams.minicpmv_version == 4) {
|
||||||
|
hparams.minicpmv_query_num = 64;
|
||||||
|
} else if (hparams.minicpmv_version == 5) {
|
||||||
|
hparams.minicpmv_query_num = 64;
|
||||||
|
} else if (hparams.minicpmv_version == 6) {
|
||||||
|
hparams.minicpmv_query_num = 64;
|
||||||
|
} else {
|
||||||
|
hparams.minicpmv_query_num = 96;
|
||||||
|
}
|
||||||
|
}
|
||||||
} else if (is_audio) {
|
} else if (is_audio) {
|
||||||
get_u32(KEY_A_NUM_MEL_BINS, hparams.n_mel_bins);
|
get_u32(KEY_A_NUM_MEL_BINS, hparams.n_mel_bins);
|
||||||
|
|
||||||
|
|
@ -2243,6 +2372,7 @@ struct clip_model_loader {
|
||||||
}
|
}
|
||||||
} break;
|
} break;
|
||||||
case PROJECTOR_TYPE_IDEFICS3:
|
case PROJECTOR_TYPE_IDEFICS3:
|
||||||
|
case PROJECTOR_TYPE_LFM2:
|
||||||
case PROJECTOR_TYPE_INTERNVL:
|
case PROJECTOR_TYPE_INTERNVL:
|
||||||
{
|
{
|
||||||
get_u32(KEY_PROJ_SCALE_FACTOR, hparams.proj_scale_factor, false);
|
get_u32(KEY_PROJ_SCALE_FACTOR, hparams.proj_scale_factor, false);
|
||||||
|
|
@ -2256,6 +2386,12 @@ struct clip_model_loader {
|
||||||
hparams.image_size = 1024;
|
hparams.image_size = 1024;
|
||||||
get_u32(KEY_SPATIAL_MERGE_SIZE, hparams.spatial_merge_size, false);
|
get_u32(KEY_SPATIAL_MERGE_SIZE, hparams.spatial_merge_size, false);
|
||||||
} break;
|
} break;
|
||||||
|
case PROJECTOR_TYPE_KIMIVL:
|
||||||
|
{
|
||||||
|
hparams.rope_theta = 10000.0f;
|
||||||
|
hparams.warmup_image_size = hparams.patch_size * 8;
|
||||||
|
get_u32(KEY_PROJ_SCALE_FACTOR, hparams.proj_scale_factor, false);
|
||||||
|
} break;
|
||||||
case PROJECTOR_TYPE_GEMMA3:
|
case PROJECTOR_TYPE_GEMMA3:
|
||||||
{
|
{
|
||||||
// default value (used by all model sizes in gemma 3 family)
|
// default value (used by all model sizes in gemma 3 family)
|
||||||
|
|
@ -2420,7 +2556,20 @@ struct clip_model_loader {
|
||||||
|
|
||||||
// some models already exported with legacy (incorrect) naming which is quite messy, let's fix it here
|
// some models already exported with legacy (incorrect) naming which is quite messy, let's fix it here
|
||||||
// note: Qwen model converted from the old surgery script has n_ff = 0, so we cannot use n_ff to check!
|
// note: Qwen model converted from the old surgery script has n_ff = 0, so we cannot use n_ff to check!
|
||||||
if (layer.ff_up_w && layer.ff_down_w && layer.ff_down_w->ne[0] == hparams.n_embd) {
|
bool is_ffn_swapped = (
|
||||||
|
// only old models need this fix
|
||||||
|
model.proj_type == PROJECTOR_TYPE_MLP
|
||||||
|
|| model.proj_type == PROJECTOR_TYPE_MLP_NORM
|
||||||
|
|| model.proj_type == PROJECTOR_TYPE_LDP
|
||||||
|
|| model.proj_type == PROJECTOR_TYPE_LDPV2
|
||||||
|
|| model.proj_type == PROJECTOR_TYPE_QWEN2VL
|
||||||
|
|| model.proj_type == PROJECTOR_TYPE_QWEN25VL
|
||||||
|
|| model.proj_type == PROJECTOR_TYPE_GLM_EDGE
|
||||||
|
|| model.proj_type == PROJECTOR_TYPE_GEMMA3
|
||||||
|
|| model.proj_type == PROJECTOR_TYPE_IDEFICS3
|
||||||
|
|| model.proj_type == PROJECTOR_TYPE_MINICPMV
|
||||||
|
) && layer.ff_up_w && layer.ff_down_w && layer.ff_down_w->ne[0] == hparams.n_embd;
|
||||||
|
if (is_ffn_swapped) {
|
||||||
// swap up and down weights
|
// swap up and down weights
|
||||||
ggml_tensor * tmp = layer.ff_up_w;
|
ggml_tensor * tmp = layer.ff_up_w;
|
||||||
layer.ff_up_w = layer.ff_down_w;
|
layer.ff_up_w = layer.ff_down_w;
|
||||||
|
|
@ -2429,6 +2578,9 @@ struct clip_model_loader {
|
||||||
tmp = layer.ff_up_b;
|
tmp = layer.ff_up_b;
|
||||||
layer.ff_up_b = layer.ff_down_b;
|
layer.ff_up_b = layer.ff_down_b;
|
||||||
layer.ff_down_b = tmp;
|
layer.ff_down_b = tmp;
|
||||||
|
if (il == 0) {
|
||||||
|
LOG_WRN("%s: ffn up/down are swapped\n", __func__);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -2546,6 +2698,16 @@ struct clip_model_loader {
|
||||||
{
|
{
|
||||||
model.projection = get_tensor(TN_MM_PROJECTOR);
|
model.projection = get_tensor(TN_MM_PROJECTOR);
|
||||||
} break;
|
} break;
|
||||||
|
case PROJECTOR_TYPE_LFM2:
|
||||||
|
case PROJECTOR_TYPE_KIMIVL:
|
||||||
|
{
|
||||||
|
model.mm_input_norm_w = get_tensor(TN_MM_INP_NORM);
|
||||||
|
model.mm_input_norm_b = get_tensor(TN_MM_INP_NORM_B);
|
||||||
|
model.mm_1_w = get_tensor(string_format(TN_LLAVA_PROJ, 1, "weight"));
|
||||||
|
model.mm_1_b = get_tensor(string_format(TN_LLAVA_PROJ, 1, "bias"));
|
||||||
|
model.mm_2_w = get_tensor(string_format(TN_LLAVA_PROJ, 2, "weight"));
|
||||||
|
model.mm_2_b = get_tensor(string_format(TN_LLAVA_PROJ, 2, "bias"));
|
||||||
|
} break;
|
||||||
case PROJECTOR_TYPE_PIXTRAL:
|
case PROJECTOR_TYPE_PIXTRAL:
|
||||||
{
|
{
|
||||||
model.mm_1_w = get_tensor(string_format(TN_LLAVA_PROJ, 1, "weight"));
|
model.mm_1_w = get_tensor(string_format(TN_LLAVA_PROJ, 1, "weight"));
|
||||||
|
|
@ -2944,7 +3106,7 @@ struct image_manipulation {
|
||||||
dst.buf.resize(3 * target_width * target_height);
|
dst.buf.resize(3 * target_width * target_height);
|
||||||
|
|
||||||
float Cc;
|
float Cc;
|
||||||
float C[5];
|
float C[5] = {};
|
||||||
float d0, d2, d3, a0, a1, a2, a3;
|
float d0, d2, d3, a0, a1, a2, a3;
|
||||||
int i, j, k, jj;
|
int i, j, k, jj;
|
||||||
int x, y;
|
int x, y;
|
||||||
|
|
@ -3467,6 +3629,45 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str
|
||||||
res_imgs->grid_y = inst.grid_size.height;
|
res_imgs->grid_y = inst.grid_size.height;
|
||||||
return true;
|
return true;
|
||||||
|
|
||||||
|
} else if ( ctx->proj_type() == PROJECTOR_TYPE_LFM2
|
||||||
|
|| ctx->proj_type() == PROJECTOR_TYPE_KIMIVL
|
||||||
|
) {
|
||||||
|
GGML_ASSERT(params.proj_scale_factor);
|
||||||
|
|
||||||
|
// smart resize
|
||||||
|
const int width = img->nx;
|
||||||
|
const int height = img->ny;
|
||||||
|
const int total_factor = params.patch_size * params.proj_scale_factor;
|
||||||
|
constexpr int min_image_tokens = 64;
|
||||||
|
constexpr int max_image_tokens = 1024;
|
||||||
|
const float min_pixels = min_image_tokens * total_factor * total_factor;
|
||||||
|
const float max_pixels = max_image_tokens * total_factor * total_factor;
|
||||||
|
|
||||||
|
auto round_by_factor = [f = total_factor](float x) { return static_cast<int>(std::nearbyintf(x / static_cast<float>(f))) * f; };
|
||||||
|
auto ceil_by_factor = [f = total_factor](float x) { return static_cast<int>(std::ceil(x / static_cast<float>(f))) * f; };
|
||||||
|
auto floor_by_factor = [f = total_factor](float x) { return static_cast<int>(std::floor(x / static_cast<float>(f))) * f; };
|
||||||
|
|
||||||
|
int h_bar = std::max(total_factor, round_by_factor(height));
|
||||||
|
int w_bar = std::max(total_factor, round_by_factor(width));
|
||||||
|
|
||||||
|
if (h_bar * w_bar > max_pixels) {
|
||||||
|
const auto beta = std::sqrt((height * width) / max_pixels);
|
||||||
|
h_bar = std::max(total_factor, floor_by_factor(height / beta));
|
||||||
|
w_bar = std::max(total_factor, floor_by_factor(width / beta));
|
||||||
|
} else if (h_bar * w_bar < min_pixels) {
|
||||||
|
const auto beta = std::sqrt(min_pixels / (height * width));
|
||||||
|
h_bar = ceil_by_factor(height * beta);
|
||||||
|
w_bar = ceil_by_factor(width * beta);
|
||||||
|
}
|
||||||
|
|
||||||
|
const std::array<uint8_t, 3> pad_color = {122, 116, 104};
|
||||||
|
|
||||||
|
clip_image_u8 resized_img;
|
||||||
|
image_manipulation::resize_and_pad_image(*img, resized_img, clip_image_size{w_bar, h_bar}, pad_color);
|
||||||
|
clip_image_f32_ptr res(clip_image_f32_init());
|
||||||
|
normalize_image_u8_to_f32(resized_img, *res, params.image_mean, params.image_std);
|
||||||
|
res_imgs->entries.push_back(std::move(res));
|
||||||
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
// the logic below is to pad the shorter side to the longer side with a background color: rgb(122, 116, 104)
|
// the logic below is to pad the shorter side to the longer side with a background color: rgb(122, 116, 104)
|
||||||
|
|
@ -3506,10 +3707,10 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str
|
||||||
}
|
}
|
||||||
|
|
||||||
return true;
|
return true;
|
||||||
|
} else {
|
||||||
|
GGML_ABORT("Unknown image preprocessing type");
|
||||||
}
|
}
|
||||||
|
|
||||||
GGML_ASSERT(false && "Unknown image preprocessing type");
|
|
||||||
}
|
}
|
||||||
|
|
||||||
ggml_tensor * clip_get_newline_tensor(const struct clip_ctx * ctx) {
|
ggml_tensor * clip_get_newline_tensor(const struct clip_ctx * ctx) {
|
||||||
|
|
@ -3573,8 +3774,9 @@ int clip_n_output_tokens_y(const struct clip_ctx * ctx, struct clip_image_f32 *
|
||||||
int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * img) {
|
int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * img) {
|
||||||
const auto & params = ctx->model.hparams;
|
const auto & params = ctx->model.hparams;
|
||||||
|
|
||||||
// only for models using fixed size square images
|
// for models with fixed size image, the input image is already pre-processed and resized to square
|
||||||
int n_patches_sq = (params.image_size / params.patch_size) * (params.image_size / params.patch_size);
|
int patch_size = params.patch_size;
|
||||||
|
int n_patches = (img->nx / patch_size) * (img->ny / patch_size);
|
||||||
|
|
||||||
projector_type proj = ctx->proj_type();
|
projector_type proj = ctx->proj_type();
|
||||||
|
|
||||||
|
|
@ -3588,89 +3790,97 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im
|
||||||
case PROJECTOR_TYPE_LDPV2:
|
case PROJECTOR_TYPE_LDPV2:
|
||||||
case PROJECTOR_TYPE_GLM_EDGE:
|
case PROJECTOR_TYPE_GLM_EDGE:
|
||||||
{
|
{
|
||||||
n_patches_sq /= 4;
|
n_patches /= 4;
|
||||||
if (ctx->model.mm_glm_tok_boi) {
|
if (ctx->model.mm_glm_tok_boi) {
|
||||||
n_patches_sq += 2; // for BOI and EOI token embeddings
|
n_patches += 2; // for BOI and EOI token embeddings
|
||||||
}
|
}
|
||||||
} break;
|
} break;
|
||||||
case PROJECTOR_TYPE_MINICPMV:
|
case PROJECTOR_TYPE_MINICPMV:
|
||||||
{
|
{
|
||||||
|
// Use actual config value if available, otherwise fall back to hardcoded values
|
||||||
|
if (params.minicpmv_query_num > 0) {
|
||||||
|
n_patches = params.minicpmv_query_num;
|
||||||
|
} else {
|
||||||
|
// Fallback to hardcoded values for legacy models
|
||||||
if (params.minicpmv_version == 2) {
|
if (params.minicpmv_version == 2) {
|
||||||
// MiniCPM-V 2.5
|
n_patches = 96;
|
||||||
n_patches_sq = 96;
|
|
||||||
} else if (params.minicpmv_version == 3) {
|
} else if (params.minicpmv_version == 3) {
|
||||||
// MiniCPM-V 2.6
|
n_patches = 64;
|
||||||
n_patches_sq = 64;
|
|
||||||
} else if (params.minicpmv_version == 4) {
|
} else if (params.minicpmv_version == 4) {
|
||||||
// MiniCPM-o 2.6
|
n_patches = 64;
|
||||||
n_patches_sq = 64;
|
|
||||||
} else if (params.minicpmv_version == 5) {
|
} else if (params.minicpmv_version == 5) {
|
||||||
// MiniCPM-V 4.0
|
// MiniCPM-V 4.0
|
||||||
n_patches_sq = 64;
|
n_patches = 64;
|
||||||
|
} else if (params.minicpmv_version == 6) {
|
||||||
|
// MiniCPM-V 4.5
|
||||||
|
n_patches = 64;
|
||||||
} else {
|
} else {
|
||||||
GGML_ABORT("Unknown minicpmv version");
|
GGML_ABORT("Unknown minicpmv version");
|
||||||
}
|
}
|
||||||
|
}
|
||||||
} break;
|
} break;
|
||||||
case PROJECTOR_TYPE_QWEN2VL:
|
case PROJECTOR_TYPE_QWEN2VL:
|
||||||
case PROJECTOR_TYPE_QWEN25VL:
|
case PROJECTOR_TYPE_QWEN25VL:
|
||||||
{
|
{
|
||||||
// dynamic size
|
// dynamic size (2 conv, so double patch size)
|
||||||
int patch_size = params.patch_size * 2;
|
int patch_size = params.patch_size * 2;
|
||||||
int x_patch = img->nx / patch_size + (int)(img->nx % patch_size > 0);
|
int x_patch = img->nx / patch_size + (int)(img->nx % patch_size > 0);
|
||||||
int y_patch = img->ny / patch_size + (int)(img->ny % patch_size > 0);
|
int y_patch = img->ny / patch_size + (int)(img->ny % patch_size > 0);
|
||||||
n_patches_sq = x_patch * y_patch;
|
n_patches = x_patch * y_patch;
|
||||||
} break;
|
} break;
|
||||||
case PROJECTOR_TYPE_GEMMA3:
|
case PROJECTOR_TYPE_GEMMA3:
|
||||||
{
|
|
||||||
int n_per_side = params.image_size / params.patch_size;
|
|
||||||
int n_per_side_2d_pool = n_per_side / params.proj_scale_factor;
|
|
||||||
n_patches_sq = n_per_side_2d_pool * n_per_side_2d_pool;
|
|
||||||
} break;
|
|
||||||
case PROJECTOR_TYPE_IDEFICS3:
|
case PROJECTOR_TYPE_IDEFICS3:
|
||||||
case PROJECTOR_TYPE_INTERNVL:
|
case PROJECTOR_TYPE_INTERNVL:
|
||||||
|
case PROJECTOR_TYPE_LLAMA4:
|
||||||
{
|
{
|
||||||
// both W and H are divided by proj_scale_factor
|
// both X and Y are downscaled by the scale factor
|
||||||
n_patches_sq /= (params.proj_scale_factor * params.proj_scale_factor);
|
int scale_factor = ctx->model.hparams.proj_scale_factor;
|
||||||
|
n_patches /= (scale_factor * scale_factor);
|
||||||
|
} break;
|
||||||
|
case PROJECTOR_TYPE_LFM2:
|
||||||
|
case PROJECTOR_TYPE_KIMIVL:
|
||||||
|
{
|
||||||
|
// dynamic size
|
||||||
|
int scale_factor = ctx->model.hparams.proj_scale_factor;
|
||||||
|
int out_patch_size = params.patch_size * scale_factor;
|
||||||
|
int x_patch = CLIP_ALIGN(img->nx, out_patch_size) / out_patch_size;
|
||||||
|
int y_patch = CLIP_ALIGN(img->ny, out_patch_size) / out_patch_size;
|
||||||
|
n_patches = x_patch * y_patch;
|
||||||
} break;
|
} break;
|
||||||
case PROJECTOR_TYPE_PIXTRAL:
|
case PROJECTOR_TYPE_PIXTRAL:
|
||||||
{
|
{
|
||||||
// dynamic size
|
// dynamic size
|
||||||
int n_merge = params.spatial_merge_size;
|
int n_merge = params.spatial_merge_size;
|
||||||
int n_patches_x = img->nx / params.patch_size / (n_merge > 0 ? n_merge : 1);
|
int n_patches_x = img->nx / patch_size / (n_merge > 0 ? n_merge : 1);
|
||||||
int n_patches_y = img->ny / params.patch_size / (n_merge > 0 ? n_merge : 1);
|
int n_patches_y = img->ny / patch_size / (n_merge > 0 ? n_merge : 1);
|
||||||
n_patches_sq = n_patches_y * n_patches_x + n_patches_y - 1; // + one [IMG_BREAK] per row, except the last row
|
n_patches = n_patches_y * n_patches_x + n_patches_y - 1; // + one [IMG_BREAK] per row, except the last row
|
||||||
} break;
|
|
||||||
case PROJECTOR_TYPE_LLAMA4:
|
|
||||||
{
|
|
||||||
int scale_factor = ctx->model.hparams.proj_scale_factor;
|
|
||||||
n_patches_sq /= (scale_factor * scale_factor);
|
|
||||||
} break;
|
} break;
|
||||||
case PROJECTOR_TYPE_VOXTRAL:
|
case PROJECTOR_TYPE_VOXTRAL:
|
||||||
case PROJECTOR_TYPE_ULTRAVOX:
|
case PROJECTOR_TYPE_ULTRAVOX:
|
||||||
case PROJECTOR_TYPE_QWEN2A:
|
case PROJECTOR_TYPE_QWEN2A:
|
||||||
{
|
{
|
||||||
n_patches_sq = img->nx;
|
n_patches = img->nx;
|
||||||
|
|
||||||
const int proj_stack_factor = ctx->model.hparams.proj_stack_factor;
|
const int proj_stack_factor = ctx->model.hparams.proj_stack_factor;
|
||||||
if (ctx->model.audio_has_stack_frames()) {
|
if (ctx->model.audio_has_stack_frames()) {
|
||||||
GGML_ASSERT(proj_stack_factor > 0);
|
GGML_ASSERT(proj_stack_factor > 0);
|
||||||
const int n_len = CLIP_ALIGN(n_patches_sq, proj_stack_factor);
|
const int n_len = CLIP_ALIGN(n_patches, proj_stack_factor);
|
||||||
n_patches_sq = n_len / proj_stack_factor;
|
n_patches = n_len / proj_stack_factor;
|
||||||
}
|
}
|
||||||
|
|
||||||
// whisper downscales input token by half after conv1d
|
// whisper downscales input token by half after conv1d
|
||||||
n_patches_sq /= 2;
|
n_patches /= 2;
|
||||||
|
|
||||||
if (ctx->model.audio_has_avgpool()) {
|
if (ctx->model.audio_has_avgpool()) {
|
||||||
// divide by 2 because of nn.AvgPool1d(2, stride=2)
|
// divide by 2 because of nn.AvgPool1d(2, stride=2)
|
||||||
n_patches_sq /= 2;
|
n_patches /= 2;
|
||||||
}
|
}
|
||||||
} break;
|
} break;
|
||||||
default:
|
default:
|
||||||
GGML_ABORT("unsupported projector type");
|
GGML_ABORT("unsupported projector type");
|
||||||
}
|
}
|
||||||
|
|
||||||
return n_patches_sq;
|
return n_patches;
|
||||||
}
|
}
|
||||||
|
|
||||||
static std::vector<std::vector<std::vector<float>>> get_1d_sincos_pos_embed_from_grid_new(int embed_dim, const std::vector<std::vector<float>> & pos) {
|
static std::vector<std::vector<std::vector<float>>> get_1d_sincos_pos_embed_from_grid_new(int embed_dim, const std::vector<std::vector<float>> & pos) {
|
||||||
|
|
@ -4019,6 +4229,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
|
||||||
set_input_i32("positions", positions);
|
set_input_i32("positions", positions);
|
||||||
} break;
|
} break;
|
||||||
case PROJECTOR_TYPE_PIXTRAL:
|
case PROJECTOR_TYPE_PIXTRAL:
|
||||||
|
case PROJECTOR_TYPE_KIMIVL:
|
||||||
{
|
{
|
||||||
// set the 2D positions
|
// set the 2D positions
|
||||||
int n_patches_per_col = image_size_width / patch_size;
|
int n_patches_per_col = image_size_width / patch_size;
|
||||||
|
|
@ -4070,6 +4281,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
|
||||||
case PROJECTOR_TYPE_INTERNVL:
|
case PROJECTOR_TYPE_INTERNVL:
|
||||||
case PROJECTOR_TYPE_QWEN2A:
|
case PROJECTOR_TYPE_QWEN2A:
|
||||||
case PROJECTOR_TYPE_ULTRAVOX:
|
case PROJECTOR_TYPE_ULTRAVOX:
|
||||||
|
case PROJECTOR_TYPE_LFM2:
|
||||||
case PROJECTOR_TYPE_VOXTRAL:
|
case PROJECTOR_TYPE_VOXTRAL:
|
||||||
{
|
{
|
||||||
// do nothing
|
// do nothing
|
||||||
|
|
@ -4141,7 +4353,6 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
|
||||||
}
|
}
|
||||||
|
|
||||||
int clip_n_mmproj_embd(const struct clip_ctx * ctx) {
|
int clip_n_mmproj_embd(const struct clip_ctx * ctx) {
|
||||||
const auto & hparams = ctx->model.hparams;
|
|
||||||
switch (ctx->model.proj_type) {
|
switch (ctx->model.proj_type) {
|
||||||
case PROJECTOR_TYPE_LDP:
|
case PROJECTOR_TYPE_LDP:
|
||||||
return ctx->model.mm_model_block_1_block_2_1_b->ne[0];
|
return ctx->model.mm_model_block_1_block_2_1_b->ne[0];
|
||||||
|
|
@ -4153,20 +4364,7 @@ int clip_n_mmproj_embd(const struct clip_ctx * ctx) {
|
||||||
case PROJECTOR_TYPE_MLP_NORM:
|
case PROJECTOR_TYPE_MLP_NORM:
|
||||||
return ctx->model.mm_3_b->ne[0];
|
return ctx->model.mm_3_b->ne[0];
|
||||||
case PROJECTOR_TYPE_MINICPMV:
|
case PROJECTOR_TYPE_MINICPMV:
|
||||||
if (hparams.minicpmv_version == 2) {
|
return ctx->model.mm_model_proj->ne[0];
|
||||||
// MiniCPM-V 2.5
|
|
||||||
return 4096;
|
|
||||||
} else if (hparams.minicpmv_version == 3) {
|
|
||||||
// MiniCPM-V 2.6
|
|
||||||
return 3584;
|
|
||||||
} else if (hparams.minicpmv_version == 4) {
|
|
||||||
// MiniCPM-o 2.6
|
|
||||||
return 3584;
|
|
||||||
} else if (hparams.minicpmv_version == 5) {
|
|
||||||
// MiniCPM-V 4.0
|
|
||||||
return 2560;
|
|
||||||
}
|
|
||||||
GGML_ABORT("Unknown minicpmv version");
|
|
||||||
case PROJECTOR_TYPE_GLM_EDGE:
|
case PROJECTOR_TYPE_GLM_EDGE:
|
||||||
return ctx->model.mm_model_mlp_3_w->ne[1];
|
return ctx->model.mm_model_mlp_3_w->ne[1];
|
||||||
case PROJECTOR_TYPE_QWEN2VL:
|
case PROJECTOR_TYPE_QWEN2VL:
|
||||||
|
|
@ -4185,6 +4383,9 @@ int clip_n_mmproj_embd(const struct clip_ctx * ctx) {
|
||||||
return ctx->model.mm_model_proj->ne[1];
|
return ctx->model.mm_model_proj->ne[1];
|
||||||
case PROJECTOR_TYPE_QWEN2A:
|
case PROJECTOR_TYPE_QWEN2A:
|
||||||
return ctx->model.mm_fc_w->ne[1];
|
return ctx->model.mm_fc_w->ne[1];
|
||||||
|
case PROJECTOR_TYPE_LFM2:
|
||||||
|
case PROJECTOR_TYPE_KIMIVL:
|
||||||
|
return ctx->model.mm_2_w->ne[1];
|
||||||
default:
|
default:
|
||||||
GGML_ABORT("Unknown projector type");
|
GGML_ABORT("Unknown projector type");
|
||||||
}
|
}
|
||||||
|
|
|
||||||
5
llama/llama.cpp/tools/mtmd/clip.h
vendored
5
llama/llama.cpp/tools/mtmd/clip.h
vendored
|
|
@ -82,11 +82,6 @@ struct clip_image_f32 * clip_image_f32_get_img(const struct clip_image_f32_batch
|
||||||
*/
|
*/
|
||||||
void clip_build_img_from_pixels(const unsigned char * rgb_pixels, int nx, int ny, struct clip_image_u8 * img);
|
void clip_build_img_from_pixels(const unsigned char * rgb_pixels, int nx, int ny, struct clip_image_u8 * img);
|
||||||
|
|
||||||
bool clip_image_load_from_file(const char * fname, struct clip_image_u8 * img);
|
|
||||||
|
|
||||||
/** interpret bytes as an image file with length bytes_length, and use the result to populate img */
|
|
||||||
bool clip_image_load_from_bytes(const unsigned char * bytes, size_t bytes_length, struct clip_image_u8 * img);
|
|
||||||
|
|
||||||
/** preprocess img and store the result in res_imgs, pad_to_square may be overridden to false depending on model configuration */
|
/** preprocess img and store the result in res_imgs, pad_to_square may be overridden to false depending on model configuration */
|
||||||
bool clip_image_preprocess(struct clip_ctx * ctx, const struct clip_image_u8 * img, struct clip_image_f32_batch * res_imgs );
|
bool clip_image_preprocess(struct clip_ctx * ctx, const struct clip_image_u8 * img, struct clip_image_f32_batch * res_imgs );
|
||||||
|
|
||||||
|
|
|
||||||
2
llama/llama.cpp/tools/mtmd/mtmd.cpp
vendored
2
llama/llama.cpp/tools/mtmd/mtmd.cpp
vendored
|
|
@ -217,7 +217,7 @@ struct mtmd_context {
|
||||||
tok_row_end_trail = false; // no trailing end-of-row token
|
tok_row_end_trail = false; // no trailing end-of-row token
|
||||||
ov_img_first = true;
|
ov_img_first = true;
|
||||||
|
|
||||||
} else if (minicpmv_version == 3 || minicpmv_version == 4 || minicpmv_version == 5) {
|
} else if (minicpmv_version == 3 || minicpmv_version == 4 || minicpmv_version == 5 || minicpmv_version == 6) {
|
||||||
// minicpmv 2.6 format:
|
// minicpmv 2.6 format:
|
||||||
// <image> (overview) </image><slice> (slice) </slice><slice> (slice) </slice>\n ...
|
// <image> (overview) </image><slice> (slice) </slice><slice> (slice) </slice>\n ...
|
||||||
slice_tmpl = MTMD_SLICE_TMPL_MINICPMV_2_6;
|
slice_tmpl = MTMD_SLICE_TMPL_MINICPMV_2_6;
|
||||||
|
|
|
||||||
5695
llama/llama.cpp/vendor/miniaudio/miniaudio.h
vendored
5695
llama/llama.cpp/vendor/miniaudio/miniaudio.h
vendored
File diff suppressed because it is too large
Load Diff
|
|
@ -116,7 +116,11 @@ func NewContextParams(numCtx int, batchSize int, numSeqMax int, threads int, fla
|
||||||
params.n_threads = C.int(threads)
|
params.n_threads = C.int(threads)
|
||||||
params.n_threads_batch = params.n_threads
|
params.n_threads_batch = params.n_threads
|
||||||
params.embeddings = C.bool(true)
|
params.embeddings = C.bool(true)
|
||||||
params.flash_attn = C.bool(flashAttention)
|
if flashAttention {
|
||||||
|
params.flash_attn_type = C.LLAMA_FLASH_ATTN_TYPE_ENABLED
|
||||||
|
} else {
|
||||||
|
params.flash_attn_type = C.LLAMA_FLASH_ATTN_TYPE_DISABLED
|
||||||
|
}
|
||||||
params.type_k = kvCacheTypeFromStr(strings.ToLower(kvCacheType))
|
params.type_k = kvCacheTypeFromStr(strings.ToLower(kvCacheType))
|
||||||
params.type_v = kvCacheTypeFromStr(strings.ToLower(kvCacheType))
|
params.type_v = kvCacheTypeFromStr(strings.ToLower(kvCacheType))
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -15,18 +15,18 @@ problem.
|
||||||
ggml/src/ggml-backend.cpp | 9 +++++++--
|
ggml/src/ggml-backend.cpp | 9 +++++++--
|
||||||
ggml/src/ggml-cann/ggml-cann.cpp | 2 ++
|
ggml/src/ggml-cann/ggml-cann.cpp | 2 ++
|
||||||
ggml/src/ggml-cuda/ggml-cuda.cu | 3 +++
|
ggml/src/ggml-cuda/ggml-cuda.cu | 3 +++
|
||||||
ggml/src/ggml-metal/ggml-metal.m | 1 +
|
ggml/src/ggml-metal/ggml-metal.cpp | 2 ++
|
||||||
ggml/src/ggml-opencl/ggml-opencl.cpp | 1 +
|
ggml/src/ggml-opencl/ggml-opencl.cpp | 1 +
|
||||||
ggml/src/ggml-rpc/ggml-rpc.cpp | 1 +
|
ggml/src/ggml-rpc/ggml-rpc.cpp | 1 +
|
||||||
ggml/src/ggml-sycl/ggml-sycl.cpp | 3 +++
|
ggml/src/ggml-sycl/ggml-sycl.cpp | 3 +++
|
||||||
ggml/src/ggml-vulkan/ggml-vulkan.cpp | 2 ++
|
ggml/src/ggml-vulkan/ggml-vulkan.cpp | 2 ++
|
||||||
8 files changed, 20 insertions(+), 2 deletions(-)
|
8 files changed, 21 insertions(+), 2 deletions(-)
|
||||||
|
|
||||||
diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp
|
diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp
|
||||||
index 1b9d29e9..97f47abd 100644
|
index ff9135fe..8ba86f82 100644
|
||||||
--- a/ggml/src/ggml-backend.cpp
|
--- a/ggml/src/ggml-backend.cpp
|
||||||
+++ b/ggml/src/ggml-backend.cpp
|
+++ b/ggml/src/ggml-backend.cpp
|
||||||
@@ -107,7 +107,6 @@ void ggml_backend_buffer_free(ggml_backend_buffer_t buffer) {
|
@@ -113,7 +113,6 @@ void ggml_backend_buffer_free(ggml_backend_buffer_t buffer) {
|
||||||
if (buffer->iface.free_buffer != NULL) {
|
if (buffer->iface.free_buffer != NULL) {
|
||||||
buffer->iface.free_buffer(buffer);
|
buffer->iface.free_buffer(buffer);
|
||||||
}
|
}
|
||||||
|
|
@ -34,7 +34,7 @@ index 1b9d29e9..97f47abd 100644
|
||||||
}
|
}
|
||||||
|
|
||||||
size_t ggml_backend_buffer_get_size(ggml_backend_buffer_t buffer) {
|
size_t ggml_backend_buffer_get_size(ggml_backend_buffer_t buffer) {
|
||||||
@@ -529,6 +528,7 @@ static void ggml_backend_multi_buffer_free_buffer(ggml_backend_buffer_t buffer)
|
@@ -586,6 +585,7 @@ static void ggml_backend_multi_buffer_free_buffer(ggml_backend_buffer_t buffer)
|
||||||
|
|
||||||
free(ctx->buffers);
|
free(ctx->buffers);
|
||||||
free(ctx);
|
free(ctx);
|
||||||
|
|
@ -42,9 +42,9 @@ index 1b9d29e9..97f47abd 100644
|
||||||
}
|
}
|
||||||
|
|
||||||
static void ggml_backend_multi_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
|
static void ggml_backend_multi_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
|
||||||
@@ -1890,6 +1890,11 @@ static void * ggml_backend_cpu_buffer_get_base(ggml_backend_buffer_t buffer) {
|
@@ -2075,6 +2075,11 @@ static void * ggml_backend_cpu_buffer_get_base(ggml_backend_buffer_t buffer) {
|
||||||
|
|
||||||
static void ggml_backend_cpu_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
static void ggml_backend_cpu_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
||||||
|
GGML_ASSERT(buffer);
|
||||||
ggml_aligned_free(buffer->context, buffer->size);
|
ggml_aligned_free(buffer->context, buffer->size);
|
||||||
+ delete buffer;
|
+ delete buffer;
|
||||||
+}
|
+}
|
||||||
|
|
@ -54,7 +54,7 @@ index 1b9d29e9..97f47abd 100644
|
||||||
}
|
}
|
||||||
|
|
||||||
static void ggml_backend_cpu_buffer_memset_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) {
|
static void ggml_backend_cpu_buffer_memset_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) {
|
||||||
@@ -1937,7 +1942,7 @@ static const struct ggml_backend_buffer_i ggml_backend_cpu_buffer_i = {
|
@@ -2127,7 +2132,7 @@ static const struct ggml_backend_buffer_i ggml_backend_cpu_buffer_i = {
|
||||||
};
|
};
|
||||||
|
|
||||||
static const struct ggml_backend_buffer_i ggml_backend_cpu_buffer_from_ptr_i = {
|
static const struct ggml_backend_buffer_i ggml_backend_cpu_buffer_from_ptr_i = {
|
||||||
|
|
@ -64,10 +64,10 @@ index 1b9d29e9..97f47abd 100644
|
||||||
/* .init_tensor = */ NULL, // no initialization required
|
/* .init_tensor = */ NULL, // no initialization required
|
||||||
/* .memset_tensor = */ ggml_backend_cpu_buffer_memset_tensor,
|
/* .memset_tensor = */ ggml_backend_cpu_buffer_memset_tensor,
|
||||||
diff --git a/ggml/src/ggml-cann/ggml-cann.cpp b/ggml/src/ggml-cann/ggml-cann.cpp
|
diff --git a/ggml/src/ggml-cann/ggml-cann.cpp b/ggml/src/ggml-cann/ggml-cann.cpp
|
||||||
index cf575b36..ca1addfa 100755
|
index b51b554e..3ba0f5a6 100755
|
||||||
--- a/ggml/src/ggml-cann/ggml-cann.cpp
|
--- a/ggml/src/ggml-cann/ggml-cann.cpp
|
||||||
+++ b/ggml/src/ggml-cann/ggml-cann.cpp
|
+++ b/ggml/src/ggml-cann/ggml-cann.cpp
|
||||||
@@ -826,6 +826,7 @@ static void ggml_backend_cann_buffer_free_buffer(
|
@@ -843,6 +843,7 @@ static void ggml_backend_cann_buffer_free_buffer(
|
||||||
ggml_backend_cann_buffer_context* ctx =
|
ggml_backend_cann_buffer_context* ctx =
|
||||||
(ggml_backend_cann_buffer_context*)buffer->context;
|
(ggml_backend_cann_buffer_context*)buffer->context;
|
||||||
delete ctx;
|
delete ctx;
|
||||||
|
|
@ -75,7 +75,7 @@ index cf575b36..ca1addfa 100755
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@@ -1572,6 +1573,7 @@ static const char * ggml_backend_cann_host_buffer_name(ggml_backend_buffer_t buf
|
@@ -1630,6 +1631,7 @@ static const char * ggml_backend_cann_host_buffer_name(ggml_backend_buffer_t buf
|
||||||
*/
|
*/
|
||||||
static void ggml_backend_cann_host_buffer_free(ggml_backend_buffer_t buffer) {
|
static void ggml_backend_cann_host_buffer_free(ggml_backend_buffer_t buffer) {
|
||||||
ACL_CHECK(aclrtFreeHost(buffer->context));
|
ACL_CHECK(aclrtFreeHost(buffer->context));
|
||||||
|
|
@ -84,7 +84,7 @@ index cf575b36..ca1addfa 100755
|
||||||
|
|
||||||
/**
|
/**
|
||||||
diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
|
diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
|
||||||
index d9110491..37ee2a6d 100644
|
index b7e81b21..fdf8c63d 100644
|
||||||
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
|
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
|
||||||
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
|
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
|
||||||
@@ -567,6 +567,7 @@ struct ggml_backend_cuda_buffer_context {
|
@@ -567,6 +567,7 @@ struct ggml_backend_cuda_buffer_context {
|
||||||
|
|
@ -111,23 +111,31 @@ index d9110491..37ee2a6d 100644
|
||||||
}
|
}
|
||||||
|
|
||||||
static void * ggml_cuda_host_malloc(size_t size) {
|
static void * ggml_cuda_host_malloc(size_t size) {
|
||||||
diff --git a/ggml/src/ggml-metal/ggml-metal.m b/ggml/src/ggml-metal/ggml-metal.m
|
diff --git a/ggml/src/ggml-metal/ggml-metal.cpp b/ggml/src/ggml-metal/ggml-metal.cpp
|
||||||
index cb8eff4a..7bccc7bf 100644
|
index e11555a7..909e17de 100644
|
||||||
--- a/ggml/src/ggml-metal/ggml-metal.m
|
--- a/ggml/src/ggml-metal/ggml-metal.cpp
|
||||||
+++ b/ggml/src/ggml-metal/ggml-metal.m
|
+++ b/ggml/src/ggml-metal/ggml-metal.cpp
|
||||||
@@ -6032,6 +6032,7 @@ static void ggml_backend_metal_buffer_free_buffer(ggml_backend_buffer_t buffer)
|
@@ -25,6 +25,7 @@ static void ggml_backend_metal_buffer_shared_free_buffer(ggml_backend_buffer_t b
|
||||||
|
GGML_ASSERT(ggml_metal_buffer_is_shared(ctx));
|
||||||
|
|
||||||
|
ggml_metal_buffer_free(ctx);
|
||||||
|
+ delete buffer;
|
||||||
}
|
}
|
||||||
|
|
||||||
free(ctx);
|
static void * ggml_backend_metal_buffer_shared_get_base(ggml_backend_buffer_t buffer) {
|
||||||
+ free(buffer);
|
@@ -99,6 +100,7 @@ static void ggml_backend_metal_buffer_private_free_buffer(ggml_backend_buffer_t
|
||||||
|
GGML_ASSERT(!ggml_metal_buffer_is_shared(ctx));
|
||||||
|
|
||||||
|
ggml_metal_buffer_free(ctx);
|
||||||
|
+ delete buffer;
|
||||||
}
|
}
|
||||||
|
|
||||||
static void * ggml_backend_metal_buffer_get_base(ggml_backend_buffer_t buffer) {
|
static void * ggml_backend_metal_buffer_private_get_base(ggml_backend_buffer_t buffer) {
|
||||||
diff --git a/ggml/src/ggml-opencl/ggml-opencl.cpp b/ggml/src/ggml-opencl/ggml-opencl.cpp
|
diff --git a/ggml/src/ggml-opencl/ggml-opencl.cpp b/ggml/src/ggml-opencl/ggml-opencl.cpp
|
||||||
index 8ba1e00d..8163e8dc 100644
|
index 0cf3b924..09d706b5 100644
|
||||||
--- a/ggml/src/ggml-opencl/ggml-opencl.cpp
|
--- a/ggml/src/ggml-opencl/ggml-opencl.cpp
|
||||||
+++ b/ggml/src/ggml-opencl/ggml-opencl.cpp
|
+++ b/ggml/src/ggml-opencl/ggml-opencl.cpp
|
||||||
@@ -2745,6 +2745,7 @@ struct ggml_backend_opencl_buffer_context {
|
@@ -3215,6 +3215,7 @@ struct ggml_backend_opencl_buffer_context {
|
||||||
static void ggml_backend_opencl_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
static void ggml_backend_opencl_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
||||||
ggml_backend_opencl_buffer_context * ctx = (ggml_backend_opencl_buffer_context *) buffer->context;
|
ggml_backend_opencl_buffer_context * ctx = (ggml_backend_opencl_buffer_context *) buffer->context;
|
||||||
delete ctx;
|
delete ctx;
|
||||||
|
|
@ -136,10 +144,10 @@ index 8ba1e00d..8163e8dc 100644
|
||||||
|
|
||||||
static void * ggml_backend_opencl_buffer_get_base(ggml_backend_buffer_t buffer) {
|
static void * ggml_backend_opencl_buffer_get_base(ggml_backend_buffer_t buffer) {
|
||||||
diff --git a/ggml/src/ggml-rpc/ggml-rpc.cpp b/ggml/src/ggml-rpc/ggml-rpc.cpp
|
diff --git a/ggml/src/ggml-rpc/ggml-rpc.cpp b/ggml/src/ggml-rpc/ggml-rpc.cpp
|
||||||
index df6ba540..2e395968 100644
|
index f99681c8..59591770 100644
|
||||||
--- a/ggml/src/ggml-rpc/ggml-rpc.cpp
|
--- a/ggml/src/ggml-rpc/ggml-rpc.cpp
|
||||||
+++ b/ggml/src/ggml-rpc/ggml-rpc.cpp
|
+++ b/ggml/src/ggml-rpc/ggml-rpc.cpp
|
||||||
@@ -486,6 +486,7 @@ static void ggml_backend_rpc_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
@@ -505,6 +505,7 @@ static void ggml_backend_rpc_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
||||||
bool status = send_rpc_cmd(ctx->sock, RPC_CMD_FREE_BUFFER, &request, sizeof(request), nullptr, 0);
|
bool status = send_rpc_cmd(ctx->sock, RPC_CMD_FREE_BUFFER, &request, sizeof(request), nullptr, 0);
|
||||||
RPC_STATUS_ASSERT(status);
|
RPC_STATUS_ASSERT(status);
|
||||||
delete ctx;
|
delete ctx;
|
||||||
|
|
@ -148,7 +156,7 @@ index df6ba540..2e395968 100644
|
||||||
|
|
||||||
static void * ggml_backend_rpc_buffer_get_base(ggml_backend_buffer_t buffer) {
|
static void * ggml_backend_rpc_buffer_get_base(ggml_backend_buffer_t buffer) {
|
||||||
diff --git a/ggml/src/ggml-sycl/ggml-sycl.cpp b/ggml/src/ggml-sycl/ggml-sycl.cpp
|
diff --git a/ggml/src/ggml-sycl/ggml-sycl.cpp b/ggml/src/ggml-sycl/ggml-sycl.cpp
|
||||||
index 3992dad0..67503951 100644
|
index 4ac919ea..447ea3c4 100644
|
||||||
--- a/ggml/src/ggml-sycl/ggml-sycl.cpp
|
--- a/ggml/src/ggml-sycl/ggml-sycl.cpp
|
||||||
+++ b/ggml/src/ggml-sycl/ggml-sycl.cpp
|
+++ b/ggml/src/ggml-sycl/ggml-sycl.cpp
|
||||||
@@ -331,6 +331,7 @@ ggml_backend_sycl_buffer_free_buffer(ggml_backend_buffer_t buffer) try {
|
@@ -331,6 +331,7 @@ ggml_backend_sycl_buffer_free_buffer(ggml_backend_buffer_t buffer) try {
|
||||||
|
|
@ -176,10 +184,10 @@ index 3992dad0..67503951 100644
|
||||||
|
|
||||||
static ggml_backend_buffer_t ggml_backend_sycl_host_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
|
static ggml_backend_buffer_t ggml_backend_sycl_host_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
|
||||||
diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
|
diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
|
||||||
index 4070e248..394a2839 100644
|
index 2608cbd0..061cd078 100644
|
||||||
--- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp
|
--- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp
|
||||||
+++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
|
+++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
|
||||||
@@ -10209,6 +10209,7 @@ static void ggml_backend_vk_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
@@ -11603,6 +11603,7 @@ static void ggml_backend_vk_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
||||||
ggml_backend_vk_buffer_context * ctx = (ggml_backend_vk_buffer_context *)buffer->context;
|
ggml_backend_vk_buffer_context * ctx = (ggml_backend_vk_buffer_context *)buffer->context;
|
||||||
ggml_vk_destroy_buffer(ctx->dev_buffer);
|
ggml_vk_destroy_buffer(ctx->dev_buffer);
|
||||||
delete ctx;
|
delete ctx;
|
||||||
|
|
@ -187,7 +195,7 @@ index 4070e248..394a2839 100644
|
||||||
}
|
}
|
||||||
|
|
||||||
static void * ggml_backend_vk_buffer_get_base(ggml_backend_buffer_t buffer) {
|
static void * ggml_backend_vk_buffer_get_base(ggml_backend_buffer_t buffer) {
|
||||||
@@ -10352,6 +10353,7 @@ static const char * ggml_backend_vk_host_buffer_name(ggml_backend_buffer_t buffe
|
@@ -11746,6 +11747,7 @@ static const char * ggml_backend_vk_host_buffer_name(ggml_backend_buffer_t buffe
|
||||||
static void ggml_backend_vk_host_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
static void ggml_backend_vk_host_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
||||||
VK_LOG_MEMORY("ggml_backend_vk_host_buffer_free_buffer()");
|
VK_LOG_MEMORY("ggml_backend_vk_host_buffer_free_buffer()");
|
||||||
ggml_vk_host_free(vk_instance.devices[0], buffer->context);
|
ggml_vk_host_free(vk_instance.devices[0], buffer->context);
|
||||||
|
|
|
||||||
|
|
@ -10,10 +10,10 @@ logs instead of throwing an error
|
||||||
1 file changed, 3 insertions(+), 11 deletions(-)
|
1 file changed, 3 insertions(+), 11 deletions(-)
|
||||||
|
|
||||||
diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp
|
diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp
|
||||||
index f7e03e70..8ebe11cf 100644
|
index da938af0..2a38abf4 100644
|
||||||
--- a/src/llama-vocab.cpp
|
--- a/src/llama-vocab.cpp
|
||||||
+++ b/src/llama-vocab.cpp
|
+++ b/src/llama-vocab.cpp
|
||||||
@@ -1804,16 +1804,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|
@@ -1811,16 +1811,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|
||||||
if (type == LLAMA_VOCAB_TYPE_BPE) {
|
if (type == LLAMA_VOCAB_TYPE_BPE) {
|
||||||
add_space_prefix = false;
|
add_space_prefix = false;
|
||||||
clean_spaces = true;
|
clean_spaces = true;
|
||||||
|
|
@ -31,8 +31,8 @@ index f7e03e70..8ebe11cf 100644
|
||||||
pre_type = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
|
pre_type = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
|
||||||
} else if (
|
} else if (
|
||||||
tokenizer_pre == "llama3" ||
|
tokenizer_pre == "llama3" ||
|
||||||
@@ -1975,7 +1966,8 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|
@@ -1987,7 +1978,8 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|
||||||
pre_type = LLAMA_VOCAB_PRE_TYPE_KIMI_K2;
|
pre_type = LLAMA_VOCAB_PRE_TYPE_GROK_2;
|
||||||
clean_spaces = false;
|
clean_spaces = false;
|
||||||
} else {
|
} else {
|
||||||
- throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));
|
- throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));
|
||||||
|
|
|
||||||
|
|
@ -10,7 +10,7 @@ filesystems for paths that include wide characters
|
||||||
1 file changed, 39 insertions(+)
|
1 file changed, 39 insertions(+)
|
||||||
|
|
||||||
diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp
|
diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp
|
||||||
index 20c21733..f4f69cfc 100644
|
index 210ecc88..355219a9 100644
|
||||||
--- a/tools/mtmd/clip.cpp
|
--- a/tools/mtmd/clip.cpp
|
||||||
+++ b/tools/mtmd/clip.cpp
|
+++ b/tools/mtmd/clip.cpp
|
||||||
@@ -28,6 +28,19 @@
|
@@ -28,6 +28,19 @@
|
||||||
|
|
@ -33,7 +33,7 @@ index 20c21733..f4f69cfc 100644
|
||||||
struct clip_logger_state g_logger_state = {GGML_LOG_LEVEL_CONT, clip_log_callback_default, NULL};
|
struct clip_logger_state g_logger_state = {GGML_LOG_LEVEL_CONT, clip_log_callback_default, NULL};
|
||||||
|
|
||||||
enum ffn_op_type {
|
enum ffn_op_type {
|
||||||
@@ -2597,7 +2610,29 @@ struct clip_model_loader {
|
@@ -2759,7 +2772,29 @@ struct clip_model_loader {
|
||||||
{
|
{
|
||||||
std::vector<uint8_t> read_buf;
|
std::vector<uint8_t> read_buf;
|
||||||
|
|
||||||
|
|
@ -63,7 +63,7 @@ index 20c21733..f4f69cfc 100644
|
||||||
if (!fin) {
|
if (!fin) {
|
||||||
throw std::runtime_error(string_format("%s: failed to open %s\n", __func__, fname.c_str()));
|
throw std::runtime_error(string_format("%s: failed to open %s\n", __func__, fname.c_str()));
|
||||||
}
|
}
|
||||||
@@ -2624,7 +2659,11 @@ struct clip_model_loader {
|
@@ -2786,7 +2821,11 @@ struct clip_model_loader {
|
||||||
ggml_backend_tensor_set(cur, read_buf.data(), 0, num_bytes);
|
ggml_backend_tensor_set(cur, read_buf.data(), 0, num_bytes);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -15,10 +15,10 @@ adds support for the Solar Pro architecture
|
||||||
7 files changed, 248 insertions(+)
|
7 files changed, 248 insertions(+)
|
||||||
|
|
||||||
diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp
|
diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp
|
||||||
index 18dcc6dd..4b285646 100644
|
index 4e8d54c4..f98a3574 100644
|
||||||
--- a/src/llama-arch.cpp
|
--- a/src/llama-arch.cpp
|
||||||
+++ b/src/llama-arch.cpp
|
+++ b/src/llama-arch.cpp
|
||||||
@@ -78,6 +78,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
|
@@ -81,6 +81,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
|
||||||
{ LLM_ARCH_GRANITE_MOE, "granitemoe" },
|
{ LLM_ARCH_GRANITE_MOE, "granitemoe" },
|
||||||
{ LLM_ARCH_GRANITE_HYBRID, "granitehybrid" },
|
{ LLM_ARCH_GRANITE_HYBRID, "granitehybrid" },
|
||||||
{ LLM_ARCH_CHAMELEON, "chameleon" },
|
{ LLM_ARCH_CHAMELEON, "chameleon" },
|
||||||
|
|
@ -26,15 +26,15 @@ index 18dcc6dd..4b285646 100644
|
||||||
{ LLM_ARCH_WAVTOKENIZER_DEC, "wavtokenizer-dec" },
|
{ LLM_ARCH_WAVTOKENIZER_DEC, "wavtokenizer-dec" },
|
||||||
{ LLM_ARCH_PLM, "plm" },
|
{ LLM_ARCH_PLM, "plm" },
|
||||||
{ LLM_ARCH_BAILINGMOE, "bailingmoe" },
|
{ LLM_ARCH_BAILINGMOE, "bailingmoe" },
|
||||||
@@ -164,6 +165,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
|
@@ -177,6 +178,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
|
||||||
{ LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT, "%s.attention.relative_buckets_count" },
|
|
||||||
{ LLM_KV_ATTENTION_SLIDING_WINDOW, "%s.attention.sliding_window" },
|
|
||||||
{ LLM_KV_ATTENTION_SCALE, "%s.attention.scale" },
|
{ LLM_KV_ATTENTION_SCALE, "%s.attention.scale" },
|
||||||
|
{ LLM_KV_ATTENTION_OUTPUT_SCALE, "%s.attention.output_scale" },
|
||||||
|
{ LLM_KV_ATTENTION_TEMPERATURE_LENGTH, "%s.attention.temperature_length" },
|
||||||
+ { LLM_KV_ATTENTION_BLOCK_SKIP_CONNECTION, "%s.attention.block_skip_connection" },
|
+ { LLM_KV_ATTENTION_BLOCK_SKIP_CONNECTION, "%s.attention.block_skip_connection" },
|
||||||
{ LLM_KV_ATTENTION_KEY_LENGTH_MLA, "%s.attention.key_length_mla" },
|
{ LLM_KV_ATTENTION_KEY_LENGTH_MLA, "%s.attention.key_length_mla" },
|
||||||
{ LLM_KV_ATTENTION_VALUE_LENGTH_MLA, "%s.attention.value_length_mla" },
|
{ LLM_KV_ATTENTION_VALUE_LENGTH_MLA, "%s.attention.value_length_mla" },
|
||||||
|
|
||||||
@@ -1794,6 +1796,24 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
|
@@ -1879,6 +1881,24 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
|
||||||
{ LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" },
|
{ LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" },
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
|
|
@ -59,7 +59,7 @@ index 18dcc6dd..4b285646 100644
|
||||||
{
|
{
|
||||||
LLM_ARCH_WAVTOKENIZER_DEC,
|
LLM_ARCH_WAVTOKENIZER_DEC,
|
||||||
{
|
{
|
||||||
@@ -2219,6 +2239,7 @@ static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
|
@@ -2368,6 +2388,7 @@ static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
|
||||||
{LLM_TENSOR_LAUREL_POST_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
|
{LLM_TENSOR_LAUREL_POST_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
|
||||||
// this tensor is loaded for T5, but never used
|
// this tensor is loaded for T5, but never used
|
||||||
{LLM_TENSOR_DEC_CROSS_ATTN_REL_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_NONE}},
|
{LLM_TENSOR_DEC_CROSS_ATTN_REL_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_NONE}},
|
||||||
|
|
@ -68,10 +68,10 @@ index 18dcc6dd..4b285646 100644
|
||||||
{LLM_TENSOR_POS_NET_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
|
{LLM_TENSOR_POS_NET_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
|
||||||
{LLM_TENSOR_POS_NET_NORM1, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
|
{LLM_TENSOR_POS_NET_NORM1, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
|
||||||
diff --git a/src/llama-arch.h b/src/llama-arch.h
|
diff --git a/src/llama-arch.h b/src/llama-arch.h
|
||||||
index 7af587e7..3ea994c7 100644
|
index b5c6f3d7..aa8e0e7b 100644
|
||||||
--- a/src/llama-arch.h
|
--- a/src/llama-arch.h
|
||||||
+++ b/src/llama-arch.h
|
+++ b/src/llama-arch.h
|
||||||
@@ -82,6 +82,7 @@ enum llm_arch {
|
@@ -85,6 +85,7 @@ enum llm_arch {
|
||||||
LLM_ARCH_GRANITE_MOE,
|
LLM_ARCH_GRANITE_MOE,
|
||||||
LLM_ARCH_GRANITE_HYBRID,
|
LLM_ARCH_GRANITE_HYBRID,
|
||||||
LLM_ARCH_CHAMELEON,
|
LLM_ARCH_CHAMELEON,
|
||||||
|
|
@ -79,15 +79,15 @@ index 7af587e7..3ea994c7 100644
|
||||||
LLM_ARCH_WAVTOKENIZER_DEC,
|
LLM_ARCH_WAVTOKENIZER_DEC,
|
||||||
LLM_ARCH_PLM,
|
LLM_ARCH_PLM,
|
||||||
LLM_ARCH_BAILINGMOE,
|
LLM_ARCH_BAILINGMOE,
|
||||||
@@ -168,6 +169,7 @@ enum llm_kv {
|
@@ -181,6 +182,7 @@ enum llm_kv {
|
||||||
LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT,
|
|
||||||
LLM_KV_ATTENTION_SLIDING_WINDOW,
|
|
||||||
LLM_KV_ATTENTION_SCALE,
|
LLM_KV_ATTENTION_SCALE,
|
||||||
|
LLM_KV_ATTENTION_OUTPUT_SCALE,
|
||||||
|
LLM_KV_ATTENTION_TEMPERATURE_LENGTH,
|
||||||
+ LLM_KV_ATTENTION_BLOCK_SKIP_CONNECTION,
|
+ LLM_KV_ATTENTION_BLOCK_SKIP_CONNECTION,
|
||||||
LLM_KV_ATTENTION_KEY_LENGTH_MLA,
|
LLM_KV_ATTENTION_KEY_LENGTH_MLA,
|
||||||
LLM_KV_ATTENTION_VALUE_LENGTH_MLA,
|
LLM_KV_ATTENTION_VALUE_LENGTH_MLA,
|
||||||
|
|
||||||
@@ -394,6 +396,7 @@ enum llm_tensor {
|
@@ -417,6 +419,7 @@ enum llm_tensor {
|
||||||
LLM_TENSOR_ENC_OUTPUT_NORM,
|
LLM_TENSOR_ENC_OUTPUT_NORM,
|
||||||
LLM_TENSOR_CLS,
|
LLM_TENSOR_CLS,
|
||||||
LLM_TENSOR_CLS_OUT,
|
LLM_TENSOR_CLS_OUT,
|
||||||
|
|
@ -96,10 +96,10 @@ index 7af587e7..3ea994c7 100644
|
||||||
LLM_TENSOR_CONVNEXT_DW,
|
LLM_TENSOR_CONVNEXT_DW,
|
||||||
LLM_TENSOR_CONVNEXT_NORM,
|
LLM_TENSOR_CONVNEXT_NORM,
|
||||||
diff --git a/src/llama-hparams.cpp b/src/llama-hparams.cpp
|
diff --git a/src/llama-hparams.cpp b/src/llama-hparams.cpp
|
||||||
index 7a06368d..35fc054f 100644
|
index c04ac58f..24a515a0 100644
|
||||||
--- a/src/llama-hparams.cpp
|
--- a/src/llama-hparams.cpp
|
||||||
+++ b/src/llama-hparams.cpp
|
+++ b/src/llama-hparams.cpp
|
||||||
@@ -146,6 +146,14 @@ uint32_t llama_hparams::n_pos_per_embd() const {
|
@@ -147,6 +147,14 @@ uint32_t llama_hparams::n_pos_per_embd() const {
|
||||||
return rope_type == LLAMA_ROPE_TYPE_MROPE ? 4 : 1;
|
return rope_type == LLAMA_ROPE_TYPE_MROPE ? 4 : 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -115,10 +115,10 @@ index 7a06368d..35fc054f 100644
|
||||||
if (il < n_layer) {
|
if (il < n_layer) {
|
||||||
return swa_layers[il];
|
return swa_layers[il];
|
||||||
diff --git a/src/llama-hparams.h b/src/llama-hparams.h
|
diff --git a/src/llama-hparams.h b/src/llama-hparams.h
|
||||||
index bd231224..29bd9056 100644
|
index 0fe4b569..eb13709f 100644
|
||||||
--- a/src/llama-hparams.h
|
--- a/src/llama-hparams.h
|
||||||
+++ b/src/llama-hparams.h
|
+++ b/src/llama-hparams.h
|
||||||
@@ -62,6 +62,8 @@ struct llama_hparams {
|
@@ -64,6 +64,8 @@ struct llama_hparams {
|
||||||
std::array<uint32_t, LLAMA_MAX_LAYERS> n_head_kv_arr;
|
std::array<uint32_t, LLAMA_MAX_LAYERS> n_head_kv_arr;
|
||||||
std::array<uint32_t, LLAMA_MAX_LAYERS> n_ff_arr;
|
std::array<uint32_t, LLAMA_MAX_LAYERS> n_ff_arr;
|
||||||
|
|
||||||
|
|
@ -127,7 +127,7 @@ index bd231224..29bd9056 100644
|
||||||
uint32_t n_layer_dense_lead = 0;
|
uint32_t n_layer_dense_lead = 0;
|
||||||
uint32_t n_lora_q = 0;
|
uint32_t n_lora_q = 0;
|
||||||
uint32_t n_lora_kv = 0;
|
uint32_t n_lora_kv = 0;
|
||||||
@@ -220,6 +222,9 @@ struct llama_hparams {
|
@@ -236,6 +238,9 @@ struct llama_hparams {
|
||||||
|
|
||||||
uint32_t n_pos_per_embd() const;
|
uint32_t n_pos_per_embd() const;
|
||||||
|
|
||||||
|
|
@ -135,10 +135,10 @@ index bd231224..29bd9056 100644
|
||||||
+ bool n_bskcn(uint32_t n, uint32_t il) const;
|
+ bool n_bskcn(uint32_t n, uint32_t il) const;
|
||||||
+
|
+
|
||||||
bool is_swa(uint32_t il) const;
|
bool is_swa(uint32_t il) const;
|
||||||
};
|
|
||||||
|
|
||||||
|
bool has_kv(uint32_t il) const;
|
||||||
diff --git a/src/llama-model-loader.cpp b/src/llama-model-loader.cpp
|
diff --git a/src/llama-model-loader.cpp b/src/llama-model-loader.cpp
|
||||||
index f71c40f8..7eab9b68 100644
|
index 8182a9ad..daef900c 100644
|
||||||
--- a/src/llama-model-loader.cpp
|
--- a/src/llama-model-loader.cpp
|
||||||
+++ b/src/llama-model-loader.cpp
|
+++ b/src/llama-model-loader.cpp
|
||||||
@@ -465,6 +465,7 @@ namespace GGUFMeta {
|
@@ -465,6 +465,7 @@ namespace GGUFMeta {
|
||||||
|
|
@ -150,10 +150,10 @@ index f71c40f8..7eab9b68 100644
|
||||||
llama_model_loader::llama_model_loader(
|
llama_model_loader::llama_model_loader(
|
||||||
const std::string & fname,
|
const std::string & fname,
|
||||||
diff --git a/src/llama-model.cpp b/src/llama-model.cpp
|
diff --git a/src/llama-model.cpp b/src/llama-model.cpp
|
||||||
index 58ca7df7..280129e1 100644
|
index 2470f878..0398b553 100644
|
||||||
--- a/src/llama-model.cpp
|
--- a/src/llama-model.cpp
|
||||||
+++ b/src/llama-model.cpp
|
+++ b/src/llama-model.cpp
|
||||||
@@ -1706,6 +1706,21 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
@@ -1845,6 +1845,21 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
||||||
default: type = LLM_TYPE_UNKNOWN;
|
default: type = LLM_TYPE_UNKNOWN;
|
||||||
}
|
}
|
||||||
} break;
|
} break;
|
||||||
|
|
@ -175,7 +175,7 @@ index 58ca7df7..280129e1 100644
|
||||||
case LLM_ARCH_WAVTOKENIZER_DEC:
|
case LLM_ARCH_WAVTOKENIZER_DEC:
|
||||||
{
|
{
|
||||||
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
||||||
@@ -4793,6 +4808,34 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
@@ -5113,6 +5128,34 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
||||||
|
|
||||||
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
|
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
|
||||||
|
|
||||||
|
|
@ -210,7 +210,7 @@ index 58ca7df7..280129e1 100644
|
||||||
layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
|
layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
|
||||||
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
|
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
|
||||||
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
|
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
|
||||||
@@ -15495,6 +15538,165 @@ struct llm_build_granite_hybrid : public llm_graph_context_mamba {
|
@@ -16273,6 +16316,165 @@ struct llm_build_granite_hybrid : public llm_graph_context_mamba {
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
@ -229,7 +229,7 @@ index 58ca7df7..280129e1 100644
|
||||||
+ struct ggml_tensor * inp_pos = build_inp_pos();
|
+ struct ggml_tensor * inp_pos = build_inp_pos();
|
||||||
+
|
+
|
||||||
+ // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
+ // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
||||||
+ auto * inp_attn = build_attn_inp_kv_unified();
|
+ auto * inp_attn = build_attn_inp_kv();
|
||||||
+
|
+
|
||||||
+ const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
|
+ const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
|
||||||
+
|
+
|
||||||
|
|
@ -316,7 +316,7 @@ index 58ca7df7..280129e1 100644
|
||||||
+
|
+
|
||||||
+ cur = build_attn(inp_attn,
|
+ cur = build_attn(inp_attn,
|
||||||
+ model.layers[il].wo, model.layers[il].bo,
|
+ model.layers[il].wo, model.layers[il].bo,
|
||||||
+ Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il);
|
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
|
||||||
+ cb(cur, "attn_out", il);
|
+ cb(cur, "attn_out", il);
|
||||||
+ }
|
+ }
|
||||||
+
|
+
|
||||||
|
|
@ -376,7 +376,7 @@ index 58ca7df7..280129e1 100644
|
||||||
// ref: https://github.com/facebookresearch/chameleon
|
// ref: https://github.com/facebookresearch/chameleon
|
||||||
// based on the original build_llama() function, changes:
|
// based on the original build_llama() function, changes:
|
||||||
// * qk-norm
|
// * qk-norm
|
||||||
@@ -18439,6 +18641,10 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
|
@@ -19552,6 +19754,10 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
|
||||||
{
|
{
|
||||||
llm = std::make_unique<llm_build_chameleon>(*this, params);
|
llm = std::make_unique<llm_build_chameleon>(*this, params);
|
||||||
} break;
|
} break;
|
||||||
|
|
@ -387,7 +387,7 @@ index 58ca7df7..280129e1 100644
|
||||||
case LLM_ARCH_WAVTOKENIZER_DEC:
|
case LLM_ARCH_WAVTOKENIZER_DEC:
|
||||||
{
|
{
|
||||||
llm = std::make_unique<llm_build_wavtokenizer_dec>(*this, params);
|
llm = std::make_unique<llm_build_wavtokenizer_dec>(*this, params);
|
||||||
@@ -18652,6 +18858,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
|
@@ -19770,6 +19976,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
|
||||||
case LLM_ARCH_GRANITE_MOE:
|
case LLM_ARCH_GRANITE_MOE:
|
||||||
case LLM_ARCH_GRANITE_HYBRID:
|
case LLM_ARCH_GRANITE_HYBRID:
|
||||||
case LLM_ARCH_CHAMELEON:
|
case LLM_ARCH_CHAMELEON:
|
||||||
|
|
@ -396,10 +396,10 @@ index 58ca7df7..280129e1 100644
|
||||||
case LLM_ARCH_NEO_BERT:
|
case LLM_ARCH_NEO_BERT:
|
||||||
case LLM_ARCH_SMOLLM3:
|
case LLM_ARCH_SMOLLM3:
|
||||||
diff --git a/src/llama-model.h b/src/llama-model.h
|
diff --git a/src/llama-model.h b/src/llama-model.h
|
||||||
index 6fcd74d5..09964533 100644
|
index d73ce969..c086f94e 100644
|
||||||
--- a/src/llama-model.h
|
--- a/src/llama-model.h
|
||||||
+++ b/src/llama-model.h
|
+++ b/src/llama-model.h
|
||||||
@@ -70,6 +70,7 @@ enum llm_type {
|
@@ -76,6 +76,7 @@ enum llm_type {
|
||||||
LLM_TYPE_15B,
|
LLM_TYPE_15B,
|
||||||
LLM_TYPE_16B,
|
LLM_TYPE_16B,
|
||||||
LLM_TYPE_20B,
|
LLM_TYPE_20B,
|
||||||
|
|
@ -407,7 +407,7 @@ index 6fcd74d5..09964533 100644
|
||||||
LLM_TYPE_27B,
|
LLM_TYPE_27B,
|
||||||
LLM_TYPE_30B,
|
LLM_TYPE_30B,
|
||||||
LLM_TYPE_32B,
|
LLM_TYPE_32B,
|
||||||
@@ -367,6 +368,8 @@ struct llama_layer {
|
@@ -380,6 +381,8 @@ struct llama_layer {
|
||||||
// openai-moe
|
// openai-moe
|
||||||
struct ggml_tensor * attn_sinks = nullptr;
|
struct ggml_tensor * attn_sinks = nullptr;
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -12,7 +12,7 @@ regex
|
||||||
2 files changed, 22 insertions(+), 1 deletion(-)
|
2 files changed, 22 insertions(+), 1 deletion(-)
|
||||||
|
|
||||||
diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp
|
diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp
|
||||||
index 8ebe11cf..c011008f 100644
|
index 2a38abf4..26fa9fad 100644
|
||||||
--- a/src/llama-vocab.cpp
|
--- a/src/llama-vocab.cpp
|
||||||
+++ b/src/llama-vocab.cpp
|
+++ b/src/llama-vocab.cpp
|
||||||
@@ -299,7 +299,7 @@ struct llm_tokenizer_bpe : llm_tokenizer {
|
@@ -299,7 +299,7 @@ struct llm_tokenizer_bpe : llm_tokenizer {
|
||||||
|
|
|
||||||
|
|
@ -8,10 +8,10 @@ Subject: [PATCH] maintain ordering for rules for grammar
|
||||||
1 file changed, 1 insertion(+), 1 deletion(-)
|
1 file changed, 1 insertion(+), 1 deletion(-)
|
||||||
|
|
||||||
diff --git a/common/json-schema-to-grammar.cpp b/common/json-schema-to-grammar.cpp
|
diff --git a/common/json-schema-to-grammar.cpp b/common/json-schema-to-grammar.cpp
|
||||||
index 637891f5..98b8280f 100644
|
index db1f0b23..f4de7e34 100644
|
||||||
--- a/common/json-schema-to-grammar.cpp
|
--- a/common/json-schema-to-grammar.cpp
|
||||||
+++ b/common/json-schema-to-grammar.cpp
|
+++ b/common/json-schema-to-grammar.cpp
|
||||||
@@ -307,7 +307,7 @@ private:
|
@@ -308,7 +308,7 @@ private:
|
||||||
friend std::string build_grammar(const std::function<void(const common_grammar_builder &)> & cb, const common_grammar_options & options);
|
friend std::string build_grammar(const std::function<void(const common_grammar_builder &)> & cb, const common_grammar_options & options);
|
||||||
std::function<json(const std::string &)> _fetch_json;
|
std::function<json(const std::string &)> _fetch_json;
|
||||||
bool _dotall;
|
bool _dotall;
|
||||||
|
|
|
||||||
|
|
@ -11,10 +11,10 @@ with the fastest acceleration is loaded
|
||||||
1 file changed, 13 insertions(+), 8 deletions(-)
|
1 file changed, 13 insertions(+), 8 deletions(-)
|
||||||
|
|
||||||
diff --git a/ggml/src/ggml-backend-reg.cpp b/ggml/src/ggml-backend-reg.cpp
|
diff --git a/ggml/src/ggml-backend-reg.cpp b/ggml/src/ggml-backend-reg.cpp
|
||||||
index 6c315137..3040b2aa 100644
|
index 136afec7..f794d9cf 100644
|
||||||
--- a/ggml/src/ggml-backend-reg.cpp
|
--- a/ggml/src/ggml-backend-reg.cpp
|
||||||
+++ b/ggml/src/ggml-backend-reg.cpp
|
+++ b/ggml/src/ggml-backend-reg.cpp
|
||||||
@@ -162,7 +162,7 @@ struct ggml_backend_reg_entry {
|
@@ -175,7 +175,7 @@ struct ggml_backend_reg_entry {
|
||||||
|
|
||||||
struct ggml_backend_registry {
|
struct ggml_backend_registry {
|
||||||
std::vector<ggml_backend_reg_entry> backends;
|
std::vector<ggml_backend_reg_entry> backends;
|
||||||
|
|
@ -23,7 +23,7 @@ index 6c315137..3040b2aa 100644
|
||||||
|
|
||||||
ggml_backend_registry() {
|
ggml_backend_registry() {
|
||||||
#ifdef GGML_USE_CUDA
|
#ifdef GGML_USE_CUDA
|
||||||
@@ -207,7 +207,7 @@ struct ggml_backend_registry {
|
@@ -223,7 +223,7 @@ struct ggml_backend_registry {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -32,7 +32,7 @@ index 6c315137..3040b2aa 100644
|
||||||
if (!reg) {
|
if (!reg) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
@@ -218,15 +218,20 @@ struct ggml_backend_registry {
|
@@ -234,15 +234,20 @@ struct ggml_backend_registry {
|
||||||
#endif
|
#endif
|
||||||
backends.push_back({ reg, std::move(handle) });
|
backends.push_back({ reg, std::move(handle) });
|
||||||
for (size_t i = 0; i < ggml_backend_reg_dev_count(reg); i++) {
|
for (size_t i = 0; i < ggml_backend_reg_dev_count(reg); i++) {
|
||||||
|
|
@ -56,7 +56,7 @@ index 6c315137..3040b2aa 100644
|
||||||
}
|
}
|
||||||
|
|
||||||
ggml_backend_reg_t load_backend(const fs::path & path, bool silent) {
|
ggml_backend_reg_t load_backend(const fs::path & path, bool silent) {
|
||||||
@@ -270,7 +275,7 @@ struct ggml_backend_registry {
|
@@ -286,7 +291,7 @@ struct ggml_backend_registry {
|
||||||
|
|
||||||
GGML_LOG_INFO("%s: loaded %s backend from %s\n", __func__, ggml_backend_reg_name(reg), path_str(path).c_str());
|
GGML_LOG_INFO("%s: loaded %s backend from %s\n", __func__, ggml_backend_reg_name(reg), path_str(path).c_str());
|
||||||
|
|
||||||
|
|
@ -65,7 +65,7 @@ index 6c315137..3040b2aa 100644
|
||||||
|
|
||||||
return reg;
|
return reg;
|
||||||
}
|
}
|
||||||
@@ -293,7 +298,7 @@ struct ggml_backend_registry {
|
@@ -309,7 +314,7 @@ struct ggml_backend_registry {
|
||||||
// remove devices
|
// remove devices
|
||||||
devices.erase(
|
devices.erase(
|
||||||
std::remove_if(devices.begin(), devices.end(),
|
std::remove_if(devices.begin(), devices.end(),
|
||||||
|
|
@ -74,7 +74,7 @@ index 6c315137..3040b2aa 100644
|
||||||
devices.end());
|
devices.end());
|
||||||
|
|
||||||
// remove backend
|
// remove backend
|
||||||
@@ -351,7 +356,7 @@ size_t ggml_backend_dev_count() {
|
@@ -367,7 +372,7 @@ size_t ggml_backend_dev_count() {
|
||||||
|
|
||||||
ggml_backend_dev_t ggml_backend_dev_get(size_t index) {
|
ggml_backend_dev_t ggml_backend_dev_get(size_t index) {
|
||||||
GGML_ASSERT(index < ggml_backend_dev_count());
|
GGML_ASSERT(index < ggml_backend_dev_count());
|
||||||
|
|
|
||||||
|
|
@ -8,10 +8,10 @@ Subject: [PATCH] add phony target ggml-cpu for all cpu variants
|
||||||
1 file changed, 2 insertions(+)
|
1 file changed, 2 insertions(+)
|
||||||
|
|
||||||
diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt
|
diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt
|
||||||
index 177fb282..f5a5079a 100644
|
index c8f3d859..ff6229a0 100644
|
||||||
--- a/ggml/src/CMakeLists.txt
|
--- a/ggml/src/CMakeLists.txt
|
||||||
+++ b/ggml/src/CMakeLists.txt
|
+++ b/ggml/src/CMakeLists.txt
|
||||||
@@ -304,6 +304,7 @@ function(ggml_add_cpu_backend_variant tag_name)
|
@@ -307,6 +307,7 @@ function(ggml_add_cpu_backend_variant tag_name)
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
ggml_add_cpu_backend_variant_impl(${tag_name})
|
ggml_add_cpu_backend_variant_impl(${tag_name})
|
||||||
|
|
@ -19,7 +19,7 @@ index 177fb282..f5a5079a 100644
|
||||||
endfunction()
|
endfunction()
|
||||||
|
|
||||||
ggml_add_backend(CPU)
|
ggml_add_backend(CPU)
|
||||||
@@ -314,6 +315,7 @@ if (GGML_CPU_ALL_VARIANTS)
|
@@ -317,6 +318,7 @@ if (GGML_CPU_ALL_VARIANTS)
|
||||||
elseif (GGML_CPU_ARM_ARCH)
|
elseif (GGML_CPU_ARM_ARCH)
|
||||||
message(FATAL_ERROR "Cannot use both GGML_CPU_ARM_ARCH and GGML_CPU_ALL_VARIANTS")
|
message(FATAL_ERROR "Cannot use both GGML_CPU_ARM_ARCH and GGML_CPU_ALL_VARIANTS")
|
||||||
endif()
|
endif()
|
||||||
|
|
|
||||||
|
|
@ -9,10 +9,10 @@ disable amx as it reduces performance on some systems
|
||||||
1 file changed, 4 deletions(-)
|
1 file changed, 4 deletions(-)
|
||||||
|
|
||||||
diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt
|
diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt
|
||||||
index f5a5079a..5158acd6 100644
|
index ff6229a0..33b3a15f 100644
|
||||||
--- a/ggml/src/CMakeLists.txt
|
--- a/ggml/src/CMakeLists.txt
|
||||||
+++ b/ggml/src/CMakeLists.txt
|
+++ b/ggml/src/CMakeLists.txt
|
||||||
@@ -324,10 +324,6 @@ if (GGML_CPU_ALL_VARIANTS)
|
@@ -327,10 +327,6 @@ if (GGML_CPU_ALL_VARIANTS)
|
||||||
ggml_add_cpu_backend_variant(skylakex SSE42 AVX F16C AVX2 BMI2 FMA AVX512)
|
ggml_add_cpu_backend_variant(skylakex SSE42 AVX F16C AVX2 BMI2 FMA AVX512)
|
||||||
ggml_add_cpu_backend_variant(icelake SSE42 AVX F16C AVX2 BMI2 FMA AVX512 AVX512_VBMI AVX512_VNNI)
|
ggml_add_cpu_backend_variant(icelake SSE42 AVX F16C AVX2 BMI2 FMA AVX512 AVX512_VBMI AVX512_VNNI)
|
||||||
ggml_add_cpu_backend_variant(alderlake SSE42 AVX F16C AVX2 BMI2 FMA AVX_VNNI)
|
ggml_add_cpu_backend_variant(alderlake SSE42 AVX F16C AVX2 BMI2 FMA AVX_VNNI)
|
||||||
|
|
|
||||||
|
|
@ -25,7 +25,7 @@ index 79ee2020..3efb22f0 100644
|
||||||
// get ith C string from array with given key_id
|
// get ith C string from array with given key_id
|
||||||
GGML_API const char * gguf_get_arr_str (const struct gguf_context * ctx, int64_t key_id, size_t i);
|
GGML_API const char * gguf_get_arr_str (const struct gguf_context * ctx, int64_t key_id, size_t i);
|
||||||
diff --git a/ggml/src/gguf.cpp b/ggml/src/gguf.cpp
|
diff --git a/ggml/src/gguf.cpp b/ggml/src/gguf.cpp
|
||||||
index 53504399..0f71d5f3 100644
|
index 8cc4ef1c..d950dbdf 100644
|
||||||
--- a/ggml/src/gguf.cpp
|
--- a/ggml/src/gguf.cpp
|
||||||
+++ b/ggml/src/gguf.cpp
|
+++ b/ggml/src/gguf.cpp
|
||||||
@@ -805,10 +805,14 @@ enum gguf_type gguf_get_arr_type(const struct gguf_context * ctx, int64_t key_id
|
@@ -805,10 +805,14 @@ enum gguf_type gguf_get_arr_type(const struct gguf_context * ctx, int64_t key_id
|
||||||
|
|
@ -53,10 +53,10 @@ index 53504399..0f71d5f3 100644
|
||||||
}
|
}
|
||||||
|
|
||||||
diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp
|
diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp
|
||||||
index c011008f..fa388b03 100644
|
index 26fa9fad..64c78a16 100644
|
||||||
--- a/src/llama-vocab.cpp
|
--- a/src/llama-vocab.cpp
|
||||||
+++ b/src/llama-vocab.cpp
|
+++ b/src/llama-vocab.cpp
|
||||||
@@ -1760,9 +1760,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|
@@ -1767,9 +1767,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|
||||||
const int precompiled_charsmap_keyidx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_PRECOMPILED_CHARSMAP).c_str());
|
const int precompiled_charsmap_keyidx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_PRECOMPILED_CHARSMAP).c_str());
|
||||||
if (precompiled_charsmap_keyidx != -1) {
|
if (precompiled_charsmap_keyidx != -1) {
|
||||||
const gguf_type pc_type = gguf_get_arr_type(ctx, precompiled_charsmap_keyidx);
|
const gguf_type pc_type = gguf_get_arr_type(ctx, precompiled_charsmap_keyidx);
|
||||||
|
|
@ -66,4 +66,4 @@ index c011008f..fa388b03 100644
|
||||||
+ const size_t n_precompiled_charsmap = gguf_get_arr_data_n(ctx, precompiled_charsmap_keyidx);
|
+ const size_t n_precompiled_charsmap = gguf_get_arr_data_n(ctx, precompiled_charsmap_keyidx);
|
||||||
const char * pc = (const char *) gguf_get_arr_data(ctx, precompiled_charsmap_keyidx);
|
const char * pc = (const char *) gguf_get_arr_data(ctx, precompiled_charsmap_keyidx);
|
||||||
precompiled_charsmap.assign(pc, pc + n_precompiled_charsmap);
|
precompiled_charsmap.assign(pc, pc + n_precompiled_charsmap);
|
||||||
#ifdef IS_BIG_ENDIAN
|
#if defined(__BYTE_ORDER__) && defined(__ORDER_BIG_ENDIAN__) && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
|
||||||
|
|
|
||||||
|
|
@ -8,7 +8,7 @@ Subject: [PATCH] ollama debug tensor
|
||||||
1 file changed, 6 insertions(+)
|
1 file changed, 6 insertions(+)
|
||||||
|
|
||||||
diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c
|
diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c
|
||||||
index d89cd8f4..a5689c18 100644
|
index dbc07301..f8574d01 100644
|
||||||
--- a/ggml/src/ggml-cpu/ggml-cpu.c
|
--- a/ggml/src/ggml-cpu/ggml-cpu.c
|
||||||
+++ b/ggml/src/ggml-cpu/ggml-cpu.c
|
+++ b/ggml/src/ggml-cpu/ggml-cpu.c
|
||||||
@@ -15,6 +15,8 @@
|
@@ -15,6 +15,8 @@
|
||||||
|
|
@ -20,7 +20,7 @@ index d89cd8f4..a5689c18 100644
|
||||||
#if defined(_MSC_VER) || defined(__MINGW32__)
|
#if defined(_MSC_VER) || defined(__MINGW32__)
|
||||||
#include <malloc.h> // using malloc.h with MSC/MINGW
|
#include <malloc.h> // using malloc.h with MSC/MINGW
|
||||||
#elif !defined(__FreeBSD__) && !defined(__NetBSD__) && !defined(__OpenBSD__)
|
#elif !defined(__FreeBSD__) && !defined(__NetBSD__) && !defined(__OpenBSD__)
|
||||||
@@ -2858,6 +2860,10 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
|
@@ -2881,6 +2883,10 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
|
||||||
|
|
||||||
ggml_compute_forward(¶ms, node);
|
ggml_compute_forward(¶ms, node);
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -184,10 +184,10 @@ index f8c291de..2a3a62db 100644
|
||||||
const char * grammar_root,
|
const char * grammar_root,
|
||||||
bool lazy,
|
bool lazy,
|
||||||
diff --git a/src/llama-sampling.cpp b/src/llama-sampling.cpp
|
diff --git a/src/llama-sampling.cpp b/src/llama-sampling.cpp
|
||||||
index bfbf5fa2..11f93f42 100644
|
index 2186f827..8fb86009 100644
|
||||||
--- a/src/llama-sampling.cpp
|
--- a/src/llama-sampling.cpp
|
||||||
+++ b/src/llama-sampling.cpp
|
+++ b/src/llama-sampling.cpp
|
||||||
@@ -1466,7 +1466,7 @@ static void llama_sampler_grammar_reset(struct llama_sampler * smpl) {
|
@@ -1563,7 +1563,7 @@ static void llama_sampler_grammar_reset(struct llama_sampler * smpl) {
|
||||||
trigger_patterns_c.push_back(trigger_pattern.pattern.c_str());
|
trigger_patterns_c.push_back(trigger_pattern.pattern.c_str());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -196,7 +196,7 @@ index bfbf5fa2..11f93f42 100644
|
||||||
ctx->grammar->lazy, trigger_patterns_c.data(), trigger_patterns_c.size(),
|
ctx->grammar->lazy, trigger_patterns_c.data(), trigger_patterns_c.size(),
|
||||||
ctx->grammar->trigger_tokens.data(), ctx->grammar->trigger_tokens.size());
|
ctx->grammar->trigger_tokens.data(), ctx->grammar->trigger_tokens.size());
|
||||||
|
|
||||||
@@ -1548,7 +1548,7 @@ static struct llama_sampler * llama_sampler_init_grammar_impl(
|
@@ -1645,7 +1645,7 @@ static struct llama_sampler * llama_sampler_init_grammar_impl(
|
||||||
/* .vocab = */ vocab,
|
/* .vocab = */ vocab,
|
||||||
/* .grammar_str = */ grammar_str,
|
/* .grammar_str = */ grammar_str,
|
||||||
/* .grammar_root = */ grammar_root,
|
/* .grammar_root = */ grammar_root,
|
||||||
|
|
|
||||||
|
|
@ -4,17 +4,18 @@ Date: Thu, 1 May 2025 13:45:12 -0700
|
||||||
Subject: [PATCH] add argsort and cuda copy for i32
|
Subject: [PATCH] add argsort and cuda copy for i32
|
||||||
|
|
||||||
---
|
---
|
||||||
ggml/src/ggml-cpu/ops.cpp | 43 +++++++++++++
|
ggml/src/ggml-cpu/ops.cpp | 43 +++++++++++
|
||||||
ggml/src/ggml-cuda/argsort.cu | 102 ++++++++++++++++++++++++++++++-
|
ggml/src/ggml-cuda/argsort.cu | 102 ++++++++++++++++++++++++++-
|
||||||
ggml/src/ggml-cuda/cpy-utils.cuh | 6 ++
|
ggml/src/ggml-cuda/cpy-utils.cuh | 6 ++
|
||||||
ggml/src/ggml-cuda/cpy.cu | 43 +++++++++++++
|
ggml/src/ggml-cuda/cpy.cu | 43 +++++++++++
|
||||||
4 files changed, 192 insertions(+), 2 deletions(-)
|
ggml/src/ggml-metal/ggml-metal.metal | 64 +++++++++++++++++
|
||||||
|
5 files changed, 256 insertions(+), 2 deletions(-)
|
||||||
|
|
||||||
diff --git a/ggml/src/ggml-cpu/ops.cpp b/ggml/src/ggml-cpu/ops.cpp
|
diff --git a/ggml/src/ggml-cpu/ops.cpp b/ggml/src/ggml-cpu/ops.cpp
|
||||||
index 854f1c2b..a2924757 100644
|
index 14f7dcf4..f7f8da35 100644
|
||||||
--- a/ggml/src/ggml-cpu/ops.cpp
|
--- a/ggml/src/ggml-cpu/ops.cpp
|
||||||
+++ b/ggml/src/ggml-cpu/ops.cpp
|
+++ b/ggml/src/ggml-cpu/ops.cpp
|
||||||
@@ -8146,6 +8146,45 @@ static void ggml_compute_forward_argsort_f32(
|
@@ -7893,6 +7893,45 @@ static void ggml_compute_forward_argsort_f32(
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -60,7 +61,7 @@ index 854f1c2b..a2924757 100644
|
||||||
void ggml_compute_forward_argsort(
|
void ggml_compute_forward_argsort(
|
||||||
const ggml_compute_params * params,
|
const ggml_compute_params * params,
|
||||||
ggml_tensor * dst) {
|
ggml_tensor * dst) {
|
||||||
@@ -8157,6 +8196,10 @@ void ggml_compute_forward_argsort(
|
@@ -7904,6 +7943,10 @@ void ggml_compute_forward_argsort(
|
||||||
{
|
{
|
||||||
ggml_compute_forward_argsort_f32(params, dst);
|
ggml_compute_forward_argsort_f32(params, dst);
|
||||||
} break;
|
} break;
|
||||||
|
|
@ -196,12 +197,12 @@ index 607ded85..53b02634 100644
|
||||||
+ }
|
+ }
|
||||||
}
|
}
|
||||||
diff --git a/ggml/src/ggml-cuda/cpy-utils.cuh b/ggml/src/ggml-cuda/cpy-utils.cuh
|
diff --git a/ggml/src/ggml-cuda/cpy-utils.cuh b/ggml/src/ggml-cuda/cpy-utils.cuh
|
||||||
index 410c12b7..b8e9e107 100644
|
index e621cb98..597c0c8b 100644
|
||||||
--- a/ggml/src/ggml-cuda/cpy-utils.cuh
|
--- a/ggml/src/ggml-cuda/cpy-utils.cuh
|
||||||
+++ b/ggml/src/ggml-cuda/cpy-utils.cuh
|
+++ b/ggml/src/ggml-cuda/cpy-utils.cuh
|
||||||
@@ -223,3 +223,9 @@ template<typename src_t, typename dst_t>
|
@@ -215,3 +215,9 @@ template<typename src_t, typename dst_t>
|
||||||
static __device__ void cpy_1_flt(const char * cxi, char * cdsti) {
|
static __device__ void cpy_1_flt(const char * cxi, char * cdsti) {
|
||||||
convert_flt((const src_t *)cxi, (dst_t *)cdsti);
|
*(dst_t *) cdsti = ggml_cuda_cast<dst_t>(*(const src_t *) cxi);
|
||||||
}
|
}
|
||||||
+
|
+
|
||||||
+static __device__ void cpy_1_i32_i32(const char * cxi, char * cdsti) {
|
+static __device__ void cpy_1_i32_i32(const char * cxi, char * cdsti) {
|
||||||
|
|
@ -210,10 +211,10 @@ index 410c12b7..b8e9e107 100644
|
||||||
+ *dst = *src;
|
+ *dst = *src;
|
||||||
+}
|
+}
|
||||||
diff --git a/ggml/src/ggml-cuda/cpy.cu b/ggml/src/ggml-cuda/cpy.cu
|
diff --git a/ggml/src/ggml-cuda/cpy.cu b/ggml/src/ggml-cuda/cpy.cu
|
||||||
index f9bb0256..9c3774e5 100644
|
index 746f4396..911220e9 100644
|
||||||
--- a/ggml/src/ggml-cuda/cpy.cu
|
--- a/ggml/src/ggml-cuda/cpy.cu
|
||||||
+++ b/ggml/src/ggml-cuda/cpy.cu
|
+++ b/ggml/src/ggml-cuda/cpy.cu
|
||||||
@@ -278,6 +278,47 @@ static void ggml_cpy_f32_iq4_nl_cuda(
|
@@ -277,6 +277,47 @@ static void ggml_cpy_f32_iq4_nl_cuda(
|
||||||
(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, cdst_indirect, graph_cpynode_index++);
|
(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, cdst_indirect, graph_cpynode_index++);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -261,7 +262,7 @@ index f9bb0256..9c3774e5 100644
|
||||||
void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, ggml_tensor * src1, bool disable_indirection_for_this_node) {
|
void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, ggml_tensor * src1, bool disable_indirection_for_this_node) {
|
||||||
const int64_t ne = ggml_nelements(src0);
|
const int64_t ne = ggml_nelements(src0);
|
||||||
GGML_ASSERT(ne == ggml_nelements(src1));
|
GGML_ASSERT(ne == ggml_nelements(src1));
|
||||||
@@ -369,6 +410,8 @@ void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, gg
|
@@ -372,6 +413,8 @@ void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, gg
|
||||||
ggml_cpy_flt_cuda<half, nv_bfloat16> (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs_d, graph_cpynode_index);
|
ggml_cpy_flt_cuda<half, nv_bfloat16> (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs_d, graph_cpynode_index);
|
||||||
} else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F32) {
|
} else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F32) {
|
||||||
ggml_cpy_flt_cuda<half, float> (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs_d, graph_cpynode_index);
|
ggml_cpy_flt_cuda<half, float> (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs_d, graph_cpynode_index);
|
||||||
|
|
@ -270,3 +271,80 @@ index f9bb0256..9c3774e5 100644
|
||||||
} else if (src0->type == GGML_TYPE_BF16 && src1->type == GGML_TYPE_BF16) {
|
} else if (src0->type == GGML_TYPE_BF16 && src1->type == GGML_TYPE_BF16) {
|
||||||
ggml_cpy_flt_cuda<nv_bfloat16, nv_bfloat16> (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs_d, graph_cpynode_index);
|
ggml_cpy_flt_cuda<nv_bfloat16, nv_bfloat16> (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs_d, graph_cpynode_index);
|
||||||
} else if (src0->type == GGML_TYPE_BF16 && src1->type == GGML_TYPE_F16) {
|
} else if (src0->type == GGML_TYPE_BF16 && src1->type == GGML_TYPE_F16) {
|
||||||
|
diff --git a/ggml/src/ggml-metal/ggml-metal.metal b/ggml/src/ggml-metal/ggml-metal.metal
|
||||||
|
index 96df6f0c..44dc31c0 100644
|
||||||
|
--- a/ggml/src/ggml-metal/ggml-metal.metal
|
||||||
|
+++ b/ggml/src/ggml-metal/ggml-metal.metal
|
||||||
|
@@ -4428,8 +4428,72 @@ kernel void kernel_argsort_f32_i32(
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
+typedef void (i32_argsort_t)(
|
||||||
|
+ constant ggml_metal_kargs_argsort & args,
|
||||||
|
+ device const int32_t * x,
|
||||||
|
+ device int32_t * dst,
|
||||||
|
+ threadgroup int32_t * shared_values [[threadgroup(0)]],
|
||||||
|
+ uint3 tgpig[[threadgroup_position_in_grid]],
|
||||||
|
+ uint3 tpitg[[thread_position_in_threadgroup]]);
|
||||||
|
+
|
||||||
|
+template<ggml_sort_order order>
|
||||||
|
+kernel void kernel_argsort_i32_i32(
|
||||||
|
+ constant ggml_metal_kargs_argsort & args,
|
||||||
|
+ device const int32_t * x,
|
||||||
|
+ device int32_t * dst,
|
||||||
|
+ threadgroup int32_t * shared_values [[threadgroup(0)]],
|
||||||
|
+ uint3 tgpig[[threadgroup_position_in_grid]],
|
||||||
|
+ uint3 tpitg[[thread_position_in_threadgroup]]) {
|
||||||
|
+ // bitonic sort
|
||||||
|
+ int col = tpitg[0];
|
||||||
|
+ int row = tgpig[1];
|
||||||
|
+
|
||||||
|
+ if (col >= args.ncols_pad) return;
|
||||||
|
+
|
||||||
|
+ device const int32_t * x_row = x + row * args.ncols;
|
||||||
|
+ threadgroup int32_t * dst_row = shared_values;
|
||||||
|
+
|
||||||
|
+ // initialize indices
|
||||||
|
+ dst_row[col] = col;
|
||||||
|
+
|
||||||
|
+ threadgroup_barrier(mem_flags::mem_threadgroup);
|
||||||
|
+
|
||||||
|
+ for (int k = 2; k <= args.ncols_pad; k *= 2) {
|
||||||
|
+ for (int j = k / 2; j > 0; j /= 2) {
|
||||||
|
+ int ixj = col ^ j;
|
||||||
|
+ if (ixj > col) {
|
||||||
|
+ if ((col & k) == 0) {
|
||||||
|
+ if (dst_row[col] >= args.ncols ||
|
||||||
|
+ (dst_row[ixj] < args.ncols && (order == GGML_SORT_ORDER_ASC ?
|
||||||
|
+ x_row[dst_row[col]] > x_row[dst_row[ixj]] :
|
||||||
|
+ x_row[dst_row[col]] < x_row[dst_row[ixj]]))
|
||||||
|
+ ) {
|
||||||
|
+ SWAP(dst_row[col], dst_row[ixj]);
|
||||||
|
+ }
|
||||||
|
+ } else {
|
||||||
|
+ if (dst_row[ixj] >= args.ncols ||
|
||||||
|
+ (dst_row[col] < args.ncols && (order == GGML_SORT_ORDER_ASC ?
|
||||||
|
+ x_row[dst_row[col]] < x_row[dst_row[ixj]] :
|
||||||
|
+ x_row[dst_row[col]] > x_row[dst_row[ixj]]))
|
||||||
|
+ ) {
|
||||||
|
+ SWAP(dst_row[col], dst_row[ixj]);
|
||||||
|
+ }
|
||||||
|
+ }
|
||||||
|
+ }
|
||||||
|
+ threadgroup_barrier(mem_flags::mem_threadgroup);
|
||||||
|
+ }
|
||||||
|
+ }
|
||||||
|
+
|
||||||
|
+ // copy the result to dst without the padding
|
||||||
|
+ if (col < args.ncols) {
|
||||||
|
+ dst[row * args.ncols + col] = dst_row[col];
|
||||||
|
+ }
|
||||||
|
+}
|
||||||
|
+
|
||||||
|
template [[host_name("kernel_argsort_f32_i32_asc")]] kernel argsort_t kernel_argsort_f32_i32<GGML_SORT_ORDER_ASC>;
|
||||||
|
template [[host_name("kernel_argsort_f32_i32_desc")]] kernel argsort_t kernel_argsort_f32_i32<GGML_SORT_ORDER_DESC>;
|
||||||
|
+template [[host_name("kernel_argsort_i32_i32_asc")]] kernel i32_argsort_t kernel_argsort_i32_i32<GGML_SORT_ORDER_ASC>;
|
||||||
|
+template [[host_name("kernel_argsort_i32_i32_desc")]] kernel i32_argsort_t kernel_argsort_i32_i32<GGML_SORT_ORDER_DESC>;
|
||||||
|
|
||||||
|
kernel void kernel_leaky_relu_f32(
|
||||||
|
constant ggml_metal_kargs_leaky_relu & args,
|
||||||
|
|
|
||||||
|
|
@ -6,12 +6,12 @@ Subject: [PATCH] graph memory reporting on failure
|
||||||
---
|
---
|
||||||
ggml/include/ggml-alloc.h | 1 +
|
ggml/include/ggml-alloc.h | 1 +
|
||||||
ggml/include/ggml-backend.h | 1 +
|
ggml/include/ggml-backend.h | 1 +
|
||||||
ggml/src/ggml-alloc.c | 36 ++++++++++++++++++++++++++++++++----
|
ggml/src/ggml-alloc.c | 34 +++++++++++++++++++++++++++++++---
|
||||||
ggml/src/ggml-backend.cpp | 7 +++++++
|
ggml/src/ggml-backend.cpp | 7 +++++++
|
||||||
4 files changed, 41 insertions(+), 4 deletions(-)
|
4 files changed, 40 insertions(+), 3 deletions(-)
|
||||||
|
|
||||||
diff --git a/ggml/include/ggml-alloc.h b/ggml/include/ggml-alloc.h
|
diff --git a/ggml/include/ggml-alloc.h b/ggml/include/ggml-alloc.h
|
||||||
index 2cb150fd2..7ab3f0192 100644
|
index 2cb150fd..7ab3f019 100644
|
||||||
--- a/ggml/include/ggml-alloc.h
|
--- a/ggml/include/ggml-alloc.h
|
||||||
+++ b/ggml/include/ggml-alloc.h
|
+++ b/ggml/include/ggml-alloc.h
|
||||||
@@ -65,6 +65,7 @@ GGML_API bool ggml_gallocr_reserve_n(
|
@@ -65,6 +65,7 @@ GGML_API bool ggml_gallocr_reserve_n(
|
||||||
|
|
@ -23,31 +23,31 @@ index 2cb150fd2..7ab3f0192 100644
|
||||||
// Utils
|
// Utils
|
||||||
// Create a buffer and allocate all the tensors in a ggml_context
|
// Create a buffer and allocate all the tensors in a ggml_context
|
||||||
diff --git a/ggml/include/ggml-backend.h b/ggml/include/ggml-backend.h
|
diff --git a/ggml/include/ggml-backend.h b/ggml/include/ggml-backend.h
|
||||||
index a2977ea2e..e8cf30841 100644
|
index 62b6d65e..fe20dca3 100644
|
||||||
--- a/ggml/include/ggml-backend.h
|
--- a/ggml/include/ggml-backend.h
|
||||||
+++ b/ggml/include/ggml-backend.h
|
+++ b/ggml/include/ggml-backend.h
|
||||||
@@ -303,6 +303,7 @@ extern "C" {
|
@@ -316,6 +316,7 @@ extern "C" {
|
||||||
GGML_API int ggml_backend_sched_get_n_copies(ggml_backend_sched_t sched);
|
|
||||||
|
|
||||||
|
GGML_API ggml_backend_buffer_type_t ggml_backend_sched_get_buffer_type(ggml_backend_sched_t sched, ggml_backend_t backend);
|
||||||
GGML_API size_t ggml_backend_sched_get_buffer_size(ggml_backend_sched_t sched, ggml_backend_t backend);
|
GGML_API size_t ggml_backend_sched_get_buffer_size(ggml_backend_sched_t sched, ggml_backend_t backend);
|
||||||
+ GGML_API size_t ggml_backend_sched_get_attempted_buffer_size(ggml_backend_sched_t sched, ggml_backend_t backend);
|
+ GGML_API size_t ggml_backend_sched_get_attempted_buffer_size(ggml_backend_sched_t sched, ggml_backend_t backend);
|
||||||
|
|
||||||
GGML_API void ggml_backend_sched_set_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node, ggml_backend_t backend);
|
GGML_API void ggml_backend_sched_set_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node, ggml_backend_t backend);
|
||||||
GGML_API ggml_backend_t ggml_backend_sched_get_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node);
|
GGML_API ggml_backend_t ggml_backend_sched_get_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node);
|
||||||
diff --git a/ggml/src/ggml-alloc.c b/ggml/src/ggml-alloc.c
|
diff --git a/ggml/src/ggml-alloc.c b/ggml/src/ggml-alloc.c
|
||||||
index 8b6e60283..b58bd671d 100644
|
index fa46f3b4..421ff7c7 100644
|
||||||
--- a/ggml/src/ggml-alloc.c
|
--- a/ggml/src/ggml-alloc.c
|
||||||
+++ b/ggml/src/ggml-alloc.c
|
+++ b/ggml/src/ggml-alloc.c
|
||||||
@@ -350,6 +350,7 @@ struct node_alloc {
|
@@ -492,6 +492,7 @@ struct node_alloc {
|
||||||
struct ggml_gallocr {
|
struct ggml_gallocr {
|
||||||
ggml_backend_buffer_type_t * bufts; // [n_buffers]
|
ggml_backend_buffer_type_t * bufts; // [n_buffers]
|
||||||
ggml_backend_buffer_t * buffers; // [n_buffers]
|
struct vbuffer ** buffers; // [n_buffers]
|
||||||
+ size_t *buffer_sizes; // [n_buffers]
|
+ size_t *buffer_sizes; // [n_buffers]
|
||||||
struct ggml_dyn_tallocr ** buf_tallocs; // [n_buffers]
|
struct ggml_dyn_tallocr ** buf_tallocs; // [n_buffers]
|
||||||
int n_buffers;
|
int n_buffers;
|
||||||
|
|
||||||
@@ -373,6 +374,9 @@ ggml_gallocr_t ggml_gallocr_new_n(ggml_backend_buffer_type_t * bufts, int n_bufs
|
@@ -515,6 +516,9 @@ ggml_gallocr_t ggml_gallocr_new_n(ggml_backend_buffer_type_t * bufts, int n_bufs
|
||||||
galloc->buffers = calloc(n_bufs, sizeof(ggml_backend_buffer_t));
|
galloc->buffers = calloc(n_bufs, sizeof(struct vbuffer *));
|
||||||
GGML_ASSERT(galloc->buffers != NULL);
|
GGML_ASSERT(galloc->buffers != NULL);
|
||||||
|
|
||||||
+ galloc->buffer_sizes = calloc(n_bufs, sizeof(size_t));
|
+ galloc->buffer_sizes = calloc(n_bufs, sizeof(size_t));
|
||||||
|
|
@ -56,7 +56,7 @@ index 8b6e60283..b58bd671d 100644
|
||||||
galloc->buf_tallocs = calloc(n_bufs, sizeof(struct ggml_dyn_tallocr *));
|
galloc->buf_tallocs = calloc(n_bufs, sizeof(struct ggml_dyn_tallocr *));
|
||||||
GGML_ASSERT(galloc->buf_tallocs != NULL);
|
GGML_ASSERT(galloc->buf_tallocs != NULL);
|
||||||
|
|
||||||
@@ -439,6 +443,7 @@ void ggml_gallocr_free(ggml_gallocr_t galloc) {
|
@@ -582,6 +586,7 @@ void ggml_gallocr_free(ggml_gallocr_t galloc) {
|
||||||
ggml_hash_set_free(&galloc->hash_set);
|
ggml_hash_set_free(&galloc->hash_set);
|
||||||
free(galloc->hash_values);
|
free(galloc->hash_values);
|
||||||
free(galloc->bufts);
|
free(galloc->bufts);
|
||||||
|
|
@ -64,7 +64,7 @@ index 8b6e60283..b58bd671d 100644
|
||||||
free(galloc->buffers);
|
free(galloc->buffers);
|
||||||
free(galloc->buf_tallocs);
|
free(galloc->buf_tallocs);
|
||||||
free(galloc->node_allocs);
|
free(galloc->node_allocs);
|
||||||
@@ -734,6 +739,8 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
|
@@ -875,6 +880,8 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -73,23 +73,21 @@ index 8b6e60283..b58bd671d 100644
|
||||||
// reallocate buffers if needed
|
// reallocate buffers if needed
|
||||||
for (int i = 0; i < galloc->n_buffers; i++) {
|
for (int i = 0; i < galloc->n_buffers; i++) {
|
||||||
// if the buffer type is used multiple times, we reuse the same buffer
|
// if the buffer type is used multiple times, we reuse the same buffer
|
||||||
@@ -755,15 +762,20 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
|
@@ -896,14 +903,19 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
|
||||||
|
|
||||||
ggml_backend_buffer_free(galloc->buffers[i]);
|
ggml_vbuffer_free(galloc->buffers[i]);
|
||||||
galloc->buffers[i] = ggml_backend_buft_alloc_buffer(galloc->bufts[i], new_size);
|
galloc->buffers[i] = ggml_vbuffer_alloc(galloc->bufts[i], galloc->buf_tallocs[i], GGML_BACKEND_BUFFER_USAGE_COMPUTE);
|
||||||
- if (galloc->buffers[i] == NULL) {
|
- if (galloc->buffers[i] == NULL) {
|
||||||
+ if (galloc->buffers[i]) {
|
+ if (galloc->buffers[i]) {
|
||||||
+ galloc->buffer_sizes[i] = ggml_backend_buffer_get_size(galloc->buffers[i]);
|
+ galloc->buffer_sizes[i] = ggml_vbuffer_size(galloc->buffers[i]);
|
||||||
+ ggml_backend_buffer_set_usage(galloc->buffers[i], GGML_BACKEND_BUFFER_USAGE_COMPUTE);
|
|
||||||
+ } else {
|
+ } else {
|
||||||
GGML_LOG_ERROR("%s: failed to allocate %s buffer of size %zu\n", __func__, ggml_backend_buft_name(galloc->bufts[i]), new_size);
|
GGML_LOG_ERROR("%s: failed to allocate %s buffer of size %zu\n", __func__, ggml_backend_buft_name(galloc->bufts[i]), new_size);
|
||||||
- return false;
|
- return false;
|
||||||
+ galloc->buffer_sizes[i] = new_size;
|
+ galloc->buffer_sizes[i] = new_size;
|
||||||
+ success = false;
|
+ success = false;
|
||||||
}
|
}
|
||||||
- ggml_backend_buffer_set_usage(galloc->buffers[i], GGML_BACKEND_BUFFER_USAGE_COMPUTE);
|
|
||||||
+ } else {
|
+ } else {
|
||||||
+ galloc->buffer_sizes[i] = ggml_backend_buffer_get_size(galloc->buffers[i]);
|
+ galloc->buffer_sizes[i] = ggml_vbuffer_size(galloc->buffers[i]);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -98,8 +96,8 @@ index 8b6e60283..b58bd671d 100644
|
||||||
}
|
}
|
||||||
|
|
||||||
bool ggml_gallocr_reserve(ggml_gallocr_t galloc, struct ggml_cgraph *graph) {
|
bool ggml_gallocr_reserve(ggml_gallocr_t galloc, struct ggml_cgraph *graph) {
|
||||||
@@ -920,6 +932,22 @@ size_t ggml_gallocr_get_buffer_size(ggml_gallocr_t galloc, int buffer_id) {
|
@@ -1058,6 +1070,22 @@ size_t ggml_gallocr_get_buffer_size(ggml_gallocr_t galloc, int buffer_id) {
|
||||||
return ggml_backend_buffer_get_size(galloc->buffers[buffer_id]);
|
return ggml_vbuffer_size(galloc->buffers[buffer_id]);
|
||||||
}
|
}
|
||||||
|
|
||||||
+size_t ggml_gallocr_get_attempted_buffer_size(ggml_gallocr_t galloc, int buffer_id) {
|
+size_t ggml_gallocr_get_attempted_buffer_size(ggml_gallocr_t galloc, int buffer_id) {
|
||||||
|
|
@ -122,10 +120,10 @@ index 8b6e60283..b58bd671d 100644
|
||||||
|
|
||||||
static void free_buffers(ggml_backend_buffer_t ** buffers, const size_t * n_buffers) {
|
static void free_buffers(ggml_backend_buffer_t ** buffers, const size_t * n_buffers) {
|
||||||
diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp
|
diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp
|
||||||
index 97f47abd2..d02a40e60 100644
|
index 8ba86f82..cb2b9956 100644
|
||||||
--- a/ggml/src/ggml-backend.cpp
|
--- a/ggml/src/ggml-backend.cpp
|
||||||
+++ b/ggml/src/ggml-backend.cpp
|
+++ b/ggml/src/ggml-backend.cpp
|
||||||
@@ -1631,6 +1631,13 @@ size_t ggml_backend_sched_get_buffer_size(ggml_backend_sched_t sched, ggml_backe
|
@@ -1809,6 +1809,13 @@ size_t ggml_backend_sched_get_buffer_size(ggml_backend_sched_t sched, ggml_backe
|
||||||
return ggml_gallocr_get_buffer_size(sched->galloc, backend_index);
|
return ggml_gallocr_get_buffer_size(sched->galloc, backend_index);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -137,5 +135,5 @@ index 97f47abd2..d02a40e60 100644
|
||||||
+}
|
+}
|
||||||
+
|
+
|
||||||
void ggml_backend_sched_set_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node, ggml_backend_t backend) {
|
void ggml_backend_sched_set_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node, ggml_backend_t backend) {
|
||||||
|
GGML_ASSERT(sched);
|
||||||
int backend_index = ggml_backend_sched_backend_id(sched, backend);
|
int backend_index = ggml_backend_sched_backend_id(sched, backend);
|
||||||
GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends);
|
|
||||||
|
|
|
||||||
|
|
@ -7,27 +7,27 @@ This enables matching up devices and information reported by the backend
|
||||||
with tools (e.g. nvidia-smi) and system management libraries (e.g. nvml).
|
with tools (e.g. nvidia-smi) and system management libraries (e.g. nvml).
|
||||||
---
|
---
|
||||||
ggml/include/ggml-backend.h | 1 +
|
ggml/include/ggml-backend.h | 1 +
|
||||||
ggml/src/ggml-cuda/ggml-cuda.cu | 67 +++++++++++++++++++++++++++++---
|
ggml/src/ggml-cuda/ggml-cuda.cu | 67 +++++++++++++++++++++++++++---
|
||||||
ggml/src/ggml-metal/ggml-metal.m | 1 +
|
ggml/src/ggml-metal/ggml-metal.cpp | 1 +
|
||||||
3 files changed, 63 insertions(+), 6 deletions(-)
|
3 files changed, 63 insertions(+), 6 deletions(-)
|
||||||
|
|
||||||
diff --git a/ggml/include/ggml-backend.h b/ggml/include/ggml-backend.h
|
diff --git a/ggml/include/ggml-backend.h b/ggml/include/ggml-backend.h
|
||||||
index 8a91b381..9424394e 100644
|
index fe20dca3..48777212 100644
|
||||||
--- a/ggml/include/ggml-backend.h
|
--- a/ggml/include/ggml-backend.h
|
||||||
+++ b/ggml/include/ggml-backend.h
|
+++ b/ggml/include/ggml-backend.h
|
||||||
@@ -152,6 +152,7 @@ extern "C" {
|
@@ -158,6 +158,7 @@ extern "C" {
|
||||||
struct ggml_backend_dev_props {
|
|
||||||
const char * name;
|
|
||||||
const char * description;
|
const char * description;
|
||||||
+ const char * id;
|
// device free memory in bytes
|
||||||
size_t memory_free;
|
size_t memory_free;
|
||||||
|
+ const char * id;
|
||||||
|
// device total memory in bytes
|
||||||
size_t memory_total;
|
size_t memory_total;
|
||||||
enum ggml_backend_dev_type type;
|
// device type
|
||||||
diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
|
diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
|
||||||
index 37ee2a6d..57eae461 100644
|
index fdf8c63d..ad389ece 100644
|
||||||
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
|
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
|
||||||
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
|
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
|
||||||
@@ -179,6 +179,51 @@ static int ggml_cuda_parse_id(char devName[]) {
|
@@ -183,6 +183,51 @@ static int ggml_cuda_parse_id(char devName[]) {
|
||||||
}
|
}
|
||||||
#endif // defined(GGML_USE_HIP)
|
#endif // defined(GGML_USE_HIP)
|
||||||
|
|
||||||
|
|
@ -77,9 +77,9 @@ index 37ee2a6d..57eae461 100644
|
||||||
+}
|
+}
|
||||||
+
|
+
|
||||||
static ggml_cuda_device_info ggml_cuda_init() {
|
static ggml_cuda_device_info ggml_cuda_init() {
|
||||||
#if defined(GGML_USE_HIP)
|
ggml_cuda_device_info info = {};
|
||||||
// Workaround for a rocBLAS bug when using multiple graphics cards:
|
|
||||||
@@ -267,22 +312,24 @@ static ggml_cuda_device_info ggml_cuda_init() {
|
@@ -249,22 +294,24 @@ static ggml_cuda_device_info ggml_cuda_init() {
|
||||||
info.devices[id].cc += prop.minor * 0x10;
|
info.devices[id].cc += prop.minor * 0x10;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
@ -107,18 +107,18 @@ index 37ee2a6d..57eae461 100644
|
||||||
+ GGML_LOG_INFO(" Device %d: %s, compute capability %d.%d, VMM: %s, ID: %s\n",
|
+ GGML_LOG_INFO(" Device %d: %s, compute capability %d.%d, VMM: %s, ID: %s\n",
|
||||||
+ id, prop.name, prop.major, prop.minor, device_vmm ? "yes" : "no",
|
+ id, prop.name, prop.major, prop.minor, device_vmm ? "yes" : "no",
|
||||||
+ ggml_cuda_parse_uuid(prop, id).c_str());
|
+ ggml_cuda_parse_uuid(prop, id).c_str());
|
||||||
#endif // defined(GGML_USE_HIP)
|
std::string device_name(prop.name);
|
||||||
}
|
if (device_name == "NVIDIA GeForce MX450") {
|
||||||
|
turing_devices_without_mma.push_back({ id, device_name });
|
||||||
@@ -3144,6 +3191,7 @@ struct ggml_backend_cuda_device_context {
|
@@ -3273,6 +3320,7 @@ struct ggml_backend_cuda_device_context {
|
||||||
int device;
|
|
||||||
std::string name;
|
std::string name;
|
||||||
std::string description;
|
std::string description;
|
||||||
|
std::string pci_bus_id;
|
||||||
+ std::string id;
|
+ std::string id;
|
||||||
};
|
};
|
||||||
|
|
||||||
static const char * ggml_backend_cuda_device_get_name(ggml_backend_dev_t dev) {
|
static const char * ggml_backend_cuda_device_get_name(ggml_backend_dev_t dev) {
|
||||||
@@ -3156,6 +3204,11 @@ static const char * ggml_backend_cuda_device_get_description(ggml_backend_dev_t
|
@@ -3285,6 +3333,11 @@ static const char * ggml_backend_cuda_device_get_description(ggml_backend_dev_t
|
||||||
return ctx->description.c_str();
|
return ctx->description.c_str();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -130,31 +130,31 @@ index 37ee2a6d..57eae461 100644
|
||||||
static void ggml_backend_cuda_device_get_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) {
|
static void ggml_backend_cuda_device_get_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) {
|
||||||
ggml_backend_cuda_device_context * ctx = (ggml_backend_cuda_device_context *)dev->context;
|
ggml_backend_cuda_device_context * ctx = (ggml_backend_cuda_device_context *)dev->context;
|
||||||
ggml_cuda_set_device(ctx->device);
|
ggml_cuda_set_device(ctx->device);
|
||||||
@@ -3170,6 +3223,7 @@ static enum ggml_backend_dev_type ggml_backend_cuda_device_get_type(ggml_backend
|
@@ -3301,6 +3354,7 @@ static void ggml_backend_cuda_device_get_props(ggml_backend_dev_t dev, ggml_back
|
||||||
static void ggml_backend_cuda_device_get_props(ggml_backend_dev_t dev, ggml_backend_dev_props * props) {
|
|
||||||
props->name = ggml_backend_cuda_device_get_name(dev);
|
props->name = ggml_backend_cuda_device_get_name(dev);
|
||||||
props->description = ggml_backend_cuda_device_get_description(dev);
|
props->description = ggml_backend_cuda_device_get_description(dev);
|
||||||
+ props->id = ggml_backend_cuda_device_get_id(dev);
|
+ props->id = ggml_backend_cuda_device_get_id(dev);
|
||||||
props->type = ggml_backend_cuda_device_get_type(dev);
|
props->type = ggml_backend_cuda_device_get_type(dev);
|
||||||
|
props->device_id = ctx->pci_bus_id.empty() ? nullptr : ctx->pci_bus_id.c_str();
|
||||||
ggml_backend_cuda_device_get_memory(dev, &props->memory_free, &props->memory_total);
|
ggml_backend_cuda_device_get_memory(dev, &props->memory_free, &props->memory_total);
|
||||||
|
@@ -3871,6 +3925,7 @@ ggml_backend_reg_t ggml_backend_cuda_reg() {
|
||||||
@@ -3767,6 +3821,7 @@ ggml_backend_reg_t ggml_backend_cuda_reg() {
|
|
||||||
cudaDeviceProp prop;
|
cudaDeviceProp prop;
|
||||||
CUDA_CHECK(cudaGetDeviceProperties(&prop, i));
|
CUDA_CHECK(cudaGetDeviceProperties(&prop, i));
|
||||||
dev_ctx->description = prop.name;
|
dev_ctx->description = prop.name;
|
||||||
+ dev_ctx->id = ggml_cuda_parse_uuid(prop, i);
|
+ dev_ctx->id = ggml_cuda_parse_uuid(prop, i);
|
||||||
|
|
||||||
ggml_backend_dev_t dev = new ggml_backend_device {
|
char pci_bus_id[16] = {};
|
||||||
/* .iface = */ ggml_backend_cuda_device_interface,
|
snprintf(pci_bus_id, sizeof(pci_bus_id), "%04x:%02x:%02x.0", prop.pciDomainID, prop.pciBusID, prop.pciDeviceID);
|
||||||
diff --git a/ggml/src/ggml-metal/ggml-metal.m b/ggml/src/ggml-metal/ggml-metal.m
|
diff --git a/ggml/src/ggml-metal/ggml-metal.cpp b/ggml/src/ggml-metal/ggml-metal.cpp
|
||||||
index 7bccc7bf..fe7b2f0a 100644
|
index 909e17de..08ab4fc9 100644
|
||||||
--- a/ggml/src/ggml-metal/ggml-metal.m
|
--- a/ggml/src/ggml-metal/ggml-metal.cpp
|
||||||
+++ b/ggml/src/ggml-metal/ggml-metal.m
|
+++ b/ggml/src/ggml-metal/ggml-metal.cpp
|
||||||
@@ -6522,6 +6522,7 @@ static enum ggml_backend_dev_type ggml_backend_metal_device_get_type(ggml_backen
|
@@ -538,6 +538,7 @@ static enum ggml_backend_dev_type ggml_backend_metal_device_get_type(ggml_backen
|
||||||
static void ggml_backend_metal_device_get_props(ggml_backend_dev_t dev, struct ggml_backend_dev_props * props) {
|
static void ggml_backend_metal_device_get_props(ggml_backend_dev_t dev, ggml_backend_dev_props * props) {
|
||||||
props->name = ggml_backend_metal_device_get_name(dev);
|
props->name = ggml_backend_metal_device_get_name(dev);
|
||||||
props->description = ggml_backend_metal_device_get_description(dev);
|
props->description = ggml_backend_metal_device_get_description(dev);
|
||||||
+ props->id = "0";
|
+ props->id = "0";
|
||||||
props->type = ggml_backend_metal_device_get_type(dev);
|
props->type = ggml_backend_metal_device_get_type(dev);
|
||||||
|
|
||||||
ggml_backend_metal_device_get_memory(dev, &props->memory_free, &props->memory_total);
|
ggml_backend_metal_device_get_memory(dev, &props->memory_free, &props->memory_total);
|
||||||
props->caps = (struct ggml_backend_dev_caps) {
|
|
||||||
|
|
|
||||||
|
|
@ -10,7 +10,7 @@ Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>
|
||||||
2 files changed, 13 insertions(+)
|
2 files changed, 13 insertions(+)
|
||||||
|
|
||||||
diff --git a/tools/mtmd/mtmd.cpp b/tools/mtmd/mtmd.cpp
|
diff --git a/tools/mtmd/mtmd.cpp b/tools/mtmd/mtmd.cpp
|
||||||
index a05373d5..6f70f7f4 100644
|
index cd022c5e..3d680945 100644
|
||||||
--- a/tools/mtmd/mtmd.cpp
|
--- a/tools/mtmd/mtmd.cpp
|
||||||
+++ b/tools/mtmd/mtmd.cpp
|
+++ b/tools/mtmd/mtmd.cpp
|
||||||
@@ -79,6 +79,16 @@ enum mtmd_slice_tmpl {
|
@@ -79,6 +79,16 @@ enum mtmd_slice_tmpl {
|
||||||
|
|
|
||||||
|
|
@ -8,10 +8,10 @@ Subject: [PATCH] no power throttling win32 with gnuc
|
||||||
1 file changed, 1 insertion(+), 1 deletion(-)
|
1 file changed, 1 insertion(+), 1 deletion(-)
|
||||||
|
|
||||||
diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c
|
diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c
|
||||||
index a5689c18..85af19a3 100644
|
index f8574d01..530efce0 100644
|
||||||
--- a/ggml/src/ggml-cpu/ggml-cpu.c
|
--- a/ggml/src/ggml-cpu/ggml-cpu.c
|
||||||
+++ b/ggml/src/ggml-cpu/ggml-cpu.c
|
+++ b/ggml/src/ggml-cpu/ggml-cpu.c
|
||||||
@@ -2412,7 +2412,7 @@ static bool ggml_thread_apply_priority(int32_t prio) {
|
@@ -2431,7 +2431,7 @@ static bool ggml_thread_apply_priority(int32_t prio) {
|
||||||
// Newer Windows 11 versions aggresively park (offline) CPU cores and often place
|
// Newer Windows 11 versions aggresively park (offline) CPU cores and often place
|
||||||
// all our threads onto the first 4 cores which results in terrible performance with
|
// all our threads onto the first 4 cores which results in terrible performance with
|
||||||
// n_threads > 4
|
// n_threads > 4
|
||||||
|
|
|
||||||
|
|
@ -5,23 +5,24 @@ Subject: [PATCH] BF16 macos version guard
|
||||||
|
|
||||||
Only enable BF16 on supported MacOS versions (v14+)
|
Only enable BF16 on supported MacOS versions (v14+)
|
||||||
---
|
---
|
||||||
ggml/src/ggml-metal/ggml-metal.m | 6 +++++-
|
ggml/src/ggml-metal/ggml-metal-context.m | 7 ++++++-
|
||||||
1 file changed, 5 insertions(+), 1 deletion(-)
|
1 file changed, 6 insertions(+), 1 deletion(-)
|
||||||
|
|
||||||
diff --git a/ggml/src/ggml-metal/ggml-metal.m b/ggml/src/ggml-metal/ggml-metal.m
|
diff --git a/ggml/src/ggml-metal/ggml-metal-context.m b/ggml/src/ggml-metal/ggml-metal-context.m
|
||||||
index fe7b2f0a..e4c31268 100644
|
index 052efb7a..b47dc787 100644
|
||||||
--- a/ggml/src/ggml-metal/ggml-metal.m
|
--- a/ggml/src/ggml-metal/ggml-metal-context.m
|
||||||
+++ b/ggml/src/ggml-metal/ggml-metal.m
|
+++ b/ggml/src/ggml-metal/ggml-metal-context.m
|
||||||
@@ -106,7 +106,11 @@ static id<MTLDevice> ggml_backend_metal_device_acq(struct ggml_backend_metal_dev
|
@@ -125,7 +125,12 @@ ggml_metal_t ggml_metal_init(ggml_metal_device_t dev) {
|
||||||
ctx->has_bfloat |= [ctx->mtl_device supportsFamily:MTLGPUFamilyApple6];
|
|
||||||
|
|
||||||
#if defined(GGML_METAL_USE_BF16)
|
res->d_queue = dispatch_queue_create("ggml-metal", DISPATCH_QUEUE_CONCURRENT);
|
||||||
- ctx->use_bfloat = ctx->has_bfloat;
|
|
||||||
|
- res->use_bfloat = props_dev->has_bfloat;
|
||||||
+ if (@available(macOS 14.0, *)) {
|
+ if (@available(macOS 14.0, *)) {
|
||||||
+ ctx->use_bfloat = ctx->has_bfloat;
|
+ res->use_bfloat = props_dev->has_bfloat;
|
||||||
+ } else {
|
+ } else {
|
||||||
+ ctx->use_bfloat = false;
|
+ res->use_bfloat = false;
|
||||||
+ }
|
+ }
|
||||||
#else
|
+
|
||||||
ctx->use_bfloat = false;
|
res->use_fusion = getenv("GGML_METAL_FUSION_DISABLE") == nil;
|
||||||
#endif
|
res->use_concurrency = getenv("GGML_METAL_CONCURRENCY_DISABLE") == nil;
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -13,10 +13,10 @@ checks.
|
||||||
1 file changed, 18 insertions(+)
|
1 file changed, 18 insertions(+)
|
||||||
|
|
||||||
diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
|
diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
|
||||||
index 57eae461..c7f9dc3a 100644
|
index ad389ece..e51c5035 100644
|
||||||
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
|
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
|
||||||
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
|
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
|
||||||
@@ -2671,12 +2671,24 @@ static bool check_node_graph_compatibility_and_refresh_copy_ops(ggml_backend_cud
|
@@ -2686,14 +2686,26 @@ static bool check_node_graph_compatibility_and_refresh_copy_ops(ggml_backend_cud
|
||||||
// Loop over nodes in GGML graph to obtain info needed for CUDA graph
|
// Loop over nodes in GGML graph to obtain info needed for CUDA graph
|
||||||
cuda_ctx->cuda_graph->cpy_dest_ptrs.clear();
|
cuda_ctx->cuda_graph->cpy_dest_ptrs.clear();
|
||||||
|
|
||||||
|
|
@ -36,12 +36,14 @@ index 57eae461..c7f9dc3a 100644
|
||||||
const std::string ffn_moe_gate_bias_prefix = "ffn_moe_gate_biased";
|
const std::string ffn_moe_gate_bias_prefix = "ffn_moe_gate_biased";
|
||||||
const std::string ffn_moe_up_bias_prefix = "ffn_moe_up_biased";
|
const std::string ffn_moe_up_bias_prefix = "ffn_moe_up_biased";
|
||||||
const std::string ffn_moe_down_bias_prefix = "ffn_moe_down_biased";
|
const std::string ffn_moe_down_bias_prefix = "ffn_moe_down_biased";
|
||||||
|
const std::string nemotron_h_block_out_prefix = "nemotron_h_block_out";
|
||||||
|
const std::string mamba2_y_add_d_prefix = "mamba2_y_add_d";
|
||||||
|
|
||||||
+
|
+
|
||||||
for (int i = 0; i < cgraph->n_nodes; i++) {
|
for (int i = 0; i < cgraph->n_nodes; i++) {
|
||||||
ggml_tensor * node = cgraph->nodes[i];
|
ggml_tensor * node = cgraph->nodes[i];
|
||||||
|
|
||||||
@@ -2700,6 +2712,12 @@ static bool check_node_graph_compatibility_and_refresh_copy_ops(ggml_backend_cud
|
@@ -2717,6 +2729,12 @@ static bool check_node_graph_compatibility_and_refresh_copy_ops(ggml_backend_cud
|
||||||
|
|
||||||
if (node->op == GGML_OP_ADD &&
|
if (node->op == GGML_OP_ADD &&
|
||||||
node->src[1] && node->src[1]->ne[1] > 1 &&
|
node->src[1] && node->src[1]->ne[1] > 1 &&
|
||||||
|
|
|
||||||
|
|
@ -8,10 +8,10 @@ Subject: [PATCH] Disable ggml-blas on macos v13 and older
|
||||||
1 file changed, 5 insertions(+)
|
1 file changed, 5 insertions(+)
|
||||||
|
|
||||||
diff --git a/ggml/src/ggml-blas/ggml-blas.cpp b/ggml/src/ggml-blas/ggml-blas.cpp
|
diff --git a/ggml/src/ggml-blas/ggml-blas.cpp b/ggml/src/ggml-blas/ggml-blas.cpp
|
||||||
index aeac2e57..40738d5b 100644
|
index 5b888cdd..2a9ff7f6 100644
|
||||||
--- a/ggml/src/ggml-blas/ggml-blas.cpp
|
--- a/ggml/src/ggml-blas/ggml-blas.cpp
|
||||||
+++ b/ggml/src/ggml-blas/ggml-blas.cpp
|
+++ b/ggml/src/ggml-blas/ggml-blas.cpp
|
||||||
@@ -505,6 +505,11 @@ static const struct ggml_backend_reg_i ggml_backend_blas_reg_i = {
|
@@ -506,6 +506,11 @@ static const struct ggml_backend_reg_i ggml_backend_blas_reg_i = {
|
||||||
};
|
};
|
||||||
|
|
||||||
ggml_backend_reg_t ggml_backend_blas_reg(void) {
|
ggml_backend_reg_t ggml_backend_blas_reg(void) {
|
||||||
|
|
|
||||||
|
|
@ -16,10 +16,10 @@ must be recreated with no-alloc set to false before loading data.
|
||||||
5 files changed, 310 insertions(+), 44 deletions(-)
|
5 files changed, 310 insertions(+), 44 deletions(-)
|
||||||
|
|
||||||
diff --git a/ggml/include/ggml-backend.h b/ggml/include/ggml-backend.h
|
diff --git a/ggml/include/ggml-backend.h b/ggml/include/ggml-backend.h
|
||||||
index 2773cc310..ae94887dd 100644
|
index 48777212..d4352663 100644
|
||||||
--- a/ggml/include/ggml-backend.h
|
--- a/ggml/include/ggml-backend.h
|
||||||
+++ b/ggml/include/ggml-backend.h
|
+++ b/ggml/include/ggml-backend.h
|
||||||
@@ -291,6 +291,7 @@ extern "C" {
|
@@ -303,6 +303,7 @@ extern "C" {
|
||||||
|
|
||||||
// Initialize a backend scheduler, backends with low index are given priority over backends with high index
|
// Initialize a backend scheduler, backends with low index are given priority over backends with high index
|
||||||
GGML_API ggml_backend_sched_t ggml_backend_sched_new(ggml_backend_t * backends, ggml_backend_buffer_type_t * bufts, int n_backends, size_t graph_size, bool parallel, bool op_offload);
|
GGML_API ggml_backend_sched_t ggml_backend_sched_new(ggml_backend_t * backends, ggml_backend_buffer_type_t * bufts, int n_backends, size_t graph_size, bool parallel, bool op_offload);
|
||||||
|
|
@ -28,7 +28,7 @@ index 2773cc310..ae94887dd 100644
|
||||||
|
|
||||||
// Initialize backend buffers from a measure graph
|
// Initialize backend buffers from a measure graph
|
||||||
diff --git a/ggml/src/ggml-backend-impl.h b/ggml/src/ggml-backend-impl.h
|
diff --git a/ggml/src/ggml-backend-impl.h b/ggml/src/ggml-backend-impl.h
|
||||||
index c36c12d65..369e9e25a 100644
|
index 07784d6f..869dc07d 100644
|
||||||
--- a/ggml/src/ggml-backend-impl.h
|
--- a/ggml/src/ggml-backend-impl.h
|
||||||
+++ b/ggml/src/ggml-backend-impl.h
|
+++ b/ggml/src/ggml-backend-impl.h
|
||||||
@@ -26,12 +26,17 @@ extern "C" {
|
@@ -26,12 +26,17 @@ extern "C" {
|
||||||
|
|
@ -57,10 +57,10 @@ index c36c12d65..369e9e25a 100644
|
||||||
};
|
};
|
||||||
|
|
||||||
GGML_API ggml_backend_buffer_t ggml_backend_buffer_init(
|
GGML_API ggml_backend_buffer_t ggml_backend_buffer_init(
|
||||||
@@ -114,6 +120,16 @@ extern "C" {
|
@@ -117,6 +123,16 @@ extern "C" {
|
||||||
void (*event_record)(ggml_backend_t backend, ggml_backend_event_t event);
|
|
||||||
// wait for an event on on a different stream
|
// (optional) sort/optimize the nodes in the graph
|
||||||
void (*event_wait) (ggml_backend_t backend, ggml_backend_event_t event);
|
void (*graph_optimize) (ggml_backend_t backend, struct ggml_cgraph * cgraph);
|
||||||
+
|
+
|
||||||
+ // (optional) reserves intermediate buffers needed for the compution
|
+ // (optional) reserves intermediate buffers needed for the compution
|
||||||
+ // if alloc is true, memory is actually allocated, otherwise the required amount is just returned by buffer_size
|
+ // if alloc is true, memory is actually allocated, otherwise the required amount is just returned by buffer_size
|
||||||
|
|
@ -75,7 +75,7 @@ index c36c12d65..369e9e25a 100644
|
||||||
|
|
||||||
struct ggml_backend {
|
struct ggml_backend {
|
||||||
diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp
|
diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp
|
||||||
index d02a40e60..6b4dee4c7 100644
|
index cb2b9956..6ef5eeaf 100644
|
||||||
--- a/ggml/src/ggml-backend.cpp
|
--- a/ggml/src/ggml-backend.cpp
|
||||||
+++ b/ggml/src/ggml-backend.cpp
|
+++ b/ggml/src/ggml-backend.cpp
|
||||||
@@ -41,6 +41,19 @@ ggml_backend_buffer_t ggml_backend_buft_alloc_buffer(ggml_backend_buffer_type_t
|
@@ -41,6 +41,19 @@ ggml_backend_buffer_t ggml_backend_buft_alloc_buffer(ggml_backend_buffer_type_t
|
||||||
|
|
@ -95,10 +95,10 @@ index d02a40e60..6b4dee4c7 100644
|
||||||
+ return buf;
|
+ return buf;
|
||||||
+ }
|
+ }
|
||||||
+
|
+
|
||||||
|
GGML_ASSERT(buft);
|
||||||
return buft->iface.alloc_buffer(buft, size);
|
return buft->iface.alloc_buffer(buft, size);
|
||||||
}
|
}
|
||||||
|
@@ -95,7 +108,8 @@ ggml_backend_buffer_t ggml_backend_buffer_init(
|
||||||
@@ -89,7 +102,8 @@ ggml_backend_buffer_t ggml_backend_buffer_init(
|
|
||||||
/* .buft = */ buft,
|
/* .buft = */ buft,
|
||||||
/* .context = */ context,
|
/* .context = */ context,
|
||||||
/* .size = */ size,
|
/* .size = */ size,
|
||||||
|
|
@ -108,7 +108,7 @@ index d02a40e60..6b4dee4c7 100644
|
||||||
};
|
};
|
||||||
|
|
||||||
return buffer;
|
return buffer;
|
||||||
@@ -119,6 +133,12 @@ void * ggml_backend_buffer_get_base(ggml_backend_buffer_t buffer) {
|
@@ -127,6 +141,12 @@ void * ggml_backend_buffer_get_base(ggml_backend_buffer_t buffer) {
|
||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -121,7 +121,7 @@ index d02a40e60..6b4dee4c7 100644
|
||||||
void * base = buffer->iface.get_base(buffer);
|
void * base = buffer->iface.get_base(buffer);
|
||||||
|
|
||||||
GGML_ASSERT(base != NULL && "backend buffer base cannot be NULL");
|
GGML_ASSERT(base != NULL && "backend buffer base cannot be NULL");
|
||||||
@@ -663,6 +683,12 @@ struct ggml_backend_sched {
|
@@ -723,6 +743,12 @@ struct ggml_backend_sched {
|
||||||
bool op_offload;
|
bool op_offload;
|
||||||
|
|
||||||
int debug;
|
int debug;
|
||||||
|
|
@ -134,7 +134,7 @@ index d02a40e60..6b4dee4c7 100644
|
||||||
};
|
};
|
||||||
|
|
||||||
#define hash_id(tensor) ggml_hash_find_or_insert(&sched->hash_set, tensor)
|
#define hash_id(tensor) ggml_hash_find_or_insert(&sched->hash_set, tensor)
|
||||||
@@ -1449,6 +1475,17 @@ ggml_backend_sched_t ggml_backend_sched_new(
|
@@ -1606,6 +1632,17 @@ ggml_backend_sched_t ggml_backend_sched_new(
|
||||||
size_t graph_size,
|
size_t graph_size,
|
||||||
bool parallel,
|
bool parallel,
|
||||||
bool op_offload) {
|
bool op_offload) {
|
||||||
|
|
@ -152,7 +152,7 @@ index d02a40e60..6b4dee4c7 100644
|
||||||
GGML_ASSERT(n_backends > 0);
|
GGML_ASSERT(n_backends > 0);
|
||||||
GGML_ASSERT(n_backends <= GGML_SCHED_MAX_BACKENDS);
|
GGML_ASSERT(n_backends <= GGML_SCHED_MAX_BACKENDS);
|
||||||
GGML_ASSERT(ggml_backend_dev_type(ggml_backend_get_device(backends[n_backends - 1])) == GGML_BACKEND_DEVICE_TYPE_CPU);
|
GGML_ASSERT(ggml_backend_dev_type(ggml_backend_get_device(backends[n_backends - 1])) == GGML_BACKEND_DEVICE_TYPE_CPU);
|
||||||
@@ -1490,10 +1527,13 @@ ggml_backend_sched_t ggml_backend_sched_new(
|
@@ -1647,10 +1684,13 @@ ggml_backend_sched_t ggml_backend_sched_new(
|
||||||
sched->events[b][c] = ggml_backend_event_new(backends[b]->device);
|
sched->events[b][c] = ggml_backend_event_new(backends[b]->device);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
@ -166,7 +166,7 @@ index d02a40e60..6b4dee4c7 100644
|
||||||
|
|
||||||
ggml_backend_sched_reset(sched);
|
ggml_backend_sched_reset(sched);
|
||||||
|
|
||||||
@@ -1508,6 +1548,10 @@ void ggml_backend_sched_free(ggml_backend_sched_t sched) {
|
@@ -1665,6 +1705,10 @@ void ggml_backend_sched_free(ggml_backend_sched_t sched) {
|
||||||
for (int c = 0; c < sched->n_copies; c++) {
|
for (int c = 0; c < sched->n_copies; c++) {
|
||||||
ggml_backend_event_free(sched->events[b][c]);
|
ggml_backend_event_free(sched->events[b][c]);
|
||||||
}
|
}
|
||||||
|
|
@ -177,7 +177,7 @@ index d02a40e60..6b4dee4c7 100644
|
||||||
}
|
}
|
||||||
ggml_gallocr_free(sched->galloc);
|
ggml_gallocr_free(sched->galloc);
|
||||||
ggml_free(sched->ctx);
|
ggml_free(sched->ctx);
|
||||||
@@ -1547,6 +1591,24 @@ bool ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph *
|
@@ -1708,6 +1752,24 @@ bool ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph *
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -202,7 +202,7 @@ index d02a40e60..6b4dee4c7 100644
|
||||||
ggml_backend_sched_reset(sched);
|
ggml_backend_sched_reset(sched);
|
||||||
|
|
||||||
return true;
|
return true;
|
||||||
@@ -1635,7 +1697,13 @@ size_t ggml_backend_sched_get_attempted_buffer_size(ggml_backend_sched_t sched,
|
@@ -1813,7 +1875,13 @@ size_t ggml_backend_sched_get_attempted_buffer_size(ggml_backend_sched_t sched,
|
||||||
int backend_index = ggml_backend_sched_backend_id(sched, backend);
|
int backend_index = ggml_backend_sched_backend_id(sched, backend);
|
||||||
GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends);
|
GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends);
|
||||||
|
|
||||||
|
|
@ -218,7 +218,7 @@ index d02a40e60..6b4dee4c7 100644
|
||||||
|
|
||||||
void ggml_backend_sched_set_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node, ggml_backend_t backend) {
|
void ggml_backend_sched_set_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node, ggml_backend_t backend) {
|
||||||
diff --git a/ggml/src/ggml-cuda/common.cuh b/ggml/src/ggml-cuda/common.cuh
|
diff --git a/ggml/src/ggml-cuda/common.cuh b/ggml/src/ggml-cuda/common.cuh
|
||||||
index 2e5d48797..b915ee1b8 100644
|
index c4246b65..448badf0 100644
|
||||||
--- a/ggml/src/ggml-cuda/common.cuh
|
--- a/ggml/src/ggml-cuda/common.cuh
|
||||||
+++ b/ggml/src/ggml-cuda/common.cuh
|
+++ b/ggml/src/ggml-cuda/common.cuh
|
||||||
@@ -35,6 +35,31 @@
|
@@ -35,6 +35,31 @@
|
||||||
|
|
@ -253,7 +253,7 @@ index 2e5d48797..b915ee1b8 100644
|
||||||
#define STRINGIZE_IMPL(...) #__VA_ARGS__
|
#define STRINGIZE_IMPL(...) #__VA_ARGS__
|
||||||
#define STRINGIZE(...) STRINGIZE_IMPL(__VA_ARGS__)
|
#define STRINGIZE(...) STRINGIZE_IMPL(__VA_ARGS__)
|
||||||
|
|
||||||
@@ -771,6 +796,9 @@ struct ggml_cuda_pool {
|
@@ -880,6 +905,9 @@ struct ggml_cuda_pool {
|
||||||
|
|
||||||
virtual void * alloc(size_t size, size_t * actual_size) = 0;
|
virtual void * alloc(size_t size, size_t * actual_size) = 0;
|
||||||
virtual void free(void * ptr, size_t size) = 0;
|
virtual void free(void * ptr, size_t size) = 0;
|
||||||
|
|
@ -263,7 +263,7 @@ index 2e5d48797..b915ee1b8 100644
|
||||||
};
|
};
|
||||||
|
|
||||||
template<typename T>
|
template<typename T>
|
||||||
@@ -914,11 +942,11 @@ struct ggml_backend_cuda_context {
|
@@ -1023,11 +1051,11 @@ struct ggml_backend_cuda_context {
|
||||||
// pool
|
// pool
|
||||||
std::unique_ptr<ggml_cuda_pool> pools[GGML_CUDA_MAX_DEVICES];
|
std::unique_ptr<ggml_cuda_pool> pools[GGML_CUDA_MAX_DEVICES];
|
||||||
|
|
||||||
|
|
@ -277,7 +277,7 @@ index 2e5d48797..b915ee1b8 100644
|
||||||
}
|
}
|
||||||
return *pools[device];
|
return *pools[device];
|
||||||
}
|
}
|
||||||
@@ -926,4 +954,20 @@ struct ggml_backend_cuda_context {
|
@@ -1035,4 +1063,20 @@ struct ggml_backend_cuda_context {
|
||||||
ggml_cuda_pool & pool() {
|
ggml_cuda_pool & pool() {
|
||||||
return pool(device);
|
return pool(device);
|
||||||
}
|
}
|
||||||
|
|
@ -299,7 +299,7 @@ index 2e5d48797..b915ee1b8 100644
|
||||||
+ }
|
+ }
|
||||||
};
|
};
|
||||||
diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
|
diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
|
||||||
index c7f9dc3a5..d5abe09e0 100644
|
index e51c5035..d324bc68 100644
|
||||||
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
|
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
|
||||||
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
|
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
|
||||||
@@ -350,6 +350,8 @@ const ggml_cuda_device_info & ggml_cuda_info() {
|
@@ -350,6 +350,8 @@ const ggml_cuda_device_info & ggml_cuda_info() {
|
||||||
|
|
@ -540,7 +540,7 @@ index c7f9dc3a5..d5abe09e0 100644
|
||||||
};
|
};
|
||||||
|
|
||||||
ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device) {
|
ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device) {
|
||||||
@@ -2936,6 +2998,7 @@ static bool ggml_cuda_can_fuse(const struct ggml_cgraph * cgraph, int node_idx,
|
@@ -3008,6 +3070,7 @@ static bool ggml_cuda_can_fuse(const struct ggml_cgraph * cgraph, int node_idx,
|
||||||
|
|
||||||
static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx, ggml_cgraph * cgraph,
|
static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx, ggml_cgraph * cgraph,
|
||||||
bool & graph_evaluated_or_captured, bool & use_cuda_graph, bool & cuda_graph_update_required) {
|
bool & graph_evaluated_or_captured, bool & use_cuda_graph, bool & cuda_graph_update_required) {
|
||||||
|
|
@ -548,7 +548,7 @@ index c7f9dc3a5..d5abe09e0 100644
|
||||||
// flag used to determine whether it is an integrated_gpu
|
// flag used to determine whether it is an integrated_gpu
|
||||||
const bool integrated = ggml_cuda_info().devices[cuda_ctx->device].integrated;
|
const bool integrated = ggml_cuda_info().devices[cuda_ctx->device].integrated;
|
||||||
|
|
||||||
@@ -2951,6 +3014,11 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx
|
@@ -3023,6 +3086,11 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -559,8 +559,8 @@ index c7f9dc3a5..d5abe09e0 100644
|
||||||
+
|
+
|
||||||
static bool disable_fusion = (getenv("GGML_CUDA_DISABLE_FUSION") != nullptr);
|
static bool disable_fusion = (getenv("GGML_CUDA_DISABLE_FUSION") != nullptr);
|
||||||
if (!disable_fusion) {
|
if (!disable_fusion) {
|
||||||
if (ggml_cuda_can_fuse(cgraph, i, { GGML_OP_RMS_NORM, GGML_OP_MUL }, {})) {
|
|
||||||
@@ -3022,6 +3090,7 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx
|
@@ -3149,6 +3217,7 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx
|
||||||
|
|
||||||
static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
|
static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
|
||||||
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
|
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
|
||||||
|
|
@ -568,7 +568,7 @@ index c7f9dc3a5..d5abe09e0 100644
|
||||||
|
|
||||||
ggml_cuda_set_device(cuda_ctx->device);
|
ggml_cuda_set_device(cuda_ctx->device);
|
||||||
|
|
||||||
@@ -3101,6 +3170,71 @@ static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend,
|
@@ -3228,6 +3297,71 @@ static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend,
|
||||||
return GGML_STATUS_SUCCESS;
|
return GGML_STATUS_SUCCESS;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -640,10 +640,10 @@ index c7f9dc3a5..d5abe09e0 100644
|
||||||
static void ggml_backend_cuda_event_record(ggml_backend_t backend, ggml_backend_event_t event) {
|
static void ggml_backend_cuda_event_record(ggml_backend_t backend, ggml_backend_event_t event) {
|
||||||
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
|
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
|
||||||
|
|
||||||
@@ -3140,6 +3274,9 @@ static const ggml_backend_i ggml_backend_cuda_interface = {
|
@@ -3268,6 +3402,9 @@ static const ggml_backend_i ggml_backend_cuda_interface = {
|
||||||
/* .graph_compute = */ ggml_backend_cuda_graph_compute,
|
|
||||||
/* .event_record = */ ggml_backend_cuda_event_record,
|
/* .event_record = */ ggml_backend_cuda_event_record,
|
||||||
/* .event_wait = */ ggml_backend_cuda_event_wait,
|
/* .event_wait = */ ggml_backend_cuda_event_wait,
|
||||||
|
/* .graph_optimize = */ NULL,
|
||||||
+ /* .graph_reserve = */ ggml_backend_cuda_graph_reserve,
|
+ /* .graph_reserve = */ ggml_backend_cuda_graph_reserve,
|
||||||
+ /* .buffer_size = */ ggml_backend_cuda_buffer_size,
|
+ /* .buffer_size = */ ggml_backend_cuda_buffer_size,
|
||||||
+ /* .reset = */ ggml_backend_cuda_reset,
|
+ /* .reset = */ ggml_backend_cuda_reset,
|
||||||
|
|
|
||||||
|
|
@ -8,10 +8,10 @@ Subject: [PATCH] decode: disable output_all
|
||||||
1 file changed, 1 insertion(+), 2 deletions(-)
|
1 file changed, 1 insertion(+), 2 deletions(-)
|
||||||
|
|
||||||
diff --git a/src/llama-context.cpp b/src/llama-context.cpp
|
diff --git a/src/llama-context.cpp b/src/llama-context.cpp
|
||||||
index 26a5cf9c..6ece5263 100644
|
index d8a8b5e6..09247cef 100644
|
||||||
--- a/src/llama-context.cpp
|
--- a/src/llama-context.cpp
|
||||||
+++ b/src/llama-context.cpp
|
+++ b/src/llama-context.cpp
|
||||||
@@ -962,8 +962,7 @@ int llama_context::decode(const llama_batch & batch_inp) {
|
@@ -974,8 +974,7 @@ int llama_context::decode(const llama_batch & batch_inp) {
|
||||||
const int64_t n_vocab = vocab.n_tokens();
|
const int64_t n_vocab = vocab.n_tokens();
|
||||||
const int64_t n_embd = hparams.n_embd;
|
const int64_t n_embd = hparams.n_embd;
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -15,10 +15,10 @@ unused then it can be reset to free these data structures.
|
||||||
5 files changed, 29 insertions(+), 2 deletions(-)
|
5 files changed, 29 insertions(+), 2 deletions(-)
|
||||||
|
|
||||||
diff --git a/ggml/include/ggml-backend.h b/ggml/include/ggml-backend.h
|
diff --git a/ggml/include/ggml-backend.h b/ggml/include/ggml-backend.h
|
||||||
index b602a7c78..fda5ceb24 100644
|
index d4352663..0a2dae26 100644
|
||||||
--- a/ggml/include/ggml-backend.h
|
--- a/ggml/include/ggml-backend.h
|
||||||
+++ b/ggml/include/ggml-backend.h
|
+++ b/ggml/include/ggml-backend.h
|
||||||
@@ -167,6 +167,7 @@ extern "C" {
|
@@ -178,6 +178,7 @@ extern "C" {
|
||||||
GGML_API void ggml_backend_dev_get_props(ggml_backend_dev_t device, struct ggml_backend_dev_props * props);
|
GGML_API void ggml_backend_dev_get_props(ggml_backend_dev_t device, struct ggml_backend_dev_props * props);
|
||||||
GGML_API ggml_backend_reg_t ggml_backend_dev_backend_reg(ggml_backend_dev_t device);
|
GGML_API ggml_backend_reg_t ggml_backend_dev_backend_reg(ggml_backend_dev_t device);
|
||||||
GGML_API ggml_backend_t ggml_backend_dev_init(ggml_backend_dev_t device, const char * params);
|
GGML_API ggml_backend_t ggml_backend_dev_init(ggml_backend_dev_t device, const char * params);
|
||||||
|
|
@ -27,10 +27,10 @@ index b602a7c78..fda5ceb24 100644
|
||||||
GGML_API ggml_backend_buffer_type_t ggml_backend_dev_host_buffer_type(ggml_backend_dev_t device);
|
GGML_API ggml_backend_buffer_type_t ggml_backend_dev_host_buffer_type(ggml_backend_dev_t device);
|
||||||
GGML_API ggml_backend_buffer_t ggml_backend_dev_buffer_from_host_ptr(ggml_backend_dev_t device, void * ptr, size_t size, size_t max_tensor_size);
|
GGML_API ggml_backend_buffer_t ggml_backend_dev_buffer_from_host_ptr(ggml_backend_dev_t device, void * ptr, size_t size, size_t max_tensor_size);
|
||||||
diff --git a/ggml/src/ggml-backend-impl.h b/ggml/src/ggml-backend-impl.h
|
diff --git a/ggml/src/ggml-backend-impl.h b/ggml/src/ggml-backend-impl.h
|
||||||
index 81749a5a3..6f10c353b 100644
|
index 869dc07d..4889df79 100644
|
||||||
--- a/ggml/src/ggml-backend-impl.h
|
--- a/ggml/src/ggml-backend-impl.h
|
||||||
+++ b/ggml/src/ggml-backend-impl.h
|
+++ b/ggml/src/ggml-backend-impl.h
|
||||||
@@ -178,6 +178,10 @@ extern "C" {
|
@@ -195,6 +195,10 @@ extern "C" {
|
||||||
ggml_backend_event_t (*event_new) (ggml_backend_dev_t dev);
|
ggml_backend_event_t (*event_new) (ggml_backend_dev_t dev);
|
||||||
void (*event_free) (ggml_backend_dev_t dev, ggml_backend_event_t event);
|
void (*event_free) (ggml_backend_dev_t dev, ggml_backend_event_t event);
|
||||||
void (*event_synchronize) (ggml_backend_dev_t dev, ggml_backend_event_t event);
|
void (*event_synchronize) (ggml_backend_dev_t dev, ggml_backend_event_t event);
|
||||||
|
|
@ -42,10 +42,10 @@ index 81749a5a3..6f10c353b 100644
|
||||||
|
|
||||||
struct ggml_backend_device {
|
struct ggml_backend_device {
|
||||||
diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp
|
diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp
|
||||||
index 05a842ed5..6556943b0 100644
|
index 6ef5eeaf..0b757af5 100644
|
||||||
--- a/ggml/src/ggml-backend.cpp
|
--- a/ggml/src/ggml-backend.cpp
|
||||||
+++ b/ggml/src/ggml-backend.cpp
|
+++ b/ggml/src/ggml-backend.cpp
|
||||||
@@ -477,6 +477,14 @@ ggml_backend_t ggml_backend_dev_init(ggml_backend_dev_t device, const char * par
|
@@ -526,6 +526,14 @@ ggml_backend_t ggml_backend_dev_init(ggml_backend_dev_t device, const char * par
|
||||||
return device->iface.init_backend(device, params);
|
return device->iface.init_backend(device, params);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -58,13 +58,13 @@ index 05a842ed5..6556943b0 100644
|
||||||
+}
|
+}
|
||||||
+
|
+
|
||||||
ggml_backend_buffer_type_t ggml_backend_dev_buffer_type(ggml_backend_dev_t device) {
|
ggml_backend_buffer_type_t ggml_backend_dev_buffer_type(ggml_backend_dev_t device) {
|
||||||
|
GGML_ASSERT(device);
|
||||||
return device->iface.get_buffer_type(device);
|
return device->iface.get_buffer_type(device);
|
||||||
}
|
|
||||||
diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
|
diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
|
||||||
index c7f9dc3a5..e43fde523 100644
|
index d324bc68..531d6e27 100644
|
||||||
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
|
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
|
||||||
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
|
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
|
||||||
@@ -103,6 +103,11 @@ int ggml_cuda_get_device() {
|
@@ -107,6 +107,11 @@ int ggml_cuda_get_device() {
|
||||||
return id;
|
return id;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -76,10 +76,10 @@ index c7f9dc3a5..e43fde523 100644
|
||||||
static cudaError_t ggml_cuda_device_malloc(void ** ptr, size_t size, int device) {
|
static cudaError_t ggml_cuda_device_malloc(void ** ptr, size_t size, int device) {
|
||||||
ggml_cuda_set_device(device);
|
ggml_cuda_set_device(device);
|
||||||
cudaError_t err;
|
cudaError_t err;
|
||||||
@@ -3243,7 +3248,10 @@ static void ggml_backend_cuda_device_get_props(ggml_backend_dev_t dev, ggml_back
|
@@ -3512,7 +3517,10 @@ static void ggml_backend_cuda_device_get_props(ggml_backend_dev_t dev, ggml_back
|
||||||
props->description = ggml_backend_cuda_device_get_description(dev);
|
|
||||||
props->id = ggml_backend_cuda_device_get_id(dev);
|
props->id = ggml_backend_cuda_device_get_id(dev);
|
||||||
props->type = ggml_backend_cuda_device_get_type(dev);
|
props->type = ggml_backend_cuda_device_get_type(dev);
|
||||||
|
props->device_id = ctx->pci_bus_id.empty() ? nullptr : ctx->pci_bus_id.c_str();
|
||||||
- ggml_backend_cuda_device_get_memory(dev, &props->memory_free, &props->memory_total);
|
- ggml_backend_cuda_device_get_memory(dev, &props->memory_free, &props->memory_total);
|
||||||
+
|
+
|
||||||
+ // Memory reporting is disabled to avoid allocation of a CUDA primary context (~300 MB per device).
|
+ // Memory reporting is disabled to avoid allocation of a CUDA primary context (~300 MB per device).
|
||||||
|
|
@ -88,7 +88,7 @@ index c7f9dc3a5..e43fde523 100644
|
||||||
|
|
||||||
bool host_buffer = getenv("GGML_CUDA_NO_PINNED") == nullptr;
|
bool host_buffer = getenv("GGML_CUDA_NO_PINNED") == nullptr;
|
||||||
#ifdef GGML_CUDA_NO_PEER_COPY
|
#ifdef GGML_CUDA_NO_PEER_COPY
|
||||||
@@ -3700,6 +3708,11 @@ static void ggml_backend_cuda_device_event_synchronize(ggml_backend_dev_t dev, g
|
@@ -3945,6 +3953,11 @@ static void ggml_backend_cuda_device_event_synchronize(ggml_backend_dev_t dev, g
|
||||||
CUDA_CHECK(cudaEventSynchronize((cudaEvent_t)event->context));
|
CUDA_CHECK(cudaEventSynchronize((cudaEvent_t)event->context));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -100,7 +100,7 @@ index c7f9dc3a5..e43fde523 100644
|
||||||
static const ggml_backend_device_i ggml_backend_cuda_device_interface = {
|
static const ggml_backend_device_i ggml_backend_cuda_device_interface = {
|
||||||
/* .get_name = */ ggml_backend_cuda_device_get_name,
|
/* .get_name = */ ggml_backend_cuda_device_get_name,
|
||||||
/* .get_description = */ ggml_backend_cuda_device_get_description,
|
/* .get_description = */ ggml_backend_cuda_device_get_description,
|
||||||
@@ -3716,6 +3729,7 @@ static const ggml_backend_device_i ggml_backend_cuda_device_interface = {
|
@@ -3961,6 +3974,7 @@ static const ggml_backend_device_i ggml_backend_cuda_device_interface = {
|
||||||
/* .event_new = */ ggml_backend_cuda_device_event_new,
|
/* .event_new = */ ggml_backend_cuda_device_event_new,
|
||||||
/* .event_free = */ ggml_backend_cuda_device_event_free,
|
/* .event_free = */ ggml_backend_cuda_device_event_free,
|
||||||
/* .event_synchronize = */ ggml_backend_cuda_device_event_synchronize,
|
/* .event_synchronize = */ ggml_backend_cuda_device_event_synchronize,
|
||||||
|
|
@ -108,7 +108,7 @@ index c7f9dc3a5..e43fde523 100644
|
||||||
};
|
};
|
||||||
|
|
||||||
// backend reg
|
// backend reg
|
||||||
@@ -3835,7 +3849,6 @@ ggml_backend_reg_t ggml_backend_cuda_reg() {
|
@@ -4076,7 +4090,6 @@ ggml_backend_reg_t ggml_backend_cuda_reg() {
|
||||||
dev_ctx->device = i;
|
dev_ctx->device = i;
|
||||||
dev_ctx->name = GGML_CUDA_NAME + std::to_string(i);
|
dev_ctx->name = GGML_CUDA_NAME + std::to_string(i);
|
||||||
|
|
||||||
|
|
@ -117,10 +117,10 @@ index c7f9dc3a5..e43fde523 100644
|
||||||
CUDA_CHECK(cudaGetDeviceProperties(&prop, i));
|
CUDA_CHECK(cudaGetDeviceProperties(&prop, i));
|
||||||
dev_ctx->description = prop.name;
|
dev_ctx->description = prop.name;
|
||||||
diff --git a/ggml/src/ggml-cuda/vendors/hip.h b/ggml/src/ggml-cuda/vendors/hip.h
|
diff --git a/ggml/src/ggml-cuda/vendors/hip.h b/ggml/src/ggml-cuda/vendors/hip.h
|
||||||
index c31f31923..cf22e60d2 100644
|
index 37386afc..06f9e7c1 100644
|
||||||
--- a/ggml/src/ggml-cuda/vendors/hip.h
|
--- a/ggml/src/ggml-cuda/vendors/hip.h
|
||||||
+++ b/ggml/src/ggml-cuda/vendors/hip.h
|
+++ b/ggml/src/ggml-cuda/vendors/hip.h
|
||||||
@@ -40,6 +40,7 @@
|
@@ -41,6 +41,7 @@
|
||||||
#define cudaDeviceDisablePeerAccess hipDeviceDisablePeerAccess
|
#define cudaDeviceDisablePeerAccess hipDeviceDisablePeerAccess
|
||||||
#define cudaDeviceEnablePeerAccess hipDeviceEnablePeerAccess
|
#define cudaDeviceEnablePeerAccess hipDeviceEnablePeerAccess
|
||||||
#define cudaDeviceProp hipDeviceProp_t
|
#define cudaDeviceProp hipDeviceProp_t
|
||||||
|
|
|
||||||
|
|
@ -8,23 +8,23 @@ management libraries for more accurate VRAM usage reporting if available.
|
||||||
---
|
---
|
||||||
ggml/include/ggml-backend.h | 9 +
|
ggml/include/ggml-backend.h | 9 +
|
||||||
ggml/src/CMakeLists.txt | 2 +
|
ggml/src/CMakeLists.txt | 2 +
|
||||||
ggml/src/ggml-cuda/ggml-cuda.cu | 75 +++++-
|
ggml/src/ggml-cuda/ggml-cuda.cu | 72 +++++
|
||||||
ggml/src/ggml-cuda/vendors/hip.h | 1 +
|
ggml/src/ggml-cuda/vendors/hip.h | 4 +
|
||||||
ggml/src/ggml-impl.h | 8 +
|
ggml/src/ggml-impl.h | 8 +
|
||||||
ggml/src/ggml-metal/ggml-metal.m | 2 +
|
ggml/src/ggml-metal/ggml-metal.cpp | 3 +-
|
||||||
ggml/src/mem_hip.cpp | 449 +++++++++++++++++++++++++++++++
|
ggml/src/mem_hip.cpp | 449 +++++++++++++++++++++++++++++
|
||||||
ggml/src/mem_nvml.cpp | 172 ++++++++++++
|
ggml/src/mem_nvml.cpp | 172 +++++++++++
|
||||||
8 files changed, 717 insertions(+), 1 deletion(-)
|
8 files changed, 718 insertions(+), 1 deletion(-)
|
||||||
create mode 100644 ggml/src/mem_hip.cpp
|
create mode 100644 ggml/src/mem_hip.cpp
|
||||||
create mode 100644 ggml/src/mem_nvml.cpp
|
create mode 100644 ggml/src/mem_nvml.cpp
|
||||||
|
|
||||||
diff --git a/ggml/include/ggml-backend.h b/ggml/include/ggml-backend.h
|
diff --git a/ggml/include/ggml-backend.h b/ggml/include/ggml-backend.h
|
||||||
index fda5ceb24..7c2d86703 100644
|
index 0a2dae26..a6bf3378 100644
|
||||||
--- a/ggml/include/ggml-backend.h
|
--- a/ggml/include/ggml-backend.h
|
||||||
+++ b/ggml/include/ggml-backend.h
|
+++ b/ggml/include/ggml-backend.h
|
||||||
@@ -158,6 +158,15 @@ extern "C" {
|
@@ -169,6 +169,15 @@ extern "C" {
|
||||||
size_t memory_total;
|
const char * device_id;
|
||||||
enum ggml_backend_dev_type type;
|
// device capabilities
|
||||||
struct ggml_backend_dev_caps caps;
|
struct ggml_backend_dev_caps caps;
|
||||||
+ int driver_major;
|
+ int driver_major;
|
||||||
+ int driver_minor;
|
+ int driver_minor;
|
||||||
|
|
@ -39,10 +39,10 @@ index fda5ceb24..7c2d86703 100644
|
||||||
|
|
||||||
GGML_API const char * ggml_backend_dev_name(ggml_backend_dev_t device);
|
GGML_API const char * ggml_backend_dev_name(ggml_backend_dev_t device);
|
||||||
diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt
|
diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt
|
||||||
index 5158acd6a..3a428a22d 100644
|
index 33b3a15f..86191ef2 100644
|
||||||
--- a/ggml/src/CMakeLists.txt
|
--- a/ggml/src/CMakeLists.txt
|
||||||
+++ b/ggml/src/CMakeLists.txt
|
+++ b/ggml/src/CMakeLists.txt
|
||||||
@@ -203,6 +203,8 @@ add_library(ggml-base
|
@@ -206,6 +206,8 @@ add_library(ggml-base
|
||||||
ggml-threading.h
|
ggml-threading.h
|
||||||
ggml-quants.c
|
ggml-quants.c
|
||||||
ggml-quants.h
|
ggml-quants.h
|
||||||
|
|
@ -52,10 +52,10 @@ index 5158acd6a..3a428a22d 100644
|
||||||
|
|
||||||
target_include_directories(ggml-base PRIVATE .)
|
target_include_directories(ggml-base PRIVATE .)
|
||||||
diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
|
diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
|
||||||
index e43fde523..14baf0fb1 100644
|
index 531d6e27..3fa3a057 100644
|
||||||
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
|
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
|
||||||
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
|
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
|
||||||
@@ -279,6 +279,16 @@ static ggml_cuda_device_info ggml_cuda_init() {
|
@@ -261,6 +261,16 @@ static ggml_cuda_device_info ggml_cuda_init() {
|
||||||
for (int id = 0; id < info.device_count; ++id) {
|
for (int id = 0; id < info.device_count; ++id) {
|
||||||
int device_vmm = 0;
|
int device_vmm = 0;
|
||||||
|
|
||||||
|
|
@ -72,7 +72,7 @@ index e43fde523..14baf0fb1 100644
|
||||||
#if defined(GGML_USE_VMM)
|
#if defined(GGML_USE_VMM)
|
||||||
CUdevice device;
|
CUdevice device;
|
||||||
CU_CHECK(cuDeviceGet(&device, id));
|
CU_CHECK(cuDeviceGet(&device, id));
|
||||||
@@ -332,9 +342,15 @@ static ggml_cuda_device_info ggml_cuda_init() {
|
@@ -314,6 +324,11 @@ static ggml_cuda_device_info ggml_cuda_init() {
|
||||||
#else
|
#else
|
||||||
info.devices[id].smpbo = prop.sharedMemPerBlockOptin;
|
info.devices[id].smpbo = prop.sharedMemPerBlockOptin;
|
||||||
info.devices[id].cc = 100*prop.major + 10*prop.minor;
|
info.devices[id].cc = 100*prop.major + 10*prop.minor;
|
||||||
|
|
@ -84,33 +84,29 @@ index e43fde523..14baf0fb1 100644
|
||||||
GGML_LOG_INFO(" Device %d: %s, compute capability %d.%d, VMM: %s, ID: %s\n",
|
GGML_LOG_INFO(" Device %d: %s, compute capability %d.%d, VMM: %s, ID: %s\n",
|
||||||
id, prop.name, prop.major, prop.minor, device_vmm ? "yes" : "no",
|
id, prop.name, prop.major, prop.minor, device_vmm ? "yes" : "no",
|
||||||
ggml_cuda_parse_uuid(prop, id).c_str());
|
ggml_cuda_parse_uuid(prop, id).c_str());
|
||||||
+
|
@@ -3481,6 +3496,14 @@ struct ggml_backend_cuda_device_context {
|
||||||
#endif // defined(GGML_USE_HIP)
|
|
||||||
}
|
|
||||||
|
|
||||||
@@ -3215,6 +3231,14 @@ struct ggml_backend_cuda_device_context {
|
|
||||||
std::string name;
|
|
||||||
std::string description;
|
std::string description;
|
||||||
|
std::string pci_bus_id;
|
||||||
std::string id;
|
std::string id;
|
||||||
+ int major;
|
+ int major;
|
||||||
+ int minor;
|
+ int minor;
|
||||||
+ int driver_major;
|
+ int driver_major;
|
||||||
+ int driver_minor;
|
+ int driver_minor;
|
||||||
+ int integrated;
|
+ int integrated;
|
||||||
+ int pci_bus_id;
|
+ int pciBusID;
|
||||||
+ int pci_device_id;
|
+ int pciDeviceID;
|
||||||
+ int pci_domain_id;
|
+ int pciDomainID;
|
||||||
};
|
};
|
||||||
|
|
||||||
static const char * ggml_backend_cuda_device_get_name(ggml_backend_dev_t dev) {
|
static const char * ggml_backend_cuda_device_get_name(ggml_backend_dev_t dev) {
|
||||||
@@ -3235,6 +3259,28 @@ static const char * ggml_backend_cuda_device_get_id(ggml_backend_dev_t dev) {
|
@@ -3501,6 +3524,28 @@ static const char * ggml_backend_cuda_device_get_id(ggml_backend_dev_t dev) {
|
||||||
static void ggml_backend_cuda_device_get_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) {
|
static void ggml_backend_cuda_device_get_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) {
|
||||||
ggml_backend_cuda_device_context * ctx = (ggml_backend_cuda_device_context *)dev->context;
|
ggml_backend_cuda_device_context * ctx = (ggml_backend_cuda_device_context *)dev->context;
|
||||||
ggml_cuda_set_device(ctx->device);
|
ggml_cuda_set_device(ctx->device);
|
||||||
+
|
+
|
||||||
+#if defined(GGML_USE_HIP)
|
+#if defined(GGML_USE_HIP)
|
||||||
+ if (ggml_hip_mgmt_init() == 0) {
|
+ if (ggml_hip_mgmt_init() == 0) {
|
||||||
+ int status = ggml_hip_get_device_memory(ctx->pci_bus_id, ctx->pci_device_id, free, total);
|
+ int status = ggml_hip_get_device_memory(ctx->pciBusID, ctx->pciDeviceID, free, total);
|
||||||
+ if (status == 0) {
|
+ if (status == 0) {
|
||||||
+ GGML_LOG_DEBUG("%s utilizing ADLX memory reporting free: %zu total: %zu\n", __func__, *free, *total);
|
+ GGML_LOG_DEBUG("%s utilizing ADLX memory reporting free: %zu total: %zu\n", __func__, *free, *total);
|
||||||
+ ggml_hip_mgmt_release();
|
+ ggml_hip_mgmt_release();
|
||||||
|
|
@ -132,19 +128,18 @@ index e43fde523..14baf0fb1 100644
|
||||||
CUDA_CHECK(cudaMemGetInfo(free, total));
|
CUDA_CHECK(cudaMemGetInfo(free, total));
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -3243,6 +3289,7 @@ static enum ggml_backend_dev_type ggml_backend_cuda_device_get_type(ggml_backend
|
@@ -3509,6 +3554,7 @@ static enum ggml_backend_dev_type ggml_backend_cuda_device_get_type(ggml_backend
|
||||||
return GGML_BACKEND_DEVICE_TYPE_GPU;
|
return GGML_BACKEND_DEVICE_TYPE_GPU;
|
||||||
}
|
}
|
||||||
|
|
||||||
+#define GGML_HIP_NAME "HIP"
|
+#define GGML_HIP_NAME "HIP"
|
||||||
static void ggml_backend_cuda_device_get_props(ggml_backend_dev_t dev, ggml_backend_dev_props * props) {
|
static void ggml_backend_cuda_device_get_props(ggml_backend_dev_t dev, ggml_backend_dev_props * props) {
|
||||||
props->name = ggml_backend_cuda_device_get_name(dev);
|
ggml_backend_cuda_device_context * ctx = (ggml_backend_cuda_device_context *)dev->context;
|
||||||
props->description = ggml_backend_cuda_device_get_description(dev);
|
|
||||||
@@ -3253,6 +3300,23 @@ static void ggml_backend_cuda_device_get_props(ggml_backend_dev_t dev, ggml_back
|
@@ -3522,6 +3568,22 @@ static void ggml_backend_cuda_device_get_props(ggml_backend_dev_t dev, ggml_back
|
||||||
// If you need the memory data, call ggml_backend_dev_memory() explicitly.
|
// If you need the memory data, call ggml_backend_dev_memory() explicitly.
|
||||||
props->memory_total = props->memory_free = 0;
|
props->memory_total = props->memory_free = 0;
|
||||||
|
|
||||||
+ ggml_backend_cuda_device_context * ctx = (ggml_backend_cuda_device_context *)dev->context;
|
|
||||||
+#if defined(GGML_USE_HIP)
|
+#if defined(GGML_USE_HIP)
|
||||||
+ int cc = ggml_cuda_info().devices[ctx->device].cc - GGML_CUDA_CC_OFFSET_AMD;
|
+ int cc = ggml_cuda_info().devices[ctx->device].cc - GGML_CUDA_CC_OFFSET_AMD;
|
||||||
+ props->compute_major = cc / 0x100;
|
+ props->compute_major = cc / 0x100;
|
||||||
|
|
@ -156,15 +151,15 @@ index e43fde523..14baf0fb1 100644
|
||||||
+ props->driver_major = ctx->driver_major;
|
+ props->driver_major = ctx->driver_major;
|
||||||
+ props->driver_minor = ctx->driver_minor;
|
+ props->driver_minor = ctx->driver_minor;
|
||||||
+ props->integrated = ctx->integrated;
|
+ props->integrated = ctx->integrated;
|
||||||
+ props->pci_bus_id = ctx->pci_bus_id;
|
+ props->pci_bus_id = ctx->pciBusID;
|
||||||
+ props->pci_device_id = ctx->pci_device_id;
|
+ props->pci_device_id = ctx->pciDeviceID;
|
||||||
+ props->pci_domain_id = ctx->pci_domain_id;
|
+ props->pci_domain_id = ctx->pciDomainID;
|
||||||
+ props->library = GGML_CUDA_NAME;
|
+ props->library = GGML_CUDA_NAME;
|
||||||
+
|
+
|
||||||
bool host_buffer = getenv("GGML_CUDA_NO_PINNED") == nullptr;
|
bool host_buffer = getenv("GGML_CUDA_NO_PINNED") == nullptr;
|
||||||
#ifdef GGML_CUDA_NO_PEER_COPY
|
#ifdef GGML_CUDA_NO_PEER_COPY
|
||||||
bool events = false;
|
bool events = false;
|
||||||
@@ -3843,6 +3907,8 @@ ggml_backend_reg_t ggml_backend_cuda_reg() {
|
@@ -4084,6 +4146,8 @@ ggml_backend_reg_t ggml_backend_cuda_reg() {
|
||||||
std::lock_guard<std::mutex> lock(mutex);
|
std::lock_guard<std::mutex> lock(mutex);
|
||||||
if (!initialized) {
|
if (!initialized) {
|
||||||
ggml_backend_cuda_reg_context * ctx = new ggml_backend_cuda_reg_context;
|
ggml_backend_cuda_reg_context * ctx = new ggml_backend_cuda_reg_context;
|
||||||
|
|
@ -173,27 +168,36 @@ index e43fde523..14baf0fb1 100644
|
||||||
|
|
||||||
for (int i = 0; i < ggml_cuda_info().device_count; i++) {
|
for (int i = 0; i < ggml_cuda_info().device_count; i++) {
|
||||||
ggml_backend_cuda_device_context * dev_ctx = new ggml_backend_cuda_device_context;
|
ggml_backend_cuda_device_context * dev_ctx = new ggml_backend_cuda_device_context;
|
||||||
@@ -3853,7 +3919,14 @@ ggml_backend_reg_t ggml_backend_cuda_reg() {
|
@@ -4099,6 +4163,14 @@ ggml_backend_reg_t ggml_backend_cuda_reg() {
|
||||||
CUDA_CHECK(cudaGetDeviceProperties(&prop, i));
|
snprintf(pci_bus_id, sizeof(pci_bus_id), "%04x:%02x:%02x.0", prop.pciDomainID, prop.pciBusID, prop.pciDeviceID);
|
||||||
dev_ctx->description = prop.name;
|
dev_ctx->pci_bus_id = pci_bus_id;
|
||||||
dev_ctx->id = ggml_cuda_parse_uuid(prop, i);
|
|
||||||
-
|
|
||||||
+ dev_ctx->major = prop.major;
|
+ dev_ctx->major = prop.major;
|
||||||
+ dev_ctx->minor = prop.minor;
|
+ dev_ctx->minor = prop.minor;
|
||||||
+ dev_ctx->driver_major = driverVersion / 1000;
|
+ dev_ctx->driver_major = driverVersion / 1000;
|
||||||
+ dev_ctx->driver_minor = (driverVersion - (dev_ctx->driver_major * 1000)) / 10;
|
+ dev_ctx->driver_minor = (driverVersion - (dev_ctx->driver_major * 1000)) / 10;
|
||||||
+ dev_ctx->integrated = prop.integrated;
|
+ dev_ctx->integrated = prop.integrated;
|
||||||
+ dev_ctx->pci_bus_id = prop.pciBusID;
|
+ dev_ctx->pciBusID = prop.pciBusID;
|
||||||
+ dev_ctx->pci_device_id = prop.pciDeviceID;
|
+ dev_ctx->pciDeviceID = prop.pciDeviceID;
|
||||||
+ dev_ctx->pci_domain_id = prop.pciDomainID;
|
+ dev_ctx->pciDomainID = prop.pciDomainID;
|
||||||
ggml_backend_dev_t dev = new ggml_backend_device {
|
ggml_backend_dev_t dev = new ggml_backend_device {
|
||||||
/* .iface = */ ggml_backend_cuda_device_interface,
|
/* .iface = */ ggml_backend_cuda_device_interface,
|
||||||
/* .reg = */ ®,
|
/* .reg = */ ®,
|
||||||
diff --git a/ggml/src/ggml-cuda/vendors/hip.h b/ggml/src/ggml-cuda/vendors/hip.h
|
diff --git a/ggml/src/ggml-cuda/vendors/hip.h b/ggml/src/ggml-cuda/vendors/hip.h
|
||||||
index cf22e60d2..957a795f2 100644
|
index 06f9e7c1..eb8f66cb 100644
|
||||||
--- a/ggml/src/ggml-cuda/vendors/hip.h
|
--- a/ggml/src/ggml-cuda/vendors/hip.h
|
||||||
+++ b/ggml/src/ggml-cuda/vendors/hip.h
|
+++ b/ggml/src/ggml-cuda/vendors/hip.h
|
||||||
@@ -42,6 +42,7 @@
|
@@ -5,6 +5,9 @@
|
||||||
|
#include <hipblas/hipblas.h>
|
||||||
|
#include <hip/hip_fp16.h>
|
||||||
|
#include <hip/hip_bf16.h>
|
||||||
|
+// for rocblas_initialize()
|
||||||
|
+#include "rocblas/rocblas.h"
|
||||||
|
+
|
||||||
|
|
||||||
|
#define CUBLAS_GEMM_DEFAULT HIPBLAS_GEMM_DEFAULT
|
||||||
|
#define CUBLAS_GEMM_DEFAULT_TENSOR_OP HIPBLAS_GEMM_DEFAULT
|
||||||
|
@@ -43,6 +46,7 @@
|
||||||
#define cudaDeviceProp hipDeviceProp_t
|
#define cudaDeviceProp hipDeviceProp_t
|
||||||
#define cudaDeviceReset hipDeviceReset
|
#define cudaDeviceReset hipDeviceReset
|
||||||
#define cudaDeviceSynchronize hipDeviceSynchronize
|
#define cudaDeviceSynchronize hipDeviceSynchronize
|
||||||
|
|
@ -202,11 +206,11 @@ index cf22e60d2..957a795f2 100644
|
||||||
#define cudaErrorPeerAccessAlreadyEnabled hipErrorPeerAccessAlreadyEnabled
|
#define cudaErrorPeerAccessAlreadyEnabled hipErrorPeerAccessAlreadyEnabled
|
||||||
#define cudaErrorPeerAccessNotEnabled hipErrorPeerAccessNotEnabled
|
#define cudaErrorPeerAccessNotEnabled hipErrorPeerAccessNotEnabled
|
||||||
diff --git a/ggml/src/ggml-impl.h b/ggml/src/ggml-impl.h
|
diff --git a/ggml/src/ggml-impl.h b/ggml/src/ggml-impl.h
|
||||||
index 19a7adb2d..b9b102a5e 100644
|
index 86a1ebf6..9fc9fbfc 100644
|
||||||
--- a/ggml/src/ggml-impl.h
|
--- a/ggml/src/ggml-impl.h
|
||||||
+++ b/ggml/src/ggml-impl.h
|
+++ b/ggml/src/ggml-impl.h
|
||||||
@@ -602,6 +602,14 @@ static inline bool ggml_can_fuse(const struct ggml_cgraph * cgraph, int node_idx
|
@@ -635,6 +635,14 @@ static inline bool ggml_can_fuse(const struct ggml_cgraph * cgraph, int node_idx
|
||||||
return true;
|
return ggml_can_fuse_ext(cgraph, idxs, ops, num_ops);
|
||||||
}
|
}
|
||||||
|
|
||||||
+// Management libraries for fetching more accurate free VRAM data
|
+// Management libraries for fetching more accurate free VRAM data
|
||||||
|
|
@ -220,28 +224,30 @@ index 19a7adb2d..b9b102a5e 100644
|
||||||
#ifdef __cplusplus
|
#ifdef __cplusplus
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
diff --git a/ggml/src/ggml-metal/ggml-metal.m b/ggml/src/ggml-metal/ggml-metal.m
|
diff --git a/ggml/src/ggml-metal/ggml-metal.cpp b/ggml/src/ggml-metal/ggml-metal.cpp
|
||||||
index e4c31268f..ec6b385ba 100644
|
index 08ab4fc9..17999a61 100644
|
||||||
--- a/ggml/src/ggml-metal/ggml-metal.m
|
--- a/ggml/src/ggml-metal/ggml-metal.cpp
|
||||||
+++ b/ggml/src/ggml-metal/ggml-metal.m
|
+++ b/ggml/src/ggml-metal/ggml-metal.cpp
|
||||||
@@ -6523,12 +6523,14 @@ static enum ggml_backend_dev_type ggml_backend_metal_device_get_type(ggml_backen
|
@@ -535,6 +535,7 @@ static enum ggml_backend_dev_type ggml_backend_metal_device_get_type(ggml_backen
|
||||||
GGML_UNUSED(dev);
|
GGML_UNUSED(dev);
|
||||||
}
|
}
|
||||||
|
|
||||||
+#define GGML_METAL_NAME "Metal"
|
+#define GGML_METAL_NAME "Metal"
|
||||||
static void ggml_backend_metal_device_get_props(ggml_backend_dev_t dev, struct ggml_backend_dev_props * props) {
|
static void ggml_backend_metal_device_get_props(ggml_backend_dev_t dev, ggml_backend_dev_props * props) {
|
||||||
props->name = ggml_backend_metal_device_get_name(dev);
|
props->name = ggml_backend_metal_device_get_name(dev);
|
||||||
props->description = ggml_backend_metal_device_get_description(dev);
|
props->description = ggml_backend_metal_device_get_description(dev);
|
||||||
props->id = "0";
|
@@ -542,7 +543,7 @@ static void ggml_backend_metal_device_get_props(ggml_backend_dev_t dev, ggml_bac
|
||||||
props->type = ggml_backend_metal_device_get_type(dev);
|
props->type = ggml_backend_metal_device_get_type(dev);
|
||||||
|
|
||||||
ggml_backend_metal_device_get_memory(dev, &props->memory_free, &props->memory_total);
|
ggml_backend_metal_device_get_memory(dev, &props->memory_free, &props->memory_total);
|
||||||
|
-
|
||||||
+ props->library = GGML_METAL_NAME;
|
+ props->library = GGML_METAL_NAME;
|
||||||
props->caps = (struct ggml_backend_dev_caps) {
|
props->caps = {
|
||||||
/* .async = */ false,
|
/* .async = */ true,
|
||||||
/* .host_buffer = */ false,
|
/* .host_buffer = */ false,
|
||||||
diff --git a/ggml/src/mem_hip.cpp b/ggml/src/mem_hip.cpp
|
diff --git a/ggml/src/mem_hip.cpp b/ggml/src/mem_hip.cpp
|
||||||
new file mode 100644
|
new file mode 100644
|
||||||
index 000000000..8ef19b8cf
|
index 00000000..8ef19b8c
|
||||||
--- /dev/null
|
--- /dev/null
|
||||||
+++ b/ggml/src/mem_hip.cpp
|
+++ b/ggml/src/mem_hip.cpp
|
||||||
@@ -0,0 +1,449 @@
|
@@ -0,0 +1,449 @@
|
||||||
|
|
@ -697,7 +703,7 @@ index 000000000..8ef19b8cf
|
||||||
\ No newline at end of file
|
\ No newline at end of file
|
||||||
diff --git a/ggml/src/mem_nvml.cpp b/ggml/src/mem_nvml.cpp
|
diff --git a/ggml/src/mem_nvml.cpp b/ggml/src/mem_nvml.cpp
|
||||||
new file mode 100644
|
new file mode 100644
|
||||||
index 000000000..aa05e9dc1
|
index 00000000..aa05e9dc
|
||||||
--- /dev/null
|
--- /dev/null
|
||||||
+++ b/ggml/src/mem_nvml.cpp
|
+++ b/ggml/src/mem_nvml.cpp
|
||||||
@@ -0,0 +1,172 @@
|
@@ -0,0 +1,172 @@
|
||||||
|
|
|
||||||
|
|
@ -1,57 +0,0 @@
|
||||||
From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
|
|
||||||
From: Jesse Gross <jesse@ollama.com>
|
|
||||||
Date: Tue, 23 Sep 2025 15:41:58 -0700
|
|
||||||
Subject: [PATCH] ggml: Backport scale kernel fixes
|
|
||||||
|
|
||||||
The GGML scale kernel uses signed 32-bit ints to represent
|
|
||||||
the number of elements in the tensor. For large images,
|
|
||||||
mistral-small3.2 overflows this, triggering CUDA errors due
|
|
||||||
to negative arguments.
|
|
||||||
|
|
||||||
Currently, this can happen when the user passes a large image
|
|
||||||
to mistral-small3.2. However, with upcoming changes to reserve
|
|
||||||
CUDA memory, it happens every time mistral-small is loaded as
|
|
||||||
we reserve using a worst case batch.
|
|
||||||
|
|
||||||
This patch is part of an upstream GGML commit and should be removed
|
|
||||||
after GGML is updated past 0a1b398 "ggml: add ops for WAN video model
|
|
||||||
(cuda && cpu) (#15669)".
|
|
||||||
|
|
||||||
Fixes #10388
|
|
||||||
---
|
|
||||||
ggml/src/ggml-cuda/scale.cu | 19 ++++++++++---------
|
|
||||||
1 file changed, 10 insertions(+), 9 deletions(-)
|
|
||||||
|
|
||||||
diff --git a/ggml/src/ggml-cuda/scale.cu b/ggml/src/ggml-cuda/scale.cu
|
|
||||||
index 2ee9e5889..0ddeff6a1 100644
|
|
||||||
--- a/ggml/src/ggml-cuda/scale.cu
|
|
||||||
+++ b/ggml/src/ggml-cuda/scale.cu
|
|
||||||
@@ -1,18 +1,19 @@
|
|
||||||
#include "scale.cuh"
|
|
||||||
|
|
||||||
-static __global__ void scale_f32(const float * x, float * dst, const float scale, const float bias, const int k) {
|
|
||||||
- const int i = blockDim.x*blockIdx.x + threadIdx.x;
|
|
||||||
+#define MAX_GRIDDIM_X 0x7FFFFFFF
|
|
||||||
|
|
||||||
- if (i >= k) {
|
|
||||||
- return;
|
|
||||||
- }
|
|
||||||
+static __global__ void scale_f32(const float * x, float * dst, const float scale, const float bias, const int64_t nelements) {
|
|
||||||
+ int64_t tid = (int64_t)blockIdx.x * (int64_t)blockDim.x + (int64_t)threadIdx.x;
|
|
||||||
+ int64_t stride = (int64_t)blockDim.x * (int64_t)gridDim.x;
|
|
||||||
|
|
||||||
- dst[i] = scale * x[i] + bias;
|
|
||||||
+ for (int64_t i = tid; i < nelements; i += stride) {
|
|
||||||
+ dst[i] = scale * x[i] + bias;
|
|
||||||
+ }
|
|
||||||
}
|
|
||||||
|
|
||||||
-static void scale_f32_cuda(const float * x, float * dst, const float scale, const float bias, const int k, cudaStream_t stream) {
|
|
||||||
- const int num_blocks = (k + CUDA_SCALE_BLOCK_SIZE - 1) / CUDA_SCALE_BLOCK_SIZE;
|
|
||||||
- scale_f32<<<num_blocks, CUDA_SCALE_BLOCK_SIZE, 0, stream>>>(x, dst, scale, bias, k);
|
|
||||||
+static void scale_f32_cuda(const float * x, float * dst, const float scale, const float bias, const int64_t nelements, cudaStream_t stream) {
|
|
||||||
+ const int64_t num_blocks = (nelements + CUDA_SCALE_BLOCK_SIZE - 1) / CUDA_SCALE_BLOCK_SIZE;
|
|
||||||
+ scale_f32<<<MIN(MAX_GRIDDIM_X, num_blocks), CUDA_SCALE_BLOCK_SIZE, 0, stream>>>(x, dst, scale, bias, nelements);
|
|
||||||
}
|
|
||||||
|
|
||||||
void ggml_cuda_op_scale(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
|
||||||
18
ml/backend/ggml/ggml/include/ggml-backend.h
vendored
18
ml/backend/ggml/ggml/include/ggml-backend.h
vendored
|
|
@ -132,6 +132,8 @@ extern "C" {
|
||||||
GGML_BACKEND_DEVICE_TYPE_CPU,
|
GGML_BACKEND_DEVICE_TYPE_CPU,
|
||||||
// GPU device using dedicated memory
|
// GPU device using dedicated memory
|
||||||
GGML_BACKEND_DEVICE_TYPE_GPU,
|
GGML_BACKEND_DEVICE_TYPE_GPU,
|
||||||
|
// integrated GPU device using host memory
|
||||||
|
GGML_BACKEND_DEVICE_TYPE_IGPU,
|
||||||
// accelerator devices intended to be used together with the CPU backend (e.g. BLAS or AMX)
|
// accelerator devices intended to be used together with the CPU backend (e.g. BLAS or AMX)
|
||||||
GGML_BACKEND_DEVICE_TYPE_ACCEL
|
GGML_BACKEND_DEVICE_TYPE_ACCEL
|
||||||
};
|
};
|
||||||
|
|
@ -150,12 +152,22 @@ extern "C" {
|
||||||
|
|
||||||
// all the device properties
|
// all the device properties
|
||||||
struct ggml_backend_dev_props {
|
struct ggml_backend_dev_props {
|
||||||
|
// device name
|
||||||
const char * name;
|
const char * name;
|
||||||
|
// device description
|
||||||
const char * description;
|
const char * description;
|
||||||
const char * id;
|
// device free memory in bytes
|
||||||
size_t memory_free;
|
size_t memory_free;
|
||||||
|
const char * id;
|
||||||
|
// device total memory in bytes
|
||||||
size_t memory_total;
|
size_t memory_total;
|
||||||
|
// device type
|
||||||
enum ggml_backend_dev_type type;
|
enum ggml_backend_dev_type type;
|
||||||
|
// device id
|
||||||
|
// for PCI devices, this should be the PCI bus id formatted as "domain:bus:device.function" (e.g. "0000:01:00.0")
|
||||||
|
// if the id is unknown, this should be NULL
|
||||||
|
const char * device_id;
|
||||||
|
// device capabilities
|
||||||
struct ggml_backend_dev_caps caps;
|
struct ggml_backend_dev_caps caps;
|
||||||
int driver_major;
|
int driver_major;
|
||||||
int driver_minor;
|
int driver_minor;
|
||||||
|
|
@ -314,12 +326,16 @@ extern "C" {
|
||||||
GGML_API int ggml_backend_sched_get_n_splits(ggml_backend_sched_t sched);
|
GGML_API int ggml_backend_sched_get_n_splits(ggml_backend_sched_t sched);
|
||||||
GGML_API int ggml_backend_sched_get_n_copies(ggml_backend_sched_t sched);
|
GGML_API int ggml_backend_sched_get_n_copies(ggml_backend_sched_t sched);
|
||||||
|
|
||||||
|
GGML_API ggml_backend_buffer_type_t ggml_backend_sched_get_buffer_type(ggml_backend_sched_t sched, ggml_backend_t backend);
|
||||||
GGML_API size_t ggml_backend_sched_get_buffer_size(ggml_backend_sched_t sched, ggml_backend_t backend);
|
GGML_API size_t ggml_backend_sched_get_buffer_size(ggml_backend_sched_t sched, ggml_backend_t backend);
|
||||||
GGML_API size_t ggml_backend_sched_get_attempted_buffer_size(ggml_backend_sched_t sched, ggml_backend_t backend);
|
GGML_API size_t ggml_backend_sched_get_attempted_buffer_size(ggml_backend_sched_t sched, ggml_backend_t backend);
|
||||||
|
|
||||||
GGML_API void ggml_backend_sched_set_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node, ggml_backend_t backend);
|
GGML_API void ggml_backend_sched_set_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node, ggml_backend_t backend);
|
||||||
GGML_API ggml_backend_t ggml_backend_sched_get_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node);
|
GGML_API ggml_backend_t ggml_backend_sched_get_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node);
|
||||||
|
|
||||||
|
// Split graph without allocating it
|
||||||
|
GGML_API void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * graph);
|
||||||
|
|
||||||
// Allocate and compute graph on the backend scheduler
|
// Allocate and compute graph on the backend scheduler
|
||||||
GGML_API bool ggml_backend_sched_alloc_graph(ggml_backend_sched_t sched, struct ggml_cgraph * graph); // returns success
|
GGML_API bool ggml_backend_sched_alloc_graph(ggml_backend_sched_t sched, struct ggml_cgraph * graph); // returns success
|
||||||
GGML_API enum ggml_status ggml_backend_sched_graph_compute(ggml_backend_sched_t sched, struct ggml_cgraph * graph);
|
GGML_API enum ggml_status ggml_backend_sched_graph_compute(ggml_backend_sched_t sched, struct ggml_cgraph * graph);
|
||||||
|
|
|
||||||
2
ml/backend/ggml/ggml/include/ggml-cpu.h
vendored
2
ml/backend/ggml/ggml/include/ggml-cpu.h
vendored
|
|
@ -101,7 +101,6 @@ extern "C" {
|
||||||
GGML_BACKEND_API int ggml_cpu_has_riscv_v (void);
|
GGML_BACKEND_API int ggml_cpu_has_riscv_v (void);
|
||||||
GGML_BACKEND_API int ggml_cpu_has_vsx (void);
|
GGML_BACKEND_API int ggml_cpu_has_vsx (void);
|
||||||
GGML_BACKEND_API int ggml_cpu_has_vxe (void);
|
GGML_BACKEND_API int ggml_cpu_has_vxe (void);
|
||||||
GGML_BACKEND_API int ggml_cpu_has_nnpa (void);
|
|
||||||
GGML_BACKEND_API int ggml_cpu_has_wasm_simd (void);
|
GGML_BACKEND_API int ggml_cpu_has_wasm_simd (void);
|
||||||
GGML_BACKEND_API int ggml_cpu_has_llamafile (void);
|
GGML_BACKEND_API int ggml_cpu_has_llamafile (void);
|
||||||
|
|
||||||
|
|
@ -135,6 +134,7 @@ extern "C" {
|
||||||
GGML_BACKEND_API ggml_backend_reg_t ggml_backend_cpu_reg(void);
|
GGML_BACKEND_API ggml_backend_reg_t ggml_backend_cpu_reg(void);
|
||||||
|
|
||||||
GGML_BACKEND_API void ggml_cpu_fp32_to_fp32(const float *, float *, int64_t);
|
GGML_BACKEND_API void ggml_cpu_fp32_to_fp32(const float *, float *, int64_t);
|
||||||
|
GGML_BACKEND_API void ggml_cpu_fp32_to_i32 (const float *, int32_t *, int64_t);
|
||||||
GGML_BACKEND_API void ggml_cpu_fp32_to_fp16(const float *, ggml_fp16_t *, int64_t);
|
GGML_BACKEND_API void ggml_cpu_fp32_to_fp16(const float *, ggml_fp16_t *, int64_t);
|
||||||
GGML_BACKEND_API void ggml_cpu_fp16_to_fp32(const ggml_fp16_t *, float *, int64_t);
|
GGML_BACKEND_API void ggml_cpu_fp16_to_fp32(const ggml_fp16_t *, float *, int64_t);
|
||||||
GGML_BACKEND_API void ggml_cpu_fp32_to_bf16(const float *, ggml_bf16_t *, int64_t);
|
GGML_BACKEND_API void ggml_cpu_fp32_to_bf16(const float *, ggml_bf16_t *, int64_t);
|
||||||
|
|
|
||||||
7
ml/backend/ggml/ggml/include/ggml-metal.h
vendored
7
ml/backend/ggml/ggml/include/ggml-metal.h
vendored
|
|
@ -39,18 +39,13 @@ extern "C" {
|
||||||
// user-code should use only these functions
|
// user-code should use only these functions
|
||||||
//
|
//
|
||||||
|
|
||||||
|
// TODO: remove in the future
|
||||||
GGML_BACKEND_API ggml_backend_t ggml_backend_metal_init(void);
|
GGML_BACKEND_API ggml_backend_t ggml_backend_metal_init(void);
|
||||||
|
|
||||||
GGML_BACKEND_API bool ggml_backend_is_metal(ggml_backend_t backend);
|
GGML_BACKEND_API bool ggml_backend_is_metal(ggml_backend_t backend);
|
||||||
|
|
||||||
GGML_DEPRECATED(
|
|
||||||
GGML_BACKEND_API ggml_backend_buffer_t ggml_backend_metal_buffer_from_ptr(void * data, size_t size, size_t max_size),
|
|
||||||
"obsoleted by the new device interface - https://github.com/ggml-org/llama.cpp/pull/9713");
|
|
||||||
|
|
||||||
GGML_BACKEND_API void ggml_backend_metal_set_abort_callback(ggml_backend_t backend, ggml_abort_callback abort_callback, void * user_data);
|
GGML_BACKEND_API void ggml_backend_metal_set_abort_callback(ggml_backend_t backend, ggml_abort_callback abort_callback, void * user_data);
|
||||||
|
|
||||||
GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void);
|
|
||||||
|
|
||||||
// helper to check if the device supports a specific family
|
// helper to check if the device supports a specific family
|
||||||
// ideally, the user code should be doing these checks
|
// ideally, the user code should be doing these checks
|
||||||
// ref: https://developer.apple.com/metal/Metal-Feature-Set-Tables.pdf
|
// ref: https://developer.apple.com/metal/Metal-Feature-Set-Tables.pdf
|
||||||
|
|
|
||||||
27
ml/backend/ggml/ggml/include/ggml-opt.h
vendored
27
ml/backend/ggml/ggml/include/ggml-opt.h
vendored
|
|
@ -74,16 +74,26 @@ extern "C" {
|
||||||
GGML_OPT_BUILD_TYPE_OPT = 30,
|
GGML_OPT_BUILD_TYPE_OPT = 30,
|
||||||
};
|
};
|
||||||
|
|
||||||
|
enum ggml_opt_optimizer_type {
|
||||||
|
GGML_OPT_OPTIMIZER_TYPE_ADAMW,
|
||||||
|
GGML_OPT_OPTIMIZER_TYPE_SGD,
|
||||||
|
|
||||||
|
GGML_OPT_OPTIMIZER_TYPE_COUNT
|
||||||
|
};
|
||||||
|
|
||||||
// parameters that control which optimizer is used and how said optimizer tries to find the minimal loss
|
// parameters that control which optimizer is used and how said optimizer tries to find the minimal loss
|
||||||
struct ggml_opt_optimizer_params {
|
struct ggml_opt_optimizer_params {
|
||||||
// AdamW optimizer parameters
|
|
||||||
struct {
|
struct {
|
||||||
float alpha; // learning rate
|
float alpha; // learning rate
|
||||||
float beta1;
|
float beta1; // first AdamW momentum
|
||||||
float beta2;
|
float beta2; // second AdamW momentum
|
||||||
float eps; // epsilon for numerical stability
|
float eps; // epsilon for numerical stability
|
||||||
float wd; // weight decay for AdamW, use 0.0f to disable
|
float wd; // weight decay - 0.0f to disable
|
||||||
} adamw;
|
} adamw;
|
||||||
|
struct {
|
||||||
|
float alpha; // learning rate
|
||||||
|
float wd; // weight decay
|
||||||
|
} sgd;
|
||||||
};
|
};
|
||||||
|
|
||||||
// callback to calculate optimizer parameters prior to a backward pass
|
// callback to calculate optimizer parameters prior to a backward pass
|
||||||
|
|
@ -114,6 +124,9 @@ extern "C" {
|
||||||
|
|
||||||
ggml_opt_get_optimizer_params get_opt_pars; // callback for calculating optimizer parameters
|
ggml_opt_get_optimizer_params get_opt_pars; // callback for calculating optimizer parameters
|
||||||
void * get_opt_pars_ud; // userdata for calculating optimizer parameters
|
void * get_opt_pars_ud; // userdata for calculating optimizer parameters
|
||||||
|
|
||||||
|
// only GGML_OPT_OPTIMIZER_TYPE_ADAMW needs m, v momenta per parameter tensor
|
||||||
|
enum ggml_opt_optimizer_type optimizer;
|
||||||
};
|
};
|
||||||
|
|
||||||
// get parameters for an optimization context with defaults set where possible
|
// get parameters for an optimization context with defaults set where possible
|
||||||
|
|
@ -142,6 +155,10 @@ extern "C" {
|
||||||
// get the gradient accumulator for a node from the forward graph
|
// get the gradient accumulator for a node from the forward graph
|
||||||
GGML_API struct ggml_tensor * ggml_opt_grad_acc(ggml_opt_context_t opt_ctx, struct ggml_tensor * node);
|
GGML_API struct ggml_tensor * ggml_opt_grad_acc(ggml_opt_context_t opt_ctx, struct ggml_tensor * node);
|
||||||
|
|
||||||
|
GGML_API enum ggml_opt_optimizer_type ggml_opt_context_optimizer_type(ggml_opt_context_t); //TODO consistent naming scheme
|
||||||
|
|
||||||
|
GGML_API const char * ggml_opt_optimizer_name(enum ggml_opt_optimizer_type);
|
||||||
|
|
||||||
// ====== Optimization Result ======
|
// ====== Optimization Result ======
|
||||||
|
|
||||||
GGML_API ggml_opt_result_t ggml_opt_result_init(void);
|
GGML_API ggml_opt_result_t ggml_opt_result_init(void);
|
||||||
|
|
@ -226,12 +243,14 @@ extern "C" {
|
||||||
struct ggml_tensor * outputs, // output tensor, must have shape [ne_label, ndata_batch] if labels are used
|
struct ggml_tensor * outputs, // output tensor, must have shape [ne_label, ndata_batch] if labels are used
|
||||||
ggml_opt_dataset_t dataset, // dataset with data and optionally also labels
|
ggml_opt_dataset_t dataset, // dataset with data and optionally also labels
|
||||||
enum ggml_opt_loss_type loss_type, // loss to minimize
|
enum ggml_opt_loss_type loss_type, // loss to minimize
|
||||||
|
enum ggml_opt_optimizer_type optimizer, // sgd or adamw
|
||||||
ggml_opt_get_optimizer_params get_opt_pars, // callback to get optimizer params, userdata is pointer to epoch (of type int64_t)
|
ggml_opt_get_optimizer_params get_opt_pars, // callback to get optimizer params, userdata is pointer to epoch (of type int64_t)
|
||||||
int64_t nepoch, // how many times the dataset should be iterated over
|
int64_t nepoch, // how many times the dataset should be iterated over
|
||||||
int64_t nbatch_logical, // datapoints optimizer step, must be a multiple of ndata_batch in inputs/outputs
|
int64_t nbatch_logical, // datapoints optimizer step, must be a multiple of ndata_batch in inputs/outputs
|
||||||
float val_split, // fraction of the dataset to use for validation, must be in [0.0f, 1.0f)
|
float val_split, // fraction of the dataset to use for validation, must be in [0.0f, 1.0f)
|
||||||
bool silent); // whether or not info prints to stderr should be suppressed
|
bool silent); // whether or not info prints to stderr should be suppressed
|
||||||
|
|
||||||
|
|
||||||
#ifdef __cplusplus
|
#ifdef __cplusplus
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
|
||||||
17
ml/backend/ggml/ggml/include/ggml-zdnn.h
vendored
Normal file
17
ml/backend/ggml/ggml/include/ggml-zdnn.h
vendored
Normal file
|
|
@ -0,0 +1,17 @@
|
||||||
|
#pragma once
|
||||||
|
|
||||||
|
#include "ggml.h"
|
||||||
|
#include "ggml-backend.h"
|
||||||
|
|
||||||
|
#ifdef __cplusplus
|
||||||
|
extern "C" {
|
||||||
|
#endif
|
||||||
|
|
||||||
|
// device buffer
|
||||||
|
GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_zdnn_buffer_type(void);
|
||||||
|
|
||||||
|
GGML_BACKEND_API ggml_backend_reg_t ggml_backend_zdnn_reg(void);
|
||||||
|
|
||||||
|
#ifdef __cplusplus
|
||||||
|
}
|
||||||
|
#endif
|
||||||
119
ml/backend/ggml/ggml/include/ggml.h
vendored
119
ml/backend/ggml/ggml/include/ggml.h
vendored
|
|
@ -241,7 +241,16 @@
|
||||||
#define GGML_ROPE_TYPE_MROPE 8
|
#define GGML_ROPE_TYPE_MROPE 8
|
||||||
#define GGML_ROPE_TYPE_VISION 24
|
#define GGML_ROPE_TYPE_VISION 24
|
||||||
|
|
||||||
|
#define GGML_MROPE_SECTIONS 4
|
||||||
|
|
||||||
#define GGML_UNUSED(x) (void)(x)
|
#define GGML_UNUSED(x) (void)(x)
|
||||||
|
#ifdef __CUDACC__
|
||||||
|
template<typename... Args>
|
||||||
|
__host__ __device__ constexpr inline void ggml_unused_vars_impl(Args&&...) noexcept {}
|
||||||
|
#define GGML_UNUSED_VARS(...) ggml_unused_vars_impl(__VA_ARGS__)
|
||||||
|
#else
|
||||||
|
#define GGML_UNUSED_VARS(...) do { (void)sizeof((__VA_ARGS__, 0)); } while(0)
|
||||||
|
#endif // __CUDACC__
|
||||||
|
|
||||||
#define GGML_PAD(x, n) (((x) + (n) - 1) & ~((n) - 1))
|
#define GGML_PAD(x, n) (((x) + (n) - 1) & ~((n) - 1))
|
||||||
|
|
||||||
|
|
@ -275,19 +284,19 @@
|
||||||
// GGML_TENSOR_LOCALS(size_t, nb1, src1, nb);
|
// GGML_TENSOR_LOCALS(size_t, nb1, src1, nb);
|
||||||
//
|
//
|
||||||
#define GGML_TENSOR_LOCALS_1(type, prefix, pointer, array) \
|
#define GGML_TENSOR_LOCALS_1(type, prefix, pointer, array) \
|
||||||
const type prefix##0 = (pointer)->array[0]; \
|
const type prefix##0 = (pointer) ? (pointer)->array[0] : 0; \
|
||||||
GGML_UNUSED(prefix##0);
|
GGML_UNUSED(prefix##0);
|
||||||
#define GGML_TENSOR_LOCALS_2(type, prefix, pointer, array) \
|
#define GGML_TENSOR_LOCALS_2(type, prefix, pointer, array) \
|
||||||
GGML_TENSOR_LOCALS_1 (type, prefix, pointer, array) \
|
GGML_TENSOR_LOCALS_1 (type, prefix, pointer, array) \
|
||||||
const type prefix##1 = (pointer)->array[1]; \
|
const type prefix##1 = (pointer) ? (pointer)->array[1] : 0; \
|
||||||
GGML_UNUSED(prefix##1);
|
GGML_UNUSED(prefix##1);
|
||||||
#define GGML_TENSOR_LOCALS_3(type, prefix, pointer, array) \
|
#define GGML_TENSOR_LOCALS_3(type, prefix, pointer, array) \
|
||||||
GGML_TENSOR_LOCALS_2 (type, prefix, pointer, array) \
|
GGML_TENSOR_LOCALS_2 (type, prefix, pointer, array) \
|
||||||
const type prefix##2 = (pointer)->array[2]; \
|
const type prefix##2 = (pointer) ? (pointer)->array[2] : 0; \
|
||||||
GGML_UNUSED(prefix##2);
|
GGML_UNUSED(prefix##2);
|
||||||
#define GGML_TENSOR_LOCALS(type, prefix, pointer, array) \
|
#define GGML_TENSOR_LOCALS(type, prefix, pointer, array) \
|
||||||
GGML_TENSOR_LOCALS_3 (type, prefix, pointer, array) \
|
GGML_TENSOR_LOCALS_3 (type, prefix, pointer, array) \
|
||||||
const type prefix##3 = (pointer)->array[3]; \
|
const type prefix##3 = (pointer) ? (pointer)->array[3] : 0; \
|
||||||
GGML_UNUSED(prefix##3);
|
GGML_UNUSED(prefix##3);
|
||||||
|
|
||||||
#define GGML_TENSOR_UNARY_OP_LOCALS \
|
#define GGML_TENSOR_UNARY_OP_LOCALS \
|
||||||
|
|
@ -502,7 +511,9 @@ extern "C" {
|
||||||
GGML_OP_CONV_TRANSPOSE_1D,
|
GGML_OP_CONV_TRANSPOSE_1D,
|
||||||
GGML_OP_IM2COL,
|
GGML_OP_IM2COL,
|
||||||
GGML_OP_IM2COL_BACK,
|
GGML_OP_IM2COL_BACK,
|
||||||
|
GGML_OP_IM2COL_3D,
|
||||||
GGML_OP_CONV_2D,
|
GGML_OP_CONV_2D,
|
||||||
|
GGML_OP_CONV_3D,
|
||||||
GGML_OP_CONV_2D_DW,
|
GGML_OP_CONV_2D_DW,
|
||||||
GGML_OP_CONV_TRANSPOSE_2D,
|
GGML_OP_CONV_TRANSPOSE_2D,
|
||||||
GGML_OP_POOL_1D,
|
GGML_OP_POOL_1D,
|
||||||
|
|
@ -540,6 +551,7 @@ extern "C" {
|
||||||
GGML_OP_CROSS_ENTROPY_LOSS,
|
GGML_OP_CROSS_ENTROPY_LOSS,
|
||||||
GGML_OP_CROSS_ENTROPY_LOSS_BACK,
|
GGML_OP_CROSS_ENTROPY_LOSS_BACK,
|
||||||
GGML_OP_OPT_STEP_ADAMW,
|
GGML_OP_OPT_STEP_ADAMW,
|
||||||
|
GGML_OP_OPT_STEP_SGD,
|
||||||
|
|
||||||
GGML_OP_GLU,
|
GGML_OP_GLU,
|
||||||
|
|
||||||
|
|
@ -1392,6 +1404,7 @@ extern "C" {
|
||||||
struct ggml_tensor * a,
|
struct ggml_tensor * a,
|
||||||
struct ggml_tensor * b);
|
struct ggml_tensor * b);
|
||||||
|
|
||||||
|
// note: casting from f32 to i32 will discard the fractional part
|
||||||
GGML_API struct ggml_tensor * ggml_cast(
|
GGML_API struct ggml_tensor * ggml_cast(
|
||||||
struct ggml_context * ctx,
|
struct ggml_context * ctx,
|
||||||
struct ggml_tensor * a,
|
struct ggml_tensor * a,
|
||||||
|
|
@ -1516,7 +1529,11 @@ extern "C" {
|
||||||
struct ggml_context * ctx,
|
struct ggml_context * ctx,
|
||||||
struct ggml_tensor * a);
|
struct ggml_tensor * a);
|
||||||
|
|
||||||
// supports 3D: a->ne[2] == b->ne[1]
|
// supports 4D a:
|
||||||
|
// a [n_embd, ne1, ne2, ne3]
|
||||||
|
// b I32 [n_rows, ne2, ne3, 1]
|
||||||
|
//
|
||||||
|
// return [n_embd, n_rows, ne2, ne3]
|
||||||
GGML_API struct ggml_tensor * ggml_get_rows(
|
GGML_API struct ggml_tensor * ggml_get_rows(
|
||||||
struct ggml_context * ctx,
|
struct ggml_context * ctx,
|
||||||
struct ggml_tensor * a, // data
|
struct ggml_tensor * a, // data
|
||||||
|
|
@ -1660,7 +1677,7 @@ extern "C" {
|
||||||
struct ggml_tensor * b,
|
struct ggml_tensor * b,
|
||||||
struct ggml_tensor * c,
|
struct ggml_tensor * c,
|
||||||
int n_dims,
|
int n_dims,
|
||||||
int sections[4],
|
int sections[GGML_MROPE_SECTIONS],
|
||||||
int mode,
|
int mode,
|
||||||
int n_ctx_orig,
|
int n_ctx_orig,
|
||||||
float freq_base,
|
float freq_base,
|
||||||
|
|
@ -1686,6 +1703,22 @@ extern "C" {
|
||||||
float beta_fast,
|
float beta_fast,
|
||||||
float beta_slow);
|
float beta_slow);
|
||||||
|
|
||||||
|
GGML_API struct ggml_tensor * ggml_rope_multi_inplace(
|
||||||
|
struct ggml_context * ctx,
|
||||||
|
struct ggml_tensor * a,
|
||||||
|
struct ggml_tensor * b,
|
||||||
|
struct ggml_tensor * c,
|
||||||
|
int n_dims,
|
||||||
|
int sections[GGML_MROPE_SECTIONS],
|
||||||
|
int mode,
|
||||||
|
int n_ctx_orig,
|
||||||
|
float freq_base,
|
||||||
|
float freq_scale,
|
||||||
|
float ext_factor,
|
||||||
|
float attn_factor,
|
||||||
|
float beta_fast,
|
||||||
|
float beta_slow);
|
||||||
|
|
||||||
GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_rope_custom(
|
GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_rope_custom(
|
||||||
struct ggml_context * ctx,
|
struct ggml_context * ctx,
|
||||||
struct ggml_tensor * a,
|
struct ggml_tensor * a,
|
||||||
|
|
@ -1843,6 +1876,41 @@ extern "C" {
|
||||||
int d0, // dilation dimension 0
|
int d0, // dilation dimension 0
|
||||||
int d1); // dilation dimension 1
|
int d1); // dilation dimension 1
|
||||||
|
|
||||||
|
GGML_API struct ggml_tensor * ggml_im2col_3d(
|
||||||
|
struct ggml_context * ctx,
|
||||||
|
struct ggml_tensor * a,
|
||||||
|
struct ggml_tensor * b,
|
||||||
|
int64_t IC,
|
||||||
|
int s0, // stride width
|
||||||
|
int s1, // stride height
|
||||||
|
int s2, // stride depth
|
||||||
|
int p0, // padding width
|
||||||
|
int p1, // padding height
|
||||||
|
int p2, // padding depth
|
||||||
|
int d0, // dilation width
|
||||||
|
int d1, // dilation height
|
||||||
|
int d2, // dilation depth
|
||||||
|
enum ggml_type dst_type);
|
||||||
|
|
||||||
|
// a: [OC*IC, KD, KH, KW]
|
||||||
|
// b: [N*IC, ID, IH, IW]
|
||||||
|
// result: [N*OC, OD, OH, OW]
|
||||||
|
GGML_API struct ggml_tensor * ggml_conv_3d(
|
||||||
|
struct ggml_context * ctx,
|
||||||
|
struct ggml_tensor * a,
|
||||||
|
struct ggml_tensor * b,
|
||||||
|
int64_t IC,
|
||||||
|
int s0, // stride width
|
||||||
|
int s1, // stride height
|
||||||
|
int s2, // stride depth
|
||||||
|
int p0, // padding width
|
||||||
|
int p1, // padding height
|
||||||
|
int p2, // padding depth
|
||||||
|
int d0, // dilation width
|
||||||
|
int d1, // dilation height
|
||||||
|
int d2 // dilation depth
|
||||||
|
);
|
||||||
|
|
||||||
// kernel size is a->ne[0] x a->ne[1]
|
// kernel size is a->ne[0] x a->ne[1]
|
||||||
// stride is equal to kernel size
|
// stride is equal to kernel size
|
||||||
// padding is zero
|
// padding is zero
|
||||||
|
|
@ -1914,6 +1982,23 @@ extern "C" {
|
||||||
int d0, // dilation dimension 0
|
int d0, // dilation dimension 0
|
||||||
int d1); // dilation dimension 1
|
int d1); // dilation dimension 1
|
||||||
|
|
||||||
|
GGML_API struct ggml_tensor * ggml_conv_3d_direct(
|
||||||
|
struct ggml_context * ctx,
|
||||||
|
struct ggml_tensor * a, // kernel [KW, KH, KD, IC * OC]
|
||||||
|
struct ggml_tensor * b, // input [W, H, D, C * N]
|
||||||
|
int s0, // stride
|
||||||
|
int s1,
|
||||||
|
int s2,
|
||||||
|
int p0, // padding
|
||||||
|
int p1,
|
||||||
|
int p2,
|
||||||
|
int d0, // dilation
|
||||||
|
int d1,
|
||||||
|
int d2,
|
||||||
|
int n_channels,
|
||||||
|
int n_batch,
|
||||||
|
int n_channels_out);
|
||||||
|
|
||||||
enum ggml_op_pool {
|
enum ggml_op_pool {
|
||||||
GGML_OP_POOL_MAX,
|
GGML_OP_POOL_MAX,
|
||||||
GGML_OP_POOL_AVG,
|
GGML_OP_POOL_AVG,
|
||||||
|
|
@ -2004,6 +2089,19 @@ extern "C" {
|
||||||
int p2,
|
int p2,
|
||||||
int p3);
|
int p3);
|
||||||
|
|
||||||
|
GGML_API struct ggml_tensor * ggml_pad_ext(
|
||||||
|
struct ggml_context * ctx,
|
||||||
|
struct ggml_tensor * a,
|
||||||
|
int lp0,
|
||||||
|
int rp0,
|
||||||
|
int lp1,
|
||||||
|
int rp1,
|
||||||
|
int lp2,
|
||||||
|
int rp2,
|
||||||
|
int lp3,
|
||||||
|
int rp3
|
||||||
|
);
|
||||||
|
|
||||||
// pad each dimension with reflection: [a, b, c, d] -> [b, a, b, c, d, c]
|
// pad each dimension with reflection: [a, b, c, d] -> [b, a, b, c, d, c]
|
||||||
GGML_API struct ggml_tensor * ggml_pad_reflect_1d(
|
GGML_API struct ggml_tensor * ggml_pad_reflect_1d(
|
||||||
struct ggml_context * ctx,
|
struct ggml_context * ctx,
|
||||||
|
|
@ -2293,7 +2391,14 @@ extern "C" {
|
||||||
struct ggml_tensor * grad,
|
struct ggml_tensor * grad,
|
||||||
struct ggml_tensor * m,
|
struct ggml_tensor * m,
|
||||||
struct ggml_tensor * v,
|
struct ggml_tensor * v,
|
||||||
struct ggml_tensor * adamw_params); // parameters such a the learning rate
|
struct ggml_tensor * adamw_params); // parameters such as the learning rate
|
||||||
|
|
||||||
|
// stochastic gradient descent step (with weight decay)
|
||||||
|
GGML_API struct ggml_tensor * ggml_opt_step_sgd(
|
||||||
|
struct ggml_context * ctx,
|
||||||
|
struct ggml_tensor * a,
|
||||||
|
struct ggml_tensor * grad,
|
||||||
|
struct ggml_tensor * sgd_params); // alpha, weight decay
|
||||||
|
|
||||||
//
|
//
|
||||||
// automatic differentiation
|
// automatic differentiation
|
||||||
|
|
|
||||||
4
ml/backend/ggml/ggml/src/CMakeLists.txt
vendored
4
ml/backend/ggml/ggml/src/CMakeLists.txt
vendored
|
|
@ -114,6 +114,9 @@ message(STATUS "GGML_SYSTEM_ARCH: ${GGML_SYSTEM_ARCH}")
|
||||||
|
|
||||||
if (NOT MSVC)
|
if (NOT MSVC)
|
||||||
if (GGML_STATIC)
|
if (GGML_STATIC)
|
||||||
|
if (UNIX AND NOT APPLE)
|
||||||
|
set(CMAKE_FIND_LIBRARY_SUFFIXES ".a;.so")
|
||||||
|
endif()
|
||||||
add_link_options(-static)
|
add_link_options(-static)
|
||||||
if (MINGW)
|
if (MINGW)
|
||||||
add_link_options(-static-libgcc -static-libstdc++)
|
add_link_options(-static-libgcc -static-libstdc++)
|
||||||
|
|
@ -382,6 +385,7 @@ ggml_add_backend(RPC)
|
||||||
ggml_add_backend(SYCL)
|
ggml_add_backend(SYCL)
|
||||||
ggml_add_backend(Vulkan)
|
ggml_add_backend(Vulkan)
|
||||||
ggml_add_backend(WebGPU)
|
ggml_add_backend(WebGPU)
|
||||||
|
ggml_add_backend(zDNN)
|
||||||
ggml_add_backend(OpenCL)
|
ggml_add_backend(OpenCL)
|
||||||
|
|
||||||
foreach (target ggml-base ggml)
|
foreach (target ggml-base ggml)
|
||||||
|
|
|
||||||
378
ml/backend/ggml/ggml/src/ggml-alloc.c
vendored
378
ml/backend/ggml/ggml/src/ggml-alloc.c
vendored
|
|
@ -23,7 +23,7 @@ static bool ggml_is_view(const struct ggml_tensor * t) {
|
||||||
}
|
}
|
||||||
|
|
||||||
// ops that return true for this function must not use restrict pointers for their backend implementations
|
// ops that return true for this function must not use restrict pointers for their backend implementations
|
||||||
static bool ggml_op_can_inplace(enum ggml_op op) {
|
bool ggml_op_can_inplace(enum ggml_op op) {
|
||||||
switch (op) {
|
switch (op) {
|
||||||
case GGML_OP_SCALE:
|
case GGML_OP_SCALE:
|
||||||
case GGML_OP_DIAG_MASK_ZERO:
|
case GGML_OP_DIAG_MASK_ZERO:
|
||||||
|
|
@ -95,39 +95,104 @@ enum ggml_status ggml_tallocr_alloc(struct ggml_tallocr * talloc, struct ggml_te
|
||||||
|
|
||||||
// dynamic tensor allocator
|
// dynamic tensor allocator
|
||||||
|
|
||||||
|
#define GGML_VBUFFER_MAX_CHUNKS 16
|
||||||
|
|
||||||
|
// relative memory address within an allocation that can be split into multiple buffers (chunks)
|
||||||
|
struct buffer_address {
|
||||||
|
int chunk; // index of a backend buffer
|
||||||
|
size_t offset; // local memory offset within the buffer
|
||||||
|
};
|
||||||
|
|
||||||
|
static const struct buffer_address GGML_BUFFER_ADDRESS_INVALID = { -1, SIZE_MAX };
|
||||||
|
|
||||||
|
static bool ggml_buffer_address_less(struct buffer_address a, struct buffer_address b) {
|
||||||
|
return a.chunk != b.chunk ? a.chunk < b.chunk : a.offset < b.offset;
|
||||||
|
}
|
||||||
|
|
||||||
struct free_block {
|
struct free_block {
|
||||||
size_t offset;
|
size_t offset;
|
||||||
size_t size;
|
size_t size;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
struct tallocr_chunk {
|
||||||
|
struct free_block free_blocks[MAX_FREE_BLOCKS];
|
||||||
|
int n_free_blocks;
|
||||||
|
size_t max_size;
|
||||||
|
};
|
||||||
|
|
||||||
struct ggml_dyn_tallocr {
|
struct ggml_dyn_tallocr {
|
||||||
size_t alignment;
|
size_t alignment;
|
||||||
int n_free_blocks;
|
size_t max_chunk_size;
|
||||||
struct free_block free_blocks[MAX_FREE_BLOCKS];
|
struct tallocr_chunk * chunks[GGML_VBUFFER_MAX_CHUNKS];
|
||||||
size_t max_size;
|
int n_chunks;
|
||||||
|
|
||||||
#ifdef GGML_ALLOCATOR_DEBUG
|
#ifdef GGML_ALLOCATOR_DEBUG
|
||||||
struct {
|
struct {
|
||||||
const struct ggml_tensor * tensor;
|
const struct ggml_tensor * tensor;
|
||||||
size_t offset;
|
struct buffer_address addr;
|
||||||
} allocated_tensors[1024];
|
} allocated_tensors[1024];
|
||||||
#endif
|
#endif
|
||||||
};
|
};
|
||||||
|
|
||||||
|
static void ggml_dyn_tallocr_insert_block(struct tallocr_chunk * chunk, size_t offset, size_t size) {
|
||||||
|
GGML_ASSERT(chunk->n_free_blocks < MAX_FREE_BLOCKS && "out of free blocks");
|
||||||
|
// insert the new block in the correct position to keep the array sorted by address (to make merging blocks faster)
|
||||||
|
int insert_pos = 0;
|
||||||
|
while (insert_pos < chunk->n_free_blocks && chunk->free_blocks[insert_pos].offset < offset) {
|
||||||
|
insert_pos++;
|
||||||
|
}
|
||||||
|
// shift all blocks from insert_pos onward to make room for the new block
|
||||||
|
for (int i = chunk->n_free_blocks; i > insert_pos; i--) {
|
||||||
|
chunk->free_blocks[i] = chunk->free_blocks[i-1];
|
||||||
|
}
|
||||||
|
// insert the new block
|
||||||
|
chunk->free_blocks[insert_pos].offset = offset;
|
||||||
|
chunk->free_blocks[insert_pos].size = size;
|
||||||
|
chunk->n_free_blocks++;
|
||||||
|
}
|
||||||
|
|
||||||
|
static void ggml_dyn_tallocr_remove_block(struct tallocr_chunk * chunk, int idx) {
|
||||||
|
// shift all elements after idx by 1 to the left, overwriting the element at idx
|
||||||
|
for (int i = idx; i < chunk->n_free_blocks; i++) {
|
||||||
|
chunk->free_blocks[i] = chunk->free_blocks[i+1];
|
||||||
|
}
|
||||||
|
chunk->n_free_blocks--;
|
||||||
|
}
|
||||||
|
|
||||||
|
static int ggml_dyn_tallocr_new_chunk(struct ggml_dyn_tallocr * alloc, size_t min_size) {
|
||||||
|
if (alloc->n_chunks >= GGML_VBUFFER_MAX_CHUNKS) {
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
struct tallocr_chunk * chunk = calloc(1, sizeof(struct tallocr_chunk));
|
||||||
|
chunk->n_free_blocks = 1;
|
||||||
|
chunk->free_blocks[0].offset = 0;
|
||||||
|
// available space in a chunk is limited to max_chunk_size, but can be higher if:
|
||||||
|
// 1. a single tensor exceeds the maximum, and cannot fit any other way
|
||||||
|
// 2. we are running out of chunks
|
||||||
|
// backends will either manage to allocate the larger size, or report an error.
|
||||||
|
chunk->free_blocks[0].size = MAX(min_size, alloc->max_chunk_size);
|
||||||
|
if (alloc->n_chunks == GGML_VBUFFER_MAX_CHUNKS - 1) {
|
||||||
|
chunk->free_blocks[0].size = SIZE_MAX/2;
|
||||||
|
}
|
||||||
|
alloc->chunks[alloc->n_chunks] = chunk;
|
||||||
|
alloc->n_chunks++;
|
||||||
|
return alloc->n_chunks - 1;
|
||||||
|
}
|
||||||
|
|
||||||
#ifdef GGML_ALLOCATOR_DEBUG
|
#ifdef GGML_ALLOCATOR_DEBUG
|
||||||
static void add_allocated_tensor(struct ggml_dyn_tallocr * alloc, size_t offset, const struct ggml_tensor * tensor) {
|
static void add_allocated_tensor(struct ggml_dyn_tallocr * alloc, struct buffer_address addr, const struct ggml_tensor * tensor) {
|
||||||
for (int i = 0; i < 1024; i++) {
|
for (int i = 0; i < 1024; i++) {
|
||||||
if (alloc->allocated_tensors[i].tensor == NULL) {
|
if (alloc->allocated_tensors[i].tensor == NULL) {
|
||||||
alloc->allocated_tensors[i].tensor = tensor;
|
alloc->allocated_tensors[i].tensor = tensor;
|
||||||
alloc->allocated_tensors[i].offset = offset;
|
alloc->allocated_tensors[i].addr = addr;
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
GGML_ABORT("out of allocated_tensors");
|
GGML_ABORT("out of allocated_tensors");
|
||||||
}
|
}
|
||||||
static void remove_allocated_tensor(struct ggml_dyn_tallocr * alloc, size_t offset, const struct ggml_tensor * tensor) {
|
static void remove_allocated_tensor(struct ggml_dyn_tallocr * alloc, struct buffer_address addr, const struct ggml_tensor * tensor) {
|
||||||
for (int i = 0; i < 1024; i++) {
|
for (int i = 0; i < 1024; i++) {
|
||||||
if (alloc->allocated_tensors[i].offset == offset) {
|
if (alloc->allocated_tensors[i].addr.chunk == addr.chunk && alloc->allocated_tensors[i].addr.offset == addr.offset) {
|
||||||
alloc->allocated_tensors[i].tensor = NULL;
|
alloc->allocated_tensors[i].tensor = NULL;
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
@ -136,76 +201,94 @@ static void remove_allocated_tensor(struct ggml_dyn_tallocr * alloc, size_t offs
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
static size_t ggml_dyn_tallocr_alloc(struct ggml_dyn_tallocr * alloc, size_t size, const struct ggml_tensor * tensor) {
|
static struct buffer_address ggml_dyn_tallocr_alloc(struct ggml_dyn_tallocr * alloc, size_t size, const struct ggml_tensor * tensor) {
|
||||||
size = aligned_offset(NULL, size, alloc->alignment);
|
size = aligned_offset(NULL, size, alloc->alignment);
|
||||||
|
|
||||||
AT_PRINTF("%s: allocating %s (%zu bytes) - ", __func__, tensor->name, size);
|
AT_PRINTF("%s: allocating %s (%zu bytes) - ", __func__, tensor->name, size);
|
||||||
|
|
||||||
|
int best_fit_chunk = -1;
|
||||||
|
int best_fit_block = -1;
|
||||||
size_t max_avail = 0;
|
size_t max_avail = 0;
|
||||||
|
|
||||||
// find the best fitting free block besides the last block
|
// find the best fitting free block besides the last block, within any chunk
|
||||||
int best_fit_block = -1;
|
for (int c = 0; c < alloc->n_chunks; ++c) {
|
||||||
|
struct tallocr_chunk * chunk = alloc->chunks[c];
|
||||||
size_t best_fit_size = SIZE_MAX;
|
size_t best_fit_size = SIZE_MAX;
|
||||||
for (int i = 0; i < alloc->n_free_blocks - 1; i++) {
|
for (int i = 0; i < chunk->n_free_blocks - 1; i++) {
|
||||||
struct free_block * block = &alloc->free_blocks[i];
|
struct free_block * block = &chunk->free_blocks[i];
|
||||||
max_avail = MAX(max_avail, block->size);
|
max_avail = MAX(max_avail, block->size);
|
||||||
if (block->size >= size && block->size <= best_fit_size) {
|
if (block->size >= size && block->size <= best_fit_size) {
|
||||||
|
best_fit_chunk = c;
|
||||||
best_fit_block = i;
|
best_fit_block = i;
|
||||||
best_fit_size = block->size;
|
best_fit_size = block->size;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
if (best_fit_block == -1) {
|
if (best_fit_block == -1) {
|
||||||
// the last block is our last resort
|
// no suitable block found, try the last block (this will grow a chunks size)
|
||||||
struct free_block * block = &alloc->free_blocks[alloc->n_free_blocks - 1];
|
for (int c = 0; c < alloc->n_chunks; ++c) {
|
||||||
|
struct tallocr_chunk * chunk = alloc->chunks[c];
|
||||||
|
if (chunk->n_free_blocks > 0) {
|
||||||
|
struct free_block * block = &chunk->free_blocks[chunk->n_free_blocks - 1];
|
||||||
max_avail = MAX(max_avail, block->size);
|
max_avail = MAX(max_avail, block->size);
|
||||||
if (block->size >= size) {
|
if (block->size >= size) {
|
||||||
best_fit_block = alloc->n_free_blocks - 1;
|
best_fit_chunk = c;
|
||||||
} else {
|
best_fit_block = chunk->n_free_blocks - 1;
|
||||||
// this should never happen
|
break;
|
||||||
GGML_LOG_ERROR("%s: not enough space in the buffer to allocate %zu bytes, largest block available %zu bytes\n",
|
}
|
||||||
__func__, size, max_avail);
|
}
|
||||||
GGML_ABORT("not enough space in the buffer");
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
struct free_block * block = &alloc->free_blocks[best_fit_block];
|
if (best_fit_block == -1) {
|
||||||
size_t offset = block->offset;
|
// none of the existing chunks have enough space left
|
||||||
block->offset = offset + size;
|
best_fit_chunk = ggml_dyn_tallocr_new_chunk(alloc, size);
|
||||||
|
best_fit_block = 0;
|
||||||
|
}
|
||||||
|
if (best_fit_chunk == -1) {
|
||||||
|
// since the last chunk always has virtually endless memory, this should never happen
|
||||||
|
GGML_LOG_ERROR("%s: not enough space in the buffer to allocate %zu bytes, largest block available %zu bytes\n",
|
||||||
|
__func__, size, max_avail);
|
||||||
|
GGML_ABORT("graph allocation: failed to reserve memory");
|
||||||
|
}
|
||||||
|
|
||||||
|
struct tallocr_chunk * chunk = alloc->chunks[best_fit_chunk];
|
||||||
|
struct free_block * block = &chunk->free_blocks[best_fit_block];
|
||||||
|
struct buffer_address addr = {.chunk = best_fit_chunk, .offset = block->offset };
|
||||||
|
block->offset += size;
|
||||||
block->size -= size;
|
block->size -= size;
|
||||||
if (block->size == 0) {
|
if (block->size == 0) {
|
||||||
// remove block if empty
|
// remove block if empty
|
||||||
alloc->n_free_blocks--;
|
ggml_dyn_tallocr_remove_block(chunk, best_fit_block);
|
||||||
for (int j = best_fit_block; j < alloc->n_free_blocks; j++) {
|
|
||||||
alloc->free_blocks[j] = alloc->free_blocks[j+1];
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
AT_PRINTF("block %d, offset %zu\n", best_fit_block, offset);
|
AT_PRINTF("block %d, offset %zu, chunk %d\n", best_fit_block, addr.offset, addr.chunk);
|
||||||
|
|
||||||
#ifdef GGML_ALLOCATOR_DEBUG
|
#ifdef GGML_ALLOCATOR_DEBUG
|
||||||
add_allocated_tensor(alloc, offset, tensor);
|
add_allocated_tensor(alloc, addr, tensor);
|
||||||
size_t cur_max = offset + size;
|
size_t cur_max = addr.offset + size;
|
||||||
if (cur_max > alloc->max_size) {
|
if (cur_max > alloc->max_size[addr.chunk]) {
|
||||||
// sort allocated_tensors by offset
|
// sort allocated_tensors by chunk/offset
|
||||||
for (int i = 0; i < 1024; i++) {
|
for (int i = 0; i < 1024; i++) {
|
||||||
for (int j = i + 1; j < 1024; j++) {
|
for (int j = i + 1; j < 1024; j++) {
|
||||||
if (alloc->allocated_tensors[i].offset > alloc->allocated_tensors[j].offset) {
|
if (ggml_buffer_address_less(alloc->allocated_tensors[j].addr, alloc->allocated_tensors[i].addr)) {
|
||||||
const struct ggml_tensor * tmp_tensor = alloc->allocated_tensors[i].tensor;
|
const struct ggml_tensor * tmp_tensor = alloc->allocated_tensors[i].tensor;
|
||||||
size_t tmp_offset = alloc->allocated_tensors[i].offset;
|
struct buffer_address tmp_addr = alloc->allocated_tensors[i].addr;
|
||||||
alloc->allocated_tensors[i].tensor = alloc->allocated_tensors[j].tensor;
|
alloc->allocated_tensors[i].tensor = alloc->allocated_tensors[j].tensor;
|
||||||
alloc->allocated_tensors[i].offset = alloc->allocated_tensors[j].offset;
|
alloc->allocated_tensors[i].addr = alloc->allocated_tensors[j].addr;
|
||||||
alloc->allocated_tensors[j].tensor = tmp_tensor;
|
alloc->allocated_tensors[j].tensor = tmp_tensor;
|
||||||
alloc->allocated_tensors[j].offset = tmp_offset;
|
alloc->allocated_tensors[j].addr = tmp_addr;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
GGML_LOG_DEBUG("max_size = %.2f MB: tensors: ", cur_max / 1024.0 / 1024.0);
|
GGML_LOG_DEBUG("max_size[%d] = %.2f MB: tensors: ", addr.chunk, cur_max / 1024.0 / 1024.0);
|
||||||
for (int i = 0; i < 1024; i++) {
|
for (int i = 0; i < 1024; i++) {
|
||||||
if (alloc->allocated_tensors[i].tensor) {
|
if (alloc->allocated_tensors[i].tensor) {
|
||||||
GGML_LOG_DEBUG("%s [%zx-%zx] (%.2f MB) ", alloc->allocated_tensors[i].tensor->name,
|
GGML_LOG_DEBUG("%s [%d: %zx-%zx] (%.2f MB) ", alloc->allocated_tensors[i].tensor->name,
|
||||||
alloc->allocated_tensors[i].offset,
|
alloc->allocated_tensors[i].addr.chunk,
|
||||||
alloc->allocated_tensors[i].offset + ggml_nbytes(alloc->allocated_tensors[i].tensor),
|
alloc->allocated_tensors[i].addr.offset,
|
||||||
|
alloc->allocated_tensors[i].addr.offset + ggml_nbytes(alloc->allocated_tensors[i].tensor),
|
||||||
ggml_nbytes(alloc->allocated_tensors[i].tensor) / 1024.0 / 1024.0);
|
ggml_nbytes(alloc->allocated_tensors[i].tensor) / 1024.0 / 1024.0);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
@ -213,78 +296,69 @@ static size_t ggml_dyn_tallocr_alloc(struct ggml_dyn_tallocr * alloc, size_t siz
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
alloc->max_size = MAX(alloc->max_size, offset + size);
|
chunk->max_size = MAX(chunk->max_size, addr.offset + size);
|
||||||
|
|
||||||
return offset;
|
return addr;
|
||||||
|
|
||||||
GGML_UNUSED(tensor);
|
GGML_UNUSED(tensor);
|
||||||
}
|
}
|
||||||
|
|
||||||
// this is a very naive implementation, but for our case the number of free blocks should be very small
|
// this is a very naive implementation, but for our case the number of free blocks should be very small
|
||||||
static void ggml_dyn_tallocr_free_tensor(struct ggml_dyn_tallocr * alloc, size_t offset, size_t size, const struct ggml_tensor * tensor) {
|
static void ggml_dyn_tallocr_free_tensor(struct ggml_dyn_tallocr * alloc, struct buffer_address addr, size_t size, const struct ggml_tensor * tensor) {
|
||||||
size = aligned_offset(NULL, size, alloc->alignment);
|
size = aligned_offset(NULL, size, alloc->alignment);
|
||||||
|
|
||||||
AT_PRINTF("%s: freeing %s at %zu (%zu bytes) - n_free_blocks = %d\n", __func__, tensor->name, offset, size, alloc->n_free_blocks);
|
AT_PRINTF("%s: freeing %s at {chunk=%d, offset=%zu} (%zu bytes) - n_free_blocks = %d\n",
|
||||||
|
__func__, tensor->name, addr.chunk, addr.offset, size, alloc->chunks[addr.chunk]->n_free_blocks);
|
||||||
|
|
||||||
#ifdef GGML_ALLOCATOR_DEBUG
|
#ifdef GGML_ALLOCATOR_DEBUG
|
||||||
remove_allocated_tensor(alloc, offset, tensor);
|
remove_allocated_tensor(alloc, addr, tensor);
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
struct tallocr_chunk * chunk = alloc->chunks[addr.chunk];
|
||||||
|
|
||||||
// see if we can merge with an existing block
|
// see if we can merge with an existing block
|
||||||
for (int i = 0; i < alloc->n_free_blocks; i++) {
|
for (int i = 0; i < chunk->n_free_blocks; i++) {
|
||||||
struct free_block * block = &alloc->free_blocks[i];
|
struct free_block * block = &chunk->free_blocks[i];
|
||||||
// check if ptr is at the end of the block
|
// check if ptr is at the end of the block
|
||||||
if (block->offset + block->size == offset) {
|
if (block->offset + block->size == addr.offset) {
|
||||||
block->size += size;
|
block->size += size;
|
||||||
// check if we can merge with the next block
|
// check if we can merge with the next block
|
||||||
if (i < alloc->n_free_blocks - 1 && block->offset + block->size == alloc->free_blocks[i+1].offset) {
|
if (i < chunk->n_free_blocks - 1) {
|
||||||
block->size += alloc->free_blocks[i+1].size;
|
struct free_block * next = &chunk->free_blocks[i+1];
|
||||||
alloc->n_free_blocks--;
|
if (block->offset + block->size == next->offset) {
|
||||||
for (int j = i+1; j < alloc->n_free_blocks; j++) {
|
block->size += next->size;
|
||||||
alloc->free_blocks[j] = alloc->free_blocks[j+1];
|
ggml_dyn_tallocr_remove_block(chunk, i+1);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
// check if ptr is at the beginning of the block
|
// check if ptr is at the beginning of the block
|
||||||
if (offset + size == block->offset) {
|
if (addr.offset + size == block->offset) {
|
||||||
block->offset = offset;
|
block->offset = addr.offset;
|
||||||
block->size += size;
|
block->size += size;
|
||||||
// check if we can merge with the previous block
|
// check if we can merge with the previous block
|
||||||
if (i > 0 && alloc->free_blocks[i-1].offset + alloc->free_blocks[i-1].size == block->offset) {
|
if (i > 0) {
|
||||||
alloc->free_blocks[i-1].size += block->size;
|
struct free_block * prev = &chunk->free_blocks[i-1];
|
||||||
alloc->n_free_blocks--;
|
if (prev->offset + prev->size == block->offset) {
|
||||||
for (int j = i; j < alloc->n_free_blocks; j++) {
|
prev->size += block->size;
|
||||||
alloc->free_blocks[j] = alloc->free_blocks[j+1];
|
ggml_dyn_tallocr_remove_block(chunk, i);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
// otherwise, add a new block
|
// otherwise, add a new block
|
||||||
GGML_ASSERT(alloc->n_free_blocks < MAX_FREE_BLOCKS && "out of free blocks");
|
ggml_dyn_tallocr_insert_block(chunk, addr.offset, size);
|
||||||
// insert the new block in the correct position to keep the array sorted by address (to make merging blocks faster)
|
|
||||||
int insert_pos = 0;
|
|
||||||
while (insert_pos < alloc->n_free_blocks && alloc->free_blocks[insert_pos].offset < offset) {
|
|
||||||
insert_pos++;
|
|
||||||
}
|
|
||||||
// shift all blocks from insert_pos onward to make room for the new block
|
|
||||||
for (int i = alloc->n_free_blocks; i > insert_pos; i--) {
|
|
||||||
alloc->free_blocks[i] = alloc->free_blocks[i-1];
|
|
||||||
}
|
|
||||||
// insert the new block
|
|
||||||
alloc->free_blocks[insert_pos].offset = offset;
|
|
||||||
alloc->free_blocks[insert_pos].size = size;
|
|
||||||
alloc->n_free_blocks++;
|
|
||||||
|
|
||||||
GGML_UNUSED(tensor);
|
GGML_UNUSED(tensor);
|
||||||
}
|
}
|
||||||
|
|
||||||
static void ggml_dyn_tallocr_reset(struct ggml_dyn_tallocr * alloc) {
|
static void ggml_dyn_tallocr_reset(struct ggml_dyn_tallocr * alloc) {
|
||||||
alloc->n_free_blocks = 1;
|
for (int i = 0; i < GGML_VBUFFER_MAX_CHUNKS; i++) {
|
||||||
alloc->free_blocks[0].offset = 0;
|
free(alloc->chunks[i]);
|
||||||
alloc->free_blocks[0].size = SIZE_MAX/2; // restrict maximum size of a measure allocator to half size_t max to avoid overflows
|
alloc->chunks[i] = NULL;
|
||||||
alloc->max_size = 0;
|
}
|
||||||
|
alloc->n_chunks = 0;
|
||||||
|
|
||||||
#ifdef GGML_ALLOCATOR_DEBUG
|
#ifdef GGML_ALLOCATOR_DEBUG
|
||||||
for (int i = 0; i < 1024; i++) {
|
for (int i = 0; i < 1024; i++) {
|
||||||
|
|
@ -293,14 +367,14 @@ static void ggml_dyn_tallocr_reset(struct ggml_dyn_tallocr * alloc) {
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
static struct ggml_dyn_tallocr * ggml_dyn_tallocr_new(size_t alignment) {
|
static struct ggml_dyn_tallocr * ggml_dyn_tallocr_new(size_t alignment, size_t max_buffer_size) {
|
||||||
struct ggml_dyn_tallocr * alloc = (struct ggml_dyn_tallocr *)malloc(sizeof(struct ggml_dyn_tallocr));
|
struct ggml_dyn_tallocr * alloc = (struct ggml_dyn_tallocr *)malloc(sizeof(struct ggml_dyn_tallocr));
|
||||||
|
|
||||||
*alloc = (struct ggml_dyn_tallocr) {
|
*alloc = (struct ggml_dyn_tallocr) {
|
||||||
/*.alignment = */ alignment,
|
/*.alignment = */ alignment,
|
||||||
/*.n_free_blocks = */ 0,
|
/*.max_chunk_size = */ MIN(max_buffer_size, SIZE_MAX/2), // clamp to avoid overflows
|
||||||
/*.free_blocks = */ {{0}},
|
/*.chunks = */ {NULL},
|
||||||
/*.max_size = */ 0,
|
/*.n_chunks = */ 0,
|
||||||
#ifdef GGML_ALLOCATOR_DEBUG
|
#ifdef GGML_ALLOCATOR_DEBUG
|
||||||
/*.allocated_tensors = */ {{0}},
|
/*.allocated_tensors = */ {{0}},
|
||||||
#endif
|
#endif
|
||||||
|
|
@ -312,11 +386,79 @@ static struct ggml_dyn_tallocr * ggml_dyn_tallocr_new(size_t alignment) {
|
||||||
}
|
}
|
||||||
|
|
||||||
static void ggml_dyn_tallocr_free(struct ggml_dyn_tallocr * alloc) {
|
static void ggml_dyn_tallocr_free(struct ggml_dyn_tallocr * alloc) {
|
||||||
|
for (int i = 0; i < alloc->n_chunks; ++i) {
|
||||||
|
free(alloc->chunks[i]);
|
||||||
|
}
|
||||||
free(alloc);
|
free(alloc);
|
||||||
}
|
}
|
||||||
|
|
||||||
static size_t ggml_dyn_tallocr_max_size(struct ggml_dyn_tallocr * alloc) {
|
static size_t ggml_dyn_tallocr_max_size(struct ggml_dyn_tallocr * alloc) {
|
||||||
return alloc->max_size;
|
size_t max_size = 0;
|
||||||
|
for (int i = 0; i < alloc->n_chunks; i++) {
|
||||||
|
max_size += alloc->chunks[i]->max_size;
|
||||||
|
}
|
||||||
|
return max_size;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
// virtual buffer with contiguous memory range, split into multiple backend buffers (chunks)
|
||||||
|
|
||||||
|
struct vbuffer {
|
||||||
|
ggml_backend_buffer_t chunks[GGML_VBUFFER_MAX_CHUNKS];
|
||||||
|
};
|
||||||
|
|
||||||
|
static void ggml_vbuffer_free(struct vbuffer * buf) {
|
||||||
|
if (buf == NULL) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
for (int i = 0; i < GGML_VBUFFER_MAX_CHUNKS; ++i) {
|
||||||
|
ggml_backend_buffer_free(buf->chunks[i]);
|
||||||
|
}
|
||||||
|
free(buf);
|
||||||
|
}
|
||||||
|
|
||||||
|
static int ggml_vbuffer_n_chunks(struct vbuffer * buf) {
|
||||||
|
int n = 0;
|
||||||
|
while (n < GGML_VBUFFER_MAX_CHUNKS && buf->chunks[n]) n++;
|
||||||
|
return n;
|
||||||
|
}
|
||||||
|
|
||||||
|
static size_t ggml_vbuffer_size(struct vbuffer * buf) {
|
||||||
|
size_t size = 0;
|
||||||
|
for (int i = 0; i < GGML_VBUFFER_MAX_CHUNKS && buf->chunks[i]; ++i) {
|
||||||
|
size += ggml_backend_buffer_get_size(buf->chunks[i]);
|
||||||
|
}
|
||||||
|
return size;
|
||||||
|
}
|
||||||
|
|
||||||
|
static struct vbuffer * ggml_vbuffer_alloc(ggml_backend_buffer_type_t buft, const struct ggml_dyn_tallocr * talloc, enum ggml_backend_buffer_usage usage) {
|
||||||
|
struct vbuffer * buf = (struct vbuffer *)calloc(1, sizeof(struct vbuffer));
|
||||||
|
if (buf == NULL) {
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
|
||||||
|
for (int n = 0; n < talloc->n_chunks; n++) {
|
||||||
|
size_t chunk_size = talloc->chunks[n]->max_size;
|
||||||
|
buf->chunks[n] = ggml_backend_buft_alloc_buffer(buft, chunk_size);
|
||||||
|
if (buf->chunks[n] == NULL) {
|
||||||
|
ggml_vbuffer_free(buf);
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
ggml_backend_buffer_set_usage(buf->chunks[n], usage);
|
||||||
|
}
|
||||||
|
return buf;
|
||||||
|
}
|
||||||
|
|
||||||
|
static void ggml_vbuffer_tensor_alloc(struct vbuffer * buf, struct ggml_tensor * tensor, struct buffer_address buf_addr) {
|
||||||
|
void * base = ggml_backend_buffer_get_base(buf->chunks[buf_addr.chunk]);
|
||||||
|
void * addr = (char *)base + buf_addr.offset;
|
||||||
|
ggml_backend_tensor_alloc(buf->chunks[buf_addr.chunk], tensor, addr);
|
||||||
|
}
|
||||||
|
|
||||||
|
static void ggml_vbuffer_reset(struct vbuffer * buf) {
|
||||||
|
for (int i = 0; i < GGML_VBUFFER_MAX_CHUNKS && buf->chunks[i]; ++i) {
|
||||||
|
ggml_backend_buffer_reset(buf->chunks[i]);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -328,13 +470,13 @@ struct hash_node {
|
||||||
int n_children;
|
int n_children;
|
||||||
int n_views;
|
int n_views;
|
||||||
int buffer_id;
|
int buffer_id;
|
||||||
size_t offset; // offset within the buffer
|
struct buffer_address addr;
|
||||||
bool allocated;
|
bool allocated;
|
||||||
};
|
};
|
||||||
|
|
||||||
struct tensor_alloc {
|
struct tensor_alloc {
|
||||||
int buffer_id;
|
int buffer_id;
|
||||||
size_t offset;
|
struct buffer_address addr;
|
||||||
size_t size_max; // 0 = pre-allocated, unused, or view
|
size_t size_max; // 0 = pre-allocated, unused, or view
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
@ -349,7 +491,7 @@ struct node_alloc {
|
||||||
|
|
||||||
struct ggml_gallocr {
|
struct ggml_gallocr {
|
||||||
ggml_backend_buffer_type_t * bufts; // [n_buffers]
|
ggml_backend_buffer_type_t * bufts; // [n_buffers]
|
||||||
ggml_backend_buffer_t * buffers; // [n_buffers]
|
struct vbuffer ** buffers; // [n_buffers]
|
||||||
size_t *buffer_sizes; // [n_buffers]
|
size_t *buffer_sizes; // [n_buffers]
|
||||||
struct ggml_dyn_tallocr ** buf_tallocs; // [n_buffers]
|
struct ggml_dyn_tallocr ** buf_tallocs; // [n_buffers]
|
||||||
int n_buffers;
|
int n_buffers;
|
||||||
|
|
@ -371,7 +513,7 @@ ggml_gallocr_t ggml_gallocr_new_n(ggml_backend_buffer_type_t * bufts, int n_bufs
|
||||||
galloc->bufts = calloc(n_bufs, sizeof(ggml_backend_buffer_type_t));
|
galloc->bufts = calloc(n_bufs, sizeof(ggml_backend_buffer_type_t));
|
||||||
GGML_ASSERT(galloc->bufts != NULL);
|
GGML_ASSERT(galloc->bufts != NULL);
|
||||||
|
|
||||||
galloc->buffers = calloc(n_bufs, sizeof(ggml_backend_buffer_t));
|
galloc->buffers = calloc(n_bufs, sizeof(struct vbuffer *));
|
||||||
GGML_ASSERT(galloc->buffers != NULL);
|
GGML_ASSERT(galloc->buffers != NULL);
|
||||||
|
|
||||||
galloc->buffer_sizes = calloc(n_bufs, sizeof(size_t));
|
galloc->buffer_sizes = calloc(n_bufs, sizeof(size_t));
|
||||||
|
|
@ -394,7 +536,8 @@ ggml_gallocr_t ggml_gallocr_new_n(ggml_backend_buffer_type_t * bufts, int n_bufs
|
||||||
|
|
||||||
if (galloc->buf_tallocs[i] == NULL) {
|
if (galloc->buf_tallocs[i] == NULL) {
|
||||||
size_t alignment = ggml_backend_buft_get_alignment(bufts[i]);
|
size_t alignment = ggml_backend_buft_get_alignment(bufts[i]);
|
||||||
galloc->buf_tallocs[i] = ggml_dyn_tallocr_new(alignment);
|
size_t max_size = ggml_backend_buft_get_max_size(bufts[i]);
|
||||||
|
galloc->buf_tallocs[i] = ggml_dyn_tallocr_new(alignment, max_size);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
galloc->n_buffers = n_bufs;
|
galloc->n_buffers = n_bufs;
|
||||||
|
|
@ -422,7 +565,7 @@ void ggml_gallocr_free(ggml_gallocr_t galloc) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (!freed) {
|
if (!freed) {
|
||||||
ggml_backend_buffer_free(galloc->buffers[i]);
|
ggml_vbuffer_free(galloc->buffers[i]);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (galloc->buf_tallocs != NULL) {
|
if (galloc->buf_tallocs != NULL) {
|
||||||
|
|
@ -472,7 +615,7 @@ static void ggml_gallocr_allocate_node(ggml_gallocr_t galloc, struct ggml_tensor
|
||||||
|
|
||||||
if (!ggml_gallocr_is_allocated(galloc, node) && !ggml_is_view(node)) {
|
if (!ggml_gallocr_is_allocated(galloc, node) && !ggml_is_view(node)) {
|
||||||
hn->allocated = true;
|
hn->allocated = true;
|
||||||
assert(hn->offset == 0);
|
assert(hn->addr.offset == 0);
|
||||||
|
|
||||||
// try to reuse a parent's buffer (inplace)
|
// try to reuse a parent's buffer (inplace)
|
||||||
if (ggml_op_can_inplace(node->op)) {
|
if (ggml_op_can_inplace(node->op)) {
|
||||||
|
|
@ -506,9 +649,9 @@ static void ggml_gallocr_allocate_node(ggml_gallocr_t galloc, struct ggml_tensor
|
||||||
struct hash_node * view_src_hn = ggml_gallocr_hash_get(galloc, view_src);
|
struct hash_node * view_src_hn = ggml_gallocr_hash_get(galloc, view_src);
|
||||||
if (view_src_hn->n_views == 1 && view_src_hn->n_children == 0 && view_src->data == parent->data) {
|
if (view_src_hn->n_views == 1 && view_src_hn->n_children == 0 && view_src->data == parent->data) {
|
||||||
AT_PRINTF("reusing view parent %s (%s) for %s\n", parent->name, view_src->name, node->name);
|
AT_PRINTF("reusing view parent %s (%s) for %s\n", parent->name, view_src->name, node->name);
|
||||||
assert(view_src_hn->offset == p_hn->offset);
|
assert(view_src_hn->addr.chunk == p_hn->addr.chunk && view_src_hn->addr.offset == p_hn->addr.offset);
|
||||||
hn->buffer_id = p_hn->buffer_id;
|
hn->buffer_id = p_hn->buffer_id;
|
||||||
hn->offset = p_hn->offset;
|
hn->addr = p_hn->addr;
|
||||||
p_hn->allocated = false; // avoid freeing the parent
|
p_hn->allocated = false; // avoid freeing the parent
|
||||||
view_src_hn->allocated = false;
|
view_src_hn->allocated = false;
|
||||||
return;
|
return;
|
||||||
|
|
@ -516,7 +659,7 @@ static void ggml_gallocr_allocate_node(ggml_gallocr_t galloc, struct ggml_tensor
|
||||||
} else {
|
} else {
|
||||||
AT_PRINTF("reusing parent %s for %s\n", parent->name, node->name);
|
AT_PRINTF("reusing parent %s for %s\n", parent->name, node->name);
|
||||||
hn->buffer_id = p_hn->buffer_id;
|
hn->buffer_id = p_hn->buffer_id;
|
||||||
hn->offset = p_hn->offset;
|
hn->addr = p_hn->addr;
|
||||||
p_hn->allocated = false; // avoid freeing the parent
|
p_hn->allocated = false; // avoid freeing the parent
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
@ -527,9 +670,8 @@ static void ggml_gallocr_allocate_node(ggml_gallocr_t galloc, struct ggml_tensor
|
||||||
struct ggml_dyn_tallocr * alloc = galloc->buf_tallocs[buffer_id];
|
struct ggml_dyn_tallocr * alloc = galloc->buf_tallocs[buffer_id];
|
||||||
ggml_backend_buffer_type_t buft = galloc->bufts[buffer_id];
|
ggml_backend_buffer_type_t buft = galloc->bufts[buffer_id];
|
||||||
size_t size = ggml_backend_buft_get_alloc_size(buft, node);
|
size_t size = ggml_backend_buft_get_alloc_size(buft, node);
|
||||||
size_t offset = ggml_dyn_tallocr_alloc(alloc, size, node);
|
|
||||||
hn->buffer_id = buffer_id;
|
hn->buffer_id = buffer_id;
|
||||||
hn->offset = offset;
|
hn->addr = ggml_dyn_tallocr_alloc(alloc, size, node);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -541,12 +683,11 @@ static void ggml_gallocr_free_node(ggml_gallocr_t galloc, struct ggml_tensor * n
|
||||||
}
|
}
|
||||||
|
|
||||||
struct hash_node * hn = ggml_gallocr_hash_get(galloc, node);
|
struct hash_node * hn = ggml_gallocr_hash_get(galloc, node);
|
||||||
size_t offset = hn->offset;
|
|
||||||
int buffer_id = hn->buffer_id;
|
int buffer_id = hn->buffer_id;
|
||||||
struct ggml_dyn_tallocr * alloc = galloc->buf_tallocs[buffer_id];
|
struct ggml_dyn_tallocr * alloc = galloc->buf_tallocs[buffer_id];
|
||||||
ggml_backend_buffer_type_t buft = galloc->bufts[buffer_id];
|
ggml_backend_buffer_type_t buft = galloc->bufts[buffer_id];
|
||||||
size_t size = ggml_backend_buft_get_alloc_size(buft, node);
|
size_t size = ggml_backend_buft_get_alloc_size(buft, node);
|
||||||
ggml_dyn_tallocr_free_tensor(alloc, offset, size, node);
|
ggml_dyn_tallocr_free_tensor(alloc, hn->addr, size, node);
|
||||||
hn->allocated = false;
|
hn->allocated = false;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -697,24 +838,24 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
|
||||||
struct node_alloc * node_alloc = &galloc->node_allocs[i];
|
struct node_alloc * node_alloc = &galloc->node_allocs[i];
|
||||||
if (node->view_src || node->data) {
|
if (node->view_src || node->data) {
|
||||||
node_alloc->dst.buffer_id = -1;
|
node_alloc->dst.buffer_id = -1;
|
||||||
node_alloc->dst.offset = SIZE_MAX;
|
node_alloc->dst.addr = GGML_BUFFER_ADDRESS_INVALID;
|
||||||
node_alloc->dst.size_max = 0;
|
node_alloc->dst.size_max = 0;
|
||||||
} else {
|
} else {
|
||||||
struct hash_node * hn = ggml_gallocr_hash_get(galloc, node);
|
struct hash_node * hn = ggml_gallocr_hash_get(galloc, node);
|
||||||
node_alloc->dst.buffer_id = hn->buffer_id;
|
node_alloc->dst.buffer_id = hn->buffer_id;
|
||||||
node_alloc->dst.offset = hn->offset;
|
node_alloc->dst.addr = hn->addr;
|
||||||
node_alloc->dst.size_max = ggml_backend_buft_get_alloc_size(galloc->bufts[hn->buffer_id], node);
|
node_alloc->dst.size_max = ggml_backend_buft_get_alloc_size(galloc->bufts[hn->buffer_id], node);
|
||||||
}
|
}
|
||||||
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
||||||
struct ggml_tensor * src = node->src[j];
|
struct ggml_tensor * src = node->src[j];
|
||||||
if (!src || src->view_src || src->data) {
|
if (!src || src->view_src || src->data) {
|
||||||
node_alloc->src[j].buffer_id = -1;
|
node_alloc->src[j].buffer_id = -1;
|
||||||
node_alloc->src[j].offset = SIZE_MAX;
|
node_alloc->src[j].addr = GGML_BUFFER_ADDRESS_INVALID;
|
||||||
node_alloc->src[j].size_max = 0;
|
node_alloc->src[j].size_max = 0;
|
||||||
} else {
|
} else {
|
||||||
struct hash_node * hn = ggml_gallocr_hash_get(galloc, src);
|
struct hash_node * hn = ggml_gallocr_hash_get(galloc, src);
|
||||||
node_alloc->src[j].buffer_id = hn->buffer_id;
|
node_alloc->src[j].buffer_id = hn->buffer_id;
|
||||||
node_alloc->src[j].offset = hn->offset;
|
node_alloc->src[j].addr = hn->addr;
|
||||||
node_alloc->src[j].size_max = ggml_backend_buft_get_alloc_size(galloc->bufts[hn->buffer_id], src);
|
node_alloc->src[j].size_max = ggml_backend_buft_get_alloc_size(galloc->bufts[hn->buffer_id], src);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
@ -730,11 +871,11 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
|
||||||
struct hash_node * hn = ggml_gallocr_hash_get(galloc, leaf);
|
struct hash_node * hn = ggml_gallocr_hash_get(galloc, leaf);
|
||||||
if (leaf->view_src || leaf->data) {
|
if (leaf->view_src || leaf->data) {
|
||||||
galloc->leaf_allocs[i].leaf.buffer_id = -1;
|
galloc->leaf_allocs[i].leaf.buffer_id = -1;
|
||||||
galloc->leaf_allocs[i].leaf.offset = SIZE_MAX;
|
galloc->leaf_allocs[i].leaf.addr = GGML_BUFFER_ADDRESS_INVALID;
|
||||||
galloc->leaf_allocs[i].leaf.size_max = 0;
|
galloc->leaf_allocs[i].leaf.size_max = 0;
|
||||||
} else {
|
} else {
|
||||||
galloc->leaf_allocs[i].leaf.buffer_id = hn->buffer_id;
|
galloc->leaf_allocs[i].leaf.buffer_id = hn->buffer_id;
|
||||||
galloc->leaf_allocs[i].leaf.offset = hn->offset;
|
galloc->leaf_allocs[i].leaf.addr = hn->addr;
|
||||||
galloc->leaf_allocs[i].leaf.size_max = ggml_backend_buft_get_alloc_size(galloc->bufts[hn->buffer_id], leaf);
|
galloc->leaf_allocs[i].leaf.size_max = ggml_backend_buft_get_alloc_size(galloc->bufts[hn->buffer_id], leaf);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
@ -751,7 +892,7 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
size_t cur_size = galloc->buffers[i] ? ggml_backend_buffer_get_size(galloc->buffers[i]) : 0;
|
size_t cur_size = galloc->buffers[i] ? ggml_vbuffer_size(galloc->buffers[i]) : 0;
|
||||||
size_t new_size = ggml_dyn_tallocr_max_size(galloc->buf_tallocs[i]);
|
size_t new_size = ggml_dyn_tallocr_max_size(galloc->buf_tallocs[i]);
|
||||||
|
|
||||||
// even if there are no tensors allocated in this buffer, we still need to allocate it to initialize views
|
// even if there are no tensors allocated in this buffer, we still need to allocate it to initialize views
|
||||||
|
|
@ -760,18 +901,17 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
|
||||||
GGML_LOG_DEBUG("%s: reallocating %s buffer from size %.02f MiB to %.02f MiB\n", __func__, ggml_backend_buft_name(galloc->bufts[i]), cur_size / 1024.0 / 1024.0, new_size / 1024.0 / 1024.0);
|
GGML_LOG_DEBUG("%s: reallocating %s buffer from size %.02f MiB to %.02f MiB\n", __func__, ggml_backend_buft_name(galloc->bufts[i]), cur_size / 1024.0 / 1024.0, new_size / 1024.0 / 1024.0);
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
ggml_backend_buffer_free(galloc->buffers[i]);
|
ggml_vbuffer_free(galloc->buffers[i]);
|
||||||
galloc->buffers[i] = ggml_backend_buft_alloc_buffer(galloc->bufts[i], new_size);
|
galloc->buffers[i] = ggml_vbuffer_alloc(galloc->bufts[i], galloc->buf_tallocs[i], GGML_BACKEND_BUFFER_USAGE_COMPUTE);
|
||||||
if (galloc->buffers[i]) {
|
if (galloc->buffers[i]) {
|
||||||
galloc->buffer_sizes[i] = ggml_backend_buffer_get_size(galloc->buffers[i]);
|
galloc->buffer_sizes[i] = ggml_vbuffer_size(galloc->buffers[i]);
|
||||||
ggml_backend_buffer_set_usage(galloc->buffers[i], GGML_BACKEND_BUFFER_USAGE_COMPUTE);
|
|
||||||
} else {
|
} else {
|
||||||
GGML_LOG_ERROR("%s: failed to allocate %s buffer of size %zu\n", __func__, ggml_backend_buft_name(galloc->bufts[i]), new_size);
|
GGML_LOG_ERROR("%s: failed to allocate %s buffer of size %zu\n", __func__, ggml_backend_buft_name(galloc->bufts[i]), new_size);
|
||||||
galloc->buffer_sizes[i] = new_size;
|
galloc->buffer_sizes[i] = new_size;
|
||||||
success = false;
|
success = false;
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
galloc->buffer_sizes[i] = ggml_backend_buffer_get_size(galloc->buffers[i]);
|
galloc->buffer_sizes[i] = ggml_vbuffer_size(galloc->buffers[i]);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -784,11 +924,11 @@ bool ggml_gallocr_reserve(ggml_gallocr_t galloc, struct ggml_cgraph *graph) {
|
||||||
|
|
||||||
static void ggml_gallocr_init_tensor(ggml_gallocr_t galloc, struct ggml_tensor * tensor, struct tensor_alloc * tensor_alloc) {
|
static void ggml_gallocr_init_tensor(ggml_gallocr_t galloc, struct ggml_tensor * tensor, struct tensor_alloc * tensor_alloc) {
|
||||||
int buffer_id = tensor_alloc->buffer_id;
|
int buffer_id = tensor_alloc->buffer_id;
|
||||||
assert(tensor->data || tensor->view_src || ggml_backend_buffer_get_alloc_size(galloc->buffers[buffer_id], tensor) <= tensor_alloc->size_max);
|
assert(tensor->data || tensor->view_src || ggml_backend_buft_get_alloc_size(galloc->bufts[buffer_id], tensor) <= tensor_alloc->size_max);
|
||||||
|
|
||||||
if (tensor->view_src != NULL) {
|
if (tensor->view_src != NULL) {
|
||||||
if (tensor->buffer == NULL) {
|
if (tensor->buffer == NULL) {
|
||||||
assert(tensor_alloc->offset == SIZE_MAX);
|
assert(tensor_alloc->addr.offset == SIZE_MAX);
|
||||||
if (tensor->view_src->buffer == NULL) {
|
if (tensor->view_src->buffer == NULL) {
|
||||||
// this tensor was allocated without ggml-backend
|
// this tensor was allocated without ggml-backend
|
||||||
return;
|
return;
|
||||||
|
|
@ -797,11 +937,9 @@ static void ggml_gallocr_init_tensor(ggml_gallocr_t galloc, struct ggml_tensor *
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
if (tensor->data == NULL) {
|
if (tensor->data == NULL) {
|
||||||
assert(tensor_alloc->offset != SIZE_MAX);
|
assert(tensor_alloc->addr.offset != SIZE_MAX);
|
||||||
assert(ggml_backend_buffer_get_alloc_size(galloc->buffers[buffer_id], tensor) <= tensor_alloc->size_max);
|
assert(ggml_backend_buft_get_alloc_size(galloc->bufts[buffer_id], tensor) <= tensor_alloc->size_max);
|
||||||
void * base = ggml_backend_buffer_get_base(galloc->buffers[buffer_id]);
|
ggml_vbuffer_tensor_alloc(galloc->buffers[buffer_id], tensor, tensor_alloc->addr);
|
||||||
void * addr = (char *)base + tensor_alloc->offset;
|
|
||||||
ggml_backend_tensor_alloc(galloc->buffers[buffer_id], tensor, addr);
|
|
||||||
} else {
|
} else {
|
||||||
if (tensor->buffer == NULL) {
|
if (tensor->buffer == NULL) {
|
||||||
// this tensor was allocated without ggml-backend
|
// this tensor was allocated without ggml-backend
|
||||||
|
|
@ -886,7 +1024,7 @@ bool ggml_gallocr_alloc_graph(ggml_gallocr_t galloc, struct ggml_cgraph * graph)
|
||||||
// reset buffers
|
// reset buffers
|
||||||
for (int i = 0; i < galloc->n_buffers; i++) {
|
for (int i = 0; i < galloc->n_buffers; i++) {
|
||||||
if (galloc->buffers[i] != NULL) {
|
if (galloc->buffers[i] != NULL) {
|
||||||
ggml_backend_buffer_reset(galloc->buffers[i]);
|
ggml_vbuffer_reset(galloc->buffers[i]);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -929,7 +1067,7 @@ size_t ggml_gallocr_get_buffer_size(ggml_gallocr_t galloc, int buffer_id) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return ggml_backend_buffer_get_size(galloc->buffers[buffer_id]);
|
return ggml_vbuffer_size(galloc->buffers[buffer_id]);
|
||||||
}
|
}
|
||||||
|
|
||||||
size_t ggml_gallocr_get_attempted_buffer_size(ggml_gallocr_t galloc, int buffer_id) {
|
size_t ggml_gallocr_get_attempted_buffer_size(ggml_gallocr_t galloc, int buffer_id) {
|
||||||
|
|
|
||||||
5
ml/backend/ggml/ggml/src/ggml-backend-impl.h
vendored
5
ml/backend/ggml/ggml/src/ggml-backend-impl.h
vendored
|
|
@ -8,7 +8,7 @@
|
||||||
extern "C" {
|
extern "C" {
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#define GGML_BACKEND_API_VERSION 1
|
#define GGML_BACKEND_API_VERSION 2
|
||||||
|
|
||||||
//
|
//
|
||||||
// Backend buffer type
|
// Backend buffer type
|
||||||
|
|
@ -121,6 +121,9 @@ extern "C" {
|
||||||
// wait for an event on on a different stream
|
// wait for an event on on a different stream
|
||||||
void (*event_wait) (ggml_backend_t backend, ggml_backend_event_t event);
|
void (*event_wait) (ggml_backend_t backend, ggml_backend_event_t event);
|
||||||
|
|
||||||
|
// (optional) sort/optimize the nodes in the graph
|
||||||
|
void (*graph_optimize) (ggml_backend_t backend, struct ggml_cgraph * cgraph);
|
||||||
|
|
||||||
// (optional) reserves intermediate buffers needed for the compution
|
// (optional) reserves intermediate buffers needed for the compution
|
||||||
// if alloc is true, memory is actually allocated, otherwise the required amount is just returned by buffer_size
|
// if alloc is true, memory is actually allocated, otherwise the required amount is just returned by buffer_size
|
||||||
enum ggml_status (*graph_reserve) (ggml_backend_t backend, struct ggml_cgraph * cgraph, bool alloc);
|
enum ggml_status (*graph_reserve) (ggml_backend_t backend, struct ggml_cgraph * cgraph, bool alloc);
|
||||||
|
|
|
||||||
25
ml/backend/ggml/ggml/src/ggml-backend-reg.cpp
vendored
25
ml/backend/ggml/ggml/src/ggml-backend-reg.cpp
vendored
|
|
@ -49,6 +49,10 @@
|
||||||
#include "ggml-webgpu.h"
|
#include "ggml-webgpu.h"
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#ifdef GGML_USE_ZDNN
|
||||||
|
#include "ggml-zdnn.h"
|
||||||
|
#endif
|
||||||
|
|
||||||
#ifdef GGML_USE_OPENCL
|
#ifdef GGML_USE_OPENCL
|
||||||
#include "ggml-opencl.h"
|
#include "ggml-opencl.h"
|
||||||
#endif
|
#endif
|
||||||
|
|
@ -131,6 +135,10 @@ static void * dl_get_sym(dl_handle * handle, const char * name) {
|
||||||
return p;
|
return p;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static const char * dl_error() {
|
||||||
|
return "";
|
||||||
|
}
|
||||||
|
|
||||||
#else
|
#else
|
||||||
|
|
||||||
using dl_handle = void;
|
using dl_handle = void;
|
||||||
|
|
@ -151,6 +159,11 @@ static void * dl_get_sym(dl_handle * handle, const char * name) {
|
||||||
return dlsym(handle, name);
|
return dlsym(handle, name);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static const char * dl_error() {
|
||||||
|
const char *rslt = dlerror();
|
||||||
|
return rslt != nullptr ? rslt : "";
|
||||||
|
}
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
using dl_handle_ptr = std::unique_ptr<dl_handle, dl_handle_deleter>;
|
using dl_handle_ptr = std::unique_ptr<dl_handle, dl_handle_deleter>;
|
||||||
|
|
@ -180,6 +193,9 @@ struct ggml_backend_registry {
|
||||||
#ifdef GGML_USE_WEBGPU
|
#ifdef GGML_USE_WEBGPU
|
||||||
register_backend(ggml_backend_webgpu_reg());
|
register_backend(ggml_backend_webgpu_reg());
|
||||||
#endif
|
#endif
|
||||||
|
#ifdef GGML_USE_ZDNN
|
||||||
|
register_backend(ggml_backend_zdnn_reg());
|
||||||
|
#endif
|
||||||
#ifdef GGML_USE_OPENCL
|
#ifdef GGML_USE_OPENCL
|
||||||
register_backend(ggml_backend_opencl_reg());
|
register_backend(ggml_backend_opencl_reg());
|
||||||
#endif
|
#endif
|
||||||
|
|
@ -238,7 +254,7 @@ struct ggml_backend_registry {
|
||||||
dl_handle_ptr handle { dl_load_library(path) };
|
dl_handle_ptr handle { dl_load_library(path) };
|
||||||
if (!handle) {
|
if (!handle) {
|
||||||
if (!silent) {
|
if (!silent) {
|
||||||
GGML_LOG_ERROR("%s: failed to load %s\n", __func__, path_str(path).c_str());
|
GGML_LOG_ERROR("%s: failed to load %s: %s\n", __func__, path_str(path).c_str(), dl_error());
|
||||||
}
|
}
|
||||||
return nullptr;
|
return nullptr;
|
||||||
}
|
}
|
||||||
|
|
@ -398,9 +414,8 @@ ggml_backend_t ggml_backend_init_by_type(enum ggml_backend_dev_type type, const
|
||||||
|
|
||||||
ggml_backend_t ggml_backend_init_best(void) {
|
ggml_backend_t ggml_backend_init_best(void) {
|
||||||
ggml_backend_dev_t dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_GPU);
|
ggml_backend_dev_t dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_GPU);
|
||||||
if (!dev) {
|
dev = dev ? dev : ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_IGPU);
|
||||||
dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
|
dev = dev ? dev : ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
|
||||||
}
|
|
||||||
if (!dev) {
|
if (!dev) {
|
||||||
return nullptr;
|
return nullptr;
|
||||||
}
|
}
|
||||||
|
|
@ -529,7 +544,7 @@ static ggml_backend_reg_t ggml_backend_load_best(const char * name, bool silent,
|
||||||
if (filename.native().find(file_prefix) == 0 && ext == file_extension) {
|
if (filename.native().find(file_prefix) == 0 && ext == file_extension) {
|
||||||
dl_handle_ptr handle { dl_load_library(entry) };
|
dl_handle_ptr handle { dl_load_library(entry) };
|
||||||
if (!handle && !silent) {
|
if (!handle && !silent) {
|
||||||
GGML_LOG_ERROR("%s: failed to load %s\n", __func__, path_str(entry.path()).c_str());
|
GGML_LOG_ERROR("%s: failed to load %s: %s\n", __func__, path_str(entry.path()).c_str(), dl_error());
|
||||||
}
|
}
|
||||||
if (handle) {
|
if (handle) {
|
||||||
auto score_fn = (ggml_backend_score_t) dl_get_sym(handle.get(), "ggml_backend_score");
|
auto score_fn = (ggml_backend_score_t) dl_get_sym(handle.get(), "ggml_backend_score");
|
||||||
|
|
|
||||||
208
ml/backend/ggml/ggml/src/ggml-backend.cpp
vendored
208
ml/backend/ggml/ggml/src/ggml-backend.cpp
vendored
|
|
@ -19,9 +19,8 @@
|
||||||
#include <stdio.h>
|
#include <stdio.h>
|
||||||
#include <stdlib.h>
|
#include <stdlib.h>
|
||||||
#include <string.h>
|
#include <string.h>
|
||||||
#include <string>
|
|
||||||
#include <vector>
|
|
||||||
#include <algorithm>
|
#include <algorithm>
|
||||||
|
#include <vector>
|
||||||
|
|
||||||
#ifdef __APPLE__
|
#ifdef __APPLE__
|
||||||
#include <sys/types.h>
|
#include <sys/types.h>
|
||||||
|
|
@ -32,6 +31,7 @@
|
||||||
// backend buffer type
|
// backend buffer type
|
||||||
|
|
||||||
const char * ggml_backend_buft_name(ggml_backend_buffer_type_t buft) {
|
const char * ggml_backend_buft_name(ggml_backend_buffer_type_t buft) {
|
||||||
|
GGML_ASSERT(buft);
|
||||||
return buft->iface.get_name(buft);
|
return buft->iface.get_name(buft);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -54,14 +54,17 @@ ggml_backend_buffer_t ggml_backend_buft_alloc_buffer(ggml_backend_buffer_type_t
|
||||||
return buf;
|
return buf;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
GGML_ASSERT(buft);
|
||||||
return buft->iface.alloc_buffer(buft, size);
|
return buft->iface.alloc_buffer(buft, size);
|
||||||
}
|
}
|
||||||
|
|
||||||
size_t ggml_backend_buft_get_alignment(ggml_backend_buffer_type_t buft) {
|
size_t ggml_backend_buft_get_alignment(ggml_backend_buffer_type_t buft) {
|
||||||
|
GGML_ASSERT(buft);
|
||||||
return buft->iface.get_alignment(buft);
|
return buft->iface.get_alignment(buft);
|
||||||
}
|
}
|
||||||
|
|
||||||
size_t ggml_backend_buft_get_max_size(ggml_backend_buffer_type_t buft) {
|
size_t ggml_backend_buft_get_max_size(ggml_backend_buffer_type_t buft) {
|
||||||
|
GGML_ASSERT(buft);
|
||||||
// get_max_size is optional, defaults to SIZE_MAX
|
// get_max_size is optional, defaults to SIZE_MAX
|
||||||
if (buft->iface.get_max_size) {
|
if (buft->iface.get_max_size) {
|
||||||
return buft->iface.get_max_size(buft);
|
return buft->iface.get_max_size(buft);
|
||||||
|
|
@ -70,6 +73,7 @@ size_t ggml_backend_buft_get_max_size(ggml_backend_buffer_type_t buft) {
|
||||||
}
|
}
|
||||||
|
|
||||||
size_t ggml_backend_buft_get_alloc_size(ggml_backend_buffer_type_t buft, const struct ggml_tensor * tensor) {
|
size_t ggml_backend_buft_get_alloc_size(ggml_backend_buffer_type_t buft, const struct ggml_tensor * tensor) {
|
||||||
|
GGML_ASSERT(buft);
|
||||||
// get_alloc_size is optional, defaults to ggml_nbytes
|
// get_alloc_size is optional, defaults to ggml_nbytes
|
||||||
if (buft->iface.get_alloc_size) {
|
if (buft->iface.get_alloc_size) {
|
||||||
size_t size = buft->iface.get_alloc_size(buft, tensor);
|
size_t size = buft->iface.get_alloc_size(buft, tensor);
|
||||||
|
|
@ -80,6 +84,7 @@ size_t ggml_backend_buft_get_alloc_size(ggml_backend_buffer_type_t buft, const s
|
||||||
}
|
}
|
||||||
|
|
||||||
bool ggml_backend_buft_is_host(ggml_backend_buffer_type_t buft) {
|
bool ggml_backend_buft_is_host(ggml_backend_buffer_type_t buft) {
|
||||||
|
GGML_ASSERT(buft);
|
||||||
if (buft->iface.is_host) {
|
if (buft->iface.is_host) {
|
||||||
return buft->iface.is_host(buft);
|
return buft->iface.is_host(buft);
|
||||||
}
|
}
|
||||||
|
|
@ -87,6 +92,7 @@ bool ggml_backend_buft_is_host(ggml_backend_buffer_type_t buft) {
|
||||||
}
|
}
|
||||||
|
|
||||||
ggml_backend_dev_t ggml_backend_buft_get_device(ggml_backend_buffer_type_t buft) {
|
ggml_backend_dev_t ggml_backend_buft_get_device(ggml_backend_buffer_type_t buft) {
|
||||||
|
GGML_ASSERT(buft);
|
||||||
return buft->device;
|
return buft->device;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -124,10 +130,12 @@ void ggml_backend_buffer_free(ggml_backend_buffer_t buffer) {
|
||||||
}
|
}
|
||||||
|
|
||||||
size_t ggml_backend_buffer_get_size(ggml_backend_buffer_t buffer) {
|
size_t ggml_backend_buffer_get_size(ggml_backend_buffer_t buffer) {
|
||||||
|
GGML_ASSERT(buffer);
|
||||||
return buffer->size;
|
return buffer->size;
|
||||||
}
|
}
|
||||||
|
|
||||||
void * ggml_backend_buffer_get_base(ggml_backend_buffer_t buffer) {
|
void * ggml_backend_buffer_get_base(ggml_backend_buffer_t buffer) {
|
||||||
|
GGML_ASSERT(buffer);
|
||||||
// get_base is optional if the buffer is zero-sized
|
// get_base is optional if the buffer is zero-sized
|
||||||
if (buffer->size == 0) {
|
if (buffer->size == 0) {
|
||||||
return NULL;
|
return NULL;
|
||||||
|
|
@ -147,6 +155,7 @@ void * ggml_backend_buffer_get_base(ggml_backend_buffer_t buffer) {
|
||||||
}
|
}
|
||||||
|
|
||||||
enum ggml_status ggml_backend_buffer_init_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) {
|
enum ggml_status ggml_backend_buffer_init_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) {
|
||||||
|
GGML_ASSERT(buffer);
|
||||||
// init_tensor is optional
|
// init_tensor is optional
|
||||||
if (buffer->iface.init_tensor) {
|
if (buffer->iface.init_tensor) {
|
||||||
return buffer->iface.init_tensor(buffer, tensor);
|
return buffer->iface.init_tensor(buffer, tensor);
|
||||||
|
|
@ -155,6 +164,7 @@ enum ggml_status ggml_backend_buffer_init_tensor(ggml_backend_buffer_t buffer, s
|
||||||
}
|
}
|
||||||
|
|
||||||
void ggml_backend_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
|
void ggml_backend_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
|
||||||
|
GGML_ASSERT(buffer);
|
||||||
// clear is optional if the buffer is zero-sized
|
// clear is optional if the buffer is zero-sized
|
||||||
if (buffer->size == 0) {
|
if (buffer->size == 0) {
|
||||||
return;
|
return;
|
||||||
|
|
@ -180,6 +190,7 @@ bool ggml_backend_buffer_is_host(ggml_backend_buffer_t buffer) {
|
||||||
}
|
}
|
||||||
|
|
||||||
void ggml_backend_buffer_set_usage(ggml_backend_buffer_t buffer, enum ggml_backend_buffer_usage usage) {
|
void ggml_backend_buffer_set_usage(ggml_backend_buffer_t buffer, enum ggml_backend_buffer_usage usage) {
|
||||||
|
GGML_ASSERT(buffer);
|
||||||
buffer->usage = usage;
|
buffer->usage = usage;
|
||||||
|
|
||||||
// FIXME: add a generic callback to the buffer interface
|
// FIXME: add a generic callback to the buffer interface
|
||||||
|
|
@ -189,14 +200,17 @@ void ggml_backend_buffer_set_usage(ggml_backend_buffer_t buffer, enum ggml_backe
|
||||||
}
|
}
|
||||||
|
|
||||||
enum ggml_backend_buffer_usage ggml_backend_buffer_get_usage(ggml_backend_buffer_t buffer) {
|
enum ggml_backend_buffer_usage ggml_backend_buffer_get_usage(ggml_backend_buffer_t buffer) {
|
||||||
|
GGML_ASSERT(buffer);
|
||||||
return buffer->usage;
|
return buffer->usage;
|
||||||
}
|
}
|
||||||
|
|
||||||
ggml_backend_buffer_type_t ggml_backend_buffer_get_type(ggml_backend_buffer_t buffer) {
|
ggml_backend_buffer_type_t ggml_backend_buffer_get_type(ggml_backend_buffer_t buffer) {
|
||||||
|
GGML_ASSERT(buffer);
|
||||||
return buffer->buft;
|
return buffer->buft;
|
||||||
}
|
}
|
||||||
|
|
||||||
void ggml_backend_buffer_reset(ggml_backend_buffer_t buffer) {
|
void ggml_backend_buffer_reset(ggml_backend_buffer_t buffer) {
|
||||||
|
GGML_ASSERT(buffer);
|
||||||
if (buffer->iface.reset) {
|
if (buffer->iface.reset) {
|
||||||
buffer->iface.reset(buffer);
|
buffer->iface.reset(buffer);
|
||||||
}
|
}
|
||||||
|
|
@ -235,6 +249,7 @@ void ggml_backend_free(ggml_backend_t backend) {
|
||||||
}
|
}
|
||||||
|
|
||||||
ggml_backend_buffer_type_t ggml_backend_get_default_buffer_type(ggml_backend_t backend) {
|
ggml_backend_buffer_type_t ggml_backend_get_default_buffer_type(ggml_backend_t backend) {
|
||||||
|
GGML_ASSERT(backend);
|
||||||
return ggml_backend_dev_buffer_type(backend->device);
|
return ggml_backend_dev_buffer_type(backend->device);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -251,6 +266,8 @@ size_t ggml_backend_get_max_size(ggml_backend_t backend) {
|
||||||
}
|
}
|
||||||
|
|
||||||
void ggml_backend_tensor_set_async(ggml_backend_t backend, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
|
void ggml_backend_tensor_set_async(ggml_backend_t backend, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
|
||||||
|
GGML_ASSERT(backend);
|
||||||
|
GGML_ASSERT(tensor);
|
||||||
GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
|
GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
|
||||||
GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds");
|
GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds");
|
||||||
|
|
||||||
|
|
@ -262,6 +279,8 @@ void ggml_backend_tensor_set_async(ggml_backend_t backend, struct ggml_tensor *
|
||||||
}
|
}
|
||||||
|
|
||||||
void ggml_backend_tensor_get_async(ggml_backend_t backend, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
|
void ggml_backend_tensor_get_async(ggml_backend_t backend, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
|
||||||
|
GGML_ASSERT(backend);
|
||||||
|
GGML_ASSERT(tensor);
|
||||||
GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
|
GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
|
||||||
GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor read out of bounds");
|
GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor read out of bounds");
|
||||||
|
|
||||||
|
|
@ -303,6 +322,7 @@ void ggml_backend_tensor_get(const struct ggml_tensor * tensor, void * data, siz
|
||||||
}
|
}
|
||||||
|
|
||||||
void ggml_backend_tensor_memset(struct ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) {
|
void ggml_backend_tensor_memset(struct ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) {
|
||||||
|
GGML_ASSERT(tensor);
|
||||||
ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
|
ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
|
||||||
|
|
||||||
if (size == 0) {
|
if (size == 0) {
|
||||||
|
|
@ -318,6 +338,7 @@ void ggml_backend_tensor_memset(struct ggml_tensor * tensor, uint8_t value, size
|
||||||
}
|
}
|
||||||
|
|
||||||
void ggml_backend_synchronize(ggml_backend_t backend) {
|
void ggml_backend_synchronize(ggml_backend_t backend) {
|
||||||
|
GGML_ASSERT(backend);
|
||||||
if (backend->iface.synchronize == NULL) {
|
if (backend->iface.synchronize == NULL) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
@ -326,18 +347,21 @@ void ggml_backend_synchronize(ggml_backend_t backend) {
|
||||||
}
|
}
|
||||||
|
|
||||||
ggml_backend_graph_plan_t ggml_backend_graph_plan_create(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
|
ggml_backend_graph_plan_t ggml_backend_graph_plan_create(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
|
||||||
|
GGML_ASSERT(backend);
|
||||||
GGML_ASSERT(backend->iface.graph_plan_create != NULL);
|
GGML_ASSERT(backend->iface.graph_plan_create != NULL);
|
||||||
|
|
||||||
return backend->iface.graph_plan_create(backend, cgraph);
|
return backend->iface.graph_plan_create(backend, cgraph);
|
||||||
}
|
}
|
||||||
|
|
||||||
void ggml_backend_graph_plan_free(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
|
void ggml_backend_graph_plan_free(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
|
||||||
|
GGML_ASSERT(backend);
|
||||||
GGML_ASSERT(backend->iface.graph_plan_free != NULL);
|
GGML_ASSERT(backend->iface.graph_plan_free != NULL);
|
||||||
|
|
||||||
backend->iface.graph_plan_free(backend, plan);
|
backend->iface.graph_plan_free(backend, plan);
|
||||||
}
|
}
|
||||||
|
|
||||||
enum ggml_status ggml_backend_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
|
enum ggml_status ggml_backend_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
|
||||||
|
GGML_ASSERT(backend);
|
||||||
GGML_ASSERT(backend->iface.graph_plan_compute != NULL);
|
GGML_ASSERT(backend->iface.graph_plan_compute != NULL);
|
||||||
|
|
||||||
return backend->iface.graph_plan_compute(backend, plan);
|
return backend->iface.graph_plan_compute(backend, plan);
|
||||||
|
|
@ -350,22 +374,27 @@ enum ggml_status ggml_backend_graph_compute(ggml_backend_t backend, struct ggml_
|
||||||
}
|
}
|
||||||
|
|
||||||
enum ggml_status ggml_backend_graph_compute_async(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
|
enum ggml_status ggml_backend_graph_compute_async(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
|
||||||
|
GGML_ASSERT(backend);
|
||||||
return backend->iface.graph_compute(backend, cgraph);
|
return backend->iface.graph_compute(backend, cgraph);
|
||||||
}
|
}
|
||||||
|
|
||||||
bool ggml_backend_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) {
|
bool ggml_backend_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) {
|
||||||
|
GGML_ASSERT(backend);
|
||||||
return ggml_backend_dev_supports_op(backend->device, op);
|
return ggml_backend_dev_supports_op(backend->device, op);
|
||||||
}
|
}
|
||||||
|
|
||||||
bool ggml_backend_supports_buft(ggml_backend_t backend, ggml_backend_buffer_type_t buft) {
|
bool ggml_backend_supports_buft(ggml_backend_t backend, ggml_backend_buffer_type_t buft) {
|
||||||
|
GGML_ASSERT(backend);
|
||||||
return ggml_backend_dev_supports_buft(backend->device, buft);
|
return ggml_backend_dev_supports_buft(backend->device, buft);
|
||||||
}
|
}
|
||||||
|
|
||||||
bool ggml_backend_offload_op(ggml_backend_t backend, const struct ggml_tensor * op) {
|
bool ggml_backend_offload_op(ggml_backend_t backend, const struct ggml_tensor * op) {
|
||||||
|
GGML_ASSERT(backend);
|
||||||
return ggml_backend_dev_offload_op(backend->device, op);
|
return ggml_backend_dev_offload_op(backend->device, op);
|
||||||
}
|
}
|
||||||
|
|
||||||
ggml_backend_dev_t ggml_backend_get_device(ggml_backend_t backend) {
|
ggml_backend_dev_t ggml_backend_get_device(ggml_backend_t backend) {
|
||||||
|
GGML_ASSERT(backend);
|
||||||
return backend->device;
|
return backend->device;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -401,6 +430,7 @@ void ggml_backend_tensor_copy_async(ggml_backend_t backend_src, ggml_backend_t b
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
GGML_ASSERT(backend_dst);
|
||||||
if (backend_dst->iface.cpy_tensor_async != NULL) {
|
if (backend_dst->iface.cpy_tensor_async != NULL) {
|
||||||
if (backend_dst->iface.cpy_tensor_async(backend_src, backend_dst, src, dst)) {
|
if (backend_dst->iface.cpy_tensor_async(backend_src, backend_dst, src, dst)) {
|
||||||
return;
|
return;
|
||||||
|
|
@ -432,38 +462,52 @@ void ggml_backend_event_free(ggml_backend_event_t event) {
|
||||||
}
|
}
|
||||||
|
|
||||||
void ggml_backend_event_record(ggml_backend_event_t event, ggml_backend_t backend) {
|
void ggml_backend_event_record(ggml_backend_event_t event, ggml_backend_t backend) {
|
||||||
|
GGML_ASSERT(backend);
|
||||||
GGML_ASSERT(backend->iface.event_record != NULL);
|
GGML_ASSERT(backend->iface.event_record != NULL);
|
||||||
|
|
||||||
backend->iface.event_record(backend, event);
|
backend->iface.event_record(backend, event);
|
||||||
}
|
}
|
||||||
|
|
||||||
void ggml_backend_event_synchronize(ggml_backend_event_t event) {
|
void ggml_backend_event_synchronize(ggml_backend_event_t event) {
|
||||||
|
GGML_ASSERT(event);
|
||||||
GGML_ASSERT(event->device->iface.event_synchronize);
|
GGML_ASSERT(event->device->iface.event_synchronize);
|
||||||
|
|
||||||
event->device->iface.event_synchronize(event->device, event);
|
event->device->iface.event_synchronize(event->device, event);
|
||||||
}
|
}
|
||||||
|
|
||||||
void ggml_backend_event_wait(ggml_backend_t backend, ggml_backend_event_t event) {
|
void ggml_backend_event_wait(ggml_backend_t backend, ggml_backend_event_t event) {
|
||||||
|
GGML_ASSERT(backend);
|
||||||
GGML_ASSERT(backend->iface.event_wait != NULL);
|
GGML_ASSERT(backend->iface.event_wait != NULL);
|
||||||
|
|
||||||
backend->iface.event_wait(backend, event);
|
backend->iface.event_wait(backend, event);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static void ggml_backend_graph_optimize(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
|
||||||
|
GGML_ASSERT(backend);
|
||||||
|
if (backend->iface.graph_optimize != NULL) {
|
||||||
|
backend->iface.graph_optimize(backend, cgraph);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// Backend device
|
// Backend device
|
||||||
|
|
||||||
const char * ggml_backend_dev_name(ggml_backend_dev_t device) {
|
const char * ggml_backend_dev_name(ggml_backend_dev_t device) {
|
||||||
|
GGML_ASSERT(device);
|
||||||
return device->iface.get_name(device);
|
return device->iface.get_name(device);
|
||||||
}
|
}
|
||||||
|
|
||||||
const char * ggml_backend_dev_description(ggml_backend_dev_t device) {
|
const char * ggml_backend_dev_description(ggml_backend_dev_t device) {
|
||||||
|
GGML_ASSERT(device);
|
||||||
return device->iface.get_description(device);
|
return device->iface.get_description(device);
|
||||||
}
|
}
|
||||||
|
|
||||||
void ggml_backend_dev_memory(ggml_backend_dev_t device, size_t * free, size_t * total) {
|
void ggml_backend_dev_memory(ggml_backend_dev_t device, size_t * free, size_t * total) {
|
||||||
|
GGML_ASSERT(device);
|
||||||
device->iface.get_memory(device, free, total);
|
device->iface.get_memory(device, free, total);
|
||||||
}
|
}
|
||||||
|
|
||||||
enum ggml_backend_dev_type ggml_backend_dev_type(ggml_backend_dev_t device) {
|
enum ggml_backend_dev_type ggml_backend_dev_type(ggml_backend_dev_t device) {
|
||||||
|
GGML_ASSERT(device);
|
||||||
return device->iface.get_type(device);
|
return device->iface.get_type(device);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -473,10 +517,12 @@ void ggml_backend_dev_get_props(ggml_backend_dev_t device, struct ggml_backend_d
|
||||||
}
|
}
|
||||||
|
|
||||||
ggml_backend_reg_t ggml_backend_dev_backend_reg(ggml_backend_dev_t device) {
|
ggml_backend_reg_t ggml_backend_dev_backend_reg(ggml_backend_dev_t device) {
|
||||||
|
GGML_ASSERT(device);
|
||||||
return device->reg;
|
return device->reg;
|
||||||
}
|
}
|
||||||
|
|
||||||
ggml_backend_t ggml_backend_dev_init(ggml_backend_dev_t device, const char * params) {
|
ggml_backend_t ggml_backend_dev_init(ggml_backend_dev_t device, const char * params) {
|
||||||
|
GGML_ASSERT(device);
|
||||||
return device->iface.init_backend(device, params);
|
return device->iface.init_backend(device, params);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -489,10 +535,12 @@ void ggml_backend_dev_reset(ggml_backend_dev_t device) {
|
||||||
}
|
}
|
||||||
|
|
||||||
ggml_backend_buffer_type_t ggml_backend_dev_buffer_type(ggml_backend_dev_t device) {
|
ggml_backend_buffer_type_t ggml_backend_dev_buffer_type(ggml_backend_dev_t device) {
|
||||||
|
GGML_ASSERT(device);
|
||||||
return device->iface.get_buffer_type(device);
|
return device->iface.get_buffer_type(device);
|
||||||
}
|
}
|
||||||
|
|
||||||
ggml_backend_buffer_type_t ggml_backend_dev_host_buffer_type(ggml_backend_dev_t device) {
|
ggml_backend_buffer_type_t ggml_backend_dev_host_buffer_type(ggml_backend_dev_t device) {
|
||||||
|
GGML_ASSERT(device);
|
||||||
if (device->iface.get_host_buffer_type == NULL) {
|
if (device->iface.get_host_buffer_type == NULL) {
|
||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
|
|
@ -501,18 +549,22 @@ ggml_backend_buffer_type_t ggml_backend_dev_host_buffer_type(ggml_backend_dev_t
|
||||||
}
|
}
|
||||||
|
|
||||||
ggml_backend_buffer_t ggml_backend_dev_buffer_from_host_ptr(ggml_backend_dev_t device, void * ptr, size_t size, size_t max_tensor_size) {
|
ggml_backend_buffer_t ggml_backend_dev_buffer_from_host_ptr(ggml_backend_dev_t device, void * ptr, size_t size, size_t max_tensor_size) {
|
||||||
|
GGML_ASSERT(device);
|
||||||
return device->iface.buffer_from_host_ptr(device, ptr, size, max_tensor_size);
|
return device->iface.buffer_from_host_ptr(device, ptr, size, max_tensor_size);
|
||||||
}
|
}
|
||||||
|
|
||||||
bool ggml_backend_dev_supports_op(ggml_backend_dev_t device, const struct ggml_tensor * op) {
|
bool ggml_backend_dev_supports_op(ggml_backend_dev_t device, const struct ggml_tensor * op) {
|
||||||
|
GGML_ASSERT(device);
|
||||||
return device->iface.supports_op(device, op);
|
return device->iface.supports_op(device, op);
|
||||||
}
|
}
|
||||||
|
|
||||||
bool ggml_backend_dev_supports_buft(ggml_backend_dev_t device, ggml_backend_buffer_type_t buft) {
|
bool ggml_backend_dev_supports_buft(ggml_backend_dev_t device, ggml_backend_buffer_type_t buft) {
|
||||||
|
GGML_ASSERT(device);
|
||||||
return device->iface.supports_buft(device, buft);
|
return device->iface.supports_buft(device, buft);
|
||||||
}
|
}
|
||||||
|
|
||||||
bool ggml_backend_dev_offload_op(ggml_backend_dev_t device, const struct ggml_tensor * op) {
|
bool ggml_backend_dev_offload_op(ggml_backend_dev_t device, const struct ggml_tensor * op) {
|
||||||
|
GGML_ASSERT(device);
|
||||||
if (device->iface.offload_op != NULL) {
|
if (device->iface.offload_op != NULL) {
|
||||||
return device->iface.offload_op(device, op);
|
return device->iface.offload_op(device, op);
|
||||||
}
|
}
|
||||||
|
|
@ -523,18 +575,22 @@ bool ggml_backend_dev_offload_op(ggml_backend_dev_t device, const struct ggml_te
|
||||||
// Backend (reg)
|
// Backend (reg)
|
||||||
|
|
||||||
const char * ggml_backend_reg_name(ggml_backend_reg_t reg) {
|
const char * ggml_backend_reg_name(ggml_backend_reg_t reg) {
|
||||||
|
GGML_ASSERT(reg);
|
||||||
return reg->iface.get_name(reg);
|
return reg->iface.get_name(reg);
|
||||||
}
|
}
|
||||||
|
|
||||||
size_t ggml_backend_reg_dev_count(ggml_backend_reg_t reg) {
|
size_t ggml_backend_reg_dev_count(ggml_backend_reg_t reg) {
|
||||||
|
GGML_ASSERT(reg);
|
||||||
return reg->iface.get_device_count(reg);
|
return reg->iface.get_device_count(reg);
|
||||||
}
|
}
|
||||||
|
|
||||||
ggml_backend_dev_t ggml_backend_reg_dev_get(ggml_backend_reg_t reg, size_t index) {
|
ggml_backend_dev_t ggml_backend_reg_dev_get(ggml_backend_reg_t reg, size_t index) {
|
||||||
|
GGML_ASSERT(reg);
|
||||||
return reg->iface.get_device(reg, index);
|
return reg->iface.get_device(reg, index);
|
||||||
}
|
}
|
||||||
|
|
||||||
void * ggml_backend_reg_get_proc_address(ggml_backend_reg_t reg, const char * name) {
|
void * ggml_backend_reg_get_proc_address(ggml_backend_reg_t reg, const char * name) {
|
||||||
|
GGML_ASSERT(reg);
|
||||||
if (!reg->iface.get_proc_address) {
|
if (!reg->iface.get_proc_address) {
|
||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
|
|
@ -549,6 +605,7 @@ struct ggml_backend_multi_buffer_context {
|
||||||
};
|
};
|
||||||
|
|
||||||
static void ggml_backend_multi_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
static void ggml_backend_multi_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
||||||
|
GGML_ASSERT(buffer);
|
||||||
ggml_backend_multi_buffer_context * ctx = (ggml_backend_multi_buffer_context *) buffer->context;
|
ggml_backend_multi_buffer_context * ctx = (ggml_backend_multi_buffer_context *) buffer->context;
|
||||||
for (size_t i = 0; i < ctx->n_buffers; i++) {
|
for (size_t i = 0; i < ctx->n_buffers; i++) {
|
||||||
ggml_backend_buffer_free(ctx->buffers[i]);
|
ggml_backend_buffer_free(ctx->buffers[i]);
|
||||||
|
|
@ -560,6 +617,7 @@ static void ggml_backend_multi_buffer_free_buffer(ggml_backend_buffer_t buffer)
|
||||||
}
|
}
|
||||||
|
|
||||||
static void ggml_backend_multi_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
|
static void ggml_backend_multi_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
|
||||||
|
GGML_ASSERT(buffer);
|
||||||
ggml_backend_multi_buffer_context * ctx = (ggml_backend_multi_buffer_context *) buffer->context;
|
ggml_backend_multi_buffer_context * ctx = (ggml_backend_multi_buffer_context *) buffer->context;
|
||||||
for (size_t i = 0; i < ctx->n_buffers; i++) {
|
for (size_t i = 0; i < ctx->n_buffers; i++) {
|
||||||
ggml_backend_buffer_clear(ctx->buffers[i], value);
|
ggml_backend_buffer_clear(ctx->buffers[i], value);
|
||||||
|
|
@ -595,10 +653,12 @@ ggml_backend_buffer_t ggml_backend_multi_buffer_alloc_buffer(ggml_backend_buffer
|
||||||
}
|
}
|
||||||
|
|
||||||
bool ggml_backend_buffer_is_multi_buffer(ggml_backend_buffer_t buffer) {
|
bool ggml_backend_buffer_is_multi_buffer(ggml_backend_buffer_t buffer) {
|
||||||
|
GGML_ASSERT(buffer);
|
||||||
return buffer->iface.free_buffer == ggml_backend_multi_buffer_free_buffer;
|
return buffer->iface.free_buffer == ggml_backend_multi_buffer_free_buffer;
|
||||||
}
|
}
|
||||||
|
|
||||||
void ggml_backend_multi_buffer_set_usage(ggml_backend_buffer_t buffer, enum ggml_backend_buffer_usage usage) {
|
void ggml_backend_multi_buffer_set_usage(ggml_backend_buffer_t buffer, enum ggml_backend_buffer_usage usage) {
|
||||||
|
GGML_ASSERT(buffer);
|
||||||
GGML_ASSERT(ggml_backend_buffer_is_multi_buffer(buffer));
|
GGML_ASSERT(ggml_backend_buffer_is_multi_buffer(buffer));
|
||||||
ggml_backend_multi_buffer_context * ctx = (ggml_backend_multi_buffer_context *) buffer->context;
|
ggml_backend_multi_buffer_context * ctx = (ggml_backend_multi_buffer_context *) buffer->context;
|
||||||
for (size_t i = 0; i < ctx->n_buffers; i++) {
|
for (size_t i = 0; i < ctx->n_buffers; i++) {
|
||||||
|
|
@ -626,7 +686,7 @@ static bool ggml_is_view_op(enum ggml_op op) {
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifndef GGML_SCHED_MAX_SPLIT_INPUTS
|
#ifndef GGML_SCHED_MAX_SPLIT_INPUTS
|
||||||
#define GGML_SCHED_MAX_SPLIT_INPUTS GGML_MAX_SRC
|
#define GGML_SCHED_MAX_SPLIT_INPUTS 30
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifndef GGML_SCHED_MAX_COPIES
|
#ifndef GGML_SCHED_MAX_COPIES
|
||||||
|
|
@ -883,7 +943,7 @@ static void ggml_backend_sched_set_if_supported(ggml_backend_sched_t sched, stru
|
||||||
}
|
}
|
||||||
|
|
||||||
// assigns backends to ops and splits the graph into subgraphs that can be computed on the same backend
|
// assigns backends to ops and splits the graph into subgraphs that can be computed on the same backend
|
||||||
static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * graph) {
|
void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * graph) {
|
||||||
// reset splits
|
// reset splits
|
||||||
sched->n_splits = 0;
|
sched->n_splits = 0;
|
||||||
sched->n_graph_inputs = 0;
|
sched->n_graph_inputs = 0;
|
||||||
|
|
@ -1279,6 +1339,10 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
||||||
struct ggml_backend_sched_split * split = &sched->splits[i];
|
struct ggml_backend_sched_split * split = &sched->splits[i];
|
||||||
split->graph = ggml_graph_view(graph, split->i_start, split->i_end);
|
split->graph = ggml_graph_view(graph, split->i_start, split->i_end);
|
||||||
|
|
||||||
|
// Optimize this split of the graph. This needs to happen before we make graph_copy,
|
||||||
|
// so they are in sync.
|
||||||
|
ggml_backend_graph_optimize(sched->backends[split->backend_id], &split->graph);
|
||||||
|
|
||||||
// add inputs to the graph copy so that they are allocated by ggml-alloc at the start of the split
|
// add inputs to the graph copy so that they are allocated by ggml-alloc at the start of the split
|
||||||
for (int j = 0; j < split->n_inputs; j++) {
|
for (int j = 0; j < split->n_inputs; j++) {
|
||||||
assert(graph_copy->size > (graph_copy->n_nodes + 1));
|
assert(graph_copy->size > (graph_copy->n_nodes + 1));
|
||||||
|
|
@ -1384,17 +1448,22 @@ static bool ggml_backend_sched_alloc_splits(ggml_backend_sched_t sched) {
|
||||||
}
|
}
|
||||||
|
|
||||||
static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t sched) {
|
static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t sched) {
|
||||||
|
GGML_ASSERT(sched);
|
||||||
struct ggml_backend_sched_split * splits = sched->splits;
|
struct ggml_backend_sched_split * splits = sched->splits;
|
||||||
|
|
||||||
for (int i = 0; i < sched->n_splits; i++) {
|
ggml_tensor * prev_ids_tensor = nullptr;
|
||||||
struct ggml_backend_sched_split * split = &splits[i];
|
std::vector<int32_t> ids;
|
||||||
|
std::vector<ggml_bitset_t> used_ids;
|
||||||
|
|
||||||
|
for (int split_id = 0; split_id < sched->n_splits; split_id++) {
|
||||||
|
struct ggml_backend_sched_split * split = &splits[split_id];
|
||||||
int split_backend_id = split->backend_id;
|
int split_backend_id = split->backend_id;
|
||||||
ggml_backend_t split_backend = sched->backends[split_backend_id];
|
ggml_backend_t split_backend = sched->backends[split_backend_id];
|
||||||
|
|
||||||
// copy the input tensors to the split backend
|
// copy the input tensors to the split backend
|
||||||
for (int j = 0; j < split->n_inputs; j++) {
|
for (int input_id = 0; input_id < split->n_inputs; input_id++) {
|
||||||
ggml_backend_t input_backend = ggml_backend_sched_get_tensor_backend(sched, split->inputs[j]);
|
ggml_backend_t input_backend = ggml_backend_sched_get_tensor_backend(sched, split->inputs[input_id]);
|
||||||
struct ggml_tensor * input = split->inputs[j];
|
struct ggml_tensor * input = split->inputs[input_id];
|
||||||
struct ggml_tensor * input_cpy = tensor_copy(input, split_backend_id, sched->cur_copy);
|
struct ggml_tensor * input_cpy = tensor_copy(input, split_backend_id, sched->cur_copy);
|
||||||
|
|
||||||
if (input->flags & GGML_TENSOR_FLAG_INPUT) {
|
if (input->flags & GGML_TENSOR_FLAG_INPUT) {
|
||||||
|
|
@ -1412,6 +1481,93 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s
|
||||||
} else {
|
} else {
|
||||||
ggml_backend_synchronize(split_backend);
|
ggml_backend_synchronize(split_backend);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// when offloading MoE weights, we can reduce the amount of data copied by copying only the experts that are used
|
||||||
|
ggml_tensor * node = split->graph.nodes[0];
|
||||||
|
if (split->graph.n_nodes > 0 &&
|
||||||
|
ggml_backend_buffer_get_usage(input->buffer) == GGML_BACKEND_BUFFER_USAGE_WEIGHTS &&
|
||||||
|
ggml_backend_buffer_is_host(input->buffer) && (
|
||||||
|
(node->src[0] == input_cpy && node->op == GGML_OP_MUL_MAT_ID)
|
||||||
|
//|| (node->src[1] == input_cpy && node->op == GGML_OP_ADD_ID) /* GGML_OP_ADD_ID weights are small and not worth splitting */
|
||||||
|
)) {
|
||||||
|
|
||||||
|
const int64_t n_expert = node->op == GGML_OP_MUL_MAT_ID ? input->ne[2] : input->ne[1];
|
||||||
|
const size_t expert_size = node->op == GGML_OP_MUL_MAT_ID ? input->nb[2] : input->nb[1];
|
||||||
|
|
||||||
|
ggml_backend_synchronize(input_backend);
|
||||||
|
|
||||||
|
// get the ids
|
||||||
|
ggml_tensor * ids_tensor = node->src[2];
|
||||||
|
ggml_backend_t ids_backend = split_backend;
|
||||||
|
|
||||||
|
// if the ids tensor is also an input of the split, it may not have been copied yet to the split backend
|
||||||
|
// in that case, we use the original ids tensor
|
||||||
|
for (int i = input_id + 1; i < split->n_inputs; i++) {
|
||||||
|
if (ids_tensor == tensor_copy(split->inputs[i], split_backend_id, sched->cur_copy)) {
|
||||||
|
ids_tensor = split->inputs[i];
|
||||||
|
ids_backend = ggml_backend_sched_get_tensor_backend(sched, split->inputs[i]);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (ids_tensor != prev_ids_tensor) {
|
||||||
|
ids.resize(ggml_nbytes(ids_tensor) / sizeof(int32_t));
|
||||||
|
ggml_backend_tensor_get_async(ids_backend, ids_tensor, ids.data(), 0, ggml_nbytes(ids_tensor));
|
||||||
|
ggml_backend_synchronize(ids_backend);
|
||||||
|
|
||||||
|
// find the used experts
|
||||||
|
used_ids.clear();
|
||||||
|
used_ids.resize(ggml_bitset_size(n_expert));
|
||||||
|
for (int64_t i1 = 0; i1 < ids_tensor->ne[1]; i1++) {
|
||||||
|
for (int64_t i0 = 0; i0 < ids_tensor->ne[0]; i0++) {
|
||||||
|
int32_t id = ids[i1 * ids_tensor->nb[1]/sizeof(int32_t) + i0 * ids_tensor->nb[0]/sizeof(int32_t)];
|
||||||
|
GGML_ASSERT(id >= 0 && id < n_expert);
|
||||||
|
ggml_bitset_set(used_ids.data(), id);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
prev_ids_tensor = ids_tensor;
|
||||||
|
}
|
||||||
|
|
||||||
|
// group consecutive experts and copy them together
|
||||||
|
auto copy_experts = [&](int32_t first_id, int32_t last_id) {
|
||||||
|
const size_t expert_offset = first_id * expert_size;
|
||||||
|
const size_t expert_size_copy = (last_id - first_id + 1) * expert_size;
|
||||||
|
const size_t padding = std::min<size_t>(expert_size, 512);
|
||||||
|
const size_t padding_end = last_id < n_expert - 1 ? padding : 0;
|
||||||
|
|
||||||
|
ggml_backend_tensor_set_async(split_backend,
|
||||||
|
input_cpy,
|
||||||
|
(const uint8_t *)input->data + expert_offset, expert_offset,
|
||||||
|
// copy a bit extra at the to ensure there are no NaNs in the padding of the last expert
|
||||||
|
// this is necessary for MMQ in the CUDA backend
|
||||||
|
expert_size_copy + padding_end);
|
||||||
|
};
|
||||||
|
|
||||||
|
int id = 0;
|
||||||
|
while (!ggml_bitset_get(used_ids.data(), id)) {
|
||||||
|
id++;
|
||||||
|
}
|
||||||
|
int32_t first_id = id;
|
||||||
|
int32_t last_id = first_id;
|
||||||
|
|
||||||
|
for (++id; id < n_expert; ++id) {
|
||||||
|
if (!ggml_bitset_get(used_ids.data(), id)) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (id == last_id + 1) {
|
||||||
|
last_id = id;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
copy_experts(first_id, last_id);
|
||||||
|
|
||||||
|
first_id = id;
|
||||||
|
last_id = id;
|
||||||
|
}
|
||||||
|
copy_experts(first_id, last_id);
|
||||||
|
} else {
|
||||||
// try async copy, but if not possible, we can still use a sync copy without synchronizing the dst backend, since we handle the synchronization here with multiple copies and events
|
// try async copy, but if not possible, we can still use a sync copy without synchronizing the dst backend, since we handle the synchronization here with multiple copies and events
|
||||||
// TODO: add public function to facilitate this, since applications do not have direct access to the backend interface
|
// TODO: add public function to facilitate this, since applications do not have direct access to the backend interface
|
||||||
if (!split_backend->iface.cpy_tensor_async || !split_backend->iface.cpy_tensor_async(input_backend, split_backend, input, input_cpy)) {
|
if (!split_backend->iface.cpy_tensor_async || !split_backend->iface.cpy_tensor_async(input_backend, split_backend, input, input_cpy)) {
|
||||||
|
|
@ -1425,6 +1581,7 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
if (!sched->callback_eval) {
|
if (!sched->callback_eval) {
|
||||||
enum ggml_status ec = ggml_backend_graph_compute_async(split_backend, &split->graph);
|
enum ggml_status ec = ggml_backend_graph_compute_async(split_backend, &split->graph);
|
||||||
|
|
@ -1578,6 +1735,7 @@ void ggml_backend_sched_free(ggml_backend_sched_t sched) {
|
||||||
}
|
}
|
||||||
|
|
||||||
void ggml_backend_sched_reset(ggml_backend_sched_t sched) {
|
void ggml_backend_sched_reset(ggml_backend_sched_t sched) {
|
||||||
|
GGML_ASSERT(sched);
|
||||||
// reset state for the next run
|
// reset state for the next run
|
||||||
if (!sched->is_reset) {
|
if (!sched->is_reset) {
|
||||||
ggml_hash_set_reset(&sched->hash_set);
|
ggml_hash_set_reset(&sched->hash_set);
|
||||||
|
|
@ -1589,8 +1747,11 @@ void ggml_backend_sched_reset(ggml_backend_sched_t sched) {
|
||||||
}
|
}
|
||||||
|
|
||||||
bool ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph) {
|
bool ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph) {
|
||||||
|
GGML_ASSERT(sched);
|
||||||
GGML_ASSERT((int)sched->hash_set.size >= measure_graph->n_nodes + measure_graph->n_leafs);
|
GGML_ASSERT((int)sched->hash_set.size >= measure_graph->n_nodes + measure_graph->n_leafs);
|
||||||
|
|
||||||
|
ggml_backend_sched_reset(sched);
|
||||||
|
|
||||||
ggml_backend_sched_synchronize(sched);
|
ggml_backend_sched_synchronize(sched);
|
||||||
|
|
||||||
ggml_backend_sched_split_graph(sched, measure_graph);
|
ggml_backend_sched_split_graph(sched, measure_graph);
|
||||||
|
|
@ -1623,6 +1784,7 @@ bool ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph *
|
||||||
}
|
}
|
||||||
|
|
||||||
bool ggml_backend_sched_alloc_graph(ggml_backend_sched_t sched, struct ggml_cgraph * graph) {
|
bool ggml_backend_sched_alloc_graph(ggml_backend_sched_t sched, struct ggml_cgraph * graph) {
|
||||||
|
GGML_ASSERT(sched);
|
||||||
GGML_ASSERT((int)sched->hash_set.size >= graph->n_nodes + graph->n_leafs);
|
GGML_ASSERT((int)sched->hash_set.size >= graph->n_nodes + graph->n_leafs);
|
||||||
GGML_ASSERT(!sched->is_alloc);
|
GGML_ASSERT(!sched->is_alloc);
|
||||||
|
|
||||||
|
|
@ -1647,6 +1809,7 @@ enum ggml_status ggml_backend_sched_graph_compute(ggml_backend_sched_t sched, st
|
||||||
}
|
}
|
||||||
|
|
||||||
enum ggml_status ggml_backend_sched_graph_compute_async(ggml_backend_sched_t sched, struct ggml_cgraph * graph) {
|
enum ggml_status ggml_backend_sched_graph_compute_async(ggml_backend_sched_t sched, struct ggml_cgraph * graph) {
|
||||||
|
GGML_ASSERT(sched);
|
||||||
if (!sched->is_reset && !sched->is_alloc) {
|
if (!sched->is_reset && !sched->is_alloc) {
|
||||||
ggml_backend_sched_reset(sched);
|
ggml_backend_sched_reset(sched);
|
||||||
}
|
}
|
||||||
|
|
@ -1661,6 +1824,7 @@ enum ggml_status ggml_backend_sched_graph_compute_async(ggml_backend_sched_t sch
|
||||||
}
|
}
|
||||||
|
|
||||||
void ggml_backend_sched_synchronize(ggml_backend_sched_t sched) {
|
void ggml_backend_sched_synchronize(ggml_backend_sched_t sched) {
|
||||||
|
GGML_ASSERT(sched);
|
||||||
for (int i = 0; i < sched->n_backends; i++) {
|
for (int i = 0; i < sched->n_backends; i++) {
|
||||||
ggml_backend_synchronize(sched->backends[i]);
|
ggml_backend_synchronize(sched->backends[i]);
|
||||||
}
|
}
|
||||||
|
|
@ -1673,28 +1837,42 @@ void ggml_backend_sched_synchronize(ggml_backend_sched_t sched) {
|
||||||
}
|
}
|
||||||
|
|
||||||
void ggml_backend_sched_set_eval_callback(ggml_backend_sched_t sched, ggml_backend_sched_eval_callback callback, void * user_data) {
|
void ggml_backend_sched_set_eval_callback(ggml_backend_sched_t sched, ggml_backend_sched_eval_callback callback, void * user_data) {
|
||||||
|
GGML_ASSERT(sched);
|
||||||
sched->callback_eval = callback;
|
sched->callback_eval = callback;
|
||||||
sched->callback_eval_user_data = user_data;
|
sched->callback_eval_user_data = user_data;
|
||||||
}
|
}
|
||||||
|
|
||||||
int ggml_backend_sched_get_n_splits(ggml_backend_sched_t sched) {
|
int ggml_backend_sched_get_n_splits(ggml_backend_sched_t sched) {
|
||||||
|
GGML_ASSERT(sched);
|
||||||
return sched->n_splits;
|
return sched->n_splits;
|
||||||
}
|
}
|
||||||
|
|
||||||
int ggml_backend_sched_get_n_copies(ggml_backend_sched_t sched) {
|
int ggml_backend_sched_get_n_copies(ggml_backend_sched_t sched) {
|
||||||
|
GGML_ASSERT(sched);
|
||||||
return sched->n_copies;
|
return sched->n_copies;
|
||||||
}
|
}
|
||||||
|
|
||||||
int ggml_backend_sched_get_n_backends(ggml_backend_sched_t sched) {
|
int ggml_backend_sched_get_n_backends(ggml_backend_sched_t sched) {
|
||||||
|
GGML_ASSERT(sched);
|
||||||
return sched->n_backends;
|
return sched->n_backends;
|
||||||
}
|
}
|
||||||
|
|
||||||
ggml_backend_t ggml_backend_sched_get_backend(ggml_backend_sched_t sched, int i) {
|
ggml_backend_t ggml_backend_sched_get_backend(ggml_backend_sched_t sched, int i) {
|
||||||
|
GGML_ASSERT(sched);
|
||||||
GGML_ASSERT(i >= 0 && i < sched->n_backends);
|
GGML_ASSERT(i >= 0 && i < sched->n_backends);
|
||||||
return sched->backends[i];
|
return sched->backends[i];
|
||||||
}
|
}
|
||||||
|
|
||||||
|
ggml_backend_buffer_type_t ggml_backend_sched_get_buffer_type(ggml_backend_sched_t sched, ggml_backend_t backend) {
|
||||||
|
GGML_ASSERT(sched);
|
||||||
|
int backend_index = ggml_backend_sched_backend_id(sched, backend);
|
||||||
|
GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends);
|
||||||
|
|
||||||
|
return sched->bufts[backend_index];
|
||||||
|
}
|
||||||
|
|
||||||
size_t ggml_backend_sched_get_buffer_size(ggml_backend_sched_t sched, ggml_backend_t backend) {
|
size_t ggml_backend_sched_get_buffer_size(ggml_backend_sched_t sched, ggml_backend_t backend) {
|
||||||
|
GGML_ASSERT(sched);
|
||||||
int backend_index = ggml_backend_sched_backend_id(sched, backend);
|
int backend_index = ggml_backend_sched_backend_id(sched, backend);
|
||||||
GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends);
|
GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends);
|
||||||
|
|
||||||
|
|
@ -1715,6 +1893,7 @@ size_t ggml_backend_sched_get_attempted_buffer_size(ggml_backend_sched_t sched,
|
||||||
}
|
}
|
||||||
|
|
||||||
void ggml_backend_sched_set_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node, ggml_backend_t backend) {
|
void ggml_backend_sched_set_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node, ggml_backend_t backend) {
|
||||||
|
GGML_ASSERT(sched);
|
||||||
int backend_index = ggml_backend_sched_backend_id(sched, backend);
|
int backend_index = ggml_backend_sched_backend_id(sched, backend);
|
||||||
GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends);
|
GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends);
|
||||||
tensor_backend_id(node) = backend_index;
|
tensor_backend_id(node) = backend_index;
|
||||||
|
|
@ -1723,6 +1902,7 @@ void ggml_backend_sched_set_tensor_backend(ggml_backend_sched_t sched, struct gg
|
||||||
}
|
}
|
||||||
|
|
||||||
ggml_backend_t ggml_backend_sched_get_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node) {
|
ggml_backend_t ggml_backend_sched_get_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node) {
|
||||||
|
GGML_ASSERT(sched);
|
||||||
int backend_index = tensor_backend_id(node);
|
int backend_index = tensor_backend_id(node);
|
||||||
if (backend_index == -1) {
|
if (backend_index == -1) {
|
||||||
return NULL;
|
return NULL;
|
||||||
|
|
@ -1733,6 +1913,7 @@ ggml_backend_t ggml_backend_sched_get_tensor_backend(ggml_backend_sched_t sched,
|
||||||
// utils
|
// utils
|
||||||
|
|
||||||
enum ggml_status ggml_backend_view_init(struct ggml_tensor * tensor) {
|
enum ggml_status ggml_backend_view_init(struct ggml_tensor * tensor) {
|
||||||
|
GGML_ASSERT(tensor);
|
||||||
GGML_ASSERT(tensor->buffer == NULL);
|
GGML_ASSERT(tensor->buffer == NULL);
|
||||||
GGML_ASSERT(tensor->view_src != NULL);
|
GGML_ASSERT(tensor->view_src != NULL);
|
||||||
GGML_ASSERT(tensor->view_src->buffer != NULL);
|
GGML_ASSERT(tensor->view_src->buffer != NULL);
|
||||||
|
|
@ -1744,6 +1925,7 @@ enum ggml_status ggml_backend_view_init(struct ggml_tensor * tensor) {
|
||||||
}
|
}
|
||||||
|
|
||||||
enum ggml_status ggml_backend_tensor_alloc(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, void * addr) {
|
enum ggml_status ggml_backend_tensor_alloc(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, void * addr) {
|
||||||
|
GGML_ASSERT(tensor);
|
||||||
GGML_ASSERT(tensor->buffer == NULL);
|
GGML_ASSERT(tensor->buffer == NULL);
|
||||||
GGML_ASSERT(tensor->data == NULL);
|
GGML_ASSERT(tensor->data == NULL);
|
||||||
GGML_ASSERT(tensor->view_src == NULL);
|
GGML_ASSERT(tensor->view_src == NULL);
|
||||||
|
|
@ -1817,6 +1999,7 @@ static void graph_copy_init_tensor(struct ggml_hash_set * hash_set, struct ggml_
|
||||||
}
|
}
|
||||||
|
|
||||||
struct ggml_backend_graph_copy ggml_backend_graph_copy(ggml_backend_t backend, struct ggml_cgraph * graph) {
|
struct ggml_backend_graph_copy ggml_backend_graph_copy(ggml_backend_t backend, struct ggml_cgraph * graph) {
|
||||||
|
GGML_ASSERT(graph);
|
||||||
struct ggml_hash_set hash_set = ggml_hash_set_new(graph->visited_hash_set.size);
|
struct ggml_hash_set hash_set = ggml_hash_set_new(graph->visited_hash_set.size);
|
||||||
struct ggml_tensor ** node_copies = (ggml_tensor **) calloc(hash_set.size, sizeof(node_copies[0])); // NOLINT
|
struct ggml_tensor ** node_copies = (ggml_tensor **) calloc(hash_set.size, sizeof(node_copies[0])); // NOLINT
|
||||||
bool * node_init = (bool *) calloc(hash_set.size, sizeof(node_init[0]));
|
bool * node_init = (bool *) calloc(hash_set.size, sizeof(node_init[0]));
|
||||||
|
|
@ -1961,6 +2144,7 @@ bool ggml_backend_compare_graph_backend(ggml_backend_t backend1, ggml_backend_t
|
||||||
// CPU backend - buffer
|
// CPU backend - buffer
|
||||||
|
|
||||||
static void * ggml_backend_cpu_buffer_get_base(ggml_backend_buffer_t buffer) {
|
static void * ggml_backend_cpu_buffer_get_base(ggml_backend_buffer_t buffer) {
|
||||||
|
GGML_ASSERT(buffer);
|
||||||
uintptr_t data = (uintptr_t)buffer->context;
|
uintptr_t data = (uintptr_t)buffer->context;
|
||||||
|
|
||||||
// align the buffer
|
// align the buffer
|
||||||
|
|
@ -1972,6 +2156,7 @@ static void * ggml_backend_cpu_buffer_get_base(ggml_backend_buffer_t buffer) {
|
||||||
}
|
}
|
||||||
|
|
||||||
static void ggml_backend_cpu_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
static void ggml_backend_cpu_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
||||||
|
GGML_ASSERT(buffer);
|
||||||
ggml_aligned_free(buffer->context, buffer->size);
|
ggml_aligned_free(buffer->context, buffer->size);
|
||||||
delete buffer;
|
delete buffer;
|
||||||
}
|
}
|
||||||
|
|
@ -1981,24 +2166,28 @@ static void ggml_backend_cpu_ptr_buffer_free_buffer(ggml_backend_buffer_t buffer
|
||||||
}
|
}
|
||||||
|
|
||||||
static void ggml_backend_cpu_buffer_memset_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) {
|
static void ggml_backend_cpu_buffer_memset_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) {
|
||||||
|
GGML_ASSERT(tensor);
|
||||||
memset((char *)tensor->data + offset, value, size);
|
memset((char *)tensor->data + offset, value, size);
|
||||||
|
|
||||||
GGML_UNUSED(buffer);
|
GGML_UNUSED(buffer);
|
||||||
}
|
}
|
||||||
|
|
||||||
static void ggml_backend_cpu_buffer_set_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
|
static void ggml_backend_cpu_buffer_set_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
|
||||||
|
GGML_ASSERT(tensor);
|
||||||
memcpy((char *)tensor->data + offset, data, size);
|
memcpy((char *)tensor->data + offset, data, size);
|
||||||
|
|
||||||
GGML_UNUSED(buffer);
|
GGML_UNUSED(buffer);
|
||||||
}
|
}
|
||||||
|
|
||||||
static void ggml_backend_cpu_buffer_get_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
|
static void ggml_backend_cpu_buffer_get_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
|
||||||
|
GGML_ASSERT(tensor);
|
||||||
memcpy(data, (const char *)tensor->data + offset, size);
|
memcpy(data, (const char *)tensor->data + offset, size);
|
||||||
|
|
||||||
GGML_UNUSED(buffer);
|
GGML_UNUSED(buffer);
|
||||||
}
|
}
|
||||||
|
|
||||||
static bool ggml_backend_cpu_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * src, struct ggml_tensor * dst) {
|
static bool ggml_backend_cpu_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * src, struct ggml_tensor * dst) {
|
||||||
|
GGML_ASSERT(src);
|
||||||
if (ggml_backend_buffer_is_host(src->buffer)) {
|
if (ggml_backend_buffer_is_host(src->buffer)) {
|
||||||
memcpy(dst->data, src->data, ggml_nbytes(src));
|
memcpy(dst->data, src->data, ggml_nbytes(src));
|
||||||
return true;
|
return true;
|
||||||
|
|
@ -2009,6 +2198,7 @@ static bool ggml_backend_cpu_buffer_cpy_tensor(ggml_backend_buffer_t buffer, con
|
||||||
}
|
}
|
||||||
|
|
||||||
static void ggml_backend_cpu_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
|
static void ggml_backend_cpu_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
|
||||||
|
GGML_ASSERT(buffer);
|
||||||
memset(buffer->context, value, buffer->size);
|
memset(buffer->context, value, buffer->size);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -74,7 +74,7 @@ if (BLAS_FOUND)
|
||||||
|
|
||||||
target_compile_options(ggml-blas PRIVATE ${BLAS_LINKER_FLAGS})
|
target_compile_options(ggml-blas PRIVATE ${BLAS_LINKER_FLAGS})
|
||||||
|
|
||||||
if (${BLAS_INCLUDE_DIRS} MATCHES "mkl" AND (${GGML_BLAS_VENDOR} MATCHES "Generic" OR ${GGML_BLAS_VENDOR} MATCHES "Intel"))
|
if ("${BLAS_INCLUDE_DIRS}" MATCHES "mkl" AND (${GGML_BLAS_VENDOR} MATCHES "Generic" OR ${GGML_BLAS_VENDOR} MATCHES "Intel"))
|
||||||
add_compile_definitions(GGML_BLAS_USE_MKL)
|
add_compile_definitions(GGML_BLAS_USE_MKL)
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -270,6 +270,7 @@ static struct ggml_backend_i blas_backend_i = {
|
||||||
/* .graph_compute = */ ggml_backend_blas_graph_compute,
|
/* .graph_compute = */ ggml_backend_blas_graph_compute,
|
||||||
/* .event_record = */ NULL,
|
/* .event_record = */ NULL,
|
||||||
/* .event_wait = */ NULL,
|
/* .event_wait = */ NULL,
|
||||||
|
/* .graph_optimize = */ NULL,
|
||||||
};
|
};
|
||||||
|
|
||||||
static ggml_guid_t ggml_backend_blas_guid(void) {
|
static ggml_guid_t ggml_backend_blas_guid(void) {
|
||||||
|
|
|
||||||
51
ml/backend/ggml/ggml/src/ggml-cpu/CMakeLists.txt
vendored
51
ml/backend/ggml/ggml/src/ggml-cpu/CMakeLists.txt
vendored
|
|
@ -224,8 +224,14 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
|
||||||
foreach(feature DOTPROD SVE MATMUL_INT8 FMA FP16_VECTOR_ARITHMETIC SME)
|
foreach(feature DOTPROD SVE MATMUL_INT8 FMA FP16_VECTOR_ARITHMETIC SME)
|
||||||
string(FIND "${ARM_FEATURE}" "__ARM_FEATURE_${feature} 1" feature_pos)
|
string(FIND "${ARM_FEATURE}" "__ARM_FEATURE_${feature} 1" feature_pos)
|
||||||
if (NOT ${feature_pos} EQUAL -1)
|
if (NOT ${feature_pos} EQUAL -1)
|
||||||
|
# Special handling for MATMUL_INT8 when machine doesn't support i8mm
|
||||||
|
if ("${feature}" STREQUAL "MATMUL_INT8" AND GGML_MACHINE_SUPPORTS_noi8mm)
|
||||||
|
message(STATUS "ARM feature ${feature} detected but unsetting due to machine not supporting i8mm")
|
||||||
|
list(APPEND ARCH_FLAGS -U__ARM_FEATURE_MATMUL_INT8)
|
||||||
|
else()
|
||||||
message(STATUS "ARM feature ${feature} enabled")
|
message(STATUS "ARM feature ${feature} enabled")
|
||||||
endif()
|
endif()
|
||||||
|
endif()
|
||||||
endforeach()
|
endforeach()
|
||||||
endif()
|
endif()
|
||||||
endif()
|
endif()
|
||||||
|
|
@ -433,15 +439,31 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
|
||||||
ggml-cpu/arch/riscv/quants.c
|
ggml-cpu/arch/riscv/quants.c
|
||||||
ggml-cpu/arch/riscv/repack.cpp
|
ggml-cpu/arch/riscv/repack.cpp
|
||||||
)
|
)
|
||||||
if (GGML_RVV)
|
if (GGML_CPU_RISCV64_SPACEMIT)
|
||||||
|
target_compile_definitions(${GGML_CPU_NAME} PRIVATE GGML_USE_CPU_RISCV64_SPACEMIT ${RISCV64_SPACEMIT_IME_SPEC})
|
||||||
|
list(APPEND GGML_CPU_SOURCES
|
||||||
|
ggml-cpu/spacemit/ime.cpp
|
||||||
|
ggml-cpu/spacemit/ime.h
|
||||||
|
ggml-cpu/spacemit/ime1_kernels.cpp
|
||||||
|
ggml-cpu/spacemit/ime_kernels.h
|
||||||
|
)
|
||||||
|
endif()
|
||||||
|
set(MARCH_STR "rv64gc")
|
||||||
|
if (GGML_RV_ZFH)
|
||||||
|
string(APPEND MARCH_STR "_zfh")
|
||||||
|
endif()
|
||||||
if (GGML_XTHEADVECTOR)
|
if (GGML_XTHEADVECTOR)
|
||||||
list(APPEND ARCH_FLAGS -march=rv64gc_xtheadvector -mabi=lp64d)
|
string(APPEND MARCH_STR "_xtheadvector")
|
||||||
elseif (GGML_RV_ZFH)
|
elseif (GGML_RVV)
|
||||||
list(APPEND ARCH_FLAGS -march=rv64gcv_zfhmin -mabi=lp64d)
|
string(APPEND MARCH_STR "_v")
|
||||||
else()
|
if (GGML_RV_ZVFH)
|
||||||
list(APPEND ARCH_FLAGS -march=rv64gcv -mabi=lp64d)
|
string(APPEND MARCH_STR "_zvfh")
|
||||||
endif()
|
endif()
|
||||||
endif()
|
endif()
|
||||||
|
if (GGML_RV_ZICBOP)
|
||||||
|
string(APPEND MARCH_STR "_zicbop")
|
||||||
|
endif()
|
||||||
|
list(APPEND ARCH_FLAGS "-march=${MARCH_STR}" -mabi=lp64d)
|
||||||
elseif (GGML_SYSTEM_ARCH STREQUAL "s390x")
|
elseif (GGML_SYSTEM_ARCH STREQUAL "s390x")
|
||||||
message(STATUS "s390x detected")
|
message(STATUS "s390x detected")
|
||||||
list(APPEND GGML_CPU_SOURCES ggml-cpu/arch/s390/quants.c)
|
list(APPEND GGML_CPU_SOURCES ggml-cpu/arch/s390/quants.c)
|
||||||
|
|
@ -450,7 +472,6 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
|
||||||
|
|
||||||
# TODO: Separation to determine activation of VX/VXE/VXE2
|
# TODO: Separation to determine activation of VX/VXE/VXE2
|
||||||
if (${S390X_M} MATCHES "8561|8562")
|
if (${S390X_M} MATCHES "8561|8562")
|
||||||
set(GGML_NNPA OFF)
|
|
||||||
message(STATUS "z15 target")
|
message(STATUS "z15 target")
|
||||||
list(APPEND ARCH_FLAGS -march=z15)
|
list(APPEND ARCH_FLAGS -march=z15)
|
||||||
elseif (${S390X_M} MATCHES "3931")
|
elseif (${S390X_M} MATCHES "3931")
|
||||||
|
|
@ -460,7 +481,7 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
|
||||||
# NOTE: Only available from GCC 15.1.0 onwards. Any z17 machine with compile issues must first verify their GCC version.
|
# NOTE: Only available from GCC 15.1.0 onwards. Any z17 machine with compile issues must first verify their GCC version.
|
||||||
# binutils must also be updated to the latest for the -march=z17 flag to work. Otherwise, use -march=arch15.
|
# binutils must also be updated to the latest for the -march=z17 flag to work. Otherwise, use -march=arch15.
|
||||||
message(STATUS "z17 target")
|
message(STATUS "z17 target")
|
||||||
list(APPEND ARCH_FLAGS -march=z17)
|
list(APPEND ARCH_FLAGS -march=arch15)
|
||||||
else()
|
else()
|
||||||
message(STATUS "Unknown target")
|
message(STATUS "Unknown target")
|
||||||
message(WARNING "Unknown target. If you are compiling for z14 and earlier, you might have to add -DGGML_VXE=OFF.")
|
message(WARNING "Unknown target. If you are compiling for z14 and earlier, you might have to add -DGGML_VXE=OFF.")
|
||||||
|
|
@ -472,11 +493,6 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
|
||||||
list(APPEND ARCH_FLAGS -mvx -mzvector)
|
list(APPEND ARCH_FLAGS -mvx -mzvector)
|
||||||
list(APPEND ARCH_DEFINITIONS GGML_VXE)
|
list(APPEND ARCH_DEFINITIONS GGML_VXE)
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
if (GGML_NNPA)
|
|
||||||
message(STATUS "NNPA enabled")
|
|
||||||
list(APPEND ARCH_DEFINITIONS GGML_NNPA)
|
|
||||||
endif()
|
|
||||||
elseif (CMAKE_SYSTEM_PROCESSOR MATCHES "wasm")
|
elseif (CMAKE_SYSTEM_PROCESSOR MATCHES "wasm")
|
||||||
message(STATUS "Wasm detected")
|
message(STATUS "Wasm detected")
|
||||||
list (APPEND GGML_CPU_SOURCES ggml-cpu/arch/wasm/quants.c)
|
list (APPEND GGML_CPU_SOURCES ggml-cpu/arch/wasm/quants.c)
|
||||||
|
|
@ -497,9 +513,9 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
|
||||||
|
|
||||||
# Fetch KleidiAI sources:
|
# Fetch KleidiAI sources:
|
||||||
include(FetchContent)
|
include(FetchContent)
|
||||||
set(KLEIDIAI_COMMIT_TAG "v1.11.0")
|
set(KLEIDIAI_COMMIT_TAG "v1.14.0")
|
||||||
set(KLEIDIAI_DOWNLOAD_URL "https://github.com/ARM-software/kleidiai/archive/refs/tags/${KLEIDIAI_COMMIT_TAG}.tar.gz")
|
set(KLEIDIAI_DOWNLOAD_URL "https://github.com/ARM-software/kleidiai/archive/refs/tags/${KLEIDIAI_COMMIT_TAG}.tar.gz")
|
||||||
set(KLEIDIAI_ARCHIVE_MD5 "3fe9e5ab964c375c53839296eb71eaa2")
|
set(KLEIDIAI_ARCHIVE_MD5 "45e110675d93f99f82c23a1afcca76bc")
|
||||||
|
|
||||||
if (POLICY CMP0135)
|
if (POLICY CMP0135)
|
||||||
cmake_policy(SET CMP0135 NEW)
|
cmake_policy(SET CMP0135 NEW)
|
||||||
|
|
@ -555,6 +571,7 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
|
||||||
|
|
||||||
list(APPEND GGML_KLEIDIAI_SOURCES
|
list(APPEND GGML_KLEIDIAI_SOURCES
|
||||||
${KLEIDIAI_SRC}/kai/ukernels/matmul/pack/kai_lhs_quant_pack_qsi8d32p_f32.c
|
${KLEIDIAI_SRC}/kai/ukernels/matmul/pack/kai_lhs_quant_pack_qsi8d32p_f32.c
|
||||||
|
${KLEIDIAI_SRC}/kai/ukernels/matmul/pack/kai_lhs_quant_pack_qsi8d32p4x8sb_f32_neon.c
|
||||||
${KLEIDIAI_SRC}/kai/ukernels/matmul/pack/kai_rhs_pack_nxk_qsi4c32ps1s0scalef16_qsu4c32s16s0_neon.c
|
${KLEIDIAI_SRC}/kai/ukernels/matmul/pack/kai_rhs_pack_nxk_qsi4c32ps1s0scalef16_qsu4c32s16s0_neon.c
|
||||||
${KLEIDIAI_SRC}/kai/ukernels/matmul/pack/kai_lhs_quant_pack_qsi8d32p_f32_neon.c
|
${KLEIDIAI_SRC}/kai/ukernels/matmul/pack/kai_lhs_quant_pack_qsi8d32p_f32_neon.c
|
||||||
${KLEIDIAI_SRC}/kai/ukernels/matmul/pack/kai_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0.c)
|
${KLEIDIAI_SRC}/kai/ukernels/matmul/pack/kai_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0.c)
|
||||||
|
|
@ -575,8 +592,10 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
|
||||||
${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p1vlx4_qsi4c32p4vlx4_1vlx4vl_sme2_mopa.c
|
${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p1vlx4_qsi4c32p4vlx4_1vlx4vl_sme2_mopa.c
|
||||||
${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p1x4_qsi4c32p4vlx4_1x4vl_sme2_sdot.c
|
${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p1x4_qsi4c32p4vlx4_1x4vl_sme2_sdot.c
|
||||||
${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_fp32_bf16p_bf16p/kai_matmul_clamp_f32_bf16p2vlx2_bf16p2vlx2_2vlx2vl_sme2_mopa.c
|
${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_fp32_bf16p_bf16p/kai_matmul_clamp_f32_bf16p2vlx2_bf16p2vlx2_2vlx2vl_sme2_mopa.c
|
||||||
|
${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_fp32_bf16p_bf16p/kai_matmul_clamp_f32_bf16p2vlx2_bf16p2vlx2_2vlx2vl_sme2_mopa_asm.S
|
||||||
${KLEIDIAI_SRC}/kai/ukernels/matmul/pack/kai_lhs_pack_bf16p2vlx2_f32_sme.c
|
${KLEIDIAI_SRC}/kai/ukernels/matmul/pack/kai_lhs_pack_bf16p2vlx2_f32_sme.c
|
||||||
${KLEIDIAI_SRC}/kai/ukernels/matmul/pack/kai_rhs_pack_kxn_bf16p2vlx2b_f32_x32_sme.c)
|
${KLEIDIAI_SRC}/kai/ukernels/matmul/pack/kai_rhs_pack_kxn_bf16p2vlx2b_f32_x32_sme.c
|
||||||
|
${KLEIDIAI_SRC}/kai/kai_common_sme_asm.S)
|
||||||
set(PRIVATE_ARCH_FLAGS "-fno-tree-vectorize;${PRIVATE_ARCH_FLAGS}+sve+sve2")
|
set(PRIVATE_ARCH_FLAGS "-fno-tree-vectorize;${PRIVATE_ARCH_FLAGS}+sve+sve2")
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -7,7 +7,7 @@
|
||||||
#include "ggml-cpu.h"
|
#include "ggml-cpu.h"
|
||||||
#include "traits.h"
|
#include "traits.h"
|
||||||
|
|
||||||
#if defined(__gnu_linux__)
|
#if defined(__linux__)
|
||||||
#include <sys/syscall.h>
|
#include <sys/syscall.h>
|
||||||
#include <unistd.h>
|
#include <unistd.h>
|
||||||
#endif
|
#endif
|
||||||
|
|
@ -186,7 +186,7 @@ static size_t ggml_backend_amx_buffer_type_get_alloc_size(ggml_backend_buffer_ty
|
||||||
#define XFEATURE_XTILEDATA 18
|
#define XFEATURE_XTILEDATA 18
|
||||||
|
|
||||||
static bool ggml_amx_init() {
|
static bool ggml_amx_init() {
|
||||||
#if defined(__gnu_linux__)
|
#if defined(__linux__)
|
||||||
if (syscall(SYS_arch_prctl, ARCH_REQ_XCOMP_PERM, XFEATURE_XTILEDATA)) {
|
if (syscall(SYS_arch_prctl, ARCH_REQ_XCOMP_PERM, XFEATURE_XTILEDATA)) {
|
||||||
fprintf(stderr, "AMX is not ready to be used!\n");
|
fprintf(stderr, "AMX is not ready to be used!\n");
|
||||||
return false;
|
return false;
|
||||||
|
|
@ -194,6 +194,8 @@ static bool ggml_amx_init() {
|
||||||
return true;
|
return true;
|
||||||
#elif defined(_WIN32)
|
#elif defined(_WIN32)
|
||||||
return true;
|
return true;
|
||||||
|
#else
|
||||||
|
return false;
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -40,18 +40,22 @@
|
||||||
#define ggml_gemv_q4_K_8x8_q8_K_generic ggml_gemv_q4_K_8x8_q8_K
|
#define ggml_gemv_q4_K_8x8_q8_K_generic ggml_gemv_q4_K_8x8_q8_K
|
||||||
#define ggml_gemv_q2_K_8x8_q8_K_generic ggml_gemv_q2_K_8x8_q8_K
|
#define ggml_gemv_q2_K_8x8_q8_K_generic ggml_gemv_q2_K_8x8_q8_K
|
||||||
#define ggml_gemv_iq4_nl_4x4_q8_0_generic ggml_gemv_iq4_nl_4x4_q8_0
|
#define ggml_gemv_iq4_nl_4x4_q8_0_generic ggml_gemv_iq4_nl_4x4_q8_0
|
||||||
|
#define ggml_gemv_iq4_nl_8x8_q8_0_generic ggml_gemv_iq4_nl_8x8_q8_0
|
||||||
#define ggml_gemm_q4_0_4x4_q8_0_generic ggml_gemm_q4_0_4x4_q8_0
|
#define ggml_gemm_q4_0_4x4_q8_0_generic ggml_gemm_q4_0_4x4_q8_0
|
||||||
#define ggml_gemm_q4_0_4x8_q8_0_generic ggml_gemm_q4_0_4x8_q8_0
|
#define ggml_gemm_q4_0_4x8_q8_0_generic ggml_gemm_q4_0_4x8_q8_0
|
||||||
#define ggml_gemm_q4_0_8x8_q8_0_generic ggml_gemm_q4_0_8x8_q8_0
|
#define ggml_gemm_q4_0_8x8_q8_0_generic ggml_gemm_q4_0_8x8_q8_0
|
||||||
#define ggml_gemm_q4_K_8x8_q8_K_generic ggml_gemm_q4_K_8x8_q8_K
|
#define ggml_gemm_q4_K_8x8_q8_K_generic ggml_gemm_q4_K_8x8_q8_K
|
||||||
#define ggml_gemm_q2_K_8x8_q8_K_generic ggml_gemm_q2_K_8x8_q8_K
|
#define ggml_gemm_q2_K_8x8_q8_K_generic ggml_gemm_q2_K_8x8_q8_K
|
||||||
#define ggml_gemm_iq4_nl_4x4_q8_0_generic ggml_gemm_iq4_nl_4x4_q8_0
|
#define ggml_gemm_iq4_nl_4x4_q8_0_generic ggml_gemm_iq4_nl_4x4_q8_0
|
||||||
|
#define ggml_gemm_iq4_nl_8x8_q8_0_generic ggml_gemm_iq4_nl_8x8_q8_0
|
||||||
#elif defined(__aarch64__) || defined(__arm__) || defined(_M_ARM) || defined(_M_ARM64)
|
#elif defined(__aarch64__) || defined(__arm__) || defined(_M_ARM) || defined(_M_ARM64)
|
||||||
// repack.cpp
|
// repack.cpp
|
||||||
#define ggml_quantize_mat_q8_K_4x8_generic ggml_quantize_mat_q8_K_4x8
|
#define ggml_quantize_mat_q8_K_4x8_generic ggml_quantize_mat_q8_K_4x8
|
||||||
#define ggml_gemv_q4_K_8x8_q8_K_generic ggml_gemv_q4_K_8x8_q8_K
|
#define ggml_gemv_q4_K_8x8_q8_K_generic ggml_gemv_q4_K_8x8_q8_K
|
||||||
|
#define ggml_gemv_iq4_nl_8x8_q8_0_generic ggml_gemv_iq4_nl_8x8_q8_0
|
||||||
#define ggml_gemv_q2_K_8x8_q8_K_generic ggml_gemv_q2_K_8x8_q8_K
|
#define ggml_gemv_q2_K_8x8_q8_K_generic ggml_gemv_q2_K_8x8_q8_K
|
||||||
#define ggml_gemm_q4_K_8x8_q8_K_generic ggml_gemm_q4_K_8x8_q8_K
|
#define ggml_gemm_q4_K_8x8_q8_K_generic ggml_gemm_q4_K_8x8_q8_K
|
||||||
|
#define ggml_gemm_iq4_nl_8x8_q8_0_generic ggml_gemm_iq4_nl_8x8_q8_0
|
||||||
#define ggml_gemm_q2_K_8x8_q8_K_generic ggml_gemm_q2_K_8x8_q8_K
|
#define ggml_gemm_q2_K_8x8_q8_K_generic ggml_gemm_q2_K_8x8_q8_K
|
||||||
#elif defined(__x86_64__) || defined(__i386__) || defined(_M_IX86) || defined(_M_X64)
|
#elif defined(__x86_64__) || defined(__i386__) || defined(_M_IX86) || defined(_M_X64)
|
||||||
// repack.cpp
|
// repack.cpp
|
||||||
|
|
@ -69,7 +73,6 @@
|
||||||
#define ggml_vec_dot_tq1_0_q8_K_generic ggml_vec_dot_tq1_0_q8_K
|
#define ggml_vec_dot_tq1_0_q8_K_generic ggml_vec_dot_tq1_0_q8_K
|
||||||
#define ggml_vec_dot_tq2_0_q8_K_generic ggml_vec_dot_tq2_0_q8_K
|
#define ggml_vec_dot_tq2_0_q8_K_generic ggml_vec_dot_tq2_0_q8_K
|
||||||
#define ggml_vec_dot_iq1_m_q8_K_generic ggml_vec_dot_iq1_m_q8_K
|
#define ggml_vec_dot_iq1_m_q8_K_generic ggml_vec_dot_iq1_m_q8_K
|
||||||
#define ggml_vec_dot_mxfp4_q8_0_generic ggml_vec_dot_mxfp4_q8_0
|
|
||||||
// repack.cpp
|
// repack.cpp
|
||||||
#define ggml_quantize_mat_q8_0_4x4_generic ggml_quantize_mat_q8_0_4x4
|
#define ggml_quantize_mat_q8_0_4x4_generic ggml_quantize_mat_q8_0_4x4
|
||||||
#define ggml_quantize_mat_q8_0_4x8_generic ggml_quantize_mat_q8_0_4x8
|
#define ggml_quantize_mat_q8_0_4x8_generic ggml_quantize_mat_q8_0_4x8
|
||||||
|
|
@ -80,12 +83,14 @@
|
||||||
#define ggml_gemv_q4_K_8x8_q8_K_generic ggml_gemv_q4_K_8x8_q8_K
|
#define ggml_gemv_q4_K_8x8_q8_K_generic ggml_gemv_q4_K_8x8_q8_K
|
||||||
#define ggml_gemv_q2_K_8x8_q8_K_generic ggml_gemv_q2_K_8x8_q8_K
|
#define ggml_gemv_q2_K_8x8_q8_K_generic ggml_gemv_q2_K_8x8_q8_K
|
||||||
#define ggml_gemv_iq4_nl_4x4_q8_0_generic ggml_gemv_iq4_nl_4x4_q8_0
|
#define ggml_gemv_iq4_nl_4x4_q8_0_generic ggml_gemv_iq4_nl_4x4_q8_0
|
||||||
|
#define ggml_gemv_iq4_nl_8x8_q8_0_generic ggml_gemv_iq4_nl_8x8_q8_0
|
||||||
#define ggml_gemm_q4_0_4x4_q8_0_generic ggml_gemm_q4_0_4x4_q8_0
|
#define ggml_gemm_q4_0_4x4_q8_0_generic ggml_gemm_q4_0_4x4_q8_0
|
||||||
#define ggml_gemm_q4_0_4x8_q8_0_generic ggml_gemm_q4_0_4x8_q8_0
|
#define ggml_gemm_q4_0_4x8_q8_0_generic ggml_gemm_q4_0_4x8_q8_0
|
||||||
#define ggml_gemm_q4_0_8x8_q8_0_generic ggml_gemm_q4_0_8x8_q8_0
|
#define ggml_gemm_q4_0_8x8_q8_0_generic ggml_gemm_q4_0_8x8_q8_0
|
||||||
#define ggml_gemm_q4_K_8x8_q8_K_generic ggml_gemm_q4_K_8x8_q8_K
|
#define ggml_gemm_q4_K_8x8_q8_K_generic ggml_gemm_q4_K_8x8_q8_K
|
||||||
#define ggml_gemm_q2_K_8x8_q8_K_generic ggml_gemm_q2_K_8x8_q8_K
|
#define ggml_gemm_q2_K_8x8_q8_K_generic ggml_gemm_q2_K_8x8_q8_K
|
||||||
#define ggml_gemm_iq4_nl_4x4_q8_0_generic ggml_gemm_iq4_nl_4x4_q8_0
|
#define ggml_gemm_iq4_nl_4x4_q8_0_generic ggml_gemm_iq4_nl_4x4_q8_0
|
||||||
|
#define ggml_gemm_iq4_nl_8x8_q8_0_generic ggml_gemm_iq4_nl_8x8_q8_0
|
||||||
#elif defined(__loongarch64)
|
#elif defined(__loongarch64)
|
||||||
// quants.c
|
// quants.c
|
||||||
#define quantize_row_q8_K_generic quantize_row_q8_K
|
#define quantize_row_q8_K_generic quantize_row_q8_K
|
||||||
|
|
@ -103,12 +108,14 @@
|
||||||
#define ggml_gemv_q4_K_8x8_q8_K_generic ggml_gemv_q4_K_8x8_q8_K
|
#define ggml_gemv_q4_K_8x8_q8_K_generic ggml_gemv_q4_K_8x8_q8_K
|
||||||
#define ggml_gemv_q2_K_8x8_q8_K_generic ggml_gemv_q2_K_8x8_q8_K
|
#define ggml_gemv_q2_K_8x8_q8_K_generic ggml_gemv_q2_K_8x8_q8_K
|
||||||
#define ggml_gemv_iq4_nl_4x4_q8_0_generic ggml_gemv_iq4_nl_4x4_q8_0
|
#define ggml_gemv_iq4_nl_4x4_q8_0_generic ggml_gemv_iq4_nl_4x4_q8_0
|
||||||
|
#define ggml_gemv_iq4_nl_8x8_q8_0_generic ggml_gemv_iq4_nl_8x8_q8_0
|
||||||
#define ggml_gemm_q4_0_4x4_q8_0_generic ggml_gemm_q4_0_4x4_q8_0
|
#define ggml_gemm_q4_0_4x4_q8_0_generic ggml_gemm_q4_0_4x4_q8_0
|
||||||
#define ggml_gemm_q4_0_4x8_q8_0_generic ggml_gemm_q4_0_4x8_q8_0
|
#define ggml_gemm_q4_0_4x8_q8_0_generic ggml_gemm_q4_0_4x8_q8_0
|
||||||
#define ggml_gemm_q4_0_8x8_q8_0_generic ggml_gemm_q4_0_8x8_q8_0
|
#define ggml_gemm_q4_0_8x8_q8_0_generic ggml_gemm_q4_0_8x8_q8_0
|
||||||
#define ggml_gemm_q4_K_8x8_q8_K_generic ggml_gemm_q4_K_8x8_q8_K
|
#define ggml_gemm_q4_K_8x8_q8_K_generic ggml_gemm_q4_K_8x8_q8_K
|
||||||
#define ggml_gemm_q2_K_8x8_q8_K_generic ggml_gemm_q2_K_8x8_q8_K
|
#define ggml_gemm_q2_K_8x8_q8_K_generic ggml_gemm_q2_K_8x8_q8_K
|
||||||
#define ggml_gemm_iq4_nl_4x4_q8_0_generic ggml_gemm_iq4_nl_4x4_q8_0
|
#define ggml_gemm_iq4_nl_4x4_q8_0_generic ggml_gemm_iq4_nl_4x4_q8_0
|
||||||
|
#define ggml_gemm_iq4_nl_8x8_q8_0_generic ggml_gemm_iq4_nl_8x8_q8_0
|
||||||
#elif defined(__riscv)
|
#elif defined(__riscv)
|
||||||
// quants.c
|
// quants.c
|
||||||
#define quantize_row_q8_K_generic quantize_row_q8_K
|
#define quantize_row_q8_K_generic quantize_row_q8_K
|
||||||
|
|
@ -133,16 +140,16 @@
|
||||||
#define ggml_gemv_q4_K_8x8_q8_K_generic ggml_gemv_q4_K_8x8_q8_K
|
#define ggml_gemv_q4_K_8x8_q8_K_generic ggml_gemv_q4_K_8x8_q8_K
|
||||||
#define ggml_gemv_q2_K_8x8_q8_K_generic ggml_gemv_q2_K_8x8_q8_K
|
#define ggml_gemv_q2_K_8x8_q8_K_generic ggml_gemv_q2_K_8x8_q8_K
|
||||||
#define ggml_gemv_iq4_nl_4x4_q8_0_generic ggml_gemv_iq4_nl_4x4_q8_0
|
#define ggml_gemv_iq4_nl_4x4_q8_0_generic ggml_gemv_iq4_nl_4x4_q8_0
|
||||||
|
#define ggml_gemv_iq4_nl_8x8_q8_0_generic ggml_gemv_iq4_nl_8x8_q8_0
|
||||||
#define ggml_gemm_q4_0_4x4_q8_0_generic ggml_gemm_q4_0_4x4_q8_0
|
#define ggml_gemm_q4_0_4x4_q8_0_generic ggml_gemm_q4_0_4x4_q8_0
|
||||||
#define ggml_gemm_q4_0_4x8_q8_0_generic ggml_gemm_q4_0_4x8_q8_0
|
#define ggml_gemm_q4_0_4x8_q8_0_generic ggml_gemm_q4_0_4x8_q8_0
|
||||||
#define ggml_gemm_q4_K_8x8_q8_K_generic ggml_gemm_q4_K_8x8_q8_K
|
#define ggml_gemm_q4_K_8x8_q8_K_generic ggml_gemm_q4_K_8x8_q8_K
|
||||||
#define ggml_gemm_q2_K_8x8_q8_K_generic ggml_gemm_q2_K_8x8_q8_K
|
#define ggml_gemm_q2_K_8x8_q8_K_generic ggml_gemm_q2_K_8x8_q8_K
|
||||||
#define ggml_gemm_iq4_nl_4x4_q8_0_generic ggml_gemm_iq4_nl_4x4_q8_0
|
#define ggml_gemm_iq4_nl_4x4_q8_0_generic ggml_gemm_iq4_nl_4x4_q8_0
|
||||||
|
#define ggml_gemm_iq4_nl_8x8_q8_0_generic ggml_gemm_iq4_nl_8x8_q8_0
|
||||||
#elif defined(__s390x__)
|
#elif defined(__s390x__)
|
||||||
// quants.c
|
// quants.c
|
||||||
#define quantize_row_q8_K_generic quantize_row_q8_K
|
#define quantize_row_q8_K_generic quantize_row_q8_K
|
||||||
#define ggml_vec_dot_q5_0_q8_0_generic ggml_vec_dot_q5_0_q8_0
|
|
||||||
#define ggml_vec_dot_q5_1_q8_1_generic ggml_vec_dot_q5_1_q8_1
|
|
||||||
#define ggml_vec_dot_tq1_0_q8_K_generic ggml_vec_dot_tq1_0_q8_K
|
#define ggml_vec_dot_tq1_0_q8_K_generic ggml_vec_dot_tq1_0_q8_K
|
||||||
#define ggml_vec_dot_tq2_0_q8_K_generic ggml_vec_dot_tq2_0_q8_K
|
#define ggml_vec_dot_tq2_0_q8_K_generic ggml_vec_dot_tq2_0_q8_K
|
||||||
#define ggml_vec_dot_q2_K_q8_K_generic ggml_vec_dot_q2_K_q8_K
|
#define ggml_vec_dot_q2_K_q8_K_generic ggml_vec_dot_q2_K_q8_K
|
||||||
|
|
@ -153,7 +160,6 @@
|
||||||
#define ggml_vec_dot_iq3_s_q8_K_generic ggml_vec_dot_iq3_s_q8_K
|
#define ggml_vec_dot_iq3_s_q8_K_generic ggml_vec_dot_iq3_s_q8_K
|
||||||
#define ggml_vec_dot_iq1_s_q8_K_generic ggml_vec_dot_iq1_s_q8_K
|
#define ggml_vec_dot_iq1_s_q8_K_generic ggml_vec_dot_iq1_s_q8_K
|
||||||
#define ggml_vec_dot_iq1_m_q8_K_generic ggml_vec_dot_iq1_m_q8_K
|
#define ggml_vec_dot_iq1_m_q8_K_generic ggml_vec_dot_iq1_m_q8_K
|
||||||
#define ggml_vec_dot_mxfp4_q8_0_generic ggml_vec_dot_mxfp4_q8_0
|
|
||||||
// repack.cpp
|
// repack.cpp
|
||||||
#define ggml_quantize_mat_q8_0_4x4_generic ggml_quantize_mat_q8_0_4x4
|
#define ggml_quantize_mat_q8_0_4x4_generic ggml_quantize_mat_q8_0_4x4
|
||||||
#define ggml_quantize_mat_q8_0_4x8_generic ggml_quantize_mat_q8_0_4x8
|
#define ggml_quantize_mat_q8_0_4x8_generic ggml_quantize_mat_q8_0_4x8
|
||||||
|
|
@ -164,12 +170,14 @@
|
||||||
#define ggml_gemv_q4_K_8x8_q8_K_generic ggml_gemv_q4_K_8x8_q8_K
|
#define ggml_gemv_q4_K_8x8_q8_K_generic ggml_gemv_q4_K_8x8_q8_K
|
||||||
#define ggml_gemv_q2_K_8x8_q8_K_generic ggml_gemv_q2_K_8x8_q8_K
|
#define ggml_gemv_q2_K_8x8_q8_K_generic ggml_gemv_q2_K_8x8_q8_K
|
||||||
#define ggml_gemv_iq4_nl_4x4_q8_0_generic ggml_gemv_iq4_nl_4x4_q8_0
|
#define ggml_gemv_iq4_nl_4x4_q8_0_generic ggml_gemv_iq4_nl_4x4_q8_0
|
||||||
|
#define ggml_gemv_iq4_nl_8x8_q8_0_generic ggml_gemv_iq4_nl_8x8_q8_0
|
||||||
#define ggml_gemm_q4_0_4x4_q8_0_generic ggml_gemm_q4_0_4x4_q8_0
|
#define ggml_gemm_q4_0_4x4_q8_0_generic ggml_gemm_q4_0_4x4_q8_0
|
||||||
#define ggml_gemm_q4_0_4x8_q8_0_generic ggml_gemm_q4_0_4x8_q8_0
|
#define ggml_gemm_q4_0_4x8_q8_0_generic ggml_gemm_q4_0_4x8_q8_0
|
||||||
#define ggml_gemm_q4_0_8x8_q8_0_generic ggml_gemm_q4_0_8x8_q8_0
|
#define ggml_gemm_q4_0_8x8_q8_0_generic ggml_gemm_q4_0_8x8_q8_0
|
||||||
#define ggml_gemm_q4_K_8x8_q8_K_generic ggml_gemm_q4_K_8x8_q8_K
|
#define ggml_gemm_q4_K_8x8_q8_K_generic ggml_gemm_q4_K_8x8_q8_K
|
||||||
#define ggml_gemm_q2_K_8x8_q8_K_generic ggml_gemm_q2_K_8x8_q8_K
|
#define ggml_gemm_q2_K_8x8_q8_K_generic ggml_gemm_q2_K_8x8_q8_K
|
||||||
#define ggml_gemm_iq4_nl_4x4_q8_0_generic ggml_gemm_iq4_nl_4x4_q8_0
|
#define ggml_gemm_iq4_nl_4x4_q8_0_generic ggml_gemm_iq4_nl_4x4_q8_0
|
||||||
|
#define ggml_gemm_iq4_nl_8x8_q8_0_generic ggml_gemm_iq4_nl_8x8_q8_0
|
||||||
#elif defined(__wasm__)
|
#elif defined(__wasm__)
|
||||||
// quants.c
|
// quants.c
|
||||||
#define ggml_vec_dot_q4_1_q8_1_generic ggml_vec_dot_q4_1_q8_1
|
#define ggml_vec_dot_q4_1_q8_1_generic ggml_vec_dot_q4_1_q8_1
|
||||||
|
|
@ -195,10 +203,12 @@
|
||||||
#define ggml_gemv_q4_K_8x8_q8_K_generic ggml_gemv_q4_K_8x8_q8_K
|
#define ggml_gemv_q4_K_8x8_q8_K_generic ggml_gemv_q4_K_8x8_q8_K
|
||||||
#define ggml_gemv_q2_K_8x8_q8_K_generic ggml_gemv_q2_K_8x8_q8_K
|
#define ggml_gemv_q2_K_8x8_q8_K_generic ggml_gemv_q2_K_8x8_q8_K
|
||||||
#define ggml_gemv_iq4_nl_4x4_q8_0_generic ggml_gemv_iq4_nl_4x4_q8_0
|
#define ggml_gemv_iq4_nl_4x4_q8_0_generic ggml_gemv_iq4_nl_4x4_q8_0
|
||||||
|
#define ggml_gemv_iq4_nl_8x8_q8_0_generic ggml_gemv_iq4_nl_8x8_q8_0
|
||||||
#define ggml_gemm_q4_0_4x4_q8_0_generic ggml_gemm_q4_0_4x4_q8_0
|
#define ggml_gemm_q4_0_4x4_q8_0_generic ggml_gemm_q4_0_4x4_q8_0
|
||||||
#define ggml_gemm_q4_0_4x8_q8_0_generic ggml_gemm_q4_0_4x8_q8_0
|
#define ggml_gemm_q4_0_4x8_q8_0_generic ggml_gemm_q4_0_4x8_q8_0
|
||||||
#define ggml_gemm_q4_0_8x8_q8_0_generic ggml_gemm_q4_0_8x8_q8_0
|
#define ggml_gemm_q4_0_8x8_q8_0_generic ggml_gemm_q4_0_8x8_q8_0
|
||||||
#define ggml_gemm_q4_K_8x8_q8_K_generic ggml_gemm_q4_K_8x8_q8_K
|
#define ggml_gemm_q4_K_8x8_q8_K_generic ggml_gemm_q4_K_8x8_q8_K
|
||||||
#define ggml_gemm_q2_K_8x8_q8_K_generic ggml_gemm_q2_K_8x8_q8_K
|
#define ggml_gemm_q2_K_8x8_q8_K_generic ggml_gemm_q2_K_8x8_q8_K
|
||||||
#define ggml_gemm_iq4_nl_4x4_q8_0_generic ggml_gemm_iq4_nl_4x4_q8_0
|
#define ggml_gemm_iq4_nl_4x4_q8_0_generic ggml_gemm_iq4_nl_4x4_q8_0
|
||||||
|
#define ggml_gemm_iq4_nl_8x8_q8_0_generic ggml_gemm_iq4_nl_8x8_q8_0
|
||||||
#endif
|
#endif
|
||||||
|
|
|
||||||
1579
ml/backend/ggml/ggml/src/ggml-cpu/arch/x86/repack.cpp
vendored
1579
ml/backend/ggml/ggml/src/ggml-cpu/arch/x86/repack.cpp
vendored
File diff suppressed because it is too large
Load Diff
14
ml/backend/ggml/ggml/src/ggml-cpu/common.h
vendored
14
ml/backend/ggml/ggml/src/ggml-cpu/common.h
vendored
|
|
@ -28,6 +28,14 @@ static inline float bf16_to_f32(ggml_bf16_t x) {
|
||||||
return GGML_BF16_TO_FP32(x);
|
return GGML_BF16_TO_FP32(x);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static inline float i32_to_f32(int32_t x) {
|
||||||
|
return x;
|
||||||
|
}
|
||||||
|
|
||||||
|
static inline int32_t f32_to_i32(float x) {
|
||||||
|
return x;
|
||||||
|
}
|
||||||
|
|
||||||
static inline float f32_to_f32(float x) {
|
static inline float f32_to_f32(float x) {
|
||||||
return x;
|
return x;
|
||||||
}
|
}
|
||||||
|
|
@ -54,6 +62,12 @@ struct type_conversion_table<ggml_bf16_t> {
|
||||||
static constexpr ggml_bf16_t (*from_f32)(float) = f32_to_bf16;
|
static constexpr ggml_bf16_t (*from_f32)(float) = f32_to_bf16;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
template <>
|
||||||
|
struct type_conversion_table<int32_t> {
|
||||||
|
static constexpr float (*to_f32)(int32_t) = i32_to_f32;
|
||||||
|
static constexpr int32_t (*from_f32)(float) = f32_to_i32;
|
||||||
|
};
|
||||||
|
|
||||||
static std::pair<int64_t, int64_t> get_thread_range(const struct ggml_compute_params * params, const struct ggml_tensor * src0) {
|
static std::pair<int64_t, int64_t> get_thread_range(const struct ggml_compute_params * params, const struct ggml_tensor * src0) {
|
||||||
const int64_t ith = params->ith;
|
const int64_t ith = params->ith;
|
||||||
const int64_t nth = params->nth;
|
const int64_t nth = params->nth;
|
||||||
|
|
|
||||||
|
|
@ -68,12 +68,6 @@ struct ggml_compute_params {
|
||||||
#endif // __VXE2__
|
#endif // __VXE2__
|
||||||
#endif // __s390x__ && __VEC__
|
#endif // __s390x__ && __VEC__
|
||||||
|
|
||||||
#if defined(__s390x__) && defined(GGML_NNPA)
|
|
||||||
#ifndef __NNPA__
|
|
||||||
#define __NNPA__
|
|
||||||
#endif // __NNPA__
|
|
||||||
#endif // __s390x__ && GGML_NNPA
|
|
||||||
|
|
||||||
#if defined(__ARM_FEATURE_SVE)
|
#if defined(__ARM_FEATURE_SVE)
|
||||||
#include <sys/prctl.h>
|
#include <sys/prctl.h>
|
||||||
#endif
|
#endif
|
||||||
|
|
@ -486,6 +480,19 @@ inline static int16x8_t vec_padd_s16(int16x8_t a, int16x8_t b) {
|
||||||
return v_abo + v_abe;
|
return v_abo + v_abe;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @see https://github.com/ggml-org/llama.cpp/pull/14037
|
||||||
|
*/
|
||||||
|
inline static float vec_hsum_f32x4(float32x4_t v) {
|
||||||
|
float32x4_t v_temp = v + vec_reve(v);
|
||||||
|
return v_temp[0] + v_temp[1];
|
||||||
|
}
|
||||||
|
|
||||||
|
inline static int32_t vec_hsum_i32x4(int32x4_t v) {
|
||||||
|
int32x4_t v_temp = v + vec_reve(v);
|
||||||
|
return v_temp[0] + v_temp[1];
|
||||||
|
}
|
||||||
|
|
||||||
inline static int32x4_t ggml_vec_dot(int32x4_t acc, int8x16_t a, int8x16_t b) {
|
inline static int32x4_t ggml_vec_dot(int32x4_t acc, int8x16_t a, int8x16_t b) {
|
||||||
const int16x8_t p = vec_mule(a, b) + vec_mulo(a, b);
|
const int16x8_t p = vec_mule(a, b) + vec_mulo(a, b);
|
||||||
return acc + (vec_unpackh(p) + vec_unpackl(p));
|
return acc + (vec_unpackh(p) + vec_unpackl(p));
|
||||||
|
|
|
||||||
95
ml/backend/ggml/ggml/src/ggml-cpu/ggml-cpu.c
vendored
95
ml/backend/ggml/ggml/src/ggml-cpu/ggml-cpu.c
vendored
|
|
@ -375,6 +375,9 @@ static const struct ggml_type_traits_cpu type_traits_cpu[GGML_TYPE_COUNT] = {
|
||||||
.vec_dot_type = GGML_TYPE_Q8_K,
|
.vec_dot_type = GGML_TYPE_Q8_K,
|
||||||
.nrows = 1,
|
.nrows = 1,
|
||||||
},
|
},
|
||||||
|
[GGML_TYPE_I32] = {
|
||||||
|
.from_float = (ggml_from_float_t) ggml_cpu_fp32_to_i32,
|
||||||
|
},
|
||||||
};
|
};
|
||||||
|
|
||||||
const struct ggml_type_traits_cpu * ggml_get_type_traits_cpu(enum ggml_type type) {
|
const struct ggml_type_traits_cpu * ggml_get_type_traits_cpu(enum ggml_type type) {
|
||||||
|
|
@ -472,10 +475,10 @@ struct ggml_threadpool {
|
||||||
struct ggml_compute_state {
|
struct ggml_compute_state {
|
||||||
#ifndef GGML_USE_OPENMP
|
#ifndef GGML_USE_OPENMP
|
||||||
ggml_thread_t thrd;
|
ggml_thread_t thrd;
|
||||||
bool cpumask[GGML_MAX_N_THREADS];
|
|
||||||
int last_graph;
|
int last_graph;
|
||||||
bool pending;
|
bool pending;
|
||||||
#endif
|
#endif
|
||||||
|
bool cpumask[GGML_MAX_N_THREADS];
|
||||||
struct ggml_threadpool * threadpool;
|
struct ggml_threadpool * threadpool;
|
||||||
int ith;
|
int ith;
|
||||||
};
|
};
|
||||||
|
|
@ -1878,10 +1881,18 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
|
||||||
{
|
{
|
||||||
ggml_compute_forward_im2col_back_f32(params, tensor);
|
ggml_compute_forward_im2col_back_f32(params, tensor);
|
||||||
} break;
|
} break;
|
||||||
|
case GGML_OP_IM2COL_3D:
|
||||||
|
{
|
||||||
|
ggml_compute_forward_im2col_3d(params, tensor);
|
||||||
|
} break;
|
||||||
case GGML_OP_CONV_2D:
|
case GGML_OP_CONV_2D:
|
||||||
{
|
{
|
||||||
ggml_compute_forward_conv_2d(params, tensor);
|
ggml_compute_forward_conv_2d(params, tensor);
|
||||||
} break;
|
} break;
|
||||||
|
case GGML_OP_CONV_3D:
|
||||||
|
{
|
||||||
|
ggml_compute_forward_conv_3d(params, tensor);
|
||||||
|
} break;
|
||||||
case GGML_OP_CONV_2D_DW:
|
case GGML_OP_CONV_2D_DW:
|
||||||
{
|
{
|
||||||
ggml_compute_forward_conv_2d_dw(params, tensor);
|
ggml_compute_forward_conv_2d_dw(params, tensor);
|
||||||
|
|
@ -2024,6 +2035,11 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
|
||||||
ggml_compute_forward_opt_step_adamw(params, tensor);
|
ggml_compute_forward_opt_step_adamw(params, tensor);
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
|
case GGML_OP_OPT_STEP_SGD:
|
||||||
|
{
|
||||||
|
ggml_compute_forward_opt_step_sgd(params, tensor);
|
||||||
|
}
|
||||||
|
break;
|
||||||
case GGML_OP_NONE:
|
case GGML_OP_NONE:
|
||||||
{
|
{
|
||||||
// nop
|
// nop
|
||||||
|
|
@ -2248,7 +2264,9 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
|
||||||
} break;
|
} break;
|
||||||
case GGML_OP_IM2COL:
|
case GGML_OP_IM2COL:
|
||||||
case GGML_OP_IM2COL_BACK:
|
case GGML_OP_IM2COL_BACK:
|
||||||
|
case GGML_OP_IM2COL_3D:
|
||||||
case GGML_OP_CONV_2D:
|
case GGML_OP_CONV_2D:
|
||||||
|
case GGML_OP_CONV_3D:
|
||||||
case GGML_OP_CONV_2D_DW:
|
case GGML_OP_CONV_2D_DW:
|
||||||
case GGML_OP_CONV_TRANSPOSE_1D:
|
case GGML_OP_CONV_TRANSPOSE_1D:
|
||||||
case GGML_OP_CONV_TRANSPOSE_2D:
|
case GGML_OP_CONV_TRANSPOSE_2D:
|
||||||
|
|
@ -2327,6 +2345,7 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
|
||||||
case GGML_OP_CROSS_ENTROPY_LOSS:
|
case GGML_OP_CROSS_ENTROPY_LOSS:
|
||||||
case GGML_OP_CROSS_ENTROPY_LOSS_BACK:
|
case GGML_OP_CROSS_ENTROPY_LOSS_BACK:
|
||||||
case GGML_OP_OPT_STEP_ADAMW:
|
case GGML_OP_OPT_STEP_ADAMW:
|
||||||
|
case GGML_OP_OPT_STEP_SGD:
|
||||||
{
|
{
|
||||||
n_tasks = n_threads;
|
n_tasks = n_threads;
|
||||||
} break;
|
} break;
|
||||||
|
|
@ -2682,7 +2701,10 @@ struct ggml_cplan ggml_graph_plan(
|
||||||
if (ggml_is_quantized(node->type) ||
|
if (ggml_is_quantized(node->type) ||
|
||||||
// F16 -> BF16 and BF16 -> F16 copies go through intermediate F32
|
// F16 -> BF16 and BF16 -> F16 copies go through intermediate F32
|
||||||
(node->src[0]->type == GGML_TYPE_F16 && node->src[1] && node->src[1]->type == GGML_TYPE_BF16) ||
|
(node->src[0]->type == GGML_TYPE_F16 && node->src[1] && node->src[1]->type == GGML_TYPE_BF16) ||
|
||||||
(node->src[0]->type == GGML_TYPE_BF16 && node->src[1] && node->src[1]->type == GGML_TYPE_F16)) {
|
(node->src[0]->type == GGML_TYPE_BF16 && node->src[1] && node->src[1]->type == GGML_TYPE_F16) ||
|
||||||
|
// conversion between F32 and I32
|
||||||
|
(node->src[0]->type == GGML_TYPE_F32 && node->src[1] && node->src[1]->type == GGML_TYPE_I32) ||
|
||||||
|
(node->src[0]->type == GGML_TYPE_I32 && node->src[1] && node->src[1]->type == GGML_TYPE_F32)) {
|
||||||
cur = ggml_type_size(GGML_TYPE_F32) * node->ne[0] * n_tasks;
|
cur = ggml_type_size(GGML_TYPE_F32) * node->ne[0] * n_tasks;
|
||||||
}
|
}
|
||||||
} break;
|
} break;
|
||||||
|
|
@ -2769,6 +2791,7 @@ struct ggml_cplan ggml_graph_plan(
|
||||||
}
|
}
|
||||||
} break;
|
} break;
|
||||||
case GGML_OP_CONV_2D:
|
case GGML_OP_CONV_2D:
|
||||||
|
case GGML_OP_CONV_3D:
|
||||||
{
|
{
|
||||||
cur = GGML_IM2COL_WORK_SIZE;
|
cur = GGML_IM2COL_WORK_SIZE;
|
||||||
} break;
|
} break;
|
||||||
|
|
@ -3064,7 +3087,14 @@ static struct ggml_threadpool * ggml_threadpool_new_impl(
|
||||||
|
|
||||||
threadpool->workers = workers;
|
threadpool->workers = workers;
|
||||||
|
|
||||||
#ifndef GGML_USE_OPENMP
|
#ifdef GGML_USE_OPENMP
|
||||||
|
int32_t cpumask_iter = 0;
|
||||||
|
|
||||||
|
// Compute CPU masks for each thread
|
||||||
|
for (int j = 0; j < tpp->n_threads; j++) {
|
||||||
|
ggml_thread_cpumask_next(tpp->cpumask, workers[j].cpumask, tpp->strict_cpu, &cpumask_iter);
|
||||||
|
}
|
||||||
|
#else // GGML_USE_OPENMP
|
||||||
ggml_mutex_init(&threadpool->mutex);
|
ggml_mutex_init(&threadpool->mutex);
|
||||||
ggml_cond_init(&threadpool->cond);
|
ggml_cond_init(&threadpool->cond);
|
||||||
|
|
||||||
|
|
@ -3137,7 +3167,14 @@ enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cpl
|
||||||
atomic_store_explicit(&threadpool->n_threads_cur, n_threads, memory_order_relaxed);
|
atomic_store_explicit(&threadpool->n_threads_cur, n_threads, memory_order_relaxed);
|
||||||
}
|
}
|
||||||
|
|
||||||
ggml_graph_compute_thread(&threadpool->workers[omp_get_thread_num()]);
|
// Apply thread CPU mask and priority
|
||||||
|
int ith = omp_get_thread_num();
|
||||||
|
|
||||||
|
ggml_thread_apply_priority(threadpool->prio);
|
||||||
|
if (ggml_thread_cpumask_is_valid(threadpool->workers[ith].cpumask)) {
|
||||||
|
ggml_thread_apply_affinity(threadpool->workers[ith].cpumask);
|
||||||
|
}
|
||||||
|
ggml_graph_compute_thread(&threadpool->workers[ith]);
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
atomic_store_explicit(&threadpool->n_threads_cur, 1, memory_order_relaxed);
|
atomic_store_explicit(&threadpool->n_threads_cur, 1, memory_order_relaxed);
|
||||||
|
|
@ -3200,20 +3237,12 @@ void ggml_cpu_fp32_to_fp16(const float * x, ggml_fp16_t * y, int64_t n) {
|
||||||
__m128i y_vec = _mm_cvtps_ph(x_vec, _MM_FROUND_TO_NEAREST_INT);
|
__m128i y_vec = _mm_cvtps_ph(x_vec, _MM_FROUND_TO_NEAREST_INT);
|
||||||
_mm_storel_epi64((__m128i *)(y + i), y_vec);
|
_mm_storel_epi64((__m128i *)(y + i), y_vec);
|
||||||
}
|
}
|
||||||
#elif defined(__NNPA__)
|
#elif defined(__riscv_zvfh)
|
||||||
for (; i + 7 < n; i += 8) {
|
for (int vl; i < n; i += vl) {
|
||||||
float32x4_t v_xh = vec_xl(0, (const float *)(x + i + 0));
|
vl = __riscv_vsetvl_e32m2(n - i);
|
||||||
float32x4_t v_xl = vec_xl(0, (const float *)(x + i + 4));
|
vfloat32m2_t vx = __riscv_vle32_v_f32m2(&x[i], vl);
|
||||||
uint16x8_t v_yd = vec_round_from_fp32(v_xh, v_xl, 0);
|
vfloat16m1_t vy = __riscv_vfncvt_f_f_w_f16m1(vx, vl);
|
||||||
uint16x8_t v_y = vec_convert_to_fp16(v_yd, 0);
|
__riscv_vse16_v_f16m1((_Float16 *)&y[i], vy, vl);
|
||||||
vec_xst(v_y, 0, (ggml_fp16_t *)(y + i));
|
|
||||||
}
|
|
||||||
for (; i + 3 < n; i += 4) {
|
|
||||||
float32x4_t v_x = vec_xl(0, (const float *)(x + i));
|
|
||||||
float32x4_t v_zero = vec_splats(0.0f);
|
|
||||||
uint16x8_t v_yd = vec_round_from_fp32(v_x, v_zero, 0);
|
|
||||||
uint16x8_t v_y = vec_convert_to_fp16(v_yd, 0);
|
|
||||||
vec_xst(v_y, 0, (ggml_fp16_t *)(y + i));
|
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
for (; i < n; ++i) {
|
for (; i < n; ++i) {
|
||||||
|
|
@ -3241,21 +3270,6 @@ void ggml_cpu_fp16_to_fp32(const ggml_fp16_t * x, float * y, int64_t n) {
|
||||||
__m128 y_vec = _mm_cvtph_ps(x_vec);
|
__m128 y_vec = _mm_cvtph_ps(x_vec);
|
||||||
_mm_storeu_ps(y + i, y_vec);
|
_mm_storeu_ps(y + i, y_vec);
|
||||||
}
|
}
|
||||||
#elif defined(__NNPA__)
|
|
||||||
for (; i + 7 < n; i += 8) {
|
|
||||||
uint16x8_t v_x = vec_xl(0, (const ggml_fp16_t *)(x + i));
|
|
||||||
uint16x8_t v_yd = vec_convert_from_fp16(v_x, 0);
|
|
||||||
float32x4_t v_yh = vec_extend_to_fp32_hi(v_yd, 0);
|
|
||||||
float32x4_t v_yl = vec_extend_to_fp32_lo(v_yd, 0);
|
|
||||||
vec_xst(v_yh, 0, (float *)(y + i + 0));
|
|
||||||
vec_xst(v_yl, 0, (float *)(y + i + 4));
|
|
||||||
}
|
|
||||||
for (; i + 3 < n; i += 4) {
|
|
||||||
uint16x8_t v_x = vec_xl(0, (const ggml_fp16_t *)(x + i));
|
|
||||||
uint16x8_t v_yd = vec_convert_from_fp16(v_x, 0);
|
|
||||||
float32x4_t v_yh = vec_extend_to_fp32_hi(v_yd, 0);
|
|
||||||
vec_xst(v_yh, 0, (float *)(y + i));
|
|
||||||
}
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
for (; i < n; ++i) {
|
for (; i < n; ++i) {
|
||||||
|
|
@ -3270,6 +3284,13 @@ void ggml_cpu_fp32_to_bf16(const float * x, ggml_bf16_t * y, int64_t n) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void ggml_cpu_fp32_to_i32(const float * x, int32_t * y, int64_t n) {
|
||||||
|
int64_t i = 0;
|
||||||
|
for (; i < n; ++i) {
|
||||||
|
y[i] = x[i];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
void ggml_cpu_bf16_to_fp32(const ggml_bf16_t * x, float * y, int64_t n) {
|
void ggml_cpu_bf16_to_fp32(const ggml_bf16_t * x, float * y, int64_t n) {
|
||||||
int64_t i = 0;
|
int64_t i = 0;
|
||||||
#if defined(__AVX2__)
|
#if defined(__AVX2__)
|
||||||
|
|
@ -3459,14 +3480,6 @@ int ggml_cpu_has_vxe(void) {
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
int ggml_cpu_has_nnpa(void) {
|
|
||||||
#if defined(GGML_NNPA)
|
|
||||||
return 1;
|
|
||||||
#else
|
|
||||||
return 0;
|
|
||||||
#endif
|
|
||||||
}
|
|
||||||
|
|
||||||
int ggml_cpu_has_neon(void) {
|
int ggml_cpu_has_neon(void) {
|
||||||
#if defined(__ARM_ARCH) && defined(__ARM_NEON)
|
#if defined(__ARM_ARCH) && defined(__ARM_NEON)
|
||||||
return 1;
|
return 1;
|
||||||
|
|
|
||||||
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue
Block a user