ggml: Enable op_offload to improve partial offload performance

When a model is partially offloaded to system RAM, we can either
do the calculations on the CPU or we can temporarily transfer the
data to the GPU to do the calculations there. Small batches tend
to be better on the CPU, large batches on the GPU.

The llamarunner used the GPU in most cases and the ollamarunner
used the CPU. Although the ollamarunner saw an improvement in
token generation performance, there was a large performance hit
in prompt processing (3-10x).

There is an existing heuristic to dynamically switch between these
two modes but in practice it doesn't have enough information to
accurately make that decision. This adds authoritative data to make
the check work to get the best of both worlds.

Fixes #12037
This commit is contained in:
Jesse Gross 2025-10-27 16:32:05 -07:00 committed by Jesse Gross
parent 26465fb85f
commit afaf7ce8c3
15 changed files with 405 additions and 128 deletions

View File

@ -1,58 +0,0 @@
From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
From: Oliver Simons <osimons@nvidia.com>
Date: Tue, 22 Jul 2025 11:02:28 +0200
Subject: [PATCH] Enable CUDA Graphs for gemma3n.
Similar to
https://github.com/ggml-org/llama.cpp/pull/14741,
though ollama has a slightly different model graph
than llama.cpp which requires different workaround
checks.
---
ggml/src/ggml-cuda/ggml-cuda.cu | 18 ++++++++++++++++++
1 file changed, 18 insertions(+)
diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
index 5b852f69..827e3205 100644
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
@@ -2689,14 +2689,26 @@ static bool check_node_graph_compatibility_and_refresh_copy_ops(ggml_backend_cud
// Loop over nodes in GGML graph to obtain info needed for CUDA graph
cuda_ctx->cuda_graph->cpy_dest_ptrs.clear();
+ // This fix was added in llama.cpp and Ollama in parallel, but with
+ // different tensor names.
+ // llama.cpp: https://github.com/ggml-org/llama.cpp/pull/14741
+ // ollama: https://github.com/ollama/ollama/pull/11525
+
+ const std::string gemma3n_per_layer_proj_src1_name_ollama = " (reshaped)";
+ const std::string gemma3n_node_name_ollama = "node_";
+
const std::string gemma3n_per_layer_proj_src0_name = "inp_per_layer_selected";
const std::string gemma3n_per_layer_proj_src1_name = "per_layer_proj";
+
+ const std::string ffn_moe_bias_suffix = "_exps.bias";
+
const std::string ffn_moe_gate_bias_prefix = "ffn_moe_gate_biased";
const std::string ffn_moe_up_bias_prefix = "ffn_moe_up_biased";
const std::string ffn_moe_down_bias_prefix = "ffn_moe_down_biased";
const std::string nemotron_h_block_out_prefix = "nemotron_h_block_out";
const std::string mamba2_y_add_d_prefix = "mamba2_y_add_d";
+
for (int i = 0; i < cgraph->n_nodes; i++) {
ggml_tensor * node = cgraph->nodes[i];
@@ -2720,6 +2732,12 @@ static bool check_node_graph_compatibility_and_refresh_copy_ops(ggml_backend_cud
if (node->op == GGML_OP_ADD &&
node->src[1] && node->src[1]->ne[1] > 1 &&
+ // ollama
+ // workarounds to exclude Gemma3n's `project_per_layer_input` operation from the batch-size heuristic, specific to ollama's implementation of gemma3n
+ // number of layers is different for per_layer_proj between gemma3n:2b and gemma3n:4b, which is why we don't check that value here
+ !(node->ne[0] == 256 && node->ne[2] == 1 && node->ne[3] == 1 && node->src[0] ? std::string(node->src[0]->name).find(gemma3n_node_name_ollama) != std::string::npos : false && node->src[1] ? node->src[1]->name == gemma3n_per_layer_proj_src1_name_ollama : false) &&
+ node->src[1] ? std::string(node->src[1]->name).find(ffn_moe_bias_suffix) == std::string::npos : false &&
+ // upstream
(node->src[0] ? node->src[0]->name != gemma3n_per_layer_proj_src0_name : true) &&
(node->src[1] ? node->src[1]->name != gemma3n_per_layer_proj_src1_name : true) &&
strncmp(node->name, ffn_moe_gate_bias_prefix.c_str(), ffn_moe_gate_bias_prefix.size()) != 0 &&

View File

@ -0,0 +1,300 @@
From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
From: Jesse Gross <jesse@ollama.com>
Date: Tue, 28 Oct 2025 17:36:54 -0700
Subject: [PATCH] ggml: Add batch size hint
Some operations use heuristics to determine the batch size, which
affects offloading decisions. However, these are not always
accurate when looking at single operations. This provides an
explicit signal on the batch size from higher layers to ensure
consistent performance.
---
ggml/include/ggml-backend.h | 5 ++-
ggml/src/ggml-backend-impl.h | 4 +--
ggml/src/ggml-backend.cpp | 19 +++++++----
ggml/src/ggml-blas/ggml-blas.cpp | 3 +-
ggml/src/ggml-cpu/ggml-cpu.cpp | 4 ++-
ggml/src/ggml-cuda/ggml-cuda.cu | 48 +++++++++++++++++-----------
ggml/src/ggml-metal/ggml-metal.cpp | 4 ++-
ggml/src/ggml-vulkan/ggml-vulkan.cpp | 3 +-
8 files changed, 58 insertions(+), 32 deletions(-)
diff --git a/ggml/include/ggml-backend.h b/ggml/include/ggml-backend.h
index 229bf387b..2763f2bd6 100644
--- a/ggml/include/ggml-backend.h
+++ b/ggml/include/ggml-backend.h
@@ -98,7 +98,7 @@ extern "C" {
GGML_API enum ggml_status ggml_backend_graph_plan_compute (ggml_backend_t backend, ggml_backend_graph_plan_t plan);
GGML_API enum ggml_status ggml_backend_graph_compute (ggml_backend_t backend, struct ggml_cgraph * cgraph);
- GGML_API enum ggml_status ggml_backend_graph_compute_async(ggml_backend_t backend, struct ggml_cgraph * cgraph);
+ GGML_API enum ggml_status ggml_backend_graph_compute_async(ggml_backend_t backend, struct ggml_cgraph * cgraph, int batch_size);
// NOTE: will be removed, use device version instead
GGML_API bool ggml_backend_supports_op(ggml_backend_t backend, const struct ggml_tensor * op);
@@ -307,6 +307,9 @@ extern "C" {
GGML_API ggml_backend_sched_t ggml_backend_sched_new(ggml_backend_t * backends, ggml_backend_buffer_type_t * bufts, int n_backends, size_t graph_size, bool parallel, bool op_offload);
GGML_API void ggml_backend_sched_free(ggml_backend_sched_t sched);
+ // Provide a hint on the batch size to optimize processing (uses heuristics if unset)
+ GGML_API void ggml_backend_sched_set_batch_size(ggml_backend_sched_t sched, int batch_size);
+
// Initialize backend buffers from a measure graph
GGML_API bool ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph); // returns success
diff --git a/ggml/src/ggml-backend-impl.h b/ggml/src/ggml-backend-impl.h
index 6792ba986..0f5b03cef 100644
--- a/ggml/src/ggml-backend-impl.h
+++ b/ggml/src/ggml-backend-impl.h
@@ -106,8 +106,8 @@ extern "C" {
// compute the graph with the plan
enum ggml_status (*graph_plan_compute)(ggml_backend_t backend, ggml_backend_graph_plan_t plan);
- // compute graph (always async if supported by the backend)
- enum ggml_status (*graph_compute) (ggml_backend_t backend, struct ggml_cgraph * cgraph);
+ // compute graph (always async if supported by the backend). batch_size may be -1 if unknown
+ enum ggml_status (*graph_compute) (ggml_backend_t backend, struct ggml_cgraph * cgraph, int batch_size);
// (optional) event synchronization
// record an event on this stream
diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp
index cb2b99562..41eef3b5f 100644
--- a/ggml/src/ggml-backend.cpp
+++ b/ggml/src/ggml-backend.cpp
@@ -348,14 +348,14 @@ enum ggml_status ggml_backend_graph_plan_compute(ggml_backend_t backend, ggml_ba
}
enum ggml_status ggml_backend_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
- enum ggml_status err = ggml_backend_graph_compute_async(backend, cgraph);
+ enum ggml_status err = ggml_backend_graph_compute_async(backend, cgraph, -1);
ggml_backend_synchronize(backend);
return err;
}
-enum ggml_status ggml_backend_graph_compute_async(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
+enum ggml_status ggml_backend_graph_compute_async(ggml_backend_t backend, struct ggml_cgraph * cgraph, int batch_size) {
GGML_ASSERT(backend);
- return backend->iface.graph_compute(backend, cgraph);
+ return backend->iface.graph_compute(backend, cgraph, batch_size);
}
bool ggml_backend_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) {
@@ -722,6 +722,8 @@ struct ggml_backend_sched {
bool op_offload;
+ int batch_size; // a hint on the batch size to optimize processing, -1 to use heuristics
+
int debug;
};
@@ -814,7 +816,7 @@ static int ggml_backend_sched_backend_id_from_cur(ggml_backend_sched_t sched, st
if (tensor->op != GGML_OP_ROPE && src->buffer != NULL && src->buffer->usage == GGML_BACKEND_BUFFER_USAGE_WEIGHTS) {
int src_backend_id = ggml_backend_sched_backend_from_buffer(sched, src, tensor);
// check if a backend with higher prio wants to offload the op
- if (sched->op_offload && src_backend_id == sched->n_backends - 1 && ggml_backend_buffer_is_host(src->buffer)) {
+ if (sched->op_offload && (sched->batch_size < 0 || sched->batch_size >= 32) && src_backend_id == sched->n_backends - 1 && ggml_backend_buffer_is_host(src->buffer)) {
for (int b = 0; b < src_backend_id; b++) {
if (ggml_backend_supports_op(sched->backends[b], tensor) && ggml_backend_offload_op(sched->backends[b], tensor)) {
SET_CAUSE(tensor, "1.off");
@@ -1550,7 +1552,7 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s
}
if (!sched->callback_eval) {
- enum ggml_status ec = ggml_backend_graph_compute_async(split_backend, &split->graph);
+ enum ggml_status ec = ggml_backend_graph_compute_async(split_backend, &split->graph, sched->batch_size);
if (ec != GGML_STATUS_SUCCESS) {
return ec;
}
@@ -1572,7 +1574,7 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s
struct ggml_cgraph gv = ggml_graph_view(&split->graph, j0, j1 + 1);
- enum ggml_status ec = ggml_backend_graph_compute_async(split_backend, &gv);
+ enum ggml_status ec = ggml_backend_graph_compute_async(split_backend, &gv, sched->batch_size);
if (ec != GGML_STATUS_SUCCESS) {
return ec;
}
@@ -1651,6 +1653,7 @@ ggml_backend_sched_t ggml_backend_sched_new(
sched->galloc = ggml_gallocr_new_n(sched->bufts, n_backends);
sched->op_offload = op_offload;
+ sched->batch_size = -1;
ggml_backend_sched_reset(sched);
@@ -1682,6 +1685,10 @@ void ggml_backend_sched_free(ggml_backend_sched_t sched) {
free(sched);
}
+void ggml_backend_sched_set_batch_size(ggml_backend_sched_t sched, int batch_size) {
+ sched->batch_size = batch_size;
+}
+
void ggml_backend_sched_reset(ggml_backend_sched_t sched) {
GGML_ASSERT(sched);
// reset state for the next run
diff --git a/ggml/src/ggml-blas/ggml-blas.cpp b/ggml/src/ggml-blas/ggml-blas.cpp
index 5b888cdd8..88d088952 100644
--- a/ggml/src/ggml-blas/ggml-blas.cpp
+++ b/ggml/src/ggml-blas/ggml-blas.cpp
@@ -224,7 +224,7 @@ static void ggml_backend_blas_free(ggml_backend_t backend) {
delete backend;
}
-static enum ggml_status ggml_backend_blas_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
+static enum ggml_status ggml_backend_blas_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph, int batch_size) {
ggml_backend_blas_context * ctx = (ggml_backend_blas_context *)backend->context;
for (int i = 0; i < cgraph->n_nodes; i++) {
@@ -254,6 +254,7 @@ static enum ggml_status ggml_backend_blas_graph_compute(ggml_backend_t backend,
return GGML_STATUS_SUCCESS;
GGML_UNUSED(backend);
+ GGML_UNUSED(batch_size);
}
static struct ggml_backend_i blas_backend_i = {
diff --git a/ggml/src/ggml-cpu/ggml-cpu.cpp b/ggml/src/ggml-cpu/ggml-cpu.cpp
index 3191faaa4..32f14c811 100644
--- a/ggml/src/ggml-cpu/ggml-cpu.cpp
+++ b/ggml/src/ggml-cpu/ggml-cpu.cpp
@@ -164,7 +164,7 @@ static enum ggml_status ggml_backend_cpu_graph_plan_compute(ggml_backend_t backe
GGML_UNUSED(backend);
}
-static enum ggml_status ggml_backend_cpu_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
+static enum ggml_status ggml_backend_cpu_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph, int batch_size) {
struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context;
struct ggml_cplan cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads, cpu_ctx->threadpool);
@@ -184,6 +184,8 @@ static enum ggml_status ggml_backend_cpu_graph_compute(ggml_backend_t backend, s
cplan.abort_callback_data = cpu_ctx->abort_callback_data;
return ggml_graph_compute(cgraph, &cplan);
+
+ GGML_UNUSED(batch_size);
}
static const struct ggml_backend_i ggml_backend_cpu_i = {
diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
index 5b852f690..c555cd30f 100644
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
@@ -2684,7 +2684,7 @@ static void ggml_backend_cuda_synchronize(ggml_backend_t backend) {
#ifdef USE_CUDA_GRAPH
static bool check_node_graph_compatibility_and_refresh_copy_ops(ggml_backend_cuda_context * cuda_ctx, ggml_cgraph * cgraph,
- bool use_cuda_graph) {
+ int batch_size, bool use_cuda_graph) {
// Loop over nodes in GGML graph to obtain info needed for CUDA graph
cuda_ctx->cuda_graph->cpy_dest_ptrs.clear();
@@ -2718,24 +2718,34 @@ static bool check_node_graph_compatibility_and_refresh_copy_ops(ggml_backend_cud
#endif
}
- if (node->op == GGML_OP_ADD &&
- node->src[1] && node->src[1]->ne[1] > 1 &&
- (node->src[0] ? node->src[0]->name != gemma3n_per_layer_proj_src0_name : true) &&
- (node->src[1] ? node->src[1]->name != gemma3n_per_layer_proj_src1_name : true) &&
- strncmp(node->name, ffn_moe_gate_bias_prefix.c_str(), ffn_moe_gate_bias_prefix.size()) != 0 &&
- strncmp(node->name, ffn_moe_up_bias_prefix.c_str(), ffn_moe_up_bias_prefix.size()) != 0 &&
- strncmp(node->name, ffn_moe_down_bias_prefix.c_str(), ffn_moe_down_bias_prefix.size()) != 0 &&
- strncmp(node->name, nemotron_h_block_out_prefix.c_str(), nemotron_h_block_out_prefix.size()) != 0 &&
- strncmp(node->name, mamba2_y_add_d_prefix.c_str(), mamba2_y_add_d_prefix.size()) != 0) {
- // disable CUDA graphs for batch size > 1 for now while excluding the matrix-matrix addition as part of Gemma3n's `project_per_layer_input` operation
- // by means of matching node names. See
- // https://github.com/ggml-org/llama.cpp/blob/f9a31eea06a859e34cecb88b4d020c7f03d86cc4/src/llama-model.cpp#L10199-L10241 and
- // https://github.com/huggingface/transformers/blob/bda75b4011239d065de84aa3e744b67ebfa7b245/src/transformers/models/gemma3n/modeling_gemma3n.py#L1773,
- // Generally, changes in batch size or context size can cause changes to the grid size of some kernels.
- use_cuda_graph = false;
+ // If we have an explicit batch size hint then we don't need to use the tensor name heuristics
+ if (batch_size >= 0) {
+ if (batch_size > 1) {
+ use_cuda_graph = false;
#ifndef NDEBUG
- GGML_LOG_DEBUG("%s: disabling CUDA graphs due to batch size > 1 [%s] [%ld %ld %ld %ld]\n", __func__, node->name, node->ne[0], node->ne[1], node->ne[2], node->ne[3]);
+ GGML_LOG_DEBUG("%s: disabling CUDA graphs due to batch size > 1 [%d]\n", __func__, batch_size);
#endif
+ }
+ } else {
+ if (node->op == GGML_OP_ADD &&
+ node->src[1] && node->src[1]->ne[1] > 1 &&
+ (node->src[0] ? node->src[0]->name != gemma3n_per_layer_proj_src0_name : true) &&
+ (node->src[1] ? node->src[1]->name != gemma3n_per_layer_proj_src1_name : true) &&
+ strncmp(node->name, ffn_moe_gate_bias_prefix.c_str(), ffn_moe_gate_bias_prefix.size()) != 0 &&
+ strncmp(node->name, ffn_moe_up_bias_prefix.c_str(), ffn_moe_up_bias_prefix.size()) != 0 &&
+ strncmp(node->name, ffn_moe_down_bias_prefix.c_str(), ffn_moe_down_bias_prefix.size()) != 0 &&
+ strncmp(node->name, nemotron_h_block_out_prefix.c_str(), nemotron_h_block_out_prefix.size()) != 0 &&
+ strncmp(node->name, mamba2_y_add_d_prefix.c_str(), mamba2_y_add_d_prefix.size()) != 0) {
+ // disable CUDA graphs for batch size > 1 for now while excluding the matrix-matrix addition as part of Gemma3n's `project_per_layer_input` operation
+ // by means of matching node names. See
+ // https://github.com/ggml-org/llama.cpp/blob/f9a31eea06a859e34cecb88b4d020c7f03d86cc4/src/llama-model.cpp#L10199-L10241 and
+ // https://github.com/huggingface/transformers/blob/bda75b4011239d065de84aa3e744b67ebfa7b245/src/transformers/models/gemma3n/modeling_gemma3n.py#L1773,
+ // Generally, changes in batch size or context size can cause changes to the grid size of some kernels.
+ use_cuda_graph = false;
+#ifndef NDEBUG
+ GGML_LOG_DEBUG("%s: disabling CUDA graphs due to batch size > 1 [%s] [%ld %ld %ld %ld]\n", __func__, node->name, node->ne[0], node->ne[1], node->ne[2], node->ne[3]);
+#endif
+ }
}
if (node->op == GGML_OP_CPY) {
@@ -3132,7 +3142,7 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx
}
}
-static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
+static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph, int batch_size) {
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
ggml_cuda_set_device(cuda_ctx->device);
@@ -3170,7 +3180,7 @@ static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend,
if (use_cuda_graph) {
cuda_graph_update_required = is_cuda_graph_update_required(cuda_ctx, cgraph);
- use_cuda_graph = check_node_graph_compatibility_and_refresh_copy_ops(cuda_ctx, cgraph, use_cuda_graph);
+ use_cuda_graph = check_node_graph_compatibility_and_refresh_copy_ops(cuda_ctx, cgraph, batch_size, use_cuda_graph);
// Disable CUDA graphs (from the next token) if the use-case is demanding too many consecutive graph updates.
if (use_cuda_graph && cuda_graph_update_required) {
diff --git a/ggml/src/ggml-metal/ggml-metal.cpp b/ggml/src/ggml-metal/ggml-metal.cpp
index f2ff9f322..05ff6a5a6 100644
--- a/ggml/src/ggml-metal/ggml-metal.cpp
+++ b/ggml/src/ggml-metal/ggml-metal.cpp
@@ -410,10 +410,12 @@ static bool ggml_backend_metal_cpy_tensor_async(ggml_backend_t backend_src, ggml
GGML_UNUSED(dst);
}
-static enum ggml_status ggml_backend_metal_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
+static enum ggml_status ggml_backend_metal_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph, int batch_size) {
ggml_metal_t ctx = (ggml_metal_t)backend->context;
return ggml_metal_graph_compute(ctx, cgraph);
+
+ GGML_UNUSED(batch_size);
}
static void ggml_backend_metal_graph_optimize(ggml_backend_t backend, ggml_cgraph * cgraph) {
diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
index ed83236f4..bd3ece516 100644
--- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp
+++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
@@ -12015,7 +12015,7 @@ static uint32_t ggml_vk_fuse_multi_add(ggml_backend_vk_context * ctx, const stru
return num_adds;
}
-static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
+static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph, int batch_size) {
VK_LOG_DEBUG("ggml_backend_vk_graph_compute(" << cgraph->n_nodes << " nodes)");
ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context;
@@ -12211,6 +12211,7 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg
return GGML_STATUS_SUCCESS;
UNUSED(backend);
+ UNUSED(batch_size);
}
// Sort the graph for improved parallelism.

View File

@ -16,7 +16,7 @@ must be recreated with no-alloc set to false before loading data.
5 files changed, 310 insertions(+), 44 deletions(-)
diff --git a/ggml/include/ggml-backend.h b/ggml/include/ggml-backend.h
index 229bf387..1ff53ed0 100644
index 2763f2bd6..b3b5b356a 100644
--- a/ggml/include/ggml-backend.h
+++ b/ggml/include/ggml-backend.h
@@ -305,6 +305,7 @@ extern "C" {
@ -26,9 +26,9 @@ index 229bf387..1ff53ed0 100644
+ GGML_API ggml_backend_sched_t ggml_backend_sched_new_ext(ggml_backend_t * backends, ggml_backend_buffer_type_t * bufts, int n_backends, size_t graph_size, bool parallel, bool op_offload, bool alloc_buffers);
GGML_API void ggml_backend_sched_free(ggml_backend_sched_t sched);
// Initialize backend buffers from a measure graph
// Provide a hint on the batch size to optimize processing (uses heuristics if unset)
diff --git a/ggml/src/ggml-backend-impl.h b/ggml/src/ggml-backend-impl.h
index 6792ba98..3c3f22fc 100644
index 0f5b03cef..7bdf9d81f 100644
--- a/ggml/src/ggml-backend-impl.h
+++ b/ggml/src/ggml-backend-impl.h
@@ -26,12 +26,17 @@ extern "C" {
@ -75,7 +75,7 @@ index 6792ba98..3c3f22fc 100644
struct ggml_backend {
diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp
index cb2b9956..6ef5eeaf 100644
index 41eef3b5f..c81a2e48a 100644
--- a/ggml/src/ggml-backend.cpp
+++ b/ggml/src/ggml-backend.cpp
@@ -41,6 +41,19 @@ ggml_backend_buffer_t ggml_backend_buft_alloc_buffer(ggml_backend_buffer_type_t
@ -121,8 +121,8 @@ index cb2b9956..6ef5eeaf 100644
void * base = buffer->iface.get_base(buffer);
GGML_ASSERT(base != NULL && "backend buffer base cannot be NULL");
@@ -723,6 +743,12 @@ struct ggml_backend_sched {
bool op_offload;
@@ -725,6 +745,12 @@ struct ggml_backend_sched {
int batch_size; // a hint on the batch size to optimize processing, -1 to use heuristics
int debug;
+
@ -134,7 +134,7 @@ index cb2b9956..6ef5eeaf 100644
};
#define hash_id(tensor) ggml_hash_find_or_insert(&sched->hash_set, tensor)
@@ -1606,6 +1632,17 @@ ggml_backend_sched_t ggml_backend_sched_new(
@@ -1608,6 +1634,17 @@ ggml_backend_sched_t ggml_backend_sched_new(
size_t graph_size,
bool parallel,
bool op_offload) {
@ -152,7 +152,7 @@ index cb2b9956..6ef5eeaf 100644
GGML_ASSERT(n_backends > 0);
GGML_ASSERT(n_backends <= GGML_SCHED_MAX_BACKENDS);
GGML_ASSERT(ggml_backend_dev_type(ggml_backend_get_device(backends[n_backends - 1])) == GGML_BACKEND_DEVICE_TYPE_CPU);
@@ -1647,10 +1684,13 @@ ggml_backend_sched_t ggml_backend_sched_new(
@@ -1649,11 +1686,14 @@ ggml_backend_sched_t ggml_backend_sched_new(
sched->events[b][c] = ggml_backend_event_new(backends[b]->device);
}
}
@ -162,11 +162,12 @@ index cb2b9956..6ef5eeaf 100644
sched->galloc = ggml_gallocr_new_n(sched->bufts, n_backends);
sched->op_offload = op_offload;
sched->batch_size = -1;
+ sched->alloc_buffers = alloc_buffers;
ggml_backend_sched_reset(sched);
@@ -1665,6 +1705,10 @@ void ggml_backend_sched_free(ggml_backend_sched_t sched) {
@@ -1668,6 +1708,10 @@ void ggml_backend_sched_free(ggml_backend_sched_t sched) {
for (int c = 0; c < sched->n_copies; c++) {
ggml_backend_event_free(sched->events[b][c]);
}
@ -177,7 +178,7 @@ index cb2b9956..6ef5eeaf 100644
}
ggml_gallocr_free(sched->galloc);
ggml_free(sched->ctx);
@@ -1708,6 +1752,24 @@ bool ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph *
@@ -1715,6 +1759,24 @@ bool ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph *
return false;
}
@ -202,7 +203,7 @@ index cb2b9956..6ef5eeaf 100644
ggml_backend_sched_reset(sched);
return true;
@@ -1813,7 +1875,13 @@ size_t ggml_backend_sched_get_attempted_buffer_size(ggml_backend_sched_t sched,
@@ -1820,7 +1882,13 @@ size_t ggml_backend_sched_get_attempted_buffer_size(ggml_backend_sched_t sched,
int backend_index = ggml_backend_sched_backend_id(sched, backend);
GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends);
@ -218,7 +219,7 @@ index cb2b9956..6ef5eeaf 100644
void ggml_backend_sched_set_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node, ggml_backend_t backend) {
diff --git a/ggml/src/ggml-cuda/common.cuh b/ggml/src/ggml-cuda/common.cuh
index e0abde54..28d6bcd7 100644
index e0abde542..28d6bcd71 100644
--- a/ggml/src/ggml-cuda/common.cuh
+++ b/ggml/src/ggml-cuda/common.cuh
@@ -35,6 +35,31 @@
@ -299,7 +300,7 @@ index e0abde54..28d6bcd7 100644
+ }
};
diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
index 827e3205..811462c7 100644
index f4d4a4267..ac70dcac8 100644
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
@@ -350,6 +350,8 @@ const ggml_cuda_device_info & ggml_cuda_info() {
@ -540,7 +541,7 @@ index 827e3205..811462c7 100644
};
ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device) {
@@ -3011,6 +3073,7 @@ static bool ggml_cuda_can_fuse(const struct ggml_cgraph * cgraph, int node_idx,
@@ -3003,6 +3065,7 @@ static bool ggml_cuda_can_fuse(const struct ggml_cgraph * cgraph, int node_idx,
static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx, ggml_cgraph * cgraph,
bool & graph_evaluated_or_captured, bool & use_cuda_graph, bool & cuda_graph_update_required) {
@ -548,7 +549,7 @@ index 827e3205..811462c7 100644
// flag used to determine whether it is an integrated_gpu
const bool integrated = ggml_cuda_info().devices[cuda_ctx->device].integrated;
@@ -3026,6 +3089,11 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx
@@ -3018,6 +3081,11 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx
continue;
}
@ -560,15 +561,15 @@ index 827e3205..811462c7 100644
static bool disable_fusion = (getenv("GGML_CUDA_DISABLE_FUSION") != nullptr);
if (!disable_fusion) {
@@ -3152,6 +3220,7 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx
@@ -3144,6 +3212,7 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx
static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph, int batch_size) {
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
+ cuda_ctx->pool_set_alloc(true);
ggml_cuda_set_device(cuda_ctx->device);
@@ -3231,6 +3300,71 @@ static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend,
@@ -3223,6 +3292,71 @@ static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend,
return GGML_STATUS_SUCCESS;
}
@ -640,7 +641,7 @@ index 827e3205..811462c7 100644
static void ggml_backend_cuda_event_record(ggml_backend_t backend, ggml_backend_event_t event) {
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
@@ -3271,6 +3405,9 @@ static const ggml_backend_i ggml_backend_cuda_interface = {
@@ -3263,6 +3397,9 @@ static const ggml_backend_i ggml_backend_cuda_interface = {
/* .event_record = */ ggml_backend_cuda_event_record,
/* .event_wait = */ ggml_backend_cuda_event_wait,
/* .graph_optimize = */ NULL,

View File

@ -106,6 +106,11 @@ type Context interface {
Arange(start, stop, step float32, dtype DType) Tensor
Forward(...Tensor) Context
// SetBatchSize provides a hint on the batch size to optimize processing
// Uses heuristics if not set
SetBatchSize(int)
Compute(...Tensor)
ComputeWithNotify(func(), ...Tensor) // notify callback once compute has begun

View File

@ -386,7 +386,7 @@ func New(modelPath string, params ml.BackendParams) (ml.Backend, error) {
C.int(len(schedBackends)),
C.size_t(maxGraphNodes),
C._Bool(false),
C._Bool(false),
C._Bool(true),
C._Bool(params.AllocMemory),
)
@ -749,6 +749,9 @@ type Context struct {
ctx *C.struct_ggml_context
graph *C.struct_ggml_cgraph
// batchSize is a hint to optimize processing
batchSize int
// buft is the buffer type used for new tensors
buft C.ggml_backend_buffer_type_t
@ -805,6 +808,10 @@ func (c *Context) Forward(tensors ...ml.Tensor) ml.Context {
return c
}
func (c *Context) SetBatchSize(batchSize int) {
c.batchSize = batchSize
}
func (c *Context) Compute(tensors ...ml.Tensor) {
c.ComputeWithNotify(nil, tensors...)
}
@ -815,6 +822,11 @@ func (c *Context) ComputeWithNotify(cb func(), tensors ...ml.Tensor) {
if cb != nil {
go cb()
}
if c.batchSize > 0 {
C.ggml_backend_sched_set_batch_size(c.b.sched, C.int(c.batchSize))
}
if status := C.ggml_backend_sched_graph_compute_async(c.b.sched, c.graph); status != C.GGML_STATUS_SUCCESS {
panic(fmt.Errorf("error computing ggml graph: %v", status))
}
@ -836,6 +848,10 @@ func (c *Context) ComputeWithNotify(cb func(), tensors ...ml.Tensor) {
}
func (c *Context) Reserve() {
if c.batchSize > 0 {
C.ggml_backend_sched_set_batch_size(c.b.sched, C.int(c.batchSize))
}
reserved := C.ggml_backend_sched_reserve(c.b.sched, c.graph)
slog.Debug("compute graph", "nodes", C.ggml_graph_n_nodes(c.graph), "splits", C.ggml_backend_sched_get_n_splits(c.b.sched))

View File

@ -98,7 +98,7 @@ extern "C" {
GGML_API enum ggml_status ggml_backend_graph_plan_compute (ggml_backend_t backend, ggml_backend_graph_plan_t plan);
GGML_API enum ggml_status ggml_backend_graph_compute (ggml_backend_t backend, struct ggml_cgraph * cgraph);
GGML_API enum ggml_status ggml_backend_graph_compute_async(ggml_backend_t backend, struct ggml_cgraph * cgraph);
GGML_API enum ggml_status ggml_backend_graph_compute_async(ggml_backend_t backend, struct ggml_cgraph * cgraph, int batch_size);
// NOTE: will be removed, use device version instead
GGML_API bool ggml_backend_supports_op(ggml_backend_t backend, const struct ggml_tensor * op);
@ -317,6 +317,9 @@ extern "C" {
GGML_API ggml_backend_sched_t ggml_backend_sched_new_ext(ggml_backend_t * backends, ggml_backend_buffer_type_t * bufts, int n_backends, size_t graph_size, bool parallel, bool op_offload, bool alloc_buffers);
GGML_API void ggml_backend_sched_free(ggml_backend_sched_t sched);
// Provide a hint on the batch size to optimize processing (uses heuristics if unset)
GGML_API void ggml_backend_sched_set_batch_size(ggml_backend_sched_t sched, int batch_size);
// Initialize backend buffers from a measure graph
GGML_API bool ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph); // returns success

View File

@ -112,8 +112,8 @@ extern "C" {
// compute the graph with the plan
enum ggml_status (*graph_plan_compute)(ggml_backend_t backend, ggml_backend_graph_plan_t plan);
// compute graph (always async if supported by the backend)
enum ggml_status (*graph_compute) (ggml_backend_t backend, struct ggml_cgraph * cgraph);
// compute graph (always async if supported by the backend). batch_size may be -1 if unknown
enum ggml_status (*graph_compute) (ggml_backend_t backend, struct ggml_cgraph * cgraph, int batch_size);
// (optional) event synchronization
// record an event on this stream

View File

@ -368,14 +368,14 @@ enum ggml_status ggml_backend_graph_plan_compute(ggml_backend_t backend, ggml_ba
}
enum ggml_status ggml_backend_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
enum ggml_status err = ggml_backend_graph_compute_async(backend, cgraph);
enum ggml_status err = ggml_backend_graph_compute_async(backend, cgraph, -1);
ggml_backend_synchronize(backend);
return err;
}
enum ggml_status ggml_backend_graph_compute_async(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
enum ggml_status ggml_backend_graph_compute_async(ggml_backend_t backend, struct ggml_cgraph * cgraph, int batch_size) {
GGML_ASSERT(backend);
return backend->iface.graph_compute(backend, cgraph);
return backend->iface.graph_compute(backend, cgraph, batch_size);
}
bool ggml_backend_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) {
@ -750,6 +750,8 @@ struct ggml_backend_sched {
bool op_offload;
int batch_size; // a hint on the batch size to optimize processing, -1 to use heuristics
int debug;
// allocate buffers on attached ggml_backend_buffer_type_t's and during reservation
@ -848,7 +850,7 @@ static int ggml_backend_sched_backend_id_from_cur(ggml_backend_sched_t sched, st
if (tensor->op != GGML_OP_ROPE && src->buffer != NULL && src->buffer->usage == GGML_BACKEND_BUFFER_USAGE_WEIGHTS) {
int src_backend_id = ggml_backend_sched_backend_from_buffer(sched, src, tensor);
// check if a backend with higher prio wants to offload the op
if (sched->op_offload && src_backend_id == sched->n_backends - 1 && ggml_backend_buffer_is_host(src->buffer)) {
if (sched->op_offload && (sched->batch_size < 0 || sched->batch_size >= 32) && src_backend_id == sched->n_backends - 1 && ggml_backend_buffer_is_host(src->buffer)) {
for (int b = 0; b < src_backend_id; b++) {
if (ggml_backend_supports_op(sched->backends[b], tensor) && ggml_backend_offload_op(sched->backends[b], tensor)) {
SET_CAUSE(tensor, "1.off");
@ -1584,7 +1586,7 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s
}
if (!sched->callback_eval) {
enum ggml_status ec = ggml_backend_graph_compute_async(split_backend, &split->graph);
enum ggml_status ec = ggml_backend_graph_compute_async(split_backend, &split->graph, sched->batch_size);
if (ec != GGML_STATUS_SUCCESS) {
return ec;
}
@ -1606,7 +1608,7 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s
struct ggml_cgraph gv = ggml_graph_view(&split->graph, j0, j1 + 1);
enum ggml_status ec = ggml_backend_graph_compute_async(split_backend, &gv);
enum ggml_status ec = ggml_backend_graph_compute_async(split_backend, &gv, sched->batch_size);
if (ec != GGML_STATUS_SUCCESS) {
return ec;
}
@ -1698,6 +1700,7 @@ ggml_backend_sched_t ggml_backend_sched_new_ext(
sched->galloc = ggml_gallocr_new_n(sched->bufts, n_backends);
sched->op_offload = op_offload;
sched->batch_size = -1;
sched->alloc_buffers = alloc_buffers;
ggml_backend_sched_reset(sched);
@ -1734,6 +1737,10 @@ void ggml_backend_sched_free(ggml_backend_sched_t sched) {
free(sched);
}
void ggml_backend_sched_set_batch_size(ggml_backend_sched_t sched, int batch_size) {
sched->batch_size = batch_size;
}
void ggml_backend_sched_reset(ggml_backend_sched_t sched) {
GGML_ASSERT(sched);
// reset state for the next run

View File

@ -224,7 +224,7 @@ static void ggml_backend_blas_free(ggml_backend_t backend) {
delete backend;
}
static enum ggml_status ggml_backend_blas_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
static enum ggml_status ggml_backend_blas_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph, int batch_size) {
ggml_backend_blas_context * ctx = (ggml_backend_blas_context *)backend->context;
for (int i = 0; i < cgraph->n_nodes; i++) {
@ -254,6 +254,7 @@ static enum ggml_status ggml_backend_blas_graph_compute(ggml_backend_t backend,
return GGML_STATUS_SUCCESS;
GGML_UNUSED(backend);
GGML_UNUSED(batch_size);
}
static struct ggml_backend_i blas_backend_i = {

View File

@ -164,7 +164,7 @@ static enum ggml_status ggml_backend_cpu_graph_plan_compute(ggml_backend_t backe
GGML_UNUSED(backend);
}
static enum ggml_status ggml_backend_cpu_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
static enum ggml_status ggml_backend_cpu_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph, int batch_size) {
struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context;
struct ggml_cplan cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads, cpu_ctx->threadpool);
@ -184,6 +184,8 @@ static enum ggml_status ggml_backend_cpu_graph_compute(ggml_backend_t backend, s
cplan.abort_callback_data = cpu_ctx->abort_callback_data;
return ggml_graph_compute(cgraph, &cplan);
GGML_UNUSED(batch_size);
}
static const struct ggml_backend_i ggml_backend_cpu_i = {

View File

@ -2775,31 +2775,19 @@ static void ggml_backend_cuda_synchronize(ggml_backend_t backend) {
#ifdef USE_CUDA_GRAPH
static bool check_node_graph_compatibility_and_refresh_copy_ops(ggml_backend_cuda_context * cuda_ctx, ggml_cgraph * cgraph,
bool use_cuda_graph) {
int batch_size, bool use_cuda_graph) {
// Loop over nodes in GGML graph to obtain info needed for CUDA graph
cuda_ctx->cuda_graph->cpy_dest_ptrs.clear();
// This fix was added in llama.cpp and Ollama in parallel, but with
// different tensor names.
// llama.cpp: https://github.com/ggml-org/llama.cpp/pull/14741
// ollama: https://github.com/ollama/ollama/pull/11525
const std::string gemma3n_per_layer_proj_src1_name_ollama = " (reshaped)";
const std::string gemma3n_node_name_ollama = "node_";
const std::string gemma3n_per_layer_proj_src0_name = "inp_per_layer_selected";
const std::string gemma3n_per_layer_proj_src1_name = "per_layer_proj";
const std::string ffn_moe_bias_suffix = "_exps.bias";
const std::string ffn_moe_gate_bias_prefix = "ffn_moe_gate_biased";
const std::string ffn_moe_up_bias_prefix = "ffn_moe_up_biased";
const std::string ffn_moe_down_bias_prefix = "ffn_moe_down_biased";
const std::string nemotron_h_block_out_prefix = "nemotron_h_block_out";
const std::string mamba2_y_add_d_prefix = "mamba2_y_add_d";
for (int i = 0; i < cgraph->n_nodes; i++) {
ggml_tensor * node = cgraph->nodes[i];
@ -2821,30 +2809,34 @@ static bool check_node_graph_compatibility_and_refresh_copy_ops(ggml_backend_cud
#endif
}
if (node->op == GGML_OP_ADD &&
node->src[1] && node->src[1]->ne[1] > 1 &&
// ollama
// workarounds to exclude Gemma3n's `project_per_layer_input` operation from the batch-size heuristic, specific to ollama's implementation of gemma3n
// number of layers is different for per_layer_proj between gemma3n:2b and gemma3n:4b, which is why we don't check that value here
!(node->ne[0] == 256 && node->ne[2] == 1 && node->ne[3] == 1 && node->src[0] ? std::string(node->src[0]->name).find(gemma3n_node_name_ollama) != std::string::npos : false && node->src[1] ? node->src[1]->name == gemma3n_per_layer_proj_src1_name_ollama : false) &&
node->src[1] ? std::string(node->src[1]->name).find(ffn_moe_bias_suffix) == std::string::npos : false &&
// upstream
(node->src[0] ? node->src[0]->name != gemma3n_per_layer_proj_src0_name : true) &&
(node->src[1] ? node->src[1]->name != gemma3n_per_layer_proj_src1_name : true) &&
strncmp(node->name, ffn_moe_gate_bias_prefix.c_str(), ffn_moe_gate_bias_prefix.size()) != 0 &&
strncmp(node->name, ffn_moe_up_bias_prefix.c_str(), ffn_moe_up_bias_prefix.size()) != 0 &&
strncmp(node->name, ffn_moe_down_bias_prefix.c_str(), ffn_moe_down_bias_prefix.size()) != 0 &&
strncmp(node->name, nemotron_h_block_out_prefix.c_str(), nemotron_h_block_out_prefix.size()) != 0 &&
strncmp(node->name, mamba2_y_add_d_prefix.c_str(), mamba2_y_add_d_prefix.size()) != 0) {
// disable CUDA graphs for batch size > 1 for now while excluding the matrix-matrix addition as part of Gemma3n's `project_per_layer_input` operation
// by means of matching node names. See
// https://github.com/ggml-org/llama.cpp/blob/f9a31eea06a859e34cecb88b4d020c7f03d86cc4/src/llama-model.cpp#L10199-L10241 and
// https://github.com/huggingface/transformers/blob/bda75b4011239d065de84aa3e744b67ebfa7b245/src/transformers/models/gemma3n/modeling_gemma3n.py#L1773,
// Generally, changes in batch size or context size can cause changes to the grid size of some kernels.
use_cuda_graph = false;
// If we have an explicit batch size hint then we don't need to use the tensor name heuristics
if (batch_size >= 0) {
if (batch_size > 1) {
use_cuda_graph = false;
#ifndef NDEBUG
GGML_LOG_DEBUG("%s: disabling CUDA graphs due to batch size > 1 [%s] [%ld %ld %ld %ld]\n", __func__, node->name, node->ne[0], node->ne[1], node->ne[2], node->ne[3]);
GGML_LOG_DEBUG("%s: disabling CUDA graphs due to batch size > 1 [%d]\n", __func__, batch_size);
#endif
}
} else {
if (node->op == GGML_OP_ADD &&
node->src[1] && node->src[1]->ne[1] > 1 &&
(node->src[0] ? node->src[0]->name != gemma3n_per_layer_proj_src0_name : true) &&
(node->src[1] ? node->src[1]->name != gemma3n_per_layer_proj_src1_name : true) &&
strncmp(node->name, ffn_moe_gate_bias_prefix.c_str(), ffn_moe_gate_bias_prefix.size()) != 0 &&
strncmp(node->name, ffn_moe_up_bias_prefix.c_str(), ffn_moe_up_bias_prefix.size()) != 0 &&
strncmp(node->name, ffn_moe_down_bias_prefix.c_str(), ffn_moe_down_bias_prefix.size()) != 0 &&
strncmp(node->name, nemotron_h_block_out_prefix.c_str(), nemotron_h_block_out_prefix.size()) != 0 &&
strncmp(node->name, mamba2_y_add_d_prefix.c_str(), mamba2_y_add_d_prefix.size()) != 0) {
// disable CUDA graphs for batch size > 1 for now while excluding the matrix-matrix addition as part of Gemma3n's `project_per_layer_input` operation
// by means of matching node names. See
// https://github.com/ggml-org/llama.cpp/blob/f9a31eea06a859e34cecb88b4d020c7f03d86cc4/src/llama-model.cpp#L10199-L10241 and
// https://github.com/huggingface/transformers/blob/bda75b4011239d065de84aa3e744b67ebfa7b245/src/transformers/models/gemma3n/modeling_gemma3n.py#L1773,
// Generally, changes in batch size or context size can cause changes to the grid size of some kernels.
use_cuda_graph = false;
#ifndef NDEBUG
GGML_LOG_DEBUG("%s: disabling CUDA graphs due to batch size > 1 [%s] [%ld %ld %ld %ld]\n", __func__, node->name, node->ne[0], node->ne[1], node->ne[2], node->ne[3]);
#endif
}
}
if (node->op == GGML_OP_CPY) {
@ -3247,7 +3239,7 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx
}
}
static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph, int batch_size) {
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
cuda_ctx->pool_set_alloc(true);
@ -3286,7 +3278,7 @@ static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend,
if (use_cuda_graph) {
cuda_graph_update_required = is_cuda_graph_update_required(cuda_ctx, cgraph);
use_cuda_graph = check_node_graph_compatibility_and_refresh_copy_ops(cuda_ctx, cgraph, use_cuda_graph);
use_cuda_graph = check_node_graph_compatibility_and_refresh_copy_ops(cuda_ctx, cgraph, batch_size, use_cuda_graph);
// Disable CUDA graphs (from the next token) if the use-case is demanding too many consecutive graph updates.
if (use_cuda_graph && cuda_graph_update_required) {

View File

@ -410,10 +410,12 @@ static bool ggml_backend_metal_cpy_tensor_async(ggml_backend_t backend_src, ggml
GGML_UNUSED(dst);
}
static enum ggml_status ggml_backend_metal_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
static enum ggml_status ggml_backend_metal_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph, int batch_size) {
ggml_metal_t ctx = (ggml_metal_t)backend->context;
return ggml_metal_graph_compute(ctx, cgraph);
GGML_UNUSED(batch_size);
}
static void ggml_backend_metal_graph_optimize(ggml_backend_t backend, ggml_cgraph * cgraph) {

View File

@ -12039,7 +12039,7 @@ static uint32_t ggml_vk_fuse_multi_add(ggml_backend_vk_context * ctx, const stru
return num_adds;
}
static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph, int batch_size) {
VK_LOG_DEBUG("ggml_backend_vk_graph_compute(" << cgraph->n_nodes << " nodes)");
ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context;
@ -12235,6 +12235,7 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg
return GGML_STATUS_SUCCESS;
UNUSED(backend);
UNUSED(batch_size);
}
// Sort the graph for improved parallelism.

View File

@ -86,6 +86,9 @@ func (m multimodalStore) getTensor(backend ml.Backend, ctx ml.Context, in ml.Ten
computeCtx.Forward(tensors...)
entry.data = make([][]float32, len(entry.mm))
// Multimodal processing is computationally intensive, so treat it similarly to a large batch
computeCtx.SetBatchSize(512)
if !reserve {
computeCtx.Compute(tensors...)

View File

@ -598,6 +598,7 @@ func (s *Server) forwardBatch(pendingBatch batchState) (nextBatch batchState, er
// Actual batchInputs values will be injected into the batch.Inputs tensor before calling Compute
batch.Inputs = nextBatch.ctx.Input().Empty(ml.DTypeI32, len(batchInputs))
batch.Outputs = nextBatch.ctx.Input().FromInts(batchOutputs, len(batchOutputs))
nextBatch.ctx.SetBatchSize(len(batchInputs))
nextBatch.modelOutput, err = model.Forward(nextBatch.ctx, s.model, batch)
if err != nil {
err = fmt.Errorf("failed to build graph: %w", err)
@ -1108,6 +1109,7 @@ func (s *Server) reserveWorstCaseGraph(prompt bool) error {
return err
}
ctx.SetBatchSize(batchSize)
ctx.Forward(t).Reserve()
return nil