mirror of
https://github.com/zebrajr/ollama.git
synced 2025-12-06 12:19:56 +01:00
ggml: Remove allocation status reporting
For each memory allocation we report the size of the (attempted) allocation and whether it succeeded or failed. The latter status reporting proved to be not that useful in practice as systems such as Windows can automatically overflow from VRAM into RAM, resultings in successful allocations even when there isn't enough memory where we wanted. As a result, this information is only used for debug logging, which isn't worthwhile enough for the amount of code. It also isn't fully accurate, as multiple allocations may result in partial failures.
This commit is contained in:
parent
0469861d9d
commit
734b57da0e
|
|
@ -4,48 +4,38 @@ Date: Fri, 18 Apr 2025 15:58:19 -0700
|
||||||
Subject: [PATCH] graph memory reporting on failure
|
Subject: [PATCH] graph memory reporting on failure
|
||||||
|
|
||||||
---
|
---
|
||||||
ggml/include/ggml-alloc.h | 6 ++++++
|
ggml/include/ggml-alloc.h | 1 +
|
||||||
ggml/include/ggml-backend.h | 6 ++++++
|
ggml/include/ggml-backend.h | 1 +
|
||||||
ggml/src/ggml-alloc.c | 38 +++++++++++++++++++++++++++++++++----
|
ggml/src/ggml-alloc.c | 36 ++++++++++++++++++++++++++++++++----
|
||||||
ggml/src/ggml-backend.cpp | 10 ++++++++++
|
ggml/src/ggml-backend.cpp | 7 +++++++
|
||||||
4 files changed, 56 insertions(+), 4 deletions(-)
|
4 files changed, 41 insertions(+), 4 deletions(-)
|
||||||
|
|
||||||
diff --git a/ggml/include/ggml-alloc.h b/ggml/include/ggml-alloc.h
|
diff --git a/ggml/include/ggml-alloc.h b/ggml/include/ggml-alloc.h
|
||||||
index 2cb150fd..781b1e10 100644
|
index 2cb150fd2..7ab3f0192 100644
|
||||||
--- a/ggml/include/ggml-alloc.h
|
--- a/ggml/include/ggml-alloc.h
|
||||||
+++ b/ggml/include/ggml-alloc.h
|
+++ b/ggml/include/ggml-alloc.h
|
||||||
@@ -66,6 +66,12 @@ GGML_API bool ggml_gallocr_alloc_graph(ggml_gallocr_t galloc, struct ggml_cgraph
|
@@ -65,6 +65,7 @@ GGML_API bool ggml_gallocr_reserve_n(
|
||||||
|
GGML_API bool ggml_gallocr_alloc_graph(ggml_gallocr_t galloc, struct ggml_cgraph * graph);
|
||||||
|
|
||||||
GGML_API size_t ggml_gallocr_get_buffer_size(ggml_gallocr_t galloc, int buffer_id);
|
GGML_API size_t ggml_gallocr_get_buffer_size(ggml_gallocr_t galloc, int buffer_id);
|
||||||
|
+GGML_API size_t ggml_gallocr_get_attempted_buffer_size(ggml_gallocr_t galloc, int buffer_id);
|
||||||
|
|
||||||
+struct ggml_allocr_buffer_status {
|
|
||||||
+ size_t size;
|
|
||||||
+ bool allocated;
|
|
||||||
+};
|
|
||||||
+GGML_API struct ggml_allocr_buffer_status ggml_gallocr_get_attempted_buffer_size(ggml_gallocr_t galloc, int buffer_id);
|
|
||||||
+
|
|
||||||
// Utils
|
// Utils
|
||||||
// Create a buffer and allocate all the tensors in a ggml_context
|
// Create a buffer and allocate all the tensors in a ggml_context
|
||||||
GGML_API struct ggml_backend_buffer * ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_context * ctx, ggml_backend_buffer_type_t buft);
|
|
||||||
diff --git a/ggml/include/ggml-backend.h b/ggml/include/ggml-backend.h
|
diff --git a/ggml/include/ggml-backend.h b/ggml/include/ggml-backend.h
|
||||||
index a2977ea2..8a91b381 100644
|
index a2977ea2e..e8cf30841 100644
|
||||||
--- a/ggml/include/ggml-backend.h
|
--- a/ggml/include/ggml-backend.h
|
||||||
+++ b/ggml/include/ggml-backend.h
|
+++ b/ggml/include/ggml-backend.h
|
||||||
@@ -304,6 +304,12 @@ extern "C" {
|
@@ -303,6 +303,7 @@ extern "C" {
|
||||||
|
GGML_API int ggml_backend_sched_get_n_copies(ggml_backend_sched_t sched);
|
||||||
|
|
||||||
GGML_API size_t ggml_backend_sched_get_buffer_size(ggml_backend_sched_t sched, ggml_backend_t backend);
|
GGML_API size_t ggml_backend_sched_get_buffer_size(ggml_backend_sched_t sched, ggml_backend_t backend);
|
||||||
|
+ GGML_API size_t ggml_backend_sched_get_attempted_buffer_size(ggml_backend_sched_t sched, ggml_backend_t backend);
|
||||||
|
|
||||||
+ struct ggml_backend_buffer_status {
|
|
||||||
+ size_t size;
|
|
||||||
+ bool allocated;
|
|
||||||
+ };
|
|
||||||
+ GGML_API struct ggml_backend_buffer_status ggml_backend_sched_get_attempted_buffer_size(ggml_backend_sched_t sched, ggml_backend_t backend);
|
|
||||||
+
|
|
||||||
GGML_API void ggml_backend_sched_set_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node, ggml_backend_t backend);
|
GGML_API void ggml_backend_sched_set_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node, ggml_backend_t backend);
|
||||||
GGML_API ggml_backend_t ggml_backend_sched_get_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node);
|
GGML_API ggml_backend_t ggml_backend_sched_get_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node);
|
||||||
|
|
||||||
diff --git a/ggml/src/ggml-alloc.c b/ggml/src/ggml-alloc.c
|
diff --git a/ggml/src/ggml-alloc.c b/ggml/src/ggml-alloc.c
|
||||||
index 8b6e6028..41c8c4a2 100644
|
index 8b6e60283..b58bd671d 100644
|
||||||
--- a/ggml/src/ggml-alloc.c
|
--- a/ggml/src/ggml-alloc.c
|
||||||
+++ b/ggml/src/ggml-alloc.c
|
+++ b/ggml/src/ggml-alloc.c
|
||||||
@@ -350,6 +350,7 @@ struct node_alloc {
|
@@ -350,6 +350,7 @@ struct node_alloc {
|
||||||
|
|
@ -108,11 +98,11 @@ index 8b6e6028..41c8c4a2 100644
|
||||||
}
|
}
|
||||||
|
|
||||||
bool ggml_gallocr_reserve(ggml_gallocr_t galloc, struct ggml_cgraph *graph) {
|
bool ggml_gallocr_reserve(ggml_gallocr_t galloc, struct ggml_cgraph *graph) {
|
||||||
@@ -920,6 +932,24 @@ size_t ggml_gallocr_get_buffer_size(ggml_gallocr_t galloc, int buffer_id) {
|
@@ -920,6 +932,22 @@ size_t ggml_gallocr_get_buffer_size(ggml_gallocr_t galloc, int buffer_id) {
|
||||||
return ggml_backend_buffer_get_size(galloc->buffers[buffer_id]);
|
return ggml_backend_buffer_get_size(galloc->buffers[buffer_id]);
|
||||||
}
|
}
|
||||||
|
|
||||||
+struct ggml_allocr_buffer_status ggml_gallocr_get_attempted_buffer_size(ggml_gallocr_t galloc, int buffer_id) {
|
+size_t ggml_gallocr_get_attempted_buffer_size(ggml_gallocr_t galloc, int buffer_id) {
|
||||||
+ GGML_ASSERT(buffer_id >= 0 && buffer_id < galloc->n_buffers);
|
+ GGML_ASSERT(buffer_id >= 0 && buffer_id < galloc->n_buffers);
|
||||||
+
|
+
|
||||||
+ for (int i = 0; i < buffer_id; i++) {
|
+ for (int i = 0; i < buffer_id; i++) {
|
||||||
|
|
@ -121,34 +111,29 @@ index 8b6e6028..41c8c4a2 100644
|
||||||
+ // (See above.) However, we need a different check because multiple buffers might be NULL in our
|
+ // (See above.) However, we need a different check because multiple buffers might be NULL in our
|
||||||
+ // case and we still want to know the attempted size.
|
+ // case and we still want to know the attempted size.
|
||||||
+
|
+
|
||||||
+ struct ggml_allocr_buffer_status status = {0, true};
|
+ return 0;
|
||||||
+ return status;
|
|
||||||
+ }
|
+ }
|
||||||
+ }
|
+ }
|
||||||
+
|
+
|
||||||
+ struct ggml_allocr_buffer_status status = {galloc->buffer_sizes[buffer_id], galloc->buffers[buffer_id] != NULL};
|
+ return galloc->buffer_sizes[buffer_id];
|
||||||
+ return status;
|
|
||||||
+}
|
+}
|
||||||
+
|
+
|
||||||
// utils
|
// utils
|
||||||
|
|
||||||
static void free_buffers(ggml_backend_buffer_t ** buffers, const size_t * n_buffers) {
|
static void free_buffers(ggml_backend_buffer_t ** buffers, const size_t * n_buffers) {
|
||||||
diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp
|
diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp
|
||||||
index 97f47abd..eded0291 100644
|
index 97f47abd2..d02a40e60 100644
|
||||||
--- a/ggml/src/ggml-backend.cpp
|
--- a/ggml/src/ggml-backend.cpp
|
||||||
+++ b/ggml/src/ggml-backend.cpp
|
+++ b/ggml/src/ggml-backend.cpp
|
||||||
@@ -1631,6 +1631,16 @@ size_t ggml_backend_sched_get_buffer_size(ggml_backend_sched_t sched, ggml_backe
|
@@ -1631,6 +1631,13 @@ size_t ggml_backend_sched_get_buffer_size(ggml_backend_sched_t sched, ggml_backe
|
||||||
return ggml_gallocr_get_buffer_size(sched->galloc, backend_index);
|
return ggml_gallocr_get_buffer_size(sched->galloc, backend_index);
|
||||||
}
|
}
|
||||||
|
|
||||||
+struct ggml_backend_buffer_status ggml_backend_sched_get_attempted_buffer_size(ggml_backend_sched_t sched, ggml_backend_t backend) {
|
+size_t ggml_backend_sched_get_attempted_buffer_size(ggml_backend_sched_t sched, ggml_backend_t backend) {
|
||||||
+ int backend_index = ggml_backend_sched_backend_id(sched, backend);
|
+ int backend_index = ggml_backend_sched_backend_id(sched, backend);
|
||||||
+ GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends);
|
+ GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends);
|
||||||
+
|
+
|
||||||
+ struct ggml_allocr_buffer_status allocr_status = ggml_gallocr_get_attempted_buffer_size(sched->galloc, backend_index);
|
+ return ggml_gallocr_get_attempted_buffer_size(sched->galloc, backend_index);
|
||||||
+ struct ggml_backend_buffer_status status = {allocr_status.size, allocr_status.allocated};
|
|
||||||
+
|
|
||||||
+ return status;
|
|
||||||
+}
|
+}
|
||||||
+
|
+
|
||||||
void ggml_backend_sched_set_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node, ggml_backend_t backend) {
|
void ggml_backend_sched_set_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node, ggml_backend_t backend) {
|
||||||
|
|
|
||||||
|
|
@ -853,19 +853,19 @@ func (s *ollamaServer) createLayout(systemInfo discover.SystemInfo, systemGPUs d
|
||||||
|
|
||||||
if memory == nil {
|
if memory == nil {
|
||||||
memory = &ml.BackendMemory{CPU: ml.DeviceMemory{
|
memory = &ml.BackendMemory{CPU: ml.DeviceMemory{
|
||||||
Weights: make([]ml.Memory, s.totalLayers),
|
Weights: make([]uint64, s.totalLayers),
|
||||||
Cache: make([]ml.Memory, s.totalLayers),
|
Cache: make([]uint64, s.totalLayers),
|
||||||
}}
|
}}
|
||||||
}
|
}
|
||||||
|
|
||||||
layers := make([]uint64, len(memory.CPU.Weights))
|
layers := make([]uint64, len(memory.CPU.Weights))
|
||||||
for i := range layers {
|
for i := range layers {
|
||||||
for j := range memory.GPUs {
|
for j := range memory.GPUs {
|
||||||
layers[i] += memory.GPUs[j].Weights[i].Size
|
layers[i] += memory.GPUs[j].Weights[i]
|
||||||
layers[i] += memory.GPUs[j].Cache[i].Size
|
layers[i] += memory.GPUs[j].Cache[i]
|
||||||
}
|
}
|
||||||
layers[i] += memory.CPU.Weights[i].Size
|
layers[i] += memory.CPU.Weights[i]
|
||||||
layers[i] += memory.CPU.Cache[i].Size
|
layers[i] += memory.CPU.Cache[i]
|
||||||
logutil.Trace("layer to assign", "layer", i, "size", format.HumanBytes2(layers[i]))
|
logutil.Trace("layer to assign", "layer", i, "size", format.HumanBytes2(layers[i]))
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -880,11 +880,11 @@ func (s *ollamaServer) createLayout(systemInfo discover.SystemInfo, systemGPUs d
|
||||||
found := false
|
found := false
|
||||||
for j := range memory.GPUs {
|
for j := range memory.GPUs {
|
||||||
if gl[i].ID == memory.GPUs[j].ID {
|
if gl[i].ID == memory.GPUs[j].ID {
|
||||||
if memory.GPUs[j].Graph.Size != 0 {
|
if memory.GPUs[j].Graph != 0 {
|
||||||
lastUsedGPU = i
|
lastUsedGPU = i
|
||||||
}
|
}
|
||||||
|
|
||||||
reserved := uint64(float32(gl[i].FreeMemory)*backoff) + gl[i].MinimumMemory + envconfig.GpuOverhead() + memory.GPUs[j].Graph.Size
|
reserved := uint64(float32(gl[i].FreeMemory)*backoff) + gl[i].MinimumMemory + envconfig.GpuOverhead() + memory.GPUs[j].Graph
|
||||||
if gl[i].FreeMemory > reserved {
|
if gl[i].FreeMemory > reserved {
|
||||||
gl[i].FreeMemory -= reserved
|
gl[i].FreeMemory -= reserved
|
||||||
} else {
|
} else {
|
||||||
|
|
@ -895,7 +895,7 @@ func (s *ollamaServer) createLayout(systemInfo discover.SystemInfo, systemGPUs d
|
||||||
"available layer vram", format.HumanBytes2(gl[i].FreeMemory),
|
"available layer vram", format.HumanBytes2(gl[i].FreeMemory),
|
||||||
"backoff", fmt.Sprintf("%.2f", backoff), "minimum", format.HumanBytes2(gl[i].MinimumMemory),
|
"backoff", fmt.Sprintf("%.2f", backoff), "minimum", format.HumanBytes2(gl[i].MinimumMemory),
|
||||||
"overhead", format.HumanBytes2(envconfig.GpuOverhead()),
|
"overhead", format.HumanBytes2(envconfig.GpuOverhead()),
|
||||||
"graph", format.HumanBytes2(memory.GPUs[j].Graph.Size))
|
"graph", format.HumanBytes2(memory.GPUs[j].Graph))
|
||||||
|
|
||||||
found = true
|
found = true
|
||||||
break
|
break
|
||||||
|
|
@ -914,12 +914,12 @@ func (s *ollamaServer) createLayout(systemInfo discover.SystemInfo, systemGPUs d
|
||||||
}
|
}
|
||||||
|
|
||||||
// These sizes will only increase as we go through additional iterations and get additional information.
|
// These sizes will only increase as we go through additional iterations and get additional information.
|
||||||
cpuSize := memory.InputWeights.Size + memory.CPU.Graph.Size
|
cpuSize := memory.InputWeights + memory.CPU.Graph
|
||||||
var vramSize uint64
|
var vramSize uint64
|
||||||
for _, gl := range gpuLayers {
|
for _, gl := range gpuLayers {
|
||||||
for _, gpu := range memory.GPUs {
|
for _, gpu := range memory.GPUs {
|
||||||
if gl.ID == gpu.ID {
|
if gl.ID == gpu.ID {
|
||||||
vramSize += gpu.Graph.Size
|
vramSize += gpu.Graph
|
||||||
break
|
break
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
@ -1723,21 +1723,21 @@ func (s *ollamaServer) VRAMSize() uint64 {
|
||||||
var mem uint64
|
var mem uint64
|
||||||
|
|
||||||
for _, g := range s.mem.GPUs {
|
for _, g := range s.mem.GPUs {
|
||||||
mem += g.Allocated()
|
mem += g.Size()
|
||||||
}
|
}
|
||||||
|
|
||||||
// Some elements are always on CPU. However, if we have allocated all layers
|
// Some elements are always on CPU. However, if we have allocated all layers
|
||||||
// on the GPU then include the CPU components as well, to represent complete offloading.
|
// on the GPU then include the CPU components as well, to represent complete offloading.
|
||||||
noCPULayers := true
|
noCPULayers := true
|
||||||
for i := range s.mem.CPU.Weights {
|
for i := range s.mem.CPU.Weights {
|
||||||
if s.mem.CPU.Weights[i].Size != 0 || s.mem.CPU.Cache[i].Size != 0 {
|
if s.mem.CPU.Weights[i] != 0 || s.mem.CPU.Cache[i] != 0 {
|
||||||
noCPULayers = false
|
noCPULayers = false
|
||||||
break
|
break
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if noCPULayers {
|
if noCPULayers {
|
||||||
mem += s.mem.InputWeights.Size
|
mem += s.mem.InputWeights
|
||||||
mem += s.mem.CPU.Graph.Size
|
mem += s.mem.CPU.Graph
|
||||||
}
|
}
|
||||||
|
|
||||||
return mem
|
return mem
|
||||||
|
|
@ -1748,10 +1748,10 @@ func (s *ollamaServer) TotalSize() uint64 {
|
||||||
return 0
|
return 0
|
||||||
}
|
}
|
||||||
|
|
||||||
mem := s.mem.InputWeights.Size
|
mem := s.mem.InputWeights
|
||||||
mem += s.mem.CPU.Allocated()
|
mem += s.mem.CPU.Size()
|
||||||
for _, g := range s.mem.GPUs {
|
for _, g := range s.mem.GPUs {
|
||||||
mem += g.Allocated()
|
mem += g.Size()
|
||||||
}
|
}
|
||||||
|
|
||||||
return mem
|
return mem
|
||||||
|
|
@ -1764,7 +1764,7 @@ func (s *ollamaServer) VRAMByGPU(gpuID string) uint64 {
|
||||||
|
|
||||||
for _, g := range s.mem.GPUs {
|
for _, g := range s.mem.GPUs {
|
||||||
if g.ID == gpuID {
|
if g.ID == gpuID {
|
||||||
return g.Allocated()
|
return g.Size()
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -155,18 +155,18 @@ func TestLLMServerFitGPU(t *testing.T) {
|
||||||
}
|
}
|
||||||
|
|
||||||
s.mem = &ml.BackendMemory{CPU: ml.DeviceMemory{
|
s.mem = &ml.BackendMemory{CPU: ml.DeviceMemory{
|
||||||
Weights: make([]ml.Memory, s.totalLayers),
|
Weights: make([]uint64, s.totalLayers),
|
||||||
Cache: make([]ml.Memory, s.totalLayers),
|
Cache: make([]uint64, s.totalLayers),
|
||||||
}, GPUs: make([]ml.DeviceMemory, len(gpus))}
|
}, GPUs: make([]ml.DeviceMemory, len(gpus))}
|
||||||
|
|
||||||
for i := range tt.layers {
|
for i := range tt.layers {
|
||||||
s.mem.CPU.Weights[i].Size = uint64(tt.layers[i])
|
s.mem.CPU.Weights[i] = uint64(tt.layers[i])
|
||||||
}
|
}
|
||||||
|
|
||||||
for i := range s.mem.GPUs {
|
for i := range s.mem.GPUs {
|
||||||
s.mem.GPUs[i].ID = fmt.Sprintf("gpu%d", i)
|
s.mem.GPUs[i].ID = fmt.Sprintf("gpu%d", i)
|
||||||
s.mem.GPUs[i].Weights = make([]ml.Memory, s.totalLayers)
|
s.mem.GPUs[i].Weights = make([]uint64, s.totalLayers)
|
||||||
s.mem.GPUs[i].Cache = make([]ml.Memory, s.totalLayers)
|
s.mem.GPUs[i].Cache = make([]uint64, s.totalLayers)
|
||||||
}
|
}
|
||||||
|
|
||||||
gpuLayers, err := s.createLayout(systemInfo, gpus, s.mem, tt.requireFull, 0)
|
gpuLayers, err := s.createLayout(systemInfo, gpus, s.mem, tt.requireFull, 0)
|
||||||
|
|
|
||||||
|
|
@ -158,40 +158,6 @@ func (e ErrNoMem) Error() string {
|
||||||
return fmt.Sprintf("insufficient memory - required allocations: %+v", e.BackendMemory)
|
return fmt.Sprintf("insufficient memory - required allocations: %+v", e.BackendMemory)
|
||||||
}
|
}
|
||||||
|
|
||||||
type AllocationStatus int
|
|
||||||
|
|
||||||
const (
|
|
||||||
// Unallocated memory - have not yet attempted to allocate
|
|
||||||
Unallocated AllocationStatus = iota
|
|
||||||
|
|
||||||
// Failed memory - tried to allocate the memory and did not succeed
|
|
||||||
Failed
|
|
||||||
|
|
||||||
// Allocated memory = tried and succeeded to allocate memory
|
|
||||||
Allocated
|
|
||||||
)
|
|
||||||
|
|
||||||
// Memory is the size of an allocation and whether it was successful.
|
|
||||||
type Memory struct {
|
|
||||||
Size uint64
|
|
||||||
Status AllocationStatus
|
|
||||||
}
|
|
||||||
|
|
||||||
func (m Memory) String() string {
|
|
||||||
s := fmt.Sprint(m.Size)
|
|
||||||
|
|
||||||
switch m.Status {
|
|
||||||
case Unallocated:
|
|
||||||
s += "U"
|
|
||||||
case Failed:
|
|
||||||
s += "F"
|
|
||||||
case Allocated:
|
|
||||||
s += "A"
|
|
||||||
}
|
|
||||||
|
|
||||||
return s
|
|
||||||
}
|
|
||||||
|
|
||||||
// DeviceMemory provides a breakdown of the memory needed
|
// DeviceMemory provides a breakdown of the memory needed
|
||||||
// per device, such as a CPU or GPU.
|
// per device, such as a CPU or GPU.
|
||||||
type DeviceMemory struct {
|
type DeviceMemory struct {
|
||||||
|
|
@ -204,39 +170,32 @@ type DeviceMemory struct {
|
||||||
ID string
|
ID string
|
||||||
|
|
||||||
// Weights is the per-layer memory needed for the model weights.
|
// Weights is the per-layer memory needed for the model weights.
|
||||||
Weights []Memory
|
Weights []uint64
|
||||||
|
|
||||||
// Cache is the per-layer memory needed for the KV cache.
|
// Cache is the per-layer memory needed for the KV cache.
|
||||||
Cache []Memory
|
Cache []uint64
|
||||||
|
|
||||||
// Graph is the size of the compute graph. It is not per-layer.
|
// Graph is the size of the compute graph. It is not per-layer.
|
||||||
Graph Memory
|
Graph uint64
|
||||||
}
|
}
|
||||||
|
|
||||||
// Allocated returns the total size of the memory that has been successfully
|
func sumMemory(mem []uint64) uint64 {
|
||||||
// allocated on this device
|
var sum uint64
|
||||||
func (m DeviceMemory) Allocated() uint64 {
|
|
||||||
var mem uint64
|
|
||||||
|
|
||||||
for _, w := range m.Weights {
|
for _, m := range mem {
|
||||||
if w.Status == Allocated {
|
sum += m
|
||||||
mem += w.Size
|
|
||||||
}
|
|
||||||
}
|
|
||||||
for _, c := range m.Cache {
|
|
||||||
if c.Status == Allocated {
|
|
||||||
mem += c.Size
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if m.Graph.Status == Allocated {
|
|
||||||
mem += m.Graph.Size
|
|
||||||
}
|
}
|
||||||
|
|
||||||
return mem
|
return sum
|
||||||
}
|
}
|
||||||
|
|
||||||
func memoryPresent(mem []Memory) bool {
|
// Size returns the total size of the memory required by this device
|
||||||
return slices.ContainsFunc(mem, func(m Memory) bool { return m.Size != 0 })
|
func (m DeviceMemory) Size() uint64 {
|
||||||
|
return sumMemory(m.Weights) + sumMemory(m.Cache) + m.Graph
|
||||||
|
}
|
||||||
|
|
||||||
|
func memoryPresent(mem []uint64) bool {
|
||||||
|
return slices.ContainsFunc(mem, func(m uint64) bool { return m != 0 })
|
||||||
}
|
}
|
||||||
|
|
||||||
func (m DeviceMemory) LogValue() slog.Value {
|
func (m DeviceMemory) LogValue() slog.Value {
|
||||||
|
|
@ -249,7 +208,7 @@ func (m DeviceMemory) LogValue() slog.Value {
|
||||||
attrs = append(attrs, slog.Any("Cache", m.Cache))
|
attrs = append(attrs, slog.Any("Cache", m.Cache))
|
||||||
}
|
}
|
||||||
|
|
||||||
if m.Graph.Size != 0 {
|
if m.Graph != 0 {
|
||||||
attrs = append(attrs, slog.Any("Graph", m.Graph))
|
attrs = append(attrs, slog.Any("Graph", m.Graph))
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -267,7 +226,7 @@ func (m DeviceMemory) LogValue() slog.Value {
|
||||||
// accommodate that to make forward progress.
|
// accommodate that to make forward progress.
|
||||||
type BackendMemory struct {
|
type BackendMemory struct {
|
||||||
// InputWeights are always located on the CPU and cannot be moved
|
// InputWeights are always located on the CPU and cannot be moved
|
||||||
InputWeights Memory
|
InputWeights uint64
|
||||||
|
|
||||||
// CPU model components are located in system memory. This does not
|
// CPU model components are located in system memory. This does not
|
||||||
// include unified memory allocated through the GPU.
|
// include unified memory allocated through the GPU.
|
||||||
|
|
@ -279,7 +238,7 @@ type BackendMemory struct {
|
||||||
|
|
||||||
func (m BackendMemory) LogValue() slog.Value {
|
func (m BackendMemory) LogValue() slog.Value {
|
||||||
var attrs []slog.Attr
|
var attrs []slog.Attr
|
||||||
if m.InputWeights.Size != 0 {
|
if m.InputWeights != 0 {
|
||||||
attrs = append(attrs, slog.Any("InputWeights", m.InputWeights))
|
attrs = append(attrs, slog.Any("InputWeights", m.InputWeights))
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -291,17 +250,7 @@ func (m BackendMemory) LogValue() slog.Value {
|
||||||
return slog.GroupValue(attrs...)
|
return slog.GroupValue(attrs...)
|
||||||
}
|
}
|
||||||
|
|
||||||
func sumMemory(mem []Memory) uint64 {
|
// Log prints a high level summary of the memory
|
||||||
var sum uint64
|
|
||||||
|
|
||||||
for _, m := range mem {
|
|
||||||
sum += m.Size
|
|
||||||
}
|
|
||||||
|
|
||||||
return sum
|
|
||||||
}
|
|
||||||
|
|
||||||
// Log prints a high level summary of the memory (allocated or not)
|
|
||||||
func (m BackendMemory) Log(level slog.Level) {
|
func (m BackendMemory) Log(level slog.Level) {
|
||||||
var total uint64
|
var total uint64
|
||||||
|
|
||||||
|
|
@ -311,7 +260,7 @@ func (m BackendMemory) Log(level slog.Level) {
|
||||||
total += sum
|
total += sum
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if sum := m.InputWeights.Size + sumMemory(m.CPU.Weights); sum > 0 {
|
if sum := m.InputWeights + sumMemory(m.CPU.Weights); sum > 0 {
|
||||||
slog.Log(context.TODO(), level, "model weights", "device", m.CPU.Name, "size", format.HumanBytes2(sum))
|
slog.Log(context.TODO(), level, "model weights", "device", m.CPU.Name, "size", format.HumanBytes2(sum))
|
||||||
total += sum
|
total += sum
|
||||||
}
|
}
|
||||||
|
|
@ -328,12 +277,12 @@ func (m BackendMemory) Log(level slog.Level) {
|
||||||
}
|
}
|
||||||
|
|
||||||
for _, gpu := range m.GPUs {
|
for _, gpu := range m.GPUs {
|
||||||
if sum := gpu.Graph.Size; sum > 0 {
|
if sum := gpu.Graph; sum > 0 {
|
||||||
slog.Log(context.TODO(), level, "compute graph", "device", gpu.Name, "size", format.HumanBytes2(sum))
|
slog.Log(context.TODO(), level, "compute graph", "device", gpu.Name, "size", format.HumanBytes2(sum))
|
||||||
total += sum
|
total += sum
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if sum := m.CPU.Graph.Size; sum > 0 {
|
if sum := m.CPU.Graph; sum > 0 {
|
||||||
slog.Log(context.TODO(), level, "compute graph", "device", m.CPU.Name, "size", format.HumanBytes2(sum))
|
slog.Log(context.TODO(), level, "compute graph", "device", m.CPU.Name, "size", format.HumanBytes2(sum))
|
||||||
total += sum
|
total += sum
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -169,8 +169,8 @@ func New(modelPath string, params ml.BackendParams) (ml.Backend, error) {
|
||||||
var props C.struct_ggml_backend_dev_props
|
var props C.struct_ggml_backend_dev_props
|
||||||
C.ggml_backend_dev_get_props(cpuDeviceBufferType.d, &props)
|
C.ggml_backend_dev_get_props(cpuDeviceBufferType.d, &props)
|
||||||
requiredMemory.CPU.ID = C.GoString(props.id)
|
requiredMemory.CPU.ID = C.GoString(props.id)
|
||||||
requiredMemory.CPU.Weights = make([]ml.Memory, blocks+1)
|
requiredMemory.CPU.Weights = make([]uint64, blocks+1)
|
||||||
requiredMemory.CPU.Cache = make([]ml.Memory, blocks+1)
|
requiredMemory.CPU.Cache = make([]uint64, blocks+1)
|
||||||
|
|
||||||
// create list of buffer types for each gpu
|
// create list of buffer types for each gpu
|
||||||
var gpuDeviceBufferTypes []deviceBufferType
|
var gpuDeviceBufferTypes []deviceBufferType
|
||||||
|
|
@ -188,8 +188,8 @@ func New(modelPath string, params ml.BackendParams) (ml.Backend, error) {
|
||||||
var props C.struct_ggml_backend_dev_props
|
var props C.struct_ggml_backend_dev_props
|
||||||
C.ggml_backend_dev_get_props(d, &props)
|
C.ggml_backend_dev_get_props(d, &props)
|
||||||
requiredMemory.GPUs[i].ID = C.GoString(props.id)
|
requiredMemory.GPUs[i].ID = C.GoString(props.id)
|
||||||
requiredMemory.GPUs[i].Weights = make([]ml.Memory, blocks+1)
|
requiredMemory.GPUs[i].Weights = make([]uint64, blocks+1)
|
||||||
requiredMemory.GPUs[i].Cache = make([]ml.Memory, blocks+1)
|
requiredMemory.GPUs[i].Cache = make([]uint64, blocks+1)
|
||||||
}
|
}
|
||||||
|
|
||||||
// inputs always use cpu
|
// inputs always use cpu
|
||||||
|
|
@ -275,13 +275,9 @@ func New(modelPath string, params ml.BackendParams) (ml.Backend, error) {
|
||||||
|
|
||||||
size := pad(C.ggml_backend_buft_get_alloc_size(bt, tt), C.ggml_backend_buft_get_alignment(bt))
|
size := pad(C.ggml_backend_buft_get_alloc_size(bt, tt), C.ggml_backend_buft_get_alignment(bt))
|
||||||
if layer == -1 {
|
if layer == -1 {
|
||||||
// Assume that InputWeights can be allocated - they're always in system memory and can't be moved in any case
|
requiredMemory.InputWeights += uint64(size)
|
||||||
if params.AllocMemory {
|
|
||||||
requiredMemory.InputWeights.Status = ml.Allocated
|
|
||||||
}
|
|
||||||
requiredMemory.InputWeights.Size += uint64(size)
|
|
||||||
} else {
|
} else {
|
||||||
btDeviceMemory[bt].Weights[layer].Size += uint64(size)
|
btDeviceMemory[bt].Weights[layer] += uint64(size)
|
||||||
}
|
}
|
||||||
|
|
||||||
//nolint:staticcheck // TODO: check if buffer type supports this tensor
|
//nolint:staticcheck // TODO: check if buffer type supports this tensor
|
||||||
|
|
@ -349,18 +345,6 @@ func New(modelPath string, params ml.BackendParams) (ml.Backend, error) {
|
||||||
}
|
}
|
||||||
|
|
||||||
b := C.ggml_backend_alloc_ctx_tensors_from_buft(c, bt)
|
b := C.ggml_backend_alloc_ctx_tensors_from_buft(c, bt)
|
||||||
if params.AllocMemory {
|
|
||||||
for i := range btDeviceMemory[bt].Weights {
|
|
||||||
if btDeviceMemory[bt].Weights[i].Size != 0 {
|
|
||||||
if b != nil {
|
|
||||||
btDeviceMemory[bt].Weights[i].Status = ml.Allocated
|
|
||||||
} else {
|
|
||||||
btDeviceMemory[bt].Weights[i].Status = ml.Failed
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if b == nil {
|
if b == nil {
|
||||||
for _, b := range bbs {
|
for _, b := range bbs {
|
||||||
C.ggml_backend_buffer_free(b)
|
C.ggml_backend_buffer_free(b)
|
||||||
|
|
@ -795,24 +779,15 @@ func (c *Context) Reserve() {
|
||||||
|
|
||||||
// Reserve may get called multiple times for different graphs - we just want the last run, which will contain the max allocations
|
// Reserve may get called multiple times for different graphs - we just want the last run, which will contain the max allocations
|
||||||
for _, bt := range c.b.schedBufts {
|
for _, bt := range c.b.schedBufts {
|
||||||
c.b.btDeviceMemory[bt].Graph = ml.Memory{}
|
c.b.btDeviceMemory[bt].Graph = 0
|
||||||
}
|
}
|
||||||
|
|
||||||
for i := range c.b.schedBackends {
|
for i := range c.b.schedBackends {
|
||||||
bufferStatus := C.ggml_backend_sched_get_attempted_buffer_size(c.b.sched, c.b.schedBackends[i])
|
bufferSize := C.ggml_backend_sched_get_attempted_buffer_size(c.b.sched, c.b.schedBackends[i])
|
||||||
|
c.b.btDeviceMemory[c.b.schedBufts[i]].Graph += uint64(bufferSize)
|
||||||
graph := &c.b.btDeviceMemory[c.b.schedBufts[i]].Graph
|
|
||||||
graph.Size += uint64(bufferStatus.size)
|
|
||||||
if c.b.allocMemory {
|
|
||||||
if bufferStatus.allocated && graph.Status != ml.Failed {
|
|
||||||
graph.Status = ml.Allocated
|
|
||||||
} else {
|
|
||||||
graph.Status = ml.Failed
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
logutil.Trace("compute graph", "backend", C.GoString(C.ggml_backend_name(c.b.schedBackends[i])),
|
logutil.Trace("compute graph", "backend", C.GoString(C.ggml_backend_name(c.b.schedBackends[i])),
|
||||||
"buffer_type", C.GoString(C.ggml_backend_buft_name(c.b.schedBufts[i])), "size", format.HumanBytes2(uint64(bufferStatus.size)))
|
"buffer_type", C.GoString(C.ggml_backend_buft_name(c.b.schedBufts[i])), "size", format.HumanBytes2(uint64(bufferSize)))
|
||||||
}
|
}
|
||||||
|
|
||||||
if !reserved {
|
if !reserved {
|
||||||
|
|
@ -862,16 +837,7 @@ func (c *Context) newTensor(dtype ml.DType, shape []int) ml.Tensor {
|
||||||
|
|
||||||
b := C.ggml_backend_buft_alloc_buffer(c.buft, size)
|
b := C.ggml_backend_buft_alloc_buffer(c.buft, size)
|
||||||
if c.layer >= 0 {
|
if c.layer >= 0 {
|
||||||
cache := &c.b.btDeviceMemory[c.buft].Cache[c.layer]
|
c.b.btDeviceMemory[c.buft].Cache[c.layer] += uint64(size)
|
||||||
|
|
||||||
cache.Size += uint64(size)
|
|
||||||
if c.b.allocMemory {
|
|
||||||
if b != nil {
|
|
||||||
cache.Status = ml.Allocated
|
|
||||||
} else {
|
|
||||||
cache.Status = ml.Failed
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if b == nil {
|
if b == nil {
|
||||||
|
|
|
||||||
7
ml/backend/ggml/ggml/include/ggml-alloc.h
vendored
7
ml/backend/ggml/ggml/include/ggml-alloc.h
vendored
|
|
@ -65,12 +65,7 @@ GGML_API bool ggml_gallocr_reserve_n(
|
||||||
GGML_API bool ggml_gallocr_alloc_graph(ggml_gallocr_t galloc, struct ggml_cgraph * graph);
|
GGML_API bool ggml_gallocr_alloc_graph(ggml_gallocr_t galloc, struct ggml_cgraph * graph);
|
||||||
|
|
||||||
GGML_API size_t ggml_gallocr_get_buffer_size(ggml_gallocr_t galloc, int buffer_id);
|
GGML_API size_t ggml_gallocr_get_buffer_size(ggml_gallocr_t galloc, int buffer_id);
|
||||||
|
GGML_API size_t ggml_gallocr_get_attempted_buffer_size(ggml_gallocr_t galloc, int buffer_id);
|
||||||
struct ggml_allocr_buffer_status {
|
|
||||||
size_t size;
|
|
||||||
bool allocated;
|
|
||||||
};
|
|
||||||
GGML_API struct ggml_allocr_buffer_status ggml_gallocr_get_attempted_buffer_size(ggml_gallocr_t galloc, int buffer_id);
|
|
||||||
|
|
||||||
// Utils
|
// Utils
|
||||||
// Create a buffer and allocate all the tensors in a ggml_context
|
// Create a buffer and allocate all the tensors in a ggml_context
|
||||||
|
|
|
||||||
7
ml/backend/ggml/ggml/include/ggml-backend.h
vendored
7
ml/backend/ggml/ggml/include/ggml-backend.h
vendored
|
|
@ -306,12 +306,7 @@ extern "C" {
|
||||||
GGML_API int ggml_backend_sched_get_n_copies(ggml_backend_sched_t sched);
|
GGML_API int ggml_backend_sched_get_n_copies(ggml_backend_sched_t sched);
|
||||||
|
|
||||||
GGML_API size_t ggml_backend_sched_get_buffer_size(ggml_backend_sched_t sched, ggml_backend_t backend);
|
GGML_API size_t ggml_backend_sched_get_buffer_size(ggml_backend_sched_t sched, ggml_backend_t backend);
|
||||||
|
GGML_API size_t ggml_backend_sched_get_attempted_buffer_size(ggml_backend_sched_t sched, ggml_backend_t backend);
|
||||||
struct ggml_backend_buffer_status {
|
|
||||||
size_t size;
|
|
||||||
bool allocated;
|
|
||||||
};
|
|
||||||
GGML_API struct ggml_backend_buffer_status ggml_backend_sched_get_attempted_buffer_size(ggml_backend_sched_t sched, ggml_backend_t backend);
|
|
||||||
|
|
||||||
GGML_API void ggml_backend_sched_set_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node, ggml_backend_t backend);
|
GGML_API void ggml_backend_sched_set_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node, ggml_backend_t backend);
|
||||||
GGML_API ggml_backend_t ggml_backend_sched_get_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node);
|
GGML_API ggml_backend_t ggml_backend_sched_get_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node);
|
||||||
|
|
|
||||||
8
ml/backend/ggml/ggml/src/ggml-alloc.c
vendored
8
ml/backend/ggml/ggml/src/ggml-alloc.c
vendored
|
|
@ -932,7 +932,7 @@ size_t ggml_gallocr_get_buffer_size(ggml_gallocr_t galloc, int buffer_id) {
|
||||||
return ggml_backend_buffer_get_size(galloc->buffers[buffer_id]);
|
return ggml_backend_buffer_get_size(galloc->buffers[buffer_id]);
|
||||||
}
|
}
|
||||||
|
|
||||||
struct ggml_allocr_buffer_status ggml_gallocr_get_attempted_buffer_size(ggml_gallocr_t galloc, int buffer_id) {
|
size_t ggml_gallocr_get_attempted_buffer_size(ggml_gallocr_t galloc, int buffer_id) {
|
||||||
GGML_ASSERT(buffer_id >= 0 && buffer_id < galloc->n_buffers);
|
GGML_ASSERT(buffer_id >= 0 && buffer_id < galloc->n_buffers);
|
||||||
|
|
||||||
for (int i = 0; i < buffer_id; i++) {
|
for (int i = 0; i < buffer_id; i++) {
|
||||||
|
|
@ -941,13 +941,11 @@ struct ggml_allocr_buffer_status ggml_gallocr_get_attempted_buffer_size(ggml_gal
|
||||||
// (See above.) However, we need a different check because multiple buffers might be NULL in our
|
// (See above.) However, we need a different check because multiple buffers might be NULL in our
|
||||||
// case and we still want to know the attempted size.
|
// case and we still want to know the attempted size.
|
||||||
|
|
||||||
struct ggml_allocr_buffer_status status = {0, true};
|
return 0;
|
||||||
return status;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
struct ggml_allocr_buffer_status status = {galloc->buffer_sizes[buffer_id], galloc->buffers[buffer_id] != NULL};
|
return galloc->buffer_sizes[buffer_id];
|
||||||
return status;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// utils
|
// utils
|
||||||
|
|
|
||||||
7
ml/backend/ggml/ggml/src/ggml-backend.cpp
vendored
7
ml/backend/ggml/ggml/src/ggml-backend.cpp
vendored
|
|
@ -1656,14 +1656,11 @@ size_t ggml_backend_sched_get_buffer_size(ggml_backend_sched_t sched, ggml_backe
|
||||||
return ggml_gallocr_get_buffer_size(sched->galloc, backend_index);
|
return ggml_gallocr_get_buffer_size(sched->galloc, backend_index);
|
||||||
}
|
}
|
||||||
|
|
||||||
struct ggml_backend_buffer_status ggml_backend_sched_get_attempted_buffer_size(ggml_backend_sched_t sched, ggml_backend_t backend) {
|
size_t ggml_backend_sched_get_attempted_buffer_size(ggml_backend_sched_t sched, ggml_backend_t backend) {
|
||||||
int backend_index = ggml_backend_sched_backend_id(sched, backend);
|
int backend_index = ggml_backend_sched_backend_id(sched, backend);
|
||||||
GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends);
|
GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends);
|
||||||
|
|
||||||
struct ggml_allocr_buffer_status allocr_status = ggml_gallocr_get_attempted_buffer_size(sched->galloc, backend_index);
|
return ggml_gallocr_get_attempted_buffer_size(sched->galloc, backend_index);
|
||||||
struct ggml_backend_buffer_status status = {allocr_status.size, allocr_status.allocated};
|
|
||||||
|
|
||||||
return status;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
void ggml_backend_sched_set_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node, ggml_backend_t backend) {
|
void ggml_backend_sched_set_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node, ggml_backend_t backend) {
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue
Block a user