llama: fix defrag patch to defragment when no slots are available (#10695)

This commit is contained in:
Jeffrey Morgan 2025-05-13 14:02:08 -07:00 committed by GitHub
parent c6bcdc4223
commit f46df4e5d2
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 50 additions and 7 deletions

View File

@ -950,9 +950,12 @@ int llama_context::decode(llama_batch & inp_batch) {
// find KV slot // find KV slot
if (!kv_self->find_slot(ubatch)) { if (!kv_self->find_slot(ubatch)) {
LLAMA_LOG_WARN("%s: failed to find KV cache slot for ubatch of size %d\n", __func__, ubatch.n_tokens); kv_self->defrag_sched(-1.0f);
kv_self->update(*this);
return 1; if (!kv_self->find_slot(ubatch)) {
LLAMA_LOG_WARN("%s: failed to find KV cache slot for ubatch of size %d\n", __func__, ubatch.n_tokens);
return 1;
}
} }
ggml_backend_sched_reset(sched.get()); ggml_backend_sched_reset(sched.get());
@ -1967,9 +1970,12 @@ void llama_context::opt_epoch_iter(
// TODO: not sure if this is needed // TODO: not sure if this is needed
if (!kv_self->find_slot(ubatch)) { if (!kv_self->find_slot(ubatch)) {
LLAMA_LOG_WARN("%s: failed to find KV cache slot for ubatch of size %d\n", __func__, ubatch.n_tokens); kv_self->defrag_sched(-1.0f);
kv_self->update(*this);
GGML_ABORT("TODO: handle this error"); if (!kv_self->find_slot(ubatch)) {
LLAMA_LOG_WARN("%s: failed to find KV cache slot for ubatch of size %d\n", __func__, ubatch.n_tokens);
GGML_ABORT("TODO: handle this error");
}
} }
auto * gf = graph_init(); auto * gf = graph_init();

View File

@ -15,11 +15,48 @@ but this can leave a cache that still does not have adequate space
even after defragmentation is triggered. Instead, we should do even after defragmentation is triggered. Instead, we should do
multiple batches of processing until everything is complete. multiple batches of processing until everything is complete.
--- ---
src/llama-context.cpp | 18 ++++---
src/llama-context.h | 1 + src/llama-context.h | 1 +
src/llama-kv-cache.cpp | 107 ++++++++++++++--------------------------- src/llama-kv-cache.cpp | 107 ++++++++++++++---------------------------
src/llama-kv-cache.h | 12 ++++- src/llama-kv-cache.h | 12 ++++-
3 files changed, 47 insertions(+), 73 deletions(-) 4 files changed, 59 insertions(+), 79 deletions(-)
diff --git a/src/llama-context.cpp b/src/llama-context.cpp
index c22687e4..c5948e8f 100644
--- a/src/llama-context.cpp
+++ b/src/llama-context.cpp
@@ -950,9 +950,12 @@ int llama_context::decode(llama_batch & inp_batch) {
// find KV slot
if (!kv_self->find_slot(ubatch)) {
- LLAMA_LOG_WARN("%s: failed to find KV cache slot for ubatch of size %d\n", __func__, ubatch.n_tokens);
-
- return 1;
+ kv_self->defrag_sched(-1.0f);
+ kv_self->update(*this);
+ if (!kv_self->find_slot(ubatch)) {
+ LLAMA_LOG_WARN("%s: failed to find KV cache slot for ubatch of size %d\n", __func__, ubatch.n_tokens);
+ return 1;
+ }
}
ggml_backend_sched_reset(sched.get());
@@ -1967,9 +1970,12 @@ void llama_context::opt_epoch_iter(
// TODO: not sure if this is needed
if (!kv_self->find_slot(ubatch)) {
- LLAMA_LOG_WARN("%s: failed to find KV cache slot for ubatch of size %d\n", __func__, ubatch.n_tokens);
-
- GGML_ABORT("TODO: handle this error");
+ kv_self->defrag_sched(-1.0f);
+ kv_self->update(*this);
+ if (!kv_self->find_slot(ubatch)) {
+ LLAMA_LOG_WARN("%s: failed to find KV cache slot for ubatch of size %d\n", __func__, ubatch.n_tokens);
+ GGML_ABORT("TODO: handle this error");
+ }
}
auto * gf = graph_init();
diff --git a/src/llama-context.h b/src/llama-context.h diff --git a/src/llama-context.h b/src/llama-context.h
index c4ab242a..9970dfc6 100644 index c4ab242a..9970dfc6 100644
--- a/src/llama-context.h --- a/src/llama-context.h