llama: fix defrag patch to defragment when no slots are available (#10695)
This commit is contained in:
parent
c6bcdc4223
commit
f46df4e5d2
18
llama/llama.cpp/src/llama-context.cpp
vendored
18
llama/llama.cpp/src/llama-context.cpp
vendored
@ -950,9 +950,12 @@ int llama_context::decode(llama_batch & inp_batch) {
|
||||
|
||||
// find KV slot
|
||||
if (!kv_self->find_slot(ubatch)) {
|
||||
LLAMA_LOG_WARN("%s: failed to find KV cache slot for ubatch of size %d\n", __func__, ubatch.n_tokens);
|
||||
|
||||
return 1;
|
||||
kv_self->defrag_sched(-1.0f);
|
||||
kv_self->update(*this);
|
||||
if (!kv_self->find_slot(ubatch)) {
|
||||
LLAMA_LOG_WARN("%s: failed to find KV cache slot for ubatch of size %d\n", __func__, ubatch.n_tokens);
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
|
||||
ggml_backend_sched_reset(sched.get());
|
||||
@ -1967,9 +1970,12 @@ void llama_context::opt_epoch_iter(
|
||||
|
||||
// TODO: not sure if this is needed
|
||||
if (!kv_self->find_slot(ubatch)) {
|
||||
LLAMA_LOG_WARN("%s: failed to find KV cache slot for ubatch of size %d\n", __func__, ubatch.n_tokens);
|
||||
|
||||
GGML_ABORT("TODO: handle this error");
|
||||
kv_self->defrag_sched(-1.0f);
|
||||
kv_self->update(*this);
|
||||
if (!kv_self->find_slot(ubatch)) {
|
||||
LLAMA_LOG_WARN("%s: failed to find KV cache slot for ubatch of size %d\n", __func__, ubatch.n_tokens);
|
||||
GGML_ABORT("TODO: handle this error");
|
||||
}
|
||||
}
|
||||
|
||||
auto * gf = graph_init();
|
||||
|
@ -15,11 +15,48 @@ but this can leave a cache that still does not have adequate space
|
||||
even after defragmentation is triggered. Instead, we should do
|
||||
multiple batches of processing until everything is complete.
|
||||
---
|
||||
src/llama-context.cpp | 18 ++++---
|
||||
src/llama-context.h | 1 +
|
||||
src/llama-kv-cache.cpp | 107 ++++++++++++++---------------------------
|
||||
src/llama-kv-cache.h | 12 ++++-
|
||||
3 files changed, 47 insertions(+), 73 deletions(-)
|
||||
4 files changed, 59 insertions(+), 79 deletions(-)
|
||||
|
||||
diff --git a/src/llama-context.cpp b/src/llama-context.cpp
|
||||
index c22687e4..c5948e8f 100644
|
||||
--- a/src/llama-context.cpp
|
||||
+++ b/src/llama-context.cpp
|
||||
@@ -950,9 +950,12 @@ int llama_context::decode(llama_batch & inp_batch) {
|
||||
|
||||
// find KV slot
|
||||
if (!kv_self->find_slot(ubatch)) {
|
||||
- LLAMA_LOG_WARN("%s: failed to find KV cache slot for ubatch of size %d\n", __func__, ubatch.n_tokens);
|
||||
-
|
||||
- return 1;
|
||||
+ kv_self->defrag_sched(-1.0f);
|
||||
+ kv_self->update(*this);
|
||||
+ if (!kv_self->find_slot(ubatch)) {
|
||||
+ LLAMA_LOG_WARN("%s: failed to find KV cache slot for ubatch of size %d\n", __func__, ubatch.n_tokens);
|
||||
+ return 1;
|
||||
+ }
|
||||
}
|
||||
|
||||
ggml_backend_sched_reset(sched.get());
|
||||
@@ -1967,9 +1970,12 @@ void llama_context::opt_epoch_iter(
|
||||
|
||||
// TODO: not sure if this is needed
|
||||
if (!kv_self->find_slot(ubatch)) {
|
||||
- LLAMA_LOG_WARN("%s: failed to find KV cache slot for ubatch of size %d\n", __func__, ubatch.n_tokens);
|
||||
-
|
||||
- GGML_ABORT("TODO: handle this error");
|
||||
+ kv_self->defrag_sched(-1.0f);
|
||||
+ kv_self->update(*this);
|
||||
+ if (!kv_self->find_slot(ubatch)) {
|
||||
+ LLAMA_LOG_WARN("%s: failed to find KV cache slot for ubatch of size %d\n", __func__, ubatch.n_tokens);
|
||||
+ GGML_ABORT("TODO: handle this error");
|
||||
+ }
|
||||
}
|
||||
|
||||
auto * gf = graph_init();
|
||||
diff --git a/src/llama-context.h b/src/llama-context.h
|
||||
index c4ab242a..9970dfc6 100644
|
||||
--- a/src/llama-context.h
|
||||
|
Loading…
x
Reference in New Issue
Block a user