llama: fix defrag patch to defragment when no slots are available (#10695)
This commit is contained in:
parent
c6bcdc4223
commit
f46df4e5d2
18
llama/llama.cpp/src/llama-context.cpp
vendored
18
llama/llama.cpp/src/llama-context.cpp
vendored
@ -950,9 +950,12 @@ int llama_context::decode(llama_batch & inp_batch) {
|
|||||||
|
|
||||||
// find KV slot
|
// find KV slot
|
||||||
if (!kv_self->find_slot(ubatch)) {
|
if (!kv_self->find_slot(ubatch)) {
|
||||||
LLAMA_LOG_WARN("%s: failed to find KV cache slot for ubatch of size %d\n", __func__, ubatch.n_tokens);
|
kv_self->defrag_sched(-1.0f);
|
||||||
|
kv_self->update(*this);
|
||||||
return 1;
|
if (!kv_self->find_slot(ubatch)) {
|
||||||
|
LLAMA_LOG_WARN("%s: failed to find KV cache slot for ubatch of size %d\n", __func__, ubatch.n_tokens);
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
ggml_backend_sched_reset(sched.get());
|
ggml_backend_sched_reset(sched.get());
|
||||||
@ -1967,9 +1970,12 @@ void llama_context::opt_epoch_iter(
|
|||||||
|
|
||||||
// TODO: not sure if this is needed
|
// TODO: not sure if this is needed
|
||||||
if (!kv_self->find_slot(ubatch)) {
|
if (!kv_self->find_slot(ubatch)) {
|
||||||
LLAMA_LOG_WARN("%s: failed to find KV cache slot for ubatch of size %d\n", __func__, ubatch.n_tokens);
|
kv_self->defrag_sched(-1.0f);
|
||||||
|
kv_self->update(*this);
|
||||||
GGML_ABORT("TODO: handle this error");
|
if (!kv_self->find_slot(ubatch)) {
|
||||||
|
LLAMA_LOG_WARN("%s: failed to find KV cache slot for ubatch of size %d\n", __func__, ubatch.n_tokens);
|
||||||
|
GGML_ABORT("TODO: handle this error");
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
auto * gf = graph_init();
|
auto * gf = graph_init();
|
||||||
|
@ -15,11 +15,48 @@ but this can leave a cache that still does not have adequate space
|
|||||||
even after defragmentation is triggered. Instead, we should do
|
even after defragmentation is triggered. Instead, we should do
|
||||||
multiple batches of processing until everything is complete.
|
multiple batches of processing until everything is complete.
|
||||||
---
|
---
|
||||||
|
src/llama-context.cpp | 18 ++++---
|
||||||
src/llama-context.h | 1 +
|
src/llama-context.h | 1 +
|
||||||
src/llama-kv-cache.cpp | 107 ++++++++++++++---------------------------
|
src/llama-kv-cache.cpp | 107 ++++++++++++++---------------------------
|
||||||
src/llama-kv-cache.h | 12 ++++-
|
src/llama-kv-cache.h | 12 ++++-
|
||||||
3 files changed, 47 insertions(+), 73 deletions(-)
|
4 files changed, 59 insertions(+), 79 deletions(-)
|
||||||
|
|
||||||
|
diff --git a/src/llama-context.cpp b/src/llama-context.cpp
|
||||||
|
index c22687e4..c5948e8f 100644
|
||||||
|
--- a/src/llama-context.cpp
|
||||||
|
+++ b/src/llama-context.cpp
|
||||||
|
@@ -950,9 +950,12 @@ int llama_context::decode(llama_batch & inp_batch) {
|
||||||
|
|
||||||
|
// find KV slot
|
||||||
|
if (!kv_self->find_slot(ubatch)) {
|
||||||
|
- LLAMA_LOG_WARN("%s: failed to find KV cache slot for ubatch of size %d\n", __func__, ubatch.n_tokens);
|
||||||
|
-
|
||||||
|
- return 1;
|
||||||
|
+ kv_self->defrag_sched(-1.0f);
|
||||||
|
+ kv_self->update(*this);
|
||||||
|
+ if (!kv_self->find_slot(ubatch)) {
|
||||||
|
+ LLAMA_LOG_WARN("%s: failed to find KV cache slot for ubatch of size %d\n", __func__, ubatch.n_tokens);
|
||||||
|
+ return 1;
|
||||||
|
+ }
|
||||||
|
}
|
||||||
|
|
||||||
|
ggml_backend_sched_reset(sched.get());
|
||||||
|
@@ -1967,9 +1970,12 @@ void llama_context::opt_epoch_iter(
|
||||||
|
|
||||||
|
// TODO: not sure if this is needed
|
||||||
|
if (!kv_self->find_slot(ubatch)) {
|
||||||
|
- LLAMA_LOG_WARN("%s: failed to find KV cache slot for ubatch of size %d\n", __func__, ubatch.n_tokens);
|
||||||
|
-
|
||||||
|
- GGML_ABORT("TODO: handle this error");
|
||||||
|
+ kv_self->defrag_sched(-1.0f);
|
||||||
|
+ kv_self->update(*this);
|
||||||
|
+ if (!kv_self->find_slot(ubatch)) {
|
||||||
|
+ LLAMA_LOG_WARN("%s: failed to find KV cache slot for ubatch of size %d\n", __func__, ubatch.n_tokens);
|
||||||
|
+ GGML_ABORT("TODO: handle this error");
|
||||||
|
+ }
|
||||||
|
}
|
||||||
|
|
||||||
|
auto * gf = graph_init();
|
||||||
diff --git a/src/llama-context.h b/src/llama-context.h
|
diff --git a/src/llama-context.h b/src/llama-context.h
|
||||||
index c4ab242a..9970dfc6 100644
|
index c4ab242a..9970dfc6 100644
|
||||||
--- a/src/llama-context.h
|
--- a/src/llama-context.h
|
||||||
|
Loading…
x
Reference in New Issue
Block a user