From f46df4e5d2e964ccfd0f23f9377240b6d9897ed8 Mon Sep 17 00:00:00 2001
From: Jeffrey Morgan <jmorganca@gmail.com>
Date: Tue, 13 May 2025 14:02:08 -0700
Subject: [PATCH] llama: fix defrag patch to defragment when no slots are
 available (#10695)

---
 llama/llama.cpp/src/llama-context.cpp         | 18 ++++++---
 ...nsure-KV-cache-is-fully-defragmented.patch | 39 ++++++++++++++++++-
 2 files changed, 50 insertions(+), 7 deletions(-)

diff --git a/llama/llama.cpp/src/llama-context.cpp b/llama/llama.cpp/src/llama-context.cpp
index c22687e40..c5948e8fb 100644
--- a/llama/llama.cpp/src/llama-context.cpp
+++ b/llama/llama.cpp/src/llama-context.cpp
@@ -950,9 +950,12 @@ int llama_context::decode(llama_batch & inp_batch) {
 
         // find KV slot
         if (!kv_self->find_slot(ubatch)) {
-            LLAMA_LOG_WARN("%s: failed to find KV cache slot for ubatch of size %d\n", __func__, ubatch.n_tokens);
-
-            return 1;
+            kv_self->defrag_sched(-1.0f);
+            kv_self->update(*this);
+            if (!kv_self->find_slot(ubatch)) {
+                LLAMA_LOG_WARN("%s: failed to find KV cache slot for ubatch of size %d\n", __func__, ubatch.n_tokens);
+                return 1;
+            }
         }
 
         ggml_backend_sched_reset(sched.get());
@@ -1967,9 +1970,12 @@ void llama_context::opt_epoch_iter(
 
             // TODO: not sure if this is needed
             if (!kv_self->find_slot(ubatch)) {
-                LLAMA_LOG_WARN("%s: failed to find KV cache slot for ubatch of size %d\n", __func__, ubatch.n_tokens);
-
-                GGML_ABORT("TODO: handle this error");
+                kv_self->defrag_sched(-1.0f);
+                kv_self->update(*this);
+                if (!kv_self->find_slot(ubatch)) {
+                    LLAMA_LOG_WARN("%s: failed to find KV cache slot for ubatch of size %d\n", __func__, ubatch.n_tokens);
+                    GGML_ABORT("TODO: handle this error");
+                }
             }
 
             auto * gf = graph_init();
diff --git a/llama/patches/0010-ensure-KV-cache-is-fully-defragmented.patch b/llama/patches/0010-ensure-KV-cache-is-fully-defragmented.patch
index 81c179694..c5faeaaae 100644
--- a/llama/patches/0010-ensure-KV-cache-is-fully-defragmented.patch
+++ b/llama/patches/0010-ensure-KV-cache-is-fully-defragmented.patch
@@ -15,11 +15,48 @@ but this can leave a cache that still does not have adequate space
 even after defragmentation is triggered. Instead, we should do
 multiple batches of processing until everything is complete.
 ---
+ src/llama-context.cpp  |  18 ++++---
  src/llama-context.h    |   1 +
  src/llama-kv-cache.cpp | 107 ++++++++++++++---------------------------
  src/llama-kv-cache.h   |  12 ++++-
- 3 files changed, 47 insertions(+), 73 deletions(-)
+ 4 files changed, 59 insertions(+), 79 deletions(-)
 
+diff --git a/src/llama-context.cpp b/src/llama-context.cpp
+index c22687e4..c5948e8f 100644
+--- a/src/llama-context.cpp
++++ b/src/llama-context.cpp
+@@ -950,9 +950,12 @@ int llama_context::decode(llama_batch & inp_batch) {
+ 
+         // find KV slot
+         if (!kv_self->find_slot(ubatch)) {
+-            LLAMA_LOG_WARN("%s: failed to find KV cache slot for ubatch of size %d\n", __func__, ubatch.n_tokens);
+-
+-            return 1;
++            kv_self->defrag_sched(-1.0f);
++            kv_self->update(*this);
++            if (!kv_self->find_slot(ubatch)) {
++                LLAMA_LOG_WARN("%s: failed to find KV cache slot for ubatch of size %d\n", __func__, ubatch.n_tokens);
++                return 1;
++            }
+         }
+ 
+         ggml_backend_sched_reset(sched.get());
+@@ -1967,9 +1970,12 @@ void llama_context::opt_epoch_iter(
+ 
+             // TODO: not sure if this is needed
+             if (!kv_self->find_slot(ubatch)) {
+-                LLAMA_LOG_WARN("%s: failed to find KV cache slot for ubatch of size %d\n", __func__, ubatch.n_tokens);
+-
+-                GGML_ABORT("TODO: handle this error");
++                kv_self->defrag_sched(-1.0f);
++                kv_self->update(*this);
++                if (!kv_self->find_slot(ubatch)) {
++                    LLAMA_LOG_WARN("%s: failed to find KV cache slot for ubatch of size %d\n", __func__, ubatch.n_tokens);
++                    GGML_ABORT("TODO: handle this error");
++                }
+             }
+ 
+             auto * gf = graph_init();
 diff --git a/src/llama-context.h b/src/llama-context.h
 index c4ab242a..9970dfc6 100644
 --- a/src/llama-context.h