diff --git a/llama/llama.cpp b/llama/llama.cpp index 938368687..88da0f4b2 100644 --- a/llama/llama.cpp +++ b/llama/llama.cpp @@ -3051,6 +3051,13 @@ struct llama_kv_cache { } }; +// block of KV slots to move when defragging +struct llama_kv_defrag_move { + uint32_t src; + uint32_t dst; + uint32_t len; +}; + struct llama_control_vector { std::vector tensors; // per layer std::vector ctxs; @@ -10828,35 +10835,23 @@ struct llm_build_context { return gf; } - struct ggml_cgraph * build_defrag(const std::vector & ids) { + struct ggml_cgraph * build_defrag(const std::vector & moves) { struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false); - for (uint32_t i = 0; i < ids.size(); ++i) { - const uint32_t id = ids[i]; - - if (i == id || id == ids.size()) { - continue; - } - - uint32_t nm = 1; - - while (i + nm < ids.size() && ids[i + nm] == id + nm) { - nm++; - } - + for (const auto & move : moves) { for (int il = 0; il < n_layer; ++il) { const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa(il); const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa(il); ggml_tensor * view_k_src = ggml_view_2d(ctx0, kv_self.k_l[il], - n_embd_k_gqa, nm, + n_embd_k_gqa, move.len, ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa), - ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa*i)); + ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa*move.src)); ggml_tensor * view_k_dst = ggml_view_2d(ctx0, kv_self.k_l[il], - n_embd_k_gqa, nm, + n_embd_k_gqa, move.len, ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa), - ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa*id)); + ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa*move.dst)); ggml_tensor * view_v_src; ggml_tensor * view_v_dst; @@ -10864,31 +10859,29 @@ struct llm_build_context { if (flash_attn) { // NOTE: the V cache is not transposed when using flash attention view_v_src = ggml_view_2d(ctx0, kv_self.v_l[il], - n_embd_v_gqa, nm, + n_embd_v_gqa, move.len, ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa), - ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa*i)); + ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa*move.src)); view_v_dst = ggml_view_2d(ctx0, kv_self.v_l[il], - n_embd_v_gqa, nm, + n_embd_v_gqa, move.len, ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa), - ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa*id)); + ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa*move.dst)); } else { view_v_src = ggml_view_2d(ctx0, kv_self.v_l[il], - nm, n_embd_v_gqa, + move.len, n_embd_v_gqa, ggml_row_size(kv_self.v_l[il]->type, kv_self.size), - ggml_row_size(kv_self.v_l[il]->type, i)); + ggml_row_size(kv_self.v_l[il]->type, move.src)); view_v_dst = ggml_view_2d(ctx0, kv_self.v_l[il], - nm, n_embd_v_gqa, + move.len, n_embd_v_gqa, ggml_row_size(kv_self.v_l[il]->type, kv_self.size), - ggml_row_size(kv_self.v_l[il]->type, id)); + ggml_row_size(kv_self.v_l[il]->type, move.dst)); } ggml_build_forward_expand(gf, ggml_cpy(ctx0, view_k_src, view_k_dst)); ggml_build_forward_expand(gf, ggml_cpy(ctx0, view_v_src, view_v_dst)); } - - i += nm - 1; } //LLAMA_LOG_INFO("gf->n_nodes = %d\n", gf->n_nodes); @@ -17351,7 +17344,7 @@ struct llm_build_context { } }; -static struct ggml_cgraph * llama_build_graph_defrag(llama_context & lctx, const std::vector & ids) { +static struct ggml_cgraph * llama_build_graph_defrag(llama_context & lctx, const std::vector & moves) { llama_ubatch dummy = {}; dummy.equal_seqs = true; @@ -17361,7 +17354,7 @@ static struct ggml_cgraph * llama_build_graph_defrag(llama_context & lctx, const llm.init(); - struct ggml_cgraph * result = llm.build_defrag(ids); + struct ggml_cgraph * result = llm.build_defrag(moves); llm.free(); @@ -18377,7 +18370,12 @@ static int llama_decode_internal( kv_self.head = 0; } - const auto slot = llama_kv_cache_find_slot(kv_self, ubatch); + auto slot = llama_kv_cache_find_slot(kv_self, ubatch); + if (!slot) { + llama_kv_cache_defrag(kv_self); + llama_kv_cache_update(&lctx); + slot = llama_kv_cache_find_slot(kv_self, ubatch); + } if (!slot) { return 1; } @@ -18782,8 +18780,8 @@ static void llama_kv_cache_defrag_internal(struct llama_context & lctx) { //const int64_t t_start = ggml_time_us(); - // number of cells moved - uint32_t n_moves = 0; + // groups of cells moved + std::vector moves; // each move requires 6*n_layer tensors (see build_defrag) // - source view, destination view, copy operation @@ -18847,19 +18845,11 @@ static void llama_kv_cache_defrag_internal(struct llama_context & lctx) { // are we moving a continuous block of memory? bool cont = false; - // should we stop searching for the next move? - bool stop = false; - // go back and move the nf cells to the hole for (; i1 < n_kv; ++i1) { auto & cell1 = kv_self.cells[i1]; if (cell1.is_empty() || ids[i1] != n_kv) { - if (n_moves == max_moves) { - stop = true; - break; - } - cont = false; continue; } @@ -18875,8 +18865,10 @@ static void llama_kv_cache_defrag_internal(struct llama_context & lctx) { kv_self.head = n_used; if (!cont) { - n_moves++; + moves.push_back({i1, i0 + nf, 1}); cont = true; + } else { + moves.back().len++; } nf++; @@ -18886,22 +18878,16 @@ static void llama_kv_cache_defrag_internal(struct llama_context & lctx) { } } - if (stop || n_moves == max_moves) { - break; - } - //LLAMA_LOG_INFO("(tmp log) KV defrag: move [%u, %u) to [%u, %u)\n", is, i1 + 1, i0, i0 + nh); i0 += nh - 1; } - if (n_moves == 0) { + if (moves.size() == 0) { return; } - //LLAMA_LOG_INFO("(tmp log) KV defrag cell moves: %u\n", n_moves); - - //LLAMA_LOG_INFO("expected gf nodes: %u\n", 6*n_moves*n_layer); + //LLAMA_LOG_INFO("(tmp log) KV defrag cell moves: %u\n", moves.size()); #if 0 // CPU defrag @@ -18976,11 +18962,18 @@ static void llama_kv_cache_defrag_internal(struct llama_context & lctx) { #else // ggml_graph defrag - ggml_backend_sched_reset(lctx.sched.get()); + for (std::size_t i = 0; i < moves.size(); i += max_moves) { + std::vector chunk; + auto end = std::min(i + max_moves, moves.size()); + chunk.assign(moves.begin() + i, moves.begin() + end); - ggml_cgraph * gf = llama_build_graph_defrag(lctx, ids); + ggml_backend_sched_reset(lctx.sched.get()); - llama_graph_compute(lctx, gf, lctx.cparams.n_threads, lctx.threadpool); + //LLAMA_LOG_INFO("expected gf nodes: %u\n", 6*chunk.size()*n_layer); + ggml_cgraph * gf = llama_build_graph_defrag(lctx, chunk); + + llama_graph_compute(lctx, gf, lctx.cparams.n_threads, lctx.threadpool); + } #endif //const int64_t t_end = ggml_time_us(); diff --git a/llama/patches/0014-llama-Ensure-KV-cache-is-fully-defragmented.patch b/llama/patches/0014-llama-Ensure-KV-cache-is-fully-defragmented.patch new file mode 100644 index 000000000..b92c24dd9 --- /dev/null +++ b/llama/patches/0014-llama-Ensure-KV-cache-is-fully-defragmented.patch @@ -0,0 +1,242 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Jesse Gross +Date: Fri, 13 Dec 2024 16:11:59 -0800 +Subject: [PATCH] llama: Ensure KV cache is fully defragmented. + +Sometimes the KV cache requires defragmentation even without +triggering the threshold heuristic. In this case, decoding +will not being able to find a KV cache slot. This is particularly +difficult for the caller to handle if it happens in between +ubatches. To avoid this, we should immediately trigger a defrag. + +In addition, a heavily fragmented cache can require more than +max_moves to defragment. Currently, we stop when we hit the limit +but this can leave a cache that still does not have adequate space +even after defragmentation is triggered. Instead, we should do +multiple batches of processing until everything is complete. +--- + src/llama.cpp | 99 ++++++++++++++++++++++++--------------------------- + 1 file changed, 46 insertions(+), 53 deletions(-) + +diff --git a/src/llama.cpp b/src/llama.cpp +index 4778a9ed..654e32bc 100644 +--- a/src/llama.cpp ++++ b/src/llama.cpp +@@ -3025,6 +3025,13 @@ struct llama_kv_cache { + } + }; + ++// block of KV slots to move when defragging ++struct llama_kv_defrag_move { ++ uint32_t src; ++ uint32_t dst; ++ uint32_t len; ++}; ++ + struct llama_control_vector { + std::vector tensors; // per layer + std::vector ctxs; +@@ -10802,35 +10809,23 @@ struct llm_build_context { + return gf; + } + +- struct ggml_cgraph * build_defrag(const std::vector & ids) { ++ struct ggml_cgraph * build_defrag(const std::vector & moves) { + struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false); + +- for (uint32_t i = 0; i < ids.size(); ++i) { +- const uint32_t id = ids[i]; +- +- if (i == id || id == ids.size()) { +- continue; +- } +- +- uint32_t nm = 1; +- +- while (i + nm < ids.size() && ids[i + nm] == id + nm) { +- nm++; +- } +- ++ for (const auto & move : moves) { + for (int il = 0; il < n_layer; ++il) { + const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa(il); + const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa(il); + + ggml_tensor * view_k_src = ggml_view_2d(ctx0, kv_self.k_l[il], +- n_embd_k_gqa, nm, ++ n_embd_k_gqa, move.len, + ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa), +- ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa*i)); ++ ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa*move.src)); + + ggml_tensor * view_k_dst = ggml_view_2d(ctx0, kv_self.k_l[il], +- n_embd_k_gqa, nm, ++ n_embd_k_gqa, move.len, + ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa), +- ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa*id)); ++ ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa*move.dst)); + + ggml_tensor * view_v_src; + ggml_tensor * view_v_dst; +@@ -10838,31 +10833,29 @@ struct llm_build_context { + if (flash_attn) { + // NOTE: the V cache is not transposed when using flash attention + view_v_src = ggml_view_2d(ctx0, kv_self.v_l[il], +- n_embd_v_gqa, nm, ++ n_embd_v_gqa, move.len, + ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa), +- ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa*i)); ++ ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa*move.src)); + + view_v_dst = ggml_view_2d(ctx0, kv_self.v_l[il], +- n_embd_v_gqa, nm, ++ n_embd_v_gqa, move.len, + ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa), +- ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa*id)); ++ ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa*move.dst)); + } else { + view_v_src = ggml_view_2d(ctx0, kv_self.v_l[il], +- nm, n_embd_v_gqa, ++ move.len, n_embd_v_gqa, + ggml_row_size(kv_self.v_l[il]->type, kv_self.size), +- ggml_row_size(kv_self.v_l[il]->type, i)); ++ ggml_row_size(kv_self.v_l[il]->type, move.src)); + + view_v_dst = ggml_view_2d(ctx0, kv_self.v_l[il], +- nm, n_embd_v_gqa, ++ move.len, n_embd_v_gqa, + ggml_row_size(kv_self.v_l[il]->type, kv_self.size), +- ggml_row_size(kv_self.v_l[il]->type, id)); ++ ggml_row_size(kv_self.v_l[il]->type, move.dst)); + } + + ggml_build_forward_expand(gf, ggml_cpy(ctx0, view_k_src, view_k_dst)); + ggml_build_forward_expand(gf, ggml_cpy(ctx0, view_v_src, view_v_dst)); + } +- +- i += nm - 1; + } + + //LLAMA_LOG_INFO("gf->n_nodes = %d\n", gf->n_nodes); +@@ -17325,7 +17318,7 @@ struct llm_build_context { + } + }; + +-static struct ggml_cgraph * llama_build_graph_defrag(llama_context & lctx, const std::vector & ids) { ++static struct ggml_cgraph * llama_build_graph_defrag(llama_context & lctx, const std::vector & moves) { + llama_ubatch dummy = {}; + dummy.equal_seqs = true; + +@@ -17335,7 +17328,7 @@ static struct ggml_cgraph * llama_build_graph_defrag(llama_context & lctx, const + + llm.init(); + +- struct ggml_cgraph * result = llm.build_defrag(ids); ++ struct ggml_cgraph * result = llm.build_defrag(moves); + + llm.free(); + +@@ -18351,7 +18344,12 @@ static int llama_decode_internal( + kv_self.head = 0; + } + +- const auto slot = llama_kv_cache_find_slot(kv_self, ubatch); ++ auto slot = llama_kv_cache_find_slot(kv_self, ubatch); ++ if (!slot) { ++ llama_kv_cache_defrag(kv_self); ++ llama_kv_cache_update(&lctx); ++ slot = llama_kv_cache_find_slot(kv_self, ubatch); ++ } + if (!slot) { + return 1; + } +@@ -18756,8 +18754,8 @@ static void llama_kv_cache_defrag_internal(struct llama_context & lctx) { + + //const int64_t t_start = ggml_time_us(); + +- // number of cells moved +- uint32_t n_moves = 0; ++ // groups of cells moved ++ std::vector moves; + + // each move requires 6*n_layer tensors (see build_defrag) + // - source view, destination view, copy operation +@@ -18821,19 +18819,11 @@ static void llama_kv_cache_defrag_internal(struct llama_context & lctx) { + // are we moving a continuous block of memory? + bool cont = false; + +- // should we stop searching for the next move? +- bool stop = false; +- + // go back and move the nf cells to the hole + for (; i1 < n_kv; ++i1) { + auto & cell1 = kv_self.cells[i1]; + + if (cell1.is_empty() || ids[i1] != n_kv) { +- if (n_moves == max_moves) { +- stop = true; +- break; +- } +- + cont = false; + continue; + } +@@ -18849,8 +18839,10 @@ static void llama_kv_cache_defrag_internal(struct llama_context & lctx) { + kv_self.head = n_used; + + if (!cont) { +- n_moves++; ++ moves.push_back({i1, i0 + nf, 1}); + cont = true; ++ } else { ++ moves.back().len++; + } + + nf++; +@@ -18860,22 +18852,16 @@ static void llama_kv_cache_defrag_internal(struct llama_context & lctx) { + } + } + +- if (stop || n_moves == max_moves) { +- break; +- } +- + //LLAMA_LOG_INFO("(tmp log) KV defrag: move [%u, %u) to [%u, %u)\n", is, i1 + 1, i0, i0 + nh); + + i0 += nh - 1; + } + +- if (n_moves == 0) { ++ if (moves.size() == 0) { + return; + } + +- //LLAMA_LOG_INFO("(tmp log) KV defrag cell moves: %u\n", n_moves); +- +- //LLAMA_LOG_INFO("expected gf nodes: %u\n", 6*n_moves*n_layer); ++ //LLAMA_LOG_INFO("(tmp log) KV defrag cell moves: %u\n", moves.size()); + + #if 0 + // CPU defrag +@@ -18950,11 +18936,18 @@ static void llama_kv_cache_defrag_internal(struct llama_context & lctx) { + #else + // ggml_graph defrag + +- ggml_backend_sched_reset(lctx.sched.get()); ++ for (std::size_t i = 0; i < moves.size(); i += max_moves) { ++ std::vector chunk; ++ auto end = std::min(i + max_moves, moves.size()); ++ chunk.assign(moves.begin() + i, moves.begin() + end); + +- ggml_cgraph * gf = llama_build_graph_defrag(lctx, ids); ++ ggml_backend_sched_reset(lctx.sched.get()); ++ ++ //LLAMA_LOG_INFO("expected gf nodes: %u\n", 6*chunk.size()*n_layer); ++ ggml_cgraph * gf = llama_build_graph_defrag(lctx, chunk); + +- llama_graph_compute(lctx, gf, lctx.cparams.n_threads, lctx.threadpool); ++ llama_graph_compute(lctx, gf, lctx.cparams.n_threads, lctx.threadpool); ++ } + #endif + + //const int64_t t_end = ggml_time_us(); diff --git a/llama/runner/runner.go b/llama/runner/runner.go index 9771420ef..86c010096 100644 --- a/llama/runner/runner.go +++ b/llama/runner/runner.go @@ -433,14 +433,7 @@ func (s *Server) processBatch(tokenBatch *llama.Batch, embedBatch *llama.Batch) err := s.lc.Decode(batch) if err != nil { - if errors.Is(err, llama.ErrKvCacheFull) { - slog.Debug("defragmenting kv cache") - s.cache.lc.KvCacheDefrag() - err = s.lc.Decode(batch) - } - if err != nil { - return fmt.Errorf("failed to decode batch: %w", err) - } + return fmt.Errorf("failed to decode batch: %w", err) } if crossAttention {