From 1ef59057d005ad318cb2d14bf4072dba1183af51 Mon Sep 17 00:00:00 2001 From: Josh Yan Date: Wed, 10 Jul 2024 13:02:37 -0700 Subject: [PATCH] patch llama.cpp --- llm/llama.cpp | 2 +- llm/patches/10-quantize-progress.diff | 28 ++++++++++++++++++--------- server/images.go | 7 ------- 3 files changed, 20 insertions(+), 17 deletions(-) diff --git a/llm/llama.cpp b/llm/llama.cpp index 7c26775ad..fa509abf2 160000 --- a/llm/llama.cpp +++ b/llm/llama.cpp @@ -1 +1 @@ -Subproject commit 7c26775adb579e92b59c82e8084c07a1d0f75e9c +Subproject commit fa509abf281177eacdc71a2a14432c4e6ed74a47 diff --git a/llm/patches/10-quantize-progress.diff b/llm/patches/10-quantize-progress.diff index c588c0957..ca8a4696a 100644 --- a/llm/patches/10-quantize-progress.diff +++ b/llm/patches/10-quantize-progress.diff @@ -1,24 +1,31 @@ +From fa509abf281177eacdc71a2a14432c4e6ed74a47 Mon Sep 17 00:00:00 2001 +From: Josh Yan +Date: Wed, 10 Jul 2024 12:58:31 -0700 +Subject: [PATCH] quantize callback + +--- + llama.cpp | 8 ++++++++ + llama.h | 3 +++ + 2 files changed, 11 insertions(+) + diff --git a/llama.cpp b/llama.cpp -index 61948751..2c683ef6 100644 +index 61948751..d3126510 100644 --- a/llama.cpp +++ b/llama.cpp -@@ -15586,6 +15586,15 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s +@@ -15586,6 +15586,12 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s const auto tn = LLM_TN(model.arch); new_ofstream(0); for (int i = 0; i < ml.n_tensors; ++i) { -+ + if (params->quantize_callback){ -+ LLAMA_LOG_INFO("ENTERED CALLBACK\n"); + if (!params->quantize_callback(i, params->quantize_callback_data)) { + return; + } -+ LLAMA_LOG_INFO("CURRENTLY AT ", i/ml.n_tensors * 100); + } + auto weight = ml.get_weight(i); struct ggml_tensor * tensor = weight->tensor; if (weight->idx != cur_split && params->keep_split) { -@@ -16119,6 +16128,8 @@ struct llama_model_quantize_params llama_model_quantize_default_params() { +@@ -16119,6 +16125,8 @@ struct llama_model_quantize_params llama_model_quantize_default_params() { /*.keep_split =*/ false, /*.imatrix =*/ nullptr, /*.kv_overrides =*/ nullptr, @@ -28,7 +35,7 @@ index 61948751..2c683ef6 100644 return result; diff --git a/llama.h b/llama.h -index da310ffa..9b48d889 100644 +index da310ffa..3cbe6023 100644 --- a/llama.h +++ b/llama.h @@ -337,6 +337,9 @@ extern "C" { @@ -36,8 +43,11 @@ index da310ffa..9b48d889 100644 void * imatrix; // pointer to importance matrix data void * kv_overrides; // pointer to vector containing overrides + -+ llama_progress_callback quantize_callback; -+ void * quantize_callback_data; ++ llama_progress_callback quantize_callback; // callback to report quantization progress ++ void * quantize_callback_data; // user data for the callback } llama_model_quantize_params; // grammar types +-- +2.39.3 (Apple Git-146) + diff --git a/server/images.go b/server/images.go index 65fe30725..fc68c7acb 100644 --- a/server/images.go +++ b/server/images.go @@ -440,8 +440,6 @@ func CreateModel(ctx context.Context, name model.Name, modelFileDir, quantizatio defer temp.Close() defer os.Remove(temp.Name()) - // Quantizes per layer - // Save total quantized tensors if err := llm.Quantize(blob, temp.Name(), want, fn, tensorCount); err != nil { return err } @@ -473,11 +471,6 @@ func CreateModel(ctx context.Context, name model.Name, modelFileDir, quantizatio config.ModelFamilies = append(config.ModelFamilies, baseLayer.GGML.KV().Architecture()) } - /* fn(api.ProgressResponse{ - Status: fmt.Sprintf("quantizing model %d%%", i*100/layerCount), - Quantize: quantization, - }) */ - layers = append(layers, baseLayer.Layer) }