From 5fd359d1174004a68180e64cd8705b01cee63a49 Mon Sep 17 00:00:00 2001 From: Josh Yan Date: Wed, 10 Jul 2024 10:28:42 -0700 Subject: [PATCH] added patch --- llm/llm.go | 2 +- llm/patches/10-quantize-progress.diff | 0 llm/quantize.diff | 76 --------------------------- 3 files changed, 1 insertion(+), 77 deletions(-) create mode 100644 llm/patches/10-quantize-progress.diff delete mode 100644 llm/quantize.diff diff --git a/llm/llm.go b/llm/llm.go index 38b2aa13a..8c558bcab 100644 --- a/llm/llm.go +++ b/llm/llm.go @@ -65,7 +65,7 @@ func Quantize(infile, outfile string, ftype fileType, fn func(resp api.ProgressR fmt.Println("Progress: ", *((*C.float)(store))) case <-done: fn(api.ProgressResponse{ - Status: "quantizing model", + Status: fmt.Sprintf("quantizing model %d/%d", tensorCount, tensorCount), Quantize: "quant", }) return diff --git a/llm/patches/10-quantize-progress.diff b/llm/patches/10-quantize-progress.diff new file mode 100644 index 000000000..e69de29bb diff --git a/llm/quantize.diff b/llm/quantize.diff deleted file mode 100644 index bd3fd8bd6..000000000 --- a/llm/quantize.diff +++ /dev/null @@ -1,76 +0,0 @@ -commit c260daa84166c568cd998410dc9ba5628c530bee -Author: Josh Yan -Date: Tue Jul 9 15:34:24 2024 -0700 - - quantize progress - -diff --git a/llama.cpp b/llama.cpp -index 61948751..c06d31b6 100644 ---- a/llama.cpp -+++ b/llama.cpp -@@ -15370,7 +15370,7 @@ static size_t llama_tensor_quantize_internal(enum ggml_type new_type, const floa - return new_size; - } - --static void llama_model_quantize_internal(const std::string & fname_inp, const std::string & fname_out, const llama_model_quantize_params * params) { -+static void llama_model_quantize_internal(const std::string & fname_inp, const std::string & fname_out, llama_model_quantize_params * params) { - ggml_type default_type; - llama_ftype ftype = params->ftype; - -@@ -15586,6 +15586,15 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s - const auto tn = LLM_TN(model.arch); - new_ofstream(0); - for (int i = 0; i < ml.n_tensors; ++i) { -+ -+ if (params->quantize_callback){ -+ if (!params->quantize_callback(i/ml.n_tensors, params->quantize_callback_data)) { -+ close_ofstream(); -+ params->quantize_callback_data = nullptr; -+ return; -+ } -+ } -+ - auto weight = ml.get_weight(i); - struct ggml_tensor * tensor = weight->tensor; - if (weight->idx != cur_split && params->keep_split) { -@@ -16119,6 +16128,8 @@ struct llama_model_quantize_params llama_model_quantize_default_params() { - /*.keep_split =*/ false, - /*.imatrix =*/ nullptr, - /*.kv_overrides =*/ nullptr, -+ /*.quantize_callback =*/ nullptr, -+ /*.quantize_callback_data =*/ nullptr, - }; - - return result; -@@ -16784,7 +16795,7 @@ struct ggml_tensor * llama_get_model_tensor(struct llama_model * model, const ch - uint32_t llama_model_quantize( - const char * fname_inp, - const char * fname_out, -- const llama_model_quantize_params * params) { -+ llama_model_quantize_params * params) { - try { - llama_model_quantize_internal(fname_inp, fname_out, params); - return 0; -diff --git a/llama.h b/llama.h -index da310ffa..847c40d4 100644 ---- a/llama.h -+++ b/llama.h -@@ -196,6 +196,8 @@ extern "C" { - - typedef bool (*llama_progress_callback)(float progress, void * user_data); - -+ typedef bool (*llama_quantize_callback)(int progress, void * user_data); -+ - // Input data for llama_decode - // A llama_batch object can contain input about one or many sequences - // The provided arrays (i.e. token, embd, pos, etc.) must have size of n_tokens -@@ -337,6 +339,9 @@ extern "C" { - bool keep_split; // quantize to the same number of shards - void * imatrix; // pointer to importance matrix data - void * kv_overrides; // pointer to vector containing overrides -+ -+ llama_quantize_callback quantize_callback; -+ void * quantize_callback_data; - } llama_model_quantize_params; - - // grammar types