diff --git a/llm/llm.go b/llm/llm.go index 37eee2bc3..db20cedce 100644 --- a/llm/llm.go +++ b/llm/llm.go @@ -10,14 +10,17 @@ package llm // #cgo linux,arm64 LDFLAGS: -L${SRCDIR}/build/linux/arm64_static -L${SRCDIR}/build/linux/arm64_static/src -L${SRCDIR}/build/linux/arm64_static/ggml/src // #include // #include "llama.h" -// bool update_quantize_progress(int progress, void* data) { -// *((int*)data) = progress; +// static bool update_quantize_progress(float progress, void* data) { +// *((float*)data) = progress; // return true; // } import "C" import ( "fmt" "unsafe" + "time" + + "github.com/ollama/ollama/api" ) // SystemInfo is an unused example of calling llama.cpp functions using CGo @@ -25,7 +28,7 @@ func SystemInfo() string { return C.GoString(C.llama_print_system_info()) } -func Quantize(infile, outfile string, ftype fileType, count *int) error { +func Quantize(infile, outfile string, ftype fileType, fn func(resp api.ProgressResponse) ) error { cinfile := C.CString(infile) defer C.free(unsafe.Pointer(cinfile)) @@ -37,7 +40,11 @@ func Quantize(infile, outfile string, ftype fileType, count *int) error { params.ftype = ftype.Value() // Initialize "global" to store progress - store := C.malloc(C.sizeof(int)) + store := C.malloc(C.sizeof_float) + defer C.free(unsafe.Pointer(store)) + + // Initialize store value, e.g., setting initial progress to 0 + *(*C.float)(store) = 0.0 params.quantize_callback_data = store params.quantize_callback = C.update_quantize_progress @@ -48,7 +55,11 @@ func Quantize(infile, outfile string, ftype fileType, count *int) error { if params.quantize_callback_data == nil { return } else { - *count = int(*(*C.int)(store)) + progress := *((*C.float)(store)) + fn(api.ProgressResponse{ + Status: fmt.Sprintf("quantizing model %d%%", int(progress*100)), + Quantize: "quant", + }) } } }() diff --git a/llm/quantize.diff b/llm/quantize.diff new file mode 100644 index 000000000..bd3fd8bd6 --- /dev/null +++ b/llm/quantize.diff @@ -0,0 +1,76 @@ +commit c260daa84166c568cd998410dc9ba5628c530bee +Author: Josh Yan +Date: Tue Jul 9 15:34:24 2024 -0700 + + quantize progress + +diff --git a/llama.cpp b/llama.cpp +index 61948751..c06d31b6 100644 +--- a/llama.cpp ++++ b/llama.cpp +@@ -15370,7 +15370,7 @@ static size_t llama_tensor_quantize_internal(enum ggml_type new_type, const floa + return new_size; + } + +-static void llama_model_quantize_internal(const std::string & fname_inp, const std::string & fname_out, const llama_model_quantize_params * params) { ++static void llama_model_quantize_internal(const std::string & fname_inp, const std::string & fname_out, llama_model_quantize_params * params) { + ggml_type default_type; + llama_ftype ftype = params->ftype; + +@@ -15586,6 +15586,15 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s + const auto tn = LLM_TN(model.arch); + new_ofstream(0); + for (int i = 0; i < ml.n_tensors; ++i) { ++ ++ if (params->quantize_callback){ ++ if (!params->quantize_callback(i/ml.n_tensors, params->quantize_callback_data)) { ++ close_ofstream(); ++ params->quantize_callback_data = nullptr; ++ return; ++ } ++ } ++ + auto weight = ml.get_weight(i); + struct ggml_tensor * tensor = weight->tensor; + if (weight->idx != cur_split && params->keep_split) { +@@ -16119,6 +16128,8 @@ struct llama_model_quantize_params llama_model_quantize_default_params() { + /*.keep_split =*/ false, + /*.imatrix =*/ nullptr, + /*.kv_overrides =*/ nullptr, ++ /*.quantize_callback =*/ nullptr, ++ /*.quantize_callback_data =*/ nullptr, + }; + + return result; +@@ -16784,7 +16795,7 @@ struct ggml_tensor * llama_get_model_tensor(struct llama_model * model, const ch + uint32_t llama_model_quantize( + const char * fname_inp, + const char * fname_out, +- const llama_model_quantize_params * params) { ++ llama_model_quantize_params * params) { + try { + llama_model_quantize_internal(fname_inp, fname_out, params); + return 0; +diff --git a/llama.h b/llama.h +index da310ffa..847c40d4 100644 +--- a/llama.h ++++ b/llama.h +@@ -196,6 +196,8 @@ extern "C" { + + typedef bool (*llama_progress_callback)(float progress, void * user_data); + ++ typedef bool (*llama_quantize_callback)(int progress, void * user_data); ++ + // Input data for llama_decode + // A llama_batch object can contain input about one or many sequences + // The provided arrays (i.e. token, embd, pos, etc.) must have size of n_tokens +@@ -337,6 +339,9 @@ extern "C" { + bool keep_split; // quantize to the same number of shards + void * imatrix; // pointer to importance matrix data + void * kv_overrides; // pointer to vector containing overrides ++ ++ llama_quantize_callback quantize_callback; ++ void * quantize_callback_data; + } llama_model_quantize_params; + + // grammar types diff --git a/server/images.go b/server/images.go index a4dcfc9da..a050e78fb 100644 --- a/server/images.go +++ b/server/images.go @@ -21,7 +21,6 @@ import ( "slices" "strconv" "strings" - "time" "github.com/ollama/ollama/api" "github.com/ollama/ollama/auth" @@ -414,8 +413,6 @@ func CreateModel(ctx context.Context, name model.Name, modelFileDir, quantizatio return fmt.Errorf("invalid model reference: %s", c.Args) } - var quantized int - tensorCount := 0 for _, baseLayer := range baseLayers { if quantization != "" && baseLayer.MediaType == "application/vnd.ollama.image.model" && @@ -426,27 +423,10 @@ func CreateModel(ctx context.Context, name model.Name, modelFileDir, quantizatio return err } - tensorCount = len(baseLayer.GGML.Tensors()) - ticker := time.NewTicker(60 * time.Millisecond) - done := make(chan struct{}) - defer close(done) - - go func() { - defer ticker.Stop() - for { - select { - case <-ticker.C: - fn(api.ProgressResponse{ - Status: fmt.Sprintf("quantizing model %d%%", quantized*100/tensorCount), - Quantize: quantization}) - case <-done: - fn(api.ProgressResponse{ - Status: "quantizing model", - Quantize: quantization}) - } - return - } - }() + fn(api.ProgressResponse{ + Status: "quantizing model", + Quantize: "quant", + }) ft := baseLayer.GGML.KV().FileType() if !slices.Contains([]string{"F16", "F32"}, ft.String()) { @@ -466,7 +446,7 @@ func CreateModel(ctx context.Context, name model.Name, modelFileDir, quantizatio // Quantizes per layer // Save total quantized tensors - if err := llm.Quantize(blob, temp.Name(), want, &quantized); err != nil { + if err := llm.Quantize(blob, temp.Name(), want, fn); err != nil { return err }