llama: update llama.cpp vendor code to commit d7cfe1ff (#9356)
This commit is contained in:
		
							
								
								
									
										353
									
								
								llama/llama.cpp/common/common.cpp
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										353
									
								
								llama/llama.cpp/common/common.cpp
									
									
									
									
										vendored
									
									
								
							| @@ -2,6 +2,9 @@ | ||||
| #define _SILENCE_CXX17_CODECVT_HEADER_DEPRECATION_WARNING | ||||
| #endif | ||||
|  | ||||
| #include "ggml.h" | ||||
| #include "gguf.h" | ||||
|  | ||||
| #include "common.h" | ||||
| #include "log.h" | ||||
| // Change JSON_ASSERT from assert() to GGML_ASSERT: | ||||
| @@ -70,6 +73,22 @@ | ||||
| #include <sys/syslimits.h> | ||||
| #endif | ||||
| #define LLAMA_CURL_MAX_URL_LENGTH 2084 // Maximum URL Length in Chrome: 2083 | ||||
|  | ||||
| // | ||||
| // CURL utils | ||||
| // | ||||
|  | ||||
| using curl_ptr = std::unique_ptr<CURL, decltype(&curl_easy_cleanup)>; | ||||
|  | ||||
| // cannot use unique_ptr for curl_slist, because we cannot update without destroying the old one | ||||
| struct curl_slist_ptr { | ||||
|     struct curl_slist * ptr = nullptr; | ||||
|     ~curl_slist_ptr() { | ||||
|         if (ptr) { | ||||
|             curl_slist_free_all(ptr); | ||||
|         } | ||||
|     } | ||||
| }; | ||||
| #endif // LLAMA_USE_CURL | ||||
|  | ||||
| using json = nlohmann::ordered_json; | ||||
| @@ -464,6 +483,48 @@ void string_replace_all(std::string & s, const std::string & search, const std:: | ||||
|     s = std::move(builder); | ||||
| } | ||||
|  | ||||
| std::string string_join(const std::vector<std::string> & values, const std::string & separator) { | ||||
|     std::ostringstream result; | ||||
|     for (size_t i = 0; i < values.size(); ++i) { | ||||
|         if (i > 0) { | ||||
|             result << separator; | ||||
|         } | ||||
|         result << values[i]; | ||||
|     } | ||||
|     return result.str(); | ||||
| } | ||||
|  | ||||
| std::vector<std::string> string_split(const std::string & str, const std::string & delimiter) { | ||||
|     std::vector<std::string> parts; | ||||
|     size_t start = 0; | ||||
|     size_t end = str.find(delimiter); | ||||
|  | ||||
|     while (end != std::string::npos) { | ||||
|         parts.push_back(str.substr(start, end - start)); | ||||
|         start = end + delimiter.length(); | ||||
|         end = str.find(delimiter, start); | ||||
|     } | ||||
|  | ||||
|     parts.push_back(str.substr(start)); | ||||
|  | ||||
|     return parts; | ||||
| } | ||||
|  | ||||
| std::string string_repeat(const std::string & str, size_t n) { | ||||
|     if (n == 0) { | ||||
|         return ""; | ||||
|     } | ||||
|  | ||||
|     std::string result; | ||||
|     result.reserve(str.length() * n); | ||||
|  | ||||
|     for (size_t i = 0; i < n; ++i) { | ||||
|         result += str; | ||||
|     } | ||||
|  | ||||
|     return result; | ||||
| } | ||||
|  | ||||
| std::string string_from(bool value) { | ||||
|     return value ? "true" : "false"; | ||||
| } | ||||
| @@ -846,7 +907,7 @@ struct common_init_result common_init_from_params(common_params & params) { | ||||
|     } else if (!params.model_url.empty()) { | ||||
|         model = common_load_model_from_url(params.model_url, params.model, params.hf_token, mparams); | ||||
|     } else { | ||||
|         model = llama_load_model_from_file(params.model.c_str(), mparams); | ||||
|         model = llama_model_load_from_file(params.model.c_str(), mparams); | ||||
|     } | ||||
|  | ||||
|     if (model == NULL) { | ||||
| @@ -854,26 +915,28 @@ struct common_init_result common_init_from_params(common_params & params) { | ||||
|         return iparams; | ||||
|     } | ||||
|  | ||||
|     const llama_vocab * vocab = llama_model_get_vocab(model); | ||||
|  | ||||
|     if (params.reranking) { | ||||
|         bool ok = true; | ||||
|  | ||||
|         if (llama_token_bos(model) == LLAMA_TOKEN_NULL) { | ||||
|             LOG_WRN("%s: warning: model does not have a  BOS token, reranking will not work\n", __func__); | ||||
|         if (llama_vocab_bos(vocab) == LLAMA_TOKEN_NULL) { | ||||
|             LOG_WRN("%s: warning: vocab does not have a  BOS token, reranking will not work\n", __func__); | ||||
|             ok = false; | ||||
|         } | ||||
|  | ||||
|         if (llama_token_eos(model) == LLAMA_TOKEN_NULL) { | ||||
|             LOG_WRN("%s: warning: model does not have an EOS token, reranking will not work\n", __func__); | ||||
|         if (llama_vocab_eos(vocab) == LLAMA_TOKEN_NULL) { | ||||
|             LOG_WRN("%s: warning: vocab does not have an EOS token, reranking will not work\n", __func__); | ||||
|             ok = false; | ||||
|         } | ||||
|  | ||||
|         if (llama_token_sep(model) == LLAMA_TOKEN_NULL) { | ||||
|             LOG_WRN("%s: warning: model does not have a  SEP token, reranking will not work\n", __func__); | ||||
|         if (llama_vocab_sep(vocab) == LLAMA_TOKEN_NULL) { | ||||
|             LOG_WRN("%s: warning: vocab does not have a  SEP token, reranking will not work\n", __func__); | ||||
|             ok = false; | ||||
|         } | ||||
|  | ||||
|         if (!ok) { | ||||
|             llama_free_model(model); | ||||
|             llama_model_free(model); | ||||
|  | ||||
|             return iparams; | ||||
|         } | ||||
| @@ -881,10 +944,10 @@ struct common_init_result common_init_from_params(common_params & params) { | ||||
|  | ||||
|     auto cparams = common_context_params_to_llama(params); | ||||
|  | ||||
|     llama_context * lctx = llama_new_context_with_model(model, cparams); | ||||
|     llama_context * lctx = llama_init_from_model(model, cparams); | ||||
|     if (lctx == NULL) { | ||||
|         LOG_ERR("%s: failed to create context with model '%s'\n", __func__, params.model.c_str()); | ||||
|         llama_free_model(model); | ||||
|         llama_model_free(model); | ||||
|         return iparams; | ||||
|     } | ||||
|  | ||||
| @@ -895,25 +958,26 @@ struct common_init_result common_init_from_params(common_params & params) { | ||||
|  | ||||
|     if (!params.control_vectors.empty()) { | ||||
|         if (params.control_vector_layer_start <= 0) params.control_vector_layer_start = 1; | ||||
|         if (params.control_vector_layer_end   <= 0) params.control_vector_layer_end   = llama_n_layer(model); | ||||
|         if (params.control_vector_layer_end   <= 0) params.control_vector_layer_end   = llama_model_n_layer(model); | ||||
|  | ||||
|         const auto cvec = common_control_vector_load(params.control_vectors); | ||||
|         if (cvec.n_embd == -1) { | ||||
|             llama_free(lctx); | ||||
|             llama_free_model(model); | ||||
|             llama_model_free(model); | ||||
|  | ||||
|             return iparams; | ||||
|         } | ||||
|  | ||||
|         int err = llama_control_vector_apply(lctx, | ||||
|                                              cvec.data.data(), | ||||
|                                              cvec.data.size(), | ||||
|                                              cvec.n_embd, | ||||
|                                              params.control_vector_layer_start, | ||||
|                                              params.control_vector_layer_end); | ||||
|         int err = llama_apply_adapter_cvec( | ||||
|                 lctx, | ||||
|                 cvec.data.data(), | ||||
|                 cvec.data.size(), | ||||
|                 cvec.n_embd, | ||||
|                 params.control_vector_layer_start, | ||||
|                 params.control_vector_layer_end); | ||||
|         if (err) { | ||||
|             llama_free(lctx); | ||||
|             llama_free_model(model); | ||||
|             llama_model_free(model); | ||||
|  | ||||
|             return iparams; | ||||
|         } | ||||
| @@ -921,12 +985,12 @@ struct common_init_result common_init_from_params(common_params & params) { | ||||
|  | ||||
|     // load and optionally apply lora adapters | ||||
|     for (auto & la : params.lora_adapters) { | ||||
|         llama_lora_adapter_ptr lora; | ||||
|         lora.reset(llama_lora_adapter_init(model, la.path.c_str())); | ||||
|         llama_adapter_lora_ptr lora; | ||||
|         lora.reset(llama_adapter_lora_init(model, la.path.c_str())); | ||||
|         if (lora == nullptr) { | ||||
|             LOG_ERR("%s: failed to apply lora adapter '%s'\n", __func__, la.path.c_str()); | ||||
|             llama_free(lctx); | ||||
|             llama_free_model(model); | ||||
|             llama_model_free(model); | ||||
|             return iparams; | ||||
|         } | ||||
|  | ||||
| @@ -935,17 +999,17 @@ struct common_init_result common_init_from_params(common_params & params) { | ||||
|     } | ||||
|  | ||||
|     if (!params.lora_init_without_apply) { | ||||
|         common_lora_adapters_apply(lctx, params.lora_adapters); | ||||
|         common_set_adapter_lora(lctx, params.lora_adapters); | ||||
|     } | ||||
|  | ||||
|     if (params.sampling.ignore_eos && llama_token_eos(model) == LLAMA_TOKEN_NULL) { | ||||
|         LOG_WRN("%s: warning: model does not have an EOS token, ignoring --ignore-eos\n", __func__); | ||||
|     if (params.sampling.ignore_eos && llama_vocab_eos(vocab) == LLAMA_TOKEN_NULL) { | ||||
|         LOG_WRN("%s: warning: vocab does not have an EOS token, ignoring --ignore-eos\n", __func__); | ||||
|         params.sampling.ignore_eos = false; | ||||
|     } | ||||
|  | ||||
|     if (params.sampling.ignore_eos) { | ||||
|         for (llama_token i = 0; i < llama_n_vocab(model); i++) { | ||||
|             if (llama_token_is_eog(model, i)) { | ||||
|         for (llama_token i = 0; i < llama_vocab_n_tokens(vocab); i++) { | ||||
|             if (llama_vocab_is_eog(vocab, i)) { | ||||
|                 LOG_INF("%s: added %s logit bias = %f\n", __func__, common_token_to_piece(lctx, i).c_str(), -INFINITY); | ||||
|                 params.sampling.logit_bias.push_back({i, -INFINITY}); | ||||
|             } | ||||
| @@ -966,8 +1030,9 @@ struct common_init_result common_init_from_params(common_params & params) { | ||||
|         LOG_WRN("%s: warming up the model with an empty run - please wait ... (--no-warmup to disable)\n", __func__); | ||||
|  | ||||
|         std::vector<llama_token> tmp; | ||||
|         llama_token bos = llama_token_bos(model); | ||||
|         llama_token eos = llama_token_eos(model); | ||||
|         llama_token bos = llama_vocab_bos(vocab); | ||||
|         llama_token eos = llama_vocab_eos(vocab); | ||||
|  | ||||
|         // some models (e.g. T5) don't have a BOS token | ||||
|         if (bos != LLAMA_TOKEN_NULL) { | ||||
|             tmp.push_back(bos); | ||||
| @@ -982,7 +1047,7 @@ struct common_init_result common_init_from_params(common_params & params) { | ||||
|         if (llama_model_has_encoder(model)) { | ||||
|             llama_encode(lctx, llama_batch_get_one(tmp.data(), tmp.size())); | ||||
|             llama_token decoder_start_token_id = llama_model_decoder_start_token(model); | ||||
|             if (decoder_start_token_id == -1) { | ||||
|             if (decoder_start_token_id == LLAMA_TOKEN_NULL) { | ||||
|                 decoder_start_token_id = bos; | ||||
|             } | ||||
|             tmp.clear(); | ||||
| @@ -1002,11 +1067,11 @@ struct common_init_result common_init_from_params(common_params & params) { | ||||
|     return iparams; | ||||
| } | ||||
|  | ||||
| void common_lora_adapters_apply(struct llama_context * ctx, std::vector<common_lora_adapter_info> & lora) { | ||||
|     llama_lora_adapter_clear(ctx); | ||||
| void common_set_adapter_lora(struct llama_context * ctx, std::vector<common_adapter_lora_info> & lora) { | ||||
|     llama_clear_adapter_lora(ctx); | ||||
|     for (auto & la : lora) { | ||||
|         if (la.scale != 0.0f) { | ||||
|             llama_lora_adapter_set(ctx, la.ptr, la.scale); | ||||
|             llama_set_adapter_lora(ctx, la.ptr, la.scale); | ||||
|         } | ||||
|     } | ||||
| } | ||||
| @@ -1020,7 +1085,6 @@ struct llama_model_params common_model_params_to_llama(common_params & params) { | ||||
|     if (params.n_gpu_layers != -1) { | ||||
|         mparams.n_gpu_layers = params.n_gpu_layers; | ||||
|     } | ||||
|     mparams.rpc_servers     = params.rpc_servers.c_str(); | ||||
|     mparams.main_gpu        = params.main_gpu; | ||||
|     mparams.split_mode      = params.split_mode; | ||||
|     mparams.tensor_split    = params.tensor_split; | ||||
| @@ -1123,7 +1187,8 @@ static bool curl_perform_with_retry(const std::string & url, CURL * curl, int ma | ||||
|  | ||||
| static bool common_download_file(const std::string & url, const std::string & path, const std::string & hf_token) { | ||||
|     // Initialize libcurl | ||||
|     std::unique_ptr<CURL, decltype(&curl_easy_cleanup)> curl(curl_easy_init(), &curl_easy_cleanup); | ||||
|     curl_ptr       curl(curl_easy_init(), &curl_easy_cleanup); | ||||
|     curl_slist_ptr http_headers; | ||||
|     if (!curl) { | ||||
|         LOG_ERR("%s: error initializing libcurl\n", __func__); | ||||
|         return false; | ||||
| @@ -1137,11 +1202,9 @@ static bool common_download_file(const std::string & url, const std::string & pa | ||||
|  | ||||
|     // Check if hf-token or bearer-token was specified | ||||
|     if (!hf_token.empty()) { | ||||
|       std::string auth_header = "Authorization: Bearer "; | ||||
|       auth_header += hf_token.c_str(); | ||||
|       struct curl_slist *http_headers = NULL; | ||||
|       http_headers = curl_slist_append(http_headers, auth_header.c_str()); | ||||
|       curl_easy_setopt(curl.get(), CURLOPT_HTTPHEADER, http_headers); | ||||
|         std::string auth_header = "Authorization: Bearer " + hf_token; | ||||
|         http_headers.ptr = curl_slist_append(http_headers.ptr, auth_header.c_str()); | ||||
|         curl_easy_setopt(curl.get(), CURLOPT_HTTPHEADER, http_headers.ptr); | ||||
|     } | ||||
|  | ||||
| #if defined(_WIN32) | ||||
| @@ -1411,7 +1474,7 @@ struct llama_model * common_load_model_from_url( | ||||
|         } | ||||
|     } | ||||
|  | ||||
|     return llama_load_model_from_file(local_path.c_str(), params); | ||||
|     return llama_model_load_from_file(local_path.c_str(), params); | ||||
| } | ||||
|  | ||||
| struct llama_model * common_load_model_from_hf( | ||||
| @@ -1437,6 +1500,80 @@ struct llama_model * common_load_model_from_hf( | ||||
|     return common_load_model_from_url(model_url, local_path, hf_token, params); | ||||
| } | ||||
|  | ||||
| /** | ||||
|  * Allow getting the HF file from the HF repo with tag (like ollama), for example: | ||||
|  * - bartowski/Llama-3.2-3B-Instruct-GGUF:q4 | ||||
|  * - bartowski/Llama-3.2-3B-Instruct-GGUF:Q4_K_M | ||||
|  * - bartowski/Llama-3.2-3B-Instruct-GGUF:q5_k_s | ||||
|  * Tag is optional, default to "latest" (meaning it checks for Q4_K_M first, then Q4, then if not found, return the first GGUF file in repo) | ||||
|  * | ||||
|  * Return pair of <repo, file> (with "repo" already having tag removed) | ||||
|  * | ||||
|  * Note: we use the Ollama-compatible HF API, but not using the blobId. Instead, we use the special "ggufFile" field which returns the value for "hf_file". This is done to be backward-compatible with existing cache files. | ||||
|  */ | ||||
| std::pair<std::string, std::string> common_get_hf_file(const std::string & hf_repo_with_tag, const std::string & hf_token) { | ||||
|     auto parts = string_split<std::string>(hf_repo_with_tag, ':'); | ||||
|     std::string tag = parts.size() > 1 ? parts.back() : "latest"; | ||||
|     std::string hf_repo = parts[0]; | ||||
|     if (string_split<std::string>(hf_repo, '/').size() != 2) { | ||||
|         throw std::invalid_argument("error: invalid HF repo format, expected <user>/<model>[:quant]\n"); | ||||
|     } | ||||
|  | ||||
|     // fetch model info from Hugging Face Hub API | ||||
|     json model_info; | ||||
|     curl_ptr       curl(curl_easy_init(), &curl_easy_cleanup); | ||||
|     curl_slist_ptr http_headers; | ||||
|     std::string res_str; | ||||
|     std::string url = "https://huggingface.co/v2/" + hf_repo + "/manifests/" + tag; | ||||
|     curl_easy_setopt(curl.get(), CURLOPT_URL, url.c_str()); | ||||
|     curl_easy_setopt(curl.get(), CURLOPT_NOPROGRESS, 1L); | ||||
|     typedef size_t(*CURLOPT_WRITEFUNCTION_PTR)(void * ptr, size_t size, size_t nmemb, void * data); | ||||
|     auto write_callback = [](void * ptr, size_t size, size_t nmemb, void * data) -> size_t { | ||||
|         static_cast<std::string *>(data)->append((char * ) ptr, size * nmemb); | ||||
|         return size * nmemb; | ||||
|     }; | ||||
|     curl_easy_setopt(curl.get(), CURLOPT_WRITEFUNCTION, static_cast<CURLOPT_WRITEFUNCTION_PTR>(write_callback)); | ||||
|     curl_easy_setopt(curl.get(), CURLOPT_WRITEDATA, &res_str); | ||||
| #if defined(_WIN32) | ||||
|     curl_easy_setopt(curl.get(), CURLOPT_SSL_OPTIONS, CURLSSLOPT_NATIVE_CA); | ||||
| #endif | ||||
|     if (!hf_token.empty()) { | ||||
|         std::string auth_header = "Authorization: Bearer " + hf_token; | ||||
|         http_headers.ptr = curl_slist_append(http_headers.ptr, auth_header.c_str()); | ||||
|     } | ||||
|     // Important: the User-Agent must be "llama-cpp" to get the "ggufFile" field in the response | ||||
|     http_headers.ptr = curl_slist_append(http_headers.ptr, "User-Agent: llama-cpp"); | ||||
|     http_headers.ptr = curl_slist_append(http_headers.ptr, "Accept: application/json"); | ||||
|     curl_easy_setopt(curl.get(), CURLOPT_HTTPHEADER, http_headers.ptr); | ||||
|  | ||||
|     CURLcode res = curl_easy_perform(curl.get()); | ||||
|  | ||||
|     if (res != CURLE_OK) { | ||||
|         throw std::runtime_error("error: cannot make GET request to HF API"); | ||||
|     } | ||||
|  | ||||
|     long res_code; | ||||
|     curl_easy_getinfo(curl.get(), CURLINFO_RESPONSE_CODE, &res_code); | ||||
|     if (res_code == 200) { | ||||
|         model_info = json::parse(res_str); | ||||
|     } else if (res_code == 401) { | ||||
|         throw std::runtime_error("error: model is private or does not exist; if you are accessing a gated model, please provide a valid HF token"); | ||||
|     } else { | ||||
|         throw std::runtime_error(string_format("error from HF API, response code: %ld, data: %s", res_code, res_str.c_str())); | ||||
|     } | ||||
|  | ||||
|     // check response | ||||
|     if (!model_info.contains("ggufFile")) { | ||||
|         throw std::runtime_error("error: model does not have ggufFile"); | ||||
|     } | ||||
|     json & gguf_file = model_info.at("ggufFile"); | ||||
|     if (!gguf_file.contains("rfilename")) { | ||||
|         throw std::runtime_error("error: ggufFile does not have rfilename"); | ||||
|     } | ||||
|  | ||||
|     return std::make_pair(hf_repo, gguf_file.at("rfilename")); | ||||
| } | ||||
|  | ||||
| #else | ||||
|  | ||||
| struct llama_model * common_load_model_from_url( | ||||
| @@ -1458,6 +1595,11 @@ struct llama_model * common_load_model_from_hf( | ||||
|     return nullptr; | ||||
| } | ||||
|  | ||||
| std::pair<std::string, std::string> common_get_hf_file(const std::string &, const std::string &) { | ||||
|     LOG_WRN("%s: llama.cpp built without libcurl, downloading from Hugging Face not supported.\n", __func__); | ||||
|     return std::make_pair("", ""); | ||||
| } | ||||
|  | ||||
| #endif // LLAMA_USE_CURL | ||||
|  | ||||
| // | ||||
| @@ -1556,21 +1698,23 @@ std::vector<llama_token> common_tokenize( | ||||
|            const std::string & text, | ||||
|                         bool   add_special, | ||||
|                         bool   parse_special) { | ||||
|     return common_tokenize(llama_get_model(ctx), text, add_special, parse_special); | ||||
|     const llama_model * model = llama_get_model(ctx); | ||||
|     const llama_vocab * vocab = llama_model_get_vocab(model); | ||||
|     return common_tokenize(vocab, text, add_special, parse_special); | ||||
| } | ||||
|  | ||||
| std::vector<llama_token> common_tokenize( | ||||
|     const struct llama_model * model, | ||||
|     const struct llama_vocab * vocab, | ||||
|            const std::string & text, | ||||
|                         bool   add_special, | ||||
|                         bool   parse_special) { | ||||
|     // upper limit for the number of tokens | ||||
|     int n_tokens = text.length() + 2 * add_special; | ||||
|     std::vector<llama_token> result(n_tokens); | ||||
|     n_tokens = llama_tokenize(model, text.data(), text.length(), result.data(), result.size(), add_special, parse_special); | ||||
|     n_tokens = llama_tokenize(vocab, text.data(), text.length(), result.data(), result.size(), add_special, parse_special); | ||||
|     if (n_tokens < 0) { | ||||
|         result.resize(-n_tokens); | ||||
|         int check = llama_tokenize(model, text.data(), text.length(), result.data(), result.size(), add_special, parse_special); | ||||
|         int check = llama_tokenize(vocab, text.data(), text.length(), result.data(), result.size(), add_special, parse_special); | ||||
|         GGML_ASSERT(check == -n_tokens); | ||||
|     } else { | ||||
|         result.resize(n_tokens); | ||||
| @@ -1579,12 +1723,18 @@ std::vector<llama_token> common_tokenize( | ||||
| } | ||||
|  | ||||
| std::string common_token_to_piece(const struct llama_context * ctx, llama_token token, bool special) { | ||||
|     const llama_model * model = llama_get_model(ctx); | ||||
|     const llama_vocab * vocab = llama_model_get_vocab(model); | ||||
|     return common_token_to_piece(vocab, token, special); | ||||
| } | ||||
|  | ||||
| std::string common_token_to_piece(const struct llama_vocab * vocab, llama_token token, bool special) { | ||||
|     std::string piece; | ||||
|     piece.resize(piece.capacity());  // using string internal cache, 15 bytes + '\n' | ||||
|     const int n_chars = llama_token_to_piece(llama_get_model(ctx), token, &piece[0], piece.size(), 0, special); | ||||
|     const int n_chars = llama_token_to_piece(vocab, token, &piece[0], piece.size(), 0, special); | ||||
|     if (n_chars < 0) { | ||||
|         piece.resize(-n_chars); | ||||
|         int check = llama_token_to_piece(llama_get_model(ctx), token, &piece[0], piece.size(), 0, special); | ||||
|         int check = llama_token_to_piece(vocab, token, &piece[0], piece.size(), 0, special); | ||||
|         GGML_ASSERT(check == -n_chars); | ||||
|     } | ||||
|     else { | ||||
| @@ -1594,13 +1744,19 @@ std::string common_token_to_piece(const struct llama_context * ctx, llama_token | ||||
|     return piece; | ||||
| } | ||||
|  | ||||
| std::string common_detokenize(llama_context * ctx, const std::vector<llama_token> & tokens, bool special) { | ||||
| std::string common_detokenize(const struct llama_context * ctx, const std::vector<llama_token> & tokens, bool special) { | ||||
|     const llama_model * model = llama_get_model(ctx); | ||||
|     const llama_vocab * vocab = llama_model_get_vocab(model); | ||||
|     return common_detokenize(vocab, tokens, special); | ||||
| } | ||||
|  | ||||
| std::string common_detokenize(const struct llama_vocab * vocab, const std::vector<llama_token> & tokens, bool special) { | ||||
|     std::string text; | ||||
|     text.resize(std::max(text.capacity(), tokens.size())); | ||||
|     int32_t n_chars = llama_detokenize(llama_get_model(ctx), tokens.data(), (int32_t)tokens.size(), &text[0], (int32_t)text.size(), false, special); | ||||
|     int32_t n_chars = llama_detokenize(vocab, tokens.data(), (int32_t)tokens.size(), &text[0], (int32_t)text.size(), false, special); | ||||
|     if (n_chars < 0) { | ||||
|         text.resize(-n_chars); | ||||
|         n_chars = llama_detokenize(llama_get_model(ctx), tokens.data(), (int32_t)tokens.size(), &text[0], (int32_t)text.size(), false, special); | ||||
|         n_chars = llama_detokenize(vocab, tokens.data(), (int32_t)tokens.size(), &text[0], (int32_t)text.size(), false, special); | ||||
|         GGML_ASSERT(n_chars <= (int32_t)text.size());  // whitespace trimming is performed after per-token detokenization | ||||
|     } | ||||
|  | ||||
| @@ -1610,103 +1766,6 @@ std::string common_detokenize(llama_context * ctx, const std::vector<llama_token | ||||
|     return text; | ||||
| } | ||||
|  | ||||
| // | ||||
| // Chat template utils | ||||
| // | ||||
|  | ||||
| std::string common_get_builtin_chat_template(const struct llama_model * model) { | ||||
|     static const char * template_key = "tokenizer.chat_template"; | ||||
|     // call with NULL buffer to get the total size of the string | ||||
|     int32_t res = llama_model_meta_val_str(model, template_key, NULL, 0); | ||||
|     if (res > 0) { | ||||
|         std::vector<char> model_template(res + 1, 0); | ||||
|         llama_model_meta_val_str(model, template_key, model_template.data(), model_template.size()); | ||||
|         return std::string(model_template.data(), model_template.size() - 1); | ||||
|     } | ||||
|     return ""; | ||||
| } | ||||
|  | ||||
| bool common_chat_verify_template(const std::string & tmpl) { | ||||
|     llama_chat_message chat[] = {{"user", "test"}}; | ||||
|     int res = llama_chat_apply_template(nullptr, tmpl.c_str(), chat, 1, true, nullptr, 0); | ||||
|     return res >= 0; | ||||
| } | ||||
|  | ||||
| std::string common_chat_apply_template(const struct llama_model * model, | ||||
|         const std::string & tmpl, | ||||
|         const std::vector<common_chat_msg> & msgs, | ||||
|         bool add_ass) { | ||||
|     int alloc_size = 0; | ||||
|     bool fallback = false; // indicate if we must fallback to default chatml | ||||
|     std::vector<llama_chat_message> chat; | ||||
|     for (auto & msg : msgs) { | ||||
|         chat.push_back({msg.role.c_str(), msg.content.c_str()}); | ||||
|         alloc_size += (msg.role.size() + msg.content.size()) * 1.25; | ||||
|     } | ||||
|  | ||||
|     const char * ptr_tmpl = tmpl.empty() ? nullptr : tmpl.c_str(); | ||||
|     std::vector<char> buf(alloc_size); | ||||
|  | ||||
|     // run the first time to get the total output length | ||||
|     int32_t res = llama_chat_apply_template(model, ptr_tmpl, chat.data(), chat.size(), add_ass, buf.data(), buf.size()); | ||||
|  | ||||
|     // error: chat template is not supported | ||||
|     if (res < 0) { | ||||
|         if (ptr_tmpl != nullptr) { | ||||
|             // if the custom "tmpl" is not supported, we throw an error | ||||
|             // this is a bit redundant (for good), since we're not sure if user validated the custom template with llama_chat_verify_template() | ||||
|             throw std::runtime_error("this custom template is not supported"); | ||||
|         } else { | ||||
|             // If the built-in template is not supported, we default to chatml | ||||
|             res = llama_chat_apply_template(nullptr, "chatml", chat.data(), chat.size(), add_ass, buf.data(), buf.size()); | ||||
|             fallback = true; | ||||
|         } | ||||
|     } | ||||
|  | ||||
|     // if it turns out that our buffer is too small, we resize it | ||||
|     if ((size_t) res > buf.size()) { | ||||
|         buf.resize(res); | ||||
|         res = llama_chat_apply_template( | ||||
|             fallback ? nullptr : model, | ||||
|             fallback ? "chatml" : ptr_tmpl, | ||||
|             chat.data(), chat.size(), add_ass, buf.data(), buf.size()); | ||||
|     } | ||||
|  | ||||
|     std::string formatted_chat(buf.data(), res); | ||||
|     return formatted_chat; | ||||
| } | ||||
|  | ||||
| std::string common_chat_format_single(const struct llama_model * model, | ||||
|         const std::string & tmpl, | ||||
|         const std::vector<common_chat_msg> & past_msg, | ||||
|         const common_chat_msg & new_msg, | ||||
|         bool add_ass) { | ||||
|     std::ostringstream ss; | ||||
|     auto fmt_past_msg = past_msg.empty() ? "" : common_chat_apply_template(model, tmpl, past_msg, false); | ||||
|     std::vector<common_chat_msg> chat_new(past_msg); | ||||
|     // if the past_msg ends with a newline, we must preserve it in the formatted version | ||||
|     if (add_ass && !fmt_past_msg.empty() && fmt_past_msg.back() == '\n') { | ||||
|         ss << "\n"; | ||||
|     }; | ||||
|     // format chat with new_msg | ||||
|     chat_new.push_back(new_msg); | ||||
|     auto fmt_new_msg = common_chat_apply_template(model, tmpl, chat_new, add_ass); | ||||
|     // get the diff part | ||||
|     ss << fmt_new_msg.substr(fmt_past_msg.size(), fmt_new_msg.size() - fmt_past_msg.size()); | ||||
|     return ss.str(); | ||||
| } | ||||
|  | ||||
| std::string common_chat_format_example(const struct llama_model * model, | ||||
|         const std::string & tmpl) { | ||||
|     std::vector<common_chat_msg> msgs = { | ||||
|         {"system",    "You are a helpful assistant"}, | ||||
|         {"user",      "Hello"}, | ||||
|         {"assistant", "Hi there"}, | ||||
|         {"user",      "How are you?"}, | ||||
|     }; | ||||
|     return common_chat_apply_template(model, tmpl, msgs, true); | ||||
| } | ||||
|  | ||||
| // | ||||
| // KV cache utils | ||||
| // | ||||
|   | ||||
		Reference in New Issue
	
	Block a user
	 Jeffrey Morgan
					Jeffrey Morgan