llama: update to commit 71e90e88 (#10192)

2025-04-16 18:14:01 -04:00 · 2025-04-16 18:14:01 -04:00 · 943464ccb8
commit 943464ccb8
parent 369de832cd
160 changed files with 42219 additions and 33080 deletions
--- a/.github/workflows/test.yaml
+++ b/.github/workflows/test.yaml
@ -237,5 +237,5 @@ jobs:
      - uses: actions/checkout@v4
      - name: Verify patches apply cleanly and do not change files
        run: |
-          make -f Makefile.sync clean sync
-          git diff --compact-summary --exit-code
+          make -f Makefile.sync clean checkout apply-patches sync
+          git diff --compact-summary --exit-code
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -51,7 +51,7 @@ include_directories(${CMAKE_CURRENT_SOURCE_DIR}/ml/backend/ggml/ggml/src/ggml-cp
 include_directories(${CMAKE_CURRENT_SOURCE_DIR}/ml/backend/ggml/ggml/src/ggml-cpu/amx)

 set(GGML_CPU ON)
-add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/ml/backend/ggml/ggml/src)
+add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/ml/backend/ggml/ggml)
 set_property(TARGET ggml PROPERTY EXCLUDE_FROM_ALL TRUE)

 get_target_property(CPU_VARIANTS ggml-cpu MANUALLY_ADDED_DEPENDENCIES)
--- a/Makefile.sync
+++ b/Makefile.sync
@ -1,6 +1,6 @@
 UPSTREAM=https://github.com/ggerganov/llama.cpp.git
 WORKDIR=llama/vendor
-FETCH_HEAD=d7cfe1ffe0f435d0048a6058d529daf76e072d9c
+FETCH_HEAD=71e90e8813f90097701e62f7fce137d96ddf41e2

 .PHONY: help
 help:
@ -15,18 +15,18 @@ help:
 	@echo "    make -f $(lastword $(MAKEFILE_LIST)) clean sync"

 .PHONY: sync
-sync: llama/build-info.cpp llama/llama.cpp ml/backend/ggml/ggml apply-patches
+sync: llama/build-info.cpp llama/llama.cpp ml/backend/ggml/ggml

 .PHONY: llama/build-info.cpp
 llama/build-info.cpp: llama/build-info.cpp.in
 	sed -e 's|@FETCH_HEAD@|$(FETCH_HEAD)|' $< > $@

 .PHONY: llama/llama.cpp
-llama/llama.cpp: llama/vendor/ apply-patches
+llama/llama.cpp: llama/vendor/
 	rsync -arvzc -f "merge $@/.rsync-filter" $< $@

-.PHONY: ml/backend/ggml/ggml apply-patches
-ml/backend/ggml/ggml: llama/vendor/ggml/ apply-patches
+.PHONY: ml/backend/ggml/ggml
+ml/backend/ggml/ggml: llama/vendor/ggml/
 	rsync -arvzc -f "merge $@/.rsync-filter" $< $@

 PATCHES=$(wildcard llama/patches/*.patch)
--- a/llama/build-info.cpp
+++ b/llama/build-info.cpp
@ -1,4 +1,4 @@
 int LLAMA_BUILD_NUMBER = 0;
-char const *LLAMA_COMMIT = "d7cfe1ffe0f435d0048a6058d529daf76e072d9c";
+char const *LLAMA_COMMIT = "71e90e8813f90097701e62f7fce137d96ddf41e2";
 char const *LLAMA_COMPILER = "";
 char const *LLAMA_BUILD_TARGET = "";
--- a/llama/llama.cpp/.rsync-filter
+++ b/llama/llama.cpp/.rsync-filter
@ -13,6 +13,7 @@ include include/llama-*.*
 include examples/
 include examples/llava/
 include examples/llava/clip.*
+include examples/llava/clip-impl.*
 include examples/llava/llava.*
 include src/
 include src/llama.*
--- a/llama/llama.cpp/common/common.cpp
+++ b/llama/llama.cpp/common/common.cpp
@ -7,10 +7,6 @@

 #include "common.h"
 #include "log.h"
-// Change JSON_ASSERT from assert() to GGML_ASSERT:
-#define JSON_ASSERT GGML_ASSERT
-#include "json.hpp"
-#include "json-schema-to-grammar.h"
 #include "llama.h"

 #include <algorithm>
@ -52,47 +48,11 @@
 #include <sys/stat.h>
 #include <unistd.h>
 #endif
-#if defined(LLAMA_USE_CURL)
-#include <curl/curl.h>
-#include <curl/easy.h>
-#include <future>
-#endif

 #if defined(_MSC_VER)
 #pragma warning(disable: 4244 4267) // possible loss of data
 #endif

-#if defined(LLAMA_USE_CURL)
-#ifdef __linux__
-#include <linux/limits.h>
-#elif defined(_WIN32)
-#   if !defined(PATH_MAX)
-#   define PATH_MAX MAX_PATH
-#   endif
-#else
-#include <sys/syslimits.h>
-#endif
-#define LLAMA_CURL_MAX_URL_LENGTH 2084 // Maximum URL Length in Chrome: 2083
-
-//
-// CURL utils
-//
-
-using curl_ptr = std::unique_ptr<CURL, decltype(&curl_easy_cleanup)>;
-
-// cannot use unique_ptr for curl_slist, because we cannot update without destroying the old one
-struct curl_slist_ptr {
-    struct curl_slist * ptr = nullptr;
-    ~curl_slist_ptr() {
-        if (ptr) {
-            curl_slist_free_all(ptr);
-        }
-    }
-};
-#endif // LLAMA_USE_CURL
-
-using json = nlohmann::ordered_json;
-
 //
 // CPU utils
 //
@ -483,6 +443,11 @@ void string_replace_all(std::string & s, const std::string & search, const std::
    s = std::move(builder);
 }

+std::string regex_escape(const std::string & s) {
+    static const std::regex special_chars("[.^$|()*+?\\[\\]{}\\\\]");
+    return std::regex_replace(s, special_chars, "\\$0");
+}
+
 std::string string_join(const std::vector<std::string> & values, const std::string & separator) {
    std::ostringstream result;
    for (size_t i = 0; i < values.size(); ++i) {
@ -865,7 +830,7 @@ std::string fs_get_cache_directory() {
    if (getenv("LLAMA_CACHE")) {
        cache_directory = std::getenv("LLAMA_CACHE");
    } else {
-#ifdef __linux__
+#if defined(__linux__) || defined(__FreeBSD__) || defined(_AIX)
        if (std::getenv("XDG_CACHE_HOME")) {
            cache_directory = std::getenv("XDG_CACHE_HOME");
        } else {
@ -875,7 +840,9 @@ std::string fs_get_cache_directory() {
        cache_directory = std::getenv("HOME") + std::string("/Library/Caches/");
 #elif defined(_WIN32)
        cache_directory = std::getenv("LOCALAPPDATA");
-#endif // __linux__
+#else
+#  error Unknown architecture
+#endif
        cache_directory = ensure_trailing_slash(cache_directory);
        cache_directory += "llama.cpp";
    }
@ -896,22 +863,14 @@ std::string fs_get_cache_file(const std::string & filename) {
 //
 // Model utils
 //
+
 struct common_init_result common_init_from_params(common_params & params) {
    common_init_result iparams;
    auto mparams = common_model_params_to_llama(params);

-    llama_model * model = nullptr;
-
-    if (!params.hf_repo.empty() && !params.hf_file.empty()) {
-        model = common_load_model_from_hf(params.hf_repo, params.hf_file, params.model, params.hf_token, mparams);
-    } else if (!params.model_url.empty()) {
-        model = common_load_model_from_url(params.model_url, params.model, params.hf_token, mparams);
-    } else {
-        model = llama_model_load_from_file(params.model.c_str(), mparams);
-    }
-
+    llama_model * model = llama_model_load_from_file(params.model.path.c_str(), mparams);
    if (model == NULL) {
-        LOG_ERR("%s: failed to load model '%s'\n", __func__, params.model.c_str());
+        LOG_ERR("%s: failed to load model '%s'\n", __func__, params.model.path.c_str());
        return iparams;
    }

@ -946,13 +905,13 @@ struct common_init_result common_init_from_params(common_params & params) {

    llama_context * lctx = llama_init_from_model(model, cparams);
    if (lctx == NULL) {
-        LOG_ERR("%s: failed to create context with model '%s'\n", __func__, params.model.c_str());
+        LOG_ERR("%s: failed to create context with model '%s'\n", __func__, params.model.path.c_str());
        llama_model_free(model);
        return iparams;
    }

-    if (params.ctx_shift && !llama_kv_cache_can_shift(lctx)) {
-        LOG_WRN("%s: KV cache shifting is not supported for this model, disabling KV cache shifting\n", __func__);
+    if (params.ctx_shift && !llama_kv_self_can_shift(lctx)) {
+        LOG_WRN("%s: KV cache shifting is not supported for this context, disabling KV cache shifting\n", __func__);
        params.ctx_shift = false;
    }

@ -1029,6 +988,8 @@ struct common_init_result common_init_from_params(common_params & params) {
    if (params.warmup) {
        LOG_WRN("%s: warming up the model with an empty run - please wait ... (--no-warmup to disable)\n", __func__);

+        llama_set_warmup(lctx, true);
+
        std::vector<llama_token> tmp;
        llama_token bos = llama_vocab_bos(vocab);
        llama_token eos = llama_vocab_eos(vocab);
@ -1056,9 +1017,10 @@ struct common_init_result common_init_from_params(common_params & params) {
        if (llama_model_has_decoder(model)) {
            llama_decode(lctx, llama_batch_get_one(tmp.data(), std::min(tmp.size(), (size_t) params.n_batch)));
        }
-        llama_kv_cache_clear(lctx);
+        llama_kv_self_clear(lctx);
        llama_synchronize(lctx);
        llama_perf_context_reset(lctx);
+        llama_set_warmup(lctx, false);
    }

    iparams.model.reset(model);
@ -1067,6 +1029,19 @@ struct common_init_result common_init_from_params(common_params & params) {
    return iparams;
 }

+std::string get_model_endpoint() {
+    const char * model_endpoint_env = getenv("MODEL_ENDPOINT");
+    // We still respect the use of environment-variable "HF_ENDPOINT" for backward-compatibility.
+    const char * hf_endpoint_env = getenv("HF_ENDPOINT");
+    const char * endpoint_env = model_endpoint_env ? model_endpoint_env : hf_endpoint_env;
+    std::string model_endpoint = "https://huggingface.co/";
+    if (endpoint_env) {
+        model_endpoint = endpoint_env;
+        if (model_endpoint.back() != '/') model_endpoint += '/';
+    }
+    return model_endpoint;
+}
+
 void common_set_adapter_lora(struct llama_context * ctx, std::vector<common_adapter_lora_info> & lora) {
    llama_clear_adapter_lora(ctx);
    for (auto & la : lora) {
@ -1082,15 +1057,18 @@ struct llama_model_params common_model_params_to_llama(common_params & params) {
    if (!params.devices.empty()) {
        mparams.devices = params.devices.data();
    }
+
    if (params.n_gpu_layers != -1) {
        mparams.n_gpu_layers = params.n_gpu_layers;
    }
+
    mparams.main_gpu        = params.main_gpu;
    mparams.split_mode      = params.split_mode;
    mparams.tensor_split    = params.tensor_split;
    mparams.use_mmap        = params.use_mmap;
    mparams.use_mlock       = params.use_mlock;
    mparams.check_tensors   = params.check_tensors;
+
    if (params.kv_overrides.empty()) {
        mparams.kv_overrides = NULL;
    } else {
@ -1098,6 +1076,13 @@ struct llama_model_params common_model_params_to_llama(common_params & params) {
        mparams.kv_overrides = params.kv_overrides.data();
    }

+    if (params.tensor_buft_overrides.empty()) {
+        mparams.tensor_buft_overrides = NULL;
+    } else {
+        GGML_ASSERT(params.tensor_buft_overrides.back().pattern == nullptr && "Tensor buffer overrides not terminated with empty pattern");
+        mparams.tensor_buft_overrides = params.tensor_buft_overrides.data();
+    }
+
    return mparams;
 }

@ -1157,451 +1142,6 @@ struct ggml_threadpool_params ggml_threadpool_params_from_cpu_params(const cpu_p
    return tpp;
 }

-#ifdef LLAMA_USE_CURL
-
-#define CURL_MAX_RETRY 3
-#define CURL_RETRY_DELAY_SECONDS 2
-
-static bool curl_perform_with_retry(const std::string & url, CURL * curl, int max_attempts, int retry_delay_seconds) {
-    int remaining_attempts = max_attempts;
-
-    while (remaining_attempts > 0) {
-        LOG_INF("%s: Trying to download from %s (attempt %d of %d)...\n", __func__ , url.c_str(), max_attempts - remaining_attempts + 1, max_attempts);
-
-        CURLcode res = curl_easy_perform(curl);
-        if (res == CURLE_OK) {
-            return true;
-        }
-
-        int exponential_backoff_delay = std::pow(retry_delay_seconds, max_attempts - remaining_attempts) * 1000;
-        LOG_WRN("%s: curl_easy_perform() failed: %s, retrying after %d milliseconds...\n", __func__, curl_easy_strerror(res), exponential_backoff_delay);
-
-        remaining_attempts--;
-        std::this_thread::sleep_for(std::chrono::milliseconds(exponential_backoff_delay));
-    }
-
-    LOG_ERR("%s: curl_easy_perform() failed after %d attempts\n", __func__, max_attempts);
-
-    return false;
-}
-
-static bool common_download_file(const std::string & url, const std::string & path, const std::string & hf_token) {
-    // Initialize libcurl
-    curl_ptr       curl(curl_easy_init(), &curl_easy_cleanup);
-    curl_slist_ptr http_headers;
-    if (!curl) {
-        LOG_ERR("%s: error initializing libcurl\n", __func__);
-        return false;
-    }
-
-    bool force_download = false;
-
-    // Set the URL, allow to follow http redirection
-    curl_easy_setopt(curl.get(), CURLOPT_URL, url.c_str());
-    curl_easy_setopt(curl.get(), CURLOPT_FOLLOWLOCATION, 1L);
-
-    // Check if hf-token or bearer-token was specified
-    if (!hf_token.empty()) {
-        std::string auth_header = "Authorization: Bearer " + hf_token;
-        http_headers.ptr = curl_slist_append(http_headers.ptr, auth_header.c_str());
-        curl_easy_setopt(curl.get(), CURLOPT_HTTPHEADER, http_headers.ptr);
-    }
-
-#if defined(_WIN32)
-    // CURLSSLOPT_NATIVE_CA tells libcurl to use standard certificate store of
-    //   operating system. Currently implemented under MS-Windows.
-    curl_easy_setopt(curl.get(), CURLOPT_SSL_OPTIONS, CURLSSLOPT_NATIVE_CA);
-#endif
-
-    // Check if the file already exists locally
-    auto file_exists = std::filesystem::exists(path);
-
-    // If the file exists, check its JSON metadata companion file.
-    std::string metadata_path = path + ".json";
-    nlohmann::json metadata;
-    std::string etag;
-    std::string last_modified;
-
-    if (file_exists) {
-        // Try and read the JSON metadata file (note: stream autoclosed upon exiting this block).
-        std::ifstream metadata_in(metadata_path);
-        if (metadata_in.good()) {
-            try {
-                metadata_in >> metadata;
-                LOG_INF("%s: previous metadata file found %s: %s\n", __func__, metadata_path.c_str(), metadata.dump().c_str());
-                if (metadata.contains("url") && metadata.at("url").is_string()) {
-                    auto previous_url = metadata.at("url").get<std::string>();
-                    if (previous_url != url) {
-                        LOG_ERR("%s: Model URL mismatch: %s != %s\n", __func__, url.c_str(), previous_url.c_str());
-                        return false;
-                    }
-                }
-                if (metadata.contains("etag") && metadata.at("etag").is_string()) {
-                    etag = metadata.at("etag");
-                }
-                if (metadata.contains("lastModified") && metadata.at("lastModified").is_string()) {
-                    last_modified = metadata.at("lastModified");
-                }
-            } catch (const nlohmann::json::exception & e) {
-            LOG_ERR("%s: error reading metadata file %s: %s\n", __func__, metadata_path.c_str(), e.what());
-                return false;
-            }
-        }
-    } else {
-        LOG_INF("%s: no previous model file found %s\n", __func__, path.c_str());
-    }
-
-    // Send a HEAD request to retrieve the etag and last-modified headers
-    struct common_load_model_from_url_headers {
-        std::string etag;
-        std::string last_modified;
-    };
-
-    common_load_model_from_url_headers headers;
-
-    {
-        typedef size_t(*CURLOPT_HEADERFUNCTION_PTR)(char *, size_t, size_t, void *);
-        auto header_callback = [](char * buffer, size_t /*size*/, size_t n_items, void * userdata) -> size_t {
-            common_load_model_from_url_headers * headers = (common_load_model_from_url_headers *) userdata;
-
-            static std::regex header_regex("([^:]+): (.*)\r\n");
-            static std::regex etag_regex("ETag", std::regex_constants::icase);
-            static std::regex last_modified_regex("Last-Modified", std::regex_constants::icase);
-
-            std::string header(buffer, n_items);
-            std::smatch match;
-            if (std::regex_match(header, match, header_regex)) {
-                const std::string & key = match[1];
-                const std::string & value = match[2];
-                if (std::regex_match(key, match, etag_regex)) {
-                    headers->etag = value;
-                } else if (std::regex_match(key, match, last_modified_regex)) {
-                    headers->last_modified = value;
-                }
-            }
-            return n_items;
-        };
-
-        curl_easy_setopt(curl.get(), CURLOPT_NOBODY, 1L); // will trigger the HEAD verb
-        curl_easy_setopt(curl.get(), CURLOPT_NOPROGRESS, 1L); // hide head request progress
-        curl_easy_setopt(curl.get(), CURLOPT_HEADERFUNCTION, static_cast<CURLOPT_HEADERFUNCTION_PTR>(header_callback));
-        curl_easy_setopt(curl.get(), CURLOPT_HEADERDATA, &headers);
-
-        bool was_perform_successful = curl_perform_with_retry(url, curl.get(), CURL_MAX_RETRY, CURL_RETRY_DELAY_SECONDS);
-        if (!was_perform_successful) {
-            return false;
-        }
-
-        long http_code = 0;
-        curl_easy_getinfo(curl.get(), CURLINFO_RESPONSE_CODE, &http_code);
-        if (http_code != 200) {
-            // HEAD not supported, we don't know if the file has changed
-            // force trigger downloading
-            force_download = true;
-            LOG_ERR("%s: HEAD invalid http status code received: %ld\n", __func__, http_code);
-        }
-    }
-
-    bool should_download = !file_exists || force_download;
-    if (!should_download) {
-        if (!etag.empty() && etag != headers.etag) {
-            LOG_WRN("%s: ETag header is different (%s != %s): triggering a new download\n", __func__, etag.c_str(), headers.etag.c_str());
-            should_download = true;
-        } else if (!last_modified.empty() && last_modified != headers.last_modified) {
-            LOG_WRN("%s: Last-Modified header is different (%s != %s): triggering a new download\n", __func__, last_modified.c_str(), headers.last_modified.c_str());
-            should_download = true;
-        }
-    }
-    if (should_download) {
-        std::string path_temporary = path + ".downloadInProgress";
-        if (file_exists) {
-            LOG_WRN("%s: deleting previous downloaded file: %s\n", __func__, path.c_str());
-            if (remove(path.c_str()) != 0) {
-                LOG_ERR("%s: unable to delete file: %s\n", __func__, path.c_str());
-                return false;
-            }
-        }
-
-        // Set the output file
-
-        struct FILE_deleter {
-            void operator()(FILE * f) const {
-                fclose(f);
-            }
-        };
-
-        std::unique_ptr<FILE, FILE_deleter> outfile(fopen(path_temporary.c_str(), "wb"));
-        if (!outfile) {
-            LOG_ERR("%s: error opening local file for writing: %s\n", __func__, path.c_str());
-            return false;
-        }
-
-        typedef size_t(*CURLOPT_WRITEFUNCTION_PTR)(void * data, size_t size, size_t nmemb, void * fd);
-        auto write_callback = [](void * data, size_t size, size_t nmemb, void * fd) -> size_t {
-            return fwrite(data, size, nmemb, (FILE *)fd);
-        };
-        curl_easy_setopt(curl.get(), CURLOPT_NOBODY, 0L);
-        curl_easy_setopt(curl.get(), CURLOPT_WRITEFUNCTION, static_cast<CURLOPT_WRITEFUNCTION_PTR>(write_callback));
-        curl_easy_setopt(curl.get(), CURLOPT_WRITEDATA, outfile.get());
-
-        //  display download progress
-        curl_easy_setopt(curl.get(), CURLOPT_NOPROGRESS, 0L);
-
-        // helper function to hide password in URL
-        auto llama_download_hide_password_in_url = [](const std::string & url) -> std::string {
-            std::size_t protocol_pos = url.find("://");
-            if (protocol_pos == std::string::npos) {
-                return url;  // Malformed URL
-            }
-
-            std::size_t at_pos = url.find('@', protocol_pos + 3);
-            if (at_pos == std::string::npos) {
-                return url;  // No password in URL
-            }
-
-            return url.substr(0, protocol_pos + 3) + "********" + url.substr(at_pos);
-        };
-
-        // start the download
-        LOG_INF("%s: trying to download model from %s to %s (server_etag:%s, server_last_modified:%s)...\n", __func__,
-            llama_download_hide_password_in_url(url).c_str(), path.c_str(), headers.etag.c_str(), headers.last_modified.c_str());
-        bool was_perform_successful = curl_perform_with_retry(url, curl.get(), CURL_MAX_RETRY, CURL_RETRY_DELAY_SECONDS);
-        if (!was_perform_successful) {
-            return false;
-        }
-
-        long http_code = 0;
-        curl_easy_getinfo (curl.get(), CURLINFO_RESPONSE_CODE, &http_code);
-        if (http_code < 200 || http_code >= 400) {
-            LOG_ERR("%s: invalid http status code received: %ld\n", __func__, http_code);
-            return false;
-        }
-
-        // Causes file to be closed explicitly here before we rename it.
-        outfile.reset();
-
-        // Write the updated JSON metadata file.
-        metadata.update({
-            {"url", url},
-            {"etag", headers.etag},
-            {"lastModified", headers.last_modified}
-        });
-        std::ofstream(metadata_path) << metadata.dump(4);
-        LOG_INF("%s: file metadata saved: %s\n", __func__, metadata_path.c_str());
-
-        if (rename(path_temporary.c_str(), path.c_str()) != 0) {
-            LOG_ERR("%s: unable to rename file: %s to %s\n", __func__, path_temporary.c_str(), path.c_str());
-            return false;
-        }
-    }
-
-    return true;
-}
-
-struct llama_model * common_load_model_from_url(
-        const std::string & model_url,
-        const std::string & local_path,
-        const std::string & hf_token,
-        const struct llama_model_params & params) {
-    // Basic validation of the model_url
-    if (model_url.empty()) {
-        LOG_ERR("%s: invalid model_url\n", __func__);
-        return NULL;
-    }
-
-    if (!common_download_file(model_url, local_path, hf_token)) {
-        return NULL;
-    }
-
-    // check for additional GGUFs split to download
-    int n_split = 0;
-    {
-        struct gguf_init_params gguf_params = {
-            /*.no_alloc = */ true,
-            /*.ctx      = */ NULL,
-        };
-        auto * ctx_gguf = gguf_init_from_file(local_path.c_str(), gguf_params);
-        if (!ctx_gguf) {
-            LOG_ERR("\n%s:  failed to load input GGUF from %s\n", __func__, local_path.c_str());
-            return NULL;
-        }
-
-        auto key_n_split = gguf_find_key(ctx_gguf, LLM_KV_SPLIT_COUNT);
-        if (key_n_split >= 0) {
-            n_split = gguf_get_val_u16(ctx_gguf, key_n_split);
-        }
-
-        gguf_free(ctx_gguf);
-    }
-
-    if (n_split > 1) {
-        char split_prefix[PATH_MAX] = {0};
-        char split_url_prefix[LLAMA_CURL_MAX_URL_LENGTH] = {0};
-
-        // Verify the first split file format
-        // and extract split URL and PATH prefixes
-        {
-            if (!llama_split_prefix(split_prefix, sizeof(split_prefix), local_path.c_str(), 0, n_split)) {
-                LOG_ERR("\n%s: unexpected model file name: %s n_split=%d\n", __func__, local_path.c_str(), n_split);
-                return NULL;
-            }
-
-            if (!llama_split_prefix(split_url_prefix, sizeof(split_url_prefix), model_url.c_str(), 0, n_split)) {
-                LOG_ERR("\n%s: unexpected model url: %s n_split=%d\n", __func__, model_url.c_str(), n_split);
-                return NULL;
-            }
-        }
-
-        // Prepare download in parallel
-        std::vector<std::future<bool>> futures_download;
-        for (int idx = 1; idx < n_split; idx++) {
-            futures_download.push_back(std::async(std::launch::async, [&split_prefix, &split_url_prefix, &n_split, hf_token](int download_idx) -> bool {
-                char split_path[PATH_MAX] = {0};
-                llama_split_path(split_path, sizeof(split_path), split_prefix, download_idx, n_split);
-
-                char split_url[LLAMA_CURL_MAX_URL_LENGTH] = {0};
-                llama_split_path(split_url, sizeof(split_url), split_url_prefix, download_idx, n_split);
-
-                return common_download_file(split_url, split_path, hf_token);
-            }, idx));
-        }
-
-        // Wait for all downloads to complete
-        for (auto & f : futures_download) {
-            if (!f.get()) {
-                return NULL;
-            }
-        }
-    }
-
-    return llama_model_load_from_file(local_path.c_str(), params);
-}
-
-struct llama_model * common_load_model_from_hf(
-        const std::string & repo,
-        const std::string & remote_path,
-        const std::string & local_path,
-        const std::string & hf_token,
-        const struct llama_model_params & params) {
-    // construct hugging face model url:
-    //
-    //  --repo ggml-org/models --file tinyllama-1.1b/ggml-model-f16.gguf
-    //    https://huggingface.co/ggml-org/models/resolve/main/tinyllama-1.1b/ggml-model-f16.gguf
-    //
-    //  --repo TheBloke/Mixtral-8x7B-v0.1-GGUF --file mixtral-8x7b-v0.1.Q4_K_M.gguf
-    //    https://huggingface.co/TheBloke/Mixtral-8x7B-v0.1-GGUF/resolve/main/mixtral-8x7b-v0.1.Q4_K_M.gguf
-    //
-
-    std::string model_url = "https://huggingface.co/";
-    model_url += repo;
-    model_url += "/resolve/main/";
-    model_url += remote_path;
-
-    return common_load_model_from_url(model_url, local_path, hf_token, params);
-}
-
-/**
- * Allow getting the HF file from the HF repo with tag (like ollama), for example:
- * - bartowski/Llama-3.2-3B-Instruct-GGUF:q4
- * - bartowski/Llama-3.2-3B-Instruct-GGUF:Q4_K_M
- * - bartowski/Llama-3.2-3B-Instruct-GGUF:q5_k_s
- * Tag is optional, default to "latest" (meaning it checks for Q4_K_M first, then Q4, then if not found, return the first GGUF file in repo)
- *
- * Return pair of <repo, file> (with "repo" already having tag removed)
- *
- * Note: we use the Ollama-compatible HF API, but not using the blobId. Instead, we use the special "ggufFile" field which returns the value for "hf_file". This is done to be backward-compatible with existing cache files.
- */
-std::pair<std::string, std::string> common_get_hf_file(const std::string & hf_repo_with_tag, const std::string & hf_token) {
-    auto parts = string_split<std::string>(hf_repo_with_tag, ':');
-    std::string tag = parts.size() > 1 ? parts.back() : "latest";
-    std::string hf_repo = parts[0];
-    if (string_split<std::string>(hf_repo, '/').size() != 2) {
-        throw std::invalid_argument("error: invalid HF repo format, expected <user>/<model>[:quant]\n");
-    }
-
-    // fetch model info from Hugging Face Hub API
-    json model_info;
-    curl_ptr       curl(curl_easy_init(), &curl_easy_cleanup);
-    curl_slist_ptr http_headers;
-    std::string res_str;
-    std::string url = "https://huggingface.co/v2/" + hf_repo + "/manifests/" + tag;
-    curl_easy_setopt(curl.get(), CURLOPT_URL, url.c_str());
-    curl_easy_setopt(curl.get(), CURLOPT_NOPROGRESS, 1L);
-    typedef size_t(*CURLOPT_WRITEFUNCTION_PTR)(void * ptr, size_t size, size_t nmemb, void * data);
-    auto write_callback = [](void * ptr, size_t size, size_t nmemb, void * data) -> size_t {
-        static_cast<std::string *>(data)->append((char * ) ptr, size * nmemb);
-        return size * nmemb;
-    };
-    curl_easy_setopt(curl.get(), CURLOPT_WRITEFUNCTION, static_cast<CURLOPT_WRITEFUNCTION_PTR>(write_callback));
-    curl_easy_setopt(curl.get(), CURLOPT_WRITEDATA, &res_str);
-#if defined(_WIN32)
-    curl_easy_setopt(curl.get(), CURLOPT_SSL_OPTIONS, CURLSSLOPT_NATIVE_CA);
-#endif
-    if (!hf_token.empty()) {
-        std::string auth_header = "Authorization: Bearer " + hf_token;
-        http_headers.ptr = curl_slist_append(http_headers.ptr, auth_header.c_str());
-    }
-    // Important: the User-Agent must be "llama-cpp" to get the "ggufFile" field in the response
-    http_headers.ptr = curl_slist_append(http_headers.ptr, "User-Agent: llama-cpp");
-    http_headers.ptr = curl_slist_append(http_headers.ptr, "Accept: application/json");
-    curl_easy_setopt(curl.get(), CURLOPT_HTTPHEADER, http_headers.ptr);
-
-    CURLcode res = curl_easy_perform(curl.get());
-
-    if (res != CURLE_OK) {
-        throw std::runtime_error("error: cannot make GET request to HF API");
-    }
-
-    long res_code;
-    curl_easy_getinfo(curl.get(), CURLINFO_RESPONSE_CODE, &res_code);
-    if (res_code == 200) {
-        model_info = json::parse(res_str);
-    } else if (res_code == 401) {
-        throw std::runtime_error("error: model is private or does not exist; if you are accessing a gated model, please provide a valid HF token");
-    } else {
-        throw std::runtime_error(string_format("error from HF API, response code: %ld, data: %s", res_code, res_str.c_str()));
-    }
-
-    // check response
-    if (!model_info.contains("ggufFile")) {
-        throw std::runtime_error("error: model does not have ggufFile");
-    }
-    json & gguf_file = model_info.at("ggufFile");
-    if (!gguf_file.contains("rfilename")) {
-        throw std::runtime_error("error: ggufFile does not have rfilename");
-    }
-
-    return std::make_pair(hf_repo, gguf_file.at("rfilename"));
-}
-
-#else
-
-struct llama_model * common_load_model_from_url(
-        const std::string & /*model_url*/,
-        const std::string & /*local_path*/,
-        const std::string & /*hf_token*/,
-        const struct llama_model_params & /*params*/) {
-    LOG_WRN("%s: llama.cpp built without libcurl, downloading from an url not supported.\n", __func__);
-    return nullptr;
-}
-
-struct llama_model * common_load_model_from_hf(
-        const std::string & /*repo*/,
-        const std::string & /*remote_path*/,
-        const std::string & /*local_path*/,
-        const std::string & /*hf_token*/,
-        const struct llama_model_params & /*params*/) {
-    LOG_WRN("%s: llama.cpp built without libcurl, downloading from Hugging Face not supported.\n", __func__);
-    return nullptr;
-}
-
-std::pair<std::string, std::string> common_get_hf_file(const std::string &, const std::string &) {
-    LOG_WRN("%s: llama.cpp built without libcurl, downloading from Hugging Face not supported.\n", __func__);
-    return std::make_pair("", "");
-}
-
-#endif // LLAMA_USE_CURL
-
 //
 // Batch utils
 //
@ -2025,4 +1565,3 @@ common_control_vector_data common_control_vector_load(const std::vector<common_c

    return result;
 }
-
--- a/llama/llama.cpp/common/common.h
+++ b/llama/llama.cpp/common/common.h
@ -110,9 +110,17 @@ enum common_conversation_mode {
    COMMON_CONVERSATION_MODE_AUTO     = 2,
 };

+enum common_grammar_trigger_type {
+    COMMON_GRAMMAR_TRIGGER_TYPE_TOKEN,
+    COMMON_GRAMMAR_TRIGGER_TYPE_WORD,
+    COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN,
+    COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN_START,
+};
+
 struct common_grammar_trigger {
-    std::string word;
-    bool at_start;
+    common_grammar_trigger_type type;
+    std::string value;
+    llama_token token = LLAMA_TOKEN_NULL;
 };

 // sampling parameters
@ -163,8 +171,7 @@ struct common_params_sampling {

    std::string                         grammar; // optional BNF-like grammar to constrain sampling
    bool                                grammar_lazy = false;
-    std::vector<common_grammar_trigger> grammar_trigger_words;  // optional trigger words to trigger lazy grammar
-    std::vector<llama_token>            grammar_trigger_tokens; // optional trigger tokens to trigger lazy grammar and print trigger special tokens.
+    std::vector<common_grammar_trigger> grammar_triggers; // optional triggers (for lazy grammars)
    std::set<llama_token>               preserved_tokens;

    std::vector<llama_logit_bias> logit_bias; // logit biases to apply
@ -173,6 +180,13 @@ struct common_params_sampling {
    std::string print() const;
 };

+struct common_params_model {
+    std::string path    = ""; // model local path                                           // NOLINT
+    std::string url     = ""; // model url to download                                      // NOLINT
+    std::string hf_repo = ""; // HF repo                                                    // NOLINT
+    std::string hf_file = ""; // HF file                                                    // NOLINT
+};
+
 struct common_params_speculative {
    std::vector<ggml_backend_dev_t> devices; // devices to use for offloading

@ -186,19 +200,13 @@ struct common_params_speculative {
    struct cpu_params cpuparams;
    struct cpu_params cpuparams_batch;

-    std::string hf_repo = ""; // HF repo                                                     // NOLINT
-    std::string hf_file = ""; // HF file                                                     // NOLINT
-
-    std::string model = "";     // draft model for speculative decoding                      // NOLINT
-    std::string model_url = ""; // model url to download                                     // NOLINT
+    struct common_params_model model;
 };

 struct common_params_vocoder {
-    std::string hf_repo = ""; // HF repo                                                     // NOLINT
-    std::string hf_file = ""; // HF file                                                     // NOLINT
+    struct common_params_model model;

-    std::string model     = ""; // model path                                                // NOLINT
-    std::string model_url = ""; // model url to download                                     // NOLINT
+    std::string speaker_file = ""; // speaker file path                                      // NOLINT

    bool use_guide_tokens = false; // enable guide tokens to improve TTS accuracy            // NOLINT
 };
@ -254,13 +262,12 @@ struct common_params {
    struct common_params_speculative speculative;
    struct common_params_vocoder     vocoder;

-    std::string model                = ""; // model path                                                    // NOLINT
+    struct common_params_model model;
+
    std::string model_alias          = ""; // model alias                                                   // NOLINT
-    std::string model_url            = ""; // model url to download                                         // NOLINT
    std::string hf_token             = ""; // HF token                                                      // NOLINT
-    std::string hf_repo              = ""; // HF repo                                                       // NOLINT
-    std::string hf_file              = ""; // HF file                                                       // NOLINT
    std::string prompt               = "";                                                                  // NOLINT
+    std::string system_prompt        = "";                                                                  // NOLINT
    std::string prompt_file          = ""; // store the external prompt file name                           // NOLINT
    std::string path_prompt_cache    = ""; // path to file for saving/loading prompt eval state             // NOLINT
    std::string input_prefix         = ""; // string to prefix user inputs with                             // NOLINT
@ -272,6 +279,7 @@ struct common_params {
    std::vector<std::string> in_files;   // all input files
    std::vector<std::string> antiprompt; // strings upon which more user input is prompted (a.k.a. reverse prompts)
    std::vector<llama_model_kv_override> kv_overrides;
+    std::vector<llama_model_tensor_buft_override> tensor_buft_overrides;

    bool lora_init_without_apply = false; // only load lora to memory, but do not apply it to ctx (user can manually apply lora later using llama_adapter_lora_apply)
    std::vector<common_adapter_lora_info> lora_adapters; // lora adapter path with user defined scale
@ -325,13 +333,15 @@ struct common_params {
    bool warmup            = true;  // warmup run
    bool check_tensors     = false; // validate tensor data

+    bool single_turn       = false; // single turn chat conversation
+
    ggml_type cache_type_k = GGML_TYPE_F16; // KV cache data type for the K
    ggml_type cache_type_v = GGML_TYPE_F16; // KV cache data type for the V

    common_conversation_mode conversation_mode = COMMON_CONVERSATION_MODE_AUTO;

    // multimodal models (see examples/llava)
-    std::string mmproj = "";        // path to multimodal projector                                         // NOLINT
+    struct common_params_model mmproj;
    std::vector<std::string> image; // path to image file(s)

    // embedding
@ -391,8 +401,6 @@ struct common_params {
    int32_t i_pos  = -1;  // position of the passkey in the junk text

    // imatrix params
-    std::string out_file = "imatrix.dat"; // save the resulting imatrix to this file
-
    int32_t n_out_freq  = 10; // output the imatrix every n_out_freq iterations
    int32_t n_save_freq =  0; // save the imatrix every n_save_freq iterations
    int32_t i_chunk     =  0; // start processing from this chunk
@ -404,16 +412,16 @@ struct common_params {
    int n_pca_batch = 100;
    int n_pca_iterations = 1000;
    dimre_method cvector_dimre_method = DIMRE_METHOD_PCA;
-    std::string cvector_outfile       = "control_vector.gguf";
    std::string cvector_positive_file = "examples/cvector-generator/positive.txt";
    std::string cvector_negative_file = "examples/cvector-generator/negative.txt";

    bool spm_infill = false; // suffix/prefix/middle pattern for infill

-    std::string lora_outfile = "ggml-lora-merged-f16.gguf";
-
    // batched-bench params
    bool batched_bench_output_jsonl = false;
+
+    // common params
+    std::string out_file; // output filename for all example programs
 };

 // call once at the start of a program if it uses libcommon
@ -453,6 +461,8 @@ std::string string_repeat(const std::string & str, size_t n);

 void string_replace_all(std::string & s, const std::string & search, const std::string & replace);

+std::string regex_escape(const std::string & s);
+
 template<class T>
 static std::vector<T> string_split(const std::string & str, char delim) {
    static_assert(!std::is_same<T, std::string>::value, "Please use the specialized version for std::string");
@ -530,26 +540,11 @@ struct llama_model_params     common_model_params_to_llama  (      common_params
 struct llama_context_params   common_context_params_to_llama(const common_params & params);
 struct ggml_threadpool_params ggml_threadpool_params_from_cpu_params(const cpu_params & params);

-struct llama_model * common_load_model_from_url(
-    const std::string & model_url,
-    const std::string & local_path,
-    const std::string & hf_token,
-    const struct llama_model_params & params);
-
-struct llama_model * common_load_model_from_hf(
-    const std::string & repo,
-    const std::string & remote_path,
-    const std::string & local_path,
-    const std::string & hf_token,
-    const struct llama_model_params & params);
-
-std::pair<std::string, std::string> common_get_hf_file(
-    const std::string & hf_repo_with_tag,
-    const std::string & hf_token);
-
 // clear LoRA adapters from context, then apply new list of adapters
 void common_set_adapter_lora(struct llama_context * ctx, std::vector<common_adapter_lora_info> & lora);

+std::string                   get_model_endpoint();
+
 //
 // Batch utils
 //
--- a/llama/llama.cpp/common/json-schema-to-grammar.cpp
+++ b/llama/llama.cpp/common/json-schema-to-grammar.cpp
@ -264,7 +264,7 @@ static void _build_min_max_int(int min_value, int max_value, std::stringstream &
    throw std::runtime_error("At least one of min_value or max_value must be set");
 }

-const std::string SPACE_RULE = "| \" \" | \"\\n\" [ \\t]{0,20}";
+const std::string SPACE_RULE = "| \" \" | \"\\n\"{1,2} [ \\t]{0,20}";

 struct BuiltinRule {
    std::string content;
@ -764,11 +764,10 @@ private:
 public:
    SchemaConverter(
        const std::function<json(const std::string &)> & fetch_json,
-        bool dotall,
-        bool compact_spaces)
+        bool dotall)
          : _fetch_json(fetch_json), _dotall(dotall)
    {
-        _rules["space"] = compact_spaces ? "\" \"?" : SPACE_RULE;
+        _rules["space"] = SPACE_RULE;
    }

    void resolve_refs(json & schema, const std::string & url) {
@ -1007,7 +1006,7 @@ std::string json_schema_to_grammar(const json & schema, bool force_gbnf) {
 }

 std::string build_grammar(const std::function<void(const common_grammar_builder &)> & cb, const common_grammar_options & options) {
-    SchemaConverter converter([&](const std::string &) { return json(); }, options.dotall, options.compact_spaces);
+    SchemaConverter converter([&](const std::string &) { return json(); }, options.dotall);
    common_grammar_builder builder {
        /* .add_rule = */ [&](const std::string & name, const std::string & rule) {
            return converter._add_rule(name, rule);
--- a/llama/llama.cpp/common/json-schema-to-grammar.h
+++ b/llama/llama.cpp/common/json-schema-to-grammar.h
@ -16,7 +16,6 @@ struct common_grammar_builder {

 struct common_grammar_options {
    bool dotall = false;
-    bool compact_spaces = false;
 };

 std::string build_grammar(const std::function<void(const common_grammar_builder &)> & cb, const common_grammar_options & options = {});
--- a/llama/llama.cpp/common/sampling.cpp
+++ b/llama/llama.cpp/common/sampling.cpp
@ -4,6 +4,7 @@

 #include <cmath>
 #include <unordered_map>
+#include <algorithm>

 // the ring buffer works similarly to std::deque, but with a fixed capacity
 // TODO: deduplicate with llama-impl.h
@ -159,17 +160,57 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, co
        GGML_ABORT("llguidance (cmake -DLLAMA_LLGUIDANCE=ON) is not enabled");
 #endif // LLAMA_USE_LLGUIDANCE
    } else {
-        std::vector<const char *> trigger_words;
-        trigger_words.reserve(params.grammar_trigger_words.size());
-        for (const auto & str : params.grammar_trigger_words) {
-            trigger_words.push_back(str.word.c_str());
+        std::vector<std::string> patterns_at_start;
+        std::vector<std::string> patterns_anywhere;
+        std::vector<llama_token> trigger_tokens;
+        for (const auto & trigger : params.grammar_triggers) {
+            switch (trigger.type) {
+                case COMMON_GRAMMAR_TRIGGER_TYPE_WORD:
+                {
+                    const auto & word = trigger.value;
+                    patterns_anywhere.push_back(regex_escape(word));
+                    break;
+                }
+                case COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN:
+                case COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN_START:
+                {
+                    const auto & pattern = trigger.value;
+                    (trigger.type == COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN_START ? patterns_at_start : patterns_anywhere).push_back(pattern);
+                    break;
+                }
+                case COMMON_GRAMMAR_TRIGGER_TYPE_TOKEN:
+                {
+                    const auto token = trigger.token;
+                    trigger_tokens.push_back(token);
+                    break;
+                }
+                default:
+                    GGML_ASSERT(false && "unknown trigger type");
+            }
+        }
+
+        std::vector<std::string> trigger_patterns;
+        if (!patterns_at_start.empty()) {
+            trigger_patterns.push_back("^(" + string_join(patterns_at_start, "|") + ")[\\s\\S]*");
+        }
+        if (!patterns_anywhere.empty()) {
+            trigger_patterns.push_back("^[\\s\\S]*?(" + string_join(patterns_anywhere, "|") + ")[\\s\\S]*");
+        }
+
+        std::vector<const char *> trigger_patterns_c;
+        trigger_patterns_c.reserve(trigger_patterns.size());
+        for (const auto & regex : trigger_patterns) {
+            trigger_patterns_c.push_back(regex.c_str());
        }

        grmr = params.grammar_lazy
-             ? llama_sampler_init_grammar_lazy(vocab, params.grammar.c_str(), "root",
-                                               trigger_words.data(), trigger_words.size(),
-                                               params.grammar_trigger_tokens.data(), params.grammar_trigger_tokens.size())
+             ? llama_sampler_init_grammar_lazy_patterns(vocab, params.grammar.c_str(), "root",
+                                                        trigger_patterns_c.data(), trigger_patterns_c.size(),
+                                                        trigger_tokens.data(), trigger_tokens.size())
             :      llama_sampler_init_grammar(vocab, params.grammar.c_str(), "root");
+        if (!grmr) {
+            return nullptr;
+        }
    }

    auto * result = new common_sampler {
--- a/llama/llama.cpp/examples/llava/clip-impl.h
+++ b/llama/llama.cpp/examples/llava/clip-impl.h
@ -0,0 +1,344 @@
+#include "ggml.h"
+#include "gguf.h"
+#include "clip.h"
+
+#include "clip.h"
+
+#include <climits>
+#include <cstdarg>
+#include <string>
+#include <map>
+#include <sstream>
+#include <vector>
+#include <memory>
+
+// Internal header for clip.cpp
+
+#define KEY_FTYPE               "general.file_type"
+#define KEY_NAME                "general.name"
+#define KEY_DESCRIPTION         "general.description"
+#define KEY_HAS_TEXT_ENC        "clip.has_text_encoder"
+#define KEY_HAS_VIS_ENC         "clip.has_vision_encoder"
+#define KEY_HAS_LLAVA_PROJ      "clip.has_llava_projector"
+#define KEY_HAS_MINICPMV_PROJ   "clip.has_minicpmv_projector"
+#define KEY_HAS_GLM_PROJ        "clip.has_glm_projector"
+#define KEY_MINICPMV_VERSION    "clip.minicpmv_version"
+#define KEY_HAS_QWEN2VL_MERGER  "clip.has_qwen2vl_merger"
+#define KEY_USE_GELU            "clip.use_gelu"
+#define KEY_USE_SILU            "clip.use_silu"
+#define KEY_N_EMBD              "clip.%s.embedding_length"
+#define KEY_N_FF                "clip.%s.feed_forward_length"
+#define KEY_N_BLOCK             "clip.%s.block_count"
+#define KEY_N_HEAD              "clip.%s.attention.head_count"
+#define KEY_LAYER_NORM_EPS      "clip.%s.attention.layer_norm_epsilon"
+#define KEY_PROJ_DIM            "clip.%s.projection_dim"
+#define KEY_TOKENS              "tokenizer.ggml.tokens"
+#define KEY_N_POSITIONS         "clip.text.context_length"
+#define KEY_IMAGE_SIZE          "clip.vision.image_size"
+#define KEY_PATCH_SIZE          "clip.vision.patch_size"
+#define KEY_IMAGE_MEAN          "clip.vision.image_mean"
+#define KEY_IMAGE_STD           "clip.vision.image_std"
+#define KEY_PROJ_TYPE           "clip.projector_type"
+#define KEY_FEATURE_LAYER       "clip.vision.feature_layer"
+
+#define KEY_MM_PATCH_MERGE_TYPE   "clip.vision.mm_patch_merge_type"
+#define KEY_IMAGE_GRID_PINPOINTS  "clip.vision.image_grid_pinpoints"
+#define KEY_IMAGE_CROP_RESOLUTION "clip.vision.image_crop_resolution"
+
+
+//
+// tensor name constants
+//
+
+#define TN_TOKEN_EMBD      "%s.token_embd.weight"
+#define TN_POS_EMBD        "%s.position_embd.weight"
+#define TN_CLASS_EMBD      "v.class_embd"
+#define TN_PATCH_EMBD      "v.patch_embd.weight"  // not rename tensor with ".0" postfix for backwrad compat
+#define TN_PATCH_EMBD_1    "v.patch_embd.weight.1"
+#define TN_PATCH_BIAS      "v.patch_embd.bias"
+#define TN_ATTN_K          "%s.blk.%d.attn_k.%s"
+#define TN_ATTN_Q          "%s.blk.%d.attn_q.%s"
+#define TN_ATTN_V          "%s.blk.%d.attn_v.%s"
+#define TN_ATTN_OUTPUT     "%s.blk.%d.attn_out.%s"
+#define TN_FFN_DOWN        "%s.blk.%d.ffn_down.%s"
+#define TN_FFN_UP          "%s.blk.%d.ffn_up.%s"
+#define TN_LN_1            "%s.blk.%d.ln1.%s"
+#define TN_LN_2            "%s.blk.%d.ln2.%s"
+#define TN_LN_PRE          "%s.pre_ln.%s"
+#define TN_LN_POST         "%s.post_ln.%s"
+#define TN_TEXT_PROJ       "text_projection.weight"
+#define TN_VIS_PROJ        "visual_projection.weight"
+#define TN_LLAVA_PROJ      "mm.%d.%s"
+#define TN_MVLM_PROJ_MLP   "mm.model.mlp.%d.%s"
+#define TN_MVLM_PROJ_BLOCK "mm.model.mb_block.%d.block.%d.%s"
+#define TN_MVLM_PROJ_PEG   "mm.model.peg.%d.%s"
+#define TN_IMAGE_NEWLINE   "model.image_newline"
+#define TN_MM_INP_PROJ     "mm.input_projection.weight" // gemma3
+#define TN_MM_SOFT_EMB_N   "mm.soft_emb_norm.weight"    // gemma3
+
+// mimicpmv
+#define TN_MINICPMV_POS_EMBD_K "resampler.pos_embed_k"
+#define TN_MINICPMV_QUERY      "resampler.query"
+#define TN_MINICPMV_PROJ       "resampler.proj.weight"
+#define TN_MINICPMV_KV_PROJ    "resampler.kv.weight"
+#define TN_MINICPMV_ATTN       "resampler.attn.%s.%s"
+#define TN_MINICPMV_LN         "resampler.ln_%s.%s"
+
+#define TN_GLM_ADAPER_CONV      "adapter.conv.%s"
+#define TN_GLM_ADAPTER_LINEAR   "adapter.linear.linear.%s"
+#define TN_GLM_ADAPTER_NORM_1   "adapter.linear.norm1.%s"
+#define TN_GLM_ADAPTER_D_H_2_4H "adapter.linear.dense_h_to_4h.%s"
+#define TN_GLM_ADAPTER_GATE     "adapter.linear.gate.%s"
+#define TN_GLM_ADAPTER_D_4H_2_H "adapter.linear.dense_4h_to_h.%s"
+#define TN_GLM_BOI_W            "adapter.boi"
+#define TN_GLM_EOI_W            "adapter.eoi"
+
+enum projector_type {
+    PROJECTOR_TYPE_MLP,
+    PROJECTOR_TYPE_MLP_NORM,
+    PROJECTOR_TYPE_LDP,
+    PROJECTOR_TYPE_LDPV2,
+    PROJECTOR_TYPE_RESAMPLER,
+    PROJECTOR_TYPE_GLM_EDGE,
+    PROJECTOR_TYPE_MERGER,
+    PROJECTOR_TYPE_GEMMA3,
+    PROJECTOR_TYPE_UNKNOWN,
+};
+
+static std::map<projector_type, std::string> PROJECTOR_TYPE_NAMES = {
+    { PROJECTOR_TYPE_MLP,       "mlp" },
+    { PROJECTOR_TYPE_LDP,       "ldp" },
+    { PROJECTOR_TYPE_LDPV2,     "ldpv2"},
+    { PROJECTOR_TYPE_RESAMPLER, "resampler"},
+    { PROJECTOR_TYPE_GLM_EDGE,  "adapter"},
+    { PROJECTOR_TYPE_MERGER,    "qwen2vl_merger"},
+    { PROJECTOR_TYPE_GEMMA3,    "gemma3"},
+};
+
+static projector_type clip_projector_type_from_string(const std::string & str) {
+    for (const auto & pair : PROJECTOR_TYPE_NAMES) {
+        if (pair.second == str) {
+            return pair.first;
+        }
+    }
+    return PROJECTOR_TYPE_UNKNOWN;
+}
+
+// RGB uint8 image
+struct clip_image_u8 {
+    int nx;
+    int ny;
+
+    std::vector<uint8_t> buf;
+};
+
+// RGB float32 image (NHWC)
+// Memory layout: RGBRGBRGB...
+struct clip_image_f32 {
+    int nx;
+    int ny;
+
+    std::vector<float> buf;
+};
+
+//
+// logging
+//
+
+static void clip_log_callback_default(enum ggml_log_level level, const char * text, void * user_data) {
+    (void) level;
+    (void) user_data;
+    fputs(text, stderr);
+    fflush(stderr);
+}
+
+struct clip_logger_state {
+    ggml_log_level verbosity_thold;
+    ggml_log_callback log_callback;
+    void * log_callback_user_data;
+};
+
+extern struct clip_logger_state g_logger_state;
+
+static void clip_log_internal_v(enum ggml_log_level level, const char * format, va_list args) {
+    if (format == NULL) {
+        return;
+    }
+    va_list args_copy;
+    va_copy(args_copy, args);
+    char buffer[128];
+    int len = vsnprintf(buffer, 128, format, args);
+    if (len < 128) {
+        g_logger_state.log_callback(level, buffer, g_logger_state.log_callback_user_data);
+    } else {
+        char * buffer2 = (char *) calloc(len + 1, sizeof(char));
+        vsnprintf(buffer2, len + 1, format, args_copy);
+        buffer2[len] = 0;
+        g_logger_state.log_callback(level, buffer2, g_logger_state.log_callback_user_data);
+        free(buffer2);
+    }
+    va_end(args_copy);
+}
+
+static void clip_log_internal(enum ggml_log_level level, const char * format, ...) {
+    va_list args;
+    va_start(args, format);
+    clip_log_internal_v(level, format, args);
+    va_end(args);
+}
+
+#define LOG_TMPL(level, ...) \
+    do { \
+        if ((level) >= g_logger_state.verbosity_thold) { \
+            clip_log_internal((level), __VA_ARGS__); \
+        } \
+    } while (0)
+#define LOG_INF(...) LOG_TMPL(GGML_LOG_LEVEL_INFO,  __VA_ARGS__)
+#define LOG_WRN(...) LOG_TMPL(GGML_LOG_LEVEL_WARN,  __VA_ARGS__)
+#define LOG_ERR(...) LOG_TMPL(GGML_LOG_LEVEL_ERROR, __VA_ARGS__)
+#define LOG_DBG(...) LOG_TMPL(GGML_LOG_LEVEL_DEBUG, __VA_ARGS__)
+#define LOG_CNT(...) LOG_TMPL(GGML_LOG_LEVEL_CONT,  __VA_ARGS__)
+
+//
+// cpp wrappers
+//
+
+// wrapper for clip_image_size
+struct clip_image_size_deleter {
+    void operator()(clip_image_size * val) { clip_image_size_free(val); }
+};
+typedef std::unique_ptr<clip_image_size, clip_image_size_deleter> clip_image_size_ptr;
+
+// wrapper for clip_image_u8
+struct clip_image_u8_deleter {
+    void operator()(clip_image_u8 * val) { clip_image_u8_free(val); }
+};
+typedef std::unique_ptr<clip_image_u8, clip_image_u8_deleter> clip_image_u8_ptr;
+
+// wrapper for clip_image_f32
+struct clip_image_f32_deleter {
+    void operator()(clip_image_f32 * val) { clip_image_f32_free(val); }
+};
+typedef std::unique_ptr<clip_image_f32, clip_image_f32_deleter> clip_image_f32_ptr;
+
+struct clip_image_u8_batch {
+    std::vector<clip_image_u8_ptr> entries;
+};
+
+struct clip_image_f32_batch {
+    std::vector<clip_image_f32_ptr> entries;
+};
+
+//
+// common utils
+//
+
+static std::string string_format(const char * fmt, ...) {
+    va_list ap;
+    va_list ap2;
+    va_start(ap, fmt);
+    va_copy(ap2, ap);
+    int size = vsnprintf(NULL, 0, fmt, ap);
+    GGML_ASSERT(size >= 0 && size < INT_MAX); // NOLINT
+    std::vector<char> buf(size + 1);
+    int size2 = vsnprintf(buf.data(), size + 1, fmt, ap2);
+    GGML_ASSERT(size2 == size);
+    va_end(ap2);
+    va_end(ap);
+    return std::string(buf.data(), buf.size());
+}
+
+static void string_replace_all(std::string & s, const std::string & search, const std::string & replace) {
+    if (search.empty()) {
+        return;
+    }
+    std::string builder;
+    builder.reserve(s.length());
+    size_t pos = 0;
+    size_t last_pos = 0;
+    while ((pos = s.find(search, last_pos)) != std::string::npos) {
+        builder.append(s, last_pos, pos - last_pos);
+        builder.append(replace);
+        last_pos = pos + search.length();
+    }
+    builder.append(s, last_pos, std::string::npos);
+    s = std::move(builder);
+}
+
+// split string by a `std::string delim` instead of `char delim`
+static std::vector<std::string> string_split_str(std::string s, const std::string & delimiter) {
+    std::vector<std::string> tokens;
+    size_t pos = 0;
+    std::string token;
+    while ((pos = s.find(delimiter)) != std::string::npos) {
+        token = s.substr(0, pos);
+        tokens.push_back(token);
+        s.erase(0, pos + delimiter.length());
+    }
+    tokens.push_back(s);
+    return tokens;
+}
+
+//
+// gguf utils
+//
+
+static std::string gguf_data_to_str(enum gguf_type type, const void * data, int i) {
+    switch (type) {
+        case GGUF_TYPE_UINT8:   return std::to_string(((const uint8_t  *)data)[i]);
+        case GGUF_TYPE_INT8:    return std::to_string(((const int8_t   *)data)[i]);
+        case GGUF_TYPE_UINT16:  return std::to_string(((const uint16_t *)data)[i]);
+        case GGUF_TYPE_INT16:   return std::to_string(((const int16_t  *)data)[i]);
+        case GGUF_TYPE_UINT32:  return std::to_string(((const uint32_t *)data)[i]);
+        case GGUF_TYPE_INT32:   return std::to_string(((const int32_t  *)data)[i]);
+        case GGUF_TYPE_UINT64:  return std::to_string(((const uint64_t *)data)[i]);
+        case GGUF_TYPE_INT64:   return std::to_string(((const int64_t  *)data)[i]);
+        case GGUF_TYPE_FLOAT32: return std::to_string(((const float    *)data)[i]);
+        case GGUF_TYPE_FLOAT64: return std::to_string(((const double   *)data)[i]);
+        case GGUF_TYPE_BOOL:    return ((const bool *)data)[i] ? "true" : "false";
+        default:                return string_format("unknown type %d", type);
+    }
+}
+
+static std::string gguf_kv_to_str(const struct gguf_context * ctx_gguf, int i) {
+    const enum gguf_type type = gguf_get_kv_type(ctx_gguf, i);
+
+    switch (type) {
+        case GGUF_TYPE_STRING:
+            return gguf_get_val_str(ctx_gguf, i);
+        case GGUF_TYPE_ARRAY:
+            {
+                const enum gguf_type arr_type = gguf_get_arr_type(ctx_gguf, i);
+                int arr_n = gguf_get_arr_n(ctx_gguf, i);
+                const void * data = arr_type == GGUF_TYPE_STRING ? nullptr : gguf_get_arr_data(ctx_gguf, i);
+                std::stringstream ss;
+                ss << "[";
+                for (int j = 0; j < arr_n; j++) {
+                    if (arr_type == GGUF_TYPE_STRING) {
+                        std::string val = gguf_get_arr_str(ctx_gguf, i, j);
+                        // escape quotes
+                        string_replace_all(val, "\\", "\\\\");
+                        string_replace_all(val, "\"", "\\\"");
+                        ss << '"' << val << '"';
+                    } else if (arr_type == GGUF_TYPE_ARRAY) {
+                        ss << "???";
+                    } else {
+                        ss << gguf_data_to_str(arr_type, data, j);
+                    }
+                    if (j < arr_n - 1) {
+                        ss << ", ";
+                    }
+                }
+                ss << "]";
+                return ss.str();
+            }
+        default:
+            return gguf_data_to_str(type, gguf_get_val_data(ctx_gguf, i), 0);
+    }
+}
+
+//
+// API used internally with mtmd
+//
+
+projector_type clip_get_projector_type(const struct clip_ctx * ctx);
--- a/llama/llama.cpp/examples/llava/clip.cpp
+++ b/llama/llama.cpp/examples/llava/clip.cpp
--- a/llama/llama.cpp/examples/llava/clip.h
+++ b/llama/llama.cpp/examples/llava/clip.h
@ -1,6 +1,7 @@
 #ifndef CLIP_H
 #define CLIP_H

+#include "ggml.h"
 #include <stddef.h>
 #include <stdint.h>

@ -29,27 +30,28 @@ struct clip_image_size {
    int height;
 };

-struct clip_image_u8_batch {
-    struct clip_image_u8 * data;
-    size_t size;
+struct clip_image_f32;
+struct clip_image_u8_batch;
+struct clip_image_f32_batch;
+
+struct clip_context_params {
+    bool use_gpu;
+    enum ggml_log_level verbosity;
 };

-struct clip_image_f32_batch {
-    struct clip_image_f32 * data;
-    size_t size;
-};
+// deprecated, use clip_init
+CLIP_API struct clip_ctx * clip_model_load(const char * fname, int verbosity);

-CLIP_API struct clip_ctx * clip_model_load    (const char * fname, int verbosity);
-CLIP_API struct clip_ctx * clip_model_load_cpu(const char * fname, int verbosity);
+CLIP_API struct clip_ctx * clip_init(const char * fname, struct clip_context_params ctx_params);

 CLIP_API void clip_free(struct clip_ctx * ctx);

 CLIP_API size_t clip_embd_nbytes(const struct clip_ctx * ctx);
 CLIP_API size_t clip_embd_nbytes_by_img(const struct clip_ctx * ctx, int img_h, int img_w);

-CLIP_API int32_t clip_image_size (const struct clip_ctx * ctx);
-CLIP_API int32_t clip_patch_size (const struct clip_ctx * ctx);
-CLIP_API int32_t clip_hidden_size(const struct clip_ctx * ctx);
+CLIP_API int32_t clip_get_image_size (const struct clip_ctx * ctx);
+CLIP_API int32_t clip_get_patch_size (const struct clip_ctx * ctx);
+CLIP_API int32_t clip_get_hidden_size(const struct clip_ctx * ctx);

 // TODO: should be enum, not string
 CLIP_API const char * clip_patch_merge_type(const struct clip_ctx * ctx);
@ -65,16 +67,30 @@ CLIP_API int clip_uhd_num_image_embeds_col(struct clip_ctx * ctx_clip);
 CLIP_API void clip_add_load_image_size(struct clip_ctx * ctx_clip, struct clip_image_size * load_image_size);
 CLIP_API struct clip_image_size * clip_get_load_image_size(struct clip_ctx * ctx_clip);

-CLIP_API struct clip_image_size * clip_image_size_init();
-CLIP_API struct clip_image_u8  * clip_image_u8_init ();
-CLIP_API struct clip_image_f32 * clip_image_f32_init();
+CLIP_API struct clip_image_size      * clip_image_size_init();
+CLIP_API struct clip_image_u8        * clip_image_u8_init ();
+CLIP_API struct clip_image_f32       * clip_image_f32_init();
+CLIP_API struct clip_image_f32_batch * clip_image_f32_batch_init(); // only used by libllava

+// nx, ny are the output image dimensions
+CLIP_API unsigned char * clip_image_u8_get_data(struct clip_image_u8 * img, uint32_t * nx, uint32_t * ny);
+
+CLIP_API void clip_image_size_free (struct clip_image_size * img_size);
 CLIP_API void clip_image_u8_free (struct clip_image_u8  * img);
 CLIP_API void clip_image_f32_free(struct clip_image_f32 * img);
 CLIP_API void clip_image_u8_batch_free (struct clip_image_u8_batch  * batch);
 CLIP_API void clip_image_f32_batch_free(struct clip_image_f32_batch * batch);

-/** build image from pixels decoded by other libraries instead of stb_image.h for better performance. The memory layout is RGBRGBRGB..., input buffer length must be 3*nx*ny bytes */
+// use for accessing underlay data of clip_image_f32_batch
+CLIP_API size_t clip_image_f32_batch_n_images(const struct clip_image_f32_batch * batch); // equivalent to batch->size()
+CLIP_API size_t clip_image_f32_batch_nx(const struct clip_image_f32_batch * batch, int idx); // equivalent to batch[idx]->nx
+CLIP_API size_t clip_image_f32_batch_ny(const struct clip_image_f32_batch * batch, int idx); // equivalent to batch[idx]->ny
+CLIP_API struct clip_image_f32 * clip_image_f32_get_img(const struct clip_image_f32_batch * batch, int idx); // equivalent to batch[idx]->data
+
+/**
+ * Build image from pixels decoded by other libraries instead of stb_image.h for better performance.
+ * The memory layout is RGBRGBRGB..., input buffer length must be 3*nx*ny bytes
+ */
 CLIP_API void clip_build_img_from_pixels(const unsigned char * rgb_pixels, int nx, int ny, struct clip_image_u8 * img);

 CLIP_API bool clip_image_load_from_file(const char * fname, struct clip_image_u8 * img);
@ -95,6 +111,8 @@ CLIP_API bool clip_model_quantize(const char * fname_inp, const char * fname_out
 CLIP_API int clip_is_minicpmv(const struct clip_ctx * ctx);
 CLIP_API bool clip_is_glm(const struct clip_ctx * ctx);
 CLIP_API bool clip_is_qwen2vl(const struct clip_ctx * ctx);
+CLIP_API bool clip_is_llava(const struct clip_ctx * ctx);
+CLIP_API bool clip_is_gemma3(const struct clip_ctx * ctx);

 CLIP_API int get_deepest_feature_layer(const struct clip_ctx * ctx);

--- a/llama/llama.cpp/examples/llava/llava.cpp
+++ b/llama/llama.cpp/examples/llava/llava.cpp
@ -10,6 +10,7 @@
 #include <cstring>
 #include <limits>
 #include <vector>
+#include <memory>

 #if defined(LLAVA_LOG_OFF)
 #   define LOG_INF(...)
@ -45,6 +46,17 @@ struct clip_image_grid_shape {
    int second;
 };

+// convenience cpp wrapper
+struct clip_image_f32_batch_deleter {
+    void operator()(clip_image_f32_batch * val) { clip_image_f32_batch_free(val); }
+};
+typedef std::unique_ptr<clip_image_f32_batch, clip_image_f32_batch_deleter> clip_image_f32_batch_ptr;
+
+struct clip_image_size_deleter {
+    void operator()(clip_image_f32_batch * val) { clip_image_f32_batch_free(val); }
+};
+typedef std::unique_ptr<clip_image_size, clip_image_size_deleter> clip_image_size_ptr;
+
 /**
 * Selects the best resolution from a list of possible resolutions based on the original size.
 *
@ -105,8 +117,8 @@ static bool clip_llava_handle_patches(clip_ctx * ctx_clip, std::vector<float *>
        struct ggml_context * ctx;
    } model;

-    const int32_t image_size = clip_image_size(ctx_clip);
-    const int32_t patch_size = clip_patch_size(ctx_clip);
+    const int32_t image_size = clip_get_image_size(ctx_clip);
+    const int32_t patch_size = clip_get_patch_size(ctx_clip);

    int32_t num_patches_per_side = image_size / patch_size; // 336 / 14 = 24 - used for embedding-patching boxes (24*24 = 576 patches)

@ -246,12 +258,9 @@ static clip_image_f32 * reshape_by_patch(clip_image_f32 * image, int patch_size)

 static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const clip_image_u8 * img, float * image_embd, int * n_img_pos) {
    // std::vector<clip_image_f32*> img_res_v; // format VectN x H x W x RGB (N x 336 x 336 x 3), so interleaved RGB - different to the python implementation which is N x 3 x 336 x 336
-    clip_image_f32_batch img_res_v;
-    img_res_v.size = 0;
-    img_res_v.data = nullptr;
-    if (!clip_image_preprocess(ctx_clip, img, &img_res_v)) {
+    clip_image_f32_batch_ptr img_res_v(clip_image_f32_batch_init());
+    if (!clip_image_preprocess(ctx_clip, img, img_res_v.get())) {
        LOG_ERR("%s: unable to preprocess image\n", __func__);
-        delete[] img_res_v.data;
        return false;
    }

@ -259,66 +268,72 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli

    const char * mm_patch_merge_type = clip_patch_merge_type(ctx_clip);

+    const size_t n_imgs = clip_image_f32_batch_n_images(img_res_v.get());
+
    if (clip_is_minicpmv(ctx_clip) || clip_is_qwen2vl(ctx_clip)) {
        std::vector<float *> image_embd_v;
-        image_embd_v.resize(img_res_v.size);
-        struct clip_image_size * load_image_size = clip_image_size_init();
+        image_embd_v.resize(n_imgs);
+        clip_image_size load_image_size;

-        for (size_t i = 0; i < img_res_v.size; i++) {
+        for (size_t i = 0; i < n_imgs; i++) {
            const int64_t t_img_enc_step_start_us = ggml_time_us();
-            image_embd_v[i] = (float *)malloc(clip_embd_nbytes_by_img(ctx_clip, img_res_v.data[i].nx, img_res_v.data[i].ny));
-            int patch_size=14;
-            load_image_size->width = img_res_v.data[i].nx;
-            load_image_size->height = img_res_v.data[i].ny;
-            clip_add_load_image_size(ctx_clip, load_image_size);
+            int nx = clip_image_f32_batch_nx(img_res_v.get(), i);
+            int ny = clip_image_f32_batch_ny(img_res_v.get(), i);
+            image_embd_v[i] = (float *)malloc(clip_embd_nbytes_by_img(ctx_clip, nx, ny));
+            int patch_size = 14;
+            load_image_size.width = nx;
+            load_image_size.height = ny;
+            clip_add_load_image_size(ctx_clip, &load_image_size);

            bool encoded = false;
+            clip_image_f32 * img_res = clip_image_f32_get_img(img_res_v.get(), i);
            if (clip_is_qwen2vl(ctx_clip)) {
-                encoded = clip_image_encode(ctx_clip, n_threads, &img_res_v.data[i], image_embd_v[i]);
+                encoded = clip_image_encode(ctx_clip, n_threads, img_res, image_embd_v[i]);
            }
            else {
-                encoded = clip_image_encode(ctx_clip, n_threads, reshape_by_patch(&img_res_v.data[i], patch_size), image_embd_v[i]);
+                encoded = clip_image_encode(ctx_clip, n_threads, reshape_by_patch(img_res, patch_size), image_embd_v[i]);
            }

            if (!encoded) {
-                LOG_ERR("Unable to encode image - spatial_unpad - subimage %d of %d\n", (int) i+1, (int) img_res_v.size);
+                LOG_ERR("Unable to encode image - spatial_unpad - subimage %d of %d\n", (int) i+1, (int) n_imgs);
                return false;
            }
            const int64_t t_img_enc_steop_batch_us = ggml_time_us();
-            LOG_INF("%s: step %d of %d encoded in %8.2f ms\n", __func__, (int)i+1, (int)img_res_v.size, (t_img_enc_steop_batch_us - t_img_enc_step_start_us) / 1000.0);
+            LOG_INF("%s: step %d of %d encoded in %8.2f ms\n", __func__, (int)i+1, (int)n_imgs, (t_img_enc_steop_batch_us - t_img_enc_step_start_us) / 1000.0);
        }
        const int64_t t_img_enc_batch_us = ggml_time_us();
-        LOG_INF("%s: all %d segments encoded in %8.2f ms\n", __func__, (int)img_res_v.size, (t_img_enc_batch_us - t_img_enc_start_us) / 1000.0);
+        LOG_INF("%s: all %d segments encoded in %8.2f ms\n", __func__, (int)n_imgs, (t_img_enc_batch_us - t_img_enc_start_us) / 1000.0);

        int n_img_pos_out = 0;
        for (size_t i = 0; i < image_embd_v.size(); i++) {
+            int nx = clip_image_f32_batch_nx(img_res_v.get(), i);
+            int ny = clip_image_f32_batch_ny(img_res_v.get(), i);
+            clip_image_f32 * img_res = clip_image_f32_get_img(img_res_v.get(), i);
            std::memcpy(
                image_embd + n_img_pos_out * clip_n_mmproj_embd(ctx_clip),
                image_embd_v[i],
-                clip_embd_nbytes_by_img(ctx_clip, img_res_v.data[i].nx, img_res_v.data[i].ny));
-            n_img_pos_out += clip_n_patches_by_img(ctx_clip, &img_res_v.data[i]);
+                clip_embd_nbytes_by_img(ctx_clip, nx, ny));
+            n_img_pos_out += clip_n_patches_by_img(ctx_clip, img_res);
        }
        *n_img_pos = n_img_pos_out;
        for (size_t i = 0; i < image_embd_v.size(); i++) {
            free(image_embd_v[i]);
        }
        image_embd_v.clear();
-        load_image_size->width = img->nx;
-        load_image_size->height = img->ny;
-        clip_add_load_image_size(ctx_clip, load_image_size);
-        LOG_INF("%s: load_image_size %d %d\n", __func__, load_image_size->width, load_image_size->height);
-        delete[] img_res_v.data;
-        img_res_v.size = 0;
-        img_res_v.data = nullptr;
+        load_image_size.width = img->nx;
+        load_image_size.height = img->ny;
+        clip_add_load_image_size(ctx_clip, &load_image_size);
+        LOG_INF("%s: load_image_size %d %d\n", __func__, load_image_size.width, load_image_size.height);
    }
    else if (clip_is_glm(ctx_clip)){
        struct clip_image_size * load_image_size = clip_image_size_init();
-        load_image_size->width = img_res_v.data[0].nx;
-        load_image_size->height = img_res_v.data[0].ny;
+        load_image_size->width  = clip_image_f32_batch_nx(img_res_v.get(), 0);
+        load_image_size->height = clip_image_f32_batch_ny(img_res_v.get(), 0);
        clip_add_load_image_size(ctx_clip, load_image_size);

-        bool encoded = clip_image_encode(ctx_clip, n_threads, &img_res_v.data[0], image_embd);
-        int pos = int(load_image_size->width/clip_patch_size(ctx_clip)/2);
+        clip_image_f32 * img_res = clip_image_f32_get_img(img_res_v.get(), 0);
+        bool encoded = clip_image_encode(ctx_clip, n_threads, img_res, image_embd);
+        int pos = int(load_image_size->width/clip_get_patch_size(ctx_clip)/2);
        *n_img_pos = (pos * pos + 2);
        if (!encoded){
            LOG_ERR("Unable to encode image \n");
@ -328,8 +343,8 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli
    else if (strcmp(mm_patch_merge_type, "spatial_unpad") != 0) {
        // flat / default llava-1.5 type embedding
        *n_img_pos = clip_n_patches(ctx_clip);
-        bool encoded = clip_image_encode(ctx_clip, n_threads, &img_res_v.data[0], image_embd); // image_embd shape is 576 x 4096
-        delete[] img_res_v.data;
+        clip_image_f32 * img_res = clip_image_f32_get_img(img_res_v.get(), 0);
+        bool encoded = clip_image_encode(ctx_clip, n_threads, img_res, image_embd); // image_embd shape is 576 x 4096
        if (!encoded) {
            LOG_ERR("Unable to encode image\n");

@ -340,17 +355,18 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli
        // spatial_unpad llava-1.6 type embedding
        // TODO: CLIP needs batching support - in HF the llm projection is separate after encoding, which might be a solution to quickly get batching working
        std::vector<float *> image_embd_v;
-        image_embd_v.resize(img_res_v.size);
-        for (size_t i = 0; i < img_res_v.size; i++) {
+        image_embd_v.resize(n_imgs);
+        for (size_t i = 0; i < n_imgs; i++) {
+            clip_image_f32 * img_res = clip_image_f32_get_img(img_res_v.get(), i);
            image_embd_v[i] = (float *)malloc(clip_embd_nbytes(ctx_clip)); // 576 patches * 4096 embeddings * 4 bytes = 9437184
-            const bool encoded = clip_image_encode(ctx_clip, n_threads, &img_res_v.data[i], image_embd_v[i]); // image data is in 3x336x336 format and will be converted to 336x336x3 inside
+            const bool encoded = clip_image_encode(ctx_clip, n_threads, img_res, image_embd_v[i]); // image data is in 3x336x336 format and will be converted to 336x336x3 inside
            if (!encoded) {
-                LOG_ERR("Unable to encode image - spatial_unpad - subimage %d of %d\n", (int) i+1, (int) img_res_v.size);
+                LOG_ERR("Unable to encode image - spatial_unpad - subimage %d of %d\n", (int) i+1, (int) n_imgs);
                return false;
            }
        }
        const int64_t t_img_enc_batch_us = ggml_time_us();
-        LOG_INF("%s: %d segments encoded in %8.2f ms\n", __func__, (int)img_res_v.size, (t_img_enc_batch_us - t_img_enc_start_us) / 1000.0);
+        LOG_INF("%s: %d segments encoded in %8.2f ms\n", __func__, (int)n_imgs, (t_img_enc_batch_us - t_img_enc_start_us) / 1000.0);

        const int32_t * image_grid = clip_image_grid(ctx_clip);
        const size_t num_gridpoints = get_clip_image_grid_size(ctx_clip);
@ -360,12 +376,7 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli
            grid_pinpoints.push_back({image_grid[i], image_grid[i+1]});
        }

-        // free all img_res_v - not needed anymore
-        delete[] img_res_v.data;
-        img_res_v.size = 0;
-        img_res_v.data = nullptr;
-
-        const int32_t image_size = clip_image_size(ctx_clip);
+        const int32_t image_size = clip_get_image_size(ctx_clip);

        struct clip_image_grid_shape grid_shape = get_anyres_image_grid_shape({img->nx,img->ny}, grid_pinpoints, image_size);

--- a/llama/llama.cpp/include/llama.h
+++ b/llama/llama.cpp/include/llama.h
@ -60,6 +60,7 @@ extern "C" {
    struct llama_model;
    struct llama_context;
    struct llama_sampler;
+    struct llama_kv_cache;

    typedef int32_t llama_pos;
    typedef int32_t llama_token;
@ -106,6 +107,10 @@ extern "C" {
        LLAMA_VOCAB_PRE_TYPE_MINERVA        = 27,
        LLAMA_VOCAB_PRE_TYPE_DEEPSEEK3_LLM  = 28,
        LLAMA_VOCAB_PRE_TYPE_GPT4O          = 29,
+        LLAMA_VOCAB_PRE_TYPE_SUPERBPE       = 30,
+        LLAMA_VOCAB_PRE_TYPE_TRILLION       = 31,
+        LLAMA_VOCAB_PRE_TYPE_BAILINGMOE     = 32,
+        LLAMA_VOCAB_PRE_TYPE_LLAMA4         = 33,
    };

    enum llama_rope_type {
@ -277,10 +282,18 @@ extern "C" {
        };
    };

+    struct llama_model_tensor_buft_override {
+        const char * pattern;
+        ggml_backend_buffer_type_t buft;
+    };
+
    struct llama_model_params {
        // NULL-terminated list of devices to use for offloading (if NULL, all available devices are used)
        ggml_backend_dev_t * devices;

+        // NULL-terminated list of buffer types to use for tensors that match a pattern
+        const struct llama_model_tensor_buft_override * tensor_buft_overrides;
+
        int32_t n_gpu_layers; // number of layers to store in VRAM
        enum llama_split_mode split_mode; // how to split the model across multiple GPUs

@ -356,17 +369,18 @@ extern "C" {

    // model quantization parameters
    typedef struct llama_model_quantize_params {
-        int32_t nthread;                     // number of threads to use for quantizing, if <=0 will use std::thread::hardware_concurrency()
-        enum llama_ftype ftype;              // quantize to this llama_ftype
-        enum ggml_type output_tensor_type;   // output tensor type
-        enum ggml_type token_embedding_type; // token embeddings tensor type
-        bool allow_requantize;               // allow quantizing non-f32/f16 tensors
-        bool quantize_output_tensor;         // quantize output.weight
-        bool only_copy;                      // only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored
-        bool pure;                           // quantize all tensors to the default type
-        bool keep_split;                     // quantize to the same number of shards
-        void * imatrix;                      // pointer to importance matrix data
-        void * kv_overrides;                 // pointer to vector containing overrides
+        int32_t nthread;                      // number of threads to use for quantizing, if <=0 will use std::thread::hardware_concurrency()
+        enum llama_ftype ftype;               // quantize to this llama_ftype
+        enum ggml_type output_tensor_type;    // output tensor type
+        enum ggml_type token_embedding_type;  // token embeddings tensor type
+        bool allow_requantize;                // allow quantizing non-f32/f16 tensors
+        bool quantize_output_tensor;          // quantize output.weight
+        bool only_copy;                       // only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored
+        bool pure;                            // quantize all tensors to the default type
+        bool keep_split;                      // quantize to the same number of shards
+        void * imatrix;                       // pointer to importance matrix data
+        void * kv_overrides;                  // pointer to vector containing overrides
+        void * tensor_types;                  // pointer to vector containing tensor types
    } llama_model_quantize_params;

    typedef struct llama_logit_bias {
@ -475,7 +489,8 @@ extern "C" {
    DEPRECATED(LLAMA_API int32_t llama_n_vocab    (const struct llama_vocab * vocab), "use llama_vocab_n_tokens instead");

    LLAMA_API const struct llama_model * llama_get_model   (const struct llama_context * ctx);
-    LLAMA_API enum llama_pooling_type    llama_pooling_type(const struct llama_context * ctx);
+    LLAMA_API    struct llama_kv_cache * llama_get_kv_self (      struct llama_context * ctx);
+    LLAMA_API  enum llama_pooling_type   llama_pooling_type(const struct llama_context * ctx); // TODO: rename to llama_get_pooling_type

    LLAMA_API const struct llama_vocab * llama_model_get_vocab(const struct llama_model * model);
    LLAMA_API enum llama_rope_type       llama_model_rope_type(const struct llama_model * model);
@ -592,7 +607,7 @@ extern "C" {
    // KV cache
    //

-    // TODO: remove llama_kv_cache_view_* API
+    // TODO: start using struct llama_kv_cache

    // Information associated with an individual cell in the KV cache view.
    struct llama_kv_cache_view_cell {
@ -647,13 +662,19 @@ extern "C" {

    // Returns the number of tokens in the KV cache (slow, use only for debug)
    // If a KV cell has multiple sequences assigned to it, it will be counted multiple times
-    LLAMA_API int32_t llama_get_kv_cache_token_count(const struct llama_context * ctx);
+    LLAMA_API int32_t llama_kv_self_n_tokens(const struct llama_context * ctx);
+
+    DEPRECATED(LLAMA_API int32_t llama_get_kv_cache_token_count(const struct llama_context * ctx),
+            "use llama_kv_self_n_tokens instead");

    // Returns the number of used KV cells (i.e. have at least one sequence assigned to them)
-    LLAMA_API int32_t llama_get_kv_cache_used_cells(const struct llama_context * ctx);
+    LLAMA_API int32_t llama_kv_self_used_cells(const struct llama_context * ctx);
+
+    DEPRECATED(LLAMA_API int32_t llama_get_kv_cache_used_cells(const struct llama_context * ctx),
+            "use llama_kv_self_used_cells instead");

    // Clear the KV cache - both cell info is erased and KV data is zeroed
-    LLAMA_API void llama_kv_cache_clear(
+    LLAMA_API void llama_kv_self_clear(
            struct llama_context * ctx);

    // Removes all tokens that belong to the specified sequence and have positions in [p0, p1)
@ -661,7 +682,7 @@ extern "C" {
    // seq_id < 0 : match any sequence
    // p0 < 0     : [0,  p1]
    // p1 < 0     : [p0, inf)
-    LLAMA_API bool llama_kv_cache_seq_rm(
+    LLAMA_API bool llama_kv_self_seq_rm(
            struct llama_context * ctx,
                    llama_seq_id   seq_id,
                       llama_pos   p0,
@ -671,7 +692,7 @@ extern "C" {
    // Note that this does not allocate extra KV cache memory - it simply assigns the tokens to the new sequence
    // p0 < 0 : [0,  p1]
    // p1 < 0 : [p0, inf)
-    LLAMA_API void llama_kv_cache_seq_cp(
+    LLAMA_API void llama_kv_self_seq_cp(
            struct llama_context * ctx,
                    llama_seq_id   seq_id_src,
                    llama_seq_id   seq_id_dst,
@ -679,17 +700,17 @@ extern "C" {
                       llama_pos   p1);

    // Removes all tokens that do not belong to the specified sequence
-    LLAMA_API void llama_kv_cache_seq_keep(
+    LLAMA_API void llama_kv_self_seq_keep(
            struct llama_context * ctx,
                    llama_seq_id   seq_id);

    // Adds relative position "delta" to all tokens that belong to the specified sequence and have positions in [p0, p1)
    // If the KV cache is RoPEd, the KV data is updated accordingly:
    //   - lazily on next llama_decode()
-    //   - explicitly with llama_kv_cache_update()
+    //   - explicitly with llama_kv_self_update()
    // p0 < 0 : [0,  p1]
    // p1 < 0 : [p0, inf)
-    LLAMA_API void llama_kv_cache_seq_add(
+    LLAMA_API void llama_kv_self_seq_add(
            struct llama_context * ctx,
                    llama_seq_id   seq_id,
                       llama_pos   p0,
@ -699,10 +720,10 @@ extern "C" {
    // Integer division of the positions by factor of `d > 1`
    // If the KV cache is RoPEd, the KV data is updated accordingly:
    //   - lazily on next llama_decode()
-    //   - explicitly with llama_kv_cache_update()
+    //   - explicitly with llama_kv_self_update()
    // p0 < 0 : [0,  p1]
    // p1 < 0 : [p0, inf)
-    LLAMA_API void llama_kv_cache_seq_div(
+    LLAMA_API void llama_kv_self_seq_div(
            struct llama_context * ctx,
                    llama_seq_id   seq_id,
                       llama_pos   p0,
@ -710,24 +731,76 @@ extern "C" {
                             int   d);

    // Returns the largest position present in the KV cache for the specified sequence
-    LLAMA_API llama_pos llama_kv_cache_seq_pos_max(
+    LLAMA_API llama_pos llama_kv_self_seq_pos_max(
            struct llama_context * ctx,
-                    llama_seq_id   seq_id);
-
-    // TODO: the llama_kv_cache_defrag and llama_kv_cache_update API tightly couples llama_context with llama_kv_cache
-    //       how to avoid this?
+                     llama_seq_id   seq_id);

    // Defragment the KV cache
    // This will be applied:
    //   - lazily on next llama_decode()
-    //   - explicitly with llama_kv_cache_update()
-    LLAMA_API void llama_kv_cache_defrag(struct llama_context * ctx);
-
-    // Apply the KV cache updates (such as K-shifts, defragmentation, etc.)
-    LLAMA_API void llama_kv_cache_update(struct llama_context * ctx);
+    //   - explicitly with llama_kv_self_update()
+    LLAMA_API void llama_kv_self_defrag(struct llama_context * ctx);

    // Check if the context supports KV cache shifting
-    LLAMA_API bool llama_kv_cache_can_shift(struct llama_context * ctx);
+    LLAMA_API bool llama_kv_self_can_shift(const struct llama_context * ctx);
+
+    // Apply the KV cache updates (such as K-shifts, defragmentation, etc.)
+    LLAMA_API void llama_kv_self_update(struct llama_context * ctx);
+
+    DEPRECATED(LLAMA_API void llama_kv_cache_clear(
+            struct llama_context * ctx),
+            "use llama_kv_self_clear instead");
+
+    DEPRECATED(LLAMA_API bool llama_kv_cache_seq_rm(
+            struct llama_context * ctx,
+                    llama_seq_id   seq_id,
+                       llama_pos   p0,
+                       llama_pos   p1),
+            "use llama_kv_self_seq_rm instead");
+
+    DEPRECATED(LLAMA_API void llama_kv_cache_seq_cp(
+            struct llama_context * ctx,
+                    llama_seq_id   seq_id_src,
+                    llama_seq_id   seq_id_dst,
+                       llama_pos   p0,
+                       llama_pos   p1),
+            "use llama_kv_self_seq_cp instead");
+
+    DEPRECATED(LLAMA_API void llama_kv_cache_seq_keep(
+            struct llama_context * ctx,
+                    llama_seq_id   seq_id),
+            "use llama_kv_self_seq_keep instead");
+
+    DEPRECATED(LLAMA_API void llama_kv_cache_seq_add(
+            struct llama_context * ctx,
+                    llama_seq_id   seq_id,
+                       llama_pos   p0,
+                       llama_pos   p1,
+                       llama_pos   delta),
+            "use llama_kv_self_seq_add instead");
+
+    DEPRECATED(LLAMA_API void llama_kv_cache_seq_div(
+            struct llama_context * ctx,
+                    llama_seq_id   seq_id,
+                       llama_pos   p0,
+                       llama_pos   p1,
+                             int   d),
+            "use llama_kv_self_seq_div instead");
+
+    DEPRECATED(LLAMA_API llama_pos llama_kv_cache_seq_pos_max(
+            struct llama_context * ctx,
+                    llama_seq_id   seq_id),
+            "use llama_kv_self_seq_pos_max instead");
+
+    DEPRECATED(LLAMA_API void llama_kv_cache_defrag(struct llama_context * ctx),
+            "use llama_kv_self_defrag instead");
+
+    DEPRECATED(LLAMA_API bool llama_kv_cache_can_shift(const struct llama_context * ctx),
+            "use llama_kv_self_can_shift instead");
+
+    DEPRECATED(LLAMA_API void llama_kv_cache_update(struct llama_context * ctx),
+            "use llama_kv_self_update instead");
+

    //
    // State / sessions
@ -891,6 +964,10 @@ extern "C" {
    // If set to true, the model will only attend to the past tokens
    LLAMA_API void llama_set_causal_attn(struct llama_context * ctx, bool causal_attn);

+    // Set whether the model is in warmup mode or not
+    // If true, all model tensors are activated during llama_decode() to load and cache their weights.
+    LLAMA_API void llama_set_warmup(struct llama_context * ctx, bool warmup);
+
    // Set abort callback
    LLAMA_API void llama_set_abort_callback(struct llama_context * ctx, ggml_abort_callback abort_callback, void * abort_callback_data);

@ -1206,22 +1283,38 @@ extern "C" {
                               float   tau,
                               float   eta);

+    /// @details Intializes a GBNF grammar, see grammars/README.md for details.
+    /// @param vocab The vocabulary that this grammar will be used with.
+    /// @param grammar_str The production rules for the grammar, encoded as a string. Returns an empty grammar if empty. Returns NULL if parsing of grammar_str fails.
+    /// @param grammar_root The name of the start symbol for the grammar.
    LLAMA_API struct llama_sampler * llama_sampler_init_grammar(
            const struct llama_vocab * vocab,
                          const char * grammar_str,
                          const char * grammar_root);

-    /// @details Lazy grammar sampler, introduced in https://github.com/ggml-org/llama.cpp/pull/9639
-    /// @param trigger_words A list of words that will trigger the grammar sampler. This may be updated to a loose regex syntax (w/ ^) in a near future.
-    /// @param trigger_tokens A list of tokens that will trigger the grammar sampler.
-    LLAMA_API struct llama_sampler * llama_sampler_init_grammar_lazy(
+    DEPRECATED(LLAMA_API struct llama_sampler * llama_sampler_init_grammar_lazy(
            const struct llama_vocab * vocab,
                          const char * grammar_str,
                          const char * grammar_root,
                         const char ** trigger_words,
                                size_t num_trigger_words,
                   const llama_token * trigger_tokens,
-                                size_t num_trigger_tokens);
+                                size_t num_trigger_tokens),
+        "use llama_sampler_init_grammar_lazy_patterns instead");
+
+
+    /// @details Lazy grammar sampler, introduced in https://github.com/ggml-org/llama.cpp/pull/9639
+    /// @param trigger_patterns A list of patterns that will trigger the grammar sampler. Pattern will be matched from the start of the generation output, and grammar sampler will be fed content starting from its first match group.
+    /// @param trigger_tokens A list of tokens that will trigger the grammar sampler. Grammar sampler will be fed content starting from the trigger token included.
+    LLAMA_API struct llama_sampler * llama_sampler_init_grammar_lazy_patterns(
+        const struct llama_vocab * vocab,
+                      const char * grammar_str,
+                      const char * grammar_root,
+                     const char ** trigger_patterns,
+                            size_t num_trigger_patterns,
+               const llama_token * trigger_tokens,
+                            size_t num_trigger_tokens);
+

    /// NOTE: Avoid using on the full vocabulary as searching for repeated tokens can become slow. For example, apply top-k or top-p sampling first.
    LLAMA_API struct llama_sampler * llama_sampler_init_penalties(
--- a/llama/llama.cpp/src/llama-adapter.cpp
+++ b/llama/llama.cpp/src/llama-adapter.cpp
@ -4,14 +4,13 @@
 #include "llama-mmap.h"
 #include "llama-model.h"

-#include <algorithm>
 #include <map>
 #include <cassert>
 #include <stdexcept>

 // vec

-struct ggml_tensor * llama_adapter_cvec::tensor_for(int il) const {
+ggml_tensor * llama_adapter_cvec::tensor_for(int il) const {
    if (il < 0 || il < layer_start || il > layer_end || (size_t) il >= tensors.size()) {
        return nullptr;
    }
@ -19,7 +18,7 @@ struct ggml_tensor * llama_adapter_cvec::tensor_for(int il) const {
    return tensors[il];
 }

-struct ggml_tensor * llama_adapter_cvec::apply_to(struct ggml_context * ctx, struct ggml_tensor * cur, int  il) const {
+ggml_tensor * llama_adapter_cvec::apply_to(ggml_context * ctx, ggml_tensor * cur, int  il) const {
    ggml_tensor * layer_dir = tensor_for(il);
    if (layer_dir != nullptr) {
        cur = ggml_add(ctx, cur, layer_dir);
@ -40,7 +39,7 @@ bool llama_adapter_cvec::init(const llama_model & model) {
    auto ctx_for_buft = [&](ggml_backend_buffer_type_t buft) -> ggml_context * {
        auto it = ctx_map.find(buft);
        if (it == ctx_map.end()) {
-            struct ggml_init_params params = {
+            ggml_init_params params = {
                /*.mem_size   =*/ hparams.n_layer*ggml_tensor_overhead(),
                /*.mem_buffer =*/ NULL,
                /*.no_alloc   =*/ true,
@ -91,7 +90,7 @@ bool llama_adapter_cvec::init(const llama_model & model) {
    return true;
 }

-int32_t llama_adapter_cvec::apply(
+bool llama_adapter_cvec::apply(
        const llama_model & model,
        const float * data,
        size_t len,
@ -104,17 +103,17 @@ int32_t llama_adapter_cvec::apply(
        // disable the current control vector (but leave allocated for later)
        layer_start = -1;
        layer_end   = -1;
-        return 0;
+        return true;
    }

    if (n_embd != (int) hparams.n_embd) {
        LLAMA_LOG_ERROR("%s: control vector n_embd does not match model\n", __func__);
-        return 1;
+        return false;
    }

    if (tensors.empty()) {
        if (!init(model)) {
-            return 1;
+            return false;
        }
    }

@ -130,12 +129,12 @@ int32_t llama_adapter_cvec::apply(
        }
    }

-    return 0;
+    return true;
 }

 // lora

-llama_adapter_lora_weight * llama_adapter_lora::get_weight(struct ggml_tensor * w) {
+llama_adapter_lora_weight * llama_adapter_lora::get_weight(ggml_tensor * w) {
    const std::string name(w->name);

    const auto pos = ab_map.find(name);
@ -146,11 +145,11 @@ llama_adapter_lora_weight * llama_adapter_lora::get_weight(struct ggml_tensor *
    return nullptr;
 }

-static void llama_adapter_lora_init_impl(struct llama_model & model, const char * path_lora, struct llama_adapter_lora & adapter) {
+static void llama_adapter_lora_init_impl(llama_model & model, const char * path_lora, llama_adapter_lora & adapter) {
    LLAMA_LOG_INFO("%s: loading lora adapter from '%s' ...\n", __func__, path_lora);

    ggml_context * ctx_init;
-    struct gguf_init_params meta_gguf_params = {
+    gguf_init_params meta_gguf_params = {
        /* .no_alloc = */ true,
        /* .ctx      = */ &ctx_init,
    };
@ -201,7 +200,7 @@ static void llama_adapter_lora_init_impl(struct llama_model & model, const char
        auto it = ctx_map.find(buft);
        if (it == ctx_map.end()) {
            // add a new context
-            struct ggml_init_params params = {
+            ggml_init_params params = {
                /*.mem_size   =*/ n_tensors*ggml_tensor_overhead(),
                /*.mem_buffer =*/ NULL,
                /*.no_alloc   =*/ true,
@ -248,6 +247,26 @@ static void llama_adapter_lora_init_impl(struct llama_model & model, const char
        }
    }

+    // get extra buffer types of the CPU
+    // TODO: a more general solution for non-CPU extra buft should be imlpemented in the future
+    //       ref: https://github.com/ggml-org/llama.cpp/pull/12593#pullrequestreview-2718659948
+    std::vector<ggml_backend_buffer_type_t> buft_extra;
+    {
+        auto * cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
+        auto * cpu_reg = ggml_backend_dev_backend_reg(cpu_dev);
+
+        auto ggml_backend_dev_get_extra_bufts_fn = (ggml_backend_dev_get_extra_bufts_t)
+            ggml_backend_reg_get_proc_address(cpu_reg, "ggml_backend_dev_get_extra_bufts");
+
+        if (ggml_backend_dev_get_extra_bufts_fn) {
+            ggml_backend_buffer_type_t * extra_bufts = ggml_backend_dev_get_extra_bufts_fn(cpu_dev);
+            while (extra_bufts && *extra_bufts) {
+                buft_extra.emplace_back(*extra_bufts);
+                ++extra_bufts;
+            }
+        }
+    }
+
    // add tensors
    for (auto & it : ab_map) {
        const std::string & name = it.first;
@ -264,7 +283,23 @@ static void llama_adapter_lora_init_impl(struct llama_model & model, const char
            throw std::runtime_error("LoRA tensor '" + name + "' does not exist in base model (hint: maybe wrong base model?)");
        }

-        struct ggml_context * dev_ctx = ctx_for_buft(ggml_backend_buffer_get_type(model_tensor->buffer));
+        auto * buft = ggml_backend_buffer_get_type(model_tensor->buffer);
+
+        // do not load loras to extra buffer types (i.e. bufts for repacking) -> use the CPU in that case
+        for (auto & ex : buft_extra) {
+            if (ex == buft) {
+                LLAMA_LOG_WARN("%s: lora for '%s' cannot use buft '%s', fallback to CPU\n", __func__, model_tensor->name, ggml_backend_buft_name(buft));
+
+                auto * cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
+                buft = ggml_backend_dev_buffer_type(cpu_dev);
+
+                break;
+            }
+        }
+
+        LLAMA_LOG_DEBUG("%s: lora for '%s' -> '%s'\n", __func__, model_tensor->name, ggml_backend_buft_name(buft));
+
+        ggml_context * dev_ctx = ctx_for_buft(buft);
        // validate tensor shape
        if (is_token_embd) {
            // expect B to be non-transposed, A and B are flipped; see llm_build_inp_embd()
@ -281,8 +316,8 @@ static void llama_adapter_lora_init_impl(struct llama_model & model, const char
        }

        // save tensor to adapter
-        struct ggml_tensor * tensor_a = ggml_dup_tensor(dev_ctx, w.a);
-        struct ggml_tensor * tensor_b = ggml_dup_tensor(dev_ctx, w.b);
+        ggml_tensor * tensor_a = ggml_dup_tensor(dev_ctx, w.a);
+        ggml_tensor * tensor_b = ggml_dup_tensor(dev_ctx, w.b);
        ggml_set_name(tensor_a, w.a->name);
        ggml_set_name(tensor_b, w.b->name);
        adapter.ab_map[name] = llama_adapter_lora_weight(tensor_a, tensor_b);
@ -308,7 +343,7 @@ static void llama_adapter_lora_init_impl(struct llama_model & model, const char
    {
        llama_file gguf_file(path_lora, "rb");
        std::vector<uint8_t> read_buf;
-        auto set_tensor = [&](struct ggml_tensor * orig, struct ggml_tensor * dev) {
+        auto set_tensor = [&](ggml_tensor * orig, ggml_tensor * dev) {
            size_t offs = gguf_get_data_offset(ctx_gguf.get()) + gguf_get_tensor_offset(ctx_gguf.get(), gguf_find_tensor(ctx_gguf.get(), orig->name));
            size_t size = ggml_nbytes(orig);
            read_buf.resize(size);
@ -327,8 +362,8 @@ static void llama_adapter_lora_init_impl(struct llama_model & model, const char
    LLAMA_LOG_INFO("%s: loaded %zu tensors from lora file\n", __func__, adapter.ab_map.size()*2);
 }

-struct llama_adapter_lora * llama_adapter_lora_init(struct llama_model * model, const char * path_lora) {
-    struct llama_adapter_lora * adapter = new llama_adapter_lora();
+llama_adapter_lora * llama_adapter_lora_init(llama_model * model, const char * path_lora) {
+    llama_adapter_lora * adapter = new llama_adapter_lora();

    try {
        llama_adapter_lora_init_impl(*model, path_lora, *adapter);
@ -342,6 +377,6 @@ struct llama_adapter_lora * llama_adapter_lora_init(struct llama_model * model,
    return nullptr;
 }

-void llama_adapter_lora_free(struct llama_adapter_lora * adapter) {
+void llama_adapter_lora_free(llama_adapter_lora * adapter) {
    delete adapter;
 }
--- a/llama/llama.cpp/src/llama-adapter.h
+++ b/llama/llama.cpp/src/llama-adapter.h
@ -15,11 +15,11 @@
 //

 struct llama_adapter_cvec {
-    struct ggml_tensor * tensor_for(int il) const;
+    ggml_tensor * tensor_for(int il) const;

-    struct ggml_tensor * apply_to(struct ggml_context * ctx, struct ggml_tensor * cur, int  il) const;
+    ggml_tensor * apply_to(ggml_context * ctx, ggml_tensor * cur, int  il) const;

-    int32_t apply(
+    bool apply(
            const llama_model & model,
            const float * data,
            size_t len,
@ -36,7 +36,7 @@ private:
    std::vector<ggml_context_ptr> ctxs;
    std::vector<ggml_backend_buffer_ptr> bufs;

-    std::vector<struct ggml_tensor *> tensors; // per layer
+    std::vector<ggml_tensor *> tensors; // per layer
 };

 //
@ -44,8 +44,8 @@ private:
 //

 struct llama_adapter_lora_weight {
-    struct ggml_tensor * a = nullptr;
-    struct ggml_tensor * b = nullptr;
+    ggml_tensor * a = nullptr;
+    ggml_tensor * b = nullptr;

    // get actual scale based on rank and alpha
    float get_scale(float alpha, float adapter_scale) const {
@ -55,12 +55,12 @@ struct llama_adapter_lora_weight {
    }

    llama_adapter_lora_weight() = default;
-    llama_adapter_lora_weight(struct ggml_tensor * a, struct ggml_tensor * b) : a(a), b(b) {}
+    llama_adapter_lora_weight(ggml_tensor * a, ggml_tensor * b) : a(a), b(b) {}
 };

 struct llama_adapter_lora {
    // map tensor name to lora_a_b
-    std::unordered_map<std::string, struct llama_adapter_lora_weight> ab_map;
+    std::unordered_map<std::string, llama_adapter_lora_weight> ab_map;

    std::vector<ggml_context_ptr> ctxs;
    std::vector<ggml_backend_buffer_ptr> bufs;
@ -70,5 +70,7 @@ struct llama_adapter_lora {
    llama_adapter_lora() = default;
    ~llama_adapter_lora() = default;

-    llama_adapter_lora_weight * get_weight(struct ggml_tensor * w);
+    llama_adapter_lora_weight * get_weight(ggml_tensor * w);
 };
+
+using llama_adapter_loras = std::unordered_map<llama_adapter_lora *, float>;
--- a/llama/llama.cpp/src/llama-arch.cpp
+++ b/llama/llama.cpp/src/llama-arch.cpp
@ -7,6 +7,7 @@
 static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
    { LLM_ARCH_LLAMA,            "llama"            },
    { LLM_ARCH_MLLAMA,           "mllama"           },
+    { LLM_ARCH_LLAMA4,           "llama4"           },
    { LLM_ARCH_DECI,             "deci"             },
    { LLM_ARCH_FALCON,           "falcon"           },
    { LLM_ARCH_GROK,             "grok"             },
@ -26,6 +27,8 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
    { LLM_ARCH_QWEN2,            "qwen2"            },
    { LLM_ARCH_QWEN2MOE,         "qwen2moe"         },
    { LLM_ARCH_QWEN2VL,          "qwen2vl"          },
+    { LLM_ARCH_QWEN3,            "qwen3"            },
+    { LLM_ARCH_QWEN3MOE,         "qwen3moe"         },
    { LLM_ARCH_PHI2,             "phi2"             },
    { LLM_ARCH_PHI3,             "phi3"             },
    { LLM_ARCH_PHIMOE,           "phimoe"           },
@ -52,6 +55,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
    { LLM_ARCH_DEEPSEEK,         "deepseek"         },
    { LLM_ARCH_DEEPSEEK2,        "deepseek2"        },
    { LLM_ARCH_CHATGLM,          "chatglm"          },
+    { LLM_ARCH_GLM4,             "glm4"             },
    { LLM_ARCH_BITNET,           "bitnet"           },
    { LLM_ARCH_T5,               "t5"               },
    { LLM_ARCH_T5ENCODER,        "t5encoder"        },
@ -60,11 +64,15 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
    { LLM_ARCH_EXAONE,           "exaone"           },
    { LLM_ARCH_RWKV6,            "rwkv6"            },
    { LLM_ARCH_RWKV6QWEN2,       "rwkv6qwen2"       },
+    { LLM_ARCH_RWKV7,            "rwkv7"            },
+    { LLM_ARCH_ARWKV7,           "arwkv7"           },
    { LLM_ARCH_GRANITE,          "granite"          },
    { LLM_ARCH_GRANITE_MOE,      "granitemoe"       },
    { LLM_ARCH_CHAMELEON,        "chameleon"        },
    { LLM_ARCH_SOLAR,            "solar"            },
    { LLM_ARCH_WAVTOKENIZER_DEC, "wavtokenizer-dec" },
+    { LLM_ARCH_PLM,              "plm"              },
+    { LLM_ARCH_BAILINGMOE,       "bailingmoe"       },
    { LLM_ARCH_MISTRAL3,         "mistral3"         },
    { LLM_ARCH_UNKNOWN,          "(unknown)"        },
 };
@ -74,6 +82,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
    { LLM_KV_GENERAL_ARCHITECTURE,         "general.architecture"                  },
    { LLM_KV_GENERAL_QUANTIZATION_VERSION, "general.quantization_version"          },
    { LLM_KV_GENERAL_ALIGNMENT,            "general.alignment"                     },
+    { LLM_KV_GENERAL_FILE_TYPE,            "general.file_type"                     },
    { LLM_KV_GENERAL_NAME,                 "general.name"                          },
    { LLM_KV_GENERAL_AUTHOR,               "general.author"                        },
    { LLM_KV_GENERAL_VERSION,              "general.version"                       },
@ -112,25 +121,30 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
    { LLM_KV_RESIDUAL_SCALE,                    "%s.residual_scale"                    },
    { LLM_KV_EMBEDDING_SCALE,                   "%s.embedding_scale"                   },
    { LLM_KV_TOKEN_SHIFT_COUNT,                 "%s.token_shift_count"                 },
+    { LLM_KV_INTERLEAVE_MOE_LAYER_STEP,         "%s.interleave_moe_layer_step"         },

-    { LLM_KV_ATTENTION_HEAD_COUNT,             "%s.attention.head_count"             },
-    { LLM_KV_ATTENTION_HEAD_COUNT_KV,          "%s.attention.head_count_kv"          },
-    { LLM_KV_ATTENTION_MAX_ALIBI_BIAS,         "%s.attention.max_alibi_bias"         },
-    { LLM_KV_ATTENTION_CLAMP_KQV,              "%s.attention.clamp_kqv"              },
-    { LLM_KV_ATTENTION_KEY_LENGTH,             "%s.attention.key_length"             },
-    { LLM_KV_ATTENTION_VALUE_LENGTH,           "%s.attention.value_length"           },
-    { LLM_KV_ATTENTION_LAYERNORM_EPS,          "%s.attention.layer_norm_epsilon"     },
-    { LLM_KV_ATTENTION_LAYERNORM_RMS_EPS,      "%s.attention.layer_norm_rms_epsilon" },
-    { LLM_KV_ATTENTION_GROUPNORM_EPS,          "%s.attention.group_norm_epsilon"     },
-    { LLM_KV_ATTENTION_GROUPNORM_GROUPS,       "%s.attention.group_norm_groups"      },
-    { LLM_KV_ATTENTION_CAUSAL,                 "%s.attention.causal"                 },
-    { LLM_KV_ATTENTION_Q_LORA_RANK,            "%s.attention.q_lora_rank"            },
-    { LLM_KV_ATTENTION_KV_LORA_RANK,           "%s.attention.kv_lora_rank"           },
-    { LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT, "%s.attention.relative_buckets_count" },
-    { LLM_KV_ATTENTION_SLIDING_WINDOW,         "%s.attention.sliding_window"         },
-    { LLM_KV_ATTENTION_SCALE,                  "%s.attention.scale"                  },
-    { LLM_KV_ATTENTION_BLOCK_SKIP_CONNECTION,  "%s.attention.block_skip_connection"  },
-    { LLM_KV_ATTENTION_CROSS_ATTENTION_LAYERS, "%s.attention.cross_attention_layers" },
+    { LLM_KV_ATTENTION_HEAD_COUNT,                   "%s.attention.head_count"                   },
+    { LLM_KV_ATTENTION_HEAD_COUNT_KV,                "%s.attention.head_count_kv"                },
+    { LLM_KV_ATTENTION_MAX_ALIBI_BIAS,               "%s.attention.max_alibi_bias"               },
+    { LLM_KV_ATTENTION_CLAMP_KQV,                    "%s.attention.clamp_kqv"                    },
+    { LLM_KV_ATTENTION_KEY_LENGTH,                   "%s.attention.key_length"                   },
+    { LLM_KV_ATTENTION_VALUE_LENGTH,                 "%s.attention.value_length"                 },
+    { LLM_KV_ATTENTION_LAYERNORM_EPS,                "%s.attention.layer_norm_epsilon"           },
+    { LLM_KV_ATTENTION_LAYERNORM_RMS_EPS,            "%s.attention.layer_norm_rms_epsilon"       },
+    { LLM_KV_ATTENTION_GROUPNORM_EPS,                "%s.attention.group_norm_epsilon"           },
+    { LLM_KV_ATTENTION_GROUPNORM_GROUPS,             "%s.attention.group_norm_groups"            },
+    { LLM_KV_ATTENTION_CAUSAL,                       "%s.attention.causal"                       },
+    { LLM_KV_ATTENTION_Q_LORA_RANK,                  "%s.attention.q_lora_rank"                  },
+    { LLM_KV_ATTENTION_KV_LORA_RANK,                 "%s.attention.kv_lora_rank"                 },
+    { LLM_KV_ATTENTION_DECAY_LORA_RANK,              "%s.attention.decay_lora_rank"              },
+    { LLM_KV_ATTENTION_ICLR_LORA_RANK,               "%s.attention.iclr_lora_rank"               },
+    { LLM_KV_ATTENTION_VALUE_RESIDUAL_MIX_LORA_RANK, "%s.attention.value_residual_mix_lora_rank" },
+    { LLM_KV_ATTENTION_GATE_LORA_RANK,               "%s.attention.gate_lora_rank"               },
+    { LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT,       "%s.attention.relative_buckets_count"       },
+    { LLM_KV_ATTENTION_SLIDING_WINDOW,               "%s.attention.sliding_window"               },
+    { LLM_KV_ATTENTION_SCALE,                        "%s.attention.scale"                        },
+    { LLM_KV_ATTENTION_BLOCK_SKIP_CONNECTION,        "%s.attention.block_skip_connection"        },
+    { LLM_KV_ATTENTION_CROSS_ATTENTION_LAYERS,       "%s.attention.cross_attention_layers"       },

    { LLM_KV_ROPE_DIMENSION_COUNT,      "%s.rope.dimension_count"                 },
    { LLM_KV_ROPE_DIMENSION_SECTIONS,   "%s.rope.dimension_sections"              },
@ -229,6 +243,35 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
            { LLM_TENSOR_FFN_UP_EXPS,     "blk.%d.ffn_up_exps" },
        },
    },
+    {
+        LLM_ARCH_LLAMA4,
+        {
+            { LLM_TENSOR_TOKEN_EMBD,      "token_embd" },
+            { LLM_TENSOR_OUTPUT_NORM,     "output_norm" },
+            { LLM_TENSOR_OUTPUT,          "output" },
+            { LLM_TENSOR_ROPE_FREQS,      "rope_freqs" },
+            { LLM_TENSOR_ATTN_NORM,       "blk.%d.attn_norm" },
+            { LLM_TENSOR_ATTN_Q,          "blk.%d.attn_q" },
+            { LLM_TENSOR_ATTN_K,          "blk.%d.attn_k" },
+            { LLM_TENSOR_ATTN_V,          "blk.%d.attn_v" },
+            { LLM_TENSOR_ATTN_OUT,        "blk.%d.attn_output" },
+            { LLM_TENSOR_ATTN_ROT_EMBD,   "blk.%d.attn_rot_embd" },
+            { LLM_TENSOR_FFN_GATE_INP,    "blk.%d.ffn_gate_inp" },
+            { LLM_TENSOR_FFN_NORM,        "blk.%d.ffn_norm" },
+            { LLM_TENSOR_FFN_GATE,        "blk.%d.ffn_gate" },
+            { LLM_TENSOR_FFN_DOWN,        "blk.%d.ffn_down" },
+            { LLM_TENSOR_FFN_UP,          "blk.%d.ffn_up" },
+            { LLM_TENSOR_FFN_GATE_EXP,    "blk.%d.ffn_gate.%d" },
+            { LLM_TENSOR_FFN_DOWN_EXP,    "blk.%d.ffn_down.%d" },
+            { LLM_TENSOR_FFN_UP_EXP,      "blk.%d.ffn_up.%d" },
+            { LLM_TENSOR_FFN_GATE_EXPS,   "blk.%d.ffn_gate_exps" },
+            { LLM_TENSOR_FFN_DOWN_EXPS,   "blk.%d.ffn_down_exps" },
+            { LLM_TENSOR_FFN_UP_EXPS,     "blk.%d.ffn_up_exps" },
+            { LLM_TENSOR_FFN_GATE_SHEXP,  "blk.%d.ffn_gate_shexp" },
+            { LLM_TENSOR_FFN_DOWN_SHEXP,  "blk.%d.ffn_down_shexp" },
+            { LLM_TENSOR_FFN_UP_SHEXP,    "blk.%d.ffn_up_shexp" },
+        },
+    },
    {
        LLM_ARCH_MLLAMA,
        {
@ -594,6 +637,45 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
            { LLM_TENSOR_FFN_UP_SHEXP,       "blk.%d.ffn_up_shexp" },
        },
    },
+    {
+        LLM_ARCH_QWEN3,
+        {
+            { LLM_TENSOR_TOKEN_EMBD,      "token_embd" },
+            { LLM_TENSOR_OUTPUT_NORM,     "output_norm" },
+            { LLM_TENSOR_OUTPUT,          "output" },
+            { LLM_TENSOR_ATTN_NORM,       "blk.%d.attn_norm" },
+            { LLM_TENSOR_ATTN_Q,          "blk.%d.attn_q" },
+            { LLM_TENSOR_ATTN_Q_NORM,     "blk.%d.attn_q_norm" },
+            { LLM_TENSOR_ATTN_K,          "blk.%d.attn_k" },
+            { LLM_TENSOR_ATTN_K_NORM,     "blk.%d.attn_k_norm" },
+            { LLM_TENSOR_ATTN_V,          "blk.%d.attn_v" },
+            { LLM_TENSOR_ATTN_OUT,        "blk.%d.attn_output" },
+            { LLM_TENSOR_FFN_NORM,        "blk.%d.ffn_norm" },
+            { LLM_TENSOR_FFN_GATE,        "blk.%d.ffn_gate" },
+            { LLM_TENSOR_FFN_DOWN,        "blk.%d.ffn_down" },
+            { LLM_TENSOR_FFN_UP,          "blk.%d.ffn_up" },
+        },
+    },
+    {
+        LLM_ARCH_QWEN3MOE,
+        {
+            { LLM_TENSOR_TOKEN_EMBD,         "token_embd" },
+            { LLM_TENSOR_OUTPUT_NORM,        "output_norm" },
+            { LLM_TENSOR_OUTPUT,             "output" },
+            { LLM_TENSOR_ATTN_NORM,          "blk.%d.attn_norm" },
+            { LLM_TENSOR_ATTN_Q,             "blk.%d.attn_q" },
+            { LLM_TENSOR_ATTN_Q_NORM,        "blk.%d.attn_q_norm" },
+            { LLM_TENSOR_ATTN_K,             "blk.%d.attn_k" },
+            { LLM_TENSOR_ATTN_K_NORM,        "blk.%d.attn_k_norm" },
+            { LLM_TENSOR_ATTN_V,             "blk.%d.attn_v" },
+            { LLM_TENSOR_ATTN_OUT,           "blk.%d.attn_output" },
+            { LLM_TENSOR_FFN_NORM,           "blk.%d.ffn_norm" },
+            { LLM_TENSOR_FFN_GATE_INP,       "blk.%d.ffn_gate_inp" },
+            { LLM_TENSOR_FFN_GATE_EXPS,      "blk.%d.ffn_gate_exps" },
+            { LLM_TENSOR_FFN_DOWN_EXPS,      "blk.%d.ffn_down_exps" },
+            { LLM_TENSOR_FFN_UP_EXPS,        "blk.%d.ffn_up_exps" },
+        },
+    },
    {
        LLM_ARCH_PHI2,
        {
@ -811,9 +893,12 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
        {
            { LLM_TENSOR_TOKEN_EMBD,      "token_embd" },
            { LLM_TENSOR_OUTPUT_NORM,     "output_norm" },
+            { LLM_TENSOR_OUTPUT,          "output" },
            { LLM_TENSOR_ATTN_NORM,       "blk.%d.attn_norm" },
            { LLM_TENSOR_ATTN_Q,          "blk.%d.attn_q" },
+            { LLM_TENSOR_ATTN_Q_NORM,     "blk.%d.attn_q_norm" },
            { LLM_TENSOR_ATTN_K,          "blk.%d.attn_k" },
+            { LLM_TENSOR_ATTN_K_NORM,     "blk.%d.attn_k_norm" },
            { LLM_TENSOR_ATTN_V,          "blk.%d.attn_v" },
            { LLM_TENSOR_ATTN_OUT,        "blk.%d.attn_output" },
            { LLM_TENSOR_ATTN_POST_NORM,  "blk.%d.post_attention_norm" },
@ -1073,6 +1158,22 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
            { LLM_TENSOR_FFN_EXP_PROBS_B,    "blk.%d.exp_probs_b" },
        },
    },
+    {
+        LLM_ARCH_PLM,
+        {
+            { LLM_TENSOR_TOKEN_EMBD,         "token_embd" },
+            { LLM_TENSOR_OUTPUT_NORM,        "output_norm" },
+            { LLM_TENSOR_ATTN_NORM,          "blk.%d.attn_norm" },
+            { LLM_TENSOR_ATTN_Q,             "blk.%d.attn_q" },
+            { LLM_TENSOR_ATTN_KV_A_MQA,      "blk.%d.attn_kv_a_mqa" },
+            { LLM_TENSOR_ATTN_KV_A_NORM,     "blk.%d.attn_kv_a_norm" },
+            { LLM_TENSOR_ATTN_KV_B,          "blk.%d.attn_kv_b" },
+            { LLM_TENSOR_ATTN_OUT,           "blk.%d.attn_output" },
+            { LLM_TENSOR_FFN_NORM,           "blk.%d.ffn_norm" },
+            { LLM_TENSOR_FFN_DOWN,           "blk.%d.ffn_down" },
+            { LLM_TENSOR_FFN_UP,             "blk.%d.ffn_up" },
+        },
+    },
    {
        LLM_ARCH_CHATGLM,
        {
@ -1091,6 +1192,25 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
            { LLM_TENSOR_FFN_DOWN,        "blk.%d.ffn_down" },
        },
    },
+    {
+        LLM_ARCH_GLM4,
+        {
+            { LLM_TENSOR_TOKEN_EMBD,      "token_embd" },
+            { LLM_TENSOR_ROPE_FREQS,      "rope_freqs" },
+            { LLM_TENSOR_OUTPUT_NORM,     "output_norm" },
+            { LLM_TENSOR_OUTPUT,          "output" },
+            { LLM_TENSOR_ATTN_NORM,       "blk.%d.attn_norm" },
+            { LLM_TENSOR_ATTN_Q,          "blk.%d.attn_q" },
+            { LLM_TENSOR_ATTN_K,          "blk.%d.attn_k" },
+            { LLM_TENSOR_ATTN_V,          "blk.%d.attn_v" },
+            { LLM_TENSOR_ATTN_OUT,        "blk.%d.attn_output" },
+            { LLM_TENSOR_FFN_NORM,        "blk.%d.ffn_norm" },
+            { LLM_TENSOR_FFN_UP,          "blk.%d.ffn_up" },
+            { LLM_TENSOR_FFN_DOWN,        "blk.%d.ffn_down" },
+            { LLM_TENSOR_ATTN_POST_NORM,  "blk.%d.post_attention_norm" },
+            { LLM_TENSOR_FFN_POST_NORM,   "blk.%d.post_ffw_norm" },
+        },
+    },
    {
        LLM_ARCH_BITNET,
        {
@ -1275,6 +1395,74 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
            { LLM_TENSOR_FFN_UP,                    "blk.%d.ffn_up" },
        },
    },
+    {
+        LLM_ARCH_RWKV7,
+        {
+            { LLM_TENSOR_TOKEN_EMBD,                "token_embd" },
+            { LLM_TENSOR_TOKEN_EMBD_NORM,           "token_embd_norm" },
+            { LLM_TENSOR_OUTPUT_NORM,               "output_norm" },
+            { LLM_TENSOR_OUTPUT,                    "output" },
+            { LLM_TENSOR_ATTN_NORM,                 "blk.%d.attn_norm" },
+            { LLM_TENSOR_ATTN_NORM_2,               "blk.%d.attn_norm_2" },
+            { LLM_TENSOR_TIME_MIX_W0,               "blk.%d.time_mix_w0" },
+            { LLM_TENSOR_TIME_MIX_W1,               "blk.%d.time_mix_w1" },
+            { LLM_TENSOR_TIME_MIX_W2,               "blk.%d.time_mix_w2" },
+            { LLM_TENSOR_TIME_MIX_A0,               "blk.%d.time_mix_a0" },
+            { LLM_TENSOR_TIME_MIX_A1,               "blk.%d.time_mix_a1" },
+            { LLM_TENSOR_TIME_MIX_A2,               "blk.%d.time_mix_a2" },
+            { LLM_TENSOR_TIME_MIX_V0,               "blk.%d.time_mix_v0" },
+            { LLM_TENSOR_TIME_MIX_V1,               "blk.%d.time_mix_v1" },
+            { LLM_TENSOR_TIME_MIX_V2,               "blk.%d.time_mix_v2" },
+            { LLM_TENSOR_TIME_MIX_G1,               "blk.%d.time_mix_g1" },
+            { LLM_TENSOR_TIME_MIX_G2,               "blk.%d.time_mix_g2" },
+            { LLM_TENSOR_TIME_MIX_K_K,              "blk.%d.time_mix_k_k" },
+            { LLM_TENSOR_TIME_MIX_K_A,              "blk.%d.time_mix_k_a" },
+            { LLM_TENSOR_TIME_MIX_R_K,              "blk.%d.time_mix_r_k" },
+            { LLM_TENSOR_TIME_MIX_LERP_FUSED,       "blk.%d.time_mix_lerp_fused" },
+            { LLM_TENSOR_TIME_MIX_KEY,              "blk.%d.time_mix_key" },
+            { LLM_TENSOR_TIME_MIX_VALUE,            "blk.%d.time_mix_value" },
+            { LLM_TENSOR_TIME_MIX_RECEPTANCE,       "blk.%d.time_mix_receptance" },
+            { LLM_TENSOR_TIME_MIX_LN,               "blk.%d.time_mix_ln" },
+            { LLM_TENSOR_TIME_MIX_OUTPUT,           "blk.%d.time_mix_output" },
+            { LLM_TENSOR_CHANNEL_MIX_LERP_K,        "blk.%d.channel_mix_lerp_k" },
+            { LLM_TENSOR_CHANNEL_MIX_KEY,           "blk.%d.channel_mix_key" },
+            { LLM_TENSOR_CHANNEL_MIX_VALUE,         "blk.%d.channel_mix_value" },
+        },
+    },
+    {
+        LLM_ARCH_ARWKV7,
+        {
+            { LLM_TENSOR_TOKEN_EMBD,                "token_embd" },
+            { LLM_TENSOR_TOKEN_EMBD_NORM,           "token_embd_norm" },
+            { LLM_TENSOR_OUTPUT_NORM,               "output_norm" },
+            { LLM_TENSOR_OUTPUT,                    "output" },
+            { LLM_TENSOR_ATTN_NORM,                 "blk.%d.attn_norm" },
+            { LLM_TENSOR_TIME_MIX_W0,               "blk.%d.time_mix_w0" },
+            { LLM_TENSOR_TIME_MIX_W1,               "blk.%d.time_mix_w1" },
+            { LLM_TENSOR_TIME_MIX_W2,               "blk.%d.time_mix_w2" },
+            { LLM_TENSOR_TIME_MIX_A0,               "blk.%d.time_mix_a0" },
+            { LLM_TENSOR_TIME_MIX_A1,               "blk.%d.time_mix_a1" },
+            { LLM_TENSOR_TIME_MIX_A2,               "blk.%d.time_mix_a2" },
+            { LLM_TENSOR_TIME_MIX_V0,               "blk.%d.time_mix_v0" },
+            { LLM_TENSOR_TIME_MIX_V1,               "blk.%d.time_mix_v1" },
+            { LLM_TENSOR_TIME_MIX_V2,               "blk.%d.time_mix_v2" },
+            { LLM_TENSOR_TIME_MIX_G1,               "blk.%d.time_mix_g1" },
+            { LLM_TENSOR_TIME_MIX_G2,               "blk.%d.time_mix_g2" },
+            { LLM_TENSOR_TIME_MIX_K_K,              "blk.%d.time_mix_k_k" },
+            { LLM_TENSOR_TIME_MIX_K_A,              "blk.%d.time_mix_k_a" },
+            { LLM_TENSOR_TIME_MIX_R_K,              "blk.%d.time_mix_r_k" },
+            { LLM_TENSOR_TIME_MIX_LERP_FUSED,       "blk.%d.time_mix_lerp_fused" },
+            { LLM_TENSOR_TIME_MIX_KEY,              "blk.%d.time_mix_key" },
+            { LLM_TENSOR_TIME_MIX_VALUE,            "blk.%d.time_mix_value" },
+            { LLM_TENSOR_TIME_MIX_RECEPTANCE,       "blk.%d.time_mix_receptance" },
+            { LLM_TENSOR_TIME_MIX_LN,               "blk.%d.time_mix_ln" },
+            { LLM_TENSOR_TIME_MIX_OUTPUT,           "blk.%d.time_mix_output" },
+            { LLM_TENSOR_FFN_NORM,                  "blk.%d.ffn_norm" },
+            { LLM_TENSOR_FFN_GATE,                  "blk.%d.ffn_gate" },
+            { LLM_TENSOR_FFN_DOWN,                  "blk.%d.ffn_down" },
+            { LLM_TENSOR_FFN_UP,                    "blk.%d.ffn_up" },
+        },
+    },
    {
        LLM_ARCH_GRANITE,
        {
@ -1372,6 +1560,29 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
            { LLM_TENSOR_POS_NET_ATTN_OUT,  "posnet.%d.attn_output" },
        },
    },
+    {
+        LLM_ARCH_BAILINGMOE,
+        {
+            { LLM_TENSOR_TOKEN_EMBD,         "token_embd" },
+            { LLM_TENSOR_OUTPUT_NORM,        "output_norm" },
+            { LLM_TENSOR_OUTPUT,             "output" },
+            { LLM_TENSOR_ROPE_FREQS,         "rope_freqs" },
+            { LLM_TENSOR_ATTN_NORM,          "blk.%d.attn_norm" },
+            { LLM_TENSOR_ATTN_Q,             "blk.%d.attn_q" },
+            { LLM_TENSOR_ATTN_K,             "blk.%d.attn_k" },
+            { LLM_TENSOR_ATTN_V,             "blk.%d.attn_v" },
+            { LLM_TENSOR_ATTN_OUT,           "blk.%d.attn_output" },
+            { LLM_TENSOR_FFN_GATE_INP,       "blk.%d.ffn_gate_inp" },
+            { LLM_TENSOR_FFN_NORM,           "blk.%d.ffn_norm" },
+            { LLM_TENSOR_FFN_GATE_EXPS,      "blk.%d.ffn_gate_exps" },
+            { LLM_TENSOR_FFN_DOWN_EXPS,      "blk.%d.ffn_down_exps" },
+            { LLM_TENSOR_FFN_UP_EXPS,        "blk.%d.ffn_up_exps" },
+            { LLM_TENSOR_FFN_GATE_INP_SHEXP, "blk.%d.ffn_gate_inp_shexp" },
+            { LLM_TENSOR_FFN_GATE_SHEXP,     "blk.%d.ffn_gate_shexp" },
+            { LLM_TENSOR_FFN_DOWN_SHEXP,     "blk.%d.ffn_down_shexp" },
+            { LLM_TENSOR_FFN_UP_SHEXP,       "blk.%d.ffn_up_shexp" },
+        },
+    },
    {
        LLM_ARCH_MISTRAL3,
        {
@ -1468,6 +1679,12 @@ static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
    {LLM_TENSOR_SSM_OUT,                    {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
    {LLM_TENSOR_TIME_MIX_W1,                {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
    {LLM_TENSOR_TIME_MIX_W2,                {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+    {LLM_TENSOR_TIME_MIX_A1,                {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+    {LLM_TENSOR_TIME_MIX_A2,                {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+    {LLM_TENSOR_TIME_MIX_V1,                {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+    {LLM_TENSOR_TIME_MIX_V2,                {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+    {LLM_TENSOR_TIME_MIX_G1,                {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+    {LLM_TENSOR_TIME_MIX_G2,                {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
    {LLM_TENSOR_TIME_MIX_DECAY_W1,          {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
    {LLM_TENSOR_TIME_MIX_DECAY_W2,          {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
    {LLM_TENSOR_TIME_MIX_KEY,               {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
@ -1486,6 +1703,9 @@ static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
    {LLM_TENSOR_TIME_MIX_LN,                {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
    {LLM_TENSOR_CHANNEL_MIX_LERP_K,         {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
    {LLM_TENSOR_CHANNEL_MIX_LERP_R,         {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
+    {LLM_TENSOR_TIME_MIX_K_K,               {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
+    {LLM_TENSOR_TIME_MIX_K_A,               {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
+    {LLM_TENSOR_TIME_MIX_R_K,               {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
    {LLM_TENSOR_TIME_MIX_LERP_W,            {LLM_TENSOR_LAYER_REPEATING, GGML_OP_ADD}},
    {LLM_TENSOR_TIME_MIX_LERP_K,            {LLM_TENSOR_LAYER_REPEATING, GGML_OP_ADD}},
    {LLM_TENSOR_TIME_MIX_LERP_V,            {LLM_TENSOR_LAYER_REPEATING, GGML_OP_ADD}},
@ -1493,6 +1713,9 @@ static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
    {LLM_TENSOR_TIME_MIX_LERP_G,            {LLM_TENSOR_LAYER_REPEATING, GGML_OP_ADD}},
    {LLM_TENSOR_TIME_MIX_LERP_FUSED,        {LLM_TENSOR_LAYER_REPEATING, GGML_OP_ADD}},
    {LLM_TENSOR_TIME_MIX_DECAY,             {LLM_TENSOR_LAYER_REPEATING, GGML_OP_ADD}},
+    {LLM_TENSOR_TIME_MIX_W0,                {LLM_TENSOR_LAYER_REPEATING, GGML_OP_ADD}},
+    {LLM_TENSOR_TIME_MIX_A0,                {LLM_TENSOR_LAYER_REPEATING, GGML_OP_ADD}},
+    {LLM_TENSOR_TIME_MIX_V0,                {LLM_TENSOR_LAYER_REPEATING, GGML_OP_ADD}},
    {LLM_TENSOR_TIME_MIX_FIRST,             {LLM_TENSOR_LAYER_REPEATING, GGML_OP_RWKV_WKV6}},
    {LLM_TENSOR_ATTN_NORM,                  {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
    {LLM_TENSOR_ATTN_NORM_2,                {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
--- a/llama/llama.cpp/src/llama-arch.h
+++ b/llama/llama.cpp/src/llama-arch.h
@ -10,6 +10,7 @@

 enum llm_arch {
    LLM_ARCH_LLAMA,
+    LLM_ARCH_LLAMA4,
    LLM_ARCH_MLLAMA,
    LLM_ARCH_DECI,
    LLM_ARCH_FALCON,
@ -30,6 +31,8 @@ enum llm_arch {
    LLM_ARCH_QWEN2,
    LLM_ARCH_QWEN2MOE,
    LLM_ARCH_QWEN2VL,
+    LLM_ARCH_QWEN3,
+    LLM_ARCH_QWEN3MOE,
    LLM_ARCH_PHI2,
    LLM_ARCH_PHI3,
    LLM_ARCH_PHIMOE,
@ -56,6 +59,7 @@ enum llm_arch {
    LLM_ARCH_DEEPSEEK,
    LLM_ARCH_DEEPSEEK2,
    LLM_ARCH_CHATGLM,
+    LLM_ARCH_GLM4,
    LLM_ARCH_BITNET,
    LLM_ARCH_T5,
    LLM_ARCH_T5ENCODER,
@ -64,12 +68,16 @@ enum llm_arch {
    LLM_ARCH_EXAONE,
    LLM_ARCH_RWKV6,
    LLM_ARCH_RWKV6QWEN2,
+    LLM_ARCH_RWKV7,
+    LLM_ARCH_ARWKV7,
    LLM_ARCH_GRANITE,
    LLM_ARCH_GRANITE_MOE,
    LLM_ARCH_CHAMELEON,
    LLM_ARCH_SOLAR,
    LLM_ARCH_WAVTOKENIZER_DEC,
    LLM_ARCH_MISTRAL3,
+    LLM_ARCH_PLM,
+    LLM_ARCH_BAILINGMOE,
    LLM_ARCH_UNKNOWN,
 };

@ -78,6 +86,7 @@ enum llm_kv {
    LLM_KV_GENERAL_ARCHITECTURE,
    LLM_KV_GENERAL_QUANTIZATION_VERSION,
    LLM_KV_GENERAL_ALIGNMENT,
+    LLM_KV_GENERAL_FILE_TYPE,
    LLM_KV_GENERAL_NAME,
    LLM_KV_GENERAL_AUTHOR,
    LLM_KV_GENERAL_VERSION,
@ -116,6 +125,7 @@ enum llm_kv {
    LLM_KV_RESIDUAL_SCALE,
    LLM_KV_EMBEDDING_SCALE,
    LLM_KV_TOKEN_SHIFT_COUNT,
+    LLM_KV_INTERLEAVE_MOE_LAYER_STEP,

    LLM_KV_ATTENTION_HEAD_COUNT,
    LLM_KV_ATTENTION_HEAD_COUNT_KV,
@ -130,6 +140,10 @@ enum llm_kv {
    LLM_KV_ATTENTION_CAUSAL,
    LLM_KV_ATTENTION_Q_LORA_RANK,
    LLM_KV_ATTENTION_KV_LORA_RANK,
+    LLM_KV_ATTENTION_DECAY_LORA_RANK,
+    LLM_KV_ATTENTION_ICLR_LORA_RANK,
+    LLM_KV_ATTENTION_VALUE_RESIDUAL_MIX_LORA_RANK,
+    LLM_KV_ATTENTION_GATE_LORA_RANK,
    LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT,
    LLM_KV_ATTENTION_SLIDING_WINDOW,
    LLM_KV_ATTENTION_SCALE,
@ -248,6 +262,8 @@ enum llm_tensor {
    LLM_TENSOR_ATTN_Q_NORM,
    LLM_TENSOR_ATTN_K_NORM,
    LLM_TENSOR_LAYER_OUT_NORM,
+    LLM_TENSOR_POST_ATTN_NORM,
+    LLM_TENSOR_POST_MLP_NORM,
    LLM_TENSOR_SSM_IN,
    LLM_TENSOR_SSM_CONV1D,
    LLM_TENSOR_SSM_X,
@ -255,8 +271,20 @@ enum llm_tensor {
    LLM_TENSOR_SSM_A,
    LLM_TENSOR_SSM_D,
    LLM_TENSOR_SSM_OUT,
+    LLM_TENSOR_TIME_MIX_W0,
    LLM_TENSOR_TIME_MIX_W1,
    LLM_TENSOR_TIME_MIX_W2,
+    LLM_TENSOR_TIME_MIX_A0,
+    LLM_TENSOR_TIME_MIX_A1,
+    LLM_TENSOR_TIME_MIX_A2,
+    LLM_TENSOR_TIME_MIX_V0,
+    LLM_TENSOR_TIME_MIX_V1,
+    LLM_TENSOR_TIME_MIX_V2,
+    LLM_TENSOR_TIME_MIX_G1,
+    LLM_TENSOR_TIME_MIX_G2,
+    LLM_TENSOR_TIME_MIX_K_K,
+    LLM_TENSOR_TIME_MIX_K_A,
+    LLM_TENSOR_TIME_MIX_R_K,
    LLM_TENSOR_TIME_MIX_LERP_X,
    LLM_TENSOR_TIME_MIX_LERP_W,
    LLM_TENSOR_TIME_MIX_LERP_K,
--- a/llama/llama.cpp/src/llama-batch.h
+++ b/llama/llama.cpp/src/llama-batch.h
@ -42,9 +42,9 @@ struct llama_sbatch {
    bool logits_all; // TODO: remove once lctx.logits_all is removed too

    // sorted indices into the batch
-    std::vector<size_t> ids;
+    std::vector<int64_t> ids;
    // batch indices of the output
-    std::vector<size_t> out_ids;
+    std::vector<int64_t> out_ids;
    std::vector<llama_sbatch_seq> seq;

    const llama_batch * batch = nullptr;
--- a/llama/llama.cpp/src/llama-chat.cpp
+++ b/llama/llama.cpp/src/llama-chat.cpp
@ -4,6 +4,7 @@

 #include <map>
 #include <sstream>
+#include <algorithm>

 #if __cplusplus >= 202000L
    #define LU8(x) (const char*)(u8##x)
@ -58,6 +59,9 @@ static const std::map<std::string, llm_chat_template> LLM_CHAT_TEMPLATES = {
    { "granite",           LLM_CHAT_TEMPLATE_GRANITE           },
    { "gigachat",          LLM_CHAT_TEMPLATE_GIGACHAT          },
    { "megrez",            LLM_CHAT_TEMPLATE_MEGREZ            },
+    { "yandex",            LLM_CHAT_TEMPLATE_YANDEX            },
+    { "bailing",           LLM_CHAT_TEMPLATE_BAILING           },
+    { "llama4",            LLM_CHAT_TEMPLATE_LLAMA4            },
 };

 llm_chat_template llm_chat_template_from_str(const std::string & name) {
@ -167,6 +171,12 @@ llm_chat_template llm_chat_detect_template(const std::string & tmpl) {
        return LLM_CHAT_TEMPLATE_GIGACHAT;
    } else if (tmpl_contains("<|role_start|>")) {
        return LLM_CHAT_TEMPLATE_MEGREZ;
+    } else if (tmpl_contains(" Ассистент:")) {
+        return LLM_CHAT_TEMPLATE_YANDEX;
+    } else if (tmpl_contains("<role>ASSISTANT</role>") && tmpl_contains("'HUMAN'")) {
+        return LLM_CHAT_TEMPLATE_BAILING;
+    } else if (tmpl_contains("<|header_start|>") && tmpl_contains("<|header_end|>")) {
+        return LLM_CHAT_TEMPLATE_LLAMA4;
    }
    return LLM_CHAT_TEMPLATE_UNKNOWN;
 }
@ -566,7 +576,51 @@ int32_t llm_chat_apply_template(
        if (add_ass) {
            ss << "<|role_start|>assistant<|role_end|>";
        }
-    } else {
+    } else if (tmpl == LLM_CHAT_TEMPLATE_YANDEX) {
+        // Yandex template ("\n\n" is defined as EOT token)
+
+        ss << "<s>";
+
+        for (size_t i = 0; i < chat.size(); i++) {
+            std::string role(chat[i]->role);
+            if (role == "user") {
+                ss << " Пользователь: " << chat[i]->content << "\n\n";
+            } else if (role == "assistant") {
+                ss << " Ассистент: " << chat[i]->content << "\n\n";
+            }
+        }
+
+        // Add generation prompt if needed
+        if (add_ass) {
+            ss << " Ассистент:[SEP]";
+        }
+    }  else if (tmpl == LLM_CHAT_TEMPLATE_BAILING) {
+        // Bailing (Ling) template
+        for (auto message : chat) {
+            std::string role(message->role);
+
+            if (role == "user") {
+                role = "HUMAN";
+            } else {
+                std::transform(role.begin(), role.end(), role.begin(), ::toupper);
+            }
+
+            ss << "<role>" << role << "</role>" << message->content;
+        }
+
+        if (add_ass) {
+            ss << "<role>ASSISTANT</role>";
+        }
+    } else if (tmpl == LLM_CHAT_TEMPLATE_LLAMA4) {
+        // Llama 4
+        for (auto message : chat) {
+            std::string role(message->role);
+            ss << "<|header_start|>" << role << "<|header_end|>\n\n" << trim(message->content) << "<|eot|>";
+        }
+        if (add_ass) {
+            ss << "<|header_start|>assistant<|header_end|>\n\n";
+        }
+    }  else {
        // template not supported
        return -1;
    }
@ -584,4 +638,3 @@ int32_t llama_chat_builtin_templates(const char ** output, size_t len) {
    }
    return (int32_t) LLM_CHAT_TEMPLATES.size();
 }
-
--- a/llama/llama.cpp/src/llama-chat.h
+++ b/llama/llama.cpp/src/llama-chat.h
@ -38,6 +38,9 @@ enum llm_chat_template {
    LLM_CHAT_TEMPLATE_GRANITE,
    LLM_CHAT_TEMPLATE_GIGACHAT,
    LLM_CHAT_TEMPLATE_MEGREZ,
+    LLM_CHAT_TEMPLATE_YANDEX,
+    LLM_CHAT_TEMPLATE_BAILING,
+    LLM_CHAT_TEMPLATE_LLAMA4,
    LLM_CHAT_TEMPLATE_UNKNOWN,
 };

--- a/llama/llama.cpp/src/llama-context.cpp
+++ b/llama/llama.cpp/src/llama-context.cpp
--- a/llama/llama.cpp/src/llama-context.h
+++ b/llama/llama.cpp/src/llama-context.h
@ -3,66 +3,216 @@
 #include "llama.h"
 #include "llama-batch.h"
 #include "llama-cparams.h"
-#include "llama-model.h"
-#include "llama-kv-cache.h"
+#include "llama-graph.h"
 #include "llama-adapter.h"
+#include "llama-kv-cache.h"

 #include "ggml-cpp.h"

 #include <map>
-#include <unordered_map>
 #include <vector>
-#include <set>
+
+struct llama_model;
+struct llama_kv_cache;
+
+class llama_io_read_i;
+class llama_io_write_i;

 struct llama_context {
-    llama_context(const llama_model & model)
-        : model(model)
-        , t_start_us(model.t_start_us)
-        , t_load_us(model.t_load_us) {}
+    // init scheduler and compute buffers, reserve worst-case graphs
+    llama_context(
+            const llama_model & model,
+                  llama_context_params params);

-    const struct llama_model & model;
+    ~llama_context();

-    struct llama_cparams      cparams;
-    struct llama_sbatch       sbatch;  // TODO: revisit if needed
-    struct llama_kv_cache     kv_self;
-    struct llama_adapter_cvec cvec;
+    void synchronize();

-    std::unordered_map<struct llama_adapter_lora *, float> lora;
+    const llama_model & get_model() const;

-    std::vector<ggml_backend_ptr> backends;
-    std::vector<std::pair<ggml_backend_t, ggml_backend_set_n_threads_t>> set_n_threads_fns;
+    uint32_t n_ctx()         const;
+    uint32_t n_ctx_per_seq() const;
+    uint32_t n_batch()       const;
+    uint32_t n_ubatch()      const;
+    uint32_t n_seq_max()     const;

-    ggml_backend_t backend_cpu = nullptr;
+    uint32_t n_threads()       const;
+    uint32_t n_threads_batch() const;

-    ggml_threadpool_t threadpool       = nullptr;
-    ggml_threadpool_t threadpool_batch = nullptr;
+          llama_kv_cache * get_kv_self();
+    const llama_kv_cache * get_kv_self() const;

-    bool has_evaluated_once = false;
+    void kv_self_update();

-    mutable int64_t t_start_us;
-    mutable int64_t t_load_us;
-    mutable int64_t t_p_eval_us = 0;
-    mutable int64_t t_eval_us   = 0;
+    enum llama_pooling_type pooling_type() const;

-    mutable int64_t t_compute_start_us = 0;
-    mutable int64_t n_queued_tokens = 0;
+    float * get_logits();
+    float * get_logits_ith(int32_t i);

-    mutable int32_t n_p_eval = 0; // number of tokens in eval calls for the prompt (with batch size > 1)
-    mutable int32_t n_eval   = 0; // number of eval calls
+    float * get_embeddings();
+    float * get_embeddings_ith(int32_t i);
+    float * get_embeddings_seq(llama_seq_id seq_id);

-    // host buffer for the model output (logits and embeddings)
-    ggml_backend_buffer_ptr buf_output;
+    void attach_threadpool(
+            ggml_threadpool_t threadpool,
+            ggml_threadpool_t threadpool_batch);
+
+    void detach_threadpool();
+
+    void set_n_threads(int32_t n_threads, int32_t n_threads_batch);
+
+    void set_abort_callback(bool (*abort_callback)(void * data), void * abort_callback_data);
+
+    void set_embeddings (bool value);
+    void set_causal_attn(bool value);
+    void set_warmup(bool value);
+    void set_cross_attn(bool value);
+
+    void set_adapter_lora(
+            llama_adapter_lora * adapter,
+            float scale);
+
+    bool rm_adapter_lora(
+            llama_adapter_lora * adapter);
+
+    void clear_adapter_lora();
+
+    bool apply_adapter_cvec(
+            const float * data,
+                 size_t   len,
+                int32_t   n_embd,
+                int32_t   il_start,
+                int32_t   il_end);
+
+    int encode(llama_batch & inp_batch);
+    int decode(llama_batch & inp_batch);
+
+    //
+    // state save/load
+    //
+
+    size_t state_get_size();
+    size_t state_get_data(      uint8_t * dst, size_t size);
+    size_t state_set_data(const uint8_t * src, size_t size);
+
+    size_t state_seq_get_size(llama_seq_id seq_id);
+    size_t state_seq_get_data(llama_seq_id seq_id,       uint8_t * dst, size_t size);
+    size_t state_seq_set_data(llama_seq_id seq_id, const uint8_t * src, size_t size);
+
+    bool state_load_file(
+            const char * filepath,
+           llama_token * tokens_out,
+                size_t   n_token_capacity,
+                size_t * n_token_count_out);
+
+    bool state_save_file(
+            const char * filepath,
+     const llama_token * tokens,
+                size_t   n_token_count);
+
+    size_t state_seq_load_file(
+          llama_seq_id   seq_id,
+            const char * filepath,
+           llama_token * tokens_out,
+                size_t   n_token_capacity,
+                size_t * n_token_count_out);
+
+    size_t state_seq_save_file(
+          llama_seq_id   seq_id,
+            const char * filepath,
+     const llama_token * tokens,
+                size_t   n_token_count);
+
+    //
+    // perf
+    //
+
+    llama_perf_context_data perf_get_data() const;
+    void perf_reset();
+
+private:
+    //
+    // output
+    //
+
+    // Make sure enough space is available for outputs.
+    // Returns max number of outputs for which space was reserved.
+    int32_t output_reserve(int32_t n_outputs);
+
+    // make the outputs have the same order they had in the user-provided batch
+    // TODO: maybe remove this
+    void output_reorder();
+
+    //
+    // graph
+    //
+
+    int32_t graph_max_nodes() const;
+
+    // zero-out inputs and create the ctx_compute for the compute graph
+    ggml_cgraph * graph_init();
+
+    llm_graph_result_ptr graph_build(
+            ggml_context * ctx,
+             ggml_cgraph * gf,
+      const llama_ubatch & ubatch,
+          llm_graph_type   gtype);
+
+    // returns the result of ggml_backend_sched_graph_compute_async execution
+    ggml_status graph_compute(
+            ggml_cgraph * gf,
+                   bool   batched);
+
+    llm_graph_cb graph_get_cb() const;
+
+    // used by kv_self_update()
+    ggml_tensor * build_rope_shift(
+        ggml_context * ctx0,
+        ggml_tensor * cur,
+        ggml_tensor * shift,
+        ggml_tensor * factors,
+              float   freq_base,
+              float   freq_scale,
+        ggml_backend_buffer * bbuf) const;
+
+    llm_graph_result_ptr build_kv_self_shift(
+            ggml_context * ctx0,
+            ggml_cgraph * gf) const;
+
+    llm_graph_result_ptr build_kv_self_defrag(
+            ggml_context * ctx0,
+            ggml_cgraph * gf,
+            const std::vector<struct llama_kv_defrag_move> & moves) const;
+
+    // TODO: read/write lora adapters and cvec
+    size_t state_write_data(llama_io_write_i & io);
+    size_t state_read_data (llama_io_read_i  & io);
+
+    size_t state_seq_write_data(llama_io_write_i & io, llama_seq_id seq_id);
+    size_t state_seq_read_data (llama_io_read_i  & io, llama_seq_id seq_id);
+
+    //
+    // members
+    //
+
+    const llama_model & model;
+
+    llama_cparams       cparams;
+    llama_adapter_cvec  cvec;
+    llama_adapter_loras loras;
+    llama_sbatch        sbatch;
+
+    llama_cross cross; // TODO: tmp for handling cross-attention - need something better probably
+
+    std::unique_ptr<llama_kv_cache_unified> kv_self;
+
+    // TODO: remove
+    bool logits_all = false;

    // decode output (2-dimensional array: [n_outputs][n_vocab])
    size_t  logits_size = 0; // capacity (of floats) for logits
    float * logits      = nullptr;

-    std::vector<int32_t> output_ids; // map batch token positions to ids of the logits and embd buffers
-    size_t  output_size = 0; // capacity (of tokens positions) for the output buffers
-    int32_t n_outputs   = 0; // number of actually-used outputs in the current ubatch or last logical batch
-
-    bool logits_all = false;
-
    // embeddings output (2-dimensional array: [n_outputs][n_embd])
    // populated only when pooling_type == LLAMA_POOLING_TYPE_NONE
    size_t  embd_size = 0; // capacity (of floats) for embeddings
@ -72,59 +222,47 @@ struct llama_context {
    // populated only when pooling_type != LLAMA_POOLING_TYPE_NONE
    std::map<llama_seq_id, std::vector<float>> embd_seq;

-    // whether we are computing encoder output or decoder output
-    bool is_encoding = false;
+    int32_t n_outputs     = 0; // number of actually-used outputs in the current ubatch or last logical batch
+    int32_t n_outputs_max = 0; // capacity (of tokens positions) for the output buffers

-    // TODO: find a better way to accommodate mutli-dimension position encoding methods
-    // number of position id each token get, 1 for each token in most cases.
-    // when using m-rope, it will be 3 position ids per token to representing 3 dimension coordinate.
-    int n_pos_per_token = 1;
+    std::vector<int32_t> output_ids; // map batch token positions to ids of the logits and embd buffers

-    // output of the encoder part of the encoder-decoder models
-    std::vector<float> embd_enc;
-    std::vector<std::set<llama_seq_id>> seq_ids_enc;
-
-    // memory buffers used to evaluate the model
-    std::vector<uint8_t> buf_compute_meta;
    ggml_backend_sched_ptr sched;

+    ggml_backend_t backend_cpu = nullptr;
+    std::vector<ggml_backend_ptr> backends;
+
+    ggml_context_ptr ctx_compute;
+
+    ggml_threadpool_t threadpool       = nullptr;
+    ggml_threadpool_t threadpool_batch = nullptr;
+
    ggml_abort_callback abort_callback      = nullptr;
    void *              abort_callback_data = nullptr;

-    // input tensors
-    struct ggml_tensor * inp_tokens;        // I32 [n_batch]
-    struct ggml_tensor * inp_embd;          // F32 [n_embd, n_batch]
-    struct ggml_tensor * inp_pos;           // I32 [n_batch]
-    struct ggml_tensor * inp_out_ids;       // I32 [n_outputs]
-    struct ggml_tensor * inp_KQ_mask;       // F32 [kv_size, n_batch]
-    struct ggml_tensor * inp_KQ_mask_swa;   // F32 [kv_size, n_batch]
-    struct ggml_tensor * inp_K_shift;       // I32 [kv_size]
-    struct ggml_tensor * inp_mean;          // F32 [n_batch, n_batch]
-    struct ggml_tensor * inp_cls;           // I32 [n_batch]
-    struct ggml_tensor * inp_s_copy;        // I32 [kv_size]
-    struct ggml_tensor * inp_s_mask;        // F32 [1, n_kv]
-    struct ggml_tensor * inp_s_seq;         // I32 [n_kv, n_batch]
-    struct ggml_tensor * inp_pos_bucket;    // I32 [n_batch|n_kv, n_batch]
-    struct ggml_tensor * inp_embd_enc;      // F32 [n_embd, n_outputs_enc]
-    struct ggml_tensor * inp_KQ_mask_cross; // F32 [n_outputs_enc, n_batch]
+    std::vector<std::pair<ggml_backend_t, ggml_backend_set_n_threads_t>> set_n_threads_fns;

-    struct ggml_tensor * inp_cross_attn_state; // F32 [4, n_embd, 1061]
+    // buffer types used for the compute buffer of each backend
+    std::vector<ggml_backend_t>             backend_ptrs;
+    std::vector<ggml_backend_buffer_type_t> backend_buft;
+
+    // memory buffers used to evaluate the model
+    std::vector<uint8_t> buf_compute_meta;
+
+    // host buffer for the model output (logits and embeddings)
+    ggml_backend_buffer_ptr buf_output;
+
+    bool has_evaluated_once = false;
+
+    // perf
+    mutable int64_t t_start_us  = 0;
+    mutable int64_t t_load_us   = 0;
+    mutable int64_t t_p_eval_us = 0;
+    mutable int64_t t_eval_us   = 0;
+
+    mutable int64_t t_compute_start_us = 0;
+    mutable int64_t n_queued_tokens    = 0;
+
+    mutable int32_t n_p_eval = 0; // number of tokens in eval calls for the prompt (with batch size > 1)
+    mutable int32_t n_eval   = 0; // number of eval calls
 };
-
-// TODO: make these methods of llama_context
-void llama_set_k_shift(struct llama_context & lctx);
-
-void llama_set_s_copy(struct llama_context & lctx);
-
-void llama_set_inputs(llama_context & lctx, const llama_ubatch & ubatch);
-
-// Make sure enough space is available for outputs.
-// Returns max number of outputs for which space was reserved.
-size_t llama_output_reserve(struct llama_context & lctx, size_t n_outputs);
-
-// make the outputs have the same order they had in the user-provided batch
-void llama_output_reorder(struct llama_context & ctx);
-
-// For internal test use
-// TODO: remove
-const std::vector<std::pair<std::string, struct ggml_tensor *>> & llama_internal_get_tensor_map(struct llama_context * ctx);
--- a/llama/llama.cpp/src/llama-cparams.h
+++ b/llama/llama.cpp/src/llama-cparams.h
@ -30,6 +30,7 @@ struct llama_cparams {
    bool flash_attn;
    bool no_perf;
    bool cross_attn;
+    bool warmup;

    enum llama_pooling_type pooling_type;

--- a/llama/llama.cpp/src/llama-grammar.cpp
+++ b/llama/llama.cpp/src/llama-grammar.cpp
@ -969,7 +969,7 @@ struct llama_grammar * llama_grammar_init_impl(
        /* .awaiting_trigger = */ false,
        /* .trigger_buffer = */   "",
        /* .trigger_tokens   = */ {},
-        /* .trigger_words    = */ {},
+        /* .trigger_patterns    = */ {},
    };
 }

@ -978,19 +978,15 @@ struct llama_grammar * llama_grammar_init_impl(
                      const char * grammar_str,
                      const char * grammar_root,
                              bool lazy,
-                     const char ** trigger_words,
-                            size_t num_trigger_words,
+                     const char ** trigger_patterns,
+                            size_t num_trigger_patterns,
               const llama_token * trigger_tokens,
                            size_t num_trigger_tokens) {
    llama_grammar_parser parser;

    // if there is a grammar, parse it
-    if (!parser.parse(grammar_str)) {
-        return nullptr;
-    }
-
-    // will be empty (default) if there are parse errors
-    if (parser.rules.empty()) {
+    // rules will be empty (default) if there are parse errors
+    if (!parser.parse(grammar_str) || parser.rules.empty()) {
        fprintf(stderr, "%s: failed to parse grammar\n", __func__);
        return nullptr;
    }
@ -1054,14 +1050,16 @@ struct llama_grammar * llama_grammar_init_impl(
    } while (true);

    std::vector<llama_token>    vec_trigger_tokens;
-    std::vector<std::string> vec_trigger_words;
+    std::vector<llama_grammar_trigger_pattern> vec_trigger_patterns;
    for (size_t i = 0; i < num_trigger_tokens; i++) {
        GGML_ASSERT(trigger_tokens != nullptr);
        vec_trigger_tokens.push_back(trigger_tokens[i]);
    }
-    for (size_t i = 0; i < num_trigger_words; i++) {
-        GGML_ASSERT(trigger_words != nullptr);
-        vec_trigger_words.push_back(trigger_words[i]);
+    for (size_t i = 0; i < num_trigger_patterns; i++) {
+        GGML_ASSERT(trigger_patterns != nullptr);
+        auto & trigger = vec_trigger_patterns.emplace_back();
+        trigger.pattern = trigger_patterns[i];
+        trigger.regex = std::regex(trigger.pattern);
    }

    // Important: vec_rules has to be moved here, not copied, because stacks contains
@ -1076,7 +1074,7 @@ struct llama_grammar * llama_grammar_init_impl(
        /* .awaiting_trigger = */ lazy,
        /* .trigger_buffer = */   "",
        std::move(vec_trigger_tokens),
-        std::move(vec_trigger_words),
+        std::move(vec_trigger_patterns),
    };
 }

@ -1089,7 +1087,7 @@ void llama_grammar_free_impl(struct llama_grammar * grammar) {
 }

 struct llama_grammar * llama_grammar_clone_impl(const struct llama_grammar & grammar) {
-    llama_grammar * result = new llama_grammar {
+    auto * result = new llama_grammar {
        grammar.vocab,
        grammar.rules,
        grammar.stacks,
@ -1098,7 +1096,7 @@ struct llama_grammar * llama_grammar_clone_impl(const struct llama_grammar & gra
        grammar.awaiting_trigger,
        grammar.trigger_buffer,
        grammar.trigger_tokens,
-        grammar.trigger_words,
+        grammar.trigger_patterns,
    };

    // redirect elements in stacks to point to new rules
@ -1173,16 +1171,18 @@ void llama_grammar_accept_impl(struct llama_grammar & grammar, llama_token token
            LLAMA_LOG_DEBUG("Grammar triggered on token %u (`%s`)", token, piece.c_str());
            return;
        } else {
-            // TODO: consider a smarter incremental substring search algorithm (store last position to search from).
            grammar.trigger_buffer += piece;
-            for (const auto & word : grammar.trigger_words) {
-                auto pos = grammar.trigger_buffer.find(word);
-                if (pos != std::string::npos) {
+
+            std::smatch match;
+            for (const auto & trigger_pattern : grammar.trigger_patterns) {
+                if (std::regex_match(grammar.trigger_buffer, match, trigger_pattern.regex)) {
                    grammar.awaiting_trigger = false;
-                    auto constrained_str = grammar.trigger_buffer.substr(pos);
+                    // get from the first match to the end of the string
+                    auto constrained_str = grammar.trigger_buffer.substr(match.position(1));
+                    // std::string constrained_str(match[1].first, grammar.trigger_buffer.end());
                    grammar.trigger_buffer.clear();
                    llama_grammar_accept_str(grammar, constrained_str);
-                    LLAMA_LOG_DEBUG("Grammar triggered on word `%s`", word.c_str());
+                    LLAMA_LOG_DEBUG("Grammar triggered on regex: '%s'\n", constrained_str.c_str());
                    return;
                }
            }
--- a/llama/llama.cpp/src/llama-grammar.h
+++ b/llama/llama.cpp/src/llama-grammar.h
@ -3,6 +3,7 @@
 #include "llama.h"

 #include <map>
+#include <regex>
 #include <string>
 #include <vector>

@ -105,6 +106,11 @@ struct llama_grammar_parser {
    void print(FILE * file);
 };

+struct llama_grammar_trigger_pattern {
+    std::string pattern;
+    std::regex  regex;
+};
+
 struct llama_grammar {
    // note: allow null vocab for testing (not great)
    const llama_vocab * vocab;
@ -122,7 +128,10 @@ struct llama_grammar {
    bool                     awaiting_trigger = false; // Initialized to true for lazy grammars only
    std::string              trigger_buffer;           // Output buffered by lazy grammar. Will be cleared once trigger is found.
    std::vector<llama_token> trigger_tokens;           // Tokens that trigger a lazy grammar, or tokens to force printing of (even if special).
-    std::vector<std::string> trigger_words;
+    std::vector<llama_grammar_trigger_pattern>
+                             trigger_patterns;         // Regular expressions that trigger a lazy grammar. Must be a full match of the entire generated
+                                                       // string, and the grammar will be given the string from the first match group onwards.
+
 };

 //
@ -141,8 +150,8 @@ struct llama_grammar * llama_grammar_init_impl(
                      const char * grammar_str,
                      const char * grammar_root,
                              bool lazy,
-                     const char ** trigger_words,
-                            size_t num_trigger_words,
+                     const char ** trigger_patterns,
+                            size_t num_trigger_patterns,
               const llama_token * trigger_tokens,
                            size_t num_trigger_tokens);

--- a/llama/llama.cpp/src/llama-graph.cpp
+++ b/llama/llama.cpp/src/llama-graph.cpp
--- a/llama/llama.cpp/src/llama-graph.h
+++ b/llama/llama.cpp/src/llama-graph.h
@ -0,0 +1,604 @@
+#pragma once
+
+#include "llama-arch.h"
+#include "llama-hparams.h"
+#include "llama-adapter.h"
+
+#include <cstdint>
+#include <vector>
+#include <memory>
+#include <set>
+#include <functional>
+
+struct ggml_cgraph;
+struct ggml_context;
+struct ggml_tensor;
+
+struct llama_ubatch;
+struct llama_cparams;
+
+class llama_memory_i;
+class llama_kv_cache_unified;
+
+// certain models (typically multi-modal) can produce different types of graphs
+enum llm_graph_type {
+    LLM_GRAPH_TYPE_DEFAULT,
+    LLM_GRAPH_TYPE_ENCODER,
+    LLM_GRAPH_TYPE_DECODER,
+};
+
+enum llm_ffn_op_type {
+    LLM_FFN_SILU,
+    LLM_FFN_GELU,
+    LLM_FFN_RELU,
+    LLM_FFN_RELU_SQR,
+    LLM_FFN_SWIGLU,
+};
+
+enum llm_ffn_gate_type {
+    LLM_FFN_SEQ,
+    LLM_FFN_PAR, // ffn_gate is parallel to ffn_up
+};
+
+enum llm_norm_type {
+    LLM_NORM,
+    LLM_NORM_RMS,
+    LLM_NORM_GROUP,
+};
+
+// TODO: tmp - need something better to pass the data from the encoder to the decoder
+struct llama_cross {
+    // the output embeddings from the encoder as a ggml tensor
+    // TODO: this needs more work to be correct, for now copy the embeddings data to host memory
+    //       ref: https://github.com/ggml-org/llama.cpp/pull/11213#discussion_r1969892524
+    //ggml_tensor * t_embd = nullptr;
+
+    int64_t n_embd = 0;
+    int64_t n_enc  = 0;
+
+    // embeddings data copied to host memory (tmp)
+    std::vector<float> v_embd;
+
+    // needed to construct the cross-attention mask in the decoder
+    std::vector<std::set<llama_seq_id>> seq_ids_enc;
+};
+
+//
+// llm_graph_input
+//
+
+class llm_graph_input_i {
+public:
+    virtual ~llm_graph_input_i() = default;
+
+    virtual void set_input(const llama_ubatch * ubatch) = 0;
+};
+
+using llm_graph_input_ptr = std::unique_ptr<llm_graph_input_i>;
+
+
+class llm_graph_input_embd : public llm_graph_input_i {
+public:
+    llm_graph_input_embd()          = default;
+    virtual ~llm_graph_input_embd() = default;
+
+    void set_input(const llama_ubatch * ubatch) override;
+
+    ggml_tensor * tokens = nullptr; // I32 [n_batch]
+    ggml_tensor * embd   = nullptr; // F32 [n_embd, n_batch]
+    ggml_tensor * cross_attn_state; // F32 [4, n_embd, 1061]
+};
+
+class llm_graph_input_pos : public llm_graph_input_i {
+public:
+    llm_graph_input_pos(int64_t n_pos_per_token) : n_pos_per_token(n_pos_per_token) {}
+    virtual ~llm_graph_input_pos() = default;
+
+    void set_input(const llama_ubatch * ubatch) override;
+
+    ggml_tensor * pos = nullptr; // I32 [n_batch]
+
+    const int64_t n_pos_per_token = 1;
+};
+
+// temperature tuning, used by llama4
+class llm_graph_input_attn_temp : public llm_graph_input_i {
+public:
+    llm_graph_input_attn_temp(int64_t n_pos_per_token, uint32_t n_attn_temp_floor_scale, float f_attn_temp_scale)
+        : n_pos_per_token(n_pos_per_token), n_attn_temp_floor_scale(n_attn_temp_floor_scale), f_attn_temp_scale(f_attn_temp_scale) {}
+    virtual ~llm_graph_input_attn_temp() = default;
+
+    void set_input(const llama_ubatch * ubatch) override;
+
+    ggml_tensor * attn_scale = nullptr; // F32 [n_batch]
+
+    const int64_t n_pos_per_token = 1;
+
+    const uint32_t n_attn_temp_floor_scale;
+    const float    f_attn_temp_scale;
+};
+
+class llm_graph_input_pos_bucket : public llm_graph_input_i {
+public:
+    llm_graph_input_pos_bucket(const llama_hparams & hparams) : hparams(hparams) {}
+    virtual ~llm_graph_input_pos_bucket() = default;
+
+    void set_input(const llama_ubatch * ubatch) override;
+
+    ggml_tensor * pos_bucket = nullptr; // I32 [n_batch, n_batch]
+
+    const llama_hparams & hparams;
+};
+
+class llm_graph_input_pos_bucket_kv : public llm_graph_input_i {
+public:
+    llm_graph_input_pos_bucket_kv(
+            const llama_hparams & hparams,
+            const llama_kv_cache_unified * kv_self) : hparams(hparams), kv_self(kv_self) {}
+    virtual ~llm_graph_input_pos_bucket_kv() = default;
+
+    void set_input(const llama_ubatch * ubatch) override;
+
+    ggml_tensor * pos_bucket = nullptr; // I32 [n_kv, n_batch]
+
+    const llama_hparams & hparams;
+    const llama_kv_cache_unified * kv_self;
+};
+
+class llm_graph_input_out_ids : public llm_graph_input_i {
+public:
+    llm_graph_input_out_ids(
+            const llama_hparams & hparams,
+            const llama_cparams & cparams,
+            int32_t n_outputs) : hparams(hparams), cparams(cparams), n_outputs(n_outputs) {}
+    virtual ~llm_graph_input_out_ids() = default;
+
+    void set_input(const llama_ubatch * ubatch) override;
+
+    ggml_tensor * out_ids; // I32 [n_outputs]
+
+    const llama_hparams & hparams;
+    const llama_cparams & cparams;
+
+    const int32_t n_outputs;
+};
+
+class llm_graph_input_mean : public llm_graph_input_i {
+public:
+    llm_graph_input_mean(const llama_cparams & cparams) : cparams(cparams) {}
+    virtual ~llm_graph_input_mean() = default;
+
+    void set_input(const llama_ubatch * ubatch) override;
+
+    ggml_tensor * mean; // F32 [n_batch, n_batch]
+
+    const llama_cparams & cparams;
+};
+
+class llm_graph_input_cls : public llm_graph_input_i {
+public:
+    llm_graph_input_cls(const llama_cparams & cparams) : cparams(cparams) {}
+    virtual ~llm_graph_input_cls() = default;
+
+    void set_input(const llama_ubatch * ubatch) override;
+
+    ggml_tensor * cls; // I32 [n_batch]
+
+    const llama_cparams & cparams;
+};
+
+class llm_graph_input_s_copy : public llm_graph_input_i {
+public:
+    llm_graph_input_s_copy(const llama_kv_cache_unified * kv_self) : kv_self(kv_self) {}
+    virtual ~llm_graph_input_s_copy() = default;
+
+    void set_input(const llama_ubatch * ubatch) override;
+
+    ggml_tensor * s_copy; // I32 [kv_size]
+
+    const llama_kv_cache_unified * kv_self;
+};
+
+class llm_graph_input_s_mask : public llm_graph_input_i {
+public:
+    llm_graph_input_s_mask(const llama_kv_cache_unified * kv_self) : kv_self(kv_self) {}
+    virtual ~llm_graph_input_s_mask() = default;
+
+    void set_input(const llama_ubatch * ubatch) override;
+
+    ggml_tensor * s_mask; // F32 [1, n_kv]
+
+    const llama_kv_cache_unified * kv_self;
+};
+
+class llm_graph_input_cross_embd : public llm_graph_input_i {
+public:
+    llm_graph_input_cross_embd(
+            const llama_cross * cross) : cross(cross) {}
+    virtual ~llm_graph_input_cross_embd() = default;
+
+    void set_input(const llama_ubatch * ubatch) override;
+
+    ggml_tensor * cross_embd; // F32 [n_embd, n_outputs_enc]
+
+    const llama_cross * cross;
+};
+
+class llm_graph_input_attn_no_cache : public llm_graph_input_i {
+public:
+    llm_graph_input_attn_no_cache(const llama_hparams & hparams, const llama_cparams & cparams) :
+        hparams(hparams),
+        cparams(cparams) {
+    }
+    ~llm_graph_input_attn_no_cache() = default;
+
+    void set_input(const llama_ubatch * ubatch) override;
+
+    ggml_tensor * get_kq_mask() const { return kq_mask_cnv; }
+
+    ggml_tensor * kq_mask     = nullptr; // F32 [n_tokens, n_batch]
+    ggml_tensor * kq_mask_cnv = nullptr; //     [n_tokens, n_batch]
+
+    const llama_hparams & hparams;
+    const llama_cparams & cparams;
+};
+
+class llm_graph_input_attn_kv_unified : public llm_graph_input_i {
+public:
+    llm_graph_input_attn_kv_unified(
+            const llama_hparams & hparams,
+            const llama_cparams & cparams,
+            const llama_kv_cache_unified * kv_self) :
+        hparams(hparams),
+        cparams(cparams),
+        kv_self(kv_self) {
+    }
+    ~llm_graph_input_attn_kv_unified() = default;
+
+    void set_input(const llama_ubatch * ubatch) override;
+
+    ggml_tensor * get_kq_mask()     const { return self_kq_mask_cnv; }
+    ggml_tensor * get_kq_mask_swa() const { return self_kq_mask_swa_cnv; }
+
+    ggml_tensor * self_kq_mask         = nullptr; // F32 [n_kv, n_batch]
+    ggml_tensor * self_kq_mask_cnv     = nullptr; //     [n_kv, n_batch]
+    ggml_tensor * self_kq_mask_swa     = nullptr; // F32 [n_kv, n_batch]
+    ggml_tensor * self_kq_mask_swa_cnv = nullptr; //     [n_kv, n_batch]
+
+    const llama_hparams & hparams;
+    const llama_cparams & cparams;
+
+    const llama_kv_cache_unified * kv_self;
+};
+
+class llm_graph_input_attn_cross : public llm_graph_input_i {
+public:
+    llm_graph_input_attn_cross(const llama_cross * cross) : cross(cross) {}
+    ~llm_graph_input_attn_cross() = default;
+
+    void set_input(const llama_ubatch * ubatch) override;
+
+    ggml_tensor * get_kq_mask_cross() const { return cross_kq_mask_cnv; }
+
+    ggml_tensor * cross_kq_mask     = nullptr; // F32 [n_outputs_enc, n_batch]
+    ggml_tensor * cross_kq_mask_cnv = nullptr; // F32 [n_outputs_enc, n_batch]
+
+    const llama_cross * cross = nullptr;
+};
+
+class llm_graph_input_cross_attn_state : public llm_graph_input_i {
+public:
+    llm_graph_input_cross_attn_state()          = default;
+    virtual ~llm_graph_input_cross_attn_state() = default;
+
+    void set_input(const llama_ubatch * ubatch) override;
+
+    ggml_tensor * cross_attn_state; // F32 [4, n_embd, 1061]
+};
+
+//
+// llm_graph_result
+//
+
+// these objects deliver the result from the graph build process back to the llama_context
+// note that the input tensors created for the graph are referenced here - the goal is to be able to populate their
+//   specific data, by calling the set_inputs() method
+// along with the input tensors, the object also provides commonly used outputs tensors, such as logits, embeddings, etc.
+//   these are used by the llama_context to extact the relevant data, based on the compute parameters
+
+class llm_graph_result_i {
+public:
+    virtual ~llm_graph_result_i() = default;
+
+    virtual ggml_tensor * get_logits()      = 0;
+    virtual ggml_tensor * get_embd()        = 0;
+    virtual ggml_tensor * get_embd_pooled() = 0;
+
+    virtual void set_inputs(const llama_ubatch * ubatch) = 0;
+};
+
+using llm_graph_result_ptr = std::unique_ptr<llm_graph_result_i>;
+
+
+class llm_graph_result : public llm_graph_result_i {
+public:
+    virtual ~llm_graph_result() = default;
+
+    ggml_tensor * get_logits()      override { return t_logits; }
+    ggml_tensor * get_embd()        override { return t_embd; }
+    ggml_tensor * get_embd_pooled() override { return t_embd_pooled; }
+
+    void set_inputs(const llama_ubatch * ubatch) override {
+        for (auto & input : inputs) {
+            input->set_input(ubatch);
+        }
+    }
+
+    llm_graph_input_i * add_input(llm_graph_input_ptr input) {
+        inputs.emplace_back(std::move(input));
+        return inputs.back().get();
+    }
+
+    // important graph nodes
+    ggml_tensor * t_logits      = nullptr;
+    ggml_tensor * t_embd        = nullptr;
+    ggml_tensor * t_embd_pooled = nullptr;
+
+    std::vector<llm_graph_input_ptr> inputs;
+};
+
+//
+// llm_graph_context
+//
+
+// callback that allows us to apply custom logic to each tensor (e.g. ggml-alloc, offloading, etc.)
+using llm_graph_cb = std::function<void(const llama_ubatch & ubatch, ggml_tensor * cur, const char * name, int il)>;
+
+struct llm_graph_params {
+    ggml_context * ctx;
+
+    const llm_arch arch;
+
+    const llama_hparams & hparams;
+    const llama_cparams & cparams;
+    const llama_ubatch  & ubatch;
+
+    ggml_backend_sched * sched;
+    ggml_backend * backend_cpu;
+
+    const llama_adapter_cvec  * cvec;
+    const llama_adapter_loras * loras;
+    const llama_memory_i      * memory;
+    const llama_cross         * cross;
+
+    int32_t n_outputs;
+
+    const llm_graph_cb & cb;
+};
+
+struct llm_graph_context {
+    const llm_arch arch;
+
+    const llama_hparams & hparams;
+    const llama_cparams & cparams;
+    const llama_ubatch  & ubatch;
+
+    const int64_t n_embd;
+    const int64_t n_layer;
+    const int64_t n_rot;
+    const int64_t n_ctx;       // user-specified context size (can be different from n_ctx_train)
+    const int64_t n_ctx_per_seq;
+    const int64_t n_head;
+    const int64_t n_head_kv;
+    const int64_t n_embd_head_k;
+    const int64_t n_embd_k_gqa;
+    const int64_t n_embd_head_v;
+    const int64_t n_embd_v_gqa;
+    const int64_t n_expert;
+    const int64_t n_expert_used;
+
+    const float freq_base;
+    const float freq_scale;
+    const float ext_factor;
+    const float attn_factor;
+    const float beta_fast;
+    const float beta_slow;
+    const float norm_eps;
+    const float norm_rms_eps;
+
+    const int32_t n_tokens;
+    const int32_t n_outputs;
+    const int32_t n_ctx_orig; // yarn
+
+    const enum llama_pooling_type pooling_type;
+    const enum llama_rope_type    rope_type;
+
+    ggml_context * ctx0 = nullptr;
+
+    ggml_backend_sched * sched;
+
+    ggml_backend * backend_cpu; // TODO: needed by build_attn_mha, figure out a way to remove?
+
+    const llama_adapter_cvec  * cvec;
+    const llama_adapter_loras * loras;
+    const llama_memory_i      * memory;
+    const llama_cross         * cross;
+
+    const llm_graph_cb & cb_func;
+
+    std::unique_ptr<llm_graph_result> res;
+
+    llm_graph_context(const llm_graph_params & params);
+
+    int64_t n_pos_per_token() const;
+
+    void cb(ggml_tensor * cur, const char * name, int il) const;
+
+    //
+    // common
+    //
+
+    ggml_tensor * build_cvec(
+             ggml_tensor * cur,
+                     int   il) const;
+
+    // do mat_mul, while optionally apply lora
+    ggml_tensor * build_lora_mm(
+              ggml_tensor * w,
+              ggml_tensor * cur) const;
+
+    // do mat_mul_id, while optionally apply lora
+    ggml_tensor * build_lora_mm_id(
+              ggml_tensor * w,   // ggml_tensor * as
+              ggml_tensor * cur, // ggml_tensor * b
+              ggml_tensor * ids) const;
+
+    ggml_tensor * build_norm(
+             ggml_tensor * cur,
+             ggml_tensor * mw,
+             ggml_tensor * mb,
+           llm_norm_type   type,
+                     int   il) const;
+
+    ggml_tensor * build_ffn(
+             ggml_tensor * cur,
+             ggml_tensor * up,
+             ggml_tensor * up_b,
+             ggml_tensor * up_s,
+             ggml_tensor * gate,
+             ggml_tensor * gate_b,
+             ggml_tensor * gate_s,
+             ggml_tensor * down,
+             ggml_tensor * down_b,
+             ggml_tensor * down_s,
+             ggml_tensor * act_scales,
+         llm_ffn_op_type   type_op,
+       llm_ffn_gate_type   type_gate,
+                     int   il) const;
+
+    ggml_tensor * build_moe_ffn(
+             ggml_tensor * cur,
+             ggml_tensor * gate_inp,
+             ggml_tensor * up_exps,
+             ggml_tensor * gate_exps,
+             ggml_tensor * down_exps,
+             ggml_tensor * exp_probs_b,
+                 int64_t   n_expert,
+                 int64_t   n_expert_used,
+         llm_ffn_op_type   type_op,
+                    bool   norm_w,
+                    bool   scale_w,
+                   float   w_scale,
+            llama_expert_gating_func_type gating_op,
+                     int   il) const;
+
+    //
+    // inputs
+    //
+
+    ggml_tensor * build_inp_embd(ggml_tensor * tok_embd) const;
+    ggml_tensor * build_inp_pos() const;
+    ggml_tensor * build_inp_attn_scale() const;
+    ggml_tensor * build_inp_out_ids() const;
+    ggml_tensor * build_inp_mean() const;
+    ggml_tensor * build_inp_cls() const;
+    ggml_tensor * build_inp_s_copy() const;
+    ggml_tensor * build_inp_s_mask() const;
+    ggml_tensor * build_inp_cross_attn_state() const;
+
+    ggml_tensor * build_inp_cross_embd() const;
+    ggml_tensor * build_inp_pos_bucket_enc() const;
+    ggml_tensor * build_inp_pos_bucket_dec() const;
+    ggml_tensor * build_pos_bias(ggml_tensor * pos_bucket, ggml_tensor * attn_rel_b) const;
+
+    //
+    // attention
+    //
+
+    ggml_tensor * build_attn_mha(
+             ggml_cgraph * gf,
+             ggml_tensor * q, // [n_embd_head_q, n_tokens, n_head_q]
+             ggml_tensor * k, // [n_embd_head_k, n_tokens, n_head_k]
+             ggml_tensor * v, // [n_embd_head_v, n_tokens, n_head_v] (v_trans == false)
+             ggml_tensor * kq_b,
+             ggml_tensor * kq_mask,
+                    bool   v_trans,
+                   float   kq_scale) const;
+
+    llm_graph_input_attn_no_cache * build_attn_inp_no_cache() const;
+
+    ggml_tensor * build_attn(
+            llm_graph_input_attn_no_cache * inp,
+            ggml_cgraph * gf,
+            ggml_tensor * wo,
+            ggml_tensor * wo_b,
+            ggml_tensor * q_cur, // [n_embd_head_q, n_head_q, n_tokens]
+            ggml_tensor * k_cur, // [n_embd_head_k, n_head_k, n_tokens]
+            ggml_tensor * v_cur, // [n_embd_head_v, n_head_v, n_tokens]
+            ggml_tensor * kq_b,
+                  float   kq_scale,
+                    int   il) const;
+
+    llm_graph_input_attn_kv_unified * build_attn_inp_kv_unified() const;
+
+    ggml_tensor * build_attn(
+            llm_graph_input_attn_kv_unified * inp,
+            ggml_cgraph * gf,
+            ggml_tensor * wo,
+            ggml_tensor * wo_b,
+            ggml_tensor * q_cur, // [n_embd_head_q, n_head_q, n_tokens]
+            ggml_tensor * k_cur, // [n_embd_head_k, n_head_k, n_tokens]
+            ggml_tensor * v_cur, // [n_embd_head_v, n_head_v, n_tokens]
+            ggml_tensor * kq_b,
+                  float   kq_scale,
+                    int   il) const;
+
+    llm_graph_input_attn_cross * build_attn_inp_cross() const;
+
+    ggml_tensor * build_attn(
+            llm_graph_input_attn_cross * inp,
+            ggml_cgraph * gf,
+            ggml_tensor * wo,
+            ggml_tensor * wo_b,
+            ggml_tensor * q_cur, // [n_embd_head_q, n_head_q, n_tokens]
+            ggml_tensor * k_cur, // [n_embd_head_k, n_head_k, n_tokens]
+            ggml_tensor * v_cur, // [n_embd_head_v, n_head_v, n_tokens]
+            ggml_tensor * kq_b,
+                  float   kq_scale,
+                    int   il) const;
+
+    //
+    // recurrent
+    //
+
+    ggml_tensor * build_copy_mask_state(
+             ggml_cgraph * gf,
+             ggml_tensor * s,
+             ggml_tensor * state_copy,
+             ggml_tensor * state_mask,
+                 int32_t   n_state,
+                 int32_t   n_seqs) const;
+
+    ggml_tensor * build_rwkv_token_shift_load(
+             ggml_cgraph * gf,
+             ggml_tensor * state_copy,
+             ggml_tensor * state_mask,
+      const llama_ubatch & ubatch,
+                     int   il) const;
+
+    ggml_tensor * build_rwkv_token_shift_store(
+             ggml_tensor * token_shift,
+      const llama_ubatch & ubatch,
+                     int   il) const;
+
+    //
+    // pooling
+    //
+
+    void build_pooling(
+            ggml_cgraph * gf,
+            ggml_tensor * cls,
+            ggml_tensor * cls_b,
+            ggml_tensor * cls_out,
+            ggml_tensor * cls_out_b) const;
+};
--- a/llama/llama.cpp/src/llama-hparams.cpp
+++ b/llama/llama.cpp/src/llama-hparams.cpp
@ -2,8 +2,6 @@

 #include "ggml.h"

-#include <algorithm>
-
 uint32_t llama_hparams::n_head(uint32_t il) const {
    if (il < n_layer) {
        return n_head_arr[il];
@ -80,6 +78,14 @@ bool llama_hparams::n_bskcn(uint32_t n, uint32_t il) const {
    GGML_ABORT("fatal error");
 }

+bool llama_hparams::is_swa(uint32_t il) const {
+    if (il < n_layer) {
+        return n_swa > 0 && n_swa_pattern > 0 && il % n_swa_pattern < (n_swa_pattern - 1);
+    }
+
+    GGML_ABORT("fatal error");
+}
+
 bool llama_hparams::cross_attention_layers(uint32_t il) const {
    return std::find(cross_attn_layers.begin(), cross_attn_layers.end(), il) != cross_attn_layers.end();
-}
+}
--- a/llama/llama.cpp/src/llama-hparams.h
+++ b/llama/llama.cpp/src/llama-hparams.h
@ -2,6 +2,8 @@

 #include "llama.h"

+#include <algorithm>
+
 #include <array>

 // bump if necessary
@ -36,6 +38,7 @@ struct llama_hparams {
    uint32_t n_layer;
    uint32_t n_rot;
    uint32_t n_swa = 0; // sliding window attention (SWA)
+    uint32_t n_swa_pattern = 1; // by default, all layers use non-sliding-window attention
    uint32_t n_embd_head_k; // dimension of keys (d_k). d_q is assumed to be the same, but there are n_head q heads, and only n_head_kv k-v heads
    uint32_t n_embd_head_v; // dimension of values (d_v) aka n_embd_head
    uint32_t n_expert = 0;
@ -79,10 +82,16 @@ struct llama_hparams {
    uint32_t time_decay_extra_dim   = 0;
    uint32_t wkv_head_size          = 0;
    uint32_t token_shift_count      = 2;
+    uint32_t n_lora_decay           = 0;
+    uint32_t n_lora_iclr            = 0;
+    uint32_t n_lora_value_res_mix   = 0;
+    uint32_t n_lora_gate            = 0;

    float    rope_attn_factor = 1.0f;
    float    rope_freq_base_train;
+    float    rope_freq_base_train_swa;
    float    rope_freq_scale_train;
+    float    rope_freq_scale_train_swa;
    uint32_t n_ctx_orig_yarn;
    float    rope_yarn_log_mul;

@ -109,6 +118,14 @@ struct llama_hparams {
    bool use_alibi     = false;
    bool attn_soft_cap = false;

+    uint32_t n_moe_layer_step        = 0;
+    bool     use_kq_norm             = true;
+    uint32_t n_attn_chunk            = 0;
+    // values below seems to be fixed on llama4
+    uint32_t n_no_rope_layer_step    = 4;
+    uint32_t n_attn_temp_floor_scale = 8192;
+    float    f_attn_temp_scale       = 0.1;
+
    // needed by encoder-decoder models (e.g. T5, FLAN-T5)
    // ref: https://github.com/ggerganov/llama.cpp/pull/8141
    llama_token dec_start_token_id = LLAMA_TOKEN_NULL;
@ -143,6 +160,8 @@ struct llama_hparams {

    // cross attention layers
    bool cross_attention_layers(uint32_t il) const;
+
+    bool is_swa(uint32_t il) const;
 };

 static_assert(std::is_trivially_copyable<llama_hparams>::value, "llama_hparams must be trivially copyable");
--- a/llama/llama.cpp/src/llama-io.cpp
+++ b/llama/llama.cpp/src/llama-io.cpp
@ -0,0 +1,15 @@
+#include "llama-io.h"
+
+void llama_io_write_i::write_string(const std::string & str) {
+    uint32_t str_size = str.size();
+
+    write(&str_size,  sizeof(str_size));
+    write(str.data(), str_size);
+}
+
+void llama_io_read_i::read_string(std::string & str) {
+    uint32_t str_size;
+    read_to(&str_size, sizeof(str_size));
+
+    str.assign((const char *) read(str_size), str_size);
+}
--- a/llama/llama.cpp/src/llama-io.h
+++ b/llama/llama.cpp/src/llama-io.h
@ -0,0 +1,35 @@
+#pragma once
+
+#include <cstddef>
+#include <cstdint>
+#include <string>
+
+struct ggml_tensor;
+
+class llama_io_write_i {
+public:
+    llama_io_write_i() = default;
+    virtual ~llama_io_write_i() = default;
+
+    virtual void write(const void * src, size_t size) = 0;
+    virtual void write_tensor(const ggml_tensor * tensor, size_t offset, size_t size) = 0;
+
+    // bytes written so far
+    virtual size_t n_bytes() = 0;
+
+    void write_string(const std::string & str);
+};
+
+class llama_io_read_i {
+public:
+    llama_io_read_i() = default;
+    virtual ~llama_io_read_i() = default;
+
+    virtual const uint8_t * read(size_t size) = 0;
+    virtual void read_to(void * dst, size_t size) = 0;
+
+    // bytes read so far
+    virtual size_t n_bytes() = 0;
+
+    void read_string(std::string & str);
+};
--- a/llama/llama.cpp/src/llama-kv-cache.cpp
+++ b/llama/llama.cpp/src/llama-kv-cache.cpp
--- a/llama/llama.cpp/src/llama-kv-cache.h
+++ b/llama/llama.cpp/src/llama-kv-cache.h
@ -1,15 +1,58 @@
 #pragma once

 #include "llama.h"
+#include "llama-io.h"
+#include "llama-memory.h"

 #include "ggml-cpp.h"

+#include <functional>
 #include <set>
 #include <vector>

+struct llama_cparams;
+struct llama_hparams;
+struct llama_ubatch;
+
+struct llama_kv_cache : public llama_memory_i {
+    using llama_memory_i::llama_memory_i;
+
+    virtual void restore() = 0; // call if batch processing fails - restores the cache state
+    virtual void commit() = 0;  // call after successful batch processing - clears any pending state
+
+    virtual int32_t get_n_tokens()   const = 0;
+    virtual int32_t get_used_cells() const = 0; // TODO: remove, this is too-specific to the unified cache
+
+    virtual bool get_can_shift() const = 0;
+
+    bool get_can_edit() const override { return get_can_shift(); }
+};
+
+struct llama_kv_cache_guard {
+    llama_kv_cache_guard(llama_kv_cache * kv) : kv(kv) {}
+
+    ~llama_kv_cache_guard() {
+        kv->restore();
+    }
+
+    void commit() {
+        kv->commit();
+    }
+
+private:
+    llama_kv_cache * kv;
+};
+
+// block of KV slots to move when defragging
+struct llama_kv_defrag_move {
+    uint32_t src;
+    uint32_t dst;
+    uint32_t len;
+};
+
 struct llama_kv_cell {
    llama_pos pos   = -1;
-    llama_pos delta = 0;
+    llama_pos delta =  0;
    int32_t   src   = -1; // used by recurrent state models to copy states
    int32_t   tail  = -1;

@ -29,10 +72,107 @@ struct llama_kv_cell {
 };

 // ring-buffer of cached KV data
-struct llama_kv_cache {
+// TODO: pimpl
+// TODO: add notion of max sequences
+class llama_kv_cache_unified : public llama_kv_cache {
+public:
+    // can be used to query data from the model if needed
+    struct callbacks {
+        std::function<ggml_tensor * (uint32_t n_ctx_per_seq, int il)> get_rope_factors;
+    };
+
+    llama_kv_cache_unified(
+            const llama_hparams & hparams,
+            callbacks             cbs);
+
+    virtual ~llama_kv_cache_unified() = default;
+
+    // TODO: become constructor
+    bool init(
+            const llama_model & model,   // TODO: do not reference the model
+          const llama_cparams & cparams,
+                    ggml_type   type_k,
+                    ggml_type   type_v,
+                     uint32_t   kv_size,
+                         bool   offload);
+
+    int32_t get_n_tokens()   const override;
+    int32_t get_used_cells() const override;
+
+    size_t total_size() const;
+
+    // TODO: better data structures to reduce the cost of this operation
+    llama_pos pos_max() const;
+
+    void clear() override;
+    void defrag() override;
+
+    virtual void restore() override;
+    virtual void commit() override;
+
+    bool seq_rm  (llama_seq_id seq_id,                              llama_pos p0, llama_pos p1) override;
+    void seq_cp  (llama_seq_id seq_id_src, llama_seq_id seq_id_dst, llama_pos p0, llama_pos p1) override;
+    void seq_keep(llama_seq_id seq_id) override;
+    void seq_add (llama_seq_id seq_id,                              llama_pos p0, llama_pos p1, llama_pos delta) override;
+    void seq_div (llama_seq_id seq_id,                              llama_pos p0, llama_pos p1, int d) override;
+
+    llama_pos seq_pos_max(llama_seq_id seq_id) const override;
+
+    bool get_can_shift() const override;
+
+    // find an empty slot of size "n_tokens" in the cache
+    // updates the cache head
+    // Note: On success, it's important that cache.head points
+    // to the first cell of the slot.
+    bool find_slot(const llama_ubatch & batch);
+
+    // TODO: maybe not needed
+    uint32_t get_padding(const llama_cparams & cparams) const;
+
+    // find how many cells are currently in use
+    uint32_t cell_max() const;
+
+    size_t size_k_bytes() const;
+    size_t size_v_bytes() const;
+
+    // defrag
+
+    struct {
+        std::vector<llama_kv_defrag_move> moves;
+    } defrag_info;
+
+    // return true if cells have been moved
+    bool defrag_prepare(int32_t n_max_nodes);
+
+    // commit/restore cache
+
+    struct slot_range {
+        uint32_t c0 = 0; // note: these are cell indices, not sequence positions
+        uint32_t c1 = 0;
+    };
+
+    // pending cell updates that are not yet committed
+    struct {
+        std::vector<slot_range> ranges;
+    } pending;
+
+    // state write/load
+
+    void state_write(llama_io_write_i & io, llama_seq_id seq_id = -1) const;
+    void state_read (llama_io_read_i  & io, llama_seq_id seq_id = -1);
+
+    // members
+
+    const llama_hparams & hparams;
+
+    callbacks cbs;
+
    bool has_shift = false;
    bool do_defrag = false;
+
+    // TODO: remove this and implement llama_kv_cache_recurrent instead
    bool recurrent = false; // with recurrent state models, a cell can hold the state for more than one past token
+
    bool v_trans   = true;  // the value tensor is transposed
    bool can_shift = false;

@ -46,173 +186,35 @@ struct llama_kv_cache {
    // computed before each graph build
    uint32_t n = 0;

+    std::vector<llama_kv_cell> cells;
+
+    std::vector<ggml_tensor *> k_l; // per layer
+    std::vector<ggml_tensor *> v_l;
+
+private:
    ggml_type type_k = GGML_TYPE_F16;
    ggml_type type_v = GGML_TYPE_F16;

-    std::vector<llama_kv_cell> cells;
-
-    std::vector<struct ggml_tensor *> k_l; // per layer
-    std::vector<struct ggml_tensor *> v_l;
-
-    std::vector<ggml_context_ptr> ctxs;
+    std::vector<ggml_context_ptr>        ctxs;
    std::vector<ggml_backend_buffer_ptr> bufs;

-    size_t total_size() const {
-        size_t size = 0;
-        for (const auto & buf : bufs) {
-            size += ggml_backend_buffer_get_size(buf.get());
-        }
+    void state_write_meta(llama_io_write_i & io, const std::vector<std::pair<uint32_t, uint32_t>> & cell_ranges, llama_seq_id seq_id = -1) const;
+    void state_write_data(llama_io_write_i & io, const std::vector<std::pair<uint32_t, uint32_t>> & cell_ranges) const;

-        return size;
-    }
-
-    // TODO: better data structures to reduce the cost of this operation
-    llama_pos max_pos() const {
-        llama_pos max_pos = -1;
-        for (const auto & cell : cells) {
-            max_pos = std::max(max_pos, cell.pos);
-        }
-
-        return max_pos;
-    }
+    bool state_read_meta(llama_io_read_i & io, uint32_t cell_count, llama_seq_id dest_seq_id = -1);
+    bool state_read_data(llama_io_read_i & io, uint32_t cell_count);
 };

-// a structure holds information about the slot found in llama_kv_cache_find_slot
-struct llama_kv_cache_slot_info {
-    std::pair<uint32_t, uint32_t> boundaries; // slot boundaries [begin, end)
-    bool found = false;                       // the slot was found
-
-    explicit llama_kv_cache_slot_info(bool found_) : found{found_} {}
-    llama_kv_cache_slot_info(uint32_t begin, uint32_t end) : boundaries{begin, end}, found{true} {}
-
-    operator bool() const { return found; }
-};
-
-// TODO: maybe not needed
-uint32_t llama_kv_cache_get_padding(const struct llama_cparams & cparams);
-
-bool llama_kv_cache_init(
-        struct llama_kv_cache & cache,
-            const llama_model & model,
-          const llama_cparams & cparams,
-                    ggml_type   type_k,
-                    ggml_type   type_v,
-                     uint32_t   kv_size,
-                         bool   offload);
-
-// find an empty slot of size "n_tokens" in the cache
-// updates the cache head
-// returns a structure holding information about the slot found
-// Note: On success, it's important that cache.head points
-// to the first cell of the slot.
-struct llama_kv_cache_slot_info llama_kv_cache_find_slot(
-           struct llama_kv_cache & cache,
-       const struct llama_ubatch & batch);
-
-// find how many cells are currently in use
-uint32_t llama_kv_cache_cell_max(const struct llama_kv_cache & cache);
-
-void llama_kv_cache_clear(struct llama_kv_cache & cache);
-
-bool llama_kv_cache_seq_rm(
-        struct llama_kv_cache & cache,
-                 llama_seq_id   seq_id,
-                    llama_pos   p0,
-                    llama_pos   p1);
-
-void llama_kv_cache_seq_cp(
-        struct llama_kv_cache & cache,
-                 llama_seq_id   seq_id_src,
-                 llama_seq_id   seq_id_dst,
-                    llama_pos   p0,
-                    llama_pos   p1);
-
-void llama_kv_cache_seq_keep(
-        struct llama_kv_cache & cache,
-                 llama_seq_id   seq_id);
-
-void llama_kv_cache_seq_add(
-        struct llama_kv_cache & cache,
-                 llama_seq_id   seq_id,
-                    llama_pos   p0,
-                    llama_pos   p1,
-                    llama_pos   delta);
-
-void llama_kv_cache_seq_div(
-        struct llama_kv_cache & cache,
-                 llama_seq_id   seq_id,
-                    llama_pos   p0,
-                    llama_pos   p1,
-                          int   d);
-
-llama_pos llama_kv_cache_seq_pos_max(
-        struct llama_kv_cache & cache,
-                 llama_seq_id   seq_id);
-
-void llama_kv_cache_defrag(struct llama_kv_cache & cache);
-
-int32_t llama_get_kv_cache_token_count(const struct llama_kv_cache & kv);
-
-int32_t llama_get_kv_cache_used_cells(const struct llama_kv_cache & kv);
-
-bool llama_kv_cache_can_shift(const struct llama_kv_cache & kv);
+// TODO: temporary reusing llama_kv_cache_unified -- implement recurrent cache and simplify llama_kv_cache_unified
+//class llama_kv_cache_recurrent : public llama_kv_cache_unified {
+//public:
+//    using llama_kv_cache_unified::llama_kv_cache_unified;
+//};

 //
 // kv cache view
 //

-struct llama_kv_cache_view llama_kv_cache_view_init(const struct llama_kv_cache & kv, int32_t n_seq_max);
-
-void llama_kv_cache_view_update(struct llama_kv_cache_view * view, const struct llama_kv_cache & kv);
-
-//
-// kv cache restore
-//
-
-// saves the kv_cache state for future recovery.
-// used to rollback llama_kv_cache_find_slot changes.
-struct llama_kv_slot_restorer {
-    struct llama_kv_cache_state {
-        uint32_t head = 0;
-        uint32_t n    = 0;
-    } old_state;
-
-    // for non-recurrent models only
-    // list of slots to restore
-    std::vector<std::pair<uint32_t, uint32_t>> slot_boundaries;
-
-    bool do_restore = false;
-
-    explicit llama_kv_slot_restorer(const struct llama_kv_cache & cache) {
-        old_state.head = cache.head;
-        old_state.n    = cache.n;
-    }
-
-    // saves a slot information for future restoration
-    void save(const struct llama_kv_cache_slot_info & slot) {
-        if (slot) {
-            do_restore = true;
-            if (slot.boundaries.first != slot.boundaries.second) {
-                slot_boundaries.push_back(slot.boundaries);
-            }
-        }
-    }
-
-    // must be explicitly called to restore the kv_cache state
-    // and rollback changes from all llama_kv_cache_find_slot calls
-    void restore(struct llama_kv_cache & cache) {
-        if (do_restore) {
-            cache.head = old_state.head;
-            cache.n    = old_state.n;
-
-            if (cache.recurrent) { // recurrent models like Mamba or RWKV can't have a state partially erased
-                llama_kv_cache_seq_rm(cache, -1, -1, -1);
-            } else {
-                for (auto & slot : slot_boundaries) {
-                    llama_kv_cache_seq_rm(cache, -1, slot.first, slot.second);
-                }
-            }
-        }
-    }
-};
+llama_kv_cache_view llama_kv_cache_view_init(const llama_kv_cache & kv, int32_t n_seq_max);

+void llama_kv_cache_view_update(llama_kv_cache_view * view, const llama_kv_cache * kv);
--- a/llama/llama.cpp/src/llama-memory.cpp
+++ b/llama/llama.cpp/src/llama-memory.cpp
@ -0,0 +1 @@
+#include "llama-memory.h"
--- a/llama/llama.cpp/src/llama-memory.h
+++ b/llama/llama.cpp/src/llama-memory.h
@ -0,0 +1,21 @@
+#pragma once
+
+#include "llama.h"
+
+// general concept of LLM memory
+// the KV cache is a type of LLM memory, but there can be other types
+class llama_memory_i {
+public:
+    virtual void clear() = 0;
+    virtual void defrag() = 0;
+
+    virtual bool seq_rm  (llama_seq_id seq_id,                              llama_pos p0, llama_pos p1) = 0;
+    virtual void seq_cp  (llama_seq_id seq_id_src, llama_seq_id seq_id_dst, llama_pos p0, llama_pos p1) = 0;
+    virtual void seq_keep(llama_seq_id seq_id) = 0;
+    virtual void seq_add (llama_seq_id seq_id,                              llama_pos p0, llama_pos p1, llama_pos delta) = 0;
+    virtual void seq_div (llama_seq_id seq_id,                              llama_pos p0, llama_pos p1, int d) = 0;
+
+    virtual llama_pos seq_pos_max(llama_seq_id seq_id) const = 0;
+
+    virtual bool get_can_edit() const = 0;
+};
--- a/llama/llama.cpp/src/llama-mmap.cpp
+++ b/llama/llama.cpp/src/llama-mmap.cpp
@ -8,6 +8,7 @@
 #include <climits>
 #include <stdexcept>
 #include <cerrno>
+#include <algorithm>

 #ifdef __has_include
    #if __has_include(<unistd.h>)
@ -34,6 +35,10 @@
    #include <io.h>
 #endif

+#if defined(__APPLE__)
+#include <TargetConditionals.h>
+#endif
+
 // TODO: consider moving to llama-impl.h if needed in more places
 #if defined(_WIN32)
 static std::string llama_format_win_err(DWORD err) {
@ -471,7 +476,11 @@ struct llama_mlock::impl {

        char* errmsg = std::strerror(errno);
        bool suggest = (errno == ENOMEM);
-
+#if defined(TARGET_OS_VISION) || defined(TARGET_OS_TV) || defined(_AIX)
+        // visionOS/tvOS dont't support RLIMIT_MEMLOCK
+        // Skip resource limit checks on visionOS/tvOS
+        suggest = false;
+#else
        struct rlimit lock_limit;
        if (suggest && getrlimit(RLIMIT_MEMLOCK, &lock_limit)) {
            suggest = false;
@ -479,6 +488,7 @@ struct llama_mlock::impl {
        if (suggest && (lock_limit.rlim_max > lock_limit.rlim_cur + size)) {
            suggest = false;
        }
+#endif

        LLAMA_LOG_WARN("warning: failed to mlock %zu-byte buffer (after previously locking %zu bytes): %s\n%s",
                size, this->size, errmsg, suggest ? MLOCK_SUGGESTION : "");
--- a/llama/llama.cpp/src/llama-model-loader.cpp
+++ b/llama/llama.cpp/src/llama-model-loader.cpp
@ -448,7 +448,8 @@ llama_model_loader::llama_model_loader(
        std::vector<std::string> & splits,
        bool use_mmap,
        bool check_tensors,
-        const struct llama_model_kv_override * param_overrides_p) {
+        const llama_model_kv_override * param_overrides_p,
+        const llama_model_tensor_buft_override * param_tensor_buft_overrides_p) {
    int trace = 0;
    if (getenv("LLAMA_TRACE")) {
        trace = atoi(getenv("LLAMA_TRACE"));
@ -460,6 +461,8 @@ llama_model_loader::llama_model_loader(
        }
    }

+    tensor_buft_overrides = param_tensor_buft_overrides_p;
+
    // Load the main GGUF
    struct ggml_context * ctx = NULL;
    struct gguf_init_params params = {
@ -603,7 +606,9 @@ llama_model_loader::llama_model_loader(

            if (trace > 0) {
                const uint16_t sid = w.idx;
-                LLAMA_LOG_INFO("%s: - tensor split %2d: %32s %-8s [ %s ]\n", __func__, sid, ggml_get_name(tensor), ggml_type_name(type), llama_format_tensor_shape(tensor).c_str());
+                LLAMA_LOG_INFO("%s: - tensor split %2d: %32s %-8s [ %s ] %8.2f MiB\n", __func__,
+                        sid, ggml_get_name(tensor), ggml_type_name(type), llama_format_tensor_shape(tensor).c_str(),
+                        ggml_nbytes(tensor)/1024.0f/1024.0f);
            }
        }

@ -643,9 +648,9 @@ llama_model_loader::llama_model_loader(
        ftype = (llama_ftype) (ftype | LLAMA_FTYPE_GUESSED);

        {
-            const int kid = gguf_find_key(meta.get(), "general.file_type"); // TODO: use LLM_KV
-            if (kid >= 0) {
-                ftype = (llama_ftype) gguf_get_val_u32(meta.get(), kid);
+            uint32_t ftype_val = 0;
+            if (get_key(LLM_KV_GENERAL_FILE_TYPE, ftype_val, false)) {
+                ftype = (llama_ftype) ftype_val;
            }
        }

--- a/llama/llama.cpp/src/llama-model-loader.h
+++ b/llama/llama.cpp/src/llama-model-loader.h
@ -77,8 +77,9 @@ struct llama_model_loader {

    llama_mmaps mappings;

-    std::map<std::string, struct llama_tensor_weight, weight_name_comparer> weights_map;
-    std::unordered_map<std::string, struct llama_model_kv_override> kv_overrides;
+    std::map<std::string, llama_tensor_weight, weight_name_comparer> weights_map;
+    std::unordered_map<std::string, llama_model_kv_override> kv_overrides;
+    const llama_model_tensor_buft_override * tensor_buft_overrides;

    gguf_context_ptr meta;
    std::vector<ggml_context_ptr> contexts;
@ -95,7 +96,8 @@ struct llama_model_loader {
        std::vector<std::string> & splits, // optional, only need if the split does not follow naming scheme
        bool use_mmap,
        bool check_tensors,
-        const struct llama_model_kv_override * param_overrides_p);
+        const llama_model_kv_override * param_overrides_p,
+        const llama_model_tensor_buft_override * param_tensor_buft_overrides_p);

    template<typename T>
    typename std::enable_if<std::is_integral<T>::value, bool>::type
--- a/llama/llama.cpp/src/llama-model.cpp
+++ b/llama/llama.cpp/src/llama-model.cpp
--- a/llama/llama.cpp/src/llama-model.h
+++ b/llama/llama.cpp/src/llama-model.h
@ -2,7 +2,9 @@

 #include "llama.h"
 #include "llama-arch.h"
+#include "llama-graph.h"
 #include "llama-hparams.h"
+#include "llama-memory.h"
 #include "llama-vocab.h"

 #include <memory>
@ -11,6 +13,8 @@
 #include <vector>
 #include <stdexcept>

+struct llama_cparams;
+struct llama_ubatch;
 struct llama_model_loader;

 // available models
@ -26,6 +30,7 @@ enum llm_type {
    LLM_TYPE_109M,
    LLM_TYPE_137M,
    LLM_TYPE_160M,
+    LLM_TYPE_190M,
    LLM_TYPE_220M,
    LLM_TYPE_250M,
    LLM_TYPE_270M,
@ -40,8 +45,10 @@ enum llm_type {
    LLM_TYPE_1_4B,
    LLM_TYPE_1_5B,
    LLM_TYPE_1_6B,
+    LLM_TYPE_1_8B,
    LLM_TYPE_2B,
    LLM_TYPE_2_8B,
+    LLM_TYPE_2_9B,
    LLM_TYPE_3B,
    LLM_TYPE_4B,
    LLM_TYPE_6B,
@ -81,6 +88,9 @@ enum llm_type {
    LLM_TYPE_10B_128x3_66B,
    LLM_TYPE_57B_A14B,
    LLM_TYPE_27B,
+    LLM_TYPE_290B,
+    LLM_TYPE_17B_16E, // llama4 Scout
+    LLM_TYPE_17B_128E, // llama4 Maverick
 };

 struct llama_layer_posnet {
@ -259,6 +269,20 @@ struct llama_layer {
    struct ggml_tensor * time_mix_receptance_b = nullptr;
    struct ggml_tensor * time_mix_gate         = nullptr;

+    // rwkv7
+    struct ggml_tensor * time_mix_w0         = nullptr;
+    struct ggml_tensor * time_mix_a0         = nullptr;
+    struct ggml_tensor * time_mix_a1         = nullptr;
+    struct ggml_tensor * time_mix_a2         = nullptr;
+    struct ggml_tensor * time_mix_v0         = nullptr;
+    struct ggml_tensor * time_mix_v1         = nullptr;
+    struct ggml_tensor * time_mix_v2         = nullptr;
+    struct ggml_tensor * time_mix_g1         = nullptr;
+    struct ggml_tensor * time_mix_g2         = nullptr;
+    struct ggml_tensor * time_mix_k_k        = nullptr;
+    struct ggml_tensor * time_mix_k_a        = nullptr;
+    struct ggml_tensor * time_mix_r_k        = nullptr;
+
    struct ggml_tensor * time_mix_ln     = nullptr;
    struct ggml_tensor * time_mix_ln_b   = nullptr;
    struct ggml_tensor * time_mix_output = nullptr;
@ -362,7 +386,7 @@ struct llama_model {
    std::string desc() const;

    size_t size() const;
-    size_t max_nodes() const;
+    size_t n_tensors() const;
    size_t n_devices() const;

    // total number of parameters in the model
@ -375,11 +399,26 @@ struct llama_model {

    ggml_backend_buffer_type_t select_buft(int il) const;

+    bool has_tensor_overrides() const;
+
    const struct ggml_tensor * get_tensor(const char * name) const;

+    // TODO: move this to new llm_arch_model_i interface
+    llama_memory_i * create_memory() const; // TODO: params
+
+    // TODO: move this to new llm_arch_model_i interface
+    llm_graph_result_ptr build_graph(
+            const llm_graph_params & params,
+                       ggml_cgraph * gf,
+                    llm_graph_type   type) const;
+
 private:
    struct impl;
    std::unique_ptr<impl> pimpl;
 };

 const char * llm_type_name(llm_type type);
+
+// For internal test use
+// TODO: remove
+const std::vector<std::pair<std::string, ggml_tensor *>> & llama_internal_get_tensor_map(const llama_model * model);
--- a/llama/llama.cpp/src/llama-quant.cpp
+++ b/llama/llama.cpp/src/llama-quant.cpp
@ -10,6 +10,7 @@
 #include <cinttypes>
 #include <fstream>
 #include <mutex>
+#include <regex>
 #include <thread>
 #include <unordered_map>

@ -47,8 +48,14 @@ struct quantize_state_impl {
        {}
 };

+// changes to this struct must be replicated in quantize.cpp
+struct tensor_quantization {
+    std::string name;
+    ggml_type quant = GGML_TYPE_COUNT;
+};
+
 static void llama_tensor_dequantize_impl(
-    struct ggml_tensor * tensor, std::vector<no_init<float>> & output, std::vector<std::thread> & workers,
+    ggml_tensor * tensor, std::vector<no_init<float>> & output, std::vector<std::thread> & workers,
    const size_t nelements, const int nthread
 ) {
    if (output.size() < nelements) {
@ -527,7 +534,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
    }

    std::vector<std::string> splits = {};
-    llama_model_loader ml(fname_inp, splits, use_mmap, /*check_tensors*/ true, kv_overrides);
+    llama_model_loader ml(fname_inp, splits, use_mmap, /*check_tensors*/ true, kv_overrides, nullptr);
    ml.init_mappings(false); // no prefetching

    llama_model model(llama_model_default_params());
@ -536,7 +543,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
    model.load_hparams(ml);
    model.load_stats  (ml);

-    struct quantize_state_impl qs(model, params);
+    quantize_state_impl qs(model, params);

    if (params->only_copy) {
        ftype = ml.ftype;
@ -663,7 +670,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
    // populate the original tensors so we get an initial meta data
    for (const auto * it : tensors) {
        uint16_t i_split = params->keep_split ? it->idx : 0;
-        struct ggml_tensor * tensor = it->tensor;
+        ggml_tensor * tensor = it->tensor;
        if (!ctx_outs[i_split]) {
            ctx_outs[i_split].reset(gguf_init_empty());
        }
@ -712,7 +719,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
    new_ofstream(0);
    for (const auto * it : tensors) {
        const auto & weight = *it;
-        struct ggml_tensor * tensor = weight.tensor;
+        ggml_tensor * tensor = weight.tensor;
        if (weight.idx != cur_split && params->keep_split) {
            close_ofstream();
            new_ofstream(weight.idx);
@ -762,10 +769,19 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
        // NOTE: can't use LLM_TN here because the layer number is not known
        quantize &= name.find("ssm_conv1d.weight") == std::string::npos;

-        // do not quantize RWKV's time_mix_first tensors
+        // do not quantize RWKV's small yet 2D weights
        quantize &= name.find("time_mix_first.weight") == std::string::npos;
+        quantize &= name.find("time_mix_w0.weight") == std::string::npos;
        quantize &= name.find("time_mix_w1.weight") == std::string::npos;
        quantize &= name.find("time_mix_w2.weight") == std::string::npos;
+        quantize &= name.find("time_mix_v0.weight") == std::string::npos;
+        quantize &= name.find("time_mix_v1.weight") == std::string::npos;
+        quantize &= name.find("time_mix_v2.weight") == std::string::npos;
+        quantize &= name.find("time_mix_a0.weight") == std::string::npos;
+        quantize &= name.find("time_mix_a1.weight") == std::string::npos;
+        quantize &= name.find("time_mix_a2.weight") == std::string::npos;
+        quantize &= name.find("time_mix_g1.weight") == std::string::npos;
+        quantize &= name.find("time_mix_g2.weight") == std::string::npos;
        quantize &= name.find("time_mix_decay_w1.weight") == std::string::npos;
        quantize &= name.find("time_mix_decay_w2.weight") == std::string::npos;
        quantize &= name.find("time_mix_lerp_fused.weight") == std::string::npos;
@ -773,7 +789,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
        // do not quantize relative position bias (T5)
        quantize &= name.find("attn_rel_b.weight") == std::string::npos;

-        enum ggml_type new_type;
+        ggml_type new_type;
        void * new_data;
        size_t new_size;

@ -783,6 +799,19 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
            // get more optimal quantization type based on the tensor shape, layer, etc.
            if (!params->pure && ggml_is_quantized(default_type)) {
                new_type = llama_tensor_get_type(qs, new_type, tensor, ftype);
+                // unless the user specifies a type
+                if (params->tensor_types) {
+                    const std::vector<tensor_quantization> & tensor_types = *static_cast<const std::vector<tensor_quantization> *>(params->tensor_types);
+                    for (const auto & [tname, qtype] : tensor_types) {
+                        if (std::regex pattern(tname); std::regex_search(tensor->name, pattern)) {
+                            if (qtype != new_type) {
+                                LLAMA_LOG_DEBUG("(overriding %s -> %s), ", ggml_type_name(new_type), ggml_type_name(qtype));
+                            }
+                            new_type = qtype;
+                            break;
+                        }
+                    }
+                }
            }
            if (params->token_embedding_type < GGML_TYPE_COUNT && strcmp(tensor->name, "token_embd.weight") == 0) {
                new_type = params->token_embedding_type;
@ -907,8 +936,8 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
 // interface implementation
 //

-struct llama_model_quantize_params llama_model_quantize_default_params() {
-    struct llama_model_quantize_params result = {
+llama_model_quantize_params llama_model_quantize_default_params() {
+    llama_model_quantize_params result = {
        /*.nthread                     =*/ 0,
        /*.ftype                       =*/ LLAMA_FTYPE_MOSTLY_Q5_1,
        /*.output_tensor_type          =*/ GGML_TYPE_COUNT,
@ -920,6 +949,7 @@ struct llama_model_quantize_params llama_model_quantize_default_params() {
        /*.keep_split                  =*/ false,
        /*.imatrix                     =*/ nullptr,
        /*.kv_overrides                =*/ nullptr,
+        /*.tensor_type                 =*/ nullptr,
    };

    return result;
--- a/llama/llama.cpp/src/llama-sampling.cpp
+++ b/llama/llama.cpp/src/llama-sampling.cpp
@ -1449,7 +1449,9 @@ static struct llama_sampler * llama_sampler_init_grammar_impl(
                     const char ** trigger_words,
                            size_t num_trigger_words,
               const llama_token * trigger_tokens,
-                            size_t num_trigger_tokens);
+                            size_t num_trigger_tokens,
+                     const char ** trigger_patterns,
+                            size_t num_trigger_patterns);

 static void llama_sampler_grammar_reset(struct llama_sampler * smpl) {
    auto * ctx = (llama_sampler_grammar *) smpl->ctx;
@ -1457,12 +1459,14 @@ static void llama_sampler_grammar_reset(struct llama_sampler * smpl) {
        return;
    }

-    std::vector<const char *>  trigger_words;
-    for (auto & word : ctx->grammar->trigger_words) {
-        trigger_words.push_back(word.c_str());
+    std::vector<const char *>  trigger_patterns_c;
+    trigger_patterns_c.reserve(ctx->grammar->trigger_patterns.size());
+    for (auto & trigger_pattern : ctx->grammar->trigger_patterns) {
+        trigger_patterns_c.push_back(trigger_pattern.pattern.c_str());
    }
+
    auto * grammar_new = llama_grammar_init_impl(ctx->grammar->vocab, ctx->grammar_str.c_str(), ctx->grammar_root.c_str(),
-                                                 ctx->grammar->lazy, trigger_words.data(), trigger_words.size(),
+                                                 ctx->grammar->lazy, trigger_patterns_c.data(), trigger_patterns_c.size(),
                                                 ctx->grammar->trigger_tokens.data(), ctx->grammar->trigger_tokens.size());

    llama_grammar_free_impl(ctx->grammar);
@ -1472,7 +1476,8 @@ static void llama_sampler_grammar_reset(struct llama_sampler * smpl) {
 static struct llama_sampler * llama_sampler_grammar_clone(const struct llama_sampler * smpl) {
    const auto * ctx = (const llama_sampler_grammar *) smpl->ctx;

-    auto * result = llama_sampler_init_grammar_impl(ctx->vocab, nullptr, nullptr, false, nullptr, 0, nullptr, 0);
+    auto * result = llama_sampler_init_grammar_impl(ctx->vocab, nullptr, nullptr, false, nullptr, 0, nullptr, 0, nullptr, 0);
+    GGML_ASSERT(result);

    // copy the state
    {
@ -1516,16 +1521,38 @@ static struct llama_sampler * llama_sampler_init_grammar_impl(
                     const char ** trigger_words,
                            size_t num_trigger_words,
               const llama_token * trigger_tokens,
-                            size_t num_trigger_tokens) {
+                            size_t num_trigger_tokens,
+                     const char ** trigger_patterns,
+                            size_t num_trigger_patterns) {
    auto * ctx = new llama_sampler_grammar;

    if (grammar_str != nullptr && grammar_str[0] != '\0') {
+        // TODO: remove trigger_words support.
+        if (trigger_words != nullptr && num_trigger_words > 0) {
+            GGML_ASSERT(trigger_patterns == nullptr && num_trigger_patterns == 0);
+            std::string trigger_pattern("[\\s\\S]*?(");
+            for (size_t i = 0; i < num_trigger_words; ++i) {
+                static const std::regex special_chars("[.^$|()*+?\\[\\]{}\\\\]");
+                if (i > 0) {
+                    trigger_pattern += "|";
+                }
+                trigger_pattern += std::regex_replace(trigger_words[i], special_chars, "\\$0");
+            }
+            trigger_pattern += ")[\\s\\S]*";
+            auto trigger_pattern_c = trigger_pattern.c_str();
+            trigger_patterns = &trigger_pattern_c;
+            num_trigger_patterns = 1;
+        }
        *ctx = {
            /* .vocab        = */ vocab,
            /* .grammar_str  = */ grammar_str,
            /* .grammar_root = */ grammar_root,
-            /* .grammar      = */ llama_grammar_init_impl(vocab, grammar_str, grammar_root, lazy, trigger_words, num_trigger_words, trigger_tokens, num_trigger_tokens),
+            /* .grammar      = */ llama_grammar_init_impl(vocab, grammar_str, grammar_root, lazy, trigger_patterns, num_trigger_patterns, trigger_tokens, num_trigger_tokens),
        };
+        if (!ctx->grammar) {
+            delete ctx;
+            return nullptr;
+        }
    } else {
        *ctx = {
            /* .vocab        = */ vocab,
@ -1545,7 +1572,7 @@ struct llama_sampler * llama_sampler_init_grammar(
        const struct llama_vocab * vocab,
                      const char * grammar_str,
                      const char * grammar_root) {
-    return llama_sampler_init_grammar_impl(vocab, grammar_str, grammar_root, /* lazy= */ false, nullptr, 0, nullptr, 0);
+    return llama_sampler_init_grammar_impl(vocab, grammar_str, grammar_root, /* lazy= */ false, nullptr, 0, nullptr, 0, nullptr, 0);
 }

 struct llama_sampler * llama_sampler_init_grammar_lazy(
@ -1556,7 +1583,18 @@ struct llama_sampler * llama_sampler_init_grammar_lazy(
                            size_t num_trigger_words,
               const llama_token * trigger_tokens,
                            size_t num_trigger_tokens) {
-    return llama_sampler_init_grammar_impl(vocab, grammar_str, grammar_root, /* lazy= */ true, trigger_words, num_trigger_words, trigger_tokens, num_trigger_tokens);
+    return llama_sampler_init_grammar_impl(vocab, grammar_str, grammar_root, /* lazy= */ true, trigger_words, num_trigger_words, trigger_tokens, num_trigger_tokens, nullptr, 0);
+}
+
+struct llama_sampler * llama_sampler_init_grammar_lazy_patterns(
+        const struct llama_vocab * vocab,
+                      const char * grammar_str,
+                      const char * grammar_root,
+                     const char ** trigger_patterns,
+                            size_t num_trigger_patterns,
+               const llama_token * trigger_tokens,
+                            size_t num_trigger_tokens) {
+    return llama_sampler_init_grammar_impl(vocab, grammar_str, grammar_root, /* lazy= */ true, nullptr, 0, trigger_tokens, num_trigger_tokens, trigger_patterns, num_trigger_patterns);
 }

 // penalties
--- a/llama/llama.cpp/src/llama-vocab.cpp
+++ b/llama/llama.cpp/src/llama-vocab.cpp
@ -16,6 +16,7 @@
 #include <queue>
 #include <set>
 #include <unordered_map>
+#include <cctype>

 //
 // helpers
@ -341,6 +342,7 @@ struct llm_tokenizer_bpe : llm_tokenizer {
            case LLAMA_VOCAB_PRE_TYPE_MPT:
            case LLAMA_VOCAB_PRE_TYPE_OLMO:
            case LLAMA_VOCAB_PRE_TYPE_JAIS:
+            case LLAMA_VOCAB_PRE_TYPE_TRILLION:
                regex_exprs = {
                    "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
                };
@ -393,10 +395,24 @@ struct llm_tokenizer_bpe : llm_tokenizer {
                };
                break;
            case LLAMA_VOCAB_PRE_TYPE_GPT4O:
-                // original regex from tokenizer.json
-                // [^\\r\\n\\p{L}\\p{N}]?[\\p{Lu}\\p{Lt}\\p{Lm}\\p{Lo}\\p{M}]*[\\p{Ll}\\p{Lm}\\p{Lo}\\p{M}]+(?i:'s|'t|'re|'ve|'m|'ll|'d)?|[^\\r\\n\\p{L}\\p{N}]?[\\p{Lu}\\p{Lt}\\p{Lm}\\p{Lo}\\p{M}]+[\\p{Ll}\\p{Lm}\\p{Lo}\\p{M}]*(?i:'s|'t|'re|'ve|'m|'ll|'d)?|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n/]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+
                regex_exprs = {
-                    "[^\\r\\n\\p{L}\\p{N}]?((?=[\\p{L}])([^a-z]))*((?=[\\p{L}])([^A-Z]))+(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])?|[^\\r\\n\\p{L}\\p{N}]?((?=[\\p{L}])([^a-z]))+((?=[\\p{L}])([^A-Z]))*(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])?|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n/]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+"
+                    // original regex from tokenizer.json
+                    // "[^\\r\\n\\p{L}\\p{N}]?[\\p{Lu}\\p{Lt}\\p{Lm}\\p{Lo}\\p{M}]*[\\p{Ll}\\p{Lm}\\p{Lo}\\p{M}]+(?i:'s|'t|'re|'ve|'m|'ll|'d)?|[^\\r\\n\\p{L}\\p{N}]?[\\p{Lu}\\p{Lt}\\p{Lm}\\p{Lo}\\p{M}]+[\\p{Ll}\\p{Lm}\\p{Lo}\\p{M}]*(?i:'s|'t|'re|'ve|'m|'ll|'d)?|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n/]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
+                    "[^\\r\\n\\p{L}\\p{N}]?((?=[\\p{L}])([^a-z]))*((?=[\\p{L}])([^A-Z]))+(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])?|[^\\r\\n\\p{L}\\p{N}]?((?=[\\p{L}])([^a-z]))+((?=[\\p{L}])([^A-Z]))*(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])?|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n/]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
+                };
+                break;
+            case LLAMA_VOCAB_PRE_TYPE_SUPERBPE:
+                regex_exprs = {
+                    "\\p{N}+",
+                    "(?=(\\d{3})+(?!\\d))",
+                };
+                break;
+            case LLAMA_VOCAB_PRE_TYPE_BAILINGMOE:
+                regex_exprs = {
+                    // original regex from tokenizer.json
+                    // "'(?i:[sdmt]|ll|ve|re)|[^\\r\\n\\p{L}\\p{N}]?+\\p{L}+|\\p{N}| ?[^\\s\\p{L}\\p{N}]++[\\r\\n]*|\\s*[\\r\\n]|\\s+(?!\\S)|\\s+"
+                    // FIXME? Changed possessive quantifiers (?+ and ++) to greedy to avoid errors and imatrix hanging (tried atomic grouping but it's not supported?)
+                    "'(?:[sSdDmMtT]|[lL][lL]|[vV][eE]|[rR][eE])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]|\\s+(?!\\S)|\\s+",
                };
                break;
            default:
@ -1547,6 +1563,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
                pre_type = LLAMA_VOCAB_PRE_TYPE_PORO;
                clean_spaces = false;
            } else if (
+                tokenizer_pre == "glm4" ||
                tokenizer_pre == "chatglm-bpe") {
                pre_type = LLAMA_VOCAB_PRE_TYPE_CHATGLM4;
                special_bos_id = LLAMA_TOKEN_NULL;
@ -1591,9 +1608,22 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
                tokenizer_pre == "megrez") {
                pre_type = LLAMA_VOCAB_PRE_TYPE_QWEN2;
            } else if (
-                tokenizer_pre == "gpt-4o") {
+                    tokenizer_pre == "gpt-4o" ||
+                    tokenizer_pre == "llama4") {
                pre_type = LLAMA_VOCAB_PRE_TYPE_GPT4O;
                clean_spaces = false;
+            } else if (
+                tokenizer_pre == "superbpe") {
+                pre_type = LLAMA_VOCAB_PRE_TYPE_SUPERBPE;
+                clean_spaces = false;
+            } else if (
+                tokenizer_pre == "trillion") {
+                pre_type = LLAMA_VOCAB_PRE_TYPE_TRILLION;
+                clean_spaces = false;
+            } else if (
+                tokenizer_pre == "bailingmoe") {
+                pre_type = LLAMA_VOCAB_PRE_TYPE_BAILINGMOE;
+                clean_spaces = false;
            } else {
                LLAMA_LOG_WARN("%s: missing or unrecognized pre-tokenizer type, using: 'default'\n", __func__);
                pre_type = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
@ -1772,6 +1802,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
                        || t.first == "<end_of_turn>"
                        || t.first == "<|endoftext|>"
                        || t.first == "<EOT>"
+                        || t.first == "_<EOT>"
                        || t.first == "<｜end▁of▁sentence｜>" // DeepSeek
                   ) {
                    special_eot_id = t.second;
@ -1804,6 +1835,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
                        || t.first == "<fim-prefix>"
                        || t.first == "<｜fim▁begin｜>" // DeepSeek
                        || t.first == "<PRE>"
+                        || t.first == "▁<PRE>"          // CodeLlama
                        ) {
                    special_fim_pre_id = t.second;
                    if ((id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
@ -1821,6 +1853,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
                        || t.first == "<fim-suffix>"
                        || t.first == "<｜fim▁hole｜>" // DeepSeek
                        || t.first == "<SUF>"
+                        || t.first == "▁<SUF>"         // CodeLlama
                        ) {
                    special_fim_suf_id = t.second;
                    if ((id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
@ -1838,6 +1871,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
                        || t.first == "<fim-middle>"
                        || t.first == "<｜fim▁end｜>"  // DeepSeek
                        || t.first == "<MID>"
+                        || t.first == "▁<MID>"         // CodeLlama
                        ) {
                    special_fim_mid_id = t.second;
                    if ((id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
@ -1922,6 +1956,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
                    || t.first == "<|endoftext|>"
                    || t.first == "<|eom_id|>"
                    || t.first == "<EOT>"
+                    || t.first == "_<EOT>"
               ) {
                special_eog_ids.insert(t.second);
                if ((id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
@ -2180,14 +2215,12 @@ void llama_vocab::impl::tokenizer_st_partition(std::forward_list<fragment_buffer
                    // find the first occurrence of a given special token in this fragment
                    //  passing offset argument only limit the "search area" but match coordinates
                    //  are still relative to the source full raw_text
-                    auto match = raw_text.find(text, raw_text_base_offset);
+                    //  string_view begins at pos 0 for the same reason
+                    auto match = std::string_view(raw_text.data(), raw_text_base_offset + raw_text_base_length).find(text, raw_text_base_offset);

                    // no occurrences found, stop processing this fragment for a given special token
                    if (match == std::string::npos) break;

-                    // check if match is within bounds of offset <-> length
-                    if (match + text.length() > raw_text_base_offset + raw_text_base_length) break;
-
 #ifdef PRETOKENIZERDEBUG
                    LLAMA_LOG_WARN("FF: (%ld %ld %ld) '%s'\n", raw_text->length(), raw_text_base_offset, raw_text_base_length, raw_text->substr(raw_text_base_offset, raw_text_base_length).c_str());
 #endif
--- a/llama/llama.cpp/src/llama.cpp
+++ b/llama/llama.cpp/src/llama.cpp
--- a/llama/llama.cpp/src/unicode.cpp
+++ b/llama/llama.cpp/src/unicode.cpp
@ -220,7 +220,6 @@ static inline std::wstring unicode_wstring_from_utf8(const std::string & s) {
    free(wbuf);
    return ret;
 #else
-
 #if defined(__clang__)
    // disable C++17 deprecation warning for std::codecvt_utf8
 #    pragma clang diagnostic push
--- a/llama/llama.go
+++ b/llama/llama.go
@ -147,27 +147,27 @@ func (c *Context) Model() *Model {
 }

 func (c *Context) KvCacheSeqAdd(seqId int, p0 int, p1 int, delta int) {
-	C.llama_kv_cache_seq_add(c.c, C.int(seqId), C.int(p0), C.int(p1), C.int(delta))
+	C.llama_kv_self_seq_add(c.c, C.int(seqId), C.int(p0), C.int(p1), C.int(delta))
 }

 func (c *Context) KvCacheSeqRm(seqId int, p0 int, p1 int) bool {
-	return bool(C.llama_kv_cache_seq_rm(c.c, C.int(seqId), C.int(p0), C.int(p1)))
+	return bool(C.llama_kv_self_seq_rm(c.c, C.int(seqId), C.int(p0), C.int(p1)))
 }

 func (c *Context) KvCacheSeqCp(srcSeqId int, dstSeqId int, p0 int, p1 int) {
-	C.llama_kv_cache_seq_cp(c.c, C.int(srcSeqId), C.int(dstSeqId), C.int(p0), C.int(p1))
+	C.llama_kv_self_seq_cp(c.c, C.int(srcSeqId), C.int(dstSeqId), C.int(p0), C.int(p1))
 }

 func (c *Context) KvCacheClear() {
-	C.llama_kv_cache_clear(c.c)
+	C.llama_kv_self_clear(c.c)
 }

 func (c *Context) KvCacheDefrag() {
-	C.llama_kv_cache_defrag(c.c)
+	C.llama_kv_self_defrag(c.c)
 }

 func (c *Context) KvCacheCanShift() bool {
-	return bool(C.llama_kv_cache_can_shift(c.c))
+	return bool(C.llama_kv_self_can_shift(c.c))
 }

 // Get the embeddings for a sequence id
--- a/llama/patches/0001-ggml-backend-malloc-and-free-using-the-same-compiler.patch
+++ b/llama/patches/0001-ggml-backend-malloc-and-free-using-the-same-compiler.patch
@ -24,10 +24,10 @@ problem.
 9 files changed, 21 insertions(+), 2 deletions(-)

 diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp
-index dba7be33..65e150d6 100644
+index 273075f4..dd11f304 100644
 --- a/ggml/src/ggml-backend.cpp
 +++ b/ggml/src/ggml-backend.cpp
-@@ -106,7 +106,6 @@ void ggml_backend_buffer_free(ggml_backend_buffer_t buffer) {
+@@ -107,7 +107,6 @@ void ggml_backend_buffer_free(ggml_backend_buffer_t buffer) {
     if (buffer->iface.free_buffer != NULL) {
         buffer->iface.free_buffer(buffer);
     }
@ -35,7 +35,7 @@ index dba7be33..65e150d6 100644
 }
 
 size_t ggml_backend_buffer_get_size(ggml_backend_buffer_t buffer) {
-@@ -542,6 +541,7 @@ static void ggml_backend_multi_buffer_free_buffer(ggml_backend_buffer_t buffer)
+@@ -544,6 +543,7 @@ static void ggml_backend_multi_buffer_free_buffer(ggml_backend_buffer_t buffer)
 
     free(ctx->buffers);
     free(ctx);
@ -43,7 +43,7 @@ index dba7be33..65e150d6 100644
 }
 
 static void ggml_backend_multi_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
-@@ -1865,6 +1865,11 @@ static void * ggml_backend_cpu_buffer_get_base(ggml_backend_buffer_t buffer) {
+@@ -1867,6 +1867,11 @@ static void * ggml_backend_cpu_buffer_get_base(ggml_backend_buffer_t buffer) {
 
 static void ggml_backend_cpu_buffer_free_buffer(ggml_backend_buffer_t buffer) {
     ggml_aligned_free(buffer->context, buffer->size);
@ -55,7 +55,7 @@ index dba7be33..65e150d6 100644
 }
 
 static void ggml_backend_cpu_buffer_memset_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) {
-@@ -1912,7 +1917,7 @@ static const struct ggml_backend_buffer_i ggml_backend_cpu_buffer_i = {
+@@ -1914,7 +1919,7 @@ static const struct ggml_backend_buffer_i ggml_backend_cpu_buffer_i = {
 };
 
 static const struct ggml_backend_buffer_i ggml_backend_cpu_buffer_from_ptr_i = {
@ -65,7 +65,7 @@ index dba7be33..65e150d6 100644
     /* .init_tensor     = */ NULL, // no initialization required
     /* .memset_tensor   = */ ggml_backend_cpu_buffer_memset_tensor,
 diff --git a/ggml/src/ggml-cann/ggml-cann.cpp b/ggml/src/ggml-cann/ggml-cann.cpp
-index d410c024..a207ab1e 100644
+index cec36b36..4b057973 100644
 --- a/ggml/src/ggml-cann/ggml-cann.cpp
 +++ b/ggml/src/ggml-cann/ggml-cann.cpp
@@ -530,6 +530,7 @@ static void ggml_backend_cann_buffer_free_buffer(
@ -76,7 +76,7 @@ index d410c024..a207ab1e 100644
 }
 
 /**
-@@ -1198,6 +1199,7 @@ static const char * ggml_backend_cann_host_buffer_name(ggml_backend_buffer_t buf
+@@ -1199,6 +1200,7 @@ static const char * ggml_backend_cann_host_buffer_name(ggml_backend_buffer_t buf
  */
 static void ggml_backend_cann_host_buffer_free(ggml_backend_buffer_t buffer) {
     ACL_CHECK(aclrtFreeHost(buffer->context));
@ -85,10 +85,10 @@ index d410c024..a207ab1e 100644
 
 /**
 diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
-index ebb2ccae..dfff21a2 100644
+index fafe9633..59a49560 100644
 --- a/ggml/src/ggml-cuda/ggml-cuda.cu
 +++ b/ggml/src/ggml-cuda/ggml-cuda.cu
-@@ -529,6 +529,7 @@ struct ggml_backend_cuda_buffer_context {
+@@ -533,6 +533,7 @@ struct ggml_backend_cuda_buffer_context {
 static void ggml_backend_cuda_buffer_free_buffer(ggml_backend_buffer_t buffer) {
     ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
     delete ctx;
@ -96,7 +96,7 @@ index ebb2ccae..dfff21a2 100644
 }
 
 static bool ggml_backend_buffer_is_cuda(ggml_backend_buffer_t buffer) {
-@@ -783,6 +784,7 @@ struct ggml_backend_cuda_split_buffer_context {
+@@ -788,6 +789,7 @@ struct ggml_backend_cuda_split_buffer_context {
 static void ggml_backend_cuda_split_buffer_free_buffer(ggml_backend_buffer_t buffer) {
     ggml_backend_cuda_split_buffer_context * ctx = (ggml_backend_cuda_split_buffer_context *)buffer->context;
     delete ctx;
@ -104,7 +104,7 @@ index ebb2ccae..dfff21a2 100644
 }
 
 static void * ggml_backend_cuda_split_buffer_get_base(ggml_backend_buffer_t buffer) {
-@@ -1055,6 +1057,7 @@ static const char * ggml_backend_cuda_host_buffer_type_name(ggml_backend_buffer_
+@@ -1061,6 +1063,7 @@ static const char * ggml_backend_cuda_host_buffer_type_name(ggml_backend_buffer_
 
 static void ggml_backend_cuda_host_buffer_free_buffer(ggml_backend_buffer_t buffer) {
     CUDA_CHECK(cudaFreeHost(buffer->context));
@ -125,10 +125,10 @@ index 50579227..2799a0a5 100644
 
 static void * ggml_backend_kompute_buffer_get_base(ggml_backend_buffer_t buffer) {
 diff --git a/ggml/src/ggml-metal/ggml-metal.m b/ggml/src/ggml-metal/ggml-metal.m
-index c550142a..fd9a4e77 100644
+index 9f1c6c6c..310afe8a 100644
 --- a/ggml/src/ggml-metal/ggml-metal.m
 +++ b/ggml/src/ggml-metal/ggml-metal.m
-@@ -4350,6 +4350,7 @@ static void ggml_backend_metal_buffer_free_buffer(ggml_backend_buffer_t buffer)
+@@ -4641,6 +4641,7 @@ static void ggml_backend_metal_buffer_free_buffer(ggml_backend_buffer_t buffer)
     }
 
     free(ctx);
@ -137,10 +137,10 @@ index c550142a..fd9a4e77 100644
 
 static void * ggml_backend_metal_buffer_get_base(ggml_backend_buffer_t buffer) {
 diff --git a/ggml/src/ggml-opencl/ggml-opencl.cpp b/ggml/src/ggml-opencl/ggml-opencl.cpp
-index f5906246..062e93b8 100644
+index b8b5cbd3..14d4561b 100644
 --- a/ggml/src/ggml-opencl/ggml-opencl.cpp
 +++ b/ggml/src/ggml-opencl/ggml-opencl.cpp
-@@ -1203,6 +1203,7 @@ static void * const cl_ptr_base = (void *)(uintptr_t) 0x1000;
+@@ -1443,6 +1443,7 @@ struct ggml_backend_opencl_buffer_context {
 static void ggml_backend_opencl_buffer_free_buffer(ggml_backend_buffer_t buffer) {
     ggml_backend_opencl_buffer_context * ctx = (ggml_backend_opencl_buffer_context *) buffer->context;
     delete ctx;
@ -149,10 +149,10 @@ index f5906246..062e93b8 100644
 
 static void * ggml_backend_opencl_buffer_get_base(ggml_backend_buffer_t buffer) {
 diff --git a/ggml/src/ggml-rpc/ggml-rpc.cpp b/ggml/src/ggml-rpc/ggml-rpc.cpp
-index 97873acc..893ee0b9 100644
+index 862b9b66..34536681 100644
 --- a/ggml/src/ggml-rpc/ggml-rpc.cpp
 +++ b/ggml/src/ggml-rpc/ggml-rpc.cpp
-@@ -419,6 +419,7 @@ static void ggml_backend_rpc_buffer_free_buffer(ggml_backend_buffer_t buffer) {
+@@ -443,6 +443,7 @@ static void ggml_backend_rpc_buffer_free_buffer(ggml_backend_buffer_t buffer) {
     bool status = send_rpc_cmd(ctx->sock, RPC_CMD_FREE_BUFFER, &request, sizeof(request), nullptr, 0);
     GGML_ASSERT(status);
     delete ctx;
@ -161,10 +161,10 @@ index 97873acc..893ee0b9 100644
 
 static void * ggml_backend_rpc_buffer_get_base(ggml_backend_buffer_t buffer) {
 diff --git a/ggml/src/ggml-sycl/ggml-sycl.cpp b/ggml/src/ggml-sycl/ggml-sycl.cpp
-index 792e0569..5e233e8b 100644
+index 3e48a924..a3d182fc 100644
 --- a/ggml/src/ggml-sycl/ggml-sycl.cpp
 +++ b/ggml/src/ggml-sycl/ggml-sycl.cpp
-@@ -311,6 +311,7 @@ ggml_backend_sycl_buffer_free_buffer(ggml_backend_buffer_t buffer) try {
+@@ -316,6 +316,7 @@ ggml_backend_sycl_buffer_free_buffer(ggml_backend_buffer_t buffer) try {
     ggml_sycl_set_device(ctx->device);
 
     delete ctx;
@ -172,7 +172,7 @@ index 792e0569..5e233e8b 100644
 }
 catch (sycl::exception const &exc) {
   std::cerr << exc.what() << "Exception caught at file:" << __FILE__
-@@ -720,6 +721,7 @@ struct ggml_backend_sycl_split_buffer_context {
+@@ -761,6 +762,7 @@ struct ggml_backend_sycl_split_buffer_context {
 static void ggml_backend_sycl_split_buffer_free_buffer(ggml_backend_buffer_t buffer) {
     ggml_backend_sycl_split_buffer_context * ctx = (ggml_backend_sycl_split_buffer_context *)buffer->context;
     delete ctx;
@ -180,7 +180,7 @@ index 792e0569..5e233e8b 100644
 }
 
 static void * ggml_backend_sycl_split_buffer_get_base(ggml_backend_buffer_t buffer) {
-@@ -1053,6 +1055,7 @@ static const char * ggml_backend_sycl_host_buffer_type_name(ggml_backend_buffer_
+@@ -1095,6 +1097,7 @@ static const char * ggml_backend_sycl_host_buffer_type_name(ggml_backend_buffer_
 
 static void ggml_backend_sycl_host_buffer_free_buffer(ggml_backend_buffer_t buffer) {
     ggml_sycl_host_free(buffer->context);
@ -189,10 +189,10 @@ index 792e0569..5e233e8b 100644
 
 static ggml_backend_buffer_t ggml_backend_sycl_host_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
 diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
-index abe3e790..1dad714b 100644
+index 783a0ff8..8ac1e07e 100644
 --- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp
 +++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
-@@ -7914,6 +7914,7 @@ static void ggml_backend_vk_buffer_free_buffer(ggml_backend_buffer_t buffer) {
+@@ -8639,6 +8639,7 @@ static void ggml_backend_vk_buffer_free_buffer(ggml_backend_buffer_t buffer) {
     ggml_backend_vk_buffer_context * ctx = (ggml_backend_vk_buffer_context *)buffer->context;
     ggml_vk_destroy_buffer(ctx->dev_buffer);
     delete ctx;
@ -200,7 +200,7 @@ index abe3e790..1dad714b 100644
 }
 
 static void * ggml_backend_vk_buffer_get_base(ggml_backend_buffer_t buffer) {
-@@ -8056,6 +8057,7 @@ static const char * ggml_backend_vk_host_buffer_name(ggml_backend_buffer_t buffe
+@@ -8782,6 +8783,7 @@ static const char * ggml_backend_vk_host_buffer_name(ggml_backend_buffer_t buffe
 static void ggml_backend_vk_host_buffer_free_buffer(ggml_backend_buffer_t buffer) {
     VK_LOG_MEMORY("ggml_backend_vk_host_buffer_free_buffer()");
     ggml_vk_host_free(vk_instance.devices[0], buffer->context);
--- a/llama/patches/0002-pretokenizer.patch
+++ b/llama/patches/0002-pretokenizer.patch
@ -3,15 +3,17 @@ From: Michael Yang <mxyng@pm.me>
 Date: Mon, 16 Sep 2024 15:53:13 -0700
 Subject: [PATCH] pretokenizer

+allow for an unset pretokenizer with a warning in the
+logs instead of throwing an error
 ---
 src/llama-vocab.cpp | 14 +++-----------
 1 file changed, 3 insertions(+), 11 deletions(-)

 diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp
-index ad9ffe66..a4eee9b8 100644
+index 464ff01e..0125ee53 100644
 --- a/src/llama-vocab.cpp
 +++ b/src/llama-vocab.cpp
-@@ -1468,16 +1468,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
+@@ -1491,16 +1491,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
         if (type == LLAMA_VOCAB_TYPE_BPE) {
             add_space_prefix = false;
             clean_spaces = true;
@ -29,9 +31,9 @@ index ad9ffe66..a4eee9b8 100644
                 pre_type = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
             } else if (
                     tokenizer_pre == "llama3"   ||
-@@ -1593,7 +1584,8 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
-                 tokenizer_pre == "megrez") {
-                 pre_type = LLAMA_VOCAB_PRE_TYPE_QWEN2;
+@@ -1634,7 +1625,8 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
+                 pre_type = LLAMA_VOCAB_PRE_TYPE_BAILINGMOE;
+                 clean_spaces = false;
             } else {
 -                throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));
 +                LLAMA_LOG_WARN("%s: missing or unrecognized pre-tokenizer type, using: 'default'\n", __func__);
--- a/llama/patches/0003-embeddings.patch
+++ b/llama/patches/0003-embeddings.patch
@ -1,52 +1,43 @@
 From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
-From: Michael Yang <mxyng@pm.me>
-Date: Mon, 16 Sep 2024 15:53:14 -0700
+From: jmorganca <jmorganca@gmail.com>
+Date: Tue, 8 Apr 2025 15:28:34 -0700
 Subject: [PATCH] embeddings

+allow a loaded model in llama.cpp to be used for
+both embeddings and causal attention text generation
+instead of forcing one or the error
 ---
- src/llama-context.cpp | 2 +-
- src/llama.cpp         | 6 ++++--
- 2 files changed, 5 insertions(+), 3 deletions(-)
+ src/llama-context.cpp | 6 +++---
+ 1 file changed, 3 insertions(+), 3 deletions(-)

 diff --git a/src/llama-context.cpp b/src/llama-context.cpp
-index 671d2a81..47e79ed4 100644
+index 4735e98e..65135172 100644
 --- a/src/llama-context.cpp
 +++ b/src/llama-context.cpp
-@@ -479,7 +479,7 @@ size_t llama_output_reserve(struct llama_context & lctx, size_t n_outputs) {
+@@ -1232,7 +1232,7 @@ int llama_context::decode(llama_batch & inp_batch) {
+     int64_t n_outputs_all = 0;
+ 
+     // count outputs
+-    if (batch.logits && !embd_pooled) {
+    if (batch.logits) {
+         for (uint32_t i = 0; i < n_tokens_all; ++i) {
+             n_outputs_all += batch.logits[i] != 0;
+         }
+@@ -1344,7 +1344,7 @@ int llama_context::decode(llama_batch & inp_batch) {
+         //    ggml_graph_dump_dot(gf, NULL, "llama.dot");
+         //}
+ 
+-        auto * t_logits = cparams.embeddings ? nullptr         : res->get_logits();
+        auto * t_logits = cparams.causal_attn ? res->get_logits() : nullptr;
+         auto * t_embd   = cparams.embeddings ? res->get_embd() : nullptr;
+ 
+         if (t_embd && res->get_embd_pooled()) {
+@@ -1488,7 +1488,7 @@ int32_t llama_context::output_reserve(int32_t n_outputs) {
     const auto n_embd  = hparams.n_embd;
 
     // TODO: use a per-batch flag for logits presence instead
-    const bool has_logits = !cparams.embeddings;
-+    const bool has_logits =  cparams.causal_attn;
-     const bool has_embd   =  cparams.embeddings && (cparams.pooling_type == LLAMA_POOLING_TYPE_NONE);
+-    bool has_logits = !cparams.embeddings;
+    bool has_logits =  cparams.causal_attn;
+     bool has_embd   =  cparams.embeddings && (cparams.pooling_type == LLAMA_POOLING_TYPE_NONE);
 
-     const size_t logits_size = has_logits ? n_vocab*n_outputs_max : 0;
-diff --git a/src/llama.cpp b/src/llama.cpp
-index 607f2786..ac85bfed 100644
--- a/src/llama.cpp
-+++ b/src/llama.cpp
-@@ -8652,7 +8652,6 @@ static int llama_decode_impl(
-             res  = nullptr;
-             embd = nullptr;
-         } else if (cparams.embeddings) {
-            res  = nullptr; // do not extract logits for embedding case
-             embd = nullptr;
-             for (int i = ggml_graph_n_nodes(gf) - 1; i >= 0; --i) {
-                 if (strcmp(ggml_graph_node(gf, i)->name, "result_embd_pooled") == 0) {
-@@ -8660,12 +8659,15 @@ static int llama_decode_impl(
-                     break;
-                 }
-             }
-            GGML_ASSERT(embd != nullptr && "missing embeddings tensor");
-         } else {
-             embd = nullptr; // do not extract embeddings when not needed
-             GGML_ASSERT(strcmp(res->name, "result_output") == 0 && "missing result_output tensor");
-         }
- 
-+        if (!cparams.causal_attn) {
-+            res = nullptr; // do not extract logits when not needed
-+        }
-+
-         // LLAMA_LOG_INFO("graph build time: %.3f ms (%d nodes, %d leafs)\n", (ggml_time_us() - t_start_us)/1000.0, gf->n_nodes, gf->n_leafs);
- 
-         ggml_backend_sched_alloc_graph(lctx.sched.get(), gf);
+     // TODO: hacky enc-dec support
--- a/llama/patches/0004-clip-unicode.patch
+++ b/llama/patches/0004-clip-unicode.patch
@ -1,19 +1,21 @@
 From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
-From: Michael Yang <mxyng@pm.me>
-Date: Mon, 16 Sep 2024 15:53:15 -0700
+From: jmorganca <jmorganca@gmail.com>
+Date: Tue, 8 Apr 2025 15:34:37 -0700
 Subject: [PATCH] clip-unicode

+fixes loading vision models in llama.cpp on windows
+filesystems for paths that include wide characters
 ---
- examples/llava/clip.cpp | 40 +++++++++++++++++++++++++++++++++++++++-
- 1 file changed, 39 insertions(+), 1 deletion(-)
+ examples/llava/clip.cpp | 39 +++++++++++++++++++++++++++++++++++++++
+ 1 file changed, 39 insertions(+)

 diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp
-index 76d4a785..205af1eb 100644
+index 49c90b75..4b72ea9f 100644
 --- a/examples/llava/clip.cpp
 +++ b/examples/llava/clip.cpp
-@@ -58,6 +58,19 @@
- #   define LOG_DBG(...) do { fprintf(stdout, __VA_ARGS__); } while (0)
- #endif // defined(LLAVA_LOG_OFF)
+@@ -28,6 +28,19 @@
+ #include <cinttypes>
+ #include <limits>
 
 +#if defined(_WIN32)
 +#define WIN32_LEAN_AND_MEAN
@ -28,49 +30,48 @@ index 76d4a785..205af1eb 100644
 +#endif
 +#endif
 +
+ struct clip_logger_state g_logger_state = {GGML_LOG_LEVEL_CONT, clip_log_callback_default, NULL};
+ 
 //#define CLIP_DEBUG_FUNCTIONS
+@@ -1429,7 +1442,29 @@ struct clip_model_loader {
+         {
+             std::vector<uint8_t> read_buf;
 
- // RGB uint8 image
-@@ -1402,8 +1415,29 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
-             gguf_free(ctx);
-             return nullptr;
-         }
-
 +#ifdef _WIN32
-+        int wlen = MultiByteToWideChar(CP_UTF8, 0, fname, -1, NULL, 0);
-+        if (!wlen) {
-+            return NULL;
-+        }
-+        wchar_t * wbuf = (wchar_t *) malloc(wlen * sizeof(wchar_t));
-+        wlen = MultiByteToWideChar(CP_UTF8, 0, fname, -1, wbuf, wlen);
-+        if (!wlen) {
-+            free(wbuf);
-+            return NULL;
-+        }
+            int wlen = MultiByteToWideChar(CP_UTF8, 0, fname.c_str(), -1, NULL, 0);
+            if (!wlen) {
+                throw std::runtime_error(string_format("%s: failed to convert filename to wide string\n", __func__));
+            }
+            wchar_t * wbuf = (wchar_t *) malloc(wlen * sizeof(wchar_t));
+            wlen = MultiByteToWideChar(CP_UTF8, 0, fname.c_str(), -1, wbuf, wlen);
+            if (!wlen) {
+                free(wbuf);
+                throw std::runtime_error(string_format("%s: failed to convert filename to wide string\n", __func__));
+            }
 +#if __GLIBCXX__
-+        int fd = _wopen(wbuf, _O_RDONLY | _O_BINARY);
-+        __gnu_cxx::stdio_filebuf<char> buffer(fd, std::ios_base::in);
-+        std::istream fin(&buffer);
+            int fd = _wopen(wbuf, _O_RDONLY | _O_BINARY);
+            __gnu_cxx::stdio_filebuf<char> buffer(fd, std::ios_base::in);
+            std::istream fin(&buffer);
 +#else // MSVC
-+        // unused in our current build
-+        auto fin = std::ifstream(wbuf, std::ios::binary);
+            // unused in our current build
+            auto fin = std::ifstream(wbuf, std::ios::binary);
 +#endif
-+        free(wbuf);
+            free(wbuf);
 +#else
-         auto fin = std::ifstream(fname, std::ios::binary);
+             auto fin = std::ifstream(fname, std::ios::binary);
 +#endif
-         if (!fin) {
-             LOG_ERR("cannot open model file for loading tensors\n");
-             clip_free(new_clip);
-@@ -1443,7 +1477,11 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
-                 ggml_backend_tensor_set(cur, read_buf.data(), 0, num_bytes);
+             if (!fin) {
+                 throw std::runtime_error(string_format("%s: failed to open %s\n", __func__, fname.c_str()));
+             }
+@@ -1456,7 +1491,11 @@ struct clip_model_loader {
+                     ggml_backend_tensor_set(cur, read_buf.data(), 0, num_bytes);
+                 }
             }
-         }
 +#if defined(_WIN32) && defined(__GLIBCXX__)
-+        close(fd);
+            close(fd);
 +#else
-         fin.close();
+             fin.close();
 +#endif
-     }
 
-     // vision model
+             LOG_DBG("%s: loaded %zu tensors from %s\n", __func__, tensors_to_load.size(), fname.c_str());
+         }
--- a/llama/patches/0005-solar-pro.patch
+++ b/llama/patches/0005-solar-pro.patch
@ -1,47 +1,40 @@
 From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
-From: Michael Yang <mxyng@pm.me>
-Date: Mon, 16 Sep 2024 15:53:16 -0700
+From: jmorganca <jmorganca@gmail.com>
+Date: Tue, 8 Apr 2025 16:03:51 -0700
 Subject: [PATCH] solar-pro

-solar-pro introduces block skip connections where blocks are connected
-to other, non-sequential blocks with a scale multiple
-
-this change adds 4 new keys to store the skip connections and one new
-tensor to store the scalar. the scalar is implemented a 1-dimensional
-tensor with 2 elements dervied from the model's bskcn_tv configuration.
-in general, the values are (bskcn_tv, 1 - bskcn_tv)
+adds support for the Solar Pro architecture
 ---
- src/llama-arch.cpp         |  21 +++++
+ src/llama-arch.cpp         |  21 ++++
 src/llama-arch.h           |   3 +
 src/llama-hparams.cpp      |   8 ++
- src/llama-hparams.h        |   5 ++
+ src/llama-hparams.h        |   5 +
 src/llama-model-loader.cpp |   1 +
- src/llama-model.cpp        |  44 +++++++++++
+ src/llama-model.cpp        | 207 +++++++++++++++++++++++++++++++++++++
 src/llama-model.h          |   3 +
- src/llama.cpp              | 152 ++++++++++++++++++++++++++++++++++++-
- 8 files changed, 236 insertions(+), 1 deletion(-)
+ 7 files changed, 248 insertions(+)

 diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp
-index 97a1e7e5..a1e0ebcc 100644
+index a6fddc7f..0b0fedcd 100644
 --- a/src/llama-arch.cpp
 +++ b/src/llama-arch.cpp
-@@ -61,6 +61,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
+@@ -68,6 +68,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
     { LLM_ARCH_GRANITE,          "granite"          },
     { LLM_ARCH_GRANITE_MOE,      "granitemoe"       },
     { LLM_ARCH_CHAMELEON,        "chameleon"        },
 +    { LLM_ARCH_SOLAR,            "solar"            },
     { LLM_ARCH_WAVTOKENIZER_DEC, "wavtokenizer-dec" },
-     { LLM_ARCH_UNKNOWN,          "(unknown)"        },
- };
-@@ -125,6 +126,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
-     { LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT, "%s.attention.relative_buckets_count" },
-     { LLM_KV_ATTENTION_SLIDING_WINDOW,         "%s.attention.sliding_window"         },
-     { LLM_KV_ATTENTION_SCALE,                  "%s.attention.scale"                  },
-+    { LLM_KV_ATTENTION_BLOCK_SKIP_CONNECTION,  "%s.attention.block_skip_connection"  },
+     { LLM_ARCH_PLM,              "plm"              },
+     { LLM_ARCH_BAILINGMOE,       "bailingmoe"       },
+@@ -140,6 +141,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
+     { LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT,       "%s.attention.relative_buckets_count"       },
+     { LLM_KV_ATTENTION_SLIDING_WINDOW,               "%s.attention.sliding_window"               },
+     { LLM_KV_ATTENTION_SCALE,                        "%s.attention.scale"                        },
+    { LLM_KV_ATTENTION_BLOCK_SKIP_CONNECTION,        "%s.attention.block_skip_connection"        },
 
     { LLM_KV_ROPE_DIMENSION_COUNT,      "%s.rope.dimension_count"                 },
     { LLM_KV_ROPE_DIMENSION_SECTIONS,   "%s.rope.dimension_sections"              },
-@@ -1271,6 +1273,24 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
+@@ -1478,6 +1480,24 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
             { LLM_TENSOR_ATTN_K_NORM,     "blk.%d.attn_k_norm" },
         },
     },
@ -66,7 +59,7 @@ index 97a1e7e5..a1e0ebcc 100644
     {
         LLM_ARCH_WAVTOKENIZER_DEC,
         {
-@@ -1429,6 +1449,7 @@ static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
+@@ -1671,6 +1691,7 @@ static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
     {LLM_TENSOR_FFN_EXP_PROBS_B,            {LLM_TENSOR_LAYER_REPEATING, GGML_OP_ADD}},
     // this tensor is loaded for T5, but never used
     {LLM_TENSOR_DEC_CROSS_ATTN_REL_B,       {LLM_TENSOR_LAYER_REPEATING, GGML_OP_NONE}},
@ -75,18 +68,18 @@ index 97a1e7e5..a1e0ebcc 100644
     {LLM_TENSOR_POS_NET_NORM,               {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
     {LLM_TENSOR_POS_NET_NORM1,              {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
 diff --git a/src/llama-arch.h b/src/llama-arch.h
-index 122fdceb..77919578 100644
+index 2c2099b3..74aa3dd0 100644
 --- a/src/llama-arch.h
 +++ b/src/llama-arch.h
-@@ -65,6 +65,7 @@ enum llm_arch {
+@@ -72,6 +72,7 @@ enum llm_arch {
     LLM_ARCH_GRANITE,
     LLM_ARCH_GRANITE_MOE,
     LLM_ARCH_CHAMELEON,
 +    LLM_ARCH_SOLAR,
     LLM_ARCH_WAVTOKENIZER_DEC,
-     LLM_ARCH_UNKNOWN,
- };
-@@ -129,6 +130,7 @@ enum llm_kv {
+     LLM_ARCH_PLM,
+     LLM_ARCH_BAILINGMOE,
+@@ -144,6 +145,7 @@ enum llm_kv {
     LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT,
     LLM_KV_ATTENTION_SLIDING_WINDOW,
     LLM_KV_ATTENTION_SCALE,
@ -94,7 +87,7 @@ index 122fdceb..77919578 100644
 
     LLM_KV_ROPE_DIMENSION_COUNT,
     LLM_KV_ROPE_DIMENSION_SECTIONS,
-@@ -311,6 +313,7 @@ enum llm_tensor {
+@@ -340,6 +342,7 @@ enum llm_tensor {
     LLM_TENSOR_ENC_OUTPUT_NORM,
     LLM_TENSOR_CLS,
     LLM_TENSOR_CLS_OUT,
@ -103,14 +96,13 @@ index 122fdceb..77919578 100644
     LLM_TENSOR_CONVNEXT_DW,
     LLM_TENSOR_CONVNEXT_NORM,
 diff --git a/src/llama-hparams.cpp b/src/llama-hparams.cpp
-index ea87b295..f3955de9 100644
+index 90dfe7a7..8a667960 100644
 --- a/src/llama-hparams.cpp
 +++ b/src/llama-hparams.cpp
-@@ -69,3 +69,11 @@ uint32_t llama_hparams::n_embd_v_s() const {
-     // corresponds to Mamba's ssm_states size
+@@ -70,6 +70,14 @@ uint32_t llama_hparams::n_embd_v_s() const {
     return ssm_d_state * ssm_d_inner;
 }
-+
+ 
 +bool llama_hparams::n_bskcn(uint32_t n, uint32_t il) const {
 +    if (il < n_layer) {
 +        return n_bskcn_arr[n][il] > 0;
@ -118,12 +110,15 @@ index ea87b295..f3955de9 100644
 +
 +    GGML_ABORT("fatal error");
 +}
-\ No newline at end of file
+
+ bool llama_hparams::is_swa(uint32_t il) const {
+     if (il < n_layer) {
+         return n_swa > 0 && n_swa_pattern > 0 && il % n_swa_pattern < (n_swa_pattern - 1);
 diff --git a/src/llama-hparams.h b/src/llama-hparams.h
-index 1fe45410..1bdcdfd5 100644
+index 4e0b5719..c3147cbc 100644
 --- a/src/llama-hparams.h
 +++ b/src/llama-hparams.h
-@@ -50,6 +50,8 @@ struct llama_hparams {
+@@ -51,6 +51,8 @@ struct llama_hparams {
     std::array<uint32_t, LLAMA_MAX_LAYERS> n_head_kv_arr;
     std::array<uint32_t, LLAMA_MAX_LAYERS> n_ff_arr;
 
@ -132,18 +127,18 @@ index 1fe45410..1bdcdfd5 100644
     uint32_t n_layer_dense_lead = 0;
     uint32_t n_lora_q           = 0;
     uint32_t n_lora_kv          = 0;
-@@ -133,6 +135,9 @@ struct llama_hparams {
- 
+@@ -149,6 +151,9 @@ struct llama_hparams {
     // dimension of the recurrent state embeddings
     uint32_t n_embd_v_s() const;
-+
+ 
 +    // Block skip connection
 +    bool n_bskcn(uint32_t n, uint32_t il) const;
+
+     bool is_swa(uint32_t il) const;
 };
 
- static_assert(std::is_trivially_copyable<llama_hparams>::value, "llama_hparams must be trivially copyable");
 diff --git a/src/llama-model-loader.cpp b/src/llama-model-loader.cpp
-index 05d58ad9..1252aca1 100644
+index ea73a8a7..a012aeae 100644
 --- a/src/llama-model-loader.cpp
 +++ b/src/llama-model-loader.cpp
@@ -439,6 +439,7 @@ namespace GGUFMeta {
@ -155,10 +150,10 @@ index 05d58ad9..1252aca1 100644
 llama_model_loader::llama_model_loader(
         const std::string & fname,
 diff --git a/src/llama-model.cpp b/src/llama-model.cpp
-index 36a0a009..ad1315c6 100644
+index b74dd72c..5fbd0055 100644
 --- a/src/llama-model.cpp
 +++ b/src/llama-model.cpp
-@@ -1238,6 +1238,21 @@ void llama_model::load_hparams(llama_model_loader & ml) {
+@@ -1372,6 +1372,21 @@ void llama_model::load_hparams(llama_model_loader & ml) {
                     default: type = LLM_TYPE_UNKNOWN;
                }
             } break;
@ -180,7 +175,7 @@ index 36a0a009..ad1315c6 100644
         case LLM_ARCH_WAVTOKENIZER_DEC:
             {
                 ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS,    hparams.f_norm_eps);
-@@ -3316,6 +3331,34 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
+@@ -3701,6 +3716,34 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
 
                         layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
 
@ -215,54 +210,12 @@ index 36a0a009..ad1315c6 100644
                         layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, 0);
                         layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
                         layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
-@@ -3900,6 +3943,7 @@ enum llama_rope_type llama_model_rope_type(const struct llama_model * model) {
-         case LLM_ARCH_GRANITE:
-         case LLM_ARCH_GRANITE_MOE:
-         case LLM_ARCH_CHAMELEON:
-+        case LLM_ARCH_SOLAR:
-             return LLAMA_ROPE_TYPE_NORM;
+@@ -12244,6 +12287,165 @@ struct llm_build_chameleon : public llm_graph_context {
+     }
+ };
 
-         // the pairs of head values are offset by n_rot/2
-diff --git a/src/llama-model.h b/src/llama-model.h
-index a7c30444..1afb0024 100644
--- a/src/llama-model.h
-+++ b/src/llama-model.h
-@@ -55,6 +55,7 @@ enum llm_type {
-     LLM_TYPE_15B,
-     LLM_TYPE_16B,
-     LLM_TYPE_20B,
-+    LLM_TYPE_22B,
-     LLM_TYPE_30B,
-     LLM_TYPE_32B,
-     LLM_TYPE_34B,
-@@ -281,6 +282,8 @@ struct llama_layer {
-     struct ggml_tensor * ffn_up_scale   = nullptr;
-     struct ggml_tensor * ffn_down_scale = nullptr;
- 
-+    struct ggml_tensor * bskcn_tv = nullptr;
-+
-     struct llama_layer_posnet posnet;
- 
-     struct llama_layer_convnext convnext;
-diff --git a/src/llama.cpp b/src/llama.cpp
-index ac85bfed..6d320ea4 100644
--- a/src/llama.cpp
-+++ b/src/llama.cpp
-@@ -7953,9 +7953,155 @@ struct llm_build_context {
-         cb(img_logits, "img_logits", -1);
-         cur = ggml_set_1d(ctx0, cur, img_logits, ggml_element_size(cur) * img_token_start_idx);
-         cb(cur, "result_output", -1);
-
-         ggml_build_forward_expand(gf, cur);
-+        return gf;
-+   }
-+
-+   ggml_cgraph * build_solar() {
-+        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
-+
-+        // mutable variable, needed during the last layer of the computation to skip unused tokens
-+        int32_t n_tokens = this->n_tokens;
-+
+struct llm_build_solar : public llm_graph_context {
+    llm_build_solar(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
 +        const int64_t n_embd_head = hparams.n_embd_head_v;
 +        GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
 +        GGML_ASSERT(n_embd_head == hparams.n_rot);
@ -270,13 +223,15 @@ index ac85bfed..6d320ea4 100644
 +        struct ggml_tensor * cur;
 +        struct ggml_tensor * inpL;
 +
-+        inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
+        inpL = build_inp_embd(model.tok_embd);
 +
 +        // inp_pos - contains the positions
 +        struct ggml_tensor * inp_pos = build_inp_pos();
 +
 +        // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
-+        struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
+        auto * inp_attn = build_attn_inp_kv_unified();
+
+        const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
 +
 +        struct ggml_tensor * bskcn_1;
 +        struct ggml_tensor * bskcn_2;
@ -305,88 +260,94 @@ index ac85bfed..6d320ea4 100644
 +                   ggml_mul(ctx0, bskcn_2, ggml_view_1d(ctx0, model.layers[il].bskcn_tv, 1, 0)),
 +                   ggml_mul(ctx0, inpSA, ggml_view_1d(ctx0, model.layers[il].bskcn_tv, 1, ggml_element_size(model.layers[il].bskcn_tv))));
 +            }
- 
+
 +            // norm
-+            cur = llm_build_norm(ctx0, inpL, hparams,
+            cur = build_norm(inpL,
 +                    model.layers[il].attn_norm, NULL,
-+                    LLM_NORM_RMS, cb, il);
+                    LLM_NORM_RMS, il);
 +            cb(cur, "attn_norm", il);
 +
 +            // self-attention
 +            {
 +                // rope freq factors for llama3; may return nullptr for llama2 and other models
-+                struct ggml_tensor * rope_factors = build_rope_factors(il);
+                ggml_tensor * rope_factors = static_cast<const llama_kv_cache_unified *>(memory)->cbs.get_rope_factors(n_ctx_per_seq, il);
 +
 +                // compute Q and K and RoPE them
-+                struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
+                ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
 +                cb(Qcur, "Qcur", il);
 +                if (model.layers[il].bq) {
 +                    Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
 +                    cb(Qcur, "Qcur", il);
 +                }
 +
-+                struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur);
+                ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
 +                cb(Kcur, "Kcur", il);
 +                if (model.layers[il].bk) {
 +                    Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
 +                    cb(Kcur, "Kcur", il);
 +                }
 +
-+                struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur);
+                ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
 +                cb(Vcur, "Vcur", il);
 +                if (model.layers[il].bv) {
 +                    Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
 +                    cb(Vcur, "Vcur", il);
 +                }
 +
+                Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head,    n_tokens);
+                Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
+                Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
+
 +                Qcur = ggml_rope_ext(
-+                    ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, rope_factors,
-+                    n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
-+                    ext_factor, attn_factor, beta_fast, beta_slow
-+                );
-+                cb(Qcur, "Qcur", il);
+                        ctx0, Qcur, inp_pos, rope_factors,
+                        n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+                        ext_factor, attn_factor, beta_fast, beta_slow
+                        );
 +
 +                Kcur = ggml_rope_ext(
-+                    ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, rope_factors,
-+                    n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
-+                    ext_factor, attn_factor, beta_fast, beta_slow
-+                );
+                        ctx0, Kcur, inp_pos, rope_factors,
+                        n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+                        ext_factor, attn_factor, beta_fast, beta_slow
+                        );
+                
+                cb(Qcur, "Qcur", il);
 +                cb(Kcur, "Kcur", il);
+                cb(Vcur, "Vcur", il);
 +
-+                cur = llm_build_kv(ctx0, lctx, kv_self, gf,
+                cur = build_attn(inp_attn, gf,
 +                        model.layers[il].wo, model.layers[il].bo,
-+                        Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
+                        Qcur, Kcur, Vcur, nullptr, kq_scale, il);
+                cb(cur, "attn_out", il);
 +            }
 +
 +            if (il == n_layer - 1) {
 +                // skip computing output for unused tokens
-+                struct ggml_tensor * inp_out_ids = build_inp_out_ids();
-+                n_tokens = n_outputs;
+                ggml_tensor * inp_out_ids = build_inp_out_ids();
 +                cur   = ggml_get_rows(ctx0,   cur, inp_out_ids);
 +                inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
 +            }
 +
-+            struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
+            ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
 +            cb(ffn_inp, "ffn_inp", il);
 +
 +            // feed-forward network
-+            cur = llm_build_norm(ctx0, ffn_inp, hparams,
+            cur = build_norm(ffn_inp,
 +                    model.layers[il].ffn_norm, NULL,
-+                    LLM_NORM_RMS, cb, il);
+                    LLM_NORM_RMS, il);
 +            cb(cur, "ffn_norm", il);
 +
-+            cur = llm_build_ffn(ctx0, lctx, cur,
+            cur = build_ffn(cur,
 +                    model.layers[il].ffn_up,   model.layers[il].ffn_up_b,   NULL,
 +                    model.layers[il].ffn_gate, model.layers[il].ffn_gate_b, NULL,
 +                    model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
 +                    NULL,
-+                    LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
+                    LLM_FFN_SILU, LLM_FFN_PAR, il);
 +            cb(cur, "ffn_out", il);
 +
 +            cur = ggml_add(ctx0, cur, ffn_inp);
 +            cb(cur, "ffn_out", il);
 +
-+            cur = lctx.cvec.apply_to(ctx0, cur, il);
+            cur = build_cvec(cur, il);
 +            cb(cur, "l_out", il);
 +
 +            // input for next layer
@ -394,25 +355,64 @@ index ac85bfed..6d320ea4 100644
 +        }
 +
 +        cur = inpL;
-+        cur = llm_build_norm(ctx0, cur, hparams,
+
+        cur = build_norm(cur,
 +                model.output_norm, NULL,
-+                LLM_NORM_RMS, cb, -1);
+                LLM_NORM_RMS, -1);
+
 +        cb(cur, "result_norm", -1);
+        res->t_embd = cur;
+
 +        // lm_head
-+        cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
+        cur = build_lora_mm(model.output, cur);
+
 +        cb(cur, "result_output", -1);
+        res->t_logits = cur;
+
 +        ggml_build_forward_expand(gf, cur);
-         return gf;
-     }
- 
-@@ -8398,6 +8544,10 @@ static struct ggml_cgraph * llama_build_graph(
+    }
+};
+
+ struct llm_build_wavtokenizer_dec : public llm_graph_context {
+     llm_build_wavtokenizer_dec(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
+         ggml_tensor * cur;
+@@ -12993,6 +13195,10 @@ llm_graph_result_ptr llama_model::build_graph(
             {
-                 result = llm.build_chameleon();
+                 llm = std::make_unique<llm_build_chameleon>(*this, params, gf);
             } break;
 +        case LLM_ARCH_SOLAR:
 +            {
-+                result = llm.build_solar();
+                llm = std::make_unique<llm_build_solar>(*this, params, gf);
 +            } break;
         case LLM_ARCH_WAVTOKENIZER_DEC:
             {
-                 result = llm.build_wavtokenizer_dec();
+                 llm = std::make_unique<llm_build_wavtokenizer_dec>(*this, params, gf);
+@@ -13139,6 +13345,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
+         case LLM_ARCH_GRANITE:
+         case LLM_ARCH_GRANITE_MOE:
+         case LLM_ARCH_CHAMELEON:
+        case LLM_ARCH_SOLAR:
+         case LLM_ARCH_BAILINGMOE:
+             return LLAMA_ROPE_TYPE_NORM;
+ 
+diff --git a/src/llama-model.h b/src/llama-model.h
+index 0f18dac1..e08d4ae4 100644
+--- a/src/llama-model.h
+++ b/src/llama-model.h
+@@ -62,6 +62,7 @@ enum llm_type {
+     LLM_TYPE_15B,
+     LLM_TYPE_16B,
+     LLM_TYPE_20B,
+    LLM_TYPE_22B,
+     LLM_TYPE_30B,
+     LLM_TYPE_32B,
+     LLM_TYPE_34B,
+@@ -305,6 +306,8 @@ struct llama_layer {
+     struct ggml_tensor * ffn_up_scale   = nullptr;
+     struct ggml_tensor * ffn_down_scale = nullptr;
+ 
+    struct ggml_tensor * bskcn_tv = nullptr;
+
+     struct llama_layer_posnet posnet;
+ 
+     struct llama_layer_convnext convnext;
--- a/llama/patches/0006-conditional-fattn.patch
+++ b/llama/patches/0006-conditional-fattn.patch
@ -8,10 +8,10 @@ Subject: [PATCH] conditional-fattn
 1 file changed, 2 insertions(+)

 diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
-index dfff21a2..1b0d074b 100644
+index 59a49560..b70c6a32 100644
 --- a/ggml/src/ggml-cuda/ggml-cuda.cu
 +++ b/ggml/src/ggml-cuda/ggml-cuda.cu
-@@ -2284,9 +2284,11 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
+@@ -2338,9 +2338,11 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
         case GGML_OP_ARGSORT:
             ggml_cuda_op_argsort(ctx, dst);
             break;
--- a/llama/patches/0007-add-mllama-support.patch
+++ b/llama/patches/0007-add-mllama-support.patch
--- a/llama/patches/0008-add-unpad-operator.patch
+++ b/llama/patches/0008-add-unpad-operator.patch
@ -1,24 +1,27 @@
 From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
-From: Michael Yang <mxyng@pm.me>
-Date: Thu, 17 Oct 2024 17:19:25 -0700
+From: jmorganca <jmorganca@gmail.com>
+Date: Sun, 13 Apr 2025 22:10:06 -0400
 Subject: [PATCH] add unpad operator

+adds the unpad operator to GGML
 ---
 ggml/include/ggml.h                  | 10 +++++
- ggml/src/ggml-cpu/ggml-cpu.c         | 58 ++++++++++++++++++++++++++++
+ ggml/src/ggml-cpu/ggml-cpu.c         |  5 +++
+ ggml/src/ggml-cpu/ops.cpp            | 55 ++++++++++++++++++++++++++++
+ ggml/src/ggml-cpu/ops.h              |  1 +
 ggml/src/ggml-cuda/ggml-cuda.cu      |  4 ++
- ggml/src/ggml-cuda/pad.cu            | 46 ++++++++++++++++++++++
+ ggml/src/ggml-cuda/pad.cu            | 46 +++++++++++++++++++++++
 ggml/src/ggml-cuda/pad.cuh           |  1 +
- ggml/src/ggml-metal/ggml-metal.m     | 33 ++++++++++++++++
- ggml/src/ggml-metal/ggml-metal.metal | 45 +++++++++++++++++++++
- ggml/src/ggml.c                      | 25 +++++++++++-
- 8 files changed, 220 insertions(+), 2 deletions(-)
+ ggml/src/ggml-metal/ggml-metal.m     | 33 +++++++++++++++++
+ ggml/src/ggml-metal/ggml-metal.metal | 45 +++++++++++++++++++++++
+ ggml/src/ggml.c                      | 25 ++++++++++++-
+ 10 files changed, 223 insertions(+), 2 deletions(-)

 diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h
-index dd0c6a96..8d269a9c 100644
+index 8fcc16df..d19fc167 100644
 --- a/ggml/include/ggml.h
 +++ b/ggml/include/ggml.h
-@@ -487,6 +487,7 @@ extern "C" {
+@@ -488,6 +488,7 @@ extern "C" {
         GGML_OP_UPSCALE, // nearest interpolate
         GGML_OP_PAD,
         GGML_OP_PAD_REFLECT_1D,
@ -26,7 +29,7 @@ index dd0c6a96..8d269a9c 100644
         GGML_OP_ARANGE,
         GGML_OP_TIMESTEP_EMBEDDING,
         GGML_OP_ARGSORT,
-@@ -1743,6 +1744,15 @@ extern "C" {
+@@ -1757,6 +1758,15 @@ extern "C" {
             int                   p0,
             int                   p1);
 
@ -43,13 +46,38 @@ index dd0c6a96..8d269a9c 100644
     // timesteps: [N,]
     // return: [N, dim]
 diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c
-index 72325349..2f606d82 100644
+index 50400328..432942bf 100644
 --- a/ggml/src/ggml-cpu/ggml-cpu.c
 +++ b/ggml/src/ggml-cpu/ggml-cpu.c
-@@ -10844,6 +10844,59 @@ static void ggml_compute_forward_pad_reflect_1d(
+@@ -1960,6 +1960,10 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
+             {
+                 ggml_compute_forward_pad_reflect_1d(params, tensor);
+             } break;
+        case GGML_OP_UNPAD:
+            {
+                ggml_compute_forward_unpad(params, tensor);
+            } break;
+         case GGML_OP_ARANGE:
+             {
+                 ggml_compute_forward_arange(params, tensor);
+@@ -2282,6 +2286,7 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
+         case GGML_OP_UPSCALE:
+         case GGML_OP_PAD:
+         case GGML_OP_PAD_REFLECT_1D:
+        case GGML_OP_UNPAD:
+         case GGML_OP_ARANGE:
+         case GGML_OP_TIMESTEP_EMBEDDING:
+         case GGML_OP_ARGSORT:
+diff --git a/ggml/src/ggml-cpu/ops.cpp b/ggml/src/ggml-cpu/ops.cpp
+index 6050147b..66b8da68 100644
+--- a/ggml/src/ggml-cpu/ops.cpp
+++ b/ggml/src/ggml-cpu/ops.cpp
+@@ -6531,6 +6531,61 @@ void ggml_compute_forward_pad_reflect_1d(
     }
 }
 
+// ggml_compute_forward_unpad
+
 +static void ggml_compute_forward_unpad_f32(
 +    const struct ggml_compute_params *params,
 +    struct ggml_tensor *dst) {
@ -85,7 +113,7 @@ index 72325349..2f606d82 100644
 +    }
 +}
 +
-+static void ggml_compute_forward_unpad(
+void ggml_compute_forward_unpad(
 +    const struct ggml_compute_params * params,
 +    struct ggml_tensor * dst) {
 +
@ -106,30 +134,23 @@ index 72325349..2f606d82 100644
 // ggml_compute_forward_arange
 
 static void ggml_compute_forward_arange_f32(
-@@ -13137,6 +13190,10 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
-             {
-                 ggml_compute_forward_pad_reflect_1d(params, tensor);
-             } break;
-+        case GGML_OP_UNPAD:
-+            {
-+                ggml_compute_forward_unpad(params, tensor);
-+            } break;
-         case GGML_OP_ARANGE:
-             {
-                 ggml_compute_forward_arange(params, tensor);
-@@ -13484,6 +13541,7 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
-         case GGML_OP_UPSCALE:
-         case GGML_OP_PAD:
-         case GGML_OP_PAD_REFLECT_1D:
-+        case GGML_OP_UNPAD:
-         case GGML_OP_ARANGE:
-         case GGML_OP_TIMESTEP_EMBEDDING:
-         case GGML_OP_ARGSORT:
+diff --git a/ggml/src/ggml-cpu/ops.h b/ggml/src/ggml-cpu/ops.h
+index 410a3720..3eca1cf8 100644
+--- a/ggml/src/ggml-cpu/ops.h
+++ b/ggml/src/ggml-cpu/ops.h
+@@ -71,6 +71,7 @@ void ggml_compute_forward_pool_2d_back(const struct ggml_compute_params * params
+ void ggml_compute_forward_upscale(const struct ggml_compute_params * params, struct ggml_tensor * dst);
+ void ggml_compute_forward_pad(const struct ggml_compute_params * params, struct ggml_tensor * dst);
+ void ggml_compute_forward_pad_reflect_1d(const struct ggml_compute_params * params, struct ggml_tensor * dst);
+void ggml_compute_forward_unpad(const struct ggml_compute_params * params, struct ggml_tensor * dst);
+ void ggml_compute_forward_arange(const struct ggml_compute_params * params, struct ggml_tensor * dst);
+ void ggml_compute_forward_timestep_embedding(const struct ggml_compute_params * params, struct ggml_tensor * dst);
+ void ggml_compute_forward_argsort(const struct ggml_compute_params * params, struct ggml_tensor * dst);
 diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
-index 1b0d074b..c7a957c8 100644
+index b70c6a32..67208cba 100644
 --- a/ggml/src/ggml-cuda/ggml-cuda.cu
 +++ b/ggml/src/ggml-cuda/ggml-cuda.cu
-@@ -2200,6 +2200,9 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
+@@ -2245,6 +2245,9 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
         case GGML_OP_PAD:
             ggml_cuda_op_pad(ctx, dst);
             break;
@ -139,16 +160,16 @@ index 1b0d074b..c7a957c8 100644
         case GGML_OP_ARANGE:
             ggml_cuda_op_arange(ctx, dst);
             break;
-@@ -3199,6 +3202,7 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
-             return ggml_is_contiguous(op->src[0]);
+@@ -3223,6 +3226,7 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
         case GGML_OP_UPSCALE:
+             return op->src[0]->type == GGML_TYPE_F32 && op->op_params[0] == GGML_SCALE_MODE_NEAREST;
         case GGML_OP_PAD:
 +        case GGML_OP_UNPAD:
         case GGML_OP_ARANGE:
         case GGML_OP_TIMESTEP_EMBEDDING:
         case GGML_OP_LEAKY_RELU:
 diff --git a/ggml/src/ggml-cuda/pad.cu b/ggml/src/ggml-cuda/pad.cu
-index aba539e8..b4b87409 100644
+index 77432b04..7d45a7e1 100644
 --- a/ggml/src/ggml-cuda/pad.cu
 +++ b/ggml/src/ggml-cuda/pad.cu
@@ -47,3 +47,49 @@ void ggml_cuda_op_pad(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
@ -212,10 +233,10 @@ index 8fd386b0..e2ededc3 100644
 void ggml_cuda_op_pad(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
 +void ggml_cuda_op_unpad(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
 diff --git a/ggml/src/ggml-metal/ggml-metal.m b/ggml/src/ggml-metal/ggml-metal.m
-index fd9a4e77..e4c093f9 100644
+index 310afe8a..b121ab9e 100644
 --- a/ggml/src/ggml-metal/ggml-metal.m
 +++ b/ggml/src/ggml-metal/ggml-metal.m
-@@ -331,6 +331,7 @@ static void ggml_backend_metal_device_rel(struct ggml_backend_metal_device_conte
+@@ -341,6 +341,7 @@ static void ggml_backend_metal_device_rel(struct ggml_backend_metal_device_conte
     GGML_METAL_KERNEL_TYPE_UPSCALE_F32,
     GGML_METAL_KERNEL_TYPE_PAD_F32,
     GGML_METAL_KERNEL_TYPE_PAD_REFLECT_1D_F32,
@ -223,23 +244,23 @@ index fd9a4e77..e4c093f9 100644
     GGML_METAL_KERNEL_TYPE_ARANGE_F32,
     GGML_METAL_KERNEL_TYPE_TIMESTEP_EMBEDDING_F32,
     GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_ASC,
-@@ -946,6 +947,7 @@ @implementation GGMLMetalClass
-         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_UPSCALE_F32,                   upscale_f32,                    true);
-         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_PAD_F32,                       pad_f32,                        true);
-         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_PAD_REFLECT_1D_F32,            pad_reflect_1d_f32,             true);
-+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_UNPAD_F32,                     unpad_f32,                      true);
-         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_TIMESTEP_EMBEDDING_F32,        timestep_embedding_f32,         true);
-         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ARANGE_F32,                    arange_f32,                     true);
-         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_ASC,           argsort_f32_i32_asc,            true);
-@@ -1254,6 +1256,7 @@ static bool ggml_metal_supports_op(const struct ggml_backend_metal_device_contex
-         case GGML_OP_UPSCALE:
+@@ -998,6 +999,7 @@ @implementation GGMLMetalClass
+         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_UPSCALE_F32,                     upscale_f32,                     true);
+         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_PAD_F32,                         pad_f32,                         true);
+         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_PAD_REFLECT_1D_F32,              pad_reflect_1d_f32,              true);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_UNPAD_F32,                       unpad_f32,                       true);
+         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_TIMESTEP_EMBEDDING_F32,          timestep_embedding_f32,          true);
+         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ARANGE_F32,                      arange_f32,                      true);
+         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_ASC,             argsort_f32_i32_asc,             true);
+@@ -1339,6 +1341,7 @@ static bool ggml_metal_supports_op(const struct ggml_backend_metal_device_contex
+         case GGML_OP_POOL_2D:
         case GGML_OP_PAD:
         case GGML_OP_PAD_REFLECT_1D:
 +        case GGML_OP_UNPAD:
-         case GGML_OP_ARANGE:
         case GGML_OP_TIMESTEP_EMBEDDING:
         case GGML_OP_ARGSORT:
-@@ -3469,6 +3472,36 @@ static void ggml_metal_encode_node(
+         case GGML_OP_LEAKY_RELU:
+@@ -3669,6 +3672,36 @@ static void ggml_metal_encode_node(
 
                 const int nth = MIN(1024, ne0);
 
@ -277,10 +298,10 @@ index fd9a4e77..e4c093f9 100644
             } break;
         case GGML_OP_ARANGE:
 diff --git a/ggml/src/ggml-metal/ggml-metal.metal b/ggml/src/ggml-metal/ggml-metal.metal
-index d092a169..f38909d0 100644
+index b08666e2..e3185e5b 100644
 --- a/ggml/src/ggml-metal/ggml-metal.metal
 +++ b/ggml/src/ggml-metal/ggml-metal.metal
-@@ -2953,6 +2953,51 @@ kernel void kernel_pad_reflect_1d_f32(
+@@ -2968,6 +2968,51 @@ kernel void kernel_pad_reflect_1d_f32(
     }
 }
 
@ -331,12 +352,12 @@ index d092a169..f38909d0 100644
 +
 kernel void kernel_arange_f32(
     device        char * dst,
-     constant   int64_t & ne0,
+     constant   ggml_metal_kargs_arange & args,
 diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c
-index 7fc06724..635aa299 100644
+index 950772c7..2276b631 100644
 --- a/ggml/src/ggml.c
 +++ b/ggml/src/ggml.c
-@@ -962,6 +962,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
+@@ -963,6 +963,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
     "UPSCALE",
     "PAD",
     "PAD_REFLECT_1D",
@ -344,16 +365,16 @@ index 7fc06724..635aa299 100644
     "ARANGE",
     "TIMESTEP_EMBEDDING",
     "ARGSORT",
-@@ -996,7 +997,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
+@@ -993,7 +994,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
     "OPT_STEP_ADAMW",
 };
 
-static_assert(GGML_OP_COUNT == 83, "GGML_OP_COUNT != 83");
-+static_assert(GGML_OP_COUNT == 84, "GGML_OP_COUNT != 84");
+-static_assert(GGML_OP_COUNT == 81, "GGML_OP_COUNT != 81");
+static_assert(GGML_OP_COUNT == 82, "GGML_OP_COUNT != 82");
 
 static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
     "none",
-@@ -1059,6 +1060,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
+@@ -1057,6 +1058,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
     "upscale(x)",
     "pad(x)",
     "pad_reflect_1d(x)",
@ -361,16 +382,16 @@ index 7fc06724..635aa299 100644
     "arange(start, stop, step)",
     "timestep_embedding(timesteps, dim, max_period)",
     "argsort(x)",
-@@ -1093,7 +1095,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
+@@ -1087,7 +1089,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
     "adamw(x)",
 };
 
-static_assert(GGML_OP_COUNT == 83, "GGML_OP_COUNT != 83");
-+static_assert(GGML_OP_COUNT == 84, "GGML_OP_COUNT != 84");
+-static_assert(GGML_OP_COUNT == 81, "GGML_OP_COUNT != 81");
+static_assert(GGML_OP_COUNT == 82, "GGML_OP_COUNT != 82");
 
 static_assert(GGML_OP_POOL_COUNT == 2, "GGML_OP_POOL_COUNT != 2");
 
-@@ -4225,6 +4227,25 @@ struct ggml_tensor * ggml_pad_reflect_1d(
+@@ -4262,6 +4264,25 @@ struct ggml_tensor * ggml_pad_reflect_1d(
     return result;
 }
 
--- a/llama/patches/0009-fix-deepseek-deseret-regex.patch
+++ b/llama/patches/0009-fix-deepseek-deseret-regex.patch
@ -1,20 +1,21 @@
 From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
-From: Daniel Hiltgen <daniel@ollama.com>
-Date: Fri, 25 Oct 2024 16:25:18 -0700
+From: jmorganca <jmorganca@gmail.com>
+Date: Tue, 8 Apr 2025 19:43:06 -0700
 Subject: [PATCH] fix deepseek deseret regex

-On windows compiled with gcc the c++ regex library failed to handle
-the characters
+on some systems, deepseek's regex would throw an error
+on windows due to the deseret characters in the matching
+regex
 ---
 src/llama-vocab.cpp |  2 +-
- src/unicode.cpp     | 22 ++++++++++++++++++++++
- 2 files changed, 23 insertions(+), 1 deletion(-)
+ src/unicode.cpp     | 21 +++++++++++++++++++++
+ 2 files changed, 22 insertions(+), 1 deletion(-)

 diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp
-index a4eee9b8..1ca827eb 100644
+index 0125ee53..d74919d2 100644
 --- a/src/llama-vocab.cpp
 +++ b/src/llama-vocab.cpp
-@@ -295,7 +295,7 @@ struct llm_tokenizer_bpe : llm_tokenizer {
+@@ -296,7 +296,7 @@ struct llm_tokenizer_bpe : llm_tokenizer {
             case LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_LLM:
                 regex_exprs = {
                     "[\r\n]",
@ -24,7 +25,7 @@ index a4eee9b8..1ca827eb 100644
                     "\\s+$",
                     "[一-龥ࠀ-一가-퟿]+",
 diff --git a/src/unicode.cpp b/src/unicode.cpp
-index e63bb4ab..9dd53b9a 100644
+index e63bb4ab..73cb2b1a 100644
 --- a/src/unicode.cpp
 +++ b/src/unicode.cpp
@@ -2,6 +2,11 @@
@ -39,7 +40,7 @@ index e63bb4ab..9dd53b9a 100644
 #include "unicode.h"
 #include "unicode-data.h"
 
-@@ -200,6 +205,22 @@ static std::unordered_map<std::string, uint8_t> unicode_utf8_to_byte_map() {
+@@ -200,6 +205,21 @@ static std::unordered_map<std::string, uint8_t> unicode_utf8_to_byte_map() {
 }
 
 static inline std::wstring unicode_wstring_from_utf8(const std::string & s) {
@ -58,11 +59,10 @@ index e63bb4ab..9dd53b9a 100644
 +    free(wbuf);
 +    return ret;
 +#else
-+
 #if defined(__clang__)
     // disable C++17 deprecation warning for std::codecvt_utf8
 #    pragma clang diagnostic push
-@@ -213,6 +234,7 @@ static inline std::wstring unicode_wstring_from_utf8(const std::string & s) {
+@@ -213,6 +233,7 @@ static inline std::wstring unicode_wstring_from_utf8(const std::string & s) {
 #endif
 
     return conv.from_bytes(s);
--- a/llama/patches/0010-Maintain-ordering-for-rules-for-grammar.patch
+++ b/llama/patches/0010-Maintain-ordering-for-rules-for-grammar.patch
@ -1,14 +1,14 @@
 From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
-From: ParthSareen <parth.sareen@ollama.com>
-Date: Wed, 11 Dec 2024 15:37:32 -0800
-Subject: [PATCH] Maintain ordering for rules for grammar
+From: jmorganca <jmorganca@gmail.com>
+Date: Tue, 8 Apr 2025 19:43:40 -0700
+Subject: [PATCH] maintain ordering for rules for grammar

 ---
 common/json-schema-to-grammar.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

 diff --git a/common/json-schema-to-grammar.cpp b/common/json-schema-to-grammar.cpp
-index 3ebcc3d9..30c28808 100644
+index 90679822..56043678 100644
 --- a/common/json-schema-to-grammar.cpp
 +++ b/common/json-schema-to-grammar.cpp
@@ -346,7 +346,7 @@ private:
--- a/llama/patches/0011-ensure-KV-cache-is-fully-defragmented.patch
+++ b/llama/patches/0011-ensure-KV-cache-is-fully-defragmented.patch
@ -0,0 +1,361 @@
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
+From: jmorganca <jmorganca@gmail.com>
+Date: Tue, 15 Apr 2025 14:27:40 -0400
+Subject: [PATCH] ensure KV cache is fully defragmented
+
+Sometimes the KV cache requires defragmentation even without
+triggering the threshold heuristic. In this case, decoding
+will not being able to find a KV cache slot. This is particularly
+difficult for the caller to handle if it happens in between
+ubatches. To avoid this, we should immediately trigger a defrag.
+
+In addition, a heavily fragmented cache can require more than
+max_moves to defragment. Currently, we stop when we hit the limit
+but this can leave a cache that still does not have adequate space
+even after defragmentation is triggered. Instead, we should do
+multiple batches of processing until everything is complete.
+---
+ src/llama-context.cpp  | 105 +++++++++++++----------------------------
+ src/llama-context.h    |   4 +-
+ src/llama-kv-cache.cpp |  39 +++------------
+ src/llama-kv-cache.h   |   9 +++-
+ 4 files changed, 51 insertions(+), 106 deletions(-)
+
+diff --git a/src/llama-context.cpp b/src/llama-context.cpp
+index afe6f552..d6e7b3af 100644
+--- a/src/llama-context.cpp
+++ b/src/llama-context.cpp
+@@ -590,13 +590,12 @@ llm_graph_result_ptr llama_context::build_kv_self_shift(
+ 
+ llm_graph_result_ptr llama_context::build_kv_self_defrag(
+         ggml_context * ctx0,
+-        ggml_cgraph * gf) const {
+        ggml_cgraph * gf,
+        const std::vector<struct llama_kv_defrag_move> & moves) const {
+     auto res = std::make_unique<llm_graph_result>();
+ 
+     const auto & hparams = model.hparams;
+ 
+-    const auto & ids = kv_self->defrag_info.ids;
+-
+ #if 0
+     // CPU defrag
+     //
+@@ -668,32 +667,20 @@ llm_graph_result_ptr llama_context::build_kv_self_defrag(
+         ggml_backend_tensor_set(v_l[il], buf_v.data(), 0, buf_v.size());
+     }
+ #else
+-    for (uint32_t i = 0; i < ids.size(); ++i) {
+-        const uint32_t id = ids[i];
+-
+-        if (i == id || id == ids.size()) {
+-            continue;
+-        }
+-
+-        uint32_t nm = 1;
+-
+-        while (i + nm < ids.size() && ids[i + nm] == id + nm) {
+-            nm++;
+-        }
+-
+    for (const auto & move : moves) {
+         for (uint32_t il = 0; il < hparams.n_layer; ++il) { // NOLINT
+             const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa(il);
+             const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa(il);
+ 
+             ggml_tensor * view_k_src = ggml_view_2d(ctx0, kv_self->k_l[il],
+-                    n_embd_k_gqa, nm,
+                    n_embd_k_gqa, move.len,
+                     ggml_row_size(kv_self->k_l[il]->type, n_embd_k_gqa),
+-                    ggml_row_size(kv_self->k_l[il]->type, n_embd_k_gqa*i));
+                    ggml_row_size(kv_self->k_l[il]->type, n_embd_k_gqa*move.src));
+ 
+             ggml_tensor * view_k_dst = ggml_view_2d(ctx0, kv_self->k_l[il],
+-                    n_embd_k_gqa, nm,
+                    n_embd_k_gqa, move.len,
+                     ggml_row_size(kv_self->k_l[il]->type, n_embd_k_gqa),
+-                    ggml_row_size(kv_self->k_l[il]->type, n_embd_k_gqa*id));
+                    ggml_row_size(kv_self->k_l[il]->type, n_embd_k_gqa*move.dst));
+ 
+             ggml_tensor * view_v_src;
+             ggml_tensor * view_v_dst;
+@@ -701,34 +688,30 @@ llm_graph_result_ptr llama_context::build_kv_self_defrag(
+             if (cparams.flash_attn) {
+                 // NOTE: the V cache is not transposed when using flash attention
+                 view_v_src = ggml_view_2d(ctx0, kv_self->v_l[il],
+-                        n_embd_v_gqa, nm,
+                        n_embd_v_gqa, move.len,
+                         ggml_row_size(kv_self->v_l[il]->type, n_embd_v_gqa),
+-                        ggml_row_size(kv_self->v_l[il]->type, n_embd_v_gqa*i));
+                        ggml_row_size(kv_self->v_l[il]->type, n_embd_v_gqa*move.src));
+ 
+                 view_v_dst = ggml_view_2d(ctx0, kv_self->v_l[il],
+-                        n_embd_v_gqa, nm,
+                        n_embd_v_gqa, move.len,
+                         ggml_row_size(kv_self->v_l[il]->type, n_embd_v_gqa),
+-                        ggml_row_size(kv_self->v_l[il]->type, n_embd_v_gqa*id));
+                        ggml_row_size(kv_self->v_l[il]->type, n_embd_v_gqa*move.dst));
+             } else {
+                 view_v_src = ggml_view_2d(ctx0, kv_self->v_l[il],
+-                        nm, n_embd_v_gqa,
+                        move.len, n_embd_v_gqa,
+                         ggml_row_size(kv_self->v_l[il]->type, kv_self->size),
+-                        ggml_row_size(kv_self->v_l[il]->type, i));
+                        ggml_row_size(kv_self->v_l[il]->type, move.src));
+ 
+                 view_v_dst = ggml_view_2d(ctx0, kv_self->v_l[il],
+-                        nm, n_embd_v_gqa,
+                        move.len, n_embd_v_gqa,
+                         ggml_row_size(kv_self->v_l[il]->type, kv_self->size),
+-                        ggml_row_size(kv_self->v_l[il]->type, id));
+                        ggml_row_size(kv_self->v_l[il]->type, move.dst));
+             }
+ 
+             ggml_build_forward_expand(gf, ggml_cpy(ctx0, view_k_src, view_k_dst));
+             ggml_build_forward_expand(gf, ggml_cpy(ctx0, view_v_src, view_v_dst));
+         }
+-
+-        i += nm - 1;
+     }
+-
+-    //LLAMA_LOG_INFO("gf->n_nodes = %d\n", gf->n_nodes);
+ #endif
+ 
+     return res;
+@@ -737,8 +720,6 @@ llm_graph_result_ptr llama_context::build_kv_self_defrag(
+ void llama_context::kv_self_update() {
+     auto & kv = kv_self;
+ 
+-    bool need_reserve = false;
+-
+     if (kv->has_shift) {
+         if (!kv->get_can_shift()) {
+             GGML_ABORT("The current context does not support K-shift");
+@@ -759,8 +740,6 @@ void llama_context::kv_self_update() {
+             res->set_inputs(nullptr);
+ 
+             graph_compute(gf, false);
+-
+-            need_reserve = true;
+         }
+ 
+         {
+@@ -775,49 +754,28 @@ void llama_context::kv_self_update() {
+     // defragment the KV cache if needed
+     if (kv->do_defrag) {
+         LLAMA_LOG_DEBUG("%s: defragmenting KV cache\n", __func__);
+        const uint32_t n_max_nodes = graph_max_nodes();
+        const uint32_t max_moves = (n_max_nodes - 2*model.hparams.n_layer)/(6*model.hparams.n_layer);
+        if (!kv->defrag_prepare(n_max_nodes)) {
+            LLAMA_LOG_ERROR("%s: failed to prepare defragmentation\n", __func__);
+            return;
+        }
+ 
+-        if (kv->defrag_prepare(graph_max_nodes())) {
+-            ggml_backend_sched_reset(sched.get());
+        for (std::size_t i = 0; i < kv_self->defrag_info.moves.size(); i += max_moves) {
+            std::vector<struct llama_kv_defrag_move> chunk;
+            auto end = std::min(i + max_moves, kv_self->defrag_info.moves.size());
+            chunk.assign(kv_self->defrag_info.moves.begin() + i, kv_self->defrag_info.moves.begin() + end);
+ 
+            ggml_backend_sched_reset(sched.get());
+             auto * gf = graph_init();
+-
+-            auto res = build_kv_self_defrag(ctx_compute.get(), gf);
+-
+            auto res = build_kv_self_defrag(ctx_compute.get(), gf, chunk);
+             ggml_backend_sched_alloc_graph(sched.get(), gf);
+-
+             res->set_inputs(nullptr);
+-
+             graph_compute(gf, false);
+-
+-            need_reserve = true;
+         }
+ 
+         kv->do_defrag = false;
+     }
+-
+-    // reserve a worst case graph if needed
+-    if (need_reserve) {
+-        LLAMA_LOG_DEBUG("%s: reserving a worst case graph\n", __func__);
+-
+-        // build worst-case graph
+-        uint32_t n_seqs = 1; // TODO: worst-case number of sequences
+-        uint32_t n_tokens = std::min(cparams.n_ctx, cparams.n_ubatch);
+-
+-        // simulate full KV cache
+-        kv_self->n = kv_self->size;
+-
+-        llama_token token = model.vocab.token_bos(); // not actually used by llama_build_graph, but required to choose between token and embedding inputs graph
+-        llama_ubatch ubatch = { true, n_tokens, n_tokens / n_seqs, n_seqs, &token, nullptr, nullptr, nullptr, nullptr, nullptr};
+-
+-        auto * gf = graph_init();
+-        graph_build(ctx_compute.get(), gf, ubatch, LLM_GRAPH_TYPE_DEFAULT);
+-
+-        // initialize scheduler with the worst-case graph
+-        ggml_backend_sched_reset(sched.get());
+-        if (!ggml_backend_sched_reserve(sched.get(), gf)) {
+-            LLAMA_LOG_ERROR("%s: failed to allocate compute buffers\n", __func__);
+-        }
+-    }
+ }
+ 
+ enum llama_pooling_type llama_context::pooling_type() const {
+@@ -1301,9 +1259,12 @@ int llama_context::decode(llama_batch & inp_batch) {
+         // find KV slot
+         {
+             if (!kv_self->find_slot(ubatch)) {
+-                LLAMA_LOG_WARN("%s: failed to find KV cache slot for ubatch of size %d\n", __func__, ubatch.n_tokens);
+-
+-                return 1;
+                kv_self->defrag();
+                kv_self_update();
+                if (!kv_self->find_slot(ubatch)) {
+                    LLAMA_LOG_WARN("%s: failed to find KV cache slot for ubatch of size %d\n", __func__, ubatch.n_tokens);
+                    return 1;
+                }
+             }
+ 
+             if (!kv_self->recurrent) {
+diff --git a/src/llama-context.h b/src/llama-context.h
+index baa03276..a59ff8fd 100644
+--- a/src/llama-context.h
+++ b/src/llama-context.h
+@@ -5,6 +5,7 @@
+ #include "llama-cparams.h"
+ #include "llama-graph.h"
+ #include "llama-adapter.h"
+#include "llama-kv-cache.h"
+ 
+ #include "ggml-cpp.h"
+ 
+@@ -180,7 +181,8 @@ private:
+ 
+     llm_graph_result_ptr build_kv_self_defrag(
+             ggml_context * ctx0,
+-            ggml_cgraph * gf) const;
+            ggml_cgraph * gf,
+            const std::vector<struct llama_kv_defrag_move> & moves) const;
+ 
+     // TODO: read/write lora adapters and cvec
+     size_t state_write_data(llama_io_write_i & io);
+diff --git a/src/llama-kv-cache.cpp b/src/llama-kv-cache.cpp
+index 9310f262..5c941e7c 100644
+--- a/src/llama-kv-cache.cpp
+++ b/src/llama-kv-cache.cpp
+@@ -781,17 +781,7 @@ bool llama_kv_cache_unified::defrag_prepare(int32_t n_max_nodes) {
+ 
+     assert(n_used <= n_kv);
+ 
+-    //const int64_t t_start = ggml_time_us();
+-
+-    // number of cells moved
+-    uint32_t n_moves = 0;
+-
+-    // each move requires 6*n_layer tensors (see graph_build_kv_self_defrag)
+-    //   - source view, destination view, copy operation
+-    //   - x2 for keys and values
+-    //const uint32_t max_moves = max_nodes()/(6*n_layer);
+-    // TODO: tmp fix https://github.com/ggerganov/llama.cpp/issues/6685#issuecomment-2057579516
+-    const uint32_t max_moves = (n_max_nodes - 2*n_layer)/(6*n_layer);
+    defrag_info.moves.clear();
+ 
+     // determine which KV cells to move where
+     //
+@@ -799,10 +789,7 @@ bool llama_kv_cache_unified::defrag_prepare(int32_t n_max_nodes) {
+     //
+     //  if ids[i] == i || ids[i] == n_kv, then cell i is not moved
+     //
+-    auto & ids = defrag_info.ids;
+-
+-    ids.clear();
+-    ids.resize(n_kv, n_kv);
+    std::vector<uint32_t> ids(n_kv, n_kv);
+ 
+     for (uint32_t i0 = 0; i0 < n_used; ++i0) {
+         const auto & cell0 = cells[i0];
+@@ -851,19 +838,11 @@ bool llama_kv_cache_unified::defrag_prepare(int32_t n_max_nodes) {
+         // are we moving a continuous block of memory?
+         bool cont = false;
+ 
+-        // should we stop searching for the next move?
+-        bool stop = false;
+-
+         // go back and move the nf cells to the hole
+         for (; i1 < n_kv; ++i1) {
+             auto & cell1 = cells[i1];
+ 
+             if (cell1.is_empty() || ids[i1] != n_kv) {
+-                if (n_moves == max_moves) {
+-                    stop = true;
+-                    break;
+-                }
+-
+                 cont = false;
+                 continue;
+             }
+@@ -879,8 +858,10 @@ bool llama_kv_cache_unified::defrag_prepare(int32_t n_max_nodes) {
+             head = n_used;
+ 
+             if (!cont) {
+-                n_moves++;
+                defrag_info.moves.push_back({i1, i0 + nf, 1});
+                 cont = true;
+            } else {
+                defrag_info.moves.back().len++;
+             }
+ 
+             nf++;
+@@ -890,22 +871,16 @@ bool llama_kv_cache_unified::defrag_prepare(int32_t n_max_nodes) {
+             }
+         }
+ 
+-        if (stop || n_moves == max_moves) {
+-            break;
+-        }
+-
+         //LLAMA_LOG_INFO("(tmp log) KV defrag: move [%u, %u) to [%u, %u)\n", is, i1 + 1, i0, i0 + nh);
+ 
+         i0 += nh - 1;
+     }
+ 
+-    if (n_moves == 0) {
+    if (defrag_info.moves.size() == 0) {
+         return false;
+     }
+ 
+-    LLAMA_LOG_DEBUG("(tmp log) KV defrag cell moves: %u\n", n_moves);
+-
+-    LLAMA_LOG_DEBUG("expected gf nodes: %u\n", 6*n_moves*n_layer);
+    // LLAMA_LOG_DEBUG("(tmp log) KV defrag cell moves: %u\n", n_moves);
+ 
+     return true;
+ }
+diff --git a/src/llama-kv-cache.h b/src/llama-kv-cache.h
+index 56c74035..25cbcb56 100644
+--- a/src/llama-kv-cache.h
+++ b/src/llama-kv-cache.h
+@@ -43,6 +43,13 @@ private:
+     llama_kv_cache * kv;
+ };
+ 
+// block of KV slots to move when defragging
+struct llama_kv_defrag_move {
+    uint32_t src;
+    uint32_t dst;
+    uint32_t len;
+};
+
+ struct llama_kv_cell {
+     llama_pos pos   = -1;
+     llama_pos delta =  0;
+@@ -131,7 +138,7 @@ public:
+     // defrag
+ 
+     struct {
+-        std::vector<uint32_t> ids;
+        std::vector<llama_kv_defrag_move> moves;
+     } defrag_info;
+ 
+     // return true if cells have been moved
--- a/llama/patches/0011-llama-Ensure-KV-cache-is-fully-defragmented.patch
+++ b/llama/patches/0011-llama-Ensure-KV-cache-is-fully-defragmented.patch
@ -1,242 +0,0 @@
-From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
-From: Jesse Gross <jesse@ollama.com>
-Date: Fri, 13 Dec 2024 16:11:59 -0800
-Subject: [PATCH] llama: Ensure KV cache is fully defragmented.
-
-Sometimes the KV cache requires defragmentation even without
-triggering the threshold heuristic. In this case, decoding
-will not being able to find a KV cache slot. This is particularly
-difficult for the caller to handle if it happens in between
-ubatches. To avoid this, we should immediately trigger a defrag.
-
-In addition, a heavily fragmented cache can require more than
-max_moves to defragment. Currently, we stop when we hit the limit
-but this can leave a cache that still does not have adequate space
-even after defragmentation is triggered. Instead, we should do
-multiple batches of processing until everything is complete.
---
- src/llama.cpp | 99 ++++++++++++++++++++++++---------------------------
- 1 file changed, 46 insertions(+), 53 deletions(-)
-
-diff --git a/src/llama.cpp b/src/llama.cpp
-index 8f7902df..01854fce 100644
--- a/src/llama.cpp
-+++ b/src/llama.cpp
-@@ -1054,6 +1054,13 @@ static struct ggml_tensor * llm_build_rwkv6_channel_mix(
-     return ggml_mul(ctx, r, llm_build_lora_mm(lctx, ctx, layer->channel_mix_value, k));
- }
- 
-+// block of KV slots to move when defragging
-+struct llama_kv_defrag_move {
-+    uint32_t src;
-+    uint32_t dst;
-+    uint32_t len;
-+};
-+
- struct llm_build_context {
-     const llama_model    & model;
-           llama_context  & lctx;
-@@ -1230,35 +1237,23 @@ struct llm_build_context {
-         return gf;
-     }
- 
-    struct ggml_cgraph * build_defrag(const std::vector<uint32_t> & ids) {
-+    struct ggml_cgraph * build_defrag(const std::vector<struct llama_kv_defrag_move> & moves) {
-         struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
- 
-        for (uint32_t i = 0; i < ids.size(); ++i) {
-            const uint32_t id = ids[i];
-
-            if (i == id || id == ids.size()) {
-                continue;
-            }
-
-            uint32_t nm = 1;
-
-            while (i + nm < ids.size() && ids[i + nm] == id + nm) {
-                nm++;
-            }
-
-+        for (const auto & move : moves) {
-             for (int il = 0; il < n_layer; ++il) {
-                 const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa(il);
-                 const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa(il);
- 
-                 ggml_tensor * view_k_src = ggml_view_2d(ctx0, kv_self.k_l[il],
-                        n_embd_k_gqa, nm,
-+                        n_embd_k_gqa, move.len,
-                         ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa),
-                        ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa*i));
-+                        ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa*move.src));
- 
-                 ggml_tensor * view_k_dst = ggml_view_2d(ctx0, kv_self.k_l[il],
-                        n_embd_k_gqa, nm,
-+                        n_embd_k_gqa, move.len,
-                         ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa),
-                        ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa*id));
-+                        ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa*move.dst));
- 
-                 ggml_tensor * view_v_src;
-                 ggml_tensor * view_v_dst;
-@@ -1266,31 +1261,29 @@ struct llm_build_context {
-                 if (flash_attn) {
-                     // NOTE: the V cache is not transposed when using flash attention
-                     view_v_src = ggml_view_2d(ctx0, kv_self.v_l[il],
-                            n_embd_v_gqa, nm,
-+                            n_embd_v_gqa, move.len,
-                             ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa),
-                            ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa*i));
-+                            ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa*move.src));
- 
-                     view_v_dst = ggml_view_2d(ctx0, kv_self.v_l[il],
-                            n_embd_v_gqa, nm,
-+                            n_embd_v_gqa, move.len,
-                             ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa),
-                            ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa*id));
-+                            ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa*move.dst));
-                 } else {
-                     view_v_src = ggml_view_2d(ctx0, kv_self.v_l[il],
-                            nm, n_embd_v_gqa,
-+                            move.len, n_embd_v_gqa,
-                             ggml_row_size(kv_self.v_l[il]->type, kv_self.size),
-                            ggml_row_size(kv_self.v_l[il]->type, i));
-+                            ggml_row_size(kv_self.v_l[il]->type, move.src));
- 
-                     view_v_dst = ggml_view_2d(ctx0, kv_self.v_l[il],
-                            nm, n_embd_v_gqa,
-+                            move.len, n_embd_v_gqa,
-                             ggml_row_size(kv_self.v_l[il]->type, kv_self.size),
-                            ggml_row_size(kv_self.v_l[il]->type, id));
-+                            ggml_row_size(kv_self.v_l[il]->type, move.dst));
-                 }
- 
-                 ggml_build_forward_expand(gf, ggml_cpy(ctx0, view_k_src, view_k_dst));
-                 ggml_build_forward_expand(gf, ggml_cpy(ctx0, view_v_src, view_v_dst));
-             }
-
-            i += nm - 1;
-         }
- 
-         //LLAMA_LOG_INFO("gf->n_nodes = %d\n", gf->n_nodes);
-@@ -8508,7 +8501,7 @@ struct llm_build_context {
-     }
- };
- 
-static struct ggml_cgraph * llama_build_graph_defrag(llama_context & lctx, const std::vector<uint32_t> & ids) {
-+static struct ggml_cgraph * llama_build_graph_defrag(llama_context & lctx, const std::vector<struct llama_kv_defrag_move> & moves) {
-     llama_ubatch dummy = {};
-     dummy.equal_seqs = true;
- 
-@@ -8518,7 +8511,7 @@ static struct ggml_cgraph * llama_build_graph_defrag(llama_context & lctx, const
- 
-     llm.init();
- 
-    struct ggml_cgraph * result = llm.build_defrag(ids);
-+    struct ggml_cgraph * result = llm.build_defrag(moves);
- 
-     llm.free();
- 
-@@ -8956,7 +8949,12 @@ static int llama_prepare_ubatch(
-             kv_self.head = 0;
-         }
- 
-        const auto slot = llama_kv_cache_find_slot(kv_self, ubatch);
-+        auto slot = llama_kv_cache_find_slot(kv_self, ubatch);
-+        if (!slot) {
-+            llama_kv_cache_defrag(kv_self);
-+            llama_kv_cache_update(&lctx);
-+            slot = llama_kv_cache_find_slot(kv_self, ubatch);
-+        }
-         if (!slot) {
-             return 1;
-         }
-@@ -9431,8 +9429,8 @@ static void llama_kv_cache_defrag_impl(struct llama_context & lctx) {
- 
-     //const int64_t t_start = ggml_time_us();
- 
-    // number of cells moved
-    uint32_t n_moves = 0;
-+    // groups of cells moved
-+    std::vector<struct llama_kv_defrag_move> moves;
- 
-     // each move requires 6*n_layer tensors (see build_defrag)
-     //   - source view, destination view, copy operation
-@@ -9496,19 +9494,11 @@ static void llama_kv_cache_defrag_impl(struct llama_context & lctx) {
-         // are we moving a continuous block of memory?
-         bool cont = false;
- 
-        // should we stop searching for the next move?
-        bool stop = false;
-
-         // go back and move the nf cells to the hole
-         for (; i1 < n_kv; ++i1) {
-             auto & cell1 = kv_self.cells[i1];
- 
-             if (cell1.is_empty() || ids[i1] != n_kv) {
-                if (n_moves == max_moves) {
-                    stop = true;
-                    break;
-                }
-
-                 cont = false;
-                 continue;
-             }
-@@ -9524,8 +9514,10 @@ static void llama_kv_cache_defrag_impl(struct llama_context & lctx) {
-             kv_self.head = n_used;
- 
-             if (!cont) {
-                n_moves++;
-+                moves.push_back({i1, i0 + nf, 1});
-                 cont = true;
-+            } else {
-+                moves.back().len++;
-             }
- 
-             nf++;
-@@ -9535,22 +9527,16 @@ static void llama_kv_cache_defrag_impl(struct llama_context & lctx) {
-             }
-         }
- 
-        if (stop || n_moves == max_moves) {
-            break;
-        }
-
-         //LLAMA_LOG_INFO("(tmp log) KV defrag: move [%u, %u) to [%u, %u)\n", is, i1 + 1, i0, i0 + nh);
- 
-         i0 += nh - 1;
-     }
- 
-    if (n_moves == 0) {
-+    if (moves.size() == 0) {
-         return;
-     }
- 
-    //LLAMA_LOG_INFO("(tmp log) KV defrag cell moves: %u\n", n_moves);
-
-    //LLAMA_LOG_INFO("expected gf nodes: %u\n", 6*n_moves*n_layer);
-+    //LLAMA_LOG_INFO("(tmp log) KV defrag cell moves: %u\n",  moves.size());
- 
- #if 0
-     // CPU defrag
-@@ -9625,11 +9611,18 @@ static void llama_kv_cache_defrag_impl(struct llama_context & lctx) {
- #else
-     // ggml_graph defrag
- 
-    ggml_backend_sched_reset(lctx.sched.get());
-+    for (std::size_t i = 0; i < moves.size(); i += max_moves) {
-+        std::vector<struct llama_kv_defrag_move> chunk;
-+        auto end = std::min(i + max_moves, moves.size());
-+        chunk.assign(moves.begin() + i, moves.begin() + end);
- 
-    ggml_cgraph * gf = llama_build_graph_defrag(lctx, ids);
-+        ggml_backend_sched_reset(lctx.sched.get());
-+
-+        //LLAMA_LOG_INFO("expected gf nodes: %u\n", 6*chunk.size()*n_layer);
-+        ggml_cgraph * gf = llama_build_graph_defrag(lctx, chunk);
- 
-    llama_graph_compute(lctx, gf, lctx.cparams.n_threads, lctx.threadpool);
-+        llama_graph_compute(lctx, gf, lctx.cparams.n_threads, lctx.threadpool);
-+    }
- #endif
- 
-     //const int64_t t_end = ggml_time_us();
--- a/llama/patches/0012-sort-devices-by-score.patch
+++ b/llama/patches/0012-sort-devices-by-score.patch
@ -1,17 +1,20 @@
 From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
-From: Michael Yang <mxyng@pm.me>
-Date: Tue, 14 Jan 2025 12:01:24 -0800
+From: jmorganca <jmorganca@gmail.com>
+Date: Tue, 8 Apr 2025 20:31:38 -0700
 Subject: [PATCH] sort devices by score

+in the ggml backend loading code, devices
+are now sorted by score, ensuring the device
+with the fastest acceleration is loaded
 ---
 ggml/src/ggml-backend-reg.cpp | 21 +++++++++++++--------
 1 file changed, 13 insertions(+), 8 deletions(-)

 diff --git a/ggml/src/ggml-backend-reg.cpp b/ggml/src/ggml-backend-reg.cpp
-index 95036ef8..98d5e14d 100644
+index 82ae1b5b..1487f322 100644
 --- a/ggml/src/ggml-backend-reg.cpp
 +++ b/ggml/src/ggml-backend-reg.cpp
-@@ -150,7 +150,7 @@ struct ggml_backend_reg_entry {
+@@ -157,7 +157,7 @@ struct ggml_backend_reg_entry {
 
 struct ggml_backend_registry {
     std::vector<ggml_backend_reg_entry> backends;
@ -20,7 +23,7 @@ index 95036ef8..98d5e14d 100644
 
     ggml_backend_registry() {
 #ifdef GGML_USE_CUDA
-@@ -195,7 +195,7 @@ struct ggml_backend_registry {
+@@ -202,7 +202,7 @@ struct ggml_backend_registry {
         }
     }
 
@ -29,7 +32,7 @@ index 95036ef8..98d5e14d 100644
         if (!reg) {
             return;
         }
-@@ -206,15 +206,20 @@ struct ggml_backend_registry {
+@@ -213,15 +213,20 @@ struct ggml_backend_registry {
 #endif
         backends.push_back({ reg, std::move(handle) });
         for (size_t i = 0; i < ggml_backend_reg_dev_count(reg); i++) {
@ -52,17 +55,17 @@ index 95036ef8..98d5e14d 100644
 +        );
     }
 
-     ggml_backend_reg_t load_backend(const std::wstring & path, bool silent) {
-@@ -257,7 +262,7 @@ struct ggml_backend_registry {
+     ggml_backend_reg_t load_backend(const fs::path & path, bool silent) {
+@@ -265,7 +270,7 @@ struct ggml_backend_registry {
 
-         GGML_LOG_INFO("%s: loaded %s backend from %s\n", __func__, ggml_backend_reg_name(reg), utf16_to_utf8(path).c_str());
+         GGML_LOG_INFO("%s: loaded %s backend from %s\n", __func__, ggml_backend_reg_name(reg), path_str(path).c_str());
 
 -        register_backend(reg, std::move(handle));
 +        register_backend(reg, score_fn ? score_fn() : -1, std::move(handle));
 
         return reg;
     }
-@@ -280,7 +285,7 @@ struct ggml_backend_registry {
+@@ -288,7 +293,7 @@ struct ggml_backend_registry {
         // remove devices
         devices.erase(
             std::remove_if(devices.begin(), devices.end(),
@ -71,7 +74,7 @@ index 95036ef8..98d5e14d 100644
             devices.end());
 
         // remove backend
-@@ -338,7 +343,7 @@ size_t ggml_backend_dev_count() {
+@@ -346,7 +351,7 @@ size_t ggml_backend_dev_count() {
 
 ggml_backend_dev_t ggml_backend_dev_get(size_t index) {
     GGML_ASSERT(index < ggml_backend_dev_count());
--- a/llama/patches/0012-use-dynamic-backend-loading-for-clip.patch
+++ b/llama/patches/0012-use-dynamic-backend-loading-for-clip.patch
@ -1,102 +0,0 @@
-From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
-From: jmorganca <jmorganca@gmail.com>
-Date: Sat, 4 Jan 2025 22:52:48 -0800
-Subject: [PATCH] use dynamic backend loading for clip
-
---
- examples/llava/clip.cpp | 74 +++++++++++++++--------------------------
- 1 file changed, 27 insertions(+), 47 deletions(-)
-
-diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp
-index 205af1eb..560021c7 100644
--- a/examples/llava/clip.cpp
-+++ b/examples/llava/clip.cpp
-@@ -9,25 +9,25 @@
- #include "ggml-backend.h"
- #include "gguf.h"
- 
-//#ifdef GGML_USE_CUDA
-//#include "ggml-cuda.h"
-//#endif
-//
-//#ifdef GGML_USE_SYCL
-//#include "ggml-sycl.h"
-//#endif
-//
-//#ifdef GGML_USE_METAL
-//#include "ggml-metal.h"
-//#endif
-//
-//#ifdef GGML_USE_CANN
-//#include "ggml-cann.h"
-//#endif
-//
-//#ifdef GGML_USE_VULKAN
-//#include "ggml-vulkan.h"
-//#endif
-+#ifdef GGML_USE_CUDA
-+#include "ggml-cuda.h"
-+#endif
-+
-+#ifdef GGML_USE_SYCL
-+#include "ggml-sycl.h"
-+#endif
-+
-+#ifdef GGML_USE_METAL
-+#include "ggml-metal.h"
-+#endif
-+
-+#ifdef GGML_USE_CANN
-+#include "ggml-cann.h"
-+#endif
-+
-+#ifdef GGML_USE_VULKAN
-+#include "ggml-vulkan.h"
-+#endif
- 
- #define STB_IMAGE_IMPLEMENTATION
- #include "stb_image.h"
-@@ -1309,35 +1309,15 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
-         }
-     }
- 
-//#ifdef GGML_USE_CUDA
-//    new_clip->backend = ggml_backend_cuda_init(0);
-//    LOG_INF("%s: CLIP using CUDA backend\n", __func__);
-//#endif
-//
-//#ifdef GGML_USE_METAL
-//    new_clip->backend = ggml_backend_metal_init();
-//    LOG_INF("%s: CLIP using Metal backend\n", __func__);
-//#endif
-//
-//#ifdef GGML_USE_CANN
-//    new_clip->backend = ggml_backend_cann_init(0);
-//    LOG_INF("%s: CLIP using CANN backend\n", __func__);
-//#endif
-//
-//#ifdef GGML_USE_VULKAN
-//    new_clip->backend = ggml_backend_vk_init(0);
-//    LOG_INF("%s: CLIP using Vulkan backend\n", __func__);
-//#endif
-//
-//#ifdef GGML_USE_SYCL
-//    new_clip->backend = ggml_backend_sycl_init(0);
-//    LOG_INF("%s: CLIP using SYCL backend\n", __func__);
-//#endif
-
-    if (!new_clip->backend) {
-        new_clip->backend = ggml_backend_cpu_init();
-        LOG_INF("%s: CLIP using CPU backend\n", __func__);
-+    ggml_backend_t backend = ggml_backend_init_best();
-+    if (backend == nullptr) {
-+        LOG_ERR("%s: failed to initialize backend\n", __func__);
-+        clip_free(new_clip);
-+        gguf_free(ctx);
-+        return nullptr;
-     }
-+    LOG_INF("%s: using %s backend\n", __func__, ggml_backend_name(backend));
-+    new_clip->backend = backend;
- 
-     // model size and capabilities
-     {
--- a/llama/patches/0013-add-phony-target-ggml-cpu-for-all-cpu-variants.patch
+++ b/llama/patches/0013-add-phony-target-ggml-cpu-for-all-cpu-variants.patch
@ -1,6 +1,6 @@
 From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
-From: Michael Yang <mxyng@pm.me>
-Date: Tue, 14 Jan 2025 15:59:04 -0800
+From: jmorganca <jmorganca@gmail.com>
+Date: Tue, 8 Apr 2025 20:32:07 -0700
 Subject: [PATCH] add phony target ggml-cpu for all cpu variants

 ---
@ -8,10 +8,10 @@ Subject: [PATCH] add phony target ggml-cpu for all cpu variants
 1 file changed, 2 insertions(+)

 diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt
-index 0002ac18..0a8d1092 100644
+index f00700da..91d6a7d5 100644
 --- a/ggml/src/CMakeLists.txt
 +++ b/ggml/src/CMakeLists.txt
-@@ -297,6 +297,7 @@ function(ggml_add_cpu_backend_variant tag_name)
+@@ -278,6 +278,7 @@ function(ggml_add_cpu_backend_variant tag_name)
     endforeach()
 
     ggml_add_cpu_backend_variant_impl(${tag_name})
@ -19,11 +19,11 @@ index 0002ac18..0a8d1092 100644
 endfunction()
 
 ggml_add_backend(CPU)
-@@ -305,6 +306,7 @@ if (GGML_CPU_ALL_VARIANTS)
+@@ -286,6 +287,7 @@ if (GGML_CPU_ALL_VARIANTS)
     if (NOT GGML_BACKEND_DL)
         message(FATAL_ERROR "GGML_CPU_ALL_VARIANTS requires GGML_BACKEND_DL")
     endif()
 +    add_custom_target(ggml-cpu)
     ggml_add_cpu_backend_variant(sandybridge    AVX)
-     ggml_add_cpu_backend_variant(haswell        AVX F16C AVX2 FMA)
-     ggml_add_cpu_backend_variant(skylakex       AVX F16C AVX2 FMA AVX512)
+     ggml_add_cpu_backend_variant(haswell        AVX F16C AVX2 BMI2 FMA)
+     ggml_add_cpu_backend_variant(skylakex       AVX F16C AVX2 BMI2 FMA AVX512)
--- a/llama/patches/0014-remove-amx.patch
+++ b/llama/patches/0014-remove-amx.patch
@ -0,0 +1,25 @@
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
+From: jmorganca <jmorganca@gmail.com>
+Date: Tue, 8 Apr 2025 20:33:01 -0700
+Subject: [PATCH] remove amx
+
+disable amx as it reduces performance on some systems
+---
+ ggml/src/CMakeLists.txt | 4 ----
+ 1 file changed, 4 deletions(-)
+
+diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt
+index 91d6a7d5..d6b393a2 100644
+--- a/ggml/src/CMakeLists.txt
+++ b/ggml/src/CMakeLists.txt
+@@ -293,10 +293,6 @@ if (GGML_CPU_ALL_VARIANTS)
+     ggml_add_cpu_backend_variant(skylakex       AVX F16C AVX2 BMI2 FMA AVX512)
+     ggml_add_cpu_backend_variant(icelake        AVX F16C AVX2 BMI2 FMA AVX512 AVX512_VBMI AVX512_VNNI)
+     ggml_add_cpu_backend_variant(alderlake      AVX F16C AVX2 BMI2 FMA AVX_VNNI)
+-    if (NOT MSVC)
+-        # MSVC doesn't support AMX
+-        ggml_add_cpu_backend_variant(sapphirerapids AVX F16C AVX2 BMI2 FMA AVX512 AVX512_VBMI AVX512_VNNI AVX512_BF16 AMX_TILE AMX_INT8)
+-    endif()
+ elseif (GGML_CPU)
+     ggml_add_cpu_backend_variant_impl("")
+ endif()
--- a/llama/patches/0015-fix-string-arr-kv-loading.patch
+++ b/llama/patches/0015-fix-string-arr-kv-loading.patch
@ -1,8 +1,11 @@
 From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
 From: jmorganca <jmorganca@gmail.com>
-Date: Wed, 5 Mar 2025 17:41:07 -0800
+Date: Tue, 8 Apr 2025 20:35:53 -0700
 Subject: [PATCH] fix string arr kv loading

+certain models would error when loading
+kv metadata fields that contain an array of strings
+such as vocab fields
 ---
 ggml/include/gguf.h | 1 +
 ggml/src/gguf.cpp   | 7 +++++--
@ -22,7 +25,7 @@ index 79ee2020..3efb22f0 100644
     // get ith C string from array with given key_id
     GGML_API const char * gguf_get_arr_str (const struct gguf_context * ctx, int64_t key_id, size_t i);
 diff --git a/ggml/src/gguf.cpp b/ggml/src/gguf.cpp
-index ab13669c..f75b923f 100644
+index 381a9c7d..e45b453d 100644
 --- a/ggml/src/gguf.cpp
 +++ b/ggml/src/gguf.cpp
@@ -777,10 +777,14 @@ enum gguf_type gguf_get_arr_type(const struct gguf_context * ctx, int64_t key_id
@ -50,10 +53,10 @@ index ab13669c..f75b923f 100644
 }
 
 diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp
-index c7ff28be..7a185443 100644
+index d74919d2..c90f636c 100644
 --- a/src/llama-vocab.cpp
 +++ b/src/llama-vocab.cpp
-@@ -1443,7 +1443,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
+@@ -1459,7 +1459,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
 
             const int precompiled_charsmap_keyidx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_PRECOMPILED_CHARSMAP).c_str());
             if (precompiled_charsmap_keyidx != -1) {
--- a/llama/patches/0015-use-std-filesystem-path-instead-of-wstring.patch
+++ b/llama/patches/0015-use-std-filesystem-path-instead-of-wstring.patch
@ -1,369 +0,0 @@
-From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
-From: jmorganca <jmorganca@gmail.com>
-Date: Sun, 16 Feb 2025 20:00:22 -0500
-Subject: [PATCH] use std::filesystem::path instead of wstring
-
---
- ggml/src/ggml-backend-reg.cpp | 199 +++++++++++++++-------------------
- 1 file changed, 88 insertions(+), 111 deletions(-)
-
-diff --git a/ggml/src/ggml-backend-reg.cpp b/ggml/src/ggml-backend-reg.cpp
-index 98d5e14d..799af5f3 100644
--- a/ggml/src/ggml-backend-reg.cpp
-+++ b/ggml/src/ggml-backend-reg.cpp
-@@ -66,26 +66,6 @@
- #include "ggml-kompute.h"
- #endif
- 
-// disable C++17 deprecation warning for std::codecvt_utf8
-#if defined(__clang__)
-#    pragma clang diagnostic push
-#    pragma clang diagnostic ignored "-Wdeprecated-declarations"
-#endif
-
-static std::wstring utf8_to_utf16(const std::string & str) {
-    std::wstring_convert<std::codecvt_utf8_utf16<wchar_t>> converter;
-    return converter.from_bytes(str);
-}
-
-static std::string utf16_to_utf8(const std::wstring & str) {
-    std::wstring_convert<std::codecvt_utf8_utf16<wchar_t>> converter;
-    return converter.to_bytes(str);
-}
-
-#if defined(__clang__)
-#    pragma clang diagnostic pop
-#endif
-
- #ifdef _WIN32
- 
- using dl_handle = std::remove_pointer_t<HMODULE>;
-@@ -96,7 +76,7 @@ struct dl_handle_deleter {
-     }
- };
- 
-static dl_handle * dl_load_library(const std::wstring & path) {
-+static dl_handle * dl_load_library(const std::filesystem::path & path) {
-     // suppress error dialogs for missing DLLs
-     DWORD old_mode = SetErrorMode(SEM_FAILCRITICALERRORS);
-     SetErrorMode(old_mode | SEM_FAILCRITICALERRORS);
-@@ -129,8 +109,8 @@ struct dl_handle_deleter {
-     }
- };
- 
-static void * dl_load_library(const std::wstring & path) {
-    dl_handle * handle = dlopen(utf16_to_utf8(path).c_str(), RTLD_NOW | RTLD_LOCAL);
-+static void * dl_load_library(const std::filesystem::path & path) {
-+    dl_handle * handle = dlopen(path.c_str(), RTLD_NOW | RTLD_LOCAL);
- 
-     return handle;
- }
-@@ -141,6 +121,25 @@ static void * dl_get_sym(dl_handle * handle, const char * name) {
- 
- #endif
- 
-+static std::string path_to_string(const std::filesystem::path & path)
-+{
-+#ifdef _WIN32
-+    const std::wstring wstr = path.wstring();
-+    const int size_needed = WideCharToMultiByte(CP_UTF8, 0, wstr.c_str(), -1, nullptr, 0, nullptr, nullptr);
-+    if (size_needed <= 0) {
-+        return std::string();
-+    }
-+
-+    // size_needed includes the null terminator
-+    std::string str(size_needed - 1, '\0');
-+    WideCharToMultiByte(CP_UTF8, 0, wstr.c_str(), -1, str.data(), size_needed, nullptr, nullptr);
-+    return str;
-+#else
-+    return path.string();
-+#endif
-+}
-+
-+
- using dl_handle_ptr = std::unique_ptr<dl_handle, dl_handle_deleter>;
- 
- struct ggml_backend_reg_entry {
-@@ -222,11 +221,11 @@ struct ggml_backend_registry {
-         );
-     }
- 
-    ggml_backend_reg_t load_backend(const std::wstring & path, bool silent) {
-+    ggml_backend_reg_t load_backend(const std::filesystem::path & path, bool silent) {
-         dl_handle_ptr handle { dl_load_library(path) };
-         if (!handle) {
-             if (!silent) {
-                GGML_LOG_ERROR("%s: failed to load %s\n", __func__, utf16_to_utf8(path).c_str());
-+                GGML_LOG_ERROR("%s: failed to load %s\n", __func__, path_to_string(path).c_str());
-             }
-             return nullptr;
-         }
-@@ -234,7 +233,7 @@ struct ggml_backend_registry {
-         auto score_fn = (ggml_backend_score_t) dl_get_sym(handle.get(), "ggml_backend_score");
-         if (score_fn && score_fn() == 0) {
-             if (!silent) {
-                GGML_LOG_INFO("%s: backend %s is not supported on this system\n", __func__, utf16_to_utf8(path).c_str());
-+                GGML_LOG_INFO("%s: backend %s is not supported on this system\n", __func__, path_to_string(path).c_str());
-             }
-             return nullptr;
-         }
-@@ -242,7 +241,7 @@ struct ggml_backend_registry {
-         auto backend_init_fn = (ggml_backend_init_t) dl_get_sym(handle.get(), "ggml_backend_init");
-         if (!backend_init_fn) {
-             if (!silent) {
-                GGML_LOG_ERROR("%s: failed to find ggml_backend_init in %s\n", __func__, utf16_to_utf8(path).c_str());
-+                GGML_LOG_ERROR("%s: failed to find ggml_backend_init in %s\n", __func__, path_to_string(path).c_str());
-             }
-             return nullptr;
-         }
-@@ -251,16 +250,16 @@ struct ggml_backend_registry {
-         if (!reg || reg->api_version != GGML_BACKEND_API_VERSION) {
-             if (!silent) {
-                 if (!reg) {
-                    GGML_LOG_ERROR("%s: failed to initialize backend from %s: ggml_backend_init returned NULL\n", __func__, utf16_to_utf8(path).c_str());
-+                    GGML_LOG_ERROR("%s: failed to initialize backend from %s: ggml_backend_init returned NULL\n", __func__, path_to_string(path).c_str());
-                 } else {
-                     GGML_LOG_ERROR("%s: failed to initialize backend from %s: incompatible API version (backend: %d, current: %d)\n",
-                        __func__, utf16_to_utf8(path).c_str(), reg->api_version, GGML_BACKEND_API_VERSION);
-+                        __func__, path_to_string(path).c_str(), reg->api_version, GGML_BACKEND_API_VERSION);
-                 }
-             }
-             return nullptr;
-         }
- 
-        GGML_LOG_INFO("%s: loaded %s backend from %s\n", __func__, ggml_backend_reg_name(reg), utf16_to_utf8(path).c_str());
-+        GGML_LOG_INFO("%s: loaded %s backend from %s\n", __func__, ggml_backend_reg_name(reg), path_to_string(path).c_str());
- 
-         register_backend(reg, score_fn ? score_fn() : -1, std::move(handle));
- 
-@@ -396,14 +395,14 @@ ggml_backend_t ggml_backend_init_best(void) {
- 
- // Dynamic loading
- ggml_backend_reg_t ggml_backend_load(const char * path) {
-    return get_reg().load_backend(utf8_to_utf16(path), false);
-+    return get_reg().load_backend(path, false);
- }
- 
- void ggml_backend_unload(ggml_backend_reg_t reg) {
-     get_reg().unload_backend(reg, true);
- }
- 
-static std::wstring get_executable_path() {
-+static std::filesystem::path get_executable_path() {
- #if defined(__APPLE__)
-     // get executable path
-     std::vector<char> path;
-@@ -415,15 +414,9 @@ static std::wstring get_executable_path() {
-         }
-         path.resize(size);
-     }
-    std::string base_path(path.data(), size);
-    // remove executable name
-    auto last_slash = base_path.find_last_of('/');
-    if (last_slash != std::string::npos) {
-        base_path = base_path.substr(0, last_slash);
-    }
-    return utf8_to_utf16(base_path + "/");
-+
-+    return std::filesystem::path(path.data()).parent_path();
- #elif defined(__linux__) || defined(__FreeBSD__)
-    std::string base_path = ".";
-     std::vector<char> path(1024);
-     while (true) {
-         // get executable path
-@@ -436,76 +429,55 @@ static std::wstring get_executable_path() {
-             break;
-         }
-         if (len < (ssize_t) path.size()) {
-            base_path = std::string(path.data(), len);
-            // remove executable name
-            auto last_slash = base_path.find_last_of('/');
-            if (last_slash != std::string::npos) {
-                base_path = base_path.substr(0, last_slash);
-            }
-            break;
-+            return std::filesystem::path(path.data()).parent_path();
-         }
-         path.resize(path.size() * 2);
-     }
-
-    return utf8_to_utf16(base_path + "/");
- #elif defined(_WIN32)
-     std::vector<wchar_t> path(MAX_PATH);
-     DWORD len = GetModuleFileNameW(NULL, path.data(), path.size());
-     if (len == 0) {
-         return {};
-     }
-    std::wstring base_path(path.data(), len);
-    // remove executable name
-    auto last_slash = base_path.find_last_of('\\');
-    if (last_slash != std::string::npos) {
-        base_path = base_path.substr(0, last_slash);
-    }
-    return base_path + L"\\";
-#else
-    return {};
-#endif
-}
- 
-static std::wstring backend_filename_prefix() {
-#ifdef _WIN32
-    return L"ggml-";
-#else
-    return L"libggml-";
-+    return std::filesystem::path(path.data()).parent_path();
- #endif
-+    return {};
- }
- 
-static std::wstring backend_filename_suffix() {
-+static std::string backend_filename_prefix() {
- #ifdef _WIN32
-    return L".dll";
-+    return "ggml-";
- #else
-    return L".so";
-+    return "libggml-";
- #endif
- }
- 
-static std::wstring path_separator() {
-+static std::string backend_filename_suffix() {
- #ifdef _WIN32
-    return L"\\";
-+    return ".dll";
- #else
-    return L"/";
-+    return ".so";
- #endif
- }
- 
- static ggml_backend_reg_t ggml_backend_load_best(const char * name, bool silent, const char * user_search_path) {
-     // enumerate all the files that match [lib]ggml-name-*.[so|dll] in the search paths
-      // TODO: search system paths
-    std::wstring file_prefix = backend_filename_prefix() + utf8_to_utf16(name) + L"-";
-    std::vector<std::wstring> search_paths;
-+    namespace fs = std::filesystem;
-+    std::string file_prefix = backend_filename_prefix() + name + "-";
-+    std::vector<fs::path> search_paths;
-+
-     if (user_search_path == nullptr) {
-        search_paths.push_back(L"." + path_separator());
-+        search_paths.push_back(fs::current_path());
-         search_paths.push_back(get_executable_path());
-     } else {
-        search_paths.push_back(utf8_to_utf16(user_search_path) + path_separator());
-+        search_paths.push_back(fs::u8path(user_search_path));
-     }
- 
-     int best_score = 0;
-    std::wstring best_path;
-+    fs::path best_path;
- 
-    namespace fs = std::filesystem;
-     for (const auto & search_path : search_paths) {
-         if (!fs::exists(search_path)) {
-             continue;
-@@ -513,29 +485,26 @@ static ggml_backend_reg_t ggml_backend_load_best(const char * name, bool silent,
-         fs::directory_iterator dir_it(search_path, fs::directory_options::skip_permission_denied);
-         for (const auto & entry : dir_it) {
-             if (entry.is_regular_file()) {
-                std::wstring filename = entry.path().filename().wstring();
-                std::wstring ext = entry.path().extension().wstring();
-+                std::string filename = entry.path().filename().string();
-+                std::string ext = entry.path().extension().string();
-                 if (filename.find(file_prefix) == 0 && ext == backend_filename_suffix()) {
-                    dl_handle_ptr handle { dl_load_library(entry.path().wstring()) };
-                    if (!handle && !silent) {
-                        GGML_LOG_ERROR("%s: failed to load %s\n", __func__, utf16_to_utf8(entry.path().wstring()).c_str());
-+                    dl_handle_ptr handle { dl_load_library(entry.path()) };
-+                    if (!handle) {
-+                        GGML_LOG_ERROR("%s: failed to load %s\n", __func__, path_to_string(entry.path()).c_str());
-+                        continue;
-                     }
-                    if (handle) {
-                        auto score_fn = (ggml_backend_score_t) dl_get_sym(handle.get(), "ggml_backend_score");
-                        if (score_fn) {
-                            int s = score_fn();
-#ifndef NDEBUG
-                            GGML_LOG_DEBUG("%s: %s score: %d\n", __func__, utf16_to_utf8(entry.path().wstring()).c_str(), s);
-#endif
-                            if (s > best_score) {
-                                best_score = s;
-                                best_path = entry.path().wstring();
-                            }
-                        } else {
-                            if (!silent) {
-                                GGML_LOG_INFO("%s: failed to find ggml_backend_score in %s\n", __func__, utf16_to_utf8(entry.path().wstring()).c_str());
-                            }
-                        }
-+
-+                    auto score_fn = (ggml_backend_score_t) dl_get_sym(handle.get(), "ggml_backend_score");
-+                    if (!score_fn) {
-+                        GGML_LOG_DEBUG("%s: failed to find ggml_backend_score in %s\n", __func__, path_to_string(entry.path()).c_str());
-+                        continue;
-+                    }
-+
-+                    int s = score_fn();
-+                    GGML_LOG_DEBUG("%s: %s score: %d\n", __func__, path_to_string(entry.path()).c_str(), s);
-+                    if (s > best_score) {
-+                        best_score = s;
-+                        best_path = entry.path();
-                     }
-                 }
-             }
-@@ -545,7 +514,7 @@ static ggml_backend_reg_t ggml_backend_load_best(const char * name, bool silent,
-     if (best_score == 0) {
-         // try to load the base backend
-         for (const auto & search_path : search_paths) {
-            std::wstring path = search_path + backend_filename_prefix() + utf8_to_utf16(name) + backend_filename_suffix();
-+            fs::path path = fs::path(search_path) / (backend_filename_prefix() + name + backend_filename_suffix());
-             if (fs::exists(path)) {
-                 return get_reg().load_backend(path, silent);
-             }
-@@ -560,6 +529,14 @@ void ggml_backend_load_all() {
-     ggml_backend_load_all_from_path(nullptr);
- }
- 
-+static void ggml_backend_try_load_best(const char * name, bool silent, const char * user_search_path) {
-+    try {
-+        ggml_backend_load_best(name, silent, user_search_path);
-+    } catch (const std::exception & e) {
-+        GGML_LOG_DEBUG("%s: failed to load %s: %s\n", __func__, name, e.what());
-+    }
-+}
-+
- void ggml_backend_load_all_from_path(const char * dir_path) {
- #ifdef NDEBUG
-     bool silent = true;
-@@ -567,18 +544,18 @@ void ggml_backend_load_all_from_path(const char * dir_path) {
-     bool silent = false;
- #endif
- 
-    ggml_backend_load_best("blas", silent, dir_path);
-    ggml_backend_load_best("cann", silent, dir_path);
-    ggml_backend_load_best("cuda", silent, dir_path);
-    ggml_backend_load_best("hip", silent, dir_path);
-    ggml_backend_load_best("kompute", silent, dir_path);
-    ggml_backend_load_best("metal", silent, dir_path);
-    ggml_backend_load_best("rpc", silent, dir_path);
-    ggml_backend_load_best("sycl", silent, dir_path);
-    ggml_backend_load_best("vulkan", silent, dir_path);
-    ggml_backend_load_best("opencl", silent, dir_path);
-    ggml_backend_load_best("musa", silent, dir_path);
-    ggml_backend_load_best("cpu", silent, dir_path);
-+    ggml_backend_try_load_best("blas", silent, dir_path);
-+    ggml_backend_try_load_best("cann", silent, dir_path);
-+    ggml_backend_try_load_best("cuda", silent, dir_path);
-+    ggml_backend_try_load_best("hip", silent, dir_path);
-+    ggml_backend_try_load_best("kompute", silent, dir_path);
-+    ggml_backend_try_load_best("metal", silent, dir_path);
-+    ggml_backend_try_load_best("rpc", silent, dir_path);
-+    ggml_backend_try_load_best("sycl", silent, dir_path);
-+    ggml_backend_try_load_best("vulkan", silent, dir_path);
-+    ggml_backend_try_load_best("opencl", silent, dir_path);
-+    ggml_backend_try_load_best("musa", silent, dir_path);
-+    ggml_backend_try_load_best("cpu", silent, dir_path);
-     // check the environment variable GGML_BACKEND_PATH to load an out-of-tree backend
-     const char * backend_path = std::getenv("GGML_BACKEND_PATH");
-     if (backend_path) {
--- a/llama/patches/0016-ollama-debug-tensor.patch
+++ b/llama/patches/0016-ollama-debug-tensor.patch
@ -1,6 +1,6 @@
 From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
-From: Michael Yang <mxyng@pm.me>
-Date: Sun, 9 Mar 2025 14:44:16 -0700
+From: jmorganca <jmorganca@gmail.com>
+Date: Tue, 8 Apr 2025 20:36:41 -0700
 Subject: [PATCH] ollama debug tensor

 ---
@ -8,11 +8,11 @@ Subject: [PATCH] ollama debug tensor
 1 file changed, 6 insertions(+)

 diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c
-index 2f606d82..ec60e8fc 100644
+index 432942bf..6d4abe4c 100644
 --- a/ggml/src/ggml-cpu/ggml-cpu.c
 +++ b/ggml/src/ggml-cpu/ggml-cpu.c
-@@ -11,6 +11,8 @@
- #include "ggml-threading.h"
+@@ -15,6 +15,8 @@
+ #include "ops.h"
 #include "ggml.h"
 
 +#include "ollama-debug.h"
@ -20,7 +20,7 @@ index 2f606d82..ec60e8fc 100644
 #if defined(_MSC_VER) || defined(__MINGW32__)
 #include <malloc.h> // using malloc.h with MSC/MINGW
 #elif !defined(__FreeBSD__) && !defined(__NetBSD__) && !defined(__OpenBSD__)
-@@ -14103,6 +14105,10 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
+@@ -2854,6 +2856,10 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
 
         ggml_compute_forward(&params, node);
 
--- a/llama/patches/0016-remove-amx.patch
+++ b/llama/patches/0016-remove-amx.patch
@ -1,24 +0,0 @@
-From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
-From: Michael Yang <mxyng@pm.me>
-Date: Tue, 18 Feb 2025 14:47:21 -0800
-Subject: [PATCH] remove amx
-
---
- ggml/src/CMakeLists.txt | 4 ----
- 1 file changed, 4 deletions(-)
-
-diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt
-index 0a8d1092..4564df91 100644
--- a/ggml/src/CMakeLists.txt
-+++ b/ggml/src/CMakeLists.txt
-@@ -312,10 +312,6 @@ if (GGML_CPU_ALL_VARIANTS)
-     ggml_add_cpu_backend_variant(skylakex       AVX F16C AVX2 FMA AVX512)
-     ggml_add_cpu_backend_variant(icelake        AVX F16C AVX2 FMA AVX512 AVX512_VBMI AVX512_VNNI)
-     ggml_add_cpu_backend_variant(alderlake      AVX F16C AVX2 FMA AVX_VNNI)
-    if (NOT MSVC)
-        # MSVC doesn't support AMX
-        ggml_add_cpu_backend_variant(sapphirerapids AVX F16C AVX2 FMA AVX512 AVX512_VBMI AVX512_VNNI AVX512_BF16 AMX_TILE AMX_INT8)
-    endif()
- elseif (GGML_CPU)
-     ggml_add_cpu_backend_variant_impl("")
- endif()
--- a/llama/patches/0017-add-model-quantizations.patch
+++ b/llama/patches/0017-add-model-quantizations.patch
@ -0,0 +1,96 @@
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
+From: jmorganca <jmorganca@gmail.com>
+Date: Tue, 8 Apr 2025 20:39:32 -0700
+Subject: [PATCH] add model quantizations
+
+a temporary patch to add model quantization for
+models not supported in llama.cpp
+---
+ src/llama-arch.cpp  | 17 +++++++++++++++++
+ src/llama-arch.h    |  1 +
+ src/llama-model.cpp |  2 ++
+ src/llama-quant.cpp |  4 ++++
+ 4 files changed, 24 insertions(+)
+
+diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp
+index c1f78618..bdf3d898 100644
+--- a/src/llama-arch.cpp
+++ b/src/llama-arch.cpp
+@@ -73,6 +73,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
+     { LLM_ARCH_WAVTOKENIZER_DEC, "wavtokenizer-dec" },
+     { LLM_ARCH_PLM,              "plm"              },
+     { LLM_ARCH_BAILINGMOE,       "bailingmoe"       },
+    { LLM_ARCH_MISTRAL3,         "mistral3"         },
+     { LLM_ARCH_UNKNOWN,          "(unknown)"        },
+ };
+ 
+@@ -1582,6 +1583,22 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
+             { LLM_TENSOR_FFN_UP_SHEXP,       "blk.%d.ffn_up_shexp" },
+         },
+     },
+    {
+        LLM_ARCH_MISTRAL3,
+        {
+            { LLM_TENSOR_TOKEN_EMBD,  "token_embd" },
+            { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
+            { LLM_TENSOR_ATTN_NORM,   "blk.%d.attn_norm" },
+            { LLM_TENSOR_ATTN_Q,      "blk.%d.attn_q" },
+            { LLM_TENSOR_ATTN_K,      "blk.%d.attn_k" },
+            { LLM_TENSOR_ATTN_V,      "blk.%d.attn_v" },
+            { LLM_TENSOR_ATTN_OUT,    "blk.%d.attn_output" },
+            { LLM_TENSOR_FFN_NORM,    "blk.%d.ffn_norm" },
+            { LLM_TENSOR_FFN_GATE,    "blk.%d.ffn_gate" },
+            { LLM_TENSOR_FFN_UP,      "blk.%d.ffn_up" },
+            { LLM_TENSOR_FFN_DOWN,    "blk.%d.ffn_down" },
+        }
+    },
+     {
+         LLM_ARCH_UNKNOWN,
+         {
+diff --git a/src/llama-arch.h b/src/llama-arch.h
+index f987844d..ee081fbf 100644
+--- a/src/llama-arch.h
+++ b/src/llama-arch.h
+@@ -75,6 +75,7 @@ enum llm_arch {
+     LLM_ARCH_CHAMELEON,
+     LLM_ARCH_SOLAR,
+     LLM_ARCH_WAVTOKENIZER_DEC,
+    LLM_ARCH_MISTRAL3,
+     LLM_ARCH_PLM,
+     LLM_ARCH_BAILINGMOE,
+     LLM_ARCH_UNKNOWN,
+diff --git a/src/llama-model.cpp b/src/llama-model.cpp
+index d5ad466e..cd1d239c 100644
+--- a/src/llama-model.cpp
+++ b/src/llama-model.cpp
+@@ -1423,6 +1423,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
+                     default: type = LLM_TYPE_UNKNOWN;
+                 }
+             } break;
+        case LLM_ARCH_MISTRAL3: break;
+         default: throw std::runtime_error("unsupported model architecture");
+     }
+ 
+@@ -13652,6 +13653,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
+         case LLM_ARCH_CHAMELEON:
+         case LLM_ARCH_SOLAR:
+         case LLM_ARCH_BAILINGMOE:
+        case LLM_ARCH_MISTRAL3:
+             return LLAMA_ROPE_TYPE_NORM;
+ 
+         // the pairs of head values are offset by n_rot/2
+diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
+index 223e1f3f..8ae6dde8 100644
+--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
+@@ -744,6 +744,10 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
+         // This used to be a regex, but <regex> has an extreme cost to compile times.
+         bool quantize = name.rfind("weight") == name.size() - 6; // ends with 'weight'?
+ 
+        // don't quantize vision stuff
+        quantize &= name.find("v.") == std::string::npos;
+        quantize &= name.find("mm.") == std::string::npos;
+
+         // quantize only 2D and 3D tensors (experts)
+         quantize &= (ggml_n_dims(tensor) >= 2);
+ 
--- a/llama/patches/0017-fix-clip-compiler-error.patch
+++ b/llama/patches/0017-fix-clip-compiler-error.patch
@ -1,36 +0,0 @@
-From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
-From: jmorganca <jmorganca@gmail.com>
-Date: Tue, 25 Feb 2025 19:14:51 -0800
-Subject: [PATCH] fix-clip-compiler-error
-
---
- examples/llava/clip.cpp | 2 +-
- examples/llava/clip.h   | 2 +-
- 2 files changed, 2 insertions(+), 2 deletions(-)
-
-diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp
-index 560021c7..54265beb 100644
--- a/examples/llava/clip.cpp
-+++ b/examples/llava/clip.cpp
-@@ -1788,7 +1788,7 @@ void clip_image_f32_batch_free(struct clip_image_f32_batch  * batch) {
-     }
- }
- 
-void clip_build_img_from_pixels(const unsigned char * rgb_pixels, int nx, int ny, clip_image_u8 * img) {
-+void clip_build_img_from_pixels(const unsigned char * rgb_pixels, int nx, int ny, struct clip_image_u8 * img) {
-     img->nx = nx;
-     img->ny = ny;
-     img->buf.resize(3 * nx * ny);
-diff --git a/examples/llava/clip.h b/examples/llava/clip.h
-index ce6f6194..f9f80d7d 100644
--- a/examples/llava/clip.h
-+++ b/examples/llava/clip.h
-@@ -75,7 +75,7 @@ CLIP_API void clip_image_u8_batch_free (struct clip_image_u8_batch  * batch);
- CLIP_API void clip_image_f32_batch_free(struct clip_image_f32_batch * batch);
- 
- /** build image from pixels decoded by other libraries instead of stb_image.h for better performance. The memory layout is RGBRGBRGB..., input buffer length must be 3*nx*ny bytes */
-CLIP_API void clip_build_img_from_pixels(const unsigned char * rgb_pixels, int nx, int ny, clip_image_u8 * img);
-+CLIP_API void clip_build_img_from_pixels(const unsigned char * rgb_pixels, int nx, int ny, struct clip_image_u8 * img);
- 
- CLIP_API bool clip_image_load_from_file(const char * fname, struct clip_image_u8 * img);
- 
--- a/llama/patches/0022-metal-add-op_neg.patch
+++ b/llama/patches/0022-metal-add-op_neg.patch
@ -1,18 +1,19 @@
 From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
-From: Michael Yang <git@mxy.ng>
-Date: Wed, 2 Apr 2025 15:26:15 -0700
-Subject: [PATCH] metal: add op_neg
+From: jmorganca <jmorganca@gmail.com>
+Date: Tue, 8 Apr 2025 20:41:24 -0700
+Subject: [PATCH] add op_neg

+adds the neg operator to ggml
 ---
 ggml/src/ggml-metal/ggml-metal.m     | 15 +++++++++++++++
 ggml/src/ggml-metal/ggml-metal.metal |  7 +++++++
 2 files changed, 22 insertions(+)

 diff --git a/ggml/src/ggml-metal/ggml-metal.m b/ggml/src/ggml-metal/ggml-metal.m
-index e4c093f9..d8422f1b 100644
+index b121ab9e..fea50521 100644
 --- a/ggml/src/ggml-metal/ggml-metal.m
 +++ b/ggml/src/ggml-metal/ggml-metal.m
-@@ -423,6 +423,7 @@ enum ggml_metal_kernel_type {
+@@ -461,6 +461,7 @@ static void ggml_backend_metal_device_rel(struct ggml_backend_metal_device_conte
     GGML_METAL_KERNEL_TYPE_SQRT,
     GGML_METAL_KERNEL_TYPE_SIN,
     GGML_METAL_KERNEL_TYPE_COS,
@ -20,23 +21,23 @@ index e4c093f9..d8422f1b 100644
     GGML_METAL_KERNEL_TYPE_SUM_ROWS,
     GGML_METAL_KERNEL_TYPE_POOL_2D_AVG_F32,
     GGML_METAL_KERNEL_TYPE_POOL_2D_MAX_F32,
-@@ -1039,6 +1040,7 @@ static struct ggml_backend_metal_context * ggml_metal_init(ggml_backend_dev_t de
-         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SQRT,                          sqrt,                           true);
-         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SIN,                           sin,                            true);
-         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_COS,                           cos,                            true);
-+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_NEG,                           neg,                            true);
-         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SUM_ROWS,                      sum_rows,                       true);
-         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ARGMAX,                        argmax,                         true);
-         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_POOL_2D_AVG_F32,               pool_2d_avg_f32,                true);
-@@ -1202,6 +1204,7 @@ static bool ggml_metal_supports_op(const struct ggml_backend_metal_device_contex
+@@ -1119,6 +1120,7 @@ @implementation GGMLMetalClass
+         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SQRT,                            sqrt,                            true);
+         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SIN,                             sin,                             true);
+         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_COS,                             cos,                             true);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_NEG,                             neg,                             true);
+         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SUM_ROWS,                        sum_rows,                        true);
+         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ARGMAX,                          argmax,                          true);
+         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_POOL_2D_AVG_F32,                 pool_2d_avg_f32,                 true);
+@@ -1280,6 +1282,7 @@ static bool ggml_metal_supports_op(const struct ggml_backend_metal_device_contex
                 case GGML_UNARY_OP_GELU_QUICK:
                 case GGML_UNARY_OP_SILU:
                 case GGML_UNARY_OP_ELU:
 +                case GGML_UNARY_OP_NEG:
-                     return ggml_is_contiguous(op->src[0]);
+                     return ggml_is_contiguous(op->src[0]) && op->src[0]->type == GGML_TYPE_F32;
                 default:
                     return false;
-@@ -1873,6 +1876,18 @@ static void ggml_metal_encode_node(
+@@ -1966,6 +1969,18 @@ static void ggml_metal_encode_node(
 
                     [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
                 } break;
@ -56,10 +57,10 @@ index e4c093f9..d8422f1b 100644
                 {
                     GGML_LOG_WARN("%s: node %3d, op = %8s not implemented\n", __func__, idx, ggml_op_name(dst->op));
 diff --git a/ggml/src/ggml-metal/ggml-metal.metal b/ggml/src/ggml-metal/ggml-metal.metal
-index f38909d0..bb0ff668 100644
+index e3185e5b..ede9d1e6 100644
 --- a/ggml/src/ggml-metal/ggml-metal.metal
 +++ b/ggml/src/ggml-metal/ggml-metal.metal
-@@ -945,6 +945,13 @@ kernel void kernel_cos(
+@@ -949,6 +949,13 @@ kernel void kernel_cos(
     dst[tpig] = cos(src0[tpig]);
 }
 
--- a/llama/patches/0018-add-phi4-support.patch
+++ b/llama/patches/0018-add-phi4-support.patch
@ -1,80 +0,0 @@
-From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
-From: jmorganca <jmorganca@gmail.com>
-Date: Thu, 27 Feb 2025 15:12:26 -0800
-Subject: [PATCH] add phi4 support
-
---
- include/llama.h     |  1 +
- src/llama-model.cpp | 10 +++++++---
- src/llama-vocab.cpp | 11 +++++++++++
- 3 files changed, 19 insertions(+), 3 deletions(-)
-
-diff --git a/include/llama.h b/include/llama.h
-index cc948005..16774711 100644
--- a/include/llama.h
-+++ b/include/llama.h
-@@ -105,6 +105,7 @@ extern "C" {
-         LLAMA_VOCAB_PRE_TYPE_CHAMELEON      = 26,
-         LLAMA_VOCAB_PRE_TYPE_MINERVA        = 27,
-         LLAMA_VOCAB_PRE_TYPE_DEEPSEEK3_LLM  = 28,
-+        LLAMA_VOCAB_PRE_TYPE_GPT4O          = 29,
-     };
- 
-     enum llama_rope_type {
-diff --git a/src/llama-model.cpp b/src/llama-model.cpp
-index 21819080..ab1a07d1 100644
--- a/src/llama-model.cpp
-+++ b/src/llama-model.cpp
-@@ -2283,7 +2283,11 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
- 
-                     // output
-                     output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), { n_embd }, 0);
-                    output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), { n_embd, n_vocab }, 0);
-+                    output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
-+                    // if output is NULL, init from the input tok embed
-+                    if (output == NULL) {
-+                        output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
-+                    }
- 
-                     for (int i = 0; i < n_layer; ++i) {
-                         auto & layer = layers[i];
-@@ -2298,8 +2302,8 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
-                         layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd }, 0);
-                         layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), { n_embd, 2 * n_ff }, 0);
- 
-                        layer.rope_long  = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_LONG,  "weight", i), { n_embd_head/2 }, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
-                        layer.rope_short = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_SHORT, "weight", i), { n_embd_head/2 }, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
-+                        layer.rope_long  = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_LONG,  "weight", i), { n_rot/2 }, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
-+                        layer.rope_short = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_SHORT, "weight", i), { n_rot/2 }, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
-                     }
-                 } break;
-             case LLM_ARCH_PHIMOE:
-diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp
-index 1ca827eb..c7ff28be 100644
--- a/src/llama-vocab.cpp
-+++ b/src/llama-vocab.cpp
-@@ -392,6 +392,13 @@ struct llm_tokenizer_bpe : llm_tokenizer {
-                     "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
-                 };
-                 break;
-+            case LLAMA_VOCAB_PRE_TYPE_GPT4O:
-+                // original regex from tokenizer.json
-+                // [^\\r\\n\\p{L}\\p{N}]?[\\p{Lu}\\p{Lt}\\p{Lm}\\p{Lo}\\p{M}]*[\\p{Ll}\\p{Lm}\\p{Lo}\\p{M}]+(?i:'s|'t|'re|'ve|'m|'ll|'d)?|[^\\r\\n\\p{L}\\p{N}]?[\\p{Lu}\\p{Lt}\\p{Lm}\\p{Lo}\\p{M}]+[\\p{Ll}\\p{Lm}\\p{Lo}\\p{M}]*(?i:'s|'t|'re|'ve|'m|'ll|'d)?|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n/]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+
-+                regex_exprs = {
-+                    "[^\\r\\n\\p{L}\\p{N}]?((?=[\\p{L}])([^a-z]))*((?=[\\p{L}])([^A-Z]))+(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])?|[^\\r\\n\\p{L}\\p{N}]?((?=[\\p{L}])([^a-z]))+((?=[\\p{L}])([^A-Z]))*(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])?|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n/]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+"
-+                };
-+                break;
-             default:
-                 // default regex for BPE tokenization pre-processing
-                 regex_exprs = {
-@@ -1583,6 +1590,10 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
-             } else if (
-                 tokenizer_pre == "megrez") {
-                 pre_type = LLAMA_VOCAB_PRE_TYPE_QWEN2;
-+            } else if (
-+                tokenizer_pre == "gpt-4o") {
-+                pre_type = LLAMA_VOCAB_PRE_TYPE_GPT4O;
-+                clean_spaces = false;
-             } else {
-                 LLAMA_LOG_WARN("%s: missing or unrecognized pre-tokenizer type, using: 'default'\n", __func__);
-                 pre_type = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
--- a/llama/patches/0019-fix-compiler-error-in-clip.h.patch
+++ b/llama/patches/0019-fix-compiler-error-in-clip.h.patch
@ -0,0 +1,39 @@
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
+From: jmorganca <jmorganca@gmail.com>
+Date: Tue, 8 Apr 2025 20:49:50 -0700
+Subject: [PATCH] fix compiler error in clip.h
+
+fixes an error that occurs in clip.h when compiling
+using CGo
+---
+ examples/llava/clip.h | 5 +++--
+ 1 file changed, 3 insertions(+), 2 deletions(-)
+
+diff --git a/examples/llava/clip.h b/examples/llava/clip.h
+index cc133a58..5fc45d3e 100644
+--- a/examples/llava/clip.h
+++ b/examples/llava/clip.h
+@@ -30,12 +30,13 @@ struct clip_image_size {
+     int height;
+ };
+ 
+struct clip_image_f32;
+ struct clip_image_u8_batch;
+ struct clip_image_f32_batch;
+ 
+ struct clip_context_params {
+     bool use_gpu;
+-    ggml_log_level verbosity;
+    enum ggml_log_level verbosity;
+ };
+ 
+ // deprecated, use clip_init
+@@ -84,7 +85,7 @@ CLIP_API void clip_image_f32_batch_free(struct clip_image_f32_batch * batch);
+ CLIP_API size_t clip_image_f32_batch_n_images(const struct clip_image_f32_batch * batch); // equivalent to batch->size()
+ CLIP_API size_t clip_image_f32_batch_nx(const struct clip_image_f32_batch * batch, int idx); // equivalent to batch[idx]->nx
+ CLIP_API size_t clip_image_f32_batch_ny(const struct clip_image_f32_batch * batch, int idx); // equivalent to batch[idx]->ny
+-CLIP_API clip_image_f32 * clip_image_f32_get_img(const struct clip_image_f32_batch * batch, int idx); // equivalent to batch[idx]->data
+CLIP_API struct clip_image_f32 * clip_image_f32_get_img(const struct clip_image_f32_batch * batch, int idx); // equivalent to batch[idx]->data
+ 
+ /**
+  * Build image from pixels decoded by other libraries instead of stb_image.h for better performance.
--- a/llama/patches/0020-Revert-Simplify-and-improve-CUDA-graphs-through-use-.patch
+++ b/llama/patches/0020-Revert-Simplify-and-improve-CUDA-graphs-through-use-.patch
@ -0,0 +1,600 @@
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
+From: jmorganca <jmorganca@gmail.com>
+Date: Sat, 12 Apr 2025 13:06:57 -0700
+Subject: [PATCH] Revert "Simplify and improve CUDA graphs through use of
+ indirect copy pointers (#9017)"
+
+this commit in llama.cpp causes errors when running llama 3.2
+vision - temporarily revert it
+
+This reverts commit 3f9da22c2b21a2cef216de50006436ef1cab8764.
+---
+ ggml/src/ggml-cuda/common.cuh   |   8 +-
+ ggml/src/ggml-cuda/cpy.cu       | 149 ++++++++++++--------------------
+ ggml/src/ggml-cuda/cpy.cuh      |   2 -
+ ggml/src/ggml-cuda/ggml-cuda.cu |  93 +++++++++++++++-----
+ 4 files changed, 124 insertions(+), 128 deletions(-)
+
+diff --git a/ggml/src/ggml-cuda/common.cuh b/ggml/src/ggml-cuda/common.cuh
+index 8284a001..a718b6a1 100644
+--- a/ggml/src/ggml-cuda/common.cuh
+++ b/ggml/src/ggml-cuda/common.cuh
+@@ -729,13 +729,7 @@ struct ggml_cuda_graph {
+     bool disable_due_to_failed_graph_capture = false;
+     int number_consecutive_updates = 0;
+     std::vector<ggml_graph_node_properties> ggml_graph_properties;
+-    bool use_cpy_indirection = false;
+-    std::vector<char *> cpy_dest_ptrs;
+-    char ** dest_ptrs_d;
+-    int dest_ptrs_size = 0;
+-    // Index to allow each cpy kernel to be aware of it's position within the graph
+-    // relative to other cpy nodes.
+-    int graph_cpynode_index = -1;
+    std::vector<char **> updated_kernel_arg;
+ #endif
+ };
+ 
+diff --git a/ggml/src/ggml-cuda/cpy.cu b/ggml/src/ggml-cuda/cpy.cu
+index 4f4faa3e..8396df28 100644
+--- a/ggml/src/ggml-cuda/cpy.cu
+++ b/ggml/src/ggml-cuda/cpy.cu
+@@ -39,18 +39,16 @@ static __device__ void cpy_1_f16_f32(const char * cxi, char * cdsti) {
+ }
+ 
+ template <cpy_kernel_t cpy_1>
+-static __global__ void cpy_f32_f16(const char * cx, char * cdst_direct, const int ne,
+static __global__ void cpy_f32_f16(const char * cx, char * cdst, const int ne,
+                                    const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
+                                    const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11,
+-                                   const int nb12, const int nb13, char ** cdst_indirect, int graph_cpynode_index) {
+                                   const int nb12, const int nb13) {
+     const int64_t i = blockDim.x*blockIdx.x + threadIdx.x;
+ 
+     if (i >= ne) {
+         return;
+     }
+ 
+-    char * cdst = (cdst_indirect != nullptr) ? cdst_indirect[graph_cpynode_index]: cdst_direct;
+-
+     // determine indices i03/i13, i02/i12, i01/i11, i00/i10 as a function of index i of flattened tensor
+     // then combine those indices with the corresponding byte offsets to get the total offsets
+     const int64_t i03 = i/(ne00 * ne01 * ne02);
+@@ -297,18 +295,16 @@ static __device__ void cpy_blck_f32_iq4_nl(const char * cxi, char * cdsti) {
+ }
+ 
+ template <cpy_kernel_t cpy_blck, int qk>
+-static __global__ void cpy_f32_q(const char * cx, char * cdst_direct, const int ne,
+static __global__ void cpy_f32_q(const char * cx, char * cdst, const int ne,
+                                  const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
+                                  const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11,
+-                                 const int nb12, const int nb13, char ** cdst_indirect, int graph_cpynode_index) {
+                                 const int nb12, const int nb13) {
+     const int i = (blockDim.x*blockIdx.x + threadIdx.x)*qk;
+ 
+     if (i >= ne) {
+         return;
+     }
+ 
+-    char * cdst = (cdst_indirect != nullptr) ? cdst_indirect[graph_cpynode_index]: cdst_direct;
+-
+     const int i03 = i/(ne00 * ne01 * ne02);
+     const int i02 = (i - i03*ne00*ne01*ne02 )/ (ne00*ne01);
+     const int i01 = (i - i03*ne00*ne01*ne02  -  i02*ne01*ne00) / ne00;
+@@ -325,18 +321,16 @@ static __global__ void cpy_f32_q(const char * cx, char * cdst_direct, const int
+ }
+ 
+ template <cpy_kernel_t cpy_blck, int qk>
+-static __global__ void cpy_q_f32(const char * cx, char * cdst_direct, const int ne,
+static __global__ void cpy_q_f32(const char * cx, char * cdst, const int ne,
+                                  const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
+                                  const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11,
+-                                 const int nb12, const int nb13, char ** cdst_indirect, int graph_cpynode_index) {
+                                 const int nb12, const int nb13) {
+     const int i = (blockDim.x*blockIdx.x + threadIdx.x)*qk;
+ 
+     if (i >= ne) {
+         return;
+     }
+ 
+-    char * cdst = (cdst_indirect != nullptr) ? cdst_indirect[graph_cpynode_index]: cdst_direct;
+-
+     const int i03 = i/(ne00 * ne01 * ne02);
+     const int i02 = (i - i03*ne00*ne01*ne02 )/ (ne00*ne01);
+     const int i01 = (i - i03*ne00*ne01*ne02  -  i02*ne01*ne00) / ne00;
+@@ -352,97 +346,76 @@ static __global__ void cpy_q_f32(const char * cx, char * cdst_direct, const int
+     cpy_blck(cx + x_offset, cdst + dst_offset);
+ }
+ 
+-// Copy destination pointers to GPU to be available when pointer indirection is in use
+-
+-void ggml_cuda_cpy_dest_ptrs_copy(ggml_cuda_graph * cuda_graph, char ** host_dest_ptrs, const int host_dest_ptrs_size, cudaStream_t stream) {
+-#if defined(GGML_CUDA_USE_GRAPHS) || defined(GGML_HIP_GRAPHS)
+-    if (cuda_graph->dest_ptrs_size < host_dest_ptrs_size) { // (re-)allocate GPU memory for destination pointers
+-        CUDA_CHECK(cudaStreamSynchronize(stream));
+-        if (cuda_graph->dest_ptrs_d != nullptr) {
+-            CUDA_CHECK(cudaFree(cuda_graph->dest_ptrs_d));
+-        }
+-        CUDA_CHECK(cudaMalloc(&cuda_graph->dest_ptrs_d, host_dest_ptrs_size*sizeof(char *)));
+-        cuda_graph->dest_ptrs_size = host_dest_ptrs_size;
+-    }
+-    // copy destination pointers to GPU
+-    CUDA_CHECK(cudaMemcpyAsync(cuda_graph->dest_ptrs_d, host_dest_ptrs, host_dest_ptrs_size*sizeof(char *), cudaMemcpyHostToDevice, stream));
+-    cuda_graph->graph_cpynode_index = 0; // reset index
+-#else
+-    GGML_UNUSED(cuda_graph); GGML_UNUSED(host_dest_ptrs);
+-    GGML_UNUSED(host_dest_ptrs_size); GGML_UNUSED(stream);
+-#endif
+-}
+-
+ static void ggml_cpy_f16_f32_cuda(
+     const char * cx, char * cdst, const int ne,
+     const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
+-    const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream, char ** cdst_indirect, int & graph_cpynode_index) {
+    const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream) {
+ 
+     const int num_blocks = (ne + CUDA_CPY_BLOCK_SIZE - 1) / CUDA_CPY_BLOCK_SIZE;
+     cpy_f32_f16<cpy_1_f16_f32><<<num_blocks, CUDA_CPY_BLOCK_SIZE, 0, stream>>>
+-        (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, cdst_indirect, graph_cpynode_index++);
+        (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
+ }
+ 
+ static void ggml_cpy_f32_f32_cuda(
+     const char * cx, char * cdst, const int ne,
+     const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
+-    const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream, char ** cdst_indirect, int & graph_cpynode_index) {
+    const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream) {
+ 
+     const int num_blocks = (ne + CUDA_CPY_BLOCK_SIZE - 1) / CUDA_CPY_BLOCK_SIZE;
+     cpy_f32_f16<cpy_1_f32_f32><<<num_blocks, CUDA_CPY_BLOCK_SIZE, 0, stream>>>
+-        (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, cdst_indirect, graph_cpynode_index++);
+        (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
+ }
+ 
+ static void ggml_cpy_f32_bf16_cuda(
+     const char * cx, char * cdst, const int ne,
+     const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
+-    const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream, char ** cdst_indirect, int & graph_cpynode_index) {
+    const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream) {
+ 
+     const int num_blocks = (ne + CUDA_CPY_BLOCK_SIZE - 1) / CUDA_CPY_BLOCK_SIZE;
+     cpy_f32_f16<cpy_1_f32_bf16><<<num_blocks, CUDA_CPY_BLOCK_SIZE, 0, stream>>>
+-        (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, cdst_indirect, graph_cpynode_index++);
+        (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
+ }
+ 
+ static void ggml_cpy_f32_f16_cuda(
+     const char * cx, char * cdst, const int ne,
+     const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
+-    const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream, char ** cdst_indirect, int & graph_cpynode_index) {
+    const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream) {
+ 
+     const int num_blocks = (ne + CUDA_CPY_BLOCK_SIZE - 1) / CUDA_CPY_BLOCK_SIZE;
+     cpy_f32_f16<cpy_1_f32_f16><<<num_blocks, CUDA_CPY_BLOCK_SIZE, 0, stream>>>
+-        (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, cdst_indirect, graph_cpynode_index++);
+        (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
+ }
+ 
+ static void ggml_cpy_f32_q8_0_cuda(
+     const char * cx, char * cdst, const int ne,
+     const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
+-    const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream, char ** cdst_indirect, int & graph_cpynode_index) {
+    const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream) {
+ 
+     GGML_ASSERT(ne % QK8_0 == 0);
+     const int num_blocks = ne / QK8_0;
+     cpy_f32_q<cpy_blck_f32_q8_0, QK8_0><<<num_blocks, 1, 0, stream>>>
+-        (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, cdst_indirect, graph_cpynode_index++);
+        (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
+ }
+ 
+ static void ggml_cpy_q8_0_f32_cuda(
+     const char * cx, char * cdst, const int ne,
+     const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
+-    const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream, char ** cdst_indirect, int & graph_cpynode_index) {
+    const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream) {
+ 
+     const int num_blocks = ne;
+     cpy_q_f32<cpy_blck_q8_0_f32, QK8_0><<<num_blocks, 1, 0, stream>>>
+-        (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, cdst_indirect, graph_cpynode_index++);
+        (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
+ }
+ 
+ static void ggml_cpy_f32_q4_0_cuda(
+     const char * cx, char * cdst, const int ne,
+     const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
+-    const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream, char ** cdst_indirect, int & graph_cpynode_index) {
+    const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream) {
+ 
+     GGML_ASSERT(ne % QK4_0 == 0);
+     const int num_blocks = ne / QK4_0;
+     cpy_f32_q<cpy_blck_f32_q4_0, QK4_0><<<num_blocks, 1, 0, stream>>>
+-        (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, cdst_indirect, graph_cpynode_index++);
+        (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
+ }
+ 
+ static void ggml_cpy_q4_0_f32_cuda(
+@@ -451,22 +424,22 @@ static void ggml_cpy_q4_0_f32_cuda(
+     const int nb00, const int nb01, const int nb02,
+     const int nb03, const int ne10, const int ne11, const int ne12,
+     const int nb10, const int nb11, const int nb12, const int nb13,
+-    cudaStream_t stream, char ** cdst_indirect, int & graph_cpynode_index) {
+    cudaStream_t stream) {
+     const int num_blocks = ne;
+     cpy_q_f32<cpy_blck_q_f32<dequantize_q4_0, QK4_0>, QK4_0><<<num_blocks, 1, 0, stream>>>(
+         cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03,
+-         ne10, ne11, ne12, nb10, nb11, nb12, nb13, cdst_indirect, graph_cpynode_index++);
+         ne10, ne11, ne12, nb10, nb11, nb12, nb13);
+ }
+ 
+ static void ggml_cpy_f32_q4_1_cuda(
+     const char * cx, char * cdst, const int ne,
+     const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
+-    const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream, char ** cdst_indirect, int & graph_cpynode_index) {
+    const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream) {
+ 
+     GGML_ASSERT(ne % QK4_1 == 0);
+     const int num_blocks = ne / QK4_1;
+     cpy_f32_q<cpy_blck_f32_q4_1, QK4_1><<<num_blocks, 1, 0, stream>>>
+-        (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, cdst_indirect, graph_cpynode_index++);
+        (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
+ }
+ 
+ static void ggml_cpy_q4_1_f32_cuda(
+@@ -475,22 +448,22 @@ static void ggml_cpy_q4_1_f32_cuda(
+     const int nb00, const int nb01, const int nb02,
+     const int nb03, const int ne10, const int ne11, const int ne12,
+     const int nb10, const int nb11, const int nb12, const int nb13,
+-    cudaStream_t stream, char ** cdst_indirect, int & graph_cpynode_index) {
+    cudaStream_t stream) {
+     const int num_blocks = ne;
+     cpy_q_f32<cpy_blck_q_f32<dequantize_q4_1, QK4_1>, QK4_1><<<num_blocks, 1, 0, stream>>>(
+         cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03,
+-         ne10, ne11, ne12, nb10, nb11, nb12, nb13, cdst_indirect, graph_cpynode_index++);
+         ne10, ne11, ne12, nb10, nb11, nb12, nb13);
+ }
+ 
+ static void ggml_cpy_f32_q5_0_cuda(
+     const char * cx, char * cdst, const int ne,
+     const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
+-    const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream, char ** cdst_indirect, int & graph_cpynode_index) {
+    const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream) {
+ 
+     GGML_ASSERT(ne % QK5_0 == 0);
+     const int num_blocks = ne / QK5_0;
+     cpy_f32_q<cpy_blck_f32_q5_0, QK5_0><<<num_blocks, 1, 0, stream>>>
+-        (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, cdst_indirect, graph_cpynode_index++);
+        (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
+ }
+ 
+ static void ggml_cpy_q5_0_f32_cuda(
+@@ -499,22 +472,22 @@ static void ggml_cpy_q5_0_f32_cuda(
+     const int nb00, const int nb01, const int nb02,
+     const int nb03, const int ne10, const int ne11, const int ne12,
+     const int nb10, const int nb11, const int nb12, const int nb13,
+-    cudaStream_t stream, char ** cdst_indirect, int & graph_cpynode_index) {
+    cudaStream_t stream) {
+     const int num_blocks = ne;
+     cpy_q_f32<cpy_blck_q_f32<dequantize_q5_0, QK5_0>, QK5_0><<<num_blocks, 1, 0, stream>>>(
+         cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03,
+-        ne10, ne11, ne12, nb10, nb11, nb12, nb13, cdst_indirect, graph_cpynode_index++);
+        ne10, ne11, ne12, nb10, nb11, nb12, nb13);
+ }
+ 
+ static void ggml_cpy_f32_q5_1_cuda(
+     const char * cx, char * cdst, const int ne,
+     const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
+-    const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream, char ** cdst_indirect, int & graph_cpynode_index) {
+    const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream) {
+ 
+     GGML_ASSERT(ne % QK5_1 == 0);
+     const int num_blocks = ne / QK5_1;
+     cpy_f32_q<cpy_blck_f32_q5_1, QK5_1><<<num_blocks, 1, 0, stream>>>
+-        (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, cdst_indirect, graph_cpynode_index++);
+        (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
+ }
+ 
+ static void ggml_cpy_q5_1_f32_cuda(
+@@ -523,32 +496,32 @@ static void ggml_cpy_q5_1_f32_cuda(
+     const int nb00, const int nb01, const int nb02,
+     const int nb03, const int ne10, const int ne11, const int ne12,
+     const int nb10, const int nb11, const int nb12, const int nb13,
+-    cudaStream_t stream, char ** cdst_indirect, int & graph_cpynode_index) {
+    cudaStream_t stream) {
+     const int num_blocks = ne;
+     cpy_q_f32<cpy_blck_q_f32<dequantize_q5_1, QK5_1>, QK5_1><<<num_blocks, 1, 0, stream>>>(
+         cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03,
+-        ne10, ne11, ne12, nb10, nb11, nb12, nb13, cdst_indirect, graph_cpynode_index++);
+        ne10, ne11, ne12, nb10, nb11, nb12, nb13);
+ }
+ 
+ static void ggml_cpy_f32_iq4_nl_cuda(
+     const char * cx, char * cdst, const int ne,
+     const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
+-    const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream, char ** cdst_indirect, int & graph_cpynode_index) {
+    const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream) {
+ 
+     GGML_ASSERT(ne % QK4_NL == 0);
+     const int num_blocks = ne / QK4_NL;
+     cpy_f32_q<cpy_blck_f32_iq4_nl, QK4_NL><<<num_blocks, 1, 0, stream>>>
+-        (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, cdst_indirect, graph_cpynode_index++);
+        (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
+ }
+ 
+ static void ggml_cpy_f16_f16_cuda(
+     const char * cx, char * cdst, const int ne,
+     const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
+-    const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream, char ** cdst_indirect, int & graph_cpynode_index) {
+    const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream) {
+ 
+     const int num_blocks = (ne + CUDA_CPY_BLOCK_SIZE - 1) / CUDA_CPY_BLOCK_SIZE;
+     cpy_f32_f16<cpy_1_f16_f16><<<num_blocks, CUDA_CPY_BLOCK_SIZE, 0, stream>>>
+-        (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, cdst_indirect, graph_cpynode_index++);
+        (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
+ }
+ 
+ void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, ggml_tensor * src1) {
+@@ -585,62 +558,48 @@ void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, gg
+     char * src0_ddc = (char *) src0->data;
+     char * src1_ddc = (char *) src1->data;
+ 
+-    char ** dest_ptrs_d = nullptr;
+-    int graph_cpynode_index = -1;
+-#if defined(GGML_CUDA_USE_GRAPHS) || defined(GGML_HIP_GRAPHS)
+-    if(ctx.cuda_graph->use_cpy_indirection) {
+-        dest_ptrs_d = ctx.cuda_graph->dest_ptrs_d;
+-        graph_cpynode_index = ctx.cuda_graph->graph_cpynode_index;
+-    }
+-#endif
+     if (src0->type == src1->type && ggml_is_contiguous(src0) && ggml_is_contiguous(src1)) {
+         GGML_ASSERT(ggml_nbytes(src0) == ggml_nbytes(src1));
+         CUDA_CHECK(cudaMemcpyAsync(src1_ddc, src0_ddc, ggml_nbytes(src0), cudaMemcpyDeviceToDevice, main_stream));
+     } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32) {
+-        ggml_cpy_f32_f32_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs_d, graph_cpynode_index);
+        ggml_cpy_f32_f32_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
+     } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_BF16) {
+-        ggml_cpy_f32_bf16_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs_d, graph_cpynode_index);
+        ggml_cpy_f32_bf16_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
+     } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F16) {
+-        ggml_cpy_f32_f16_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs_d, graph_cpynode_index);
+        ggml_cpy_f32_f16_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
+     } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q8_0) {
+-        ggml_cpy_f32_q8_0_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs_d, graph_cpynode_index);
+        ggml_cpy_f32_q8_0_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
+     } else if (src0->type == GGML_TYPE_Q8_0 && src1->type == GGML_TYPE_F32) {
+-        ggml_cpy_q8_0_f32_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs_d, graph_cpynode_index);
+        ggml_cpy_q8_0_f32_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
+     } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q4_0) {
+-        ggml_cpy_f32_q4_0_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs_d, graph_cpynode_index);
+        ggml_cpy_f32_q4_0_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
+     } else if (src0->type == GGML_TYPE_Q4_0 && src1->type == GGML_TYPE_F32) {
+         ggml_cpy_q4_0_f32_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02,
+-            nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs_d, graph_cpynode_index);
+            nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
+     } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q4_1) {
+-        ggml_cpy_f32_q4_1_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs_d, graph_cpynode_index);
+        ggml_cpy_f32_q4_1_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
+     } else if (src0->type == GGML_TYPE_Q4_1 && src1->type == GGML_TYPE_F32) {
+         ggml_cpy_q4_1_f32_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02,
+-            nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs_d, graph_cpynode_index);
+            nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
+     } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q5_0) {
+-        ggml_cpy_f32_q5_0_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs_d, graph_cpynode_index);
+        ggml_cpy_f32_q5_0_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
+     } else if (src0->type == GGML_TYPE_Q5_0 && src1->type == GGML_TYPE_F32) {
+         ggml_cpy_q5_0_f32_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02,
+-            nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs_d, graph_cpynode_index);
+            nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
+     } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_IQ4_NL) {
+-        ggml_cpy_f32_iq4_nl_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs_d, graph_cpynode_index);
+        ggml_cpy_f32_iq4_nl_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
+     } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q5_1) {
+-        ggml_cpy_f32_q5_1_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs_d, graph_cpynode_index);
+        ggml_cpy_f32_q5_1_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
+     } else if (src0->type == GGML_TYPE_Q5_1 && src1->type == GGML_TYPE_F32) {
+-        ggml_cpy_q5_1_f32_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs_d, graph_cpynode_index);
+        ggml_cpy_q5_1_f32_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
+     } else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F16) {
+-        ggml_cpy_f16_f16_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs_d, graph_cpynode_index);
+        ggml_cpy_f16_f16_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
+     } else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F32) {
+-        ggml_cpy_f16_f32_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs_d, graph_cpynode_index);
+        ggml_cpy_f16_f32_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
+     } else {
+         GGML_ABORT("%s: unsupported type combination (%s to %s)\n", __func__,
+                 ggml_type_name(src0->type), ggml_type_name(src1->type));
+     }
+-#if defined(GGML_CUDA_USE_GRAPHS) || defined(GGML_HIP_GRAPHS)
+-    if(ctx.cuda_graph->use_cpy_indirection) {
+-        ctx.cuda_graph->graph_cpynode_index = graph_cpynode_index;
+-    }
+-#endif
+-
+ }
+ 
+ void ggml_cuda_dup(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
+diff --git a/ggml/src/ggml-cuda/cpy.cuh b/ggml/src/ggml-cuda/cpy.cuh
+index 6bed0564..28b06cdd 100644
+--- a/ggml/src/ggml-cuda/cpy.cuh
+++ b/ggml/src/ggml-cuda/cpy.cuh
+@@ -7,5 +7,3 @@ void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, gg
+ void ggml_cuda_dup(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
+ 
+ void* ggml_cuda_cpy_fn(const ggml_tensor * src0, ggml_tensor * src1);
+-
+-void ggml_cuda_cpy_dest_ptrs_copy(ggml_cuda_graph * cuda_graph, char ** host_dest_ptrs, const int host_dest_ptrs_size, cudaStream_t stream);
+diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
+index 67208cba..a44788db 100644
+--- a/ggml/src/ggml-cuda/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
+@@ -2477,11 +2477,10 @@ static void ggml_backend_cuda_synchronize(ggml_backend_t backend) {
+ 
+ #ifdef USE_CUDA_GRAPH
+ static bool check_node_graph_compatibility_and_refresh_copy_ops(ggml_backend_cuda_context * cuda_ctx, ggml_cgraph * cgraph,
+-    bool use_cuda_graph) {
+    std::vector<void *> & ggml_cuda_cpy_fn_ptrs, bool use_cuda_graph) {
+ 
+     // Loop over nodes in GGML graph to obtain info needed for CUDA graph
+-    cuda_ctx->cuda_graph->cpy_dest_ptrs.clear();
+-
+    cuda_ctx->cuda_graph->updated_kernel_arg.clear();
+     for (int i = 0; i < cgraph->n_nodes; i++) {
+         ggml_tensor * node = cgraph->nodes[i];
+ 
+@@ -2513,11 +2512,8 @@ static bool check_node_graph_compatibility_and_refresh_copy_ops(ggml_backend_cud
+         }
+ 
+         if (node->op == GGML_OP_CPY) {
+-
+-            // Store the pointers which are updated for each token, such that these can be sent
+-            // to the device and accessed using indirection from CUDA graph
+-            cuda_ctx->cuda_graph->cpy_dest_ptrs.push_back((char *) node->src[1]->data);
+-
+            // store the copy op parameter which changes with each token.
+            cuda_ctx->cuda_graph->updated_kernel_arg.push_back((char **) &(node->src[1]->data));
+             // store a pointer to each copy op CUDA kernel to identify it later
+             void * ptr = ggml_cuda_cpy_fn(node->src[0], node->src[1]);
+             if (!ptr) {
+@@ -2525,6 +2521,10 @@ static bool check_node_graph_compatibility_and_refresh_copy_ops(ggml_backend_cud
+ #ifndef NDEBUG
+                 GGML_LOG_DEBUG("%s: disabling CUDA graphs due to unsupported copy op\n", __func__);
+ #endif
+            } else {
+                if (std::find(ggml_cuda_cpy_fn_ptrs.begin(), ggml_cuda_cpy_fn_ptrs.end(), ptr) == ggml_cuda_cpy_fn_ptrs.end()) {
+                    ggml_cuda_cpy_fn_ptrs.push_back(ptr);
+                }
+             }
+         }
+ 
+@@ -2533,12 +2533,6 @@ static bool check_node_graph_compatibility_and_refresh_copy_ops(ggml_backend_cud
+         }
+     }
+ 
+-    if (use_cuda_graph) {
+-        cuda_ctx->cuda_graph->use_cpy_indirection = true;
+-        // copy pointers to GPU so they can be accessed via indirection within CUDA graph
+-        ggml_cuda_cpy_dest_ptrs_copy(cuda_ctx->cuda_graph.get(), cuda_ctx->cuda_graph->cpy_dest_ptrs.data(), cuda_ctx->cuda_graph->cpy_dest_ptrs.size(), cuda_ctx->stream());
+-    }
+-
+     return use_cuda_graph;
+ }
+ 
+@@ -2593,6 +2587,51 @@ static bool ggml_graph_node_has_matching_properties(ggml_tensor * node, ggml_gra
+     return true;
+ }
+ 
+static void maintain_cuda_graph(ggml_backend_cuda_context * cuda_ctx, std::vector<void *> & ggml_cuda_cpy_fn_ptrs, bool cuda_graph_update_required) {
+
+    if (cuda_graph_update_required) {
+        // Extract nodes from graph
+        // First call with null argument gets number of nodes in graph
+        CUDA_CHECK(cudaGraphGetNodes(cuda_ctx->cuda_graph->graph, nullptr, &cuda_ctx->cuda_graph->num_nodes));
+        // Subsequent call with non-null argument gets nodes
+        cuda_ctx->cuda_graph->nodes.clear();
+        cuda_ctx->cuda_graph->nodes.resize(cuda_ctx->cuda_graph->num_nodes);
+        cuda_ctx->cuda_graph->params.clear();
+        cuda_ctx->cuda_graph->params.resize(cuda_ctx->cuda_graph->num_nodes);
+        if (cuda_ctx->cuda_graph->num_nodes > 0) {
+            CUDA_CHECK(cudaGraphGetNodes(cuda_ctx->cuda_graph->graph, cuda_ctx->cuda_graph->nodes.data(), &cuda_ctx->cuda_graph->num_nodes));
+
+            // Loop over nodes, and extract kernel parameters from each node
+            for (size_t i = 0; i < cuda_ctx->cuda_graph->num_nodes; i++) {
+                cudaGraphNodeType node_type;
+                CUDA_CHECK(cudaGraphNodeGetType(cuda_ctx->cuda_graph->nodes[i], &node_type));
+                if (node_type == cudaGraphNodeTypeKernel) {
+                    cudaError_t stat = cudaGraphKernelNodeGetParams(cuda_ctx->cuda_graph->nodes[i], &cuda_ctx->cuda_graph->params[i]); // Get params using runtime
+                    if (stat == cudaErrorInvalidDeviceFunction) {
+                        // Fails due to incorrect handling by CUDA runtime of CUDA BLAS node.
+                        // We don't need to update blas nodes, so clear error and move on.
+                        (void)cudaGetLastError();
+                    } else {
+                        GGML_ASSERT(stat == cudaSuccess);
+                    }
+                }
+            }
+        }
+    } else {
+        // One of the arguments to the copy kernel is updated for each token, hence we need to
+        // replace that argument with the updated value in the CUDA graph
+        // on update steps, the live parameters will already be captured
+        int k = 0;
+        for (size_t i = 0; i < cuda_ctx->cuda_graph->num_nodes; i++) {
+            if(count(ggml_cuda_cpy_fn_ptrs.begin(), ggml_cuda_cpy_fn_ptrs.end(), cuda_ctx->cuda_graph->params[i].func) > 0) {
+                char ** updated_kernel_arg_ptr = cuda_ctx->cuda_graph->updated_kernel_arg.at(k++);
+                *(void**)cuda_ctx->cuda_graph->params[i].kernelParams[1] = *(void**)updated_kernel_arg_ptr;
+                CUDA_CHECK(cudaGraphKernelNodeSetParams(cuda_ctx->cuda_graph->nodes[i], &cuda_ctx->cuda_graph->params[i]));
+            }
+        }
+    }
+}
+
+ static bool is_cuda_graph_update_required(ggml_backend_cuda_context * cuda_ctx, ggml_cgraph * cgraph) {
+ 
+     bool cuda_graph_update_required = false;
+@@ -2652,7 +2691,8 @@ static void update_cuda_graph_executable(ggml_backend_cuda_context * cuda_ctx) {
+ #endif
+ 
+ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx, ggml_cgraph * cgraph,
+-    bool & graph_evaluated_or_captured, bool & use_cuda_graph, bool & cuda_graph_update_required) {
+   [[maybe_unused]] std::vector<void *> & ggml_cuda_cpy_fn_ptrs,  bool & graph_evaluated_or_captured, bool & use_cuda_graph,
+    bool & cuda_graph_update_required) {
+ 
+     while (!graph_evaluated_or_captured) {
+         // Only perform the graph execution if CUDA graphs are not enabled, or we are capturing the graph.
+@@ -2702,9 +2742,13 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx
+         if (cuda_ctx->cuda_graph->instance == nullptr) { // Create executable graph from captured graph.
+             CUDA_CHECK(cudaGraphInstantiate(&cuda_ctx->cuda_graph->instance, cuda_ctx->cuda_graph->graph, NULL, NULL, 0));
+         }
+-        if (cuda_graph_update_required) { // Update graph executable
+-            update_cuda_graph_executable(cuda_ctx);
+-        }
+
+        // Perform update to graph (if required for this token), and change copy parameter (required for every token)
+        maintain_cuda_graph(cuda_ctx, ggml_cuda_cpy_fn_ptrs, cuda_graph_update_required);
+
+        // Update graph executable
+        update_cuda_graph_executable(cuda_ctx);
+
+         // Launch graph
+         CUDA_CHECK(cudaGraphLaunch(cuda_ctx->cuda_graph->instance, cuda_ctx->stream()));
+ #else
+@@ -2718,6 +2762,10 @@ static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend,
+ 
+     ggml_cuda_set_device(cuda_ctx->device);
+ 
+    // vector of pointers to CUDA cpy kernels, which are required to identify
+    // kernel parameters which need updated in the graph for each token
+    std::vector<void *> ggml_cuda_cpy_fn_ptrs;
+
+ #ifdef USE_CUDA_GRAPH
+     static const bool disable_cuda_graphs_due_to_env = (getenv("GGML_CUDA_DISABLE_GRAPHS") != nullptr);
+ 
+@@ -2751,7 +2799,8 @@ static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend,
+     if (use_cuda_graph) {
+         cuda_graph_update_required = is_cuda_graph_update_required(cuda_ctx, cgraph);
+ 
+-        use_cuda_graph = check_node_graph_compatibility_and_refresh_copy_ops(cuda_ctx, cgraph, use_cuda_graph);
+        use_cuda_graph = check_node_graph_compatibility_and_refresh_copy_ops(cuda_ctx, cgraph,
+                             ggml_cuda_cpy_fn_ptrs, use_cuda_graph);
+ 
+         // Disable CUDA graphs (from the next token) if the use-case is demanding too many consecutive graph updates.
+         if (use_cuda_graph && cuda_graph_update_required) {
+@@ -2772,10 +2821,6 @@ static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend,
+         CUDA_CHECK(cudaStreamBeginCapture(cuda_ctx->stream(), cudaStreamCaptureModeRelaxed));
+     }
+ 
+-    if (!use_cuda_graph) {
+-        cuda_ctx->cuda_graph->use_cpy_indirection = false;
+-    }
+-
+ #else
+     bool use_cuda_graph = false;
+     bool cuda_graph_update_required = false;
+@@ -2783,7 +2828,7 @@ static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend,
+ 
+     bool graph_evaluated_or_captured = false;
+ 
+-    evaluate_and_capture_cuda_graph(cuda_ctx, cgraph, graph_evaluated_or_captured, use_cuda_graph, cuda_graph_update_required);
+    evaluate_and_capture_cuda_graph(cuda_ctx, cgraph, ggml_cuda_cpy_fn_ptrs, graph_evaluated_or_captured, use_cuda_graph, cuda_graph_update_required);
+ 
+     return GGML_STATUS_SUCCESS;
+ }
--- a/llama/patches/0021-add-model-quantizations.patch
+++ b/llama/patches/0021-add-model-quantizations.patch
@ -1,173 +0,0 @@
-From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
-From: Patrick Devine <patrick@infrahq.com>
-Date: Fri, 14 Mar 2025 16:33:23 -0700
-Subject: [PATCH] add model quantizations
-
- gemma3
- mistral3
---
- src/llama-arch.cpp  | 36 ++++++++++++++++++++++++++++++++++++
- src/llama-arch.h    |  2 ++
- src/llama-model.cpp | 10 ++++++++++
- src/llama-quant.cpp |  4 ++++
- 4 files changed, 52 insertions(+)
-
-diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp
-index b6f20286..13a0a988 100644
--- a/src/llama-arch.cpp
-+++ b/src/llama-arch.cpp
-@@ -37,6 +37,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
-     { LLM_ARCH_MINICPM3,         "minicpm3"         },
-     { LLM_ARCH_GEMMA,            "gemma"            },
-     { LLM_ARCH_GEMMA2,           "gemma2"           },
-+    { LLM_ARCH_GEMMA3,           "gemma3"           },
-     { LLM_ARCH_STARCODER2,       "starcoder2"       },
-     { LLM_ARCH_MAMBA,            "mamba"            },
-     { LLM_ARCH_XVERSE,           "xverse"           },
-@@ -64,6 +65,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
-     { LLM_ARCH_CHAMELEON,        "chameleon"        },
-     { LLM_ARCH_SOLAR,            "solar"            },
-     { LLM_ARCH_WAVTOKENIZER_DEC, "wavtokenizer-dec" },
-+    { LLM_ARCH_MISTRAL3,         "mistral3"         },
-     { LLM_ARCH_UNKNOWN,          "(unknown)"        },
- };
- 
-@@ -804,6 +806,24 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
-             { LLM_TENSOR_FFN_POST_NORM,   "blk.%d.post_ffw_norm" },
-         },
-     },
-+    {
-+        LLM_ARCH_GEMMA3,
-+        {
-+            { LLM_TENSOR_TOKEN_EMBD,      "token_embd" },
-+            { LLM_TENSOR_OUTPUT_NORM,     "output_norm" },
-+            { LLM_TENSOR_ATTN_NORM,       "blk.%d.attn_norm" },
-+            { LLM_TENSOR_ATTN_Q,          "blk.%d.attn_q" },
-+            { LLM_TENSOR_ATTN_K,          "blk.%d.attn_k" },
-+            { LLM_TENSOR_ATTN_V,          "blk.%d.attn_v" },
-+            { LLM_TENSOR_ATTN_OUT,        "blk.%d.attn_output" },
-+            { LLM_TENSOR_ATTN_POST_NORM,  "blk.%d.post_attention_norm" },
-+            { LLM_TENSOR_FFN_NORM,        "blk.%d.ffn_norm" },
-+            { LLM_TENSOR_FFN_GATE,        "blk.%d.ffn_gate" },
-+            { LLM_TENSOR_FFN_DOWN,        "blk.%d.ffn_down" },
-+            { LLM_TENSOR_FFN_UP,          "blk.%d.ffn_up" },
-+            { LLM_TENSOR_FFN_POST_NORM,   "blk.%d.post_ffw_norm" },
-+        },
-+    },
-     {
-         LLM_ARCH_STARCODER2,
-         {
-@@ -1352,6 +1372,22 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
-             { LLM_TENSOR_POS_NET_ATTN_OUT,  "posnet.%d.attn_output" },
-         },
-     },
-+    {
-+        LLM_ARCH_MISTRAL3,
-+        {
-+            { LLM_TENSOR_TOKEN_EMBD,  "token_embd" },
-+            { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
-+            { LLM_TENSOR_ATTN_NORM,   "blk.%d.attn_norm" },
-+            { LLM_TENSOR_ATTN_Q,      "blk.%d.attn_q" },
-+            { LLM_TENSOR_ATTN_K,      "blk.%d.attn_k" },
-+            { LLM_TENSOR_ATTN_V,      "blk.%d.attn_v" },
-+            { LLM_TENSOR_ATTN_OUT,    "blk.%d.attn_output" },
-+            { LLM_TENSOR_FFN_NORM,    "blk.%d.ffn_norm" },
-+            { LLM_TENSOR_FFN_GATE,    "blk.%d.ffn_gate" },
-+            { LLM_TENSOR_FFN_UP,      "blk.%d.ffn_up" },
-+            { LLM_TENSOR_FFN_DOWN,    "blk.%d.ffn_down" },
-+        }
-+    },
-     {
-         LLM_ARCH_UNKNOWN,
-         {
-diff --git a/src/llama-arch.h b/src/llama-arch.h
-index ec742224..8476ae0a 100644
--- a/src/llama-arch.h
-+++ b/src/llama-arch.h
-@@ -41,6 +41,7 @@ enum llm_arch {
-     LLM_ARCH_MINICPM3,
-     LLM_ARCH_GEMMA,
-     LLM_ARCH_GEMMA2,
-+    LLM_ARCH_GEMMA3,
-     LLM_ARCH_STARCODER2,
-     LLM_ARCH_MAMBA,
-     LLM_ARCH_XVERSE,
-@@ -68,6 +69,7 @@ enum llm_arch {
-     LLM_ARCH_CHAMELEON,
-     LLM_ARCH_SOLAR,
-     LLM_ARCH_WAVTOKENIZER_DEC,
-+    LLM_ARCH_MISTRAL3,
-     LLM_ARCH_UNKNOWN,
- };
- 
-diff --git a/src/llama-model.cpp b/src/llama-model.cpp
-index ab1a07d1..db4f2685 100644
--- a/src/llama-model.cpp
-+++ b/src/llama-model.cpp
-@@ -878,6 +878,9 @@ void llama_model::load_hparams(llama_model_loader & ml) {
-                     default: type = LLM_TYPE_UNKNOWN;
-                }
-             } break;
-+        case LLM_ARCH_GEMMA3:
-+            {
-+            } break;
-         case LLM_ARCH_STARCODER2:
-             {
-                 ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
-@@ -1274,6 +1277,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
-                 ml.get_key(LLM_KV_ATTENTION_GROUPNORM_GROUPS, hparams.n_norm_groups);
-                 ml.get_key(LLM_KV_ATTENTION_CAUSAL,           hparams.causal_attn);
-             } break;
-+        case LLM_ARCH_MISTRAL3: break;
-         default: throw std::runtime_error("unsupported model architecture");
-     }
- 
-@@ -2537,6 +2541,9 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
-                         layer.ffn_post_norm = create_tensor(tn(LLM_TENSOR_FFN_POST_NORM, "weight", i), {n_embd}, 0);
-                     }
-                 } break;
-+            case LLM_ARCH_GEMMA3:
-+                {
-+                } break;
-             case LLM_ARCH_STARCODER2:
-                 {
-                     tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
-@@ -3531,6 +3538,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
-                     output   = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {hparams.convnext.n_embd, n_embd}, 0);
-                     output_b = create_tensor(tn(LLM_TENSOR_OUTPUT, "bias"),   {n_embd}, 0);
-                 } break;
-+            case LLM_ARCH_MISTRAL3: break;
-             default:
-                 throw std::runtime_error("unknown architecture");
-         }
-@@ -4009,6 +4017,7 @@ enum llama_rope_type llama_model_rope_type(const struct llama_model * model) {
-         case LLM_ARCH_GRANITE_MOE:
-         case LLM_ARCH_CHAMELEON:
-         case LLM_ARCH_SOLAR:
-+        case LLM_ARCH_MISTRAL3:
-             return LLAMA_ROPE_TYPE_NORM;
- 
-         // the pairs of head values are offset by n_rot/2
-@@ -4029,6 +4038,7 @@ enum llama_rope_type llama_model_rope_type(const struct llama_model * model) {
-         case LLM_ARCH_PHIMOE:
-         case LLM_ARCH_GEMMA:
-         case LLM_ARCH_GEMMA2:
-+        case LLM_ARCH_GEMMA3:
-         case LLM_ARCH_STARCODER2:
-         case LLM_ARCH_OPENELM:
-         case LLM_ARCH_GPTNEOX:
-diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
-index 6eb1da08..ebcbafa1 100644
--- a/src/llama-quant.cpp
-+++ b/src/llama-quant.cpp
-@@ -737,6 +737,10 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
-         // This used to be a regex, but <regex> has an extreme cost to compile times.
-         bool quantize = name.rfind("weight") == name.size() - 6; // ends with 'weight'?
- 
-+        // don't quantize vision stuff
-+        quantize &= name.find("v.") == std::string::npos;
-+        quantize &= name.find("mm.") == std::string::npos;
-+
-         // quantize only 2D and 3D tensors (experts)
-         quantize &= (ggml_n_dims(tensor) >= 2);
- 
--- a/llama/patches/0021-remove-ggml-git-build-info.patch
+++ b/llama/patches/0021-remove-ggml-git-build-info.patch
@ -0,0 +1,45 @@
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
+From: jmorganca <jmorganca@gmail.com>
+Date: Sat, 12 Apr 2025 21:13:44 -0400
+Subject: [PATCH] remove ggml git build info
+
+---
+ ggml/CMakeLists.txt | 25 -------------------------
+ 1 file changed, 25 deletions(-)
+
+diff --git a/ggml/CMakeLists.txt b/ggml/CMakeLists.txt
+index d33f843b..a6c59f22 100644
+--- a/ggml/CMakeLists.txt
+++ b/ggml/CMakeLists.txt
+@@ -287,31 +287,6 @@ if (GGML_STANDALONE)
+         DESTINATION share/pkgconfig)
+ endif()
+ 
+-#
+-# Create CMake package
+-#
+-
+-# Generate version info based on git commit.
+-
+-if(NOT DEFINED GGML_BUILD_NUMBER)
+-    find_program(GIT_EXE NAMES git git.exe REQUIRED NO_CMAKE_FIND_ROOT_PATH)
+-    execute_process(COMMAND ${GIT_EXE} rev-list --count HEAD
+-        WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
+-        OUTPUT_VARIABLE GGML_BUILD_NUMBER
+-        OUTPUT_STRIP_TRAILING_WHITESPACE
+-    )
+-
+-    if(GGML_BUILD_NUMBER EQUAL 1)
+-        message(WARNING "GGML build version fixed at 1 likely due to a shallow clone.")
+-    endif()
+-
+-    execute_process(COMMAND ${GIT_EXE} rev-parse --short HEAD
+-        WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
+-        OUTPUT_VARIABLE GGML_BUILD_COMMIT
+-        OUTPUT_STRIP_TRAILING_WHITESPACE
+-    )
+-endif()
+-
+ 
+ # Capture variables prefixed with GGML_.
+ 
--- a/llama/patches/0022-add-rdna4-support.patch
+++ b/llama/patches/0022-add-rdna4-support.patch
@ -1,103 +0,0 @@
-From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
-From: Saman <saman.khatir@amd.com>
-Date: Wed, 19 Mar 2025 14:02:26 -0700
-Subject: [PATCH] add rdna4 support
-
---
- ggml/src/ggml-cuda/common.cuh    | 6 ++++--
- ggml/src/ggml-cuda/mmq.cu        | 2 +-
- ggml/src/ggml-cuda/mmq.cuh       | 4 ++--
- ggml/src/ggml-cuda/mmvq.cu       | 4 ++--
- ggml/src/ggml-cuda/vendors/hip.h | 4 ++++
- 5 files changed, 13 insertions(+), 7 deletions(-)
-
-diff --git a/ggml/src/ggml-cuda/common.cuh b/ggml/src/ggml-cuda/common.cuh
-index adf0d3ec..b24593fc 100644
--- a/ggml/src/ggml-cuda/common.cuh
-+++ b/ggml/src/ggml-cuda/common.cuh
-@@ -61,11 +61,13 @@
- #define GGML_CUDA_CC_RDNA1      (GGML_CUDA_CC_OFFSET_AMD + 0x1010) // RX 5000
- #define GGML_CUDA_CC_RDNA2      (GGML_CUDA_CC_OFFSET_AMD + 0x1030) // RX 6000, minimum for dp4a
- #define GGML_CUDA_CC_RDNA3      (GGML_CUDA_CC_OFFSET_AMD + 0x1100) // RX 7000, minimum for WMMA
-+#define GGML_CUDA_CC_RDNA4      (GGML_CUDA_CC_OFFSET_AMD + 0x1200) // RX 9000
- 
- #define GGML_CUDA_CC_IS_RDNA(cc)  (cc >= GGML_CUDA_CC_RDNA1)
- #define GGML_CUDA_CC_IS_RDNA1(cc) (cc >= GGML_CUDA_CC_RDNA1 && cc < GGML_CUDA_CC_RDNA2)
- #define GGML_CUDA_CC_IS_RDNA2(cc) (cc >= GGML_CUDA_CC_RDNA2 && cc < GGML_CUDA_CC_RDNA3)
-#define GGML_CUDA_CC_IS_RDNA3(cc) (cc >= GGML_CUDA_CC_RDNA3)
-+#define GGML_CUDA_CC_IS_RDNA3(cc) (cc >= GGML_CUDA_CC_RDNA3 && cc < GGML_CUDA_CC_RDNA4)
-+#define GGML_CUDA_CC_IS_RDNA4(cc) (cc >= GGML_CUDA_CC_RDNA4)
- #define GGML_CUDA_CC_IS_GCN(cc)   (cc > GGML_CUDA_CC_OFFSET_AMD && cc < GGML_CUDA_CC_CDNA)
- #define GGML_CUDA_CC_IS_CDNA(cc)  (cc >= GGML_CUDA_CC_CDNA && cc < GGML_CUDA_CC_RDNA1)
- 
-@@ -386,7 +388,7 @@ static __device__ __forceinline__ int ggml_cuda_dp4a(const int a, const int b, i
- #if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
- #if defined(__gfx906__) || defined(__gfx908__) || defined(__gfx90a__) || defined(RDNA2)
-     c = __builtin_amdgcn_sdot4(a, b, c, false);
-#elif defined(RDNA3)
-+#elif defined(RDNA3) || defined(RDNA4)
-     c = __builtin_amdgcn_sudot4( true, a, true, b, c, false);
- #elif defined(__gfx1010__) || defined(__gfx900__)
-     int tmp1;
-diff --git a/ggml/src/ggml-cuda/mmq.cu b/ggml/src/ggml-cuda/mmq.cu
-index 10f2ebb1..933d945c 100644
--- a/ggml/src/ggml-cuda/mmq.cu
-+++ b/ggml/src/ggml-cuda/mmq.cu
-@@ -149,5 +149,5 @@ bool ggml_cuda_should_use_mmq(enum ggml_type type, int cc, int64_t ne11) {
-         return !fp16_mma_hardware_available(cc) || ne11 < MMQ_DP4A_MAX_BATCH_SIZE;
-     }
- 
-    return (!GGML_CUDA_CC_IS_RDNA3(cc) && !GGML_CUDA_CC_IS_CDNA(cc)) || ne11 < MMQ_DP4A_MAX_BATCH_SIZE;
-+    return (!GGML_CUDA_CC_IS_RDNA4(cc) && !GGML_CUDA_CC_IS_RDNA3(cc) && !GGML_CUDA_CC_IS_CDNA(cc)) || ne11 < MMQ_DP4A_MAX_BATCH_SIZE;
- }
-diff --git a/ggml/src/ggml-cuda/mmq.cuh b/ggml/src/ggml-cuda/mmq.cuh
-index 0451c65f..66ce2bc9 100644
--- a/ggml/src/ggml-cuda/mmq.cuh
-+++ b/ggml/src/ggml-cuda/mmq.cuh
-@@ -2577,9 +2577,9 @@ static __device__ void mul_mat_q_process_tile(
- 
- template <ggml_type type, int mmq_x, int nwarps, bool need_check>
- #if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
-#if defined(RDNA3) || defined(RDNA2) || defined(CDNA) || defined(GCN)
-+#if defined(RDNA4) || defined(RDNA3) || defined(RDNA2) || defined(CDNA) || defined(GCN)
-     __launch_bounds__(WARP_SIZE*nwarps, 2)
-#endif // defined(RDNA3) || defined(RDNA2) || defined(CDNA) || defined(GCN)
-+#endif // defined(RDNA4) || defined(RDNA3) || defined(RDNA2) || defined(CDNA) || defined(GCN)
- #else
- #if __CUDA_ARCH__ >= GGML_CUDA_CC_VOLTA
-     __launch_bounds__(WARP_SIZE*nwarps, 1)
-diff --git a/ggml/src/ggml-cuda/mmvq.cu b/ggml/src/ggml-cuda/mmvq.cu
-index 4fb466ca..23ae7abc 100644
--- a/ggml/src/ggml-cuda/mmvq.cu
-+++ b/ggml/src/ggml-cuda/mmvq.cu
-@@ -62,13 +62,13 @@ static __global__ void mul_mat_vec_q(
- 
-     constexpr vec_dot_q_cuda_t vec_dot_q_cuda = get_vec_dot_q_cuda(type);
- 
-#if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) && (defined(RDNA2) || defined(RDNA3))
-+#if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) && (defined(RDNA2) || defined(RDNA3) || defined(RDNA4))
-     constexpr int nwarps              = 1;
-     constexpr int rows_per_cuda_block = 1;
- #else
-     constexpr int nwarps              = ncols_y <= 4 ? 4 : 2;
-     constexpr int rows_per_cuda_block = ncols_y == 1 ? 1 : 2;
-#endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) && !defined(RDNA2) && !defined(RDNA3)
-+#endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) && !defined(RDNA2) && !defined(RDNA3) && !defined(RDNA4)
- 
-     const     int tid = WARP_SIZE*threadIdx.y + threadIdx.x;
-     const     int row0 = rows_per_cuda_block*blockIdx.x;
-diff --git a/ggml/src/ggml-cuda/vendors/hip.h b/ggml/src/ggml-cuda/vendors/hip.h
-index 81964611..a62544b5 100644
--- a/ggml/src/ggml-cuda/vendors/hip.h
-+++ b/ggml/src/ggml-cuda/vendors/hip.h
-@@ -150,6 +150,10 @@
- #define CDNA
- #endif
- 
-+#if defined(__gfx1200__) || defined(__gfx1201__)
-+#define RDNA4
-+#endif
-+
- #if defined(__gfx1100__) || defined(__gfx1101__) || defined(__gfx1102__) || defined(__gfx1103__) || \
-     defined(__gfx1150__) || defined(__gfx1151__)
- #define RDNA3
--- a/llama/sampling_ext.cpp
+++ b/llama/sampling_ext.cpp
@ -73,7 +73,7 @@ struct llama_vocab * llama_load_vocab_from_file(const char * fname) {
    try {
        const auto kv = LLM_KV(LLM_ARCH_UNKNOWN);
        std::vector<std::string> splits = {};
-        llama_model_loader ml(std::string(fname), splits, false, false, nullptr);
+        llama_model_loader ml(std::string(fname), splits, false, false, nullptr, nullptr);
        vocab->load(ml, kv);
    } catch (const std::exception & err) {
        LLAMA_LOG_ERROR("%s: error loading model: %s\n", __func__, err.what());
--- a/ml/backend/ggml/ggml/.rsync-filter
+++ b/ml/backend/ggml/ggml/.rsync-filter
@ -1,5 +1,6 @@
 protect *.go
 protect *-embed.*
+include cmake/
 include include/
 include src/
 include src/CMakeLists.txt
@ -13,6 +14,7 @@ include src/ggml-cuda/vendors/
 include src/ggml-cuda/template-instances/
 include src/ggml-hip/
 include src/ggml-metal/
+include CMakeLists.txt
 include *.c
 include *.h
 include *.cpp
@ -20,4 +22,6 @@ include *.cu
 include *.cuh
 include *.m
 include *.metal
+include common.cmake
+include ggml-config.cmake.in
 exclude *
--- a/ml/backend/ggml/ggml/CMakeLists.txt
+++ b/ml/backend/ggml/ggml/CMakeLists.txt
@ -0,0 +1,337 @@
+cmake_minimum_required(VERSION 3.14) # for add_link_options and implicit target directories.
+project("ggml" C CXX)
+include(CheckIncludeFileCXX)
+
+set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
+
+if (NOT XCODE AND NOT MSVC AND NOT CMAKE_BUILD_TYPE)
+    set(CMAKE_BUILD_TYPE Release CACHE STRING "Build type" FORCE)
+    set_property(CACHE CMAKE_BUILD_TYPE PROPERTY STRINGS "Debug" "Release" "MinSizeRel" "RelWithDebInfo")
+endif()
+
+if (CMAKE_SOURCE_DIR STREQUAL CMAKE_CURRENT_SOURCE_DIR)
+    set(GGML_STANDALONE ON)
+
+    set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin)
+
+    # configure project version
+    # TODO
+else()
+    set(GGML_STANDALONE OFF)
+endif()
+
+if (EMSCRIPTEN)
+    set(BUILD_SHARED_LIBS_DEFAULT OFF)
+
+    option(GGML_WASM_SINGLE_FILE "ggml: embed WASM inside the generated ggml.js" ON)
+else()
+    if (MINGW)
+        set(BUILD_SHARED_LIBS_DEFAULT OFF)
+    else()
+        set(BUILD_SHARED_LIBS_DEFAULT ON)
+    endif()
+endif()
+
+# remove the lib prefix on win32 mingw
+if (WIN32)
+    set(CMAKE_STATIC_LIBRARY_PREFIX "")
+    set(CMAKE_SHARED_LIBRARY_PREFIX "")
+    set(CMAKE_SHARED_MODULE_PREFIX  "")
+endif()
+
+option(BUILD_SHARED_LIBS "ggml: build shared libraries" ${BUILD_SHARED_LIBS_DEFAULT})
+option(GGML_BACKEND_DL   "ggml: build backends as dynamic libraries (requires BUILD_SHARED_LIBS)" OFF)
+
+#
+# option list
+#
+
+# TODO: mark all options as advanced when not GGML_STANDALONE
+
+if (APPLE)
+    set(GGML_METAL_DEFAULT ON)
+    set(GGML_BLAS_DEFAULT ON)
+    set(GGML_BLAS_VENDOR_DEFAULT "Apple")
+else()
+    set(GGML_METAL_DEFAULT OFF)
+    set(GGML_BLAS_DEFAULT OFF)
+    set(GGML_BLAS_VENDOR_DEFAULT "Generic")
+endif()
+
+if (CMAKE_CROSSCOMPILING OR DEFINED ENV{SOURCE_DATE_EPOCH})
+    message(STATUS "Setting GGML_NATIVE_DEFAULT to OFF")
+    set(GGML_NATIVE_DEFAULT OFF)
+else()
+    set(GGML_NATIVE_DEFAULT ON)
+endif()
+
+# defaults
+if (NOT GGML_LLAMAFILE_DEFAULT)
+    set(GGML_LLAMAFILE_DEFAULT OFF)
+endif()
+
+if (NOT GGML_CUDA_GRAPHS_DEFAULT)
+    set(GGML_CUDA_GRAPHS_DEFAULT OFF)
+endif()
+
+# general
+option(GGML_STATIC "ggml: static link libraries"                     OFF)
+option(GGML_NATIVE "ggml: optimize the build for the current system" ${GGML_NATIVE_DEFAULT})
+option(GGML_LTO    "ggml: enable link time optimization"             OFF)
+option(GGML_CCACHE "ggml: use ccache if available"                   ON)
+
+# debug
+option(GGML_ALL_WARNINGS           "ggml: enable all compiler warnings"                   ON)
+option(GGML_ALL_WARNINGS_3RD_PARTY "ggml: enable all compiler warnings in 3rd party libs" OFF)
+option(GGML_GPROF                  "ggml: enable gprof"                                   OFF)
+
+# build
+option(GGML_FATAL_WARNINGS    "ggml: enable -Werror flag"    OFF)
+
+# sanitizers
+option(GGML_SANITIZE_THREAD    "ggml: enable thread sanitizer"    OFF)
+option(GGML_SANITIZE_ADDRESS   "ggml: enable address sanitizer"   OFF)
+option(GGML_SANITIZE_UNDEFINED "ggml: enable undefined sanitizer" OFF)
+
+# instruction set specific
+if (GGML_NATIVE OR NOT GGML_NATIVE_DEFAULT)
+    set(INS_ENB OFF)
+else()
+    set(INS_ENB ON)
+endif()
+
+message(DEBUG "GGML_NATIVE         : ${GGML_NATIVE}")
+message(DEBUG "GGML_NATIVE_DEFAULT : ${GGML_NATIVE_DEFAULT}")
+message(DEBUG "INS_ENB             : ${INS_ENB}")
+
+option(GGML_CPU_HBM          "ggml: use memkind for CPU HBM" OFF)
+option(GGML_CPU_AARCH64      "ggml: use runtime weight conversion of Q4_0 to Q4_X_X" ON)
+option(GGML_CPU_KLEIDIAI     "ggml: use KleidiAI optimized kernels if applicable" OFF)
+option(GGML_AVX              "ggml: enable AVX"              ${INS_ENB})
+option(GGML_AVX_VNNI         "ggml: enable AVX-VNNI"         OFF)
+option(GGML_AVX2             "ggml: enable AVX2"             ${INS_ENB})
+option(GGML_BMI2             "ggml: enable BMI2"             ${INS_ENB})
+option(GGML_AVX512           "ggml: enable AVX512F"          OFF)
+option(GGML_AVX512_VBMI      "ggml: enable AVX512-VBMI"      OFF)
+option(GGML_AVX512_VNNI      "ggml: enable AVX512-VNNI"      OFF)
+option(GGML_AVX512_BF16      "ggml: enable AVX512-BF16"      OFF)
+if (NOT MSVC)
+    # in MSVC F16C and FMA is implied with AVX2/AVX512
+    option(GGML_FMA          "ggml: enable FMA"              ${INS_ENB})
+    option(GGML_F16C         "ggml: enable F16C"             ${INS_ENB})
+    # MSVC does not seem to support AMX
+    option(GGML_AMX_TILE     "ggml: enable AMX-TILE"         OFF)
+    option(GGML_AMX_INT8     "ggml: enable AMX-INT8"         OFF)
+    option(GGML_AMX_BF16     "ggml: enable AMX-BF16"         OFF)
+endif()
+option(GGML_LASX             "ggml: enable lasx"             ON)
+option(GGML_LSX              "ggml: enable lsx"              ON)
+option(GGML_RVV              "ggml: enable rvv"              ON)
+option(GGML_RV_ZFH           "ggml: enable riscv zfh"        OFF)
+option(GGML_VXE              "ggml: enable vxe"              ON)
+
+option(GGML_CPU_ALL_VARIANTS "ggml: build all variants of the CPU backend (requires GGML_BACKEND_DL)" OFF)
+set(GGML_CPU_ARM_ARCH        "" CACHE STRING "ggml: CPU architecture for ARM")
+set(GGML_CPU_POWERPC_CPUTYPE "" CACHE STRING "ggml: CPU type for PowerPC")
+
+
+if (WIN32)
+    set(GGML_WIN_VER "0x602" CACHE STRING   "ggml: Windows version")
+endif()
+
+# ggml core
+set(GGML_SCHED_MAX_COPIES  "4" CACHE STRING "ggml: max input copies for pipeline parallelism")
+option(GGML_CPU                             "ggml: enable CPU backend"                        ON)
+
+# 3rd party libs / backends
+option(GGML_ACCELERATE                      "ggml: enable Accelerate framework"               ON)
+option(GGML_BLAS                            "ggml: use BLAS"                                  ${GGML_BLAS_DEFAULT})
+set(GGML_BLAS_VENDOR ${GGML_BLAS_VENDOR_DEFAULT} CACHE STRING
+                                            "ggml: BLAS library vendor")
+option(GGML_LLAMAFILE                       "ggml: use LLAMAFILE"                             ${GGML_LLAMAFILE_DEFAULT})
+
+option(GGML_CUDA                            "ggml: use CUDA"                                  OFF)
+option(GGML_MUSA                            "ggml: use MUSA"                                  OFF)
+option(GGML_CUDA_FORCE_MMQ                  "ggml: use mmq kernels instead of cuBLAS"         OFF)
+option(GGML_CUDA_FORCE_CUBLAS               "ggml: always use cuBLAS instead of mmq kernels"  OFF)
+option(GGML_CUDA_F16                        "ggml: use 16 bit floats for some calculations"   OFF)
+set   (GGML_CUDA_PEER_MAX_BATCH_SIZE "128" CACHE STRING
+                                            "ggml: max. batch size for using peer access")
+option(GGML_CUDA_NO_PEER_COPY               "ggml: do not use peer to peer copies"            OFF)
+option(GGML_CUDA_NO_VMM                     "ggml: do not try to use CUDA VMM"                OFF)
+option(GGML_CUDA_FA                         "ggml: compile ggml FlashAttention CUDA kernels"  ON)
+option(GGML_CUDA_FA_ALL_QUANTS              "ggml: compile all quants for FlashAttention"     OFF)
+option(GGML_CUDA_GRAPHS                     "ggml: use CUDA graphs (llama.cpp only)"          ${GGML_CUDA_GRAPHS_DEFAULT})
+set   (GGML_CUDA_COMPRESSION_MODE "size" CACHE STRING
+                                            "ggml: cuda link binary compression mode; requires cuda 12.8+")
+set_property(CACHE GGML_CUDA_COMPRESSION_MODE PROPERTY STRINGS "none;speed;balance;size")
+
+option(GGML_HIP                             "ggml: use HIP"                                   OFF)
+option(GGML_HIP_GRAPHS                      "ggml: use HIP graph, experimental, slow"         OFF)
+option(GGML_HIP_NO_VMM                      "ggml: do not try to use HIP VMM"                 ON)
+option(GGML_HIP_ROCWMMA_FATTN               "ggml: enable rocWMMA for FlashAttention"         OFF)
+option(GGML_HIP_UMA                         "ggml: use HIP unified memory architecture"       OFF)
+option(GGML_VULKAN                          "ggml: use Vulkan"                                OFF)
+option(GGML_VULKAN_CHECK_RESULTS            "ggml: run Vulkan op checks"                      OFF)
+option(GGML_VULKAN_DEBUG                    "ggml: enable Vulkan debug output"                OFF)
+option(GGML_VULKAN_MEMORY_DEBUG             "ggml: enable Vulkan memory debug output"         OFF)
+option(GGML_VULKAN_SHADER_DEBUG_INFO        "ggml: enable Vulkan shader debug info"           OFF)
+option(GGML_VULKAN_PERF                     "ggml: enable Vulkan perf output"                 OFF)
+option(GGML_VULKAN_VALIDATE                 "ggml: enable Vulkan validation"                  OFF)
+option(GGML_VULKAN_RUN_TESTS                "ggml: run Vulkan tests"                          OFF)
+option(GGML_KOMPUTE                         "ggml: use Kompute"                               OFF)
+option(GGML_METAL                           "ggml: use Metal"                                 ${GGML_METAL_DEFAULT})
+option(GGML_METAL_USE_BF16                  "ggml: use bfloat if available"                   OFF)
+option(GGML_METAL_NDEBUG                    "ggml: disable Metal debugging"                   OFF)
+option(GGML_METAL_SHADER_DEBUG              "ggml: compile Metal with -fno-fast-math"         OFF)
+option(GGML_METAL_EMBED_LIBRARY             "ggml: embed Metal library"                       ${GGML_METAL})
+set   (GGML_METAL_MACOSX_VERSION_MIN "" CACHE STRING
+                                            "ggml: metal minimum macOS version")
+set   (GGML_METAL_STD "" CACHE STRING       "ggml: metal standard version (-std flag)")
+option(GGML_OPENMP                          "ggml: use OpenMP"                                ON)
+option(GGML_RPC                             "ggml: use RPC"                                   OFF)
+option(GGML_SYCL                            "ggml: use SYCL"                                  OFF)
+option(GGML_SYCL_F16                        "ggml: use 16 bit floats for sycl calculations"   OFF)
+option(GGML_SYCL_GRAPH                      "ggml: enable graphs in the SYCL backend"         ON)
+set   (GGML_SYCL_TARGET "INTEL" CACHE STRING
+                                            "ggml: sycl target device")
+set   (GGML_SYCL_DEVICE_ARCH "" CACHE STRING
+                                            "ggml: sycl device architecture")
+
+option(GGML_OPENCL                          "ggml: use OpenCL"                                OFF)
+option(GGML_OPENCL_PROFILING                "ggml: use OpenCL profiling (increases overhead)" OFF)
+option(GGML_OPENCL_EMBED_KERNELS            "ggml: embed kernels"                             ON)
+option(GGML_OPENCL_USE_ADRENO_KERNELS       "ggml: use optimized kernels for Adreno"          ON)
+set   (GGML_OPENCL_TARGET_VERSION "300" CACHE STRING
+                                            "gmml: OpenCL API version to target")
+
+# toolchain for vulkan-shaders-gen
+set   (GGML_VULKAN_SHADERS_GEN_TOOLCHAIN "" CACHE FILEPATH "ggml: toolchain file for vulkan-shaders-gen")
+
+# extra artifacts
+option(GGML_BUILD_TESTS    "ggml: build tests"    ${GGML_STANDALONE})
+option(GGML_BUILD_EXAMPLES "ggml: build examples" ${GGML_STANDALONE})
+
+#
+# dependencies
+#
+
+set(CMAKE_C_STANDARD 11)
+set(CMAKE_C_STANDARD_REQUIRED true)
+
+set(CMAKE_CXX_STANDARD 17)
+set(CMAKE_CXX_STANDARD_REQUIRED true)
+
+set(THREADS_PREFER_PTHREAD_FLAG ON)
+
+find_package(Threads REQUIRED)
+
+include(GNUInstallDirs)
+
+#
+# build the library
+#
+
+add_subdirectory(src)
+
+#
+# tests and examples
+#
+
+if (GGML_BUILD_TESTS)
+    enable_testing()
+    add_subdirectory(tests)
+endif ()
+
+if (GGML_BUILD_EXAMPLES)
+    add_subdirectory(examples)
+endif ()
+
+#
+# install
+#
+
+include(CMakePackageConfigHelpers)
+
+# all public headers
+set(GGML_PUBLIC_HEADERS
+    include/ggml.h
+    include/ggml-cpu.h
+    include/ggml-alloc.h
+    include/ggml-backend.h
+    include/ggml-blas.h
+    include/ggml-cann.h
+    include/ggml-cpp.h
+    include/ggml-cuda.h
+    include/ggml-kompute.h
+    include/ggml-opt.h
+    include/ggml-metal.h
+    include/ggml-rpc.h
+    include/ggml-sycl.h
+    include/ggml-vulkan.h
+    include/gguf.h)
+
+set_target_properties(ggml PROPERTIES PUBLIC_HEADER "${GGML_PUBLIC_HEADERS}")
+#if (GGML_METAL)
+#    set_target_properties(ggml PROPERTIES RESOURCE "${CMAKE_CURRENT_SOURCE_DIR}/src/ggml-metal.metal")
+#endif()
+install(TARGETS ggml LIBRARY PUBLIC_HEADER)
+install(TARGETS ggml-base LIBRARY)
+
+if (GGML_STANDALONE)
+    configure_file(${CMAKE_CURRENT_SOURCE_DIR}/ggml.pc.in
+        ${CMAKE_CURRENT_BINARY_DIR}/ggml.pc
+        @ONLY)
+
+    install(FILES ${CMAKE_CURRENT_BINARY_DIR}/ggml.pc
+        DESTINATION share/pkgconfig)
+endif()
+
+
+# Capture variables prefixed with GGML_.
+
+set(variable_set_statements
+"
+####### Expanded from @GGML_VARIABLES_EXPANED@ by configure_package_config_file() #######
+####### Any changes to this file will be overwritten by the next CMake run        #######
+
+")
+
+set(GGML_SHARED_LIB ${BUILD_SHARED_LIBS})
+
+get_cmake_property(all_variables VARIABLES)
+foreach(variable_name IN LISTS all_variables)
+    if(variable_name MATCHES "^GGML_")
+        string(REPLACE ";" "\\;"
+               variable_value "${${variable_name}}")
+
+        set(variable_set_statements
+            "${variable_set_statements}set(${variable_name} \"${variable_value}\")\n")
+    endif()
+endforeach()
+
+set(GGML_VARIABLES_EXPANDED ${variable_set_statements})
+
+# Create the CMake package and set install location.
+
+set(GGML_INSTALL_VERSION 0.0.${GGML_BUILD_NUMBER})
+set(GGML_INCLUDE_INSTALL_DIR ${CMAKE_INSTALL_INCLUDEDIR} CACHE PATH "Location of header  files")
+set(GGML_LIB_INSTALL_DIR     ${CMAKE_INSTALL_LIBDIR}     CACHE PATH "Location of library files")
+set(GGML_BIN_INSTALL_DIR     ${CMAKE_INSTALL_BINDIR}     CACHE PATH "Location of binary  files")
+
+configure_package_config_file(
+        ${CMAKE_CURRENT_SOURCE_DIR}/cmake/ggml-config.cmake.in
+        ${CMAKE_CURRENT_BINARY_DIR}/ggml-config.cmake
+    INSTALL_DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/ggml
+    PATH_VARS GGML_INCLUDE_INSTALL_DIR
+              GGML_LIB_INSTALL_DIR
+              GGML_BIN_INSTALL_DIR)
+
+write_basic_package_version_file(
+        ${CMAKE_CURRENT_BINARY_DIR}/ggml-version.cmake
+    VERSION ${GGML_INSTALL_VERSION}
+    COMPATIBILITY SameMajorVersion)
+
+install(FILES ${CMAKE_CURRENT_BINARY_DIR}/ggml-config.cmake
+              ${CMAKE_CURRENT_BINARY_DIR}/ggml-version.cmake
+        DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/ggml)
--- a/ml/backend/ggml/ggml/cmake/common.cmake
+++ b/ml/backend/ggml/ggml/cmake/common.cmake
@ -0,0 +1,26 @@
+function(ggml_get_flags CCID CCVER)
+    set(C_FLAGS "")
+    set(CXX_FLAGS "")
+
+    if (CCID MATCHES "Clang")
+        set(C_FLAGS   -Wunreachable-code-break -Wunreachable-code-return)
+        set(CXX_FLAGS -Wunreachable-code-break -Wunreachable-code-return -Wmissing-prototypes -Wextra-semi)
+
+        if (
+            (CCID STREQUAL "Clang"      AND CCVER VERSION_GREATER_EQUAL 3.8.0) OR
+            (CCID STREQUAL "AppleClang" AND CCVER VERSION_GREATER_EQUAL 7.3.0)
+        )
+            list(APPEND C_FLAGS -Wdouble-promotion)
+        endif()
+    elseif (CCID STREQUAL "GNU")
+        set(C_FLAGS   -Wdouble-promotion)
+        set(CXX_FLAGS -Wno-array-bounds)
+
+        if (CCVER VERSION_GREATER_EQUAL 8.1.0)
+            list(APPEND CXX_FLAGS -Wextra-semi)
+        endif()
+    endif()
+
+    set(GF_C_FLAGS   ${C_FLAGS}   PARENT_SCOPE)
+    set(GF_CXX_FLAGS ${CXX_FLAGS} PARENT_SCOPE)
+endfunction()
--- a/ml/backend/ggml/ggml/cmake/ggml-config.cmake.in
+++ b/ml/backend/ggml/ggml/cmake/ggml-config.cmake.in
@ -0,0 +1,152 @@
+
+@GGML_VARIABLES_EXPANDED@
+
+@PACKAGE_INIT@
+
+set_and_check(GGML_INCLUDE_DIR "@PACKAGE_GGML_INCLUDE_INSTALL_DIR@")
+set_and_check(GGML_LIB_DIR "@PACKAGE_GGML_LIB_INSTALL_DIR@")
+#set_and_check(GGML_BIN_DIR "@PACKAGE_GGML_BIN_INSTALL_DIR@")
+
+find_package(Threads REQUIRED)
+
+find_library(GGML_LIBRARY ggml
+    REQUIRED
+    HINTS ${GGML_LIB_DIR}
+    NO_CMAKE_FIND_ROOT_PATH)
+
+add_library(ggml::ggml UNKNOWN IMPORTED)
+set_target_properties(ggml::ggml
+    PROPERTIES
+        IMPORTED_LOCATION "${GGML_LIBRARY}")
+
+find_library(GGML_BASE_LIBRARY ggml-base
+    REQUIRED
+    HINTS ${GGML_LIB_DIR}
+    NO_CMAKE_FIND_ROOT_PATH)
+
+add_library(ggml::ggml-base UNKNOWN IMPORTED)
+set_target_properties(ggml::ggml-base
+    PROPERTIES
+        IMPORTED_LOCATION "${GGML_BASE_LIBRARY}")
+
+if (NOT GGML_SHARED_LIB)
+    if (APPLE AND GGML_ACCELERATE)
+        find_library(ACCELERATE_FRAMEWORK Accelerate REQUIRED)
+        list(APPEND GGML_CPU_INTERFACE_LINK_LIBRARIES ${ACCELERATE_FRAMEWORK})
+    endif()
+
+    if (GGML_OPENMP)
+        find_package(OpenMP REQUIRED)
+        list(APPEND GGML_CPU_INTERFACE_LINK_LIBRARIES OpenMP::OpenMP_C OpenMP::OpenMP_CXX)
+    endif()
+
+    if (GGML_CPU_HBM)
+        find_library(memkind memkind REQUIRED)
+        list(APPEND GGML_CPU_INTERFACE_LINK_LIBRARIES memkind)
+    endif()
+
+    if (GGML_BLAS)
+        find_package(BLAS REQUIRED)
+        list(APPEND GGML_CPU_INTERFACE_LINK_LIBRARIES ${BLAS_LIBRARIES})
+        list(APPEND GGML_CPU_INTERFACE_LINK_OPTIONS   ${BLAS_LINKER_FLAGS})
+    endif()
+
+    if (GGML_CUDA)
+        find_package(CUDAToolkit REQUIRED)
+    endif()
+
+    if (GGML_METAL)
+        find_library(FOUNDATION_LIBRARY Foundation REQUIRED)
+        find_library(METAL_FRAMEWORK    Metal REQUIRED)
+        find_library(METALKIT_FRAMEWORK MetalKit REQUIRED)
+
+        list(APPEND GGML_METAL_INTERFACE_LINK_LIBRARIES
+                    ${FOUNDATION_LIBRARY} ${METAL_FRAMEWORK} ${METALKIT_FRAMEWORK})
+    endif()
+
+    if (GGML_VULKAN)
+        find_package(Vulkan REQUIRED)
+        list(APPEND GGML_VULKAN_INTERFACE_LINK_LIBRARIES Vulkan::Vulkan)
+    endif()
+
+    if (GGML_HIP)
+        find_package(hip     REQUIRED)
+        find_package(hipblas REQUIRED)
+        find_package(rocblas REQUIRED)
+        list(APPEND GGML_HIP_INTERFACE_LINK_LIBRARIES hip::host roc::rocblas roc::hipblas)
+    endif()
+
+    if (GGML_SYCL)
+        find_package(DNNL)
+        if (${DNNL_FOUND} AND GGML_SYCL_TARGET STREQUAL "INTEL")
+            list(APPEND GGML_SYCL_INTERFACE_LINK_LIBRARIES DNNL::dnnl)
+        endif()
+        if (WIN32)
+            find_package(IntelSYCL REQUIRED)
+            find_package(MKL       REQUIRED)
+            list(APPEND GGML_SYCL_INTERFACE_LINK_LIBRARIES IntelSYCL::SYCL_CXX MKL::MKL MKL::MKL_SYCL)
+        endif()
+    endif()
+endif()
+
+set(_ggml_all_targets "")
+foreach(_ggml_backend ${GGML_AVAILABLE_BACKENDS})
+    string(REPLACE "-" "_" _ggml_backend_pfx "${_ggml_backend}")
+    string(TOUPPER "${_ggml_backend_pfx}" _ggml_backend_pfx)
+
+    find_library(${_ggml_backend_pfx}_LIBRARY ${_ggml_backend}
+        REQUIRED
+        HINTS ${GGML_LIB_DIR}
+        NO_CMAKE_FIND_ROOT_PATH)
+
+    message(STATUS "Found ${${_ggml_backend_pfx}_LIBRARY}")
+
+    add_library(ggml::${_ggml_backend} UNKNOWN IMPORTED)
+    set_target_properties(ggml::${_ggml_backend}
+        PROPERTIES
+            INTERFACE_INCLUDE_DIRECTORIES "${GGML_INCLUDE_DIR}"
+            IMPORTED_LINK_INTERFACE_LANGUAGES "CXX"
+            IMPORTED_LOCATION "${${_ggml_backend_pfx}_LIBRARY}"
+            INTERFACE_COMPILE_FEATURES c_std_90
+            POSITION_INDEPENDENT_CODE ON)
+
+    string(REGEX MATCH "^ggml-cpu" is_cpu_variant "${_ggml_backend}")
+    if(is_cpu_variant)
+        list(APPEND GGML_CPU_INTERFACE_LINK_LIBRARIES "ggml::ggml-base")
+        set_target_properties(ggml::${_ggml_backend}
+           PROPERTIES
+               INTERFACE_LINK_LIBRARIES "${GGML_CPU_INTERFACE_LINK_LIBRARIES}")
+
+        if(GGML_CPU_INTERFACE_LINK_OPTIONS)
+            set_target_properties(ggml::${_ggml_backend}
+                PROPERTIES
+                    INTERFACE_LINK_OPTIONS "${GGML_CPU_INTERFACE_LINK_OPTIONS}")
+        endif()
+
+    else()
+        list(APPEND ${_ggml_backend_pfx}_INTERFACE_LINK_LIBRARIES "ggml::ggml-base")
+        set_target_properties(ggml::${_ggml_backend}
+            PROPERTIES
+                INTERFACE_LINK_LIBRARIES "${${_ggml_backend_pfx}_INTERFACE_LINK_LIBRARIES}")
+
+        if(${_ggml_backend_pfx}_INTERFACE_LINK_OPTIONS)
+            set_target_properties(ggml::${_ggml_backend}
+                PROPERTIES
+                    INTERFACE_LINK_OPTIONS "${${_ggml_backend_pfx}_INTERFACE_LINK_OPTIONS}")
+        endif()
+    endif()
+
+    list(APPEND _ggml_all_targets ggml::${_ggml_backend})
+endforeach()
+
+list(APPEND GGML_INTERFACE_LINK_LIBRARIES ggml::ggml-base "${_ggml_all_targets}")
+set_target_properties(ggml::ggml
+    PROPERTIES
+        INTERFACE_LINK_LIBRARIES "${GGML_INTERFACE_LINK_LIBRARIES}")
+
+add_library(ggml::all INTERFACE IMPORTED)
+set_target_properties(ggml::all
+    PROPERTIES
+        INTERFACE_LINK_LIBRARIES "${_ggml_all_targets}")
+
+check_required_components(ggml)
--- a/ml/backend/ggml/ggml/include/ggml-alloc.h
+++ b/ml/backend/ggml/ggml/include/ggml-alloc.h
@ -19,7 +19,7 @@ struct ggml_tallocr {
 };

 GGML_API struct ggml_tallocr ggml_tallocr_new(ggml_backend_buffer_t buffer);
-GGML_API void                ggml_tallocr_alloc(struct ggml_tallocr * talloc, struct ggml_tensor * tensor);
+GGML_API enum ggml_status    ggml_tallocr_alloc(struct ggml_tallocr * talloc, struct ggml_tensor * tensor);

 // Graph allocator
 /*
--- a/ml/backend/ggml/ggml/include/ggml-backend.h
+++ b/ml/backend/ggml/ggml/include/ggml-backend.h
@ -56,7 +56,7 @@ extern "C" {
    GGML_API void                           ggml_backend_buffer_free          (ggml_backend_buffer_t buffer);
    GGML_API void *                         ggml_backend_buffer_get_base      (ggml_backend_buffer_t buffer);
    GGML_API size_t                         ggml_backend_buffer_get_size      (ggml_backend_buffer_t buffer);
-    GGML_API void                           ggml_backend_buffer_init_tensor   (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
+    GGML_API enum ggml_status               ggml_backend_buffer_init_tensor   (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
    GGML_API size_t                         ggml_backend_buffer_get_alignment (ggml_backend_buffer_t buffer);
    GGML_API size_t                         ggml_backend_buffer_get_max_size  (ggml_backend_buffer_t buffer);
    GGML_API size_t                         ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
@ -342,8 +342,8 @@ extern "C" {
    GGML_API bool ggml_backend_compare_graph_backend(ggml_backend_t backend1, ggml_backend_t backend2, struct ggml_cgraph * graph, ggml_backend_eval_callback callback, void * user_data);

    // Tensor initialization
-    GGML_API void ggml_backend_tensor_alloc(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, void * addr);
-    GGML_API void ggml_backend_view_init(struct ggml_tensor * tensor);
+    GGML_API enum ggml_status ggml_backend_tensor_alloc(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, void * addr);
+    GGML_API enum ggml_status ggml_backend_view_init(struct ggml_tensor * tensor);

    // CPU buffer types are always available
    GGML_API ggml_backend_buffer_t      ggml_backend_cpu_buffer_from_ptr(void * ptr, size_t size);
--- a/ml/backend/ggml/ggml/include/ggml-cpu.h
+++ b/ml/backend/ggml/ggml/include/ggml-cpu.h
@ -80,6 +80,7 @@ extern "C" {
    GGML_BACKEND_API int ggml_cpu_has_avx        (void);
    GGML_BACKEND_API int ggml_cpu_has_avx_vnni   (void);
    GGML_BACKEND_API int ggml_cpu_has_avx2       (void);
+    GGML_BACKEND_API int ggml_cpu_has_bmi2       (void);
    GGML_BACKEND_API int ggml_cpu_has_f16c       (void);
    GGML_BACKEND_API int ggml_cpu_has_fma        (void);
    GGML_BACKEND_API int ggml_cpu_has_avx512     (void);
--- a/ml/backend/ggml/ggml/include/ggml-rpc.h
+++ b/ml/backend/ggml/ggml/include/ggml-rpc.h
@ -17,7 +17,9 @@ GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_rpc_buffer_type(const c

 GGML_BACKEND_API void ggml_backend_rpc_get_device_memory(const char * endpoint, size_t * free, size_t * total);

-GGML_BACKEND_API void ggml_backend_rpc_start_server(ggml_backend_t backend, const char * endpoint, size_t free_mem, size_t total_mem);
+GGML_BACKEND_API void ggml_backend_rpc_start_server(ggml_backend_t backend, const char * endpoint,
+                                                    const char * cache_dir,
+                                                    size_t free_mem, size_t total_mem);

 GGML_BACKEND_API ggml_backend_reg_t ggml_backend_rpc_reg(void);

--- a/ml/backend/ggml/ggml/include/ggml.h
+++ b/ml/backend/ggml/ggml/include/ggml.h
@ -454,6 +454,7 @@ extern "C" {
        GGML_OP_RMS_NORM,
        GGML_OP_RMS_NORM_BACK,
        GGML_OP_GROUP_NORM,
+        GGML_OP_L2_NORM,

        GGML_OP_MUL_MAT,
        GGML_OP_MUL_MAT_ID,
@ -503,20 +504,16 @@ extern "C" {
        GGML_OP_ADD_REL_POS,
        GGML_OP_RWKV_WKV6,
        GGML_OP_GATED_LINEAR_ATTN,
+        GGML_OP_RWKV_WKV7,

        GGML_OP_UNARY,

-        GGML_OP_MAP_UNARY,
-        GGML_OP_MAP_BINARY,
-
-        GGML_OP_MAP_CUSTOM1_F32,
-        GGML_OP_MAP_CUSTOM2_F32,
-        GGML_OP_MAP_CUSTOM3_F32,
-
        GGML_OP_MAP_CUSTOM1,
        GGML_OP_MAP_CUSTOM2,
        GGML_OP_MAP_CUSTOM3,

+        GGML_OP_CUSTOM,
+
        GGML_OP_CROSS_ENTROPY_LOSS,
        GGML_OP_CROSS_ENTROPY_LOSS_BACK,
        GGML_OP_OPT_STEP_ADAMW,
@ -1096,6 +1093,18 @@ extern "C" {
            int                   n_groups,
            float                 eps);

+    // l2 normalize along rows
+    // used in rwkv v7
+    GGML_API struct ggml_tensor * ggml_l2_norm(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            float                 eps);
+
+    GGML_API struct ggml_tensor * ggml_l2_norm_inplace(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            float                 eps);
+
    // a - x
    // b - dy
    GGML_API struct ggml_tensor * ggml_rms_norm_back(
@ -1709,24 +1718,29 @@ extern "C" {
            float                 p0,
            float                 p1);

-    // nearest interpolate
+    enum ggml_scale_mode {
+        GGML_SCALE_MODE_NEAREST  = 0,
+        GGML_SCALE_MODE_BILINEAR = 1,
+    };
+
+    // interpolate
    // multiplies ne0 and ne1 by scale factor
-    // used in stable-diffusion
    GGML_API struct ggml_tensor * ggml_upscale(
            struct ggml_context * ctx,
            struct ggml_tensor  * a,
-            int                   scale_factor);
+            int                   scale_factor,
+            enum ggml_scale_mode  mode);

-    // nearest interpolate
-    // nearest interpolate to specified dimensions
-    // used in tortoise.cpp
+    // interpolate
+    // interpolate scale to specified dimensions
    GGML_API struct ggml_tensor * ggml_upscale_ext(
            struct ggml_context * ctx,
            struct ggml_tensor  * a,
            int                   ne0,
            int                   ne1,
            int                   ne2,
-            int                   ne3);
+            int                   ne3,
+            enum ggml_scale_mode  mode);

    // pad each dimension with zeros: [x, ..., x] -> [x, ..., x, 0, ..., 0]
    GGML_API struct ggml_tensor * ggml_pad(
@ -1787,11 +1801,11 @@ extern "C" {

 #define GGML_KQ_MASK_PAD 64

-    // q:    [n_embd, n_batch,     n_head,    1]
-    // k:    [n_embd, n_kv,        n_head_kv, 1]
-    // v:    [n_embd, n_kv,        n_head_kv, 1] !! not transposed !!
-    // mask: [n_kv,   n_batch_pad, 1,         1] !! n_batch_pad = GGML_PAD(n_batch, GGML_KQ_MASK_PAD) !!
-    // res:  [n_embd, n_head,      n_batch,   1] !! permuted !!
+    // q:    [n_embd_k, n_batch,     n_head,    1]
+    // k:    [n_embd_k, n_kv,        n_head_kv, 1]
+    // v:    [n_embd_v, n_kv,        n_head_kv, 1] !! not transposed !!
+    // mask: [n_kv,     n_batch_pad, 1,         1] !! n_batch_pad = GGML_PAD(n_batch, GGML_KQ_MASK_PAD) !!
+    // res:  [n_embd_v, n_head,      n_batch,   1] !! permuted !!
    GGML_API struct ggml_tensor * ggml_flash_attn_ext(
            struct ggml_context * ctx,
            struct ggml_tensor  * q,
@ -1900,85 +1914,18 @@ extern "C" {
            struct ggml_tensor  * state,
            float scale);

+    GGML_API struct ggml_tensor * ggml_rwkv_wkv7(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * r,
+            struct ggml_tensor  * w,
+            struct ggml_tensor  * k,
+            struct ggml_tensor  * v,
+            struct ggml_tensor  * a,
+            struct ggml_tensor  * b,
+            struct ggml_tensor  * state);
+
    // custom operators

-    typedef void (*ggml_unary_op_f32_t) (const int, float *, const float *);
-    typedef void (*ggml_binary_op_f32_t)(const int, float *, const float *, const float *);
-
-    typedef void (*ggml_custom1_op_f32_t)(struct ggml_tensor *, const struct ggml_tensor *);
-    typedef void (*ggml_custom2_op_f32_t)(struct ggml_tensor *, const struct ggml_tensor *, const struct ggml_tensor *);
-    typedef void (*ggml_custom3_op_f32_t)(struct ggml_tensor *, const struct ggml_tensor *, const struct ggml_tensor *, const struct ggml_tensor *);
-
-    GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_map_unary_f32(
-            struct ggml_context        * ctx,
-            struct ggml_tensor         * a,
-                   ggml_unary_op_f32_t   fun),
-        "use ggml_map_custom1 instead");
-
-    GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_map_unary_inplace_f32(
-            struct ggml_context        * ctx,
-            struct ggml_tensor         * a,
-                   ggml_unary_op_f32_t   fun),
-        "use ggml_map_custom1_inplace instead");
-
-    GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_map_binary_f32(
-            struct ggml_context         * ctx,
-            struct ggml_tensor          * a,
-            struct ggml_tensor          * b,
-                   ggml_binary_op_f32_t   fun),
-        "use ggml_map_custom2 instead");
-
-    GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_map_binary_inplace_f32(
-            struct ggml_context         * ctx,
-            struct ggml_tensor          * a,
-            struct ggml_tensor          * b,
-                   ggml_binary_op_f32_t   fun),
-        "use ggml_map_custom2_inplace instead");
-
-    GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_map_custom1_f32(
-            struct ggml_context          * ctx,
-            struct ggml_tensor           * a,
-                   ggml_custom1_op_f32_t   fun),
-        "use ggml_map_custom1 instead");
-
-    GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_map_custom1_inplace_f32(
-            struct ggml_context          * ctx,
-            struct ggml_tensor           * a,
-                   ggml_custom1_op_f32_t   fun),
-        "use ggml_map_custom1_inplace instead");
-
-    GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_map_custom2_f32(
-            struct ggml_context          * ctx,
-            struct ggml_tensor           * a,
-            struct ggml_tensor           * b,
-                   ggml_custom2_op_f32_t   fun),
-        "use ggml_map_custom2 instead");
-
-    GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_map_custom2_inplace_f32(
-            struct ggml_context          * ctx,
-            struct ggml_tensor           * a,
-            struct ggml_tensor           * b,
-                   ggml_custom2_op_f32_t   fun),
-        "use ggml_map_custom2_inplace instead");
-
-    GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_map_custom3_f32(
-            struct ggml_context          * ctx,
-            struct ggml_tensor           * a,
-            struct ggml_tensor           * b,
-            struct ggml_tensor           * c,
-                   ggml_custom3_op_f32_t   fun),
-        "use ggml_map_custom3 instead");
-
-    GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_map_custom3_inplace_f32(
-            struct ggml_context          * ctx,
-            struct ggml_tensor           * a,
-            struct ggml_tensor           * b,
-            struct ggml_tensor           * c,
-                   ggml_custom3_op_f32_t   fun),
-        "use ggml_map_custom3_inplace instead");
-
-    // custom operators v2
-
    typedef void (*ggml_custom1_op_t)(struct ggml_tensor * dst , const struct ggml_tensor * a, int ith, int nth, void * userdata);
    typedef void (*ggml_custom2_op_t)(struct ggml_tensor * dst , const struct ggml_tensor * a, const struct ggml_tensor * b, int ith, int nth, void * userdata);
    typedef void (*ggml_custom3_op_t)(struct ggml_tensor * dst , const struct ggml_tensor * a, const struct ggml_tensor * b, const struct ggml_tensor * c, int ith, int nth, void * userdata);
@ -2034,6 +1981,30 @@ extern "C" {
            int                     n_tasks,
            void                  * userdata);

+    typedef void (*ggml_custom_op_t)(struct ggml_tensor * dst , int ith, int nth, void * userdata);
+
+    GGML_API struct ggml_tensor * ggml_custom_4d(
+            struct ggml_context * ctx,
+            enum ggml_type        type,
+            int64_t               ne0,
+            int64_t               ne1,
+            int64_t               ne2,
+            int64_t               ne3,
+            struct ggml_tensor ** args,
+            int                   n_args,
+            ggml_custom_op_t      fun,
+            int                   n_tasks,
+            void                * userdata);
+
+    GGML_API struct ggml_tensor * ggml_custom_inplace(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            struct ggml_tensor ** args,
+            int                   n_args,
+            ggml_custom_op_t      fun,
+            int                   n_tasks,
+            void                * userdata);
+
    // loss function

    GGML_API struct ggml_tensor * ggml_cross_entropy_loss(
@ -2150,7 +2121,11 @@ extern "C" {
 #        define GGML_RESTRICT
 #    endif
 #else
-#    define GGML_RESTRICT restrict
+#    if defined (_MSC_VER) && (__STDC_VERSION__ < 201112L)
+#        define GGML_RESTRICT __restrict
+#    else
+#        define GGML_RESTRICT restrict
+#    endif
 #endif
    typedef void (*ggml_to_float_t)  (const void  * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
    typedef void (*ggml_from_float_t)(const float * GGML_RESTRICT x, void  * GGML_RESTRICT y, int64_t k);
--- a/ml/backend/ggml/ggml/src/CMakeLists.txt
+++ b/ml/backend/ggml/ggml/src/CMakeLists.txt
@ -1,4 +1,5 @@
 include(CheckCXXCompilerFlag)
+include("../cmake/common.cmake")

 add_compile_definitions(GGML_SCHED_MAX_COPIES=${GGML_SCHED_MAX_COPIES})

@ -24,33 +25,6 @@ if (NOT MSVC)
    endif()
 endif()

-function(ggml_get_flags CCID CCVER)
-    set(C_FLAGS "")
-    set(CXX_FLAGS "")
-
-    if (CCID MATCHES "Clang")
-        set(C_FLAGS   -Wunreachable-code-break -Wunreachable-code-return)
-        set(CXX_FLAGS -Wunreachable-code-break -Wunreachable-code-return -Wmissing-prototypes -Wextra-semi)
-
-        if (
-            (CCID STREQUAL "Clang"      AND CCVER VERSION_GREATER_EQUAL 3.8.0) OR
-            (CCID STREQUAL "AppleClang" AND CCVER VERSION_GREATER_EQUAL 7.3.0)
-        )
-            list(APPEND C_FLAGS -Wdouble-promotion)
-        endif()
-    elseif (CCID STREQUAL "GNU")
-        set(C_FLAGS   -Wdouble-promotion)
-        set(CXX_FLAGS -Wno-array-bounds)
-
-        if (CCVER VERSION_GREATER_EQUAL 8.1.0)
-            list(APPEND CXX_FLAGS -Wextra-semi)
-        endif()
-    endif()
-
-    set(GF_C_FLAGS   ${C_FLAGS}   PARENT_SCOPE)
-    set(GF_CXX_FLAGS ${CXX_FLAGS} PARENT_SCOPE)
-endfunction()
-
 if (GGML_FATAL_WARNINGS)
    if (CMAKE_CXX_COMPILER_ID MATCHES "GNU" OR CMAKE_CXX_COMPILER_ID MATCHES "Clang")
        list(APPEND C_FLAGS   -Werror)
@ -91,7 +65,7 @@ if (GGML_LTO)
    endif()
 endif()

-if (GGML_CCACHE)
+if (GGML_CCACHE AND NOT CMAKE_C_COMPILER_LAUNCHER AND NOT CMAKE_CXX_COMPILER_LAUNCHER)
    find_program(GGML_CCACHE_FOUND ccache)
    find_program(GGML_SCCACHE_FOUND sccache)

@ -102,7 +76,11 @@ if (GGML_CCACHE)
            set(GGML_CCACHE_VARIANT sccache)
        endif()
        # TODO: should not be set globally
-        set_property(GLOBAL PROPERTY RULE_LAUNCH_COMPILE "${GGML_CCACHE_VARIANT}")
+        if (GGML_SYCL AND GGML_CCACHE_FOUND AND WIN32)
+            set_property(GLOBAL PROPERTY RULE_LAUNCH_COMPILE "ccache compiler_type=icl")
+        else ()
+            set_property(GLOBAL PROPERTY RULE_LAUNCH_COMPILE "${GGML_CCACHE_VARIANT}")
+        endif ()
        set(ENV{CCACHE_SLOPPINESS} time_macros)
        message(STATUS "${GGML_CCACHE_VARIANT} found, compilation results will be cached. Disable with GGML_CCACHE=OFF.")
    else()
@ -226,6 +204,9 @@ add_library(ggml-base
            gguf.cpp)

 target_include_directories(ggml-base PRIVATE .)
+if (GGML_BACKEND_DL)
+    target_compile_definitions(ggml-base PUBLIC GGML_BACKEND_DL)
+endif()

 add_library(ggml
            ggml-backend-reg.cpp)
@ -233,7 +214,7 @@ add_library(ggml
 target_link_libraries(ggml PUBLIC ggml-base)

 if (CMAKE_SYSTEM_NAME MATCHES "Linux")
-    target_link_libraries(ggml PRIVATE dl)
+    target_link_libraries(ggml PRIVATE dl stdc++fs)
 endif()

 function(ggml_add_backend_library backend)
@ -286,7 +267,7 @@ function(ggml_add_cpu_backend_variant tag_name)
    set(GGML_CPU_TAG_NAME ${tag_name})
    # other: OPENMP LLAMAFILE CPU_HBM
    foreach (feat NATIVE
-                  AVX AVX2 AVX_VNNI FMA F16C
+                  AVX AVX2 BMI2 AVX_VNNI FMA F16C
                  AVX512 AVX512_VBMI AVX512_VNNI AVX512_BF16
                  AMX_TILE AMX_INT8 AMX_BF16)
        set(GGML_${feat} OFF)
@ -308,10 +289,10 @@ if (GGML_CPU_ALL_VARIANTS)
    endif()
    add_custom_target(ggml-cpu)
    ggml_add_cpu_backend_variant(sandybridge    AVX)
-    ggml_add_cpu_backend_variant(haswell        AVX F16C AVX2 FMA)
-    ggml_add_cpu_backend_variant(skylakex       AVX F16C AVX2 FMA AVX512)
-    ggml_add_cpu_backend_variant(icelake        AVX F16C AVX2 FMA AVX512 AVX512_VBMI AVX512_VNNI)
-    ggml_add_cpu_backend_variant(alderlake      AVX F16C AVX2 FMA AVX_VNNI)
+    ggml_add_cpu_backend_variant(haswell        AVX F16C AVX2 BMI2 FMA)
+    ggml_add_cpu_backend_variant(skylakex       AVX F16C AVX2 BMI2 FMA AVX512)
+    ggml_add_cpu_backend_variant(icelake        AVX F16C AVX2 BMI2 FMA AVX512 AVX512_VBMI AVX512_VNNI)
+    ggml_add_cpu_backend_variant(alderlake      AVX F16C AVX2 BMI2 FMA AVX_VNNI)
 elseif (GGML_CPU)
    ggml_add_cpu_backend_variant_impl("")
 endif()
@ -346,6 +327,10 @@ if (CMAKE_SYSTEM_NAME MATCHES "Android")
    target_link_libraries(ggml-base PRIVATE dl)
 endif()

+if(CMAKE_SYSTEM_NAME MATCHES "visionOS")
+    target_compile_definitions(ggml-base PUBLIC _DARWIN_C_SOURCE)
+endif()
+
 if (BUILD_SHARED_LIBS)
    foreach (target ggml-base ggml)
        set_target_properties(${target} PROPERTIES POSITION_INDEPENDENT_CODE ON)
--- a/ml/backend/ggml/ggml/src/ggml-alloc.c
+++ b/ml/backend/ggml/ggml/src/ggml-alloc.c
@ -89,7 +89,7 @@ struct ggml_tallocr ggml_tallocr_new(ggml_backend_buffer_t buffer) {
    return talloc;
 }

-void ggml_tallocr_alloc(struct ggml_tallocr * talloc, struct ggml_tensor * tensor) {
+enum ggml_status ggml_tallocr_alloc(struct ggml_tallocr * talloc, struct ggml_tensor * tensor) {
    size_t size = ggml_backend_buffer_get_alloc_size(talloc->buffer, tensor);
    size = GGML_PAD(size, talloc->alignment);

@ -104,7 +104,7 @@ void ggml_tallocr_alloc(struct ggml_tallocr * talloc, struct ggml_tensor * tenso

    assert(((uintptr_t)addr % talloc->alignment) == 0);

-    ggml_backend_tensor_alloc(talloc->buffer, tensor, addr);
+    return ggml_backend_tensor_alloc(talloc->buffer, tensor, addr);
 }

 // dynamic tensor allocator
@ -933,42 +933,51 @@ size_t ggml_gallocr_get_buffer_size(ggml_gallocr_t galloc, int buffer_id) {

 // utils

+static void free_buffers(ggml_backend_buffer_t ** buffers, const size_t * n_buffers) {
+    for (size_t i = 0; i < *n_buffers; i++) {
+        ggml_backend_buffer_free((*buffers)[i]);
+    }
+    free(*buffers);
+}
+
 static bool alloc_tensor_range(struct ggml_context * ctx,
        struct ggml_tensor * first, struct ggml_tensor * last,
        ggml_backend_buffer_type_t buft, size_t size,
        ggml_backend_buffer_t ** buffers, size_t * n_buffers) {
+
    ggml_backend_buffer_t buffer = ggml_backend_buft_alloc_buffer(buft, size);
    if (buffer == NULL) {
-#ifndef NDEBUG
-        GGML_LOG_DEBUG("%s: failed to allocate %s buffer of size %zu\n", __func__, ggml_backend_buft_name(buft), size);
-#endif
-        for (size_t i = 0; i < *n_buffers; i++) {
-            ggml_backend_buffer_free((*buffers)[i]);
-        }
-        free(*buffers);
+        GGML_LOG_ERROR("%s: failed to allocate %s buffer of size %zu\n", __func__, ggml_backend_buft_name(buft), size);
+        free_buffers(buffers, n_buffers);
        return false;
    }

-    struct ggml_tallocr tallocr = ggml_tallocr_new(buffer);
-
-    for (struct ggml_tensor * t = first; t != last; t = ggml_get_next_tensor(ctx, t)) {
-        if (t->data == NULL) {
-            if (t->view_src == NULL) {
-                ggml_tallocr_alloc(&tallocr, t);
-            } else if (t->buffer == NULL) {
-                ggml_backend_view_init(t);
-            }
-        } else {
-            if (t->view_src != NULL && t->buffer == NULL) {
-                // view of a pre-allocated tensor
-                ggml_backend_view_init(t);
-            }
-        }
-    }
-
    *buffers = realloc(*buffers, sizeof(ggml_backend_buffer_t) * (*n_buffers + 1));
    (*buffers)[(*n_buffers)++] = buffer;

+    struct ggml_tallocr tallocr = ggml_tallocr_new(buffer);
+
+    for (struct ggml_tensor * t = first; t != last; t = ggml_get_next_tensor(ctx, t)) {
+        enum ggml_status status = GGML_STATUS_SUCCESS;
+        if (t->data == NULL) {
+            if (t->view_src == NULL) {
+                status = ggml_tallocr_alloc(&tallocr, t);
+            } else if (t->buffer == NULL) {
+                status = ggml_backend_view_init(t);
+            }
+        } else {
+            if (t->view_src != NULL && t->buffer == NULL) {
+                // view of a pre-allocated tensor
+                status = ggml_backend_view_init(t);
+            }
+        }
+        if (status != GGML_STATUS_SUCCESS) {
+            GGML_LOG_ERROR("%s: failed to initialize tensor %s\n", __func__, t->name);
+            free_buffers(buffers, n_buffers);
+            return false;
+        }
+    }
+
    return true;
 }

--- a/ml/backend/ggml/ggml/src/ggml-backend-impl.h
+++ b/ml/backend/ggml/ggml/src/ggml-backend-impl.h
@ -44,7 +44,7 @@ extern "C" {
        // base address of the buffer
        void *       (*get_base)     (ggml_backend_buffer_t buffer);
        // (optional) initialize a tensor in the buffer (eg. add tensor extras)
-        void         (*init_tensor)  (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
+        enum ggml_status (*init_tensor)(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
        // tensor data access
        void         (*memset_tensor)(ggml_backend_buffer_t buffer,       struct ggml_tensor * tensor,     uint8_t value, size_t offset, size_t size);
        void         (*set_tensor)   (ggml_backend_buffer_t buffer,       struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
--- a/ml/backend/ggml/ggml/src/ggml-backend-reg.cpp
+++ b/ml/backend/ggml/ggml/src/ggml-backend-reg.cpp
@ -2,14 +2,13 @@
 #include "ggml-backend.h"
 #include "ggml-impl.h"
 #include <algorithm>
-#include <codecvt>
 #include <cstring>
 #include <filesystem>
-#include <locale>
 #include <memory>
 #include <string>
 #include <type_traits>
 #include <vector>
+#include <cctype>

 #ifdef _WIN32
 #    define WIN32_LEAN_AND_MEAN
@ -66,6 +65,34 @@
 #include "ggml-kompute.h"
 #endif

+// disable C++17 deprecation warning for std::codecvt_utf8
+#if defined(__clang__)
+#    pragma clang diagnostic push
+#    pragma clang diagnostic ignored "-Wdeprecated-declarations"
+#endif
+
+namespace fs = std::filesystem;
+
+static std::string path_str(const fs::path & path) {
+    std::string u8path;
+    try {
+#if defined(__cpp_lib_char8_t)
+        // C++20 and later: u8string() returns std::u8string
+        std::u8string u8str = path.u8string();
+        u8path = std::string(reinterpret_cast<const char*>(u8str.c_str()));
+#else
+        // C++17: u8string() returns std::string
+        u8path = path.u8string();
+#endif
+    } catch (...) {
+    }
+    return u8path;
+}
+
+#if defined(__clang__)
+#    pragma clang diagnostic pop
+#endif
+
 #ifdef _WIN32

 using dl_handle = std::remove_pointer_t<HMODULE>;
@ -76,12 +103,12 @@ struct dl_handle_deleter {
    }
 };

-static dl_handle * dl_load_library(const std::filesystem::path & path) {
+static dl_handle * dl_load_library(const fs::path & path) {
    // suppress error dialogs for missing DLLs
    DWORD old_mode = SetErrorMode(SEM_FAILCRITICALERRORS);
    SetErrorMode(old_mode | SEM_FAILCRITICALERRORS);

-    HMODULE handle = LoadLibraryW(path.c_str());
+    HMODULE handle = LoadLibraryW(path.wstring().c_str());

    SetErrorMode(old_mode);

@ -109,8 +136,8 @@ struct dl_handle_deleter {
    }
 };

-static void * dl_load_library(const std::filesystem::path & path) {
-    dl_handle * handle = dlopen(path.c_str(), RTLD_NOW | RTLD_LOCAL);
+static void * dl_load_library(const fs::path & path) {
+    dl_handle * handle = dlopen(path.string().c_str(), RTLD_NOW | RTLD_LOCAL);

    return handle;
 }
@ -121,25 +148,6 @@ static void * dl_get_sym(dl_handle * handle, const char * name) {

 #endif

-static std::string path_to_string(const std::filesystem::path & path)
-{
-#ifdef _WIN32
-    const std::wstring wstr = path.wstring();
-    const int size_needed = WideCharToMultiByte(CP_UTF8, 0, wstr.c_str(), -1, nullptr, 0, nullptr, nullptr);
-    if (size_needed <= 0) {
-        return std::string();
-    }
-
-    // size_needed includes the null terminator
-    std::string str(size_needed - 1, '\0');
-    WideCharToMultiByte(CP_UTF8, 0, wstr.c_str(), -1, str.data(), size_needed, nullptr, nullptr);
-    return str;
-#else
-    return path.string();
-#endif
-}
-
-
 using dl_handle_ptr = std::unique_ptr<dl_handle, dl_handle_deleter>;

 struct ggml_backend_reg_entry {
@ -221,11 +229,11 @@ struct ggml_backend_registry {
        );
    }

-    ggml_backend_reg_t load_backend(const std::filesystem::path & path, bool silent) {
+    ggml_backend_reg_t load_backend(const fs::path & path, bool silent) {
        dl_handle_ptr handle { dl_load_library(path) };
        if (!handle) {
            if (!silent) {
-                GGML_LOG_ERROR("%s: failed to load %s\n", __func__, path_to_string(path).c_str());
+                GGML_LOG_ERROR("%s: failed to load %s\n", __func__, path_str(path).c_str());
            }
            return nullptr;
        }
@ -233,7 +241,7 @@ struct ggml_backend_registry {
        auto score_fn = (ggml_backend_score_t) dl_get_sym(handle.get(), "ggml_backend_score");
        if (score_fn && score_fn() == 0) {
            if (!silent) {
-                GGML_LOG_INFO("%s: backend %s is not supported on this system\n", __func__, path_to_string(path).c_str());
+                GGML_LOG_INFO("%s: backend %s is not supported on this system\n", __func__, path_str(path).c_str());
            }
            return nullptr;
        }
@ -241,7 +249,7 @@ struct ggml_backend_registry {
        auto backend_init_fn = (ggml_backend_init_t) dl_get_sym(handle.get(), "ggml_backend_init");
        if (!backend_init_fn) {
            if (!silent) {
-                GGML_LOG_ERROR("%s: failed to find ggml_backend_init in %s\n", __func__, path_to_string(path).c_str());
+                GGML_LOG_ERROR("%s: failed to find ggml_backend_init in %s\n", __func__, path_str(path).c_str());
            }
            return nullptr;
        }
@ -250,16 +258,17 @@ struct ggml_backend_registry {
        if (!reg || reg->api_version != GGML_BACKEND_API_VERSION) {
            if (!silent) {
                if (!reg) {
-                    GGML_LOG_ERROR("%s: failed to initialize backend from %s: ggml_backend_init returned NULL\n", __func__, path_to_string(path).c_str());
+                    GGML_LOG_ERROR("%s: failed to initialize backend from %s: ggml_backend_init returned NULL\n",
+                        __func__, path_str(path).c_str());
                } else {
                    GGML_LOG_ERROR("%s: failed to initialize backend from %s: incompatible API version (backend: %d, current: %d)\n",
-                        __func__, path_to_string(path).c_str(), reg->api_version, GGML_BACKEND_API_VERSION);
+                        __func__, path_str(path).c_str(), reg->api_version, GGML_BACKEND_API_VERSION);
                }
            }
            return nullptr;
        }

-        GGML_LOG_INFO("%s: loaded %s backend from %s\n", __func__, ggml_backend_reg_name(reg), path_to_string(path).c_str());
+        GGML_LOG_INFO("%s: loaded %s backend from %s\n", __func__, ggml_backend_reg_name(reg), path_str(path).c_str());

        register_backend(reg, score_fn ? score_fn() : -1, std::move(handle));

@ -402,7 +411,7 @@ void ggml_backend_unload(ggml_backend_reg_t reg) {
    get_reg().unload_backend(reg, true);
 }

-static std::filesystem::path get_executable_path() {
+static fs::path get_executable_path() {
 #if defined(__APPLE__)
    // get executable path
    std::vector<char> path;
@ -414,9 +423,15 @@ static std::filesystem::path get_executable_path() {
        }
        path.resize(size);
    }
-
-    return std::filesystem::path(path.data()).parent_path();
+    std::string base_path(path.data(), size);
+    // remove executable name
+    auto last_slash = base_path.find_last_of('/');
+    if (last_slash != std::string::npos) {
+        base_path = base_path.substr(0, last_slash);
+    }
+    return base_path + "/";
 #elif defined(__linux__) || defined(__FreeBSD__)
+    std::string base_path = ".";
    std::vector<char> path(1024);
    while (true) {
        // get executable path
@ -429,48 +444,63 @@ static std::filesystem::path get_executable_path() {
            break;
        }
        if (len < (ssize_t) path.size()) {
-            return std::filesystem::path(path.data()).parent_path();
+            base_path = std::string(path.data(), len);
+            // remove executable name
+            auto last_slash = base_path.find_last_of('/');
+            if (last_slash != std::string::npos) {
+                base_path = base_path.substr(0, last_slash);
+            }
+            break;
        }
        path.resize(path.size() * 2);
    }
+
+    return base_path + "/";
 #elif defined(_WIN32)
    std::vector<wchar_t> path(MAX_PATH);
    DWORD len = GetModuleFileNameW(NULL, path.data(), path.size());
    if (len == 0) {
        return {};
    }
-
-    return std::filesystem::path(path.data()).parent_path();
-#endif
+    std::wstring base_path(path.data(), len);
+    // remove executable name
+    auto last_slash = base_path.find_last_of('\\');
+    if (last_slash != std::string::npos) {
+        base_path = base_path.substr(0, last_slash);
+    }
+    return base_path + L"\\";
+#else
    return {};
-}
-
-static std::string backend_filename_prefix() {
-#ifdef _WIN32
-    return "ggml-";
-#else
-    return "libggml-";
 #endif
 }

-static std::string backend_filename_suffix() {
+static fs::path backend_filename_prefix() {
 #ifdef _WIN32
-    return ".dll";
+    return fs::u8path("ggml-");
 #else
-    return ".so";
+    return fs::u8path("libggml-");
+#endif
+}
+
+static fs::path backend_filename_extension() {
+#ifdef _WIN32
+    return fs::u8path(".dll");
+#else
+    return fs::u8path(".so");
 #endif
 }

 static ggml_backend_reg_t ggml_backend_load_best(const char * name, bool silent, const char * user_search_path) {
    // enumerate all the files that match [lib]ggml-name-*.[so|dll] in the search paths
-     // TODO: search system paths
-    namespace fs = std::filesystem;
-    std::string file_prefix = backend_filename_prefix() + name + "-";
-    std::vector<fs::path> search_paths;
+    const fs::path name_path = fs::u8path(name);
+    const fs::path file_prefix = backend_filename_prefix().native() + name_path.native() + fs::u8path("-").native();
+    const fs::path file_extension = backend_filename_extension();

+    std::vector<fs::path> search_paths;
    if (user_search_path == nullptr) {
-        search_paths.push_back(fs::current_path());
+        // default search paths: executable directory, current directory
        search_paths.push_back(get_executable_path());
+        search_paths.push_back(fs::current_path());
    } else {
        search_paths.push_back(fs::u8path(user_search_path));
    }
@ -480,31 +510,35 @@ static ggml_backend_reg_t ggml_backend_load_best(const char * name, bool silent,

    for (const auto & search_path : search_paths) {
        if (!fs::exists(search_path)) {
+            GGML_LOG_DEBUG("%s: search path %s does not exist\n", __func__, path_str(search_path).c_str());
            continue;
        }
        fs::directory_iterator dir_it(search_path, fs::directory_options::skip_permission_denied);
        for (const auto & entry : dir_it) {
            if (entry.is_regular_file()) {
-                std::string filename = entry.path().filename().string();
-                std::string ext = entry.path().extension().string();
-                if (filename.find(file_prefix) == 0 && ext == backend_filename_suffix()) {
-                    dl_handle_ptr handle { dl_load_library(entry.path()) };
-                    if (!handle) {
-                        GGML_LOG_ERROR("%s: failed to load %s\n", __func__, path_to_string(entry.path()).c_str());
-                        continue;
+                auto filename = entry.path().filename();
+                auto ext = entry.path().extension();
+                if (filename.native().find(file_prefix) == 0 && ext == file_extension) {
+                    dl_handle_ptr handle { dl_load_library(entry) };
+                    if (!handle && !silent) {
+                        GGML_LOG_ERROR("%s: failed to load %s\n", __func__, path_str(entry.path()).c_str());
                    }
-
-                    auto score_fn = (ggml_backend_score_t) dl_get_sym(handle.get(), "ggml_backend_score");
-                    if (!score_fn) {
-                        GGML_LOG_DEBUG("%s: failed to find ggml_backend_score in %s\n", __func__, path_to_string(entry.path()).c_str());
-                        continue;
-                    }
-
-                    int s = score_fn();
-                    GGML_LOG_DEBUG("%s: %s score: %d\n", __func__, path_to_string(entry.path()).c_str(), s);
-                    if (s > best_score) {
-                        best_score = s;
-                        best_path = entry.path();
+                    if (handle) {
+                        auto score_fn = (ggml_backend_score_t) dl_get_sym(handle.get(), "ggml_backend_score");
+                        if (score_fn) {
+                            int s = score_fn();
+#ifndef NDEBUG
+                            GGML_LOG_DEBUG("%s: %s score: %d\n", __func__, path_str(entry.path()).c_str(), s);
+#endif
+                            if (s > best_score) {
+                                best_score = s;
+                                best_path = entry.path();
+                            }
+                        } else {
+                            if (!silent) {
+                                GGML_LOG_INFO("%s: failed to find ggml_backend_score in %s\n", __func__, path_str(entry.path()).c_str());
+                            }
+                        }
                    }
                }
            }
@ -514,7 +548,8 @@ static ggml_backend_reg_t ggml_backend_load_best(const char * name, bool silent,
    if (best_score == 0) {
        // try to load the base backend
        for (const auto & search_path : search_paths) {
-            fs::path path = fs::path(search_path) / (backend_filename_prefix() + name + backend_filename_suffix());
+            fs::path filename = backend_filename_prefix().native() + name_path.native() + backend_filename_extension().native();
+            fs::path path = search_path / filename;
            if (fs::exists(path)) {
                return get_reg().load_backend(path, silent);
            }
@ -529,14 +564,6 @@ void ggml_backend_load_all() {
    ggml_backend_load_all_from_path(nullptr);
 }

-static void ggml_backend_try_load_best(const char * name, bool silent, const char * user_search_path) {
-    try {
-        ggml_backend_load_best(name, silent, user_search_path);
-    } catch (const std::exception & e) {
-        GGML_LOG_DEBUG("%s: failed to load %s: %s\n", __func__, name, e.what());
-    }
-}
-
 void ggml_backend_load_all_from_path(const char * dir_path) {
 #ifdef NDEBUG
    bool silent = true;
@ -544,18 +571,18 @@ void ggml_backend_load_all_from_path(const char * dir_path) {
    bool silent = false;
 #endif

-    ggml_backend_try_load_best("blas", silent, dir_path);
-    ggml_backend_try_load_best("cann", silent, dir_path);
-    ggml_backend_try_load_best("cuda", silent, dir_path);
-    ggml_backend_try_load_best("hip", silent, dir_path);
-    ggml_backend_try_load_best("kompute", silent, dir_path);
-    ggml_backend_try_load_best("metal", silent, dir_path);
-    ggml_backend_try_load_best("rpc", silent, dir_path);
-    ggml_backend_try_load_best("sycl", silent, dir_path);
-    ggml_backend_try_load_best("vulkan", silent, dir_path);
-    ggml_backend_try_load_best("opencl", silent, dir_path);
-    ggml_backend_try_load_best("musa", silent, dir_path);
-    ggml_backend_try_load_best("cpu", silent, dir_path);
+    ggml_backend_load_best("blas", silent, dir_path);
+    ggml_backend_load_best("cann", silent, dir_path);
+    ggml_backend_load_best("cuda", silent, dir_path);
+    ggml_backend_load_best("hip", silent, dir_path);
+    ggml_backend_load_best("kompute", silent, dir_path);
+    ggml_backend_load_best("metal", silent, dir_path);
+    ggml_backend_load_best("rpc", silent, dir_path);
+    ggml_backend_load_best("sycl", silent, dir_path);
+    ggml_backend_load_best("vulkan", silent, dir_path);
+    ggml_backend_load_best("opencl", silent, dir_path);
+    ggml_backend_load_best("musa", silent, dir_path);
+    ggml_backend_load_best("cpu", silent, dir_path);
    // check the environment variable GGML_BACKEND_PATH to load an out-of-tree backend
    const char * backend_path = std::getenv("GGML_BACKEND_PATH");
    if (backend_path) {
--- a/ml/backend/ggml/ggml/src/ggml-backend.cpp
+++ b/ml/backend/ggml/ggml/src/ggml-backend.cpp
@ -21,6 +21,7 @@
 #include <string.h>
 #include <string>
 #include <vector>
+#include <algorithm>

 #ifdef __APPLE__
 #include <sys/types.h>
@ -125,11 +126,12 @@ void * ggml_backend_buffer_get_base(ggml_backend_buffer_t buffer) {
    return base;
 }

-void ggml_backend_buffer_init_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) {
+enum ggml_status ggml_backend_buffer_init_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) {
    // init_tensor is optional
    if (buffer->iface.init_tensor) {
-        buffer->iface.init_tensor(buffer, tensor);
+        return buffer->iface.init_tensor(buffer, tensor);
    }
+    return GGML_STATUS_SUCCESS;
 }

 void ggml_backend_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
@ -1641,7 +1643,7 @@ ggml_backend_t ggml_backend_sched_get_tensor_backend(ggml_backend_sched_t sched,

 // utils

-void ggml_backend_view_init(struct ggml_tensor * tensor) {
+enum ggml_status ggml_backend_view_init(struct ggml_tensor * tensor) {
    GGML_ASSERT(tensor->buffer == NULL);
    GGML_ASSERT(tensor->view_src != NULL);
    GGML_ASSERT(tensor->view_src->buffer != NULL);
@ -1649,10 +1651,10 @@ void ggml_backend_view_init(struct ggml_tensor * tensor) {

    tensor->buffer = tensor->view_src->buffer;
    tensor->data = (char *)tensor->view_src->data + tensor->view_offs;
-    ggml_backend_buffer_init_tensor(tensor->buffer, tensor);
+    return ggml_backend_buffer_init_tensor(tensor->buffer, tensor);
 }

-void ggml_backend_tensor_alloc(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, void * addr) {
+enum ggml_status ggml_backend_tensor_alloc(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, void * addr) {
    GGML_ASSERT(tensor->buffer == NULL);
    GGML_ASSERT(tensor->data == NULL);
    GGML_ASSERT(tensor->view_src == NULL);
@ -1662,7 +1664,7 @@ void ggml_backend_tensor_alloc(ggml_backend_buffer_t buffer, struct ggml_tensor

    tensor->buffer = buffer;
    tensor->data = addr;
-    ggml_backend_buffer_init_tensor(buffer, tensor);
+    return ggml_backend_buffer_init_tensor(buffer, tensor);
 }

 static struct ggml_tensor * graph_copy_dup_tensor(struct ggml_hash_set hash_set, struct ggml_tensor ** node_copies,
@ -1708,7 +1710,8 @@ static void graph_copy_init_tensor(struct ggml_hash_set * hash_set, struct ggml_
    struct ggml_tensor * dst = node_copies[id];
    if (dst->view_src != NULL) {
        graph_copy_init_tensor(hash_set, node_copies, node_init, src->view_src);
-        ggml_backend_view_init(dst);
+        enum ggml_status status = ggml_backend_view_init(dst);
+        GGML_ASSERT(status == GGML_STATUS_SUCCESS);
    }
    else {
        ggml_backend_tensor_copy(src, dst);
@ -1823,7 +1826,6 @@ bool ggml_backend_compare_graph_backend(ggml_backend_t backend1, ggml_backend_t
    assert(g1->n_nodes == g2->n_nodes);

    for (int i = 0; i < g1->n_nodes; i++) {
-        //printf("eval %d/%d\n", i, g1->n_nodes);
        struct ggml_tensor * t1 = g1->nodes[i];
        struct ggml_tensor * t2 = g2->nodes[i];

--- a/ml/backend/ggml/ggml/src/ggml-common.h
+++ b/ml/backend/ggml/ggml/src/ggml-common.h
@ -158,6 +158,12 @@ typedef sycl::half2 ggml_half2;

 #endif // GGML_COMMON_DECL_CUDA || GGML_COMMON_DECL_HIP

+#ifdef _MSC_VER
+#define GGML_EXTENSION
+#else // _MSC_VER
+#define GGML_EXTENSION __extension__
+#endif // _MSC_VER
+
 #define QK4_0 32
 typedef struct {
    ggml_half d;           // delta
@ -167,7 +173,7 @@ static_assert(sizeof(block_q4_0) == sizeof(ggml_half) + QK4_0 / 2, "wrong q4_0 b

 #define QK4_1 32
 typedef struct {
-    union {
+    GGML_EXTENSION union {
        struct {
            ggml_half d; // delta
            ggml_half m; // min
@ -188,7 +194,7 @@ static_assert(sizeof(block_q5_0) == sizeof(ggml_half) + sizeof(uint32_t) + QK5_0

 #define QK5_1 32
 typedef struct {
-    union {
+    GGML_EXTENSION union {
        struct {
            ggml_half d; // delta
            ggml_half m; // min
@ -209,7 +215,7 @@ static_assert(sizeof(block_q8_0) == sizeof(ggml_half) + QK8_0, "wrong q8_0 block

 #define QK8_1 32
 typedef struct {
-    union {
+    GGML_EXTENSION union {
        struct {
            ggml_half d; // delta
            ggml_half s; // d * sum(qs[i])
@ -250,7 +256,7 @@ static_assert(sizeof(block_tq2_0) == sizeof(ggml_half) + QK_K / 4, "wrong tq2_0
 typedef struct {
    uint8_t scales[QK_K/16]; // scales and mins, quantized with 4 bits
    uint8_t qs[QK_K/4];      // quants
-    union {
+    GGML_EXTENSION union {
        struct {
            ggml_half d;    // super-block scale for quantized scales
            ggml_half dmin; // super-block scale for quantized mins
@ -277,7 +283,7 @@ static_assert(sizeof(block_q3_K) == sizeof(ggml_half) + QK_K / 4 + QK_K / 8 + 12
 // weight is represented as x = a * q + b
 // Effectively 4.5 bits per weight
 typedef struct {
-    union {
+    GGML_EXTENSION union {
        struct {
            ggml_half d;    // super-block scale for quantized scales
            ggml_half dmin; // super-block scale for quantized mins
@ -294,7 +300,7 @@ static_assert(sizeof(block_q4_K) == 2*sizeof(ggml_half) + K_SCALE_SIZE + QK_K/2,
 // weight is represented as x = a * q + b
 // Effectively 5.5 bits per weight
 typedef struct {
-    union {
+    GGML_EXTENSION union {
        struct {
            ggml_half d;    // super-block scale for quantized scales
            ggml_half dmin; // super-block scale for quantized mins
--- a/ml/backend/ggml/ggml/src/ggml-cpu/CMakeLists.txt
+++ b/ml/backend/ggml/ggml/src/ggml-cpu/CMakeLists.txt
@ -23,6 +23,16 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
        ggml-cpu/amx/mmq.cpp
        ggml-cpu/amx/mmq.h
        ggml-cpu/ggml-cpu-impl.h
+        ggml-cpu/common.h
+        ggml-cpu/binary-ops.h
+        ggml-cpu/binary-ops.cpp
+        ggml-cpu/unary-ops.h
+        ggml-cpu/unary-ops.cpp
+        ggml-cpu/simd-mappings.h
+        ggml-cpu/vec.h
+        ggml-cpu/vec.cpp
+        ggml-cpu/ops.h
+        ggml-cpu/ops.cpp
        )

    target_compile_features(${GGML_CPU_NAME} PRIVATE c_std_11 cxx_std_17)
@ -219,6 +229,10 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
            if (GGML_AVX_VNNI)
                list(APPEND ARCH_DEFINITIONS __AVXVNNI__ GGML_AVX_VNNI)
            endif()
+            if (GGML_BMI2)
+                # MSVC does not define macro __BMI2__
+                list(APPEND ARCH_DEFINITIONS __BMI2__ GGML_BMI2)
+            endif()
        else ()
            if (GGML_NATIVE)
                list(APPEND ARCH_FLAGS -march=native)
@ -233,6 +247,10 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
                    list(APPEND ARCH_FLAGS -mfma)
                    list(APPEND ARCH_DEFINITIONS GGML_FMA)
                endif()
+                if (GGML_BMI2)
+                    list(APPEND ARCH_FLAGS -mbmi2)
+                    list(APPEND ARCH_DEFINITIONS GGML_BMI2)
+                endif()
                if (GGML_AVX)
                    list(APPEND ARCH_FLAGS -mavx)
                    list(APPEND ARCH_DEFINITIONS GGML_AVX)
@ -279,21 +297,31 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
                endif()
            endif()
        endif()
-    elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "ppc64")
+    elseif ("${CMAKE_SYSTEM_PROCESSOR} " STREQUAL "ppc64le " OR "${CMAKE_SYSTEM_PROCESSOR} " STREQUAL "powerpc ")
        message(STATUS "PowerPC detected")
-        execute_process(COMMAND bash -c "grep POWER10 /proc/cpuinfo | head -n 1" OUTPUT_VARIABLE POWER10_M)
-        string(FIND "${POWER10_M}" "POWER10" substring_index)
-        if (NOT DEFINED substring_index OR "${substring_index}" STREQUAL "")
-            set(substring_index -1)
-        endif()
+        if (GGML_NATIVE)
+            if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "ppc64")
+                file(READ "/proc/cpuinfo" POWER10_M)
+            elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "powerpc")
+                execute_process(COMMAND bash -c "prtconf |grep 'Implementation' | head -n 1" OUTPUT_VARIABLE POWER10_M)
+            endif()

-        if (${substring_index} GREATER_EQUAL 0)
-        list(APPEND ARCH_FLAGS -mcpu=power10)
-        elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "ppc64le")
-        list(APPEND ARCH_FLAGS -mcpu=powerpc64le)
+            string(REGEX MATCHALL "POWER *([0-9]+)" MATCHED_STRING "${POWER10_M}")
+            string(REGEX REPLACE "POWER *([0-9]+)" "\\1" EXTRACTED_NUMBER "${MATCHED_STRING}")
+
+            if (EXTRACTED_NUMBER GREATER_EQUAL 10)
+                list(APPEND ARCH_FLAGS -mcpu=power10 -mpowerpc64)
+            elseif (EXTRACTED_NUMBER EQUAL 9)
+                list(APPEND ARCH_FLAGS -mcpu=power9 -mpowerpc64)
+            elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "ppc64le")
+                list(APPEND ARCH_FLAGS -mcpu=powerpc64le -mtune=native)
+            else()
+                list(APPEND ARCH_FLAGS -mcpu=native -mtune=native -mpowerpc64)
+            endif()
        else()
-            list(APPEND ARCH_FLAGS -mcpu=native -mtune=native)
-            # TODO: Add  targets for Power8/Power9 (Altivec/VSX) and Power10(MMA) and query for big endian systems (ppc64/le/be)
+            if (GGML_CPU_POWERPC_CPUTYPE)
+                list(APPEND ARCH_FLAGS -mcpu=${GGML_CPU_POWERPC_CPUTYPE})
+            endif()
        endif()
    elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "loongarch64")
        message(STATUS "loongarch64 detected")
@ -308,7 +336,11 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
    elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "riscv64")
        message(STATUS "RISC-V detected")
        if (GGML_RVV)
-            list(APPEND ARCH_FLAGS -march=rv64gcv -mabi=lp64d)
+            if (GGML_RV_ZFH)
+                list(APPEND ARCH_FLAGS -march=rv64gcv_zfhmin -DGGML_RV_ZFH -mabi=lp64d)
+            else()
+                list(APPEND ARCH_FLAGS -march=rv64gcv -mabi=lp64d)
+            endif()
        endif()
    elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "s390x")
        message(STATUS "s390x detected")
@ -347,9 +379,9 @@ function(ggml_add_cpu_backend_variant_impl tag_name)

        # Fetch KleidiAI sources:
        include(FetchContent)
-        set(KLEIDIAI_COMMIT_TAG "v1.3.0")
+        set(KLEIDIAI_COMMIT_TAG "v1.5.0")
        set(KLEIDIAI_DOWNLOAD_URL "https://github.com/ARM-software/kleidiai/archive/refs/tags/${KLEIDIAI_COMMIT_TAG}.tar.gz")
-        set(KLEIDIAI_ARCHIVE_MD5  "060bd2dc64642b091f461cc8dd7426d9")
+        set(KLEIDIAI_ARCHIVE_MD5  "ea22e1aefb800e9bc8c74d91633cc58e")

        if (POLICY CMP0135)
            cmake_policy(SET CMP0135 NEW)
--- a/ml/backend/ggml/ggml/src/ggml-cpu/amx/amx.cpp
+++ b/ml/backend/ggml/ggml/src/ggml-cpu/amx/amx.cpp
@ -50,10 +50,11 @@ static void * ggml_backend_amx_buffer_get_base(ggml_backend_buffer_t buffer) {
    return (void *) (buffer->context);
 }

-static void ggml_backend_amx_buffer_init_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) {
+static enum ggml_status ggml_backend_amx_buffer_init_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) {
    tensor->extra = (void *) ggml::cpu::amx::get_tensor_traits(buffer, tensor);

    GGML_UNUSED(buffer);
+    return GGML_STATUS_SUCCESS;
 }

 static void ggml_backend_amx_buffer_memset_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor,
--- a/ml/backend/ggml/ggml/src/ggml-cpu/binary-ops.cpp
+++ b/ml/backend/ggml/ggml/src/ggml-cpu/binary-ops.cpp
@ -0,0 +1,158 @@
+#include "binary-ops.h"
+
+#if defined(GGML_USE_ACCELERATE)
+#include <Accelerate/Accelerate.h>
+
+using vDSP_fn_t = void (*)(const float *, vDSP_Stride, const float *, vDSP_Stride, float *, vDSP_Stride, vDSP_Length);
+#endif
+
+static inline float op_add(float a, float b) {
+    return a + b;
+}
+
+static inline float op_sub(float a, float b) {
+    return a - b;
+}
+
+static inline float op_mul(float a, float b) {
+    return a * b;
+}
+
+static inline float op_div(float a, float b) {
+    return a / b;
+}
+
+template <float (*op)(float, float), typename src0_t, typename src1_t, typename dst_t>
+static inline void vec_binary_op_contiguous(const int64_t n, dst_t * z, const src0_t * x, const src1_t * y) {
+    constexpr auto src0_to_f32 = type_conversion_table<src0_t>::to_f32;
+    constexpr auto src1_to_f32 = type_conversion_table<src1_t>::to_f32;
+    constexpr auto f32_to_dst  = type_conversion_table<dst_t >::from_f32;
+
+    for (int i = 0; i < n; i++) {
+        z[i] = f32_to_dst(op(src0_to_f32(x[i]), src1_to_f32(y[i])));
+    }
+}
+
+template <float (*op)(float, float), typename src0_t, typename src1_t, typename dst_t>
+static inline void vec_binary_op_non_contiguous(const int64_t n, const int64_t ne10, const int64_t nb10, dst_t * z, const src0_t * x, const src1_t * y) {
+    constexpr auto src0_to_f32 = type_conversion_table<src0_t>::to_f32;
+    constexpr auto src1_to_f32 = type_conversion_table<src1_t>::to_f32;
+    constexpr auto f32_to_dst  = type_conversion_table<dst_t >::from_f32;
+
+    for (int i = 0; i < n; i++) {
+        int i10 = i % ne10;
+        const src1_t * y_ptr = (const src1_t *)((const char *)y + i10*nb10);
+        z[i] = f32_to_dst(op(src0_to_f32(x[i]), src1_to_f32(*y_ptr)));
+    }
+}
+
+template <float (*op)(float, float), typename src0_t, typename src1_t, typename dst_t>
+static void apply_binary_op(const ggml_compute_params * params, ggml_tensor * dst) {
+    const ggml_tensor * src0 = dst->src[0];
+    const ggml_tensor * src1 = dst->src[1];
+
+    GGML_ASSERT(ggml_can_repeat(src1, src0) && ggml_are_same_shape(src0, dst));
+
+    GGML_TENSOR_BINARY_OP_LOCALS
+
+    GGML_ASSERT( nb0 == sizeof(dst_t));
+    GGML_ASSERT(nb00 == sizeof(src0_t));
+
+    const auto [ir0, ir1] = get_thread_range(params, src0);
+    const bool is_src1_contiguous = (nb10 == sizeof(src1_t));
+
+    if (!is_src1_contiguous) { // broadcast not implemented yet for non-contiguous
+        GGML_ASSERT(ggml_are_same_shape(src0, src1));
+    }
+
+#ifdef GGML_USE_ACCELERATE
+    vDSP_fn_t vDSP_op = nullptr;
+    // TODO - avoid the f32-only check using type 'trait' lookup tables and row-based src-to-float conversion functions
+    if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
+        if (op == op_add) {
+            vDSP_op = vDSP_vadd;
+        } else if (op == op_sub) {
+            vDSP_op = vDSP_vsub;
+        } else if (op == op_mul) {
+            vDSP_op = vDSP_vmul;
+        } else if (op == op_div) {
+            vDSP_op = vDSP_vdiv;
+        }
+    }
+#endif
+
+    for (int64_t ir = ir0; ir < ir1; ++ir) {
+        const int64_t i03 = ir/(ne02*ne01);
+        const int64_t i02 = (ir - i03*ne02*ne01)/ne01;
+        const int64_t i01 = (ir - i03*ne02*ne01 - i02*ne01);
+
+        const int64_t i13 = i03 % ne13;
+        const int64_t i12 = i02 % ne12;
+        const int64_t i11 = i01 % ne11;
+
+        dst_t        * dst_ptr  = (dst_t  *)       ((char *)       dst->data  + i03*nb3  + i02*nb2  + i01*nb1 );
+        const src0_t * src0_ptr = (const src0_t *) ((const char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01);
+        const src1_t * src1_ptr = (const src1_t *) ((const char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11);
+
+        if (is_src1_contiguous) {
+            // src1 is broadcastable across src0 and dst in i1, i2, i3
+            const int64_t nr0 = ne00 / ne10;
+
+            for (int64_t r = 0; r < nr0; ++r) {
+#ifdef GGML_USE_ACCELERATE
+                if constexpr (std::is_same_v<src0_t, float> && std::is_same_v<src1_t, float> && std::is_same_v<dst_t, float>) {
+                    if (vDSP_op != nullptr) {
+                        vDSP_op(src1_ptr, 1, src0_ptr + r*ne10, 1, dst_ptr + r*ne10, 1, ne10);
+                        continue;
+                    }
+                }
+#endif
+                vec_binary_op_contiguous<op>(ne10, dst_ptr + r*ne10, src0_ptr + r*ne10, src1_ptr);
+            }
+        } else {
+            vec_binary_op_non_contiguous<op>(ne0, ne10, nb10, dst_ptr, src0_ptr, src1_ptr);
+        }
+    }
+}
+
+// TODO: Use the 'traits' lookup table (for type conversion fns), instead of a mass of 'if' conditions with long templates
+template <float (*op)(float, float)>
+static void binary_op(const ggml_compute_params * params, ggml_tensor * dst) {
+    const ggml_tensor * src0 = dst->src[0];
+    const ggml_tensor * src1 = dst->src[1];
+
+    /*  */ if (src0->type == GGML_TYPE_F32  && src1->type == GGML_TYPE_F32  && dst->type == GGML_TYPE_F32) { // all f32
+        apply_binary_op<op, float, float, float>(params, dst);
+    } else if (src0->type == GGML_TYPE_F16  && src1->type == GGML_TYPE_F16  && dst->type == GGML_TYPE_F16) { // all f16
+        apply_binary_op<op, ggml_fp16_t, ggml_fp16_t, ggml_fp16_t>(params, dst);
+    } else if (src0->type == GGML_TYPE_BF16 && src1->type == GGML_TYPE_BF16 && dst->type == GGML_TYPE_BF16) { // all bf16
+        apply_binary_op<op, ggml_bf16_t, ggml_bf16_t, ggml_bf16_t>(params, dst);
+    } else if (src0->type == GGML_TYPE_BF16 && src1->type == GGML_TYPE_F32  && dst->type == GGML_TYPE_BF16) {
+        apply_binary_op<op, ggml_bf16_t, float, ggml_bf16_t>(params, dst);
+    } else if (src0->type == GGML_TYPE_BF16 && src1->type == GGML_TYPE_F32  && dst->type == GGML_TYPE_F32) {
+        apply_binary_op<op, ggml_bf16_t, float, float>(params, dst);
+    } else if (src0->type == GGML_TYPE_F16  && src1->type == GGML_TYPE_F32  && dst->type == GGML_TYPE_F16) {
+        apply_binary_op<op, ggml_fp16_t, float, ggml_fp16_t>(params, dst);
+    } else if (src0->type == GGML_TYPE_F16  && src1->type == GGML_TYPE_F32  && dst->type == GGML_TYPE_F32) {
+        apply_binary_op<op, ggml_fp16_t, float, float>(params, dst);
+    } else {
+        GGML_ABORT("%s: unsupported types: dst: %s, src0: %s, src1: %s\n", __func__,
+            ggml_type_name(dst->type), ggml_type_name(src0->type), ggml_type_name(src1->type));
+    }
+}
+
+void ggml_compute_forward_add_non_quantized(const ggml_compute_params * params, ggml_tensor * dst) {
+    binary_op<op_add>(params, dst);
+}
+
+void ggml_compute_forward_sub(const ggml_compute_params * params, ggml_tensor * dst) {
+    binary_op<op_sub>(params, dst);
+}
+
+void ggml_compute_forward_mul(const ggml_compute_params * params, ggml_tensor * dst) {
+    binary_op<op_mul>(params, dst);
+}
+
+void ggml_compute_forward_div(const ggml_compute_params * params, ggml_tensor * dst) {
+    binary_op<op_div>(params, dst);
+}
--- a/ml/backend/ggml/ggml/src/ggml-cpu/binary-ops.h
+++ b/ml/backend/ggml/ggml/src/ggml-cpu/binary-ops.h
@ -0,0 +1,16 @@
+#pragma once
+
+#include "common.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void ggml_compute_forward_add_non_quantized(const struct ggml_compute_params * params, struct ggml_tensor * dst);
+void ggml_compute_forward_sub(const struct ggml_compute_params * params, struct ggml_tensor * dst);
+void ggml_compute_forward_mul(const struct ggml_compute_params * params, struct ggml_tensor * dst);
+void ggml_compute_forward_div(const struct ggml_compute_params * params, struct ggml_tensor * dst);
+
+#ifdef __cplusplus
+}
+#endif
--- a/ml/backend/ggml/ggml/src/ggml-cpu/common.h
+++ b/ml/backend/ggml/ggml/src/ggml-cpu/common.h
@ -0,0 +1,72 @@
+#pragma once
+
+#include "ggml.h"
+#include "ggml-cpu-traits.h"
+#include "ggml-cpu-impl.h"
+#include "ggml-impl.h"
+
+#ifdef __cplusplus
+
+#include <utility>
+
+// convenience functions/macros for use in template calls
+// note: these won't be required after the 'traits' lookup table is used.
+static inline ggml_fp16_t f32_to_f16(float x) {
+    return GGML_FP32_TO_FP16(x);
+}
+
+static inline float f16_to_f32(ggml_fp16_t x) {
+    return GGML_FP16_TO_FP32(x);
+}
+
+static inline ggml_bf16_t f32_to_bf16(float x) {
+    return GGML_FP32_TO_BF16(x);
+}
+
+static inline float bf16_to_f32(ggml_bf16_t x) {
+    return GGML_BF16_TO_FP32(x);
+}
+
+static inline float f32_to_f32(float x) {
+    return x;
+}
+
+// TODO - merge this into the traits table, after using row-based conversions
+template <class T>
+struct type_conversion_table;
+
+template <>
+struct type_conversion_table<ggml_fp16_t> {
+    static constexpr float (*to_f32)(ggml_fp16_t) = f16_to_f32;
+    static constexpr ggml_fp16_t (*from_f32)(float) = f32_to_f16;
+};
+
+template <>
+struct type_conversion_table<float> {
+    static constexpr float (*to_f32)(float) = f32_to_f32;
+    static constexpr float (*from_f32)(float) = f32_to_f32;
+};
+
+template <>
+struct type_conversion_table<ggml_bf16_t> {
+    static constexpr float (*to_f32)(ggml_bf16_t) = bf16_to_f32;
+    static constexpr ggml_bf16_t (*from_f32)(float) = f32_to_bf16;
+};
+
+static std::pair<int64_t, int64_t> get_thread_range(const struct ggml_compute_params * params, const struct ggml_tensor * src0) {
+    const int64_t ith = params->ith;
+    const int64_t nth = params->nth;
+
+    const int64_t nr  = ggml_nrows(src0);
+
+    // rows per thread
+    const int64_t dr = (nr + nth - 1)/nth;
+
+    // row range for this thread
+    const int64_t ir0 = dr*ith;
+    const int64_t ir1 = MIN(ir0 + dr, nr);
+
+    return {ir0, ir1};
+}
+
+#endif
--- a/ml/backend/ggml/ggml/src/ggml-cpu/cpu-feats-x86.cpp
+++ b/ml/backend/ggml/ggml/src/ggml-cpu/cpu-feats-x86.cpp
@ -278,6 +278,10 @@ static int ggml_backend_cpu_x86_score() {
    if (!is.SSE42()) { return 0; }
    score += 1<<2;
 #endif
+#ifdef GGML_BMI2
+    if (!is.BMI2()) { return 0; }
+    score += 1<<3;
+#endif
 #ifdef GGML_AVX
    if (!is.AVX()) { return 0; }
    score += 1<<4;
--- a/ml/backend/ggml/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp
+++ b/ml/backend/ggml/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp
--- a/Show More
+++ b/Show More