Merge branch 'main' into drifkin/array-head-count-simple

2025-05-08 11:46:52 -07:00 · 2025-05-08 11:46:52 -07:00 · 20c5fd39c8
commit 20c5fd39c8
parent d2ee599dcf 6e9a7a2568
156 changed files with 6327 additions and 3282 deletions
--- a/.github/workflows/release.yaml
+++ b/.github/workflows/release.yaml
@ -103,11 +103,6 @@ jobs:
        arch: [amd64]
        preset: ['CPU']
        include:
-          - os: windows
-            arch: amd64
-            preset: 'CUDA 11'
-            install: https://developer.download.nvidia.com/compute/cuda/11.3.1/local_installers/cuda_11.3.1_465.89_win10.exe
-            cuda-version: '11.3'
          - os: windows
            arch: amd64
            preset: 'CUDA 12'
@ -324,7 +319,6 @@ jobs:
            case "$COMPONENT" in
              bin/ollama)               echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}.tar.in ;;
              lib/ollama/*.so)          echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}.tar.in ;;
-              lib/ollama/cuda_v11)      echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}.tar.in ;;
              lib/ollama/cuda_v12)      echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}.tar.in ;;
              lib/ollama/cuda_jetpack5) echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}-jetpack5.tar.in ;;
              lib/ollama/cuda_jetpack6) echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}-jetpack6.tar.in ;;
@ -432,6 +426,22 @@ jobs:
          docker buildx imagetools inspect ollama/ollama:${{ steps.metadata.outputs.version }}
        working-directory: ${{ runner.temp }}

+  # Trigger downstream release process
+  trigger:
+    runs-on: ubuntu-latest
+    environment: release
+    needs: [darwin-build, windows-build, windows-depends]
+    steps:
+      - name: Trigger downstream release process
+        run: |
+          curl -L \
+            -X POST \
+            -H "Accept: application/vnd.github+json" \
+            -H "Authorization: Bearer ${{ secrets.RELEASE_TOKEN }}" \
+            -H "X-GitHub-Api-Version: 2022-11-28" \
+            https://api.github.com/repos/ollama/${{ vars.RELEASE_REPO }}/dispatches \
+            -d "{\"event_type\": \"trigger-workflow\", \"client_payload\": {\"run_id\": \"${GITHUB_RUN_ID}\", \"version\": \"${GITHUB_REF_NAME#v}\"}}"
+
  # Aggregate all the assets and ship a release
  release:
    needs: [darwin-sign, windows-sign, linux-build]
--- a/.github/workflows/test.yaml
+++ b/.github/workflows/test.yaml
@ -46,7 +46,7 @@ jobs:
        include:
          - preset: CPU
          - preset: CUDA
-            container: nvidia/cuda:11.8.0-devel-ubuntu22.04
+            container: nvidia/cuda:12.8.1-devel-ubuntu22.04
            flags: '-DCMAKE_CUDA_ARCHITECTURES=87'
          - preset: ROCm
            container: rocm/dev-ubuntu-22.04:6.1.2
@ -78,7 +78,7 @@ jobs:
        include:
          - preset: CPU
          - preset: CUDA
-            install: https://developer.download.nvidia.com/compute/cuda/11.3.1/local_installers/cuda_11.3.1_465.89_win10.exe
+            install: https://developer.download.nvidia.com/compute/cuda/12.8.0/local_installers/cuda_12.8.0_571.96_windows.exe
            flags: '-DCMAKE_CUDA_ARCHITECTURES=80'
          - preset: ROCm
            install: https://download.amd.com/developer/eula/rocm-hub/AMD-Software-PRO-Edition-24.Q4-WinSvr2022-For-HIP.exe
@ -102,7 +102,7 @@ jobs:
          $ErrorActionPreference = "Stop"
          if ("${{ steps.cache-install.outputs.cache-hit }}" -ne 'true') {
            Invoke-WebRequest -Uri "${{ matrix.install }}" -OutFile "install.exe"
-            Start-Process -FilePath .\install.exe -ArgumentList (@("-s", "cudart_11.3", "nvcc_11.3", "cublas_11.3", "cublas_dev_11.3")) -NoNewWindow -Wait
+            Start-Process -FilePath .\install.exe -ArgumentList (@("-s", "cudart_12.8", "nvcc_12.8", "cublas_12.8", "cublas_dev_12.8")) -NoNewWindow -Wait
          }

          $cudaPath = (Resolve-Path "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\*").path
--- a/.golangci.yaml
+++ b/.golangci.yaml
@ -19,8 +19,8 @@ linters:
    - nolintlint
    - nosprintfhostport
    - staticcheck
-    - tenv
    - unconvert
+    - usetesting
    - wastedassign
    - whitespace
  disable:
--- a/CMakePresets.json
+++ b/CMakePresets.json
@ -17,14 +17,6 @@
      "name": "CUDA",
      "inherits": [ "Default" ]
    },
-    {
-      "name": "CUDA 11",
-      "inherits": [ "CUDA" ],
-      "cacheVariables": {
-        "CMAKE_CUDA_ARCHITECTURES": "50;52;53;60;61;70;75;80;86",
-        "CMAKE_CUDA_FLAGS": "-Wno-deprecated-gpu-targets"
-      }
-    },
    {
      "name": "CUDA 12",
      "inherits": [ "CUDA" ],
@ -78,11 +70,6 @@
      "configurePreset": "CUDA",
      "targets": [ "ggml-cuda" ]
    },
-    {
-      "name": "CUDA 11",
-      "inherits": [ "CUDA" ],
-      "configurePreset": "CUDA 11"
-    },
    {
      "name": "CUDA 12",
      "inherits": [ "CUDA" ],
--- a/17
+++ b/17
@ -7,14 +7,10 @@ ARG JETPACK5VERSION=r35.4.1
 ARG JETPACK6VERSION=r36.4.0
 ARG CMAKEVERSION=3.31.2

-# CUDA v11 requires gcc v10.  v10.3 has regressions, so the rockylinux 8.5 AppStream has the latest compatible version
 FROM --platform=linux/amd64 rocm/dev-almalinux-8:${ROCMVERSION}-complete AS base-amd64
 RUN yum install -y yum-utils \
-    && yum-config-manager --add-repo https://dl.rockylinux.org/vault/rocky/8.5/AppStream/\$basearch/os/ \
-    && rpm --import https://dl.rockylinux.org/pub/rocky/RPM-GPG-KEY-Rocky-8 \
-    && dnf install -y yum-utils ccache gcc-toolset-10-gcc-10.2.1-8.2.el8 gcc-toolset-10-gcc-c++-10.2.1-8.2.el8 gcc-toolset-10-binutils-2.35-11.el8 \
+    && dnf install -y ccache \
    && yum-config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/rhel8/x86_64/cuda-rhel8.repo
-ENV PATH=/opt/rh/gcc-toolset-10/root/usr/bin:$PATH

 FROM --platform=linux/arm64 almalinux:8 AS base-arm64
 # install epel-release for ccache
@ -38,15 +34,6 @@ RUN --mount=type=cache,target=/root/.ccache \
        && cmake --build --parallel --preset 'CPU' \
        && cmake --install build --component CPU --strip --parallel 8

-FROM base AS cuda-11
-ARG CUDA11VERSION=11.3
-RUN dnf install -y cuda-toolkit-${CUDA11VERSION//./-}
-ENV PATH=/usr/local/cuda-11/bin:$PATH
-RUN --mount=type=cache,target=/root/.ccache \
-    cmake --preset 'CUDA 11' \
-        && cmake --build --parallel --preset 'CUDA 11' \
-        && cmake --install build --component CUDA --strip --parallel 8
-
 FROM base AS cuda-12
 ARG CUDA12VERSION=12.8
 RUN dnf install -y cuda-toolkit-${CUDA12VERSION//./-}
@ -98,11 +85,9 @@ RUN --mount=type=cache,target=/root/.cache/go-build \
    go build -trimpath -buildmode=pie -o /bin/ollama .

 FROM --platform=linux/amd64 scratch AS amd64
-COPY --from=cuda-11 dist/lib/ollama/cuda_v11 /lib/ollama/cuda_v11
 COPY --from=cuda-12 dist/lib/ollama/cuda_v12 /lib/ollama/cuda_v12

 FROM --platform=linux/arm64 scratch AS arm64
-COPY --from=cuda-11 dist/lib/ollama/cuda_v11 /lib/ollama/cuda_v11
 COPY --from=cuda-12 dist/lib/ollama/cuda_v12 /lib/ollama/cuda_v12
 COPY --from=jetpack-5 dist/lib/ollama/cuda_v11 /lib/ollama/cuda_jetpack5
 COPY --from=jetpack-6 dist/lib/ollama/cuda_v12 /lib/ollama/cuda_jetpack6
--- a/Makefile.sync
+++ b/Makefile.sync
@ -1,6 +1,6 @@
 UPSTREAM=https://github.com/ggerganov/llama.cpp.git
 WORKDIR=llama/vendor
-FETCH_HEAD=2016f07bd106c73699ecbaace80f55db5ed95dac
+FETCH_HEAD=e1e8e0991ffd9e99a445c6812bb519d5bac9f4b5

 .PHONY: help
 help:
--- a/README.md
+++ b/README.md
@ -61,6 +61,8 @@ Here are some example models that can be downloaded:
 | QwQ                | 32B        | 20GB  | `ollama run qwq`                 |
 | DeepSeek-R1        | 7B         | 4.7GB | `ollama run deepseek-r1`         |
 | DeepSeek-R1        | 671B       | 404GB | `ollama run deepseek-r1:671b`    |
+| Llama 4            | 109B       | 67GB  | `ollama run llama4:scout`        |
+| Llama 4            | 400B       | 245GB | `ollama run llama4:maverick`     |
 | Llama 3.3          | 70B        | 43GB  | `ollama run llama3.3`            |
 | Llama 3.2          | 3B         | 2.0GB | `ollama run llama3.2`            |
 | Llama 3.2          | 1B         | 1.3GB | `ollama run llama3.2:1b`         |
@ -77,7 +79,7 @@ Here are some example models that can be downloaded:
 | Code Llama         | 7B         | 3.8GB | `ollama run codellama`           |
 | Llama 2 Uncensored | 7B         | 3.8GB | `ollama run llama2-uncensored`   |
 | LLaVA              | 7B         | 4.5GB | `ollama run llava`               |
-| Granite-3.2         | 8B         | 4.9GB | `ollama run granite3.2`          |
+| Granite-3.3         | 8B         | 4.9GB | `ollama run granite3.3`          |

 > [!NOTE]
 > You should have at least 8 GB of RAM available to run the 7B models, 16 GB to run the 13B models, and 32 GB to run the 33B models.
@ -285,7 +287,7 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [Bionic GPT](https://github.com/bionic-gpt/bionic-gpt)
 - [HTML UI](https://github.com/rtcfirefly/ollama-ui)
 - [Saddle](https://github.com/jikkuatwork/saddle)
- [TagSpaces](https://www.tagspaces.org) (A platform for file based apps, [utilizing Ollama](https://docs.tagspaces.org/ai/) for the generation of tags and descriptions)
+- [TagSpaces](https://www.tagspaces.org) (A platform for file-based apps, [utilizing Ollama](https://docs.tagspaces.org/ai/) for the generation of tags and descriptions)
 - [Chatbot UI](https://github.com/ivanfioravanti/chatbot-ollama)
 - [Chatbot UI v2](https://github.com/mckaywrigley/chatbot-ui)
 - [Typescript UI](https://github.com/ollama-interface/Ollama-Gui?tab=readme-ov-file)
@ -312,6 +314,7 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [Ollama Basic Chat: Uses HyperDiv Reactive UI](https://github.com/rapidarchitect/ollama_basic_chat)
 - [Ollama-chats RPG](https://github.com/drazdra/ollama-chats)
 - [IntelliBar](https://intellibar.app/) (AI-powered assistant for macOS)
+- [Jirapt](https://github.com/AliAhmedNada/jirapt) (Jira Integration to generate issues, tasks, epics)
 - [QA-Pilot](https://github.com/reid41/QA-Pilot) (Interactive chat tool that can leverage Ollama models for rapid understanding and navigation of GitHub code repositories)
 - [ChatOllama](https://github.com/sugarforever/chat-ollama) (Open Source Chatbot based on Ollama with Knowledge Bases)
 - [CRAG Ollama Chat](https://github.com/Nagi-ovo/CRAG-Ollama-Chat) (Simple Web Search with Corrective RAG)
@ -325,14 +328,14 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [RWKV-Runner](https://github.com/josStorer/RWKV-Runner) (RWKV offline LLM deployment tool, also usable as a client for ChatGPT and Ollama)
 - [Ollama Grid Search](https://github.com/dezoito/ollama-grid-search) (app to evaluate and compare models)
 - [Olpaka](https://github.com/Otacon/olpaka) (User-friendly Flutter Web App for Ollama)
- [Casibase](https://casibase.org) (An open source AI knowledge base and dialogue system combining the latest RAG, SSO, ollama support and multiple large language models.)
+- [Casibase](https://casibase.org) (An open source AI knowledge base and dialogue system combining the latest RAG, SSO, ollama support, and multiple large language models.)
 - [OllamaSpring](https://github.com/CrazyNeil/OllamaSpring) (Ollama Client for macOS)
 - [LLocal.in](https://github.com/kartikm7/llocal) (Easy to use Electron Desktop Client for Ollama)
 - [Shinkai Desktop](https://github.com/dcSpark/shinkai-apps) (Two click install Local AI using Ollama + Files + RAG)
- [AiLama](https://github.com/zeyoyt/ailama) (A Discord User App that allows you to interact with Ollama anywhere in discord )
+- [AiLama](https://github.com/zeyoyt/ailama) (A Discord User App that allows you to interact with Ollama anywhere in Discord)
 - [Ollama with Google Mesop](https://github.com/rapidarchitect/ollama_mesop/) (Mesop Chat Client implementation with Ollama)
 - [R2R](https://github.com/SciPhi-AI/R2R) (Open-source RAG engine)
- [Ollama-Kis](https://github.com/elearningshow/ollama-kis) (A simple easy to use GUI with sample custom LLM for Drivers Education)
+- [Ollama-Kis](https://github.com/elearningshow/ollama-kis) (A simple easy-to-use GUI with sample custom LLM for Drivers Education)
 - [OpenGPA](https://opengpa.org) (Open-source offline-first Enterprise Agentic Application)
 - [Painting Droid](https://github.com/mateuszmigas/painting-droid) (Painting app with AI integrations)
 - [Kerlig AI](https://www.kerlig.com/) (AI writing assistant for macOS)
@ -341,16 +344,16 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [LLMStack](https://github.com/trypromptly/LLMStack) (No-code multi-agent framework to build LLM agents and workflows)
 - [BoltAI for Mac](https://boltai.com) (AI Chat Client for Mac)
 - [Harbor](https://github.com/av/harbor) (Containerized LLM Toolkit with Ollama as default backend)
- [PyGPT](https://github.com/szczyglis-dev/py-gpt) (AI desktop assistant for Linux, Windows and Mac)
- [Alpaca](https://github.com/Jeffser/Alpaca) (An Ollama client application for linux and macos made with GTK4 and Adwaita)
+- [PyGPT](https://github.com/szczyglis-dev/py-gpt) (AI desktop assistant for Linux, Windows, and Mac)
+- [Alpaca](https://github.com/Jeffser/Alpaca) (An Ollama client application for Linux and macOS made with GTK4 and Adwaita)
 - [AutoGPT](https://github.com/Significant-Gravitas/AutoGPT/blob/master/docs/content/platform/ollama.md) (AutoGPT Ollama integration)
 - [Go-CREW](https://www.jonathanhecl.com/go-crew/) (Powerful Offline RAG in Golang)
 - [PartCAD](https://github.com/openvmp/partcad/) (CAD model generation with OpenSCAD and CadQuery)
- [Ollama4j Web UI](https://github.com/ollama4j/ollama4j-web-ui) - Java-based Web UI for Ollama built with Vaadin, Spring Boot and Ollama4j
+- [Ollama4j Web UI](https://github.com/ollama4j/ollama4j-web-ui) - Java-based Web UI for Ollama built with Vaadin, Spring Boot, and Ollama4j
 - [PyOllaMx](https://github.com/kspviswa/pyOllaMx) - macOS application capable of chatting with both Ollama and Apple MLX models.
 - [Cline](https://github.com/cline/cline) - Formerly known as Claude Dev is a VSCode extension for multi-file/whole-repo coding
 - [Cherry Studio](https://github.com/kangfenmao/cherry-studio) (Desktop client with Ollama support)
- [ConfiChat](https://github.com/1runeberg/confichat) (Lightweight, standalone, multi-platform, and privacy focused LLM chat interface with optional encryption)
+- [ConfiChat](https://github.com/1runeberg/confichat) (Lightweight, standalone, multi-platform, and privacy-focused LLM chat interface with optional encryption)
 - [Archyve](https://github.com/nickthecook/archyve) (RAG-enabling document library)
 - [crewAI with Mesop](https://github.com/rapidarchitect/ollama-crew-mesop) (Mesop Web Interface to run crewAI with Ollama)
 - [Tkinter-based client](https://github.com/chyok/ollama-gui) (Python tkinter-based Client for Ollama)
@ -368,7 +371,7 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [DualMind](https://github.com/tcsenpai/dualmind) (Experimental app allowing two models to talk to each other in the terminal or in a web interface)
 - [ollamarama-matrix](https://github.com/h1ddenpr0cess20/ollamarama-matrix) (Ollama chatbot for the Matrix chat protocol)
 - [ollama-chat-app](https://github.com/anan1213095357/ollama-chat-app) (Flutter-based chat app)
- [Perfect Memory AI](https://www.perfectmemory.ai/) (Productivity AI assists personalized by what you have seen on your screen, heard and said in the meetings)
+- [Perfect Memory AI](https://www.perfectmemory.ai/) (Productivity AI assists personalized by what you have seen on your screen, heard, and said in the meetings)
 - [Hexabot](https://github.com/hexastack/hexabot) (A conversational AI builder)
 - [Reddit Rate](https://github.com/rapidarchitect/reddit_analyzer) (Search and Rate Reddit topics with a weighted summation)
 - [OpenTalkGpt](https://github.com/adarshM84/OpenTalkGpt) (Chrome Extension to manage open-source models supported by Ollama, create custom models, and chat with models from a user-friendly UI)
@ -386,7 +389,7 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [ChibiChat](https://github.com/CosmicEventHorizon/ChibiChat) (Kotlin-based Android app to chat with Ollama and Koboldcpp API endpoints)
 - [LocalLLM](https://github.com/qusaismael/localllm) (Minimal Web-App to run ollama models on it with a GUI)
 - [Ollamazing](https://github.com/buiducnhat/ollamazing) (Web extension to run Ollama models)
- [OpenDeepResearcher-via-searxng](https://github.com/benhaotang/OpenDeepResearcher-via-searxng) (A Deep Research equivent endpoint with Ollama support for running locally)
+- [OpenDeepResearcher-via-searxng](https://github.com/benhaotang/OpenDeepResearcher-via-searxng) (A Deep Research equivalent endpoint with Ollama support for running locally)
 - [AntSK](https://github.com/AIDotNet/AntSK) (Out-of-the-box & Adaptable RAG Chatbot)
 - [MaxKB](https://github.com/1Panel-dev/MaxKB/) (Ready-to-use & flexible RAG Chatbot)
 - [yla](https://github.com/danielekp/yla) (Web interface to freely interact with your customized models)
@ -394,11 +397,13 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [1Panel](https://github.com/1Panel-dev/1Panel/) (Web-based Linux Server Management Tool)
 - [AstrBot](https://github.com/Soulter/AstrBot/) (User-friendly LLM-based multi-platform chatbot with a WebUI, supporting RAG, LLM agents, and plugins integration)
 - [Reins](https://github.com/ibrahimcetin/reins) (Easily tweak parameters, customize system prompts per chat, and enhance your AI experiments with reasoning model support.)
+- [Flufy](https://github.com/Aharon-Bensadoun/Flufy) (A beautiful chat interface for interacting with Ollama's API. Built with React, TypeScript, and Material-UI.)
 - [Ellama](https://github.com/zeozeozeo/ellama) (Friendly native app to chat with an Ollama instance)
 - [screenpipe](https://github.com/mediar-ai/screenpipe) Build agents powered by your screen history
 - [Ollamb](https://github.com/hengkysteen/ollamb) (Simple yet rich in features, cross-platform built with Flutter and designed for Ollama. Try the [web demo](https://hengkysteen.github.io/demo/ollamb/).)
 - [Writeopia](https://github.com/Writeopia/Writeopia) (Text editor with integration with Ollama)
 - [AppFlowy](https://github.com/AppFlowy-IO/AppFlowy) (AI collaborative workspace with Ollama, cross-platform and self-hostable)
+- [Lumina](https://github.com/cushydigit/lumina.git) (A lightweight, minimal React.js frontend for interacting with Ollama servers)

 ### Cloud

@ -440,7 +445,7 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [PowershAI](https://github.com/rrg92/powershai) PowerShell module that brings AI to terminal on Windows, including support for Ollama
 - [DeepShell](https://github.com/Abyss-c0re/deepshell) Your self-hosted AI assistant. Interactive Shell, Files and Folders analysis.
 - [orbiton](https://github.com/xyproto/orbiton) Configuration-free text editor and IDE with support for tab completion with Ollama.
- [orca-cli](https://github.com/molbal/orca-cli) Ollama Registry CLI Application - Browse, pull and download models from Ollama Registry in your terminal.
+- [orca-cli](https://github.com/molbal/orca-cli) Ollama Registry CLI Application - Browse, pull, and download models from Ollama Registry in your terminal.
 - [GGUF-to-Ollama](https://github.com/jonathanhecl/gguf-to-ollama) - Importing GGUF to Ollama made easy (multiplatform)

 ### Apple Vision Pro
@ -468,7 +473,7 @@ See the [API documentation](./docs/api.md) for all endpoints.

 ### Libraries

- [LangChain](https://python.langchain.com/docs/integrations/llms/ollama) and [LangChain.js](https://js.langchain.com/docs/integrations/chat/ollama/) with [example](https://js.langchain.com/docs/tutorials/local_rag/)
+- [LangChain](https://python.langchain.com/docs/integrations/chat/ollama/) and [LangChain.js](https://js.langchain.com/docs/integrations/chat/ollama/) with [example](https://js.langchain.com/docs/tutorials/local_rag/)
 - [Firebase Genkit](https://firebase.google.com/docs/genkit/plugins/ollama)
 - [crewAI](https://github.com/crewAIInc/crewAI)
 - [Yacana](https://remembersoftwares.github.io/yacana/) (User-friendly multi-agent framework for brainstorming and executing predetermined flows with built-in tool integration)
@ -515,7 +520,7 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [Swollama for Swift](https://github.com/marcusziade/Swollama) with [DocC](https://marcusziade.github.io/Swollama/documentation/swollama/)
 - [GoLamify](https://github.com/prasad89/golamify)
 - [Ollama for Haskell](https://github.com/tusharad/ollama-haskell)
- [multi-llm-ts](https://github.com/nbonamy/multi-llm-ts) (A Typescript/JavaScript library allowing access to different LLM in unified API)
+- [multi-llm-ts](https://github.com/nbonamy/multi-llm-ts) (A Typescript/JavaScript library allowing access to different LLM in a unified API)
 - [LlmTornado](https://github.com/lofcz/llmtornado) (C# library providing a unified interface for major FOSS & Commercial inference APIs)
 - [Ollama for Zig](https://github.com/dravenk/ollama-zig)
 - [Abso](https://github.com/lunary-ai/abso) (OpenAI-compatible TypeScript SDK for any LLM provider)
@ -524,11 +529,11 @@ See the [API documentation](./docs/api.md) for all endpoints.

 ### Mobile

- [SwiftChat](https://github.com/aws-samples/swift-chat) (Lightning-fast Cross-platform AI chat app with native UI for Android, iOS and iPad)
+- [SwiftChat](https://github.com/aws-samples/swift-chat) (Lightning-fast Cross-platform AI chat app with native UI for Android, iOS, and iPad)
 - [Enchanted](https://github.com/AugustDev/enchanted)
 - [Maid](https://github.com/Mobile-Artificial-Intelligence/maid)
 - [Ollama App](https://github.com/JHubi1/ollama-app) (Modern and easy-to-use multi-platform client for Ollama)
- [ConfiChat](https://github.com/1runeberg/confichat) (Lightweight, standalone, multi-platform, and privacy focused LLM chat interface with optional encryption)
+- [ConfiChat](https://github.com/1runeberg/confichat) (Lightweight, standalone, multi-platform, and privacy-focused LLM chat interface with optional encryption)
 - [Ollama Android Chat](https://github.com/sunshine0523/OllamaServer) (No need for Termux, start the Ollama service with one click on an Android device)
 - [Reins](https://github.com/ibrahimcetin/reins) (Easily tweak parameters, customize system prompts per chat, and enhance your AI experiments with reasoning model support.)

@ -552,7 +557,7 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [Obsidian Local GPT plugin](https://github.com/pfrankov/obsidian-local-gpt)
 - [Open Interpreter](https://docs.openinterpreter.com/language-model-setup/local-models/ollama)
 - [Llama Coder](https://github.com/ex3ndr/llama-coder) (Copilot alternative using Ollama)
- [Ollama Copilot](https://github.com/bernardo-bruning/ollama-copilot) (Proxy that allows you to use ollama as a copilot like Github copilot)
+- [Ollama Copilot](https://github.com/bernardo-bruning/ollama-copilot) (Proxy that allows you to use Ollama as a copilot like GitHub Copilot)
 - [twinny](https://github.com/rjmacarthy/twinny) (Copilot and Copilot chat alternative using Ollama)
 - [Wingman-AI](https://github.com/RussellCanfield/wingman-ai) (Copilot code and chat alternative using Ollama and Hugging Face)
 - [Page Assist](https://github.com/n4ze3m/page-assist) (Chrome Extension)
@ -562,8 +567,8 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [Discord-Ollama Chat Bot](https://github.com/kevinthedang/discord-ollama) (Generalized TypeScript Discord Bot w/ Tuning Documentation)
 - [ChatGPTBox: All in one browser extension](https://github.com/josStorer/chatGPTBox) with [Integrating Tutorial](https://github.com/josStorer/chatGPTBox/issues/616#issuecomment-1975186467)
 - [Discord AI chat/moderation bot](https://github.com/rapmd73/Companion) Chat/moderation bot written in python. Uses Ollama to create personalities.
- [Headless Ollama](https://github.com/nischalj10/headless-ollama) (Scripts to automatically install ollama client & models on any OS for apps that depends on ollama server)
- [Terraform AWS Ollama & Open WebUI](https://github.com/xuyangbocn/terraform-aws-self-host-llm) (A Terraform module to deploy on AWS a ready-to-use Ollama service, together with its front end Open WebUI service.)
+- [Headless Ollama](https://github.com/nischalj10/headless-ollama) (Scripts to automatically install ollama client & models on any OS for apps that depend on ollama server)
+- [Terraform AWS Ollama & Open WebUI](https://github.com/xuyangbocn/terraform-aws-self-host-llm) (A Terraform module to deploy on AWS a ready-to-use Ollama service, together with its front-end Open WebUI service.)
 - [node-red-contrib-ollama](https://github.com/jakubburkiewicz/node-red-contrib-ollama)
 - [Local AI Helper](https://github.com/ivostoykov/localAI) (Chrome and Firefox extensions that enable interactions with the active tab and customisable API endpoints. Includes secure storage for user prompts.)
 - [vnc-lm](https://github.com/jake83741/vnc-lm) (Discord bot for messaging with LLMs through Ollama and LiteLLM. Seamlessly move between local and flagship models.)
--- a/api/client_test.go
+++ b/api/client_test.go
@ -1,7 +1,6 @@
 package api

 import (
-	"context"
 	"encoding/json"
 	"fmt"
 	"net/http"
@ -137,7 +136,7 @@ func TestClientStream(t *testing.T) {
 			client := NewClient(&url.URL{Scheme: "http", Host: ts.Listener.Addr().String()}, http.DefaultClient)

 			var receivedChunks []ChatResponse
-			err := client.stream(context.Background(), http.MethodPost, "/v1/chat", nil, func(chunk []byte) error {
+			err := client.stream(t.Context(), http.MethodPost, "/v1/chat", nil, func(chunk []byte) error {
 				var resp ChatResponse
 				if err := json.Unmarshal(chunk, &resp); err != nil {
 					return fmt.Errorf("failed to unmarshal chunk: %w", err)
@ -223,7 +222,7 @@ func TestClientDo(t *testing.T) {
 				ID      string `json:"id"`
 				Success bool   `json:"success"`
 			}
-			err := client.do(context.Background(), http.MethodPost, "/v1/messages", nil, &resp)
+			err := client.do(t.Context(), http.MethodPost, "/v1/messages", nil, &resp)

 			if tc.wantErr != "" {
 				if err == nil {
--- a/api/types.go
+++ b/api/types.go
@ -271,9 +271,6 @@ type Options struct {
 	RepeatPenalty    float32  `json:"repeat_penalty,omitempty"`
 	PresencePenalty  float32  `json:"presence_penalty,omitempty"`
 	FrequencyPenalty float32  `json:"frequency_penalty,omitempty"`
-	Mirostat         int      `json:"mirostat,omitempty"`
-	MirostatTau      float32  `json:"mirostat_tau,omitempty"`
-	MirostatEta      float32  `json:"mirostat_eta,omitempty"`
 	Stop             []string `json:"stop,omitempty"`
 }

@ -283,12 +280,7 @@ type Runner struct {
 	NumBatch  int   `json:"num_batch,omitempty"`
 	NumGPU    int   `json:"num_gpu,omitempty"`
 	MainGPU   int   `json:"main_gpu,omitempty"`
-	LowVRAM   bool  `json:"low_vram,omitempty"`
-	F16KV     bool  `json:"f16_kv,omitempty"` // Deprecated: This option is ignored
-	LogitsAll bool  `json:"logits_all,omitempty"`
-	VocabOnly bool  `json:"vocab_only,omitempty"`
 	UseMMap   *bool `json:"use_mmap,omitempty"`
-	UseMLock  bool  `json:"use_mlock,omitempty"`
 	NumThread int   `json:"num_thread,omitempty"`
 }

@ -471,13 +463,6 @@ type ProcessModelResponse struct {
 	SizeVRAM  int64        `json:"size_vram"`
 }

-type RetrieveModelResponse struct {
-	Id      string `json:"id"`
-	Object  string `json:"object"`
-	Created int64  `json:"created"`
-	OwnedBy string `json:"owned_by"`
-}
-
 type TokenResponse struct {
 	Token string `json:"token"`
 }
@ -660,9 +645,6 @@ func DefaultOptions() Options {
 		RepeatPenalty:    1.1,
 		PresencePenalty:  0.0,
 		FrequencyPenalty: 0.0,
-		Mirostat:         0,
-		MirostatTau:      5.0,
-		MirostatEta:      0.1,
 		Seed:             -1,

 		Runner: Runner{
@ -671,8 +653,6 @@ func DefaultOptions() Options {
 			NumBatch:  512,
 			NumGPU:    -1, // -1 here indicates that NumGPU should be set dynamically
 			NumThread: 0,  // let the runtime decide
-			LowVRAM:   false,
-			UseMLock:  false,
 			UseMMap:   nil,
 		},
 	}
--- a/benchmark/server_benchmark_test.go
+++ b/benchmark/server_benchmark_test.go
@ -78,7 +78,7 @@ func BenchmarkColdStart(b *testing.B) {

 	for _, tt := range tests {
 		b.Run(fmt.Sprintf("%s/cold/%s", m, tt.name), func(b *testing.B) {
-			ctx := context.Background()
+			ctx := b.Context()

 			// Set number of tokens as our throughput metric
 			b.SetBytes(int64(tt.maxTokens))
@ -113,7 +113,7 @@ func BenchmarkWarmStart(b *testing.B) {

 	for _, tt := range tests {
 		b.Run(fmt.Sprintf("%s/warm/%s", m, tt.name), func(b *testing.B) {
-			ctx := context.Background()
+			ctx := b.Context()

 			// Pre-warm the model
 			warmup(client, m, tt.prompt, b)
@ -140,7 +140,7 @@ func setup(b *testing.B) *api.Client {
 	if err != nil {
 		b.Fatal(err)
 	}
-	if _, err := client.Show(context.Background(), &api.ShowRequest{Model: modelName(b)}); err != nil {
+	if _, err := client.Show(b.Context(), &api.ShowRequest{Model: modelName(b)}); err != nil {
 		b.Fatalf("Model unavailable: %v", err)
 	}

--- a/cmd/cmd.go
+++ b/cmd/cmd.go
@ -31,6 +31,7 @@ import (
 	"github.com/olekukonko/tablewriter"
 	"github.com/spf13/cobra"
 	"golang.org/x/crypto/ssh"
+	"golang.org/x/sync/errgroup"
 	"golang.org/x/term"

 	"github.com/ollama/ollama/api"
@ -41,6 +42,7 @@ import (
 	"github.com/ollama/ollama/runner"
 	"github.com/ollama/ollama/server"
 	"github.com/ollama/ollama/types/model"
+	"github.com/ollama/ollama/types/syncmap"
 	"github.com/ollama/ollama/version"
 )

@ -106,7 +108,7 @@ func CreateHandler(cmd *cobra.Command, args []string) error {
 	}
 	spinner.Stop()

-	req.Name = args[0]
+	req.Model = args[0]
 	quantize, _ := cmd.Flags().GetString("quantize")
 	if quantize != "" {
 		req.Quantize = quantize
@ -117,34 +119,54 @@ func CreateHandler(cmd *cobra.Command, args []string) error {
 		return err
 	}

-	if len(req.Files) > 0 {
-		fileMap := map[string]string{}
-		for f, digest := range req.Files {
+	var g errgroup.Group
+	g.SetLimit(max(runtime.GOMAXPROCS(0)-1, 1))
+
+	files := syncmap.NewSyncMap[string, string]()
+	for f, digest := range req.Files {
+		g.Go(func() error {
 			if _, err := createBlob(cmd, client, f, digest, p); err != nil {
 				return err
 			}
-			fileMap[filepath.Base(f)] = digest
-		}
-		req.Files = fileMap
+
+			// TODO: this is incorrect since the file might be in a subdirectory
+			//       instead this should take the path relative to the model directory
+			//       but the current implementation does not allow this
+			files.Store(filepath.Base(f), digest)
+			return nil
+		})
 	}

-	if len(req.Adapters) > 0 {
-		fileMap := map[string]string{}
-		for f, digest := range req.Adapters {
+	adapters := syncmap.NewSyncMap[string, string]()
+	for f, digest := range req.Adapters {
+		g.Go(func() error {
 			if _, err := createBlob(cmd, client, f, digest, p); err != nil {
 				return err
 			}
-			fileMap[filepath.Base(f)] = digest
-		}
-		req.Adapters = fileMap
+
+			// TODO: same here
+			adapters.Store(filepath.Base(f), digest)
+			return nil
+		})
 	}

+	if err := g.Wait(); err != nil {
+		return err
+	}
+
+	req.Files = files.Items()
+	req.Adapters = adapters.Items()
+
 	bars := make(map[string]*progress.Bar)
 	fn := func(resp api.ProgressResponse) error {
 		if resp.Digest != "" {
 			bar, ok := bars[resp.Digest]
 			if !ok {
-				bar = progress.NewBar(fmt.Sprintf("pulling %s...", resp.Digest[7:19]), resp.Total, resp.Completed)
+				msg := resp.Status
+				if msg == "" {
+					msg = fmt.Sprintf("pulling %s...", resp.Digest[7:19])
+				}
+				bar = progress.NewBar(msg, resp.Total, resp.Completed)
 				bars[resp.Digest] = bar
 				p.Add(resp.Digest, bar)
 			}
@ -213,7 +235,7 @@ func createBlob(cmd *cobra.Command, client *api.Client, path string, digest stri
 		}
 	}()

-	if err = client.CreateBlob(cmd.Context(), digest, io.TeeReader(bin, &pw)); err != nil {
+	if err := client.CreateBlob(cmd.Context(), digest, io.TeeReader(bin, &pw)); err != nil {
 		return "", err
 	}
 	return digest, nil
@ -1407,7 +1429,6 @@ func NewCLI() *cobra.Command {
 				envVars["OLLAMA_LLM_LIBRARY"],
 				envVars["OLLAMA_GPU_OVERHEAD"],
 				envVars["OLLAMA_LOAD_TIMEOUT"],
-				envVars["OLLAMA_CONTEXT_LENGTH"],
 			})
 		default:
 			appendEnvDocs(cmd, envs)
--- a/cmd/cmd_test.go
+++ b/cmd/cmd_test.go
@ -2,7 +2,6 @@ package cmd

 import (
 	"bytes"
-	"context"
 	"encoding/json"
 	"io"
 	"net/http"
@ -337,7 +336,7 @@ func TestDeleteHandler(t *testing.T) {
 	t.Cleanup(mockServer.Close)

 	cmd := &cobra.Command{}
-	cmd.SetContext(context.TODO())
+	cmd.SetContext(t.Context())
 	if err := DeleteHandler(cmd, []string{"test-model"}); err != nil {
 		t.Fatalf("DeleteHandler failed: %v", err)
 	}
@ -399,11 +398,6 @@ func TestGetModelfileName(t *testing.T) {
 			var expectedFilename string

 			if tt.fileExists {
-				tempDir, err := os.MkdirTemp("", "modelfiledir")
-				defer os.RemoveAll(tempDir)
-				if err != nil {
-					t.Fatalf("temp modelfile dir creation failed: %v", err)
-				}
 				var fn string
 				if tt.modelfileName != "" {
 					fn = tt.modelfileName
@ -411,7 +405,7 @@ func TestGetModelfileName(t *testing.T) {
 					fn = "Modelfile"
 				}

-				tempFile, err := os.CreateTemp(tempDir, fn)
+				tempFile, err := os.CreateTemp(t.TempDir(), fn)
 				if err != nil {
 					t.Fatalf("temp modelfile creation failed: %v", err)
 				}
@ -530,7 +524,7 @@ func TestPushHandler(t *testing.T) {

 			cmd := &cobra.Command{}
 			cmd.Flags().Bool("insecure", false, "")
-			cmd.SetContext(context.TODO())
+			cmd.SetContext(t.Context())

 			// Redirect stderr to capture progress output
 			oldStderr := os.Stderr
@ -635,7 +629,7 @@ func TestListHandler(t *testing.T) {
 			t.Setenv("OLLAMA_HOST", mockServer.URL)

 			cmd := &cobra.Command{}
-			cmd.SetContext(context.TODO())
+			cmd.SetContext(t.Context())

 			// Capture stdout
 			oldStdout := os.Stdout
@ -690,7 +684,7 @@ func TestCreateHandler(t *testing.T) {
 						return
 					}

-					if req.Name != "test-model" {
+					if req.Model != "test-model" {
 						t.Errorf("expected model name 'test-model', got %s", req.Name)
 					}

@ -730,7 +724,7 @@ func TestCreateHandler(t *testing.T) {
 			}))
 			t.Setenv("OLLAMA_HOST", mockServer.URL)
 			t.Cleanup(mockServer.Close)
-			tempFile, err := os.CreateTemp("", "modelfile")
+			tempFile, err := os.CreateTemp(t.TempDir(), "modelfile")
 			if err != nil {
 				t.Fatal(err)
 			}
@ -750,7 +744,7 @@ func TestCreateHandler(t *testing.T) {
 			}

 			cmd.Flags().Bool("insecure", false, "")
-			cmd.SetContext(context.TODO())
+			cmd.SetContext(t.Context())

 			// Redirect stderr to capture progress output
 			oldStderr := os.Stderr
--- a/convert/convert.go
+++ b/convert/convert.go
@ -4,9 +4,9 @@ import (
 	"encoding/json"
 	"errors"
 	"fmt"
-	"io"
 	"io/fs"
 	"log/slog"
+	"os"
 	"slices"
 	"strings"

@ -89,7 +89,7 @@ type ModelConverter interface {
 	// KV maps parameters to LLM key-values
 	KV(*Tokenizer) ggml.KV
 	// Tensors maps input tensors to LLM tensors. Model specific modifications can be done here.
-	Tensors([]Tensor) []ggml.Tensor
+	Tensors([]Tensor) []*ggml.Tensor
 	// Replacements returns a list of string pairs to replace in tensor names.
 	// See [strings.Replacer](https://pkg.go.dev/strings#Replacer) for details
 	Replacements() []string
@ -106,13 +106,13 @@ type AdapterConverter interface {
 	// KV maps parameters to LLM key-values
 	KV(ggml.KV) ggml.KV
 	// Tensors maps input tensors to LLM tensors. Adapter specific modifications can be done here.
-	Tensors([]Tensor) []ggml.Tensor
+	Tensors([]Tensor) []*ggml.Tensor
 	// Replacements returns a list of string pairs to replace in tensor names.
 	// See [strings.Replacer](https://pkg.go.dev/strings#Replacer) for details
 	Replacements() []string
 }

-func ConvertAdapter(fsys fs.FS, ws io.WriteSeeker, baseKV ggml.KV) error {
+func ConvertAdapter(fsys fs.FS, f *os.File, baseKV ggml.KV) error {
 	bts, err := fs.ReadFile(fsys, "adapter_config.json")
 	if err != nil {
 		return err
@ -147,14 +147,14 @@ func ConvertAdapter(fsys fs.FS, ws io.WriteSeeker, baseKV ggml.KV) error {
 		return err
 	}

-	return writeFile(ws, conv.KV(baseKV), conv.Tensors(ts))
+	return writeFile(f, conv.KV(baseKV), conv.Tensors(ts))
 }

 // Convert writes an Ollama compatible model to the provided io.WriteSeeker based on configurations
 // and files it finds in the input path.
 // Supported input model formats include safetensors.
 // Supported input tokenizers files include tokenizer.json (preferred) and tokenizer.model.
-func ConvertModel(fsys fs.FS, ws io.WriteSeeker) error {
+func ConvertModel(fsys fs.FS, f *os.File) error {
 	bts, err := fs.ReadFile(fsys, "config.json")
 	if err != nil {
 		return err
@ -239,13 +239,13 @@ func ConvertModel(fsys fs.FS, ws io.WriteSeeker) error {
 		return err
 	}

-	return writeFile(ws, conv.KV(t), conv.Tensors(ts))
+	return writeFile(f, conv.KV(t), conv.Tensors(ts))
 }

-func writeFile(ws io.WriteSeeker, kv ggml.KV, ts []ggml.Tensor) error {
+func writeFile(f *os.File, kv ggml.KV, ts []*ggml.Tensor) error {
 	for i := range ts {
 		ts[i].Shape = slices.Clone(ts[i].Shape)
 		slices.Reverse(ts[i].Shape)
 	}
-	return ggml.WriteGGUF(ws, kv, ts)
+	return ggml.WriteGGUF(f, kv, ts)
 }
--- a/convert/convert_bert.go
+++ b/convert/convert_bert.go
@ -132,8 +132,8 @@ func (p *bertModel) KV(t *Tokenizer) ggml.KV {
 	return kv
 }

-func (p *bertModel) Tensors(ts []Tensor) []ggml.Tensor {
-	var out []ggml.Tensor
+func (p *bertModel) Tensors(ts []Tensor) []*ggml.Tensor {
+	var out []*ggml.Tensor
 	for _, t := range ts {
 		if slices.Contains([]string{
 			"embeddings.position_ids",
@ -143,7 +143,7 @@ func (p *bertModel) Tensors(ts []Tensor) []ggml.Tensor {
 			continue
 		}

-		out = append(out, ggml.Tensor{
+		out = append(out, &ggml.Tensor{
 			Name:     t.Name(),
 			Kind:     t.Kind(),
 			Shape:    t.Shape(),
--- a/convert/convert_commandr.go
+++ b/convert/convert_commandr.go
@ -43,10 +43,10 @@ func (p *commandrModel) KV(t *Tokenizer) ggml.KV {
 	return kv
 }

-func (p *commandrModel) Tensors(ts []Tensor) []ggml.Tensor {
-	var out []ggml.Tensor
+func (p *commandrModel) Tensors(ts []Tensor) []*ggml.Tensor {
+	var out []*ggml.Tensor
 	for _, t := range ts {
-		out = append(out, ggml.Tensor{
+		out = append(out, &ggml.Tensor{
 			Name:     t.Name(),
 			Kind:     t.Kind(),
 			Shape:    t.Shape(),
--- a/convert/convert_gemma.go
+++ b/convert/convert_gemma.go
@ -42,14 +42,14 @@ func (p *gemmaModel) KV(t *Tokenizer) ggml.KV {
 	return kv
 }

-func (p *gemmaModel) Tensors(ts []Tensor) []ggml.Tensor {
-	var out []ggml.Tensor
+func (p *gemmaModel) Tensors(ts []Tensor) []*ggml.Tensor {
+	var out []*ggml.Tensor
 	for _, t := range ts {
 		if !strings.HasPrefix(t.Name(), "v.") && strings.HasSuffix(t.Name(), "_norm.weight") {
 			t.SetRepacker(p.addOne)
 		}

-		out = append(out, ggml.Tensor{
+		out = append(out, &ggml.Tensor{
 			Name:     t.Name(),
 			Kind:     t.Kind(),
 			Shape:    t.Shape(),
--- a/convert/convert_gemma2_adapter.go
+++ b/convert/convert_gemma2_adapter.go
@ -21,8 +21,8 @@ func (p *gemma2Adapter) KV(baseKV ggml.KV) ggml.KV {
 	return kv
 }

-func (p *gemma2Adapter) Tensors(ts []Tensor) []ggml.Tensor {
-	var out []ggml.Tensor
+func (p *gemma2Adapter) Tensors(ts []Tensor) []*ggml.Tensor {
+	var out []*ggml.Tensor
 	for _, t := range ts {
 		shape := t.Shape()
 		if (strings.HasSuffix(t.Name(), "weight.lora_a") && shape[0] > shape[1]) ||
@ -31,7 +31,7 @@ func (p *gemma2Adapter) Tensors(ts []Tensor) []ggml.Tensor {
 			t.SetRepacker(p.repack)
 		}

-		out = append(out, ggml.Tensor{
+		out = append(out, &ggml.Tensor{
 			Name:     t.Name(),
 			Kind:     t.Kind(),
 			Shape:    t.Shape(),
--- a/convert/convert_llama.go
+++ b/convert/convert_llama.go
@ -126,11 +126,11 @@ func (p *llamaModel) KV(t *Tokenizer) ggml.KV {
 	return kv
 }

-func (p *llamaModel) Tensors(ts []Tensor) []ggml.Tensor {
-	var out []ggml.Tensor
+func (p *llamaModel) Tensors(ts []Tensor) []*ggml.Tensor {
+	var out []*ggml.Tensor

 	if p.RopeScaling.factors != nil {
-		out = append(out, ggml.Tensor{
+		out = append(out, &ggml.Tensor{
 			Name:     "rope_freqs.weight",
 			Kind:     0,
 			Shape:    []uint64{uint64(len(p.RopeScaling.factors))},
@ -145,7 +145,7 @@ func (p *llamaModel) Tensors(ts []Tensor) []ggml.Tensor {
 			}
 		}

-		out = append(out, ggml.Tensor{
+		out = append(out, &ggml.Tensor{
 			Name:     t.Name(),
 			Kind:     t.Kind(),
 			Shape:    t.Shape(),
--- a/convert/convert_llama4.go
+++ b/convert/convert_llama4.go
@ -88,13 +88,13 @@ func (p *llama4Model) Replacements() []string {
 }

 // Tensors implements ModelConverter.
-func (p *llama4Model) Tensors(ts []Tensor) []ggml.Tensor {
-	var out []ggml.Tensor
+func (p *llama4Model) Tensors(ts []Tensor) []*ggml.Tensor {
+	var out []*ggml.Tensor

 	var textTensors []Tensor
 	for _, t := range ts {
 		if strings.HasPrefix(t.Name(), "v.") || strings.HasPrefix(t.Name(), "mm.") {
-			out = append(out, ggml.Tensor{
+			out = append(out, &ggml.Tensor{
 				Name:     t.Name(),
 				Kind:     t.Kind(),
 				Shape:    t.Shape(),
@ -112,7 +112,7 @@ func (p *llama4Model) Tensors(ts []Tensor) []ggml.Tensor {
 				// clone tensor since we need separate repackers
 				tt := t.Clone()
 				tt.SetRepacker(p.repack(nil, nil, tensor.S(i*halfDim, (i+1)*halfDim)))
-				out = append(out, ggml.Tensor{
+				out = append(out, &ggml.Tensor{
 					Name:     strings.ReplaceAll(tt.Name(), "ffn_gate_up_exps", name),
 					Kind:     tt.Kind(),
 					Shape:    newShape,
@ -125,7 +125,7 @@ func (p *llama4Model) Tensors(ts []Tensor) []ggml.Tensor {
 			t.SetRepacker(p.repack())
 			newShape := slices.Clone(t.Shape())
 			newShape[1], newShape[2] = newShape[2], newShape[1]
-			out = append(out, ggml.Tensor{
+			out = append(out, &ggml.Tensor{
 				Name:     t.Name(),
 				Kind:     t.Kind(),
 				Shape:    newShape,
--- a/convert/convert_llama_adapter.go
+++ b/convert/convert_llama_adapter.go
@ -29,8 +29,8 @@ func (p *llamaAdapter) KV(baseKV ggml.KV) ggml.KV {
 	return kv
 }

-func (p *llamaAdapter) Tensors(ts []Tensor) []ggml.Tensor {
-	var out []ggml.Tensor
+func (p *llamaAdapter) Tensors(ts []Tensor) []*ggml.Tensor {
+	var out []*ggml.Tensor
 	for _, t := range ts {
 		shape := t.Shape()
 		if (strings.HasSuffix(t.Name(), "weight.lora_a") && shape[0] > shape[1]) ||
@ -41,7 +41,7 @@ func (p *llamaAdapter) Tensors(ts []Tensor) []ggml.Tensor {
 			t.SetRepacker(p.repack)
 		}

-		out = append(out, ggml.Tensor{
+		out = append(out, &ggml.Tensor{
 			Name:     t.Name(),
 			Kind:     t.Kind(),
 			Shape:    shape,
--- a/convert/convert_mistral.go
+++ b/convert/convert_mistral.go
@ -89,8 +89,8 @@ func (p *mistral3Model) KV(t *Tokenizer) ggml.KV {
 	return kv
 }

-func (p *mistral3Model) Tensors(ts []Tensor) []ggml.Tensor {
-	var out []ggml.Tensor
+func (p *mistral3Model) Tensors(ts []Tensor) []*ggml.Tensor {
+	var out []*ggml.Tensor

 	for _, t := range ts {
 		if !strings.HasPrefix(t.Name(), "v.") {
@ -100,7 +100,7 @@ func (p *mistral3Model) Tensors(ts []Tensor) []ggml.Tensor {
 			}
 		}

-		out = append(out, ggml.Tensor{
+		out = append(out, &ggml.Tensor{
 			Name:     t.Name(),
 			Kind:     t.Kind(),
 			Shape:    t.Shape(),
--- a/convert/convert_mixtral.go
+++ b/convert/convert_mixtral.go
@ -29,7 +29,7 @@ func (p *mixtralModel) KV(t *Tokenizer) ggml.KV {
 	return kv
 }

-func (p *mixtralModel) Tensors(ts []Tensor) []ggml.Tensor {
+func (p *mixtralModel) Tensors(ts []Tensor) []*ggml.Tensor {
 	oldnew := []string{
 		"model.layers", "blk",
 		"w1", "ffn_gate_exps",
@ -56,10 +56,10 @@ func (p *mixtralModel) Tensors(ts []Tensor) []ggml.Tensor {
 		return true
 	})

-	var out []ggml.Tensor
+	var out []*ggml.Tensor
 	for n, e := range experts {
 		// TODO(mxyng): sanity check experts
-		out = append(out, ggml.Tensor{
+		out = append(out, &ggml.Tensor{
 			Name:     n,
 			Kind:     e[0].Kind(),
 			Shape:    append([]uint64{uint64(len(e))}, e[0].Shape()...),
--- a/convert/convert_phi3.go
+++ b/convert/convert_phi3.go
@ -68,19 +68,19 @@ func (p *phi3Model) KV(t *Tokenizer) ggml.KV {
 	return kv
 }

-func (p *phi3Model) Tensors(ts []Tensor) []ggml.Tensor {
+func (p *phi3Model) Tensors(ts []Tensor) []*ggml.Tensor {
 	var addRopeFactors sync.Once

-	out := make([]ggml.Tensor, 0, len(ts)+2)
+	out := make([]*ggml.Tensor, 0, len(ts)+2)
 	for _, t := range ts {
 		if strings.HasPrefix(t.Name(), "blk.0.") {
 			addRopeFactors.Do(func() {
-				out = append(out, ggml.Tensor{
+				out = append(out, &ggml.Tensor{
 					Name:     "rope_factors_long.weight",
 					Kind:     0,
 					Shape:    []uint64{uint64(len(p.RopeScaling.LongFactor))},
 					WriterTo: p.RopeScaling.LongFactor,
-				}, ggml.Tensor{
+				}, &ggml.Tensor{
 					Name:     "rope_factors_short.weight",
 					Kind:     0,
 					Shape:    []uint64{uint64(len(p.RopeScaling.ShortFactor))},
@ -89,7 +89,7 @@ func (p *phi3Model) Tensors(ts []Tensor) []ggml.Tensor {
 			})
 		}

-		out = append(out, ggml.Tensor{
+		out = append(out, &ggml.Tensor{
 			Name:     t.Name(),
 			Kind:     t.Kind(),
 			Shape:    t.Shape(),
--- a/convert/convert_qwen2.go
+++ b/convert/convert_qwen2.go
@ -45,10 +45,10 @@ func (q *qwen2Model) KV(t *Tokenizer) ggml.KV {
 	return kv
 }

-func (q *qwen2Model) Tensors(ts []Tensor) []ggml.Tensor {
-	var out []ggml.Tensor
+func (q *qwen2Model) Tensors(ts []Tensor) []*ggml.Tensor {
+	var out []*ggml.Tensor
 	for _, t := range ts {
-		out = append(out, ggml.Tensor{
+		out = append(out, &ggml.Tensor{
 			Name:     t.Name(),
 			Kind:     t.Kind(),
 			Shape:    t.Shape(),
--- a/convert/convert_test.go
+++ b/convert/convert_test.go
@ -130,6 +130,7 @@ func TestConvertModel(t *testing.T) {
 			if err != nil {
 				t.Fatal(err)
 			}
+			defer expectFile.Close()

 			var expect map[string]string
 			if err := json.NewDecoder(expectFile).Decode(&expect); err != nil {
--- a/convert/fs.go
+++ b/convert/fs.go
@ -1,58 +0,0 @@
-package convert
-
-import (
-	"archive/zip"
-	"errors"
-	"io"
-	"io/fs"
-	"os"
-	"path/filepath"
-)
-
-type ZipReader struct {
-	r *zip.Reader
-	p string
-
-	// limit is the maximum size of a file that can be read directly
-	// from the zip archive. Files larger than this size will be extracted
-	limit int64
-}
-
-func NewZipReader(r *zip.Reader, p string, limit int64) fs.FS {
-	return &ZipReader{r, p, limit}
-}
-
-func (z *ZipReader) Open(name string) (fs.File, error) {
-	r, err := z.r.Open(name)
-	if err != nil {
-		return nil, err
-	}
-	defer r.Close()
-
-	if fi, err := r.Stat(); err != nil {
-		return nil, err
-	} else if fi.Size() < z.limit {
-		return r, nil
-	}
-
-	if !filepath.IsLocal(name) {
-		return nil, zip.ErrInsecurePath
-	}
-
-	n := filepath.Join(z.p, name)
-	if _, err := os.Stat(n); errors.Is(err, os.ErrNotExist) {
-		w, err := os.Create(n)
-		if err != nil {
-			return nil, err
-		}
-		defer w.Close()
-
-		if _, err := io.Copy(w, r); err != nil {
-			return nil, err
-		}
-	} else if err != nil {
-		return nil, err
-	}
-
-	return os.Open(n)
-}
--- a/discover/cuda_common.go
+++ b/discover/cuda_common.go
@ -3,6 +3,7 @@
 package discover

 import (
+	"fmt"
 	"log/slog"
 	"os"
 	"regexp"
@ -59,6 +60,8 @@ func cudaVariant(gpuInfo CudaGPUInfo) string {

 	// driver 12.0 has problems with the cuda v12 library, so run v11 on those older drivers
 	if gpuInfo.DriverMajor < 12 || (gpuInfo.DriverMajor == 12 && gpuInfo.DriverMinor == 0) {
+		// The detected driver is older than Feb 2023
+		slog.Warn("old CUDA driver detected - please upgrade to a newer driver", "version", fmt.Sprintf("%d.%d", gpuInfo.DriverMajor, gpuInfo.DriverMinor))
 		return "v11"
 	}
 	return "v12"
--- a/discover/gpu_info.h
+++ b/discover/gpu_info.h
@ -27,12 +27,14 @@

 #endif

+#ifndef LOG
 #define LOG(verbose, ...) \
  do { \
    if (verbose) { \
      fprintf(stderr, __VA_ARGS__); \
    } \
  } while (0)
+#endif

 #ifdef __cplusplus
 extern "C" {
--- a/discover/gpu_info_cudart.c
+++ b/discover/gpu_info_cudart.c
@ -1,6 +1,7 @@
 #ifndef __APPLE__  // TODO - maybe consider nvidia support on intel macs?

 #include <string.h>
+#include <inttypes.h>
 #include "gpu_info_cudart.h"

 void cudart_init(char *cudart_lib_path, cudart_init_resp_t *resp) {
@ -58,7 +59,7 @@ void cudart_init(char *cudart_lib_path, cudart_init_resp_t *resp) {
    LOG(resp->ch.verbose, "cudaSetDevice err: %d\n", ret);
    UNLOAD_LIBRARY(resp->ch.handle);
    resp->ch.handle = NULL;
-    if (ret == CUDA_ERROR_INSUFFICIENT_DRIVER) {
+    if (ret == CUDART_ERROR_INSUFFICIENT_DRIVER) {
      resp->err = strdup("your nvidia driver is too old or missing.  If you have a CUDA GPU please upgrade to run ollama");
      return;
    }
@ -168,9 +169,9 @@ void cudart_bootstrap(cudart_handle_t h, int i, mem_info_t *resp) {
  resp->free = memInfo.free;
  resp->used = memInfo.used;

-  LOG(h.verbose, "[%s] CUDA totalMem %lu\n", resp->gpu_id, resp->total);
-  LOG(h.verbose, "[%s] CUDA freeMem %lu\n", resp->gpu_id, resp->free);
-  LOG(h.verbose, "[%s] CUDA usedMem %lu\n", resp->gpu_id, resp->used);
+  LOG(h.verbose, "[%s] CUDA totalMem %" PRId64 "\n", resp->gpu_id, resp->total);
+  LOG(h.verbose, "[%s] CUDA freeMem %" PRId64 "\n", resp->gpu_id, resp->free);
+  LOG(h.verbose, "[%s] CUDA usedMem %" PRId64 "\n", resp->gpu_id, resp->used);
  LOG(h.verbose, "[%s] Compute Capability %d.%d\n", resp->gpu_id, resp->major, resp->minor);
 }

@ -180,4 +181,4 @@ void cudart_release(cudart_handle_t h) {
  h.handle = NULL;
 }

-#endif  // __APPLE__
+#endif  // __APPLE__
--- a/discover/gpu_info_nvcuda.c
+++ b/discover/gpu_info_nvcuda.c
@ -1,6 +1,7 @@
 #ifndef __APPLE__  // TODO - maybe consider nvidia support on intel macs?

 #include <string.h>
+#include <inttypes.h>
 #include "gpu_info_nvcuda.h"

 void nvcuda_init(char *nvcuda_lib_path, nvcuda_init_resp_t *resp) {
@ -193,8 +194,8 @@ void nvcuda_bootstrap(nvcuda_handle_t h, int i, mem_info_t *resp) {
  resp->total = memInfo.total;
  resp->free = memInfo.free;

-  LOG(h.verbose, "[%s] CUDA totalMem %lu mb\n", resp->gpu_id, resp->total / 1024 / 1024);
-  LOG(h.verbose, "[%s] CUDA freeMem %lu mb\n", resp->gpu_id, resp->free / 1024 / 1024);
+  LOG(h.verbose, "[%s] CUDA totalMem %" PRId64 "mb\n", resp->gpu_id, resp->total / 1024 / 1024);
+  LOG(h.verbose, "[%s] CUDA freeMem %" PRId64 "mb\n", resp->gpu_id, resp->free / 1024 / 1024);
  LOG(h.verbose, "[%s] Compute Capability %d.%d\n", resp->gpu_id, resp->major, resp->minor);

  
@ -247,4 +248,4 @@ void nvcuda_release(nvcuda_handle_t h) {
  h.handle = NULL;
 }

-#endif  // __APPLE__
+#endif  // __APPLE__
--- a/discover/path.go
+++ b/discover/path.go
@ -12,7 +12,7 @@ import (
 // '../lib/ollama' on Linux and the executable's directory on macOS
 // note: distribution builds, additional GPU-specific libraries are
 // found in subdirectories of the returned path, such as
-// 'cuda_v11', 'cuda_v12', 'rocm', etc.
+// 'cuda_v12', 'rocm', etc.
 var LibOllamaPath string = func() string {
 	exe, err := os.Executable()
 	if err != nil {
--- a/docs/api.md
+++ b/docs/api.md
@ -394,9 +394,6 @@ curl http://localhost:11434/api/generate -d '{
    "repeat_penalty": 1.2,
    "presence_penalty": 1.5,
    "frequency_penalty": 1.0,
-    "mirostat": 1,
-    "mirostat_tau": 0.8,
-    "mirostat_eta": 0.6,
    "penalize_newline": true,
    "stop": ["\n", "user:"],
    "numa": false,
@ -404,10 +401,7 @@ curl http://localhost:11434/api/generate -d '{
    "num_batch": 2,
    "num_gpu": 1,
    "main_gpu": 0,
-    "low_vram": false,
-    "vocab_only": false,
    "use_mmap": true,
-    "use_mlock": false,
    "num_thread": 8
  }
 }'
--- a/docs/faq.md
+++ b/docs/faq.md
@ -20,7 +20,7 @@ Please refer to the [GPU docs](./gpu.md).

 ## How can I specify the context window size?

-By default, Ollama uses a context window size of 4096 tokens, unless you have a single GPU with <= 4 GB of VRAM, in which case it will default to 2048 tokens. 
+By default, Ollama uses a context window size of 4096 tokens. 

 This can be overridden with the `OLLAMA_CONTEXT_LENGTH` environment variable. For example, to set the default context window to 8K, use: 

@ -31,7 +31,7 @@ OLLAMA_CONTEXT_LENGTH=8192 ollama serve
 To change this when using `ollama run`, use `/set parameter`:

 ```shell
-/set parameter num_ctx 8192
+/set parameter num_ctx 4096
 ```

 When using the API, specify the `num_ctx` parameter:
@ -41,7 +41,7 @@ curl http://localhost:11434/api/generate -d '{
  "model": "llama3.2",
  "prompt": "Why is the sky blue?",
  "options": {
-    "num_ctx": 8192
+    "num_ctx": 4096
  }
 }'
 ```
--- a/docs/gpu.md
+++ b/docs/gpu.md
@ -1,6 +1,6 @@
 # GPU
 ## Nvidia
-Ollama supports Nvidia GPUs with compute capability 5.0+.
+Ollama supports Nvidia GPUs with compute capability 5.0+ and driver version 531 and newer.

 Check your compute compatibility to see if your card is supported:
 [https://developer.nvidia.com/cuda-gpus](https://developer.nvidia.com/cuda-gpus)
--- a/docs/modelfile.md
+++ b/docs/modelfile.md
@ -150,9 +150,6 @@ PARAMETER <parameter> <parametervalue>

 | Parameter      | Description                                                                                                                                                                                                                                             | Value Type | Example Usage        |
 | -------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ---------- | -------------------- |
-| mirostat       | Enable Mirostat sampling for controlling perplexity. (default: 0, 0 = disabled, 1 = Mirostat, 2 = Mirostat 2.0)                                                                                                                                         | int        | mirostat 0           |
-| mirostat_eta   | Influences how quickly the algorithm responds to feedback from the generated text. A lower learning rate will result in slower adjustments, while a higher learning rate will make the algorithm more responsive. (Default: 0.1)                        | float      | mirostat_eta 0.1     |
-| mirostat_tau   | Controls the balance between coherence and diversity of the output. A lower value will result in more focused and coherent text. (Default: 5.0)                                                                                                         | float      | mirostat_tau 5.0     |
 | num_ctx        | Sets the size of the context window used to generate the next token. (Default: 2048)                                                                                                                                                                    | int        | num_ctx 4096         |
 | repeat_last_n  | Sets how far back for the model to look back to prevent repetition. (Default: 64, 0 = disabled, -1 = num_ctx)                                                                                                                                           | int        | repeat_last_n 64     |
 | repeat_penalty | Sets how strongly to penalize repetitions. A higher value (e.g., 1.5) will penalize repetitions more strongly, while a lower value (e.g., 0.9) will be more lenient. (Default: 1.1)                                                                     | float      | repeat_penalty 1.1   |
--- a/docs/troubleshooting.md
+++ b/docs/troubleshooting.md
@ -43,7 +43,7 @@ Ollama includes multiple LLM libraries compiled for different GPUs and CPU vecto
 In the server log, you will see a message that looks something like this (varies from release to release):

 ```
-Dynamic LLM libraries [rocm_v6 cpu cpu_avx cpu_avx2 cuda_v11 rocm_v5]
+Dynamic LLM libraries [rocm_v6 cpu cpu_avx cpu_avx2 cuda_v12 rocm_v5]
 ```

 **Experimental LLM Library Override**
--- a/envconfig/config.go
+++ b/envconfig/config.go
@ -169,7 +169,7 @@ var (
 	// Enable the new Ollama engine
 	NewEngine = Bool("OLLAMA_NEW_ENGINE")
 	// ContextLength sets the default context length
-	ContextLength = Int64("OLLAMA_CONTEXT_LENGTH", -1)
+	ContextLength = Uint("OLLAMA_CONTEXT_LENGTH", 4096)
 )

 func String(s string) func() string {
@ -227,20 +227,6 @@ func Uint64(key string, defaultValue uint64) func() uint64 {
 	}
 }

-func Int64(key string, defaultValue int64) func() int64 {
-	return func() int64 {
-		if s := Var(key); s != "" {
-			if n, err := strconv.ParseInt(s, 10, 64); err != nil {
-				slog.Warn("invalid environment variable, using default", "key", key, "value", s, "default", defaultValue)
-			} else {
-				return n
-			}
-		}
-
-		return defaultValue
-	}
-}
-
 // Set aside VRAM per GPU
 var GpuOverhead = Uint64("OLLAMA_GPU_OVERHEAD", 0)

@ -269,7 +255,7 @@ func AsMap() map[string]EnvVar {
 		"OLLAMA_ORIGINS":           {"OLLAMA_ORIGINS", AllowedOrigins(), "A comma separated list of allowed origins"},
 		"OLLAMA_SCHED_SPREAD":      {"OLLAMA_SCHED_SPREAD", SchedSpread(), "Always schedule model across all GPUs"},
 		"OLLAMA_MULTIUSER_CACHE":   {"OLLAMA_MULTIUSER_CACHE", MultiUserCache(), "Optimize prompt caching for multi-user scenarios"},
-		"OLLAMA_CONTEXT_LENGTH":    {"OLLAMA_CONTEXT_LENGTH", ContextLength(), "Context length to use unless otherwise specified (default 4096 or 2048 with low VRAM)"},
+		"OLLAMA_CONTEXT_LENGTH":    {"OLLAMA_CONTEXT_LENGTH", ContextLength(), "Context length to use unless otherwise specified (default: 4096)"},
 		"OLLAMA_NEW_ENGINE":        {"OLLAMA_NEW_ENGINE", NewEngine(), "Enable the new Ollama engine"},

 		// Informational
--- a/envconfig/config_test.go
+++ b/envconfig/config_test.go
@ -278,9 +278,9 @@ func TestVar(t *testing.T) {
 }

 func TestContextLength(t *testing.T) {
-	cases := map[string]int64{
-		"":     -1,
-		"4096": 4096,
+	cases := map[string]uint{
+		"":     4096,
+		"2048": 2048,
 	}

 	for k, v := range cases {
--- a/fs/ggml/ggml.go
+++ b/fs/ggml/ggml.go
@ -37,12 +37,12 @@ func (kv KV) ParameterCount() uint64 {
 	return val
 }

-func (kv KV) FileType() fileType {
+func (kv KV) FileType() FileType {
 	if t := kv.Uint("general.file_type"); t > 0 {
-		return fileType(t)
+		return FileType(t)
 	}

-	return fileTypeUnknown
+	return FileTypeUnknown
 }

 func (kv KV) BlockCount() uint64 {
@ -194,7 +194,7 @@ func keyValue[T valueTypes | arrayValueTypes](kv KV, key string, defaultValue ..
 		return val, true
 	}

-	slog.Warn("key with type not found", "key", key, "default", defaultValue[0])
+	slog.Debug("key with type not found", "key", key, "default", defaultValue[0])
 	return defaultValue[0], false
 }

@ -271,7 +271,11 @@ func (t Tensor) block() (n int) {
 }

 func (t Tensor) blockSize() uint64 {
-	switch t.Kind {
+	return (TensorType)(t.Kind).BlockSize()
+}
+
+func (t TensorType) BlockSize() uint64 {
+	switch t {
 	case
 		0,  // F32
 		1,  // F16
@ -297,73 +301,77 @@ func (t Tensor) blockSize() uint64 {
 }

 func (t Tensor) typeSize() uint64 {
-	blockSize := t.blockSize()
+	return TensorType(t.Kind).TypeSize()
+}

-	switch t.Kind {
-	case 0: // FP32
+func (t TensorType) TypeSize() uint64 {
+	blockSize := t.BlockSize()
+
+	switch t {
+	case TensorTypeF32:
 		return 4
-	case 1: // FP16
+	case TensorTypeF16:
 		return 2
-	case 2: // Q4_0
+	case TensorTypeQ4_0:
 		return 2 + blockSize/2
-	case 3: // Q4_1
+	case TensorTypeQ4_1:
 		return 2 + 2 + blockSize/2
-	case 6: // Q5_0
+	case TensorTypeQ5_0:
 		return 2 + 4 + blockSize/2
-	case 7: // Q5_1
+	case TensorTypeQ5_1:
 		return 2 + 2 + 4 + blockSize/2
-	case 8: // Q8_0
+	case TensorTypeQ8_0:
 		return 2 + blockSize
-	case 9: // Q8_1
+	case TensorTypeQ8_1:
 		return 2 + 2 + blockSize
-	case 10: // Q2_K
+	case TensorTypeQ2_K:
 		return blockSize/16 + blockSize/4 + 2 + 2
-	case 11: // Q3_K
+	case TensorTypeQ3_K:
 		return blockSize/8 + blockSize/4 + 12 + 2
-	case 12: // Q4_K
+	case TensorTypeQ4_K:
 		return 2 + 2 + 12 + blockSize/2
-	case 13: // Q5_K
+	case TensorTypeQ5_K:
 		return 2 + 2 + 12 + blockSize/8 + blockSize/2
-	case 14: // Q6_K
+	case TensorTypeQ6_K:
 		return blockSize/2 + blockSize/4 + blockSize/16 + 2
-	case 15: // Q8_K
+	case TensorTypeQ8_K:
 		return 4 + blockSize + 2*blockSize/16
-	case 16: // IQ2_XXS
+	case tensorTypeIQ2_XXS:
 		return 2 + 2*blockSize/8
-	case 17: // IQ2_XS
+	case tensorTypeIQ2_XS:
 		return 2 + 2*blockSize/8 + blockSize/32
-	case 18: // IQ3_XXS
+	case tensorTypeIQ3_XXS:
 		return 2 + blockSize/4 + blockSize/8
-	case 19: // IQ1_S
+	case tensorTypeIQ1_S:
 		return 2 + blockSize/8 + blockSize/16
-	case 20: // IQ4_NL
+	case tensorTypeIQ4_NL:
 		return 2 + blockSize/2
-	case 21: // IQ3_S
+	case tensorTypeIQ3_S:
 		return 2 + blockSize/4 + blockSize/8 + blockSize/32 + 4
-	case 22: // IQ2_S
+	case tensorTypeIQ2_S:
 		return 2 + blockSize/4 + blockSize/16
-	case 23: // IQ4_XS
+	case tensorTypeIQ4_XS:
 		return 2 + 2 + blockSize/2 + blockSize/64
-	case 24: // I8
+	case TensorTypeI8:
 		return 1
-	case 25: // I16
+	case TensorTypeI16:
 		return 2
-	case 26: // I32
+	case TensorTypeI32:
 		return 4
-	case 27: // I64
+	case TensorTypeI64:
 		return 8
-	case 28: // F64
+	case TensorTypeF64:
 		return 8
-	case 29: // IQ1_M
+	case tensorTypeIQ1_M:
 		return blockSize/8 + blockSize/16 + blockSize/32
-	case 30: // BF16
+	case TensorTypeBF16:
 		return 2
 	default:
 		return 0
 	}
 }

-func (t Tensor) parameters() uint64 {
+func (t Tensor) Elements() uint64 {
 	var count uint64 = 1
 	for _, n := range t.Shape {
 		count *= n
@ -372,11 +380,11 @@ func (t Tensor) parameters() uint64 {
 }

 func (t Tensor) Size() uint64 {
-	return t.parameters() * t.typeSize() / t.blockSize()
+	return t.Elements() * t.typeSize() / t.blockSize()
 }

 func (t Tensor) Type() string {
-	return fileType(t.Kind).String()
+	return TensorType(t.Kind).String()
 }

 type container interface {
@ -525,7 +533,7 @@ func (f GGML) GraphSize(context, batch uint64, numParallel int, kvCacheType stri
 		var ropeFreqsCount uint64
 		if ropeFreqs, ok := f.Tensors().GroupLayers()["rope_freqs"]; ok {
 			if ropeFreqsWeights, ok := ropeFreqs["weights"]; ok {
-				ropeFreqsCount = ropeFreqsWeights.parameters()
+				ropeFreqsCount = ropeFreqsWeights.Elements()
 			}
 		}

--- a/fs/ggml/gguf.go
+++ b/fs/ggml/gguf.go
@ -9,8 +9,12 @@ import (
 	"io"
 	"log/slog"
 	"maps"
+	"os"
+	"runtime"
 	"slices"
 	"strings"
+
+	"golang.org/x/sync/errgroup"
 )

 type containerGGUF struct {
@ -225,7 +229,7 @@ func (llm *gguf) Decode(rs io.ReadSeeker) error {
 		}

 		llm.tensors = append(llm.tensors, &tensor)
-		llm.parameters += tensor.parameters()
+		llm.parameters += tensor.Elements()
 	}

 	// patch KV with parameter count
@ -488,25 +492,38 @@ func writeGGUFArray[S ~[]E, E any](w io.Writer, t uint32, s S) error {
 		return err
 	}

+	if t == ggufTypeString {
+		for _, e := range any(s).([]string) {
+			if err := binary.Write(w, binary.LittleEndian, uint64(len(e))); err != nil {
+				return err
+			}
+
+			if err := binary.Write(w, binary.LittleEndian, []byte(e)); err != nil {
+				return err
+			}
+		}
+		return nil
+	}
+
 	return binary.Write(w, binary.LittleEndian, s)
 }

-func WriteGGUF(ws io.WriteSeeker, kv KV, ts []Tensor) error {
+func WriteGGUF(f *os.File, kv KV, ts []*Tensor) error {
 	alignment := kv.Uint("general.alignment", 32)

-	if err := binary.Write(ws, binary.LittleEndian, []byte("GGUF")); err != nil {
+	if err := binary.Write(f, binary.LittleEndian, []byte("GGUF")); err != nil {
 		return err
 	}

-	if err := binary.Write(ws, binary.LittleEndian, uint32(3)); err != nil {
+	if err := binary.Write(f, binary.LittleEndian, uint32(3)); err != nil {
 		return err
 	}

-	if err := binary.Write(ws, binary.LittleEndian, uint64(len(ts))); err != nil {
+	if err := binary.Write(f, binary.LittleEndian, uint64(len(ts))); err != nil {
 		return err
 	}

-	if err := binary.Write(ws, binary.LittleEndian, uint64(len(kv))); err != nil {
+	if err := binary.Write(f, binary.LittleEndian, uint64(len(kv))); err != nil {
 		return err
 	}

@ -514,12 +531,12 @@ func WriteGGUF(ws io.WriteSeeker, kv KV, ts []Tensor) error {
 	slices.Sort(keys)

 	for _, key := range keys {
-		if err := ggufWriteKV(ws, key, kv[key]); err != nil {
+		if err := ggufWriteKV(f, key, kv[key]); err != nil {
 			return err
 		}
 	}

-	slices.SortStableFunc(ts, func(a, b Tensor) int {
+	slices.SortStableFunc(ts, func(a, b *Tensor) int {
 		if i, j := a.block(), b.block(); i < 0 && j > 0 {
 			return 1
 		} else if i > 0 && j < 0 {
@ -530,21 +547,34 @@ func WriteGGUF(ws io.WriteSeeker, kv KV, ts []Tensor) error {
 	})

 	var s uint64
-	for _, t := range ts {
-		t.Offset = s + uint64(ggufPadding(int64(s), int64(alignment)))
-		if err := ggufWriteTensorInfo(ws, t); err != nil {
+	for i := range ts {
+		ts[i].Offset = s
+		if err := ggufWriteTensorInfo(f, ts[i]); err != nil {
 			return err
 		}
-		s += t.Size()
+		s += ts[i].Size()
+		s += uint64(ggufPadding(int64(s), int64(alignment)))
 	}

+	offset, err := f.Seek(0, io.SeekCurrent)
+	if err != nil {
+		return err
+	}
+	offset += ggufPadding(offset, int64(alignment))
+
+	var g errgroup.Group
+	g.SetLimit(runtime.GOMAXPROCS(0))
+	// TODO consider reducing if tensors size * gomaxprocs is larger than free memory
 	for _, t := range ts {
-		if err := ggufWriteTensor(ws, t, int64(alignment)); err != nil {
+		t := t
+		w := io.NewOffsetWriter(f, offset+int64(t.Offset))
+		g.Go(func() error {
+			_, err := t.WriteTo(w)
 			return err
-		}
+		})
 	}

-	return nil
+	return g.Wait()
 }

 func ggufWriteKV(ws io.WriteSeeker, k string, v any) error {
@ -559,8 +589,10 @@ func ggufWriteKV(ws io.WriteSeeker, k string, v any) error {

 	var err error
 	switch v := v.(type) {
-	case uint32:
+	case uint32, FileType:
 		err = writeGGUF(ws, ggufTypeUint32, v)
+	case uint64:
+		err = writeGGUF(ws, ggufTypeUint64, v)
 	case float32:
 		err = writeGGUF(ws, ggufTypeFloat32, v)
 	case bool:
@ -569,32 +601,20 @@ func ggufWriteKV(ws io.WriteSeeker, k string, v any) error {
 		err = writeGGUFString(ws, v)
 	case []int32:
 		err = writeGGUFArray(ws, ggufTypeInt32, v)
+	case *array[int32]:
+		err = writeGGUFArray(ws, ggufTypeInt32, v.values)
 	case []uint32:
 		err = writeGGUFArray(ws, ggufTypeUint32, v)
+	case *array[uint32]:
+		err = writeGGUFArray(ws, ggufTypeUint32, v.values)
 	case []float32:
 		err = writeGGUFArray(ws, ggufTypeFloat32, v)
+	case *array[float32]:
+		err = writeGGUFArray(ws, ggufTypeFloat32, v.values)
 	case []string:
-		if err := binary.Write(ws, binary.LittleEndian, ggufTypeArray); err != nil {
-			return err
-		}
-
-		if err := binary.Write(ws, binary.LittleEndian, ggufTypeString); err != nil {
-			return err
-		}
-
-		if err := binary.Write(ws, binary.LittleEndian, uint64(len(v))); err != nil {
-			return err
-		}
-
-		for _, e := range v {
-			if err := binary.Write(ws, binary.LittleEndian, uint64(len(e))); err != nil {
-				return err
-			}
-
-			if err := binary.Write(ws, binary.LittleEndian, []byte(e)); err != nil {
-				return err
-			}
-		}
+		err = writeGGUFArray(ws, ggufTypeString, v)
+	case *array[string]:
+		err = writeGGUFArray(ws, ggufTypeString, v.values)
 	default:
 		return fmt.Errorf("improper type for '%s'", k)
 	}
@ -602,7 +622,7 @@ func ggufWriteKV(ws io.WriteSeeker, k string, v any) error {
 	return err
 }

-func ggufWriteTensorInfo(ws io.WriteSeeker, t Tensor) error {
+func ggufWriteTensorInfo(ws io.WriteSeeker, t *Tensor) error {
 	slog.Debug(t.Name, "kind", t.Kind, "shape", t.Shape, "offset", t.Offset)
 	if err := binary.Write(ws, binary.LittleEndian, uint64(len(t.Name))); err != nil {
 		return err
@ -629,20 +649,6 @@ func ggufWriteTensorInfo(ws io.WriteSeeker, t Tensor) error {
 	return binary.Write(ws, binary.LittleEndian, t.Offset)
 }

-func ggufWriteTensor(ws io.WriteSeeker, t Tensor, alignment int64) error {
-	offset, err := ws.Seek(0, io.SeekCurrent)
-	if err != nil {
-		return err
-	}
-
-	if err := binary.Write(ws, binary.LittleEndian, bytes.Repeat([]byte{0}, int(ggufPadding(offset, alignment)))); err != nil {
-		return err
-	}
-
-	_, err = t.WriteTo(ws)
-	return err
-}
-
 func ggufPadding(offset, align int64) int64 {
 	return (align - offset%align) % align
 }
--- a/fs/ggml/gguf_test.go
+++ b/fs/ggml/gguf_test.go
@ -0,0 +1,63 @@
+package ggml
+
+import (
+	"bytes"
+	"os"
+	"slices"
+	"testing"
+
+	"github.com/google/go-cmp/cmp"
+)
+
+func TestWriteGGUF(t *testing.T) {
+	w, err := os.CreateTemp(t.TempDir(), "*.bin")
+	if err != nil {
+		t.Fatal(err)
+	}
+	defer w.Close()
+
+	if err := WriteGGUF(w, KV{
+		"general.alignment": uint32(16),
+	}, []*Tensor{
+		{Name: "test.0", Shape: []uint64{2, 3}, WriterTo: bytes.NewBuffer(slices.Repeat([]byte{0}, 2*3*4))},
+		{Name: "test.1", Shape: []uint64{2, 3}, WriterTo: bytes.NewBuffer(slices.Repeat([]byte{0}, 2*3*4))},
+		{Name: "test.2", Shape: []uint64{2, 3}, WriterTo: bytes.NewBuffer(slices.Repeat([]byte{0}, 2*3*4))},
+		{Name: "test.3", Shape: []uint64{2, 3}, WriterTo: bytes.NewBuffer(slices.Repeat([]byte{0}, 2*3*4))},
+		{Name: "test.4", Shape: []uint64{2, 3}, WriterTo: bytes.NewBuffer(slices.Repeat([]byte{0}, 2*3*4))},
+		{Name: "test.5", Shape: []uint64{2, 3}, WriterTo: bytes.NewBuffer(slices.Repeat([]byte{0}, 2*3*4))},
+	}); err != nil {
+		t.Fatal(err)
+	}
+
+	r, err := os.Open(w.Name())
+	if err != nil {
+		t.Fatal(err)
+	}
+	defer r.Close()
+
+	ff, _, err := Decode(r, 0)
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	if diff := cmp.Diff(ff.KV(), KV{
+		"general.alignment":       uint32(16),
+		"general.parameter_count": uint64(36),
+	}); diff != "" {
+		t.Errorf("Mismatch (-want +got):\n%s", diff)
+	}
+
+	if diff := cmp.Diff(ff.Tensors(), Tensors{
+		Offset: 336,
+		items: []*Tensor{
+			{Name: "test.0", Offset: 0, Shape: []uint64{2, 3}},
+			{Name: "test.1", Offset: 32, Shape: []uint64{2, 3}},
+			{Name: "test.2", Offset: 64, Shape: []uint64{2, 3}},
+			{Name: "test.3", Offset: 96, Shape: []uint64{2, 3}},
+			{Name: "test.4", Offset: 128, Shape: []uint64{2, 3}},
+			{Name: "test.5", Offset: 160, Shape: []uint64{2, 3}},
+		},
+	}, cmp.AllowUnexported(Tensors{})); diff != "" {
+		t.Errorf("Mismatch (-want +got):\n%s", diff)
+	}
+}
--- a/fs/ggml/type.go
+++ b/fs/ggml/type.go
@ -1,185 +1,341 @@
 package ggml

-import "fmt"
-
-type fileType uint32
-
-const (
-	fileTypeF32 fileType = iota
-	fileTypeF16
-	fileTypeQ4_0
-	fileTypeQ4_1
-	fileTypeQ4_1_F16
-	fileTypeQ4_2 // unused
-	fileTypeQ4_3 // unused
-	fileTypeQ8_0
-	fileTypeQ5_0
-	fileTypeQ5_1
-	fileTypeQ2_K
-	fileTypeQ3_K_S
-	fileTypeQ3_K_M
-	fileTypeQ3_K_L
-	fileTypeQ4_K_S
-	fileTypeQ4_K_M
-	fileTypeQ5_K_S
-	fileTypeQ5_K_M
-	fileTypeQ6_K
-	fileTypeIQ2_XXS
-	fileTypeIQ2_XS
-	fileTypeQ2_K_S
-	fileTypeIQ3_XS
-	fileTypeIQ3_XXS
-	fileTypeIQ1_S
-	fileTypeIQ4_NL
-	fileTypeIQ3_S
-	fileTypeIQ3_M
-	fileTypeIQ2_S
-	fileTypeIQ2_M
-	fileTypeIQ4_XS
-	fileTypeIQ1_M
-	fileTypeBF16
-
-	fileTypeUnknown
+import (
+	"fmt"
+	"log/slog"
+	"strings"
 )

-func ParseFileType(s string) (fileType, error) {
+// FileType is the Go equivalent to llama_ftype used for gguf file typing
+type FileType uint32
+
+const (
+	FileTypeF32 FileType = iota
+	FileTypeF16
+	FileTypeQ4_0
+	FileTypeQ4_1
+	fileTypeQ4_1_F16 // unused by GGML
+	fileTypeQ4_2     // unused by GGML
+	fileTypeQ4_3     // unused by GGML
+	FileTypeQ8_0
+	FileTypeQ5_0
+	FileTypeQ5_1
+	FileTypeQ2_K
+	FileTypeQ3_K_S
+	FileTypeQ3_K_M
+	FileTypeQ3_K_L
+	FileTypeQ4_K_S
+	FileTypeQ4_K_M
+	FileTypeQ5_K_S
+	FileTypeQ5_K_M
+	FileTypeQ6_K
+	fileTypeIQ2_XXS // not supported by ollama
+	fileTypeIQ2_XS  // not supported by ollama
+	FileTypeQ2_K_S
+	fileTypeIQ3_XS  // not supported by ollama
+	fileTypeIQ3_XXS // not supported by ollama
+	fileTypeIQ1_S   // not supported by ollama
+	fileTypeIQ4_NL  // not supported by ollama
+	fileTypeIQ3_S   // not supported by ollama
+	fileTypeIQ3_M   // not supported by ollama
+	fileTypeIQ2_S   // not supported by ollama
+	fileTypeIQ2_M   // not supported by ollama
+	fileTypeIQ4_XS  // not supported by ollama
+	fileTypeIQ1_M   // not supported by ollama
+	FileTypeBF16
+	fileTypeQ4_0_4_4 // unused by GGML
+	fileTypeQ4_0_4_8 // unused by GGML
+	fileTypeQ4_0_8_8 // unused by GGML
+	fileTypeTQ1_0    // not supported by ollama
+	fileTypeTQ2_0    // not supported by ollama
+
+	FileTypeUnknown = 1024
+)
+
+// ParseFileType parses the provided GGUF file type
+// Only Ollama supported types are considered valid
+func ParseFileType(s string) (FileType, error) {
 	switch s {
 	case "F32":
-		return fileTypeF32, nil
+		return FileTypeF32, nil
 	case "F16":
-		return fileTypeF16, nil
+		return FileTypeF16, nil
 	case "Q4_0":
-		return fileTypeQ4_0, nil
+		return FileTypeQ4_0, nil
 	case "Q4_1":
-		return fileTypeQ4_1, nil
-	case "Q4_1_F16":
-		return fileTypeQ4_1_F16, nil
+		return FileTypeQ4_1, nil
 	case "Q8_0":
-		return fileTypeQ8_0, nil
+		return FileTypeQ8_0, nil
 	case "Q5_0":
-		return fileTypeQ5_0, nil
+		return FileTypeQ5_0, nil
 	case "Q5_1":
-		return fileTypeQ5_1, nil
+		return FileTypeQ5_1, nil
 	case "Q2_K":
-		return fileTypeQ2_K, nil
+		return FileTypeQ2_K, nil
 	case "Q3_K_S":
-		return fileTypeQ3_K_S, nil
+		return FileTypeQ3_K_S, nil
 	case "Q3_K_M":
-		return fileTypeQ3_K_M, nil
+		return FileTypeQ3_K_M, nil
 	case "Q3_K_L":
-		return fileTypeQ3_K_L, nil
+		return FileTypeQ3_K_L, nil
 	case "Q4_K_S":
-		return fileTypeQ4_K_S, nil
-	case "Q4_K_M":
-		return fileTypeQ4_K_M, nil
+		return FileTypeQ4_K_S, nil
+	case "Q4_K_M", "Q4_K":
+		return FileTypeQ4_K_M, nil
 	case "Q5_K_S":
-		return fileTypeQ5_K_S, nil
-	case "Q5_K_M":
-		return fileTypeQ5_K_M, nil
+		return FileTypeQ5_K_S, nil
+	case "Q5_K_M", "Q5_K":
+		return FileTypeQ5_K_M, nil
 	case "Q6_K":
-		return fileTypeQ6_K, nil
-	case "IQ2_XXS":
-		return fileTypeIQ2_XXS, nil
-	case "IQ2_XS":
-		return fileTypeIQ2_XS, nil
+		return FileTypeQ6_K, nil
 	case "Q2_K_S":
-		return fileTypeQ2_K_S, nil
-	case "IQ3_XS":
-		return fileTypeIQ3_XS, nil
-	case "IQ3_XXS":
-		return fileTypeIQ3_XXS, nil
-	case "IQ1_S":
-		return fileTypeIQ1_S, nil
-	case "IQ4_NL":
-		return fileTypeIQ4_NL, nil
-	case "IQ3_S":
-		return fileTypeIQ3_S, nil
-	case "IQ3_M":
-		return fileTypeIQ3_M, nil
-	case "IQ2_S":
-		return fileTypeIQ2_S, nil
-	case "IQ2_M":
-		return fileTypeIQ2_M, nil
-	case "IQ4_XS":
-		return fileTypeIQ4_XS, nil
-	case "IQ1_M":
-		return fileTypeIQ1_M, nil
+		return FileTypeQ2_K_S, nil
 	case "BF16":
-		return fileTypeBF16, nil
+		return FileTypeBF16, nil
 	default:
-		return fileTypeUnknown, fmt.Errorf("unknown fileType: %s", s)
+		supportedFileTypes := []FileType{
+			FileTypeF32,
+			FileTypeF16,
+			FileTypeQ4_K_S,
+			FileTypeQ4_K_M,
+			FileTypeQ8_0,
+			// fsggml.FileTypeBF16, // TODO
+		}
+		strs := make([]string, len(supportedFileTypes))
+		for i := range supportedFileTypes {
+			strs[i] = supportedFileTypes[i].String()
+		}
+
+		return FileTypeUnknown, fmt.Errorf("unsupported quantization type %s - supported types are %s", s, strings.Join(strs, ", "))
 	}
 }

-func (t fileType) String() string {
+func (t FileType) String() string {
 	switch t {
-	case fileTypeF32:
+	case FileTypeF32:
 		return "F32"
-	case fileTypeF16:
+	case FileTypeF16:
 		return "F16"
-	case fileTypeQ4_0:
+	case FileTypeQ4_0:
 		return "Q4_0"
-	case fileTypeQ4_1:
+	case FileTypeQ4_1:
 		return "Q4_1"
-	case fileTypeQ4_1_F16:
-		return "Q4_1_F16"
-	case fileTypeQ8_0:
+	case FileTypeQ8_0:
 		return "Q8_0"
-	case fileTypeQ5_0:
+	case FileTypeQ5_0:
 		return "Q5_0"
-	case fileTypeQ5_1:
+	case FileTypeQ5_1:
 		return "Q5_1"
-	case fileTypeQ2_K:
+	case FileTypeQ2_K:
 		return "Q2_K"
-	case fileTypeQ3_K_S:
+	case FileTypeQ3_K_S:
 		return "Q3_K_S"
-	case fileTypeQ3_K_M:
+	case FileTypeQ3_K_M:
 		return "Q3_K_M"
-	case fileTypeQ3_K_L:
+	case FileTypeQ3_K_L:
 		return "Q3_K_L"
-	case fileTypeQ4_K_S:
+	case FileTypeQ4_K_S:
 		return "Q4_K_S"
-	case fileTypeQ4_K_M:
+	case FileTypeQ4_K_M:
 		return "Q4_K_M"
-	case fileTypeQ5_K_S:
+	case FileTypeQ5_K_S:
 		return "Q5_K_S"
-	case fileTypeQ5_K_M:
+	case FileTypeQ5_K_M:
 		return "Q5_K_M"
-	case fileTypeQ6_K:
+	case FileTypeQ6_K:
 		return "Q6_K"
-	case fileTypeIQ2_XXS:
-		return "IQ2_XXS"
-	case fileTypeIQ2_XS:
-		return "IQ2_XS"
-	case fileTypeQ2_K_S:
+	case FileTypeQ2_K_S:
 		return "Q2_K_S"
-	case fileTypeIQ3_XS:
-		return "IQ3_XS"
-	case fileTypeIQ3_XXS:
-		return "IQ3_XXS"
-	case fileTypeIQ1_S:
-		return "IQ1_S"
-	case fileTypeIQ4_NL:
-		return "IQ4_NL"
-	case fileTypeIQ3_S:
-		return "IQ3_S"
-	case fileTypeIQ3_M:
-		return "IQ3_M"
-	case fileTypeIQ2_S:
-		return "IQ2_S"
-	case fileTypeIQ4_XS:
-		return "IQ4_XS"
-	case fileTypeIQ2_M:
-		return "IQ2_M"
-	case fileTypeIQ1_M:
-		return "IQ1_M"
-	case fileTypeBF16:
+	case FileTypeBF16:
 		return "BF16"
 	default:
 		return "unknown"
 	}
 }

-func (t fileType) Value() uint32 {
+func (t FileType) Value() uint32 {
 	return uint32(t)
 }
+
+func (ftype FileType) ToTensorType() TensorType {
+	switch ftype {
+	case FileTypeF32:
+		return TensorTypeF32
+	case FileTypeF16:
+		return TensorTypeF16
+	case FileTypeQ4_0:
+		return TensorTypeQ4_0
+	case FileTypeQ4_1:
+		return TensorTypeQ4_1
+	case FileTypeQ8_0:
+		return TensorTypeQ8_0
+	case FileTypeQ5_0:
+		return TensorTypeQ5_0
+	case FileTypeQ5_1:
+		return TensorTypeQ5_1
+	case FileTypeQ2_K:
+		return TensorTypeQ2_K
+	case FileTypeQ3_K_S:
+		return TensorTypeQ3_K
+	case FileTypeQ3_K_M:
+		return TensorTypeQ3_K
+	case FileTypeQ3_K_L:
+		return TensorTypeQ3_K
+	case FileTypeQ4_K_S:
+		return TensorTypeQ4_K
+	case FileTypeQ4_K_M:
+		return TensorTypeQ4_K
+	case FileTypeQ5_K_S:
+		return TensorTypeQ5_K
+	case FileTypeQ5_K_M:
+		return TensorTypeQ5_K
+	case FileTypeQ6_K:
+		return TensorTypeQ6_K
+	case FileTypeQ2_K_S:
+		return TensorTypeQ2_K
+	case FileTypeBF16:
+		return TensorTypeBF16
+	default:
+		slog.Warn("unsupported file type", "type", ftype)
+		return 0 // F32
+	}
+}
+
+// TensorType is equivalent to ggml_type for individual tensor types
+// Note: these are not the same as FileType
+type TensorType uint32
+
+const (
+	TensorTypeF32 TensorType = iota
+	TensorTypeF16
+	TensorTypeQ4_0
+	TensorTypeQ4_1
+	tensorTypeQ4_2 // unused by GGML
+	tensorTypeQ4_3 // unused by GGML
+	TensorTypeQ5_0
+	TensorTypeQ5_1
+	TensorTypeQ8_0
+	TensorTypeQ8_1
+	TensorTypeQ2_K
+	TensorTypeQ3_K
+	TensorTypeQ4_K
+	TensorTypeQ5_K
+	TensorTypeQ6_K
+	TensorTypeQ8_K
+	tensorTypeIQ2_XXS // not supported by ollama
+	tensorTypeIQ2_XS  // not supported by ollama
+	tensorTypeIQ3_XXS // not supported by ollama
+	tensorTypeIQ1_S   // not supported by ollama
+	tensorTypeIQ4_NL  // not supported by ollama
+	tensorTypeIQ3_S   // not supported by ollama
+	tensorTypeIQ2_S   // not supported by ollama
+	tensorTypeIQ4_XS  // not supported by ollama
+	TensorTypeI8
+	TensorTypeI16
+	TensorTypeI32
+	TensorTypeI64
+	TensorTypeF64
+	tensorTypeIQ1_M // not supported by ollama
+	TensorTypeBF16
+	tensorTypeQ4_0_4_4   // unused by GGML
+	tensorTypeQ4_0_4_8   // unused by GGML
+	tensorTypeQ4_0_8_8   // unused by GGML
+	tensorTypeTQ1_0      // not supported by ollama
+	tensorTypeTQ2_0      // not supported by ollama
+	tensorTypeIQ4_NL_4_4 // unused by GGML
+	tensorTypeIQ4_NL_4_8 // unused by GGML
+	tensorTypeIQ4_NL_8_8 // unused by GGML
+)
+
+// ParseFileType parses the provided GGUF file type
+// Only Ollama supported types are considered valid
+func ParseTensorType(s string) (TensorType, error) {
+	switch s {
+	case "F32":
+		return TensorTypeF32, nil
+	case "F16":
+		return TensorTypeF16, nil
+	case "Q4_0":
+		return TensorTypeQ4_0, nil
+	case "Q4_1":
+		return TensorTypeQ4_1, nil
+	case "Q5_0":
+		return TensorTypeQ5_0, nil
+	case "Q5_1":
+		return TensorTypeQ5_1, nil
+	case "Q8_0":
+		return TensorTypeQ8_0, nil
+	case "Q8_1":
+		return TensorTypeQ8_1, nil
+	case "Q2_K":
+		return TensorTypeQ2_K, nil
+	case "Q3_K":
+		return TensorTypeQ3_K, nil
+	case "Q4_K":
+		return TensorTypeQ4_K, nil
+	case "Q5_K":
+		return TensorTypeQ5_K, nil
+	case "Q6_K":
+		return TensorTypeQ6_K, nil
+	case "Q8_K":
+		return TensorTypeQ8_K, nil
+	case "F64":
+		return TensorTypeF64, nil
+	case "BF16":
+		return TensorTypeBF16, nil
+	default:
+		return 0, fmt.Errorf("unsupported quantization type %s", s)
+	}
+}
+
+func (t TensorType) IsQuantized() bool {
+	switch t {
+	case TensorTypeF32, TensorTypeF16, TensorTypeBF16:
+		return false
+	default:
+		return true
+	}
+}
+
+func (t TensorType) RowSize(ne uint64) uint64 {
+	return t.TypeSize() * ne / t.BlockSize()
+}
+
+func (t TensorType) String() string {
+	switch t {
+	case TensorTypeF32:
+		return "F32"
+	case TensorTypeF16:
+		return "F16"
+	case TensorTypeQ4_0:
+		return "Q4_0"
+	case TensorTypeQ4_1:
+		return "Q4_1"
+	case TensorTypeQ5_0:
+		return "Q5_0"
+	case TensorTypeQ5_1:
+		return "Q5_1"
+	case TensorTypeQ8_0:
+		return "Q8_0"
+	case TensorTypeQ8_1:
+		return "Q8_1"
+	case TensorTypeQ2_K:
+		return "Q2_K"
+	case TensorTypeQ3_K:
+		return "Q3_K"
+	case TensorTypeQ4_K:
+		return "Q4_K"
+	case TensorTypeQ5_K:
+		return "Q5_K"
+	case TensorTypeQ6_K:
+		return "Q6_K"
+	case TensorTypeQ8_K:
+		return "Q8_K"
+	case TensorTypeF64:
+		return "F64"
+	case TensorTypeBF16:
+		return "BF16"
+	default:
+		return "unknown"
+	}
+}
--- a/go.mod
+++ b/go.mod
@ -11,7 +11,7 @@ require (
 	github.com/spf13/cobra v1.7.0
 	github.com/stretchr/testify v1.9.0
 	github.com/x448/float16 v0.8.4
-	golang.org/x/sync v0.11.0
+	golang.org/x/sync v0.12.0
 )

 require (
@ -70,12 +70,12 @@ require (
 	github.com/twitchyliquid64/golang-asm v0.15.1 // indirect
 	github.com/ugorji/go/codec v1.2.12 // indirect
 	golang.org/x/arch v0.8.0 // indirect
-	golang.org/x/crypto v0.33.0
+	golang.org/x/crypto v0.36.0
 	golang.org/x/exp v0.0.0-20250218142911-aa4b98e5adaa
-	golang.org/x/net v0.35.0 // indirect
-	golang.org/x/sys v0.30.0
-	golang.org/x/term v0.29.0
-	golang.org/x/text v0.22.0
+	golang.org/x/net v0.38.0 // indirect
+	golang.org/x/sys v0.31.0
+	golang.org/x/term v0.30.0
+	golang.org/x/text v0.23.0
 	google.golang.org/protobuf v1.34.1
 	gopkg.in/yaml.v3 v3.0.1 // indirect
 )
--- a/go.sum
+++ b/go.sum
@ -214,8 +214,8 @@ golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACk
 golang.org/x/crypto v0.0.0-20190510104115-cbcb75029529/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI=
 golang.org/x/crypto v0.0.0-20191011191535-87dc89f01550/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI=
 golang.org/x/crypto v0.0.0-20200622213623-75b288015ac9/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto=
-golang.org/x/crypto v0.33.0 h1:IOBPskki6Lysi0lo9qQvbxiQ+FvsCC/YWOecCHAixus=
-golang.org/x/crypto v0.33.0/go.mod h1:bVdXmD7IV/4GdElGPozy6U7lWdRXA4qyRVGJV57uQ5M=
+golang.org/x/crypto v0.36.0 h1:AnAEvhDddvBdpY+uR+MyHmuZzzNqXSe/GvuDeob5L34=
+golang.org/x/crypto v0.36.0/go.mod h1:Y4J0ReaxCR1IMaabaSMugxJES1EpwhBHhv2bDHklZvc=
 golang.org/x/exp v0.0.0-20180321215751-8460e604b9de/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA=
 golang.org/x/exp v0.0.0-20180807140117-3d87b88a115f/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA=
 golang.org/x/exp v0.0.0-20190121172915-509febef88a4/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA=
@ -257,8 +257,8 @@ golang.org/x/net v0.0.0-20200822124328-c89045814202/go.mod h1:/O7V0waA8r7cgGh81R
 golang.org/x/net v0.0.0-20201021035429-f5854403a974/go.mod h1:sp8m0HH+o8qH0wwXwYZr8TS3Oi6o0r6Gce1SSxlDquU=
 golang.org/x/net v0.0.0-20210405180319-a5a99cb37ef4/go.mod h1:p54w0d4576C0XHj96bSt6lcn1PtDYWL6XObtHCRCNQM=
 golang.org/x/net v0.0.0-20210614182718-04defd469f4e/go.mod h1:9nx3DQGgdP8bBQD5qxJ1jj9UTztislL4KSBs9R2vV5Y=
-golang.org/x/net v0.35.0 h1:T5GQRQb2y08kTAByq9L4/bz8cipCdA8FbRTXewonqY8=
-golang.org/x/net v0.35.0/go.mod h1:EglIi67kWsHKlRzzVMUD93VMSWGFOMSZgxFjparz1Qk=
+golang.org/x/net v0.38.0 h1:vRMAPTMaeGqVhG5QyLJHqNDwecKTomGeqbnfZyKlBI8=
+golang.org/x/net v0.38.0/go.mod h1:ivrbrMbzFq5J41QOQh0siUuly180yBYtLp+CKbEaFx8=
 golang.org/x/oauth2 v0.0.0-20180821212333-d2e6202438be/go.mod h1:N/0e6XlmueqKjAGxoOufVs8QHGRruUQn6yWY3a++T0U=
 golang.org/x/oauth2 v0.0.0-20200107190931-bf48bf16ab8d/go.mod h1:gOpvHmFTYa4IltrdGE7lF6nIHvwfUNPOp7c8zoXwtLw=
 golang.org/x/sync v0.0.0-20180314180146-1d60e4601c6f/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
@ -268,8 +268,8 @@ golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJ
 golang.org/x/sync v0.0.0-20190911185100-cd5d95a43a6e/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
 golang.org/x/sync v0.0.0-20201020160332-67f06af15bc9/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
 golang.org/x/sync v0.0.0-20210220032951-036812b2e83c/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
-golang.org/x/sync v0.11.0 h1:GGz8+XQP4FvTTrjZPzNKTMFtSXH80RAzG+5ghFPgK9w=
-golang.org/x/sync v0.11.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk=
+golang.org/x/sync v0.12.0 h1:MHc5BpPuC30uJk597Ri8TV3CNZcTLu6B6z4lJy+g6Jw=
+golang.org/x/sync v0.12.0/go.mod h1:1dzgHSNfp02xaA81J2MS99Qcpr2w7fw1gpm99rleRqA=
 golang.org/x/sys v0.0.0-20180830151530-49385e6e1522/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
 golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
 golang.org/x/sys v0.0.0-20190312061237-fead79001313/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
@ -285,17 +285,17 @@ golang.org/x/sys v0.0.0-20210510120138-977fb7262007/go.mod h1:oPkhp1MJrh7nUepCBc
 golang.org/x/sys v0.0.0-20210630005230-0f9fa26af87c/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
 golang.org/x/sys v0.5.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
 golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
-golang.org/x/sys v0.30.0 h1:QjkSwP/36a20jFYWkSue1YwXzLmsV5Gfq7Eiy72C1uc=
-golang.org/x/sys v0.30.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA=
+golang.org/x/sys v0.31.0 h1:ioabZlmFYtWhL+TRYpcnNlLwhyxaM9kWTDEmfnprqik=
+golang.org/x/sys v0.31.0/go.mod h1:BJP2sWEmIv4KK5OTEluFJCKSidICx8ciO85XgH3Ak8k=
 golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo=
-golang.org/x/term v0.29.0 h1:L6pJp37ocefwRRtYPKSWOWzOtWSxVajvz2ldH/xi3iU=
-golang.org/x/term v0.29.0/go.mod h1:6bl4lRlvVuDgSf3179VpIxBF0o10JUpXWOnI7nErv7s=
+golang.org/x/term v0.30.0 h1:PQ39fJZ+mfadBm0y5WlL4vlM7Sx1Hgf13sMIY2+QS9Y=
+golang.org/x/term v0.30.0/go.mod h1:NYYFdzHoI5wRh/h5tDMdMqCqPJZEuNqVR5xJLd/n67g=
 golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
 golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
 golang.org/x/text v0.3.5/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
 golang.org/x/text v0.3.6/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
-golang.org/x/text v0.22.0 h1:bofq7m3/HAFvbF51jz3Q9wLg3jkvSPuiZu/pD1XwgtM=
-golang.org/x/text v0.22.0/go.mod h1:YRoo4H8PVmsu+E3Ou7cqLVH8oXWIHVoX0jqUWALQhfY=
+golang.org/x/text v0.23.0 h1:D71I7dUrlY+VX0gQShAThNGHFxZ13dGLBHQLVl1mJlY=
+golang.org/x/text v0.23.0/go.mod h1:/BLNzu4aZCJ1+kcD0DNRotWKage4q2rGVAg4o22unh4=
 golang.org/x/tools v0.0.0-20180525024113-a5b4c53f6e8b/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
 golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
 golang.org/x/tools v0.0.0-20190114222345-bf090417da8b/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
--- a/integration/embed_test.go
+++ b/integration/embed_test.go
@ -34,13 +34,15 @@ func cosineSimilarity[V float32 | float64](v1, v2 []V) V {
 func TestAllMiniLMEmbeddings(t *testing.T) {
 	ctx, cancel := context.WithTimeout(context.Background(), 2*time.Minute)
 	defer cancel()
+	client, _, cleanup := InitServerConnection(ctx, t)
+	defer cleanup()

 	req := api.EmbeddingRequest{
 		Model:  "all-minilm",
 		Prompt: "why is the sky blue?",
 	}

-	res, err := embeddingTestHelper(ctx, t, req)
+	res, err := embeddingTestHelper(ctx, client, t, req)

 	if err != nil {
 		t.Fatalf("error: %v", err)
@ -62,13 +64,15 @@ func TestAllMiniLMEmbeddings(t *testing.T) {
 func TestAllMiniLMEmbed(t *testing.T) {
 	ctx, cancel := context.WithTimeout(context.Background(), 2*time.Minute)
 	defer cancel()
+	client, _, cleanup := InitServerConnection(ctx, t)
+	defer cleanup()

 	req := api.EmbedRequest{
 		Model: "all-minilm",
 		Input: "why is the sky blue?",
 	}

-	res, err := embedTestHelper(ctx, t, req)
+	res, err := embedTestHelper(ctx, client, t, req)

 	if err != nil {
 		t.Fatalf("error: %v", err)
@ -98,13 +102,15 @@ func TestAllMiniLMEmbed(t *testing.T) {
 func TestAllMiniLMBatchEmbed(t *testing.T) {
 	ctx, cancel := context.WithTimeout(context.Background(), 2*time.Minute)
 	defer cancel()
+	client, _, cleanup := InitServerConnection(ctx, t)
+	defer cleanup()

 	req := api.EmbedRequest{
 		Model: "all-minilm",
 		Input: []string{"why is the sky blue?", "why is the grass green?"},
 	}

-	res, err := embedTestHelper(ctx, t, req)
+	res, err := embedTestHelper(ctx, client, t, req)

 	if err != nil {
 		t.Fatalf("error: %v", err)
@ -144,6 +150,8 @@ func TestAllMiniLMBatchEmbed(t *testing.T) {
 func TestAllMiniLMEmbedTruncate(t *testing.T) {
 	ctx, cancel := context.WithTimeout(context.Background(), 2*time.Minute)
 	defer cancel()
+	client, _, cleanup := InitServerConnection(ctx, t)
+	defer cleanup()

 	truncTrue, truncFalse := true, false

@ -182,7 +190,7 @@ func TestAllMiniLMEmbedTruncate(t *testing.T) {
 	res := make(map[string]*api.EmbedResponse)

 	for _, req := range reqs {
-		response, err := embedTestHelper(ctx, t, req.Request)
+		response, err := embedTestHelper(ctx, client, t, req.Request)
 		if err != nil {
 			t.Fatalf("error: %v", err)
 		}
@ -198,7 +206,7 @@ func TestAllMiniLMEmbedTruncate(t *testing.T) {
 	}

 	// check that truncate set to false returns an error if context length is exceeded
-	_, err := embedTestHelper(ctx, t, api.EmbedRequest{
+	_, err := embedTestHelper(ctx, client, t, api.EmbedRequest{
 		Model:    "all-minilm",
 		Input:    "why is the sky blue?",
 		Truncate: &truncFalse,
@ -210,9 +218,7 @@ func TestAllMiniLMEmbedTruncate(t *testing.T) {
 	}
 }

-func embeddingTestHelper(ctx context.Context, t *testing.T, req api.EmbeddingRequest) (*api.EmbeddingResponse, error) {
-	client, _, cleanup := InitServerConnection(ctx, t)
-	defer cleanup()
+func embeddingTestHelper(ctx context.Context, client *api.Client, t *testing.T, req api.EmbeddingRequest) (*api.EmbeddingResponse, error) {
 	if err := PullIfMissing(ctx, client, req.Model); err != nil {
 		t.Fatalf("failed to pull model %s: %v", req.Model, err)
 	}
@ -226,9 +232,7 @@ func embeddingTestHelper(ctx context.Context, t *testing.T, req api.EmbeddingReq
 	return response, nil
 }

-func embedTestHelper(ctx context.Context, t *testing.T, req api.EmbedRequest) (*api.EmbedResponse, error) {
-	client, _, cleanup := InitServerConnection(ctx, t)
-	defer cleanup()
+func embedTestHelper(ctx context.Context, client *api.Client, t *testing.T, req api.EmbedRequest) (*api.EmbedResponse, error) {
 	if err := PullIfMissing(ctx, client, req.Model); err != nil {
 		t.Fatalf("failed to pull model %s: %v", req.Model, err)
 	}
--- a/integration/model_arch_test.go
+++ b/integration/model_arch_test.go
@ -48,17 +48,6 @@ var (
 	}
 )

-func getTimeouts(t *testing.T) (soft time.Duration, hard time.Duration) {
-	deadline, hasDeadline := t.Deadline()
-	if !hasDeadline {
-		return 8 * time.Minute, 10 * time.Minute
-	} else if deadline.Compare(time.Now().Add(2*time.Minute)) <= 0 {
-		t.Skip("too little time")
-		return time.Duration(0), time.Duration(0)
-	}
-	return -time.Since(deadline.Add(-2 * time.Minute)), -time.Since(deadline.Add(-20 * time.Second))
-}
-
 func TestModelsGenerate(t *testing.T) {
 	softTimeout, hardTimeout := getTimeouts(t)
 	slog.Info("Setting timeouts", "soft", softTimeout, "hard", hardTimeout)
--- a/integration/quantization_test.go
+++ b/integration/quantization_test.go
@ -0,0 +1,130 @@
+//go:build integration && models
+
+package integration
+
+import (
+	"bytes"
+	"context"
+	"fmt"
+	"log/slog"
+	"strings"
+	"testing"
+	"time"
+
+	"github.com/ollama/ollama/api"
+)
+
+func TestQuantization(t *testing.T) {
+	sourceModels := []string{
+		"qwen2.5:0.5b-instruct-fp16",
+	}
+	quantizations := []string{
+		"Q8_0",
+		"Q4_K_S",
+		"Q4_K_M",
+		"Q4_K",
+	}
+	softTimeout, hardTimeout := getTimeouts(t)
+	started := time.Now()
+	slog.Info("Setting timeouts", "soft", softTimeout, "hard", hardTimeout)
+	ctx, cancel := context.WithTimeout(context.Background(), hardTimeout)
+	defer cancel()
+	client, _, cleanup := InitServerConnection(ctx, t)
+	defer cleanup()
+
+	for _, base := range sourceModels {
+		if err := PullIfMissing(ctx, client, base); err != nil {
+			t.Fatalf("pull failed %s", err)
+		}
+		for _, quant := range quantizations {
+			newName := fmt.Sprintf("%s__%s", base, quant)
+			t.Run(newName, func(t *testing.T) {
+				if time.Now().Sub(started) > softTimeout {
+					t.Skip("skipping remaining tests to avoid excessive runtime")
+				}
+				req := &api.CreateRequest{
+					Model:        newName,
+					Quantization: quant,
+					From:         base,
+				}
+				fn := func(resp api.ProgressResponse) error {
+					// fmt.Print(".")
+					return nil
+				}
+				t.Logf("quantizing: %s -> %s", base, quant)
+				if err := client.Create(ctx, req, fn); err != nil {
+					t.Fatalf("create failed %s", err)
+				}
+				defer func() {
+					req := &api.DeleteRequest{
+						Model: newName,
+					}
+					t.Logf("deleting: %s -> %s", base, quant)
+					if err := client.Delete(ctx, req); err != nil {
+						t.Logf("failed to clean up %s: %s", req.Model, err)
+					}
+				}()
+				// Check metadata on the model
+				resp, err := client.Show(ctx, &api.ShowRequest{Name: newName})
+				if err != nil {
+					t.Fatalf("unable to show model: %s", err)
+				}
+				if !strings.Contains(resp.Details.QuantizationLevel, quant) {
+					t.Fatalf("unexpected quantization for %s:\ngot: %s", newName, resp.Details.QuantizationLevel)
+				}
+
+				stream := true
+				genReq := api.GenerateRequest{
+					Model:     newName,
+					Prompt:    "why is the sky blue?",
+					KeepAlive: &api.Duration{Duration: 3 * time.Second},
+					Options: map[string]any{
+						"seed":        42,
+						"temperature": 0.0,
+					},
+					Stream: &stream,
+				}
+				t.Logf("verifying: %s -> %s", base, quant)
+
+				// Some smaller quantizations can cause models to have poor quality
+				// or get stuck in repetition loops, so we stop as soon as we have any matches
+				anyResp := []string{"rayleigh", "scattering", "day", "sun", "moon", "color", "nitrogen", "oxygen"}
+				reqCtx, reqCancel := context.WithCancel(ctx)
+				atLeastOne := false
+				var buf bytes.Buffer
+				genfn := func(response api.GenerateResponse) error {
+					buf.Write([]byte(response.Response))
+					fullResp := strings.ToLower(buf.String())
+					for _, resp := range anyResp {
+						if strings.Contains(fullResp, resp) {
+							atLeastOne = true
+							t.Log(fullResp)
+							reqCancel()
+							break
+						}
+					}
+					return nil
+				}
+
+				done := make(chan int)
+				var genErr error
+				go func() {
+					genErr = client.Generate(reqCtx, &genReq, genfn)
+					done <- 0
+				}()
+
+				select {
+				case <-done:
+					if genErr != nil && !atLeastOne {
+						t.Fatalf("failed with %s request prompt %s ", genReq.Model, genReq.Prompt)
+					}
+				case <-ctx.Done():
+					t.Error("outer test context done while waiting for generate")
+				}
+
+				t.Logf("passed")
+
+			})
+		}
+	}
+}
--- a/integration/utils_test.go
+++ b/integration/utils_test.go
@ -217,6 +217,7 @@ func InitServerConnection(ctx context.Context, t *testing.T) (*api.Client, strin
 					slog.Error("failed to open server log", "logfile", lifecycle.ServerLogFile, "error", err)
 					return
 				}
+				defer fp.Close()
 				data, err := io.ReadAll(fp)
 				if err != nil {
 					slog.Error("failed to read server log", "logfile", lifecycle.ServerLogFile, "error", err)
@ -358,3 +359,14 @@ func skipUnderMinVRAM(t *testing.T, gb uint64) {
 		}
 	}
 }
+
+func getTimeouts(t *testing.T) (soft time.Duration, hard time.Duration) {
+	deadline, hasDeadline := t.Deadline()
+	if !hasDeadline {
+		return 8 * time.Minute, 10 * time.Minute
+	} else if deadline.Compare(time.Now().Add(2*time.Minute)) <= 0 {
+		t.Skip("too little time")
+		return time.Duration(0), time.Duration(0)
+	}
+	return -time.Since(deadline.Add(-2 * time.Minute)), -time.Since(deadline.Add(-20 * time.Second))
+}
--- a/kvcache/causal.go
+++ b/kvcache/causal.go
@ -239,7 +239,7 @@ func (c *Causal) findStartLoc() (int, error) {
 		}
 	}

-	return 0, fmt.Errorf("%w (length: %v)", ErrKvCacheFull, len(c.cells))
+	return 0, fmt.Errorf("%w (cache: %v batch: %v)", ErrKvCacheFull, len(c.cells), c.curBatchSize)
 }

 func (c *Causal) updateSlidingWindow() {
--- a/llama/build-info.cpp
+++ b/llama/build-info.cpp
@ -1,4 +1,4 @@
 int LLAMA_BUILD_NUMBER = 0;
-char const *LLAMA_COMMIT = "2016f07bd106c73699ecbaace80f55db5ed95dac";
+char const *LLAMA_COMMIT = "e1e8e0991ffd9e99a445c6812bb519d5bac9f4b5";
 char const *LLAMA_COMPILER = "";
 char const *LLAMA_BUILD_TARGET = "";
--- a/llama/llama.cpp/common/common.h
+++ b/llama/llama.cpp/common/common.h
@ -342,6 +342,8 @@ struct common_params {

    // multimodal models (see examples/llava)
    struct common_params_model mmproj;
+    bool mmproj_use_gpu = true;     // use GPU for multimodal model
+    bool no_mmproj = false;         // explicitly disable multimodal model
    std::vector<std::string> image; // path to image file(s)

    // embedding
--- a/llama/llama.cpp/common/json-schema-to-grammar.cpp
+++ b/llama/llama.cpp/common/json-schema-to-grammar.cpp
@ -16,6 +16,9 @@ using json = nlohmann::ordered_json;
 static std::string build_repetition(const std::string & item_rule, int min_items, int max_items, const std::string & separator_rule = "") {
    auto has_max = max_items != std::numeric_limits<int>::max();

+    if (max_items == 0) {
+        return "";
+    }
    if (min_items == 0 && max_items == 1) {
        return item_rule + "?";
    }
--- a/llama/llama.cpp/examples/llava/clip-impl.h
+++ b/llama/llama.cpp/examples/llava/clip-impl.h
@ -2,8 +2,6 @@
 #include "gguf.h"
 #include "clip.h"

-#include "clip.h"
-
 #include <climits>
 #include <cstdarg>
 #include <string>
@ -17,33 +15,31 @@
 #define KEY_FTYPE               "general.file_type"
 #define KEY_NAME                "general.name"
 #define KEY_DESCRIPTION         "general.description"
-#define KEY_HAS_TEXT_ENC        "clip.has_text_encoder"
-#define KEY_HAS_VIS_ENC         "clip.has_vision_encoder"
-#define KEY_HAS_LLAVA_PROJ      "clip.has_llava_projector"
-#define KEY_HAS_MINICPMV_PROJ   "clip.has_minicpmv_projector"
-#define KEY_HAS_GLM_PROJ        "clip.has_glm_projector"
 #define KEY_MINICPMV_VERSION    "clip.minicpmv_version"
-#define KEY_HAS_QWEN2VL_MERGER  "clip.has_qwen2vl_merger"
 #define KEY_USE_GELU            "clip.use_gelu"
 #define KEY_USE_SILU            "clip.use_silu"
-#define KEY_N_EMBD              "clip.%s.embedding_length"
-#define KEY_N_FF                "clip.%s.feed_forward_length"
-#define KEY_N_BLOCK             "clip.%s.block_count"
-#define KEY_N_HEAD              "clip.%s.attention.head_count"
-#define KEY_LAYER_NORM_EPS      "clip.%s.attention.layer_norm_epsilon"
-#define KEY_PROJ_DIM            "clip.%s.projection_dim"
-#define KEY_TOKENS              "tokenizer.ggml.tokens"
-#define KEY_N_POSITIONS         "clip.text.context_length"
+#define KEY_N_EMBD              "clip.vision.embedding_length"
+#define KEY_N_FF                "clip.vision.feed_forward_length"
+#define KEY_N_BLOCK             "clip.vision.block_count"
+#define KEY_N_HEAD              "clip.vision.attention.head_count"
+#define KEY_LAYER_NORM_EPS      "clip.vision.attention.layer_norm_epsilon"
+#define KEY_PROJ_DIM            "clip.vision.projection_dim"
 #define KEY_IMAGE_SIZE          "clip.vision.image_size"
 #define KEY_PATCH_SIZE          "clip.vision.patch_size"
 #define KEY_IMAGE_MEAN          "clip.vision.image_mean"
 #define KEY_IMAGE_STD           "clip.vision.image_std"
-#define KEY_PROJ_TYPE           "clip.projector_type"
 #define KEY_FEATURE_LAYER       "clip.vision.feature_layer"
+#define KEY_PROJ_SCALE_FACTOR   "clip.vision.projector.scale_factor"
+#define KEY_PROJ_TYPE           "clip.projector_type"
+
+#define KEY_USE_GLU_MLP         "clip.use_glu_mlp"  // for qwen2.5vl
+#define KEY_USE_RMS_NORM        "clip.use_rms_norm" // for qwen2.5vl

 #define KEY_MM_PATCH_MERGE_TYPE   "clip.vision.mm_patch_merge_type"
 #define KEY_IMAGE_GRID_PINPOINTS  "clip.vision.image_grid_pinpoints"
 #define KEY_IMAGE_CROP_RESOLUTION "clip.vision.image_crop_resolution"
+#define KEY_WIN_ATTN_PATTERN      "clip.vision.n_wa_pattern"
+#define KEY_ATTN_WINDOW_SIZE      "clip.vision.window_size"


 //
@ -60,7 +56,9 @@
 #define TN_ATTN_V          "%s.blk.%d.attn_v.%s"
 #define TN_ATTN_OUTPUT     "%s.blk.%d.attn_out.%s"
 #define TN_FFN_DOWN        "%s.blk.%d.ffn_down.%s"
+#define TN_FFN_GATE        "%s.blk.%d.ffn_gate.%s"
 #define TN_FFN_UP          "%s.blk.%d.ffn_up.%s"
+#define TN_FFN_GATE        "%s.blk.%d.ffn_gate.%s"
 #define TN_LN_1            "%s.blk.%d.ln1.%s"
 #define TN_LN_2            "%s.blk.%d.ln2.%s"
 #define TN_LN_PRE          "%s.pre_ln.%s"
@ -72,6 +70,8 @@
 #define TN_IMAGE_NEWLINE   "model.image_newline"
 #define TN_MM_INP_PROJ     "mm.input_projection.weight" // gemma3
 #define TN_MM_SOFT_EMB_N   "mm.soft_emb_norm.weight"    // gemma3
+#define TN_MM_PROJECTOR    "mm.model.fc.weight"         // idefics3
+#define TN_TOK_IMG_BREAK   "v.token_embd.img_break"     // pixtral

 // mimicpmv
 #define TN_MINICPMV_POS_EMBD_K "resampler.pos_embed_k"
@ -87,18 +87,19 @@
 #define TN_GLM_ADAPTER_D_H_2_4H "adapter.linear.dense_h_to_4h.%s"
 #define TN_GLM_ADAPTER_GATE     "adapter.linear.gate.%s"
 #define TN_GLM_ADAPTER_D_4H_2_H "adapter.linear.dense_4h_to_h.%s"
-#define TN_GLM_BOI_W            "adapter.boi"
-#define TN_GLM_EOI_W            "adapter.eoi"

 enum projector_type {
    PROJECTOR_TYPE_MLP,
    PROJECTOR_TYPE_MLP_NORM,
    PROJECTOR_TYPE_LDP,
    PROJECTOR_TYPE_LDPV2,
-    PROJECTOR_TYPE_RESAMPLER,
+    PROJECTOR_TYPE_MINICPMV,
    PROJECTOR_TYPE_GLM_EDGE,
-    PROJECTOR_TYPE_MERGER,
+    PROJECTOR_TYPE_QWEN2VL,
    PROJECTOR_TYPE_GEMMA3,
+    PROJECTOR_TYPE_IDEFICS3,
+    PROJECTOR_TYPE_PIXTRAL,
+    PROJECTOR_TYPE_QWEN25VL,
    PROJECTOR_TYPE_UNKNOWN,
 };

@ -106,10 +107,13 @@ static std::map<projector_type, std::string> PROJECTOR_TYPE_NAMES = {
    { PROJECTOR_TYPE_MLP,       "mlp" },
    { PROJECTOR_TYPE_LDP,       "ldp" },
    { PROJECTOR_TYPE_LDPV2,     "ldpv2"},
-    { PROJECTOR_TYPE_RESAMPLER, "resampler"},
+    { PROJECTOR_TYPE_MINICPMV,  "resampler"},
    { PROJECTOR_TYPE_GLM_EDGE,  "adapter"},
-    { PROJECTOR_TYPE_MERGER,    "qwen2vl_merger"},
+    { PROJECTOR_TYPE_QWEN2VL,   "qwen2vl_merger"},
+    { PROJECTOR_TYPE_QWEN25VL,  "qwen2.5vl_merger"},
    { PROJECTOR_TYPE_GEMMA3,    "gemma3"},
+    { PROJECTOR_TYPE_IDEFICS3,  "idefics3"},
+    { PROJECTOR_TYPE_PIXTRAL,   "pixtral"},
 };

 static projector_type clip_projector_type_from_string(const std::string & str) {
--- a/llama/llama.cpp/examples/llava/clip.cpp
+++ b/llama/llama.cpp/examples/llava/clip.cpp
--- a/llama/llama.cpp/examples/llava/clip.h
+++ b/llama/llama.cpp/examples/llava/clip.h
@ -47,7 +47,7 @@ CLIP_API struct clip_ctx * clip_init(const char * fname, struct clip_context_par
 CLIP_API void clip_free(struct clip_ctx * ctx);

 CLIP_API size_t clip_embd_nbytes(const struct clip_ctx * ctx);
-CLIP_API size_t clip_embd_nbytes_by_img(const struct clip_ctx * ctx, int img_h, int img_w);
+CLIP_API size_t clip_embd_nbytes_by_img(const struct clip_ctx * ctx, int img_w, int img_h);

 CLIP_API int32_t clip_get_image_size (const struct clip_ctx * ctx);
 CLIP_API int32_t clip_get_patch_size (const struct clip_ctx * ctx);
@ -59,9 +59,20 @@ CLIP_API const char * clip_patch_merge_type(const struct clip_ctx * ctx);
 CLIP_API const int32_t * clip_image_grid(const struct clip_ctx * ctx);
 CLIP_API size_t get_clip_image_grid_size(const struct clip_ctx * ctx);

-CLIP_API int clip_n_patches        (const struct clip_ctx * ctx);
-CLIP_API int clip_n_patches_by_img (const struct clip_ctx * ctx, struct clip_image_f32 * img);
-CLIP_API int clip_n_mmproj_embd    (const struct clip_ctx * ctx);
+GGML_DEPRECATED(CLIP_API int clip_n_patches(const struct clip_ctx * ctx),
+    "use clip_n_output_tokens instead");
+GGML_DEPRECATED(CLIP_API int clip_n_patches_by_img(const struct clip_ctx * ctx, struct clip_image_f32 * img),
+    "use clip_n_output_tokens instead");
+
+CLIP_API int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * img);
+
+// for M-RoPE, this will be the number of token positions in X and Y directions
+// for other models, X will be the total number of tokens and Y will be 1
+CLIP_API int clip_n_output_tokens_x(const struct clip_ctx * ctx, struct clip_image_f32 * img);
+CLIP_API int clip_n_output_tokens_y(const struct clip_ctx * ctx, struct clip_image_f32 * img);
+
+// this should be equal to the embedding dimension of the text model
+CLIP_API int clip_n_mmproj_embd(const struct clip_ctx * ctx);

 CLIP_API int clip_uhd_num_image_embeds_col(struct clip_ctx * ctx_clip);
 CLIP_API void clip_add_load_image_size(struct clip_ctx * ctx_clip, struct clip_image_size * load_image_size);
@ -114,8 +125,6 @@ CLIP_API bool clip_is_qwen2vl(const struct clip_ctx * ctx);
 CLIP_API bool clip_is_llava(const struct clip_ctx * ctx);
 CLIP_API bool clip_is_gemma3(const struct clip_ctx * ctx);

-CLIP_API int get_deepest_feature_layer(const struct clip_ctx * ctx);
-
 CLIP_API bool clip_encode_float_image (struct clip_ctx * ctx, int n_threads, float * img, int h, int w, float * vec);


--- a/llama/llama.cpp/examples/llava/llava.cpp
+++ b/llama/llama.cpp/examples/llava/llava.cpp
@ -112,7 +112,7 @@ static struct clip_image_grid_shape get_anyres_image_grid_shape(const std::pair<
 }

 // Take the image segments in a grid configuration and return the embeddings and the number of embeddings into preallocated memory (image_embd_out)
-static bool clip_llava_handle_patches(clip_ctx * ctx_clip, std::vector<float *> & image_embd_v, struct clip_image_grid_shape grid_shape, float * image_embd_out, int * n_img_pos_out) {
+static bool clip_llava_handle_patches(clip_ctx * ctx_clip, std::vector<float *> & image_embd_v, struct clip_image_grid_shape grid_shape, float * image_embd_out, int * n_img_pos_out, clip_image_f32 * img_input) {
    struct {
        struct ggml_context * ctx;
    } model;
@ -175,7 +175,7 @@ static bool clip_llava_handle_patches(clip_ctx * ctx_clip, std::vector<float *>

    model.ctx = ggml_init(params);

-    struct ggml_tensor * image_features = ggml_new_tensor_3d(model.ctx, GGML_TYPE_F32, clip_n_mmproj_embd(ctx_clip), clip_n_patches(ctx_clip), num_images - 1); // example: 4096 x 576 x 4
+    struct ggml_tensor * image_features = ggml_new_tensor_3d(model.ctx, GGML_TYPE_F32, clip_n_mmproj_embd(ctx_clip), clip_n_output_tokens(ctx_clip, img_input), num_images - 1); // example: 4096 x 576 x 4
    // ggml_tensor_printf(image_features,"image_features",__LINE__,false,false);
    // fill it with the image embeddings, ignoring the base
    for (size_t i = 1; i < num_images; i++) {
@ -214,8 +214,8 @@ static bool clip_llava_handle_patches(clip_ctx * ctx_clip, std::vector<float *>

    memcpy(image_embd_out, image_embd_v[0], clip_embd_nbytes(ctx_clip)); // main image as global context
    // append without newline tokens (default behavior in llava_arch when not using unpad ):
-    memcpy(image_embd_out + clip_n_patches(ctx_clip) * clip_n_mmproj_embd(ctx_clip), (float*)result->data, clip_embd_nbytes(ctx_clip) * (num_images-1)); // grid patches
-    *n_img_pos_out = static_cast<int>(result->ne[1]+clip_n_patches(ctx_clip));
+    memcpy(image_embd_out + clip_n_output_tokens(ctx_clip, img_input) * clip_n_mmproj_embd(ctx_clip), (float*)result->data, clip_embd_nbytes(ctx_clip) * (num_images-1)); // grid patches
+    *n_img_pos_out = static_cast<int>(result->ne[1]+clip_n_output_tokens(ctx_clip, img_input));

    // Debug: Test single segments
    // Current findings: sending base image, sending a segment embedding all works similar to python
@ -313,7 +313,7 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli
                image_embd + n_img_pos_out * clip_n_mmproj_embd(ctx_clip),
                image_embd_v[i],
                clip_embd_nbytes_by_img(ctx_clip, nx, ny));
-            n_img_pos_out += clip_n_patches_by_img(ctx_clip, img_res);
+            n_img_pos_out += clip_n_output_tokens(ctx_clip, img_res);
        }
        *n_img_pos = n_img_pos_out;
        for (size_t i = 0; i < image_embd_v.size(); i++) {
@ -342,8 +342,8 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli
    }
    else if (strcmp(mm_patch_merge_type, "spatial_unpad") != 0) {
        // flat / default llava-1.5 type embedding
-        *n_img_pos = clip_n_patches(ctx_clip);
        clip_image_f32 * img_res = clip_image_f32_get_img(img_res_v.get(), 0);
+        *n_img_pos = clip_n_output_tokens(ctx_clip, img_res);
        bool encoded = clip_image_encode(ctx_clip, n_threads, img_res, image_embd); // image_embd shape is 576 x 4096
        if (!encoded) {
            LOG_ERR("Unable to encode image\n");
@ -381,7 +381,8 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli
        struct clip_image_grid_shape grid_shape = get_anyres_image_grid_shape({img->nx,img->ny}, grid_pinpoints, image_size);

        int n_img_pos_out;
-        clip_llava_handle_patches(ctx_clip, image_embd_v, grid_shape, image_embd, &n_img_pos_out);
+        clip_image_f32 * img_input = clip_image_f32_get_img(img_res_v.get(), 0);
+        clip_llava_handle_patches(ctx_clip, image_embd_v, grid_shape, image_embd, &n_img_pos_out, img_input);
        *n_img_pos = n_img_pos_out;

        for (size_t i = 0; i < image_embd_v.size(); i++) {
--- a/llama/llama.cpp/include/llama.h
+++ b/llama/llama.cpp/include/llama.h
@ -111,6 +111,7 @@ extern "C" {
        LLAMA_VOCAB_PRE_TYPE_TRILLION       = 31,
        LLAMA_VOCAB_PRE_TYPE_BAILINGMOE     = 32,
        LLAMA_VOCAB_PRE_TYPE_LLAMA4         = 33,
+        LLAMA_VOCAB_PRE_TYPE_PIXTRAL        = 34,
    };

    enum llama_rope_type {
@ -1237,6 +1238,7 @@ extern "C" {
        "will be removed in the future (see https://github.com/ggml-org/llama.cpp/pull/9896#discussion_r1800920915)");

    /// @details Top-K sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751
+    /// Setting k <= 0 makes this a noop
    LLAMA_API struct llama_sampler * llama_sampler_init_top_k      (int32_t k);

    /// @details Nucleus sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751
--- a/llama/llama.cpp/src/llama-arch.cpp
+++ b/llama/llama.cpp/src/llama-arch.cpp
@ -20,6 +20,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
    { LLM_ARCH_REFACT,           "refact"           },
    { LLM_ARCH_BERT,             "bert"             },
    { LLM_ARCH_NOMIC_BERT,       "nomic-bert"       },
+    { LLM_ARCH_NOMIC_BERT_MOE,   "nomic-bert-moe"   },
    { LLM_ARCH_JINA_BERT_V2,     "jina-bert-v2"     },
    { LLM_ARCH_BLOOM,            "bloom"            },
    { LLM_ARCH_STABLELM,         "stablelm"         },
@ -73,7 +74,6 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
    { LLM_ARCH_WAVTOKENIZER_DEC, "wavtokenizer-dec" },
    { LLM_ARCH_PLM,              "plm"              },
    { LLM_ARCH_BAILINGMOE,       "bailingmoe"       },
-    { LLM_ARCH_MISTRAL3,         "mistral3"         },
    { LLM_ARCH_UNKNOWN,          "(unknown)"        },
 };

@ -109,6 +109,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
    { LLM_KV_EXPERT_WEIGHTS_SCALE,              "%s.expert_weights_scale"              },
    { LLM_KV_EXPERT_WEIGHTS_NORM,               "%s.expert_weights_norm"               },
    { LLM_KV_EXPERT_GATING_FUNC,                "%s.expert_gating_func"                },
+    { LLM_KV_MOE_EVERY_N_LAYERS,                "%s.moe_every_n_layers"                },
    { LLM_KV_POOLING_TYPE,                      "%s.pooling_type"                      },
    { LLM_KV_LOGIT_SCALE,                       "%s.logit_scale"                       },
    { LLM_KV_DECODER_START_TOKEN_ID,            "%s.decoder_start_token_id"            },
@ -511,6 +512,24 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
            { LLM_TENSOR_FFN_UP,          "blk.%d.ffn_up" },
        },
    },
+    {
+        LLM_ARCH_NOMIC_BERT_MOE,
+        {
+            { LLM_TENSOR_TOKEN_EMBD,      "token_embd" },
+            { LLM_TENSOR_TOKEN_EMBD_NORM, "token_embd_norm" },
+            { LLM_TENSOR_TOKEN_TYPES,     "token_types" },
+            { LLM_TENSOR_ATTN_OUT_NORM,   "blk.%d.attn_output_norm" },
+            { LLM_TENSOR_ATTN_QKV,        "blk.%d.attn_qkv" },
+            { LLM_TENSOR_ATTN_OUT,        "blk.%d.attn_output" },
+            { LLM_TENSOR_LAYER_OUT_NORM,  "blk.%d.layer_output_norm" },
+            { LLM_TENSOR_FFN_GATE,        "blk.%d.ffn_gate" },
+            { LLM_TENSOR_FFN_DOWN,        "blk.%d.ffn_down" },
+            { LLM_TENSOR_FFN_UP,          "blk.%d.ffn_up" },
+            { LLM_TENSOR_FFN_GATE_INP,    "blk.%d.ffn_gate_inp" },
+            { LLM_TENSOR_FFN_DOWN_EXPS,   "blk.%d.ffn_down_exps" },
+            { LLM_TENSOR_FFN_UP_EXPS,     "blk.%d.ffn_up_exps" },
+        },
+    },
    {
        LLM_ARCH_JINA_BERT_V2,
        {
@ -1587,22 +1606,6 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
            { LLM_TENSOR_FFN_UP_SHEXP,       "blk.%d.ffn_up_shexp" },
        },
    },
-    {
-        LLM_ARCH_MISTRAL3,
-        {
-            { LLM_TENSOR_TOKEN_EMBD,  "token_embd" },
-            { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
-            { LLM_TENSOR_ATTN_NORM,   "blk.%d.attn_norm" },
-            { LLM_TENSOR_ATTN_Q,      "blk.%d.attn_q" },
-            { LLM_TENSOR_ATTN_K,      "blk.%d.attn_k" },
-            { LLM_TENSOR_ATTN_V,      "blk.%d.attn_v" },
-            { LLM_TENSOR_ATTN_OUT,    "blk.%d.attn_output" },
-            { LLM_TENSOR_FFN_NORM,    "blk.%d.ffn_norm" },
-            { LLM_TENSOR_FFN_GATE,    "blk.%d.ffn_gate" },
-            { LLM_TENSOR_FFN_UP,      "blk.%d.ffn_up" },
-            { LLM_TENSOR_FFN_DOWN,    "blk.%d.ffn_down" },
-        }
-    },
    {
        LLM_ARCH_UNKNOWN,
        {
--- a/llama/llama.cpp/src/llama-arch.h
+++ b/llama/llama.cpp/src/llama-arch.h
@ -24,6 +24,7 @@ enum llm_arch {
    LLM_ARCH_REFACT,
    LLM_ARCH_BERT,
    LLM_ARCH_NOMIC_BERT,
+    LLM_ARCH_NOMIC_BERT_MOE,
    LLM_ARCH_JINA_BERT_V2,
    LLM_ARCH_BLOOM,
    LLM_ARCH_STABLELM,
@ -75,7 +76,6 @@ enum llm_arch {
    LLM_ARCH_CHAMELEON,
    LLM_ARCH_SOLAR,
    LLM_ARCH_WAVTOKENIZER_DEC,
-    LLM_ARCH_MISTRAL3,
    LLM_ARCH_PLM,
    LLM_ARCH_BAILINGMOE,
    LLM_ARCH_UNKNOWN,
@ -113,6 +113,7 @@ enum llm_kv {
    LLM_KV_EXPERT_WEIGHTS_SCALE,
    LLM_KV_EXPERT_WEIGHTS_NORM,
    LLM_KV_EXPERT_GATING_FUNC,
+    LLM_KV_MOE_EVERY_N_LAYERS,
    LLM_KV_POOLING_TYPE,
    LLM_KV_LOGIT_SCALE,
    LLM_KV_DECODER_START_TOKEN_ID,
--- a/llama/llama.cpp/src/llama-chat.cpp
+++ b/llama/llama.cpp/src/llama-chat.cpp
@ -50,8 +50,8 @@ static const std::map<std::string, llm_chat_template> LLM_CHAT_TEMPLATES = {
    { "deepseek3",         LLM_CHAT_TEMPLATE_DEEPSEEK_3        },
    { "command-r",         LLM_CHAT_TEMPLATE_COMMAND_R         },
    { "llama3",            LLM_CHAT_TEMPLATE_LLAMA_3           },
-    { "chatglm3",          LLM_CHAT_TEMPLATE_CHATGML_3         },
-    { "chatglm4",          LLM_CHAT_TEMPLATE_CHATGML_4         },
+    { "chatglm3",          LLM_CHAT_TEMPLATE_CHATGLM_3         },
+    { "chatglm4",          LLM_CHAT_TEMPLATE_CHATGLM_4         },
    { "glmedge",           LLM_CHAT_TEMPLATE_GLMEDGE           },
    { "minicpm",           LLM_CHAT_TEMPLATE_MINICPM           },
    { "exaone3",           LLM_CHAT_TEMPLATE_EXAONE_3          },
@ -62,6 +62,7 @@ static const std::map<std::string, llm_chat_template> LLM_CHAT_TEMPLATES = {
    { "yandex",            LLM_CHAT_TEMPLATE_YANDEX            },
    { "bailing",           LLM_CHAT_TEMPLATE_BAILING           },
    { "llama4",            LLM_CHAT_TEMPLATE_LLAMA4            },
+    { "smolvlm",           LLM_CHAT_TEMPLATE_SMOLVLM           },
 };

 llm_chat_template llm_chat_template_from_str(const std::string & name) {
@ -81,7 +82,9 @@ llm_chat_template llm_chat_detect_template(const std::string & tmpl) {
    if (tmpl_contains("<|im_start|>")) {
        return tmpl_contains("<|im_sep|>")
            ? LLM_CHAT_TEMPLATE_PHI_4
-            : LLM_CHAT_TEMPLATE_CHATML;
+            : tmpl_contains("<end_of_utterance>")
+                ? LLM_CHAT_TEMPLATE_SMOLVLM // SmolVLM uses <|im_start|> as BOS, but it is NOT chatml
+                : LLM_CHAT_TEMPLATE_CHATML;
    } else if (tmpl.find("mistral") == 0 || tmpl_contains("[INST]")) {
        if (tmpl_contains("[SYSTEM_PROMPT]")) {
            return LLM_CHAT_TEMPLATE_MISTRAL_V7;
@ -119,8 +122,12 @@ llm_chat_template llm_chat_detect_template(const std::string & tmpl) {
        }
    } else if (tmpl_contains("<|assistant|>") && tmpl_contains("<|end|>")) {
        return LLM_CHAT_TEMPLATE_PHI_3;
+    } else if (tmpl_contains("[gMASK]<sop>")) {
+        return LLM_CHAT_TEMPLATE_CHATGLM_4;
    } else if (tmpl_contains("<|assistant|>") && tmpl_contains("<|user|>")) {
        return tmpl_contains("</s>") ? LLM_CHAT_TEMPLATE_FALCON_3 : LLM_CHAT_TEMPLATE_GLMEDGE;
+    } else if (tmpl_contains("<|{{ item['role'] }}|>") && tmpl_contains("<|begin_of_image|>")) {
+        return LLM_CHAT_TEMPLATE_GLMEDGE;
    } else if (tmpl_contains("<|user|>") && tmpl_contains("<|endoftext|>")) {
        return LLM_CHAT_TEMPLATE_ZEPHYR;
    } else if (tmpl_contains("bos_token + message['role']")) {
@ -149,9 +156,7 @@ llm_chat_template llm_chat_detect_template(const std::string & tmpl) {
        return LLM_CHAT_TEMPLATE_LLAMA_3;
    } else if (tmpl_contains("[gMASK]sop")) {
        // chatglm3-6b
-        return LLM_CHAT_TEMPLATE_CHATGML_3;
-    } else if (tmpl_contains("[gMASK]<sop>")) {
-        return LLM_CHAT_TEMPLATE_CHATGML_4;
+        return LLM_CHAT_TEMPLATE_CHATGLM_3;
    } else if (tmpl_contains(LU8("<用户>"))) {
        // MiniCPM-3B-OpenHermes-2.5-v2-GGUF
        return LLM_CHAT_TEMPLATE_MINICPM;
@ -432,7 +437,7 @@ int32_t llm_chat_apply_template(
        if (add_ass) {
            ss << "<|start_header_id|>assistant<|end_header_id|>\n\n";
        }
-    } else if (tmpl == LLM_CHAT_TEMPLATE_CHATGML_3) {
+    } else if (tmpl == LLM_CHAT_TEMPLATE_CHATGLM_3) {
        // chatglm3-6b
        ss << "[gMASK]" << "sop";
        for (auto message : chat) {
@ -442,7 +447,7 @@ int32_t llm_chat_apply_template(
        if (add_ass) {
            ss << "<|assistant|>";
        }
-    } else if (tmpl == LLM_CHAT_TEMPLATE_CHATGML_4) {
+    } else if (tmpl == LLM_CHAT_TEMPLATE_CHATGLM_4 || tmpl == LLM_CHAT_TEMPLATE_GLMEDGE) {
        ss << "[gMASK]" << "<sop>";
        for (auto message : chat) {
            std::string role(message->role);
@ -451,14 +456,6 @@ int32_t llm_chat_apply_template(
        if (add_ass) {
            ss << "<|assistant|>";
        }
-    } else if (tmpl == LLM_CHAT_TEMPLATE_GLMEDGE) {
-        for (auto message : chat) {
-            std::string role(message->role);
-            ss << "<|" << role << "|>" << "\n" << message->content;
-        }
-        if (add_ass) {
-            ss << "<|assistant|>";
-        }
    } else if (tmpl == LLM_CHAT_TEMPLATE_MINICPM) {
        // MiniCPM-3B-OpenHermes-2.5-v2-GGUF
        for (auto message : chat) {
@ -620,7 +617,23 @@ int32_t llm_chat_apply_template(
        if (add_ass) {
            ss << "<|header_start|>assistant<|header_end|>\n\n";
        }
-    }  else {
+    } else if (tmpl == LLM_CHAT_TEMPLATE_SMOLVLM) {
+        // SmolVLM
+        ss << "<|im_start|>"; // uses <|im_start|> as BOS, but the actual content is NOT chatml
+        for (auto message : chat) {
+            std::string role(message->role);
+            if (role == "system") {
+                ss << message->content << "\n\n";
+            } else if (role == "user") {
+                ss << "User: " << message->content << "<end_of_utterance>\n";
+            } else {
+                ss << "Assistant: " << message->content << "<end_of_utterance>\n";
+            }
+        }
+        if (add_ass) {
+            ss << "Assistant:";
+        }
+    } else {
        // template not supported
        return -1;
    }
--- a/llama/llama.cpp/src/llama-chat.h
+++ b/llama/llama.cpp/src/llama-chat.h
@ -29,8 +29,8 @@ enum llm_chat_template {
    LLM_CHAT_TEMPLATE_DEEPSEEK_3,
    LLM_CHAT_TEMPLATE_COMMAND_R,
    LLM_CHAT_TEMPLATE_LLAMA_3,
-    LLM_CHAT_TEMPLATE_CHATGML_3,
-    LLM_CHAT_TEMPLATE_CHATGML_4,
+    LLM_CHAT_TEMPLATE_CHATGLM_3,
+    LLM_CHAT_TEMPLATE_CHATGLM_4,
    LLM_CHAT_TEMPLATE_GLMEDGE,
    LLM_CHAT_TEMPLATE_MINICPM,
    LLM_CHAT_TEMPLATE_EXAONE_3,
@ -41,6 +41,7 @@ enum llm_chat_template {
    LLM_CHAT_TEMPLATE_YANDEX,
    LLM_CHAT_TEMPLATE_BAILING,
    LLM_CHAT_TEMPLATE_LLAMA4,
+    LLM_CHAT_TEMPLATE_SMOLVLM,
    LLM_CHAT_TEMPLATE_UNKNOWN,
 };

--- a/llama/llama.cpp/src/llama-context.cpp
+++ b/llama/llama.cpp/src/llama-context.cpp
@ -114,7 +114,7 @@ llama_context::llama_context(
    }

    if (n_ctx_per_seq > hparams.n_ctx_train) {
-        LLAMA_LOG_WARN("%s: n_ctx_pre_seq (%u) > n_ctx_train (%u) -- possible training context overflow\n",
+        LLAMA_LOG_WARN("%s: n_ctx_per_seq (%u) > n_ctx_train (%u) -- possible training context overflow\n",
                __func__, n_ctx_per_seq, hparams.n_ctx_train);
    }

@ -469,8 +469,7 @@ ggml_tensor * llama_context::build_rope_shift(
        ggml_tensor * shift,
        ggml_tensor * factors,
              float   freq_base,
-              float   freq_scale,
-        ggml_backend_buffer * bbuf) const {
+              float   freq_scale) const {
    const auto & n_ctx_orig = cparams.n_ctx_orig_yarn;

    const auto & yarn_ext_factor  = cparams.yarn_ext_factor;
@ -492,17 +491,7 @@ ggml_tensor * llama_context::build_rope_shift(
        // dequantize to f32 -> RoPE -> quantize back
        tmp = ggml_cast(ctx0, cur, GGML_TYPE_F32);

-        if (bbuf) {
-            for (const auto & backend : backends) {
-                // Figure out which backend KV cache belongs to
-                if (ggml_backend_supports_buft(backend.get(), ggml_backend_buffer_get_type(bbuf))) {
-                    ggml_backend_sched_set_tensor_backend(sched.get(), tmp, backend.get());
-                    break;
-                }
-            }
-        }
-
-        tmp = ggml_rope_ext_inplace(ctx0, tmp,
+        tmp = ggml_rope_ext(ctx0, tmp,
                shift, factors, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
                yarn_ext_factor, yarn_attn_factor, yarn_beta_fast, yarn_beta_slow);

@ -582,7 +571,7 @@ llm_graph_result_ptr llama_context::build_kv_self_shift(
                ggml_row_size(kv_self->k_l[il]->type, n_embd_k_gqa),
                0);

-        ggml_tensor * cur = build_rope_shift(ctx0, k, inp->k_shift, rope_factors, freq_base_l, freq_scale_l, kv_self->k_l[il]->buffer);
+        ggml_tensor * cur = build_rope_shift(ctx0, k, inp->k_shift, rope_factors, freq_base_l, freq_scale_l);

        ggml_build_forward_expand(gf, cur);
    }
@ -1510,8 +1499,6 @@ int32_t llama_context::output_reserve(int32_t n_outputs) {
    // set all ids as invalid (negative)
    std::fill(output_ids.begin(), output_ids.end(), -1);

-    ggml_backend_buffer_clear(buf_output.get(), 0);
-
    this->n_outputs     = 0;
    this->n_outputs_max = n_outputs_max;

--- a/llama/llama.cpp/src/llama-context.h
+++ b/llama/llama.cpp/src/llama-context.h
@ -172,8 +172,7 @@ private:
        ggml_tensor * shift,
        ggml_tensor * factors,
              float   freq_base,
-              float   freq_scale,
-        ggml_backend_buffer * bbuf) const;
+              float   freq_scale) const;

    llm_graph_result_ptr build_kv_self_shift(
            ggml_context * ctx0,
--- a/llama/llama.cpp/src/llama-graph.cpp
+++ b/llama/llama.cpp/src/llama-graph.cpp
@ -55,7 +55,21 @@ void llm_graph_input_pos::set_input(const llama_ubatch * ubatch) {
    if (ubatch->pos && pos) {
        const int64_t n_tokens = ubatch->n_tokens;

-        ggml_backend_tensor_set(pos, ubatch->pos, 0, n_tokens*n_pos_per_token*ggml_element_size(pos));
+        if (ubatch->token && n_pos_per_embd == 4) {
+            // in case we're using M-RoPE with text tokens, convert the 1D positions to 4D
+            // the 3 first dims are the same, and 4th dim is all 0
+            std::vector<llama_pos> pos_data(n_tokens*n_pos_per_embd);
+            // copy the first dimension
+            for (int i = 0; i < n_tokens; ++i) {
+                pos_data[               i] = ubatch->pos[i];
+                pos_data[    n_tokens + i] = ubatch->pos[i];
+                pos_data[2 * n_tokens + i] = ubatch->pos[i];
+                pos_data[3 * n_tokens + i] = 0; // 4th dim is 0
+            }
+            ggml_backend_tensor_set(pos, pos_data.data(), 0, pos_data.size()*ggml_element_size(pos));
+        } else {
+            ggml_backend_tensor_set(pos, ubatch->pos, 0, n_tokens*n_pos_per_embd*ggml_element_size(pos));
+        }
    }
 }

@ -71,7 +85,7 @@ void llm_graph_input_attn_temp::set_input(const llama_ubatch * ubatch) {
            ) * f_attn_temp_scale + 1.0;
        }

-        ggml_backend_tensor_set(attn_scale, attn_scale_data.data(), 0, n_tokens*n_pos_per_token*ggml_element_size(attn_scale));
+        ggml_backend_tensor_set(attn_scale, attn_scale_data.data(), 0, n_tokens*ggml_element_size(attn_scale));
    }
 }

@ -598,7 +612,7 @@ llm_graph_context::llm_graph_context(const llm_graph_params & params) :
    res              (std::make_unique<llm_graph_result>()) {
    }

-int64_t llm_graph_context::n_pos_per_token() const {
+int64_t llm_graph_context::n_pos_per_embd() const {
    return arch == LLM_ARCH_QWEN2VL ? 4 : 1;
 }

@ -809,6 +823,10 @@ ggml_tensor * llm_graph_context::build_ffn(

    if (down) {
        cur = build_lora_mm(down, cur);
+        if (arch == LLM_ARCH_GLM4) {
+            // GLM4 seems to have numerical issues with half-precision accumulators
+            ggml_mul_mat_set_prec(cur, GGML_PREC_F32);
+        }
    }

    if (down_b) {
@ -916,28 +934,35 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
    ggml_tensor * up = build_lora_mm_id(up_exps, cur, selected_experts); // [n_ff, n_expert_used, n_tokens]
    cb(up, "ffn_moe_up", il);

-    ggml_tensor * gate = build_lora_mm_id(gate_exps, cur, selected_experts); // [n_ff, n_expert_used, n_tokens]
-    cb(gate, "ffn_moe_gate", il);
+    ggml_tensor * experts = nullptr;
+    if (gate_exps) {
+        cur = build_lora_mm_id(gate_exps, cur, selected_experts); // [n_ff, n_expert_used, n_tokens]
+        cb(cur, "ffn_moe_gate", il);
+    } else {
+        cur = up;
+    }

    switch (type_op) {
        case LLM_FFN_SILU:
            {
-                gate = ggml_silu(ctx0, gate);
-                cb(gate, "ffn_moe_silu", il);
+                cur = ggml_silu(ctx0, cur);
+                cb(cur, "ffn_moe_silu", il);
            } break;
        case LLM_FFN_GELU:
            {
-                gate = ggml_gelu(ctx0, gate);
-                cb(gate, "ffn_moe_gelu", il);
+                cur = ggml_gelu(ctx0, cur);
+                cb(cur, "ffn_moe_gelu", il);
            } break;
        default:
            GGML_ABORT("fatal error");
    }

-    ggml_tensor * par = ggml_mul(ctx0, up, gate); // [n_ff, n_expert_used, n_tokens]
-    cb(par, "ffn_moe_gate_par", il);
+    if (gate_exps) {
+        cur = ggml_mul(ctx0, cur, up); // [n_ff, n_expert_used, n_tokens]
+        cb(cur, "ffn_moe_gate_par", il);
+    }

-    ggml_tensor * experts = build_lora_mm_id(down_exps, par, selected_experts); // [n_embd, n_expert_used, n_tokens]
+    experts = build_lora_mm_id(down_exps, cur, selected_experts); // [n_embd, n_expert_used, n_tokens]
    cb(experts, "ffn_moe_down", il);

    if (!weight_before_ffn) {
@ -1020,11 +1045,11 @@ ggml_tensor * llm_graph_context::build_inp_embd(ggml_tensor * tok_embd) const {
 }

 ggml_tensor * llm_graph_context::build_inp_pos() const {
-    auto inp = std::make_unique<llm_graph_input_pos>(n_pos_per_token());
+    auto inp = std::make_unique<llm_graph_input_pos>(n_pos_per_embd());

    auto & cur = inp->pos;

-    cur = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens*n_pos_per_token());
+    cur = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens*n_pos_per_embd());
    ggml_set_input(cur);

    res->add_input(std::move(inp));
@ -1033,11 +1058,12 @@ ggml_tensor * llm_graph_context::build_inp_pos() const {
 }

 ggml_tensor * llm_graph_context::build_inp_attn_scale() const {
-    auto inp = std::make_unique<llm_graph_input_attn_temp>(n_pos_per_token(), hparams.n_attn_temp_floor_scale, hparams.f_attn_temp_scale);
+    auto inp = std::make_unique<llm_graph_input_attn_temp>(hparams.n_attn_temp_floor_scale, hparams.f_attn_temp_scale);

    auto & cur = inp->attn_scale;

-    cur = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, 1, 1, n_tokens*n_pos_per_token());
+    // this need to be 1x1xN for broadcasting
+    cur = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, 1, 1, n_tokens);
    ggml_set_input(cur);

    res->add_input(std::move(inp));
--- a/llama/llama.cpp/src/llama-graph.h
+++ b/llama/llama.cpp/src/llama-graph.h
@ -91,29 +91,27 @@ public:

 class llm_graph_input_pos : public llm_graph_input_i {
 public:
-    llm_graph_input_pos(int64_t n_pos_per_token) : n_pos_per_token(n_pos_per_token) {}
+    llm_graph_input_pos(int64_t n_pos_per_embd) : n_pos_per_embd(n_pos_per_embd) {}
    virtual ~llm_graph_input_pos() = default;

    void set_input(const llama_ubatch * ubatch) override;

    ggml_tensor * pos = nullptr; // I32 [n_batch]

-    const int64_t n_pos_per_token = 1;
+    const int64_t n_pos_per_embd = 1;
 };

 // temperature tuning, used by llama4
 class llm_graph_input_attn_temp : public llm_graph_input_i {
 public:
-    llm_graph_input_attn_temp(int64_t n_pos_per_token, uint32_t n_attn_temp_floor_scale, float f_attn_temp_scale)
-        : n_pos_per_token(n_pos_per_token), n_attn_temp_floor_scale(n_attn_temp_floor_scale), f_attn_temp_scale(f_attn_temp_scale) {}
+    llm_graph_input_attn_temp(uint32_t n_attn_temp_floor_scale, float f_attn_temp_scale)
+        : n_attn_temp_floor_scale(n_attn_temp_floor_scale), f_attn_temp_scale(f_attn_temp_scale) {}
    virtual ~llm_graph_input_attn_temp() = default;

    void set_input(const llama_ubatch * ubatch) override;

    ggml_tensor * attn_scale = nullptr; // F32 [n_batch]

-    const int64_t n_pos_per_token = 1;
-
    const uint32_t n_attn_temp_floor_scale;
    const float    f_attn_temp_scale;
 };
@ -430,7 +428,7 @@ struct llm_graph_context {

    llm_graph_context(const llm_graph_params & params);

-    int64_t n_pos_per_token() const;
+    int64_t n_pos_per_embd() const;

    void cb(ggml_tensor * cur, const char * name, int il) const;

--- a/llama/llama.cpp/src/llama-hparams.h
+++ b/llama/llama.cpp/src/llama-hparams.h
@ -72,6 +72,7 @@ struct llama_hparams {
    float    expert_weights_scale = 0.0;
    bool     expert_weights_norm  = false;
    uint32_t expert_gating_func   = LLAMA_EXPERT_GATING_FUNC_TYPE_NONE;
+    uint32_t moe_every_n_layers   = 0;

    float f_norm_eps;
    float f_norm_rms_eps;
--- a/llama/llama.cpp/src/llama-model.cpp
+++ b/llama/llama.cpp/src/llama-model.cpp
@ -43,11 +43,13 @@ const char * llm_type_name(llm_type type) {
        case LLM_TYPE_770M:          return "770M";
        case LLM_TYPE_780M:          return "780M";
        case LLM_TYPE_0_5B:          return "0.5B";
+        case LLM_TYPE_0_6B:          return "0.6B";
        case LLM_TYPE_1B:            return "1B";
        case LLM_TYPE_1_3B:          return "1.3B";
        case LLM_TYPE_1_4B:          return "1.4B";
        case LLM_TYPE_1_5B:          return "1.5B";
        case LLM_TYPE_1_6B:          return "1.6B";
+        case LLM_TYPE_1_7B:          return "1.7B";
        case LLM_TYPE_1_8B:          return "1.8B";
        case LLM_TYPE_2B:            return "2B";
        case LLM_TYPE_2_8B:          return "2.8B";
@ -66,6 +68,7 @@ const char * llm_type_name(llm_type type) {
        case LLM_TYPE_15B:           return "15B";
        case LLM_TYPE_16B:           return "16B";
        case LLM_TYPE_20B:           return "20B";
+        case LLM_TYPE_27B:           return "27B";
        case LLM_TYPE_30B:           return "30B";
        case LLM_TYPE_32B:           return "32B";
        case LLM_TYPE_34B:           return "34B";
@ -74,6 +77,7 @@ const char * llm_type_name(llm_type type) {
        case LLM_TYPE_65B:           return "65B";
        case LLM_TYPE_70B:           return "70B";
        case LLM_TYPE_236B:          return "236B";
+        case LLM_TYPE_290B:          return "290B";
        case LLM_TYPE_314B:          return "314B";
        case LLM_TYPE_671B:          return "671B";
        case LLM_TYPE_SMALL:         return "0.1B";
@ -88,10 +92,10 @@ const char * llm_type_name(llm_type type) {
        case LLM_TYPE_16x3_8B:       return "16x3.8B";
        case LLM_TYPE_10B_128x3_66B: return "10B+128x3.66B";
        case LLM_TYPE_57B_A14B:      return "57B.A14B";
-        case LLM_TYPE_27B:           return "27B";
-        case LLM_TYPE_290B:          return "290B";
        case LLM_TYPE_17B_16E:       return "17Bx16E (Scout)";
        case LLM_TYPE_17B_128E:      return "17Bx128E (Maverick)";
+        case LLM_TYPE_30B_A3B:       return "30B.A3B";
+        case LLM_TYPE_235B_A22B:     return "235B.A22B";
        default:                     return "?B";
    }
 }
@ -709,10 +713,12 @@ void llama_model::load_hparams(llama_model_loader & ml) {
                }
            } break;
        case LLM_ARCH_NOMIC_BERT:
+        case LLM_ARCH_NOMIC_BERT_MOE:
            {
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS,    hparams.f_norm_eps);
                ml.get_key(LLM_KV_ATTENTION_CAUSAL,           hparams.causal_attn);
                ml.get_key(LLM_KV_POOLING_TYPE,               hparams.pooling_type);
+                ml.get_key(LLM_KV_MOE_EVERY_N_LAYERS,         hparams.moe_every_n_layers, 0);

                if (hparams.n_layer == 12 && hparams.n_embd == 768) {
                    type = LLM_TYPE_137M;
@ -805,6 +811,10 @@ void llama_model::load_hparams(llama_model_loader & ml) {
            {
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
                switch (hparams.n_layer) {
+                    case 28: type = hparams.n_embd == 1024 ? LLM_TYPE_0_6B : LLM_TYPE_1_7B; break;
+                    case 36: type = hparams.n_embd == 2560 ? LLM_TYPE_4B : LLM_TYPE_8B; break;
+                    case 40: type = LLM_TYPE_14B; break;
+                    case 64: type = LLM_TYPE_32B; break;
                    default: type = LLM_TYPE_UNKNOWN;
                }
            } break;
@ -814,6 +824,8 @@ void llama_model::load_hparams(llama_model_loader & ml) {

                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
                switch (hparams.n_layer) {
+                    case 48: type = LLM_TYPE_30B_A3B; break;
+                    case 94: type = LLM_TYPE_235B_A22B; break;
                    default: type = LLM_TYPE_UNKNOWN;
                }
            } break;
@ -1425,7 +1437,6 @@ void llama_model::load_hparams(llama_model_loader & ml) {
                    default: type = LLM_TYPE_UNKNOWN;
                }
            } break;
-        case LLM_ARCH_MISTRAL3: break;
        default: throw std::runtime_error("unsupported model architecture");
    }

@ -2133,6 +2144,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
                } break;
            case LLM_ARCH_BERT:
            case LLM_ARCH_NOMIC_BERT:
+            case LLM_ARCH_NOMIC_BERT_MOE:
                {
                    tok_embd     = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD,  "weight"), {n_embd, n_vocab}, 0);
                    type_embd    = create_tensor(tn(LLM_TENSOR_TOKEN_TYPES, "weight"), {n_embd, n_token_types}, 0);
@ -2166,20 +2178,31 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
                            layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
                        }

+                        if (arch == LLM_ARCH_NOMIC_BERT_MOE) {
+                            layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, 0);
+                        }
+
                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT,      "weight", i), {n_embd, n_embd}, 0);

                        layer.attn_out_norm   = create_tensor(tn(LLM_TENSOR_ATTN_OUT_NORM, "weight", i), {n_embd}, 0);
                        layer.attn_out_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_OUT_NORM, "bias", i),   {n_embd}, 0);

-                        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,        "weight", i), {n_embd, n_ff}, 0);
-                        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN,      "weight", i), {n_ff, n_embd}, 0);
-
-                        if (arch == LLM_ARCH_BERT) {
+                        if (hparams.moe_every_n_layers > 0 && i % hparams.moe_every_n_layers == 1) {
                            layer.bo         = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
-                            layer.ffn_up_b   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "bias", i), {n_ff}, 0);
-                            layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, 0);
+                            layer.ffn_up_exps   = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS,   "weight", i), {  n_embd, n_ff,   n_expert}, 0);
+                            layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {  n_ff,   n_embd, n_expert}, 0);
+                            layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP,   "weight", i), {n_embd, n_expert}, 0);
                        } else {
-                            layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
+                            layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,        "weight", i), {n_embd, n_ff}, 0);
+                            layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN,      "weight", i), {n_ff, n_embd}, 0);
+
+                            if (arch == LLM_ARCH_BERT || arch == LLM_ARCH_NOMIC_BERT_MOE) {
+                                layer.bo         = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
+                                layer.ffn_up_b   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "bias", i), {n_ff}, 0);
+                                layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, 0);
+                            } else {
+                                layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
+                            }
                        }

                        layer.layer_out_norm   = create_tensor(tn(LLM_TENSOR_LAYER_OUT_NORM, "weight", i), {n_embd}, 0);
@ -6074,6 +6097,11 @@ struct llm_build_bert : public llm_graph_context {
                cur = build_lora_mm(model.layers[il].wqkv, cur);
                cb(cur, "wqkv", il);

+                if (model.arch == LLM_ARCH_NOMIC_BERT_MOE) {
+                    cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
+                    cb(cur, "bqkv", il);
+                }
+
                Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd,     n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
                Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
                Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
@ -6126,13 +6154,29 @@ struct llm_build_bert : public llm_graph_context {
            cb(ffn_inp, "ffn_inp", il);

            // feed-forward network
-            if (model.arch == LLM_ARCH_BERT) {
+            if (hparams.moe_every_n_layers > 0 && il % hparams.moe_every_n_layers == 1) {
+                // MoE branch
+                cur = build_moe_ffn(cur,
+                        model.layers[il].ffn_gate_inp,
+                        model.layers[il].ffn_up_exps,
+                        nullptr,
+                        model.layers[il].ffn_down_exps,
+                        nullptr,
+                        hparams.n_expert,
+                        hparams.n_expert_used,
+                        LLM_FFN_GELU,
+                        false, false,
+                        0.0f,
+                        LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX, il);
+                cb(cur, "ffn_moe_out", il);
+            } else if (model.arch == LLM_ARCH_BERT || model.arch == LLM_ARCH_NOMIC_BERT_MOE) {
                cur = build_ffn(cur,
                        model.layers[il].ffn_up,   model.layers[il].ffn_up_b,   NULL,
                        NULL,                      NULL,                        NULL,
                        model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
                        NULL,
                        LLM_FFN_GELU, LLM_FFN_SEQ, il);
+                cb(cur, "ffn_out", il);
            } else if (model.arch == LLM_ARCH_JINA_BERT_V2) {
                cur = build_ffn(cur,
                        model.layers[il].ffn_up,   NULL,                        NULL,
@ -6140,6 +6184,7 @@ struct llm_build_bert : public llm_graph_context {
                        model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
                        NULL,
                        LLM_FFN_GELU, LLM_FFN_PAR, il);
+                cb(cur, "ffn_out", il);
            } else {
                cur = build_ffn(cur,
                        model.layers[il].ffn_up,   NULL, NULL,
@ -6147,8 +6192,8 @@ struct llm_build_bert : public llm_graph_context {
                        model.layers[il].ffn_down, NULL, NULL,
                        NULL,
                        LLM_FFN_SILU, LLM_FFN_PAR, il);
+                cb(cur, "ffn_out", il);
            }
-            cb(cur, "ffn_out", il);

            // attentions bypass the intermediate layer
            cur = ggml_add(ctx0, cur, ffn_inp);
@ -13349,6 +13394,7 @@ llm_graph_result_ptr llama_model::build_graph(
        case LLM_ARCH_BERT:
        case LLM_ARCH_JINA_BERT_V2:
        case LLM_ARCH_NOMIC_BERT:
+        case LLM_ARCH_NOMIC_BERT_MOE:
            {
                llm = std::make_unique<llm_build_bert>(*this, params, gf);
            } break;
@ -13705,7 +13751,6 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
        case LLM_ARCH_CHAMELEON:
        case LLM_ARCH_SOLAR:
        case LLM_ARCH_BAILINGMOE:
-        case LLM_ARCH_MISTRAL3:
            return LLAMA_ROPE_TYPE_NORM;

        // the pairs of head values are offset by n_rot/2
@ -13714,6 +13759,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
        case LLM_ARCH_DBRX:
        case LLM_ARCH_BERT:
        case LLM_ARCH_NOMIC_BERT:
+        case LLM_ARCH_NOMIC_BERT_MOE:
        case LLM_ARCH_STABLELM:
        case LLM_ARCH_BITNET:
        case LLM_ARCH_QWEN:
--- a/llama/llama.cpp/src/llama-model.h
+++ b/llama/llama.cpp/src/llama-model.h
@ -40,11 +40,13 @@ enum llm_type {
    LLM_TYPE_770M,
    LLM_TYPE_780M,
    LLM_TYPE_0_5B,
+    LLM_TYPE_0_6B,
    LLM_TYPE_1B,
    LLM_TYPE_1_3B,
    LLM_TYPE_1_4B,
    LLM_TYPE_1_5B,
    LLM_TYPE_1_6B,
+    LLM_TYPE_1_7B,
    LLM_TYPE_1_8B,
    LLM_TYPE_2B,
    LLM_TYPE_2_8B,
@ -64,6 +66,7 @@ enum llm_type {
    LLM_TYPE_16B,
    LLM_TYPE_20B,
    LLM_TYPE_22B,
+    LLM_TYPE_27B,
    LLM_TYPE_30B,
    LLM_TYPE_32B,
    LLM_TYPE_34B,
@ -73,6 +76,7 @@ enum llm_type {
    LLM_TYPE_70B,
    LLM_TYPE_90B,
    LLM_TYPE_236B,
+    LLM_TYPE_290B,
    LLM_TYPE_314B,
    LLM_TYPE_671B,
    LLM_TYPE_SMALL,
@ -87,10 +91,10 @@ enum llm_type {
    LLM_TYPE_16x3_8B,
    LLM_TYPE_10B_128x3_66B,
    LLM_TYPE_57B_A14B,
-    LLM_TYPE_27B,
-    LLM_TYPE_290B,
    LLM_TYPE_17B_16E, // llama4 Scout
    LLM_TYPE_17B_128E, // llama4 Maverick
+    LLM_TYPE_30B_A3B,
+    LLM_TYPE_235B_A22B,
 };

 struct llama_layer_posnet {
--- a/llama/llama.cpp/src/llama-quant.cpp
+++ b/llama/llama.cpp/src/llama-quant.cpp
@ -744,10 +744,6 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
        // This used to be a regex, but <regex> has an extreme cost to compile times.
        bool quantize = name.rfind("weight") == name.size() - 6; // ends with 'weight'?

-        // don't quantize vision stuff
-        quantize &= name.find("v.") == std::string::npos;
-        quantize &= name.find("mm.") == std::string::npos;
-
        // quantize only 2D and 3D tensors (experts)
        quantize &= (ggml_n_dims(tensor) >= 2);

--- a/llama/llama.cpp/src/llama-sampling.cpp
+++ b/llama/llama.cpp/src/llama-sampling.cpp
@ -232,7 +232,7 @@ static void llama_sampler_top_k_impl(llama_token_data_array * cur_p, int32_t k)
    // }

    if (k <= 0) {
-        k = cur_p->size;
+        return;
    }

    k = std::min(k, (int) cur_p->size);
@ -298,6 +298,7 @@ static void llama_sampler_top_k_impl(llama_token_data_array * cur_p, int32_t k)
        }
        cur_p->sorted = true;
    }
+
    cur_p->size = k;
 }

--- a/llama/llama.cpp/src/llama-vocab.cpp
+++ b/llama/llama.cpp/src/llama-vocab.cpp
@ -1497,7 +1497,8 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
                    tokenizer_pre == "llama3"   ||
                    tokenizer_pre == "llama-v3" ||
                    tokenizer_pre == "llama-bpe"||
-                    tokenizer_pre == "falcon3") {
+                    tokenizer_pre == "falcon3"  ||
+                    tokenizer_pre == "pixtral") {
                pre_type = LLAMA_VOCAB_PRE_TYPE_LLAMA3;
                ignore_merges = true;
                add_bos = true;
--- a/llama/llama.go
+++ b/llama/llama.go
@ -2,6 +2,7 @@ package llama

 /*
 #cgo CFLAGS: -std=c11
+#cgo windows CFLAGS: -Wno-dll-attribute-on-redeclaration
 #cgo CXXFLAGS: -std=c++17
 #cgo CPPFLAGS: -I${SRCDIR}/llama.cpp/include
 #cgo CPPFLAGS: -I${SRCDIR}/llama.cpp/common
@ -198,7 +199,6 @@ type ModelParams struct {
 	NumGpuLayers int
 	MainGpu      int
 	UseMmap      bool
-	UseMlock     bool
 	TensorSplit  []float32
 	Progress     func(float32)
 	VocabOnly    bool
@ -217,7 +217,6 @@ func LoadModelFromFile(modelPath string, params ModelParams) (*Model, error) {
 	cparams.n_gpu_layers = C.int(params.NumGpuLayers)
 	cparams.main_gpu = C.int32_t(params.MainGpu)
 	cparams.use_mmap = C.bool(params.UseMmap)
-	cparams.use_mlock = C.bool(params.UseMlock)
 	cparams.vocab_only = C.bool(params.VocabOnly)

 	if len(params.TensorSplit) > 0 {
@ -461,24 +460,6 @@ func (m *Model) NEmbd() int {
 	return int(C.llama_model_n_embd(m.c))
 }

-func Quantize(infile, outfile string, ftype uint32) error {
-	cinfile := C.CString(infile)
-	defer C.free(unsafe.Pointer(cinfile))
-
-	coutfile := C.CString(outfile)
-	defer C.free(unsafe.Pointer(coutfile))
-
-	params := C.llama_model_quantize_default_params()
-	params.nthread = -1
-	params.ftype = ftype
-
-	if rc := C.llama_model_quantize(cinfile, coutfile, &params); rc != 0 {
-		return fmt.Errorf("llama_model_quantize: %d", rc)
-	}
-
-	return nil
-}
-
 // vision processing
 type ClipContext struct {
 	c *C.struct_clip_ctx
@ -606,9 +587,6 @@ type SamplingParams struct {
 	PenaltyRepeat  float32
 	PenaltyFreq    float32
 	PenaltyPresent float32
-	Mirostat       int
-	MirostatTau    float32
-	MirostatEta    float32
 	PenalizeNl     bool
 	Seed           uint32
 	Grammar        string
@ -625,9 +603,6 @@ func NewSamplingContext(model *Model, params SamplingParams) (*SamplingContext,
 	cparams.penalty_repeat = C.float(params.PenaltyRepeat)
 	cparams.penalty_freq = C.float(params.PenaltyFreq)
 	cparams.penalty_present = C.float(params.PenaltyFreq)
-	cparams.mirostat = C.int32_t(params.Mirostat)
-	cparams.mirostat_tau = C.float(params.MirostatTau)
-	cparams.mirostat_eta = C.float(params.MirostatEta)
 	cparams.seed = C.uint32_t(params.Seed)

 	grammar := C.CString(params.Grammar)
--- a/llama/patches/0001-ggml-backend-malloc-and-free-using-the-same-compiler.patch
+++ b/llama/patches/0001-ggml-backend-malloc-and-free-using-the-same-compiler.patch
@ -85,7 +85,7 @@ index e2617b06..242e50a7 100644
 
 /**
 diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
-index a7febef7..31750b6f 100644
+index 9fb2134f..04ce764e 100644
 --- a/ggml/src/ggml-cuda/ggml-cuda.cu
 +++ b/ggml/src/ggml-cuda/ggml-cuda.cu
@@ -534,6 +534,7 @@ struct ggml_backend_cuda_buffer_context {
@ -125,10 +125,10 @@ index 50579227..2799a0a5 100644
 
 static void * ggml_backend_kompute_buffer_get_base(ggml_backend_buffer_t buffer) {
 diff --git a/ggml/src/ggml-metal/ggml-metal.m b/ggml/src/ggml-metal/ggml-metal.m
-index 266d8af4..12886cd3 100644
+index d92392ed..425524d0 100644
 --- a/ggml/src/ggml-metal/ggml-metal.m
 +++ b/ggml/src/ggml-metal/ggml-metal.m
-@@ -4759,6 +4759,7 @@ static void ggml_backend_metal_buffer_free_buffer(ggml_backend_buffer_t buffer)
+@@ -5077,6 +5077,7 @@ static void ggml_backend_metal_buffer_free_buffer(ggml_backend_buffer_t buffer)
     }
 
     free(ctx);
@ -149,10 +149,10 @@ index 05a2f4e6..392cc18d 100644
 
 static void * ggml_backend_opencl_buffer_get_base(ggml_backend_buffer_t buffer) {
 diff --git a/ggml/src/ggml-rpc/ggml-rpc.cpp b/ggml/src/ggml-rpc/ggml-rpc.cpp
-index a0667b7d..bd83adc5 100644
+index 140a775f..e33c4ba0 100644
 --- a/ggml/src/ggml-rpc/ggml-rpc.cpp
 +++ b/ggml/src/ggml-rpc/ggml-rpc.cpp
-@@ -468,6 +468,7 @@ static void ggml_backend_rpc_buffer_free_buffer(ggml_backend_buffer_t buffer) {
+@@ -477,6 +477,7 @@ static void ggml_backend_rpc_buffer_free_buffer(ggml_backend_buffer_t buffer) {
     bool status = send_rpc_cmd(ctx->sock, RPC_CMD_FREE_BUFFER, &request, sizeof(request), nullptr, 0);
     GGML_ASSERT(status);
     delete ctx;
@ -161,10 +161,10 @@ index a0667b7d..bd83adc5 100644
 
 static void * ggml_backend_rpc_buffer_get_base(ggml_backend_buffer_t buffer) {
 diff --git a/ggml/src/ggml-sycl/ggml-sycl.cpp b/ggml/src/ggml-sycl/ggml-sycl.cpp
-index 1de34c96..4600f61e 100644
+index 66b6f2cc..e3e6deae 100644
 --- a/ggml/src/ggml-sycl/ggml-sycl.cpp
 +++ b/ggml/src/ggml-sycl/ggml-sycl.cpp
-@@ -316,6 +316,7 @@ ggml_backend_sycl_buffer_free_buffer(ggml_backend_buffer_t buffer) try {
+@@ -317,6 +317,7 @@ ggml_backend_sycl_buffer_free_buffer(ggml_backend_buffer_t buffer) try {
     ggml_sycl_set_device(ctx->device);
 
     delete ctx;
@ -172,7 +172,7 @@ index 1de34c96..4600f61e 100644
 }
 catch (sycl::exception const &exc) {
   std::cerr << exc.what() << "Exception caught at file:" << __FILE__
-@@ -761,6 +762,7 @@ struct ggml_backend_sycl_split_buffer_context {
+@@ -762,6 +763,7 @@ struct ggml_backend_sycl_split_buffer_context {
 static void ggml_backend_sycl_split_buffer_free_buffer(ggml_backend_buffer_t buffer) {
     ggml_backend_sycl_split_buffer_context * ctx = (ggml_backend_sycl_split_buffer_context *)buffer->context;
     delete ctx;
@ -180,7 +180,7 @@ index 1de34c96..4600f61e 100644
 }
 
 static void * ggml_backend_sycl_split_buffer_get_base(ggml_backend_buffer_t buffer) {
-@@ -1095,6 +1097,7 @@ static const char * ggml_backend_sycl_host_buffer_type_name(ggml_backend_buffer_
+@@ -1096,6 +1098,7 @@ static const char * ggml_backend_sycl_host_buffer_type_name(ggml_backend_buffer_
 
 static void ggml_backend_sycl_host_buffer_free_buffer(ggml_backend_buffer_t buffer) {
     ggml_sycl_host_free(buffer->context);
@ -189,10 +189,10 @@ index 1de34c96..4600f61e 100644
 
 static ggml_backend_buffer_t ggml_backend_sycl_host_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
 diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
-index 39f3cd34..c569a8a5 100644
+index c0bdb9e1..03d03064 100644
 --- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp
 +++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
-@@ -8653,6 +8653,7 @@ static void ggml_backend_vk_buffer_free_buffer(ggml_backend_buffer_t buffer) {
+@@ -8660,6 +8660,7 @@ static void ggml_backend_vk_buffer_free_buffer(ggml_backend_buffer_t buffer) {
     ggml_backend_vk_buffer_context * ctx = (ggml_backend_vk_buffer_context *)buffer->context;
     ggml_vk_destroy_buffer(ctx->dev_buffer);
     delete ctx;
@ -200,7 +200,7 @@ index 39f3cd34..c569a8a5 100644
 }
 
 static void * ggml_backend_vk_buffer_get_base(ggml_backend_buffer_t buffer) {
-@@ -8796,6 +8797,7 @@ static const char * ggml_backend_vk_host_buffer_name(ggml_backend_buffer_t buffe
+@@ -8803,6 +8804,7 @@ static const char * ggml_backend_vk_host_buffer_name(ggml_backend_buffer_t buffe
 static void ggml_backend_vk_host_buffer_free_buffer(ggml_backend_buffer_t buffer) {
     VK_LOG_MEMORY("ggml_backend_vk_host_buffer_free_buffer()");
     ggml_vk_host_free(vk_instance.devices[0], buffer->context);
--- a/llama/patches/0002-pretokenizer.patch
+++ b/llama/patches/0002-pretokenizer.patch
@ -10,7 +10,7 @@ logs instead of throwing an error
 1 file changed, 3 insertions(+), 11 deletions(-)

 diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp
-index 48060517..a35b498c 100644
+index 50ded286..a9ee9f03 100644
 --- a/src/llama-vocab.cpp
 +++ b/src/llama-vocab.cpp
@@ -1491,16 +1491,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
@ -31,7 +31,7 @@ index 48060517..a35b498c 100644
                 pre_type = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
             } else if (
                     tokenizer_pre == "llama3"   ||
-@@ -1634,7 +1625,8 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
+@@ -1635,7 +1626,8 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
                 pre_type = LLAMA_VOCAB_PRE_TYPE_BAILINGMOE;
                 clean_spaces = false;
             } else {
--- a/llama/patches/0003-embeddings.patch
+++ b/llama/patches/0003-embeddings.patch
@ -11,10 +11,10 @@ instead of forcing one or the error
 1 file changed, 3 insertions(+), 3 deletions(-)

 diff --git a/src/llama-context.cpp b/src/llama-context.cpp
-index 983385f8..32f59819 100644
+index 5a2eef9b..9c1fe93f 100644
 --- a/src/llama-context.cpp
 +++ b/src/llama-context.cpp
-@@ -1236,7 +1236,7 @@ int llama_context::decode(llama_batch & inp_batch) {
+@@ -1225,7 +1225,7 @@ int llama_context::decode(llama_batch & inp_batch) {
     int64_t n_outputs_all = 0;
 
     // count outputs
@ -23,7 +23,7 @@ index 983385f8..32f59819 100644
         for (uint32_t i = 0; i < n_tokens_all; ++i) {
             n_outputs_all += batch.logits[i] != 0;
         }
-@@ -1348,7 +1348,7 @@ int llama_context::decode(llama_batch & inp_batch) {
+@@ -1337,7 +1337,7 @@ int llama_context::decode(llama_batch & inp_batch) {
         //    ggml_graph_dump_dot(gf, NULL, "llama.dot");
         //}
 
@ -32,7 +32,7 @@ index 983385f8..32f59819 100644
         auto * t_embd   = cparams.embeddings ? res->get_embd() : nullptr;
 
         if (t_embd && res->get_embd_pooled()) {
-@@ -1492,7 +1492,7 @@ int32_t llama_context::output_reserve(int32_t n_outputs) {
+@@ -1481,7 +1481,7 @@ int32_t llama_context::output_reserve(int32_t n_outputs) {
     const auto n_embd  = hparams.n_embd;
 
     // TODO: use a per-batch flag for logits presence instead
--- a/llama/patches/0004-clip-unicode.patch
+++ b/llama/patches/0004-clip-unicode.patch
@ -10,12 +10,12 @@ filesystems for paths that include wide characters
 1 file changed, 39 insertions(+)

 diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp
-index 75970615..d57b4bd6 100644
+index ad3e7df1..b3218c78 100644
 --- a/examples/llava/clip.cpp
 +++ b/examples/llava/clip.cpp
-@@ -29,6 +29,19 @@
- #include <limits>
+@@ -30,6 +30,19 @@
 #include <array>
+ #include <numeric>
 
 +#if defined(_WIN32)
 +#define WIN32_LEAN_AND_MEAN
@ -33,7 +33,7 @@ index 75970615..d57b4bd6 100644
 struct clip_logger_state g_logger_state = {GGML_LOG_LEVEL_CONT, clip_log_callback_default, NULL};
 
 //#define CLIP_DEBUG_FUNCTIONS
-@@ -1430,7 +1443,29 @@ struct clip_model_loader {
+@@ -1971,7 +1984,29 @@ struct clip_model_loader {
         {
             std::vector<uint8_t> read_buf;
 
@ -63,7 +63,7 @@ index 75970615..d57b4bd6 100644
             if (!fin) {
                 throw std::runtime_error(string_format("%s: failed to open %s\n", __func__, fname.c_str()));
             }
-@@ -1457,7 +1492,11 @@ struct clip_model_loader {
+@@ -1998,7 +2033,11 @@ struct clip_model_loader {
                     ggml_backend_tensor_set(cur, read_buf.data(), 0, num_bytes);
                 }
             }
--- a/llama/patches/0005-solar-pro.patch
+++ b/llama/patches/0005-solar-pro.patch
@ -15,10 +15,10 @@ adds support for the Solar Pro architecture
 7 files changed, 248 insertions(+)

 diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp
-index 62e1480b..f754bc8f 100644
+index f2bc8ca7..5ab3f572 100644
 --- a/src/llama-arch.cpp
 +++ b/src/llama-arch.cpp
-@@ -68,6 +68,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
+@@ -69,6 +69,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
     { LLM_ARCH_GRANITE,          "granite"          },
     { LLM_ARCH_GRANITE_MOE,      "granitemoe"       },
     { LLM_ARCH_CHAMELEON,        "chameleon"        },
@ -26,7 +26,7 @@ index 62e1480b..f754bc8f 100644
     { LLM_ARCH_WAVTOKENIZER_DEC, "wavtokenizer-dec" },
     { LLM_ARCH_PLM,              "plm"              },
     { LLM_ARCH_BAILINGMOE,       "bailingmoe"       },
-@@ -140,6 +141,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
+@@ -142,6 +143,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
     { LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT,       "%s.attention.relative_buckets_count"       },
     { LLM_KV_ATTENTION_SLIDING_WINDOW,               "%s.attention.sliding_window"               },
     { LLM_KV_ATTENTION_SCALE,                        "%s.attention.scale"                        },
@ -34,7 +34,7 @@ index 62e1480b..f754bc8f 100644
     { LLM_KV_ATTENTION_KEY_LENGTH_MLA,               "%s.attention.key_length_mla"               },
     { LLM_KV_ATTENTION_VALUE_LENGTH_MLA,             "%s.attention.value_length_mla"             },
 
-@@ -1482,6 +1484,24 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
+@@ -1502,6 +1504,24 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
             { LLM_TENSOR_ATTN_K_NORM,     "blk.%d.attn_k_norm" },
         },
     },
@ -59,7 +59,7 @@ index 62e1480b..f754bc8f 100644
     {
         LLM_ARCH_WAVTOKENIZER_DEC,
         {
-@@ -1660,6 +1680,7 @@ static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
+@@ -1680,6 +1700,7 @@ static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
     {LLM_TENSOR_FFN_EXP_PROBS_B,            {LLM_TENSOR_LAYER_REPEATING, GGML_OP_ADD}},
     // this tensor is loaded for T5, but never used
     {LLM_TENSOR_DEC_CROSS_ATTN_REL_B,       {LLM_TENSOR_LAYER_REPEATING, GGML_OP_NONE}},
@ -68,10 +68,10 @@ index 62e1480b..f754bc8f 100644
     {LLM_TENSOR_POS_NET_NORM,               {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
     {LLM_TENSOR_POS_NET_NORM1,              {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
 diff --git a/src/llama-arch.h b/src/llama-arch.h
-index 98ca00a1..439aaeab 100644
+index 41a023da..525c1b7d 100644
 --- a/src/llama-arch.h
 +++ b/src/llama-arch.h
-@@ -72,6 +72,7 @@ enum llm_arch {
+@@ -73,6 +73,7 @@ enum llm_arch {
     LLM_ARCH_GRANITE,
     LLM_ARCH_GRANITE_MOE,
     LLM_ARCH_CHAMELEON,
@ -79,7 +79,7 @@ index 98ca00a1..439aaeab 100644
     LLM_ARCH_WAVTOKENIZER_DEC,
     LLM_ARCH_PLM,
     LLM_ARCH_BAILINGMOE,
-@@ -144,6 +145,7 @@ enum llm_kv {
+@@ -146,6 +147,7 @@ enum llm_kv {
     LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT,
     LLM_KV_ATTENTION_SLIDING_WINDOW,
     LLM_KV_ATTENTION_SCALE,
@ -87,7 +87,7 @@ index 98ca00a1..439aaeab 100644
     LLM_KV_ATTENTION_KEY_LENGTH_MLA,
     LLM_KV_ATTENTION_VALUE_LENGTH_MLA,
 
-@@ -344,6 +346,7 @@ enum llm_tensor {
+@@ -346,6 +348,7 @@ enum llm_tensor {
     LLM_TENSOR_ENC_OUTPUT_NORM,
     LLM_TENSOR_CLS,
     LLM_TENSOR_CLS_OUT,
@ -115,7 +115,7 @@ index 90dfe7a7..8a667960 100644
     if (il < n_layer) {
         return n_swa > 0 && n_swa_pattern > 0 && il % n_swa_pattern < (n_swa_pattern - 1);
 diff --git a/src/llama-hparams.h b/src/llama-hparams.h
-index 80fcd65d..6e278945 100644
+index 7ee6a5b7..48dce407 100644
 --- a/src/llama-hparams.h
 +++ b/src/llama-hparams.h
@@ -55,6 +55,8 @@ struct llama_hparams {
@ -127,7 +127,7 @@ index 80fcd65d..6e278945 100644
     uint32_t n_layer_dense_lead = 0;
     uint32_t n_lora_q           = 0;
     uint32_t n_lora_kv          = 0;
-@@ -153,6 +155,9 @@ struct llama_hparams {
+@@ -154,6 +156,9 @@ struct llama_hparams {
     // dimension of the recurrent state embeddings
     uint32_t n_embd_v_s() const;
 
@ -150,10 +150,10 @@ index ea73a8a7..a012aeae 100644
 llama_model_loader::llama_model_loader(
         const std::string & fname,
 diff --git a/src/llama-model.cpp b/src/llama-model.cpp
-index 6b7bfecf..aba42819 100644
+index 822e2bb2..572378c9 100644
 --- a/src/llama-model.cpp
 +++ b/src/llama-model.cpp
-@@ -1374,6 +1374,21 @@ void llama_model::load_hparams(llama_model_loader & ml) {
+@@ -1386,6 +1386,21 @@ void llama_model::load_hparams(llama_model_loader & ml) {
                     default: type = LLM_TYPE_UNKNOWN;
                }
             } break;
@ -175,7 +175,7 @@ index 6b7bfecf..aba42819 100644
         case LLM_ARCH_WAVTOKENIZER_DEC:
             {
                 ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS,    hparams.f_norm_eps);
-@@ -3717,6 +3732,34 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
+@@ -3741,6 +3756,34 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
 
                         layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
 
@ -210,7 +210,7 @@ index 6b7bfecf..aba42819 100644
                         layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, 0);
                         layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
                         layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
-@@ -12296,6 +12339,165 @@ struct llm_build_chameleon : public llm_graph_context {
+@@ -12342,6 +12385,165 @@ struct llm_build_chameleon : public llm_graph_context {
     }
 };
 
@ -376,7 +376,7 @@ index 6b7bfecf..aba42819 100644
 struct llm_build_wavtokenizer_dec : public llm_graph_context {
     llm_build_wavtokenizer_dec(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
         ggml_tensor * cur;
-@@ -13045,6 +13247,10 @@ llm_graph_result_ptr llama_model::build_graph(
+@@ -13092,6 +13294,10 @@ llm_graph_result_ptr llama_model::build_graph(
             {
                 llm = std::make_unique<llm_build_chameleon>(*this, params, gf);
             } break;
@ -387,7 +387,7 @@ index 6b7bfecf..aba42819 100644
         case LLM_ARCH_WAVTOKENIZER_DEC:
             {
                 llm = std::make_unique<llm_build_wavtokenizer_dec>(*this, params, gf);
-@@ -13191,6 +13397,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
+@@ -13238,6 +13444,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
         case LLM_ARCH_GRANITE:
         case LLM_ARCH_GRANITE_MOE:
         case LLM_ARCH_CHAMELEON:
@ -396,18 +396,18 @@ index 6b7bfecf..aba42819 100644
             return LLAMA_ROPE_TYPE_NORM;
 
 diff --git a/src/llama-model.h b/src/llama-model.h
-index fd82d106..5865d5e9 100644
+index 95eca002..856e6042 100644
 --- a/src/llama-model.h
 +++ b/src/llama-model.h
-@@ -62,6 +62,7 @@ enum llm_type {
+@@ -64,6 +64,7 @@ enum llm_type {
     LLM_TYPE_15B,
     LLM_TYPE_16B,
     LLM_TYPE_20B,
 +    LLM_TYPE_22B,
+     LLM_TYPE_27B,
     LLM_TYPE_30B,
     LLM_TYPE_32B,
-     LLM_TYPE_34B,
-@@ -307,6 +308,8 @@ struct llama_layer {
+@@ -311,6 +312,8 @@ struct llama_layer {
     struct ggml_tensor * ffn_up_scale   = nullptr;
     struct ggml_tensor * ffn_down_scale = nullptr;
 
--- a/llama/patches/0006-add-mllama-support.patch
+++ b/llama/patches/0006-add-mllama-support.patch
@ -5,7 +5,6 @@ Subject: [PATCH] add mllama support

 adds support for the llama 3.2 vision architecture
 ---
- examples/llava/gemma3-cli.cpp |   3 +-
 examples/llava/llava.cpp      |   5 +-
 examples/llava/mtmd.cpp       |   6 +-
 ggml/src/ggml-backend-reg.cpp |   6 +-
@ -25,34 +24,13 @@ adds support for the llama 3.2 vision architecture
 src/llama-model.cpp           | 309 +++++++++++++++++++++++++++++++++-
 src/llama-model.h             |  12 ++
 src/llama-quant.cpp           |   4 +-
- 20 files changed, 475 insertions(+), 22 deletions(-)
+ 19 files changed, 473 insertions(+), 21 deletions(-)

-diff --git a/examples/llava/gemma3-cli.cpp b/examples/llava/gemma3-cli.cpp
-index 3d566475..654d1358 100644
--- a/examples/llava/gemma3-cli.cpp
-+++ b/examples/llava/gemma3-cli.cpp
-@@ -106,7 +106,7 @@ struct decode_embd_batch {
-     std::vector<llama_seq_id *> seq_ids;
-     std::vector<int8_t>         logits;
-     llama_batch batch;
-    decode_embd_batch(float * embd, int32_t n_tokens, llama_pos pos_0, llama_seq_id seq_id) {
-+    decode_embd_batch(float * embd, int32_t n_embd, int32_t n_tokens, llama_pos pos_0, llama_seq_id seq_id) {
-         pos     .resize(n_tokens);
-         n_seq_id.resize(n_tokens);
-         seq_ids .resize(n_tokens + 1);
-@@ -118,6 +118,7 @@ struct decode_embd_batch {
-             /*n_tokens       =*/ n_tokens,
-             /*tokens         =*/ nullptr,
-             /*embd           =*/ embd,
-+            /*n_embd         =*/ n_embd,
-             /*pos            =*/ pos.data(),
-             /*n_seq_id       =*/ n_seq_id.data(),
-             /*seq_id         =*/ seq_ids.data(),
 diff --git a/examples/llava/llava.cpp b/examples/llava/llava.cpp
-index 03a22cbb..5eb40bcd 100644
+index c00d16ae..bab027b5 100644
 --- a/examples/llava/llava.cpp
 +++ b/examples/llava/llava.cpp
-@@ -456,7 +456,7 @@ struct llava_embd_batch {
+@@ -457,7 +457,7 @@ struct llava_embd_batch {
     std::vector<llama_seq_id *> seq_ids;
     std::vector<int8_t>         logits;
     llama_batch batch;
@ -61,7 +39,7 @@ index 03a22cbb..5eb40bcd 100644
         pos     .resize(n_tokens);
         n_seq_id.resize(n_tokens);
         seq_ids .resize(n_tokens + 1);
-@@ -468,6 +468,7 @@ struct llava_embd_batch {
+@@ -469,6 +469,7 @@ struct llava_embd_batch {
             /*n_tokens       =*/ n_tokens,
             /*tokens         =*/ nullptr,
             /*embd           =*/ embd,
@ -69,7 +47,7 @@ index 03a22cbb..5eb40bcd 100644
             /*pos            =*/ pos.data(),
             /*n_seq_id       =*/ n_seq_id.data(),
             /*seq_id         =*/ seq_ids.data(),
-@@ -491,7 +492,7 @@ bool llava_eval_image_embed(llama_context * ctx_llama, const struct llava_image_
+@@ -492,7 +493,7 @@ bool llava_eval_image_embed(llama_context * ctx_llama, const struct llava_image_
             n_eval = n_batch;
         }
         float * embd = image_embed->embed+i*n_embd;
@ -79,19 +57,19 @@ index 03a22cbb..5eb40bcd 100644
             LOG_ERR("%s : failed to eval\n", __func__);
             return false;
 diff --git a/examples/llava/mtmd.cpp b/examples/llava/mtmd.cpp
-index 3fd5bebc..f0cec596 100644
+index 7081fd73..c14ac501 100644
 --- a/examples/llava/mtmd.cpp
 +++ b/examples/llava/mtmd.cpp
-@@ -233,7 +233,7 @@ struct decode_embd_batch {
+@@ -476,7 +476,7 @@ struct decode_embd_batch {
     std::vector<llama_seq_id *> seq_ids;
     std::vector<int8_t>         logits;
     llama_batch batch;
-    decode_embd_batch(float * embd, int32_t n_tokens, llama_pos pos_0, llama_seq_id seq_id) {
-+    decode_embd_batch(float * embd, int32_t n_embd, int32_t n_tokens, llama_pos pos_0, llama_seq_id seq_id) {
-         pos     .resize(n_tokens);
+-    decode_embd_batch(float * embd, int32_t n_tokens, int n_pos_per_embd, int n_mmproj_embd) : n_pos_per_embd(n_pos_per_embd), n_mmproj_embd(n_mmproj_embd) {
+    decode_embd_batch(float * embd, int32_t n_embd, int32_t n_tokens, llama_pos pos_0, llama_seq_id seq_id) : n_pos_per_embd(n_pos_per_embd), n_mmproj_embd(n_mmproj_embd) {
+         pos     .resize(n_tokens * n_pos_per_embd);
         n_seq_id.resize(n_tokens);
         seq_ids .resize(n_tokens + 1);
-@@ -245,6 +245,7 @@ struct decode_embd_batch {
+@@ -487,6 +487,7 @@ struct decode_embd_batch {
             /*n_tokens       =*/ n_tokens,
             /*tokens         =*/ nullptr,
             /*embd           =*/ embd,
@ -99,16 +77,16 @@ index 3fd5bebc..f0cec596 100644
             /*pos            =*/ pos.data(),
             /*n_seq_id       =*/ n_seq_id.data(),
             /*seq_id         =*/ seq_ids.data(),
-@@ -311,7 +312,8 @@ int32_t mtmd_helper_eval(mtmd_context * ctx,
- 
-             int32_t n_tokens = mtmd_image_tokens_get_n_tokens(chunk.tokens_image.get());
+@@ -610,7 +611,8 @@ int32_t mtmd_helper_eval(mtmd_context * ctx,
+             int32_t i_batch = 0;
+             int32_t n_img_batches = GGML_PAD(n_tokens, n_batch) / n_batch;
             float * embd = mtmd_get_output_embd(ctx);
-            decode_embd_batch batch_img(embd, n_tokens, n_past, 0);
+-            decode_embd_batch batch_embd(embd, n_tokens, n_pos_per_embd, n_mmproj_embd);
 +            int n_embd  = llama_model_n_embd(llama_get_model(lctx));
-+            decode_embd_batch batch_img(embd, n_embd, n_tokens, n_past, 0);
-             int64_t t1 = ggml_time_ms();
-             ret = llama_decode(lctx, batch_img.batch);
-             if (ret != 0) {
+            decode_embd_batch batch_embd(embd, n_embd, n_tokens, n_past, 0);
+ 
+             const int nx = mtmd_image_tokens_get_nx(chunk.tokens_image.get());
+             const int ny = mtmd_image_tokens_get_ny(chunk.tokens_image.get());
 diff --git a/ggml/src/ggml-backend-reg.cpp b/ggml/src/ggml-backend-reg.cpp
 index 405d8e31..82ae1b5b 100644
 --- a/ggml/src/ggml-backend-reg.cpp
@ -127,10 +105,10 @@ index 405d8e31..82ae1b5b 100644
         register_backend(ggml_backend_rpc_reg());
 #endif
 diff --git a/include/llama.h b/include/llama.h
-index 5657fbf0..f91896e4 100644
+index 06c56395..f1628e88 100644
 --- a/include/llama.h
 +++ b/include/llama.h
-@@ -255,6 +255,7 @@ extern "C" {
+@@ -256,6 +256,7 @@ extern "C" {
 
         llama_token  *  token;
         float        *  embd;
@ -138,7 +116,7 @@ index 5657fbf0..f91896e4 100644
         llama_pos    *  pos;
         int32_t      *  n_seq_id;
         llama_seq_id ** seq_id;
-@@ -357,6 +358,7 @@ extern "C" {
+@@ -358,6 +359,7 @@ extern "C" {
         bool offload_kqv; // whether to offload the KQV ops (including the KV cache) to GPU
         bool flash_attn;  // whether to use flash attention [EXPERIMENTAL]
         bool no_perf;     // whether to measure performance timings
@ -146,7 +124,7 @@ index 5657fbf0..f91896e4 100644
 
         // Abort callback
         // if it returns true, execution of llama_decode() will be aborted
-@@ -458,6 +460,10 @@ extern "C" {
+@@ -459,6 +461,10 @@ extern "C" {
             struct llama_context_params   params),
             "use llama_init_from_model instead");
 
@ -158,7 +136,7 @@ index 5657fbf0..f91896e4 100644
     LLAMA_API void llama_free(struct llama_context * ctx);
 
 diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp
-index f754bc8f..0568565f 100644
+index 5ab3f572..eb7b5325 100644
 --- a/src/llama-arch.cpp
 +++ b/src/llama-arch.cpp
@@ -6,6 +6,7 @@
@ -169,7 +147,7 @@ index f754bc8f..0568565f 100644
     { LLM_ARCH_LLAMA4,           "llama4"           },
     { LLM_ARCH_DECI,             "deci"             },
     { LLM_ARCH_FALCON,           "falcon"           },
-@@ -142,6 +143,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
+@@ -144,6 +145,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
     { LLM_KV_ATTENTION_SLIDING_WINDOW,               "%s.attention.sliding_window"               },
     { LLM_KV_ATTENTION_SCALE,                        "%s.attention.scale"                        },
     { LLM_KV_ATTENTION_BLOCK_SKIP_CONNECTION,        "%s.attention.block_skip_connection"        },
@ -177,7 +155,7 @@ index f754bc8f..0568565f 100644
     { LLM_KV_ATTENTION_KEY_LENGTH_MLA,               "%s.attention.key_length_mla"               },
     { LLM_KV_ATTENTION_VALUE_LENGTH_MLA,             "%s.attention.value_length_mla"             },
 
-@@ -271,6 +273,40 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
+@@ -273,6 +275,40 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
             { LLM_TENSOR_FFN_UP_SHEXP,    "blk.%d.ffn_up_shexp" },
         },
     },
@ -218,7 +196,7 @@ index f754bc8f..0568565f 100644
     {
         LLM_ARCH_DECI,
         {
-@@ -1681,6 +1717,14 @@ static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
+@@ -1701,6 +1737,14 @@ static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
     // this tensor is loaded for T5, but never used
     {LLM_TENSOR_DEC_CROSS_ATTN_REL_B,       {LLM_TENSOR_LAYER_REPEATING, GGML_OP_NONE}},
     {LLM_TENSOR_BSKCN_TV,                   {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
@ -234,7 +212,7 @@ index f754bc8f..0568565f 100644
     {LLM_TENSOR_POS_NET_NORM,               {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
     {LLM_TENSOR_POS_NET_NORM1,              {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
 diff --git a/src/llama-arch.h b/src/llama-arch.h
-index 439aaeab..6a989034 100644
+index 525c1b7d..bc8a4f0b 100644
 --- a/src/llama-arch.h
 +++ b/src/llama-arch.h
@@ -11,6 +11,7 @@
@ -245,7 +223,7 @@ index 439aaeab..6a989034 100644
     LLM_ARCH_DECI,
     LLM_ARCH_FALCON,
     LLM_ARCH_BAICHUAN,
-@@ -146,6 +147,7 @@ enum llm_kv {
+@@ -148,6 +149,7 @@ enum llm_kv {
     LLM_KV_ATTENTION_SLIDING_WINDOW,
     LLM_KV_ATTENTION_SCALE,
     LLM_KV_ATTENTION_BLOCK_SKIP_CONNECTION,
@ -253,7 +231,7 @@ index 439aaeab..6a989034 100644
     LLM_KV_ATTENTION_KEY_LENGTH_MLA,
     LLM_KV_ATTENTION_VALUE_LENGTH_MLA,
 
-@@ -347,6 +349,14 @@ enum llm_tensor {
+@@ -349,6 +351,14 @@ enum llm_tensor {
     LLM_TENSOR_CLS,
     LLM_TENSOR_CLS_OUT,
     LLM_TENSOR_BSKCN_TV,
@ -297,10 +275,10 @@ index 01d5ca57..8682b0e6 100644
         batch.token = (llama_token *) malloc(sizeof(llama_token) * n_tokens_alloc);
     }
 diff --git a/src/llama-context.cpp b/src/llama-context.cpp
-index 32f59819..0343ba8a 100644
+index 9c1fe93f..cd06ad91 100644
 --- a/src/llama-context.cpp
 +++ b/src/llama-context.cpp
-@@ -862,7 +862,7 @@ float * llama_context::get_logits_ith(int32_t i) {
+@@ -851,7 +851,7 @@ float * llama_context::get_logits_ith(int32_t i) {
             throw std::runtime_error(format("corrupt output buffer (j=%d, n_outputs=%d)", j, n_outputs));
         }
 
@ -309,7 +287,7 @@ index 32f59819..0343ba8a 100644
     } catch (const std::exception & err) {
         LLAMA_LOG_ERROR("%s: invalid logits id %d, reason: %s\n", __func__, i, err.what());
 #ifndef NDEBUG
-@@ -983,6 +983,10 @@ void llama_context::set_warmup(bool value) {
+@@ -972,6 +972,10 @@ void llama_context::set_warmup(bool value) {
     cparams.warmup = value;
 }
 
@ -320,7 +298,7 @@ index 32f59819..0343ba8a 100644
 void llama_context::set_adapter_lora(
             llama_adapter_lora * adapter,
             float scale) {
-@@ -1058,7 +1062,7 @@ int llama_context::encode(llama_batch & inp_batch) {
+@@ -1047,7 +1051,7 @@ int llama_context::encode(llama_batch & inp_batch) {
 
     const int64_t n_embd = hparams.n_embd;
 
@ -329,7 +307,7 @@ index 32f59819..0343ba8a 100644
 
     const llama_ubatch ubatch = sbatch.split_simple(n_tokens);
 
-@@ -1198,10 +1202,9 @@ int llama_context::decode(llama_batch & inp_batch) {
+@@ -1187,10 +1191,9 @@ int llama_context::decode(llama_batch & inp_batch) {
 
     const llama_batch & batch = batch_allocr.batch;
 
@ -341,7 +319,7 @@ index 32f59819..0343ba8a 100644
 
     const int64_t n_tokens_all = batch.n_tokens;
     const int64_t n_embd       = hparams.n_embd;
-@@ -1249,7 +1252,7 @@ int llama_context::decode(llama_batch & inp_batch) {
+@@ -1238,7 +1241,7 @@ int llama_context::decode(llama_batch & inp_batch) {
 
     const bool logits_all = n_outputs_all == n_tokens_all;
 
@ -350,7 +328,7 @@ index 32f59819..0343ba8a 100644
             /* simple_split */ !kv_self->recurrent,
             /* logits_all   */ logits_all);
 
-@@ -1483,12 +1486,11 @@ int llama_context::decode(llama_batch & inp_batch) {
+@@ -1472,12 +1475,11 @@ int llama_context::decode(llama_batch & inp_batch) {
 
 int32_t llama_context::output_reserve(int32_t n_outputs) {
     const auto & hparams = model.hparams;
@ -364,7 +342,7 @@ index 32f59819..0343ba8a 100644
     const auto n_embd  = hparams.n_embd;
 
     // TODO: use a per-batch flag for logits presence instead
-@@ -1558,7 +1560,7 @@ int32_t llama_context::output_reserve(int32_t n_outputs) {
+@@ -1545,7 +1547,7 @@ int32_t llama_context::output_reserve(int32_t n_outputs) {
 void llama_context::output_reorder() {
     auto & out_ids = sbatch.out_ids;
     if (!out_ids.empty()) {
@ -373,7 +351,7 @@ index 32f59819..0343ba8a 100644
         const uint32_t n_embd  = model.hparams.n_embd;
 
         GGML_ASSERT((size_t) n_outputs == out_ids.size());
-@@ -2065,7 +2067,7 @@ size_t llama_context::state_write_data(llama_io_write_i & io) {
+@@ -2052,7 +2054,7 @@ size_t llama_context::state_write_data(llama_io_write_i & io) {
     {
         LLAMA_LOG_DEBUG("%s: - writing logits\n", __func__);
 
@ -382,7 +360,7 @@ index 32f59819..0343ba8a 100644
 
         io.write(&logits_size, sizeof(logits_size));
 
-@@ -2248,6 +2250,7 @@ llama_context_params llama_context_default_params() {
+@@ -2235,6 +2237,7 @@ llama_context_params llama_context_default_params() {
         /*.offload_kqv                 =*/ true,
         /*.flash_attn                  =*/ false,
         /*.no_perf                     =*/ true,
@ -390,7 +368,7 @@ index 32f59819..0343ba8a 100644
         /*.abort_callback              =*/ nullptr,
         /*.abort_callback_data         =*/ nullptr,
     };
-@@ -2375,6 +2378,10 @@ void llama_set_warmup(llama_context * ctx, bool warmup) {
+@@ -2362,6 +2365,10 @@ void llama_set_warmup(llama_context * ctx, bool warmup) {
     ctx->set_warmup(warmup);
 }
 
@ -402,7 +380,7 @@ index 32f59819..0343ba8a 100644
     ctx->synchronize();
 }
 diff --git a/src/llama-context.h b/src/llama-context.h
-index 04facb54..baa03276 100644
+index 5457f077..a50c4afa 100644
 --- a/src/llama-context.h
 +++ b/src/llama-context.h
@@ -65,6 +65,7 @@ struct llama_context {
@ -426,10 +404,10 @@ index 30e550f0..85ad91b9 100644
 
     enum llama_pooling_type pooling_type;
 diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp
-index a85e9728..d740c120 100644
+index fabb9ca2..b67216a4 100644
 --- a/src/llama-graph.cpp
 +++ b/src/llama-graph.cpp
-@@ -546,6 +546,12 @@ void llm_graph_input_attn_cross::set_input(const llama_ubatch * ubatch) {
+@@ -560,6 +560,12 @@ void llm_graph_input_attn_cross::set_input(const llama_ubatch * ubatch) {
     }
 }
 
@ -442,7 +420,7 @@ index a85e9728..d740c120 100644
 //
 // llm_graph_context
 //
-@@ -1506,6 +1512,25 @@ llm_graph_input_attn_cross * llm_graph_context::build_attn_inp_cross() const {
+@@ -1532,6 +1538,25 @@ llm_graph_input_attn_cross * llm_graph_context::build_attn_inp_cross() const {
     return (llm_graph_input_attn_cross *) res->add_input(std::move(inp));
 }
 
@ -469,7 +447,7 @@ index a85e9728..d740c120 100644
         llm_graph_input_attn_cross * inp,
         ggml_cgraph * gf,
 diff --git a/src/llama-graph.h b/src/llama-graph.h
-index d192dc14..260a2af2 100644
+index d0c8d321..0fe18150 100644
 --- a/src/llama-graph.h
 +++ b/src/llama-graph.h
@@ -86,6 +86,7 @@ public:
@ -480,7 +458,7 @@ index d192dc14..260a2af2 100644
 };
 
 class llm_graph_input_pos : public llm_graph_input_i {
-@@ -285,6 +286,16 @@ public:
+@@ -283,6 +284,16 @@ public:
     const llama_cross * cross = nullptr;
 };
 
@ -497,7 +475,7 @@ index d192dc14..260a2af2 100644
 //
 // llm_graph_result
 //
-@@ -493,6 +504,7 @@ struct llm_graph_context {
+@@ -491,6 +502,7 @@ struct llm_graph_context {
     ggml_tensor * build_inp_cls() const;
     ggml_tensor * build_inp_s_copy() const;
     ggml_tensor * build_inp_s_mask() const;
@ -518,7 +496,7 @@ index 8a667960..6a02de03 100644
 +    return std::find(cross_attn_layers.begin(), cross_attn_layers.end(), il) != cross_attn_layers.end();
 +}
 diff --git a/src/llama-hparams.h b/src/llama-hparams.h
-index 6e278945..c8a34d52 100644
+index 48dce407..b6fc7e6d 100644
 --- a/src/llama-hparams.h
 +++ b/src/llama-hparams.h
@@ -2,6 +2,8 @@
@ -546,7 +524,7 @@ index 6e278945..c8a34d52 100644
 
     uint32_t n_layer_dense_lead = 0;
     uint32_t n_lora_q           = 0;
-@@ -158,6 +162,9 @@ struct llama_hparams {
+@@ -159,6 +163,9 @@ struct llama_hparams {
     // Block skip connection
     bool n_bskcn(uint32_t n, uint32_t il) const;
 
@ -593,10 +571,10 @@ index a012aeae..2e11507d 100644
     bool llama_model_loader::get_arr(const std::string & key, std::array<T, N_MAX> & result, bool required) {
         const int kid = gguf_find_key(meta.get(), key.c_str());
 diff --git a/src/llama-model.cpp b/src/llama-model.cpp
-index aba42819..d051696c 100644
+index 572378c9..9d099f11 100644
 --- a/src/llama-model.cpp
 +++ b/src/llama-model.cpp
-@@ -419,6 +419,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
+@@ -423,6 +423,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
 
     // get general kv
     ml.get_key(LLM_KV_GENERAL_NAME, name, false);
@ -604,7 +582,7 @@ index aba42819..d051696c 100644
 
     // everything past this point is not vocab-related
     if (hparams.vocab_only) {
-@@ -430,6 +431,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
+@@ -434,6 +435,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
     ml.get_key(LLM_KV_BLOCK_COUNT,       hparams.n_layer);
     ml.get_key(LLM_KV_EXPERT_COUNT,      hparams.n_expert,      false);
     ml.get_key(LLM_KV_EXPERT_USED_COUNT, hparams.n_expert_used, false);
@ -612,7 +590,7 @@ index aba42819..d051696c 100644
 
     if (arch == LLM_ARCH_WAVTOKENIZER_DEC) {
         ml.get_key(LLM_KV_FEATURES_LENGTH, hparams.n_embd_features);
-@@ -453,9 +455,11 @@ void llama_model::load_hparams(llama_model_loader & ml) {
+@@ -457,9 +459,11 @@ void llama_model::load_hparams(llama_model_loader & ml) {
     std::fill(hparams.n_head_arr.begin(),    hparams.n_head_arr.end(),    0);
     std::fill(hparams.n_head_kv_arr.begin(), hparams.n_head_kv_arr.end(), 0);
     std::fill(hparams.n_ff_arr.begin(),      hparams.n_ff_arr.end(),      0);
@ -624,7 +602,7 @@ index aba42819..d051696c 100644
 
     // n_head_kv is optional, default to n_head
     hparams.n_head_kv_arr = hparams.n_head_arr;
-@@ -508,7 +512,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
+@@ -512,7 +516,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
 
         ml.get_key(LLM_KV_ROPE_DIMENSION_COUNT, hparams.n_rot, false);
 
@ -633,7 +611,7 @@ index aba42819..d051696c 100644
             if (hparams.n_rot != hparams.n_embd_head_k) {
                 throw std::runtime_error(format("invalid n_rot: %u, expected %u", hparams.n_rot, hparams.n_embd_head_k));
             }
-@@ -571,6 +575,16 @@ void llama_model::load_hparams(llama_model_loader & ml) {
+@@ -575,6 +579,16 @@ void llama_model::load_hparams(llama_model_loader & ml) {
                     hparams.use_kq_norm = false;
                 }
             } break;
@ -650,7 +628,7 @@ index aba42819..d051696c 100644
         case LLM_ARCH_DECI:
             {
                 ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
-@@ -1550,7 +1564,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
+@@ -1562,7 +1576,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
         const int64_t n_embd_head_v = hparams.n_embd_head_v;
         const int64_t n_ff          = hparams.n_ff();
         const int64_t n_embd_gqa    = n_embd_v_gqa;
@ -659,7 +637,7 @@ index aba42819..d051696c 100644
         const int64_t n_token_types = vocab.n_token_types();
         const int64_t n_rot         = hparams.n_rot;
         const int64_t n_expert      = hparams.n_expert;
-@@ -1803,6 +1817,52 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
+@@ -1815,6 +1829,52 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
                         }
                     }
                 } break;
@ -712,7 +690,7 @@ index aba42819..d051696c 100644
             case LLM_ARCH_DECI:
                 {
                     tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
-@@ -4683,6 +4743,246 @@ struct llm_build_llama : public llm_graph_context {
+@@ -4707,6 +4767,246 @@ struct llm_build_llama : public llm_graph_context {
     }
 };
 
@ -959,7 +937,7 @@ index aba42819..d051696c 100644
 struct llm_build_deci : public llm_graph_context {
     llm_build_deci(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
         const int64_t n_embd_head = hparams.n_embd_head_v;
-@@ -13017,6 +13317,10 @@ llm_graph_result_ptr llama_model::build_graph(
+@@ -13063,6 +13363,10 @@ llm_graph_result_ptr llama_model::build_graph(
             {
                 llm = std::make_unique<llm_build_llama>(*this, params, gf);
             } break;
@ -970,7 +948,7 @@ index aba42819..d051696c 100644
         case LLM_ARCH_DECI:
             {
                 llm = std::make_unique<llm_build_deci>(*this, params, gf);
-@@ -13377,6 +13681,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
+@@ -13424,6 +13728,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
         // use what we call a normal RoPE, operating on pairs of consecutive head values
         case LLM_ARCH_LLAMA:
         case LLM_ARCH_LLAMA4:
@ -979,7 +957,7 @@ index aba42819..d051696c 100644
         case LLM_ARCH_BAICHUAN:
         case LLM_ARCH_STARCODER:
 diff --git a/src/llama-model.h b/src/llama-model.h
-index 5865d5e9..72bab5be 100644
+index 856e6042..6be91282 100644
 --- a/src/llama-model.h
 +++ b/src/llama-model.h
@@ -11,6 +11,7 @@
@ -990,15 +968,15 @@ index 5865d5e9..72bab5be 100644
 
 struct llama_cparams;
 struct llama_ubatch;
-@@ -70,6 +71,7 @@ enum llm_type {
+@@ -73,6 +74,7 @@ enum llm_type {
     LLM_TYPE_40B,
     LLM_TYPE_65B,
     LLM_TYPE_70B,
 +    LLM_TYPE_90B,
     LLM_TYPE_236B,
+     LLM_TYPE_290B,
     LLM_TYPE_314B,
-     LLM_TYPE_671B,
-@@ -310,6 +312,16 @@ struct llama_layer {
+@@ -314,6 +316,16 @@ struct llama_layer {
 
     struct ggml_tensor * bskcn_tv = nullptr;
 
--- a/llama/patches/0007-add-unpad-operator.patch
+++ b/llama/patches/0007-add-unpad-operator.patch
@ -18,10 +18,10 @@ adds the unpad operator to GGML
 10 files changed, 223 insertions(+), 2 deletions(-)

 diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h
-index 8fcc16df..d19fc167 100644
+index 1b8603e7..53ef31b2 100644
 --- a/ggml/include/ggml.h
 +++ b/ggml/include/ggml.h
-@@ -488,6 +488,7 @@ extern "C" {
+@@ -489,6 +489,7 @@ extern "C" {
         GGML_OP_UPSCALE, // nearest interpolate
         GGML_OP_PAD,
         GGML_OP_PAD_REFLECT_1D,
@ -29,7 +29,7 @@ index 8fcc16df..d19fc167 100644
         GGML_OP_ARANGE,
         GGML_OP_TIMESTEP_EMBEDDING,
         GGML_OP_ARGSORT,
-@@ -1757,6 +1758,15 @@ extern "C" {
+@@ -1777,6 +1778,15 @@ extern "C" {
             int                   p0,
             int                   p1);
 
@ -46,10 +46,10 @@ index 8fcc16df..d19fc167 100644
     // timesteps: [N,]
     // return: [N, dim]
 diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c
-index 50400328..432942bf 100644
+index 64405449..34624cca 100644
 --- a/ggml/src/ggml-cpu/ggml-cpu.c
 +++ b/ggml/src/ggml-cpu/ggml-cpu.c
-@@ -1960,6 +1960,10 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
+@@ -1964,6 +1964,10 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
             {
                 ggml_compute_forward_pad_reflect_1d(params, tensor);
             } break;
@ -60,7 +60,7 @@ index 50400328..432942bf 100644
         case GGML_OP_ARANGE:
             {
                 ggml_compute_forward_arange(params, tensor);
-@@ -2282,6 +2286,7 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
+@@ -2287,6 +2291,7 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
         case GGML_OP_UPSCALE:
         case GGML_OP_PAD:
         case GGML_OP_PAD_REFLECT_1D:
@ -69,10 +69,10 @@ index 50400328..432942bf 100644
         case GGML_OP_TIMESTEP_EMBEDDING:
         case GGML_OP_ARGSORT:
 diff --git a/ggml/src/ggml-cpu/ops.cpp b/ggml/src/ggml-cpu/ops.cpp
-index 6050147b..66b8da68 100644
+index 7413192b..becdae07 100644
 --- a/ggml/src/ggml-cpu/ops.cpp
 +++ b/ggml/src/ggml-cpu/ops.cpp
-@@ -6531,6 +6531,61 @@ void ggml_compute_forward_pad_reflect_1d(
+@@ -6703,6 +6703,61 @@ void ggml_compute_forward_pad_reflect_1d(
     }
 }
 
@ -135,10 +135,10 @@ index 6050147b..66b8da68 100644
 
 static void ggml_compute_forward_arange_f32(
 diff --git a/ggml/src/ggml-cpu/ops.h b/ggml/src/ggml-cpu/ops.h
-index 410a3720..3eca1cf8 100644
+index dc081b9e..a7125555 100644
 --- a/ggml/src/ggml-cpu/ops.h
 +++ b/ggml/src/ggml-cpu/ops.h
-@@ -71,6 +71,7 @@ void ggml_compute_forward_pool_2d_back(const struct ggml_compute_params * params
+@@ -72,6 +72,7 @@ void ggml_compute_forward_pool_2d_back(const struct ggml_compute_params * params
 void ggml_compute_forward_upscale(const struct ggml_compute_params * params, struct ggml_tensor * dst);
 void ggml_compute_forward_pad(const struct ggml_compute_params * params, struct ggml_tensor * dst);
 void ggml_compute_forward_pad_reflect_1d(const struct ggml_compute_params * params, struct ggml_tensor * dst);
@ -147,10 +147,10 @@ index 410a3720..3eca1cf8 100644
 void ggml_compute_forward_timestep_embedding(const struct ggml_compute_params * params, struct ggml_tensor * dst);
 void ggml_compute_forward_argsort(const struct ggml_compute_params * params, struct ggml_tensor * dst);
 diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
-index 31750b6f..0fef9522 100644
+index 04ce764e..491acccb 100644
 --- a/ggml/src/ggml-cuda/ggml-cuda.cu
 +++ b/ggml/src/ggml-cuda/ggml-cuda.cu
-@@ -2246,6 +2246,9 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
+@@ -2223,6 +2223,9 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
         case GGML_OP_PAD:
             ggml_cuda_op_pad(ctx, dst);
             break;
@ -160,7 +160,7 @@ index 31750b6f..0fef9522 100644
         case GGML_OP_ARANGE:
             ggml_cuda_op_arange(ctx, dst);
             break;
-@@ -3222,6 +3225,7 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
+@@ -3197,6 +3200,7 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
         case GGML_OP_UPSCALE:
             return op->src[0]->type == GGML_TYPE_F32 && op->op_params[0] == GGML_SCALE_MODE_NEAREST;
         case GGML_OP_PAD:
@ -233,7 +233,7 @@ index 8fd386b0..e2ededc3 100644
 void ggml_cuda_op_pad(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
 +void ggml_cuda_op_unpad(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
 diff --git a/ggml/src/ggml-metal/ggml-metal.m b/ggml/src/ggml-metal/ggml-metal.m
-index 12886cd3..b2e95a66 100644
+index 425524d0..112abef6 100644
 --- a/ggml/src/ggml-metal/ggml-metal.m
 +++ b/ggml/src/ggml-metal/ggml-metal.m
@@ -341,6 +341,7 @@ static void ggml_backend_metal_device_rel(struct ggml_backend_metal_device_conte
@ -244,7 +244,7 @@ index 12886cd3..b2e95a66 100644
     GGML_METAL_KERNEL_TYPE_ARANGE_F32,
     GGML_METAL_KERNEL_TYPE_TIMESTEP_EMBEDDING_F32,
     GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_ASC,
-@@ -1020,6 +1021,7 @@ @implementation GGMLMetalClass
+@@ -1277,6 +1278,7 @@ @implementation GGMLMetalClass
         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_UPSCALE_F32,                     upscale_f32,                     true);
         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_PAD_F32,                         pad_f32,                         true);
         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_PAD_REFLECT_1D_F32,              pad_reflect_1d_f32,              true);
@ -252,7 +252,7 @@ index 12886cd3..b2e95a66 100644
         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_TIMESTEP_EMBEDDING_F32,          timestep_embedding_f32,          true);
         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ARANGE_F32,                      arange_f32,                      true);
         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_ASC,             argsort_f32_i32_asc,             true);
-@@ -1384,6 +1386,7 @@ static bool ggml_metal_supports_op(const struct ggml_backend_metal_device_contex
+@@ -1647,6 +1649,7 @@ static bool ggml_metal_supports_op(const struct ggml_backend_metal_device_contex
         case GGML_OP_POOL_2D:
         case GGML_OP_PAD:
         case GGML_OP_PAD_REFLECT_1D:
@ -260,7 +260,7 @@ index 12886cd3..b2e95a66 100644
         case GGML_OP_TIMESTEP_EMBEDDING:
         case GGML_OP_ARGSORT:
         case GGML_OP_LEAKY_RELU:
-@@ -3731,6 +3734,36 @@ static void ggml_metal_encode_node(
+@@ -4047,6 +4050,36 @@ static bool ggml_metal_encode_node(
 
                 const int nth = MIN(1024, ne0);
 
@ -298,7 +298,7 @@ index 12886cd3..b2e95a66 100644
             } break;
         case GGML_OP_ARANGE:
 diff --git a/ggml/src/ggml-metal/ggml-metal.metal b/ggml/src/ggml-metal/ggml-metal.metal
-index 8d6e99e6..71f0f97f 100644
+index 9f4147e9..6ceb3cef 100644
 --- a/ggml/src/ggml-metal/ggml-metal.metal
 +++ b/ggml/src/ggml-metal/ggml-metal.metal
@@ -2975,6 +2975,51 @@ kernel void kernel_pad_reflect_1d_f32(
@ -354,10 +354,10 @@ index 8d6e99e6..71f0f97f 100644
     device        char * dst,
     constant   ggml_metal_kargs_arange & args,
 diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c
-index 950772c7..2276b631 100644
+index 7654ae17..3c57aff8 100644
 --- a/ggml/src/ggml.c
 +++ b/ggml/src/ggml.c
-@@ -963,6 +963,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
+@@ -923,6 +923,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
     "UPSCALE",
     "PAD",
     "PAD_REFLECT_1D",
@ -365,16 +365,16 @@ index 950772c7..2276b631 100644
     "ARANGE",
     "TIMESTEP_EMBEDDING",
     "ARGSORT",
-@@ -993,7 +994,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
+@@ -953,7 +954,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
     "OPT_STEP_ADAMW",
 };
 
-static_assert(GGML_OP_COUNT == 81, "GGML_OP_COUNT != 81");
-+static_assert(GGML_OP_COUNT == 82, "GGML_OP_COUNT != 82");
+-static_assert(GGML_OP_COUNT == 82, "GGML_OP_COUNT != 82");
+static_assert(GGML_OP_COUNT == 83, "GGML_OP_COUNT != 83");
 
 static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
     "none",
-@@ -1057,6 +1058,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
+@@ -1018,6 +1019,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
     "upscale(x)",
     "pad(x)",
     "pad_reflect_1d(x)",
@ -382,16 +382,16 @@ index 950772c7..2276b631 100644
     "arange(start, stop, step)",
     "timestep_embedding(timesteps, dim, max_period)",
     "argsort(x)",
-@@ -1087,7 +1089,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
+@@ -1048,7 +1050,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
     "adamw(x)",
 };
 
-static_assert(GGML_OP_COUNT == 81, "GGML_OP_COUNT != 81");
-+static_assert(GGML_OP_COUNT == 82, "GGML_OP_COUNT != 82");
+-static_assert(GGML_OP_COUNT == 82, "GGML_OP_COUNT != 82");
+static_assert(GGML_OP_COUNT == 83, "GGML_OP_COUNT != 83");
 
 static_assert(GGML_OP_POOL_COUNT == 2, "GGML_OP_POOL_COUNT != 2");
 
-@@ -4262,6 +4264,25 @@ struct ggml_tensor * ggml_pad_reflect_1d(
+@@ -4270,6 +4272,25 @@ struct ggml_tensor * ggml_pad_reflect_1d(
     return result;
 }
 
--- a/llama/patches/0008-fix-deepseek-deseret-regex.patch
+++ b/llama/patches/0008-fix-deepseek-deseret-regex.patch
@ -12,7 +12,7 @@ regex
 2 files changed, 22 insertions(+), 1 deletion(-)

 diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp
-index a35b498c..032019c9 100644
+index a9ee9f03..1306864e 100644
 --- a/src/llama-vocab.cpp
 +++ b/src/llama-vocab.cpp
@@ -296,7 +296,7 @@ struct llm_tokenizer_bpe : llm_tokenizer {
--- a/llama/patches/0009-maintain-ordering-for-rules-for-grammar.patch
+++ b/llama/patches/0009-maintain-ordering-for-rules-for-grammar.patch
@ -8,10 +8,10 @@ Subject: [PATCH] maintain ordering for rules for grammar
 1 file changed, 1 insertion(+), 1 deletion(-)

 diff --git a/common/json-schema-to-grammar.cpp b/common/json-schema-to-grammar.cpp
-index 90679822..56043678 100644
+index 5b3059c2..656b3eca 100644
 --- a/common/json-schema-to-grammar.cpp
 +++ b/common/json-schema-to-grammar.cpp
-@@ -346,7 +346,7 @@ private:
+@@ -349,7 +349,7 @@ private:
     friend std::string build_grammar(const std::function<void(const common_grammar_builder &)> & cb, const common_grammar_options & options);
     std::function<json(const std::string &)> _fetch_json;
     bool _dotall;
--- a/llama/patches/0010-ensure-KV-cache-is-fully-defragmented.patch
+++ b/llama/patches/0010-ensure-KV-cache-is-fully-defragmented.patch
@ -22,10 +22,10 @@ multiple batches of processing until everything is complete.
 4 files changed, 51 insertions(+), 106 deletions(-)

 diff --git a/src/llama-context.cpp b/src/llama-context.cpp
-index 0343ba8a..4b3e6a83 100644
+index cd06ad91..77177c5e 100644
 --- a/src/llama-context.cpp
 +++ b/src/llama-context.cpp
-@@ -594,13 +594,12 @@ llm_graph_result_ptr llama_context::build_kv_self_shift(
+@@ -583,13 +583,12 @@ llm_graph_result_ptr llama_context::build_kv_self_shift(
 
 llm_graph_result_ptr llama_context::build_kv_self_defrag(
         ggml_context * ctx0,
@ -41,7 +41,7 @@ index 0343ba8a..4b3e6a83 100644
 #if 0
     // CPU defrag
     //
-@@ -672,32 +671,20 @@ llm_graph_result_ptr llama_context::build_kv_self_defrag(
+@@ -661,32 +660,20 @@ llm_graph_result_ptr llama_context::build_kv_self_defrag(
         ggml_backend_tensor_set(v_l[il], buf_v.data(), 0, buf_v.size());
     }
 #else
@ -79,7 +79,7 @@ index 0343ba8a..4b3e6a83 100644
 
             ggml_tensor * view_v_src;
             ggml_tensor * view_v_dst;
-@@ -705,34 +692,30 @@ llm_graph_result_ptr llama_context::build_kv_self_defrag(
+@@ -694,34 +681,30 @@ llm_graph_result_ptr llama_context::build_kv_self_defrag(
             if (cparams.flash_attn) {
                 // NOTE: the V cache is not transposed when using flash attention
                 view_v_src = ggml_view_2d(ctx0, kv_self->v_l[il],
@ -122,7 +122,7 @@ index 0343ba8a..4b3e6a83 100644
 #endif
 
     return res;
-@@ -741,8 +724,6 @@ llm_graph_result_ptr llama_context::build_kv_self_defrag(
+@@ -730,8 +713,6 @@ llm_graph_result_ptr llama_context::build_kv_self_defrag(
 void llama_context::kv_self_update() {
     auto & kv = kv_self;
 
@ -131,7 +131,7 @@ index 0343ba8a..4b3e6a83 100644
     if (kv->has_shift) {
         if (!kv->get_can_shift()) {
             GGML_ABORT("The current context does not support K-shift");
-@@ -763,8 +744,6 @@ void llama_context::kv_self_update() {
+@@ -752,8 +733,6 @@ void llama_context::kv_self_update() {
             res->set_inputs(nullptr);
 
             graph_compute(gf, false);
@ -140,7 +140,7 @@ index 0343ba8a..4b3e6a83 100644
         }
 
         {
-@@ -779,49 +758,28 @@ void llama_context::kv_self_update() {
+@@ -768,49 +747,28 @@ void llama_context::kv_self_update() {
     // defragment the KV cache if needed
     if (kv->do_defrag) {
         LLAMA_LOG_DEBUG("%s: defragmenting KV cache\n", __func__);
@ -202,7 +202,7 @@ index 0343ba8a..4b3e6a83 100644
 }
 
 enum llama_pooling_type llama_context::pooling_type() const {
-@@ -1305,9 +1263,12 @@ int llama_context::decode(llama_batch & inp_batch) {
+@@ -1294,9 +1252,12 @@ int llama_context::decode(llama_batch & inp_batch) {
         // find KV slot
         {
             if (!kv_self->find_slot(ubatch)) {
@ -219,7 +219,7 @@ index 0343ba8a..4b3e6a83 100644
 
             if (!kv_self->recurrent) {
 diff --git a/src/llama-context.h b/src/llama-context.h
-index baa03276..a59ff8fd 100644
+index a50c4afa..30f84bfd 100644
 --- a/src/llama-context.h
 +++ b/src/llama-context.h
@@ -5,6 +5,7 @@
@ -230,7 +230,7 @@ index baa03276..a59ff8fd 100644
 
 #include "ggml-cpp.h"
 
-@@ -180,7 +181,8 @@ private:
+@@ -179,7 +180,8 @@ private:
 
     llm_graph_result_ptr build_kv_self_defrag(
             ggml_context * ctx0,
--- a/llama/patches/0012-add-phony-target-ggml-cpu-for-all-cpu-variants.patch
+++ b/llama/patches/0012-add-phony-target-ggml-cpu-for-all-cpu-variants.patch
@ -8,10 +8,10 @@ Subject: [PATCH] add phony target ggml-cpu for all cpu variants
 1 file changed, 2 insertions(+)

 diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt
-index f00700da..91d6a7d5 100644
+index 43d9fc4f..4c0d3824 100644
 --- a/ggml/src/CMakeLists.txt
 +++ b/ggml/src/CMakeLists.txt
-@@ -278,6 +278,7 @@ function(ggml_add_cpu_backend_variant tag_name)
+@@ -279,6 +279,7 @@ function(ggml_add_cpu_backend_variant tag_name)
     endforeach()
 
     ggml_add_cpu_backend_variant_impl(${tag_name})
@ -19,11 +19,11 @@ index f00700da..91d6a7d5 100644
 endfunction()
 
 ggml_add_backend(CPU)
-@@ -286,6 +287,7 @@ if (GGML_CPU_ALL_VARIANTS)
+@@ -287,6 +288,7 @@ if (GGML_CPU_ALL_VARIANTS)
     if (NOT GGML_BACKEND_DL)
         message(FATAL_ERROR "GGML_CPU_ALL_VARIANTS requires GGML_BACKEND_DL")
     endif()
 +    add_custom_target(ggml-cpu)
-     ggml_add_cpu_backend_variant(sandybridge    AVX)
-     ggml_add_cpu_backend_variant(haswell        AVX F16C AVX2 BMI2 FMA)
-     ggml_add_cpu_backend_variant(skylakex       AVX F16C AVX2 BMI2 FMA AVX512)
+     ggml_add_cpu_backend_variant(x64)
+     ggml_add_cpu_backend_variant(sse42        SSE42)
+     ggml_add_cpu_backend_variant(sandybridge  SSE42 AVX)
--- a/llama/patches/0013-remove-amx.patch
+++ b/llama/patches/0013-remove-amx.patch
@ -1,6 +1,6 @@
 From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
 From: jmorganca <jmorganca@gmail.com>
-Date: Tue, 8 Apr 2025 20:33:01 -0700
+Date: Thu, 1 May 2025 15:05:08 -0700
 Subject: [PATCH] remove amx

 disable amx as it reduces performance on some systems
@ -9,16 +9,16 @@ disable amx as it reduces performance on some systems
 1 file changed, 4 deletions(-)

 diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt
-index 91d6a7d5..d6b393a2 100644
+index 4c0d3824..79c26312 100644
 --- a/ggml/src/CMakeLists.txt
 +++ b/ggml/src/CMakeLists.txt
-@@ -293,10 +293,6 @@ if (GGML_CPU_ALL_VARIANTS)
-     ggml_add_cpu_backend_variant(skylakex       AVX F16C AVX2 BMI2 FMA AVX512)
-     ggml_add_cpu_backend_variant(icelake        AVX F16C AVX2 BMI2 FMA AVX512 AVX512_VBMI AVX512_VNNI)
-     ggml_add_cpu_backend_variant(alderlake      AVX F16C AVX2 BMI2 FMA AVX_VNNI)
+@@ -296,10 +296,6 @@ if (GGML_CPU_ALL_VARIANTS)
+     ggml_add_cpu_backend_variant(skylakex     SSE42 AVX F16C AVX2 BMI2 FMA AVX512)
+     ggml_add_cpu_backend_variant(icelake      SSE42 AVX F16C AVX2 BMI2 FMA AVX512 AVX512_VBMI AVX512_VNNI)
+     ggml_add_cpu_backend_variant(alderlake    SSE42 AVX F16C AVX2 BMI2 FMA AVX_VNNI)
 -    if (NOT MSVC)
 -        # MSVC doesn't support AMX
-        ggml_add_cpu_backend_variant(sapphirerapids AVX F16C AVX2 BMI2 FMA AVX512 AVX512_VBMI AVX512_VNNI AVX512_BF16 AMX_TILE AMX_INT8)
+-        ggml_add_cpu_backend_variant(sapphirerapids SSE42 AVX F16C AVX2 BMI2 FMA AVX512 AVX512_VBMI AVX512_VNNI AVX512_BF16 AMX_TILE AMX_INT8)
 -    endif()
 elseif (GGML_CPU)
     ggml_add_cpu_backend_variant_impl("")
--- a/llama/patches/0014-fix-string-arr-kv-loading.patch
+++ b/llama/patches/0014-fix-string-arr-kv-loading.patch
@ -53,7 +53,7 @@ index 381a9c7d..e45b453d 100644
 }
 
 diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp
-index 032019c9..ba37df35 100644
+index 1306864e..d6515ff6 100644
 --- a/src/llama-vocab.cpp
 +++ b/src/llama-vocab.cpp
@@ -1459,7 +1459,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
--- a/llama/patches/0015-ollama-debug-tensor.patch
+++ b/llama/patches/0015-ollama-debug-tensor.patch
@ -8,7 +8,7 @@ Subject: [PATCH] ollama debug tensor
 1 file changed, 6 insertions(+)

 diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c
-index 432942bf..6d4abe4c 100644
+index 34624cca..59bd3c62 100644
 --- a/ggml/src/ggml-cpu/ggml-cpu.c
 +++ b/ggml/src/ggml-cpu/ggml-cpu.c
@@ -15,6 +15,8 @@
@ -20,7 +20,7 @@ index 432942bf..6d4abe4c 100644
 #if defined(_MSC_VER) || defined(__MINGW32__)
 #include <malloc.h> // using malloc.h with MSC/MINGW
 #elif !defined(__FreeBSD__) && !defined(__NetBSD__) && !defined(__OpenBSD__)
-@@ -2854,6 +2856,10 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
+@@ -2859,6 +2861,10 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
 
         ggml_compute_forward(&params, node);
 
--- a/llama/patches/0016-add-model-quantizations.patch
+++ b/llama/patches/0016-add-model-quantizations.patch
@ -1,96 +0,0 @@
-From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
-From: jmorganca <jmorganca@gmail.com>
-Date: Tue, 8 Apr 2025 20:39:32 -0700
-Subject: [PATCH] add model quantizations
-
-a temporary patch to add model quantization for
-models not supported in llama.cpp
---
- src/llama-arch.cpp  | 17 +++++++++++++++++
- src/llama-arch.h    |  1 +
- src/llama-model.cpp |  2 ++
- src/llama-quant.cpp |  4 ++++
- 4 files changed, 24 insertions(+)
-
-diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp
-index 0568565f..dd01df60 100644
--- a/src/llama-arch.cpp
-+++ b/src/llama-arch.cpp
-@@ -73,6 +73,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
-     { LLM_ARCH_WAVTOKENIZER_DEC, "wavtokenizer-dec" },
-     { LLM_ARCH_PLM,              "plm"              },
-     { LLM_ARCH_BAILINGMOE,       "bailingmoe"       },
-+    { LLM_ARCH_MISTRAL3,         "mistral3"         },
-     { LLM_ARCH_UNKNOWN,          "(unknown)"        },
- };
- 
-@@ -1586,6 +1587,22 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
-             { LLM_TENSOR_FFN_UP_SHEXP,       "blk.%d.ffn_up_shexp" },
-         },
-     },
-+    {
-+        LLM_ARCH_MISTRAL3,
-+        {
-+            { LLM_TENSOR_TOKEN_EMBD,  "token_embd" },
-+            { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
-+            { LLM_TENSOR_ATTN_NORM,   "blk.%d.attn_norm" },
-+            { LLM_TENSOR_ATTN_Q,      "blk.%d.attn_q" },
-+            { LLM_TENSOR_ATTN_K,      "blk.%d.attn_k" },
-+            { LLM_TENSOR_ATTN_V,      "blk.%d.attn_v" },
-+            { LLM_TENSOR_ATTN_OUT,    "blk.%d.attn_output" },
-+            { LLM_TENSOR_FFN_NORM,    "blk.%d.ffn_norm" },
-+            { LLM_TENSOR_FFN_GATE,    "blk.%d.ffn_gate" },
-+            { LLM_TENSOR_FFN_UP,      "blk.%d.ffn_up" },
-+            { LLM_TENSOR_FFN_DOWN,    "blk.%d.ffn_down" },
-+        }
-+    },
-     {
-         LLM_ARCH_UNKNOWN,
-         {
-diff --git a/src/llama-arch.h b/src/llama-arch.h
-index 6a989034..b6227eeb 100644
--- a/src/llama-arch.h
-+++ b/src/llama-arch.h
-@@ -75,6 +75,7 @@ enum llm_arch {
-     LLM_ARCH_CHAMELEON,
-     LLM_ARCH_SOLAR,
-     LLM_ARCH_WAVTOKENIZER_DEC,
-+    LLM_ARCH_MISTRAL3,
-     LLM_ARCH_PLM,
-     LLM_ARCH_BAILINGMOE,
-     LLM_ARCH_UNKNOWN,
-diff --git a/src/llama-model.cpp b/src/llama-model.cpp
-index d051696c..c8374159 100644
--- a/src/llama-model.cpp
-+++ b/src/llama-model.cpp
-@@ -1425,6 +1425,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
-                     default: type = LLM_TYPE_UNKNOWN;
-                 }
-             } break;
-+        case LLM_ARCH_MISTRAL3: break;
-         default: throw std::runtime_error("unsupported model architecture");
-     }
- 
-@@ -13704,6 +13705,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
-         case LLM_ARCH_CHAMELEON:
-         case LLM_ARCH_SOLAR:
-         case LLM_ARCH_BAILINGMOE:
-+        case LLM_ARCH_MISTRAL3:
-             return LLAMA_ROPE_TYPE_NORM;
- 
-         // the pairs of head values are offset by n_rot/2
-diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
-index 223e1f3f..8ae6dde8 100644
--- a/src/llama-quant.cpp
-+++ b/src/llama-quant.cpp
-@@ -744,6 +744,10 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
-         // This used to be a regex, but <regex> has an extreme cost to compile times.
-         bool quantize = name.rfind("weight") == name.size() - 6; // ends with 'weight'?
- 
-+        // don't quantize vision stuff
-+        quantize &= name.find("v.") == std::string::npos;
-+        quantize &= name.find("mm.") == std::string::npos;
-+
-         // quantize only 2D and 3D tensors (experts)
-         quantize &= (ggml_n_dims(tensor) >= 2);
- 
--- a/llama/patches/0016-add-ollama-vocab-for-grammar-support.patch
+++ b/llama/patches/0016-add-ollama-vocab-for-grammar-support.patch
@ -184,10 +184,10 @@ index f8c291de..2a3a62db 100644
                       const char * grammar_root,
                               bool lazy,
 diff --git a/src/llama-sampling.cpp b/src/llama-sampling.cpp
-index d1497985..b1a9dca3 100644
+index c0a5f934..75731053 100644
 --- a/src/llama-sampling.cpp
 +++ b/src/llama-sampling.cpp
-@@ -1465,7 +1465,7 @@ static void llama_sampler_grammar_reset(struct llama_sampler * smpl) {
+@@ -1466,7 +1466,7 @@ static void llama_sampler_grammar_reset(struct llama_sampler * smpl) {
         trigger_patterns_c.push_back(trigger_pattern.pattern.c_str());
     }
 
@ -196,7 +196,7 @@ index d1497985..b1a9dca3 100644
                                                  ctx->grammar->lazy, trigger_patterns_c.data(), trigger_patterns_c.size(),
                                                  ctx->grammar->trigger_tokens.data(), ctx->grammar->trigger_tokens.size());
 
-@@ -1547,7 +1547,7 @@ static struct llama_sampler * llama_sampler_init_grammar_impl(
+@@ -1548,7 +1548,7 @@ static struct llama_sampler * llama_sampler_init_grammar_impl(
             /* .vocab        = */ vocab,
             /* .grammar_str  = */ grammar_str,
             /* .grammar_root = */ grammar_root,
--- a/llama/patches/0017-ggml-Don-t-assert-fail-when-tensor-data-changes-1322.patch
+++ b/llama/patches/0017-ggml-Don-t-assert-fail-when-tensor-data-changes-1322.patch
@ -0,0 +1,38 @@
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
+From: Jesse Gross <jesse@kernel.org>
+Date: Thu, 1 May 2025 13:46:10 -0700
+Subject: [PATCH] ggml: Don't assert fail when tensor data changes (#13222)
+
+The following scenario will cause an assertion failure in the graph
+allocator:
+ - Build and allocate a graph containing a tensor with a non-NULL data
+   pointer
+ - Build and allocate a new graph where that data is NULL
+
+Result:
+ggml-alloc.c:819: GGML_ASSERT(talloc->buffer_id >= 0) failed
+
+This happens during revalidation because we think that memory should
+have been previously allocated based on the current graph but in
+reality the previous graph was different. In this situation, we
+should do a full reallocation pass.
+---
+ ggml/src/ggml-alloc.c | 5 ++++-
+ 1 file changed, 4 insertions(+), 1 deletion(-)
+
+diff --git a/ggml/src/ggml-alloc.c b/ggml/src/ggml-alloc.c
+index a3d3f690..5fd379f6 100644
+--- a/ggml/src/ggml-alloc.c
+++ b/ggml/src/ggml-alloc.c
+@@ -816,7 +816,10 @@ static void ggml_gallocr_init_tensor(ggml_gallocr_t galloc, struct ggml_tensor *
+ static bool ggml_gallocr_node_needs_realloc(ggml_gallocr_t galloc, struct ggml_tensor * node, struct tensor_alloc * talloc) {
+     size_t node_size = 0;
+     if (!node->data && !node->view_src) {
+-        GGML_ASSERT(talloc->buffer_id >= 0); // prevent segfault when misusing the API
+        // If we previously had data but don't now then reallocate
+        if (talloc->buffer_id < 0) {
+            return false;
+        }
+         node_size = ggml_backend_buft_get_alloc_size(galloc->bufts[talloc->buffer_id], node);
+     }
+     return talloc->size_max >= node_size;
--- a/llama/sampling_ext.cpp
+++ b/llama/sampling_ext.cpp
@ -19,9 +19,6 @@ struct common_sampler *common_sampler_cinit(const struct llama_model *model, str
        sparams.penalty_repeat = params->penalty_repeat;
        sparams.penalty_freq = params->penalty_freq;
        sparams.penalty_present = params->penalty_present;
-        sparams.mirostat = params->mirostat;
-        sparams.mirostat_tau = params->mirostat_tau;
-        sparams.mirostat_eta = params->mirostat_eta;
        sparams.seed = params->seed;
        sparams.grammar = params->grammar;
        sparams.xtc_probability = 0.0;
--- a/llama/sampling_ext.h
+++ b/llama/sampling_ext.h
@ -20,9 +20,6 @@ extern "C"
        float penalty_repeat;
        float penalty_freq;
        float penalty_present;
-        int32_t mirostat;
-        float mirostat_tau;
-        float mirostat_eta;
        uint32_t seed;
        char *grammar;
    };
--- a/llm/llm_windows.go
+++ b/llm/llm_windows.go
@ -7,6 +7,7 @@ import (
 const (
 	CREATE_DEFAULT_ERROR_MODE   = 0x04000000
 	ABOVE_NORMAL_PRIORITY_CLASS = 0x00008000
+	CREATE_NO_WINDOW            = 0x08000000
 )

 var LlamaServerSysProcAttr = &syscall.SysProcAttr{
@ -18,5 +19,5 @@ var LlamaServerSysProcAttr = &syscall.SysProcAttr{
 	//
 	// Setting Above Normal priority class ensures when running as a "background service"
 	// with "programs" given best priority, we aren't starved of cpu cycles
-	CreationFlags: CREATE_DEFAULT_ERROR_MODE | ABOVE_NORMAL_PRIORITY_CLASS,
+	CreationFlags: CREATE_DEFAULT_ERROR_MODE | ABOVE_NORMAL_PRIORITY_CLASS | CREATE_NO_WINDOW,
 }
--- a/llm/memory_test.go
+++ b/llm/memory_test.go
@ -25,7 +25,7 @@ func TestEstimateGPULayers(t *testing.T) {
 	defer f.Close()
 	inputLayerCount := 5

-	tensors := []ggml.Tensor{
+	tensors := []*ggml.Tensor{
 		{Name: "blk.0.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: bytes.NewReader(make([]byte, 32))},
 		{Name: "blk.1.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: bytes.NewReader(make([]byte, 32))},
 		{Name: "blk.2.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: bytes.NewReader(make([]byte, 32))},
--- a/llm/server.go
+++ b/llm/server.go
@ -44,6 +44,7 @@ type LlamaServer interface {
 	EstimatedVRAM() uint64 // Total VRAM across all GPUs
 	EstimatedTotal() uint64
 	EstimatedVRAMByGPU(gpuID string) uint64
+	Pid() int
 }

 // llmServer is an instance of the llama.cpp server
@ -216,10 +217,6 @@ func NewLlamaServer(gpus discover.GpuInfoList, modelPath string, f *ggml.GGML, a
 		params = append(params, "--no-mmap")
 	}

-	if opts.UseMLock {
-		params = append(params, "--mlock")
-	}
-
 	// TODO - NUMA support currently doesn't work properly

 	params = append(params, "--parallel", strconv.Itoa(numParallel))
@ -289,7 +286,7 @@ func NewLlamaServer(gpus discover.GpuInfoList, modelPath string, f *ggml.GGML, a
 		params = append(params, "--mmproj", projectors[0])
 	}

-	// iterate through compatible GPU libraries such as 'cuda_v12', 'cuda_v11', 'rocm', etc.
+	// iterate through compatible GPU libraries such as 'cuda_v12', 'rocm', etc.
 	// adding each library's respective path to the LD_LIBRARY_PATH, until finally running
 	// without any LD_LIBRARY_PATH flags
 	for {
@ -324,21 +321,23 @@ func NewLlamaServer(gpus discover.GpuInfoList, modelPath string, f *ggml.GGML, a
 			pathEnv = "LD_LIBRARY_PATH"
 		}

-		var libraryPaths []string
+		// Note: we always put our dependency paths first
+		// since these are the exact version we compiled/linked against
+		libraryPaths := []string{discover.LibOllamaPath}
 		if libraryPath, ok := os.LookupEnv(pathEnv); ok {
 			libraryPaths = append(libraryPaths, filepath.SplitList(libraryPath)...)
 		}

+		ggmlPaths := []string{discover.LibOllamaPath}
 		if len(compatible) > 0 {
 			c := compatible[0]
 			if libpath, ok := libs[c]; ok {
 				slog.Debug("adding gpu library", "path", libpath)
-				libraryPaths = append(libraryPaths, libpath)
+				libraryPaths = append([]string{libpath}, libraryPaths...)
+				ggmlPaths = append(ggmlPaths, libpath)
 			}
 		}

-		// Note: we always put the dependency path first
-		// since this was the exact version we compiled/linked against
 		if gpus[0].DependencyPath != nil {
 			slog.Debug("adding gpu dependency paths", "paths", gpus[0].DependencyPath)
 			// assume gpus from the same library have the same dependency path
@ -369,6 +368,8 @@ func NewLlamaServer(gpus discover.GpuInfoList, modelPath string, f *ggml.GGML, a
 		s.cmd.Stderr = s.status
 		s.cmd.SysProcAttr = LlamaServerSysProcAttr

+		s.cmd.Env = append(s.cmd.Env, "OLLAMA_LIBRARY_PATH="+strings.Join(ggmlPaths, string(filepath.ListSeparator)))
+
 		envWorkarounds := [][2]string{}
 		for _, gpu := range gpus {
 			envWorkarounds = append(envWorkarounds, gpu.EnvWorkarounds...)
@ -406,7 +407,8 @@ func NewLlamaServer(gpus discover.GpuInfoList, modelPath string, f *ggml.GGML, a
 		if envconfig.Debug() {
 			filteredEnv := []string{}
 			for _, ev := range s.cmd.Env {
-				if strings.HasPrefix(ev, "CUDA_") ||
+				if strings.HasPrefix(ev, "OLLAMA_") ||
+					strings.HasPrefix(ev, "CUDA_") ||
 					strings.HasPrefix(ev, "ROCR_") ||
 					strings.HasPrefix(ev, "ROCM_") ||
 					strings.HasPrefix(ev, "HIP_") ||
@ -515,6 +517,9 @@ func (s *llmServer) getServerStatus(ctx context.Context) (ServerStatus, error) {
 		if errors.Is(err, context.DeadlineExceeded) {
 			return ServerStatusNotResponding, errors.New("server not responding")
 		}
+		if strings.Contains(err.Error(), "connection refused") {
+			return ServerStatusNotResponding, errors.New("connection refused")
+		}
 		return ServerStatusError, fmt.Errorf("health resp: %w", err)
 	}
 	defer resp.Body.Close()
@ -635,6 +640,13 @@ func (s *llmServer) WaitUntilRunning(ctx context.Context) error {
 	}
 }

+func (s *llmServer) Pid() int {
+	if s.cmd != nil && s.cmd.Process != nil {
+		return s.cmd.Process.Pid
+	}
+	return -1
+}
+
 var grammarJSON = `
 root   ::= object
 value  ::= object | array | string | number | ("true" | "false" | "null") ws
@ -998,17 +1010,17 @@ func (s *llmServer) Close() error {
 	s.llamaModelLock.Unlock()

 	if s.cmd != nil {
-		slog.Debug("stopping llama server")
+		slog.Debug("stopping llama server", "pid", s.Pid())
 		if err := s.cmd.Process.Kill(); err != nil {
 			return err
 		}
 		// if ProcessState is already populated, Wait already completed, no need to wait again
 		if s.cmd.ProcessState == nil {
-			slog.Debug("waiting for llama server to exit")
+			slog.Debug("waiting for llama server to exit", "pid", s.Pid())
 			<-s.done
 		}

-		slog.Debug("llama server stopped")
+		slog.Debug("llama server stopped", "pid", s.Pid())
 	}

 	return nil
--- a/llm/server_test.go
+++ b/llm/server_test.go
@ -16,7 +16,7 @@ func TestLLMServerCompletionFormat(t *testing.T) {
 	// of a mess, and but it's good enough, until we can refactoring the
 	// Completion method to be more testable.

-	ctx, cancel := context.WithCancel(context.Background())
+	ctx, cancel := context.WithCancel(t.Context())
 	s := &llmServer{
 		sem: semaphore.NewWeighted(1), // required to prevent nil panic
 	}
--- a/ml/backend/ggml/ggml.go
+++ b/ml/backend/ggml/ggml.go
@ -312,6 +312,7 @@ func New(ctx context.Context, r *os.File, params ml.BackendParams) (ml.Backend,
 	g, ctx := errgroup.WithContext(ctx)
 	g.SetLimit(runtime.GOMAXPROCS(0))
 	for _, t := range meta.Tensors().Items() {
+		t := t
 		g.Go(func() error {
 			tts := make([]*C.struct_ggml_tensor, max(1, len(targets[t.Name])))
 			for i := range tts {
@ -341,6 +342,11 @@ func New(ctx context.Context, r *os.File, params ml.BackendParams) (ml.Backend,

 			var s uint64
 			for s < t.Size() {
+				// Stop if either the parent context has been canceled or if any of the other tensors returned an error
+				if err := ctx.Err(); err != nil {
+					return err
+				}
+
 				n, err := io.ReadFull(sr, bts[:min(len(bts), int(t.Size()-s))])
 				if err != nil {
 					slog.Warn("file read error", "file", r.Name(), "error", err)
@ -363,14 +369,6 @@ func New(ctx context.Context, r *os.File, params ml.BackendParams) (ml.Backend,
 		})
 	}

-	// start a goroutine to cancel the errgroup if the parent context is done
-	go func() {
-		<-ctx.Done()
-		g.Go(func() error {
-			return ctx.Err()
-		})
-	}()
-
 	if err := g.Wait(); err != nil {
 		return nil, err
 	}
--- a/ml/backend/ggml/ggml/include/ggml-cpu.h
+++ b/ml/backend/ggml/ggml/include/ggml-cpu.h
@ -133,6 +133,11 @@ extern "C" {

    GGML_BACKEND_API ggml_backend_reg_t ggml_backend_cpu_reg(void);

+    GGML_BACKEND_API void ggml_cpu_fp32_to_fp16(const float *, ggml_fp16_t *, int64_t);
+    GGML_BACKEND_API void ggml_cpu_fp16_to_fp32(const ggml_fp16_t *, float *, int64_t);
+    GGML_BACKEND_API void ggml_cpu_fp32_to_bf16(const float *, ggml_bf16_t *, int64_t);
+    GGML_BACKEND_API void ggml_cpu_bf16_to_fp32(const ggml_bf16_t *, float *, int64_t);
+
 #ifdef __cplusplus
 }
 #endif
--- a/ml/backend/ggml/ggml/include/ggml-rpc.h
+++ b/ml/backend/ggml/ggml/include/ggml-rpc.h
@ -7,7 +7,7 @@
 extern "C" {
 #endif

-#define RPC_PROTO_MAJOR_VERSION    1
+#define RPC_PROTO_MAJOR_VERSION    2
 #define RPC_PROTO_MINOR_VERSION    0
 #define RPC_PROTO_PATCH_VERSION    0
 #define GGML_RPC_MAX_SERVERS       16
--- a/ml/backend/ggml/ggml/include/ggml.h
+++ b/ml/backend/ggml/ggml/include/ggml.h
@ -393,8 +393,8 @@ extern "C" {

    // precision
    enum ggml_prec {
-        GGML_PREC_DEFAULT,
-        GGML_PREC_F32,
+        GGML_PREC_DEFAULT =  0, // stored as ggml_tensor.op_params, 0 by default
+        GGML_PREC_F32     = 10,
    };

    // model file types
@ -481,6 +481,7 @@ extern "C" {
        GGML_OP_CONV_TRANSPOSE_1D,
        GGML_OP_IM2COL,
        GGML_OP_IM2COL_BACK,
+        GGML_OP_CONV_2D_DW,
        GGML_OP_CONV_TRANSPOSE_2D,
        GGML_OP_POOL_1D,
        GGML_OP_POOL_2D,
@ -678,6 +679,9 @@ extern "C" {
    GGML_API bool ggml_is_contiguous_1(const struct ggml_tensor * tensor); // contiguous for dims >= 1
    GGML_API bool ggml_is_contiguous_2(const struct ggml_tensor * tensor); // contiguous for dims >= 2

+    // true for tensor that is stored in memory as CxWxHxN and has been permuted to WxHxCxN
+    GGML_API bool ggml_is_contiguous_channels(const struct ggml_tensor * tensor);
+
    GGML_API bool ggml_are_same_shape (const struct ggml_tensor * t0, const struct ggml_tensor * t1);
    GGML_API bool ggml_are_same_stride(const struct ggml_tensor * t0, const struct ggml_tensor * t1);

@ -1661,7 +1665,7 @@ extern "C" {
            struct ggml_tensor  * a,
            struct ggml_tensor  * b);

-    // depthwise
+    // depthwise (via im2col and mul_mat)
    GGML_API struct ggml_tensor * ggml_conv_2d_dw(
            struct ggml_context * ctx,
            struct ggml_tensor  * a,  // convolution kernel
@ -1673,6 +1677,22 @@ extern "C" {
            int                  d0,  // dilation dimension 0
            int                  d1); // dilation dimension 1

+    // Depthwise 2D convolution
+    // may be faster than ggml_conv_2d_dw, but not available in all backends
+    // a:   KW    KH    1    C    convolution kernel
+    // b:   W     H     C    N    input data
+    // res: W_out H_out C    N
+    GGML_API struct ggml_tensor * ggml_conv_2d_dw_direct(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            struct ggml_tensor  * b,
+            int                   stride0,
+            int                   stride1,
+            int                   pad0,
+            int                   pad1,
+            int                   dilation0,
+            int                   dilation1);
+
    GGML_API struct ggml_tensor * ggml_conv_transpose_2d_p0(
            struct ggml_context * ctx,
            struct ggml_tensor  * a,
--- a/ml/backend/ggml/ggml/src/CMakeLists.txt
+++ b/ml/backend/ggml/ggml/src/CMakeLists.txt
@ -267,6 +267,7 @@ function(ggml_add_cpu_backend_variant tag_name)
    set(GGML_CPU_TAG_NAME ${tag_name})
    # other: OPENMP LLAMAFILE CPU_HBM
    foreach (feat NATIVE
+                  SSE42
                  AVX AVX2 BMI2 AVX_VNNI FMA F16C
                  AVX512 AVX512_VBMI AVX512_VNNI AVX512_BF16
                  AMX_TILE AMX_INT8 AMX_BF16)
@ -288,11 +289,13 @@ if (GGML_CPU_ALL_VARIANTS)
        message(FATAL_ERROR "GGML_CPU_ALL_VARIANTS requires GGML_BACKEND_DL")
    endif()
    add_custom_target(ggml-cpu)
-    ggml_add_cpu_backend_variant(sandybridge    AVX)
-    ggml_add_cpu_backend_variant(haswell        AVX F16C AVX2 BMI2 FMA)
-    ggml_add_cpu_backend_variant(skylakex       AVX F16C AVX2 BMI2 FMA AVX512)
-    ggml_add_cpu_backend_variant(icelake        AVX F16C AVX2 BMI2 FMA AVX512 AVX512_VBMI AVX512_VNNI)
-    ggml_add_cpu_backend_variant(alderlake      AVX F16C AVX2 BMI2 FMA AVX_VNNI)
+    ggml_add_cpu_backend_variant(x64)
+    ggml_add_cpu_backend_variant(sse42        SSE42)
+    ggml_add_cpu_backend_variant(sandybridge  SSE42 AVX)
+    ggml_add_cpu_backend_variant(haswell      SSE42 AVX F16C AVX2 BMI2 FMA)
+    ggml_add_cpu_backend_variant(skylakex     SSE42 AVX F16C AVX2 BMI2 FMA AVX512)
+    ggml_add_cpu_backend_variant(icelake      SSE42 AVX F16C AVX2 BMI2 FMA AVX512 AVX512_VBMI AVX512_VNNI)
+    ggml_add_cpu_backend_variant(alderlake    SSE42 AVX F16C AVX2 BMI2 FMA AVX_VNNI)
 elseif (GGML_CPU)
    ggml_add_cpu_backend_variant_impl("")
 endif()
--- a/Show More
+++ b/Show More