diff --git a/.github/workflows/release.yaml b/.github/workflows/release.yaml index 12f361408..b2e122469 100644 --- a/.github/workflows/release.yaml +++ b/.github/workflows/release.yaml @@ -103,11 +103,6 @@ jobs: arch: [amd64] preset: ['CPU'] include: - - os: windows - arch: amd64 - preset: 'CUDA 11' - install: https://developer.download.nvidia.com/compute/cuda/11.3.1/local_installers/cuda_11.3.1_465.89_win10.exe - cuda-version: '11.3' - os: windows arch: amd64 preset: 'CUDA 12' @@ -324,7 +319,6 @@ jobs: case "$COMPONENT" in bin/ollama) echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}.tar.in ;; lib/ollama/*.so) echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}.tar.in ;; - lib/ollama/cuda_v11) echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}.tar.in ;; lib/ollama/cuda_v12) echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}.tar.in ;; lib/ollama/cuda_jetpack5) echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}-jetpack5.tar.in ;; lib/ollama/cuda_jetpack6) echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}-jetpack6.tar.in ;; @@ -432,6 +426,22 @@ jobs: docker buildx imagetools inspect ollama/ollama:${{ steps.metadata.outputs.version }} working-directory: ${{ runner.temp }} + # Trigger downstream release process + trigger: + runs-on: ubuntu-latest + environment: release + needs: [darwin-build, windows-build, windows-depends] + steps: + - name: Trigger downstream release process + run: | + curl -L \ + -X POST \ + -H "Accept: application/vnd.github+json" \ + -H "Authorization: Bearer ${{ secrets.RELEASE_TOKEN }}" \ + -H "X-GitHub-Api-Version: 2022-11-28" \ + https://api.github.com/repos/ollama/${{ vars.RELEASE_REPO }}/dispatches \ + -d "{\"event_type\": \"trigger-workflow\", \"client_payload\": {\"run_id\": \"${GITHUB_RUN_ID}\", \"version\": \"${GITHUB_REF_NAME#v}\"}}" + # Aggregate all the assets and ship a release release: needs: [darwin-sign, windows-sign, linux-build] diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml index 27e229fcf..2e7093391 100644 --- a/.github/workflows/test.yaml +++ b/.github/workflows/test.yaml @@ -46,7 +46,7 @@ jobs: include: - preset: CPU - preset: CUDA - container: nvidia/cuda:11.8.0-devel-ubuntu22.04 + container: nvidia/cuda:12.8.1-devel-ubuntu22.04 flags: '-DCMAKE_CUDA_ARCHITECTURES=87' - preset: ROCm container: rocm/dev-ubuntu-22.04:6.1.2 @@ -78,7 +78,7 @@ jobs: include: - preset: CPU - preset: CUDA - install: https://developer.download.nvidia.com/compute/cuda/11.3.1/local_installers/cuda_11.3.1_465.89_win10.exe + install: https://developer.download.nvidia.com/compute/cuda/12.8.0/local_installers/cuda_12.8.0_571.96_windows.exe flags: '-DCMAKE_CUDA_ARCHITECTURES=80' - preset: ROCm install: https://download.amd.com/developer/eula/rocm-hub/AMD-Software-PRO-Edition-24.Q4-WinSvr2022-For-HIP.exe @@ -102,7 +102,7 @@ jobs: $ErrorActionPreference = "Stop" if ("${{ steps.cache-install.outputs.cache-hit }}" -ne 'true') { Invoke-WebRequest -Uri "${{ matrix.install }}" -OutFile "install.exe" - Start-Process -FilePath .\install.exe -ArgumentList (@("-s", "cudart_11.3", "nvcc_11.3", "cublas_11.3", "cublas_dev_11.3")) -NoNewWindow -Wait + Start-Process -FilePath .\install.exe -ArgumentList (@("-s", "cudart_12.8", "nvcc_12.8", "cublas_12.8", "cublas_dev_12.8")) -NoNewWindow -Wait } $cudaPath = (Resolve-Path "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\*").path diff --git a/.golangci.yaml b/.golangci.yaml index 9bb9786a8..9d6705bd3 100644 --- a/.golangci.yaml +++ b/.golangci.yaml @@ -19,8 +19,8 @@ linters: - nolintlint - nosprintfhostport - staticcheck - - tenv - unconvert + - usetesting - wastedassign - whitespace disable: diff --git a/CMakePresets.json b/CMakePresets.json index 0b70d8ba3..2f29e041e 100644 --- a/CMakePresets.json +++ b/CMakePresets.json @@ -17,14 +17,6 @@ "name": "CUDA", "inherits": [ "Default" ] }, - { - "name": "CUDA 11", - "inherits": [ "CUDA" ], - "cacheVariables": { - "CMAKE_CUDA_ARCHITECTURES": "50;52;53;60;61;70;75;80;86", - "CMAKE_CUDA_FLAGS": "-Wno-deprecated-gpu-targets" - } - }, { "name": "CUDA 12", "inherits": [ "CUDA" ], @@ -78,11 +70,6 @@ "configurePreset": "CUDA", "targets": [ "ggml-cuda" ] }, - { - "name": "CUDA 11", - "inherits": [ "CUDA" ], - "configurePreset": "CUDA 11" - }, { "name": "CUDA 12", "inherits": [ "CUDA" ], diff --git a/Dockerfile b/Dockerfile index 4c6619e77..1196dc535 100644 --- a/Dockerfile +++ b/Dockerfile @@ -7,14 +7,10 @@ ARG JETPACK5VERSION=r35.4.1 ARG JETPACK6VERSION=r36.4.0 ARG CMAKEVERSION=3.31.2 -# CUDA v11 requires gcc v10. v10.3 has regressions, so the rockylinux 8.5 AppStream has the latest compatible version FROM --platform=linux/amd64 rocm/dev-almalinux-8:${ROCMVERSION}-complete AS base-amd64 RUN yum install -y yum-utils \ - && yum-config-manager --add-repo https://dl.rockylinux.org/vault/rocky/8.5/AppStream/\$basearch/os/ \ - && rpm --import https://dl.rockylinux.org/pub/rocky/RPM-GPG-KEY-Rocky-8 \ - && dnf install -y yum-utils ccache gcc-toolset-10-gcc-10.2.1-8.2.el8 gcc-toolset-10-gcc-c++-10.2.1-8.2.el8 gcc-toolset-10-binutils-2.35-11.el8 \ + && dnf install -y ccache \ && yum-config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/rhel8/x86_64/cuda-rhel8.repo -ENV PATH=/opt/rh/gcc-toolset-10/root/usr/bin:$PATH FROM --platform=linux/arm64 almalinux:8 AS base-arm64 # install epel-release for ccache @@ -38,15 +34,6 @@ RUN --mount=type=cache,target=/root/.ccache \ && cmake --build --parallel --preset 'CPU' \ && cmake --install build --component CPU --strip --parallel 8 -FROM base AS cuda-11 -ARG CUDA11VERSION=11.3 -RUN dnf install -y cuda-toolkit-${CUDA11VERSION//./-} -ENV PATH=/usr/local/cuda-11/bin:$PATH -RUN --mount=type=cache,target=/root/.ccache \ - cmake --preset 'CUDA 11' \ - && cmake --build --parallel --preset 'CUDA 11' \ - && cmake --install build --component CUDA --strip --parallel 8 - FROM base AS cuda-12 ARG CUDA12VERSION=12.8 RUN dnf install -y cuda-toolkit-${CUDA12VERSION//./-} @@ -98,11 +85,9 @@ RUN --mount=type=cache,target=/root/.cache/go-build \ go build -trimpath -buildmode=pie -o /bin/ollama . FROM --platform=linux/amd64 scratch AS amd64 -COPY --from=cuda-11 dist/lib/ollama/cuda_v11 /lib/ollama/cuda_v11 COPY --from=cuda-12 dist/lib/ollama/cuda_v12 /lib/ollama/cuda_v12 FROM --platform=linux/arm64 scratch AS arm64 -COPY --from=cuda-11 dist/lib/ollama/cuda_v11 /lib/ollama/cuda_v11 COPY --from=cuda-12 dist/lib/ollama/cuda_v12 /lib/ollama/cuda_v12 COPY --from=jetpack-5 dist/lib/ollama/cuda_v11 /lib/ollama/cuda_jetpack5 COPY --from=jetpack-6 dist/lib/ollama/cuda_v12 /lib/ollama/cuda_jetpack6 diff --git a/Makefile.sync b/Makefile.sync index 949ade809..238d76279 100644 --- a/Makefile.sync +++ b/Makefile.sync @@ -1,6 +1,6 @@ UPSTREAM=https://github.com/ggerganov/llama.cpp.git WORKDIR=llama/vendor -FETCH_HEAD=2016f07bd106c73699ecbaace80f55db5ed95dac +FETCH_HEAD=e1e8e0991ffd9e99a445c6812bb519d5bac9f4b5 .PHONY: help help: diff --git a/README.md b/README.md index 30019aeb4..e99226ae7 100644 --- a/README.md +++ b/README.md @@ -61,6 +61,8 @@ Here are some example models that can be downloaded: | QwQ | 32B | 20GB | `ollama run qwq` | | DeepSeek-R1 | 7B | 4.7GB | `ollama run deepseek-r1` | | DeepSeek-R1 | 671B | 404GB | `ollama run deepseek-r1:671b` | +| Llama 4 | 109B | 67GB | `ollama run llama4:scout` | +| Llama 4 | 400B | 245GB | `ollama run llama4:maverick` | | Llama 3.3 | 70B | 43GB | `ollama run llama3.3` | | Llama 3.2 | 3B | 2.0GB | `ollama run llama3.2` | | Llama 3.2 | 1B | 1.3GB | `ollama run llama3.2:1b` | @@ -77,7 +79,7 @@ Here are some example models that can be downloaded: | Code Llama | 7B | 3.8GB | `ollama run codellama` | | Llama 2 Uncensored | 7B | 3.8GB | `ollama run llama2-uncensored` | | LLaVA | 7B | 4.5GB | `ollama run llava` | -| Granite-3.2 | 8B | 4.9GB | `ollama run granite3.2` | +| Granite-3.3 | 8B | 4.9GB | `ollama run granite3.3` | > [!NOTE] > You should have at least 8 GB of RAM available to run the 7B models, 16 GB to run the 13B models, and 32 GB to run the 33B models. @@ -285,7 +287,7 @@ See the [API documentation](./docs/api.md) for all endpoints. - [Bionic GPT](https://github.com/bionic-gpt/bionic-gpt) - [HTML UI](https://github.com/rtcfirefly/ollama-ui) - [Saddle](https://github.com/jikkuatwork/saddle) -- [TagSpaces](https://www.tagspaces.org) (A platform for file based apps, [utilizing Ollama](https://docs.tagspaces.org/ai/) for the generation of tags and descriptions) +- [TagSpaces](https://www.tagspaces.org) (A platform for file-based apps, [utilizing Ollama](https://docs.tagspaces.org/ai/) for the generation of tags and descriptions) - [Chatbot UI](https://github.com/ivanfioravanti/chatbot-ollama) - [Chatbot UI v2](https://github.com/mckaywrigley/chatbot-ui) - [Typescript UI](https://github.com/ollama-interface/Ollama-Gui?tab=readme-ov-file) @@ -312,6 +314,7 @@ See the [API documentation](./docs/api.md) for all endpoints. - [Ollama Basic Chat: Uses HyperDiv Reactive UI](https://github.com/rapidarchitect/ollama_basic_chat) - [Ollama-chats RPG](https://github.com/drazdra/ollama-chats) - [IntelliBar](https://intellibar.app/) (AI-powered assistant for macOS) +- [Jirapt](https://github.com/AliAhmedNada/jirapt) (Jira Integration to generate issues, tasks, epics) - [QA-Pilot](https://github.com/reid41/QA-Pilot) (Interactive chat tool that can leverage Ollama models for rapid understanding and navigation of GitHub code repositories) - [ChatOllama](https://github.com/sugarforever/chat-ollama) (Open Source Chatbot based on Ollama with Knowledge Bases) - [CRAG Ollama Chat](https://github.com/Nagi-ovo/CRAG-Ollama-Chat) (Simple Web Search with Corrective RAG) @@ -325,14 +328,14 @@ See the [API documentation](./docs/api.md) for all endpoints. - [RWKV-Runner](https://github.com/josStorer/RWKV-Runner) (RWKV offline LLM deployment tool, also usable as a client for ChatGPT and Ollama) - [Ollama Grid Search](https://github.com/dezoito/ollama-grid-search) (app to evaluate and compare models) - [Olpaka](https://github.com/Otacon/olpaka) (User-friendly Flutter Web App for Ollama) -- [Casibase](https://casibase.org) (An open source AI knowledge base and dialogue system combining the latest RAG, SSO, ollama support and multiple large language models.) +- [Casibase](https://casibase.org) (An open source AI knowledge base and dialogue system combining the latest RAG, SSO, ollama support, and multiple large language models.) - [OllamaSpring](https://github.com/CrazyNeil/OllamaSpring) (Ollama Client for macOS) - [LLocal.in](https://github.com/kartikm7/llocal) (Easy to use Electron Desktop Client for Ollama) - [Shinkai Desktop](https://github.com/dcSpark/shinkai-apps) (Two click install Local AI using Ollama + Files + RAG) -- [AiLama](https://github.com/zeyoyt/ailama) (A Discord User App that allows you to interact with Ollama anywhere in discord ) +- [AiLama](https://github.com/zeyoyt/ailama) (A Discord User App that allows you to interact with Ollama anywhere in Discord) - [Ollama with Google Mesop](https://github.com/rapidarchitect/ollama_mesop/) (Mesop Chat Client implementation with Ollama) - [R2R](https://github.com/SciPhi-AI/R2R) (Open-source RAG engine) -- [Ollama-Kis](https://github.com/elearningshow/ollama-kis) (A simple easy to use GUI with sample custom LLM for Drivers Education) +- [Ollama-Kis](https://github.com/elearningshow/ollama-kis) (A simple easy-to-use GUI with sample custom LLM for Drivers Education) - [OpenGPA](https://opengpa.org) (Open-source offline-first Enterprise Agentic Application) - [Painting Droid](https://github.com/mateuszmigas/painting-droid) (Painting app with AI integrations) - [Kerlig AI](https://www.kerlig.com/) (AI writing assistant for macOS) @@ -341,16 +344,16 @@ See the [API documentation](./docs/api.md) for all endpoints. - [LLMStack](https://github.com/trypromptly/LLMStack) (No-code multi-agent framework to build LLM agents and workflows) - [BoltAI for Mac](https://boltai.com) (AI Chat Client for Mac) - [Harbor](https://github.com/av/harbor) (Containerized LLM Toolkit with Ollama as default backend) -- [PyGPT](https://github.com/szczyglis-dev/py-gpt) (AI desktop assistant for Linux, Windows and Mac) -- [Alpaca](https://github.com/Jeffser/Alpaca) (An Ollama client application for linux and macos made with GTK4 and Adwaita) +- [PyGPT](https://github.com/szczyglis-dev/py-gpt) (AI desktop assistant for Linux, Windows, and Mac) +- [Alpaca](https://github.com/Jeffser/Alpaca) (An Ollama client application for Linux and macOS made with GTK4 and Adwaita) - [AutoGPT](https://github.com/Significant-Gravitas/AutoGPT/blob/master/docs/content/platform/ollama.md) (AutoGPT Ollama integration) - [Go-CREW](https://www.jonathanhecl.com/go-crew/) (Powerful Offline RAG in Golang) - [PartCAD](https://github.com/openvmp/partcad/) (CAD model generation with OpenSCAD and CadQuery) -- [Ollama4j Web UI](https://github.com/ollama4j/ollama4j-web-ui) - Java-based Web UI for Ollama built with Vaadin, Spring Boot and Ollama4j +- [Ollama4j Web UI](https://github.com/ollama4j/ollama4j-web-ui) - Java-based Web UI for Ollama built with Vaadin, Spring Boot, and Ollama4j - [PyOllaMx](https://github.com/kspviswa/pyOllaMx) - macOS application capable of chatting with both Ollama and Apple MLX models. - [Cline](https://github.com/cline/cline) - Formerly known as Claude Dev is a VSCode extension for multi-file/whole-repo coding - [Cherry Studio](https://github.com/kangfenmao/cherry-studio) (Desktop client with Ollama support) -- [ConfiChat](https://github.com/1runeberg/confichat) (Lightweight, standalone, multi-platform, and privacy focused LLM chat interface with optional encryption) +- [ConfiChat](https://github.com/1runeberg/confichat) (Lightweight, standalone, multi-platform, and privacy-focused LLM chat interface with optional encryption) - [Archyve](https://github.com/nickthecook/archyve) (RAG-enabling document library) - [crewAI with Mesop](https://github.com/rapidarchitect/ollama-crew-mesop) (Mesop Web Interface to run crewAI with Ollama) - [Tkinter-based client](https://github.com/chyok/ollama-gui) (Python tkinter-based Client for Ollama) @@ -368,7 +371,7 @@ See the [API documentation](./docs/api.md) for all endpoints. - [DualMind](https://github.com/tcsenpai/dualmind) (Experimental app allowing two models to talk to each other in the terminal or in a web interface) - [ollamarama-matrix](https://github.com/h1ddenpr0cess20/ollamarama-matrix) (Ollama chatbot for the Matrix chat protocol) - [ollama-chat-app](https://github.com/anan1213095357/ollama-chat-app) (Flutter-based chat app) -- [Perfect Memory AI](https://www.perfectmemory.ai/) (Productivity AI assists personalized by what you have seen on your screen, heard and said in the meetings) +- [Perfect Memory AI](https://www.perfectmemory.ai/) (Productivity AI assists personalized by what you have seen on your screen, heard, and said in the meetings) - [Hexabot](https://github.com/hexastack/hexabot) (A conversational AI builder) - [Reddit Rate](https://github.com/rapidarchitect/reddit_analyzer) (Search and Rate Reddit topics with a weighted summation) - [OpenTalkGpt](https://github.com/adarshM84/OpenTalkGpt) (Chrome Extension to manage open-source models supported by Ollama, create custom models, and chat with models from a user-friendly UI) @@ -386,7 +389,7 @@ See the [API documentation](./docs/api.md) for all endpoints. - [ChibiChat](https://github.com/CosmicEventHorizon/ChibiChat) (Kotlin-based Android app to chat with Ollama and Koboldcpp API endpoints) - [LocalLLM](https://github.com/qusaismael/localllm) (Minimal Web-App to run ollama models on it with a GUI) - [Ollamazing](https://github.com/buiducnhat/ollamazing) (Web extension to run Ollama models) -- [OpenDeepResearcher-via-searxng](https://github.com/benhaotang/OpenDeepResearcher-via-searxng) (A Deep Research equivent endpoint with Ollama support for running locally) +- [OpenDeepResearcher-via-searxng](https://github.com/benhaotang/OpenDeepResearcher-via-searxng) (A Deep Research equivalent endpoint with Ollama support for running locally) - [AntSK](https://github.com/AIDotNet/AntSK) (Out-of-the-box & Adaptable RAG Chatbot) - [MaxKB](https://github.com/1Panel-dev/MaxKB/) (Ready-to-use & flexible RAG Chatbot) - [yla](https://github.com/danielekp/yla) (Web interface to freely interact with your customized models) @@ -394,11 +397,13 @@ See the [API documentation](./docs/api.md) for all endpoints. - [1Panel](https://github.com/1Panel-dev/1Panel/) (Web-based Linux Server Management Tool) - [AstrBot](https://github.com/Soulter/AstrBot/) (User-friendly LLM-based multi-platform chatbot with a WebUI, supporting RAG, LLM agents, and plugins integration) - [Reins](https://github.com/ibrahimcetin/reins) (Easily tweak parameters, customize system prompts per chat, and enhance your AI experiments with reasoning model support.) +- [Flufy](https://github.com/Aharon-Bensadoun/Flufy) (A beautiful chat interface for interacting with Ollama's API. Built with React, TypeScript, and Material-UI.) - [Ellama](https://github.com/zeozeozeo/ellama) (Friendly native app to chat with an Ollama instance) - [screenpipe](https://github.com/mediar-ai/screenpipe) Build agents powered by your screen history - [Ollamb](https://github.com/hengkysteen/ollamb) (Simple yet rich in features, cross-platform built with Flutter and designed for Ollama. Try the [web demo](https://hengkysteen.github.io/demo/ollamb/).) - [Writeopia](https://github.com/Writeopia/Writeopia) (Text editor with integration with Ollama) - [AppFlowy](https://github.com/AppFlowy-IO/AppFlowy) (AI collaborative workspace with Ollama, cross-platform and self-hostable) +- [Lumina](https://github.com/cushydigit/lumina.git) (A lightweight, minimal React.js frontend for interacting with Ollama servers) ### Cloud @@ -440,7 +445,7 @@ See the [API documentation](./docs/api.md) for all endpoints. - [PowershAI](https://github.com/rrg92/powershai) PowerShell module that brings AI to terminal on Windows, including support for Ollama - [DeepShell](https://github.com/Abyss-c0re/deepshell) Your self-hosted AI assistant. Interactive Shell, Files and Folders analysis. - [orbiton](https://github.com/xyproto/orbiton) Configuration-free text editor and IDE with support for tab completion with Ollama. -- [orca-cli](https://github.com/molbal/orca-cli) Ollama Registry CLI Application - Browse, pull and download models from Ollama Registry in your terminal. +- [orca-cli](https://github.com/molbal/orca-cli) Ollama Registry CLI Application - Browse, pull, and download models from Ollama Registry in your terminal. - [GGUF-to-Ollama](https://github.com/jonathanhecl/gguf-to-ollama) - Importing GGUF to Ollama made easy (multiplatform) ### Apple Vision Pro @@ -468,7 +473,7 @@ See the [API documentation](./docs/api.md) for all endpoints. ### Libraries -- [LangChain](https://python.langchain.com/docs/integrations/llms/ollama) and [LangChain.js](https://js.langchain.com/docs/integrations/chat/ollama/) with [example](https://js.langchain.com/docs/tutorials/local_rag/) +- [LangChain](https://python.langchain.com/docs/integrations/chat/ollama/) and [LangChain.js](https://js.langchain.com/docs/integrations/chat/ollama/) with [example](https://js.langchain.com/docs/tutorials/local_rag/) - [Firebase Genkit](https://firebase.google.com/docs/genkit/plugins/ollama) - [crewAI](https://github.com/crewAIInc/crewAI) - [Yacana](https://remembersoftwares.github.io/yacana/) (User-friendly multi-agent framework for brainstorming and executing predetermined flows with built-in tool integration) @@ -515,7 +520,7 @@ See the [API documentation](./docs/api.md) for all endpoints. - [Swollama for Swift](https://github.com/marcusziade/Swollama) with [DocC](https://marcusziade.github.io/Swollama/documentation/swollama/) - [GoLamify](https://github.com/prasad89/golamify) - [Ollama for Haskell](https://github.com/tusharad/ollama-haskell) -- [multi-llm-ts](https://github.com/nbonamy/multi-llm-ts) (A Typescript/JavaScript library allowing access to different LLM in unified API) +- [multi-llm-ts](https://github.com/nbonamy/multi-llm-ts) (A Typescript/JavaScript library allowing access to different LLM in a unified API) - [LlmTornado](https://github.com/lofcz/llmtornado) (C# library providing a unified interface for major FOSS & Commercial inference APIs) - [Ollama for Zig](https://github.com/dravenk/ollama-zig) - [Abso](https://github.com/lunary-ai/abso) (OpenAI-compatible TypeScript SDK for any LLM provider) @@ -524,11 +529,11 @@ See the [API documentation](./docs/api.md) for all endpoints. ### Mobile -- [SwiftChat](https://github.com/aws-samples/swift-chat) (Lightning-fast Cross-platform AI chat app with native UI for Android, iOS and iPad) +- [SwiftChat](https://github.com/aws-samples/swift-chat) (Lightning-fast Cross-platform AI chat app with native UI for Android, iOS, and iPad) - [Enchanted](https://github.com/AugustDev/enchanted) - [Maid](https://github.com/Mobile-Artificial-Intelligence/maid) - [Ollama App](https://github.com/JHubi1/ollama-app) (Modern and easy-to-use multi-platform client for Ollama) -- [ConfiChat](https://github.com/1runeberg/confichat) (Lightweight, standalone, multi-platform, and privacy focused LLM chat interface with optional encryption) +- [ConfiChat](https://github.com/1runeberg/confichat) (Lightweight, standalone, multi-platform, and privacy-focused LLM chat interface with optional encryption) - [Ollama Android Chat](https://github.com/sunshine0523/OllamaServer) (No need for Termux, start the Ollama service with one click on an Android device) - [Reins](https://github.com/ibrahimcetin/reins) (Easily tweak parameters, customize system prompts per chat, and enhance your AI experiments with reasoning model support.) @@ -552,7 +557,7 @@ See the [API documentation](./docs/api.md) for all endpoints. - [Obsidian Local GPT plugin](https://github.com/pfrankov/obsidian-local-gpt) - [Open Interpreter](https://docs.openinterpreter.com/language-model-setup/local-models/ollama) - [Llama Coder](https://github.com/ex3ndr/llama-coder) (Copilot alternative using Ollama) -- [Ollama Copilot](https://github.com/bernardo-bruning/ollama-copilot) (Proxy that allows you to use ollama as a copilot like Github copilot) +- [Ollama Copilot](https://github.com/bernardo-bruning/ollama-copilot) (Proxy that allows you to use Ollama as a copilot like GitHub Copilot) - [twinny](https://github.com/rjmacarthy/twinny) (Copilot and Copilot chat alternative using Ollama) - [Wingman-AI](https://github.com/RussellCanfield/wingman-ai) (Copilot code and chat alternative using Ollama and Hugging Face) - [Page Assist](https://github.com/n4ze3m/page-assist) (Chrome Extension) @@ -562,8 +567,8 @@ See the [API documentation](./docs/api.md) for all endpoints. - [Discord-Ollama Chat Bot](https://github.com/kevinthedang/discord-ollama) (Generalized TypeScript Discord Bot w/ Tuning Documentation) - [ChatGPTBox: All in one browser extension](https://github.com/josStorer/chatGPTBox) with [Integrating Tutorial](https://github.com/josStorer/chatGPTBox/issues/616#issuecomment-1975186467) - [Discord AI chat/moderation bot](https://github.com/rapmd73/Companion) Chat/moderation bot written in python. Uses Ollama to create personalities. -- [Headless Ollama](https://github.com/nischalj10/headless-ollama) (Scripts to automatically install ollama client & models on any OS for apps that depends on ollama server) -- [Terraform AWS Ollama & Open WebUI](https://github.com/xuyangbocn/terraform-aws-self-host-llm) (A Terraform module to deploy on AWS a ready-to-use Ollama service, together with its front end Open WebUI service.) +- [Headless Ollama](https://github.com/nischalj10/headless-ollama) (Scripts to automatically install ollama client & models on any OS for apps that depend on ollama server) +- [Terraform AWS Ollama & Open WebUI](https://github.com/xuyangbocn/terraform-aws-self-host-llm) (A Terraform module to deploy on AWS a ready-to-use Ollama service, together with its front-end Open WebUI service.) - [node-red-contrib-ollama](https://github.com/jakubburkiewicz/node-red-contrib-ollama) - [Local AI Helper](https://github.com/ivostoykov/localAI) (Chrome and Firefox extensions that enable interactions with the active tab and customisable API endpoints. Includes secure storage for user prompts.) - [vnc-lm](https://github.com/jake83741/vnc-lm) (Discord bot for messaging with LLMs through Ollama and LiteLLM. Seamlessly move between local and flagship models.) diff --git a/api/client_test.go b/api/client_test.go index fe9a15899..2ceeec9cf 100644 --- a/api/client_test.go +++ b/api/client_test.go @@ -1,7 +1,6 @@ package api import ( - "context" "encoding/json" "fmt" "net/http" @@ -137,7 +136,7 @@ func TestClientStream(t *testing.T) { client := NewClient(&url.URL{Scheme: "http", Host: ts.Listener.Addr().String()}, http.DefaultClient) var receivedChunks []ChatResponse - err := client.stream(context.Background(), http.MethodPost, "/v1/chat", nil, func(chunk []byte) error { + err := client.stream(t.Context(), http.MethodPost, "/v1/chat", nil, func(chunk []byte) error { var resp ChatResponse if err := json.Unmarshal(chunk, &resp); err != nil { return fmt.Errorf("failed to unmarshal chunk: %w", err) @@ -223,7 +222,7 @@ func TestClientDo(t *testing.T) { ID string `json:"id"` Success bool `json:"success"` } - err := client.do(context.Background(), http.MethodPost, "/v1/messages", nil, &resp) + err := client.do(t.Context(), http.MethodPost, "/v1/messages", nil, &resp) if tc.wantErr != "" { if err == nil { diff --git a/api/types.go b/api/types.go index 7d8b6e532..602f93da8 100644 --- a/api/types.go +++ b/api/types.go @@ -271,9 +271,6 @@ type Options struct { RepeatPenalty float32 `json:"repeat_penalty,omitempty"` PresencePenalty float32 `json:"presence_penalty,omitempty"` FrequencyPenalty float32 `json:"frequency_penalty,omitempty"` - Mirostat int `json:"mirostat,omitempty"` - MirostatTau float32 `json:"mirostat_tau,omitempty"` - MirostatEta float32 `json:"mirostat_eta,omitempty"` Stop []string `json:"stop,omitempty"` } @@ -283,12 +280,7 @@ type Runner struct { NumBatch int `json:"num_batch,omitempty"` NumGPU int `json:"num_gpu,omitempty"` MainGPU int `json:"main_gpu,omitempty"` - LowVRAM bool `json:"low_vram,omitempty"` - F16KV bool `json:"f16_kv,omitempty"` // Deprecated: This option is ignored - LogitsAll bool `json:"logits_all,omitempty"` - VocabOnly bool `json:"vocab_only,omitempty"` UseMMap *bool `json:"use_mmap,omitempty"` - UseMLock bool `json:"use_mlock,omitempty"` NumThread int `json:"num_thread,omitempty"` } @@ -471,13 +463,6 @@ type ProcessModelResponse struct { SizeVRAM int64 `json:"size_vram"` } -type RetrieveModelResponse struct { - Id string `json:"id"` - Object string `json:"object"` - Created int64 `json:"created"` - OwnedBy string `json:"owned_by"` -} - type TokenResponse struct { Token string `json:"token"` } @@ -660,9 +645,6 @@ func DefaultOptions() Options { RepeatPenalty: 1.1, PresencePenalty: 0.0, FrequencyPenalty: 0.0, - Mirostat: 0, - MirostatTau: 5.0, - MirostatEta: 0.1, Seed: -1, Runner: Runner{ @@ -671,8 +653,6 @@ func DefaultOptions() Options { NumBatch: 512, NumGPU: -1, // -1 here indicates that NumGPU should be set dynamically NumThread: 0, // let the runtime decide - LowVRAM: false, - UseMLock: false, UseMMap: nil, }, } diff --git a/benchmark/server_benchmark_test.go b/benchmark/server_benchmark_test.go index 672b8b173..4a3c46cda 100644 --- a/benchmark/server_benchmark_test.go +++ b/benchmark/server_benchmark_test.go @@ -78,7 +78,7 @@ func BenchmarkColdStart(b *testing.B) { for _, tt := range tests { b.Run(fmt.Sprintf("%s/cold/%s", m, tt.name), func(b *testing.B) { - ctx := context.Background() + ctx := b.Context() // Set number of tokens as our throughput metric b.SetBytes(int64(tt.maxTokens)) @@ -113,7 +113,7 @@ func BenchmarkWarmStart(b *testing.B) { for _, tt := range tests { b.Run(fmt.Sprintf("%s/warm/%s", m, tt.name), func(b *testing.B) { - ctx := context.Background() + ctx := b.Context() // Pre-warm the model warmup(client, m, tt.prompt, b) @@ -140,7 +140,7 @@ func setup(b *testing.B) *api.Client { if err != nil { b.Fatal(err) } - if _, err := client.Show(context.Background(), &api.ShowRequest{Model: modelName(b)}); err != nil { + if _, err := client.Show(b.Context(), &api.ShowRequest{Model: modelName(b)}); err != nil { b.Fatalf("Model unavailable: %v", err) } diff --git a/cmd/cmd.go b/cmd/cmd.go index befe578d6..0f8072f06 100644 --- a/cmd/cmd.go +++ b/cmd/cmd.go @@ -31,6 +31,7 @@ import ( "github.com/olekukonko/tablewriter" "github.com/spf13/cobra" "golang.org/x/crypto/ssh" + "golang.org/x/sync/errgroup" "golang.org/x/term" "github.com/ollama/ollama/api" @@ -41,6 +42,7 @@ import ( "github.com/ollama/ollama/runner" "github.com/ollama/ollama/server" "github.com/ollama/ollama/types/model" + "github.com/ollama/ollama/types/syncmap" "github.com/ollama/ollama/version" ) @@ -106,7 +108,7 @@ func CreateHandler(cmd *cobra.Command, args []string) error { } spinner.Stop() - req.Name = args[0] + req.Model = args[0] quantize, _ := cmd.Flags().GetString("quantize") if quantize != "" { req.Quantize = quantize @@ -117,34 +119,54 @@ func CreateHandler(cmd *cobra.Command, args []string) error { return err } - if len(req.Files) > 0 { - fileMap := map[string]string{} - for f, digest := range req.Files { + var g errgroup.Group + g.SetLimit(max(runtime.GOMAXPROCS(0)-1, 1)) + + files := syncmap.NewSyncMap[string, string]() + for f, digest := range req.Files { + g.Go(func() error { if _, err := createBlob(cmd, client, f, digest, p); err != nil { return err } - fileMap[filepath.Base(f)] = digest - } - req.Files = fileMap + + // TODO: this is incorrect since the file might be in a subdirectory + // instead this should take the path relative to the model directory + // but the current implementation does not allow this + files.Store(filepath.Base(f), digest) + return nil + }) } - if len(req.Adapters) > 0 { - fileMap := map[string]string{} - for f, digest := range req.Adapters { + adapters := syncmap.NewSyncMap[string, string]() + for f, digest := range req.Adapters { + g.Go(func() error { if _, err := createBlob(cmd, client, f, digest, p); err != nil { return err } - fileMap[filepath.Base(f)] = digest - } - req.Adapters = fileMap + + // TODO: same here + adapters.Store(filepath.Base(f), digest) + return nil + }) } + if err := g.Wait(); err != nil { + return err + } + + req.Files = files.Items() + req.Adapters = adapters.Items() + bars := make(map[string]*progress.Bar) fn := func(resp api.ProgressResponse) error { if resp.Digest != "" { bar, ok := bars[resp.Digest] if !ok { - bar = progress.NewBar(fmt.Sprintf("pulling %s...", resp.Digest[7:19]), resp.Total, resp.Completed) + msg := resp.Status + if msg == "" { + msg = fmt.Sprintf("pulling %s...", resp.Digest[7:19]) + } + bar = progress.NewBar(msg, resp.Total, resp.Completed) bars[resp.Digest] = bar p.Add(resp.Digest, bar) } @@ -213,7 +235,7 @@ func createBlob(cmd *cobra.Command, client *api.Client, path string, digest stri } }() - if err = client.CreateBlob(cmd.Context(), digest, io.TeeReader(bin, &pw)); err != nil { + if err := client.CreateBlob(cmd.Context(), digest, io.TeeReader(bin, &pw)); err != nil { return "", err } return digest, nil @@ -1407,7 +1429,6 @@ func NewCLI() *cobra.Command { envVars["OLLAMA_LLM_LIBRARY"], envVars["OLLAMA_GPU_OVERHEAD"], envVars["OLLAMA_LOAD_TIMEOUT"], - envVars["OLLAMA_CONTEXT_LENGTH"], }) default: appendEnvDocs(cmd, envs) diff --git a/cmd/cmd_test.go b/cmd/cmd_test.go index 367a35b6b..eb2fb124e 100644 --- a/cmd/cmd_test.go +++ b/cmd/cmd_test.go @@ -2,7 +2,6 @@ package cmd import ( "bytes" - "context" "encoding/json" "io" "net/http" @@ -337,7 +336,7 @@ func TestDeleteHandler(t *testing.T) { t.Cleanup(mockServer.Close) cmd := &cobra.Command{} - cmd.SetContext(context.TODO()) + cmd.SetContext(t.Context()) if err := DeleteHandler(cmd, []string{"test-model"}); err != nil { t.Fatalf("DeleteHandler failed: %v", err) } @@ -399,11 +398,6 @@ func TestGetModelfileName(t *testing.T) { var expectedFilename string if tt.fileExists { - tempDir, err := os.MkdirTemp("", "modelfiledir") - defer os.RemoveAll(tempDir) - if err != nil { - t.Fatalf("temp modelfile dir creation failed: %v", err) - } var fn string if tt.modelfileName != "" { fn = tt.modelfileName @@ -411,7 +405,7 @@ func TestGetModelfileName(t *testing.T) { fn = "Modelfile" } - tempFile, err := os.CreateTemp(tempDir, fn) + tempFile, err := os.CreateTemp(t.TempDir(), fn) if err != nil { t.Fatalf("temp modelfile creation failed: %v", err) } @@ -530,7 +524,7 @@ func TestPushHandler(t *testing.T) { cmd := &cobra.Command{} cmd.Flags().Bool("insecure", false, "") - cmd.SetContext(context.TODO()) + cmd.SetContext(t.Context()) // Redirect stderr to capture progress output oldStderr := os.Stderr @@ -635,7 +629,7 @@ func TestListHandler(t *testing.T) { t.Setenv("OLLAMA_HOST", mockServer.URL) cmd := &cobra.Command{} - cmd.SetContext(context.TODO()) + cmd.SetContext(t.Context()) // Capture stdout oldStdout := os.Stdout @@ -690,7 +684,7 @@ func TestCreateHandler(t *testing.T) { return } - if req.Name != "test-model" { + if req.Model != "test-model" { t.Errorf("expected model name 'test-model', got %s", req.Name) } @@ -730,7 +724,7 @@ func TestCreateHandler(t *testing.T) { })) t.Setenv("OLLAMA_HOST", mockServer.URL) t.Cleanup(mockServer.Close) - tempFile, err := os.CreateTemp("", "modelfile") + tempFile, err := os.CreateTemp(t.TempDir(), "modelfile") if err != nil { t.Fatal(err) } @@ -750,7 +744,7 @@ func TestCreateHandler(t *testing.T) { } cmd.Flags().Bool("insecure", false, "") - cmd.SetContext(context.TODO()) + cmd.SetContext(t.Context()) // Redirect stderr to capture progress output oldStderr := os.Stderr diff --git a/convert/convert.go b/convert/convert.go index ffcc2b8ab..249ec8077 100644 --- a/convert/convert.go +++ b/convert/convert.go @@ -4,9 +4,9 @@ import ( "encoding/json" "errors" "fmt" - "io" "io/fs" "log/slog" + "os" "slices" "strings" @@ -89,7 +89,7 @@ type ModelConverter interface { // KV maps parameters to LLM key-values KV(*Tokenizer) ggml.KV // Tensors maps input tensors to LLM tensors. Model specific modifications can be done here. - Tensors([]Tensor) []ggml.Tensor + Tensors([]Tensor) []*ggml.Tensor // Replacements returns a list of string pairs to replace in tensor names. // See [strings.Replacer](https://pkg.go.dev/strings#Replacer) for details Replacements() []string @@ -106,13 +106,13 @@ type AdapterConverter interface { // KV maps parameters to LLM key-values KV(ggml.KV) ggml.KV // Tensors maps input tensors to LLM tensors. Adapter specific modifications can be done here. - Tensors([]Tensor) []ggml.Tensor + Tensors([]Tensor) []*ggml.Tensor // Replacements returns a list of string pairs to replace in tensor names. // See [strings.Replacer](https://pkg.go.dev/strings#Replacer) for details Replacements() []string } -func ConvertAdapter(fsys fs.FS, ws io.WriteSeeker, baseKV ggml.KV) error { +func ConvertAdapter(fsys fs.FS, f *os.File, baseKV ggml.KV) error { bts, err := fs.ReadFile(fsys, "adapter_config.json") if err != nil { return err @@ -147,14 +147,14 @@ func ConvertAdapter(fsys fs.FS, ws io.WriteSeeker, baseKV ggml.KV) error { return err } - return writeFile(ws, conv.KV(baseKV), conv.Tensors(ts)) + return writeFile(f, conv.KV(baseKV), conv.Tensors(ts)) } // Convert writes an Ollama compatible model to the provided io.WriteSeeker based on configurations // and files it finds in the input path. // Supported input model formats include safetensors. // Supported input tokenizers files include tokenizer.json (preferred) and tokenizer.model. -func ConvertModel(fsys fs.FS, ws io.WriteSeeker) error { +func ConvertModel(fsys fs.FS, f *os.File) error { bts, err := fs.ReadFile(fsys, "config.json") if err != nil { return err @@ -239,13 +239,13 @@ func ConvertModel(fsys fs.FS, ws io.WriteSeeker) error { return err } - return writeFile(ws, conv.KV(t), conv.Tensors(ts)) + return writeFile(f, conv.KV(t), conv.Tensors(ts)) } -func writeFile(ws io.WriteSeeker, kv ggml.KV, ts []ggml.Tensor) error { +func writeFile(f *os.File, kv ggml.KV, ts []*ggml.Tensor) error { for i := range ts { ts[i].Shape = slices.Clone(ts[i].Shape) slices.Reverse(ts[i].Shape) } - return ggml.WriteGGUF(ws, kv, ts) + return ggml.WriteGGUF(f, kv, ts) } diff --git a/convert/convert_bert.go b/convert/convert_bert.go index 8575652aa..a9f4b8a77 100644 --- a/convert/convert_bert.go +++ b/convert/convert_bert.go @@ -132,8 +132,8 @@ func (p *bertModel) KV(t *Tokenizer) ggml.KV { return kv } -func (p *bertModel) Tensors(ts []Tensor) []ggml.Tensor { - var out []ggml.Tensor +func (p *bertModel) Tensors(ts []Tensor) []*ggml.Tensor { + var out []*ggml.Tensor for _, t := range ts { if slices.Contains([]string{ "embeddings.position_ids", @@ -143,7 +143,7 @@ func (p *bertModel) Tensors(ts []Tensor) []ggml.Tensor { continue } - out = append(out, ggml.Tensor{ + out = append(out, &ggml.Tensor{ Name: t.Name(), Kind: t.Kind(), Shape: t.Shape(), diff --git a/convert/convert_commandr.go b/convert/convert_commandr.go index 738a2cf3b..a909515bd 100644 --- a/convert/convert_commandr.go +++ b/convert/convert_commandr.go @@ -43,10 +43,10 @@ func (p *commandrModel) KV(t *Tokenizer) ggml.KV { return kv } -func (p *commandrModel) Tensors(ts []Tensor) []ggml.Tensor { - var out []ggml.Tensor +func (p *commandrModel) Tensors(ts []Tensor) []*ggml.Tensor { + var out []*ggml.Tensor for _, t := range ts { - out = append(out, ggml.Tensor{ + out = append(out, &ggml.Tensor{ Name: t.Name(), Kind: t.Kind(), Shape: t.Shape(), diff --git a/convert/convert_gemma.go b/convert/convert_gemma.go index 2f329943e..26698d6a6 100644 --- a/convert/convert_gemma.go +++ b/convert/convert_gemma.go @@ -42,14 +42,14 @@ func (p *gemmaModel) KV(t *Tokenizer) ggml.KV { return kv } -func (p *gemmaModel) Tensors(ts []Tensor) []ggml.Tensor { - var out []ggml.Tensor +func (p *gemmaModel) Tensors(ts []Tensor) []*ggml.Tensor { + var out []*ggml.Tensor for _, t := range ts { if !strings.HasPrefix(t.Name(), "v.") && strings.HasSuffix(t.Name(), "_norm.weight") { t.SetRepacker(p.addOne) } - out = append(out, ggml.Tensor{ + out = append(out, &ggml.Tensor{ Name: t.Name(), Kind: t.Kind(), Shape: t.Shape(), diff --git a/convert/convert_gemma2_adapter.go b/convert/convert_gemma2_adapter.go index 3494aa3f9..6299cd9e0 100644 --- a/convert/convert_gemma2_adapter.go +++ b/convert/convert_gemma2_adapter.go @@ -21,8 +21,8 @@ func (p *gemma2Adapter) KV(baseKV ggml.KV) ggml.KV { return kv } -func (p *gemma2Adapter) Tensors(ts []Tensor) []ggml.Tensor { - var out []ggml.Tensor +func (p *gemma2Adapter) Tensors(ts []Tensor) []*ggml.Tensor { + var out []*ggml.Tensor for _, t := range ts { shape := t.Shape() if (strings.HasSuffix(t.Name(), "weight.lora_a") && shape[0] > shape[1]) || @@ -31,7 +31,7 @@ func (p *gemma2Adapter) Tensors(ts []Tensor) []ggml.Tensor { t.SetRepacker(p.repack) } - out = append(out, ggml.Tensor{ + out = append(out, &ggml.Tensor{ Name: t.Name(), Kind: t.Kind(), Shape: t.Shape(), diff --git a/convert/convert_llama.go b/convert/convert_llama.go index 0caaa1949..e491a9d8d 100644 --- a/convert/convert_llama.go +++ b/convert/convert_llama.go @@ -126,11 +126,11 @@ func (p *llamaModel) KV(t *Tokenizer) ggml.KV { return kv } -func (p *llamaModel) Tensors(ts []Tensor) []ggml.Tensor { - var out []ggml.Tensor +func (p *llamaModel) Tensors(ts []Tensor) []*ggml.Tensor { + var out []*ggml.Tensor if p.RopeScaling.factors != nil { - out = append(out, ggml.Tensor{ + out = append(out, &ggml.Tensor{ Name: "rope_freqs.weight", Kind: 0, Shape: []uint64{uint64(len(p.RopeScaling.factors))}, @@ -145,7 +145,7 @@ func (p *llamaModel) Tensors(ts []Tensor) []ggml.Tensor { } } - out = append(out, ggml.Tensor{ + out = append(out, &ggml.Tensor{ Name: t.Name(), Kind: t.Kind(), Shape: t.Shape(), diff --git a/convert/convert_llama4.go b/convert/convert_llama4.go index 26a230b33..3e3792339 100644 --- a/convert/convert_llama4.go +++ b/convert/convert_llama4.go @@ -88,13 +88,13 @@ func (p *llama4Model) Replacements() []string { } // Tensors implements ModelConverter. -func (p *llama4Model) Tensors(ts []Tensor) []ggml.Tensor { - var out []ggml.Tensor +func (p *llama4Model) Tensors(ts []Tensor) []*ggml.Tensor { + var out []*ggml.Tensor var textTensors []Tensor for _, t := range ts { if strings.HasPrefix(t.Name(), "v.") || strings.HasPrefix(t.Name(), "mm.") { - out = append(out, ggml.Tensor{ + out = append(out, &ggml.Tensor{ Name: t.Name(), Kind: t.Kind(), Shape: t.Shape(), @@ -112,7 +112,7 @@ func (p *llama4Model) Tensors(ts []Tensor) []ggml.Tensor { // clone tensor since we need separate repackers tt := t.Clone() tt.SetRepacker(p.repack(nil, nil, tensor.S(i*halfDim, (i+1)*halfDim))) - out = append(out, ggml.Tensor{ + out = append(out, &ggml.Tensor{ Name: strings.ReplaceAll(tt.Name(), "ffn_gate_up_exps", name), Kind: tt.Kind(), Shape: newShape, @@ -125,7 +125,7 @@ func (p *llama4Model) Tensors(ts []Tensor) []ggml.Tensor { t.SetRepacker(p.repack()) newShape := slices.Clone(t.Shape()) newShape[1], newShape[2] = newShape[2], newShape[1] - out = append(out, ggml.Tensor{ + out = append(out, &ggml.Tensor{ Name: t.Name(), Kind: t.Kind(), Shape: newShape, diff --git a/convert/convert_llama_adapter.go b/convert/convert_llama_adapter.go index 718ef047e..4cc451153 100644 --- a/convert/convert_llama_adapter.go +++ b/convert/convert_llama_adapter.go @@ -29,8 +29,8 @@ func (p *llamaAdapter) KV(baseKV ggml.KV) ggml.KV { return kv } -func (p *llamaAdapter) Tensors(ts []Tensor) []ggml.Tensor { - var out []ggml.Tensor +func (p *llamaAdapter) Tensors(ts []Tensor) []*ggml.Tensor { + var out []*ggml.Tensor for _, t := range ts { shape := t.Shape() if (strings.HasSuffix(t.Name(), "weight.lora_a") && shape[0] > shape[1]) || @@ -41,7 +41,7 @@ func (p *llamaAdapter) Tensors(ts []Tensor) []ggml.Tensor { t.SetRepacker(p.repack) } - out = append(out, ggml.Tensor{ + out = append(out, &ggml.Tensor{ Name: t.Name(), Kind: t.Kind(), Shape: shape, diff --git a/convert/convert_mistral.go b/convert/convert_mistral.go index 6c224ae4f..a6fd4c41a 100644 --- a/convert/convert_mistral.go +++ b/convert/convert_mistral.go @@ -89,8 +89,8 @@ func (p *mistral3Model) KV(t *Tokenizer) ggml.KV { return kv } -func (p *mistral3Model) Tensors(ts []Tensor) []ggml.Tensor { - var out []ggml.Tensor +func (p *mistral3Model) Tensors(ts []Tensor) []*ggml.Tensor { + var out []*ggml.Tensor for _, t := range ts { if !strings.HasPrefix(t.Name(), "v.") { @@ -100,7 +100,7 @@ func (p *mistral3Model) Tensors(ts []Tensor) []ggml.Tensor { } } - out = append(out, ggml.Tensor{ + out = append(out, &ggml.Tensor{ Name: t.Name(), Kind: t.Kind(), Shape: t.Shape(), diff --git a/convert/convert_mixtral.go b/convert/convert_mixtral.go index 95a289f76..17580ff8f 100644 --- a/convert/convert_mixtral.go +++ b/convert/convert_mixtral.go @@ -29,7 +29,7 @@ func (p *mixtralModel) KV(t *Tokenizer) ggml.KV { return kv } -func (p *mixtralModel) Tensors(ts []Tensor) []ggml.Tensor { +func (p *mixtralModel) Tensors(ts []Tensor) []*ggml.Tensor { oldnew := []string{ "model.layers", "blk", "w1", "ffn_gate_exps", @@ -56,10 +56,10 @@ func (p *mixtralModel) Tensors(ts []Tensor) []ggml.Tensor { return true }) - var out []ggml.Tensor + var out []*ggml.Tensor for n, e := range experts { // TODO(mxyng): sanity check experts - out = append(out, ggml.Tensor{ + out = append(out, &ggml.Tensor{ Name: n, Kind: e[0].Kind(), Shape: append([]uint64{uint64(len(e))}, e[0].Shape()...), diff --git a/convert/convert_phi3.go b/convert/convert_phi3.go index d1c13795a..5a6756053 100644 --- a/convert/convert_phi3.go +++ b/convert/convert_phi3.go @@ -68,19 +68,19 @@ func (p *phi3Model) KV(t *Tokenizer) ggml.KV { return kv } -func (p *phi3Model) Tensors(ts []Tensor) []ggml.Tensor { +func (p *phi3Model) Tensors(ts []Tensor) []*ggml.Tensor { var addRopeFactors sync.Once - out := make([]ggml.Tensor, 0, len(ts)+2) + out := make([]*ggml.Tensor, 0, len(ts)+2) for _, t := range ts { if strings.HasPrefix(t.Name(), "blk.0.") { addRopeFactors.Do(func() { - out = append(out, ggml.Tensor{ + out = append(out, &ggml.Tensor{ Name: "rope_factors_long.weight", Kind: 0, Shape: []uint64{uint64(len(p.RopeScaling.LongFactor))}, WriterTo: p.RopeScaling.LongFactor, - }, ggml.Tensor{ + }, &ggml.Tensor{ Name: "rope_factors_short.weight", Kind: 0, Shape: []uint64{uint64(len(p.RopeScaling.ShortFactor))}, @@ -89,7 +89,7 @@ func (p *phi3Model) Tensors(ts []Tensor) []ggml.Tensor { }) } - out = append(out, ggml.Tensor{ + out = append(out, &ggml.Tensor{ Name: t.Name(), Kind: t.Kind(), Shape: t.Shape(), diff --git a/convert/convert_qwen2.go b/convert/convert_qwen2.go index 18278802e..edcb82e29 100644 --- a/convert/convert_qwen2.go +++ b/convert/convert_qwen2.go @@ -45,10 +45,10 @@ func (q *qwen2Model) KV(t *Tokenizer) ggml.KV { return kv } -func (q *qwen2Model) Tensors(ts []Tensor) []ggml.Tensor { - var out []ggml.Tensor +func (q *qwen2Model) Tensors(ts []Tensor) []*ggml.Tensor { + var out []*ggml.Tensor for _, t := range ts { - out = append(out, ggml.Tensor{ + out = append(out, &ggml.Tensor{ Name: t.Name(), Kind: t.Kind(), Shape: t.Shape(), diff --git a/convert/convert_test.go b/convert/convert_test.go index 1cdc26c41..b9db6fa15 100644 --- a/convert/convert_test.go +++ b/convert/convert_test.go @@ -130,6 +130,7 @@ func TestConvertModel(t *testing.T) { if err != nil { t.Fatal(err) } + defer expectFile.Close() var expect map[string]string if err := json.NewDecoder(expectFile).Decode(&expect); err != nil { diff --git a/convert/fs.go b/convert/fs.go deleted file mode 100644 index 31132dbe7..000000000 --- a/convert/fs.go +++ /dev/null @@ -1,58 +0,0 @@ -package convert - -import ( - "archive/zip" - "errors" - "io" - "io/fs" - "os" - "path/filepath" -) - -type ZipReader struct { - r *zip.Reader - p string - - // limit is the maximum size of a file that can be read directly - // from the zip archive. Files larger than this size will be extracted - limit int64 -} - -func NewZipReader(r *zip.Reader, p string, limit int64) fs.FS { - return &ZipReader{r, p, limit} -} - -func (z *ZipReader) Open(name string) (fs.File, error) { - r, err := z.r.Open(name) - if err != nil { - return nil, err - } - defer r.Close() - - if fi, err := r.Stat(); err != nil { - return nil, err - } else if fi.Size() < z.limit { - return r, nil - } - - if !filepath.IsLocal(name) { - return nil, zip.ErrInsecurePath - } - - n := filepath.Join(z.p, name) - if _, err := os.Stat(n); errors.Is(err, os.ErrNotExist) { - w, err := os.Create(n) - if err != nil { - return nil, err - } - defer w.Close() - - if _, err := io.Copy(w, r); err != nil { - return nil, err - } - } else if err != nil { - return nil, err - } - - return os.Open(n) -} diff --git a/discover/cuda_common.go b/discover/cuda_common.go index 048295297..f46c7cfa5 100644 --- a/discover/cuda_common.go +++ b/discover/cuda_common.go @@ -3,6 +3,7 @@ package discover import ( + "fmt" "log/slog" "os" "regexp" @@ -59,6 +60,8 @@ func cudaVariant(gpuInfo CudaGPUInfo) string { // driver 12.0 has problems with the cuda v12 library, so run v11 on those older drivers if gpuInfo.DriverMajor < 12 || (gpuInfo.DriverMajor == 12 && gpuInfo.DriverMinor == 0) { + // The detected driver is older than Feb 2023 + slog.Warn("old CUDA driver detected - please upgrade to a newer driver", "version", fmt.Sprintf("%d.%d", gpuInfo.DriverMajor, gpuInfo.DriverMinor)) return "v11" } return "v12" diff --git a/discover/gpu_info.h b/discover/gpu_info.h index 094b791a8..ee7ff4c33 100644 --- a/discover/gpu_info.h +++ b/discover/gpu_info.h @@ -27,12 +27,14 @@ #endif +#ifndef LOG #define LOG(verbose, ...) \ do { \ if (verbose) { \ fprintf(stderr, __VA_ARGS__); \ } \ } while (0) +#endif #ifdef __cplusplus extern "C" { diff --git a/discover/gpu_info_cudart.c b/discover/gpu_info_cudart.c index 03f15a2c3..bc5115bfd 100644 --- a/discover/gpu_info_cudart.c +++ b/discover/gpu_info_cudart.c @@ -1,6 +1,7 @@ #ifndef __APPLE__ // TODO - maybe consider nvidia support on intel macs? #include +#include #include "gpu_info_cudart.h" void cudart_init(char *cudart_lib_path, cudart_init_resp_t *resp) { @@ -58,7 +59,7 @@ void cudart_init(char *cudart_lib_path, cudart_init_resp_t *resp) { LOG(resp->ch.verbose, "cudaSetDevice err: %d\n", ret); UNLOAD_LIBRARY(resp->ch.handle); resp->ch.handle = NULL; - if (ret == CUDA_ERROR_INSUFFICIENT_DRIVER) { + if (ret == CUDART_ERROR_INSUFFICIENT_DRIVER) { resp->err = strdup("your nvidia driver is too old or missing. If you have a CUDA GPU please upgrade to run ollama"); return; } @@ -168,9 +169,9 @@ void cudart_bootstrap(cudart_handle_t h, int i, mem_info_t *resp) { resp->free = memInfo.free; resp->used = memInfo.used; - LOG(h.verbose, "[%s] CUDA totalMem %lu\n", resp->gpu_id, resp->total); - LOG(h.verbose, "[%s] CUDA freeMem %lu\n", resp->gpu_id, resp->free); - LOG(h.verbose, "[%s] CUDA usedMem %lu\n", resp->gpu_id, resp->used); + LOG(h.verbose, "[%s] CUDA totalMem %" PRId64 "\n", resp->gpu_id, resp->total); + LOG(h.verbose, "[%s] CUDA freeMem %" PRId64 "\n", resp->gpu_id, resp->free); + LOG(h.verbose, "[%s] CUDA usedMem %" PRId64 "\n", resp->gpu_id, resp->used); LOG(h.verbose, "[%s] Compute Capability %d.%d\n", resp->gpu_id, resp->major, resp->minor); } @@ -180,4 +181,4 @@ void cudart_release(cudart_handle_t h) { h.handle = NULL; } -#endif // __APPLE__ \ No newline at end of file +#endif // __APPLE__ diff --git a/discover/gpu_info_nvcuda.c b/discover/gpu_info_nvcuda.c index 466e1ac24..d2d0b683b 100644 --- a/discover/gpu_info_nvcuda.c +++ b/discover/gpu_info_nvcuda.c @@ -1,6 +1,7 @@ #ifndef __APPLE__ // TODO - maybe consider nvidia support on intel macs? #include +#include #include "gpu_info_nvcuda.h" void nvcuda_init(char *nvcuda_lib_path, nvcuda_init_resp_t *resp) { @@ -193,8 +194,8 @@ void nvcuda_bootstrap(nvcuda_handle_t h, int i, mem_info_t *resp) { resp->total = memInfo.total; resp->free = memInfo.free; - LOG(h.verbose, "[%s] CUDA totalMem %lu mb\n", resp->gpu_id, resp->total / 1024 / 1024); - LOG(h.verbose, "[%s] CUDA freeMem %lu mb\n", resp->gpu_id, resp->free / 1024 / 1024); + LOG(h.verbose, "[%s] CUDA totalMem %" PRId64 "mb\n", resp->gpu_id, resp->total / 1024 / 1024); + LOG(h.verbose, "[%s] CUDA freeMem %" PRId64 "mb\n", resp->gpu_id, resp->free / 1024 / 1024); LOG(h.verbose, "[%s] Compute Capability %d.%d\n", resp->gpu_id, resp->major, resp->minor); @@ -247,4 +248,4 @@ void nvcuda_release(nvcuda_handle_t h) { h.handle = NULL; } -#endif // __APPLE__ \ No newline at end of file +#endif // __APPLE__ diff --git a/discover/path.go b/discover/path.go index 8a20d8c21..68e63009a 100644 --- a/discover/path.go +++ b/discover/path.go @@ -12,7 +12,7 @@ import ( // '../lib/ollama' on Linux and the executable's directory on macOS // note: distribution builds, additional GPU-specific libraries are // found in subdirectories of the returned path, such as -// 'cuda_v11', 'cuda_v12', 'rocm', etc. +// 'cuda_v12', 'rocm', etc. var LibOllamaPath string = func() string { exe, err := os.Executable() if err != nil { diff --git a/docs/api.md b/docs/api.md index 7f3e5e2d2..b2b11573d 100644 --- a/docs/api.md +++ b/docs/api.md @@ -394,9 +394,6 @@ curl http://localhost:11434/api/generate -d '{ "repeat_penalty": 1.2, "presence_penalty": 1.5, "frequency_penalty": 1.0, - "mirostat": 1, - "mirostat_tau": 0.8, - "mirostat_eta": 0.6, "penalize_newline": true, "stop": ["\n", "user:"], "numa": false, @@ -404,10 +401,7 @@ curl http://localhost:11434/api/generate -d '{ "num_batch": 2, "num_gpu": 1, "main_gpu": 0, - "low_vram": false, - "vocab_only": false, "use_mmap": true, - "use_mlock": false, "num_thread": 8 } }' diff --git a/docs/faq.md b/docs/faq.md index 327afc6e5..6fe633414 100644 --- a/docs/faq.md +++ b/docs/faq.md @@ -20,7 +20,7 @@ Please refer to the [GPU docs](./gpu.md). ## How can I specify the context window size? -By default, Ollama uses a context window size of 4096 tokens, unless you have a single GPU with <= 4 GB of VRAM, in which case it will default to 2048 tokens. +By default, Ollama uses a context window size of 4096 tokens. This can be overridden with the `OLLAMA_CONTEXT_LENGTH` environment variable. For example, to set the default context window to 8K, use: @@ -31,7 +31,7 @@ OLLAMA_CONTEXT_LENGTH=8192 ollama serve To change this when using `ollama run`, use `/set parameter`: ```shell -/set parameter num_ctx 8192 +/set parameter num_ctx 4096 ``` When using the API, specify the `num_ctx` parameter: @@ -41,7 +41,7 @@ curl http://localhost:11434/api/generate -d '{ "model": "llama3.2", "prompt": "Why is the sky blue?", "options": { - "num_ctx": 8192 + "num_ctx": 4096 } }' ``` diff --git a/docs/gpu.md b/docs/gpu.md index b54c66ab6..61ff6e458 100644 --- a/docs/gpu.md +++ b/docs/gpu.md @@ -1,6 +1,6 @@ # GPU ## Nvidia -Ollama supports Nvidia GPUs with compute capability 5.0+. +Ollama supports Nvidia GPUs with compute capability 5.0+ and driver version 531 and newer. Check your compute compatibility to see if your card is supported: [https://developer.nvidia.com/cuda-gpus](https://developer.nvidia.com/cuda-gpus) diff --git a/docs/modelfile.md b/docs/modelfile.md index a71183f40..6513873ce 100644 --- a/docs/modelfile.md +++ b/docs/modelfile.md @@ -150,9 +150,6 @@ PARAMETER | Parameter | Description | Value Type | Example Usage | | -------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ---------- | -------------------- | -| mirostat | Enable Mirostat sampling for controlling perplexity. (default: 0, 0 = disabled, 1 = Mirostat, 2 = Mirostat 2.0) | int | mirostat 0 | -| mirostat_eta | Influences how quickly the algorithm responds to feedback from the generated text. A lower learning rate will result in slower adjustments, while a higher learning rate will make the algorithm more responsive. (Default: 0.1) | float | mirostat_eta 0.1 | -| mirostat_tau | Controls the balance between coherence and diversity of the output. A lower value will result in more focused and coherent text. (Default: 5.0) | float | mirostat_tau 5.0 | | num_ctx | Sets the size of the context window used to generate the next token. (Default: 2048) | int | num_ctx 4096 | | repeat_last_n | Sets how far back for the model to look back to prevent repetition. (Default: 64, 0 = disabled, -1 = num_ctx) | int | repeat_last_n 64 | | repeat_penalty | Sets how strongly to penalize repetitions. A higher value (e.g., 1.5) will penalize repetitions more strongly, while a lower value (e.g., 0.9) will be more lenient. (Default: 1.1) | float | repeat_penalty 1.1 | diff --git a/docs/troubleshooting.md b/docs/troubleshooting.md index ba5487fef..995b33aca 100644 --- a/docs/troubleshooting.md +++ b/docs/troubleshooting.md @@ -43,7 +43,7 @@ Ollama includes multiple LLM libraries compiled for different GPUs and CPU vecto In the server log, you will see a message that looks something like this (varies from release to release): ``` -Dynamic LLM libraries [rocm_v6 cpu cpu_avx cpu_avx2 cuda_v11 rocm_v5] +Dynamic LLM libraries [rocm_v6 cpu cpu_avx cpu_avx2 cuda_v12 rocm_v5] ``` **Experimental LLM Library Override** diff --git a/envconfig/config.go b/envconfig/config.go index fcb0a6947..b18e93f89 100644 --- a/envconfig/config.go +++ b/envconfig/config.go @@ -169,7 +169,7 @@ var ( // Enable the new Ollama engine NewEngine = Bool("OLLAMA_NEW_ENGINE") // ContextLength sets the default context length - ContextLength = Int64("OLLAMA_CONTEXT_LENGTH", -1) + ContextLength = Uint("OLLAMA_CONTEXT_LENGTH", 4096) ) func String(s string) func() string { @@ -227,20 +227,6 @@ func Uint64(key string, defaultValue uint64) func() uint64 { } } -func Int64(key string, defaultValue int64) func() int64 { - return func() int64 { - if s := Var(key); s != "" { - if n, err := strconv.ParseInt(s, 10, 64); err != nil { - slog.Warn("invalid environment variable, using default", "key", key, "value", s, "default", defaultValue) - } else { - return n - } - } - - return defaultValue - } -} - // Set aside VRAM per GPU var GpuOverhead = Uint64("OLLAMA_GPU_OVERHEAD", 0) @@ -269,7 +255,7 @@ func AsMap() map[string]EnvVar { "OLLAMA_ORIGINS": {"OLLAMA_ORIGINS", AllowedOrigins(), "A comma separated list of allowed origins"}, "OLLAMA_SCHED_SPREAD": {"OLLAMA_SCHED_SPREAD", SchedSpread(), "Always schedule model across all GPUs"}, "OLLAMA_MULTIUSER_CACHE": {"OLLAMA_MULTIUSER_CACHE", MultiUserCache(), "Optimize prompt caching for multi-user scenarios"}, - "OLLAMA_CONTEXT_LENGTH": {"OLLAMA_CONTEXT_LENGTH", ContextLength(), "Context length to use unless otherwise specified (default 4096 or 2048 with low VRAM)"}, + "OLLAMA_CONTEXT_LENGTH": {"OLLAMA_CONTEXT_LENGTH", ContextLength(), "Context length to use unless otherwise specified (default: 4096)"}, "OLLAMA_NEW_ENGINE": {"OLLAMA_NEW_ENGINE", NewEngine(), "Enable the new Ollama engine"}, // Informational diff --git a/envconfig/config_test.go b/envconfig/config_test.go index 72bfb4df5..9e80645c7 100644 --- a/envconfig/config_test.go +++ b/envconfig/config_test.go @@ -278,9 +278,9 @@ func TestVar(t *testing.T) { } func TestContextLength(t *testing.T) { - cases := map[string]int64{ - "": -1, - "4096": 4096, + cases := map[string]uint{ + "": 4096, + "2048": 2048, } for k, v := range cases { diff --git a/fs/ggml/ggml.go b/fs/ggml/ggml.go index 0d38f29e8..e128dfae4 100644 --- a/fs/ggml/ggml.go +++ b/fs/ggml/ggml.go @@ -37,12 +37,12 @@ func (kv KV) ParameterCount() uint64 { return val } -func (kv KV) FileType() fileType { +func (kv KV) FileType() FileType { if t := kv.Uint("general.file_type"); t > 0 { - return fileType(t) + return FileType(t) } - return fileTypeUnknown + return FileTypeUnknown } func (kv KV) BlockCount() uint64 { @@ -194,7 +194,7 @@ func keyValue[T valueTypes | arrayValueTypes](kv KV, key string, defaultValue .. return val, true } - slog.Warn("key with type not found", "key", key, "default", defaultValue[0]) + slog.Debug("key with type not found", "key", key, "default", defaultValue[0]) return defaultValue[0], false } @@ -271,7 +271,11 @@ func (t Tensor) block() (n int) { } func (t Tensor) blockSize() uint64 { - switch t.Kind { + return (TensorType)(t.Kind).BlockSize() +} + +func (t TensorType) BlockSize() uint64 { + switch t { case 0, // F32 1, // F16 @@ -297,73 +301,77 @@ func (t Tensor) blockSize() uint64 { } func (t Tensor) typeSize() uint64 { - blockSize := t.blockSize() + return TensorType(t.Kind).TypeSize() +} - switch t.Kind { - case 0: // FP32 +func (t TensorType) TypeSize() uint64 { + blockSize := t.BlockSize() + + switch t { + case TensorTypeF32: return 4 - case 1: // FP16 + case TensorTypeF16: return 2 - case 2: // Q4_0 + case TensorTypeQ4_0: return 2 + blockSize/2 - case 3: // Q4_1 + case TensorTypeQ4_1: return 2 + 2 + blockSize/2 - case 6: // Q5_0 + case TensorTypeQ5_0: return 2 + 4 + blockSize/2 - case 7: // Q5_1 + case TensorTypeQ5_1: return 2 + 2 + 4 + blockSize/2 - case 8: // Q8_0 + case TensorTypeQ8_0: return 2 + blockSize - case 9: // Q8_1 + case TensorTypeQ8_1: return 2 + 2 + blockSize - case 10: // Q2_K + case TensorTypeQ2_K: return blockSize/16 + blockSize/4 + 2 + 2 - case 11: // Q3_K + case TensorTypeQ3_K: return blockSize/8 + blockSize/4 + 12 + 2 - case 12: // Q4_K + case TensorTypeQ4_K: return 2 + 2 + 12 + blockSize/2 - case 13: // Q5_K + case TensorTypeQ5_K: return 2 + 2 + 12 + blockSize/8 + blockSize/2 - case 14: // Q6_K + case TensorTypeQ6_K: return blockSize/2 + blockSize/4 + blockSize/16 + 2 - case 15: // Q8_K + case TensorTypeQ8_K: return 4 + blockSize + 2*blockSize/16 - case 16: // IQ2_XXS + case tensorTypeIQ2_XXS: return 2 + 2*blockSize/8 - case 17: // IQ2_XS + case tensorTypeIQ2_XS: return 2 + 2*blockSize/8 + blockSize/32 - case 18: // IQ3_XXS + case tensorTypeIQ3_XXS: return 2 + blockSize/4 + blockSize/8 - case 19: // IQ1_S + case tensorTypeIQ1_S: return 2 + blockSize/8 + blockSize/16 - case 20: // IQ4_NL + case tensorTypeIQ4_NL: return 2 + blockSize/2 - case 21: // IQ3_S + case tensorTypeIQ3_S: return 2 + blockSize/4 + blockSize/8 + blockSize/32 + 4 - case 22: // IQ2_S + case tensorTypeIQ2_S: return 2 + blockSize/4 + blockSize/16 - case 23: // IQ4_XS + case tensorTypeIQ4_XS: return 2 + 2 + blockSize/2 + blockSize/64 - case 24: // I8 + case TensorTypeI8: return 1 - case 25: // I16 + case TensorTypeI16: return 2 - case 26: // I32 + case TensorTypeI32: return 4 - case 27: // I64 + case TensorTypeI64: return 8 - case 28: // F64 + case TensorTypeF64: return 8 - case 29: // IQ1_M + case tensorTypeIQ1_M: return blockSize/8 + blockSize/16 + blockSize/32 - case 30: // BF16 + case TensorTypeBF16: return 2 default: return 0 } } -func (t Tensor) parameters() uint64 { +func (t Tensor) Elements() uint64 { var count uint64 = 1 for _, n := range t.Shape { count *= n @@ -372,11 +380,11 @@ func (t Tensor) parameters() uint64 { } func (t Tensor) Size() uint64 { - return t.parameters() * t.typeSize() / t.blockSize() + return t.Elements() * t.typeSize() / t.blockSize() } func (t Tensor) Type() string { - return fileType(t.Kind).String() + return TensorType(t.Kind).String() } type container interface { @@ -525,7 +533,7 @@ func (f GGML) GraphSize(context, batch uint64, numParallel int, kvCacheType stri var ropeFreqsCount uint64 if ropeFreqs, ok := f.Tensors().GroupLayers()["rope_freqs"]; ok { if ropeFreqsWeights, ok := ropeFreqs["weights"]; ok { - ropeFreqsCount = ropeFreqsWeights.parameters() + ropeFreqsCount = ropeFreqsWeights.Elements() } } diff --git a/fs/ggml/gguf.go b/fs/ggml/gguf.go index fb3421576..8e75625e0 100644 --- a/fs/ggml/gguf.go +++ b/fs/ggml/gguf.go @@ -9,8 +9,12 @@ import ( "io" "log/slog" "maps" + "os" + "runtime" "slices" "strings" + + "golang.org/x/sync/errgroup" ) type containerGGUF struct { @@ -225,7 +229,7 @@ func (llm *gguf) Decode(rs io.ReadSeeker) error { } llm.tensors = append(llm.tensors, &tensor) - llm.parameters += tensor.parameters() + llm.parameters += tensor.Elements() } // patch KV with parameter count @@ -488,25 +492,38 @@ func writeGGUFArray[S ~[]E, E any](w io.Writer, t uint32, s S) error { return err } + if t == ggufTypeString { + for _, e := range any(s).([]string) { + if err := binary.Write(w, binary.LittleEndian, uint64(len(e))); err != nil { + return err + } + + if err := binary.Write(w, binary.LittleEndian, []byte(e)); err != nil { + return err + } + } + return nil + } + return binary.Write(w, binary.LittleEndian, s) } -func WriteGGUF(ws io.WriteSeeker, kv KV, ts []Tensor) error { +func WriteGGUF(f *os.File, kv KV, ts []*Tensor) error { alignment := kv.Uint("general.alignment", 32) - if err := binary.Write(ws, binary.LittleEndian, []byte("GGUF")); err != nil { + if err := binary.Write(f, binary.LittleEndian, []byte("GGUF")); err != nil { return err } - if err := binary.Write(ws, binary.LittleEndian, uint32(3)); err != nil { + if err := binary.Write(f, binary.LittleEndian, uint32(3)); err != nil { return err } - if err := binary.Write(ws, binary.LittleEndian, uint64(len(ts))); err != nil { + if err := binary.Write(f, binary.LittleEndian, uint64(len(ts))); err != nil { return err } - if err := binary.Write(ws, binary.LittleEndian, uint64(len(kv))); err != nil { + if err := binary.Write(f, binary.LittleEndian, uint64(len(kv))); err != nil { return err } @@ -514,12 +531,12 @@ func WriteGGUF(ws io.WriteSeeker, kv KV, ts []Tensor) error { slices.Sort(keys) for _, key := range keys { - if err := ggufWriteKV(ws, key, kv[key]); err != nil { + if err := ggufWriteKV(f, key, kv[key]); err != nil { return err } } - slices.SortStableFunc(ts, func(a, b Tensor) int { + slices.SortStableFunc(ts, func(a, b *Tensor) int { if i, j := a.block(), b.block(); i < 0 && j > 0 { return 1 } else if i > 0 && j < 0 { @@ -530,21 +547,34 @@ func WriteGGUF(ws io.WriteSeeker, kv KV, ts []Tensor) error { }) var s uint64 - for _, t := range ts { - t.Offset = s + uint64(ggufPadding(int64(s), int64(alignment))) - if err := ggufWriteTensorInfo(ws, t); err != nil { + for i := range ts { + ts[i].Offset = s + if err := ggufWriteTensorInfo(f, ts[i]); err != nil { return err } - s += t.Size() + s += ts[i].Size() + s += uint64(ggufPadding(int64(s), int64(alignment))) } + offset, err := f.Seek(0, io.SeekCurrent) + if err != nil { + return err + } + offset += ggufPadding(offset, int64(alignment)) + + var g errgroup.Group + g.SetLimit(runtime.GOMAXPROCS(0)) + // TODO consider reducing if tensors size * gomaxprocs is larger than free memory for _, t := range ts { - if err := ggufWriteTensor(ws, t, int64(alignment)); err != nil { + t := t + w := io.NewOffsetWriter(f, offset+int64(t.Offset)) + g.Go(func() error { + _, err := t.WriteTo(w) return err - } + }) } - return nil + return g.Wait() } func ggufWriteKV(ws io.WriteSeeker, k string, v any) error { @@ -559,8 +589,10 @@ func ggufWriteKV(ws io.WriteSeeker, k string, v any) error { var err error switch v := v.(type) { - case uint32: + case uint32, FileType: err = writeGGUF(ws, ggufTypeUint32, v) + case uint64: + err = writeGGUF(ws, ggufTypeUint64, v) case float32: err = writeGGUF(ws, ggufTypeFloat32, v) case bool: @@ -569,32 +601,20 @@ func ggufWriteKV(ws io.WriteSeeker, k string, v any) error { err = writeGGUFString(ws, v) case []int32: err = writeGGUFArray(ws, ggufTypeInt32, v) + case *array[int32]: + err = writeGGUFArray(ws, ggufTypeInt32, v.values) case []uint32: err = writeGGUFArray(ws, ggufTypeUint32, v) + case *array[uint32]: + err = writeGGUFArray(ws, ggufTypeUint32, v.values) case []float32: err = writeGGUFArray(ws, ggufTypeFloat32, v) + case *array[float32]: + err = writeGGUFArray(ws, ggufTypeFloat32, v.values) case []string: - if err := binary.Write(ws, binary.LittleEndian, ggufTypeArray); err != nil { - return err - } - - if err := binary.Write(ws, binary.LittleEndian, ggufTypeString); err != nil { - return err - } - - if err := binary.Write(ws, binary.LittleEndian, uint64(len(v))); err != nil { - return err - } - - for _, e := range v { - if err := binary.Write(ws, binary.LittleEndian, uint64(len(e))); err != nil { - return err - } - - if err := binary.Write(ws, binary.LittleEndian, []byte(e)); err != nil { - return err - } - } + err = writeGGUFArray(ws, ggufTypeString, v) + case *array[string]: + err = writeGGUFArray(ws, ggufTypeString, v.values) default: return fmt.Errorf("improper type for '%s'", k) } @@ -602,7 +622,7 @@ func ggufWriteKV(ws io.WriteSeeker, k string, v any) error { return err } -func ggufWriteTensorInfo(ws io.WriteSeeker, t Tensor) error { +func ggufWriteTensorInfo(ws io.WriteSeeker, t *Tensor) error { slog.Debug(t.Name, "kind", t.Kind, "shape", t.Shape, "offset", t.Offset) if err := binary.Write(ws, binary.LittleEndian, uint64(len(t.Name))); err != nil { return err @@ -629,20 +649,6 @@ func ggufWriteTensorInfo(ws io.WriteSeeker, t Tensor) error { return binary.Write(ws, binary.LittleEndian, t.Offset) } -func ggufWriteTensor(ws io.WriteSeeker, t Tensor, alignment int64) error { - offset, err := ws.Seek(0, io.SeekCurrent) - if err != nil { - return err - } - - if err := binary.Write(ws, binary.LittleEndian, bytes.Repeat([]byte{0}, int(ggufPadding(offset, alignment)))); err != nil { - return err - } - - _, err = t.WriteTo(ws) - return err -} - func ggufPadding(offset, align int64) int64 { return (align - offset%align) % align } diff --git a/fs/ggml/gguf_test.go b/fs/ggml/gguf_test.go new file mode 100644 index 000000000..10d3b6849 --- /dev/null +++ b/fs/ggml/gguf_test.go @@ -0,0 +1,63 @@ +package ggml + +import ( + "bytes" + "os" + "slices" + "testing" + + "github.com/google/go-cmp/cmp" +) + +func TestWriteGGUF(t *testing.T) { + w, err := os.CreateTemp(t.TempDir(), "*.bin") + if err != nil { + t.Fatal(err) + } + defer w.Close() + + if err := WriteGGUF(w, KV{ + "general.alignment": uint32(16), + }, []*Tensor{ + {Name: "test.0", Shape: []uint64{2, 3}, WriterTo: bytes.NewBuffer(slices.Repeat([]byte{0}, 2*3*4))}, + {Name: "test.1", Shape: []uint64{2, 3}, WriterTo: bytes.NewBuffer(slices.Repeat([]byte{0}, 2*3*4))}, + {Name: "test.2", Shape: []uint64{2, 3}, WriterTo: bytes.NewBuffer(slices.Repeat([]byte{0}, 2*3*4))}, + {Name: "test.3", Shape: []uint64{2, 3}, WriterTo: bytes.NewBuffer(slices.Repeat([]byte{0}, 2*3*4))}, + {Name: "test.4", Shape: []uint64{2, 3}, WriterTo: bytes.NewBuffer(slices.Repeat([]byte{0}, 2*3*4))}, + {Name: "test.5", Shape: []uint64{2, 3}, WriterTo: bytes.NewBuffer(slices.Repeat([]byte{0}, 2*3*4))}, + }); err != nil { + t.Fatal(err) + } + + r, err := os.Open(w.Name()) + if err != nil { + t.Fatal(err) + } + defer r.Close() + + ff, _, err := Decode(r, 0) + if err != nil { + t.Fatal(err) + } + + if diff := cmp.Diff(ff.KV(), KV{ + "general.alignment": uint32(16), + "general.parameter_count": uint64(36), + }); diff != "" { + t.Errorf("Mismatch (-want +got):\n%s", diff) + } + + if diff := cmp.Diff(ff.Tensors(), Tensors{ + Offset: 336, + items: []*Tensor{ + {Name: "test.0", Offset: 0, Shape: []uint64{2, 3}}, + {Name: "test.1", Offset: 32, Shape: []uint64{2, 3}}, + {Name: "test.2", Offset: 64, Shape: []uint64{2, 3}}, + {Name: "test.3", Offset: 96, Shape: []uint64{2, 3}}, + {Name: "test.4", Offset: 128, Shape: []uint64{2, 3}}, + {Name: "test.5", Offset: 160, Shape: []uint64{2, 3}}, + }, + }, cmp.AllowUnexported(Tensors{})); diff != "" { + t.Errorf("Mismatch (-want +got):\n%s", diff) + } +} diff --git a/fs/ggml/type.go b/fs/ggml/type.go index 7265afbcd..8172c46d9 100644 --- a/fs/ggml/type.go +++ b/fs/ggml/type.go @@ -1,185 +1,341 @@ package ggml -import "fmt" - -type fileType uint32 - -const ( - fileTypeF32 fileType = iota - fileTypeF16 - fileTypeQ4_0 - fileTypeQ4_1 - fileTypeQ4_1_F16 - fileTypeQ4_2 // unused - fileTypeQ4_3 // unused - fileTypeQ8_0 - fileTypeQ5_0 - fileTypeQ5_1 - fileTypeQ2_K - fileTypeQ3_K_S - fileTypeQ3_K_M - fileTypeQ3_K_L - fileTypeQ4_K_S - fileTypeQ4_K_M - fileTypeQ5_K_S - fileTypeQ5_K_M - fileTypeQ6_K - fileTypeIQ2_XXS - fileTypeIQ2_XS - fileTypeQ2_K_S - fileTypeIQ3_XS - fileTypeIQ3_XXS - fileTypeIQ1_S - fileTypeIQ4_NL - fileTypeIQ3_S - fileTypeIQ3_M - fileTypeIQ2_S - fileTypeIQ2_M - fileTypeIQ4_XS - fileTypeIQ1_M - fileTypeBF16 - - fileTypeUnknown +import ( + "fmt" + "log/slog" + "strings" ) -func ParseFileType(s string) (fileType, error) { +// FileType is the Go equivalent to llama_ftype used for gguf file typing +type FileType uint32 + +const ( + FileTypeF32 FileType = iota + FileTypeF16 + FileTypeQ4_0 + FileTypeQ4_1 + fileTypeQ4_1_F16 // unused by GGML + fileTypeQ4_2 // unused by GGML + fileTypeQ4_3 // unused by GGML + FileTypeQ8_0 + FileTypeQ5_0 + FileTypeQ5_1 + FileTypeQ2_K + FileTypeQ3_K_S + FileTypeQ3_K_M + FileTypeQ3_K_L + FileTypeQ4_K_S + FileTypeQ4_K_M + FileTypeQ5_K_S + FileTypeQ5_K_M + FileTypeQ6_K + fileTypeIQ2_XXS // not supported by ollama + fileTypeIQ2_XS // not supported by ollama + FileTypeQ2_K_S + fileTypeIQ3_XS // not supported by ollama + fileTypeIQ3_XXS // not supported by ollama + fileTypeIQ1_S // not supported by ollama + fileTypeIQ4_NL // not supported by ollama + fileTypeIQ3_S // not supported by ollama + fileTypeIQ3_M // not supported by ollama + fileTypeIQ2_S // not supported by ollama + fileTypeIQ2_M // not supported by ollama + fileTypeIQ4_XS // not supported by ollama + fileTypeIQ1_M // not supported by ollama + FileTypeBF16 + fileTypeQ4_0_4_4 // unused by GGML + fileTypeQ4_0_4_8 // unused by GGML + fileTypeQ4_0_8_8 // unused by GGML + fileTypeTQ1_0 // not supported by ollama + fileTypeTQ2_0 // not supported by ollama + + FileTypeUnknown = 1024 +) + +// ParseFileType parses the provided GGUF file type +// Only Ollama supported types are considered valid +func ParseFileType(s string) (FileType, error) { switch s { case "F32": - return fileTypeF32, nil + return FileTypeF32, nil case "F16": - return fileTypeF16, nil + return FileTypeF16, nil case "Q4_0": - return fileTypeQ4_0, nil + return FileTypeQ4_0, nil case "Q4_1": - return fileTypeQ4_1, nil - case "Q4_1_F16": - return fileTypeQ4_1_F16, nil + return FileTypeQ4_1, nil case "Q8_0": - return fileTypeQ8_0, nil + return FileTypeQ8_0, nil case "Q5_0": - return fileTypeQ5_0, nil + return FileTypeQ5_0, nil case "Q5_1": - return fileTypeQ5_1, nil + return FileTypeQ5_1, nil case "Q2_K": - return fileTypeQ2_K, nil + return FileTypeQ2_K, nil case "Q3_K_S": - return fileTypeQ3_K_S, nil + return FileTypeQ3_K_S, nil case "Q3_K_M": - return fileTypeQ3_K_M, nil + return FileTypeQ3_K_M, nil case "Q3_K_L": - return fileTypeQ3_K_L, nil + return FileTypeQ3_K_L, nil case "Q4_K_S": - return fileTypeQ4_K_S, nil - case "Q4_K_M": - return fileTypeQ4_K_M, nil + return FileTypeQ4_K_S, nil + case "Q4_K_M", "Q4_K": + return FileTypeQ4_K_M, nil case "Q5_K_S": - return fileTypeQ5_K_S, nil - case "Q5_K_M": - return fileTypeQ5_K_M, nil + return FileTypeQ5_K_S, nil + case "Q5_K_M", "Q5_K": + return FileTypeQ5_K_M, nil case "Q6_K": - return fileTypeQ6_K, nil - case "IQ2_XXS": - return fileTypeIQ2_XXS, nil - case "IQ2_XS": - return fileTypeIQ2_XS, nil + return FileTypeQ6_K, nil case "Q2_K_S": - return fileTypeQ2_K_S, nil - case "IQ3_XS": - return fileTypeIQ3_XS, nil - case "IQ3_XXS": - return fileTypeIQ3_XXS, nil - case "IQ1_S": - return fileTypeIQ1_S, nil - case "IQ4_NL": - return fileTypeIQ4_NL, nil - case "IQ3_S": - return fileTypeIQ3_S, nil - case "IQ3_M": - return fileTypeIQ3_M, nil - case "IQ2_S": - return fileTypeIQ2_S, nil - case "IQ2_M": - return fileTypeIQ2_M, nil - case "IQ4_XS": - return fileTypeIQ4_XS, nil - case "IQ1_M": - return fileTypeIQ1_M, nil + return FileTypeQ2_K_S, nil case "BF16": - return fileTypeBF16, nil + return FileTypeBF16, nil default: - return fileTypeUnknown, fmt.Errorf("unknown fileType: %s", s) + supportedFileTypes := []FileType{ + FileTypeF32, + FileTypeF16, + FileTypeQ4_K_S, + FileTypeQ4_K_M, + FileTypeQ8_0, + // fsggml.FileTypeBF16, // TODO + } + strs := make([]string, len(supportedFileTypes)) + for i := range supportedFileTypes { + strs[i] = supportedFileTypes[i].String() + } + + return FileTypeUnknown, fmt.Errorf("unsupported quantization type %s - supported types are %s", s, strings.Join(strs, ", ")) } } -func (t fileType) String() string { +func (t FileType) String() string { switch t { - case fileTypeF32: + case FileTypeF32: return "F32" - case fileTypeF16: + case FileTypeF16: return "F16" - case fileTypeQ4_0: + case FileTypeQ4_0: return "Q4_0" - case fileTypeQ4_1: + case FileTypeQ4_1: return "Q4_1" - case fileTypeQ4_1_F16: - return "Q4_1_F16" - case fileTypeQ8_0: + case FileTypeQ8_0: return "Q8_0" - case fileTypeQ5_0: + case FileTypeQ5_0: return "Q5_0" - case fileTypeQ5_1: + case FileTypeQ5_1: return "Q5_1" - case fileTypeQ2_K: + case FileTypeQ2_K: return "Q2_K" - case fileTypeQ3_K_S: + case FileTypeQ3_K_S: return "Q3_K_S" - case fileTypeQ3_K_M: + case FileTypeQ3_K_M: return "Q3_K_M" - case fileTypeQ3_K_L: + case FileTypeQ3_K_L: return "Q3_K_L" - case fileTypeQ4_K_S: + case FileTypeQ4_K_S: return "Q4_K_S" - case fileTypeQ4_K_M: + case FileTypeQ4_K_M: return "Q4_K_M" - case fileTypeQ5_K_S: + case FileTypeQ5_K_S: return "Q5_K_S" - case fileTypeQ5_K_M: + case FileTypeQ5_K_M: return "Q5_K_M" - case fileTypeQ6_K: + case FileTypeQ6_K: return "Q6_K" - case fileTypeIQ2_XXS: - return "IQ2_XXS" - case fileTypeIQ2_XS: - return "IQ2_XS" - case fileTypeQ2_K_S: + case FileTypeQ2_K_S: return "Q2_K_S" - case fileTypeIQ3_XS: - return "IQ3_XS" - case fileTypeIQ3_XXS: - return "IQ3_XXS" - case fileTypeIQ1_S: - return "IQ1_S" - case fileTypeIQ4_NL: - return "IQ4_NL" - case fileTypeIQ3_S: - return "IQ3_S" - case fileTypeIQ3_M: - return "IQ3_M" - case fileTypeIQ2_S: - return "IQ2_S" - case fileTypeIQ4_XS: - return "IQ4_XS" - case fileTypeIQ2_M: - return "IQ2_M" - case fileTypeIQ1_M: - return "IQ1_M" - case fileTypeBF16: + case FileTypeBF16: return "BF16" default: return "unknown" } } -func (t fileType) Value() uint32 { +func (t FileType) Value() uint32 { return uint32(t) } + +func (ftype FileType) ToTensorType() TensorType { + switch ftype { + case FileTypeF32: + return TensorTypeF32 + case FileTypeF16: + return TensorTypeF16 + case FileTypeQ4_0: + return TensorTypeQ4_0 + case FileTypeQ4_1: + return TensorTypeQ4_1 + case FileTypeQ8_0: + return TensorTypeQ8_0 + case FileTypeQ5_0: + return TensorTypeQ5_0 + case FileTypeQ5_1: + return TensorTypeQ5_1 + case FileTypeQ2_K: + return TensorTypeQ2_K + case FileTypeQ3_K_S: + return TensorTypeQ3_K + case FileTypeQ3_K_M: + return TensorTypeQ3_K + case FileTypeQ3_K_L: + return TensorTypeQ3_K + case FileTypeQ4_K_S: + return TensorTypeQ4_K + case FileTypeQ4_K_M: + return TensorTypeQ4_K + case FileTypeQ5_K_S: + return TensorTypeQ5_K + case FileTypeQ5_K_M: + return TensorTypeQ5_K + case FileTypeQ6_K: + return TensorTypeQ6_K + case FileTypeQ2_K_S: + return TensorTypeQ2_K + case FileTypeBF16: + return TensorTypeBF16 + default: + slog.Warn("unsupported file type", "type", ftype) + return 0 // F32 + } +} + +// TensorType is equivalent to ggml_type for individual tensor types +// Note: these are not the same as FileType +type TensorType uint32 + +const ( + TensorTypeF32 TensorType = iota + TensorTypeF16 + TensorTypeQ4_0 + TensorTypeQ4_1 + tensorTypeQ4_2 // unused by GGML + tensorTypeQ4_3 // unused by GGML + TensorTypeQ5_0 + TensorTypeQ5_1 + TensorTypeQ8_0 + TensorTypeQ8_1 + TensorTypeQ2_K + TensorTypeQ3_K + TensorTypeQ4_K + TensorTypeQ5_K + TensorTypeQ6_K + TensorTypeQ8_K + tensorTypeIQ2_XXS // not supported by ollama + tensorTypeIQ2_XS // not supported by ollama + tensorTypeIQ3_XXS // not supported by ollama + tensorTypeIQ1_S // not supported by ollama + tensorTypeIQ4_NL // not supported by ollama + tensorTypeIQ3_S // not supported by ollama + tensorTypeIQ2_S // not supported by ollama + tensorTypeIQ4_XS // not supported by ollama + TensorTypeI8 + TensorTypeI16 + TensorTypeI32 + TensorTypeI64 + TensorTypeF64 + tensorTypeIQ1_M // not supported by ollama + TensorTypeBF16 + tensorTypeQ4_0_4_4 // unused by GGML + tensorTypeQ4_0_4_8 // unused by GGML + tensorTypeQ4_0_8_8 // unused by GGML + tensorTypeTQ1_0 // not supported by ollama + tensorTypeTQ2_0 // not supported by ollama + tensorTypeIQ4_NL_4_4 // unused by GGML + tensorTypeIQ4_NL_4_8 // unused by GGML + tensorTypeIQ4_NL_8_8 // unused by GGML +) + +// ParseFileType parses the provided GGUF file type +// Only Ollama supported types are considered valid +func ParseTensorType(s string) (TensorType, error) { + switch s { + case "F32": + return TensorTypeF32, nil + case "F16": + return TensorTypeF16, nil + case "Q4_0": + return TensorTypeQ4_0, nil + case "Q4_1": + return TensorTypeQ4_1, nil + case "Q5_0": + return TensorTypeQ5_0, nil + case "Q5_1": + return TensorTypeQ5_1, nil + case "Q8_0": + return TensorTypeQ8_0, nil + case "Q8_1": + return TensorTypeQ8_1, nil + case "Q2_K": + return TensorTypeQ2_K, nil + case "Q3_K": + return TensorTypeQ3_K, nil + case "Q4_K": + return TensorTypeQ4_K, nil + case "Q5_K": + return TensorTypeQ5_K, nil + case "Q6_K": + return TensorTypeQ6_K, nil + case "Q8_K": + return TensorTypeQ8_K, nil + case "F64": + return TensorTypeF64, nil + case "BF16": + return TensorTypeBF16, nil + default: + return 0, fmt.Errorf("unsupported quantization type %s", s) + } +} + +func (t TensorType) IsQuantized() bool { + switch t { + case TensorTypeF32, TensorTypeF16, TensorTypeBF16: + return false + default: + return true + } +} + +func (t TensorType) RowSize(ne uint64) uint64 { + return t.TypeSize() * ne / t.BlockSize() +} + +func (t TensorType) String() string { + switch t { + case TensorTypeF32: + return "F32" + case TensorTypeF16: + return "F16" + case TensorTypeQ4_0: + return "Q4_0" + case TensorTypeQ4_1: + return "Q4_1" + case TensorTypeQ5_0: + return "Q5_0" + case TensorTypeQ5_1: + return "Q5_1" + case TensorTypeQ8_0: + return "Q8_0" + case TensorTypeQ8_1: + return "Q8_1" + case TensorTypeQ2_K: + return "Q2_K" + case TensorTypeQ3_K: + return "Q3_K" + case TensorTypeQ4_K: + return "Q4_K" + case TensorTypeQ5_K: + return "Q5_K" + case TensorTypeQ6_K: + return "Q6_K" + case TensorTypeQ8_K: + return "Q8_K" + case TensorTypeF64: + return "F64" + case TensorTypeBF16: + return "BF16" + default: + return "unknown" + } +} diff --git a/go.mod b/go.mod index cc5789005..283286b7d 100644 --- a/go.mod +++ b/go.mod @@ -11,7 +11,7 @@ require ( github.com/spf13/cobra v1.7.0 github.com/stretchr/testify v1.9.0 github.com/x448/float16 v0.8.4 - golang.org/x/sync v0.11.0 + golang.org/x/sync v0.12.0 ) require ( @@ -70,12 +70,12 @@ require ( github.com/twitchyliquid64/golang-asm v0.15.1 // indirect github.com/ugorji/go/codec v1.2.12 // indirect golang.org/x/arch v0.8.0 // indirect - golang.org/x/crypto v0.33.0 + golang.org/x/crypto v0.36.0 golang.org/x/exp v0.0.0-20250218142911-aa4b98e5adaa - golang.org/x/net v0.35.0 // indirect - golang.org/x/sys v0.30.0 - golang.org/x/term v0.29.0 - golang.org/x/text v0.22.0 + golang.org/x/net v0.38.0 // indirect + golang.org/x/sys v0.31.0 + golang.org/x/term v0.30.0 + golang.org/x/text v0.23.0 google.golang.org/protobuf v1.34.1 gopkg.in/yaml.v3 v3.0.1 // indirect ) diff --git a/go.sum b/go.sum index 0ab97b909..5755616f6 100644 --- a/go.sum +++ b/go.sum @@ -214,8 +214,8 @@ golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACk golang.org/x/crypto v0.0.0-20190510104115-cbcb75029529/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI= golang.org/x/crypto v0.0.0-20191011191535-87dc89f01550/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI= golang.org/x/crypto v0.0.0-20200622213623-75b288015ac9/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto= -golang.org/x/crypto v0.33.0 h1:IOBPskki6Lysi0lo9qQvbxiQ+FvsCC/YWOecCHAixus= -golang.org/x/crypto v0.33.0/go.mod h1:bVdXmD7IV/4GdElGPozy6U7lWdRXA4qyRVGJV57uQ5M= +golang.org/x/crypto v0.36.0 h1:AnAEvhDddvBdpY+uR+MyHmuZzzNqXSe/GvuDeob5L34= +golang.org/x/crypto v0.36.0/go.mod h1:Y4J0ReaxCR1IMaabaSMugxJES1EpwhBHhv2bDHklZvc= golang.org/x/exp v0.0.0-20180321215751-8460e604b9de/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA= golang.org/x/exp v0.0.0-20180807140117-3d87b88a115f/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA= golang.org/x/exp v0.0.0-20190121172915-509febef88a4/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA= @@ -257,8 +257,8 @@ golang.org/x/net v0.0.0-20200822124328-c89045814202/go.mod h1:/O7V0waA8r7cgGh81R golang.org/x/net v0.0.0-20201021035429-f5854403a974/go.mod h1:sp8m0HH+o8qH0wwXwYZr8TS3Oi6o0r6Gce1SSxlDquU= golang.org/x/net v0.0.0-20210405180319-a5a99cb37ef4/go.mod h1:p54w0d4576C0XHj96bSt6lcn1PtDYWL6XObtHCRCNQM= golang.org/x/net v0.0.0-20210614182718-04defd469f4e/go.mod h1:9nx3DQGgdP8bBQD5qxJ1jj9UTztislL4KSBs9R2vV5Y= -golang.org/x/net v0.35.0 h1:T5GQRQb2y08kTAByq9L4/bz8cipCdA8FbRTXewonqY8= -golang.org/x/net v0.35.0/go.mod h1:EglIi67kWsHKlRzzVMUD93VMSWGFOMSZgxFjparz1Qk= +golang.org/x/net v0.38.0 h1:vRMAPTMaeGqVhG5QyLJHqNDwecKTomGeqbnfZyKlBI8= +golang.org/x/net v0.38.0/go.mod h1:ivrbrMbzFq5J41QOQh0siUuly180yBYtLp+CKbEaFx8= golang.org/x/oauth2 v0.0.0-20180821212333-d2e6202438be/go.mod h1:N/0e6XlmueqKjAGxoOufVs8QHGRruUQn6yWY3a++T0U= golang.org/x/oauth2 v0.0.0-20200107190931-bf48bf16ab8d/go.mod h1:gOpvHmFTYa4IltrdGE7lF6nIHvwfUNPOp7c8zoXwtLw= golang.org/x/sync v0.0.0-20180314180146-1d60e4601c6f/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= @@ -268,8 +268,8 @@ golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJ golang.org/x/sync v0.0.0-20190911185100-cd5d95a43a6e/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20201020160332-67f06af15bc9/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20210220032951-036812b2e83c/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= -golang.org/x/sync v0.11.0 h1:GGz8+XQP4FvTTrjZPzNKTMFtSXH80RAzG+5ghFPgK9w= -golang.org/x/sync v0.11.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk= +golang.org/x/sync v0.12.0 h1:MHc5BpPuC30uJk597Ri8TV3CNZcTLu6B6z4lJy+g6Jw= +golang.org/x/sync v0.12.0/go.mod h1:1dzgHSNfp02xaA81J2MS99Qcpr2w7fw1gpm99rleRqA= golang.org/x/sys v0.0.0-20180830151530-49385e6e1522/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= golang.org/x/sys v0.0.0-20190312061237-fead79001313/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= @@ -285,17 +285,17 @@ golang.org/x/sys v0.0.0-20210510120138-977fb7262007/go.mod h1:oPkhp1MJrh7nUepCBc golang.org/x/sys v0.0.0-20210630005230-0f9fa26af87c/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.5.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= -golang.org/x/sys v0.30.0 h1:QjkSwP/36a20jFYWkSue1YwXzLmsV5Gfq7Eiy72C1uc= -golang.org/x/sys v0.30.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= +golang.org/x/sys v0.31.0 h1:ioabZlmFYtWhL+TRYpcnNlLwhyxaM9kWTDEmfnprqik= +golang.org/x/sys v0.31.0/go.mod h1:BJP2sWEmIv4KK5OTEluFJCKSidICx8ciO85XgH3Ak8k= golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo= -golang.org/x/term v0.29.0 h1:L6pJp37ocefwRRtYPKSWOWzOtWSxVajvz2ldH/xi3iU= -golang.org/x/term v0.29.0/go.mod h1:6bl4lRlvVuDgSf3179VpIxBF0o10JUpXWOnI7nErv7s= +golang.org/x/term v0.30.0 h1:PQ39fJZ+mfadBm0y5WlL4vlM7Sx1Hgf13sMIY2+QS9Y= +golang.org/x/term v0.30.0/go.mod h1:NYYFdzHoI5wRh/h5tDMdMqCqPJZEuNqVR5xJLd/n67g= golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= golang.org/x/text v0.3.5/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= golang.org/x/text v0.3.6/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= -golang.org/x/text v0.22.0 h1:bofq7m3/HAFvbF51jz3Q9wLg3jkvSPuiZu/pD1XwgtM= -golang.org/x/text v0.22.0/go.mod h1:YRoo4H8PVmsu+E3Ou7cqLVH8oXWIHVoX0jqUWALQhfY= +golang.org/x/text v0.23.0 h1:D71I7dUrlY+VX0gQShAThNGHFxZ13dGLBHQLVl1mJlY= +golang.org/x/text v0.23.0/go.mod h1:/BLNzu4aZCJ1+kcD0DNRotWKage4q2rGVAg4o22unh4= golang.org/x/tools v0.0.0-20180525024113-a5b4c53f6e8b/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= golang.org/x/tools v0.0.0-20190114222345-bf090417da8b/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= diff --git a/integration/embed_test.go b/integration/embed_test.go index 8a95816a5..09369dbb4 100644 --- a/integration/embed_test.go +++ b/integration/embed_test.go @@ -34,13 +34,15 @@ func cosineSimilarity[V float32 | float64](v1, v2 []V) V { func TestAllMiniLMEmbeddings(t *testing.T) { ctx, cancel := context.WithTimeout(context.Background(), 2*time.Minute) defer cancel() + client, _, cleanup := InitServerConnection(ctx, t) + defer cleanup() req := api.EmbeddingRequest{ Model: "all-minilm", Prompt: "why is the sky blue?", } - res, err := embeddingTestHelper(ctx, t, req) + res, err := embeddingTestHelper(ctx, client, t, req) if err != nil { t.Fatalf("error: %v", err) @@ -62,13 +64,15 @@ func TestAllMiniLMEmbeddings(t *testing.T) { func TestAllMiniLMEmbed(t *testing.T) { ctx, cancel := context.WithTimeout(context.Background(), 2*time.Minute) defer cancel() + client, _, cleanup := InitServerConnection(ctx, t) + defer cleanup() req := api.EmbedRequest{ Model: "all-minilm", Input: "why is the sky blue?", } - res, err := embedTestHelper(ctx, t, req) + res, err := embedTestHelper(ctx, client, t, req) if err != nil { t.Fatalf("error: %v", err) @@ -98,13 +102,15 @@ func TestAllMiniLMEmbed(t *testing.T) { func TestAllMiniLMBatchEmbed(t *testing.T) { ctx, cancel := context.WithTimeout(context.Background(), 2*time.Minute) defer cancel() + client, _, cleanup := InitServerConnection(ctx, t) + defer cleanup() req := api.EmbedRequest{ Model: "all-minilm", Input: []string{"why is the sky blue?", "why is the grass green?"}, } - res, err := embedTestHelper(ctx, t, req) + res, err := embedTestHelper(ctx, client, t, req) if err != nil { t.Fatalf("error: %v", err) @@ -144,6 +150,8 @@ func TestAllMiniLMBatchEmbed(t *testing.T) { func TestAllMiniLMEmbedTruncate(t *testing.T) { ctx, cancel := context.WithTimeout(context.Background(), 2*time.Minute) defer cancel() + client, _, cleanup := InitServerConnection(ctx, t) + defer cleanup() truncTrue, truncFalse := true, false @@ -182,7 +190,7 @@ func TestAllMiniLMEmbedTruncate(t *testing.T) { res := make(map[string]*api.EmbedResponse) for _, req := range reqs { - response, err := embedTestHelper(ctx, t, req.Request) + response, err := embedTestHelper(ctx, client, t, req.Request) if err != nil { t.Fatalf("error: %v", err) } @@ -198,7 +206,7 @@ func TestAllMiniLMEmbedTruncate(t *testing.T) { } // check that truncate set to false returns an error if context length is exceeded - _, err := embedTestHelper(ctx, t, api.EmbedRequest{ + _, err := embedTestHelper(ctx, client, t, api.EmbedRequest{ Model: "all-minilm", Input: "why is the sky blue?", Truncate: &truncFalse, @@ -210,9 +218,7 @@ func TestAllMiniLMEmbedTruncate(t *testing.T) { } } -func embeddingTestHelper(ctx context.Context, t *testing.T, req api.EmbeddingRequest) (*api.EmbeddingResponse, error) { - client, _, cleanup := InitServerConnection(ctx, t) - defer cleanup() +func embeddingTestHelper(ctx context.Context, client *api.Client, t *testing.T, req api.EmbeddingRequest) (*api.EmbeddingResponse, error) { if err := PullIfMissing(ctx, client, req.Model); err != nil { t.Fatalf("failed to pull model %s: %v", req.Model, err) } @@ -226,9 +232,7 @@ func embeddingTestHelper(ctx context.Context, t *testing.T, req api.EmbeddingReq return response, nil } -func embedTestHelper(ctx context.Context, t *testing.T, req api.EmbedRequest) (*api.EmbedResponse, error) { - client, _, cleanup := InitServerConnection(ctx, t) - defer cleanup() +func embedTestHelper(ctx context.Context, client *api.Client, t *testing.T, req api.EmbedRequest) (*api.EmbedResponse, error) { if err := PullIfMissing(ctx, client, req.Model); err != nil { t.Fatalf("failed to pull model %s: %v", req.Model, err) } diff --git a/integration/model_arch_test.go b/integration/model_arch_test.go index e094d3cea..6ce183d79 100644 --- a/integration/model_arch_test.go +++ b/integration/model_arch_test.go @@ -48,17 +48,6 @@ var ( } ) -func getTimeouts(t *testing.T) (soft time.Duration, hard time.Duration) { - deadline, hasDeadline := t.Deadline() - if !hasDeadline { - return 8 * time.Minute, 10 * time.Minute - } else if deadline.Compare(time.Now().Add(2*time.Minute)) <= 0 { - t.Skip("too little time") - return time.Duration(0), time.Duration(0) - } - return -time.Since(deadline.Add(-2 * time.Minute)), -time.Since(deadline.Add(-20 * time.Second)) -} - func TestModelsGenerate(t *testing.T) { softTimeout, hardTimeout := getTimeouts(t) slog.Info("Setting timeouts", "soft", softTimeout, "hard", hardTimeout) diff --git a/integration/quantization_test.go b/integration/quantization_test.go new file mode 100644 index 000000000..af9da0b62 --- /dev/null +++ b/integration/quantization_test.go @@ -0,0 +1,130 @@ +//go:build integration && models + +package integration + +import ( + "bytes" + "context" + "fmt" + "log/slog" + "strings" + "testing" + "time" + + "github.com/ollama/ollama/api" +) + +func TestQuantization(t *testing.T) { + sourceModels := []string{ + "qwen2.5:0.5b-instruct-fp16", + } + quantizations := []string{ + "Q8_0", + "Q4_K_S", + "Q4_K_M", + "Q4_K", + } + softTimeout, hardTimeout := getTimeouts(t) + started := time.Now() + slog.Info("Setting timeouts", "soft", softTimeout, "hard", hardTimeout) + ctx, cancel := context.WithTimeout(context.Background(), hardTimeout) + defer cancel() + client, _, cleanup := InitServerConnection(ctx, t) + defer cleanup() + + for _, base := range sourceModels { + if err := PullIfMissing(ctx, client, base); err != nil { + t.Fatalf("pull failed %s", err) + } + for _, quant := range quantizations { + newName := fmt.Sprintf("%s__%s", base, quant) + t.Run(newName, func(t *testing.T) { + if time.Now().Sub(started) > softTimeout { + t.Skip("skipping remaining tests to avoid excessive runtime") + } + req := &api.CreateRequest{ + Model: newName, + Quantization: quant, + From: base, + } + fn := func(resp api.ProgressResponse) error { + // fmt.Print(".") + return nil + } + t.Logf("quantizing: %s -> %s", base, quant) + if err := client.Create(ctx, req, fn); err != nil { + t.Fatalf("create failed %s", err) + } + defer func() { + req := &api.DeleteRequest{ + Model: newName, + } + t.Logf("deleting: %s -> %s", base, quant) + if err := client.Delete(ctx, req); err != nil { + t.Logf("failed to clean up %s: %s", req.Model, err) + } + }() + // Check metadata on the model + resp, err := client.Show(ctx, &api.ShowRequest{Name: newName}) + if err != nil { + t.Fatalf("unable to show model: %s", err) + } + if !strings.Contains(resp.Details.QuantizationLevel, quant) { + t.Fatalf("unexpected quantization for %s:\ngot: %s", newName, resp.Details.QuantizationLevel) + } + + stream := true + genReq := api.GenerateRequest{ + Model: newName, + Prompt: "why is the sky blue?", + KeepAlive: &api.Duration{Duration: 3 * time.Second}, + Options: map[string]any{ + "seed": 42, + "temperature": 0.0, + }, + Stream: &stream, + } + t.Logf("verifying: %s -> %s", base, quant) + + // Some smaller quantizations can cause models to have poor quality + // or get stuck in repetition loops, so we stop as soon as we have any matches + anyResp := []string{"rayleigh", "scattering", "day", "sun", "moon", "color", "nitrogen", "oxygen"} + reqCtx, reqCancel := context.WithCancel(ctx) + atLeastOne := false + var buf bytes.Buffer + genfn := func(response api.GenerateResponse) error { + buf.Write([]byte(response.Response)) + fullResp := strings.ToLower(buf.String()) + for _, resp := range anyResp { + if strings.Contains(fullResp, resp) { + atLeastOne = true + t.Log(fullResp) + reqCancel() + break + } + } + return nil + } + + done := make(chan int) + var genErr error + go func() { + genErr = client.Generate(reqCtx, &genReq, genfn) + done <- 0 + }() + + select { + case <-done: + if genErr != nil && !atLeastOne { + t.Fatalf("failed with %s request prompt %s ", genReq.Model, genReq.Prompt) + } + case <-ctx.Done(): + t.Error("outer test context done while waiting for generate") + } + + t.Logf("passed") + + }) + } + } +} diff --git a/integration/utils_test.go b/integration/utils_test.go index e08806fca..19f4d1bf8 100644 --- a/integration/utils_test.go +++ b/integration/utils_test.go @@ -217,6 +217,7 @@ func InitServerConnection(ctx context.Context, t *testing.T) (*api.Client, strin slog.Error("failed to open server log", "logfile", lifecycle.ServerLogFile, "error", err) return } + defer fp.Close() data, err := io.ReadAll(fp) if err != nil { slog.Error("failed to read server log", "logfile", lifecycle.ServerLogFile, "error", err) @@ -358,3 +359,14 @@ func skipUnderMinVRAM(t *testing.T, gb uint64) { } } } + +func getTimeouts(t *testing.T) (soft time.Duration, hard time.Duration) { + deadline, hasDeadline := t.Deadline() + if !hasDeadline { + return 8 * time.Minute, 10 * time.Minute + } else if deadline.Compare(time.Now().Add(2*time.Minute)) <= 0 { + t.Skip("too little time") + return time.Duration(0), time.Duration(0) + } + return -time.Since(deadline.Add(-2 * time.Minute)), -time.Since(deadline.Add(-20 * time.Second)) +} diff --git a/kvcache/causal.go b/kvcache/causal.go index ea07932cd..9bc1d5da2 100644 --- a/kvcache/causal.go +++ b/kvcache/causal.go @@ -239,7 +239,7 @@ func (c *Causal) findStartLoc() (int, error) { } } - return 0, fmt.Errorf("%w (length: %v)", ErrKvCacheFull, len(c.cells)) + return 0, fmt.Errorf("%w (cache: %v batch: %v)", ErrKvCacheFull, len(c.cells), c.curBatchSize) } func (c *Causal) updateSlidingWindow() { diff --git a/llama/build-info.cpp b/llama/build-info.cpp index be908c364..27ce8e701 100644 --- a/llama/build-info.cpp +++ b/llama/build-info.cpp @@ -1,4 +1,4 @@ int LLAMA_BUILD_NUMBER = 0; -char const *LLAMA_COMMIT = "2016f07bd106c73699ecbaace80f55db5ed95dac"; +char const *LLAMA_COMMIT = "e1e8e0991ffd9e99a445c6812bb519d5bac9f4b5"; char const *LLAMA_COMPILER = ""; char const *LLAMA_BUILD_TARGET = ""; diff --git a/llama/llama.cpp/common/common.h b/llama/llama.cpp/common/common.h index e6eaa8e80..0a9dc0599 100644 --- a/llama/llama.cpp/common/common.h +++ b/llama/llama.cpp/common/common.h @@ -342,6 +342,8 @@ struct common_params { // multimodal models (see examples/llava) struct common_params_model mmproj; + bool mmproj_use_gpu = true; // use GPU for multimodal model + bool no_mmproj = false; // explicitly disable multimodal model std::vector image; // path to image file(s) // embedding diff --git a/llama/llama.cpp/common/json-schema-to-grammar.cpp b/llama/llama.cpp/common/json-schema-to-grammar.cpp index 56043678c..656b3ecaa 100644 --- a/llama/llama.cpp/common/json-schema-to-grammar.cpp +++ b/llama/llama.cpp/common/json-schema-to-grammar.cpp @@ -16,6 +16,9 @@ using json = nlohmann::ordered_json; static std::string build_repetition(const std::string & item_rule, int min_items, int max_items, const std::string & separator_rule = "") { auto has_max = max_items != std::numeric_limits::max(); + if (max_items == 0) { + return ""; + } if (min_items == 0 && max_items == 1) { return item_rule + "?"; } diff --git a/llama/llama.cpp/examples/llava/clip-impl.h b/llama/llama.cpp/examples/llava/clip-impl.h index 180ae9880..66cb21ef1 100644 --- a/llama/llama.cpp/examples/llava/clip-impl.h +++ b/llama/llama.cpp/examples/llava/clip-impl.h @@ -2,8 +2,6 @@ #include "gguf.h" #include "clip.h" -#include "clip.h" - #include #include #include @@ -17,33 +15,31 @@ #define KEY_FTYPE "general.file_type" #define KEY_NAME "general.name" #define KEY_DESCRIPTION "general.description" -#define KEY_HAS_TEXT_ENC "clip.has_text_encoder" -#define KEY_HAS_VIS_ENC "clip.has_vision_encoder" -#define KEY_HAS_LLAVA_PROJ "clip.has_llava_projector" -#define KEY_HAS_MINICPMV_PROJ "clip.has_minicpmv_projector" -#define KEY_HAS_GLM_PROJ "clip.has_glm_projector" #define KEY_MINICPMV_VERSION "clip.minicpmv_version" -#define KEY_HAS_QWEN2VL_MERGER "clip.has_qwen2vl_merger" #define KEY_USE_GELU "clip.use_gelu" #define KEY_USE_SILU "clip.use_silu" -#define KEY_N_EMBD "clip.%s.embedding_length" -#define KEY_N_FF "clip.%s.feed_forward_length" -#define KEY_N_BLOCK "clip.%s.block_count" -#define KEY_N_HEAD "clip.%s.attention.head_count" -#define KEY_LAYER_NORM_EPS "clip.%s.attention.layer_norm_epsilon" -#define KEY_PROJ_DIM "clip.%s.projection_dim" -#define KEY_TOKENS "tokenizer.ggml.tokens" -#define KEY_N_POSITIONS "clip.text.context_length" +#define KEY_N_EMBD "clip.vision.embedding_length" +#define KEY_N_FF "clip.vision.feed_forward_length" +#define KEY_N_BLOCK "clip.vision.block_count" +#define KEY_N_HEAD "clip.vision.attention.head_count" +#define KEY_LAYER_NORM_EPS "clip.vision.attention.layer_norm_epsilon" +#define KEY_PROJ_DIM "clip.vision.projection_dim" #define KEY_IMAGE_SIZE "clip.vision.image_size" #define KEY_PATCH_SIZE "clip.vision.patch_size" #define KEY_IMAGE_MEAN "clip.vision.image_mean" #define KEY_IMAGE_STD "clip.vision.image_std" -#define KEY_PROJ_TYPE "clip.projector_type" #define KEY_FEATURE_LAYER "clip.vision.feature_layer" +#define KEY_PROJ_SCALE_FACTOR "clip.vision.projector.scale_factor" +#define KEY_PROJ_TYPE "clip.projector_type" + +#define KEY_USE_GLU_MLP "clip.use_glu_mlp" // for qwen2.5vl +#define KEY_USE_RMS_NORM "clip.use_rms_norm" // for qwen2.5vl #define KEY_MM_PATCH_MERGE_TYPE "clip.vision.mm_patch_merge_type" #define KEY_IMAGE_GRID_PINPOINTS "clip.vision.image_grid_pinpoints" #define KEY_IMAGE_CROP_RESOLUTION "clip.vision.image_crop_resolution" +#define KEY_WIN_ATTN_PATTERN "clip.vision.n_wa_pattern" +#define KEY_ATTN_WINDOW_SIZE "clip.vision.window_size" // @@ -60,7 +56,9 @@ #define TN_ATTN_V "%s.blk.%d.attn_v.%s" #define TN_ATTN_OUTPUT "%s.blk.%d.attn_out.%s" #define TN_FFN_DOWN "%s.blk.%d.ffn_down.%s" +#define TN_FFN_GATE "%s.blk.%d.ffn_gate.%s" #define TN_FFN_UP "%s.blk.%d.ffn_up.%s" +#define TN_FFN_GATE "%s.blk.%d.ffn_gate.%s" #define TN_LN_1 "%s.blk.%d.ln1.%s" #define TN_LN_2 "%s.blk.%d.ln2.%s" #define TN_LN_PRE "%s.pre_ln.%s" @@ -72,6 +70,8 @@ #define TN_IMAGE_NEWLINE "model.image_newline" #define TN_MM_INP_PROJ "mm.input_projection.weight" // gemma3 #define TN_MM_SOFT_EMB_N "mm.soft_emb_norm.weight" // gemma3 +#define TN_MM_PROJECTOR "mm.model.fc.weight" // idefics3 +#define TN_TOK_IMG_BREAK "v.token_embd.img_break" // pixtral // mimicpmv #define TN_MINICPMV_POS_EMBD_K "resampler.pos_embed_k" @@ -87,18 +87,19 @@ #define TN_GLM_ADAPTER_D_H_2_4H "adapter.linear.dense_h_to_4h.%s" #define TN_GLM_ADAPTER_GATE "adapter.linear.gate.%s" #define TN_GLM_ADAPTER_D_4H_2_H "adapter.linear.dense_4h_to_h.%s" -#define TN_GLM_BOI_W "adapter.boi" -#define TN_GLM_EOI_W "adapter.eoi" enum projector_type { PROJECTOR_TYPE_MLP, PROJECTOR_TYPE_MLP_NORM, PROJECTOR_TYPE_LDP, PROJECTOR_TYPE_LDPV2, - PROJECTOR_TYPE_RESAMPLER, + PROJECTOR_TYPE_MINICPMV, PROJECTOR_TYPE_GLM_EDGE, - PROJECTOR_TYPE_MERGER, + PROJECTOR_TYPE_QWEN2VL, PROJECTOR_TYPE_GEMMA3, + PROJECTOR_TYPE_IDEFICS3, + PROJECTOR_TYPE_PIXTRAL, + PROJECTOR_TYPE_QWEN25VL, PROJECTOR_TYPE_UNKNOWN, }; @@ -106,10 +107,13 @@ static std::map PROJECTOR_TYPE_NAMES = { { PROJECTOR_TYPE_MLP, "mlp" }, { PROJECTOR_TYPE_LDP, "ldp" }, { PROJECTOR_TYPE_LDPV2, "ldpv2"}, - { PROJECTOR_TYPE_RESAMPLER, "resampler"}, + { PROJECTOR_TYPE_MINICPMV, "resampler"}, { PROJECTOR_TYPE_GLM_EDGE, "adapter"}, - { PROJECTOR_TYPE_MERGER, "qwen2vl_merger"}, + { PROJECTOR_TYPE_QWEN2VL, "qwen2vl_merger"}, + { PROJECTOR_TYPE_QWEN25VL, "qwen2.5vl_merger"}, { PROJECTOR_TYPE_GEMMA3, "gemma3"}, + { PROJECTOR_TYPE_IDEFICS3, "idefics3"}, + { PROJECTOR_TYPE_PIXTRAL, "pixtral"}, }; static projector_type clip_projector_type_from_string(const std::string & str) { diff --git a/llama/llama.cpp/examples/llava/clip.cpp b/llama/llama.cpp/examples/llava/clip.cpp index d57b4bd6e..b3218c789 100644 --- a/llama/llama.cpp/examples/llava/clip.cpp +++ b/llama/llama.cpp/examples/llava/clip.cpp @@ -28,6 +28,7 @@ #include #include #include +#include #if defined(_WIN32) #define WIN32_LEAN_AND_MEAN @@ -172,14 +173,18 @@ struct clip_hparams { int32_t projection_dim; int32_t n_head; int32_t n_layer; + int32_t proj_scale_factor = 0; // idefics3 patch_merge_type mm_patch_merge_type = PATCH_MERGE_FLAT; - float eps; + float eps = 1e-6; + float rope_theta = 0.0; std::vector image_grid_pinpoints; int32_t image_crop_resolution; std::unordered_set vision_feature_layer; + int32_t attn_window_size = 0; + int32_t n_wa_pattern = 0; }; struct clip_layer { @@ -199,11 +204,20 @@ struct clip_layer { struct ggml_tensor * ln_1_b = nullptr; // ff - struct ggml_tensor * ff_i_w = nullptr; - struct ggml_tensor * ff_i_b = nullptr; + struct ggml_tensor * ff_i_w = nullptr; // legacy naming + struct ggml_tensor * ff_i_b = nullptr; // legacy naming + struct ggml_tensor * ff_o_w = nullptr; // legacy naming + struct ggml_tensor * ff_o_b = nullptr; // legacy naming - struct ggml_tensor * ff_o_w = nullptr; - struct ggml_tensor * ff_o_b = nullptr; + struct ggml_tensor * ff_up_w = nullptr; + struct ggml_tensor * ff_up_b = nullptr; + struct ggml_tensor * ff_gate_w = nullptr; + struct ggml_tensor * ff_gate_b = nullptr; + struct ggml_tensor * ff_down_w = nullptr; + struct ggml_tensor * ff_down_b = nullptr; + + struct ggml_tensor * ff_g_w = NULL; + struct ggml_tensor * ff_g_b = NULL; // layernorm 2 struct ggml_tensor * ln_2_w = nullptr; @@ -249,8 +263,6 @@ struct clip_vision_model { //GLMV-Edge projection struct ggml_tensor * mm_model_adapter_conv_w = nullptr; struct ggml_tensor * mm_model_adapter_conv_b = nullptr; - struct ggml_tensor * boi_w = nullptr; - struct ggml_tensor * eoi_w = nullptr; // MobileVLM projection struct ggml_tensor * mm_model_mlp_1_w = nullptr; @@ -309,16 +321,14 @@ struct clip_vision_model { // gemma3 struct ggml_tensor * mm_input_proj_w = nullptr; struct ggml_tensor * mm_soft_emb_norm_w = nullptr; + + // pixtral + struct ggml_tensor * token_embd_img_break = nullptr; }; struct clip_ctx { - bool has_text_encoder = false; - bool has_vision_encoder = false; bool has_llava_projector = false; - bool has_minicpmv_projector = false; - bool has_glm_projector = false; - bool has_qwen2vl_merger = false; - int minicpmv_version = 2; + int minicpmv_version = 0; struct clip_vision_model vision_model; projector_type proj_type = PROJECTOR_TYPE_MLP; @@ -341,6 +351,7 @@ struct clip_ctx { ggml_backend_t backend_cpu; ggml_backend_buffer_ptr buf; + int max_nodes = 8192; ggml_backend_sched_ptr sched; clip_image_size load_image_size; @@ -376,23 +387,20 @@ struct clip_ctx { } }; -static ggml_cgraph * clip_image_build_graph_siglip(clip_ctx * ctx, const clip_image_f32_batch & imgs) { +static ggml_cgraph * clip_image_build_graph_siglip(clip_ctx * ctx, const clip_image_f32 & img) { const auto & model = ctx->vision_model; const auto & hparams = model.hparams; - const int image_size = hparams.image_size; - int image_size_width = image_size; - int image_size_height = image_size; + int image_size_width = img.nx; + int image_size_height = img.ny; - const int patch_size = hparams.patch_size; - const int num_patches = ((image_size_width / patch_size) * (image_size_height / patch_size)); - const int hidden_size = hparams.hidden_size; - const int n_head = hparams.n_head; - const int d_head = hidden_size / n_head; - const int n_layer = hparams.n_layer; - const float eps = hparams.eps; - - GGML_ASSERT(imgs.entries.size() == 1); // batch_size == 1 + const int patch_size = hparams.patch_size; + const int num_patches = ((image_size_width / patch_size) * (image_size_height / patch_size)); + const int hidden_size = hparams.hidden_size; + const int n_head = hparams.n_head; + const int d_head = hidden_size / n_head; + const int n_layer = hparams.n_layer; + const float eps = hparams.eps; struct ggml_init_params params = { /*.mem_size =*/ ctx->buf_compute_meta.size(), @@ -519,6 +527,482 @@ static ggml_cgraph * clip_image_build_graph_siglip(clip_ctx * ctx, const clip_im embeddings = ggml_mul_mat(ctx0, ggml_cont(ctx0, ggml_transpose(ctx0, model.mm_input_proj_w)), embeddings); + + } else if (ctx->proj_type == PROJECTOR_TYPE_IDEFICS3) { + // https://github.com/huggingface/transformers/blob/0a950e0bbe1ed58d5401a6b547af19f15f0c195e/src/transformers/models/idefics3/modeling_idefics3.py#L578 + + ggml_tensor * cur = embeddings; + const int scale_factor = model.hparams.proj_scale_factor; + const int n_embd = cur->ne[0]; + const int seq = cur->ne[1]; + const int bsz = 1; // batch size, always 1 for now since we don't support batching + const int height = std::sqrt(seq); + const int width = std::sqrt(seq); + GGML_ASSERT(scale_factor != 0); + cur = ggml_reshape_4d(ctx0, cur, n_embd * scale_factor, width / scale_factor, height, bsz); + cur = ggml_permute(ctx0, cur, 0, 2, 1, 3); + cur = ggml_reshape_4d(ctx0, ggml_cont(ctx0, cur), + n_embd * scale_factor * scale_factor, + height / scale_factor, + width / scale_factor, + bsz); + cur = ggml_permute(ctx0, cur, 0, 2, 1, 3); + cur = ggml_reshape_3d(ctx0, ggml_cont(ctx0, cur), + n_embd * scale_factor * scale_factor, + seq / (scale_factor * scale_factor), + bsz); + + cur = ggml_mul_mat(ctx0, model.projection, cur); + embeddings = cur; + } else { + GGML_ABORT("SigLIP: Unsupported projector type"); + } + + // build the graph + ggml_build_forward_expand(gf, embeddings); + + return gf; +} + +// implementation of the 2D RoPE without adding a new op in ggml +// this is not efficient (use double the memory), but works on all backends +// TODO: there was a more efficient which relies on ggml_view and ggml_rope_ext_inplace, but the rope inplace does not work well with non-contiguous tensors ; we should fix that and revert back to the original implementation in https://github.com/ggml-org/llama.cpp/pull/13065 +static ggml_tensor * build_rope_2d( + ggml_context * ctx0, + ggml_tensor * cur, + ggml_tensor * pos_h, + ggml_tensor * pos_w, + const float freq_base +) { + const int64_t n_dim = cur->ne[0]; + const int64_t n_head = cur->ne[1]; + const int64_t n_pos = cur->ne[2]; + + // for example, if we have cur tensor of shape (n_dim=8, n_head, n_pos) + // we will have a list of 4 inv_freq: 1e-0, 1e-1, 1e-2, 1e-3 + // first half of cur will use 1e-0, 1e-2 (even) + // second half of cur will use 1e-1, 1e-3 (odd) + // the trick here is to rotate just half of n_dim, so inv_freq will automatically be even + // ^ don't ask me why, it's math! -2(2i) / n_dim == -2i / (n_dim/2) + // then for the second half, we use freq_scale to shift the inv_freq + // ^ why? replace (2i) with (2i+1) in the above equation + const float freq_scale_odd = std::pow(freq_base, (float)-2/n_dim); + + // first half + ggml_tensor * first; + { + first = ggml_view_3d(ctx0, cur, + n_dim/2, n_head, n_pos, + ggml_row_size(cur->type, n_dim), + ggml_row_size(cur->type, n_dim*n_head), + 0); + first = ggml_rope_ext( + ctx0, + first, + pos_h, // positions + nullptr, // freq factors + n_dim/2, // n_dims + 0, 0, freq_base, + 1.0f, 0.0f, 1.0f, 0.0f, 0.0f + ); + } + + // second half + ggml_tensor * second; + { + second = ggml_view_3d(ctx0, cur, + n_dim/2, n_head, n_pos, + ggml_row_size(cur->type, n_dim), + ggml_row_size(cur->type, n_dim*n_head), + n_dim/2 * ggml_element_size(cur)); + second = ggml_cont(ctx0, second); // copy, because ggml_rope don't play well with non-contiguous tensors + second = ggml_rope_ext( + ctx0, + second, + pos_w, // positions + nullptr, // freq factors + n_dim/2, // n_dims + 0, 0, freq_base, + freq_scale_odd, + 0.0f, 1.0f, 0.0f, 0.0f + ); + } + + cur = ggml_concat(ctx0, first, second, 0); + return cur; +} + +static ggml_cgraph * clip_image_build_graph_pixtral(clip_ctx * ctx, const clip_image_f32 & img) { + const auto & model = ctx->vision_model; + const auto & hparams = model.hparams; + + GGML_ASSERT(ctx->proj_type == PROJECTOR_TYPE_PIXTRAL); + + int image_size_width = img.nx; + int image_size_height = img.ny; + + const int patch_size = hparams.patch_size; + const int n_patches_x = image_size_width / patch_size; + const int n_patches_y = image_size_height / patch_size; + const int num_patches = n_patches_x * n_patches_y; + const int hidden_size = hparams.hidden_size; + const int n_head = hparams.n_head; + const int d_head = hidden_size / n_head; + const int n_layer = hparams.n_layer; + const float eps = hparams.eps; + + struct ggml_init_params params = { + /*.mem_size =*/ ctx->buf_compute_meta.size(), + /*.mem_buffer =*/ ctx->buf_compute_meta.data(), + /*.no_alloc =*/ true, + }; + + ggml_context_ptr ctx0_ptr(ggml_init(params)); + auto ctx0 = ctx0_ptr.get(); + + struct ggml_cgraph * gf = ggml_new_graph(ctx0); + + // input raw + struct ggml_tensor * inp_raw = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, image_size_width, image_size_height, 3); + ggml_set_name(inp_raw, "inp_raw"); + ggml_set_input(inp_raw); + + // 2D input positions + struct ggml_tensor * pos_h = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, num_patches); + ggml_set_name(pos_h, "pos_h"); + ggml_set_input(pos_h); + struct ggml_tensor * pos_w = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, num_patches); + ggml_set_name(pos_w, "pos_w"); + ggml_set_input(pos_w); + + struct ggml_tensor * inp = ggml_conv_2d(ctx0, model.patch_embeddings_0, inp_raw, patch_size, patch_size, 0, 0, 1, 1); + inp = ggml_reshape_2d(ctx0, inp, num_patches, hidden_size); + inp = ggml_cont(ctx0, ggml_transpose(ctx0, inp)); + + struct ggml_tensor * embeddings = inp; + + // pre-layer norm + embeddings = ggml_mul(ctx0, ggml_rms_norm(ctx0, embeddings, eps), model.pre_ln_w); + + // loop over layers + for (int il = 0; il < n_layer; il++) { + struct ggml_tensor * cur = embeddings; + + // pre-attention norm + cur = ggml_mul(ctx0, ggml_rms_norm(ctx0, cur, eps), model.layers[il].ln_1_w); + + // self-attention + { + struct ggml_tensor * Q = ggml_mul_mat(ctx0, model.layers[il].q_w, cur); + + Q = ggml_reshape_3d(ctx0, Q, d_head, n_head, num_patches); + Q = build_rope_2d(ctx0, Q, pos_h, pos_w, hparams.rope_theta); + Q = ggml_cont(ctx0, ggml_permute(ctx0, Q, 0, 2, 1, 3)); + + struct ggml_tensor * K = ggml_mul_mat(ctx0, model.layers[il].k_w, cur); + + K = ggml_reshape_3d(ctx0, K, d_head, n_head, num_patches); + K = build_rope_2d(ctx0, K, pos_h, pos_w, hparams.rope_theta); + K = ggml_cont(ctx0, ggml_permute(ctx0, K, 0, 2, 1, 3)); + + struct ggml_tensor * V = ggml_mul_mat(ctx0, model.layers[il].v_w, cur); + + V = ggml_reshape_3d(ctx0, V, d_head, n_head, num_patches); + V = ggml_cont(ctx0, ggml_permute(ctx0, V, 1, 2, 0, 3)); + + struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q); + KQ = ggml_soft_max_ext(ctx0, KQ, nullptr, 1.0f / sqrtf((float)d_head), 0.0f); + + struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ); + KQV = ggml_reshape_3d(ctx0, KQV, d_head, num_patches, n_head); + KQV = ggml_permute(ctx0, KQV, 0, 2, 1, 3); + + cur = ggml_cont_2d(ctx0, KQV, hidden_size, num_patches); + + cur = ggml_mul_mat(ctx0, model.layers[il].o_w, cur); + } + + // re-add the layer input, e.g., residual + cur = ggml_add(ctx0, cur, embeddings); + + embeddings = cur; // embeddings = residual, cur = hidden_states + + // pre-ffn norm + cur = ggml_mul(ctx0, ggml_rms_norm(ctx0, cur, eps), model.layers[il].ln_2_w); + + // feed-forward + { + ggml_tensor * gate_proj = ggml_mul_mat(ctx0, model.layers[il].ff_gate_w, cur); + ggml_tensor * up_proj = ggml_mul_mat(ctx0, model.layers[il].ff_up_w, cur); + gate_proj = ggml_silu(ctx0, gate_proj); // pixtral uses silu + cur = ggml_mul(ctx0, up_proj, gate_proj); + cur = ggml_mul_mat(ctx0, model.layers[il].ff_down_w, cur); + } + + // residual 2 + cur = ggml_add(ctx0, embeddings, cur); + + embeddings = cur; + } + + // LlavaMultiModalProjector (with GELU activation) + { + embeddings = ggml_mul_mat(ctx0, model.mm_1_w, embeddings); + embeddings = ggml_add(ctx0, embeddings, model.mm_1_b); + + embeddings = ggml_gelu(ctx0, embeddings); + embeddings = ggml_mul_mat(ctx0, model.mm_2_w, embeddings); + embeddings = ggml_add(ctx0, embeddings, model.mm_2_b); + } + + // arrangement of the [IMG_BREAK] token + { + // not efficient, but works + // the trick is to view the embeddings as a 3D tensor with shape [hidden_size, n_patches_per_row, n_rows] + // and then concatenate the [IMG_BREAK] token to the end of each row, aka n_patches_per_row dimension + // after the concatenation, we have a tensor with shape [hidden_size, n_patches_per_row + 1, n_rows] + + const int n_embd_text = embeddings->ne[0]; + const int n_tokens_output = num_patches + n_patches_y - 1; // one [IMG_BREAK] per row, except the last row + + ggml_tensor * cur = ggml_reshape_3d(ctx0, embeddings, n_embd_text, n_patches_x, n_patches_y); + ggml_tensor * tok = ggml_new_tensor_3d(ctx0, embeddings->type, n_embd_text, 1, n_patches_y); + tok = ggml_scale(ctx0, tok, 0.0); // clear the tensor + tok = ggml_add(ctx0, tok, model.token_embd_img_break); + cur = ggml_concat(ctx0, cur, tok, 1); + embeddings = ggml_view_2d(ctx0, cur, + n_embd_text, n_tokens_output, + ggml_row_size(cur->type, n_embd_text), 0); + } + + // build the graph + ggml_build_forward_expand(gf, embeddings); + + return gf; +} + +static ggml_cgraph * clip_image_build_graph_qwen25vl(clip_ctx * ctx, const clip_image_f32_batch & imgs) { + const auto & model = ctx->vision_model; + const auto & hparams = model.hparams; + + const int image_size_width = imgs.entries[0]->nx; + const int image_size_height = imgs.entries[0]->ny; + + const bool use_window_attn = hparams.n_wa_pattern > 0; + + const int n_wa_pattern = hparams.n_wa_pattern; + const int patch_size = hparams.patch_size; + const int num_patches = ((image_size_width / patch_size) * (image_size_height / patch_size)); + const int patches_w = image_size_width / patch_size; + const int patches_h = image_size_height / patch_size; + const int num_positions = num_patches + (model.class_embedding ? 1 : 0); + const int num_position_ids = num_positions * 4; // m-rope requires 4 dim per position + const int hidden_size = hparams.hidden_size; + const int n_head = hparams.n_head; + const int d_head = hidden_size / n_head; + const int n_layer = hparams.n_layer; + const float eps = hparams.eps; + + int mrope_sections[4] = {d_head/4, d_head/4, d_head/4, d_head/4}; + + const int batch_size = imgs.entries.size(); + GGML_ASSERT(batch_size == 1); + + struct ggml_init_params params = { + /*.mem_size =*/ ctx->buf_compute_meta.size(), + /*.mem_buffer =*/ ctx->buf_compute_meta.data(), + /*.no_alloc =*/ true, + }; + + ggml_context_ptr ctx0_ptr(ggml_init(params)); + auto ctx0 = ctx0_ptr.get(); + + struct ggml_cgraph * gf = ggml_new_graph(ctx0); + + struct ggml_tensor * inp_raw = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, image_size_width, image_size_height, 3, batch_size); + ggml_set_name(inp_raw, "inp_raw"); + ggml_set_input(inp_raw); + + struct ggml_tensor * inp = ggml_conv_2d(ctx0, model.patch_embeddings_0, inp_raw, patch_size, patch_size, 0, 0, 1, 1); + + GGML_ASSERT(image_size_width % (patch_size * 2) == 0); + GGML_ASSERT(image_size_height % (patch_size * 2) == 0); + + auto inp_1 = ggml_conv_2d(ctx0, model.patch_embeddings_1, inp_raw, patch_size, patch_size, 0, 0, 1, 1); + inp = ggml_add(ctx0, inp, inp_1); + + inp = ggml_cont(ctx0, ggml_permute(ctx0, inp, 1, 2, 0, 3)); // [w, h, c, b] -> [c, w, h, b] + inp = ggml_reshape_4d( + ctx0, inp, + hidden_size * 2, patches_w / 2, patches_h, batch_size); + inp = ggml_reshape_4d( + ctx0, inp, + hidden_size * 2, patches_w / 2, 2, batch_size * (patches_h / 2)); + inp = ggml_cont(ctx0, ggml_permute(ctx0, inp, 0, 2, 1, 3)); + inp = ggml_reshape_3d( + ctx0, inp, + hidden_size, patches_w * patches_h, batch_size); + + if (model.patch_bias) { + // inp = ggml_add(ctx0, inp, ggml_repeat(ctx0, model.patch_bias, inp)); + inp = ggml_add(ctx0, inp, model.patch_bias); + } + struct ggml_tensor * embeddings = inp; + struct ggml_tensor * window_mask = nullptr; + struct ggml_tensor * window_idx = nullptr; + struct ggml_tensor * inv_window_idx = nullptr; + + struct ggml_tensor * positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, num_position_ids); + ggml_set_name(positions, "positions"); + ggml_set_input(positions); + + // pre-layernorm + if (model.pre_ln_w) { + embeddings = ggml_rms_norm(ctx0, embeddings, eps); + ggml_set_name(embeddings, "pre_ln"); + + embeddings = ggml_mul(ctx0, embeddings, model.pre_ln_w); + } + + if (use_window_attn) { + // handle window attention inputs + inv_window_idx = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, num_positions / 4); + ggml_set_name(inv_window_idx, "inv_window_idx"); + ggml_set_input(inv_window_idx); + // mask for window attention + window_mask = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, num_positions, num_positions); + ggml_set_name(window_mask, "window_mask"); + ggml_set_input(window_mask); + + // embeddings shape: [hidden_size, patches_w * patches_h, batch_size] + GGML_ASSERT(batch_size == 1); + embeddings = ggml_reshape_2d(ctx0, embeddings, hidden_size * 4, patches_w * patches_h * batch_size / 4); + embeddings = ggml_get_rows(ctx0, embeddings, inv_window_idx); + embeddings = ggml_reshape_3d(ctx0, embeddings, hidden_size, patches_w * patches_h, batch_size); + } + + // loop over layers + for (int il = 0; il < n_layer; il++) { + struct ggml_tensor * cur = embeddings; // embeddings = residual, cur = hidden_states + + // rmsnorm1 + cur = ggml_rms_norm(ctx0, cur, eps); + cur = ggml_mul(ctx0, cur, model.layers[il].ln_1_w); + + // self-attention + { + + struct ggml_tensor * Q = + ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].q_w, cur), model.layers[il].q_b); + + Q = ggml_reshape_4d(ctx0, Q, d_head, n_head, num_positions, batch_size); + Q = ggml_rope_multi( + ctx0, Q, positions, nullptr, + d_head/2, mrope_sections, GGML_ROPE_TYPE_VISION, 32768, 10000, 1, 0, 1, 32, 1); + Q = ggml_cont(ctx0, ggml_permute(ctx0, Q, 0, 2, 1, 3)); + Q = ggml_reshape_3d(ctx0, Q, d_head, num_positions, n_head * batch_size); + + struct ggml_tensor * K = + ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].k_w, cur), model.layers[il].k_b); + + K = ggml_reshape_4d(ctx0, K, d_head, n_head, num_positions, batch_size); + K = ggml_rope_multi( + ctx0, K, positions, nullptr, + d_head/2, mrope_sections, GGML_ROPE_TYPE_VISION, 32768, 10000, 1, 0, 1, 32, 1); + K = ggml_cont(ctx0, ggml_permute(ctx0, K, 0, 2, 1, 3)); + K = ggml_reshape_3d(ctx0, K, d_head, num_positions, n_head * batch_size); + + struct ggml_tensor * V = + ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].v_w, cur), model.layers[il].v_b); + + V = ggml_reshape_4d(ctx0, V, d_head, n_head, num_positions, batch_size); + V = ggml_cont(ctx0, ggml_permute(ctx0, V, 1, 2, 0, 3)); + V = ggml_reshape_3d(ctx0, V, num_positions, d_head, n_head * batch_size); + + struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q); + const bool full_attn = use_window_attn ? (il + 1) % n_wa_pattern == 0 : true; + if (full_attn) { + KQ = ggml_soft_max_ext(ctx0, KQ, nullptr, 1.0f / sqrtf((float)d_head), 0.0f); + } else { + KQ = ggml_soft_max_ext(ctx0, KQ, window_mask, 1.0f / sqrtf((float)d_head), 0.0f); + } + + struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ); + KQV = ggml_reshape_4d(ctx0, KQV, d_head, num_positions, n_head, batch_size); + KQV = ggml_permute(ctx0, KQV, 0, 2, 1, 3); + + cur = ggml_cont_3d(ctx0, KQV, hidden_size, num_positions, batch_size); + } + + // attention output + cur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].o_w, cur), model.layers[il].o_b); + + // re-add the layer input, e.g., residual + cur = ggml_add(ctx0, cur, embeddings); + + embeddings = cur; // embeddings = residual, cur = hidden_states + + // rms norm2 + cur = ggml_rms_norm(ctx0, cur, eps); + cur = ggml_mul(ctx0, cur, model.layers[il].ln_2_w); + + // mlp + // ffn_up + auto cur_up = ggml_mul_mat(ctx0, model.layers[il].ff_o_w, cur); + cur_up = ggml_add(ctx0, cur_up, model.layers[il].ff_o_b); + + auto cur_gate = ggml_mul_mat(ctx0, model.layers[il].ff_g_w, cur); + cur_gate = ggml_add(ctx0, cur_gate, model.layers[il].ff_g_b); + // TODO : only 2 of these 3 are actually used, should we remove one of them? + if (ctx->use_gelu) { + cur_gate = ggml_gelu_inplace(ctx0, cur_gate); + } else if (ctx->use_silu) { + cur_gate = ggml_silu_inplace(ctx0, cur_gate); + } else { + cur_gate = ggml_gelu_quick_inplace(ctx0, cur_gate); + } + cur = ggml_mul(ctx0, cur_gate, cur_up); + + // ffn_down + cur = ggml_mul_mat(ctx0, model.layers[il].ff_i_w, cur); + cur = ggml_add(ctx0, cur, model.layers[il].ff_i_b); + + // residual 2 + cur = ggml_add(ctx0, embeddings, cur); + + embeddings = cur; + } + + // post-layernorm + if (model.post_ln_w) { + embeddings = ggml_rms_norm(ctx0, embeddings, eps); + ggml_set_name(embeddings, "post_ln"); + + embeddings = ggml_mul(ctx0, embeddings, model.post_ln_w); + } + + embeddings = ggml_reshape_3d(ctx0, embeddings, hidden_size * 4, num_positions / 4, batch_size); + + embeddings = ggml_mul_mat(ctx0, model.mm_0_w, embeddings); + embeddings = ggml_add(ctx0, embeddings, model.mm_0_b); + + // GELU activation + embeddings = ggml_gelu(ctx0, embeddings); + + // Second linear layer + embeddings = ggml_mul_mat(ctx0, model.mm_1_w, embeddings); + embeddings = ggml_add(ctx0, embeddings, model.mm_1_b); + + if (use_window_attn) { + window_idx = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, num_positions / 4); + ggml_set_name(window_idx, "window_idx"); + ggml_set_input(window_idx); + + // embeddings shape: [hidden_size, patches_w * patches_h, batch_size] + GGML_ASSERT(batch_size == 1); + embeddings = ggml_reshape_2d(ctx0, embeddings, hparams.projection_dim, patches_w * patches_h / 4); + embeddings = ggml_get_rows(ctx0, embeddings, window_idx); + embeddings = ggml_reshape_3d(ctx0, embeddings, hparams.projection_dim, patches_w * patches_h / 4, batch_size); } // build the graph @@ -528,18 +1012,14 @@ static ggml_cgraph * clip_image_build_graph_siglip(clip_ctx * ctx, const clip_im } static ggml_cgraph * clip_image_build_graph_legacy(clip_ctx * ctx, const clip_image_f32_batch & imgs, struct clip_image_size load_image_size, bool is_inf = false) { - if (!ctx->has_vision_encoder) { - LOG_ERR("This gguf file seems to have no vision encoder\n"); - return nullptr; - } - const auto & model = ctx->vision_model; const auto & hparams = model.hparams; const int image_size = hparams.image_size; int image_size_width = image_size; int image_size_height = image_size; - if (ctx->has_minicpmv_projector) { + + if (ctx->proj_type == PROJECTOR_TYPE_MINICPMV) { LOG_DBG("%s: %d %d\n", __func__, load_image_size.width, load_image_size.height); image_size_width = load_image_size.width; image_size_height = load_image_size.height; @@ -548,7 +1028,8 @@ static ggml_cgraph * clip_image_build_graph_legacy(clip_ctx * ctx, const clip_im image_size_height = imgs.entries[0]->ny; } } - else if (ctx->has_qwen2vl_merger) { + + else if (ctx->proj_type == PROJECTOR_TYPE_QWEN2VL) { // use the image's native resolution when image is avaible if (is_inf) { // if (imgs->data->nx && imgs->data->ny) { @@ -556,12 +1037,13 @@ static ggml_cgraph * clip_image_build_graph_legacy(clip_ctx * ctx, const clip_im image_size_height = imgs.entries[0]->ny; } } + const int patch_size = hparams.patch_size; const int num_patches = ((image_size_width / patch_size) * (image_size_height / patch_size)); const int patches_w = image_size_width / patch_size; const int patches_h = image_size_height / patch_size; const int num_positions = num_patches + (model.class_embedding ? 1 : 0); - const int num_position_ids = ctx->has_qwen2vl_merger ? num_positions * 4 : num_positions; + const int num_position_ids = ctx->proj_type == PROJECTOR_TYPE_QWEN2VL ? num_positions * 4 : num_positions; const int hidden_size = hparams.hidden_size; const int n_head = hparams.n_head; const int d_head = hidden_size / n_head; @@ -570,7 +1052,9 @@ static ggml_cgraph * clip_image_build_graph_legacy(clip_ctx * ctx, const clip_im const int batch_size = imgs.entries.size(); - if (ctx->has_llava_projector || ctx->has_minicpmv_projector || ctx->has_glm_projector) { + if (ctx->has_llava_projector + || ctx->proj_type == PROJECTOR_TYPE_MINICPMV + || ctx->proj_type == PROJECTOR_TYPE_GLM_EDGE) { GGML_ASSERT(batch_size == 1); } @@ -591,8 +1075,8 @@ static ggml_cgraph * clip_image_build_graph_legacy(clip_ctx * ctx, const clip_im struct ggml_tensor * inp = ggml_conv_2d(ctx0, model.patch_embeddings_0, inp_raw, patch_size, patch_size, 0, 0, 1, 1); - if (ctx->has_qwen2vl_merger) { - GGML_ASSERT(image_size_width % (patch_size * 2) == 0); + if (ctx->proj_type == PROJECTOR_TYPE_QWEN2VL) { + GGML_ASSERT(image_size_width % (patch_size * 2) == 0); GGML_ASSERT(image_size_height % (patch_size * 2) == 0); auto inp_1 = ggml_conv_2d(ctx0, model.patch_embeddings_1, inp_raw, patch_size, patch_size, 0, 0, 1, 1); @@ -621,40 +1105,30 @@ static ggml_cgraph * clip_image_build_graph_legacy(clip_ctx * ctx, const clip_im struct ggml_tensor * embeddings = inp; struct ggml_tensor * pos_embed = nullptr; - if (ctx->has_llava_projector) { - // concat class_embeddings and patch_embeddings - if (model.class_embedding) { - embeddings = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, hidden_size, num_positions, batch_size); - ggml_set_name(embeddings, "embeddings"); - ggml_set_input(embeddings); - embeddings = ggml_acc(ctx0, embeddings, model.class_embedding, - embeddings->nb[1], embeddings->nb[2], embeddings->nb[3], 0); - embeddings = ggml_acc(ctx0, embeddings, inp, - embeddings->nb[1], embeddings->nb[2], embeddings->nb[3], model.class_embedding->nb[1]); - } + // concat class_embeddings and patch_embeddings + if (model.class_embedding) { + embeddings = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, hidden_size, num_positions, batch_size); + embeddings = ggml_scale(ctx0, embeddings, 0.0f); // set to all zeros + embeddings = ggml_acc(ctx0, embeddings, model.class_embedding, + embeddings->nb[1], embeddings->nb[2], embeddings->nb[3], 0); + embeddings = ggml_acc(ctx0, embeddings, inp, + embeddings->nb[1], embeddings->nb[2], embeddings->nb[3], model.class_embedding->nb[1]); } struct ggml_tensor * positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, num_position_ids); ggml_set_name(positions, "positions"); ggml_set_input(positions); - if (!ctx->has_qwen2vl_merger) { // qwen2vl use rope position embedding + if (ctx->proj_type != PROJECTOR_TYPE_QWEN2VL) { // qwen2vl does NOT use learned position embeddings embeddings = ggml_add(ctx0, embeddings, ggml_get_rows(ctx0, model.position_embeddings, positions)); } - if (ctx->has_minicpmv_projector) { + if (ctx->proj_type == PROJECTOR_TYPE_MINICPMV) { int pos_w = image_size_width/patch_size; int pos_h = image_size_height/patch_size; - if (ctx->minicpmv_version == 2) { - pos_embed = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, 4096, pos_w * pos_h, 1); - } - else if (ctx->minicpmv_version == 3) { - pos_embed = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, 3584, pos_w * pos_h, 1); - } - else if (ctx->minicpmv_version == 4) { - pos_embed = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, 3584, pos_w * pos_h, 1); - } + int n_output_dim = clip_n_mmproj_embd(ctx); + pos_embed = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_output_dim, pos_w * pos_h, 1); ggml_set_name(pos_embed, "pos_embed"); ggml_set_input(pos_embed); } @@ -697,7 +1171,7 @@ static ggml_cgraph * clip_image_build_graph_legacy(clip_ctx * ctx, const clip_im ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].q_w, cur), model.layers[il].q_b); Q = ggml_reshape_4d(ctx0, Q, d_head, n_head, num_positions, batch_size); - if (ctx->has_qwen2vl_merger) { + if (ctx->proj_type == PROJECTOR_TYPE_QWEN2VL) { Q = ggml_rope_multi( ctx0, Q, positions, nullptr, d_head/2, mrope_sections, GGML_ROPE_TYPE_VISION, 32768, 10000, 1, 0, 1, 32, 1); @@ -709,7 +1183,7 @@ static ggml_cgraph * clip_image_build_graph_legacy(clip_ctx * ctx, const clip_im ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].k_w, cur), model.layers[il].k_b); K = ggml_reshape_4d(ctx0, K, d_head, n_head, num_positions, batch_size); - if (ctx->has_qwen2vl_merger) { + if (ctx->proj_type == PROJECTOR_TYPE_QWEN2VL) { K = ggml_rope_multi( ctx0, K, positions, nullptr, d_head/2, mrope_sections, GGML_ROPE_TYPE_VISION, 32768, 10000, 1, 0, 1, 32, 1); @@ -974,106 +1448,92 @@ static ggml_cgraph * clip_image_build_graph_legacy(clip_ctx * ctx, const clip_im } } // minicpmv projector - else if (ctx->has_minicpmv_projector) - { - if (ctx->proj_type == PROJECTOR_TYPE_RESAMPLER) { - struct ggml_tensor * q = model.mm_model_query; - { // layernorm - q = ggml_norm(ctx0, q, eps); - q = ggml_add(ctx0, ggml_mul(ctx0, q, model.mm_model_ln_q_w), model.mm_model_ln_q_b); - } - struct ggml_tensor * v = ggml_mul_mat(ctx0, model.mm_model_kv_proj, embeddings); - { // layernorm - v = ggml_norm(ctx0, v, eps); - v = ggml_add(ctx0, ggml_mul(ctx0, v, model.mm_model_ln_kv_w), model.mm_model_ln_kv_b); - } - struct ggml_tensor * k; - { // position - // q = ggml_add(ctx0, q, model.mm_model_pos_embed); - k = ggml_add(ctx0, v, pos_embed); - } - - { // attention - int hidden_size = 4096; - const int d_head = 128; - int n_head = hidden_size/d_head; - int num_query = 96; - if (ctx->minicpmv_version == 2) { - hidden_size = 4096; - n_head = hidden_size/d_head; - num_query = 96; - } - else if (ctx->minicpmv_version == 3) { - hidden_size = 3584; - n_head = hidden_size/d_head; - num_query = 64; - } - else if (ctx->minicpmv_version == 4) { - hidden_size = 3584; - n_head = hidden_size/d_head; - num_query = 64; - } - - struct ggml_tensor * Q = ggml_add(ctx0, ggml_mul_mat(ctx0, model.mm_model_attn_q_w, q), model.mm_model_attn_q_b); - struct ggml_tensor * K = ggml_add(ctx0, ggml_mul_mat(ctx0, model.mm_model_attn_k_w, k), model.mm_model_attn_k_b); - struct ggml_tensor * V = ggml_add(ctx0, ggml_mul_mat(ctx0, model.mm_model_attn_v_w, v), model.mm_model_attn_v_b); - // permute - Q = ggml_reshape_4d(ctx0, Q, d_head, n_head, num_query, batch_size); - Q = ggml_cont(ctx0, ggml_permute(ctx0, Q, 0, 2, 1, 3)); - Q = ggml_reshape_3d(ctx0, Q, d_head, num_query, n_head * batch_size); - K = ggml_reshape_4d(ctx0, K, d_head, n_head, num_positions, batch_size); - K = ggml_cont(ctx0, ggml_permute(ctx0, K, 0, 2, 1, 3)); - K = ggml_reshape_3d(ctx0, K, d_head, num_positions, n_head * batch_size); - V = ggml_reshape_4d(ctx0, V, d_head, n_head, num_positions, batch_size); - V = ggml_cont(ctx0, ggml_permute(ctx0, V, 1, 2, 0, 3)); - V = ggml_reshape_3d(ctx0, V, num_positions, d_head, n_head * batch_size); - struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q); - KQ = ggml_soft_max_ext(ctx0, KQ, nullptr, 1.0f / sqrtf((float)d_head), 0.0f); - struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ); - KQV = ggml_reshape_4d(ctx0, KQV, d_head, num_query, n_head, batch_size); - KQV = ggml_permute(ctx0, KQV, 0, 2, 1, 3); - KQV = ggml_cont_3d(ctx0, KQV, hidden_size, num_query, batch_size); - - embeddings = ggml_add(ctx0, ggml_mul_mat(ctx0, model.mm_model_attn_o_w, KQV), model.mm_model_attn_o_b); - } - { // layernorm - embeddings = ggml_norm(ctx0, embeddings, eps); - embeddings = ggml_add(ctx0, ggml_mul(ctx0, embeddings, model.mm_model_ln_post_w), model.mm_model_ln_post_b); - } - embeddings = ggml_mul_mat(ctx0, model.mm_model_proj, embeddings); + else if (ctx->proj_type == PROJECTOR_TYPE_MINICPMV) { + struct ggml_tensor * q = model.mm_model_query; + { // layernorm + q = ggml_norm(ctx0, q, eps); + q = ggml_add(ctx0, ggml_mul(ctx0, q, model.mm_model_ln_q_w), model.mm_model_ln_q_b); } - else { - GGML_ASSERT(false); + struct ggml_tensor * v = ggml_mul_mat(ctx0, model.mm_model_kv_proj, embeddings); + { // layernorm + v = ggml_norm(ctx0, v, eps); + v = ggml_add(ctx0, ggml_mul(ctx0, v, model.mm_model_ln_kv_w), model.mm_model_ln_kv_b); } + struct ggml_tensor * k; + { // position + // q = ggml_add(ctx0, q, model.mm_model_pos_embed); + k = ggml_add(ctx0, v, pos_embed); + } + + { // attention + int hidden_size = clip_n_mmproj_embd(ctx); + const int d_head = 128; + int n_head = hidden_size/d_head; + int num_query = 96; + if (ctx->minicpmv_version == 2) { + num_query = 96; + } + else if (ctx->minicpmv_version == 3) { + num_query = 64; + } + else if (ctx->minicpmv_version == 4) { + num_query = 64; + } + + struct ggml_tensor * Q = ggml_add(ctx0, ggml_mul_mat(ctx0, model.mm_model_attn_q_w, q), model.mm_model_attn_q_b); + struct ggml_tensor * K = ggml_add(ctx0, ggml_mul_mat(ctx0, model.mm_model_attn_k_w, k), model.mm_model_attn_k_b); + struct ggml_tensor * V = ggml_add(ctx0, ggml_mul_mat(ctx0, model.mm_model_attn_v_w, v), model.mm_model_attn_v_b); + // permute + Q = ggml_reshape_4d(ctx0, Q, d_head, n_head, num_query, batch_size); + Q = ggml_cont(ctx0, ggml_permute(ctx0, Q, 0, 2, 1, 3)); + Q = ggml_reshape_3d(ctx0, Q, d_head, num_query, n_head * batch_size); + K = ggml_reshape_4d(ctx0, K, d_head, n_head, num_positions, batch_size); + K = ggml_cont(ctx0, ggml_permute(ctx0, K, 0, 2, 1, 3)); + K = ggml_reshape_3d(ctx0, K, d_head, num_positions, n_head * batch_size); + V = ggml_reshape_4d(ctx0, V, d_head, n_head, num_positions, batch_size); + V = ggml_cont(ctx0, ggml_permute(ctx0, V, 1, 2, 0, 3)); + V = ggml_reshape_3d(ctx0, V, num_positions, d_head, n_head * batch_size); + struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q); + KQ = ggml_soft_max_ext(ctx0, KQ, nullptr, 1.0f / sqrtf((float)d_head), 0.0f); + struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ); + KQV = ggml_reshape_4d(ctx0, KQV, d_head, num_query, n_head, batch_size); + KQV = ggml_permute(ctx0, KQV, 0, 2, 1, 3); + KQV = ggml_cont_3d(ctx0, KQV, hidden_size, num_query, batch_size); + + embeddings = ggml_add(ctx0, ggml_mul_mat(ctx0, model.mm_model_attn_o_w, KQV), model.mm_model_attn_o_b); + } + { // layernorm + embeddings = ggml_norm(ctx0, embeddings, eps); + embeddings = ggml_add(ctx0, ggml_mul(ctx0, embeddings, model.mm_model_ln_post_w), model.mm_model_ln_post_b); + } + embeddings = ggml_mul_mat(ctx0, model.mm_model_proj, embeddings); } + // glm projector - else if (ctx->has_glm_projector) { - if (ctx->proj_type == PROJECTOR_TYPE_GLM_EDGE) { - size_t gridsz = (size_t)sqrt(embeddings->ne[1]); - embeddings = ggml_cont(ctx0, ggml_permute(ctx0,embeddings,1,0,2,3)); - embeddings = ggml_reshape_3d(ctx0, embeddings, gridsz, gridsz, embeddings->ne[1]); - embeddings = ggml_conv_2d(ctx0, model.mm_model_adapter_conv_w, embeddings, 2, 2, 0, 0, 1, 1); - embeddings = ggml_reshape_3d(ctx0, embeddings,embeddings->ne[0]*embeddings->ne[1] , embeddings->ne[2], batch_size); - embeddings = ggml_cont(ctx0, ggml_permute(ctx0,embeddings, 1, 0, 2, 3)); - embeddings = ggml_add(ctx0, embeddings, model.mm_model_adapter_conv_b); - //GLU - { - embeddings = ggml_mul_mat(ctx0, model.mm_model_mlp_0_w, embeddings); - embeddings = ggml_norm(ctx0, embeddings, eps); - embeddings = ggml_add(ctx0, ggml_mul(ctx0, embeddings, model.mm_model_ln_q_w), model.mm_model_ln_q_b); - embeddings = ggml_gelu_inplace(ctx0, embeddings); - struct ggml_tensor * x = embeddings; - embeddings = ggml_mul_mat(ctx0, model.mm_model_mlp_2_w, embeddings); - x = ggml_mul_mat(ctx0, model.mm_model_mlp_1_w,x); - embeddings = ggml_silu_inplace(ctx0, embeddings); - embeddings = ggml_mul(ctx0, embeddings,x); - embeddings = ggml_mul_mat(ctx0, model.mm_model_mlp_3_w, embeddings); - } - } else { - GGML_ABORT("fatal error"); + else if (ctx->proj_type == PROJECTOR_TYPE_GLM_EDGE) { + size_t gridsz = (size_t)sqrt(embeddings->ne[1]); + embeddings = ggml_cont(ctx0, ggml_permute(ctx0,embeddings,1,0,2,3)); + embeddings = ggml_reshape_3d(ctx0, embeddings, gridsz, gridsz, embeddings->ne[1]); + embeddings = ggml_conv_2d(ctx0, model.mm_model_adapter_conv_w, embeddings, 2, 2, 0, 0, 1, 1); + embeddings = ggml_reshape_3d(ctx0, embeddings,embeddings->ne[0]*embeddings->ne[1] , embeddings->ne[2], batch_size); + embeddings = ggml_cont(ctx0, ggml_permute(ctx0,embeddings, 1, 0, 2, 3)); + embeddings = ggml_add(ctx0, embeddings, model.mm_model_adapter_conv_b); + // GLU + { + embeddings = ggml_mul_mat(ctx0, model.mm_model_mlp_0_w, embeddings); + embeddings = ggml_norm(ctx0, embeddings, eps); + embeddings = ggml_add(ctx0, ggml_mul(ctx0, embeddings, model.mm_model_ln_q_w), model.mm_model_ln_q_b); + embeddings = ggml_gelu_inplace(ctx0, embeddings); + struct ggml_tensor * x = embeddings; + embeddings = ggml_mul_mat(ctx0, model.mm_model_mlp_2_w, embeddings); + x = ggml_mul_mat(ctx0, model.mm_model_mlp_1_w,x); + embeddings = ggml_silu_inplace(ctx0, embeddings); + embeddings = ggml_mul(ctx0, embeddings,x); + embeddings = ggml_mul_mat(ctx0, model.mm_model_mlp_3_w, embeddings); } } - else if (ctx->proj_type == PROJECTOR_TYPE_MERGER) { + + else if (ctx->proj_type == PROJECTOR_TYPE_QWEN2VL) { embeddings = ggml_reshape_3d(ctx0, embeddings, hidden_size * 4, num_positions / 4, batch_size); embeddings = ggml_mul_mat(ctx0, model.mm_0_w, embeddings); @@ -1094,12 +1554,30 @@ static ggml_cgraph * clip_image_build_graph_legacy(clip_ctx * ctx, const clip_im } static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32_batch & imgs, struct clip_image_size load_image_size, bool is_inf = false) { - if (ctx->proj_type == PROJECTOR_TYPE_GEMMA3) { - return clip_image_build_graph_siglip(ctx, imgs); - } else { - // TODO: we should have one build_* function per model - return clip_image_build_graph_legacy(ctx, imgs, load_image_size, is_inf); + ggml_cgraph * res; + switch (ctx->proj_type) { + case PROJECTOR_TYPE_GEMMA3: + case PROJECTOR_TYPE_IDEFICS3: + { + GGML_ASSERT(imgs.entries.size() == 1); + res = clip_image_build_graph_siglip(ctx, *imgs.entries[0]); + } break; + case PROJECTOR_TYPE_PIXTRAL: + { + GGML_ASSERT(imgs.entries.size() == 1); + res = clip_image_build_graph_pixtral(ctx, *imgs.entries[0]); + } break; + case PROJECTOR_TYPE_QWEN25VL: + { + res = clip_image_build_graph_qwen25vl(ctx, imgs); + } break; + default: + { + // TODO: we should have one build_* function per model + res = clip_image_build_graph_legacy(ctx, imgs, load_image_size, is_inf); + } break; } + return res; } struct clip_model_loader { @@ -1109,7 +1587,7 @@ struct clip_model_loader { clip_ctx & ctx_clip; std::string fname; - size_t model_size; // in bytes + size_t model_size = 0; // in bytes // TODO @ngxson : we should not pass clip_ctx here, it should be clip_vision_model clip_model_loader(const char * fname, clip_ctx & ctx_clip) : ctx_clip(ctx_clip), fname(fname) { @@ -1160,9 +1638,11 @@ struct clip_model_loader { } void load_hparams() { + auto & hparams = ctx_clip.vision_model.hparams; + // projector type + std::string proj_type; { - std::string proj_type; get_string(KEY_PROJ_TYPE, proj_type, false); if (!proj_type.empty()) { ctx_clip.proj_type = clip_projector_type_from_string(proj_type); @@ -1174,34 +1654,27 @@ struct clip_model_loader { // other hparams { - get_bool(KEY_HAS_TEXT_ENC, ctx_clip.has_text_encoder, false); - get_bool(KEY_HAS_VIS_ENC, ctx_clip.has_vision_encoder, false); - GGML_ASSERT(ctx_clip.has_vision_encoder); - GGML_ASSERT(!ctx_clip.has_text_encoder); - - // legacy keys, use KEY_PROJ_TYPE instead - get_bool(KEY_HAS_LLAVA_PROJ, ctx_clip.has_llava_projector, false); - get_bool(KEY_HAS_MINICPMV_PROJ, ctx_clip.has_minicpmv_projector, false); get_i32(KEY_MINICPMV_VERSION, ctx_clip.minicpmv_version, false); - get_bool(KEY_HAS_GLM_PROJ, ctx_clip.has_glm_projector, false); - get_bool(KEY_HAS_QWEN2VL_MERGER, ctx_clip.has_qwen2vl_merger, false); - // !!! do NOT extend the list above, use KEY_PROJ_TYPE instead get_bool(KEY_USE_GELU, ctx_clip.use_gelu, false); get_bool(KEY_USE_SILU, ctx_clip.use_silu, false); - auto & hparams = ctx_clip.vision_model.hparams; - get_u32(string_format(KEY_N_EMBD, "vision"), hparams.hidden_size); - get_u32(string_format(KEY_N_HEAD, "vision"), hparams.n_head); - get_u32(string_format(KEY_N_FF, "vision"), hparams.n_intermediate); - get_u32(string_format(KEY_N_BLOCK, "vision"), hparams.n_layer); - get_u32(string_format(KEY_PROJ_DIM, "vision"), hparams.projection_dim); - get_f32(string_format(KEY_LAYER_NORM_EPS, "vision"), hparams.eps); - get_u32(KEY_IMAGE_SIZE, hparams.image_size); - get_u32(KEY_PATCH_SIZE, hparams.patch_size); - get_u32(KEY_IMAGE_CROP_RESOLUTION, hparams.image_crop_resolution, false); + get_u32(KEY_N_EMBD, hparams.hidden_size); + get_u32(KEY_N_HEAD, hparams.n_head); + get_u32(KEY_N_FF, hparams.n_intermediate); + get_u32(KEY_N_BLOCK, hparams.n_layer); + get_u32(KEY_PROJ_DIM, hparams.projection_dim); + get_f32(KEY_LAYER_NORM_EPS, hparams.eps); + get_u32(KEY_IMAGE_SIZE, hparams.image_size); + get_u32(KEY_PATCH_SIZE, hparams.patch_size); + get_u32(KEY_IMAGE_CROP_RESOLUTION, hparams.image_crop_resolution, false); get_arr_int(KEY_IMAGE_GRID_PINPOINTS, hparams.image_grid_pinpoints, false); + ctx_clip.has_llava_projector = ctx_clip.proj_type == PROJECTOR_TYPE_MLP + || ctx_clip.proj_type == PROJECTOR_TYPE_MLP_NORM + || ctx_clip.proj_type == PROJECTOR_TYPE_LDP + || ctx_clip.proj_type == PROJECTOR_TYPE_LDPV2; + { std::string mm_patch_merge_type; get_string(KEY_MM_PATCH_MERGE_TYPE, mm_patch_merge_type, false); @@ -1234,15 +1707,62 @@ struct clip_model_loader { for (auto & layer : vision_feature_layer) { hparams.vision_feature_layer.insert(layer); } - // Calculate the deepest feature layer based on hparams and projector type - ctx_clip.max_feature_layer = get_deepest_feature_layer(&ctx_clip); - LOG_INF("%s: text_encoder: %d\n", __func__, ctx_clip.has_text_encoder); - LOG_INF("%s: vision_encoder: %d\n", __func__, ctx_clip.has_vision_encoder); - LOG_INF("%s: llava_projector: %d\n", __func__, ctx_clip.has_llava_projector); - LOG_INF("%s: minicpmv_projector: %d\n", __func__, ctx_clip.has_minicpmv_projector); + // Calculate the deepest feature layer based on hparams and projector type + // NOTE: This is only used by build_graph_legacy() + { + // Get the index of the second to last layer; this is the default for models that have a llava projector + int n_layer = hparams.n_layer - 1; + int deepest_feature_layer = -1; + + if (ctx_clip.proj_type == PROJECTOR_TYPE_MINICPMV + || ctx_clip.proj_type == PROJECTOR_TYPE_GLM_EDGE + || ctx_clip.proj_type == PROJECTOR_TYPE_QWEN2VL + || ctx_clip.proj_type == PROJECTOR_TYPE_QWEN25VL) { + n_layer += 1; + } + + // If we set explicit vision feature layers, only go up to the deepest one + // NOTE: only used by granite-vision models for now + for (const auto & feature_layer : hparams.vision_feature_layer) { + if (feature_layer > deepest_feature_layer) { + deepest_feature_layer = feature_layer; + } + } + ctx_clip.max_feature_layer = deepest_feature_layer < 0 ? n_layer : deepest_feature_layer; + } + + // model-specific params + switch (ctx_clip.proj_type) { + case PROJECTOR_TYPE_MINICPMV: + { + if (ctx_clip.minicpmv_version == 0) { + ctx_clip.minicpmv_version = 2; // default to 2 if not set + } + } break; + case PROJECTOR_TYPE_IDEFICS3: + { + get_u32(KEY_PROJ_SCALE_FACTOR, hparams.proj_scale_factor, false); + } break; + case PROJECTOR_TYPE_PIXTRAL: + { + hparams.rope_theta = 10000.0f; + } break; + case PROJECTOR_TYPE_QWEN25VL: + { + get_u32(KEY_WIN_ATTN_PATTERN, hparams.n_wa_pattern); + } break; + default: + break; + } + + LOG_INF("%s: projector: %s\n", __func__, proj_type.c_str()); + LOG_INF("%s: has_llava_proj: %d\n", __func__, ctx_clip.has_llava_projector); LOG_INF("%s: minicpmv_version: %d\n", __func__, ctx_clip.minicpmv_version); - LOG_INF("%s: glm_projector: %d\n", __func__, ctx_clip.has_glm_projector); + LOG_INF("%s: proj_scale_factor: %d\n", __func__, hparams.proj_scale_factor); + LOG_INF("%s: n_wa_pattern: %d\n", __func__, hparams.n_wa_pattern); + LOG_INF("%s: use_silu: %d\n", __func__, ctx_clip.use_silu); + LOG_INF("%s: use_gelu: %d\n", __func__, ctx_clip.use_gelu); LOG_INF("%s: model size: %.2f MiB\n", __func__, model_size / 1024.0 / 1024.0); LOG_INF("%s: metadata size: %.2f MiB\n", __func__, ggml_get_mem_size(ctx_meta.get()) / 1024.0 / 1024.0); } @@ -1298,9 +1818,6 @@ struct clip_model_loader { vision_model.patch_bias = get_tensor(TN_PATCH_BIAS, false); vision_model.patch_embeddings_0 = get_tensor(TN_PATCH_EMBD, false); vision_model.patch_embeddings_1 = get_tensor(TN_PATCH_EMBD_1, false); - if (vision_model.patch_embeddings_1 == nullptr) { - ctx_clip.has_qwen2vl_merger = false; - } vision_model.position_embeddings = get_tensor(string_format(TN_POS_EMBD, "v"), false); @@ -1314,16 +1831,28 @@ struct clip_model_loader { layer.o_w = get_tensor(string_format(TN_ATTN_OUTPUT, "v", il, "weight")); layer.ln_1_w = get_tensor(string_format(TN_LN_1, "v", il, "weight"), false); layer.ln_2_w = get_tensor(string_format(TN_LN_2, "v", il, "weight"), false); - layer.ff_i_w = get_tensor(string_format(TN_FFN_DOWN, "v", il, "weight")); - layer.ff_o_w = get_tensor(string_format(TN_FFN_UP, "v", il, "weight")); layer.k_b = get_tensor(string_format(TN_ATTN_K, "v", il, "bias"), false); layer.q_b = get_tensor(string_format(TN_ATTN_Q, "v", il, "bias"), false); layer.v_b = get_tensor(string_format(TN_ATTN_V, "v", il, "bias"), false); layer.o_b = get_tensor(string_format(TN_ATTN_OUTPUT, "v", il, "bias"), false); layer.ln_1_b = get_tensor(string_format(TN_LN_1, "v", il, "bias"), false); layer.ln_2_b = get_tensor(string_format(TN_LN_2, "v", il, "bias"), false); - layer.ff_i_b = get_tensor(string_format(TN_FFN_DOWN, "v", il, "bias"), false); - layer.ff_o_b = get_tensor(string_format(TN_FFN_UP, "v", il, "bias"), false); + + // new naming + layer.ff_up_w = get_tensor(string_format(TN_FFN_UP, "v", il, "weight")); + layer.ff_up_b = get_tensor(string_format(TN_FFN_UP, "v", il, "bias"), false); + layer.ff_gate_w = get_tensor(string_format(TN_FFN_GATE, "v", il, "weight"), false); + layer.ff_gate_b = get_tensor(string_format(TN_FFN_GATE, "v", il, "bias"), false); + layer.ff_down_w = get_tensor(string_format(TN_FFN_DOWN, "v", il, "weight")); + layer.ff_down_b = get_tensor(string_format(TN_FFN_DOWN, "v", il, "bias"), false); + + // legacy naming (the in and out is reversed! don't ask me why) + layer.ff_i_w = layer.ff_down_w; + layer.ff_o_w = layer.ff_up_w; + layer.ff_g_w = layer.ff_gate_w; + layer.ff_i_b = layer.ff_down_b; + layer.ff_o_b = layer.ff_up_b; + layer.ff_g_b = layer.ff_gate_b; } switch (ctx_clip.proj_type) { @@ -1388,7 +1917,7 @@ struct clip_model_loader { vision_model.mm_model_peg_0_w = get_tensor(string_format(TN_MVLM_PROJ_PEG, 0, "weight")); vision_model.mm_model_peg_0_b = get_tensor(string_format(TN_MVLM_PROJ_PEG, 0, "bias")); } break; - case PROJECTOR_TYPE_RESAMPLER: + case PROJECTOR_TYPE_MINICPMV: { // vision_model.mm_model_pos_embed = get_tensor(new_clip->ctx_data, TN_MINICPMV_POS_EMBD); vision_model.mm_model_pos_embed_k = get_tensor(TN_MINICPMV_POS_EMBD_K); @@ -1420,10 +1949,9 @@ struct clip_model_loader { vision_model.mm_model_mlp_1_w = get_tensor(string_format(TN_GLM_ADAPTER_D_H_2_4H,"weight")); vision_model.mm_model_mlp_2_w = get_tensor(string_format(TN_GLM_ADAPTER_GATE,"weight")); vision_model.mm_model_mlp_3_w = get_tensor(string_format(TN_GLM_ADAPTER_D_4H_2_H,"weight")); - vision_model.boi_w = get_tensor(TN_GLM_BOI_W); - vision_model.eoi_w = get_tensor(TN_GLM_EOI_W); } break; - case PROJECTOR_TYPE_MERGER: + case PROJECTOR_TYPE_QWEN2VL: + case PROJECTOR_TYPE_QWEN25VL: { vision_model.mm_0_w = get_tensor(string_format(TN_LLAVA_PROJ, 0, "weight")); vision_model.mm_0_b = get_tensor(string_format(TN_LLAVA_PROJ, 0, "bias")); @@ -1435,6 +1963,19 @@ struct clip_model_loader { vision_model.mm_input_proj_w = get_tensor(TN_MM_INP_PROJ); vision_model.mm_soft_emb_norm_w = get_tensor(TN_MM_SOFT_EMB_N); } break; + case PROJECTOR_TYPE_IDEFICS3: + { + vision_model.projection = get_tensor(TN_MM_PROJECTOR); + } break; + case PROJECTOR_TYPE_PIXTRAL: + { + vision_model.mm_1_w = get_tensor(string_format(TN_LLAVA_PROJ, 1, "weight")); + vision_model.mm_1_b = get_tensor(string_format(TN_LLAVA_PROJ, 1, "bias")); + vision_model.mm_2_w = get_tensor(string_format(TN_LLAVA_PROJ, 2, "weight")); + vision_model.mm_2_b = get_tensor(string_format(TN_LLAVA_PROJ, 2, "bias")); + // [IMG_BREAK] token embedding + vision_model.token_embd_img_break = get_tensor(TN_TOK_IMG_BREAK); + } break; default: GGML_ASSERT(false && "unknown projector type"); } @@ -1503,18 +2044,17 @@ struct clip_model_loader { } void alloc_compute_meta() { - ctx_clip.buf_compute_meta.resize(GGML_DEFAULT_GRAPH_SIZE * ggml_tensor_overhead() + ggml_graph_overhead()); + ctx_clip.buf_compute_meta.resize(ctx_clip.max_nodes * ggml_tensor_overhead() + ggml_graph_overhead()); // create a fake batch clip_image_f32_batch batch; clip_image_f32_ptr img(clip_image_f32_init()); clip_image_size image_size; - image_size.width = clip_get_image_size(&ctx_clip); - image_size.height = clip_get_image_size(&ctx_clip); - int n_patches = clip_get_image_size(&ctx_clip) / image_size.width; - img->nx = n_patches; - img->ny = n_patches; - img->buf.resize(n_patches * image_size.width * image_size.height * 3); + image_size.width = ctx_clip.vision_model.hparams.image_size; + image_size.height = ctx_clip.vision_model.hparams.image_size; + img->nx = image_size.width; + img->ny = image_size.height; + img->buf.resize(image_size.width * image_size.height * 3); batch.entries.push_back(std::move(img)); ggml_cgraph * gf = clip_image_build_graph(&ctx_clip, batch, image_size, false); @@ -1902,6 +2442,26 @@ struct image_manipulation { } } + // calculate the size of the **resized** image, while preserving the aspect ratio + // the calculated size will be aligned to the nearest multiple of align_size + // if H or W size is larger than max_dimension, it will be resized to max_dimension + static clip_image_size calc_size_preserved_ratio(const clip_image_size & inp_size, const int align_size, const int max_dimension) { + if (inp_size.width <= 0 || inp_size.height <= 0 || align_size <= 0 || max_dimension <= 0) { + return {0, 0}; + } + + float scale = std::min(1.0f, std::min(static_cast(max_dimension) / inp_size.width, + static_cast(max_dimension) / inp_size.height)); + + float target_width_f = static_cast(inp_size.width) * scale; + float target_height_f = static_cast(inp_size.height) * scale; + + int aligned_width = GGML_PAD((int)target_width_f, align_size); + int aligned_height = GGML_PAD((int)target_height_f, align_size); + + return {aligned_width, aligned_height}; + } + private: static inline int clip(int x, int lower, int upper) { return std::max(lower, std::min(x, upper)); @@ -2194,11 +2754,6 @@ int clip_uhd_num_image_embeds_col(struct clip_ctx * ctx_clip) { // returns the normalized float tensor for llava-1.5, for spatial_unpad with anyres processing for llava-1.6 it returns the normalized image patch tensors as a vector // res_imgs memory is being allocated here, previous allocations will be freed if found bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, struct clip_image_f32_batch * res_imgs) { - if (!ctx->has_vision_encoder) { - LOG_ERR("%s: This gguf file seems to have no vision encoder\n", __func__); - return false; - } - clip_image_size original_size{img->nx, img->ny}; bool pad_to_square = true; auto & params = ctx->vision_model.hparams; @@ -2219,7 +2774,7 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str } return true; } - else if (ctx->has_qwen2vl_merger) { + else if (ctx->proj_type == PROJECTOR_TYPE_QWEN2VL || ctx->proj_type == PROJECTOR_TYPE_QWEN25VL) { clip_image_u8 resized; auto patch_size = clip_get_patch_size(ctx) * 2; int nx = ceil((float)img->nx / patch_size) * patch_size; @@ -2233,17 +2788,27 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str res_imgs->entries.push_back(std::move(img_f32)); return true; } - - if (ctx->has_glm_projector || ctx->proj_type == PROJECTOR_TYPE_GEMMA3) { + else if (ctx->proj_type == PROJECTOR_TYPE_GLM_EDGE + || ctx->proj_type == PROJECTOR_TYPE_GEMMA3 + || ctx->proj_type == PROJECTOR_TYPE_IDEFICS3) { clip_image_u8 resized_image; int sz = params.image_size; - image_manipulation::bicubic_resize(*img, resized_image, sz, sz); + image_manipulation::resize_and_pad_image(*img, resized_image, {sz, sz}); clip_image_f32_ptr img_f32(clip_image_f32_init()); //clip_image_save_to_bmp(resized_image, "resized.bmp"); normalize_image_u8_to_f32(resized_image, *img_f32, ctx->image_mean, ctx->image_std); res_imgs->entries.push_back(std::move(img_f32)); return true; } + else if (ctx->proj_type == PROJECTOR_TYPE_PIXTRAL) { + clip_image_u8 resized_image; + auto new_size = image_manipulation::calc_size_preserved_ratio(original_size, params.patch_size, params.image_size); + image_manipulation::bilinear_resize(*img, resized_image, new_size.width, new_size.height); + clip_image_f32_ptr img_f32(clip_image_f32_init()); + normalize_image_u8_to_f32(resized_image, *img_f32, ctx->image_mean, ctx->image_std); + res_imgs->entries.push_back(std::move(img_f32)); + return true; + } // the logic below is to pad the shorter side to the longer side with a background color: rgb(122, 116, 104) // see https://github.com/haotian-liu/LLaVA/blob/e854a2bf85118c504f6f16bf5c3c7c92f8fa8c6b/llava/conversation.py#L113-L156 @@ -2299,16 +2864,18 @@ void clip_free(clip_ctx * ctx) { delete ctx; } +// deprecated size_t clip_embd_nbytes(const struct clip_ctx * ctx) { - int extra_tokens = ctx->has_glm_projector ? 2 : 0; - return (clip_n_patches(ctx) + extra_tokens) * clip_n_mmproj_embd(ctx) * sizeof(float); + const int32_t nx = ctx->vision_model.hparams.image_size; + const int32_t ny = ctx->vision_model.hparams.image_size; + return clip_embd_nbytes_by_img(ctx, nx, ny); } -size_t clip_embd_nbytes_by_img(const struct clip_ctx * ctx, int img_h, int img_w) { +size_t clip_embd_nbytes_by_img(const struct clip_ctx * ctx, int img_w, int img_h) { clip_image_f32 img; img.nx = img_w; img.ny = img_h; - return clip_n_patches_by_img(ctx, &img) * clip_n_mmproj_embd(ctx) * sizeof(float); + return clip_n_output_tokens(ctx, &img) * clip_n_mmproj_embd(ctx) * sizeof(float); } int32_t clip_get_image_size(const struct clip_ctx * ctx) { @@ -2338,21 +2905,44 @@ size_t get_clip_image_grid_size(const struct clip_ctx * ctx) { return ctx->vision_model.hparams.image_grid_pinpoints.size(); } +// deprecated int clip_n_patches(const struct clip_ctx * ctx) { clip_image_f32 img; img.nx = ctx->vision_model.hparams.image_size; img.ny = ctx->vision_model.hparams.image_size; - return clip_n_patches_by_img(ctx, &img); + return clip_n_output_tokens(ctx, &img); } +// deprecated int clip_n_patches_by_img(const struct clip_ctx * ctx, struct clip_image_f32 * img) { + return clip_n_output_tokens(ctx, img); +} + +int clip_n_output_tokens_x(const struct clip_ctx * ctx, struct clip_image_f32 * img) { + const auto & params = ctx->vision_model.hparams; + const int n_total = clip_n_output_tokens(ctx, img); + if (ctx->proj_type == PROJECTOR_TYPE_QWEN2VL || ctx->proj_type == PROJECTOR_TYPE_QWEN25VL) { + return img->nx / (params.patch_size * 2) + (int)(img->nx % params.patch_size > 0); + } + return n_total; +} + +int clip_n_output_tokens_y(const struct clip_ctx * ctx, struct clip_image_f32 * img) { + const auto & params = ctx->vision_model.hparams; + if (ctx->proj_type == PROJECTOR_TYPE_QWEN2VL || ctx->proj_type == PROJECTOR_TYPE_QWEN25VL) { + return img->ny / (params.patch_size * 2) + (int)(img->ny % params.patch_size > 0); + } + return 1; +} + +int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * img) { const auto & params = ctx->vision_model.hparams; int n_patches = (params.image_size / params.patch_size) * (params.image_size / params.patch_size); if (ctx->proj_type == PROJECTOR_TYPE_LDP || ctx->proj_type == PROJECTOR_TYPE_LDPV2 || ctx->proj_type == PROJECTOR_TYPE_GLM_EDGE) { n_patches /= 4; - } else if (ctx->proj_type == PROJECTOR_TYPE_RESAMPLER) { + } else if (ctx->proj_type == PROJECTOR_TYPE_MINICPMV) { if (ctx->minicpmv_version == 2) { n_patches = 96; } @@ -2362,13 +2952,22 @@ int clip_n_patches_by_img(const struct clip_ctx * ctx, struct clip_image_f32 * i else if (ctx->minicpmv_version == 4) { n_patches = 64; } - } else if (ctx->proj_type == PROJECTOR_TYPE_MERGER) { + else { + GGML_ABORT("Unknown minicpmv version"); + } + } else if (ctx->proj_type == PROJECTOR_TYPE_QWEN2VL || ctx->proj_type == PROJECTOR_TYPE_QWEN25VL) { int patch_size = params.patch_size * 2; int x_patch = img->nx / patch_size + (int)(img->nx % patch_size > 0); int y_patch = img->ny / patch_size + (int)(img->ny % patch_size > 0); n_patches = x_patch * y_patch; } else if (ctx->proj_type == PROJECTOR_TYPE_GEMMA3) { n_patches = 256; + } else if (ctx->proj_type == PROJECTOR_TYPE_IDEFICS3) { + n_patches /= ctx->vision_model.hparams.proj_scale_factor; + } else if (ctx->proj_type == PROJECTOR_TYPE_PIXTRAL) { + int n_patches_x = img->nx / params.patch_size; + int n_patches_y = img->ny / params.patch_size; + n_patches = n_patches_y*n_patches_x + n_patches_y - 1; // + one [IMG_BREAK] per row, except the last row } return n_patches; @@ -2461,11 +3060,6 @@ static std::vector> get_2d_sincos_pos_embed(int embed_dim, co } bool clip_image_encode(struct clip_ctx * ctx, const int n_threads, clip_image_f32 * img, float * vec) { - if (!ctx->has_vision_encoder) { - LOG_ERR("%s: This gguf file seems to have no vision encoder\n", __func__); - return false; - } - clip_image_f32_batch imgs; clip_image_f32_ptr img_copy(clip_image_f32_init()); *img_copy = *img; @@ -2476,25 +3070,13 @@ bool clip_image_encode(struct clip_ctx * ctx, const int n_threads, clip_image_f3 bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_image_f32_batch * imgs_c_ptr, float * vec) { const clip_image_f32_batch & imgs = *imgs_c_ptr; - - if (!ctx->has_vision_encoder) { - LOG_ERR("%s: This gguf file seems to have no vision encoder\n", __func__); - return false; - } - int batch_size = imgs.entries.size(); - if (ctx->has_llava_projector) { - GGML_ASSERT(batch_size == 1); // TODO: support multiple images - } - if (ctx->has_minicpmv_projector) { + + if (ctx->has_llava_projector + || ctx->proj_type == PROJECTOR_TYPE_MINICPMV + || ctx->proj_type == PROJECTOR_TYPE_GLM_EDGE) { GGML_ASSERT(batch_size == 1); } - if (ctx->has_glm_projector) { - GGML_ASSERT(batch_size == 1); - ggml_tensor * boi = ctx->vision_model.boi_w; - ggml_backend_tensor_get(boi,vec,0,ggml_nbytes(boi)); - vec = (float*)(vec+ggml_nelements(boi)); //offset for boi - } // build the inference graph ggml_backend_sched_reset(ctx->sched.get()); @@ -2502,164 +3084,283 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima ggml_backend_sched_alloc_graph(ctx->sched.get(), gf); // set inputs - const auto & model = ctx->vision_model; + const auto & model = ctx->vision_model; const auto & hparams = model.hparams; - const int image_size = hparams.image_size; - int image_size_width = image_size; - int image_size_height = image_size; - if (ctx->has_minicpmv_projector | ctx->has_qwen2vl_merger) { - image_size_width = imgs.entries[0]->nx; - image_size_height = imgs.entries[0]->ny; - } + const int image_size_width = imgs.entries[0]->nx; + const int image_size_height = imgs.entries[0]->ny; + const int patch_size = hparams.patch_size; const int num_patches = ((image_size_width / patch_size) * (image_size_height / patch_size)); const int num_positions = num_patches + (model.class_embedding ? 1 : 0); - const int pos_w = ctx->load_image_size.width / patch_size; + const int pos_w = ctx->load_image_size.width / patch_size; const int pos_h = ctx->load_image_size.height / patch_size; + const bool use_window_attn = hparams.n_wa_pattern > 0; // for qwen2.5vl + + auto get_inp_tensor = [&gf](const char * name) { + struct ggml_tensor * inp = ggml_graph_get_tensor(gf, name); + if (inp == nullptr) { + GGML_ABORT("Failed to get tensor %s", name); + } + if (!(inp->flags & GGML_TENSOR_FLAG_INPUT)) { + GGML_ABORT("Tensor %s is not an input tensor", name); + } + return inp; + }; + + auto set_input_f32 = [&get_inp_tensor](const char * name, std::vector & values) { + ggml_tensor * cur = get_inp_tensor(name); + GGML_ASSERT(cur->type == GGML_TYPE_F32); + GGML_ASSERT(ggml_nelements(cur) == (int64_t)values.size()); + ggml_backend_tensor_set(cur, values.data(), 0, ggml_nbytes(cur)); + }; + + auto set_input_i32 = [&get_inp_tensor](const char * name, std::vector & values) { + ggml_tensor * cur = get_inp_tensor(name); + GGML_ASSERT(cur->type == GGML_TYPE_I32); + GGML_ASSERT(ggml_nelements(cur) == (int64_t)values.size()); + ggml_backend_tensor_set(cur, values.data(), 0, ggml_nbytes(cur)); + }; + + // set input pixel values { - struct ggml_tensor * inp_raw = ggml_graph_get_tensor(gf, "inp_raw"); - float * data = (float *)malloc(ggml_nbytes(inp_raw)); + size_t nelem = 0; + for (const auto & img : imgs.entries) { + nelem += img->nx * img->ny * 3; + } + std::vector inp_raw(nelem); + + // layout of data (note: the channel dim is unrolled to better visualize the layout): + // + // ┌──W──┐ + // │ H │ channel = R + // ├─────┤ │ + // │ H │ channel = G + // ├─────┤ │ + // │ H │ channel = B + // └─────┘ │ + // ──────┘ x B for (size_t i = 0; i < imgs.entries.size(); i++) { const int nx = imgs.entries[i]->nx; const int ny = imgs.entries[i]->ny; - if (!(ctx->has_minicpmv_projector | ctx->has_qwen2vl_merger)) { - GGML_ASSERT(nx == image_size && ny == image_size); - } - const int n = nx * ny; for (int b = 0; b < batch_size; b++) { - for (int k = 0; k < 3; k++) { - for (int y = 0; y < ny; y++) { - for (int x = 0; x < nx; x++) { - data[(b * 3 * n) + k * n + y * nx + x] = imgs.entries[b]->buf[3 * (y * nx + x) + k]; - } + float * batch_entry = inp_raw.data() + b * (3*n); + for (int y = 0; y < ny; y++) { + for (int x = 0; x < nx; x++) { + size_t base_src = 3*(y * nx + x); // idx of the first channel + size_t base_dst = y * nx + x; // idx of the first channel + batch_entry[ base_dst] = imgs.entries[b]->buf[base_src ]; + batch_entry[1*n + base_dst] = imgs.entries[b]->buf[base_src + 1]; + batch_entry[2*n + base_dst] = imgs.entries[b]->buf[base_src + 2]; } } } } - ggml_backend_tensor_set(inp_raw, data, 0, ggml_nbytes(inp_raw)); - free(data); + set_input_f32("inp_raw", inp_raw); } - if (ctx->has_minicpmv_projector) { - { - // inspired from siglip: - // -> https://huggingface.co/HuggingFaceM4/siglip-so400m-14-980-flash-attn2-navit - // -> https://huggingface.co/HuggingFaceM4/siglip-so400m-14-980-flash-attn2-navit/blob/d66538faeba44480d0bfaa42145eef26f9423199/modeling_siglip.py#L316 - struct ggml_tensor * positions = ggml_graph_get_tensor(gf, "positions"); - int* positions_data = (int*)malloc(ggml_nbytes(positions)); - int bucket_coords_h[1024]; - int bucket_coords_w[1024]; - for (int i = 0; i < pos_h; i++){ - bucket_coords_h[i] = std::floor(70.0*i/pos_h); - } - for (int i = 0; i < pos_w; i++){ - bucket_coords_w[i] = std::floor(70.0*i/pos_w); - } - for (int i = 0, id = 0; i < pos_h; i++){ - for (int j = 0; j < pos_w; j++){ - positions_data[id++] = bucket_coords_h[i]*70 + bucket_coords_w[j]; - } - } - ggml_backend_tensor_set(positions, positions_data, 0, ggml_nbytes(positions)); - free(positions_data); - } - { - // inspired from resampler of Qwen-VL: - // -> https://huggingface.co/Qwen/Qwen-VL/tree/main - // -> https://huggingface.co/Qwen/Qwen-VL/blob/0547ed36a86561e2e42fecec8fd0c4f6953e33c4/visual.py#L23 - struct ggml_tensor * pos_embed = ggml_graph_get_tensor(gf, "pos_embed"); - int embed_dim = 4096; - if (ctx->minicpmv_version == 2) { - embed_dim = 4096; - } - else if (ctx->minicpmv_version == 3) { - embed_dim = 3584; - } - else if (ctx->minicpmv_version == 4) { - embed_dim = 3584; - } - auto pos_embed_t = get_2d_sincos_pos_embed(embed_dim, std::make_pair(pos_w, pos_h)); - - float * pos_embed_data = (float *)malloc(ggml_nbytes(pos_embed)); - for(int i=0;i < pos_w * pos_h; ++i){ - for(int j=0; j < embed_dim; ++j){ - pos_embed_data[i * embed_dim + j] = pos_embed_t[i][j]; - } - } - - ggml_backend_tensor_set(pos_embed, pos_embed_data, 0, ggml_nbytes(pos_embed)); - free(pos_embed_data); - } - } - else { - if (model.class_embedding) { - struct ggml_tensor * embeddings = ggml_graph_get_tensor(gf, "embeddings"); - - void* zero_mem = malloc(ggml_nbytes(embeddings)); - memset(zero_mem, 0, ggml_nbytes(embeddings)); - ggml_backend_tensor_set(embeddings, zero_mem, 0, ggml_nbytes(embeddings)); - free(zero_mem); - } - - if (ctx->has_qwen2vl_merger) { - struct ggml_tensor * positions = ggml_graph_get_tensor(gf, "positions"); - - const int pw = image_size_width / patch_size; - const int ph = image_size_height / patch_size; - int* positions_data = (int*)malloc(ggml_nbytes(positions)); - - int ptr = 0; - for (int y = 0; y < ph; y+=2) + // set input per projector + switch (ctx->proj_type) { + case PROJECTOR_TYPE_MINICPMV: { - for (int x = 0; x < pw; x+=2) - { - for (int dy = 0; dy < 2; dy++) { - for (int dx = 0; dx < 2; dx++) { - positions_data[ptr] = y + dy; - positions_data[num_patches + ptr] = x + dx; - positions_data[num_patches * 2 + ptr] = y + dy; - positions_data[num_patches * 3 + ptr] = x + dx; - ptr++; + // inspired from siglip: + // -> https://huggingface.co/HuggingFaceM4/siglip-so400m-14-980-flash-attn2-navit + // -> https://huggingface.co/HuggingFaceM4/siglip-so400m-14-980-flash-attn2-navit/blob/d66538faeba44480d0bfaa42145eef26f9423199/modeling_siglip.py#L316 + std::vector positions(pos_h * pos_w); + int bucket_coords_h[1024]; + int bucket_coords_w[1024]; + for (int i = 0; i < pos_h; i++){ + bucket_coords_h[i] = std::floor(70.0*i/pos_h); + } + for (int i = 0; i < pos_w; i++){ + bucket_coords_w[i] = std::floor(70.0*i/pos_w); + } + for (int i = 0, id = 0; i < pos_h; i++){ + for (int j = 0; j < pos_w; j++){ + positions[id++] = bucket_coords_h[i]*70 + bucket_coords_w[j]; + } + } + set_input_i32("positions", positions); + + // inspired from resampler of Qwen-VL: + // -> https://huggingface.co/Qwen/Qwen-VL/tree/main + // -> https://huggingface.co/Qwen/Qwen-VL/blob/0547ed36a86561e2e42fecec8fd0c4f6953e33c4/visual.py#L23 + int embed_dim = clip_n_mmproj_embd(ctx); + + // TODO @ngxson : this is very inefficient, can we do this using ggml_sin and ggml_cos? + auto pos_embed_t = get_2d_sincos_pos_embed(embed_dim, std::make_pair(pos_w, pos_h)); + + std::vector pos_embed(embed_dim * pos_w * pos_h); + for(int i = 0; i < pos_w * pos_h; ++i){ + for(int j = 0; j < embed_dim; ++j){ + pos_embed[i * embed_dim + j] = pos_embed_t[i][j]; + } + } + + set_input_f32("pos_embed", pos_embed); + } break; + case PROJECTOR_TYPE_QWEN2VL: + { + const int merge_ratio = 2; + const int pw = image_size_width / patch_size; + const int ph = image_size_height / patch_size; + std::vector positions(num_positions * 4); + int ptr = 0; + for (int y = 0; y < ph; y += merge_ratio) { + for (int x = 0; x < pw; x += merge_ratio) { + for (int dy = 0; dy < 2; dy++) { + for (int dx = 0; dx < 2; dx++) { + positions[ ptr] = y + dy; + positions[ num_patches + ptr] = x + dx; + positions[2 * num_patches + ptr] = y + dy; + positions[3 * num_patches + ptr] = x + dx; + ptr++; + } } } } - } - ggml_backend_tensor_set(positions, positions_data, 0, ggml_nbytes(positions)); - free(positions_data); - } - else if (ctx->proj_type == PROJECTOR_TYPE_GEMMA3) { - // do nothing - } - else { - struct ggml_tensor * positions = ggml_graph_get_tensor(gf, "positions"); + set_input_i32("positions", positions); + } break; + case PROJECTOR_TYPE_QWEN25VL: + { + // pw * ph = number of tokens output by ViT after apply patch merger + // ipw * ipw = number of vision token been processed inside ViT + const int merge_ratio = 2; + const int pw = image_size_width / patch_size / merge_ratio; + const int ph = image_size_height / patch_size / merge_ratio; + const int ipw = image_size_width / patch_size; + const int iph = image_size_height / patch_size; - int* positions_data = (int*)malloc(ggml_nbytes(positions)); + std::vector idx (ph * pw); + std::vector inv_idx(ph * pw); + + if (use_window_attn) { + const int attn_window_size = 112; + const int grid_window = attn_window_size / patch_size / merge_ratio; + int dst = 0; + // [num_vision_tokens, num_vision_tokens] attention mask tensor + std::vector mask(pow(ipw * iph, 2), std::numeric_limits::lowest()); + int mask_row = 0; + + for (int y = 0; y < ph; y += grid_window) { + for (int x = 0; x < pw; x += grid_window) { + const int win_h = std::min(grid_window, ph - y); + const int win_w = std::min(grid_window, pw - x); + const int dst_0 = dst; + // group all tokens belong to the same window togather (to a continue range) + for (int dy = 0; dy < win_h; dy++) { + for (int dx = 0; dx < win_w; dx++) { + const int src = (y + dy) * pw + (x + dx); + GGML_ASSERT(src < (int)idx.size()); + GGML_ASSERT(dst < (int)inv_idx.size()); + idx [src] = dst; + inv_idx[dst] = src; + dst++; + } + } + + for (int r=0; r < win_h * win_w * merge_ratio * merge_ratio; r++) { + int row_offset = mask_row * (ipw * iph); + std::fill( + mask.begin() + row_offset + (dst_0 * merge_ratio * merge_ratio), + mask.begin() + row_offset + (dst * merge_ratio * merge_ratio), + 0.0); + mask_row++; + } + } + } + + set_input_i32("window_idx", idx); + set_input_i32("inv_window_idx", inv_idx); + set_input_f32("window_mask", mask); + } else { + for (int i = 0; i < ph * pw; i++) { + idx[i] = i; + } + } + + const int mpow = merge_ratio * merge_ratio; + std::vector positions(num_positions * 4); + + int ptr = 0; + for (int y = 0; y < iph; y += merge_ratio) { + for (int x = 0; x < ipw; x += merge_ratio) { + for (int dy = 0; dy < 2; dy++) { + for (int dx = 0; dx < 2; dx++) { + auto remap = idx[ptr / mpow]; + remap = (remap * mpow) + (ptr % mpow); + + positions[ remap] = y + dy; + positions[ num_patches + remap] = x + dx; + positions[2 * num_patches + remap] = y + dy; + positions[3 * num_patches + remap] = x + dx; + ptr++; + } + } + } + } + + set_input_i32("positions", positions); + } break; + case PROJECTOR_TYPE_PIXTRAL: + { + // set the 2D positions + int n_patches_per_col = image_size_width / patch_size; + std::vector pos_data(num_positions); + // dimension H + for (int i = 0; i < num_positions; i++) { + pos_data[i] = i / n_patches_per_col; + } + set_input_i32("pos_h", pos_data); + // dimension W + for (int i = 0; i < num_positions; i++) { + pos_data[i] = i % n_patches_per_col; + } + set_input_i32("pos_w", pos_data); + } break; + case PROJECTOR_TYPE_GLM_EDGE: + { + // llava and other models + std::vector positions(num_positions); for (int i = 0; i < num_positions; i++) { - positions_data[i] = i; + positions[i] = i; } - ggml_backend_tensor_set(positions, positions_data, 0, ggml_nbytes(positions)); - free(positions_data); + set_input_i32("positions", positions); + } break; + case PROJECTOR_TYPE_MLP: + case PROJECTOR_TYPE_MLP_NORM: + case PROJECTOR_TYPE_LDP: + case PROJECTOR_TYPE_LDPV2: + { + // llava and other models + std::vector positions(num_positions); + for (int i = 0; i < num_positions; i++) { + positions[i] = i; + } + set_input_i32("positions", positions); - if (!ctx->has_glm_projector) { - struct ggml_tensor * patches = ggml_graph_get_tensor(gf, "patches"); // The patches vector is used to get rows to index into the embeds with; // we should skip dim 0 only if we have CLS to avoid going out of bounds // when retrieving the rows. int patch_offset = model.class_embedding ? 1 : 0; - int* patches_data = (int*)malloc(ggml_nbytes(patches)); + std::vector patches(num_patches); for (int i = 0; i < num_patches; i++) { - patches_data[i] = i + patch_offset; + patches[i] = i + patch_offset; } - ggml_backend_tensor_set(patches, patches_data, 0, ggml_nbytes(patches)); - free(patches_data); - } - } + set_input_i32("patches", patches); + } break; + case PROJECTOR_TYPE_GEMMA3: + case PROJECTOR_TYPE_IDEFICS3: + { + // do nothing + } break; + default: + GGML_ABORT("Unknown projector type"); } ggml_backend_cpu_set_n_threads(ctx->backend_cpu, n_threads); @@ -2676,13 +3377,6 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima // copy the embeddings to the location passed by the user ggml_backend_tensor_get(embeddings, vec, 0, ggml_nbytes(embeddings)); - if (ctx->has_glm_projector) { - //eoi - ggml_tensor * eoi = ctx->vision_model.eoi_w; - int offset = ggml_nelements(embeddings); - ggml_backend_tensor_get(eoi, vec+offset, 0, ggml_nbytes(eoi)); - } - return true; } @@ -2822,56 +3516,52 @@ bool clip_model_quantize(const char * fname_inp, const char * fname_out, const i } int clip_n_mmproj_embd(const struct clip_ctx * ctx) { - if (ctx->proj_type == PROJECTOR_TYPE_LDP) { - return ctx->vision_model.mm_model_block_1_block_2_1_b->ne[0]; + switch (ctx->proj_type) { + case PROJECTOR_TYPE_LDP: + return ctx->vision_model.mm_model_block_1_block_2_1_b->ne[0]; + case PROJECTOR_TYPE_LDPV2: + return ctx->vision_model.mm_model_peg_0_b->ne[0]; + case PROJECTOR_TYPE_MLP: + case PROJECTOR_TYPE_PIXTRAL: + return ctx->vision_model.mm_2_b->ne[0]; + case PROJECTOR_TYPE_MLP_NORM: + return ctx->vision_model.mm_3_b->ne[0]; + case PROJECTOR_TYPE_MINICPMV: + if (ctx->minicpmv_version == 2) { + return 4096; + } else if (ctx->minicpmv_version == 3) { + return 3584; + } else if (ctx->minicpmv_version == 4) { + return 3584; + } + GGML_ABORT("Unknown minicpmv version"); + case PROJECTOR_TYPE_GLM_EDGE: + return ctx->vision_model.mm_model_mlp_3_w->ne[1]; + case PROJECTOR_TYPE_QWEN2VL: + case PROJECTOR_TYPE_QWEN25VL: + return ctx->vision_model.mm_1_b->ne[0]; + case PROJECTOR_TYPE_GEMMA3: + return ctx->vision_model.mm_input_proj_w->ne[0]; + case PROJECTOR_TYPE_IDEFICS3: + return ctx->vision_model.projection->ne[1]; + default: + GGML_ABORT("Unknown projector type"); } - if (ctx->proj_type == PROJECTOR_TYPE_LDPV2) { - return ctx->vision_model.mm_model_peg_0_b->ne[0]; - } - if (ctx->proj_type == PROJECTOR_TYPE_MLP) { - return ctx->vision_model.mm_2_b->ne[0]; - } - if (ctx->proj_type == PROJECTOR_TYPE_MLP_NORM) { - return ctx->vision_model.mm_3_b->ne[0]; - } - if (ctx->proj_type == PROJECTOR_TYPE_RESAMPLER) { - if (ctx->minicpmv_version == 2) { - return 4096; - } - else if (ctx->minicpmv_version == 3) { - return 3584; - } - else if (ctx->minicpmv_version == 4) { - return 3584; - } - } - if (ctx->proj_type == PROJECTOR_TYPE_GLM_EDGE){ - return ctx->vision_model.mm_model_mlp_3_w->ne[1]; - } - if (ctx->proj_type == PROJECTOR_TYPE_MERGER) { - return ctx->vision_model.mm_1_b->ne[0]; - } - if (ctx->proj_type == PROJECTOR_TYPE_GEMMA3) { - return ctx->vision_model.mm_input_proj_w->ne[0]; - } - - std::string proj_type = PROJECTOR_TYPE_NAMES[ctx->proj_type]; - throw std::runtime_error(string_format("%s: don't support projector with: %s currently\n", __func__, proj_type.c_str())); } int clip_is_minicpmv(const struct clip_ctx * ctx) { - if (ctx->has_minicpmv_projector) { + if (ctx->proj_type == PROJECTOR_TYPE_MINICPMV) { return ctx->minicpmv_version; } return 0; } bool clip_is_glm(const struct clip_ctx * ctx) { - return ctx->has_glm_projector; + return ctx->proj_type == PROJECTOR_TYPE_GLM_EDGE; } bool clip_is_qwen2vl(const struct clip_ctx * ctx) { - return ctx->has_qwen2vl_merger; + return ctx->proj_type == PROJECTOR_TYPE_QWEN2VL || ctx->proj_type == PROJECTOR_TYPE_QWEN25VL; } bool clip_is_llava(const struct clip_ctx * ctx) { @@ -2882,29 +3572,6 @@ bool clip_is_gemma3(const struct clip_ctx * ctx) { return ctx->proj_type == PROJECTOR_TYPE_GEMMA3; } -// Determine the number of encoder layers to iterate over -int get_deepest_feature_layer(const struct clip_ctx * ctx) { - // Get the index of the second to last layer; this is the - // default for models that have a llava projector - const auto & hparams = ctx->vision_model.hparams; - int n_layer = hparams.n_layer - 1; - int deepest_feature_layer = -1; - - // Handle other projectors; incrementing here indicates that we - // should use the last encoder layer for the vision features. - if (ctx->has_minicpmv_projector || ctx->has_glm_projector || ctx->has_qwen2vl_merger) { - n_layer += 1; - } - - // If we set explicit vision feature layers, only go up to the deepest one - for (const auto & feature_layer : hparams.vision_feature_layer) { - if (feature_layer > deepest_feature_layer) { - deepest_feature_layer = feature_layer; - } - } - return deepest_feature_layer < 0 ? n_layer : deepest_feature_layer; -} - bool clip_encode_float_image (struct clip_ctx * ctx, int n_threads, float * img, int h, int w, float * vec) { clip_image_f32 clip_img; clip_img.buf.resize(h * w * 3); diff --git a/llama/llama.cpp/examples/llava/clip.h b/llama/llama.cpp/examples/llava/clip.h index 5fc45d3e2..0a53bd8eb 100644 --- a/llama/llama.cpp/examples/llava/clip.h +++ b/llama/llama.cpp/examples/llava/clip.h @@ -47,7 +47,7 @@ CLIP_API struct clip_ctx * clip_init(const char * fname, struct clip_context_par CLIP_API void clip_free(struct clip_ctx * ctx); CLIP_API size_t clip_embd_nbytes(const struct clip_ctx * ctx); -CLIP_API size_t clip_embd_nbytes_by_img(const struct clip_ctx * ctx, int img_h, int img_w); +CLIP_API size_t clip_embd_nbytes_by_img(const struct clip_ctx * ctx, int img_w, int img_h); CLIP_API int32_t clip_get_image_size (const struct clip_ctx * ctx); CLIP_API int32_t clip_get_patch_size (const struct clip_ctx * ctx); @@ -59,9 +59,20 @@ CLIP_API const char * clip_patch_merge_type(const struct clip_ctx * ctx); CLIP_API const int32_t * clip_image_grid(const struct clip_ctx * ctx); CLIP_API size_t get_clip_image_grid_size(const struct clip_ctx * ctx); -CLIP_API int clip_n_patches (const struct clip_ctx * ctx); -CLIP_API int clip_n_patches_by_img (const struct clip_ctx * ctx, struct clip_image_f32 * img); -CLIP_API int clip_n_mmproj_embd (const struct clip_ctx * ctx); +GGML_DEPRECATED(CLIP_API int clip_n_patches(const struct clip_ctx * ctx), + "use clip_n_output_tokens instead"); +GGML_DEPRECATED(CLIP_API int clip_n_patches_by_img(const struct clip_ctx * ctx, struct clip_image_f32 * img), + "use clip_n_output_tokens instead"); + +CLIP_API int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * img); + +// for M-RoPE, this will be the number of token positions in X and Y directions +// for other models, X will be the total number of tokens and Y will be 1 +CLIP_API int clip_n_output_tokens_x(const struct clip_ctx * ctx, struct clip_image_f32 * img); +CLIP_API int clip_n_output_tokens_y(const struct clip_ctx * ctx, struct clip_image_f32 * img); + +// this should be equal to the embedding dimension of the text model +CLIP_API int clip_n_mmproj_embd(const struct clip_ctx * ctx); CLIP_API int clip_uhd_num_image_embeds_col(struct clip_ctx * ctx_clip); CLIP_API void clip_add_load_image_size(struct clip_ctx * ctx_clip, struct clip_image_size * load_image_size); @@ -114,8 +125,6 @@ CLIP_API bool clip_is_qwen2vl(const struct clip_ctx * ctx); CLIP_API bool clip_is_llava(const struct clip_ctx * ctx); CLIP_API bool clip_is_gemma3(const struct clip_ctx * ctx); -CLIP_API int get_deepest_feature_layer(const struct clip_ctx * ctx); - CLIP_API bool clip_encode_float_image (struct clip_ctx * ctx, int n_threads, float * img, int h, int w, float * vec); diff --git a/llama/llama.cpp/examples/llava/llava.cpp b/llama/llama.cpp/examples/llava/llava.cpp index 5eb40bcd1..bab027b50 100644 --- a/llama/llama.cpp/examples/llava/llava.cpp +++ b/llama/llama.cpp/examples/llava/llava.cpp @@ -112,7 +112,7 @@ static struct clip_image_grid_shape get_anyres_image_grid_shape(const std::pair< } // Take the image segments in a grid configuration and return the embeddings and the number of embeddings into preallocated memory (image_embd_out) -static bool clip_llava_handle_patches(clip_ctx * ctx_clip, std::vector & image_embd_v, struct clip_image_grid_shape grid_shape, float * image_embd_out, int * n_img_pos_out) { +static bool clip_llava_handle_patches(clip_ctx * ctx_clip, std::vector & image_embd_v, struct clip_image_grid_shape grid_shape, float * image_embd_out, int * n_img_pos_out, clip_image_f32 * img_input) { struct { struct ggml_context * ctx; } model; @@ -175,7 +175,7 @@ static bool clip_llava_handle_patches(clip_ctx * ctx_clip, std::vector model.ctx = ggml_init(params); - struct ggml_tensor * image_features = ggml_new_tensor_3d(model.ctx, GGML_TYPE_F32, clip_n_mmproj_embd(ctx_clip), clip_n_patches(ctx_clip), num_images - 1); // example: 4096 x 576 x 4 + struct ggml_tensor * image_features = ggml_new_tensor_3d(model.ctx, GGML_TYPE_F32, clip_n_mmproj_embd(ctx_clip), clip_n_output_tokens(ctx_clip, img_input), num_images - 1); // example: 4096 x 576 x 4 // ggml_tensor_printf(image_features,"image_features",__LINE__,false,false); // fill it with the image embeddings, ignoring the base for (size_t i = 1; i < num_images; i++) { @@ -214,8 +214,8 @@ static bool clip_llava_handle_patches(clip_ctx * ctx_clip, std::vector memcpy(image_embd_out, image_embd_v[0], clip_embd_nbytes(ctx_clip)); // main image as global context // append without newline tokens (default behavior in llava_arch when not using unpad ): - memcpy(image_embd_out + clip_n_patches(ctx_clip) * clip_n_mmproj_embd(ctx_clip), (float*)result->data, clip_embd_nbytes(ctx_clip) * (num_images-1)); // grid patches - *n_img_pos_out = static_cast(result->ne[1]+clip_n_patches(ctx_clip)); + memcpy(image_embd_out + clip_n_output_tokens(ctx_clip, img_input) * clip_n_mmproj_embd(ctx_clip), (float*)result->data, clip_embd_nbytes(ctx_clip) * (num_images-1)); // grid patches + *n_img_pos_out = static_cast(result->ne[1]+clip_n_output_tokens(ctx_clip, img_input)); // Debug: Test single segments // Current findings: sending base image, sending a segment embedding all works similar to python @@ -313,7 +313,7 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli image_embd + n_img_pos_out * clip_n_mmproj_embd(ctx_clip), image_embd_v[i], clip_embd_nbytes_by_img(ctx_clip, nx, ny)); - n_img_pos_out += clip_n_patches_by_img(ctx_clip, img_res); + n_img_pos_out += clip_n_output_tokens(ctx_clip, img_res); } *n_img_pos = n_img_pos_out; for (size_t i = 0; i < image_embd_v.size(); i++) { @@ -342,8 +342,8 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli } else if (strcmp(mm_patch_merge_type, "spatial_unpad") != 0) { // flat / default llava-1.5 type embedding - *n_img_pos = clip_n_patches(ctx_clip); clip_image_f32 * img_res = clip_image_f32_get_img(img_res_v.get(), 0); + *n_img_pos = clip_n_output_tokens(ctx_clip, img_res); bool encoded = clip_image_encode(ctx_clip, n_threads, img_res, image_embd); // image_embd shape is 576 x 4096 if (!encoded) { LOG_ERR("Unable to encode image\n"); @@ -381,7 +381,8 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli struct clip_image_grid_shape grid_shape = get_anyres_image_grid_shape({img->nx,img->ny}, grid_pinpoints, image_size); int n_img_pos_out; - clip_llava_handle_patches(ctx_clip, image_embd_v, grid_shape, image_embd, &n_img_pos_out); + clip_image_f32 * img_input = clip_image_f32_get_img(img_res_v.get(), 0); + clip_llava_handle_patches(ctx_clip, image_embd_v, grid_shape, image_embd, &n_img_pos_out, img_input); *n_img_pos = n_img_pos_out; for (size_t i = 0; i < image_embd_v.size(); i++) { diff --git a/llama/llama.cpp/include/llama.h b/llama/llama.cpp/include/llama.h index f91896e48..f1628e88f 100644 --- a/llama/llama.cpp/include/llama.h +++ b/llama/llama.cpp/include/llama.h @@ -111,6 +111,7 @@ extern "C" { LLAMA_VOCAB_PRE_TYPE_TRILLION = 31, LLAMA_VOCAB_PRE_TYPE_BAILINGMOE = 32, LLAMA_VOCAB_PRE_TYPE_LLAMA4 = 33, + LLAMA_VOCAB_PRE_TYPE_PIXTRAL = 34, }; enum llama_rope_type { @@ -1237,6 +1238,7 @@ extern "C" { "will be removed in the future (see https://github.com/ggml-org/llama.cpp/pull/9896#discussion_r1800920915)"); /// @details Top-K sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751 + /// Setting k <= 0 makes this a noop LLAMA_API struct llama_sampler * llama_sampler_init_top_k (int32_t k); /// @details Nucleus sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751 diff --git a/llama/llama.cpp/src/llama-arch.cpp b/llama/llama.cpp/src/llama-arch.cpp index dd01df60a..eb7b5325e 100644 --- a/llama/llama.cpp/src/llama-arch.cpp +++ b/llama/llama.cpp/src/llama-arch.cpp @@ -20,6 +20,7 @@ static const std::map LLM_ARCH_NAMES = { { LLM_ARCH_REFACT, "refact" }, { LLM_ARCH_BERT, "bert" }, { LLM_ARCH_NOMIC_BERT, "nomic-bert" }, + { LLM_ARCH_NOMIC_BERT_MOE, "nomic-bert-moe" }, { LLM_ARCH_JINA_BERT_V2, "jina-bert-v2" }, { LLM_ARCH_BLOOM, "bloom" }, { LLM_ARCH_STABLELM, "stablelm" }, @@ -73,7 +74,6 @@ static const std::map LLM_ARCH_NAMES = { { LLM_ARCH_WAVTOKENIZER_DEC, "wavtokenizer-dec" }, { LLM_ARCH_PLM, "plm" }, { LLM_ARCH_BAILINGMOE, "bailingmoe" }, - { LLM_ARCH_MISTRAL3, "mistral3" }, { LLM_ARCH_UNKNOWN, "(unknown)" }, }; @@ -109,6 +109,7 @@ static const std::map LLM_KV_NAMES = { { LLM_KV_EXPERT_WEIGHTS_SCALE, "%s.expert_weights_scale" }, { LLM_KV_EXPERT_WEIGHTS_NORM, "%s.expert_weights_norm" }, { LLM_KV_EXPERT_GATING_FUNC, "%s.expert_gating_func" }, + { LLM_KV_MOE_EVERY_N_LAYERS, "%s.moe_every_n_layers" }, { LLM_KV_POOLING_TYPE, "%s.pooling_type" }, { LLM_KV_LOGIT_SCALE, "%s.logit_scale" }, { LLM_KV_DECODER_START_TOKEN_ID, "%s.decoder_start_token_id" }, @@ -511,6 +512,24 @@ static const std::map> LLM_TENSOR_N { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" }, }, }, + { + LLM_ARCH_NOMIC_BERT_MOE, + { + { LLM_TENSOR_TOKEN_EMBD, "token_embd" }, + { LLM_TENSOR_TOKEN_EMBD_NORM, "token_embd_norm" }, + { LLM_TENSOR_TOKEN_TYPES, "token_types" }, + { LLM_TENSOR_ATTN_OUT_NORM, "blk.%d.attn_output_norm" }, + { LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" }, + { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" }, + { LLM_TENSOR_LAYER_OUT_NORM, "blk.%d.layer_output_norm" }, + { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" }, + { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" }, + { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" }, + { LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" }, + { LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" }, + { LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" }, + }, + }, { LLM_ARCH_JINA_BERT_V2, { @@ -1587,22 +1606,6 @@ static const std::map> LLM_TENSOR_N { LLM_TENSOR_FFN_UP_SHEXP, "blk.%d.ffn_up_shexp" }, }, }, - { - LLM_ARCH_MISTRAL3, - { - { LLM_TENSOR_TOKEN_EMBD, "token_embd" }, - { LLM_TENSOR_OUTPUT_NORM, "output_norm" }, - { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" }, - { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" }, - { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" }, - { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" }, - { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" }, - { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" }, - { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" }, - { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" }, - { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" }, - } - }, { LLM_ARCH_UNKNOWN, { diff --git a/llama/llama.cpp/src/llama-arch.h b/llama/llama.cpp/src/llama-arch.h index b6227eebf..bc8a4f0bb 100644 --- a/llama/llama.cpp/src/llama-arch.h +++ b/llama/llama.cpp/src/llama-arch.h @@ -24,6 +24,7 @@ enum llm_arch { LLM_ARCH_REFACT, LLM_ARCH_BERT, LLM_ARCH_NOMIC_BERT, + LLM_ARCH_NOMIC_BERT_MOE, LLM_ARCH_JINA_BERT_V2, LLM_ARCH_BLOOM, LLM_ARCH_STABLELM, @@ -75,7 +76,6 @@ enum llm_arch { LLM_ARCH_CHAMELEON, LLM_ARCH_SOLAR, LLM_ARCH_WAVTOKENIZER_DEC, - LLM_ARCH_MISTRAL3, LLM_ARCH_PLM, LLM_ARCH_BAILINGMOE, LLM_ARCH_UNKNOWN, @@ -113,6 +113,7 @@ enum llm_kv { LLM_KV_EXPERT_WEIGHTS_SCALE, LLM_KV_EXPERT_WEIGHTS_NORM, LLM_KV_EXPERT_GATING_FUNC, + LLM_KV_MOE_EVERY_N_LAYERS, LLM_KV_POOLING_TYPE, LLM_KV_LOGIT_SCALE, LLM_KV_DECODER_START_TOKEN_ID, diff --git a/llama/llama.cpp/src/llama-chat.cpp b/llama/llama.cpp/src/llama-chat.cpp index 721faa4e8..735d2619c 100644 --- a/llama/llama.cpp/src/llama-chat.cpp +++ b/llama/llama.cpp/src/llama-chat.cpp @@ -50,8 +50,8 @@ static const std::map LLM_CHAT_TEMPLATES = { { "deepseek3", LLM_CHAT_TEMPLATE_DEEPSEEK_3 }, { "command-r", LLM_CHAT_TEMPLATE_COMMAND_R }, { "llama3", LLM_CHAT_TEMPLATE_LLAMA_3 }, - { "chatglm3", LLM_CHAT_TEMPLATE_CHATGML_3 }, - { "chatglm4", LLM_CHAT_TEMPLATE_CHATGML_4 }, + { "chatglm3", LLM_CHAT_TEMPLATE_CHATGLM_3 }, + { "chatglm4", LLM_CHAT_TEMPLATE_CHATGLM_4 }, { "glmedge", LLM_CHAT_TEMPLATE_GLMEDGE }, { "minicpm", LLM_CHAT_TEMPLATE_MINICPM }, { "exaone3", LLM_CHAT_TEMPLATE_EXAONE_3 }, @@ -62,6 +62,7 @@ static const std::map LLM_CHAT_TEMPLATES = { { "yandex", LLM_CHAT_TEMPLATE_YANDEX }, { "bailing", LLM_CHAT_TEMPLATE_BAILING }, { "llama4", LLM_CHAT_TEMPLATE_LLAMA4 }, + { "smolvlm", LLM_CHAT_TEMPLATE_SMOLVLM }, }; llm_chat_template llm_chat_template_from_str(const std::string & name) { @@ -81,7 +82,9 @@ llm_chat_template llm_chat_detect_template(const std::string & tmpl) { if (tmpl_contains("<|im_start|>")) { return tmpl_contains("<|im_sep|>") ? LLM_CHAT_TEMPLATE_PHI_4 - : LLM_CHAT_TEMPLATE_CHATML; + : tmpl_contains("") + ? LLM_CHAT_TEMPLATE_SMOLVLM // SmolVLM uses <|im_start|> as BOS, but it is NOT chatml + : LLM_CHAT_TEMPLATE_CHATML; } else if (tmpl.find("mistral") == 0 || tmpl_contains("[INST]")) { if (tmpl_contains("[SYSTEM_PROMPT]")) { return LLM_CHAT_TEMPLATE_MISTRAL_V7; @@ -119,8 +122,12 @@ llm_chat_template llm_chat_detect_template(const std::string & tmpl) { } } else if (tmpl_contains("<|assistant|>") && tmpl_contains("<|end|>")) { return LLM_CHAT_TEMPLATE_PHI_3; + } else if (tmpl_contains("[gMASK]")) { + return LLM_CHAT_TEMPLATE_CHATGLM_4; } else if (tmpl_contains("<|assistant|>") && tmpl_contains("<|user|>")) { return tmpl_contains("") ? LLM_CHAT_TEMPLATE_FALCON_3 : LLM_CHAT_TEMPLATE_GLMEDGE; + } else if (tmpl_contains("<|{{ item['role'] }}|>") && tmpl_contains("<|begin_of_image|>")) { + return LLM_CHAT_TEMPLATE_GLMEDGE; } else if (tmpl_contains("<|user|>") && tmpl_contains("<|endoftext|>")) { return LLM_CHAT_TEMPLATE_ZEPHYR; } else if (tmpl_contains("bos_token + message['role']")) { @@ -149,9 +156,7 @@ llm_chat_template llm_chat_detect_template(const std::string & tmpl) { return LLM_CHAT_TEMPLATE_LLAMA_3; } else if (tmpl_contains("[gMASK]sop")) { // chatglm3-6b - return LLM_CHAT_TEMPLATE_CHATGML_3; - } else if (tmpl_contains("[gMASK]")) { - return LLM_CHAT_TEMPLATE_CHATGML_4; + return LLM_CHAT_TEMPLATE_CHATGLM_3; } else if (tmpl_contains(LU8("<用户>"))) { // MiniCPM-3B-OpenHermes-2.5-v2-GGUF return LLM_CHAT_TEMPLATE_MINICPM; @@ -432,7 +437,7 @@ int32_t llm_chat_apply_template( if (add_ass) { ss << "<|start_header_id|>assistant<|end_header_id|>\n\n"; } - } else if (tmpl == LLM_CHAT_TEMPLATE_CHATGML_3) { + } else if (tmpl == LLM_CHAT_TEMPLATE_CHATGLM_3) { // chatglm3-6b ss << "[gMASK]" << "sop"; for (auto message : chat) { @@ -442,7 +447,7 @@ int32_t llm_chat_apply_template( if (add_ass) { ss << "<|assistant|>"; } - } else if (tmpl == LLM_CHAT_TEMPLATE_CHATGML_4) { + } else if (tmpl == LLM_CHAT_TEMPLATE_CHATGLM_4 || tmpl == LLM_CHAT_TEMPLATE_GLMEDGE) { ss << "[gMASK]" << ""; for (auto message : chat) { std::string role(message->role); @@ -451,14 +456,6 @@ int32_t llm_chat_apply_template( if (add_ass) { ss << "<|assistant|>"; } - } else if (tmpl == LLM_CHAT_TEMPLATE_GLMEDGE) { - for (auto message : chat) { - std::string role(message->role); - ss << "<|" << role << "|>" << "\n" << message->content; - } - if (add_ass) { - ss << "<|assistant|>"; - } } else if (tmpl == LLM_CHAT_TEMPLATE_MINICPM) { // MiniCPM-3B-OpenHermes-2.5-v2-GGUF for (auto message : chat) { @@ -620,7 +617,23 @@ int32_t llm_chat_apply_template( if (add_ass) { ss << "<|header_start|>assistant<|header_end|>\n\n"; } - } else { + } else if (tmpl == LLM_CHAT_TEMPLATE_SMOLVLM) { + // SmolVLM + ss << "<|im_start|>"; // uses <|im_start|> as BOS, but the actual content is NOT chatml + for (auto message : chat) { + std::string role(message->role); + if (role == "system") { + ss << message->content << "\n\n"; + } else if (role == "user") { + ss << "User: " << message->content << "\n"; + } else { + ss << "Assistant: " << message->content << "\n"; + } + } + if (add_ass) { + ss << "Assistant:"; + } + } else { // template not supported return -1; } diff --git a/llama/llama.cpp/src/llama-chat.h b/llama/llama.cpp/src/llama-chat.h index 34537ca21..3f5843466 100644 --- a/llama/llama.cpp/src/llama-chat.h +++ b/llama/llama.cpp/src/llama-chat.h @@ -29,8 +29,8 @@ enum llm_chat_template { LLM_CHAT_TEMPLATE_DEEPSEEK_3, LLM_CHAT_TEMPLATE_COMMAND_R, LLM_CHAT_TEMPLATE_LLAMA_3, - LLM_CHAT_TEMPLATE_CHATGML_3, - LLM_CHAT_TEMPLATE_CHATGML_4, + LLM_CHAT_TEMPLATE_CHATGLM_3, + LLM_CHAT_TEMPLATE_CHATGLM_4, LLM_CHAT_TEMPLATE_GLMEDGE, LLM_CHAT_TEMPLATE_MINICPM, LLM_CHAT_TEMPLATE_EXAONE_3, @@ -41,6 +41,7 @@ enum llm_chat_template { LLM_CHAT_TEMPLATE_YANDEX, LLM_CHAT_TEMPLATE_BAILING, LLM_CHAT_TEMPLATE_LLAMA4, + LLM_CHAT_TEMPLATE_SMOLVLM, LLM_CHAT_TEMPLATE_UNKNOWN, }; diff --git a/llama/llama.cpp/src/llama-context.cpp b/llama/llama.cpp/src/llama-context.cpp index 4b3e6a83e..77177c5ee 100644 --- a/llama/llama.cpp/src/llama-context.cpp +++ b/llama/llama.cpp/src/llama-context.cpp @@ -114,7 +114,7 @@ llama_context::llama_context( } if (n_ctx_per_seq > hparams.n_ctx_train) { - LLAMA_LOG_WARN("%s: n_ctx_pre_seq (%u) > n_ctx_train (%u) -- possible training context overflow\n", + LLAMA_LOG_WARN("%s: n_ctx_per_seq (%u) > n_ctx_train (%u) -- possible training context overflow\n", __func__, n_ctx_per_seq, hparams.n_ctx_train); } @@ -469,8 +469,7 @@ ggml_tensor * llama_context::build_rope_shift( ggml_tensor * shift, ggml_tensor * factors, float freq_base, - float freq_scale, - ggml_backend_buffer * bbuf) const { + float freq_scale) const { const auto & n_ctx_orig = cparams.n_ctx_orig_yarn; const auto & yarn_ext_factor = cparams.yarn_ext_factor; @@ -492,17 +491,7 @@ ggml_tensor * llama_context::build_rope_shift( // dequantize to f32 -> RoPE -> quantize back tmp = ggml_cast(ctx0, cur, GGML_TYPE_F32); - if (bbuf) { - for (const auto & backend : backends) { - // Figure out which backend KV cache belongs to - if (ggml_backend_supports_buft(backend.get(), ggml_backend_buffer_get_type(bbuf))) { - ggml_backend_sched_set_tensor_backend(sched.get(), tmp, backend.get()); - break; - } - } - } - - tmp = ggml_rope_ext_inplace(ctx0, tmp, + tmp = ggml_rope_ext(ctx0, tmp, shift, factors, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, yarn_ext_factor, yarn_attn_factor, yarn_beta_fast, yarn_beta_slow); @@ -582,7 +571,7 @@ llm_graph_result_ptr llama_context::build_kv_self_shift( ggml_row_size(kv_self->k_l[il]->type, n_embd_k_gqa), 0); - ggml_tensor * cur = build_rope_shift(ctx0, k, inp->k_shift, rope_factors, freq_base_l, freq_scale_l, kv_self->k_l[il]->buffer); + ggml_tensor * cur = build_rope_shift(ctx0, k, inp->k_shift, rope_factors, freq_base_l, freq_scale_l); ggml_build_forward_expand(gf, cur); } @@ -1510,8 +1499,6 @@ int32_t llama_context::output_reserve(int32_t n_outputs) { // set all ids as invalid (negative) std::fill(output_ids.begin(), output_ids.end(), -1); - ggml_backend_buffer_clear(buf_output.get(), 0); - this->n_outputs = 0; this->n_outputs_max = n_outputs_max; diff --git a/llama/llama.cpp/src/llama-context.h b/llama/llama.cpp/src/llama-context.h index a59ff8fd4..30f84bfd3 100644 --- a/llama/llama.cpp/src/llama-context.h +++ b/llama/llama.cpp/src/llama-context.h @@ -172,8 +172,7 @@ private: ggml_tensor * shift, ggml_tensor * factors, float freq_base, - float freq_scale, - ggml_backend_buffer * bbuf) const; + float freq_scale) const; llm_graph_result_ptr build_kv_self_shift( ggml_context * ctx0, diff --git a/llama/llama.cpp/src/llama-graph.cpp b/llama/llama.cpp/src/llama-graph.cpp index d740c1200..b67216a48 100644 --- a/llama/llama.cpp/src/llama-graph.cpp +++ b/llama/llama.cpp/src/llama-graph.cpp @@ -55,7 +55,21 @@ void llm_graph_input_pos::set_input(const llama_ubatch * ubatch) { if (ubatch->pos && pos) { const int64_t n_tokens = ubatch->n_tokens; - ggml_backend_tensor_set(pos, ubatch->pos, 0, n_tokens*n_pos_per_token*ggml_element_size(pos)); + if (ubatch->token && n_pos_per_embd == 4) { + // in case we're using M-RoPE with text tokens, convert the 1D positions to 4D + // the 3 first dims are the same, and 4th dim is all 0 + std::vector pos_data(n_tokens*n_pos_per_embd); + // copy the first dimension + for (int i = 0; i < n_tokens; ++i) { + pos_data[ i] = ubatch->pos[i]; + pos_data[ n_tokens + i] = ubatch->pos[i]; + pos_data[2 * n_tokens + i] = ubatch->pos[i]; + pos_data[3 * n_tokens + i] = 0; // 4th dim is 0 + } + ggml_backend_tensor_set(pos, pos_data.data(), 0, pos_data.size()*ggml_element_size(pos)); + } else { + ggml_backend_tensor_set(pos, ubatch->pos, 0, n_tokens*n_pos_per_embd*ggml_element_size(pos)); + } } } @@ -71,7 +85,7 @@ void llm_graph_input_attn_temp::set_input(const llama_ubatch * ubatch) { ) * f_attn_temp_scale + 1.0; } - ggml_backend_tensor_set(attn_scale, attn_scale_data.data(), 0, n_tokens*n_pos_per_token*ggml_element_size(attn_scale)); + ggml_backend_tensor_set(attn_scale, attn_scale_data.data(), 0, n_tokens*ggml_element_size(attn_scale)); } } @@ -598,7 +612,7 @@ llm_graph_context::llm_graph_context(const llm_graph_params & params) : res (std::make_unique()) { } -int64_t llm_graph_context::n_pos_per_token() const { +int64_t llm_graph_context::n_pos_per_embd() const { return arch == LLM_ARCH_QWEN2VL ? 4 : 1; } @@ -809,6 +823,10 @@ ggml_tensor * llm_graph_context::build_ffn( if (down) { cur = build_lora_mm(down, cur); + if (arch == LLM_ARCH_GLM4) { + // GLM4 seems to have numerical issues with half-precision accumulators + ggml_mul_mat_set_prec(cur, GGML_PREC_F32); + } } if (down_b) { @@ -916,28 +934,35 @@ ggml_tensor * llm_graph_context::build_moe_ffn( ggml_tensor * up = build_lora_mm_id(up_exps, cur, selected_experts); // [n_ff, n_expert_used, n_tokens] cb(up, "ffn_moe_up", il); - ggml_tensor * gate = build_lora_mm_id(gate_exps, cur, selected_experts); // [n_ff, n_expert_used, n_tokens] - cb(gate, "ffn_moe_gate", il); + ggml_tensor * experts = nullptr; + if (gate_exps) { + cur = build_lora_mm_id(gate_exps, cur, selected_experts); // [n_ff, n_expert_used, n_tokens] + cb(cur, "ffn_moe_gate", il); + } else { + cur = up; + } switch (type_op) { case LLM_FFN_SILU: { - gate = ggml_silu(ctx0, gate); - cb(gate, "ffn_moe_silu", il); + cur = ggml_silu(ctx0, cur); + cb(cur, "ffn_moe_silu", il); } break; case LLM_FFN_GELU: { - gate = ggml_gelu(ctx0, gate); - cb(gate, "ffn_moe_gelu", il); + cur = ggml_gelu(ctx0, cur); + cb(cur, "ffn_moe_gelu", il); } break; default: GGML_ABORT("fatal error"); } - ggml_tensor * par = ggml_mul(ctx0, up, gate); // [n_ff, n_expert_used, n_tokens] - cb(par, "ffn_moe_gate_par", il); + if (gate_exps) { + cur = ggml_mul(ctx0, cur, up); // [n_ff, n_expert_used, n_tokens] + cb(cur, "ffn_moe_gate_par", il); + } - ggml_tensor * experts = build_lora_mm_id(down_exps, par, selected_experts); // [n_embd, n_expert_used, n_tokens] + experts = build_lora_mm_id(down_exps, cur, selected_experts); // [n_embd, n_expert_used, n_tokens] cb(experts, "ffn_moe_down", il); if (!weight_before_ffn) { @@ -1020,11 +1045,11 @@ ggml_tensor * llm_graph_context::build_inp_embd(ggml_tensor * tok_embd) const { } ggml_tensor * llm_graph_context::build_inp_pos() const { - auto inp = std::make_unique(n_pos_per_token()); + auto inp = std::make_unique(n_pos_per_embd()); auto & cur = inp->pos; - cur = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens*n_pos_per_token()); + cur = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens*n_pos_per_embd()); ggml_set_input(cur); res->add_input(std::move(inp)); @@ -1033,11 +1058,12 @@ ggml_tensor * llm_graph_context::build_inp_pos() const { } ggml_tensor * llm_graph_context::build_inp_attn_scale() const { - auto inp = std::make_unique(n_pos_per_token(), hparams.n_attn_temp_floor_scale, hparams.f_attn_temp_scale); + auto inp = std::make_unique(hparams.n_attn_temp_floor_scale, hparams.f_attn_temp_scale); auto & cur = inp->attn_scale; - cur = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, 1, 1, n_tokens*n_pos_per_token()); + // this need to be 1x1xN for broadcasting + cur = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, 1, 1, n_tokens); ggml_set_input(cur); res->add_input(std::move(inp)); diff --git a/llama/llama.cpp/src/llama-graph.h b/llama/llama.cpp/src/llama-graph.h index 260a2af21..0fe18150b 100644 --- a/llama/llama.cpp/src/llama-graph.h +++ b/llama/llama.cpp/src/llama-graph.h @@ -91,29 +91,27 @@ public: class llm_graph_input_pos : public llm_graph_input_i { public: - llm_graph_input_pos(int64_t n_pos_per_token) : n_pos_per_token(n_pos_per_token) {} + llm_graph_input_pos(int64_t n_pos_per_embd) : n_pos_per_embd(n_pos_per_embd) {} virtual ~llm_graph_input_pos() = default; void set_input(const llama_ubatch * ubatch) override; ggml_tensor * pos = nullptr; // I32 [n_batch] - const int64_t n_pos_per_token = 1; + const int64_t n_pos_per_embd = 1; }; // temperature tuning, used by llama4 class llm_graph_input_attn_temp : public llm_graph_input_i { public: - llm_graph_input_attn_temp(int64_t n_pos_per_token, uint32_t n_attn_temp_floor_scale, float f_attn_temp_scale) - : n_pos_per_token(n_pos_per_token), n_attn_temp_floor_scale(n_attn_temp_floor_scale), f_attn_temp_scale(f_attn_temp_scale) {} + llm_graph_input_attn_temp(uint32_t n_attn_temp_floor_scale, float f_attn_temp_scale) + : n_attn_temp_floor_scale(n_attn_temp_floor_scale), f_attn_temp_scale(f_attn_temp_scale) {} virtual ~llm_graph_input_attn_temp() = default; void set_input(const llama_ubatch * ubatch) override; ggml_tensor * attn_scale = nullptr; // F32 [n_batch] - const int64_t n_pos_per_token = 1; - const uint32_t n_attn_temp_floor_scale; const float f_attn_temp_scale; }; @@ -430,7 +428,7 @@ struct llm_graph_context { llm_graph_context(const llm_graph_params & params); - int64_t n_pos_per_token() const; + int64_t n_pos_per_embd() const; void cb(ggml_tensor * cur, const char * name, int il) const; diff --git a/llama/llama.cpp/src/llama-hparams.h b/llama/llama.cpp/src/llama-hparams.h index c8a34d521..b6fc7e6df 100644 --- a/llama/llama.cpp/src/llama-hparams.h +++ b/llama/llama.cpp/src/llama-hparams.h @@ -72,6 +72,7 @@ struct llama_hparams { float expert_weights_scale = 0.0; bool expert_weights_norm = false; uint32_t expert_gating_func = LLAMA_EXPERT_GATING_FUNC_TYPE_NONE; + uint32_t moe_every_n_layers = 0; float f_norm_eps; float f_norm_rms_eps; diff --git a/llama/llama.cpp/src/llama-model.cpp b/llama/llama.cpp/src/llama-model.cpp index c8374159f..9d099f117 100644 --- a/llama/llama.cpp/src/llama-model.cpp +++ b/llama/llama.cpp/src/llama-model.cpp @@ -43,11 +43,13 @@ const char * llm_type_name(llm_type type) { case LLM_TYPE_770M: return "770M"; case LLM_TYPE_780M: return "780M"; case LLM_TYPE_0_5B: return "0.5B"; + case LLM_TYPE_0_6B: return "0.6B"; case LLM_TYPE_1B: return "1B"; case LLM_TYPE_1_3B: return "1.3B"; case LLM_TYPE_1_4B: return "1.4B"; case LLM_TYPE_1_5B: return "1.5B"; case LLM_TYPE_1_6B: return "1.6B"; + case LLM_TYPE_1_7B: return "1.7B"; case LLM_TYPE_1_8B: return "1.8B"; case LLM_TYPE_2B: return "2B"; case LLM_TYPE_2_8B: return "2.8B"; @@ -66,6 +68,7 @@ const char * llm_type_name(llm_type type) { case LLM_TYPE_15B: return "15B"; case LLM_TYPE_16B: return "16B"; case LLM_TYPE_20B: return "20B"; + case LLM_TYPE_27B: return "27B"; case LLM_TYPE_30B: return "30B"; case LLM_TYPE_32B: return "32B"; case LLM_TYPE_34B: return "34B"; @@ -74,6 +77,7 @@ const char * llm_type_name(llm_type type) { case LLM_TYPE_65B: return "65B"; case LLM_TYPE_70B: return "70B"; case LLM_TYPE_236B: return "236B"; + case LLM_TYPE_290B: return "290B"; case LLM_TYPE_314B: return "314B"; case LLM_TYPE_671B: return "671B"; case LLM_TYPE_SMALL: return "0.1B"; @@ -88,10 +92,10 @@ const char * llm_type_name(llm_type type) { case LLM_TYPE_16x3_8B: return "16x3.8B"; case LLM_TYPE_10B_128x3_66B: return "10B+128x3.66B"; case LLM_TYPE_57B_A14B: return "57B.A14B"; - case LLM_TYPE_27B: return "27B"; - case LLM_TYPE_290B: return "290B"; case LLM_TYPE_17B_16E: return "17Bx16E (Scout)"; case LLM_TYPE_17B_128E: return "17Bx128E (Maverick)"; + case LLM_TYPE_30B_A3B: return "30B.A3B"; + case LLM_TYPE_235B_A22B: return "235B.A22B"; default: return "?B"; } } @@ -709,10 +713,12 @@ void llama_model::load_hparams(llama_model_loader & ml) { } } break; case LLM_ARCH_NOMIC_BERT: + case LLM_ARCH_NOMIC_BERT_MOE: { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps); ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn); ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type); + ml.get_key(LLM_KV_MOE_EVERY_N_LAYERS, hparams.moe_every_n_layers, 0); if (hparams.n_layer == 12 && hparams.n_embd == 768) { type = LLM_TYPE_137M; @@ -805,6 +811,10 @@ void llama_model::load_hparams(llama_model_loader & ml) { { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); switch (hparams.n_layer) { + case 28: type = hparams.n_embd == 1024 ? LLM_TYPE_0_6B : LLM_TYPE_1_7B; break; + case 36: type = hparams.n_embd == 2560 ? LLM_TYPE_4B : LLM_TYPE_8B; break; + case 40: type = LLM_TYPE_14B; break; + case 64: type = LLM_TYPE_32B; break; default: type = LLM_TYPE_UNKNOWN; } } break; @@ -814,6 +824,8 @@ void llama_model::load_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); switch (hparams.n_layer) { + case 48: type = LLM_TYPE_30B_A3B; break; + case 94: type = LLM_TYPE_235B_A22B; break; default: type = LLM_TYPE_UNKNOWN; } } break; @@ -1425,7 +1437,6 @@ void llama_model::load_hparams(llama_model_loader & ml) { default: type = LLM_TYPE_UNKNOWN; } } break; - case LLM_ARCH_MISTRAL3: break; default: throw std::runtime_error("unsupported model architecture"); } @@ -2133,6 +2144,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) { } break; case LLM_ARCH_BERT: case LLM_ARCH_NOMIC_BERT: + case LLM_ARCH_NOMIC_BERT_MOE: { tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0); type_embd = create_tensor(tn(LLM_TENSOR_TOKEN_TYPES, "weight"), {n_embd, n_token_types}, 0); @@ -2166,20 +2178,31 @@ bool llama_model::load_tensors(llama_model_loader & ml) { layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0); } + if (arch == LLM_ARCH_NOMIC_BERT_MOE) { + layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, 0); + } + layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0); layer.attn_out_norm = create_tensor(tn(LLM_TENSOR_ATTN_OUT_NORM, "weight", i), {n_embd}, 0); layer.attn_out_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_OUT_NORM, "bias", i), {n_embd}, 0); - layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0); - layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0); - - if (arch == LLM_ARCH_BERT) { + if (hparams.moe_every_n_layers > 0 && i % hparams.moe_every_n_layers == 1) { layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0); - layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, 0); - layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, 0); + layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff, n_expert}, 0); + layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff, n_embd, n_expert}, 0); + layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0); } else { - layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0); + layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0); + layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0); + + if (arch == LLM_ARCH_BERT || arch == LLM_ARCH_NOMIC_BERT_MOE) { + layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0); + layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, 0); + layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, 0); + } else { + layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0); + } } layer.layer_out_norm = create_tensor(tn(LLM_TENSOR_LAYER_OUT_NORM, "weight", i), {n_embd}, 0); @@ -6074,6 +6097,11 @@ struct llm_build_bert : public llm_graph_context { cur = build_lora_mm(model.layers[il].wqkv, cur); cb(cur, "wqkv", il); + if (model.arch == LLM_ARCH_NOMIC_BERT_MOE) { + cur = ggml_add(ctx0, cur, model.layers[il].bqkv); + cb(cur, "bqkv", il); + } + Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd))); Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd))); Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa))); @@ -6126,13 +6154,29 @@ struct llm_build_bert : public llm_graph_context { cb(ffn_inp, "ffn_inp", il); // feed-forward network - if (model.arch == LLM_ARCH_BERT) { + if (hparams.moe_every_n_layers > 0 && il % hparams.moe_every_n_layers == 1) { + // MoE branch + cur = build_moe_ffn(cur, + model.layers[il].ffn_gate_inp, + model.layers[il].ffn_up_exps, + nullptr, + model.layers[il].ffn_down_exps, + nullptr, + hparams.n_expert, + hparams.n_expert_used, + LLM_FFN_GELU, + false, false, + 0.0f, + LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX, il); + cb(cur, "ffn_moe_out", il); + } else if (model.arch == LLM_ARCH_BERT || model.arch == LLM_ARCH_NOMIC_BERT_MOE) { cur = build_ffn(cur, model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL, NULL, NULL, NULL, model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL, NULL, LLM_FFN_GELU, LLM_FFN_SEQ, il); + cb(cur, "ffn_out", il); } else if (model.arch == LLM_ARCH_JINA_BERT_V2) { cur = build_ffn(cur, model.layers[il].ffn_up, NULL, NULL, @@ -6140,6 +6184,7 @@ struct llm_build_bert : public llm_graph_context { model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL, NULL, LLM_FFN_GELU, LLM_FFN_PAR, il); + cb(cur, "ffn_out", il); } else { cur = build_ffn(cur, model.layers[il].ffn_up, NULL, NULL, @@ -6147,8 +6192,8 @@ struct llm_build_bert : public llm_graph_context { model.layers[il].ffn_down, NULL, NULL, NULL, LLM_FFN_SILU, LLM_FFN_PAR, il); + cb(cur, "ffn_out", il); } - cb(cur, "ffn_out", il); // attentions bypass the intermediate layer cur = ggml_add(ctx0, cur, ffn_inp); @@ -13349,6 +13394,7 @@ llm_graph_result_ptr llama_model::build_graph( case LLM_ARCH_BERT: case LLM_ARCH_JINA_BERT_V2: case LLM_ARCH_NOMIC_BERT: + case LLM_ARCH_NOMIC_BERT_MOE: { llm = std::make_unique(*this, params, gf); } break; @@ -13705,7 +13751,6 @@ llama_rope_type llama_model_rope_type(const llama_model * model) { case LLM_ARCH_CHAMELEON: case LLM_ARCH_SOLAR: case LLM_ARCH_BAILINGMOE: - case LLM_ARCH_MISTRAL3: return LLAMA_ROPE_TYPE_NORM; // the pairs of head values are offset by n_rot/2 @@ -13714,6 +13759,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) { case LLM_ARCH_DBRX: case LLM_ARCH_BERT: case LLM_ARCH_NOMIC_BERT: + case LLM_ARCH_NOMIC_BERT_MOE: case LLM_ARCH_STABLELM: case LLM_ARCH_BITNET: case LLM_ARCH_QWEN: diff --git a/llama/llama.cpp/src/llama-model.h b/llama/llama.cpp/src/llama-model.h index 72bab5bee..6be91282a 100644 --- a/llama/llama.cpp/src/llama-model.h +++ b/llama/llama.cpp/src/llama-model.h @@ -40,11 +40,13 @@ enum llm_type { LLM_TYPE_770M, LLM_TYPE_780M, LLM_TYPE_0_5B, + LLM_TYPE_0_6B, LLM_TYPE_1B, LLM_TYPE_1_3B, LLM_TYPE_1_4B, LLM_TYPE_1_5B, LLM_TYPE_1_6B, + LLM_TYPE_1_7B, LLM_TYPE_1_8B, LLM_TYPE_2B, LLM_TYPE_2_8B, @@ -64,6 +66,7 @@ enum llm_type { LLM_TYPE_16B, LLM_TYPE_20B, LLM_TYPE_22B, + LLM_TYPE_27B, LLM_TYPE_30B, LLM_TYPE_32B, LLM_TYPE_34B, @@ -73,6 +76,7 @@ enum llm_type { LLM_TYPE_70B, LLM_TYPE_90B, LLM_TYPE_236B, + LLM_TYPE_290B, LLM_TYPE_314B, LLM_TYPE_671B, LLM_TYPE_SMALL, @@ -87,10 +91,10 @@ enum llm_type { LLM_TYPE_16x3_8B, LLM_TYPE_10B_128x3_66B, LLM_TYPE_57B_A14B, - LLM_TYPE_27B, - LLM_TYPE_290B, LLM_TYPE_17B_16E, // llama4 Scout LLM_TYPE_17B_128E, // llama4 Maverick + LLM_TYPE_30B_A3B, + LLM_TYPE_235B_A22B, }; struct llama_layer_posnet { diff --git a/llama/llama.cpp/src/llama-quant.cpp b/llama/llama.cpp/src/llama-quant.cpp index 8ae6dde87..223e1f3f9 100644 --- a/llama/llama.cpp/src/llama-quant.cpp +++ b/llama/llama.cpp/src/llama-quant.cpp @@ -744,10 +744,6 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: // This used to be a regex, but has an extreme cost to compile times. bool quantize = name.rfind("weight") == name.size() - 6; // ends with 'weight'? - // don't quantize vision stuff - quantize &= name.find("v.") == std::string::npos; - quantize &= name.find("mm.") == std::string::npos; - // quantize only 2D and 3D tensors (experts) quantize &= (ggml_n_dims(tensor) >= 2); diff --git a/llama/llama.cpp/src/llama-sampling.cpp b/llama/llama.cpp/src/llama-sampling.cpp index b1a9dca3c..757310533 100644 --- a/llama/llama.cpp/src/llama-sampling.cpp +++ b/llama/llama.cpp/src/llama-sampling.cpp @@ -232,7 +232,7 @@ static void llama_sampler_top_k_impl(llama_token_data_array * cur_p, int32_t k) // } if (k <= 0) { - k = cur_p->size; + return; } k = std::min(k, (int) cur_p->size); @@ -298,6 +298,7 @@ static void llama_sampler_top_k_impl(llama_token_data_array * cur_p, int32_t k) } cur_p->sorted = true; } + cur_p->size = k; } diff --git a/llama/llama.cpp/src/llama-vocab.cpp b/llama/llama.cpp/src/llama-vocab.cpp index ba37df355..d6515ff65 100644 --- a/llama/llama.cpp/src/llama-vocab.cpp +++ b/llama/llama.cpp/src/llama-vocab.cpp @@ -1497,7 +1497,8 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) { tokenizer_pre == "llama3" || tokenizer_pre == "llama-v3" || tokenizer_pre == "llama-bpe"|| - tokenizer_pre == "falcon3") { + tokenizer_pre == "falcon3" || + tokenizer_pre == "pixtral") { pre_type = LLAMA_VOCAB_PRE_TYPE_LLAMA3; ignore_merges = true; add_bos = true; diff --git a/llama/llama.go b/llama/llama.go index 5fce0a622..3e157c0ac 100644 --- a/llama/llama.go +++ b/llama/llama.go @@ -2,6 +2,7 @@ package llama /* #cgo CFLAGS: -std=c11 +#cgo windows CFLAGS: -Wno-dll-attribute-on-redeclaration #cgo CXXFLAGS: -std=c++17 #cgo CPPFLAGS: -I${SRCDIR}/llama.cpp/include #cgo CPPFLAGS: -I${SRCDIR}/llama.cpp/common @@ -198,7 +199,6 @@ type ModelParams struct { NumGpuLayers int MainGpu int UseMmap bool - UseMlock bool TensorSplit []float32 Progress func(float32) VocabOnly bool @@ -217,7 +217,6 @@ func LoadModelFromFile(modelPath string, params ModelParams) (*Model, error) { cparams.n_gpu_layers = C.int(params.NumGpuLayers) cparams.main_gpu = C.int32_t(params.MainGpu) cparams.use_mmap = C.bool(params.UseMmap) - cparams.use_mlock = C.bool(params.UseMlock) cparams.vocab_only = C.bool(params.VocabOnly) if len(params.TensorSplit) > 0 { @@ -461,24 +460,6 @@ func (m *Model) NEmbd() int { return int(C.llama_model_n_embd(m.c)) } -func Quantize(infile, outfile string, ftype uint32) error { - cinfile := C.CString(infile) - defer C.free(unsafe.Pointer(cinfile)) - - coutfile := C.CString(outfile) - defer C.free(unsafe.Pointer(coutfile)) - - params := C.llama_model_quantize_default_params() - params.nthread = -1 - params.ftype = ftype - - if rc := C.llama_model_quantize(cinfile, coutfile, ¶ms); rc != 0 { - return fmt.Errorf("llama_model_quantize: %d", rc) - } - - return nil -} - // vision processing type ClipContext struct { c *C.struct_clip_ctx @@ -606,9 +587,6 @@ type SamplingParams struct { PenaltyRepeat float32 PenaltyFreq float32 PenaltyPresent float32 - Mirostat int - MirostatTau float32 - MirostatEta float32 PenalizeNl bool Seed uint32 Grammar string @@ -625,9 +603,6 @@ func NewSamplingContext(model *Model, params SamplingParams) (*SamplingContext, cparams.penalty_repeat = C.float(params.PenaltyRepeat) cparams.penalty_freq = C.float(params.PenaltyFreq) cparams.penalty_present = C.float(params.PenaltyFreq) - cparams.mirostat = C.int32_t(params.Mirostat) - cparams.mirostat_tau = C.float(params.MirostatTau) - cparams.mirostat_eta = C.float(params.MirostatEta) cparams.seed = C.uint32_t(params.Seed) grammar := C.CString(params.Grammar) diff --git a/llama/patches/0001-ggml-backend-malloc-and-free-using-the-same-compiler.patch b/llama/patches/0001-ggml-backend-malloc-and-free-using-the-same-compiler.patch index 8f5c3a779..44aa70953 100644 --- a/llama/patches/0001-ggml-backend-malloc-and-free-using-the-same-compiler.patch +++ b/llama/patches/0001-ggml-backend-malloc-and-free-using-the-same-compiler.patch @@ -85,7 +85,7 @@ index e2617b06..242e50a7 100644 /** diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu -index a7febef7..31750b6f 100644 +index 9fb2134f..04ce764e 100644 --- a/ggml/src/ggml-cuda/ggml-cuda.cu +++ b/ggml/src/ggml-cuda/ggml-cuda.cu @@ -534,6 +534,7 @@ struct ggml_backend_cuda_buffer_context { @@ -125,10 +125,10 @@ index 50579227..2799a0a5 100644 static void * ggml_backend_kompute_buffer_get_base(ggml_backend_buffer_t buffer) { diff --git a/ggml/src/ggml-metal/ggml-metal.m b/ggml/src/ggml-metal/ggml-metal.m -index 266d8af4..12886cd3 100644 +index d92392ed..425524d0 100644 --- a/ggml/src/ggml-metal/ggml-metal.m +++ b/ggml/src/ggml-metal/ggml-metal.m -@@ -4759,6 +4759,7 @@ static void ggml_backend_metal_buffer_free_buffer(ggml_backend_buffer_t buffer) +@@ -5077,6 +5077,7 @@ static void ggml_backend_metal_buffer_free_buffer(ggml_backend_buffer_t buffer) } free(ctx); @@ -149,10 +149,10 @@ index 05a2f4e6..392cc18d 100644 static void * ggml_backend_opencl_buffer_get_base(ggml_backend_buffer_t buffer) { diff --git a/ggml/src/ggml-rpc/ggml-rpc.cpp b/ggml/src/ggml-rpc/ggml-rpc.cpp -index a0667b7d..bd83adc5 100644 +index 140a775f..e33c4ba0 100644 --- a/ggml/src/ggml-rpc/ggml-rpc.cpp +++ b/ggml/src/ggml-rpc/ggml-rpc.cpp -@@ -468,6 +468,7 @@ static void ggml_backend_rpc_buffer_free_buffer(ggml_backend_buffer_t buffer) { +@@ -477,6 +477,7 @@ static void ggml_backend_rpc_buffer_free_buffer(ggml_backend_buffer_t buffer) { bool status = send_rpc_cmd(ctx->sock, RPC_CMD_FREE_BUFFER, &request, sizeof(request), nullptr, 0); GGML_ASSERT(status); delete ctx; @@ -161,10 +161,10 @@ index a0667b7d..bd83adc5 100644 static void * ggml_backend_rpc_buffer_get_base(ggml_backend_buffer_t buffer) { diff --git a/ggml/src/ggml-sycl/ggml-sycl.cpp b/ggml/src/ggml-sycl/ggml-sycl.cpp -index 1de34c96..4600f61e 100644 +index 66b6f2cc..e3e6deae 100644 --- a/ggml/src/ggml-sycl/ggml-sycl.cpp +++ b/ggml/src/ggml-sycl/ggml-sycl.cpp -@@ -316,6 +316,7 @@ ggml_backend_sycl_buffer_free_buffer(ggml_backend_buffer_t buffer) try { +@@ -317,6 +317,7 @@ ggml_backend_sycl_buffer_free_buffer(ggml_backend_buffer_t buffer) try { ggml_sycl_set_device(ctx->device); delete ctx; @@ -172,7 +172,7 @@ index 1de34c96..4600f61e 100644 } catch (sycl::exception const &exc) { std::cerr << exc.what() << "Exception caught at file:" << __FILE__ -@@ -761,6 +762,7 @@ struct ggml_backend_sycl_split_buffer_context { +@@ -762,6 +763,7 @@ struct ggml_backend_sycl_split_buffer_context { static void ggml_backend_sycl_split_buffer_free_buffer(ggml_backend_buffer_t buffer) { ggml_backend_sycl_split_buffer_context * ctx = (ggml_backend_sycl_split_buffer_context *)buffer->context; delete ctx; @@ -180,7 +180,7 @@ index 1de34c96..4600f61e 100644 } static void * ggml_backend_sycl_split_buffer_get_base(ggml_backend_buffer_t buffer) { -@@ -1095,6 +1097,7 @@ static const char * ggml_backend_sycl_host_buffer_type_name(ggml_backend_buffer_ +@@ -1096,6 +1098,7 @@ static const char * ggml_backend_sycl_host_buffer_type_name(ggml_backend_buffer_ static void ggml_backend_sycl_host_buffer_free_buffer(ggml_backend_buffer_t buffer) { ggml_sycl_host_free(buffer->context); @@ -189,10 +189,10 @@ index 1de34c96..4600f61e 100644 static ggml_backend_buffer_t ggml_backend_sycl_host_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) { diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp -index 39f3cd34..c569a8a5 100644 +index c0bdb9e1..03d03064 100644 --- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp +++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp -@@ -8653,6 +8653,7 @@ static void ggml_backend_vk_buffer_free_buffer(ggml_backend_buffer_t buffer) { +@@ -8660,6 +8660,7 @@ static void ggml_backend_vk_buffer_free_buffer(ggml_backend_buffer_t buffer) { ggml_backend_vk_buffer_context * ctx = (ggml_backend_vk_buffer_context *)buffer->context; ggml_vk_destroy_buffer(ctx->dev_buffer); delete ctx; @@ -200,7 +200,7 @@ index 39f3cd34..c569a8a5 100644 } static void * ggml_backend_vk_buffer_get_base(ggml_backend_buffer_t buffer) { -@@ -8796,6 +8797,7 @@ static const char * ggml_backend_vk_host_buffer_name(ggml_backend_buffer_t buffe +@@ -8803,6 +8804,7 @@ static const char * ggml_backend_vk_host_buffer_name(ggml_backend_buffer_t buffe static void ggml_backend_vk_host_buffer_free_buffer(ggml_backend_buffer_t buffer) { VK_LOG_MEMORY("ggml_backend_vk_host_buffer_free_buffer()"); ggml_vk_host_free(vk_instance.devices[0], buffer->context); diff --git a/llama/patches/0002-pretokenizer.patch b/llama/patches/0002-pretokenizer.patch index e51b43730..ecdabe7e1 100644 --- a/llama/patches/0002-pretokenizer.patch +++ b/llama/patches/0002-pretokenizer.patch @@ -10,7 +10,7 @@ logs instead of throwing an error 1 file changed, 3 insertions(+), 11 deletions(-) diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp -index 48060517..a35b498c 100644 +index 50ded286..a9ee9f03 100644 --- a/src/llama-vocab.cpp +++ b/src/llama-vocab.cpp @@ -1491,16 +1491,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) { @@ -31,7 +31,7 @@ index 48060517..a35b498c 100644 pre_type = LLAMA_VOCAB_PRE_TYPE_DEFAULT; } else if ( tokenizer_pre == "llama3" || -@@ -1634,7 +1625,8 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) { +@@ -1635,7 +1626,8 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) { pre_type = LLAMA_VOCAB_PRE_TYPE_BAILINGMOE; clean_spaces = false; } else { diff --git a/llama/patches/0003-embeddings.patch b/llama/patches/0003-embeddings.patch index c27dbd7b5..022a83f43 100644 --- a/llama/patches/0003-embeddings.patch +++ b/llama/patches/0003-embeddings.patch @@ -11,10 +11,10 @@ instead of forcing one or the error 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/llama-context.cpp b/src/llama-context.cpp -index 983385f8..32f59819 100644 +index 5a2eef9b..9c1fe93f 100644 --- a/src/llama-context.cpp +++ b/src/llama-context.cpp -@@ -1236,7 +1236,7 @@ int llama_context::decode(llama_batch & inp_batch) { +@@ -1225,7 +1225,7 @@ int llama_context::decode(llama_batch & inp_batch) { int64_t n_outputs_all = 0; // count outputs @@ -23,7 +23,7 @@ index 983385f8..32f59819 100644 for (uint32_t i = 0; i < n_tokens_all; ++i) { n_outputs_all += batch.logits[i] != 0; } -@@ -1348,7 +1348,7 @@ int llama_context::decode(llama_batch & inp_batch) { +@@ -1337,7 +1337,7 @@ int llama_context::decode(llama_batch & inp_batch) { // ggml_graph_dump_dot(gf, NULL, "llama.dot"); //} @@ -32,7 +32,7 @@ index 983385f8..32f59819 100644 auto * t_embd = cparams.embeddings ? res->get_embd() : nullptr; if (t_embd && res->get_embd_pooled()) { -@@ -1492,7 +1492,7 @@ int32_t llama_context::output_reserve(int32_t n_outputs) { +@@ -1481,7 +1481,7 @@ int32_t llama_context::output_reserve(int32_t n_outputs) { const auto n_embd = hparams.n_embd; // TODO: use a per-batch flag for logits presence instead diff --git a/llama/patches/0004-clip-unicode.patch b/llama/patches/0004-clip-unicode.patch index 81d61a827..35f54fd3c 100644 --- a/llama/patches/0004-clip-unicode.patch +++ b/llama/patches/0004-clip-unicode.patch @@ -10,12 +10,12 @@ filesystems for paths that include wide characters 1 file changed, 39 insertions(+) diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp -index 75970615..d57b4bd6 100644 +index ad3e7df1..b3218c78 100644 --- a/examples/llava/clip.cpp +++ b/examples/llava/clip.cpp -@@ -29,6 +29,19 @@ - #include +@@ -30,6 +30,19 @@ #include + #include +#if defined(_WIN32) +#define WIN32_LEAN_AND_MEAN @@ -33,7 +33,7 @@ index 75970615..d57b4bd6 100644 struct clip_logger_state g_logger_state = {GGML_LOG_LEVEL_CONT, clip_log_callback_default, NULL}; //#define CLIP_DEBUG_FUNCTIONS -@@ -1430,7 +1443,29 @@ struct clip_model_loader { +@@ -1971,7 +1984,29 @@ struct clip_model_loader { { std::vector read_buf; @@ -63,7 +63,7 @@ index 75970615..d57b4bd6 100644 if (!fin) { throw std::runtime_error(string_format("%s: failed to open %s\n", __func__, fname.c_str())); } -@@ -1457,7 +1492,11 @@ struct clip_model_loader { +@@ -1998,7 +2033,11 @@ struct clip_model_loader { ggml_backend_tensor_set(cur, read_buf.data(), 0, num_bytes); } } diff --git a/llama/patches/0005-solar-pro.patch b/llama/patches/0005-solar-pro.patch index 76ddc6197..bf0fe310e 100644 --- a/llama/patches/0005-solar-pro.patch +++ b/llama/patches/0005-solar-pro.patch @@ -15,10 +15,10 @@ adds support for the Solar Pro architecture 7 files changed, 248 insertions(+) diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp -index 62e1480b..f754bc8f 100644 +index f2bc8ca7..5ab3f572 100644 --- a/src/llama-arch.cpp +++ b/src/llama-arch.cpp -@@ -68,6 +68,7 @@ static const std::map LLM_ARCH_NAMES = { +@@ -69,6 +69,7 @@ static const std::map LLM_ARCH_NAMES = { { LLM_ARCH_GRANITE, "granite" }, { LLM_ARCH_GRANITE_MOE, "granitemoe" }, { LLM_ARCH_CHAMELEON, "chameleon" }, @@ -26,7 +26,7 @@ index 62e1480b..f754bc8f 100644 { LLM_ARCH_WAVTOKENIZER_DEC, "wavtokenizer-dec" }, { LLM_ARCH_PLM, "plm" }, { LLM_ARCH_BAILINGMOE, "bailingmoe" }, -@@ -140,6 +141,7 @@ static const std::map LLM_KV_NAMES = { +@@ -142,6 +143,7 @@ static const std::map LLM_KV_NAMES = { { LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT, "%s.attention.relative_buckets_count" }, { LLM_KV_ATTENTION_SLIDING_WINDOW, "%s.attention.sliding_window" }, { LLM_KV_ATTENTION_SCALE, "%s.attention.scale" }, @@ -34,7 +34,7 @@ index 62e1480b..f754bc8f 100644 { LLM_KV_ATTENTION_KEY_LENGTH_MLA, "%s.attention.key_length_mla" }, { LLM_KV_ATTENTION_VALUE_LENGTH_MLA, "%s.attention.value_length_mla" }, -@@ -1482,6 +1484,24 @@ static const std::map> LLM_TENSOR_N +@@ -1502,6 +1504,24 @@ static const std::map> LLM_TENSOR_N { LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" }, }, }, @@ -59,7 +59,7 @@ index 62e1480b..f754bc8f 100644 { LLM_ARCH_WAVTOKENIZER_DEC, { -@@ -1660,6 +1680,7 @@ static const std::map LLM_TENSOR_INFOS = { +@@ -1680,6 +1700,7 @@ static const std::map LLM_TENSOR_INFOS = { {LLM_TENSOR_FFN_EXP_PROBS_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_ADD}}, // this tensor is loaded for T5, but never used {LLM_TENSOR_DEC_CROSS_ATTN_REL_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_NONE}}, @@ -68,10 +68,10 @@ index 62e1480b..f754bc8f 100644 {LLM_TENSOR_POS_NET_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}}, {LLM_TENSOR_POS_NET_NORM1, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}}, diff --git a/src/llama-arch.h b/src/llama-arch.h -index 98ca00a1..439aaeab 100644 +index 41a023da..525c1b7d 100644 --- a/src/llama-arch.h +++ b/src/llama-arch.h -@@ -72,6 +72,7 @@ enum llm_arch { +@@ -73,6 +73,7 @@ enum llm_arch { LLM_ARCH_GRANITE, LLM_ARCH_GRANITE_MOE, LLM_ARCH_CHAMELEON, @@ -79,7 +79,7 @@ index 98ca00a1..439aaeab 100644 LLM_ARCH_WAVTOKENIZER_DEC, LLM_ARCH_PLM, LLM_ARCH_BAILINGMOE, -@@ -144,6 +145,7 @@ enum llm_kv { +@@ -146,6 +147,7 @@ enum llm_kv { LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT, LLM_KV_ATTENTION_SLIDING_WINDOW, LLM_KV_ATTENTION_SCALE, @@ -87,7 +87,7 @@ index 98ca00a1..439aaeab 100644 LLM_KV_ATTENTION_KEY_LENGTH_MLA, LLM_KV_ATTENTION_VALUE_LENGTH_MLA, -@@ -344,6 +346,7 @@ enum llm_tensor { +@@ -346,6 +348,7 @@ enum llm_tensor { LLM_TENSOR_ENC_OUTPUT_NORM, LLM_TENSOR_CLS, LLM_TENSOR_CLS_OUT, @@ -115,7 +115,7 @@ index 90dfe7a7..8a667960 100644 if (il < n_layer) { return n_swa > 0 && n_swa_pattern > 0 && il % n_swa_pattern < (n_swa_pattern - 1); diff --git a/src/llama-hparams.h b/src/llama-hparams.h -index 80fcd65d..6e278945 100644 +index 7ee6a5b7..48dce407 100644 --- a/src/llama-hparams.h +++ b/src/llama-hparams.h @@ -55,6 +55,8 @@ struct llama_hparams { @@ -127,7 +127,7 @@ index 80fcd65d..6e278945 100644 uint32_t n_layer_dense_lead = 0; uint32_t n_lora_q = 0; uint32_t n_lora_kv = 0; -@@ -153,6 +155,9 @@ struct llama_hparams { +@@ -154,6 +156,9 @@ struct llama_hparams { // dimension of the recurrent state embeddings uint32_t n_embd_v_s() const; @@ -150,10 +150,10 @@ index ea73a8a7..a012aeae 100644 llama_model_loader::llama_model_loader( const std::string & fname, diff --git a/src/llama-model.cpp b/src/llama-model.cpp -index 6b7bfecf..aba42819 100644 +index 822e2bb2..572378c9 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp -@@ -1374,6 +1374,21 @@ void llama_model::load_hparams(llama_model_loader & ml) { +@@ -1386,6 +1386,21 @@ void llama_model::load_hparams(llama_model_loader & ml) { default: type = LLM_TYPE_UNKNOWN; } } break; @@ -175,7 +175,7 @@ index 6b7bfecf..aba42819 100644 case LLM_ARCH_WAVTOKENIZER_DEC: { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps); -@@ -3717,6 +3732,34 @@ bool llama_model::load_tensors(llama_model_loader & ml) { +@@ -3741,6 +3756,34 @@ bool llama_model::load_tensors(llama_model_loader & ml) { layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0); @@ -210,7 +210,7 @@ index 6b7bfecf..aba42819 100644 layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0); layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0); layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0); -@@ -12296,6 +12339,165 @@ struct llm_build_chameleon : public llm_graph_context { +@@ -12342,6 +12385,165 @@ struct llm_build_chameleon : public llm_graph_context { } }; @@ -376,7 +376,7 @@ index 6b7bfecf..aba42819 100644 struct llm_build_wavtokenizer_dec : public llm_graph_context { llm_build_wavtokenizer_dec(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) { ggml_tensor * cur; -@@ -13045,6 +13247,10 @@ llm_graph_result_ptr llama_model::build_graph( +@@ -13092,6 +13294,10 @@ llm_graph_result_ptr llama_model::build_graph( { llm = std::make_unique(*this, params, gf); } break; @@ -387,7 +387,7 @@ index 6b7bfecf..aba42819 100644 case LLM_ARCH_WAVTOKENIZER_DEC: { llm = std::make_unique(*this, params, gf); -@@ -13191,6 +13397,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) { +@@ -13238,6 +13444,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) { case LLM_ARCH_GRANITE: case LLM_ARCH_GRANITE_MOE: case LLM_ARCH_CHAMELEON: @@ -396,18 +396,18 @@ index 6b7bfecf..aba42819 100644 return LLAMA_ROPE_TYPE_NORM; diff --git a/src/llama-model.h b/src/llama-model.h -index fd82d106..5865d5e9 100644 +index 95eca002..856e6042 100644 --- a/src/llama-model.h +++ b/src/llama-model.h -@@ -62,6 +62,7 @@ enum llm_type { +@@ -64,6 +64,7 @@ enum llm_type { LLM_TYPE_15B, LLM_TYPE_16B, LLM_TYPE_20B, + LLM_TYPE_22B, + LLM_TYPE_27B, LLM_TYPE_30B, LLM_TYPE_32B, - LLM_TYPE_34B, -@@ -307,6 +308,8 @@ struct llama_layer { +@@ -311,6 +312,8 @@ struct llama_layer { struct ggml_tensor * ffn_up_scale = nullptr; struct ggml_tensor * ffn_down_scale = nullptr; diff --git a/llama/patches/0006-add-mllama-support.patch b/llama/patches/0006-add-mllama-support.patch index e5fa0462c..9283224fe 100644 --- a/llama/patches/0006-add-mllama-support.patch +++ b/llama/patches/0006-add-mllama-support.patch @@ -5,7 +5,6 @@ Subject: [PATCH] add mllama support adds support for the llama 3.2 vision architecture --- - examples/llava/gemma3-cli.cpp | 3 +- examples/llava/llava.cpp | 5 +- examples/llava/mtmd.cpp | 6 +- ggml/src/ggml-backend-reg.cpp | 6 +- @@ -25,34 +24,13 @@ adds support for the llama 3.2 vision architecture src/llama-model.cpp | 309 +++++++++++++++++++++++++++++++++- src/llama-model.h | 12 ++ src/llama-quant.cpp | 4 +- - 20 files changed, 475 insertions(+), 22 deletions(-) + 19 files changed, 473 insertions(+), 21 deletions(-) -diff --git a/examples/llava/gemma3-cli.cpp b/examples/llava/gemma3-cli.cpp -index 3d566475..654d1358 100644 ---- a/examples/llava/gemma3-cli.cpp -+++ b/examples/llava/gemma3-cli.cpp -@@ -106,7 +106,7 @@ struct decode_embd_batch { - std::vector seq_ids; - std::vector logits; - llama_batch batch; -- decode_embd_batch(float * embd, int32_t n_tokens, llama_pos pos_0, llama_seq_id seq_id) { -+ decode_embd_batch(float * embd, int32_t n_embd, int32_t n_tokens, llama_pos pos_0, llama_seq_id seq_id) { - pos .resize(n_tokens); - n_seq_id.resize(n_tokens); - seq_ids .resize(n_tokens + 1); -@@ -118,6 +118,7 @@ struct decode_embd_batch { - /*n_tokens =*/ n_tokens, - /*tokens =*/ nullptr, - /*embd =*/ embd, -+ /*n_embd =*/ n_embd, - /*pos =*/ pos.data(), - /*n_seq_id =*/ n_seq_id.data(), - /*seq_id =*/ seq_ids.data(), diff --git a/examples/llava/llava.cpp b/examples/llava/llava.cpp -index 03a22cbb..5eb40bcd 100644 +index c00d16ae..bab027b5 100644 --- a/examples/llava/llava.cpp +++ b/examples/llava/llava.cpp -@@ -456,7 +456,7 @@ struct llava_embd_batch { +@@ -457,7 +457,7 @@ struct llava_embd_batch { std::vector seq_ids; std::vector logits; llama_batch batch; @@ -61,7 +39,7 @@ index 03a22cbb..5eb40bcd 100644 pos .resize(n_tokens); n_seq_id.resize(n_tokens); seq_ids .resize(n_tokens + 1); -@@ -468,6 +468,7 @@ struct llava_embd_batch { +@@ -469,6 +469,7 @@ struct llava_embd_batch { /*n_tokens =*/ n_tokens, /*tokens =*/ nullptr, /*embd =*/ embd, @@ -69,7 +47,7 @@ index 03a22cbb..5eb40bcd 100644 /*pos =*/ pos.data(), /*n_seq_id =*/ n_seq_id.data(), /*seq_id =*/ seq_ids.data(), -@@ -491,7 +492,7 @@ bool llava_eval_image_embed(llama_context * ctx_llama, const struct llava_image_ +@@ -492,7 +493,7 @@ bool llava_eval_image_embed(llama_context * ctx_llama, const struct llava_image_ n_eval = n_batch; } float * embd = image_embed->embed+i*n_embd; @@ -79,19 +57,19 @@ index 03a22cbb..5eb40bcd 100644 LOG_ERR("%s : failed to eval\n", __func__); return false; diff --git a/examples/llava/mtmd.cpp b/examples/llava/mtmd.cpp -index 3fd5bebc..f0cec596 100644 +index 7081fd73..c14ac501 100644 --- a/examples/llava/mtmd.cpp +++ b/examples/llava/mtmd.cpp -@@ -233,7 +233,7 @@ struct decode_embd_batch { +@@ -476,7 +476,7 @@ struct decode_embd_batch { std::vector seq_ids; std::vector logits; llama_batch batch; -- decode_embd_batch(float * embd, int32_t n_tokens, llama_pos pos_0, llama_seq_id seq_id) { -+ decode_embd_batch(float * embd, int32_t n_embd, int32_t n_tokens, llama_pos pos_0, llama_seq_id seq_id) { - pos .resize(n_tokens); +- decode_embd_batch(float * embd, int32_t n_tokens, int n_pos_per_embd, int n_mmproj_embd) : n_pos_per_embd(n_pos_per_embd), n_mmproj_embd(n_mmproj_embd) { ++ decode_embd_batch(float * embd, int32_t n_embd, int32_t n_tokens, llama_pos pos_0, llama_seq_id seq_id) : n_pos_per_embd(n_pos_per_embd), n_mmproj_embd(n_mmproj_embd) { + pos .resize(n_tokens * n_pos_per_embd); n_seq_id.resize(n_tokens); seq_ids .resize(n_tokens + 1); -@@ -245,6 +245,7 @@ struct decode_embd_batch { +@@ -487,6 +487,7 @@ struct decode_embd_batch { /*n_tokens =*/ n_tokens, /*tokens =*/ nullptr, /*embd =*/ embd, @@ -99,16 +77,16 @@ index 3fd5bebc..f0cec596 100644 /*pos =*/ pos.data(), /*n_seq_id =*/ n_seq_id.data(), /*seq_id =*/ seq_ids.data(), -@@ -311,7 +312,8 @@ int32_t mtmd_helper_eval(mtmd_context * ctx, - - int32_t n_tokens = mtmd_image_tokens_get_n_tokens(chunk.tokens_image.get()); +@@ -610,7 +611,8 @@ int32_t mtmd_helper_eval(mtmd_context * ctx, + int32_t i_batch = 0; + int32_t n_img_batches = GGML_PAD(n_tokens, n_batch) / n_batch; float * embd = mtmd_get_output_embd(ctx); -- decode_embd_batch batch_img(embd, n_tokens, n_past, 0); +- decode_embd_batch batch_embd(embd, n_tokens, n_pos_per_embd, n_mmproj_embd); + int n_embd = llama_model_n_embd(llama_get_model(lctx)); -+ decode_embd_batch batch_img(embd, n_embd, n_tokens, n_past, 0); - int64_t t1 = ggml_time_ms(); - ret = llama_decode(lctx, batch_img.batch); - if (ret != 0) { ++ decode_embd_batch batch_embd(embd, n_embd, n_tokens, n_past, 0); + + const int nx = mtmd_image_tokens_get_nx(chunk.tokens_image.get()); + const int ny = mtmd_image_tokens_get_ny(chunk.tokens_image.get()); diff --git a/ggml/src/ggml-backend-reg.cpp b/ggml/src/ggml-backend-reg.cpp index 405d8e31..82ae1b5b 100644 --- a/ggml/src/ggml-backend-reg.cpp @@ -127,10 +105,10 @@ index 405d8e31..82ae1b5b 100644 register_backend(ggml_backend_rpc_reg()); #endif diff --git a/include/llama.h b/include/llama.h -index 5657fbf0..f91896e4 100644 +index 06c56395..f1628e88 100644 --- a/include/llama.h +++ b/include/llama.h -@@ -255,6 +255,7 @@ extern "C" { +@@ -256,6 +256,7 @@ extern "C" { llama_token * token; float * embd; @@ -138,7 +116,7 @@ index 5657fbf0..f91896e4 100644 llama_pos * pos; int32_t * n_seq_id; llama_seq_id ** seq_id; -@@ -357,6 +358,7 @@ extern "C" { +@@ -358,6 +359,7 @@ extern "C" { bool offload_kqv; // whether to offload the KQV ops (including the KV cache) to GPU bool flash_attn; // whether to use flash attention [EXPERIMENTAL] bool no_perf; // whether to measure performance timings @@ -146,7 +124,7 @@ index 5657fbf0..f91896e4 100644 // Abort callback // if it returns true, execution of llama_decode() will be aborted -@@ -458,6 +460,10 @@ extern "C" { +@@ -459,6 +461,10 @@ extern "C" { struct llama_context_params params), "use llama_init_from_model instead"); @@ -158,7 +136,7 @@ index 5657fbf0..f91896e4 100644 LLAMA_API void llama_free(struct llama_context * ctx); diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp -index f754bc8f..0568565f 100644 +index 5ab3f572..eb7b5325 100644 --- a/src/llama-arch.cpp +++ b/src/llama-arch.cpp @@ -6,6 +6,7 @@ @@ -169,7 +147,7 @@ index f754bc8f..0568565f 100644 { LLM_ARCH_LLAMA4, "llama4" }, { LLM_ARCH_DECI, "deci" }, { LLM_ARCH_FALCON, "falcon" }, -@@ -142,6 +143,7 @@ static const std::map LLM_KV_NAMES = { +@@ -144,6 +145,7 @@ static const std::map LLM_KV_NAMES = { { LLM_KV_ATTENTION_SLIDING_WINDOW, "%s.attention.sliding_window" }, { LLM_KV_ATTENTION_SCALE, "%s.attention.scale" }, { LLM_KV_ATTENTION_BLOCK_SKIP_CONNECTION, "%s.attention.block_skip_connection" }, @@ -177,7 +155,7 @@ index f754bc8f..0568565f 100644 { LLM_KV_ATTENTION_KEY_LENGTH_MLA, "%s.attention.key_length_mla" }, { LLM_KV_ATTENTION_VALUE_LENGTH_MLA, "%s.attention.value_length_mla" }, -@@ -271,6 +273,40 @@ static const std::map> LLM_TENSOR_N +@@ -273,6 +275,40 @@ static const std::map> LLM_TENSOR_N { LLM_TENSOR_FFN_UP_SHEXP, "blk.%d.ffn_up_shexp" }, }, }, @@ -218,7 +196,7 @@ index f754bc8f..0568565f 100644 { LLM_ARCH_DECI, { -@@ -1681,6 +1717,14 @@ static const std::map LLM_TENSOR_INFOS = { +@@ -1701,6 +1737,14 @@ static const std::map LLM_TENSOR_INFOS = { // this tensor is loaded for T5, but never used {LLM_TENSOR_DEC_CROSS_ATTN_REL_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_NONE}}, {LLM_TENSOR_BSKCN_TV, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}}, @@ -234,7 +212,7 @@ index f754bc8f..0568565f 100644 {LLM_TENSOR_POS_NET_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}}, {LLM_TENSOR_POS_NET_NORM1, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}}, diff --git a/src/llama-arch.h b/src/llama-arch.h -index 439aaeab..6a989034 100644 +index 525c1b7d..bc8a4f0b 100644 --- a/src/llama-arch.h +++ b/src/llama-arch.h @@ -11,6 +11,7 @@ @@ -245,7 +223,7 @@ index 439aaeab..6a989034 100644 LLM_ARCH_DECI, LLM_ARCH_FALCON, LLM_ARCH_BAICHUAN, -@@ -146,6 +147,7 @@ enum llm_kv { +@@ -148,6 +149,7 @@ enum llm_kv { LLM_KV_ATTENTION_SLIDING_WINDOW, LLM_KV_ATTENTION_SCALE, LLM_KV_ATTENTION_BLOCK_SKIP_CONNECTION, @@ -253,7 +231,7 @@ index 439aaeab..6a989034 100644 LLM_KV_ATTENTION_KEY_LENGTH_MLA, LLM_KV_ATTENTION_VALUE_LENGTH_MLA, -@@ -347,6 +349,14 @@ enum llm_tensor { +@@ -349,6 +351,14 @@ enum llm_tensor { LLM_TENSOR_CLS, LLM_TENSOR_CLS_OUT, LLM_TENSOR_BSKCN_TV, @@ -297,10 +275,10 @@ index 01d5ca57..8682b0e6 100644 batch.token = (llama_token *) malloc(sizeof(llama_token) * n_tokens_alloc); } diff --git a/src/llama-context.cpp b/src/llama-context.cpp -index 32f59819..0343ba8a 100644 +index 9c1fe93f..cd06ad91 100644 --- a/src/llama-context.cpp +++ b/src/llama-context.cpp -@@ -862,7 +862,7 @@ float * llama_context::get_logits_ith(int32_t i) { +@@ -851,7 +851,7 @@ float * llama_context::get_logits_ith(int32_t i) { throw std::runtime_error(format("corrupt output buffer (j=%d, n_outputs=%d)", j, n_outputs)); } @@ -309,7 +287,7 @@ index 32f59819..0343ba8a 100644 } catch (const std::exception & err) { LLAMA_LOG_ERROR("%s: invalid logits id %d, reason: %s\n", __func__, i, err.what()); #ifndef NDEBUG -@@ -983,6 +983,10 @@ void llama_context::set_warmup(bool value) { +@@ -972,6 +972,10 @@ void llama_context::set_warmup(bool value) { cparams.warmup = value; } @@ -320,7 +298,7 @@ index 32f59819..0343ba8a 100644 void llama_context::set_adapter_lora( llama_adapter_lora * adapter, float scale) { -@@ -1058,7 +1062,7 @@ int llama_context::encode(llama_batch & inp_batch) { +@@ -1047,7 +1051,7 @@ int llama_context::encode(llama_batch & inp_batch) { const int64_t n_embd = hparams.n_embd; @@ -329,7 +307,7 @@ index 32f59819..0343ba8a 100644 const llama_ubatch ubatch = sbatch.split_simple(n_tokens); -@@ -1198,10 +1202,9 @@ int llama_context::decode(llama_batch & inp_batch) { +@@ -1187,10 +1191,9 @@ int llama_context::decode(llama_batch & inp_batch) { const llama_batch & batch = batch_allocr.batch; @@ -341,7 +319,7 @@ index 32f59819..0343ba8a 100644 const int64_t n_tokens_all = batch.n_tokens; const int64_t n_embd = hparams.n_embd; -@@ -1249,7 +1252,7 @@ int llama_context::decode(llama_batch & inp_batch) { +@@ -1238,7 +1241,7 @@ int llama_context::decode(llama_batch & inp_batch) { const bool logits_all = n_outputs_all == n_tokens_all; @@ -350,7 +328,7 @@ index 32f59819..0343ba8a 100644 /* simple_split */ !kv_self->recurrent, /* logits_all */ logits_all); -@@ -1483,12 +1486,11 @@ int llama_context::decode(llama_batch & inp_batch) { +@@ -1472,12 +1475,11 @@ int llama_context::decode(llama_batch & inp_batch) { int32_t llama_context::output_reserve(int32_t n_outputs) { const auto & hparams = model.hparams; @@ -364,7 +342,7 @@ index 32f59819..0343ba8a 100644 const auto n_embd = hparams.n_embd; // TODO: use a per-batch flag for logits presence instead -@@ -1558,7 +1560,7 @@ int32_t llama_context::output_reserve(int32_t n_outputs) { +@@ -1545,7 +1547,7 @@ int32_t llama_context::output_reserve(int32_t n_outputs) { void llama_context::output_reorder() { auto & out_ids = sbatch.out_ids; if (!out_ids.empty()) { @@ -373,7 +351,7 @@ index 32f59819..0343ba8a 100644 const uint32_t n_embd = model.hparams.n_embd; GGML_ASSERT((size_t) n_outputs == out_ids.size()); -@@ -2065,7 +2067,7 @@ size_t llama_context::state_write_data(llama_io_write_i & io) { +@@ -2052,7 +2054,7 @@ size_t llama_context::state_write_data(llama_io_write_i & io) { { LLAMA_LOG_DEBUG("%s: - writing logits\n", __func__); @@ -382,7 +360,7 @@ index 32f59819..0343ba8a 100644 io.write(&logits_size, sizeof(logits_size)); -@@ -2248,6 +2250,7 @@ llama_context_params llama_context_default_params() { +@@ -2235,6 +2237,7 @@ llama_context_params llama_context_default_params() { /*.offload_kqv =*/ true, /*.flash_attn =*/ false, /*.no_perf =*/ true, @@ -390,7 +368,7 @@ index 32f59819..0343ba8a 100644 /*.abort_callback =*/ nullptr, /*.abort_callback_data =*/ nullptr, }; -@@ -2375,6 +2378,10 @@ void llama_set_warmup(llama_context * ctx, bool warmup) { +@@ -2362,6 +2365,10 @@ void llama_set_warmup(llama_context * ctx, bool warmup) { ctx->set_warmup(warmup); } @@ -402,7 +380,7 @@ index 32f59819..0343ba8a 100644 ctx->synchronize(); } diff --git a/src/llama-context.h b/src/llama-context.h -index 04facb54..baa03276 100644 +index 5457f077..a50c4afa 100644 --- a/src/llama-context.h +++ b/src/llama-context.h @@ -65,6 +65,7 @@ struct llama_context { @@ -426,10 +404,10 @@ index 30e550f0..85ad91b9 100644 enum llama_pooling_type pooling_type; diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp -index a85e9728..d740c120 100644 +index fabb9ca2..b67216a4 100644 --- a/src/llama-graph.cpp +++ b/src/llama-graph.cpp -@@ -546,6 +546,12 @@ void llm_graph_input_attn_cross::set_input(const llama_ubatch * ubatch) { +@@ -560,6 +560,12 @@ void llm_graph_input_attn_cross::set_input(const llama_ubatch * ubatch) { } } @@ -442,7 +420,7 @@ index a85e9728..d740c120 100644 // // llm_graph_context // -@@ -1506,6 +1512,25 @@ llm_graph_input_attn_cross * llm_graph_context::build_attn_inp_cross() const { +@@ -1532,6 +1538,25 @@ llm_graph_input_attn_cross * llm_graph_context::build_attn_inp_cross() const { return (llm_graph_input_attn_cross *) res->add_input(std::move(inp)); } @@ -469,7 +447,7 @@ index a85e9728..d740c120 100644 llm_graph_input_attn_cross * inp, ggml_cgraph * gf, diff --git a/src/llama-graph.h b/src/llama-graph.h -index d192dc14..260a2af2 100644 +index d0c8d321..0fe18150 100644 --- a/src/llama-graph.h +++ b/src/llama-graph.h @@ -86,6 +86,7 @@ public: @@ -480,7 +458,7 @@ index d192dc14..260a2af2 100644 }; class llm_graph_input_pos : public llm_graph_input_i { -@@ -285,6 +286,16 @@ public: +@@ -283,6 +284,16 @@ public: const llama_cross * cross = nullptr; }; @@ -497,7 +475,7 @@ index d192dc14..260a2af2 100644 // // llm_graph_result // -@@ -493,6 +504,7 @@ struct llm_graph_context { +@@ -491,6 +502,7 @@ struct llm_graph_context { ggml_tensor * build_inp_cls() const; ggml_tensor * build_inp_s_copy() const; ggml_tensor * build_inp_s_mask() const; @@ -518,7 +496,7 @@ index 8a667960..6a02de03 100644 + return std::find(cross_attn_layers.begin(), cross_attn_layers.end(), il) != cross_attn_layers.end(); +} diff --git a/src/llama-hparams.h b/src/llama-hparams.h -index 6e278945..c8a34d52 100644 +index 48dce407..b6fc7e6d 100644 --- a/src/llama-hparams.h +++ b/src/llama-hparams.h @@ -2,6 +2,8 @@ @@ -546,7 +524,7 @@ index 6e278945..c8a34d52 100644 uint32_t n_layer_dense_lead = 0; uint32_t n_lora_q = 0; -@@ -158,6 +162,9 @@ struct llama_hparams { +@@ -159,6 +163,9 @@ struct llama_hparams { // Block skip connection bool n_bskcn(uint32_t n, uint32_t il) const; @@ -593,10 +571,10 @@ index a012aeae..2e11507d 100644 bool llama_model_loader::get_arr(const std::string & key, std::array & result, bool required) { const int kid = gguf_find_key(meta.get(), key.c_str()); diff --git a/src/llama-model.cpp b/src/llama-model.cpp -index aba42819..d051696c 100644 +index 572378c9..9d099f11 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp -@@ -419,6 +419,7 @@ void llama_model::load_hparams(llama_model_loader & ml) { +@@ -423,6 +423,7 @@ void llama_model::load_hparams(llama_model_loader & ml) { // get general kv ml.get_key(LLM_KV_GENERAL_NAME, name, false); @@ -604,7 +582,7 @@ index aba42819..d051696c 100644 // everything past this point is not vocab-related if (hparams.vocab_only) { -@@ -430,6 +431,7 @@ void llama_model::load_hparams(llama_model_loader & ml) { +@@ -434,6 +435,7 @@ void llama_model::load_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_BLOCK_COUNT, hparams.n_layer); ml.get_key(LLM_KV_EXPERT_COUNT, hparams.n_expert, false); ml.get_key(LLM_KV_EXPERT_USED_COUNT, hparams.n_expert_used, false); @@ -612,7 +590,7 @@ index aba42819..d051696c 100644 if (arch == LLM_ARCH_WAVTOKENIZER_DEC) { ml.get_key(LLM_KV_FEATURES_LENGTH, hparams.n_embd_features); -@@ -453,9 +455,11 @@ void llama_model::load_hparams(llama_model_loader & ml) { +@@ -457,9 +459,11 @@ void llama_model::load_hparams(llama_model_loader & ml) { std::fill(hparams.n_head_arr.begin(), hparams.n_head_arr.end(), 0); std::fill(hparams.n_head_kv_arr.begin(), hparams.n_head_kv_arr.end(), 0); std::fill(hparams.n_ff_arr.begin(), hparams.n_ff_arr.end(), 0); @@ -624,7 +602,7 @@ index aba42819..d051696c 100644 // n_head_kv is optional, default to n_head hparams.n_head_kv_arr = hparams.n_head_arr; -@@ -508,7 +512,7 @@ void llama_model::load_hparams(llama_model_loader & ml) { +@@ -512,7 +516,7 @@ void llama_model::load_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_ROPE_DIMENSION_COUNT, hparams.n_rot, false); @@ -633,7 +611,7 @@ index aba42819..d051696c 100644 if (hparams.n_rot != hparams.n_embd_head_k) { throw std::runtime_error(format("invalid n_rot: %u, expected %u", hparams.n_rot, hparams.n_embd_head_k)); } -@@ -571,6 +575,16 @@ void llama_model::load_hparams(llama_model_loader & ml) { +@@ -575,6 +579,16 @@ void llama_model::load_hparams(llama_model_loader & ml) { hparams.use_kq_norm = false; } } break; @@ -650,7 +628,7 @@ index aba42819..d051696c 100644 case LLM_ARCH_DECI: { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); -@@ -1550,7 +1564,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) { +@@ -1562,7 +1576,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) { const int64_t n_embd_head_v = hparams.n_embd_head_v; const int64_t n_ff = hparams.n_ff(); const int64_t n_embd_gqa = n_embd_v_gqa; @@ -659,7 +637,7 @@ index aba42819..d051696c 100644 const int64_t n_token_types = vocab.n_token_types(); const int64_t n_rot = hparams.n_rot; const int64_t n_expert = hparams.n_expert; -@@ -1803,6 +1817,52 @@ bool llama_model::load_tensors(llama_model_loader & ml) { +@@ -1815,6 +1829,52 @@ bool llama_model::load_tensors(llama_model_loader & ml) { } } } break; @@ -712,7 +690,7 @@ index aba42819..d051696c 100644 case LLM_ARCH_DECI: { tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0); -@@ -4683,6 +4743,246 @@ struct llm_build_llama : public llm_graph_context { +@@ -4707,6 +4767,246 @@ struct llm_build_llama : public llm_graph_context { } }; @@ -959,7 +937,7 @@ index aba42819..d051696c 100644 struct llm_build_deci : public llm_graph_context { llm_build_deci(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) { const int64_t n_embd_head = hparams.n_embd_head_v; -@@ -13017,6 +13317,10 @@ llm_graph_result_ptr llama_model::build_graph( +@@ -13063,6 +13363,10 @@ llm_graph_result_ptr llama_model::build_graph( { llm = std::make_unique(*this, params, gf); } break; @@ -970,7 +948,7 @@ index aba42819..d051696c 100644 case LLM_ARCH_DECI: { llm = std::make_unique(*this, params, gf); -@@ -13377,6 +13681,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) { +@@ -13424,6 +13728,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) { // use what we call a normal RoPE, operating on pairs of consecutive head values case LLM_ARCH_LLAMA: case LLM_ARCH_LLAMA4: @@ -979,7 +957,7 @@ index aba42819..d051696c 100644 case LLM_ARCH_BAICHUAN: case LLM_ARCH_STARCODER: diff --git a/src/llama-model.h b/src/llama-model.h -index 5865d5e9..72bab5be 100644 +index 856e6042..6be91282 100644 --- a/src/llama-model.h +++ b/src/llama-model.h @@ -11,6 +11,7 @@ @@ -990,15 +968,15 @@ index 5865d5e9..72bab5be 100644 struct llama_cparams; struct llama_ubatch; -@@ -70,6 +71,7 @@ enum llm_type { +@@ -73,6 +74,7 @@ enum llm_type { LLM_TYPE_40B, LLM_TYPE_65B, LLM_TYPE_70B, + LLM_TYPE_90B, LLM_TYPE_236B, + LLM_TYPE_290B, LLM_TYPE_314B, - LLM_TYPE_671B, -@@ -310,6 +312,16 @@ struct llama_layer { +@@ -314,6 +316,16 @@ struct llama_layer { struct ggml_tensor * bskcn_tv = nullptr; diff --git a/llama/patches/0007-add-unpad-operator.patch b/llama/patches/0007-add-unpad-operator.patch index 116545d67..50acfc632 100644 --- a/llama/patches/0007-add-unpad-operator.patch +++ b/llama/patches/0007-add-unpad-operator.patch @@ -18,10 +18,10 @@ adds the unpad operator to GGML 10 files changed, 223 insertions(+), 2 deletions(-) diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h -index 8fcc16df..d19fc167 100644 +index 1b8603e7..53ef31b2 100644 --- a/ggml/include/ggml.h +++ b/ggml/include/ggml.h -@@ -488,6 +488,7 @@ extern "C" { +@@ -489,6 +489,7 @@ extern "C" { GGML_OP_UPSCALE, // nearest interpolate GGML_OP_PAD, GGML_OP_PAD_REFLECT_1D, @@ -29,7 +29,7 @@ index 8fcc16df..d19fc167 100644 GGML_OP_ARANGE, GGML_OP_TIMESTEP_EMBEDDING, GGML_OP_ARGSORT, -@@ -1757,6 +1758,15 @@ extern "C" { +@@ -1777,6 +1778,15 @@ extern "C" { int p0, int p1); @@ -46,10 +46,10 @@ index 8fcc16df..d19fc167 100644 // timesteps: [N,] // return: [N, dim] diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c -index 50400328..432942bf 100644 +index 64405449..34624cca 100644 --- a/ggml/src/ggml-cpu/ggml-cpu.c +++ b/ggml/src/ggml-cpu/ggml-cpu.c -@@ -1960,6 +1960,10 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm +@@ -1964,6 +1964,10 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm { ggml_compute_forward_pad_reflect_1d(params, tensor); } break; @@ -60,7 +60,7 @@ index 50400328..432942bf 100644 case GGML_OP_ARANGE: { ggml_compute_forward_arange(params, tensor); -@@ -2282,6 +2286,7 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) { +@@ -2287,6 +2291,7 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) { case GGML_OP_UPSCALE: case GGML_OP_PAD: case GGML_OP_PAD_REFLECT_1D: @@ -69,10 +69,10 @@ index 50400328..432942bf 100644 case GGML_OP_TIMESTEP_EMBEDDING: case GGML_OP_ARGSORT: diff --git a/ggml/src/ggml-cpu/ops.cpp b/ggml/src/ggml-cpu/ops.cpp -index 6050147b..66b8da68 100644 +index 7413192b..becdae07 100644 --- a/ggml/src/ggml-cpu/ops.cpp +++ b/ggml/src/ggml-cpu/ops.cpp -@@ -6531,6 +6531,61 @@ void ggml_compute_forward_pad_reflect_1d( +@@ -6703,6 +6703,61 @@ void ggml_compute_forward_pad_reflect_1d( } } @@ -135,10 +135,10 @@ index 6050147b..66b8da68 100644 static void ggml_compute_forward_arange_f32( diff --git a/ggml/src/ggml-cpu/ops.h b/ggml/src/ggml-cpu/ops.h -index 410a3720..3eca1cf8 100644 +index dc081b9e..a7125555 100644 --- a/ggml/src/ggml-cpu/ops.h +++ b/ggml/src/ggml-cpu/ops.h -@@ -71,6 +71,7 @@ void ggml_compute_forward_pool_2d_back(const struct ggml_compute_params * params +@@ -72,6 +72,7 @@ void ggml_compute_forward_pool_2d_back(const struct ggml_compute_params * params void ggml_compute_forward_upscale(const struct ggml_compute_params * params, struct ggml_tensor * dst); void ggml_compute_forward_pad(const struct ggml_compute_params * params, struct ggml_tensor * dst); void ggml_compute_forward_pad_reflect_1d(const struct ggml_compute_params * params, struct ggml_tensor * dst); @@ -147,10 +147,10 @@ index 410a3720..3eca1cf8 100644 void ggml_compute_forward_timestep_embedding(const struct ggml_compute_params * params, struct ggml_tensor * dst); void ggml_compute_forward_argsort(const struct ggml_compute_params * params, struct ggml_tensor * dst); diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu -index 31750b6f..0fef9522 100644 +index 04ce764e..491acccb 100644 --- a/ggml/src/ggml-cuda/ggml-cuda.cu +++ b/ggml/src/ggml-cuda/ggml-cuda.cu -@@ -2246,6 +2246,9 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg +@@ -2223,6 +2223,9 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg case GGML_OP_PAD: ggml_cuda_op_pad(ctx, dst); break; @@ -160,7 +160,7 @@ index 31750b6f..0fef9522 100644 case GGML_OP_ARANGE: ggml_cuda_op_arange(ctx, dst); break; -@@ -3222,6 +3225,7 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g +@@ -3197,6 +3200,7 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g case GGML_OP_UPSCALE: return op->src[0]->type == GGML_TYPE_F32 && op->op_params[0] == GGML_SCALE_MODE_NEAREST; case GGML_OP_PAD: @@ -233,7 +233,7 @@ index 8fd386b0..e2ededc3 100644 void ggml_cuda_op_pad(ggml_backend_cuda_context & ctx, ggml_tensor * dst); +void ggml_cuda_op_unpad(ggml_backend_cuda_context & ctx, ggml_tensor * dst); diff --git a/ggml/src/ggml-metal/ggml-metal.m b/ggml/src/ggml-metal/ggml-metal.m -index 12886cd3..b2e95a66 100644 +index 425524d0..112abef6 100644 --- a/ggml/src/ggml-metal/ggml-metal.m +++ b/ggml/src/ggml-metal/ggml-metal.m @@ -341,6 +341,7 @@ static void ggml_backend_metal_device_rel(struct ggml_backend_metal_device_conte @@ -244,7 +244,7 @@ index 12886cd3..b2e95a66 100644 GGML_METAL_KERNEL_TYPE_ARANGE_F32, GGML_METAL_KERNEL_TYPE_TIMESTEP_EMBEDDING_F32, GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_ASC, -@@ -1020,6 +1021,7 @@ @implementation GGMLMetalClass +@@ -1277,6 +1278,7 @@ @implementation GGMLMetalClass GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_UPSCALE_F32, upscale_f32, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_PAD_F32, pad_f32, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_PAD_REFLECT_1D_F32, pad_reflect_1d_f32, true); @@ -252,7 +252,7 @@ index 12886cd3..b2e95a66 100644 GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_TIMESTEP_EMBEDDING_F32, timestep_embedding_f32, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ARANGE_F32, arange_f32, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_ASC, argsort_f32_i32_asc, true); -@@ -1384,6 +1386,7 @@ static bool ggml_metal_supports_op(const struct ggml_backend_metal_device_contex +@@ -1647,6 +1649,7 @@ static bool ggml_metal_supports_op(const struct ggml_backend_metal_device_contex case GGML_OP_POOL_2D: case GGML_OP_PAD: case GGML_OP_PAD_REFLECT_1D: @@ -260,7 +260,7 @@ index 12886cd3..b2e95a66 100644 case GGML_OP_TIMESTEP_EMBEDDING: case GGML_OP_ARGSORT: case GGML_OP_LEAKY_RELU: -@@ -3731,6 +3734,36 @@ static void ggml_metal_encode_node( +@@ -4047,6 +4050,36 @@ static bool ggml_metal_encode_node( const int nth = MIN(1024, ne0); @@ -298,7 +298,7 @@ index 12886cd3..b2e95a66 100644 } break; case GGML_OP_ARANGE: diff --git a/ggml/src/ggml-metal/ggml-metal.metal b/ggml/src/ggml-metal/ggml-metal.metal -index 8d6e99e6..71f0f97f 100644 +index 9f4147e9..6ceb3cef 100644 --- a/ggml/src/ggml-metal/ggml-metal.metal +++ b/ggml/src/ggml-metal/ggml-metal.metal @@ -2975,6 +2975,51 @@ kernel void kernel_pad_reflect_1d_f32( @@ -354,10 +354,10 @@ index 8d6e99e6..71f0f97f 100644 device char * dst, constant ggml_metal_kargs_arange & args, diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c -index 950772c7..2276b631 100644 +index 7654ae17..3c57aff8 100644 --- a/ggml/src/ggml.c +++ b/ggml/src/ggml.c -@@ -963,6 +963,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = { +@@ -923,6 +923,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = { "UPSCALE", "PAD", "PAD_REFLECT_1D", @@ -365,16 +365,16 @@ index 950772c7..2276b631 100644 "ARANGE", "TIMESTEP_EMBEDDING", "ARGSORT", -@@ -993,7 +994,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = { +@@ -953,7 +954,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = { "OPT_STEP_ADAMW", }; --static_assert(GGML_OP_COUNT == 81, "GGML_OP_COUNT != 81"); -+static_assert(GGML_OP_COUNT == 82, "GGML_OP_COUNT != 82"); +-static_assert(GGML_OP_COUNT == 82, "GGML_OP_COUNT != 82"); ++static_assert(GGML_OP_COUNT == 83, "GGML_OP_COUNT != 83"); static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = { "none", -@@ -1057,6 +1058,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = { +@@ -1018,6 +1019,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = { "upscale(x)", "pad(x)", "pad_reflect_1d(x)", @@ -382,16 +382,16 @@ index 950772c7..2276b631 100644 "arange(start, stop, step)", "timestep_embedding(timesteps, dim, max_period)", "argsort(x)", -@@ -1087,7 +1089,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = { +@@ -1048,7 +1050,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = { "adamw(x)", }; --static_assert(GGML_OP_COUNT == 81, "GGML_OP_COUNT != 81"); -+static_assert(GGML_OP_COUNT == 82, "GGML_OP_COUNT != 82"); +-static_assert(GGML_OP_COUNT == 82, "GGML_OP_COUNT != 82"); ++static_assert(GGML_OP_COUNT == 83, "GGML_OP_COUNT != 83"); static_assert(GGML_OP_POOL_COUNT == 2, "GGML_OP_POOL_COUNT != 2"); -@@ -4262,6 +4264,25 @@ struct ggml_tensor * ggml_pad_reflect_1d( +@@ -4270,6 +4272,25 @@ struct ggml_tensor * ggml_pad_reflect_1d( return result; } diff --git a/llama/patches/0008-fix-deepseek-deseret-regex.patch b/llama/patches/0008-fix-deepseek-deseret-regex.patch index 9b2d33984..5b4753bf8 100644 --- a/llama/patches/0008-fix-deepseek-deseret-regex.patch +++ b/llama/patches/0008-fix-deepseek-deseret-regex.patch @@ -12,7 +12,7 @@ regex 2 files changed, 22 insertions(+), 1 deletion(-) diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp -index a35b498c..032019c9 100644 +index a9ee9f03..1306864e 100644 --- a/src/llama-vocab.cpp +++ b/src/llama-vocab.cpp @@ -296,7 +296,7 @@ struct llm_tokenizer_bpe : llm_tokenizer { diff --git a/llama/patches/0009-maintain-ordering-for-rules-for-grammar.patch b/llama/patches/0009-maintain-ordering-for-rules-for-grammar.patch index 5504b1d31..4c2192887 100644 --- a/llama/patches/0009-maintain-ordering-for-rules-for-grammar.patch +++ b/llama/patches/0009-maintain-ordering-for-rules-for-grammar.patch @@ -8,10 +8,10 @@ Subject: [PATCH] maintain ordering for rules for grammar 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/common/json-schema-to-grammar.cpp b/common/json-schema-to-grammar.cpp -index 90679822..56043678 100644 +index 5b3059c2..656b3eca 100644 --- a/common/json-schema-to-grammar.cpp +++ b/common/json-schema-to-grammar.cpp -@@ -346,7 +346,7 @@ private: +@@ -349,7 +349,7 @@ private: friend std::string build_grammar(const std::function & cb, const common_grammar_options & options); std::function _fetch_json; bool _dotall; diff --git a/llama/patches/0010-ensure-KV-cache-is-fully-defragmented.patch b/llama/patches/0010-ensure-KV-cache-is-fully-defragmented.patch index c9d4e9ad8..e4b2a4081 100644 --- a/llama/patches/0010-ensure-KV-cache-is-fully-defragmented.patch +++ b/llama/patches/0010-ensure-KV-cache-is-fully-defragmented.patch @@ -22,10 +22,10 @@ multiple batches of processing until everything is complete. 4 files changed, 51 insertions(+), 106 deletions(-) diff --git a/src/llama-context.cpp b/src/llama-context.cpp -index 0343ba8a..4b3e6a83 100644 +index cd06ad91..77177c5e 100644 --- a/src/llama-context.cpp +++ b/src/llama-context.cpp -@@ -594,13 +594,12 @@ llm_graph_result_ptr llama_context::build_kv_self_shift( +@@ -583,13 +583,12 @@ llm_graph_result_ptr llama_context::build_kv_self_shift( llm_graph_result_ptr llama_context::build_kv_self_defrag( ggml_context * ctx0, @@ -41,7 +41,7 @@ index 0343ba8a..4b3e6a83 100644 #if 0 // CPU defrag // -@@ -672,32 +671,20 @@ llm_graph_result_ptr llama_context::build_kv_self_defrag( +@@ -661,32 +660,20 @@ llm_graph_result_ptr llama_context::build_kv_self_defrag( ggml_backend_tensor_set(v_l[il], buf_v.data(), 0, buf_v.size()); } #else @@ -79,7 +79,7 @@ index 0343ba8a..4b3e6a83 100644 ggml_tensor * view_v_src; ggml_tensor * view_v_dst; -@@ -705,34 +692,30 @@ llm_graph_result_ptr llama_context::build_kv_self_defrag( +@@ -694,34 +681,30 @@ llm_graph_result_ptr llama_context::build_kv_self_defrag( if (cparams.flash_attn) { // NOTE: the V cache is not transposed when using flash attention view_v_src = ggml_view_2d(ctx0, kv_self->v_l[il], @@ -122,7 +122,7 @@ index 0343ba8a..4b3e6a83 100644 #endif return res; -@@ -741,8 +724,6 @@ llm_graph_result_ptr llama_context::build_kv_self_defrag( +@@ -730,8 +713,6 @@ llm_graph_result_ptr llama_context::build_kv_self_defrag( void llama_context::kv_self_update() { auto & kv = kv_self; @@ -131,7 +131,7 @@ index 0343ba8a..4b3e6a83 100644 if (kv->has_shift) { if (!kv->get_can_shift()) { GGML_ABORT("The current context does not support K-shift"); -@@ -763,8 +744,6 @@ void llama_context::kv_self_update() { +@@ -752,8 +733,6 @@ void llama_context::kv_self_update() { res->set_inputs(nullptr); graph_compute(gf, false); @@ -140,7 +140,7 @@ index 0343ba8a..4b3e6a83 100644 } { -@@ -779,49 +758,28 @@ void llama_context::kv_self_update() { +@@ -768,49 +747,28 @@ void llama_context::kv_self_update() { // defragment the KV cache if needed if (kv->do_defrag) { LLAMA_LOG_DEBUG("%s: defragmenting KV cache\n", __func__); @@ -202,7 +202,7 @@ index 0343ba8a..4b3e6a83 100644 } enum llama_pooling_type llama_context::pooling_type() const { -@@ -1305,9 +1263,12 @@ int llama_context::decode(llama_batch & inp_batch) { +@@ -1294,9 +1252,12 @@ int llama_context::decode(llama_batch & inp_batch) { // find KV slot { if (!kv_self->find_slot(ubatch)) { @@ -219,7 +219,7 @@ index 0343ba8a..4b3e6a83 100644 if (!kv_self->recurrent) { diff --git a/src/llama-context.h b/src/llama-context.h -index baa03276..a59ff8fd 100644 +index a50c4afa..30f84bfd 100644 --- a/src/llama-context.h +++ b/src/llama-context.h @@ -5,6 +5,7 @@ @@ -230,7 +230,7 @@ index baa03276..a59ff8fd 100644 #include "ggml-cpp.h" -@@ -180,7 +181,8 @@ private: +@@ -179,7 +180,8 @@ private: llm_graph_result_ptr build_kv_self_defrag( ggml_context * ctx0, diff --git a/llama/patches/0012-add-phony-target-ggml-cpu-for-all-cpu-variants.patch b/llama/patches/0012-add-phony-target-ggml-cpu-for-all-cpu-variants.patch index eaba3c4c8..6de840a60 100644 --- a/llama/patches/0012-add-phony-target-ggml-cpu-for-all-cpu-variants.patch +++ b/llama/patches/0012-add-phony-target-ggml-cpu-for-all-cpu-variants.patch @@ -8,10 +8,10 @@ Subject: [PATCH] add phony target ggml-cpu for all cpu variants 1 file changed, 2 insertions(+) diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt -index f00700da..91d6a7d5 100644 +index 43d9fc4f..4c0d3824 100644 --- a/ggml/src/CMakeLists.txt +++ b/ggml/src/CMakeLists.txt -@@ -278,6 +278,7 @@ function(ggml_add_cpu_backend_variant tag_name) +@@ -279,6 +279,7 @@ function(ggml_add_cpu_backend_variant tag_name) endforeach() ggml_add_cpu_backend_variant_impl(${tag_name}) @@ -19,11 +19,11 @@ index f00700da..91d6a7d5 100644 endfunction() ggml_add_backend(CPU) -@@ -286,6 +287,7 @@ if (GGML_CPU_ALL_VARIANTS) +@@ -287,6 +288,7 @@ if (GGML_CPU_ALL_VARIANTS) if (NOT GGML_BACKEND_DL) message(FATAL_ERROR "GGML_CPU_ALL_VARIANTS requires GGML_BACKEND_DL") endif() + add_custom_target(ggml-cpu) - ggml_add_cpu_backend_variant(sandybridge AVX) - ggml_add_cpu_backend_variant(haswell AVX F16C AVX2 BMI2 FMA) - ggml_add_cpu_backend_variant(skylakex AVX F16C AVX2 BMI2 FMA AVX512) + ggml_add_cpu_backend_variant(x64) + ggml_add_cpu_backend_variant(sse42 SSE42) + ggml_add_cpu_backend_variant(sandybridge SSE42 AVX) diff --git a/llama/patches/0013-remove-amx.patch b/llama/patches/0013-remove-amx.patch index 0bbc0a3a3..c27032372 100644 --- a/llama/patches/0013-remove-amx.patch +++ b/llama/patches/0013-remove-amx.patch @@ -1,6 +1,6 @@ From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 From: jmorganca -Date: Tue, 8 Apr 2025 20:33:01 -0700 +Date: Thu, 1 May 2025 15:05:08 -0700 Subject: [PATCH] remove amx disable amx as it reduces performance on some systems @@ -9,16 +9,16 @@ disable amx as it reduces performance on some systems 1 file changed, 4 deletions(-) diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt -index 91d6a7d5..d6b393a2 100644 +index 4c0d3824..79c26312 100644 --- a/ggml/src/CMakeLists.txt +++ b/ggml/src/CMakeLists.txt -@@ -293,10 +293,6 @@ if (GGML_CPU_ALL_VARIANTS) - ggml_add_cpu_backend_variant(skylakex AVX F16C AVX2 BMI2 FMA AVX512) - ggml_add_cpu_backend_variant(icelake AVX F16C AVX2 BMI2 FMA AVX512 AVX512_VBMI AVX512_VNNI) - ggml_add_cpu_backend_variant(alderlake AVX F16C AVX2 BMI2 FMA AVX_VNNI) +@@ -296,10 +296,6 @@ if (GGML_CPU_ALL_VARIANTS) + ggml_add_cpu_backend_variant(skylakex SSE42 AVX F16C AVX2 BMI2 FMA AVX512) + ggml_add_cpu_backend_variant(icelake SSE42 AVX F16C AVX2 BMI2 FMA AVX512 AVX512_VBMI AVX512_VNNI) + ggml_add_cpu_backend_variant(alderlake SSE42 AVX F16C AVX2 BMI2 FMA AVX_VNNI) - if (NOT MSVC) - # MSVC doesn't support AMX -- ggml_add_cpu_backend_variant(sapphirerapids AVX F16C AVX2 BMI2 FMA AVX512 AVX512_VBMI AVX512_VNNI AVX512_BF16 AMX_TILE AMX_INT8) +- ggml_add_cpu_backend_variant(sapphirerapids SSE42 AVX F16C AVX2 BMI2 FMA AVX512 AVX512_VBMI AVX512_VNNI AVX512_BF16 AMX_TILE AMX_INT8) - endif() elseif (GGML_CPU) ggml_add_cpu_backend_variant_impl("") diff --git a/llama/patches/0014-fix-string-arr-kv-loading.patch b/llama/patches/0014-fix-string-arr-kv-loading.patch index 01f1b71eb..5d94ca2c8 100644 --- a/llama/patches/0014-fix-string-arr-kv-loading.patch +++ b/llama/patches/0014-fix-string-arr-kv-loading.patch @@ -53,7 +53,7 @@ index 381a9c7d..e45b453d 100644 } diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp -index 032019c9..ba37df35 100644 +index 1306864e..d6515ff6 100644 --- a/src/llama-vocab.cpp +++ b/src/llama-vocab.cpp @@ -1459,7 +1459,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) { diff --git a/llama/patches/0015-ollama-debug-tensor.patch b/llama/patches/0015-ollama-debug-tensor.patch index a192bdea8..79d997c75 100644 --- a/llama/patches/0015-ollama-debug-tensor.patch +++ b/llama/patches/0015-ollama-debug-tensor.patch @@ -8,7 +8,7 @@ Subject: [PATCH] ollama debug tensor 1 file changed, 6 insertions(+) diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c -index 432942bf..6d4abe4c 100644 +index 34624cca..59bd3c62 100644 --- a/ggml/src/ggml-cpu/ggml-cpu.c +++ b/ggml/src/ggml-cpu/ggml-cpu.c @@ -15,6 +15,8 @@ @@ -20,7 +20,7 @@ index 432942bf..6d4abe4c 100644 #if defined(_MSC_VER) || defined(__MINGW32__) #include // using malloc.h with MSC/MINGW #elif !defined(__FreeBSD__) && !defined(__NetBSD__) && !defined(__OpenBSD__) -@@ -2854,6 +2856,10 @@ static thread_ret_t ggml_graph_compute_thread(void * data) { +@@ -2859,6 +2861,10 @@ static thread_ret_t ggml_graph_compute_thread(void * data) { ggml_compute_forward(¶ms, node); diff --git a/llama/patches/0016-add-model-quantizations.patch b/llama/patches/0016-add-model-quantizations.patch deleted file mode 100644 index 3d078b03f..000000000 --- a/llama/patches/0016-add-model-quantizations.patch +++ /dev/null @@ -1,96 +0,0 @@ -From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 -From: jmorganca -Date: Tue, 8 Apr 2025 20:39:32 -0700 -Subject: [PATCH] add model quantizations - -a temporary patch to add model quantization for -models not supported in llama.cpp ---- - src/llama-arch.cpp | 17 +++++++++++++++++ - src/llama-arch.h | 1 + - src/llama-model.cpp | 2 ++ - src/llama-quant.cpp | 4 ++++ - 4 files changed, 24 insertions(+) - -diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp -index 0568565f..dd01df60 100644 ---- a/src/llama-arch.cpp -+++ b/src/llama-arch.cpp -@@ -73,6 +73,7 @@ static const std::map LLM_ARCH_NAMES = { - { LLM_ARCH_WAVTOKENIZER_DEC, "wavtokenizer-dec" }, - { LLM_ARCH_PLM, "plm" }, - { LLM_ARCH_BAILINGMOE, "bailingmoe" }, -+ { LLM_ARCH_MISTRAL3, "mistral3" }, - { LLM_ARCH_UNKNOWN, "(unknown)" }, - }; - -@@ -1586,6 +1587,22 @@ static const std::map> LLM_TENSOR_N - { LLM_TENSOR_FFN_UP_SHEXP, "blk.%d.ffn_up_shexp" }, - }, - }, -+ { -+ LLM_ARCH_MISTRAL3, -+ { -+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" }, -+ { LLM_TENSOR_OUTPUT_NORM, "output_norm" }, -+ { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" }, -+ { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" }, -+ { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" }, -+ { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" }, -+ { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" }, -+ { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" }, -+ { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" }, -+ { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" }, -+ { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" }, -+ } -+ }, - { - LLM_ARCH_UNKNOWN, - { -diff --git a/src/llama-arch.h b/src/llama-arch.h -index 6a989034..b6227eeb 100644 ---- a/src/llama-arch.h -+++ b/src/llama-arch.h -@@ -75,6 +75,7 @@ enum llm_arch { - LLM_ARCH_CHAMELEON, - LLM_ARCH_SOLAR, - LLM_ARCH_WAVTOKENIZER_DEC, -+ LLM_ARCH_MISTRAL3, - LLM_ARCH_PLM, - LLM_ARCH_BAILINGMOE, - LLM_ARCH_UNKNOWN, -diff --git a/src/llama-model.cpp b/src/llama-model.cpp -index d051696c..c8374159 100644 ---- a/src/llama-model.cpp -+++ b/src/llama-model.cpp -@@ -1425,6 +1425,7 @@ void llama_model::load_hparams(llama_model_loader & ml) { - default: type = LLM_TYPE_UNKNOWN; - } - } break; -+ case LLM_ARCH_MISTRAL3: break; - default: throw std::runtime_error("unsupported model architecture"); - } - -@@ -13704,6 +13705,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) { - case LLM_ARCH_CHAMELEON: - case LLM_ARCH_SOLAR: - case LLM_ARCH_BAILINGMOE: -+ case LLM_ARCH_MISTRAL3: - return LLAMA_ROPE_TYPE_NORM; - - // the pairs of head values are offset by n_rot/2 -diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp -index 223e1f3f..8ae6dde8 100644 ---- a/src/llama-quant.cpp -+++ b/src/llama-quant.cpp -@@ -744,6 +744,10 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: - // This used to be a regex, but has an extreme cost to compile times. - bool quantize = name.rfind("weight") == name.size() - 6; // ends with 'weight'? - -+ // don't quantize vision stuff -+ quantize &= name.find("v.") == std::string::npos; -+ quantize &= name.find("mm.") == std::string::npos; -+ - // quantize only 2D and 3D tensors (experts) - quantize &= (ggml_n_dims(tensor) >= 2); - diff --git a/llama/patches/0021-add-ollama-vocab-for-grammar-support.patch b/llama/patches/0016-add-ollama-vocab-for-grammar-support.patch similarity index 98% rename from llama/patches/0021-add-ollama-vocab-for-grammar-support.patch rename to llama/patches/0016-add-ollama-vocab-for-grammar-support.patch index 6193b755f..26a91ad9a 100644 --- a/llama/patches/0021-add-ollama-vocab-for-grammar-support.patch +++ b/llama/patches/0016-add-ollama-vocab-for-grammar-support.patch @@ -184,10 +184,10 @@ index f8c291de..2a3a62db 100644 const char * grammar_root, bool lazy, diff --git a/src/llama-sampling.cpp b/src/llama-sampling.cpp -index d1497985..b1a9dca3 100644 +index c0a5f934..75731053 100644 --- a/src/llama-sampling.cpp +++ b/src/llama-sampling.cpp -@@ -1465,7 +1465,7 @@ static void llama_sampler_grammar_reset(struct llama_sampler * smpl) { +@@ -1466,7 +1466,7 @@ static void llama_sampler_grammar_reset(struct llama_sampler * smpl) { trigger_patterns_c.push_back(trigger_pattern.pattern.c_str()); } @@ -196,7 +196,7 @@ index d1497985..b1a9dca3 100644 ctx->grammar->lazy, trigger_patterns_c.data(), trigger_patterns_c.size(), ctx->grammar->trigger_tokens.data(), ctx->grammar->trigger_tokens.size()); -@@ -1547,7 +1547,7 @@ static struct llama_sampler * llama_sampler_init_grammar_impl( +@@ -1548,7 +1548,7 @@ static struct llama_sampler * llama_sampler_init_grammar_impl( /* .vocab = */ vocab, /* .grammar_str = */ grammar_str, /* .grammar_root = */ grammar_root, diff --git a/llama/patches/0017-ggml-Don-t-assert-fail-when-tensor-data-changes-1322.patch b/llama/patches/0017-ggml-Don-t-assert-fail-when-tensor-data-changes-1322.patch new file mode 100644 index 000000000..b3424c9ef --- /dev/null +++ b/llama/patches/0017-ggml-Don-t-assert-fail-when-tensor-data-changes-1322.patch @@ -0,0 +1,38 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Jesse Gross +Date: Thu, 1 May 2025 13:46:10 -0700 +Subject: [PATCH] ggml: Don't assert fail when tensor data changes (#13222) + +The following scenario will cause an assertion failure in the graph +allocator: + - Build and allocate a graph containing a tensor with a non-NULL data + pointer + - Build and allocate a new graph where that data is NULL + +Result: +ggml-alloc.c:819: GGML_ASSERT(talloc->buffer_id >= 0) failed + +This happens during revalidation because we think that memory should +have been previously allocated based on the current graph but in +reality the previous graph was different. In this situation, we +should do a full reallocation pass. +--- + ggml/src/ggml-alloc.c | 5 ++++- + 1 file changed, 4 insertions(+), 1 deletion(-) + +diff --git a/ggml/src/ggml-alloc.c b/ggml/src/ggml-alloc.c +index a3d3f690..5fd379f6 100644 +--- a/ggml/src/ggml-alloc.c ++++ b/ggml/src/ggml-alloc.c +@@ -816,7 +816,10 @@ static void ggml_gallocr_init_tensor(ggml_gallocr_t galloc, struct ggml_tensor * + static bool ggml_gallocr_node_needs_realloc(ggml_gallocr_t galloc, struct ggml_tensor * node, struct tensor_alloc * talloc) { + size_t node_size = 0; + if (!node->data && !node->view_src) { +- GGML_ASSERT(talloc->buffer_id >= 0); // prevent segfault when misusing the API ++ // If we previously had data but don't now then reallocate ++ if (talloc->buffer_id < 0) { ++ return false; ++ } + node_size = ggml_backend_buft_get_alloc_size(galloc->bufts[talloc->buffer_id], node); + } + return talloc->size_max >= node_size; diff --git a/llama/sampling_ext.cpp b/llama/sampling_ext.cpp index 6a025c906..78b889bd7 100644 --- a/llama/sampling_ext.cpp +++ b/llama/sampling_ext.cpp @@ -19,9 +19,6 @@ struct common_sampler *common_sampler_cinit(const struct llama_model *model, str sparams.penalty_repeat = params->penalty_repeat; sparams.penalty_freq = params->penalty_freq; sparams.penalty_present = params->penalty_present; - sparams.mirostat = params->mirostat; - sparams.mirostat_tau = params->mirostat_tau; - sparams.mirostat_eta = params->mirostat_eta; sparams.seed = params->seed; sparams.grammar = params->grammar; sparams.xtc_probability = 0.0; diff --git a/llama/sampling_ext.h b/llama/sampling_ext.h index a9e610ba2..3302e6efa 100644 --- a/llama/sampling_ext.h +++ b/llama/sampling_ext.h @@ -20,9 +20,6 @@ extern "C" float penalty_repeat; float penalty_freq; float penalty_present; - int32_t mirostat; - float mirostat_tau; - float mirostat_eta; uint32_t seed; char *grammar; }; diff --git a/llm/llm_windows.go b/llm/llm_windows.go index 915355a25..a350979ab 100644 --- a/llm/llm_windows.go +++ b/llm/llm_windows.go @@ -7,6 +7,7 @@ import ( const ( CREATE_DEFAULT_ERROR_MODE = 0x04000000 ABOVE_NORMAL_PRIORITY_CLASS = 0x00008000 + CREATE_NO_WINDOW = 0x08000000 ) var LlamaServerSysProcAttr = &syscall.SysProcAttr{ @@ -18,5 +19,5 @@ var LlamaServerSysProcAttr = &syscall.SysProcAttr{ // // Setting Above Normal priority class ensures when running as a "background service" // with "programs" given best priority, we aren't starved of cpu cycles - CreationFlags: CREATE_DEFAULT_ERROR_MODE | ABOVE_NORMAL_PRIORITY_CLASS, + CreationFlags: CREATE_DEFAULT_ERROR_MODE | ABOVE_NORMAL_PRIORITY_CLASS | CREATE_NO_WINDOW, } diff --git a/llm/memory_test.go b/llm/memory_test.go index 213784a02..1d4f7a98c 100644 --- a/llm/memory_test.go +++ b/llm/memory_test.go @@ -25,7 +25,7 @@ func TestEstimateGPULayers(t *testing.T) { defer f.Close() inputLayerCount := 5 - tensors := []ggml.Tensor{ + tensors := []*ggml.Tensor{ {Name: "blk.0.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: bytes.NewReader(make([]byte, 32))}, {Name: "blk.1.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: bytes.NewReader(make([]byte, 32))}, {Name: "blk.2.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: bytes.NewReader(make([]byte, 32))}, diff --git a/llm/server.go b/llm/server.go index 82b514b2c..8884d105a 100644 --- a/llm/server.go +++ b/llm/server.go @@ -44,6 +44,7 @@ type LlamaServer interface { EstimatedVRAM() uint64 // Total VRAM across all GPUs EstimatedTotal() uint64 EstimatedVRAMByGPU(gpuID string) uint64 + Pid() int } // llmServer is an instance of the llama.cpp server @@ -216,10 +217,6 @@ func NewLlamaServer(gpus discover.GpuInfoList, modelPath string, f *ggml.GGML, a params = append(params, "--no-mmap") } - if opts.UseMLock { - params = append(params, "--mlock") - } - // TODO - NUMA support currently doesn't work properly params = append(params, "--parallel", strconv.Itoa(numParallel)) @@ -289,7 +286,7 @@ func NewLlamaServer(gpus discover.GpuInfoList, modelPath string, f *ggml.GGML, a params = append(params, "--mmproj", projectors[0]) } - // iterate through compatible GPU libraries such as 'cuda_v12', 'cuda_v11', 'rocm', etc. + // iterate through compatible GPU libraries such as 'cuda_v12', 'rocm', etc. // adding each library's respective path to the LD_LIBRARY_PATH, until finally running // without any LD_LIBRARY_PATH flags for { @@ -324,21 +321,23 @@ func NewLlamaServer(gpus discover.GpuInfoList, modelPath string, f *ggml.GGML, a pathEnv = "LD_LIBRARY_PATH" } - var libraryPaths []string + // Note: we always put our dependency paths first + // since these are the exact version we compiled/linked against + libraryPaths := []string{discover.LibOllamaPath} if libraryPath, ok := os.LookupEnv(pathEnv); ok { libraryPaths = append(libraryPaths, filepath.SplitList(libraryPath)...) } + ggmlPaths := []string{discover.LibOllamaPath} if len(compatible) > 0 { c := compatible[0] if libpath, ok := libs[c]; ok { slog.Debug("adding gpu library", "path", libpath) - libraryPaths = append(libraryPaths, libpath) + libraryPaths = append([]string{libpath}, libraryPaths...) + ggmlPaths = append(ggmlPaths, libpath) } } - // Note: we always put the dependency path first - // since this was the exact version we compiled/linked against if gpus[0].DependencyPath != nil { slog.Debug("adding gpu dependency paths", "paths", gpus[0].DependencyPath) // assume gpus from the same library have the same dependency path @@ -369,6 +368,8 @@ func NewLlamaServer(gpus discover.GpuInfoList, modelPath string, f *ggml.GGML, a s.cmd.Stderr = s.status s.cmd.SysProcAttr = LlamaServerSysProcAttr + s.cmd.Env = append(s.cmd.Env, "OLLAMA_LIBRARY_PATH="+strings.Join(ggmlPaths, string(filepath.ListSeparator))) + envWorkarounds := [][2]string{} for _, gpu := range gpus { envWorkarounds = append(envWorkarounds, gpu.EnvWorkarounds...) @@ -406,7 +407,8 @@ func NewLlamaServer(gpus discover.GpuInfoList, modelPath string, f *ggml.GGML, a if envconfig.Debug() { filteredEnv := []string{} for _, ev := range s.cmd.Env { - if strings.HasPrefix(ev, "CUDA_") || + if strings.HasPrefix(ev, "OLLAMA_") || + strings.HasPrefix(ev, "CUDA_") || strings.HasPrefix(ev, "ROCR_") || strings.HasPrefix(ev, "ROCM_") || strings.HasPrefix(ev, "HIP_") || @@ -515,6 +517,9 @@ func (s *llmServer) getServerStatus(ctx context.Context) (ServerStatus, error) { if errors.Is(err, context.DeadlineExceeded) { return ServerStatusNotResponding, errors.New("server not responding") } + if strings.Contains(err.Error(), "connection refused") { + return ServerStatusNotResponding, errors.New("connection refused") + } return ServerStatusError, fmt.Errorf("health resp: %w", err) } defer resp.Body.Close() @@ -635,6 +640,13 @@ func (s *llmServer) WaitUntilRunning(ctx context.Context) error { } } +func (s *llmServer) Pid() int { + if s.cmd != nil && s.cmd.Process != nil { + return s.cmd.Process.Pid + } + return -1 +} + var grammarJSON = ` root ::= object value ::= object | array | string | number | ("true" | "false" | "null") ws @@ -998,17 +1010,17 @@ func (s *llmServer) Close() error { s.llamaModelLock.Unlock() if s.cmd != nil { - slog.Debug("stopping llama server") + slog.Debug("stopping llama server", "pid", s.Pid()) if err := s.cmd.Process.Kill(); err != nil { return err } // if ProcessState is already populated, Wait already completed, no need to wait again if s.cmd.ProcessState == nil { - slog.Debug("waiting for llama server to exit") + slog.Debug("waiting for llama server to exit", "pid", s.Pid()) <-s.done } - slog.Debug("llama server stopped") + slog.Debug("llama server stopped", "pid", s.Pid()) } return nil diff --git a/llm/server_test.go b/llm/server_test.go index 6c8f7590b..b6a8705e5 100644 --- a/llm/server_test.go +++ b/llm/server_test.go @@ -16,7 +16,7 @@ func TestLLMServerCompletionFormat(t *testing.T) { // of a mess, and but it's good enough, until we can refactoring the // Completion method to be more testable. - ctx, cancel := context.WithCancel(context.Background()) + ctx, cancel := context.WithCancel(t.Context()) s := &llmServer{ sem: semaphore.NewWeighted(1), // required to prevent nil panic } diff --git a/ml/backend/ggml/ggml.go b/ml/backend/ggml/ggml.go index 177ac6fd0..aace1335d 100644 --- a/ml/backend/ggml/ggml.go +++ b/ml/backend/ggml/ggml.go @@ -312,6 +312,7 @@ func New(ctx context.Context, r *os.File, params ml.BackendParams) (ml.Backend, g, ctx := errgroup.WithContext(ctx) g.SetLimit(runtime.GOMAXPROCS(0)) for _, t := range meta.Tensors().Items() { + t := t g.Go(func() error { tts := make([]*C.struct_ggml_tensor, max(1, len(targets[t.Name]))) for i := range tts { @@ -341,6 +342,11 @@ func New(ctx context.Context, r *os.File, params ml.BackendParams) (ml.Backend, var s uint64 for s < t.Size() { + // Stop if either the parent context has been canceled or if any of the other tensors returned an error + if err := ctx.Err(); err != nil { + return err + } + n, err := io.ReadFull(sr, bts[:min(len(bts), int(t.Size()-s))]) if err != nil { slog.Warn("file read error", "file", r.Name(), "error", err) @@ -363,14 +369,6 @@ func New(ctx context.Context, r *os.File, params ml.BackendParams) (ml.Backend, }) } - // start a goroutine to cancel the errgroup if the parent context is done - go func() { - <-ctx.Done() - g.Go(func() error { - return ctx.Err() - }) - }() - if err := g.Wait(); err != nil { return nil, err } diff --git a/ml/backend/ggml/ggml/include/ggml-cpu.h b/ml/backend/ggml/ggml/include/ggml-cpu.h index f5e11f1e1..de77a875e 100644 --- a/ml/backend/ggml/ggml/include/ggml-cpu.h +++ b/ml/backend/ggml/ggml/include/ggml-cpu.h @@ -133,6 +133,11 @@ extern "C" { GGML_BACKEND_API ggml_backend_reg_t ggml_backend_cpu_reg(void); + GGML_BACKEND_API void ggml_cpu_fp32_to_fp16(const float *, ggml_fp16_t *, int64_t); + GGML_BACKEND_API void ggml_cpu_fp16_to_fp32(const ggml_fp16_t *, float *, int64_t); + GGML_BACKEND_API void ggml_cpu_fp32_to_bf16(const float *, ggml_bf16_t *, int64_t); + GGML_BACKEND_API void ggml_cpu_bf16_to_fp32(const ggml_bf16_t *, float *, int64_t); + #ifdef __cplusplus } #endif diff --git a/ml/backend/ggml/ggml/include/ggml-rpc.h b/ml/backend/ggml/ggml/include/ggml-rpc.h index c8b6097f7..1e6741127 100644 --- a/ml/backend/ggml/ggml/include/ggml-rpc.h +++ b/ml/backend/ggml/ggml/include/ggml-rpc.h @@ -7,7 +7,7 @@ extern "C" { #endif -#define RPC_PROTO_MAJOR_VERSION 1 +#define RPC_PROTO_MAJOR_VERSION 2 #define RPC_PROTO_MINOR_VERSION 0 #define RPC_PROTO_PATCH_VERSION 0 #define GGML_RPC_MAX_SERVERS 16 diff --git a/ml/backend/ggml/ggml/include/ggml.h b/ml/backend/ggml/ggml/include/ggml.h index d19fc1678..53ef31b22 100644 --- a/ml/backend/ggml/ggml/include/ggml.h +++ b/ml/backend/ggml/ggml/include/ggml.h @@ -393,8 +393,8 @@ extern "C" { // precision enum ggml_prec { - GGML_PREC_DEFAULT, - GGML_PREC_F32, + GGML_PREC_DEFAULT = 0, // stored as ggml_tensor.op_params, 0 by default + GGML_PREC_F32 = 10, }; // model file types @@ -481,6 +481,7 @@ extern "C" { GGML_OP_CONV_TRANSPOSE_1D, GGML_OP_IM2COL, GGML_OP_IM2COL_BACK, + GGML_OP_CONV_2D_DW, GGML_OP_CONV_TRANSPOSE_2D, GGML_OP_POOL_1D, GGML_OP_POOL_2D, @@ -678,6 +679,9 @@ extern "C" { GGML_API bool ggml_is_contiguous_1(const struct ggml_tensor * tensor); // contiguous for dims >= 1 GGML_API bool ggml_is_contiguous_2(const struct ggml_tensor * tensor); // contiguous for dims >= 2 + // true for tensor that is stored in memory as CxWxHxN and has been permuted to WxHxCxN + GGML_API bool ggml_is_contiguous_channels(const struct ggml_tensor * tensor); + GGML_API bool ggml_are_same_shape (const struct ggml_tensor * t0, const struct ggml_tensor * t1); GGML_API bool ggml_are_same_stride(const struct ggml_tensor * t0, const struct ggml_tensor * t1); @@ -1661,7 +1665,7 @@ extern "C" { struct ggml_tensor * a, struct ggml_tensor * b); - // depthwise + // depthwise (via im2col and mul_mat) GGML_API struct ggml_tensor * ggml_conv_2d_dw( struct ggml_context * ctx, struct ggml_tensor * a, // convolution kernel @@ -1673,6 +1677,22 @@ extern "C" { int d0, // dilation dimension 0 int d1); // dilation dimension 1 + // Depthwise 2D convolution + // may be faster than ggml_conv_2d_dw, but not available in all backends + // a: KW KH 1 C convolution kernel + // b: W H C N input data + // res: W_out H_out C N + GGML_API struct ggml_tensor * ggml_conv_2d_dw_direct( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + int stride0, + int stride1, + int pad0, + int pad1, + int dilation0, + int dilation1); + GGML_API struct ggml_tensor * ggml_conv_transpose_2d_p0( struct ggml_context * ctx, struct ggml_tensor * a, diff --git a/ml/backend/ggml/ggml/src/CMakeLists.txt b/ml/backend/ggml/ggml/src/CMakeLists.txt index d6b393a21..79c26312f 100644 --- a/ml/backend/ggml/ggml/src/CMakeLists.txt +++ b/ml/backend/ggml/ggml/src/CMakeLists.txt @@ -267,6 +267,7 @@ function(ggml_add_cpu_backend_variant tag_name) set(GGML_CPU_TAG_NAME ${tag_name}) # other: OPENMP LLAMAFILE CPU_HBM foreach (feat NATIVE + SSE42 AVX AVX2 BMI2 AVX_VNNI FMA F16C AVX512 AVX512_VBMI AVX512_VNNI AVX512_BF16 AMX_TILE AMX_INT8 AMX_BF16) @@ -288,11 +289,13 @@ if (GGML_CPU_ALL_VARIANTS) message(FATAL_ERROR "GGML_CPU_ALL_VARIANTS requires GGML_BACKEND_DL") endif() add_custom_target(ggml-cpu) - ggml_add_cpu_backend_variant(sandybridge AVX) - ggml_add_cpu_backend_variant(haswell AVX F16C AVX2 BMI2 FMA) - ggml_add_cpu_backend_variant(skylakex AVX F16C AVX2 BMI2 FMA AVX512) - ggml_add_cpu_backend_variant(icelake AVX F16C AVX2 BMI2 FMA AVX512 AVX512_VBMI AVX512_VNNI) - ggml_add_cpu_backend_variant(alderlake AVX F16C AVX2 BMI2 FMA AVX_VNNI) + ggml_add_cpu_backend_variant(x64) + ggml_add_cpu_backend_variant(sse42 SSE42) + ggml_add_cpu_backend_variant(sandybridge SSE42 AVX) + ggml_add_cpu_backend_variant(haswell SSE42 AVX F16C AVX2 BMI2 FMA) + ggml_add_cpu_backend_variant(skylakex SSE42 AVX F16C AVX2 BMI2 FMA AVX512) + ggml_add_cpu_backend_variant(icelake SSE42 AVX F16C AVX2 BMI2 FMA AVX512 AVX512_VBMI AVX512_VNNI) + ggml_add_cpu_backend_variant(alderlake SSE42 AVX F16C AVX2 BMI2 FMA AVX_VNNI) elseif (GGML_CPU) ggml_add_cpu_backend_variant_impl("") endif() diff --git a/ml/backend/ggml/ggml/src/ggml-alloc.c b/ml/backend/ggml/ggml/src/ggml-alloc.c index a3d3f6901..5fd379f6a 100644 --- a/ml/backend/ggml/ggml/src/ggml-alloc.c +++ b/ml/backend/ggml/ggml/src/ggml-alloc.c @@ -816,7 +816,10 @@ static void ggml_gallocr_init_tensor(ggml_gallocr_t galloc, struct ggml_tensor * static bool ggml_gallocr_node_needs_realloc(ggml_gallocr_t galloc, struct ggml_tensor * node, struct tensor_alloc * talloc) { size_t node_size = 0; if (!node->data && !node->view_src) { - GGML_ASSERT(talloc->buffer_id >= 0); // prevent segfault when misusing the API + // If we previously had data but don't now then reallocate + if (talloc->buffer_id < 0) { + return false; + } node_size = ggml_backend_buft_get_alloc_size(galloc->bufts[talloc->buffer_id], node); } return talloc->size_max >= node_size; diff --git a/ml/backend/ggml/ggml/src/ggml-cpu/CMakeLists.txt b/ml/backend/ggml/ggml/src/ggml-cpu/CMakeLists.txt index e73a3b69b..9a3085bef 100644 --- a/ml/backend/ggml/ggml/src/ggml-cpu/CMakeLists.txt +++ b/ml/backend/ggml/ggml/src/ggml-cpu/CMakeLists.txt @@ -222,7 +222,7 @@ function(ggml_add_cpu_backend_variant_impl tag_name) elseif (GGML_AVX) list(APPEND ARCH_FLAGS /arch:AVX) list(APPEND ARCH_DEFINITIONS GGML_AVX) - else () + elseif (GGML_SSE42) list(APPEND ARCH_FLAGS /arch:SSE4.2) list(APPEND ARCH_DEFINITIONS GGML_SSE42) endif() @@ -237,8 +237,10 @@ function(ggml_add_cpu_backend_variant_impl tag_name) if (GGML_NATIVE) list(APPEND ARCH_FLAGS -march=native) else () - list(APPEND ARCH_FLAGS -msse4.2) - list(APPEND ARCH_DEFINITIONS GGML_SSE42) + if (GGML_SSE42) + list(APPEND ARCH_FLAGS -msse4.2) + list(APPEND ARCH_DEFINITIONS GGML_SSE42) + endif() if (GGML_F16C) list(APPEND ARCH_FLAGS -mf16c) list(APPEND ARCH_DEFINITIONS GGML_F16C) @@ -350,10 +352,14 @@ function(ggml_add_cpu_backend_variant_impl tag_name) # TODO: Separation to determine activation of VX/VXE/VXE2 if (${S390X_M} MATCHES "8561|8562") message(STATUS "z15 target") - list(APPEND ARCH_FLAGS -march=z15 -mtune=z15) + list(APPEND ARCH_FLAGS -march=z15) elseif (${S390X_M} MATCHES "3931") message(STATUS "z16 target") - list(APPEND ARCH_FLAGS -march=z16 -mtune=z16) + list(APPEND ARCH_FLAGS -march=z16) + elseif (${S390X_M} MATCHES "9175|9176") + # NOTE: Only available from GCC 15.1.0 onwards. Any z17 machine with compile issues must first verify their GCC version. + message(STATUS "z17 target") + list(APPEND ARCH_FLAGS -march=z17) else() message(STATUS "Unknown target") message(WARNING "Unknown target. If you are compiling for z14 and earlier, you might have to add -DGGML_VXE=OFF.") diff --git a/ml/backend/ggml/ggml/src/ggml-cpu/cpu-feats-x86.cpp b/ml/backend/ggml/ggml/src/ggml-cpu/cpu-feats-x86.cpp index 902ee4346..d775a0363 100644 --- a/ml/backend/ggml/ggml/src/ggml-cpu/cpu-feats-x86.cpp +++ b/ml/backend/ggml/ggml/src/ggml-cpu/cpu-feats-x86.cpp @@ -263,7 +263,7 @@ void test_x86_is() { static int ggml_backend_cpu_x86_score() { // FIXME: this does not check for OS support - int score = 0; + int score = 1; cpuid_x86 is; #ifdef GGML_FMA diff --git a/ml/backend/ggml/ggml/src/ggml-cpu/ggml-cpu.c b/ml/backend/ggml/ggml/src/ggml-cpu/ggml-cpu.c index 6d4abe4c7..59bd3c621 100644 --- a/ml/backend/ggml/ggml/src/ggml-cpu/ggml-cpu.c +++ b/ml/backend/ggml/ggml/src/ggml-cpu/ggml-cpu.c @@ -217,7 +217,7 @@ static const struct ggml_type_traits_cpu type_traits_cpu[GGML_TYPE_COUNT] = { .nrows = 1, }, [GGML_TYPE_F16] = { - .from_float = (ggml_from_float_t) ggml_fp32_to_fp16_row, + .from_float = (ggml_from_float_t) ggml_cpu_fp32_to_fp16, .vec_dot = (ggml_vec_dot_t) ggml_vec_dot_f16, .vec_dot_type = GGML_TYPE_F16, .nrows = 1, @@ -358,7 +358,7 @@ static const struct ggml_type_traits_cpu type_traits_cpu[GGML_TYPE_COUNT] = { .from_float = quantize_row_q8_K, }, [GGML_TYPE_BF16] = { - .from_float = (ggml_from_float_t) ggml_fp32_to_bf16_row, + .from_float = (ggml_from_float_t) ggml_cpu_fp32_to_bf16, .vec_dot = (ggml_vec_dot_t) ggml_vec_dot_bf16, .vec_dot_type = GGML_TYPE_BF16, .nrows = 1, @@ -1934,6 +1934,10 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm { ggml_compute_forward_im2col_back_f32(params, tensor); } break; + case GGML_OP_CONV_2D_DW: + { + ggml_compute_forward_conv_2d_dw(params, tensor); + } break; case GGML_OP_CONV_TRANSPOSE_2D: { ggml_compute_forward_conv_transpose_2d(params, tensor); @@ -2274,6 +2278,7 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) { } break; case GGML_OP_IM2COL: case GGML_OP_IM2COL_BACK: + case GGML_OP_CONV_2D_DW: case GGML_OP_CONV_TRANSPOSE_1D: case GGML_OP_CONV_TRANSPOSE_2D: { @@ -3172,6 +3177,93 @@ enum ggml_status ggml_graph_compute_with_ctx(struct ggml_context * ctx, struct g return ggml_graph_compute(cgraph, &cplan); } +void ggml_cpu_fp32_to_fp16(const float * x, ggml_fp16_t * y, int64_t n) { + int64_t i = 0; +#if defined(__F16C__) +#if defined(__AVX512F__) + for (; i + 15 < n; i += 16) { + __m512 x_vec = _mm512_loadu_ps(x + i); + __m256i y_vec = _mm512_cvtps_ph(x_vec, _MM_FROUND_TO_NEAREST_INT); + _mm256_storeu_si256((__m256i *)(y + i), y_vec); + } +#endif + for (; i + 7 < n; i += 8) { + __m256 x_vec = _mm256_loadu_ps(x + i); + __m128i y_vec = _mm256_cvtps_ph(x_vec, _MM_FROUND_TO_NEAREST_INT); + _mm_storeu_si128((__m128i *)(y + i), y_vec); + } + for (; i + 3 < n; i += 4) { + __m128 x_vec = _mm_loadu_ps(x + i); + __m128i y_vec = _mm_cvtps_ph(x_vec, _MM_FROUND_TO_NEAREST_INT); + _mm_storel_epi64((__m128i *)(y + i), y_vec); + } +#endif + for (; i < n; ++i) { + y[i] = GGML_FP32_TO_FP16(x[i]); + } +} + +void ggml_cpu_fp16_to_fp32(const ggml_fp16_t * x, float * y, int64_t n) { + int64_t i = 0; +#if defined(__F16C__) +#if defined(__AVX512F__) + for (; i + 15 < n; i += 16) { + __m256i x_vec = _mm256_loadu_si256((const __m256i *)(x + i)); + __m512 y_vec = _mm512_cvtph_ps(x_vec); + _mm512_storeu_ps(y + i, y_vec); + } +#endif + for (; i + 7 < n; i += 8) { + __m128i x_vec = _mm_loadu_si128((const __m128i *)(x + i)); + __m256 y_vec = _mm256_cvtph_ps(x_vec); + _mm256_storeu_ps(y + i, y_vec); + } + for (; i + 3 < n; i += 4) { + __m128i x_vec = _mm_loadl_epi64((const __m128i *)(x + i)); + __m128 y_vec = _mm_cvtph_ps(x_vec); + _mm_storeu_ps(y + i, y_vec); + } +#endif + for (; i < n; ++i) { + y[i] = GGML_FP16_TO_FP32(x[i]); + } +} + +void ggml_cpu_fp32_to_bf16(const float * x, ggml_bf16_t * y, int64_t n) { + int64_t i = 0; + for (; i < n; ++i) { + y[i] = GGML_FP32_TO_BF16(x[i]); + } +} + +void ggml_cpu_bf16_to_fp32(const ggml_bf16_t * x, float * y, int64_t n) { + int64_t i = 0; +#if defined(__AVX2__) +#if defined(__AVX512F__) + for (; i + 15 < n; i += 16) { + _mm512_storeu_ps(y + i, + _mm512_castsi512_ps( + _mm512_slli_epi32( + _mm512_cvtepu16_epi32( + _mm256_loadu_si256( + (const __m256i *)(x + i))), + 16))); + } +#endif + for (; i + 7 < n; i += 8) { + _mm256_storeu_ps(y + i, + _mm256_castsi256_ps( + _mm256_slli_epi32( + _mm256_cvtepu16_epi32( + _mm_loadu_si128( + (const __m128i *)(x + i))), + 16))); + } +#endif + for (; i < n; i++) { + y[i] = GGML_BF16_TO_FP32(x[i]); + } +} int ggml_cpu_has_avx(void) { #if defined(__AVX__) diff --git a/ml/backend/ggml/ggml/src/ggml-cpu/ops.cpp b/ml/backend/ggml/ggml/src/ggml-cpu/ops.cpp index 66b8da68f..becdae075 100644 --- a/ml/backend/ggml/ggml/src/ggml-cpu/ops.cpp +++ b/ml/backend/ggml/ggml/src/ggml-cpu/ops.cpp @@ -4222,7 +4222,7 @@ static void ggml_compute_forward_get_rows_f16( GGML_ASSERT(i01 >= 0 && i01 < ne01); - ggml_fp16_to_fp32_row( + ggml_cpu_fp16_to_fp32( (const ggml_fp16_t*) ((char *) src0->data + i01*nb01 + i11*nb02 + i12*nb03), (float *) ((char *) dst->data + i10*nb1 + i11*nb2 + i12*nb3), nc); } @@ -4263,7 +4263,7 @@ static void ggml_compute_forward_get_rows_bf16( GGML_ASSERT(i01 >= 0 && i01 < ne01); - ggml_bf16_to_fp32_row( + ggml_cpu_bf16_to_fp32( (const ggml_bf16_t *) ((char *) src0->data + i01*nb01 + i11*nb02 + i12*nb03), (float *) ((char *) dst->data + i10*nb1 + i11*nb2 + i12*nb3), nc); } @@ -6064,6 +6064,178 @@ void ggml_compute_forward_conv_transpose_2d( } } +// ggml_compute_forward_conv_2d_dw + +struct ggml_conv_2d_dw_params { + int64_t channels; + int64_t batch; + int64_t src_w; + int64_t src_h; + int64_t dst_w; + int64_t dst_h; + int64_t knl_w; + int64_t knl_h; + int stride_x; + int stride_y; + int pad_x; + int pad_y; + int dilation_x; + int dilation_y; +}; + +static void ggml_compute_forward_conv_2d_dw_cwhn( + const ggml_compute_params * params, + const ggml_tensor * src, + const ggml_tensor * kernel, + ggml_tensor * dst, + const ggml_conv_2d_dw_params & p) { + + const int64_t c = p.channels; + const float * knl_data = (const float *)kernel->data; + + const int64_t rows_total = p.dst_h * p.batch; + const int64_t rows_per_thread = (rows_total + params->nth - 1) / params->nth; + const int64_t row_start = params->ith * rows_per_thread; + const int64_t row_end = MIN(row_start + rows_per_thread, rows_total); + +#ifdef GGML_SIMD + const int64_t pkg_size = GGML_F32_EPR; + const int64_t pkg_count = c / pkg_size; + const int64_t c_pkg_end = pkg_count * pkg_size; +#else + const int64_t c_pkg_end = 0; +#endif + + for (int64_t row = row_start; row < row_end; ++row) { + const int64_t dst_y = row % p.dst_h; + const float * src_data = (const float *)src->data + (row / p.dst_h) * p.src_w * p.src_h * c; + for (int64_t dst_x = 0; dst_x < p.dst_w; ++dst_x) { + float * dst_data = (float *)dst->data + (row * p.dst_w + dst_x) * c; + const int64_t src_y_base = dst_y * p.stride_y - p.pad_y; + const int64_t src_x_base = dst_x * p.stride_x - p.pad_x; + +#ifdef GGML_SIMD + // Vectorized loop + for (int64_t c_i = 0; c_i < c_pkg_end; c_i += pkg_size) { + GGML_F32_VEC sum = GGML_F32_VEC_ZERO; + for (int64_t knl_y = 0; knl_y < p.knl_h; ++knl_y) { + const int64_t src_y = src_y_base + knl_y * p.dilation_y; + if (src_y < 0 || src_y >= p.src_h) { + continue; + } + for (int64_t knl_x = 0; knl_x < p.knl_w; ++knl_x) { + const int64_t src_x = src_x_base + knl_x * p.dilation_x; + if (src_x < 0 || src_x >= p.src_w) { + continue; + } + GGML_F32_VEC k = GGML_F32_VEC_LOAD(knl_data + (knl_y * p.knl_w + knl_x) * c + c_i); + GGML_F32_VEC s = GGML_F32_VEC_LOAD(src_data + (src_y * p.src_w + src_x) * c + c_i); + sum = GGML_F32_VEC_FMA(sum, k, s); + } + } + GGML_F32_VEC_STORE(dst_data + c_i, sum); + } +#endif + // Scalar loop + for (int64_t c_i = c_pkg_end; c_i < c; ++c_i) { + float sum = 0.0f; + for (int64_t knl_y = 0; knl_y < p.knl_h; ++knl_y) { + const int64_t src_y = src_y_base + knl_y * p.dilation_y; + if (src_y < 0 || src_y >= p.src_h) { + continue; + } + for (int64_t knl_x = 0; knl_x < p.knl_w; ++knl_x) { + const int64_t src_x = src_x_base + knl_x * p.dilation_x; + if (src_x < 0 || src_x >= p.src_w) { + continue; + } + sum += knl_data[(knl_y * p.knl_w + knl_x) * c + c_i] + * src_data[(src_y * p.src_w + src_x) * c + c_i]; + } + } + dst_data[c_i] = sum; + } + } + } +} + +static void ggml_compute_forward_conv_2d_dw_whcn( + const ggml_compute_params * params, + const ggml_tensor * src, + const ggml_tensor * kernel, + ggml_tensor * dst, + const ggml_conv_2d_dw_params & p) { + + const int64_t n = p.channels * p.batch; + const int64_t per_thread = (n + params->nth - 1) / params->nth; + const int64_t start = params->ith * per_thread; + const int64_t end = MIN(start + per_thread, n); + + for (int64_t i = start; i < end; ++i) { + const float * knl_data = (const float *)kernel->data + (i % p.channels) * p.knl_w * p.knl_h; + const float * src_data = (const float *)src->data + i * p.src_w * p.src_h; + float * dst_data = (float *)dst->data + i * p.dst_w * p.dst_h; + + for (int64_t dst_y = 0; dst_y < p.dst_h; ++dst_y) { + for (int64_t dst_x = 0; dst_x < p.dst_w; ++dst_x) { + + float sum = 0.0f; + for (int64_t knl_y = 0; knl_y < p.knl_h; ++knl_y) { + const int64_t src_y = dst_y * p.stride_y + knl_y * p.dilation_y - p.pad_y; + if (src_y < 0 || src_y >= p.src_h) { + continue; + } + for (int64_t knl_x = 0; knl_x < p.knl_w; ++knl_x) { + const int64_t src_x = dst_x * p.stride_x + knl_x * p.dilation_x - p.pad_x; + if (src_x < 0 || src_x >= p.src_w) { + continue; + } + sum += knl_data[knl_y * p.knl_w + knl_x] + * src_data[src_y * p.src_w + src_x]; + } + } + dst_data[dst_y * p.dst_w + dst_x] = sum; + } + } + } +} + +void ggml_compute_forward_conv_2d_dw( + const ggml_compute_params * params, + ggml_tensor * dst) { + + const ggml_tensor * kernel = dst->src[0]; + const ggml_tensor * src = dst->src[1]; + ggml_conv_2d_dw_params p; + p.channels = src->ne[2]; + p.batch = src->ne[3]; + p.src_w = src->ne[0]; + p.src_h = src->ne[1]; + p.dst_w = dst->ne[0]; + p.dst_h = dst->ne[1]; + p.knl_w = kernel->ne[0]; + p.knl_h = kernel->ne[1]; + p.stride_x = dst->op_params[0]; + p.stride_y = dst->op_params[1]; + p.pad_x = dst->op_params[2]; + p.pad_y = dst->op_params[3]; + p.dilation_x = dst->op_params[4]; + p.dilation_y = dst->op_params[5]; + + GGML_ASSERT(kernel->ne[3] == p.channels); + GGML_ASSERT(dst->ne[3] == p.batch); + + if (ggml_is_contiguous(src)) { + ggml_compute_forward_conv_2d_dw_whcn(params, src, kernel, dst, p); + } else if (ggml_is_contiguous_channels(src)) { + // kernel should also have channels most contiguous in memory + GGML_ASSERT(kernel->nb[0] >= kernel->nb[2] && kernel->nb[1] >= kernel->nb[0]); + ggml_compute_forward_conv_2d_dw_cwhn(params, src, kernel, dst, p); + } else { + GGML_ABORT("non-contiguous memory layout not supported"); + } +} + // ggml_compute_forward_pool_1d_sk_p0 static void ggml_compute_forward_pool_1d_sk_p0( diff --git a/ml/backend/ggml/ggml/src/ggml-cpu/ops.h b/ml/backend/ggml/ggml/src/ggml-cpu/ops.h index 3eca1cf86..a7125555e 100644 --- a/ml/backend/ggml/ggml/src/ggml-cpu/ops.h +++ b/ml/backend/ggml/ggml/src/ggml-cpu/ops.h @@ -65,6 +65,7 @@ void ggml_compute_forward_conv_transpose_1d(const struct ggml_compute_params * p void ggml_compute_forward_im2col(const struct ggml_compute_params * params, struct ggml_tensor * dst); void ggml_compute_forward_im2col_back_f32(const struct ggml_compute_params * params, struct ggml_tensor * dst); void ggml_compute_forward_conv_transpose_2d(const struct ggml_compute_params * params, struct ggml_tensor * dst); +void ggml_compute_forward_conv_2d_dw(const struct ggml_compute_params * params, struct ggml_tensor * dst); void ggml_compute_forward_pool_1d(const struct ggml_compute_params * params, struct ggml_tensor * dst); void ggml_compute_forward_pool_2d(const struct ggml_compute_params * params, struct ggml_tensor * dst); void ggml_compute_forward_pool_2d_back(const struct ggml_compute_params * params, struct ggml_tensor * dst); diff --git a/ml/backend/ggml/ggml/src/ggml-cpu/simd-mappings.h b/ml/backend/ggml/ggml/src/ggml-cpu/simd-mappings.h index 04d10cec2..45c31cf1f 100644 --- a/ml/backend/ggml/ggml/src/ggml-cpu/simd-mappings.h +++ b/ml/backend/ggml/ggml/src/ggml-cpu/simd-mappings.h @@ -341,7 +341,7 @@ static inline void __avx_f32cx8_store(ggml_fp16_t *x, __m256 y) { #define GGML_F32_EPR 4 #define GGML_F32x4 vector float -#define GGML_F32x4_ZERO 0.0f +#define GGML_F32x4_ZERO {0.0f} #define GGML_F32x4_SET1 vec_splats #define GGML_F32x4_LOAD(p) vec_xl(0, p) #define GGML_F32x4_STORE(p, r) vec_xst(r, 0, p) diff --git a/ml/backend/ggml/ggml/src/ggml-cuda/common.cuh b/ml/backend/ggml/ggml/src/ggml-cuda/common.cuh index 8284a0017..2ea014e64 100644 --- a/ml/backend/ggml/ggml/src/ggml-cuda/common.cuh +++ b/ml/backend/ggml/ggml/src/ggml-cuda/common.cuh @@ -78,13 +78,13 @@ // Moore Threads #define GGML_CUDA_MUSA_ARCH_IS_QY1 (__MUSA_ARCH__ <= 210) -#define GGML_CUDA_CC_QY1 (GGML_MUSA_CC_OFFSET_MTHREADS + 0x210) // MTT S80, MTT S3000 -#define GGML_CUDA_CC_QY2 (GGML_MUSA_CC_OFFSET_MTHREADS + 0x220) // MTT S4000 -#define GGML_CUDA_CC_NG (GGML_MUSA_CC_OFFSET_MTHREADS + 0x310) // TBD +#define GGML_CUDA_CC_QY1 (GGML_CUDA_CC_OFFSET_MTHREADS + 0x210) // MTT S80, MTT S3000 +#define GGML_CUDA_CC_QY2 (GGML_CUDA_CC_OFFSET_MTHREADS + 0x220) // MTT S4000 +#define GGML_CUDA_CC_NG (GGML_CUDA_CC_OFFSET_MTHREADS + 0x310) // TBD #define GGML_CUDA_CC_IS_MTHREADS(cc) (cc >= GGML_CUDA_CC_OFFSET_MTHREADS && cc < GGML_CUDA_CC_OFFSET_AMD) #define GGML_CUDA_CC_IS_QY1(cc) (cc >= GGML_CUDA_CC_QY1 && cc < GGML_CUDA_CC_QY2) -#define GGML_CUDA_CC_IS_QY2(cc) (cc >= GGML_CUDA_CC_QY2 && cc < GGML_CUDA_CC_NEXT) +#define GGML_CUDA_CC_IS_QY2(cc) (cc >= GGML_CUDA_CC_QY2 && cc < GGML_CUDA_CC_NG) #define GGML_CUDA_CC_IS_NG(cc) (cc >= GGML_CUDA_CC_NG) #ifdef __CUDA_ARCH_LIST__ diff --git a/ml/backend/ggml/ggml/src/ggml-cuda/convert.cu b/ml/backend/ggml/ggml/src/ggml-cuda/convert.cu index a224ec0e1..c6dec4276 100644 --- a/ml/backend/ggml/ggml/src/ggml-cuda/convert.cu +++ b/ml/backend/ggml/ggml/src/ggml-cuda/convert.cu @@ -1,6 +1,8 @@ #include "convert.cuh" #include "dequantize.cuh" +#include + #define CUDA_Q8_0_NE_ALIGN 2048 template @@ -570,30 +572,46 @@ static void dequantize_row_iq4_xs_cuda(const void * vx, dst_t * y, const int64_t } template -static __global__ void convert_unary(const void * __restrict__ vx, dst_t * __restrict__ y, const int64_t k) { - const int64_t i = (int64_t)blockDim.x*blockIdx.x + threadIdx.x; +static __global__ void convert_unary( + const void * __restrict__ vx, dst_t * __restrict__ y, const int64_t ne00, const int64_t ne01, const int64_t ne02, + const int64_t s01, const int64_t s02, const int64_t s03) { + const int64_t i00 = (int64_t)blockDim.x*blockIdx.x + threadIdx.x; - if (i >= k) { + if (i00 >= ne00) { return; } + const int64_t i01 = blockIdx.y; + const int64_t i02 = blockIdx.z % ne02; + const int64_t i03 = blockIdx.z / ne02; + const src_t * x = (const src_t *) vx; - y[i] = float(x[i]); + const int64_t ix = i03*s03 + i02*s02 + i01*s01 + i00; + const int64_t iy = ((i03*ne02 + i02)*ne01 + i01)*ne00 + i00; + y[iy] = float(x[ix]); } template -static void convert_unary_cuda(const void * __restrict__ vx, dst_t * __restrict__ y, const int64_t k, cudaStream_t stream) { - const int num_blocks = (k + CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / CUDA_DEQUANTIZE_BLOCK_SIZE; - convert_unary<<>>(vx, y, k); +static void convert_unary_cuda(const void * vx, dst_t * y, + const int64_t ne00, const int64_t ne01, const int64_t ne02, const int64_t ne03, + const int64_t s01, const int64_t s02, const int64_t s03, cudaStream_t stream) { + const dim3 num_blocks((ne00 + CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / CUDA_DEQUANTIZE_BLOCK_SIZE, ne01, ne02*ne03); + convert_unary<<>> + (vx, y, ne00, ne01, ne02, s01, s02, s03); +} + +template +static void convert_unary_cont_cuda(const void * vx, dst_t * y, const int64_t k, cudaStream_t stream) { + convert_unary_cuda(vx, y, k, 1, 1, 1, k, k, k, stream); } to_bf16_cuda_t ggml_get_to_bf16_cuda(ggml_type type) { switch (type) { case GGML_TYPE_F32: - return convert_unary_cuda; + return convert_unary_cont_cuda; case GGML_TYPE_F16: - return convert_unary_cuda; + return convert_unary_cont_cuda; default: return nullptr; } @@ -643,9 +661,9 @@ to_fp16_cuda_t ggml_get_to_fp16_cuda(ggml_type type) { case GGML_TYPE_IQ3_S: return dequantize_row_iq3_s_cuda; case GGML_TYPE_F32: - return convert_unary_cuda; + return convert_unary_cont_cuda; case GGML_TYPE_BF16: - return convert_unary_cuda; + return convert_unary_cont_cuda; default: return nullptr; } @@ -692,7 +710,18 @@ to_fp32_cuda_t ggml_get_to_fp32_cuda(ggml_type type) { case GGML_TYPE_IQ3_S: return dequantize_row_iq3_s_cuda; case GGML_TYPE_F16: - return convert_unary_cuda; + return convert_unary_cont_cuda; + case GGML_TYPE_BF16: + return convert_unary_cont_cuda; + default: + return nullptr; + } +} + +to_fp16_nc_cuda_t ggml_get_to_fp16_nc_cuda(ggml_type type) { + switch (type) { + case GGML_TYPE_F32: + return convert_unary_cuda; case GGML_TYPE_BF16: return convert_unary_cuda; default: diff --git a/ml/backend/ggml/ggml/src/ggml-cuda/convert.cuh b/ml/backend/ggml/ggml/src/ggml-cuda/convert.cuh index 411a13cf1..b65b98e08 100644 --- a/ml/backend/ggml/ggml/src/ggml-cuda/convert.cuh +++ b/ml/backend/ggml/ggml/src/ggml-cuda/convert.cuh @@ -3,7 +3,7 @@ #define CUDA_DEQUANTIZE_BLOCK_SIZE 256 template -using to_t_cuda_t = void (*)(const void * __restrict__ x, T * __restrict__ y, int64_t k, cudaStream_t stream); +using to_t_cuda_t = void (*)(const void * x, T * y, int64_t k, cudaStream_t stream); typedef to_t_cuda_t to_fp32_cuda_t; typedef to_t_cuda_t to_fp16_cuda_t; @@ -14,3 +14,13 @@ to_fp16_cuda_t ggml_get_to_fp16_cuda(ggml_type type); to_bf16_cuda_t ggml_get_to_bf16_cuda(ggml_type type); to_fp32_cuda_t ggml_get_to_fp32_cuda(ggml_type type); + +// TODO more general support for non-contiguous inputs + +template +using to_t_nc_cuda_t = void (*)(const void * x, T * y, + int64_t ne00, int64_t ne01, int64_t ne02, int64_t ne03, + int64_t s01, int64_t s02, int64_t s03, cudaStream_t stream); + +typedef to_t_nc_cuda_t to_fp16_nc_cuda_t; +to_fp16_nc_cuda_t ggml_get_to_fp16_nc_cuda(ggml_type type); diff --git a/ml/backend/ggml/ggml/src/ggml-cuda/cpy.cu b/ml/backend/ggml/ggml/src/ggml-cuda/cpy.cu index ed25646e8..2d46176ea 100644 --- a/ml/backend/ggml/ggml/src/ggml-cuda/cpy.cu +++ b/ml/backend/ggml/ggml/src/ggml-cuda/cpy.cu @@ -639,6 +639,8 @@ void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, gg if(ctx.cuda_graph->use_cpy_indirection && !disable_indirection_for_this_node) { ctx.cuda_graph->graph_cpynode_index = graph_cpynode_index; } +#else + GGML_UNUSED(disable_indirection_for_this_node); #endif } diff --git a/ml/backend/ggml/ggml/src/ggml-cuda/getrows.cu b/ml/backend/ggml/ggml/src/ggml-cuda/getrows.cu index 4cef53a98..ea8bf6916 100644 --- a/ml/backend/ggml/ggml/src/ggml-cuda/getrows.cu +++ b/ml/backend/ggml/ggml/src/ggml-cuda/getrows.cu @@ -33,8 +33,8 @@ static __global__ void k_get_rows( dfloat2 v; dequantize_kernel(src0_row, ib, iqs, v); - dst_row[iybs + iqs + 0] = v.x; - dst_row[iybs + iqs + y_offset] = v.y; + dst_row[iybs + iqs + 0] = float(v.x); + dst_row[iybs + iqs + y_offset] = float(v.y); } template @@ -60,7 +60,7 @@ static __global__ void k_get_rows_float( dst_t * dst_row = dst + i10*s1 + i11*s2 + i12*s3; const src0_t * src0_row = (const src0_t *)((const char *) src0 + i01*nb01 + i11*nb02 + i12*nb03); - dst_row[i00] = src0_row[i00]; + dst_row[i00] = float(src0_row[i00]); } template @@ -86,120 +86,159 @@ static __global__ void k_get_rows_back_float( dst[dst_row*ncols + col] = sum; } -template -static void get_rows_cuda( - const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, - const void * src0_dd, const int32_t * src1_dd, float * dst_dd, cudaStream_t stream) { - - GGML_TENSOR_BINARY_OP_LOCALS - +template +static void get_rows_cuda_q( + const void * src0_d, const int32_t * src1_d, dst_t * dst_d, + const int64_t ne00, const size_t nb01, const size_t nb02, const size_t nb03, + const int64_t ne10, const int64_t ne11, const int64_t ne12, const size_t nb10, const size_t nb11, const size_t nb12, + const size_t nb1, const size_t nb2, const size_t nb3, + cudaStream_t stream) { const dim3 block_dims(CUDA_GET_ROWS_BLOCK_SIZE, 1, 1); const int block_num_x = (ne00 + 2*CUDA_GET_ROWS_BLOCK_SIZE - 1) / (2*CUDA_GET_ROWS_BLOCK_SIZE); const dim3 block_nums(block_num_x, ne10, ne11*ne12); // strides in elements - //const size_t s0 = nb0 / ggml_element_size(dst); - const size_t s1 = nb1 / ggml_element_size(dst); - const size_t s2 = nb2 / ggml_element_size(dst); - const size_t s3 = nb3 / ggml_element_size(dst); + // const size_t s0 = nb0 / sizeof(dst_t); + const size_t s1 = nb1 / sizeof(dst_t); + const size_t s2 = nb2 / sizeof(dst_t); + const size_t s3 = nb3 / sizeof(dst_t); - const size_t s10 = nb10 / ggml_element_size(src1); - const size_t s11 = nb11 / ggml_element_size(src1); - const size_t s12 = nb12 / ggml_element_size(src1); - //const size_t s13 = nb13 / ggml_element_size(src1); + const size_t s10 = nb10 / sizeof(int32_t); + const size_t s11 = nb11 / sizeof(int32_t); + const size_t s12 = nb12 / sizeof(int32_t); + // const size_t s13 = nb13 / sizeof(int32_t); GGML_ASSERT(ne00 % 2 == 0); k_get_rows<<>>( - src0_dd, src1_dd, dst_dd, + src0_d, src1_d, dst_d, ne00, /*ne01, ne02, ne03,*/ /*ne10, ne11,*/ ne12, /*ne13,*/ /* s0,*/ s1, s2, s3, /* nb00,*/ nb01, nb02, nb03, s10, s11, s12/*, s13*/); - - GGML_UNUSED(dst); } -template +template static void get_rows_cuda_float( - const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, - const src0_t * src0_dd, const int32_t * src1_dd, float * dst_dd, cudaStream_t stream) { - - GGML_TENSOR_BINARY_OP_LOCALS - - GGML_ASSERT(ne13 == 1); - + const src0_t * src0_d, const int32_t * src1_d, dst_t * dst_d, + const int64_t ne00, const size_t nb01, const size_t nb02, const size_t nb03, + const int64_t ne10, const int64_t ne11, const int64_t ne12, const size_t nb10, const size_t nb11, const size_t nb12, + const size_t nb1, const size_t nb2, const size_t nb3, + cudaStream_t stream) { const dim3 block_dims(CUDA_GET_ROWS_BLOCK_SIZE, 1, 1); const int block_num_x = (ne00 + CUDA_GET_ROWS_BLOCK_SIZE - 1) / CUDA_GET_ROWS_BLOCK_SIZE; const dim3 block_nums(block_num_x, ne10, ne11*ne12); // strides in elements - //const size_t s0 = nb0 / ggml_element_size(dst); - const size_t s1 = nb1 / ggml_element_size(dst); - const size_t s2 = nb2 / ggml_element_size(dst); - const size_t s3 = nb3 / ggml_element_size(dst); + // const size_t s0 = nb0 / sizeof(dst_t); + const size_t s1 = nb1 / sizeof(dst_t); + const size_t s2 = nb2 / sizeof(dst_t); + const size_t s3 = nb3 / sizeof(dst_t); - const size_t s10 = nb10 / ggml_element_size(src1); - const size_t s11 = nb11 / ggml_element_size(src1); - const size_t s12 = nb12 / ggml_element_size(src1); - //const size_t s13 = nb13 / ggml_element_size(src1); + const size_t s10 = nb10 / sizeof(int32_t); + const size_t s11 = nb11 / sizeof(int32_t); + const size_t s12 = nb12 / sizeof(int32_t); + // const size_t s13 = nb13 / sizeof(int32_t); k_get_rows_float<<>>( - src0_dd, src1_dd, dst_dd, + src0_d, src1_d, dst_d, ne00, /*ne01, ne02, ne03,*/ /*ne10, ne11,*/ ne12, /*ne13,*/ /* s0,*/ s1, s2, s3, /* nb00,*/ nb01, nb02, nb03, s10, s11, s12/*, s13*/); +} - GGML_UNUSED(dst); +template +static void ggml_cuda_get_rows_switch_src0_type( + const void * src0_d, const ggml_type src0_type, const int32_t * src1_d, dst_t * dst_d, + const int64_t ne00, const size_t nb01, const size_t nb02, const size_t nb03, + const int64_t ne10, const int64_t ne11, const int64_t ne12, const size_t nb10, const size_t nb11, const size_t nb12, + const size_t nb1, const size_t nb2, const size_t nb3, + cudaStream_t stream) { + switch (src0_type) { + case GGML_TYPE_F16: + get_rows_cuda_float((const half *) src0_d, src1_d, dst_d, + ne00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb1, nb2, nb3, stream); + break; + case GGML_TYPE_F32: + get_rows_cuda_float((const float *) src0_d, src1_d, dst_d, + ne00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb1, nb2, nb3, stream); + break; + case GGML_TYPE_BF16: + get_rows_cuda_float((const nv_bfloat16 *) src0_d, src1_d, dst_d, + ne00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb1, nb2, nb3, stream); + break; + case GGML_TYPE_Q4_0: + get_rows_cuda_q(src0_d, src1_d, dst_d, + ne00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb1, nb2, nb3, stream); + break; + case GGML_TYPE_Q4_1: + get_rows_cuda_q(src0_d, src1_d, dst_d, + ne00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb1, nb2, nb3, stream); + break; + case GGML_TYPE_Q5_0: + get_rows_cuda_q(src0_d, src1_d, dst_d, + ne00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb1, nb2, nb3, stream); + break; + case GGML_TYPE_Q5_1: + get_rows_cuda_q(src0_d, src1_d, dst_d, + ne00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb1, nb2, nb3, stream); + break; + case GGML_TYPE_Q8_0: + get_rows_cuda_q(src0_d, src1_d, dst_d, + ne00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb1, nb2, nb3, stream); + break; + default: + // TODO: k-quants + GGML_ABORT("%s: unsupported src0 type: %s\n", __func__, ggml_type_name(src0_type)); + break; + } +} + +void get_rows_cuda( + const void * src0_d, ggml_type src0_type, const int32_t * src1_d, void * dst_d, ggml_type dst_type, + int64_t ne00, size_t nb01, size_t nb02, size_t nb03, + int64_t ne10, int64_t ne11, int64_t ne12, size_t nb10, size_t nb11, size_t nb12, + size_t nb1, size_t nb2, size_t nb3, + cudaStream_t stream) { + switch (dst_type) { + case GGML_TYPE_F32: + ggml_cuda_get_rows_switch_src0_type(src0_d, src0_type, src1_d, (float *) dst_d, + ne00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb1, nb2, nb3, stream); + break; + case GGML_TYPE_F16: + ggml_cuda_get_rows_switch_src0_type(src0_d, src0_type, src1_d, (half *) dst_d, + ne00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb1, nb2, nb3, stream); + break; + case GGML_TYPE_BF16: + ggml_cuda_get_rows_switch_src0_type(src0_d, src0_type, src1_d, (nv_bfloat16 *) dst_d, + ne00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb1, nb2, nb3, stream); + break; + default: + GGML_ABORT("%s: unsupported dst type: %s\n", __func__, ggml_type_name(dst_type)); + break; + } } void ggml_cuda_op_get_rows(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { const ggml_tensor * src0 = dst->src[0]; const ggml_tensor * src1 = dst->src[1]; - const void * src0_d = (const void *) src0->data; - const int32_t * src1_d = (const int32_t *) src1->data; - float * dst_d = (float *) dst->data; - cudaStream_t stream = ctx.stream(); + GGML_TENSOR_BINARY_OP_LOCALS + GGML_ASSERT(src1->type == GGML_TYPE_I32); - GGML_ASSERT(dst->type == GGML_TYPE_F32); + GGML_ASSERT(ne13 == 1); GGML_ASSERT(src0->nb[0] == ggml_type_size(src0->type)); GGML_ASSERT(src1->nb[0] == ggml_type_size(src1->type)); GGML_ASSERT(dst->nb[0] == ggml_type_size(dst->type)); - switch (src0->type) { - case GGML_TYPE_F16: - get_rows_cuda_float(src0, src1, dst, (const half *) src0_d, src1_d, dst_d, stream); - break; - case GGML_TYPE_F32: - get_rows_cuda_float(src0, src1, dst, (const float *) src0_d, src1_d, dst_d, stream); - break; - case GGML_TYPE_Q4_0: - get_rows_cuda(src0, src1, dst, src0_d, src1_d, dst_d, stream); - break; - case GGML_TYPE_Q4_1: - get_rows_cuda(src0, src1, dst, src0_d, src1_d, dst_d, stream); - break; - case GGML_TYPE_Q5_0: - get_rows_cuda(src0, src1, dst, src0_d, src1_d, dst_d, stream); - break; - case GGML_TYPE_Q5_1: - get_rows_cuda(src0, src1, dst, src0_d, src1_d, dst_d, stream); - break; - case GGML_TYPE_Q8_0: - get_rows_cuda(src0, src1, dst, src0_d, src1_d, dst_d, stream); - break; - default: - // TODO: k-quants - GGML_ABORT("%s: unsupported type: %s\n", __func__, ggml_type_name(src0->type)); - break; - } + get_rows_cuda(src0->data, src0->type, (const int32_t *) src1->data, dst->data, dst->type, + ne00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb1, nb2, nb3, stream); } void ggml_cuda_op_get_rows_back(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { diff --git a/ml/backend/ggml/ggml/src/ggml-cuda/getrows.cuh b/ml/backend/ggml/ggml/src/ggml-cuda/getrows.cuh index a1ca643f1..3c5bea5f4 100644 --- a/ml/backend/ggml/ggml/src/ggml-cuda/getrows.cuh +++ b/ml/backend/ggml/ggml/src/ggml-cuda/getrows.cuh @@ -3,6 +3,13 @@ #define CUDA_GET_ROWS_BLOCK_SIZE 256 #define CUDA_GET_ROWS_BACK_BLOCK_SIZE 256 +void get_rows_cuda( + const void * src0_d, ggml_type src0_type, const int32_t * src1_d, void * dst_d, ggml_type dst_type, + int64_t ne00, size_t nb01, size_t nb02, size_t nb03, + int64_t ne10, int64_t ne11, int64_t ne12, size_t nb10, size_t nb11, size_t nb12, + size_t nb1, size_t nb2, size_t nb3, + cudaStream_t stream); + void ggml_cuda_op_get_rows(ggml_backend_cuda_context & ctx, ggml_tensor * dst); void ggml_cuda_op_get_rows_back(ggml_backend_cuda_context & ctx, ggml_tensor * dst); diff --git a/ml/backend/ggml/ggml/src/ggml-cuda/ggml-cuda.cu b/ml/backend/ggml/ggml/src/ggml-cuda/ggml-cuda.cu index 0fef9522d..491acccb4 100644 --- a/ml/backend/ggml/ggml/src/ggml-cuda/ggml-cuda.cu +++ b/ml/backend/ggml/ggml/src/ggml-cuda/ggml-cuda.cu @@ -1413,6 +1413,11 @@ static void ggml_cuda_op_mul_mat( const int64_t ne0 = dst->ne[0]; const int64_t ne1 = dst->ne[1]; + // const int64_t nb10 = src1->nb[0]; + const int64_t nb11 = src1->nb[1]; + const int64_t nb12 = src1->nb[2]; + const int64_t nb13 = src1->nb[3]; + const int64_t nb2 = dst->nb[2]; const int64_t nb3 = dst->nb[3]; @@ -1548,7 +1553,10 @@ static void ggml_cuda_op_mul_mat( dev[id].src1_ddq = dev[id].src1_ddq_alloc.alloc(ctx.pool(id), src_1_ddq_size); if (src1_on_device && src1_is_contiguous) { - quantize_src1(dev[id].src1_ddf, dev[id].src1_ddq, ne10, ne11, ne12*ne13, src1_padded_col_size, src0->type, stream); + quantize_src1( + dev[id].src1_ddf, nullptr, dev[id].src1_ddq, src0->type, ne10, + nb11/sizeof(float), nb12/sizeof(float), nb13/sizeof(float), + src1_padded_col_size, ne11, ne12, ne13, stream); CUDA_CHECK(cudaGetLastError()); } } @@ -1643,7 +1651,9 @@ static void ggml_cuda_op_mul_mat( } if (quantize_src1 && !src1_is_contiguous) { - quantize_src1(src1_ddf_i, src1_ddq_i, ne10, src1_ncols, 1, src1_padded_col_size, src0->type, stream); + quantize_src1( + src1_ddf_i, nullptr, src1_ddq_i, src0->type, ne10, ne10, ne11*ne10, ne12*ne11*ne10, + src1_padded_col_size, src1_ncols, 1, 1, stream); CUDA_CHECK(cudaGetLastError()); } @@ -1713,15 +1723,15 @@ static __global__ void k_compute_batched_ptrs( size_t nb12, size_t nb13, size_t nbd2, size_t nbd3, int64_t r2, int64_t r3) { - int64_t i13 = blockIdx.x * blockDim.x + threadIdx.x; - int64_t i12 = blockIdx.y * blockDim.y + threadIdx.y; + const int64_t i13 = blockIdx.x * blockDim.x + threadIdx.x; + const int64_t i12 = blockIdx.y * blockDim.y + threadIdx.y; if (i13 >= ne13 || i12 >= ne12) { return; } - int64_t i03 = i13 / r3; - int64_t i02 = i12 / r2; + const int64_t i03 = i13 / r3; + const int64_t i02 = i12 / r2; ptrs_src[0*ne23 + i12 + i13*ne12] = (const char *) src0_as_f16 + i02*nb02 + i03*nb03; ptrs_src[1*ne23 + i12 + i13*ne12] = (const char *) src1_as_f16 + i12*nb12 + i13*nb13; @@ -1735,6 +1745,10 @@ static void ggml_cuda_mul_mat_batched_cublas(ggml_backend_cuda_context & ctx, co GGML_ASSERT(ggml_backend_buffer_is_cuda(src0->buffer)); GGML_ASSERT(src0->type == GGML_TYPE_F16); + // Byte offsets and tensor dimensions are currently used in an inconsistent way for dst. + // As long as dst is contiguous this does not matter though. + GGML_ASSERT(ggml_is_contiguous(dst)); + GGML_TENSOR_BINARY_OP_LOCALS const int64_t ne_dst = ggml_nelements(dst); @@ -1743,21 +1757,31 @@ static void ggml_cuda_mul_mat_batched_cublas(ggml_backend_cuda_context & ctx, co CUBLAS_CHECK(cublasSetStream(ctx.cublas_handle(), main_stream)); - void * src0_ddq = src0->data; - half * src0_f16 = (half *) src0_ddq; - float * src1_ddf = (float *) src1->data; - float * dst_ddf = (float *) dst->data; + const half * src0_f16 = (const half *) src0->data; + float * dst_ddf = (float *) dst->data; + + const half * src1_f16 = (const half *) src1->data; + const size_t ts_src1 = ggml_type_size(src1->type); + GGML_ASSERT(nb10 == ts_src1); + int64_t s11 = nb11 / ts_src1; + int64_t s12 = nb12 / ts_src1; + int64_t s13 = nb13 / ts_src1; + ggml_cuda_pool_alloc src1_f16_alloc(ctx.pool()); // convert src1 to fp16 - ggml_cuda_pool_alloc src1_f16_alloc(ctx.pool()); if (src1->type != GGML_TYPE_F16) { - const to_fp16_cuda_t to_fp16_cuda = ggml_get_to_fp16_cuda(src1->type); + const to_fp16_nc_cuda_t to_fp16_cuda = ggml_get_to_fp16_nc_cuda(src1->type); const int64_t ne_src1 = ggml_nelements(src1); src1_f16_alloc.alloc(ne_src1); GGML_ASSERT(to_fp16_cuda != nullptr); - to_fp16_cuda(src1_ddf, src1_f16_alloc.get(), ne_src1, main_stream); + + to_fp16_cuda(src1_f16, src1_f16_alloc.get(), ne10, ne11, ne12, ne13, s11, s12, s13, main_stream); + + src1_f16 = src1_f16_alloc.get(); + s11 = ne10; + s12 = ne11*s11; + s13 = ne12*s12; } - half * src1_f16 = src1->type == GGML_TYPE_F16 ? (half *) src1_ddf : src1_f16_alloc.get(); ggml_cuda_pool_alloc dst_f16(ctx.pool()); char * dst_t; @@ -1817,13 +1841,13 @@ static void ggml_cuda_mul_mat_batched_cublas(ggml_backend_cuda_context & ctx, co int i02 = i12 / r2; CUBLAS_CHECK( - cublasGemmEx(g_cublas_handles[g_main_device], CUBLAS_OP_T, CUBLAS_OP_N, - ne01, ne11, ne10, - alpha, (const char *) src0_as_f16 + i02*src0->nb[2] + i03*src0->nb[3] , CUDA_R_16F, nb01/sizeof(half), - (const char *) src1_as_f16 + i12*src1->nb[2]/2 + i13*src1->nb[3]/2, CUDA_R_16F, nb11/sizeof(float), - beta, ( char *) dst_t + i12*nbd2 + i13*nbd3, cu_data_type, ne01, - cu_compute_type, - CUBLAS_GEMM_DEFAULT_TENSOR_OP)); + cublasGemmEx(ctx.cublas_handle(), CUBLAS_OP_T, CUBLAS_OP_N, + ne01, ne11, ne10, + alpha, (const char *) src0_f16 + i03*nb03 + i02*nb02, CUDA_R_16F, nb01/sizeof(half), + src1_f16 + i13*s13 + i12*s12, CUDA_R_16F, s11, + beta, ( char *) dst_t + i13*nbd3 + i12*nbd2, cu_data_type, ne0, + cu_compute_type, + CUBLAS_GEMM_DEFAULT_TENSOR_OP)); } } } @@ -1834,15 +1858,15 @@ static void ggml_cuda_mul_mat_batched_cublas(ggml_backend_cuda_context & ctx, co CUBLAS_CHECK( cublasGemmStridedBatchedEx(ctx.cublas_handle(), CUBLAS_OP_T, CUBLAS_OP_N, ne01, ne11, ne10, - alpha, (const char *) src0_f16, CUDA_R_16F, nb01/nb00, nb02/nb00, // strideA - (const char *) src1_f16, CUDA_R_16F, nb11/nb10, nb12/nb10, // strideB - beta, ( char *) dst_t, cu_data_type, ne01, nb2/nb0, // strideC + alpha, src0_f16, CUDA_R_16F, nb01/nb00, nb02/nb00, // strideA + src1_f16, CUDA_R_16F, s11, s12, // strideB + beta, dst_t, cu_data_type, ne0, ne1*ne0, // strideC ne12*ne13, cu_compute_type, CUBLAS_GEMM_DEFAULT_TENSOR_OP)); } else { // use cublasGemmBatchedEx - const int ne23 = ne12*ne13; + const int64_t ne23 = ne12*ne13; ggml_cuda_pool_alloc ptrs_src(ctx.pool(), 2*ne23); ggml_cuda_pool_alloc< void *> ptrs_dst(ctx.pool(), 1*ne23); @@ -1854,8 +1878,8 @@ static void ggml_cuda_mul_mat_batched_cublas(ggml_backend_cuda_context & ctx, co ne12, ne13, ne23, nb02, nb03, - src1->type == GGML_TYPE_F16 ? nb12 : nb12/2, - src1->type == GGML_TYPE_F16 ? nb13 : nb13/2, + src1->type == GGML_TYPE_F16 ? nb12 : s12*sizeof(half), + src1->type == GGML_TYPE_F16 ? nb13 : s13*sizeof(half), nbd2, nbd3, r2, r3); CUDA_CHECK(cudaGetLastError()); @@ -1864,8 +1888,8 @@ static void ggml_cuda_mul_mat_batched_cublas(ggml_backend_cuda_context & ctx, co cublasGemmBatchedEx(ctx.cublas_handle(), CUBLAS_OP_T, CUBLAS_OP_N, ne01, ne11, ne10, alpha, (const void **) (ptrs_src.get() + 0*ne23), CUDA_R_16F, nb01/nb00, - (const void **) (ptrs_src.get() + 1*ne23), CUDA_R_16F, nb11/nb10, - beta, ( void **) (ptrs_dst.get() + 0*ne23), cu_data_type, ne01, + (const void **) (ptrs_src.get() + 1*ne23), CUDA_R_16F, s11, + beta, ( void **) (ptrs_dst.get() + 0*ne23), cu_data_type, ne0, ne23, cu_compute_type, CUBLAS_GEMM_DEFAULT_TENSOR_OP)); @@ -1881,7 +1905,7 @@ static void ggml_cuda_mul_mat_batched_cublas(ggml_backend_cuda_context & ctx, co static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { const bool split = ggml_backend_buft_is_cuda_split(src0->buffer->buft); - bool use_mul_mat_vec = (src0->type == GGML_TYPE_F16 || src0->type == GGML_TYPE_BF16) + bool use_mul_mat_vec = (src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16 || src0->type == GGML_TYPE_BF16) && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32 && src0->ne[0] % 2 == 0 && src1->ne[1] == 1; bool use_mul_mat_vec_q = ggml_is_quantized(src0->type) @@ -1922,12 +1946,16 @@ static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor //printf("src0 is contiguous %d, transposed %d, type = %s, name = %s\n", ggml_is_contiguous(src0), ggml_is_transposed(src0), ggml_type_name(src0->type), src0->name); //printf("src1 is contiguous %d, transposed %d, type = %s, name = %s\n", ggml_is_contiguous(src1), ggml_is_transposed(src1), ggml_type_name(src1->type), src1->name); - if (!split && use_mul_mat_vec && (src0->ne[1] < MMV_MAX_ROWS || any_gpus_without_fp16_mma)) { + if (!split && use_mul_mat_vec && (src0->ne[1] <= MMV_MAX_ROWS || any_gpus_without_fp16_mma)) { // the custom F16 vector kernel can be used over batched cuBLAS GEMM // but this is only faster for GPUs without tensor cores or with a thin src0 matrix (particularly KQV in attention) - ggml_cuda_mul_mat_vec(ctx, src0, src1, dst); - } else if (!split && src0->type == GGML_TYPE_F16 && (src1->type == GGML_TYPE_F16 || !any_gpus_with_slow_fp16) - && !ggml_is_transposed(src0) && !ggml_is_transposed(src1) && src1->ne[2]*src1->ne[3] > 1) { + ggml_cuda_mul_mat_vec(ctx, src0, src1, nullptr, dst); + } else if (!split && use_mul_mat_vec_q) { + ggml_cuda_mul_mat_vec_q(ctx, src0, src1, nullptr, dst); + } else if (!split && use_mul_mat_q) { + ggml_cuda_mul_mat_q(ctx, src0, src1, nullptr, dst); + } else if (!split && src0->type == GGML_TYPE_F16 && (src1->type == GGML_TYPE_F16 || !any_gpus_with_slow_fp16) && + !ggml_is_transposed(src0) && !ggml_is_transposed(src1) && src1->ne[2]*src1->ne[3] > 1) { // general KQ + KQV multi-batch without FlashAttention ggml_cuda_mul_mat_batched_cublas(ctx, src0, src1, dst); } else if (use_mul_mat_vec) { @@ -1941,196 +1969,145 @@ static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor } } -struct mmid_row_mapping { - int32_t i1; - int32_t i2; -}; - -static __global__ void k_copy_src1_to_contiguous(const char * __restrict__ src1_original, char * __restrict__ src1_contiguous, - int * __restrict__ cur_src1_row, mmid_row_mapping * __restrict__ row_mapping, - const char * __restrict ids, int64_t i02, size_t ids_nb1, size_t ids_nb0, - int64_t ne11, int64_t ne10, - size_t nb11, size_t nb12) { - int32_t iid1 = blockIdx.x; - int32_t id = blockIdx.y; - - const int32_t row_id_i = *(const int32_t *) (ids + iid1*ids_nb1 + id*ids_nb0); - - if (row_id_i != i02) { - return; - } - - const int64_t i11 = id % ne11; - const int64_t i12 = iid1; - - __shared__ int src1_row; - if (threadIdx.x == 0) { - src1_row = atomicAdd(cur_src1_row, 1); - row_mapping[src1_row] = {id, iid1}; - } - __syncthreads(); - - const float * src1_row_original = (const float *)(src1_original + i11*nb11 + i12*nb12); - float * src1_row_contiguous = (float *)(src1_contiguous + src1_row*nb11); - - for (int i = threadIdx.x; i < ne10; i += blockDim.x) { - src1_row_contiguous[i] = src1_row_original[i]; - } -} - -static __global__ void k_copy_dst_from_contiguous(char * __restrict__ dst_original, const char * __restrict__ dst_contiguous, - const mmid_row_mapping * __restrict__ row_mapping, - int64_t ne0, - size_t nb1, size_t nb2) { - int32_t i = blockIdx.x; - - const int32_t i1 = row_mapping[i].i1; - const int32_t i2 = row_mapping[i].i2; - - const float * dst_row_contiguous = (const float *)(dst_contiguous + i*nb1); - float * dst_row_original = (float *)(dst_original + i1*nb1 + i2*nb2); - - for (int j = threadIdx.x; j < ne0; j += blockDim.x) { - dst_row_original[j] = dst_row_contiguous[j]; - } -} - static void ggml_cuda_mul_mat_id(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { const ggml_tensor * src0 = dst->src[0]; const ggml_tensor * src1 = dst->src[1]; const ggml_tensor * ids = dst->src[2]; + GGML_ASSERT(src1->type == GGML_TYPE_F32); + GGML_ASSERT(dst->type == GGML_TYPE_F32); + GGML_ASSERT(!ggml_backend_buft_is_cuda_split(src0->buffer->buft) && "mul_mat_id does not support split buffers"); + GGML_TENSOR_BINARY_OP_LOCALS - GGML_ASSERT(!ggml_backend_buft_is_cuda_split(src0->buffer->buft) && "mul_mat_id does not support split buffers"); + const int cc = ggml_cuda_info().devices[ggml_cuda_get_device()].cc; + + if (src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) { + if (ne2 == 1) { + if (ggml_is_quantized(src0->type)) { + ggml_cuda_mul_mat_vec_q(ctx, src0, src1, ids, dst); + } else { + ggml_cuda_mul_mat_vec(ctx, src0, src1, ids, dst); + } + return; + } + + if (ggml_cuda_should_use_mmq(src0->type, cc, ne12)) { + ggml_cuda_mul_mat_q(ctx, src0, src1, ids, dst); + return; + } + } cudaStream_t stream = ctx.stream(); - const int64_t n_as = ne02; - const int64_t n_ids = ids->ne[0]; + GGML_ASSERT(nb12 % nb11 == 0); + GGML_ASSERT(nb2 % nb1 == 0); + + const ggml_type type_src1_sorted = (src0->type == GGML_TYPE_F16 && !fast_fp16_hardware_available(cc)) + || ggml_is_quantized(src0->type) ? GGML_TYPE_F32 : src0->type; + const ggml_type type_dst_sorted = GGML_TYPE_F32; + const size_t ts_src1_sorted = ggml_type_size(type_src1_sorted); + const size_t ts_dst_sorted = ggml_type_size(type_dst_sorted); + + const int64_t n_expert_used = ids->ne[0]; + const int64_t ne_get_rows = ne12 * n_expert_used; + + std::vector ids_to_sorted_host; + ids_to_sorted_host.reserve(2*ne_get_rows); + std::vector ids_from_sorted_host(ne_get_rows); + + ggml_cuda_pool_alloc ids_buf_dev(ctx.pool(), 2*ne_get_rows); + + std::vector tokens_per_expert(ne02); + + ggml_cuda_pool_alloc src1_sorted(ctx.pool(), ne12*n_expert_used*ne10*ts_src1_sorted); + ggml_cuda_pool_alloc dst_sorted(ctx.pool(), ne2 *n_expert_used* ne0*ts_dst_sorted); std::vector ids_host(ggml_nbytes(ids)); - const char * ids_dev = (const char *) ids->data; - CUDA_CHECK(cudaMemcpyAsync(ids_host.data(), ids_dev, ggml_nbytes(ids), cudaMemcpyDeviceToHost, stream)); + CUDA_CHECK(cudaMemcpyAsync(ids_host.data(), ids->data, ggml_nbytes(ids), cudaMemcpyDeviceToHost, stream)); CUDA_CHECK(cudaStreamSynchronize(stream)); - ggml_tensor src0_row = *src0; - ggml_tensor src1_row = *src1; - ggml_tensor dst_row = *dst; - - char * src0_original = (char *) src0->data; - char * src1_original = (char *) src1->data; - char * dst_original = (char *) dst->data; - - src0_row.ne[2] = 1; - src0_row.ne[3] = 1; - src0_row.nb[3] = nb02; - - src1_row.ne[1] = 1; - src1_row.ne[2] = 1; - src1_row.ne[3] = 1; - src1_row.nb[2] = nb11; - src1_row.nb[3] = nb11; - - dst_row.ne[1] = 1; - dst_row.ne[2] = 1; - dst_row.ne[3] = 1; - dst_row.nb[2] = nb1; - dst_row.nb[3] = nb1; - - if (ne12 == 1) { - for (int64_t iid1 = 0; iid1 < ids->ne[1]; iid1++) { - for (int64_t id = 0; id < n_ids; id++) { - const int32_t i02 = *(const int32_t *) (ids_host.data() + iid1*ids->nb[1] + id*ids->nb[0]); - - GGML_ASSERT(i02 >= 0 && i02 < n_as); - - const int64_t i11 = id % ne11; - const int64_t i12 = iid1; - - const int64_t i1 = id; - const int64_t i2 = i12; - - src0_row.data = src0_original + i02*nb02; - src1_row.data = src1_original + i11*nb11 + i12*nb12; - dst_row.data = dst_original + i1*nb1 + i2*nb2; - - ggml_cuda_mul_mat(ctx, &src0_row, &src1_row, &dst_row); - } - } - } else { - ggml_cuda_pool_alloc src1_contiguous(ctx.pool(), sizeof(float)*ggml_nelements(src1)); - ggml_cuda_pool_alloc dst_contiguous(ctx.pool(), sizeof(float)*ggml_nelements(dst)); - - src1_row.data = src1_contiguous.get(); - dst_row.data = dst_contiguous.get(); - - for (int64_t i02 = 0; i02 < n_as; i02++) { - int64_t num_src1_rows = 0; - - for (int64_t iid1 = 0; iid1 < ids->ne[1]; iid1++) { - for (int64_t id = 0; id < n_ids; id++) { - const int32_t row_id_i = *(const int32_t *) (ids_host.data() + iid1*ids->nb[1] + id*ids->nb[0]); - - GGML_ASSERT(row_id_i >= 0 && row_id_i < n_as); - - if (row_id_i != i02) { - continue; - } - - num_src1_rows++; + for (int64_t i02 = 0; i02 < ne02; ++i02) { // expert matrices + for (int64_t i12 = 0; i12 < ne12; ++i12) { // tokens + for (int64_t iex = 0; iex < n_expert_used; ++iex) { + const int32_t expert_to_use = *(const int32_t *)(ids_host.data() + i12*ids->nb[1] + iex*ids->nb[0]); + assert(expert_to_use >= 0 && expert_to_use < ne02); + if (expert_to_use == i02) { + ids_from_sorted_host[i12*n_expert_used + iex] = ids_to_sorted_host.size(); + ids_to_sorted_host.push_back(i12*ne11 + iex % ne11); + tokens_per_expert[i02]++; + break; } } - - if (num_src1_rows == 0) { - continue; - } - - ggml_cuda_pool_alloc dev_cur_src1_row(ctx.pool(), 1); - ggml_cuda_pool_alloc dev_row_mapping(ctx.pool(), num_src1_rows); - CUDA_CHECK(cudaMemsetAsync(dev_cur_src1_row.get(), 0, sizeof(int), stream)); - - { - dim3 block_dims(std::min((unsigned int)ne10, 768u)); - dim3 grid_dims(ids->ne[1], n_ids); - k_copy_src1_to_contiguous<<>>( - src1_original, src1_contiguous.get(), - dev_cur_src1_row.get(), dev_row_mapping.get(), - ids_dev, i02, ids->nb[1], ids->nb[0], - ne11, ne10, - nb11, nb12); - CUDA_CHECK(cudaGetLastError()); - } - - src0_row.data = src0_original + i02*nb02; - - GGML_ASSERT(nb11 == sizeof(float)*ne10); - GGML_ASSERT(nb1 == sizeof(float)*ne0); - - src1_row.ne[1] = num_src1_rows; - src1_row.nb[1] = nb11; - src1_row.nb[2] = num_src1_rows*nb11; - src1_row.nb[3] = num_src1_rows*nb11; - - dst_row.ne[1] = num_src1_rows; - dst_row.nb[1] = nb1; - dst_row.nb[2] = num_src1_rows*nb1; - dst_row.nb[3] = num_src1_rows*nb1; - - ggml_cuda_mul_mat(ctx, &src0_row, &src1_row, &dst_row); - - { - dim3 block_dims(std::min((unsigned int)ne0, 768u)); - dim3 grid_dims(num_src1_rows); - k_copy_dst_from_contiguous<<>>( - dst_original, dst_contiguous.get(), - dev_row_mapping.get(), - ne0, - nb1, nb2); - CUDA_CHECK(cudaGetLastError()); - } } } + GGML_ASSERT(ids_to_sorted_host.size() == size_t(ne_get_rows)); + + ids_to_sorted_host.insert(ids_to_sorted_host.end(), ids_from_sorted_host.begin(), ids_from_sorted_host.end()); + + CUDA_CHECK(cudaMemcpyAsync(ids_buf_dev.ptr, ids_to_sorted_host.data(), 2*ne_get_rows*sizeof(int32_t), cudaMemcpyHostToDevice, stream)); + CUDA_CHECK(cudaStreamSynchronize(stream)); + + const int32_t * ids_to_sorted = ids_buf_dev.ptr + 0*ne_get_rows; + const int32_t * ids_from_sorted = ids_buf_dev.ptr + 1*ne_get_rows; + + get_rows_cuda(src1->data, src1->type, ids_to_sorted, src1_sorted.ptr, type_src1_sorted, + ne10, nb11, nb12, nb13, + ne_get_rows, 1, 1, sizeof(int32_t), ne_get_rows*sizeof(int32_t), ne_get_rows*sizeof(int32_t), + ne10*ts_src1_sorted, ne_get_rows*ne10*ts_src1_sorted, ne_get_rows*ne10*ts_src1_sorted, stream); + CUDA_CHECK(cudaGetLastError()); + + char * src1_data_cur = (char *) src1_sorted.ptr; + char * dst_data_cur = (char *) dst_sorted.ptr; + for (int64_t i02 = 0; i02 < ne02; ++i02) { + if (tokens_per_expert[i02] == 0) { + continue; + } + + ggml_tensor src0_slice = *src0; + src0_slice.ne[2] = 1; + src0_slice.nb[3] = src0_slice.nb[2]; + src0_slice.data = (char *) src0->data + i02*nb02; + + ggml_tensor src1_slice; + memset(&src1_slice, 0, sizeof(src1_slice)); + src1_slice.buffer = src1->buffer; + src1_slice.type = type_src1_sorted; + src1_slice.ne[0] = ne10; + src1_slice.ne[1] = tokens_per_expert[i02]; + src1_slice.ne[2] = 1; + src1_slice.ne[3] = 1; + src1_slice.nb[0] = ts_src1_sorted; + src1_slice.nb[1] = src1_slice.ne[0] * src1_slice.nb[0]; + src1_slice.nb[2] = src1_slice.ne[1] * src1_slice.nb[1]; + src1_slice.nb[3] = src1_slice.ne[2] * src1_slice.nb[2]; + src1_slice.data = src1_data_cur; + + ggml_tensor dst_slice; + memset(&dst_slice, 0, sizeof(dst_slice)); + dst_slice.buffer = dst->buffer; + dst_slice.type = type_dst_sorted; + dst_slice.ne[0] = ne0; + dst_slice.ne[1] = tokens_per_expert[i02]; + dst_slice.ne[2] = 1; + dst_slice.ne[3] = 1; + dst_slice.nb[0] = ts_dst_sorted; + dst_slice.nb[1] = dst_slice.ne[0] * dst_slice.nb[0]; + dst_slice.nb[2] = dst_slice.ne[1] * dst_slice.nb[1]; + dst_slice.nb[3] = dst_slice.ne[2] * dst_slice.nb[2]; + dst_slice.data = dst_data_cur; + + ggml_cuda_mul_mat(ctx, &src0_slice, &src1_slice, &dst_slice); + CUDA_CHECK(cudaGetLastError()); + + src1_data_cur += src1_slice.nb[2]; + dst_data_cur += dst_slice.nb[2]; + } + + get_rows_cuda(dst_sorted.ptr, type_dst_sorted, ids_from_sorted, dst->data, dst->type, + ne0, ne0*ts_dst_sorted, ne_get_rows*ne0*ts_dst_sorted, ne_get_rows*ne0*ts_dst_sorted, + ne_get_rows, 1, 1, sizeof(int32_t), ne_get_rows*sizeof(int32_t), ne_get_rows*sizeof(int32_t), + nb1, nb2, nb3, stream); } static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct ggml_tensor * dst) { @@ -2495,7 +2472,7 @@ static bool check_node_graph_compatibility_and_refresh_copy_ops(ggml_backend_cud #endif } - if (node->op == GGML_OP_MUL_MAT_ID) { + if (node->op == GGML_OP_MUL_MAT_ID && node->ne[2] != 1) { use_cuda_graph = false; // This node type is not supported by CUDA graph capture #ifndef NDEBUG GGML_LOG_DEBUG("%s: disabling CUDA graphs due to unsupported node type\n", __func__); @@ -3209,9 +3186,7 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g } case GGML_OP_ROPE: case GGML_OP_ROPE_BACK: { - const size_t ts = ggml_type_size(op->src[0]->type); - const int64_t ne0_012 = op->src[0]->ne[0] * op->src[0]->ne[1] * op->src[0]->ne[2]; - return op->src[0]->nb[0] == ts && op->src[0]->nb[3] == ne0_012*ts; + return op->src[0]->nb[0] == ggml_type_size(op->src[0]->type) && ggml_is_contiguous_2(op->src[0]); } case GGML_OP_IM2COL: case GGML_OP_POOL_2D: diff --git a/ml/backend/ggml/ggml/src/ggml-cuda/mmq.cu b/ml/backend/ggml/ggml/src/ggml-cuda/mmq.cu index b36b43d54..f397a7e03 100644 --- a/ml/backend/ggml/ggml/src/ggml-cuda/mmq.cu +++ b/ml/backend/ggml/ggml/src/ggml-cuda/mmq.cu @@ -1,37 +1,10 @@ #include "mmq.cuh" +#include "quantize.cuh" -void ggml_cuda_op_mul_mat_q( - ggml_backend_cuda_context & ctx, - const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i, - const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols, - const int64_t src1_padded_row_size, cudaStream_t stream) { +#include - const int64_t ne00 = src0->ne[0]; - - const int64_t ne10 = src1->ne[0]; - const int64_t ne11 = src1->ne[1]; - GGML_ASSERT(ne10 % QK8_1 == 0); - - const int64_t ne0 = dst->ne[0]; - - const int64_t row_diff = row_high - row_low; - const int64_t stride00 = ne00 / ggml_blck_size(src0->type); - - int id = ggml_cuda_get_device(); - const int cc = ggml_cuda_info().devices[id].cc; - - // the main device has a larger memory buffer to hold the results from all GPUs - // nrows_dst == nrows of the matrix that the kernel writes into - const int64_t nrows_dst = id == ctx.device ? ne0 : row_diff; - - // The stream-k decomposition is only faster for recent NVIDIA GPUs. - // Also its fixup needs to allocate a temporary buffer in the memory pool. - // There are multiple parallel CUDA streams for src1_ncols != ne11 which would introduce a race condition for this buffer. - const bool use_stream_k = GGML_CUDA_CC_IS_NVIDIA(cc) && - ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_VOLTA && src1_ncols == ne11; - const mmq_args args = {src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stride00, src1_padded_row_size, src1_ncols, ne11, nrows_dst, use_stream_k}; - - switch (src0->type) { +static void ggml_cuda_mul_mat_q_switch_type(ggml_backend_cuda_context & ctx, const mmq_args & args, cudaStream_t stream) { + switch (args.type_x) { case GGML_TYPE_Q4_0: mul_mat_q_case(ctx, args, stream); break; @@ -90,10 +63,195 @@ void ggml_cuda_op_mul_mat_q( GGML_ABORT("fatal error"); break; } +} + +void ggml_cuda_mul_mat_q( + ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * ids, ggml_tensor * dst) { + GGML_ASSERT( src1->type == GGML_TYPE_F32); + GGML_ASSERT( dst->type == GGML_TYPE_F32); + GGML_ASSERT(!ids || ids->type == GGML_TYPE_I32); // Optional, used for batched GGML_MUL_MAT_ID. + + GGML_TENSOR_BINARY_OP_LOCALS; + + cudaStream_t stream = ctx.stream(); + const int cc = ggml_cuda_info().devices[ggml_cuda_get_device()].cc; + + const size_t ts_src0 = ggml_type_size(src0->type); + const size_t ts_src1 = ggml_type_size(src1->type); + const size_t ts_dst = ggml_type_size(dst->type); + + GGML_ASSERT( nb00 == ts_src0); + GGML_ASSERT( nb10 == ts_src1); + GGML_ASSERT( nb0 == ts_dst); + GGML_ASSERT(!ids || ids->nb[0] == ggml_type_size(ids->type)); + + const char * src0_d = (const char *) src0->data; + const float * src1_d = (const float *) src1->data; + float * dst_d = (float *) dst->data; + + const int64_t ne10_padded = GGML_PAD(ne10, MATRIX_ROW_PADDING); + + const int64_t s01 = src0->nb[1] / ts_src0; + const int64_t s1 = dst->nb[1] / ts_dst; + const int64_t s02 = src0->nb[2] / ts_src0; + const int64_t s2 = dst->nb[2] / ts_dst; + const int64_t s03 = src0->nb[3] / ts_src0; + const int64_t s3 = dst->nb[3] / ts_dst; + + const bool use_stream_k = GGML_CUDA_CC_IS_NVIDIA(cc) && ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_VOLTA; + + if (!ids) { + const size_t nbytes_src1_q8_1 = ne13*ne12 * ne11*ne10_padded * sizeof(block_q8_1)/QK8_1 + + get_mmq_x_max_host(cc)*sizeof(block_q8_1_mmq); + ggml_cuda_pool_alloc src1_q8_1(ctx.pool(), nbytes_src1_q8_1); + + { + const int64_t s11 = src1->nb[1] / ts_src1; + const int64_t s12 = src1->nb[2] / ts_src1; + const int64_t s13 = src1->nb[3] / ts_src1; + quantize_mmq_q8_1_cuda(src1_d, nullptr, src1_q8_1.get(), src0->type, + ne10, s11, s12, s13, ne10_padded, ne11, ne12, ne13, stream); + } + + const int64_t s12 = ne11*ne10_padded * sizeof(block_q8_1)/(QK8_1*sizeof(int)); + const int64_t s13 = ne12*s12; + + const mmq_args args = { + src0_d, src0->type, (const int *) src1_q8_1.ptr, nullptr, nullptr, dst_d, + ne00, ne01, ne1, s01, s1, + ne02, ne12, s02, s12, s2, + ne03, ne13, s03, s13, s3, + use_stream_k}; + ggml_cuda_mul_mat_q_switch_type(ctx, args, stream); + return; + } + + GGML_ASSERT(ne13 == 1); + GGML_ASSERT(nb12 % nb11 == 0); + GGML_ASSERT(nb2 % nb1 == 0); + + const int64_t n_expert_used = ids->ne[0]; + const int64_t ne_get_rows = ne12 * n_expert_used; + + std::vector ids_host(ggml_nbytes(ids)); + std::vector ids_src1_host; + ids_src1_host.reserve(ne_get_rows); + std::vector ids_dst_host; + ids_dst_host.reserve(ne_get_rows); + std::vector tokens_per_expert_host(ne02); + std::vector expert_bounds_host(ne02 + 1); + ggml_cuda_pool_alloc ids_buf_dev(ctx.pool()); + + CUDA_CHECK(cudaMemcpyAsync(ids_host.data(), ids->data, ggml_nbytes(ids), cudaMemcpyDeviceToHost, stream)); + CUDA_CHECK(cudaStreamSynchronize(stream)); + + for (int64_t i02 = 0; i02 < ne02; ++i02) { // expert matrices + for (int64_t i12 = 0; i12 < ne12; ++i12) { // tokens + for (int64_t iex = 0; iex < n_expert_used; ++iex) { + const int32_t expert_to_use = *(const int32_t *)(ids_host.data() + i12*ids->nb[1] + iex*ids->nb[0]); + assert(expert_to_use >= 0 && expert_to_use < ne02); + if (expert_to_use == i02) { + ids_src1_host.push_back(i12*(nb12/nb11) + iex % ne11); + ids_dst_host.push_back(i12*ne1 + iex); + tokens_per_expert_host[i02]++; + break; + } + } + } + } + + int32_t cumsum = 0; + for (int64_t i = 0; i < ne02; ++i) { + expert_bounds_host[i] = cumsum; + cumsum += tokens_per_expert_host[i]; + } + expert_bounds_host[ne02] = cumsum; + + std::vector ids_buf_host; + ids_buf_host.reserve(ids_src1_host.size() + ids_dst_host.size() + expert_bounds_host.size()); + ids_buf_host.insert(ids_buf_host.end(), ids_src1_host.begin(), ids_src1_host.end()); + ids_buf_host.insert(ids_buf_host.end(), ids_dst_host.begin(), ids_dst_host.end()); + ids_buf_host.insert(ids_buf_host.end(), expert_bounds_host.begin(), expert_bounds_host.end()); + ids_buf_dev.alloc(ids_buf_host.size() + get_mmq_x_max_host(cc)); // Expert bounds are padded on device. + CUDA_CHECK(cudaMemcpyAsync(ids_buf_dev.ptr, ids_buf_host.data(), ids_buf_host.size()*sizeof(int32_t), cudaMemcpyHostToDevice, stream)); + CUDA_CHECK(cudaStreamSynchronize(stream)); + + const int32_t * ids_src1_dev = ids_buf_dev.ptr; + const int32_t * ids_dst_dev = ids_src1_dev + ids_src1_host.size(); + const int32_t * expert_bounds_dev = ids_dst_dev + ids_dst_host.size(); + + const size_t nbytes_src1_q8_1 = ne12*n_expert_used*ne10_padded * sizeof(block_q8_1)/QK8_1 + + get_mmq_x_max_host(cc)*sizeof(block_q8_1_mmq); + ggml_cuda_pool_alloc src1_q8_1(ctx.pool(), nbytes_src1_q8_1); + + const int64_t ne11_flat = ne12*n_expert_used; + const int64_t ne12_flat = 1; + const int64_t ne13_flat = 1; + + { + const int64_t s11 = src1->nb[1] / ts_src1; + const int64_t s12 = src1->nb[2] / ts_src1; + const int64_t s13 = src1->nb[2] / ts_src1; + quantize_mmq_q8_1_cuda(src1_d, ids_src1_dev, src1_q8_1.get(), src0->type, + ne10, s11, s12, s13, ne10_padded, ne11_flat, ne12_flat, ne13_flat, stream); + } + + const int64_t s12 = ne11*ne10_padded * sizeof(block_q8_1)/(QK8_1*sizeof(int)); + const int64_t s13 = ne12*s12; + + // Note that ne02 is used instead of ne12 because the number of y channels determines the z dimension of the CUDA grid. + const mmq_args args = { + src0_d, src0->type, (const int *) src1_q8_1.ptr, ids_dst_dev, expert_bounds_dev, dst_d, + ne00, ne01, ne_get_rows, s01, s1, + ne02, ne02, s02, s12, s2, + ne03, ne13, s03, s13, s3, + use_stream_k}; + + ggml_cuda_mul_mat_q_switch_type(ctx, args, stream); +} + +void ggml_cuda_op_mul_mat_q( + ggml_backend_cuda_context & ctx, + const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i, + const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols, + const int64_t src1_padded_row_size, cudaStream_t stream) { + + const int64_t ne00 = src0->ne[0]; + + const int64_t ne10 = src1->ne[0]; + const int64_t ne11 = src1->ne[1]; + GGML_ASSERT(ne10 % QK8_1 == 0); + + const int64_t ne0 = dst->ne[0]; + + const int64_t row_diff = row_high - row_low; + const int64_t stride01 = ne00 / ggml_blck_size(src0->type); + + const int id = ggml_cuda_get_device(); + const int cc = ggml_cuda_info().devices[id].cc; + + // the main device has a larger memory buffer to hold the results from all GPUs + // nrows_dst == nrows of the matrix that the kernel writes into + const int64_t nrows_dst = id == ctx.device ? ne0 : row_diff; + + // The stream-k decomposition is only faster for recent NVIDIA GPUs. + // Also its fixup needs to allocate a temporary buffer in the memory pool. + // There are multiple parallel CUDA streams for src1_ncols != ne11 which would introduce a race condition for this buffer. + const bool use_stream_k = GGML_CUDA_CC_IS_NVIDIA(cc) && + ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_VOLTA && src1_ncols == ne11; + const mmq_args args = { + src0_dd_i, src0->type, (const int *) src1_ddq_i, nullptr, nullptr, dst_dd_i, + ne00, row_diff, src1_ncols, stride01, nrows_dst, + 1, 1, 0, 0, 0, + 1, 1, 0, 0, 0, + use_stream_k}; + + ggml_cuda_mul_mat_q_switch_type(ctx, args, stream); GGML_UNUSED(src1); GGML_UNUSED(dst); GGML_UNUSED(src1_ddf_i); + GGML_UNUSED(src1_padded_row_size); } bool ggml_cuda_should_use_mmq(enum ggml_type type, int cc, int64_t ne11) { diff --git a/ml/backend/ggml/ggml/src/ggml-cuda/mmq.cuh b/ml/backend/ggml/ggml/src/ggml-cuda/mmq.cuh index 532358018..8c93e8326 100644 --- a/ml/backend/ggml/ggml/src/ggml-cuda/mmq.cuh +++ b/ml/backend/ggml/ggml/src/ggml-cuda/mmq.cuh @@ -13,9 +13,10 @@ using namespace ggml_cuda_mma; #define MMQ_ITER_K 256 #define MMQ_NWARPS 8 -typedef void (*load_tiles_mmq_t)(const char * __restrict__ x, int * x_tile, const int & kbx0, const int & i_max, const int & stride); -typedef void (*vec_dot_mmq_t)(const int * __restrict__ x, const int * __restrict__ y, float * __restrict__ sum, const int & k00); -typedef void (*mmq_write_back_t)(const float * __restrict__ sum, float * __restrict__ dst, const int & stride, const int & i_max, const int & j_max); +typedef void (*load_tiles_mmq_t)(const char * __restrict__ x, int * x_tile, const int kbx0, const int i_max, const int stride); +typedef void (*vec_dot_mmq_t)(const int * __restrict__ x, const int * __restrict__ y, float * __restrict__ sum, const int k00); +typedef void (*mmq_write_back_t)(const float * __restrict__ sum, const int32_t * __restrict__ get_rows_to_sorted, + float * __restrict__ dst, const int stride, const int i_max, const int j_max); enum mmq_q8_1_ds_layout { MMQ_Q8_1_DS_LAYOUT_D4, @@ -155,25 +156,27 @@ static constexpr __device__ int get_mmq_y_device() { #define MMQ_DP4A_TXS_Q6_K tile_x_sizes{mmq_y*WARP_SIZE*2 + mmq_y, mmq_y*WARP_SIZE/QI6_K + mmq_y/QI6_K, mmq_y*WARP_SIZE/8 + mmq_y/8} static constexpr __host__ __device__ tile_x_sizes mmq_get_dp4a_tile_x_sizes(ggml_type type, int mmq_y) { - return type == GGML_TYPE_Q4_0 ? MMQ_DP4A_TXS_Q4_0 : - type == GGML_TYPE_Q4_1 ? MMQ_DP4A_TXS_Q4_1 : - type == GGML_TYPE_Q5_0 ? MMQ_DP4A_TXS_Q8_0 : - type == GGML_TYPE_Q5_1 ? MMQ_DP4A_TXS_Q8_1 : - type == GGML_TYPE_Q8_0 ? MMQ_DP4A_TXS_Q8_0 : - type == GGML_TYPE_Q2_K ? MMQ_DP4A_TXS_Q2_K : - type == GGML_TYPE_Q3_K ? MMQ_DP4A_TXS_Q3_K : - type == GGML_TYPE_Q4_K ? MMQ_DP4A_TXS_Q4_K : - type == GGML_TYPE_Q5_K ? MMQ_DP4A_TXS_Q5_K : - type == GGML_TYPE_Q6_K ? MMQ_DP4A_TXS_Q6_K : - type == GGML_TYPE_IQ2_XXS ? MMQ_DP4A_TXS_Q8_0 : - type == GGML_TYPE_IQ2_XS ? MMQ_DP4A_TXS_Q8_0_16 : - type == GGML_TYPE_IQ2_S ? MMQ_DP4A_TXS_Q8_0_16 : - type == GGML_TYPE_IQ3_XXS ? MMQ_DP4A_TXS_Q8_0 : - type == GGML_TYPE_IQ3_S ? MMQ_DP4A_TXS_Q8_0 : - type == GGML_TYPE_IQ1_S ? MMQ_DP4A_TXS_Q8_0 : - type == GGML_TYPE_IQ4_XS ? MMQ_DP4A_TXS_Q8_0 : - type == GGML_TYPE_IQ4_NL ? MMQ_DP4A_TXS_Q8_0 : - tile_x_sizes{0, 0, 0}; + switch (type) { + case GGML_TYPE_Q4_0: return MMQ_DP4A_TXS_Q4_0; + case GGML_TYPE_Q4_1: return MMQ_DP4A_TXS_Q4_1; + case GGML_TYPE_Q5_0: return MMQ_DP4A_TXS_Q8_0; + case GGML_TYPE_Q5_1: return MMQ_DP4A_TXS_Q8_1; + case GGML_TYPE_Q8_0: return MMQ_DP4A_TXS_Q8_0; + case GGML_TYPE_Q2_K: return MMQ_DP4A_TXS_Q2_K; + case GGML_TYPE_Q3_K: return MMQ_DP4A_TXS_Q3_K; + case GGML_TYPE_Q4_K: return MMQ_DP4A_TXS_Q4_K; + case GGML_TYPE_Q5_K: return MMQ_DP4A_TXS_Q5_K; + case GGML_TYPE_Q6_K: return MMQ_DP4A_TXS_Q6_K; + case GGML_TYPE_IQ2_XXS: return MMQ_DP4A_TXS_Q8_0; + case GGML_TYPE_IQ2_XS: return MMQ_DP4A_TXS_Q8_0_16; + case GGML_TYPE_IQ2_S: return MMQ_DP4A_TXS_Q8_0_16; + case GGML_TYPE_IQ3_XXS: return MMQ_DP4A_TXS_Q8_0; + case GGML_TYPE_IQ3_S: return MMQ_DP4A_TXS_Q8_0; + case GGML_TYPE_IQ1_S: return MMQ_DP4A_TXS_Q8_0; + case GGML_TYPE_IQ4_XS: return MMQ_DP4A_TXS_Q8_0; + case GGML_TYPE_IQ4_NL: return MMQ_DP4A_TXS_Q8_0; + default: return tile_x_sizes{0, 0, 0}; + } } #define MMQ_MMA_TILE_X_K_Q8_0 (2*WARP_SIZE + 2*WARP_SIZE/QI8_0 + 4) @@ -189,25 +192,27 @@ static_assert(MMQ_MMA_TILE_X_K_Q3_K % 8 == 4, "Wrong padding."); static_assert(MMQ_MMA_TILE_X_K_Q6_K % 8 == 4, "Wrong padding."); static constexpr __host__ __device__ int mmq_get_mma_tile_x_k(ggml_type type) { - return type == GGML_TYPE_Q4_0 ? MMQ_MMA_TILE_X_K_Q8_0 : - type == GGML_TYPE_Q4_1 ? MMQ_MMA_TILE_X_K_Q8_1 : - type == GGML_TYPE_Q5_0 ? MMQ_MMA_TILE_X_K_Q8_0 : - type == GGML_TYPE_Q5_1 ? MMQ_MMA_TILE_X_K_Q8_1 : - type == GGML_TYPE_Q8_0 ? MMQ_MMA_TILE_X_K_Q8_0 : - type == GGML_TYPE_Q2_K ? MMQ_MMA_TILE_X_K_Q2_K : - type == GGML_TYPE_Q3_K ? MMQ_MMA_TILE_X_K_Q3_K : - type == GGML_TYPE_Q4_K ? MMQ_MMA_TILE_X_K_Q8_1 : - type == GGML_TYPE_Q5_K ? MMQ_MMA_TILE_X_K_Q8_1 : - type == GGML_TYPE_Q6_K ? MMQ_MMA_TILE_X_K_Q6_K : - type == GGML_TYPE_IQ2_XXS ? MMQ_MMA_TILE_X_K_Q8_0 : - type == GGML_TYPE_IQ2_XS ? MMQ_MMA_TILE_X_K_Q3_K : - type == GGML_TYPE_IQ2_S ? MMQ_MMA_TILE_X_K_Q3_K : - type == GGML_TYPE_IQ3_XXS ? MMQ_MMA_TILE_X_K_Q8_0 : - type == GGML_TYPE_IQ3_S ? MMQ_MMA_TILE_X_K_Q8_0 : - type == GGML_TYPE_IQ1_S ? MMQ_MMA_TILE_X_K_Q8_0 : - type == GGML_TYPE_IQ4_XS ? MMQ_MMA_TILE_X_K_Q8_0 : - type == GGML_TYPE_IQ4_NL ? MMQ_MMA_TILE_X_K_Q8_0 : - 0; + switch (type) { + case GGML_TYPE_Q4_0: return MMQ_MMA_TILE_X_K_Q8_0; + case GGML_TYPE_Q4_1: return MMQ_MMA_TILE_X_K_Q8_1; + case GGML_TYPE_Q5_0: return MMQ_MMA_TILE_X_K_Q8_0; + case GGML_TYPE_Q5_1: return MMQ_MMA_TILE_X_K_Q8_1; + case GGML_TYPE_Q8_0: return MMQ_MMA_TILE_X_K_Q8_0; + case GGML_TYPE_Q2_K: return MMQ_MMA_TILE_X_K_Q2_K; + case GGML_TYPE_Q3_K: return MMQ_MMA_TILE_X_K_Q3_K; + case GGML_TYPE_Q4_K: return MMQ_MMA_TILE_X_K_Q8_1; + case GGML_TYPE_Q5_K: return MMQ_MMA_TILE_X_K_Q8_1; + case GGML_TYPE_Q6_K: return MMQ_MMA_TILE_X_K_Q6_K; + case GGML_TYPE_IQ2_XXS: return MMQ_MMA_TILE_X_K_Q8_0; + case GGML_TYPE_IQ2_XS: return MMQ_MMA_TILE_X_K_Q3_K; + case GGML_TYPE_IQ2_S: return MMQ_MMA_TILE_X_K_Q3_K; + case GGML_TYPE_IQ3_XXS: return MMQ_MMA_TILE_X_K_Q8_0; + case GGML_TYPE_IQ3_S: return MMQ_MMA_TILE_X_K_Q8_0; + case GGML_TYPE_IQ1_S: return MMQ_MMA_TILE_X_K_Q8_0; + case GGML_TYPE_IQ4_XS: return MMQ_MMA_TILE_X_K_Q8_0; + case GGML_TYPE_IQ4_NL: return MMQ_MMA_TILE_X_K_Q8_0; + default: return 0; + } } #define MMQ_TILE_Y_K (WARP_SIZE + WARP_SIZE/QI8_1) @@ -229,7 +234,7 @@ static constexpr __device__ int mmq_get_granularity_device(const int /* mmq_x */ // ------------------------------------------------------------ template static __device__ __forceinline__ void load_tiles_q4_0( - const char * __restrict__ x, int * __restrict__ x_tile, const int & kbx0, const int & i_max, const int & stride) { + const char * __restrict__ x, int * __restrict__ x_tile, const int kbx0, const int i_max, const int stride) { #ifdef NEW_MMA_AVAILABLE int * x_qs = (int *) x_tile; @@ -285,7 +290,7 @@ template static __device__ __forceinlin template static __device__ __forceinline__ void vec_dot_q4_0_q8_1_dp4a( - const int * __restrict__ x, const int * __restrict__ y, float * __restrict__ sum, const int & k00) { + const int * __restrict__ x, const int * __restrict__ y, float * __restrict__ sum, const int k00) { constexpr tile_x_sizes txs = mmq_get_dp4a_tile_x_sizes(GGML_TYPE_Q4_0, mmq_y); const int * x_qs = (const int *) x; @@ -324,7 +329,7 @@ static __device__ __forceinline__ void vec_dot_q4_0_q8_1_dp4a( } template static __device__ __forceinline__ void load_tiles_q4_1( - const char * __restrict__ x, int * __restrict__ x_tile, const int & kbx0, const int & i_max, const int & stride) { + const char * __restrict__ x, int * __restrict__ x_tile, const int kbx0, const int i_max, const int stride) { #ifdef NEW_MMA_AVAILABLE int * x_qs = (int *) x_tile; @@ -380,7 +385,7 @@ template static __device__ __forceinlin template static __device__ __forceinline__ void vec_dot_q4_1_q8_1_dp4a( - const int * __restrict__ x, const int * __restrict__ y, float * __restrict__ sum, const int & k00) { + const int * __restrict__ x, const int * __restrict__ y, float * __restrict__ sum, const int k00) { constexpr tile_x_sizes txs = mmq_get_dp4a_tile_x_sizes(GGML_TYPE_Q4_1, mmq_y); const int * x_qs = (const int *) x; @@ -419,7 +424,7 @@ static __device__ __forceinline__ void vec_dot_q4_1_q8_1_dp4a( } template static __device__ __forceinline__ void load_tiles_q5_0( - const char * __restrict__ x, int * __restrict__ x_tile, const int & kbx0, const int & i_max, const int & stride) { + const char * __restrict__ x, int * __restrict__ x_tile, const int kbx0, const int i_max, const int stride) { #ifdef NEW_MMA_AVAILABLE int * x_qs = (int *) x_tile; @@ -491,7 +496,7 @@ template static __device__ __forceinlin } template static __device__ __forceinline__ void load_tiles_q5_1( - const char * __restrict__ x, int * __restrict__ x_tile, const int & kbx0, const int & i_max, const int & stride) { + const char * __restrict__ x, int * __restrict__ x_tile, const int kbx0, const int i_max, const int stride) { #ifdef NEW_MMA_AVAILABLE int * x_qs = (int *) x_tile; @@ -561,7 +566,7 @@ template static __device__ __forceinlin } template static __device__ __forceinline__ void load_tiles_q8_0( - const char * __restrict__ x, int * __restrict__ x_tile, const int & kbx0, const int & i_max, const int & stride) { + const char * __restrict__ x, int * __restrict__ x_tile, const int kbx0, const int i_max, const int stride) { #ifdef NEW_MMA_AVAILABLE int * x_qs = (int *) x_tile; @@ -617,7 +622,7 @@ template static __device__ __forceinlin template static __device__ __forceinline__ void vec_dot_q8_0_q8_1_dp4a( - const int * __restrict__ x, const int * __restrict__ y, float * __restrict__ sum, const int & k00) { + const int * __restrict__ x, const int * __restrict__ y, float * __restrict__ sum, const int k00) { constexpr tile_x_sizes txs = mmq_get_dp4a_tile_x_sizes(GGML_TYPE_Q8_0, mmq_y); const int * x_qs = (const int *) x; @@ -647,7 +652,7 @@ static __device__ __forceinline__ void vec_dot_q8_0_q8_1_dp4a( template static __device__ __forceinline__ void vec_dot_q8_0_q8_1_mma( - const int * __restrict__ x, const int * __restrict__ y, float * __restrict__ sum, const int & k00) { + const int * __restrict__ x, const int * __restrict__ y, float * __restrict__ sum, const int k00) { typedef tile<16, 8, int> tile_A; typedef tile< 8, 8, int> tile_B; @@ -728,7 +733,7 @@ static __device__ __forceinline__ void vec_dot_q8_0_q8_1_mma( template static __device__ __forceinline__ void vec_dot_q8_1_q8_1_dp4a( - const int * __restrict__ x, const int * __restrict__ y, float * __restrict__ sum, const int & k00) { + const int * __restrict__ x, const int * __restrict__ y, float * __restrict__ sum, const int k00) { constexpr tile_x_sizes txs = mmq_get_dp4a_tile_x_sizes(GGML_TYPE_Q5_1, mmq_y); const int * x_qs = (const int *) x; @@ -758,7 +763,7 @@ static __device__ __forceinline__ void vec_dot_q8_1_q8_1_dp4a( template static __device__ __forceinline__ void vec_dot_q8_1_q8_1_mma( - const int * __restrict__ x, const int * __restrict__ y, float * __restrict__ sum, const int & k00) { + const int * __restrict__ x, const int * __restrict__ y, float * __restrict__ sum, const int k00) { typedef tile<16, 8, int> tile_A; typedef tile< 8, 8, int> tile_B; @@ -835,7 +840,7 @@ static __device__ __forceinline__ void vec_dot_q8_1_q8_1_mma( template static __device__ __forceinline__ void vec_dot_q8_0_16_q8_1_dp4a( - const int * __restrict__ x, const int * __restrict__ y, float * __restrict__ sum, const int & k00) { + const int * __restrict__ x, const int * __restrict__ y, float * __restrict__ sum, const int k00) { constexpr tile_x_sizes txs = MMQ_DP4A_TXS_Q8_0_16; const int * x_qs = (const int *) x; @@ -867,7 +872,7 @@ static __device__ __forceinline__ void vec_dot_q8_0_16_q8_1_dp4a( template static __device__ __forceinline__ void vec_dot_q8_0_16_q8_1_mma( - const int * __restrict__ x, const int * __restrict__ y, float * __restrict__ sum, const int & k00) { + const int * __restrict__ x, const int * __restrict__ y, float * __restrict__ sum, const int k00) { #ifdef NEW_MMA_AVAILABLE typedef tile<16, 4, int> tile_A; @@ -951,7 +956,7 @@ static __device__ __forceinline__ void vec_dot_q8_0_16_q8_1_mma( } template static __device__ __forceinline__ void load_tiles_q2_K( - const char * __restrict__ x, int * __restrict__ x_tile, const int & kbx0, const int & i_max, const int & stride) { + const char * __restrict__ x, int * __restrict__ x_tile, const int kbx0, const int i_max, const int stride) { #ifdef NEW_MMA_AVAILABLE int * x_qs = (int *) x_tile; @@ -1007,7 +1012,7 @@ template static __device__ __forceinlin template static __device__ __forceinline__ void vec_dot_q2_K_q8_1_dp4a( - const int * __restrict__ x, const int * __restrict__ y, float * __restrict__ sum, const int & k00) { + const int * __restrict__ x, const int * __restrict__ y, float * __restrict__ sum, const int k00) { constexpr tile_x_sizes txs = mmq_get_dp4a_tile_x_sizes(GGML_TYPE_Q2_K, mmq_y); const int * x_qs = (const int *) x; @@ -1070,7 +1075,7 @@ static __device__ __forceinline__ void vec_dot_q2_K_q8_1_dp4a( template static __device__ __forceinline__ void vec_dot_q2_K_q8_1_mma( - const int * __restrict__ x, const int * __restrict__ y, float * __restrict__ sum, const int & k00) { + const int * __restrict__ x, const int * __restrict__ y, float * __restrict__ sum, const int k00) { #ifdef NEW_MMA_AVAILABLE typedef tile<16, 4, int> tile_A; @@ -1197,7 +1202,7 @@ static __device__ __forceinline__ void vec_dot_q2_K_q8_1_mma( } template static __device__ __forceinline__ void load_tiles_q3_K( - const char * __restrict__ x, int * __restrict__ x_tile, const int & kbx0, const int & i_max, const int & stride) { + const char * __restrict__ x, int * __restrict__ x_tile, const int kbx0, const int i_max, const int stride) { #ifdef NEW_MMA_AVAILABLE int * x_qs = (int *) x_tile; @@ -1294,7 +1299,7 @@ template static __device__ __forceinlin template static __device__ __forceinline__ void vec_dot_q3_K_q8_1_dp4a( - const int * __restrict__ x, const int * __restrict__ y, float * __restrict__ sum, const int & k00) { + const int * __restrict__ x, const int * __restrict__ y, float * __restrict__ sum, const int k00) { constexpr tile_x_sizes txs = mmq_get_dp4a_tile_x_sizes(GGML_TYPE_Q3_K, mmq_y); const int * x_qs = (const int *) x; @@ -1336,7 +1341,7 @@ static __device__ __forceinline__ int unpack_scales_q45_K(const int * scales, co } template static __device__ __forceinline__ void load_tiles_q4_K( - const char * __restrict__ x, int * __restrict__ x_tile, const int & kbx0, const int & i_max, const int & stride) { + const char * __restrict__ x, int * __restrict__ x_tile, const int kbx0, const int i_max, const int stride) { #ifdef NEW_MMA_AVAILABLE int * x_qs = (int *) x_tile; @@ -1433,7 +1438,7 @@ template static __device__ __forceinlin template static __device__ __forceinline__ void vec_dot_q4_K_q8_1_dp4a( - const int * __restrict__ x, const int * __restrict__ y, float * __restrict__ sum, const int & k00) { + const int * __restrict__ x, const int * __restrict__ y, float * __restrict__ sum, const int k00) { constexpr tile_x_sizes txs = mmq_get_dp4a_tile_x_sizes(GGML_TYPE_Q4_K, mmq_y); const int * x_qs = (const int *) x; @@ -1465,7 +1470,7 @@ static __device__ __forceinline__ void vec_dot_q4_K_q8_1_dp4a( } template static __device__ __forceinline__ void load_tiles_q5_K( - const char * __restrict__ x, int * __restrict__ x_tile, const int & kbx0, const int & i_max, const int & stride) { + const char * __restrict__ x, int * __restrict__ x_tile, const int kbx0, const int i_max, const int stride) { #ifdef NEW_MMA_AVAILABLE int * x_qs = (int *) x_tile; @@ -1574,7 +1579,7 @@ template static __device__ __forceinlin template static __device__ __forceinline__ void vec_dot_q5_K_q8_1_dp4a( - const int * __restrict__ x, const int * __restrict__ y, float * __restrict__ sum, const int & k00) { + const int * __restrict__ x, const int * __restrict__ y, float * __restrict__ sum, const int k00) { constexpr tile_x_sizes txs = mmq_get_dp4a_tile_x_sizes(GGML_TYPE_Q5_K, mmq_y); const int * x_qs = (const int *) x; @@ -1606,7 +1611,7 @@ static __device__ __forceinline__ void vec_dot_q5_K_q8_1_dp4a( } template static __device__ __forceinline__ void load_tiles_q6_K( - const char * __restrict__ x, int * __restrict__ x_tile, const int & kbx0, const int & i_max, const int & stride) { + const char * __restrict__ x, int * __restrict__ x_tile, const int kbx0, const int i_max, const int stride) { #ifdef NEW_MMA_AVAILABLE int * x_qs = (int *) x_tile; @@ -1689,7 +1694,7 @@ template static __device__ __forceinlin template static __device__ __forceinline__ void vec_dot_q6_K_q8_1_dp4a( - const int * __restrict__ x, const int * __restrict__ y, float * __restrict__ sum, const int & k00) { + const int * __restrict__ x, const int * __restrict__ y, float * __restrict__ sum, const int k00) { constexpr tile_x_sizes txs = mmq_get_dp4a_tile_x_sizes(GGML_TYPE_Q6_K, mmq_y); const int * x_qs = (const int *) x; @@ -1722,7 +1727,7 @@ static __device__ __forceinline__ void vec_dot_q6_K_q8_1_dp4a( template static __device__ __forceinline__ void vec_dot_q6_K_q8_1_mma( - const int * __restrict__ x, const int * __restrict__ y, float * __restrict__ sum, const int & k00) { + const int * __restrict__ x, const int * __restrict__ y, float * __restrict__ sum, const int k00) { #ifdef NEW_MMA_AVAILABLE typedef tile<16, 4, int> tile_A; @@ -1831,7 +1836,7 @@ static __device__ __forceinline__ void vec_dot_q6_K_q8_1_mma( } template static __device__ __forceinline__ void load_tiles_iq4_nl( - const char * __restrict__ x, int * __restrict__ x_tile, const int & kbx0, const int & i_max, const int & stride) { + const char * __restrict__ x, int * __restrict__ x_tile, const int kbx0, const int i_max, const int stride) { #ifdef NEW_MMA_AVAILABLE int * x_qs = (int *) x_tile; @@ -1889,7 +1894,7 @@ template static __device__ __forceinlin } template static __device__ __forceinline__ void load_tiles_iq2_xxs( - const char * __restrict__ x, int * __restrict__ x_tile, const int & kbx0, const int & i_max, const int & stride) { + const char * __restrict__ x, int * __restrict__ x_tile, const int kbx0, const int i_max, const int stride) { #ifdef NEW_MMA_AVAILABLE int * x_qs = (int *) x_tile; @@ -1947,7 +1952,7 @@ template static __device__ __forceinlin } template static __device__ __forceinline__ void load_tiles_iq2_xs( - const char * __restrict__ x, int * __restrict__ x_tile, const int & kbx0, const int & i_max, const int & stride) { + const char * __restrict__ x, int * __restrict__ x_tile, const int kbx0, const int i_max, const int stride) { #ifdef NEW_MMA_AVAILABLE int * x_qs = (int *) x_tile; @@ -2003,7 +2008,7 @@ template static __device__ __forceinlin } template static __device__ __forceinline__ void load_tiles_iq2_s( - const char * __restrict__ x, int * __restrict__ x_tile, const int & kbx0, const int & i_max, const int & stride) { + const char * __restrict__ x, int * __restrict__ x_tile, const int kbx0, const int i_max, const int stride) { #ifdef NEW_MMA_AVAILABLE int * x_qs = (int *) x_tile; @@ -2066,7 +2071,7 @@ template static __device__ __forceinlin } template static __device__ __forceinline__ void load_tiles_iq3_xxs( - const char * __restrict__ x, int * __restrict__ x_tile, const int & kbx0, const int & i_max, const int & stride) { + const char * __restrict__ x, int * __restrict__ x_tile, const int kbx0, const int i_max, const int stride) { #ifdef NEW_MMA_AVAILABLE int * x_qs = (int *) x_tile; @@ -2122,7 +2127,7 @@ template static __device__ __forceinlin } template static __device__ __forceinline__ void load_tiles_iq3_s( - const char * __restrict__ x, int * __restrict__ x_tile, const int & kbx0, const int & i_max, const int & stride) { + const char * __restrict__ x, int * __restrict__ x_tile, const int kbx0, const int i_max, const int stride) { #ifdef NEW_MMA_AVAILABLE int * x_qs = (int *) x_tile; @@ -2185,7 +2190,7 @@ template static __device__ __forceinlin } template static __device__ __forceinline__ void load_tiles_iq1_s( - const char * __restrict__ x, int * __restrict__ x_tile, const int & kbx0, const int & i_max, const int & stride) { + const char * __restrict__ x, int * __restrict__ x_tile, const int kbx0, const int i_max, const int stride) { #ifdef NEW_MMA_AVAILABLE int * x_qs = (int *) x_tile; @@ -2241,7 +2246,7 @@ template static __device__ __forceinlin } template static __device__ __forceinline__ void load_tiles_iq4_xs( - const char * __restrict__ x, int * __restrict__ x_tile, const int & kbx0, const int & i_max, const int & stride) { + const char * __restrict__ x, int * __restrict__ x_tile, const int kbx0, const int i_max, const int stride) { #ifdef NEW_MMA_AVAILABLE int * x_qs = (int *) x_tile; @@ -2302,8 +2307,8 @@ template static __device__ __forceinlin template static __device__ __forceinline__ void mmq_write_back_dp4a( - const float * __restrict__ sum, float * __restrict__ dst, const int & stride, const int & i_max, const int & j_max) { - + const float * __restrict__ sum, const int32_t * __restrict__ ids_dst, float * __restrict__ dst, + const int stride, const int i_max, const int j_max) { #pragma unroll for (int j0 = 0; j0 < mmq_x; j0 += nwarps) { const int j = j0 + threadIdx.y; @@ -2320,15 +2325,15 @@ static __device__ __forceinline__ void mmq_write_back_dp4a( continue; } - dst[j*stride + i] = sum[(j0/nwarps) * (mmq_y/WARP_SIZE) + i0/WARP_SIZE]; + dst[ids_dst[j]*stride + i] = sum[(j0/nwarps) * (mmq_y/WARP_SIZE) + i0/WARP_SIZE]; } } } template static __device__ __forceinline__ void mmq_write_back_mma( - const float * __restrict__ sum, float * __restrict__ dst, const int & stride, const int & i_max, const int & j_max) { - + const float * __restrict__ sum, const int * __restrict__ ids_dst, float * __restrict__ dst, + const int stride, const int i_max, const int j_max) { typedef tile<16, 8, int> tile_C; constexpr int granularity = mmq_get_granularity_device(mmq_x); @@ -2358,7 +2363,7 @@ static __device__ __forceinline__ void mmq_write_back_mma( continue; } - dst[j*stride + i] = sum[(j0/tile_C::J + n)*tile_C::ne + l]; + dst[ids_dst[j]*stride + i] = sum[(j0/tile_C::J + n)*tile_C::ne + l]; } } } @@ -2514,17 +2519,18 @@ struct mmq_type_traits { }; template -static __device__ void mul_mat_q_process_tile( - const char * __restrict__ x, const char * __restrict__ yc, float * __restrict__ dst, float * __restrict__ tmp_fixup, - const int & ne00, const int & ne01, const int & stride01, const int & ne10, const int & ne11, const int & stride11, const int & ne0, - const int & it, const int & jt, const int & kb0_start, const int & kb0_stop) { +static __device__ __forceinline__ void mul_mat_q_process_tile( + const char * __restrict__ x, const int offset_x, const int * __restrict__ y, + const int * __restrict__ ids_dst, float * __restrict__ dst, float * __restrict__ tmp_fixup, + const int nrows_x, const int ncols_y, const int stride_row_x, const int stride_col_dst, + const int tile_x_max_i, const int tile_y_max_j, const int kb0_start, const int kb0_stop) { constexpr int qk = ggml_cuda_type_traits::qk; constexpr int mmq_y = get_mmq_y_device(); constexpr load_tiles_mmq_t load_tiles = mmq_type_traits::load_tiles; - extern __shared__ char data_mul_mat_q[]; - int * tile_y = (int *) data_mul_mat_q; + extern __shared__ int data_mul_mat_q[]; + int * tile_y = data_mul_mat_q + mmq_x; int * tile_x = tile_y + GGML_PAD(mmq_x*(WARP_SIZE + WARP_SIZE/QI8_1), nwarps*WARP_SIZE); #ifdef NEW_MMA_AVAILABLE @@ -2539,16 +2545,11 @@ static __device__ void mul_mat_q_process_tile( float sum[mmq_x*mmq_y / (nwarps*WARP_SIZE)] = {0.0f}; - const int tile_x_max_i = ne01 - it*mmq_y - 1; - const int tile_y_max_j = ne11 - jt*mmq_x - 1; - - const int * y = (const int *) yc + jt*(mmq_x*sizeof(block_q8_1_mmq)/sizeof(int)); - for (int kb0 = kb0_start; kb0 < kb0_stop; kb0 += blocks_per_iter) { - load_tiles(x, tile_x, stride01*it*mmq_y + kb0, tile_x_max_i, stride01); + load_tiles(x, tile_x, offset_x + kb0, tile_x_max_i, stride_row_x); { - const int * by0 = y + stride11*(kb0*(qk*sizeof(block_q8_1_mmq) / (4*QK8_1*sizeof(int))) + 0*sizeof(block_q8_1_mmq)/sizeof(int)); + const int * by0 = y + ncols_y*(kb0*(qk*sizeof(block_q8_1_mmq) / (4*QK8_1*sizeof(int))) + 0*sizeof(block_q8_1_mmq)/sizeof(int)); #pragma unroll for (int l0 = 0; l0 < mmq_x*MMQ_TILE_Y_K; l0 += nwarps*WARP_SIZE) { int l = l0 + threadIdx.y*WARP_SIZE + threadIdx.x; @@ -2564,7 +2565,7 @@ static __device__ void mul_mat_q_process_tile( __syncthreads(); { - const int * by0 = y + stride11*(kb0*(qk*sizeof(block_q8_1_mmq) / (4*QK8_1*sizeof(int))) + 1*sizeof(block_q8_1_mmq)/sizeof(int)); + const int * by0 = y + ncols_y*(kb0*(qk*sizeof(block_q8_1_mmq) / (4*QK8_1*sizeof(int))) + 1*sizeof(block_q8_1_mmq)/sizeof(int)); #pragma unroll for (int l0 = 0; l0 < mmq_x*MMQ_TILE_Y_K; l0 += nwarps*WARP_SIZE) { int l = l0 + threadIdx.y*WARP_SIZE + threadIdx.x; @@ -2581,12 +2582,10 @@ static __device__ void mul_mat_q_process_tile( } if (fixup) { - write_back(sum, tmp_fixup + blockIdx.x*(mmq_x*mmq_y), mmq_y, mmq_y, mmq_x); + write_back(sum, ids_dst, tmp_fixup + blockIdx.x*(mmq_x*mmq_y), mmq_y, mmq_y, mmq_x); } else { - write_back(sum, dst + jt*mmq_x*ne0 + it*mmq_y, ne0, tile_x_max_i, tile_y_max_j); + write_back(sum, ids_dst, dst, stride_col_dst, tile_x_max_i, tile_y_max_j); } - - GGML_UNUSED(ne00); GGML_UNUSED(ne10); } @@ -2605,8 +2604,11 @@ template #endif // __CUDA_ARCH__ >= GGML_CUDA_CC_VOLTA #endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) static __global__ void mul_mat_q( - const char * __restrict__ x, const char * __restrict__ yc, float * __restrict__ dst, float * __restrict__ tmp_fixup, - const int ne00, const int ne01, const int stride01, const int ne10, const int ne11, const int stride11, const int ne0) { + const char * __restrict__ x, const int * __restrict__ y, const int32_t * __restrict__ ids_dst, + const int32_t * __restrict__ expert_bounds, float * __restrict__ dst, float * __restrict__ tmp_fixup, + const int ncols_x, const int nrows_x, const int ncols_y, const int stride_row_x, const int stride_col_dst, + const int channel_ratio, const int nchannels_y, const int stride_channel_x, const int stride_channel_y, const int stride_channel_dst, + const int sample_ratio, const int nsamples_y, const int stride_sample_x, const int stride_sample_y, const int stride_sample_dst) { // Skip unused template specializations for faster compilation: if (mmq_x > get_mmq_x_max_device() || mmq_x % mmq_get_granularity_device(mmq_x) != 0) { @@ -2617,26 +2619,85 @@ static __global__ void mul_mat_q( constexpr int qk = ggml_cuda_type_traits::qk; constexpr int mmq_y = get_mmq_y_device(); + const int ntx = (ncols_y + mmq_x - 1) / mmq_x; // Number of tiles x + const int nty = (nrows_x + mmq_y - 1) / mmq_y; // Number of tiles y + + // Initialize the ids for writing back data with just the index. + // For regular matrix multiplications this is never changed. + // For MoE the correct indices are loaded from ids_dst. + extern __shared__ int ids_dst_shared[]; // Stored at beginning of shared memory. +#pragma unroll + for (int j0 = 0; j0 < mmq_x; j0 += nwarps*WARP_SIZE) { + const int j = j0 + threadIdx.y*WARP_SIZE + threadIdx.x; + + if (j0 + nwarps*WARP_SIZE > mmq_x && j >= mmq_x) { + break; + } + + ids_dst_shared[j] = j; + } + // On AMD or old CUDA the performance with stream-k was worse, use conventional tiling instead: #if (defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) || __CUDA_ARCH__ < GGML_CUDA_CC_VOLTA { + const int wt = blockIdx.z / nchannels_y; + const int zt = blockIdx.z - wt*nchannels_y; + const int jt = blockIdx.y; + const int it = blockIdx.x; + + // Defaults for regular matrix multiplication: + int col_low = 0; + int col_high = ncols_y; + int col_diff = ncols_y; + int offset_y = wt*stride_sample_y + zt*stride_channel_y; + int offset_dst = wt*stride_sample_dst + zt*stride_channel_dst + jt*mmq_x*stride_col_dst; + + if (ids_dst) { + col_low = expert_bounds[zt + 0]; + col_high = expert_bounds[zt + 1]; + col_diff = col_high - col_low; + + offset_y = 0; + offset_dst = 0; + + if (jt*mmq_x >= col_diff) { + return; + } + +#pragma unroll + for (int j0 = 0; j0 < mmq_x; j0 += nwarps*WARP_SIZE) { + const int j = j0 + threadIdx.y*WARP_SIZE + threadIdx.x; + + if (j0 + nwarps*WARP_SIZE > mmq_x && j >= mmq_x) { + break; + } + + ids_dst_shared[j] = ids_dst[col_low + jt*mmq_x + j]; + } + } + + offset_y += (col_low + jt*mmq_x)*(sizeof(block_q8_1_mmq)/sizeof(int)); + offset_dst += it*mmq_y; + + const int tile_x_max_i = nrows_x - it*mmq_y - 1; + const int tile_y_max_j = col_diff - jt*mmq_x - 1; + + const int offset_x = (wt/sample_ratio)*stride_sample_x + (zt/channel_ratio)*stride_channel_x + it*mmq_y*stride_row_x; + constexpr bool fixup = false; mul_mat_q_process_tile - (x, yc, dst, tmp_fixup, ne00, ne01, stride01, ne10, ne11, stride11, ne0, - blockIdx.x, blockIdx.y, 0, ne00/qk); + (x, offset_x, y + offset_y, ids_dst_shared, dst + offset_dst, tmp_fixup, nrows_x, ncols_y, stride_row_x, stride_col_dst, + tile_x_max_i, tile_y_max_j, 0, ncols_x/qk); return; } #endif // (defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) || __CUDA_ARCH__ < GGML_CUDA_CC_VOLTA - const int64_t blocks_per_ne00 = ne00 / qk; + const int64_t blocks_per_ne00 = ncols_x / qk; constexpr int blocks_per_iter = MMQ_ITER_K / qk; - const int ntx = (ne11 + mmq_x - 1) / mmq_x; // Number of tiles x - const int nty = (ne01 + mmq_y - 1) / mmq_y; // Number of tiles y - // kbc == k block continuous, current index in continuous ijk space. - int64_t kbc = (int64_t) blockIdx.x *blocks_per_ne00*ntx*nty / gridDim.x; - int64_t kbc_stop = (int64_t)(blockIdx.x + 1)*blocks_per_ne00*ntx*nty / gridDim.x; + int64_t kbc = (int64_t) blockIdx.x *nsamples_y*nchannels_y*ntx*nty*blocks_per_ne00 / gridDim.x; + int64_t kbc_stop = (int64_t)(blockIdx.x + 1)*nsamples_y*nchannels_y*ntx*nty*blocks_per_ne00 / gridDim.x; kbc -= (kbc % blocks_per_ne00) % blocks_per_iter; kbc_stop -= (kbc_stop % blocks_per_ne00) % blocks_per_iter; @@ -2645,13 +2706,64 @@ static __global__ void mul_mat_q( int kb0_start = kbc % blocks_per_ne00; int kb0_stop = min(blocks_per_ne00, kb0_start + kbc_stop - kbc); while (kbc < kbc_stop && kb0_stop == blocks_per_ne00) { - const int jt = kbc / (blocks_per_ne00*nty); // j index of current tile. - const int it = (kbc - jt*(blocks_per_ne00*nty)) / blocks_per_ne00; // i index of current tile. + int tmp = kbc; + const int it = tmp / (nsamples_y*nchannels_y*ntx*blocks_per_ne00); + tmp -= it * (nsamples_y*nchannels_y*ntx*blocks_per_ne00); + const int wt = tmp / (nchannels_y*ntx*blocks_per_ne00); + tmp -= wt * (nchannels_y*ntx*blocks_per_ne00); + const int zt = tmp / (ntx*blocks_per_ne00); + tmp -= zt * (ntx*blocks_per_ne00); + const int jt = tmp / blocks_per_ne00; + + // Defaults for regular matrix multiplication: + int col_low = 0; + int col_high = ncols_y; + int col_diff = ncols_y; + int offset_y = wt*stride_sample_y + zt*stride_channel_y; + int offset_dst = wt*stride_sample_dst + zt*stride_channel_dst + jt*mmq_x*stride_col_dst; + + if (ids_dst) { + col_low = expert_bounds[zt + 0]; + col_high = expert_bounds[zt + 1]; + col_diff = col_high - col_low; + + offset_y = 0; + offset_dst = 0; + + if (jt*mmq_x >= col_diff) { + kbc += blocks_per_ne00; + kbc -= kbc % blocks_per_ne00; + + kb0_start = 0; + kb0_stop = min(blocks_per_ne00, kbc_stop - kbc); + + continue; + } + +#pragma unroll + for (int j0 = 0; j0 < mmq_x; j0 += nwarps*WARP_SIZE) { + const int j = j0 + threadIdx.y*WARP_SIZE + threadIdx.x; + + if (j0 + nwarps*WARP_SIZE > mmq_x && j >= mmq_x) { + break; + } + + ids_dst_shared[j] = ids_dst[col_low + jt*mmq_x + j]; + } + } + + offset_y += (col_low + jt*mmq_x)*(sizeof(block_q8_1_mmq)/sizeof(int)); + offset_dst += it*mmq_y; + + const int tile_x_max_i = nrows_x - it*mmq_y - 1; + const int tile_y_max_j = col_diff - jt*mmq_x - 1; + + const int offset_x = (wt/sample_ratio)*stride_sample_x + (zt/channel_ratio)*stride_channel_x + it*mmq_y*stride_row_x; constexpr bool fixup = false; // All but (potentially) the last iterations write their data to dst rather than the fixup buffer. mul_mat_q_process_tile - (x, yc, dst, tmp_fixup, ne00, ne01, stride01, ne10, ne11, stride11, ne0, - it, jt, kb0_start, kb0_stop); + (x, offset_x, y + offset_y, ids_dst_shared, dst + offset_dst, tmp_fixup, nrows_x, ncols_y, stride_row_x, stride_col_dst, + tile_x_max_i, tile_y_max_j, kb0_start, kb0_stop); kbc += blocks_per_ne00; kbc -= kbc % blocks_per_ne00; @@ -2664,55 +2776,106 @@ static __global__ void mul_mat_q( return; } - const int jt = kbc / (blocks_per_ne00*nty); - const int it = (kbc - jt*(blocks_per_ne00*nty)) / blocks_per_ne00; + int tmp = kbc; + const int it = tmp / (nsamples_y*nchannels_y*ntx*blocks_per_ne00); + tmp -= it * (nsamples_y*nchannels_y*ntx*blocks_per_ne00); + const int wt = tmp / (nchannels_y*ntx*blocks_per_ne00); + tmp -= wt * (nchannels_y*ntx*blocks_per_ne00); + const int zt = tmp / (ntx*blocks_per_ne00); + tmp -= zt * (ntx*blocks_per_ne00); + const int jt = tmp / blocks_per_ne00; + + // Defaults for regular matrix multiplication: + int col_low = 0; + int col_high = ncols_y; + int col_diff = ncols_y; + int offset_y = wt*stride_sample_y + zt*stride_channel_y; + int offset_dst = wt*stride_sample_dst + zt*stride_channel_dst + jt*mmq_x*stride_col_dst; + + if (ids_dst) { + col_low = expert_bounds[zt + 0]; + col_high = expert_bounds[zt + 1]; + col_diff = col_high - col_low; + + offset_y = 0; + offset_dst = 0; + + if (jt*mmq_x >= col_diff) { + return; + } + + // The memory layout for the fixup buffer is always contiguous, therefore reset ids: +#pragma unroll + for (int j0 = 0; j0 < mmq_x; j0 += nwarps*WARP_SIZE) { + const int j = j0 + threadIdx.y*WARP_SIZE + threadIdx.x; + + if (j0 + nwarps*WARP_SIZE > mmq_x && j >= mmq_x) { + break; + } + + ids_dst_shared[j] = j; + } + } + + offset_y += (col_low + jt*mmq_x)*(sizeof(block_q8_1_mmq)/sizeof(int)); + offset_dst += it*mmq_y; + + const int tile_x_max_i = nrows_x - it*mmq_y - 1; + const int tile_y_max_j = col_diff - jt*mmq_x - 1; + + const int offset_x = (wt/sample_ratio)*stride_sample_x + (zt/channel_ratio)*stride_channel_x + it*mmq_y*stride_row_x; constexpr bool fixup = true; // Last index writes its data to fixup buffer to avoid data races with other blocks. mul_mat_q_process_tile - (x, yc, dst, tmp_fixup, ne00, ne01, stride01, ne10, ne11, stride11, ne0, - it, jt, kb0_start, kb0_stop); + (x, offset_x, y + offset_y, ids_dst_shared, dst + offset_dst, tmp_fixup, nrows_x, ncols_y, stride_row_x, stride_col_dst, + tile_x_max_i, tile_y_max_j, kb0_start, kb0_stop); } template static __global__ void mul_mat_q_stream_k_fixup( - float * __restrict__ dst, const float * __restrict__ tmp_last_tile, const int ne00, const int ne01, const int ne11, const int ne0, const int block_num_mmq) { - + const int32_t * ids_dst, const int32_t * expert_bounds, float * __restrict__ dst, const float * __restrict__ tmp_last_tile, + const int ncols_x, const int nrows_x, const int ncols_y, const int stride_col_dst, + const int nchannels_y, const int stride_channel_dst, const int nsamples_y, const int stride_sample_dst) { constexpr int mmq_y = get_mmq_y_device(); constexpr int qk = ggml_cuda_type_traits::qk; constexpr int blocks_per_iter = MMQ_ITER_K / qk; - const int64_t blocks_per_ne00 = ne00 / qk; + const int64_t blocks_per_ne00 = ncols_x / qk; float sum[mmq_x*mmq_y / (nwarps*WARP_SIZE)] = {0.0f}; - const int ntx = (ne11 + mmq_x - 1) / mmq_x; - const int nty = (ne01 + mmq_y - 1) / mmq_y; + const int ntx = (ncols_y + mmq_x - 1) / mmq_x; + const int nty = (nrows_x + mmq_y - 1) / mmq_y; + + const int bidx0 = blockIdx.x; + + // kbc == k block continuous, current index in continuous ijk space. + int64_t kbc0 = (int64_t) bidx0 *nsamples_y*nchannels_y*ntx*nty*blocks_per_ne00 / gridDim.x; + int64_t kbc0_stop = (int64_t)(bidx0 + 1)*nsamples_y*nchannels_y*ntx*nty*blocks_per_ne00 / gridDim.x; + + kbc0 -= (kbc0 % blocks_per_ne00) % blocks_per_iter; + kbc0_stop -= (kbc0_stop % blocks_per_ne00) % blocks_per_iter; + + const bool did_not_have_any_data = kbc0 == kbc0_stop; + const bool wrote_beginning_of_tile = kbc0 % blocks_per_ne00 == 0; + const bool did_not_write_last = kbc0/blocks_per_ne00 == kbc0_stop/blocks_per_ne00 && kbc0_stop % blocks_per_ne00 != 0; + if (did_not_have_any_data || wrote_beginning_of_tile || did_not_write_last) { + return; + } bool any_fixup = false; - const int bidx_start = ((blockIdx.y*nty + blockIdx.x) * block_num_mmq) / (gridDim.y*gridDim.x); - const int bidx_stop = ((blockIdx.y*nty + blockIdx.x + 1) * block_num_mmq + gridDim.y*gridDim.x - 1) / (gridDim.y*gridDim.x); + // Iterate over previous blocks and sum up partial sums written to fixup buffer. + // All CUDA blocks that get here must have a previous block that needs a fixup. + int64_t bidx = bidx0 - 1; + int64_t kbc_stop = kbc0; + while(true) { + int64_t kbc = bidx*nsamples_y*nchannels_y*ntx*nty*blocks_per_ne00 / gridDim.x; + kbc -= (kbc % blocks_per_ne00) % blocks_per_iter; - int64_t kbc_0; - int64_t kbc_stop_0 = (int64_t) bidx_start*blocks_per_ne00*ntx*nty / block_num_mmq; - - for (int bidx = bidx_start; bidx < bidx_stop; ++bidx) { - kbc_0 = kbc_stop_0; - kbc_stop_0 = (int64_t) (bidx + 1)*blocks_per_ne00*ntx*nty / block_num_mmq; - - const int64_t kbc = kbc_0 - (kbc_0 % blocks_per_ne00) % blocks_per_iter; - const int64_t kbc_stop = kbc_stop_0 - (kbc_stop_0 % blocks_per_ne00) % blocks_per_iter; - - // Skip fixup tile if the MMQ CUDA block never wrote anything to it: - if (kbc == kbc_stop || kbc_stop % blocks_per_ne00 == 0) { - continue; - } - - const int jt = kbc_stop / (blocks_per_ne00*nty); - const int it = (kbc_stop - jt*(blocks_per_ne00*nty)) / blocks_per_ne00; - - // Skip fixup tile if it's unrelated to the output tile assigned to this CUDA block: - if ((unsigned)it != blockIdx.x || (unsigned)jt != blockIdx.y) { + if (kbc == kbc_stop) { // Did not have any data. + bidx--; + kbc_stop = kbc; continue; } @@ -2729,16 +2892,71 @@ static __global__ void mul_mat_q_stream_k_fixup( sum[(j0/nwarps) * (mmq_y/WARP_SIZE) + i0/WARP_SIZE] += tmp_last_tile[bidx*(mmq_x*mmq_y) + j*mmq_y + i]; } } + + // If this block started in a previous tile we are done and don't need to combine additional partial results. + if (kbc % blocks_per_ne00 == 0 || kbc/blocks_per_ne00 < kbc0/blocks_per_ne00) { + break; + } + bidx--; + kbc_stop = kbc; } if (!any_fixup) { return; } - dst += blockIdx.y*mmq_x*ne0 + blockIdx.x*mmq_y; + int tmp = kbc0; + const int it = tmp / (nsamples_y*nchannels_y*ntx*blocks_per_ne00); + tmp -= it * (nsamples_y*nchannels_y*ntx*blocks_per_ne00); + const int wt = tmp / (nchannels_y*ntx*blocks_per_ne00); + tmp -= wt * (nchannels_y*ntx*blocks_per_ne00); + const int zt = tmp / (ntx*blocks_per_ne00); + tmp -= zt * (ntx*blocks_per_ne00); + const int jt = tmp / blocks_per_ne00; - const int i_max = ne01 - blockIdx.x*mmq_y - 1; - const int j_max = ne11 - blockIdx.y*mmq_x - 1; + if (!ids_dst) { + const int offset_dst = wt*stride_sample_dst + zt*stride_channel_dst + jt*mmq_x*stride_col_dst + it*mmq_y; + dst += offset_dst; + + const int i_max = nrows_x - it*mmq_y - 1; + const int j_max = ncols_y - jt*mmq_x - 1; + +#pragma unroll + for (int j0 = 0; j0 < mmq_x; j0 += nwarps) { + const int j = j0 + threadIdx.y; + + if (j > j_max) { + return; + } + +#pragma unroll + for (int i0 = 0; i0 < mmq_y; i0 += WARP_SIZE) { + const int i = i0 + threadIdx.x; + + if (need_check && i > i_max) { + continue; + } + + dst[j*stride_col_dst + i] += sum[(j0/nwarps) * (mmq_y/WARP_SIZE) + i0/WARP_SIZE]; + } + } + return; + } + + __shared__ int ids_dst_shared[mmq_x]; + const int col_low = expert_bounds[zt + 0]; + const int col_high = expert_bounds[zt + 1]; + const int col_diff = col_high - col_low; + + for (int j = threadIdx.y*WARP_SIZE + threadIdx.x; j < mmq_x; j += nwarps*WARP_SIZE) { + ids_dst_shared[j] = ids_dst[col_low + j]; + } + + const int offset_dst = it*mmq_y; + dst += offset_dst; + + const int i_max = nrows_x - it*mmq_y - 1; + const int j_max = col_diff - jt*mmq_x - 1; #pragma unroll for (int j0 = 0; j0 < mmq_x; j0 += nwarps) { @@ -2756,26 +2974,27 @@ static __global__ void mul_mat_q_stream_k_fixup( continue; } - dst[j*ne0 + i] += sum[(j0/nwarps) * (mmq_y/WARP_SIZE) + i0/WARP_SIZE]; + dst[ids_dst_shared[j]*stride_col_dst + i] += sum[(j0/nwarps) * (mmq_y/WARP_SIZE) + i0/WARP_SIZE]; } } } struct mmq_args { - const char * x; const char * y; float * dst; - int64_t ne00; int64_t ne01; int64_t stride01; - int64_t ne10; int64_t ne11; int64_t stride11; - int64_t ne0; + const char * x; ggml_type type_x; const int * y; const int32_t * ids_dst; const int32_t * expert_bounds; float * dst; + int64_t ncols_x; int64_t nrows_x; int64_t ncols_y; int64_t stride_row_x; int64_t nrows_dst; + int64_t nchannels_x; int64_t nchannels_y; int64_t stride_channel_x; int64_t stride_channel_y; int64_t stride_channel_dst; + int64_t nsamples_x; int64_t nsamples_y; int64_t stride_sample_x; int64_t stride_sample_y; int64_t stride_sample_dst; bool use_stream_k; }; template -static int mmq_get_shmem(const int mmq_x, const int mmq_y, const int cc) { +static size_t mmq_get_nbytes_shared(const int mmq_x, const int mmq_y, const int cc) { const tile_x_sizes txs = mmq_get_dp4a_tile_x_sizes(type, mmq_y); const int mmq_tile_x_k = mmq_get_mma_tile_x_k(type); - const int shmem_x = new_mma_available(cc) ? mmq_y*mmq_tile_x_k*sizeof(int) : txs.qs*sizeof(int) + txs.dm*sizeof(half2) + txs.sc*sizeof(int); - const int shmem_y = mmq_x*sizeof(block_q8_1_mmq); - return shmem_x + GGML_PAD(shmem_y, MMQ_NWARPS*WARP_SIZE*sizeof(int)); + const size_t nbs_ids = mmq_x*sizeof(int); + const size_t nbs_x = new_mma_available(cc) ? mmq_y*mmq_tile_x_k*sizeof(int) : txs.qs*sizeof(int) + txs.dm*sizeof(half2) + txs.sc*sizeof(int); + const size_t nbs_y = mmq_x*sizeof(block_q8_1_mmq); + return nbs_ids + nbs_x + GGML_PAD(nbs_y, MMQ_NWARPS*WARP_SIZE*sizeof(int)); } template @@ -2787,86 +3006,114 @@ static void launch_mul_mat_q(ggml_backend_cuda_context & ctx, const mmq_args & a const dim3 block_dims(WARP_SIZE, MMQ_NWARPS, 1); - const int shmem = mmq_get_shmem(mmq_x, mmq_y, cc); + const int nbytes_shared = mmq_get_nbytes_shared(mmq_x, mmq_y, cc); #if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && !defined(GGML_USE_MUSA) - static bool shmem_limit_raised[GGML_CUDA_MAX_DEVICES] = {false}; - if (!shmem_limit_raised[id]) { - CUDA_CHECK(cudaFuncSetAttribute(mul_mat_q, cudaFuncAttributeMaxDynamicSharedMemorySize, shmem)); - CUDA_CHECK(cudaFuncSetAttribute(mul_mat_q, cudaFuncAttributeMaxDynamicSharedMemorySize, shmem)); - shmem_limit_raised[id] = true; + static bool shared_memory_limit_raised[GGML_CUDA_MAX_DEVICES] = {false}; + if (!shared_memory_limit_raised[id]) { + CUDA_CHECK(cudaFuncSetAttribute(mul_mat_q, cudaFuncAttributeMaxDynamicSharedMemorySize, nbytes_shared)); + CUDA_CHECK(cudaFuncSetAttribute(mul_mat_q, cudaFuncAttributeMaxDynamicSharedMemorySize, nbytes_shared)); + shared_memory_limit_raised[id] = true; } #endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && !defined(GGML_USE_MUSA) - const int nty = (args.ne01 + mmq_y - 1) / mmq_y; - const int ntx = (args.ne11 + mmq_x - 1) / mmq_x; - const dim3 block_nums_xy_tiling(nty, ntx, 1); + const int nty = (args.nrows_x + mmq_y - 1) / mmq_y; + const int ntx = (args.ncols_y + mmq_x - 1) / mmq_x; + const int ntzw = args.nchannels_y * args.nsamples_y; + const dim3 block_nums_xy_tiling(nty, ntx, ntzw); + + GGML_ASSERT(args.nchannels_y % args.nchannels_x == 0); + GGML_ASSERT(args.nsamples_y % args.nsamples_x == 0); + const int channel_ratio = args.nchannels_y / args.nchannels_x; + const int sample_ratio = args.nsamples_y / args.nsamples_x; if (!args.use_stream_k) { - if (args.ne01 % mmq_y == 0) { + if (args.nrows_x % mmq_y == 0) { constexpr bool need_check = false; - mul_mat_q<<>> - (args.x, args.y, args.dst, nullptr, args.ne00, args.ne01, args.stride01, args.ne10, args.ne11, args.stride11, args.ne0); + mul_mat_q<<>> + (args.x, args.y, args.ids_dst, args.expert_bounds, args.dst, nullptr, + args.ncols_x, args.nrows_x, args.ncols_y, args.stride_row_x, args.nrows_dst, + channel_ratio, args.nchannels_y, args.stride_channel_x, args.stride_channel_y, args.stride_channel_dst, + sample_ratio, args.nsamples_y, args.stride_sample_x, args.stride_sample_y, args.stride_sample_dst); } else { constexpr bool need_check = true; - mul_mat_q<<>> - (args.x, args.y, args.dst, nullptr, args.ne00, args.ne01, args.stride01, args.ne10, args.ne11, args.stride11, args.ne0); + mul_mat_q<<>> + (args.x, args.y, args.ids_dst, args.expert_bounds, args.dst, nullptr, + args.ncols_x, args.nrows_x, args.ncols_y, args.stride_row_x, args.nrows_dst, + channel_ratio, args.nchannels_y, args.stride_channel_x, args.stride_channel_y, args.stride_channel_dst, + sample_ratio, args.nsamples_y, args.stride_sample_x, args.stride_sample_y, args.stride_sample_dst); } return; } - const dim3 block_nums_mmq(nsm, 1, 1); + const dim3 block_nums_stream_k(nsm, 1, 1); + const bool fixup_needed = ntx*nty*ntzw % nsm != 0; ggml_cuda_pool & pool = ctx.pool(id); - ggml_cuda_pool_alloc tmp_fixup(pool, block_nums_mmq.x * mmq_x*mmq_y); + ggml_cuda_pool_alloc tmp_fixup(pool); + if (fixup_needed) { + tmp_fixup.alloc(block_nums_stream_k.x * mmq_x*mmq_y); + } - if (args.ne01 % mmq_y == 0) { + if (args.nrows_x % mmq_y == 0) { constexpr bool need_check = false; - mul_mat_q<<>> - (args.x, args.y, args.dst, tmp_fixup.ptr, args.ne00, args.ne01, args.stride01, args.ne10, args.ne11, args.stride11, args.ne0); + mul_mat_q<<>> + (args.x, args.y, args.ids_dst, args.expert_bounds, args.dst, tmp_fixup.ptr, + args.ncols_x, args.nrows_x, args.ncols_y, args.stride_row_x, args.nrows_dst, + channel_ratio, args.nchannels_y, args.stride_channel_x, args.stride_channel_y, args.stride_channel_dst, + sample_ratio, args.nsamples_y, args.stride_sample_x, args.stride_sample_y, args.stride_sample_dst); - mul_mat_q_stream_k_fixup<<>> - (args.dst, tmp_fixup.ptr, args.ne00, args.ne01, args.ne11, args.ne0, block_nums_mmq.x); + if (!fixup_needed) { + return; + } + + mul_mat_q_stream_k_fixup<<>> + (args.ids_dst, args.expert_bounds, args.dst, tmp_fixup.ptr, args.ncols_x, args.nrows_x, args.ncols_y, + args.nrows_dst, args.nchannels_y, args.stride_channel_dst, args.nsamples_y, args.stride_sample_dst); } else { constexpr bool need_check = true; - mul_mat_q<<>> - (args.x, args.y, args.dst, tmp_fixup.ptr, args.ne00, args.ne01, args.stride01, args.ne10, args.ne11, args.stride11, args.ne0); + mul_mat_q<<>> + (args.x, args.y, args.ids_dst, args.expert_bounds, args.dst, tmp_fixup.ptr, + args.ncols_x, args.nrows_x, args.ncols_y, args.stride_row_x, args.nrows_dst, + channel_ratio, args.nchannels_y, args.stride_channel_x, args.stride_channel_y, args.stride_channel_dst, + sample_ratio, args.nsamples_y, args.stride_sample_x, args.stride_sample_y, args.stride_sample_dst); - mul_mat_q_stream_k_fixup<<>> - (args.dst, tmp_fixup.ptr, args.ne00, args.ne01, args.ne11, args.ne0, block_nums_mmq.x); + if (!fixup_needed) { + return; + } + + mul_mat_q_stream_k_fixup<<>> + (args.ids_dst, args.expert_bounds, args.dst, tmp_fixup.ptr, args.ncols_x, args.nrows_x, args.ncols_y, + args.nrows_dst, args.nchannels_y, args.stride_channel_dst, args.nsamples_y, args.stride_sample_dst); } } template void mul_mat_q_case(ggml_backend_cuda_context & ctx, const mmq_args & args, cudaStream_t stream) { - const int id = ggml_cuda_get_device(); - const int cc = ggml_cuda_info().devices[id].cc; - const int smpbo = ggml_cuda_info().devices[id].smpbo; + const int id = ggml_cuda_get_device(); + const int cc = ggml_cuda_info().devices[id].cc; + const size_t smpbo = ggml_cuda_info().devices[id].smpbo; const int mmq_x_max = get_mmq_x_max_host(cc); const int mmq_y = get_mmq_y_host(cc); - const int block_num_y = (args.ne01 + mmq_y - 1) / mmq_y; - const bool use_stream_k = GGML_CUDA_CC_IS_NVIDIA(cc) && ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_VOLTA; int mmq_x_best = 0; - int nparts_best = INT_MAX; + int ntiles_x_best = INT_MAX; - for (int mmq_x = 8; mmq_x <= mmq_x_max && nparts_best > 1; mmq_x += 8) { + for (int mmq_x = 8; mmq_x <= mmq_x_max && ntiles_x_best > 1; mmq_x += 8) { const int granularity = mmq_get_granularity_host(mmq_x, cc); - if (mmq_x % granularity != 0 || mmq_get_shmem(mmq_x, mmq_y, cc) > smpbo) { + if (mmq_x % granularity != 0 || mmq_get_nbytes_shared(mmq_x, mmq_y, cc) > smpbo) { continue; } - const int ntiles_x = (args.ne11 + mmq_x - 1) / mmq_x; - const int nwaves_xy_tiling = ntiles_x*block_num_y; - const int nparts = use_stream_k ? ntiles_x : nwaves_xy_tiling; + const int ntiles_x = (args.ncols_y + mmq_x - 1) / mmq_x; - if (nparts < nparts_best) { - mmq_x_best = mmq_x; - nparts_best = nparts; + if (ntiles_x < ntiles_x_best) { + mmq_x_best = mmq_x; + ntiles_x_best = ntiles_x; } } @@ -2950,6 +3197,9 @@ extern DECL_MMQ_CASE(GGML_TYPE_IQ4_XS); // ------------------------------------------------------------------------------------------------------------------------- +void ggml_cuda_mul_mat_q( + ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * ids, ggml_tensor * dst); + void ggml_cuda_op_mul_mat_q( ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i, diff --git a/ml/backend/ggml/ggml/src/ggml-cuda/mmv.cu b/ml/backend/ggml/ggml/src/ggml-cuda/mmv.cu index b39961cd1..d8c385e23 100644 --- a/ml/backend/ggml/ggml/src/ggml-cuda/mmv.cu +++ b/ml/backend/ggml/ggml/src/ggml-cuda/mmv.cu @@ -4,18 +4,23 @@ template static __global__ void mul_mat_vec( - const T * __restrict__ x, const float * __restrict__ y, float * __restrict__ dst, const int64_t ncols2, const int64_t stride_row, + const T * __restrict__ x, const float * __restrict__ y, const int32_t * __restrict__ ids, float * __restrict__ dst, + const int64_t ncols2, const int64_t nchannels_y, const int64_t stride_row, const int64_t channel_ratio, const int64_t stride_channel_x, const int64_t stride_channel_y, const int64_t stride_channel_dst, const int64_t sample_ratio, const int64_t stride_sample_x, const int64_t stride_sample_y, const int64_t stride_sample_dst) { - const int64_t row = blockIdx.x; - const int64_t channel = blockIdx.y; - const int64_t sample = blockIdx.z; - const int tid = threadIdx.x; - constexpr int warp_size = ggml_cuda_get_physical_warp_size(); + const int64_t row = blockIdx.x; + const int64_t channel_dst = blockIdx.y; + const int64_t channel_x = ids ? ids[channel_dst] : channel_dst / channel_ratio; + const int64_t channel_y = ids ? channel_dst % nchannels_y : channel_dst; + const int64_t sample_dst = blockIdx.z; + const int64_t sample_x = sample_dst / sample_ratio; + const int64_t sample_y = sample_dst; + const int tid = threadIdx.x; + constexpr int warp_size = ggml_cuda_get_physical_warp_size(); - x += (sample/sample_ratio)*stride_sample_x + (channel/channel_ratio)*stride_channel_x + row*stride_row; - y += sample *stride_sample_y + channel *stride_channel_y; - dst += sample *stride_sample_dst + channel *stride_channel_dst; + x += sample_x *stride_sample_x + channel_x *stride_channel_x + row*stride_row; + y += sample_y *stride_sample_y + channel_y *stride_channel_y; + dst += sample_dst*stride_sample_dst + channel_dst*stride_channel_dst; const float2 * y2 = (const float2 *) y; @@ -31,12 +36,19 @@ static __global__ void mul_mat_vec( float sumf = 0.0f; - if constexpr (std::is_same::value) { + if constexpr (std::is_same::value) { + const float2 * x2 = (const float2 *) x; + + for (int64_t col2 = tid; col2 < ncols2; col2 += block_size) { + const float2 tmpx = x2[col2]; + const float2 tmpy = y2[col2]; + sumf += tmpx.x*tmpy.x; + sumf += tmpx.y*tmpy.y; + } + } else if constexpr (std::is_same::value) { const half2 * x2 = (const half2 *) x; if (std::is_same::value) { - sumf = 0.0f; - for (int64_t col2 = tid; col2 < ncols2; col2 += block_size) { const float2 tmpx = __half22float2(x2[col2]); const float2 tmpy = y2[col2]; @@ -59,8 +71,6 @@ static __global__ void mul_mat_vec( } } else if constexpr (std::is_same::value) { const int * x2 = (const int *) x; - sumf = 0.0f; - for (int64_t col2 = tid; col2 < ncols2; col2 += block_size) { const int tmpx = x2[col2]; const float2 tmpy = y2[col2]; @@ -92,17 +102,17 @@ static __global__ void mul_mat_vec( template static void launch_mul_mat_vec_cuda( - const T * x, const float * y, float * dst, - const int64_t ncols, const int64_t nrows, const int64_t stride_row, const int64_t nchannels_x, const int64_t nchannels_y, + const T * x, const float * y, const int32_t * ids, float * dst, + const int64_t ncols, const int64_t nrows, const int64_t stride_row, const int64_t nchannels_x, const int64_t nchannels_y, const int64_t nchannels_dst, const int64_t stride_channel_x, const int64_t stride_channel_y, const int64_t stride_channel_dst, const int64_t nsamples_x, - const int64_t nsamples_y, const int64_t stride_sample_x, const int64_t stride_sample_y, const int64_t stride_sample_dst, + const int64_t nsamples_dst, const int64_t stride_sample_x, const int64_t stride_sample_y, const int64_t stride_sample_dst, cudaStream_t stream) { GGML_ASSERT(ncols % 2 == 0); GGML_ASSERT(stride_row % 2 == 0); - GGML_ASSERT(nchannels_y % nchannels_x == 0); - GGML_ASSERT(nsamples_y % nsamples_x == 0); - const int64_t channel_ratio = nchannels_y / nchannels_x; - const int64_t sample_ratio = nsamples_y / nsamples_x; + GGML_ASSERT(ids || nchannels_dst % nchannels_x == 0); + GGML_ASSERT( nsamples_dst % nsamples_x == 0); + const int64_t channel_ratio = nchannels_dst / nchannels_x; + const int64_t sample_ratio = nsamples_dst / nsamples_x; int device; int warp_size; @@ -124,48 +134,48 @@ static void launch_mul_mat_vec_cuda( } const int smem = warp_size*sizeof(float); - const dim3 block_nums(nrows, nchannels_y, nsamples_y); + const dim3 block_nums(nrows, nchannels_dst, nsamples_dst); const dim3 block_dims(block_size_best, 1, 1); switch (block_size_best) { case 32: { mul_mat_vec<<>> - (x, y, dst, ncols/2, stride_row, channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst, - sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst); + (x, y, ids, dst, ncols/2, nchannels_y, stride_row, channel_ratio, stride_channel_x, stride_channel_y, + stride_channel_dst, sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst); } break; case 64: { mul_mat_vec<<>> - (x, y, dst, ncols/2, stride_row, channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst, - sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst); + (x, y, ids, dst, ncols/2, nchannels_y, stride_row, channel_ratio, stride_channel_x, stride_channel_y, + stride_channel_dst, sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst); } break; case 96: { mul_mat_vec<<>> - (x, y, dst, ncols/2, stride_row, channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst, - sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst); + (x, y, ids, dst, ncols/2, nchannels_y, stride_row, channel_ratio, stride_channel_x, stride_channel_y, + stride_channel_dst, sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst); } break; case 128: { mul_mat_vec<<>> - (x, y, dst, ncols/2, stride_row, channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst, - sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst); + (x, y, ids, dst, ncols/2, nchannels_y, stride_row, channel_ratio, stride_channel_x, stride_channel_y, + stride_channel_dst, sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst); } break; case 160: { mul_mat_vec<<>> - (x, y, dst, ncols/2, stride_row, channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst, - sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst); + (x, y, ids, dst, ncols/2, nchannels_y, stride_row, channel_ratio, stride_channel_x, stride_channel_y, + stride_channel_dst, sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst); } break; case 192: { mul_mat_vec<<>> - (x, y, dst, ncols/2, stride_row, channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst, - sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst); + (x, y, ids, dst, ncols/2, nchannels_y, stride_row, channel_ratio, stride_channel_x, stride_channel_y, + stride_channel_dst, sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst); } break; case 224: { mul_mat_vec<<>> - (x, y, dst, ncols/2, stride_row, channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst, - sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst); + (x, y, ids, dst, ncols/2, nchannels_y, stride_row, channel_ratio, stride_channel_x, stride_channel_y, + stride_channel_dst, sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst); } break; case 256: { mul_mat_vec<<>> - (x, y, dst, ncols/2, stride_row, channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst, - sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst); + (x, y, ids, dst, ncols/2, nchannels_y, stride_row, channel_ratio, stride_channel_x, stride_channel_y, + stride_channel_dst, sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst); } break; default: { GGML_ABORT("fatal error"); @@ -175,28 +185,28 @@ static void launch_mul_mat_vec_cuda( template static void mul_mat_vec_cuda( - const T * x, const float * y, float * dst, - const int64_t ncols, const int64_t nrows, const int64_t stride_row, const int64_t nchannels_x, const int64_t nchannels_y, + const T * x, const float * y, const int32_t * ids, float * dst, + const int64_t ncols, const int64_t nrows, const int64_t stride_row, const int64_t nchannels_x, const int64_t nchannels_y, const int64_t nchannels_dst, const int64_t stride_channel_x, const int64_t stride_channel_y, const int64_t stride_channel_dst, const int64_t nsamples_x, - const int64_t nsamples_y, const int64_t stride_sample_x, const int64_t stride_sample_y, const int64_t stride_sample_dst, + const int64_t nsamples_dst, const int64_t stride_sample_x, const int64_t stride_sample_y, const int64_t stride_sample_dst, enum ggml_prec prec, cudaStream_t stream) { - switch (prec) { - case GGML_PREC_DEFAULT: { + if constexpr(std::is_same::value) { + if (prec == GGML_PREC_DEFAULT) { launch_mul_mat_vec_cuda - (x, y, dst, ncols, nrows, stride_row, nchannels_x, nchannels_y, stride_channel_x, stride_channel_y, stride_channel_dst, - nsamples_x, nsamples_y, stride_sample_x, stride_sample_y, stride_sample_dst, stream); - } break; - case GGML_PREC_F32: { - launch_mul_mat_vec_cuda - (x, y, dst, ncols, nrows, stride_row, nchannels_x, nchannels_y, stride_channel_x, stride_channel_y, stride_channel_dst, - nsamples_x, nsamples_y, stride_sample_x, stride_sample_y, stride_sample_dst, stream); - } break; + (x, y, ids, dst, ncols, nrows, stride_row, nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, + stride_channel_dst, nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream); + return; + } } + launch_mul_mat_vec_cuda + (x, y, ids, dst, ncols, nrows, stride_row, nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, + stride_channel_dst, nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream); } -void ggml_cuda_mul_mat_vec(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { - GGML_ASSERT(src1->type == GGML_TYPE_F32); - GGML_ASSERT(dst->type == GGML_TYPE_F32); +void ggml_cuda_mul_mat_vec(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * ids, ggml_tensor * dst) { + GGML_ASSERT( src1->type == GGML_TYPE_F32); + GGML_ASSERT(!ids || ids->type == GGML_TYPE_I32); + GGML_ASSERT( dst->type == GGML_TYPE_F32); GGML_TENSOR_BINARY_OP_LOCALS; @@ -204,21 +214,24 @@ void ggml_cuda_mul_mat_vec(ggml_backend_cuda_context & ctx, const ggml_tensor * const size_t ts_src1 = ggml_type_size(src1->type); const size_t ts_dst = ggml_type_size(dst->type); - GGML_ASSERT(ne11 == 1); - GGML_ASSERT(ne12 == ne2); + GGML_ASSERT(!ids || ne12 == 1); // Implementation is only correct for batch size 1. GGML_ASSERT(ne13 == ne3); - GGML_ASSERT(nb00 == ts_src0); - GGML_ASSERT(nb10 == ts_src1); - GGML_ASSERT(nb0 == ts_dst); + GGML_ASSERT( nb00 == ts_src0); + GGML_ASSERT( nb10 == ts_src1); + GGML_ASSERT(!ids || ids->nb[0] == ggml_type_size(ids->type)); + GGML_ASSERT( nb0 == ts_dst); const int cc = ggml_cuda_info().devices[ggml_cuda_get_device()].cc; const enum ggml_prec prec = fast_fp16_available(cc) ? ggml_prec(dst->op_params[0]) : GGML_PREC_F32; - const float * src1_d = (const float *) src1->data; - float * dst_d = (float *) dst->data; + const float * src1_d = (const float *) src1->data; + const int32_t * ids_d = ids ? (const int32_t *) ids->data : nullptr; + float * dst_d = (float *) dst->data; const int64_t s01 = src0->nb[1] / ts_src0; + const int64_t s11 = src1->nb[1] / ts_src1; + const int64_t s1 = dst->nb[1] / ts_dst; const int64_t s02 = src0->nb[2] / ts_src0; const int64_t s12 = src1->nb[2] / ts_src1; const int64_t s2 = dst->nb[2] / ts_dst; @@ -226,14 +239,33 @@ void ggml_cuda_mul_mat_vec(ggml_backend_cuda_context & ctx, const ggml_tensor * const int64_t s13 = src1->nb[3] / ts_src1; const int64_t s3 = dst->nb[3] / ts_dst; + // For MUL_MAT_ID the memory layout is different than for MUL_MAT: + const int64_t ncols_dst = ids ? ne2 : ne1; + const int64_t nchannels_y = ids ? ne11 : ne12; + const int64_t nchannels_dst = ids ? ne1 : ne2; + const int64_t stride_channel_dst = ids ? s1 : s2; + const int64_t stride_channel_y = ids ? s11 : s12; + + GGML_ASSERT(ncols_dst == 1); + switch (src0->type) { + case GGML_TYPE_F32: { + const float * src0_d = (const float *) src0->data; + mul_mat_vec_cuda(src0_d, src1_d, ids_d, dst_d, ne00, ne01, s01, + ne02, nchannels_y, nchannels_dst, s02, stride_channel_y, stride_channel_dst, + ne03, ne3, s03, s13, s3, prec, ctx.stream()); + } break; case GGML_TYPE_F16: { const half * src0_d = (const half *) src0->data; - mul_mat_vec_cuda(src0_d, src1_d, dst_d, ne00, ne01, s01, ne02, ne12, s02, s12, s2, ne03, ne13, s03, s13, s3, prec, ctx.stream()); + mul_mat_vec_cuda(src0_d, src1_d, ids_d, dst_d, ne00, ne01, s01, + ne02, nchannels_y, nchannels_dst, s02, stride_channel_y, stride_channel_dst, + ne03, ne3, s03, s13, s3, prec, ctx.stream()); } break; case GGML_TYPE_BF16: { const nv_bfloat16 * src0_d = (const nv_bfloat16 *) src0->data; - mul_mat_vec_cuda(src0_d, src1_d, dst_d, ne00, ne01, s01, ne02, ne12, s02, s12, s2, ne03, ne13, s03, s13, s3, prec, ctx.stream()); + mul_mat_vec_cuda(src0_d, src1_d, ids_d, dst_d, ne00, ne01, s01, + ne02, nchannels_y, nchannels_dst, s02, stride_channel_y, stride_channel_dst, + ne03, ne3, s03, s13, s3, prec, ctx.stream()); } break; default: GGML_ABORT("unsupported type: %s", ggml_type_name(src0->type)); @@ -262,27 +294,34 @@ void ggml_cuda_op_mul_mat_vec( const int64_t stride_row = ne00; const int64_t nchannels_x = 1; const int64_t nchannels_y = 1; + const int64_t nchannels_dst = 1; const int64_t stride_channel_x = 0; const int64_t stride_channel_y = 0; const int64_t stride_channel_dst = 0; const int64_t nsamples_x = 1; - const int64_t nsamples_y = 1; + const int64_t nsamples_dst = 1; const int64_t stride_sample_x = 0; const int64_t stride_sample_y = 0; const int64_t stride_sample_dst = 0; switch (src0->type) { + case GGML_TYPE_F32: { + const float * src0_d = (const float *) src0_dd_i; + mul_mat_vec_cuda(src0_d, src1_ddf_i, nullptr, dst_dd_i, ne00, row_diff, stride_row, + nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst, + nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, prec, stream); + } break; case GGML_TYPE_F16: { const half * src0_d = (const half *) src0_dd_i; - mul_mat_vec_cuda(src0_d, src1_ddf_i, dst_dd_i, ne00, row_diff, stride_row, - nchannels_x, nchannels_y, stride_channel_x, stride_channel_y, stride_channel_dst, - nsamples_x, nsamples_y, stride_sample_x, stride_sample_y, stride_sample_dst, prec, stream); + mul_mat_vec_cuda(src0_d, src1_ddf_i, nullptr, dst_dd_i, ne00, row_diff, stride_row, + nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst, + nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, prec, stream); } break; case GGML_TYPE_BF16: { const nv_bfloat16 * src0_d = (const nv_bfloat16 *) src0_dd_i; - mul_mat_vec_cuda(src0_d, src1_ddf_i, dst_dd_i, ne00, row_diff, stride_row, - nchannels_x, nchannels_y, stride_channel_x, stride_channel_y, stride_channel_dst, - nsamples_x, nsamples_y, stride_sample_x, stride_sample_y, stride_sample_dst, prec, stream); + mul_mat_vec_cuda(src0_d, src1_ddf_i, nullptr, dst_dd_i, ne00, row_diff, stride_row, + nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst, + nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, prec, stream); } break; default: GGML_ABORT("unsupported type: %s", ggml_type_name(src0->type)); diff --git a/ml/backend/ggml/ggml/src/ggml-cuda/mmv.cuh b/ml/backend/ggml/ggml/src/ggml-cuda/mmv.cuh index 78a1cd4a6..756e7e1cc 100644 --- a/ml/backend/ggml/ggml/src/ggml-cuda/mmv.cuh +++ b/ml/backend/ggml/ggml/src/ggml-cuda/mmv.cuh @@ -3,7 +3,7 @@ // maximum number of src0 rows with which to use mul_mat_vec over cuBLAS if FP16 tensor cores are available #define MMV_MAX_ROWS 512 -void ggml_cuda_mul_mat_vec(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst); +void ggml_cuda_mul_mat_vec(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * ids, ggml_tensor * dst); void ggml_cuda_op_mul_mat_vec( ggml_backend_cuda_context & ctx, diff --git a/ml/backend/ggml/ggml/src/ggml-cuda/mmvq.cu b/ml/backend/ggml/ggml/src/ggml-cuda/mmvq.cu index eef8585a7..132c466fd 100644 --- a/ml/backend/ggml/ggml/src/ggml-cuda/mmvq.cu +++ b/ml/backend/ggml/ggml/src/ggml-cuda/mmvq.cu @@ -1,50 +1,57 @@ #include "mmvq.cuh" +#include "quantize.cuh" #include "vecdotq.cuh" +#include + typedef float (*vec_dot_q_cuda_t)(const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & kbx, const int & iqs); static constexpr __device__ vec_dot_q_cuda_t get_vec_dot_q_cuda(ggml_type type) { - return type == GGML_TYPE_Q4_0 ? vec_dot_q4_0_q8_1 : - type == GGML_TYPE_Q4_1 ? vec_dot_q4_1_q8_1 : - type == GGML_TYPE_Q5_0 ? vec_dot_q5_0_q8_1 : - type == GGML_TYPE_Q5_1 ? vec_dot_q5_1_q8_1 : - type == GGML_TYPE_Q8_0 ? vec_dot_q8_0_q8_1 : - type == GGML_TYPE_Q2_K ? vec_dot_q2_K_q8_1 : - type == GGML_TYPE_Q3_K ? vec_dot_q3_K_q8_1 : - type == GGML_TYPE_Q4_K ? vec_dot_q4_K_q8_1 : - type == GGML_TYPE_Q5_K ? vec_dot_q5_K_q8_1 : - type == GGML_TYPE_Q6_K ? vec_dot_q6_K_q8_1 : - type == GGML_TYPE_IQ2_XXS ? vec_dot_iq2_xxs_q8_1 : - type == GGML_TYPE_IQ2_XS ? vec_dot_iq2_xs_q8_1 : - type == GGML_TYPE_IQ2_S ? vec_dot_iq2_s_q8_1 : - type == GGML_TYPE_IQ3_XXS ? vec_dot_iq3_xxs_q8_1 : - type == GGML_TYPE_IQ1_S ? vec_dot_iq1_s_q8_1 : - type == GGML_TYPE_IQ1_M ? vec_dot_iq1_m_q8_1 : - type == GGML_TYPE_IQ4_NL ? vec_dot_iq4_nl_q8_1 : - type == GGML_TYPE_IQ4_XS ? vec_dot_iq4_xs_q8_1 : - type == GGML_TYPE_IQ3_S ? vec_dot_iq3_s_q8_1 : - nullptr; + switch (type) { + case GGML_TYPE_Q4_0: return vec_dot_q4_0_q8_1; + case GGML_TYPE_Q4_1: return vec_dot_q4_1_q8_1; + case GGML_TYPE_Q5_0: return vec_dot_q5_0_q8_1; + case GGML_TYPE_Q5_1: return vec_dot_q5_1_q8_1; + case GGML_TYPE_Q8_0: return vec_dot_q8_0_q8_1; + case GGML_TYPE_Q2_K: return vec_dot_q2_K_q8_1; + case GGML_TYPE_Q3_K: return vec_dot_q3_K_q8_1; + case GGML_TYPE_Q4_K: return vec_dot_q4_K_q8_1; + case GGML_TYPE_Q5_K: return vec_dot_q5_K_q8_1; + case GGML_TYPE_Q6_K: return vec_dot_q6_K_q8_1; + case GGML_TYPE_IQ2_XXS: return vec_dot_iq2_xxs_q8_1; + case GGML_TYPE_IQ2_XS: return vec_dot_iq2_xs_q8_1; + case GGML_TYPE_IQ2_S: return vec_dot_iq2_s_q8_1; + case GGML_TYPE_IQ3_XXS: return vec_dot_iq3_xxs_q8_1; + case GGML_TYPE_IQ1_S: return vec_dot_iq1_s_q8_1; + case GGML_TYPE_IQ1_M: return vec_dot_iq1_m_q8_1; + case GGML_TYPE_IQ4_NL: return vec_dot_iq4_nl_q8_1; + case GGML_TYPE_IQ4_XS: return vec_dot_iq4_xs_q8_1; + case GGML_TYPE_IQ3_S: return vec_dot_iq3_s_q8_1; + default: return nullptr; + } } static constexpr __device__ int get_vdr_mmvq(ggml_type type) { - return type == GGML_TYPE_Q4_0 ? VDR_Q4_0_Q8_1_MMVQ : - type == GGML_TYPE_Q4_1 ? VDR_Q4_1_Q8_1_MMVQ : - type == GGML_TYPE_Q5_0 ? VDR_Q5_0_Q8_1_MMVQ : - type == GGML_TYPE_Q5_1 ? VDR_Q5_1_Q8_1_MMVQ : - type == GGML_TYPE_Q8_0 ? VDR_Q8_0_Q8_1_MMVQ : - type == GGML_TYPE_Q2_K ? VDR_Q2_K_Q8_1_MMVQ : - type == GGML_TYPE_Q3_K ? VDR_Q3_K_Q8_1_MMVQ : - type == GGML_TYPE_Q4_K ? VDR_Q4_K_Q8_1_MMVQ : - type == GGML_TYPE_Q5_K ? VDR_Q5_K_Q8_1_MMVQ : - type == GGML_TYPE_Q6_K ? VDR_Q6_K_Q8_1_MMVQ : - type == GGML_TYPE_IQ2_XXS ? VDR_IQ2_XXS_Q8_1_MMVQ : - type == GGML_TYPE_IQ2_XS ? VDR_IQ2_XS_Q8_1_MMVQ : - type == GGML_TYPE_IQ2_S ? VDR_IQ2_S_Q8_1_MMVQ : - type == GGML_TYPE_IQ3_XXS ? VDR_IQ3_XXS_Q8_1_MMVQ : - type == GGML_TYPE_IQ3_S ? VDR_IQ3_S_Q8_1_MMVQ : - type == GGML_TYPE_IQ4_NL ? VDR_IQ4_NL_Q8_1_MMVQ : - type == GGML_TYPE_IQ4_XS ? VDR_IQ4_XS_Q8_1_MMVQ : - 1; + switch (type) { + case GGML_TYPE_Q4_0: return VDR_Q4_0_Q8_1_MMVQ; + case GGML_TYPE_Q4_1: return VDR_Q4_1_Q8_1_MMVQ; + case GGML_TYPE_Q5_0: return VDR_Q5_0_Q8_1_MMVQ; + case GGML_TYPE_Q5_1: return VDR_Q5_1_Q8_1_MMVQ; + case GGML_TYPE_Q8_0: return VDR_Q8_0_Q8_1_MMVQ; + case GGML_TYPE_Q2_K: return VDR_Q2_K_Q8_1_MMVQ; + case GGML_TYPE_Q3_K: return VDR_Q3_K_Q8_1_MMVQ; + case GGML_TYPE_Q4_K: return VDR_Q4_K_Q8_1_MMVQ; + case GGML_TYPE_Q5_K: return VDR_Q5_K_Q8_1_MMVQ; + case GGML_TYPE_Q6_K: return VDR_Q6_K_Q8_1_MMVQ; + case GGML_TYPE_IQ2_XXS: return VDR_IQ2_XXS_Q8_1_MMVQ; + case GGML_TYPE_IQ2_XS: return VDR_IQ2_XS_Q8_1_MMVQ; + case GGML_TYPE_IQ2_S: return VDR_IQ2_S_Q8_1_MMVQ; + case GGML_TYPE_IQ3_XXS: return VDR_IQ3_XXS_Q8_1_MMVQ; + case GGML_TYPE_IQ3_S: return VDR_IQ3_S_Q8_1_MMVQ; + case GGML_TYPE_IQ4_NL: return VDR_IQ4_NL_Q8_1_MMVQ; + case GGML_TYPE_IQ4_XS: return VDR_IQ4_XS_Q8_1_MMVQ; + default: return 1; + } } enum mmvq_parameter_table_id { @@ -73,9 +80,9 @@ static __host__ mmvq_parameter_table_id get_device_table_id(int cc) { return MMVQ_PARAMETERS_GENERIC; } -static constexpr __host__ __device__ int calc_nwarps(int ncols_y, mmvq_parameter_table_id table_id) { +static constexpr __host__ __device__ int calc_nwarps(int ncols_dst, mmvq_parameter_table_id table_id) { if (table_id == MMVQ_PARAMETERS_GENERIC) { - switch (ncols_y) { + switch (ncols_dst) { case 1: case 2: case 3: @@ -90,7 +97,7 @@ static constexpr __host__ __device__ int calc_nwarps(int ncols_y, mmvq_paramete return 1; } } else if (table_id == MMVQ_PARAMETERS_GCN) { - switch (ncols_y) { + switch (ncols_dst) { case 1: case 2: case 3: @@ -107,9 +114,9 @@ static constexpr __host__ __device__ int calc_nwarps(int ncols_y, mmvq_paramete return 1; } -static constexpr __host__ __device__ int calc_rows_per_block(int ncols_y, int table_id) { +static constexpr __host__ __device__ int calc_rows_per_block(int ncols_dst, int table_id) { if (table_id == MMVQ_PARAMETERS_GENERIC || table_id == MMVQ_PARAMETERS_GCN) { - switch (ncols_y) { + switch (ncols_dst) { case 1: return 1; case 2: @@ -127,19 +134,21 @@ static constexpr __host__ __device__ int calc_rows_per_block(int ncols_y, int ta return 1; } -template +template // tell the compiler to use as many registers as it wants, see nwarps definition below -__launch_bounds__(calc_nwarps(ncols_y, get_device_table_id())*ggml_cuda_get_physical_warp_size(), 1) +__launch_bounds__(calc_nwarps(ncols_dst, get_device_table_id())*ggml_cuda_get_physical_warp_size(), 1) static __global__ void mul_mat_vec_q( - const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst, - const int ncols_x, const int nrows_x, const int nrows_y, const int nrows_dst) { + const void * __restrict__ vx, const void * __restrict__ vy, const int32_t * __restrict__ ids, float * __restrict__ dst, + const int ncols_x, const int nchannels_y, const int stride_row_x, const int stride_col_y, const int stride_col_dst, + const int channel_ratio, const int stride_channel_x, const int stride_channel_y, const int stride_channel_dst, + const int sample_ratio, const int stride_sample_x, const int stride_sample_y, const int stride_sample_dst) { constexpr int qk = ggml_cuda_type_traits::qk; constexpr int qi = ggml_cuda_type_traits::qi; constexpr int vdr = get_vdr_mmvq(type); constexpr mmvq_parameter_table_id table_id = get_device_table_id(); - constexpr int nwarps = calc_nwarps(ncols_y, table_id); - constexpr int rows_per_cuda_block = calc_rows_per_block(ncols_y, table_id); + constexpr int nwarps = calc_nwarps(ncols_dst, table_id); + constexpr int rows_per_cuda_block = calc_rows_per_block(ncols_dst, table_id); constexpr int warp_size = ggml_cuda_get_physical_warp_size(); constexpr vec_dot_q_cuda_t vec_dot_q_cuda = get_vec_dot_q_cuda(type); @@ -147,13 +156,21 @@ static __global__ void mul_mat_vec_q( const int tid = warp_size*threadIdx.y + threadIdx.x; const int row0 = rows_per_cuda_block*blockIdx.x; const int blocks_per_row_x = ncols_x / qk; - const int blocks_per_col_y = nrows_y / QK8_1; constexpr int blocks_per_iter = vdr * nwarps*warp_size / qi; - // partial sum for each thread - float tmp[ncols_y][rows_per_cuda_block] = {{0.0f}}; + // The MUL_MAT_ID code path with ids != nullptr is only implemented for ncols_dst == 1. + const int channel_dst = blockIdx.y; + const int channel_x = ncols_dst == 1 && ids ? ids[channel_dst] : channel_dst / channel_ratio; + const int channel_y = ncols_dst == 1 && ids ? channel_dst % nchannels_y : channel_dst; + const int sample_dst = blockIdx.z; + const int sample_x = sample_dst / sample_ratio; + const int sample_y = sample_dst; - const block_q8_1 * y = (const block_q8_1 *) vy; + // partial sum for each thread + float tmp[ncols_dst][rows_per_cuda_block] = {{0.0f}}; + + const block_q8_1 * y = ((const block_q8_1 *) vy) + sample_y*stride_sample_y + channel_y*stride_channel_y; + const int kbx_offset = sample_x*stride_sample_x + channel_x*stride_channel_x + row0*stride_row_x; for (int kbx = tid / (qi/vdr); kbx < blocks_per_row_x; kbx += blocks_per_iter) { const int kby = kbx * (qk/QK8_1); // y block index that aligns with kbx @@ -162,18 +179,19 @@ static __global__ void mul_mat_vec_q( const int kqs = vdr * (tid % (qi/vdr)); #pragma unroll - for (int j = 0; j < ncols_y; ++j) { + for (int j = 0; j < ncols_dst; ++j) { #pragma unroll for (int i = 0; i < rows_per_cuda_block; ++i) { - tmp[j][i] += vec_dot_q_cuda(vx, &y[j*blocks_per_col_y + kby], (row0 + i)*blocks_per_row_x + kbx, kqs); + tmp[j][i] += vec_dot_q_cuda( + vx, &y[j*stride_col_y + kby], kbx_offset + i*stride_row_x + kbx, kqs); } } } - __shared__ float tmp_shared[nwarps-1 > 0 ? nwarps-1 : 1][ncols_y][rows_per_cuda_block][warp_size]; + __shared__ float tmp_shared[nwarps-1 > 0 ? nwarps-1 : 1][ncols_dst][rows_per_cuda_block][warp_size]; if (threadIdx.y > 0) { #pragma unroll - for (int j = 0; j < ncols_y; ++j) { + for (int j = 0; j < ncols_dst; ++j) { #pragma unroll for (int i = 0; i < rows_per_cuda_block; ++i) { tmp_shared[threadIdx.y-1][j][i][threadIdx.x] = tmp[j][i]; @@ -185,9 +203,11 @@ static __global__ void mul_mat_vec_q( return; } + dst += sample_dst*stride_sample_dst + channel_dst*stride_channel_dst + row0; + // sum up partial sums and write back result #pragma unroll - for (int j = 0; j < ncols_y; ++j) { + for (int j = 0; j < ncols_dst; ++j) { #pragma unroll for (int i = 0; i < rows_per_cuda_block; ++i) { #pragma unroll @@ -197,88 +217,121 @@ static __global__ void mul_mat_vec_q( tmp[j][i] = warp_reduce_sum(tmp[j][i]); } - if (threadIdx.x < rows_per_cuda_block && (rows_per_cuda_block == 1 || row0 + threadIdx.x < (unsigned)nrows_dst)) { - dst[j*nrows_dst + row0 + threadIdx.x] = tmp[j][threadIdx.x]; + if (threadIdx.x < rows_per_cuda_block && (rows_per_cuda_block == 1 || row0 + int(threadIdx.x) < stride_col_dst)) { + dst[j*stride_col_dst + threadIdx.x] = tmp[j][threadIdx.x]; } } - - GGML_UNUSED(nrows_x); } -static std::pair calc_launch_params(const int ncols_y, const int nrows_x, const int warp_size, const mmvq_parameter_table_id table_id) { - const int64_t nblocks = (nrows_x + calc_rows_per_block(ncols_y, table_id) - 1) / calc_rows_per_block(ncols_y, table_id); - const dim3 block_nums(nblocks, 1, 1); - const dim3 block_dims(warp_size, calc_nwarps(ncols_y, table_id), 1); +static std::pair calc_launch_params( + const int ncols_dst, const int nrows_x, const int nchannels_y, const int nsamples_y, + const int warp_size, const mmvq_parameter_table_id table_id) { + const int64_t nblocks = (nrows_x + calc_rows_per_block(ncols_dst, table_id) - 1) / calc_rows_per_block(ncols_dst, table_id); + const dim3 block_nums(nblocks, nchannels_y, nsamples_y); + const dim3 block_dims(warp_size, calc_nwarps(ncols_dst, table_id), 1); return {block_nums, block_dims}; } template -static void mul_mat_vec_q_cuda( - const void * vx, const void * vy, float * dst, - const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) { +static void mul_mat_vec_q_switch_ncols_dst( + const void * vx, const void * vy, const int32_t * ids, float * dst, + const int ncols_x, const int nrows_x, const int ncols_dst, + const int stride_row_x, const int stride_col_y, const int stride_col_dst, + const int nchannels_x, const int nchannels_y, const int nchannels_dst, + const int stride_channel_x, const int stride_channel_y, const int stride_channel_dst, + const int nsamples_x, const int nsamples_dst, const int stride_sample_x, const int stride_sample_y, const int stride_sample_dst, + cudaStream_t stream) { GGML_ASSERT(ncols_x % ggml_blck_size(type) == 0); - GGML_ASSERT(ncols_y <= MMVQ_MAX_BATCH_SIZE); + GGML_ASSERT(ncols_dst <= MMVQ_MAX_BATCH_SIZE); + + const int channel_ratio = nchannels_dst / nchannels_x; + const int sample_ratio = nsamples_dst / nsamples_x; const int device = ggml_cuda_get_device(); const int warp_size = ggml_cuda_info().devices[device].warp_size; const mmvq_parameter_table_id table_id = get_device_table_id(ggml_cuda_info().devices[device].cc); - switch (ncols_y) { + GGML_ASSERT(!ids || ncols_dst == 1); + switch (ncols_dst) { case 1: { - constexpr int c_ncols_y = 1; - std::pair dims = calc_launch_params(c_ncols_y, nrows_x, warp_size, table_id); - mul_mat_vec_q<<>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, nrows_dst); + constexpr int c_ncols_dst = 1; + std::pair dims = calc_launch_params(c_ncols_dst, nrows_x, nchannels_dst, nsamples_dst, warp_size, table_id); + mul_mat_vec_q<<>> + (vx, vy, ids, dst, ncols_x, nchannels_y, stride_row_x, stride_col_y, stride_col_dst, + channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst, + sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst); break; } case 2: { - constexpr int c_ncols_y = 2; - std::pair dims = calc_launch_params(c_ncols_y, nrows_x, warp_size, table_id); - mul_mat_vec_q<<>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, nrows_dst); + constexpr int c_ncols_dst = 2; + std::pair dims = calc_launch_params(c_ncols_dst, nrows_x, nchannels_dst, nsamples_dst, warp_size, table_id); + mul_mat_vec_q<<>> + (vx, vy, ids, dst, ncols_x, nchannels_y, stride_row_x, stride_col_y, stride_col_dst, + channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst, + sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst); break; } case 3: { - constexpr int c_ncols_y = 3; - std::pair dims = calc_launch_params(c_ncols_y, nrows_x, warp_size, table_id); - mul_mat_vec_q<<>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, nrows_dst); + constexpr int c_ncols_dst = 3; + std::pair dims = calc_launch_params(c_ncols_dst, nrows_x, nchannels_dst, nsamples_dst, warp_size, table_id); + mul_mat_vec_q<<>> + (vx, vy, ids, dst, ncols_x, nchannels_y, stride_row_x, stride_col_y, stride_col_dst, + channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst, + sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst); break; } case 4: { - constexpr int c_ncols_y = 4; - std::pair dims = calc_launch_params(c_ncols_y, nrows_x, warp_size, table_id); - mul_mat_vec_q<<>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, nrows_dst); + constexpr int c_ncols_dst = 4; + std::pair dims = calc_launch_params(c_ncols_dst, nrows_x, nchannels_dst, nsamples_dst, warp_size, table_id); + mul_mat_vec_q<<>> + (vx, vy, ids, dst, ncols_x, nchannels_y, stride_row_x, stride_col_y, stride_col_dst, + channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst, + sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst); break; } case 5: { - constexpr int c_ncols_y = 5; - std::pair dims = calc_launch_params(c_ncols_y, nrows_x, warp_size, table_id); - mul_mat_vec_q<<>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, nrows_dst); + constexpr int c_ncols_dst = 5; + std::pair dims = calc_launch_params(c_ncols_dst, nrows_x, nchannels_dst, nsamples_dst, warp_size, table_id); + mul_mat_vec_q<<>> + (vx, vy, ids, dst, ncols_x, nchannels_y, stride_row_x, stride_col_y, stride_col_dst, + channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst, + sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst); break; } case 6: { - constexpr int c_ncols_y = 6; - std::pair dims = calc_launch_params(c_ncols_y, nrows_x, warp_size, table_id); - mul_mat_vec_q<<>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, nrows_dst); + constexpr int c_ncols_dst = 6; + std::pair dims = calc_launch_params(c_ncols_dst, nrows_x, nchannels_dst, nsamples_dst, warp_size, table_id); + mul_mat_vec_q<<>> + (vx, vy, ids, dst, ncols_x, nchannels_y, stride_row_x, stride_col_y, stride_col_dst, + channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst, + sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst); break; } case 7: { - constexpr int c_ncols_y = 7; - std::pair dims = calc_launch_params(c_ncols_y, nrows_x, warp_size, table_id); - mul_mat_vec_q<<>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, nrows_dst); + constexpr int c_ncols_dst = 7; + std::pair dims = calc_launch_params(c_ncols_dst, nrows_x, nchannels_dst, nsamples_dst, warp_size, table_id); + mul_mat_vec_q<<>> + (vx, vy, ids, dst, ncols_x, nchannels_y, stride_row_x, stride_col_y, stride_col_dst, + channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst, + sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst); break; } case 8: { - constexpr int c_ncols_y = 8; - std::pair dims = calc_launch_params(c_ncols_y, nrows_x, warp_size, table_id); - mul_mat_vec_q<<>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, nrows_dst); + constexpr int c_ncols_dst = 8; + std::pair dims = calc_launch_params(c_ncols_dst, nrows_x, nchannels_dst, nsamples_dst, warp_size, table_id); + mul_mat_vec_q<<>> + (vx, vy, ids, dst, ncols_x, nchannels_y, stride_row_x, stride_col_y, stride_col_dst, + channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst, + sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst); break; } default: @@ -287,137 +340,213 @@ static void mul_mat_vec_q_cuda( } } -static void mul_mat_vec_q4_0_q8_1_cuda( - const void * vx, const void * vy, float * dst, - const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) { - - mul_mat_vec_q_cuda(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream); +static void mul_mat_vec_q_switch_type( + const void * vx, const ggml_type type_x, const void * vy, const int32_t * ids, float * dst, + const int ncols_x, const int nrows_x, const int ncols_dst, + const int stride_row_x, const int stride_col_y, const int stride_col_dst, + const int nchannels_x, const int nchannels_y, const int nchannels_dst, + const int stride_channel_x, const int stride_channel_y, const int stride_channel_dst, + const int nsamples_x, const int nsamples_dst, const int stride_sample_x, const int stride_sample_y, const int stride_sample_dst, + cudaStream_t stream) { + switch (type_x) { + case GGML_TYPE_Q4_0: + mul_mat_vec_q_switch_ncols_dst + (vx, vy, ids, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst, + nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst, + nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, + stream); + break; + case GGML_TYPE_Q4_1: + mul_mat_vec_q_switch_ncols_dst + (vx, vy, ids, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst, + nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst, + nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, + stream); + break; + case GGML_TYPE_Q5_0: + mul_mat_vec_q_switch_ncols_dst + (vx, vy, ids, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst, + nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst, + nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, + stream); + break; + case GGML_TYPE_Q5_1: + mul_mat_vec_q_switch_ncols_dst + (vx, vy, ids, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst, + nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst, + nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, + stream); + break; + case GGML_TYPE_Q8_0: + mul_mat_vec_q_switch_ncols_dst + (vx, vy, ids, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst, + nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst, + nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, + stream); + break; + case GGML_TYPE_Q2_K: + mul_mat_vec_q_switch_ncols_dst + (vx, vy, ids, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst, + nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst, + nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, + stream); + break; + case GGML_TYPE_Q3_K: + mul_mat_vec_q_switch_ncols_dst + (vx, vy, ids, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst, + nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst, + nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, + stream); + break; + case GGML_TYPE_Q4_K: + mul_mat_vec_q_switch_ncols_dst + (vx, vy, ids, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst, + nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst, + nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, + stream); + break; + case GGML_TYPE_Q5_K: + mul_mat_vec_q_switch_ncols_dst + (vx, vy, ids, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst, + nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst, + nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, + stream); + break; + case GGML_TYPE_Q6_K: + mul_mat_vec_q_switch_ncols_dst + (vx, vy, ids, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst, + nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst, + nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, + stream); + break; + case GGML_TYPE_IQ2_XXS: + mul_mat_vec_q_switch_ncols_dst + (vx, vy, ids, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst, + nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst, + nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, + stream); + break; + case GGML_TYPE_IQ2_XS: + mul_mat_vec_q_switch_ncols_dst + (vx, vy, ids, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst, + nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst, + nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, + stream); + break; + case GGML_TYPE_IQ2_S: + mul_mat_vec_q_switch_ncols_dst + (vx, vy, ids, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst, + nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst, + nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, + stream); + break; + case GGML_TYPE_IQ3_XXS: + mul_mat_vec_q_switch_ncols_dst + (vx, vy, ids, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst, + nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst, + nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, + stream); + break; + case GGML_TYPE_IQ1_S: + mul_mat_vec_q_switch_ncols_dst + (vx, vy, ids, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst, + nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst, + nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, + stream); + break; + case GGML_TYPE_IQ1_M: + mul_mat_vec_q_switch_ncols_dst + (vx, vy, ids, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst, + nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst, + nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, + stream); + break; + case GGML_TYPE_IQ4_NL: + mul_mat_vec_q_switch_ncols_dst + (vx, vy, ids, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst, + nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst, + nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, + stream); + break; + case GGML_TYPE_IQ4_XS: + mul_mat_vec_q_switch_ncols_dst + (vx, vy, ids, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst, + nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst, + nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, + stream); + break; + case GGML_TYPE_IQ3_S: + mul_mat_vec_q_switch_ncols_dst + (vx, vy, ids, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst, + nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst, + nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, + stream); + break; + default: + GGML_ABORT("fatal error"); + break; + } } -static void mul_mat_vec_q4_1_q8_1_cuda( - const void * vx, const void * vy, float * dst, - const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) { +void ggml_cuda_mul_mat_vec_q( + ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * ids, ggml_tensor * dst) { + GGML_ASSERT( src1->type == GGML_TYPE_F32); + GGML_ASSERT( dst->type == GGML_TYPE_F32); + GGML_ASSERT(!ids || ids->type == GGML_TYPE_I32); // Optional, used for batched GGML_MUL_MAT_ID. - mul_mat_vec_q_cuda(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream); -} + GGML_TENSOR_BINARY_OP_LOCALS; -static void mul_mat_vec_q5_0_q8_1_cuda( - const void * vx, const void * vy, float * dst, - const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) { + cudaStream_t stream = ctx.stream(); - mul_mat_vec_q_cuda(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream); -} + const size_t ts_src0 = ggml_type_size(src0->type); + const size_t ts_src1 = ggml_type_size(src1->type); + const size_t ts_dst = ggml_type_size(dst->type); -static void mul_mat_vec_q5_1_q8_1_cuda( - const void * vx, const void * vy, float * dst, - const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) { + GGML_ASSERT( nb00 == ts_src0); + GGML_ASSERT( nb10 == ts_src1); + GGML_ASSERT( nb0 == ts_dst); + GGML_ASSERT(!ids || ids->nb[0] == ggml_type_size(ids->type)); - mul_mat_vec_q_cuda(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream); -} + GGML_ASSERT(!ids || ne12 == 1); // Implementation is only correct for batch size 1. -static void mul_mat_vec_q8_0_q8_1_cuda( - const void * vx, const void * vy, float * dst, - const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) { + const float * src1_d = (const float *) src1->data; + const int32_t * ids_d = ids ? (const int32_t *) ids->data : nullptr; + float * dst_d = (float *) dst->data; - mul_mat_vec_q_cuda(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream); -} + const int64_t ne10_padded = GGML_PAD(ne10, MATRIX_ROW_PADDING); + ggml_cuda_pool_alloc src1_q8_1(ctx.pool(), ne13*ne12 * ne11*ne10_padded * sizeof(block_q8_1)/QK8_1); + { + const int64_t s11 = src1->nb[1] / ts_src1; + const int64_t s12 = src1->nb[2] / ts_src1; + const int64_t s13 = src1->nb[3] / ts_src1; + quantize_row_q8_1_cuda(src1_d, nullptr, src1_q8_1.get(), src0->type, ne10, s11, s12, s13, ne10_padded, ne11, ne12, ne13, stream); + } -static void mul_mat_vec_q2_K_q8_1_cuda( - const void * vx, const void * vy, float * dst, - const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) { + const int64_t s01 = src0->nb[1] / ts_src0; + const int64_t s11 = ne10_padded / QK8_1; + const int64_t s1 = dst->nb[1] / ts_dst; + const int64_t s02 = src0->nb[2] / ts_src0; + const int64_t s2 = dst->nb[2] / ts_dst; + const int64_t s03 = src0->nb[3] / ts_src0; + const int64_t s3 = dst->nb[3] / ts_dst; - mul_mat_vec_q_cuda(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream); -} + const int64_t s12 = ne11*s11; + const int64_t s13 = ne12*s12; -static void mul_mat_vec_q3_K_q8_1_cuda( - const void * vx, const void * vy, float * dst, - const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) { + // For MUL_MAT_ID the memory layout is different than for MUL_MAT: + const int64_t ncols_dst = ids ? ne2 : ne1; + const int64_t nchannels_y = ids ? ne11 : ne12; + const int64_t nchannels_dst = ids ? ne1 : ne2; + const int64_t stride_col_dst = ids ? s2 : s1; + const int64_t stride_col_y = ids ? s12 : s11; + const int64_t stride_channel_dst = ids ? s1 : s2; + const int64_t stride_channel_y = ids ? s11 : s12; - mul_mat_vec_q_cuda(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream); -} - -static void mul_mat_vec_q4_K_q8_1_cuda( - const void * vx, const void * vy, float * dst, - const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) { - - mul_mat_vec_q_cuda(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream); -} - -static void mul_mat_vec_q5_K_q8_1_cuda( - const void * vx, const void * vy, float * dst, - const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) { - - mul_mat_vec_q_cuda(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream); -} - -static void mul_mat_vec_q6_K_q8_1_cuda( - const void * vx, const void * vy, float * dst, - const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) { - - mul_mat_vec_q_cuda(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream); -} - -static void mul_mat_vec_iq2_xxs_q8_1_cuda( - const void * vx, const void * vy, float * dst, - const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) { - - mul_mat_vec_q_cuda(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream); -} - -static void mul_mat_vec_iq2_xs_q8_1_cuda( - const void * vx, const void * vy, float * dst, - const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) { - - mul_mat_vec_q_cuda(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream); -} - -static void mul_mat_vec_iq2_s_q8_1_cuda( - const void * vx, const void * vy, float * dst, - const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) { - - mul_mat_vec_q_cuda(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream); -} - -static void mul_mat_vec_iq3_xxs_q8_1_cuda( - const void * vx, const void * vy, float * dst, - const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) { - - mul_mat_vec_q_cuda(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream); -} - -static void mul_mat_vec_iq1_s_q8_1_cuda( - const void * vx, const void * vy, float * dst, - const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) { - - mul_mat_vec_q_cuda(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream); -} - -static void mul_mat_vec_iq1_m_q8_1_cuda( - const void * vx, const void * vy, float * dst, - const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) { - - mul_mat_vec_q_cuda(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream); -} - -static void mul_mat_vec_iq4_nl_q8_1_cuda( - const void * vx, const void * vy, float * dst, - const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) { - - mul_mat_vec_q_cuda(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream); -} - -static void mul_mat_vec_iq4_xs_q8_1_cuda( - const void * vx, const void * vy, float * dst, - const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) { - - mul_mat_vec_q_cuda(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream); -} - -static void mul_mat_vec_iq3_s_q8_1_cuda( - const void * vx, const void * vy, float * dst, - const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) { - - mul_mat_vec_q_cuda(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream); + mul_mat_vec_q_switch_type( + src0->data, src0->type, src1_q8_1.get(), ids_d, dst_d, ne00, + ne01, ncols_dst, s01, stride_col_y, stride_col_dst, + ne02, nchannels_y, nchannels_dst, s02, stride_channel_y, stride_channel_dst, + ne03, ne3, s03, s13, s3, stream); } void ggml_cuda_op_mul_mat_vec_q( @@ -440,68 +569,12 @@ void ggml_cuda_op_mul_mat_vec_q( // nrows_dst == nrows of the matrix that the kernel writes into const int64_t nrows_dst = id == ctx.device ? ne0 : row_diff; - switch (src0->type) { - case GGML_TYPE_Q4_0: - mul_mat_vec_q4_0_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream); - break; - case GGML_TYPE_Q4_1: - mul_mat_vec_q4_1_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream); - break; - case GGML_TYPE_Q5_0: - mul_mat_vec_q5_0_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream); - break; - case GGML_TYPE_Q5_1: - mul_mat_vec_q5_1_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream); - break; - case GGML_TYPE_Q8_0: - mul_mat_vec_q8_0_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream); - break; - case GGML_TYPE_Q2_K: - mul_mat_vec_q2_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream); - break; - case GGML_TYPE_Q3_K: - mul_mat_vec_q3_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream); - break; - case GGML_TYPE_Q4_K: - mul_mat_vec_q4_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream); - break; - case GGML_TYPE_Q5_K: - mul_mat_vec_q5_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream); - break; - case GGML_TYPE_Q6_K: - mul_mat_vec_q6_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream); - break; - case GGML_TYPE_IQ2_XXS: - mul_mat_vec_iq2_xxs_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream); - break; - case GGML_TYPE_IQ2_XS: - mul_mat_vec_iq2_xs_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream); - break; - case GGML_TYPE_IQ2_S: - mul_mat_vec_iq2_s_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream); - break; - case GGML_TYPE_IQ3_XXS: - mul_mat_vec_iq3_xxs_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream); - break; - case GGML_TYPE_IQ1_S: - mul_mat_vec_iq1_s_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream); - break; - case GGML_TYPE_IQ1_M: - mul_mat_vec_iq1_m_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream); - break; - case GGML_TYPE_IQ4_NL: - mul_mat_vec_iq4_nl_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream); - break; - case GGML_TYPE_IQ4_XS: - mul_mat_vec_iq4_xs_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream); - break; - case GGML_TYPE_IQ3_S: - mul_mat_vec_iq3_s_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream); - break; - default: - GGML_ABORT("fatal error"); - break; - } + const int stride_row_x = ne00 / ggml_blck_size(src0->type); + const int stride_col_y = src1_padded_row_size / QK8_1; + + mul_mat_vec_q_switch_type( + src0_dd_i, src0->type, src1_ddq_i, nullptr, dst_dd_i, ne00, row_diff, src1_ncols, stride_row_x, stride_col_y, nrows_dst, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, stream); GGML_UNUSED(src1); GGML_UNUSED(dst); diff --git a/ml/backend/ggml/ggml/src/ggml-cuda/mmvq.cuh b/ml/backend/ggml/ggml/src/ggml-cuda/mmvq.cuh index d9e42fdd6..39dc7d33e 100644 --- a/ml/backend/ggml/ggml/src/ggml-cuda/mmvq.cuh +++ b/ml/backend/ggml/ggml/src/ggml-cuda/mmvq.cuh @@ -2,6 +2,9 @@ #define MMVQ_MAX_BATCH_SIZE 8 // Max. batch size for which to use MMVQ kernels. +void ggml_cuda_mul_mat_vec_q(ggml_backend_cuda_context & ctx, + const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * ids, ggml_tensor * dst); + void ggml_cuda_op_mul_mat_vec_q( ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i, diff --git a/ml/backend/ggml/ggml/src/ggml-cuda/quantize.cu b/ml/backend/ggml/ggml/src/ggml-cuda/quantize.cu index 1702e4ce2..931a45ad3 100644 --- a/ml/backend/ggml/ggml/src/ggml-cuda/quantize.cu +++ b/ml/backend/ggml/ggml/src/ggml-cuda/quantize.cu @@ -1,30 +1,40 @@ #include "quantize.cuh" #include -static __global__ void quantize_q8_1(const float * __restrict__ x, void * __restrict__ vy, const int64_t kx, const int64_t kx0_padded) { - const int64_t ix0 = (int64_t)blockDim.x*blockIdx.x + threadIdx.x; +static __global__ void quantize_q8_1( + const float * __restrict__ x, void * __restrict__ vy, + const int64_t ne00, const int64_t s01, const int64_t s02, const int64_t s03, + const int64_t ne0, const int ne1, const int ne2) { + const int64_t i0 = (int64_t)blockDim.x*blockIdx.x + threadIdx.x; - if (ix0 >= kx0_padded) { + if (i0 >= ne0) { return; } - const int64_t ix1 = blockIdx.y; + const int64_t i1 = blockIdx.y; + const int64_t i2 = blockIdx.z % ne2; + const int64_t i3 = blockIdx.z / ne2; - const int64_t i_padded = ix1*kx0_padded + ix0; + const int64_t & i00 = i0; + const int64_t & i01 = i1; + const int64_t & i02 = i2; + const int64_t & i03 = i3; + + const int64_t i_cont = ((i3*ne2 + i2) * ne1 + i1) * ne0 + i0; block_q8_1 * y = (block_q8_1 *) vy; - const int64_t ib = i_padded / QK8_1; // block index - const int64_t iqs = i_padded % QK8_1; // quant index + const int64_t ib = i_cont / QK8_1; // block index + const int64_t iqs = i_cont % QK8_1; // quant index - const float xi = ix0 < kx ? x[ix1*kx + ix0] : 0.0f; + const float xi = i0 < ne00 ? x[i03*s03 + i02*s02 + i01*s01 + i00] : 0.0f; float amax = fabsf(xi); float sum = xi; amax = warp_reduce_max(amax); - sum = warp_reduce_sum(sum); + sum = warp_reduce_sum(sum); - const float d = amax / 127; + const float d = amax / 127; const int8_t q = amax == 0.0f ? 0 : roundf(xi / d); y[ib].qs[iqs] = q; @@ -39,29 +49,38 @@ static __global__ void quantize_q8_1(const float * __restrict__ x, void * __rest template static __global__ void quantize_mmq_q8_1( - const float * __restrict__ x, void * __restrict__ vy, const int64_t kx0, const int64_t kx1, const int64_t kx0_padded) { + const float * __restrict__ x, const int32_t * __restrict__ ids, void * __restrict__ vy, + const int64_t ne00, const int64_t s01, const int64_t s02, const int64_t s03, + const int64_t ne0, const int ne1, const int ne2) { constexpr int vals_per_scale = ds_layout == MMQ_Q8_1_DS_LAYOUT_D2S6 ? 64 : 32; constexpr int vals_per_sum = ds_layout == MMQ_Q8_1_DS_LAYOUT_D2S6 ? 16 : 32; - const int64_t ix0 = ((int64_t)blockDim.x*blockIdx.x + threadIdx.x)*4; + const int64_t i0 = ((int64_t)blockDim.x*blockIdx.x + threadIdx.x)*4; - if (ix0 >= kx0_padded) { + if (i0 >= ne0) { return; } - const float4 * x4 = (const float4 *) x; + const int64_t i1 = blockIdx.y; + const int64_t i2 = blockIdx.z % ne2; + const int64_t i3 = blockIdx.z / ne2; - const int64_t ix1 = kx1*blockIdx.z + blockIdx.y; + const int64_t i00 = i0; + const int64_t i01 = ids ? ids[i1] : i1; + const int64_t i02 = i2; + const int64_t i03 = i3; + + const float4 * x4 = (const float4 *) x; block_q8_1_mmq * y = (block_q8_1_mmq *) vy; const int64_t ib0 = blockIdx.z*((int64_t)gridDim.y*gridDim.x*blockDim.x/QK8_1); // first block of channel - const int64_t ib = ib0 + (ix0 / (4*QK8_1))*kx1 + blockIdx.y; // block index in channel - const int64_t iqs = ix0 % (4*QK8_1); // quant index in block + const int64_t ib = ib0 + (i0 / (4*QK8_1))*ne1 + blockIdx.y; // block index in channel + const int64_t iqs = i0 % (4*QK8_1); // quant index in block // Load 4 floats per thread and calculate max. abs. value between them: - const float4 xi = ix0 < kx0 ? x4[(ix1*kx0 + ix0)/4] : make_float4(0.0f, 0.0f, 0.0f, 0.0f); + const float4 xi = i0 < ne00 ? x4[(i03*s03 + i02*s02 + i01*s01 + i00)/4] : make_float4(0.0f, 0.0f, 0.0f, 0.0f); float amax = fabsf(xi.x); amax = fmaxf(amax, fabsf(xi.y)); amax = fmaxf(amax, fabsf(xi.z)); @@ -77,7 +96,7 @@ static __global__ void quantize_mmq_q8_1( if (ds_layout != MMQ_Q8_1_DS_LAYOUT_D4) { sum = xi.x + xi.y + xi.z + xi.w; - // Exchange calculate sum across vals_per_sum/4 threads. + // Calculate sums across vals_per_sum/4 threads. #pragma unroll for (int offset = vals_per_sum/8; offset > 0; offset >>= 1) { sum += __shfl_xor_sync(0xFFFFFFFF, sum, offset, WARP_SIZE); @@ -127,40 +146,40 @@ static __global__ void quantize_mmq_q8_1( } void quantize_row_q8_1_cuda( - const float * x, void * vy, const int64_t kx0, const int64_t kx1, const int64_t channels, - const int64_t kx0_padded, const ggml_type type_x, cudaStream_t stream) { + const float * x, const int32_t * ids, void * vy, const ggml_type type_src0, + const int64_t ne00, const int64_t s01, const int64_t s02, const int64_t s03, + const int64_t ne0, const int64_t ne1, const int64_t ne2, const int64_t ne3, cudaStream_t stream) { + GGML_ASSERT(!ids); + GGML_ASSERT(ne0 % QK8_1 == 0); - GGML_ASSERT(kx0_padded % QK8_1 == 0); - - const int64_t block_num_x = (kx0_padded + CUDA_QUANTIZE_BLOCK_SIZE - 1) / CUDA_QUANTIZE_BLOCK_SIZE; - const dim3 num_blocks(block_num_x, kx1*channels, 1); + const int64_t block_num_x = (ne0 + CUDA_QUANTIZE_BLOCK_SIZE - 1) / CUDA_QUANTIZE_BLOCK_SIZE; + const dim3 num_blocks(block_num_x, ne1, ne2*ne3); const dim3 block_size(CUDA_QUANTIZE_BLOCK_SIZE, 1, 1); - quantize_q8_1<<>>(x, vy, kx0, kx0_padded); - - GGML_UNUSED(type_x); + quantize_q8_1<<>>(x, vy, ne00, s01, s02, s03, ne0, ne1, ne2); + GGML_UNUSED(type_src0); } void quantize_mmq_q8_1_cuda( - const float * x, void * vy, const int64_t kx0, const int64_t kx1, const int64_t channels, - const int64_t kx0_padded, const ggml_type type_x, cudaStream_t stream) { + const float * x, const int32_t * ids, void * vy, const ggml_type type_src0, + const int64_t ne00, const int64_t s01, const int64_t s02, const int64_t s03, + const int64_t ne0, const int64_t ne1, const int64_t ne2, const int64_t ne3, cudaStream_t stream) { + GGML_ASSERT(ne0 % (4*QK8_1) == 0); - GGML_ASSERT(kx0_padded % (4*QK8_1) == 0); - - const int64_t block_num_x = (kx0_padded + 4*CUDA_QUANTIZE_BLOCK_SIZE_MMQ - 1) / (4*CUDA_QUANTIZE_BLOCK_SIZE_MMQ); - const dim3 num_blocks(block_num_x, kx1, channels); + const int64_t block_num_x = (ne0 + 4*CUDA_QUANTIZE_BLOCK_SIZE_MMQ - 1) / (4*CUDA_QUANTIZE_BLOCK_SIZE_MMQ); + const dim3 num_blocks(block_num_x, ne1, ne2*ne3); const dim3 block_size(CUDA_QUANTIZE_BLOCK_SIZE_MMQ, 1, 1); - switch (mmq_get_q8_1_ds_layout(type_x)) { + switch (mmq_get_q8_1_ds_layout(type_src0)) { case MMQ_Q8_1_DS_LAYOUT_D4: quantize_mmq_q8_1 - <<>>(x, vy, kx0, kx1, kx0_padded); + <<>>(x, ids, vy, ne00, s01, s02, s03, ne0, ne1, ne2); break; case MMQ_Q8_1_DS_LAYOUT_DS4: quantize_mmq_q8_1 - <<>>(x, vy, kx0, kx1, kx0_padded); + <<>>(x, ids, vy, ne00, s01, s02, s03, ne0, ne1, ne2); break; case MMQ_Q8_1_DS_LAYOUT_D2S6: quantize_mmq_q8_1 - <<>>(x, vy, kx0, kx1, kx0_padded); + <<>>(x, ids, vy, ne00, s01, s02, s03, ne0, ne1, ne2); break; default: GGML_ABORT("fatal error"); diff --git a/ml/backend/ggml/ggml/src/ggml-cuda/quantize.cuh b/ml/backend/ggml/ggml/src/ggml-cuda/quantize.cuh index 03bf322b9..725ab5244 100644 --- a/ml/backend/ggml/ggml/src/ggml-cuda/quantize.cuh +++ b/ml/backend/ggml/ggml/src/ggml-cuda/quantize.cuh @@ -12,13 +12,16 @@ static_assert(MATRIX_ROW_PADDING % CUDA_QUANTIZE_BLOCK_SIZE == 0, "Risk static_assert(MATRIX_ROW_PADDING % (4*CUDA_QUANTIZE_BLOCK_SIZE_MMQ) == 0, "Risk of out-of-bounds access."); typedef void (*quantize_cuda_t)( - const float * x, void * vy, const int64_t kx0, const int64_t kx1, const int64_t channels, const int64_t kx0_padded, - const ggml_type type_x, cudaStream_t stream); + const float * x, const int32_t * ids, void * vy, + ggml_type type_src0, int64_t ne00, int64_t s01, int64_t s02, int64_t s03, + int64_t ne0, int64_t ne1, int64_t ne2, int64_t ne3, cudaStream_t stream); void quantize_row_q8_1_cuda( - const float * x, void * vy, const int64_t kx0, const int64_t kx1, const int64_t channels, const int64_t kx0_padded, - const ggml_type type_x, cudaStream_t stream); + const float * x, const int32_t * ids, void * vy, + ggml_type type_src0, int64_t ne00, int64_t s01, int64_t s02, int64_t s03, + int64_t ne0, int64_t ne1, int64_t ne2, int64_t ne3, cudaStream_t stream); void quantize_mmq_q8_1_cuda( - const float * x, void * vy, const int64_t kx0, const int64_t kx1, const int64_t channels, const int64_t kx0_padded, - const ggml_type type_x, cudaStream_t stream); + const float * x, const int32_t * ids, void * vy, + ggml_type type_src0, int64_t ne00, int64_t s01, int64_t s02, int64_t s03, + int64_t ne0, int64_t ne1, int64_t ne2, int64_t ne3, cudaStream_t stream); diff --git a/ml/backend/ggml/ggml/src/ggml-cuda/vecdotq.cuh b/ml/backend/ggml/ggml/src/ggml-cuda/vecdotq.cuh index 40091a0ef..ba195e1d1 100644 --- a/ml/backend/ggml/ggml/src/ggml-cuda/vecdotq.cuh +++ b/ml/backend/ggml/ggml/src/ggml-cuda/vecdotq.cuh @@ -1,3 +1,5 @@ +#pragma once + #include "common.cuh" #include diff --git a/ml/backend/ggml/ggml/src/ggml-metal/ggml-metal-embed.metal b/ml/backend/ggml/ggml/src/ggml-metal/ggml-metal-embed.metal index 297933adb..223dc1807 100644 --- a/ml/backend/ggml/ggml/src/ggml-metal/ggml-metal-embed.metal +++ b/ml/backend/ggml/ggml/src/ggml-metal/ggml-metal-embed.metal @@ -5690,7 +5690,7 @@ kernel void kernel_flash_attn_ext( { float S[Q] = { [0 ... Q-1] = 0.0f }; - float M[Q] = { [0 ... Q-1] = -__FLT16_MAX__/2 }; + float M[Q] = { [0 ... Q-1] = -__FLT_MAX__/2 }; // thread indices inside the simdgroup // TODO: see if we can utilize quad-group functions for better performance @@ -5950,7 +5950,7 @@ kernel void kernel_flash_attn_ext( // reduce the warps sequentially for (ushort sg = 1; sg < nsg; ++sg) { float S = { 0.0f }; - float M = { -__FLT16_MAX__/2 }; + float M = { -__FLT_MAX__/2 }; threadgroup_barrier(mem_flags::mem_threadgroup); @@ -6197,7 +6197,7 @@ kernel void kernel_flash_attn_ext_vec( { float S = 0.0f; - float M = -__FLT16_MAX__/2; + float M = -__FLT_MAX__/2; // thread indices inside the simdgroup const short tx = tiisg%NL; diff --git a/ml/backend/ggml/ggml/src/ggml-metal/ggml-metal.m b/ml/backend/ggml/ggml/src/ggml-metal/ggml-metal.m index b2e95a66c..112abef68 100644 --- a/ml/backend/ggml/ggml/src/ggml-metal/ggml-metal.m +++ b/ml/backend/ggml/ggml/src/ggml-metal/ggml-metal.m @@ -44,8 +44,8 @@ static struct ggml_backend_device g_ggml_backend_metal_device; // note: assumes single GPU device - the default one // TODO: support multiple GPU devices static struct ggml_backend_metal_device_context { - id mtl_device; - int mtl_device_ref_count; + id mtl_device; + int mtl_device_ref_count; id mtl_library; bool has_simdgroup_reduction; @@ -491,7 +491,259 @@ enum ggml_metal_kernel_type { GGML_METAL_KERNEL_TYPE_COUNT }; +// +// ggml_metal_heap +// + +struct ggml_metal_heap { + // number of times the heap was unused + int n_unused; + + // total number of buffer allocations in this heap across all computes + int64_t n_alloc; + + // current offset in the heap - we reset this after each node in order to reuse the memory + size_t offs; + + // the currently allocated MTLBuffer objects in this heap + id obj; + + NSMutableArray * bufs; +}; + +static struct ggml_metal_heap * ggml_metal_heap_init(id device, size_t size) { + struct ggml_metal_heap * heap = calloc(1, sizeof(struct ggml_metal_heap)); + + MTLHeapDescriptor * desc = [[MTLHeapDescriptor alloc] init]; + desc.storageMode = MTLStorageModePrivate; + desc.cpuCacheMode = MTLCPUCacheModeDefaultCache; + desc.type = MTLHeapTypePlacement; + desc.size = size; + + heap->n_unused = 0; + heap->n_alloc = 0; + + heap->obj = [device newHeapWithDescriptor:desc]; + if (!heap->obj) { + GGML_LOG_ERROR("%s: error: failed to create MTLHeap with size %zu\n", __func__, size); + + free(heap); + + return false; + } + + [desc release]; + + heap->bufs = [[NSMutableArray alloc] init]; + + return heap; +} + +static void ggml_metal_heap_reset(struct ggml_metal_heap * heap) { + heap->offs = 0; + + // count how many graph computes the heap ended up being unused + if ([heap->bufs count] > 0) { + heap->n_unused = 0; + } else { + heap->n_unused++; + } + + for (id buf in heap->bufs) { + [buf release]; + } + [heap->bufs removeAllObjects]; + + // tell the OS that it can reuse this memory if needed + // ref: https://developer.apple.com/documentation/metal/mtlpurgeablestate?language=objc + [heap->obj setPurgeableState:MTLPurgeableStateVolatile]; +} + +static void ggml_metal_heap_free(struct ggml_metal_heap * heap) { + if (heap == nil) { + return; + } + + ggml_metal_heap_reset(heap); + + [heap->obj release]; + [heap->bufs release]; + + free(heap); +} + +@interface ggml_metal_heap_ptr : NSObject + +@property (nonatomic, assign) struct ggml_metal_heap * data; + +@end + +@implementation ggml_metal_heap_ptr +@end + +// +// ggml_metal_mem_pool +// + +struct ggml_metal_mem_pool { + id device; + + int n_heaps; // total number of heaps ever created (including those that were removed) + + NSMutableArray * heaps; + NSMutableArray * heaps_to_remove; +}; + +static struct ggml_metal_mem_pool * ggml_metal_mem_pool_init(void) { + struct ggml_metal_mem_pool * mem_pool = calloc(1, sizeof(struct ggml_metal_mem_pool)); + + mem_pool->n_heaps = 0; + + mem_pool->heaps = [[NSMutableArray alloc] init]; + mem_pool->heaps_to_remove = [[NSMutableArray alloc] init]; + + return mem_pool; +} + +static void ggml_metal_mem_pool_free(struct ggml_metal_mem_pool * mem_pool) { + GGML_LOG_DEBUG("%s: freeing memory pool, num heaps = %zu (total = %d)\n", __func__, [mem_pool->heaps count], mem_pool->n_heaps); + + size_t size_all = 0; + size_t size_cur = 0; + + for (ggml_metal_heap_ptr * ptr in mem_pool->heaps) { + GGML_LOG_DEBUG("%s: heap: %p\n", __func__, (void *) ptr.data); + GGML_LOG_DEBUG("%s: n_alloc: %" PRId64 "\n", __func__, ptr.data->n_alloc); + GGML_LOG_DEBUG("%s: n_unused: %d\n", __func__, ptr.data->n_unused); + GGML_LOG_DEBUG("%s: size: %.2f MiB\n", __func__, [ptr.data->obj size] / 1024.0 / 1024.0); + GGML_LOG_DEBUG("%s: bufs: %zu\n", __func__, [ptr.data->bufs count]); + + if ([ptr.data->bufs count] > 0) { + size_cur += [ptr.data->obj size]; + } + size_all += [ptr.data->obj size]; + + ggml_metal_heap_free(ptr.data); + [ptr release]; + } + [mem_pool->heaps release]; + [mem_pool->heaps_to_remove release]; + + if (size_all > 0) { + GGML_LOG_DEBUG("%s: size_all: %.2f MiB\n", __func__, size_all / 1024.0 / 1024.0); + GGML_LOG_DEBUG("%s: size_cur: %.2f MiB\n", __func__, size_cur / 1024.0 / 1024.0); + } + + free(mem_pool); +} + +static void ggml_metal_mem_pool_reset(struct ggml_metal_mem_pool * mem_pool) { + for (NSUInteger i = 0; i < [mem_pool->heaps count]; i++) { + ggml_metal_heap_ptr * ptr = [mem_pool->heaps objectAtIndex:i]; + + struct ggml_metal_heap * heap = ptr.data; + ggml_metal_heap_reset(heap); + + // if the heap hasn't been used for a while, remove it + if (heap->n_unused >= 128) { + [mem_pool->heaps_to_remove addObject:@(i)]; + } + } + + if (mem_pool->heaps_to_remove.count > 0) { + for (NSUInteger i = 0; i < [mem_pool->heaps_to_remove count]; i++) { + NSUInteger index = [[mem_pool->heaps_to_remove objectAtIndex:i] intValue]; + ggml_metal_heap_ptr * ptr = [mem_pool->heaps objectAtIndex:index]; + + struct ggml_metal_heap * heap = ptr.data; + ggml_metal_heap_free(heap); + + [mem_pool->heaps removeObjectAtIndex:index]; + [ptr release]; + } + + [mem_pool->heaps_to_remove removeAllObjects]; + } +} + +static void ggml_metal_mem_pool_clear(struct ggml_metal_mem_pool * mem_pool) { + for (ggml_metal_heap_ptr * ptr in mem_pool->heaps) { + ptr.data->offs = 0; + } +} + +static id ggml_metal_mem_pool_alloc(struct ggml_metal_mem_pool * mem_pool, size_t size) { + const size_t alignment = 32; + + const size_t size_aligned = GGML_PAD(size, alignment); + + // try one of the existing heaps + for (ggml_metal_heap_ptr * ptr in mem_pool->heaps) { + struct ggml_metal_heap * heap = ptr.data; + if (heap->offs + size_aligned <= [heap->obj size]) { + // if this is the first buffer in the heap for the current command buffer, tell the OS that + // it cannot free the memory used by the heap + // ref: https://developer.apple.com/documentation/metal/mtlpurgeablestate?language=objc + if ([heap->bufs count] == 0) { + [heap->obj setPurgeableState:MTLPurgeableStateNonVolatile]; + } + + id buf = [heap->obj newBufferWithLength:size_aligned options:MTLResourceStorageModePrivate offset:heap->offs]; + if (buf == nil) { + GGML_LOG_ERROR("%s: error: failed to create MTLBuffer with size %zu\n", __func__, size_aligned); + return nil; + } + + heap->n_alloc++; + heap->offs += size_aligned; + + [heap->bufs addObject:buf]; + + return buf; + } + } + + // create a new heap that can fit this buffer + ggml_metal_heap_ptr * heap_ptr = [ggml_metal_heap_ptr new]; + + struct ggml_metal_heap * heap = ggml_metal_heap_init(mem_pool->device, size_aligned); + if (heap == NULL) { + GGML_LOG_ERROR("%s: error: failed to create heap of size %zu\n", __func__, size_aligned); + return NULL; + } + + //GGML_LOG_DEBUG("%s: creating new heap of size %zu, got %zu\n", __func__, size_aligned, [heap->obj size]); + + heap_ptr.data = heap; + ggml_metal_heap_reset(heap); + + [heap->obj setPurgeableState:MTLPurgeableStateNonVolatile]; + id buf = [heap->obj newBufferWithLength:size_aligned options:MTLResourceStorageModePrivate offset:heap->offs]; + if (buf == nil) { + GGML_LOG_ERROR("%s: error: failed to create MTLBuffer with size %zu\n", __func__, size_aligned); + return NULL; + } + + heap->n_alloc++; + heap->offs += size_aligned; + + [heap->bufs addObject:buf]; + + [mem_pool->heaps addObject:heap_ptr]; + mem_pool->n_heaps++; + + return buf; +} + +struct ggml_metal_command_buffer { + id obj; + + // each command buffer has a memory pool from which it can allocate temporary buffers during the compute + struct ggml_metal_mem_pool * mem_pool; +}; + struct ggml_backend_metal_context { + id device; id queue; dispatch_queue_t d_queue; @@ -516,7 +768,7 @@ struct ggml_backend_metal_context { void (^encode_async)(size_t ith); // n_cb command buffers + 1 used by the main thread - id command_buffers[GGML_METAL_MAX_COMMAND_BUFFERS + 1]; + struct ggml_metal_command_buffer cmd_bufs[GGML_METAL_MAX_COMMAND_BUFFERS + 1]; // abort ggml_metal_graph_compute if callback returns true ggml_abort_callback abort_callback; @@ -706,9 +958,11 @@ static struct ggml_backend_metal_context * ggml_metal_init(ggml_backend_dev_t de struct ggml_backend_metal_device_context * ctx_dev = dev->context; id device = ggml_backend_metal_device_acq(ctx_dev); + GGML_LOG_INFO("%s: picking default device: %s\n", __func__, [[device name] UTF8String]); - ctx->queue = [device newCommandQueue]; + ctx->device = device; + ctx->queue = [device newCommandQueue]; if (ctx->queue == nil) { GGML_LOG_ERROR("%s: error: failed to create command queue\n", __func__); return NULL; @@ -769,7 +1023,10 @@ static struct ggml_backend_metal_context * ggml_metal_init(ggml_backend_dev_t de ctx->gf = nil; ctx->encode_async = nil; for (int i = 0; i < GGML_METAL_MAX_COMMAND_BUFFERS; ++i) { - ctx->command_buffers[i] = nil; + ctx->cmd_bufs[i].obj = nil; + + ctx->cmd_bufs[i].mem_pool = ggml_metal_mem_pool_init(); + ctx->cmd_bufs[i].mem_pool->device = device; } #if TARGET_OS_OSX || (TARGET_OS_IOS && __clang_major__ >= 15) @@ -1183,6 +1440,12 @@ static void ggml_metal_free(struct ggml_backend_metal_context * ctx) { [ctx->queue release]; + for (int i = 0; i < GGML_METAL_MAX_COMMAND_BUFFERS; ++i) { + // ctx->cmd_bufs[i].obj is auto released + + ggml_metal_mem_pool_free(ctx->cmd_bufs[i].mem_pool); + } + dispatch_release(ctx->d_queue); free(ctx); @@ -1489,10 +1752,11 @@ static bool ggml_metal_supports_op(const struct ggml_backend_metal_device_contex } } -static void ggml_metal_encode_node( +static bool ggml_metal_encode_node( ggml_backend_t backend, int idx, - id encoder) { + id encoder, + struct ggml_metal_mem_pool * mem_pool) { struct ggml_backend_metal_context * ctx = backend->context; struct ggml_backend_metal_device_context * ctx_dev = backend->device->context; @@ -1508,7 +1772,7 @@ static void ggml_metal_encode_node( struct ggml_tensor * dst = node; if (ggml_is_empty(dst)) { - return; + return true; } switch (dst->op) { @@ -1519,7 +1783,7 @@ static void ggml_metal_encode_node( case GGML_OP_PERMUTE: { // noop -> next node - } return; + } return true; default: { } break; @@ -1530,6 +1794,8 @@ static void ggml_metal_encode_node( GGML_ABORT("unsupported op"); } + ggml_metal_mem_pool_clear(mem_pool); + const int64_t ne00 = src0 ? src0->ne[0] : 0; const int64_t ne01 = src0 ? src0->ne[1] : 0; const int64_t ne02 = src0 ? src0->ne[2] : 0; @@ -2176,26 +2442,76 @@ static void ggml_metal_encode_node( const float m0 = powf(2.0f, -(max_bias ) / n_head_log2); const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_head_log2); - ggml_metal_kargs_soft_max args = { +// use this branch to test the ggml_metal_mem_pool functionality +#if 0 + // cpy to tmp buffer in MTLHeap + + id h_src0 = h_src0 = ggml_metal_mem_pool_alloc(mem_pool, ggml_nbytes(src0)); + if (!h_src0) { + GGML_LOG_ERROR("%s: failed to allocate buffer from memory pool, size = %zu\n", __func__, ggml_nbytes(src0)); + return false; + } + + offs_src0 = 0; + + ggml_metal_kargs_cpy args_cpy = { /*.ne00 =*/ ne00, /*.ne01 =*/ ne01, /*.ne02 =*/ ne02, - /*.scale =*/ scale, - /*.max_bias =*/ max_bias, - /*.m0 =*/ m0, - /*.m1 =*/ m1, + /*.ne03 =*/ ne03, + /*.nb00 =*/ nb00, + /*.nb01 =*/ nb01, + /*.nb02 =*/ nb02, + /*.nb03 =*/ nb03, + /*.ne0 =*/ ne00, + /*.ne1 =*/ ne01, + /*.ne2 =*/ ne02, + /*.ne3 =*/ ne03, + /*.nb0 =*/ nb00, + /*.nb1 =*/ nb01, + /*.nb2 =*/ nb02, + /*.nb3 =*/ nb03, + }; + + if (src0->type == GGML_TYPE_F16) { + [encoder setComputePipelineState:ctx->kernels[GGML_METAL_KERNEL_TYPE_CPY_F16_F16].pipeline]; + } else { + [encoder setComputePipelineState:ctx->kernels[GGML_METAL_KERNEL_TYPE_CPY_F32_F32].pipeline]; + } + [encoder setBytes:&args_cpy length:sizeof(args_cpy) atIndex:0]; + [encoder setBuffer:id_src0 offset:offs_src0 atIndex:1]; + [encoder setBuffer:h_src0 offset:0 atIndex:2]; + + GGML_ASSERT(ne00 % ggml_blck_size(src0->type) == 0); + int nth_cpy = MIN(1024, ne00 / ggml_blck_size(src0->type)); + + [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(nth_cpy, 1, 1)]; + +#else + id h_src0 = id_src0; +#endif + // softmax + + ggml_metal_kargs_soft_max args = { + /*.ne00 =*/ ne00, + /*.ne01 =*/ ne01, + /*.ne02 =*/ ne02, + /*.scale =*/ scale, + /*.max_bias =*/ max_bias, + /*.m0 =*/ m0, + /*.m1 =*/ m1, /*.n_head_log2 =*/ n_head_log2, }; [encoder setComputePipelineState:pipeline]; - [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; + [encoder setBuffer:h_src0 offset:offs_src0 atIndex:0]; if (id_src1) { - [encoder setBuffer:id_src1 offset:offs_src1 atIndex:1]; + [encoder setBuffer:id_src1 offset:offs_src1 atIndex:1]; } else { - [encoder setBuffer:id_src0 offset:offs_src0 atIndex:1]; + [encoder setBuffer:h_src0 offset:offs_src0 atIndex:1]; } - [encoder setBuffer:id_dst offset:offs_dst atIndex:2]; - [encoder setBytes:&args length:sizeof(args) atIndex:3]; + [encoder setBuffer:id_dst offset:offs_dst atIndex:2]; + [encoder setBytes:&args length:sizeof(args) atIndex:3]; [encoder setThreadgroupMemoryLength:32*sizeof(float) atIndex:0]; @@ -4634,6 +4950,8 @@ static void ggml_metal_encode_node( GGML_ABORT("fatal error"); } } + + return true; } static enum ggml_status ggml_metal_graph_compute( @@ -4687,25 +5005,25 @@ static enum ggml_status ggml_metal_graph_compute( } // the main thread commits the first few commands immediately - // command_buffer[n_cb] + // cmd_buf[n_cb] { - id command_buffer = [ctx->queue commandBufferWithUnretainedReferences]; - ctx->command_buffers[n_cb] = command_buffer; + id cmd_buf = [ctx->queue commandBufferWithUnretainedReferences]; + ctx->cmd_bufs[n_cb].obj = cmd_buf; - [command_buffer enqueue]; + [cmd_buf enqueue]; ctx->encode_async(n_cb); } // prepare the rest of the command buffers asynchronously - // command_buffer[0.. n_cb) + // cmd_buf[0.. n_cb) for (int cb_idx = 0; cb_idx < n_cb; ++cb_idx) { - id command_buffer = [ctx->queue commandBufferWithUnretainedReferences]; - ctx->command_buffers[cb_idx] = command_buffer; + id cmd_buf = [ctx->queue commandBufferWithUnretainedReferences]; + ctx->cmd_bufs[cb_idx].obj = cmd_buf; // always enqueue the first two command buffers // enqueue all of the command buffers if we don't need to abort if (cb_idx < 2 || ctx->abort_callback == NULL) { - [command_buffer enqueue]; + [cmd_buf enqueue]; } } @@ -4714,14 +5032,14 @@ static enum ggml_status ggml_metal_graph_compute( // wait for completion and check status of each command buffer // needed to detect if the device ran out-of-memory for example (#1881) { - id command_buffer = ctx->command_buffers[n_cb]; - [command_buffer waitUntilCompleted]; + id cmd_buf = ctx->cmd_bufs[n_cb].obj; + [cmd_buf waitUntilCompleted]; - MTLCommandBufferStatus status = [command_buffer status]; + MTLCommandBufferStatus status = [cmd_buf status]; if (status != MTLCommandBufferStatusCompleted) { GGML_LOG_INFO("%s: command buffer %d failed with status %lu\n", __func__, n_cb, status); if (status == MTLCommandBufferStatusError) { - GGML_LOG_INFO("error: %s\n", [[command_buffer error].localizedDescription UTF8String]); + GGML_LOG_INFO("error: %s\n", [[cmd_buf error].localizedDescription UTF8String]); } return GGML_STATUS_FAILED; @@ -4729,20 +5047,20 @@ static enum ggml_status ggml_metal_graph_compute( } for (int i = 0; i < n_cb; ++i) { - id command_buffer = ctx->command_buffers[i]; - [command_buffer waitUntilCompleted]; + id cmd_buf = ctx->cmd_bufs[i].obj; + [cmd_buf waitUntilCompleted]; - MTLCommandBufferStatus status = [command_buffer status]; + MTLCommandBufferStatus status = [cmd_buf status]; if (status != MTLCommandBufferStatusCompleted) { GGML_LOG_INFO("%s: command buffer %d failed with status %lu\n", __func__, i, status); if (status == MTLCommandBufferStatusError) { - GGML_LOG_INFO("error: %s\n", [[command_buffer error].localizedDescription UTF8String]); + GGML_LOG_INFO("error: %s\n", [[cmd_buf error].localizedDescription UTF8String]); } return GGML_STATUS_FAILED; } - id next_buffer = (i + 1 < n_cb ? ctx->command_buffers[i + 1] : nil); + id next_buffer = (i + 1 < n_cb ? ctx->cmd_bufs[i + 1].obj : nil); if (!next_buffer) { continue; } @@ -5126,8 +5444,9 @@ static void ggml_backend_metal_set_n_cb(ggml_backend_t backend, int n_cb) { const int n_nodes_per_cb = ctx->n_nodes_per_cb; - id command_buffer = ctx->command_buffers[cb_idx]; - id encoder = [command_buffer computeCommandEncoder]; + id cmd_buf = ctx->cmd_bufs[cb_idx].obj; + + id encoder = [cmd_buf computeCommandEncoder]; int node_start = 0; int node_end = n_nodes_0; @@ -5139,22 +5458,29 @@ static void ggml_backend_metal_set_n_cb(ggml_backend_t backend, int n_cb) { const bool should_capture = ctx->capture_next_compute; + struct ggml_metal_mem_pool * mem_pool = ctx->cmd_bufs[cb_idx].mem_pool; + ggml_metal_mem_pool_reset(mem_pool); + for (int idx = node_start; idx < node_end; ++idx) { if (should_capture) { [encoder pushDebugGroup:[NSString stringWithCString:ggml_op_desc(ggml_graph_node(ctx->gf, idx)) encoding:NSUTF8StringEncoding]]; } - ggml_metal_encode_node(backend, idx, encoder); + const bool res = ggml_metal_encode_node(backend, idx, encoder, mem_pool); if (should_capture) { [encoder popDebugGroup]; } + + if (!res) { + break; + } } [encoder endEncoding]; if (cb_idx < 2 || ctx->abort_callback == NULL) { - [command_buffer commit]; + [cmd_buf commit]; } }); } diff --git a/ml/backend/ggml/ggml/src/ggml-metal/ggml-metal.metal b/ml/backend/ggml/ggml/src/ggml-metal/ggml-metal.metal index 71f0f97ff..6ceb3cef7 100644 --- a/ml/backend/ggml/ggml/src/ggml-metal/ggml-metal.metal +++ b/ml/backend/ggml/ggml/src/ggml-metal/ggml-metal.metal @@ -3237,7 +3237,7 @@ kernel void kernel_flash_attn_ext( { float S[Q] = { [0 ... Q-1] = 0.0f }; - float M[Q] = { [0 ... Q-1] = -__FLT16_MAX__/2 }; + float M[Q] = { [0 ... Q-1] = -__FLT_MAX__/2 }; // thread indices inside the simdgroup // TODO: see if we can utilize quad-group functions for better performance @@ -3497,7 +3497,7 @@ kernel void kernel_flash_attn_ext( // reduce the warps sequentially for (ushort sg = 1; sg < nsg; ++sg) { float S = { 0.0f }; - float M = { -__FLT16_MAX__/2 }; + float M = { -__FLT_MAX__/2 }; threadgroup_barrier(mem_flags::mem_threadgroup); @@ -3744,7 +3744,7 @@ kernel void kernel_flash_attn_ext_vec( { float S = 0.0f; - float M = -__FLT16_MAX__/2; + float M = -__FLT_MAX__/2; // thread indices inside the simdgroup const short tx = tiisg%NL; diff --git a/ml/backend/ggml/ggml/src/ggml.c b/ml/backend/ggml/ggml/src/ggml.c index 2276b6312..3c57aff8b 100644 --- a/ml/backend/ggml/ggml/src/ggml.c +++ b/ml/backend/ggml/ggml/src/ggml.c @@ -4,6 +4,7 @@ #include "ggml-backend.h" #include "ggml-impl.h" #include "ggml-threading.h" +#include "ggml-cpu.h" #include "ggml.h" // FIXME: required here for quantization functions @@ -382,58 +383,16 @@ void ggml_fp16_to_fp32_row(const ggml_fp16_t * x, float * y, int64_t n) { } } -// FIXME: these functions must detect the instruction set at runtime, since they are part of the core ggml library -// currently, the ggml_cpu_has_* functions are entirely compile-time void ggml_fp32_to_fp16_row(const float * x, ggml_fp16_t * y, int64_t n) { - int64_t i = 0; -#if defined(__F16C__) - //if (ggml_cpu_has_f16c()) { - for (; i + 7 < n; i += 8) { - __m256 x_vec = _mm256_loadu_ps(x + i); - __m128i y_vec = _mm256_cvtps_ph(x_vec, _MM_FROUND_TO_NEAREST_INT); - _mm_storeu_si128((__m128i *)(y + i), y_vec); - } - for(; i + 3 < n; i += 4) { - __m128 x_vec = _mm_loadu_ps(x + i); - __m128i y_vec = _mm_cvtps_ph(x_vec, _MM_FROUND_TO_NEAREST_INT); - _mm_storel_epi64((__m128i *)(y + i), y_vec); - } - //} -#endif - for (; i < n; i++) { + int i = 0; + for (; i < n; ++i) { y[i] = GGML_FP32_TO_FP16(x[i]); } } void ggml_bf16_to_fp32_row(const ggml_bf16_t * x, float * y, int64_t n) { - int64_t i = 0; -#if defined(__AVX512F__) - //if (ggml_cpu_has_avx512()) { - for (; i + 16 <= n; i += 16) { - _mm512_storeu_ps(y + i, - _mm512_castsi512_ps( - _mm512_slli_epi32( - _mm512_cvtepu16_epi32( - _mm256_loadu_si256( - (const __m256i *)(x + i))), - 16))); - } - //} -#endif -#if defined(__AVX2__) - //if (ggml_cpu_has_avx2()) { - for (; i + 8 <= n; i += 8) { - _mm256_storeu_ps(y + i, - _mm256_castsi256_ps( - _mm256_slli_epi32( - _mm256_cvtepu16_epi32( - _mm_loadu_si128( - (const __m128i *)(x + i))), - 16))); - } - //} -#endif - for (; i < n; i++) { + int i = 0; + for (; i < n; ++i) { y[i] = GGML_BF16_TO_FP32(x[i]); } } @@ -956,6 +915,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = { "CONV_TRANSPOSE_1D", "IM2COL", "IM2COL_BACK", + "CONV_2D_DW", "CONV_TRANSPOSE_2D", "POOL_1D", "POOL_2D", @@ -994,7 +954,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = { "OPT_STEP_ADAMW", }; -static_assert(GGML_OP_COUNT == 82, "GGML_OP_COUNT != 82"); +static_assert(GGML_OP_COUNT == 83, "GGML_OP_COUNT != 83"); static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = { "none", @@ -1051,6 +1011,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = { "conv_transpose_1d(x)", "im2col(x)", "im2col_back(x)", + "conv_2d_dw(x)", "conv_transpose_2d(x)", "pool_1d(x)", "pool_2d(x)", @@ -1089,7 +1050,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = { "adamw(x)", }; -static_assert(GGML_OP_COUNT == 82, "GGML_OP_COUNT != 82"); +static_assert(GGML_OP_COUNT == 83, "GGML_OP_COUNT != 83"); static_assert(GGML_OP_POOL_COUNT == 2, "GGML_OP_POOL_COUNT != 2"); @@ -1346,6 +1307,13 @@ bool ggml_is_permuted(const struct ggml_tensor * tensor) { return tensor->nb[0] > tensor->nb[1] || tensor->nb[1] > tensor->nb[2] || tensor->nb[2] > tensor->nb[3]; } +bool ggml_is_contiguous_channels(const struct ggml_tensor * tensor) { + return + tensor->nb[0] > tensor->nb[2] && + tensor->nb[1] > tensor->nb[0] && + tensor->nb[2] == ggml_type_size(tensor->type); +} + static inline bool ggml_is_padded_1d(const struct ggml_tensor * tensor) { static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function"); @@ -4052,6 +4020,46 @@ struct ggml_tensor * ggml_conv_2d_dw( return result; } +// ggml_conv_2d_dw_direct + +struct ggml_tensor * ggml_conv_2d_dw_direct( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + int stride0, + int stride1, + int pad0, + int pad1, + int dilation0, + int dilation1) { + GGML_ASSERT(a->ne[2] == 1); + GGML_ASSERT(a->ne[3] == b->ne[2]); + int64_t ne[4]; + ne[0] = ggml_calc_conv_output_size(b->ne[0], a->ne[0], stride0, pad0, dilation0); + ne[1] = ggml_calc_conv_output_size(b->ne[1], a->ne[1], stride1, pad1, dilation1); + ne[2] = b->ne[2]; + ne[3] = b->ne[3]; + + struct ggml_tensor * result = ggml_new_tensor(ctx, b->type, 4, ne); + + if (ggml_is_contiguous_channels(b)) { + // Result will be permuted the same way as input (CWHN order) + const int64_t type_size = ggml_type_size(result->type); + GGML_ASSERT(ggml_blck_size(result->type) == 1); + result->nb[0] = result->ne[2] * type_size; + result->nb[1] = result->ne[0] * result->nb[0]; + result->nb[2] = type_size; + } + + int32_t params[] = { stride0, stride1, pad0, pad1, dilation0, dilation1 }; + ggml_set_op_params(result, params, sizeof(params)); + + result->op = GGML_OP_CONV_2D_DW; + result->src[0] = a; + result->src[1] = b; + return result; +} + // ggml_conv_transpose_2d_p0 static int64_t ggml_calc_conv_transpose_output_size(int64_t ins, int64_t ks, int s, int p) { diff --git a/ml/backend/ggml/ggml/src/ggml.go b/ml/backend/ggml/ggml/src/ggml.go index afc1e1edd..91f1f1ada 100644 --- a/ml/backend/ggml/ggml/src/ggml.go +++ b/ml/backend/ggml/ggml/src/ggml.go @@ -3,6 +3,7 @@ package ggml // #cgo CXXFLAGS: -std=c++17 // #cgo CPPFLAGS: -DNDEBUG -DGGML_USE_CPU // #cgo CPPFLAGS: -I${SRCDIR}/../include -I${SRCDIR}/ggml-cpu +// #cgo windows CFLAGS: -Wno-dll-attribute-on-redeclaration // #cgo windows LDFLAGS: -lmsvcrt -static -static-libgcc -static-libstdc++ // #include // #include "ggml-backend.h" @@ -57,26 +58,20 @@ var OnceLoad = sync.OnceFunc(func() { exe = "." } - // PATH, LD_LIBRARY_PATH, and DYLD_LIBRARY_PATH are often - // set by the parent process, however, use a default value - // if the environment variable is not set. - var name, value string + var value string switch runtime.GOOS { case "darwin": - // On macOS, DYLD_LIBRARY_PATH is often not set, so - // we use the directory of the executable as the default. - name = "DYLD_LIBRARY_PATH" value = filepath.Dir(exe) case "windows": - name = "PATH" value = filepath.Join(filepath.Dir(exe), "lib", "ollama") default: - name = "LD_LIBRARY_PATH" value = filepath.Join(filepath.Dir(exe), "..", "lib", "ollama") } - paths, ok := os.LookupEnv(name) + // Avoid potentially loading incompatible GGML libraries + paths, ok := os.LookupEnv("OLLAMA_LIBRARY_PATH") if !ok { + slog.Debug("OLLAMA_LIBRARY_PATH not set, falling back to default", "search", value) paths = value } diff --git a/ml/backend/ggml/quantization.go b/ml/backend/ggml/quantization.go new file mode 100644 index 000000000..bb31e455d --- /dev/null +++ b/ml/backend/ggml/quantization.go @@ -0,0 +1,83 @@ +package ggml + +// #cgo CPPFLAGS: -I${SRCDIR}/ggml/src +// #include +// #include +// #include "ggml.h" +// #include "ggml-cpu.h" +// #include "ggml-backend.h" +// #include "ggml-quants.h" +import "C" + +import ( + "unsafe" + + fsggml "github.com/ollama/ollama/fs/ggml" +) + +// convertToF32 converts (dequantizes) the raw data to F32 so we can then quantize it +func ConvertToF32(data []byte, dtype uint32, nelements uint64) []float32 { + f32s := make([]float32, nelements) + elems := C.int64_t(nelements) + switch dtype { + case C.GGML_TYPE_F16: + C.ggml_fp16_to_fp32_row((*C.uint16_t)(unsafe.Pointer(&data[0])), (*C.float)(&f32s[0]), elems) + case C.GGML_TYPE_Q4_0: + C.dequantize_row_q4_0((*C.block_q4_0)(unsafe.Pointer(&data[0])), (*C.float)(&f32s[0]), elems) + case C.GGML_TYPE_Q4_1: + C.dequantize_row_q4_1((*C.block_q4_1)(unsafe.Pointer(&data[0])), (*C.float)(&f32s[0]), elems) + case C.GGML_TYPE_Q5_0: + C.dequantize_row_q5_0((*C.block_q5_0)(unsafe.Pointer(&data[0])), (*C.float)(&f32s[0]), elems) + case C.GGML_TYPE_Q5_1: + C.dequantize_row_q5_1((*C.block_q5_1)(unsafe.Pointer(&data[0])), (*C.float)(&f32s[0]), elems) + case C.GGML_TYPE_Q8_0: + C.dequantize_row_q8_0((*C.block_q8_0)(unsafe.Pointer(&data[0])), (*C.float)(&f32s[0]), elems) + case C.GGML_TYPE_Q2_K: + C.dequantize_row_q2_K((*C.block_q2_K)(unsafe.Pointer(&data[0])), (*C.float)(&f32s[0]), elems) + case C.GGML_TYPE_Q3_K: + C.dequantize_row_q3_K((*C.block_q3_K)(unsafe.Pointer(&data[0])), (*C.float)(&f32s[0]), elems) + case C.GGML_TYPE_Q4_K: + C.dequantize_row_q4_K((*C.block_q4_K)(unsafe.Pointer(&data[0])), (*C.float)(&f32s[0]), elems) + case C.GGML_TYPE_Q5_K: + C.dequantize_row_q5_K((*C.block_q5_K)(unsafe.Pointer(&data[0])), (*C.float)(&f32s[0]), elems) + case C.GGML_TYPE_Q6_K: + C.dequantize_row_q6_K((*C.block_q6_K)(unsafe.Pointer(&data[0])), (*C.float)(&f32s[0]), elems) + case C.GGML_TYPE_BF16: + C.ggml_bf16_to_fp32_row((*C.ggml_bf16_t)(unsafe.Pointer(&data[0])), (*C.float)(&f32s[0]), elems) + default: + panic("unsupported quantization format") + } + return f32s +} + +func Quantize(newType fsggml.TensorType, f32s []float32, shape []uint64) []byte { + buf := make([]byte, len(f32s)*4) // upper bound on size + nPerRow := C.int64_t(shape[0]) + nrows := C.int64_t(1) + if len(shape) > 1 { + nrows = C.int64_t(shape[1]) + } + shape2 := C.int64_t(1) + if len(shape) > 2 { + shape2 = C.int64_t(shape[2]) + } + nelements_matrix := nPerRow * nrows + newSize := C.size_t(0) + for i03 := C.int64_t(0); i03 < shape2; i03++ { + f32s_03 := i03 * nelements_matrix + buf_03 := C.int64_t(C.ggml_row_size(uint32(newType), nPerRow)) * i03 * nrows + newSize += C.ggml_quantize_chunk( + uint32(newType), + (*C.float)(&f32s[f32s_03]), + unsafe.Pointer((uintptr)(unsafe.Pointer(&buf[0]))+uintptr(buf_03)), + 0, + nrows, + nPerRow, + nil) + } + return buf[:newSize] +} + +func QuantizationVersion() uint32 { + return uint32(C.GGML_QNT_VERSION) +} diff --git a/parser/parser.go b/parser/parser.go index a14ac5ff4..96eae9c04 100644 --- a/parser/parser.go +++ b/parser/parser.go @@ -39,7 +39,17 @@ func (f Modelfile) String() string { return sb.String() } -var deprecatedParameters = []string{"penalize_newline"} +var deprecatedParameters = []string{ + "penalize_newline", + "low_vram", + "f16_kv", + "logits_all", + "vocab_only", + "use_mlock", + "mirostat", + "mirostat_tau", + "mirostat_eta", +} // CreateRequest creates a new *api.CreateRequest from an existing Modelfile func (f Modelfile) CreateRequest(relativeDir string) (*api.CreateRequest, error) { @@ -139,10 +149,28 @@ func fileDigestMap(path string) (map[string]string, error) { var files []string if fi.IsDir() { - files, err = filesForModel(path) + fs, err := filesForModel(path) if err != nil { return nil, err } + + for _, f := range fs { + f, err := filepath.EvalSymlinks(f) + if err != nil { + return nil, err + } + + rel, err := filepath.Rel(path, f) + if err != nil { + return nil, err + } + + if !filepath.IsLocal(rel) { + return nil, fmt.Errorf("insecure path: %s", rel) + } + + files = append(files, f) + } } else { files = []string{path} } @@ -215,11 +243,11 @@ func filesForModel(path string) ([]string, error) { return nil, err } - for _, safetensor := range matches { - if ct, err := detectContentType(safetensor); err != nil { + for _, match := range matches { + if ct, err := detectContentType(match); err != nil { return nil, err } else if ct != contentType { - return nil, fmt.Errorf("invalid content type: expected %s for %s", ct, safetensor) + return nil, fmt.Errorf("invalid content type: expected %s for %s", ct, match) } } diff --git a/parser/parser_test.go b/parser/parser_test.go index 097c058fb..7d5a808ba 100644 --- a/parser/parser_test.go +++ b/parser/parser_test.go @@ -478,11 +478,7 @@ func TestParseFileParameters(t *testing.T) { "num_gqa 1": {"num_gqa", "1"}, "num_gpu 1": {"num_gpu", "1"}, "main_gpu 1": {"main_gpu", "1"}, - "low_vram true": {"low_vram", "true"}, - "logits_all true": {"logits_all", "true"}, - "vocab_only true": {"vocab_only", "true"}, "use_mmap true": {"use_mmap", "true"}, - "use_mlock true": {"use_mlock", "true"}, "num_thread 1": {"num_thread", "1"}, "num_keep 1": {"num_keep", "1"}, "seed 1": {"seed", "1"}, @@ -496,9 +492,6 @@ func TestParseFileParameters(t *testing.T) { "repeat_penalty 1.0": {"repeat_penalty", "1.0"}, "presence_penalty 1.0": {"presence_penalty", "1.0"}, "frequency_penalty 1.0": {"frequency_penalty", "1.0"}, - "mirostat 1": {"mirostat", "1"}, - "mirostat_tau 1.0": {"mirostat_tau", "1.0"}, - "mirostat_eta 1.0": {"mirostat_eta", "1.0"}, "penalize_newline true": {"penalize_newline", "true"}, "stop ### User:": {"stop", "### User:"}, "stop ### User: ": {"stop", "### User:"}, @@ -769,7 +762,7 @@ func getSHA256Digest(t *testing.T, r io.Reader) (string, int64) { return fmt.Sprintf("sha256:%x", h.Sum(nil)), n } -func createBinFile(t *testing.T, kv map[string]any, ti []ggml.Tensor) (string, string) { +func createBinFile(t *testing.T, kv map[string]any, ti []*ggml.Tensor) (string, string) { t.Helper() f, err := os.CreateTemp(t.TempDir(), "testbin.*.gguf") diff --git a/progress/bar.go b/progress/bar.go index 410b6e23f..f3d21a8fd 100644 --- a/progress/bar.go +++ b/progress/bar.go @@ -64,7 +64,7 @@ func formatDuration(d time.Duration) string { func (b *Bar) String() string { termWidth, _, err := term.GetSize(int(os.Stderr.Fd())) if err != nil { - termWidth = 80 + termWidth = defaultTermWidth } var pre strings.Builder diff --git a/progress/progress.go b/progress/progress.go index 0cd0ea1f9..9f54275ec 100644 --- a/progress/progress.go +++ b/progress/progress.go @@ -4,8 +4,16 @@ import ( "bufio" "fmt" "io" + "os" "sync" "time" + + "golang.org/x/term" +) + +const ( + defaultTermWidth = 80 + defaultTermHeight = 24 ) type State interface { @@ -83,6 +91,11 @@ func (p *Progress) Add(key string, state State) { } func (p *Progress) render() { + _, termHeight, err := term.GetSize(int(os.Stderr.Fd())) + if err != nil { + termHeight = defaultTermHeight + } + p.mu.Lock() defer p.mu.Unlock() @@ -102,8 +115,9 @@ func (p *Progress) render() { fmt.Fprint(p.w, "\033[1G") // render progress lines - for i, state := range p.states { - fmt.Fprint(p.w, state.String(), "\033[K") + maxHeight := min(len(p.states), termHeight) + for i := len(p.states) - maxHeight; i < len(p.states); i++ { + fmt.Fprint(p.w, p.states[i].String(), "\033[K") if i < len(p.states)-1 { fmt.Fprint(p.w, "\n") } diff --git a/runner/llamarunner/runner.go b/runner/llamarunner/runner.go index d8169be40..5341d4fb1 100644 --- a/runner/llamarunner/runner.go +++ b/runner/llamarunner/runner.go @@ -583,9 +583,6 @@ func (s *Server) completion(w http.ResponseWriter, r *http.Request) { PenaltyRepeat: req.Options.RepeatPenalty, PenaltyFreq: req.Options.FrequencyPenalty, PenaltyPresent: req.Options.PresencePenalty, - Mirostat: req.Options.Mirostat, - MirostatTau: req.Options.MirostatTau, - MirostatEta: req.Options.MirostatEta, Seed: uint32(req.Options.Seed), Grammar: req.Grammar, } @@ -820,7 +817,6 @@ func Execute(args []string) error { threads := fs.Int("threads", runtime.NumCPU(), "Number of threads to use during generation") verbose := fs.Bool("verbose", false, "verbose output (default: disabled)") noMmap := fs.Bool("no-mmap", false, "do not memory-map model (slower load but may reduce pageouts if not using mlock)") - mlock := fs.Bool("mlock", false, "force system to keep model in RAM rather than swapping or compressing") tensorSplit := fs.String("tensor-split", "", "fraction of the model to offload to each GPU, comma-separated list of proportions") multiUserCache := fs.Bool("multiuser-cache", false, "optimize input cache algorithm for multiple users") @@ -876,7 +872,6 @@ func Execute(args []string) error { NumGpuLayers: *nGpuLayers, MainGpu: *mainGpu, UseMmap: !*noMmap && lpaths.String() == "", - UseMlock: *mlock, TensorSplit: tensorSplitFloats, Progress: func(progress float32) { server.progress = progress diff --git a/runner/ollamarunner/cache.go b/runner/ollamarunner/cache.go index 01f435e4b..2138d7988 100644 --- a/runner/ollamarunner/cache.go +++ b/runner/ollamarunner/cache.go @@ -284,7 +284,7 @@ func (c *InputCache) ShiftCacheSlot(slot *InputCacheSlot, numKeep int32) error { copy(newInputs[numKeep:], slot.Inputs[numKeep+discard:]) // Reset the cache - _ = c.cache.Remove(slot.Id, 0, -1) + _ = c.cache.Remove(slot.Id, 0, math.MaxInt32) slot.Inputs = []input.Input{} // Return error with inputs that need to be reprocessed diff --git a/runner/ollamarunner/runner.go b/runner/ollamarunner/runner.go index 0ac543888..b028a7216 100644 --- a/runner/ollamarunner/runner.go +++ b/runner/ollamarunner/runner.go @@ -34,14 +34,10 @@ import ( _ "github.com/ollama/ollama/model/models" ) -type contextList struct { - list []ml.Context -} - type Sequence struct { // ctxs are used for allocating tensors that last the lifetime of the sequence, such as // multimodal embeddings - ctxs *contextList + ctxs []ml.Context // batch index iBatch int @@ -177,8 +173,10 @@ func (s *Server) NewSequence(prompt string, images []llm.ImageData, params NewSe // inputs processes the prompt and images into a list of inputs // by splitting the prompt on [img-] tags, tokenizing text and // decoding images -func (s *Server) inputs(prompt string, images []llm.ImageData) ([]input.Input, *contextList, error) { +func (s *Server) inputs(prompt string, images []llm.ImageData) ([]input.Input, []ml.Context, error) { var inputs []input.Input + var ctxs []ml.Context + var parts []string var matches [][]string @@ -192,13 +190,6 @@ func (s *Server) inputs(prompt string, images []llm.ImageData) ([]input.Input, * parts = []string{prompt} } - var contexts contextList - runtime.AddCleanup(&contexts, func(ctxs []ml.Context) { - for _, ctx := range ctxs { - ctx.Close() - } - }, contexts.list) - postTokenize := false for i, part := range parts { // text - tokenize @@ -228,7 +219,8 @@ func (s *Server) inputs(prompt string, images []llm.ImageData) ([]input.Input, * } ctx := s.model.Backend().NewContext() - contexts.list = append(contexts.list, ctx) + runtime.SetFinalizer(ctx, func(c ml.Context) { c.Close() }) + ctxs = append(ctxs, ctx) imageEmbeddings, err := multimodalProcessor.EncodeMultimodal(ctx, images[imageIndex].Data) if err != nil { return nil, nil, err @@ -251,7 +243,7 @@ func (s *Server) inputs(prompt string, images []llm.ImageData) ([]input.Input, * } } - return inputs, &contexts, nil + return inputs, ctxs, nil } type Server struct { @@ -826,7 +818,6 @@ func Execute(args []string) error { threads := fs.Int("threads", runtime.NumCPU(), "Number of threads to use during generation") verbose := fs.Bool("verbose", false, "verbose output (default: disabled)") _ = fs.Bool("no-mmap", false, "do not memory-map model (slower load but may reduce pageouts if not using mlock)") - _ = fs.Bool("mlock", false, "force system to keep model in RAM rather than swapping or compressing") tensorSplit := fs.String("tensor-split", "", "fraction of the model to offload to each GPU, comma-separated list of proportions") multiUserCache := fs.Bool("multiuser-cache", false, "optimize input cache algorithm for multiple users") diff --git a/scripts/build_windows.ps1 b/scripts/build_windows.ps1 index 60485df85..eaac2c600 100644 --- a/scripts/build_windows.ps1 +++ b/scripts/build_windows.ps1 @@ -27,7 +27,6 @@ function checkEnv() { $env:VCToolsRedistDir=(get-item "${MSVC_INSTALL}\VC\Redist\MSVC\*")[0] } # Locate CUDA versions - # Note: this assumes every version found will be built $cudaList=(get-item "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v*\bin\" -ea 'silentlycontinue') if ($cudaList.length -eq 0) { $d=(get-command -ea 'silentlycontinue' nvcc).path @@ -94,19 +93,6 @@ function buildOllama() { $hashEnv = @{} Get-ChildItem env: | foreach { $hashEnv[$_.Name] = $_.Value } - if ("$script:CUDA_DIRS".Contains("v11")) { - $hashEnv.Keys | foreach { if ($_.Contains("CUDA_PATH_V11")) { $v11="$_" }} - $env:CUDAToolkit_ROOT=$hashEnv[$v11] - write-host "Building CUDA v11 backend libraries" - # Note: cuda v11 requires msvc 2019 so force the older generator - # to avoid 2022 (or newer) from being used as the default - & cmake --fresh --preset "CUDA 11" -G "Visual Studio 16 2019" --install-prefix $script:DIST_DIR - if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)} - & cmake --build --preset "CUDA 11" --config Release --parallel $script:JOBS - if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)} - & cmake --install build --component "CUDA" --strip - if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)} - } if ("$script:CUDA_DIRS".Contains("v12")) { $hashEnv.Keys | foreach { if ($_.Contains("CUDA_PATH_V12")) { $v12="$_" }} $env:CUDAToolkit_ROOT=$hashEnv[$v12] @@ -121,7 +107,7 @@ function buildOllama() { if ($env:HIP_PATH) { write-host "Building ROCm backend libraries" if (-Not (get-command -ErrorAction silent ninja)) { - $NINJA_DIR=(gci -path (Get-CimInstance MSFT_VSInstance -Namespace root/cimv2/vs)[0].InstallLocation -r -fi ninja.exe) | split-path -parent + $NINJA_DIR=(gci -path (Get-CimInstance MSFT_VSInstance -Namespace root/cimv2/vs)[0].InstallLocation -r -fi ninja.exe).Directory.FullName $env:PATH="$NINJA_DIR;$env:PATH" } $env:HIPCXX="${env:HIP_PATH}\bin\clang++.exe" diff --git a/scripts/env.sh b/scripts/env.sh index c5e6f530a..65a970bdc 100644 --- a/scripts/env.sh +++ b/scripts/env.sh @@ -10,9 +10,7 @@ OLLAMA_COMMON_BUILD_ARGS="--build-arg=VERSION \ --build-arg=GOFLAGS \ --build-arg=OLLAMA_CUSTOM_CPU_DEFS \ --build-arg=OLLAMA_SKIP_CUDA_GENERATE \ - --build-arg=OLLAMA_SKIP_CUDA_11_GENERATE \ --build-arg=OLLAMA_SKIP_CUDA_12_GENERATE \ - --build-arg=CUDA_V11_ARCHITECTURES \ --build-arg=CUDA_V12_ARCHITECTURES \ --build-arg=OLLAMA_SKIP_ROCM_GENERATE \ --build-arg=OLLAMA_FAST_BUILD \ diff --git a/server/create.go b/server/create.go index 50e669db0..7ffa60a22 100644 --- a/server/create.go +++ b/server/create.go @@ -15,6 +15,7 @@ import ( "path/filepath" "slices" "strings" + "sync/atomic" "github.com/gin-gonic/gin" @@ -23,7 +24,6 @@ import ( "github.com/ollama/ollama/envconfig" "github.com/ollama/ollama/format" "github.com/ollama/ollama/fs/ggml" - "github.com/ollama/ollama/llama" "github.com/ollama/ollama/template" "github.com/ollama/ollama/types/errtypes" "github.com/ollama/ollama/types/model" @@ -425,9 +425,14 @@ func createModel(r api.CreateRequest, name model.Name, baseLayers []*layerGGML, func quantizeLayer(layer *layerGGML, quantizeType string, fn func(resp api.ProgressResponse)) (*layerGGML, error) { ft := layer.GGML.KV().FileType() - fn(api.ProgressResponse{Status: fmt.Sprintf("quantizing %s model to %s", ft, quantizeType)}) - - want, err := ggml.ParseFileType(quantizeType) + var doneBytes atomic.Uint64 + totalBytes := uint64(layer.Size) - layer.GGML.Tensors().Offset + fnWrap := func(n uint64) { + done := doneBytes.Add(n) + progress := float32(done) / float32(totalBytes) + fn(api.ProgressResponse{Status: fmt.Sprintf("quantizing %s model to %s", ft, quantizeType), Digest: "0", Total: layer.Size, Completed: int64(progress * float32(layer.Size))}) + } + ftype, err := ggml.ParseFileType(quantizeType) if err != nil { return nil, err } @@ -436,6 +441,11 @@ func quantizeLayer(layer *layerGGML, quantizeType string, fn func(resp api.Progr if err != nil { return nil, err } + fp, err := os.Open(blob) + if err != nil { + return nil, err + } + defer fp.Close() temp, err := os.CreateTemp(filepath.Dir(blob), quantizeType) if err != nil { @@ -444,15 +454,15 @@ func quantizeLayer(layer *layerGGML, quantizeType string, fn func(resp api.Progr defer temp.Close() defer os.Remove(temp.Name()) - if err := llama.Quantize(blob, temp.Name(), uint32(want)); err != nil { + if err := quantize(fp, temp, layer.GGML, ftype, fnWrap); err != nil { return nil, err } - + temp.Seek(0, io.SeekStart) + fn(api.ProgressResponse{Status: "verifying conversion"}) newLayer, err := NewLayer(temp, layer.MediaType) if err != nil { return nil, err } - if _, err := temp.Seek(0, io.SeekStart); err != nil { return nil, err } @@ -462,7 +472,6 @@ func quantizeLayer(layer *layerGGML, quantizeType string, fn func(resp api.Progr slog.Error(fmt.Sprintf("error decoding ggml: %s\n", err)) return nil, err } - return &layerGGML{newLayer, f}, nil } diff --git a/server/images.go b/server/images.go index be629f4cb..352f10f2b 100644 --- a/server/images.go +++ b/server/images.go @@ -106,6 +106,11 @@ func (m *Model) Capabilities() []model.Capability { capabilities = append(capabilities, model.CapabilityInsert) } + // Check for vision capability in projector-based models + if len(m.ProjectorPaths) > 0 { + capabilities = append(capabilities, model.CapabilityVision) + } + return capabilities } diff --git a/server/images_test.go b/server/images_test.go index 22e5b7e6a..363b298e1 100644 --- a/server/images_test.go +++ b/server/images_test.go @@ -3,6 +3,7 @@ package server import ( "bytes" "encoding/binary" + "errors" "os" "path/filepath" "strings" @@ -91,11 +92,7 @@ func createMockGGUFData(architecture string, vision bool) []byte { func TestModelCapabilities(t *testing.T) { // Create a temporary directory for test files - tempDir, err := os.MkdirTemp("", "model_capabilities_test") - if err != nil { - t.Fatalf("Failed to create temp directory: %v", err) - } - defer os.RemoveAll(tempDir) + tempDir := t.TempDir() // Create different types of mock model files completionModelPath := filepath.Join(tempDir, "model.bin") @@ -104,21 +101,13 @@ func TestModelCapabilities(t *testing.T) { // Create a simple model file for tests that don't depend on GGUF content simpleModelPath := filepath.Join(tempDir, "simple_model.bin") - err = os.WriteFile(completionModelPath, createMockGGUFData("llama", false), 0o644) - if err != nil { - t.Fatalf("Failed to create completion model file: %v", err) - } - err = os.WriteFile(visionModelPath, createMockGGUFData("llama", true), 0o644) - if err != nil { - t.Fatalf("Failed to create completion model file: %v", err) - } - err = os.WriteFile(embeddingModelPath, createMockGGUFData("bert", false), 0o644) - if err != nil { - t.Fatalf("Failed to create embedding model file: %v", err) - } - err = os.WriteFile(simpleModelPath, []byte("dummy model data"), 0o644) - if err != nil { - t.Fatalf("Failed to create simple model file: %v", err) + if err := errors.Join( + os.WriteFile(completionModelPath, createMockGGUFData("llama", false), 0o644), + os.WriteFile(visionModelPath, createMockGGUFData("llama", true), 0o644), + os.WriteFile(embeddingModelPath, createMockGGUFData("bert", false), 0o644), + os.WriteFile(simpleModelPath, []byte("dummy model data"), 0o644), + ); err != nil { + t.Fatalf("Failed to create model files: %v", err) } toolsInsertTemplate, err := template.Parse("{{ .prompt }}{{ if .tools }}{{ .tools }}{{ end }}{{ if .suffix }}{{ .suffix }}{{ end }}") @@ -236,27 +225,18 @@ func TestModelCapabilities(t *testing.T) { func TestModelCheckCapabilities(t *testing.T) { // Create a temporary directory for test files - tempDir, err := os.MkdirTemp("", "model_check_capabilities_test") - if err != nil { - t.Fatalf("Failed to create temp directory: %v", err) - } - defer os.RemoveAll(tempDir) + tempDir := t.TempDir() visionModelPath := filepath.Join(tempDir, "vision_model.bin") simpleModelPath := filepath.Join(tempDir, "model.bin") embeddingModelPath := filepath.Join(tempDir, "embedding_model.bin") - err = os.WriteFile(simpleModelPath, []byte("dummy model data"), 0o644) - if err != nil { - t.Fatalf("Failed to create simple model file: %v", err) - } - err = os.WriteFile(visionModelPath, createMockGGUFData("llama", true), 0o644) - if err != nil { - t.Fatalf("Failed to create vision model file: %v", err) - } - err = os.WriteFile(embeddingModelPath, createMockGGUFData("bert", false), 0o644) - if err != nil { - t.Fatalf("Failed to create embedding model file: %v", err) + if err := errors.Join( + os.WriteFile(simpleModelPath, []byte("dummy model data"), 0o644), + os.WriteFile(visionModelPath, createMockGGUFData("llama", true), 0o644), + os.WriteFile(embeddingModelPath, createMockGGUFData("bert", false), 0o644), + ); err != nil { + t.Fatalf("Failed to create model files: %v", err) } toolsInsertTemplate, err := template.Parse("{{ .prompt }}{{ if .tools }}{{ .tools }}{{ end }}{{ if .suffix }}{{ .suffix }}{{ end }}") diff --git a/server/internal/cmd/opp/internal/safetensors/safetensors.go b/server/internal/cmd/opp/internal/safetensors/safetensors.go deleted file mode 100644 index 7a45b91df..000000000 --- a/server/internal/cmd/opp/internal/safetensors/safetensors.go +++ /dev/null @@ -1,224 +0,0 @@ -// safetensors provides a reader for the safetensor directories and files. -package safetensors - -import ( - "encoding/json" - "fmt" - "io" - "io/fs" - "iter" - "slices" - "strconv" - "strings" -) - -// Tensor represents a single tensor in a safetensors file. -// -// It's zero value is not valid. Use [Model.Tensors] to get valid tensors. -// -// It is not safe for use across multiple goroutines. -type Tensor struct { - name string - dataType string - shape []int64 - - fsys fs.FS - fname string // entry name in fsys - offset int64 - size int64 -} - -type Model struct { - fsys fs.FS -} - -func Read(fsys fs.FS) (*Model, error) { - return &Model{fsys: fsys}, nil -} - -func (m *Model) Tensors() iter.Seq2[*Tensor, error] { - return func(yield func(*Tensor, error) bool) { - entries, err := fs.Glob(m.fsys, "*.safetensors") - if err != nil { - yield(nil, err) - return - } - for _, e := range entries { - tt, err := m.readTensors(e) - if err != nil { - yield(nil, err) - return - } - for _, t := range tt { - if !yield(t, nil) { - return - } - } - } - } -} - -func (m *Model) readTensors(fname string) ([]*Tensor, error) { - f, err := m.fsys.Open(fname) - if err != nil { - return nil, err - } - defer f.Close() - - finfo, err := f.Stat() - if err != nil { - return nil, err - } - - headerSize, err := readInt64(f) - if err != nil { - return nil, err - } - - data := make([]byte, headerSize) - _, err = io.ReadFull(f, data) - if err != nil { - return nil, err - } - - var raws map[string]json.RawMessage - if err := json.Unmarshal(data, &raws); err != nil { - return nil, err - } - - endOfHeader := 8 + headerSize // 8 bytes for header size plus the header itself - - // TODO(bmizerany): do something with metadata? This could be another - // header read if needed. We also need to figure out if the metadata is - // present in only one .safetensors file or if each file may have their - // own and if it needs to follow each tensor. Currently, I (bmizerany) - // am only seeing them show up with one entry for file type which is - // always "pt". - - tt := make([]*Tensor, 0, len(raws)) - for name, raw := range raws { - if name == "__metadata__" { - // TODO(bmizerany): do something with metadata? - continue - } - var v struct { - DataType string `json:"dtype"` - Shape []int64 `json:"shape"` - Offsets []int64 `json:"data_offsets"` - } - if err := json.Unmarshal(raw, &v); err != nil { - return nil, fmt.Errorf("error unmarshalling layer %q: %w", name, err) - } - if len(v.Offsets) != 2 { - return nil, fmt.Errorf("invalid offsets for %q: %v", name, v.Offsets) - } - - // TODO(bmizerany): after collecting, validate all offests make - // tensors contiguous? - begin := endOfHeader + v.Offsets[0] - end := endOfHeader + v.Offsets[1] - if err := checkBeginEnd(finfo.Size(), begin, end); err != nil { - return nil, err - } - - // TODO(bmizerany): just yield.. don't be silly and make a slice :) - tt = append(tt, &Tensor{ - name: name, - dataType: v.DataType, - shape: v.Shape, - fsys: m.fsys, - fname: fname, - offset: begin, - size: end - begin, - }) - } - return tt, nil -} - -func checkBeginEnd(size, begin, end int64) error { - if begin < 0 { - return fmt.Errorf("begin must not be negative: %d", begin) - } - if end < 0 { - return fmt.Errorf("end must not be negative: %d", end) - } - if end < begin { - return fmt.Errorf("end must be >= begin: %d < %d", end, begin) - } - if end > size { - return fmt.Errorf("end must be <= size: %d > %d", end, size) - } - return nil -} - -func readInt64(r io.Reader) (int64, error) { - var v uint64 - var buf [8]byte - if _, err := io.ReadFull(r, buf[:]); err != nil { - return 0, err - } - for i := range buf { - v |= uint64(buf[i]) << (8 * i) - } - return int64(v), nil -} - -type Shape []int64 - -func (s Shape) String() string { - var b strings.Builder - b.WriteByte('[') - for i, v := range s { - if i > 0 { - b.WriteByte(',') - } - b.WriteString(strconv.FormatInt(v, 10)) - } - b.WriteByte(']') - return b.String() -} - -func (t *Tensor) Name() string { return t.name } -func (t *Tensor) DataType() string { return t.dataType } -func (t *Tensor) Size() int64 { return t.size } -func (t *Tensor) Shape() Shape { return slices.Clone(t.shape) } - -func (t *Tensor) Reader() (io.ReadCloser, error) { - f, err := t.fsys.Open(t.fname) - if err != nil { - return nil, err - } - r := newSectionReader(f, t.offset, t.size) - rc := struct { - io.Reader - io.Closer - }{r, f} - return rc, nil -} - -// newSectionReader returns a new io.Reader that reads from r starting at -// offset. It is a convenience function for creating a io.SectionReader when r -// may not be an io.ReaderAt. -// -// If r is already a ReaderAt, it is returned directly, otherwise if r is an -// io.Seeker, a new io.ReaderAt is returned that wraps r after seeking to the -// beginning of the file. -// -// If r is an io.Seeker, -// or slow path. The slow path is used when r does not implement io.ReaderAt, -// in which case it must discard the data it reads. -func newSectionReader(r io.Reader, offset, n int64) io.Reader { - if r, ok := r.(io.ReaderAt); ok { - return io.NewSectionReader(r, offset, n) - } - if r, ok := r.(io.ReadSeeker); ok { - r.Seek(offset, io.SeekStart) - return io.LimitReader(r, n) - } - // Discard to offset and return a limited reader. - _, err := io.CopyN(io.Discard, r, offset) - if err != nil { - return nil - } - return io.LimitReader(r, n) -} diff --git a/server/internal/cmd/opp/opp.go b/server/internal/cmd/opp/opp.go deleted file mode 100644 index 6976927c7..000000000 --- a/server/internal/cmd/opp/opp.go +++ /dev/null @@ -1,375 +0,0 @@ -package main - -import ( - "bytes" - "cmp" - "context" - "encoding/json" - "errors" - "flag" - "fmt" - "io" - "log" - "mime" - "net/http" - "os" - "runtime" - "strings" - "sync" - "sync/atomic" - "time" - - "github.com/ollama/ollama/server/internal/cache/blob" - "github.com/ollama/ollama/server/internal/client/ollama" - "github.com/ollama/ollama/server/internal/cmd/opp/internal/safetensors" - "golang.org/x/sync/errgroup" -) - -var stdout io.Writer = os.Stdout - -const usage = `Opp is a tool for pushing and pulling Ollama models. - -Usage: - - opp [flags] - -Commands: - - push Upload a model to the Ollama server. - pull Download a model from the Ollama server. - import Import a model from a local safetensor directory. - -Examples: - - # Pull a model from the Ollama server. - opp pull library/llama3.2:latest - - # Push a model to the Ollama server. - opp push username/my_model:8b - - # Import a model from a local safetensor directory. - opp import /path/to/safetensor - -Envionment Variables: - - OLLAMA_MODELS - The directory where models are pushed and pulled from - (default ~/.ollama/models). -` - -func main() { - flag.Usage = func() { - fmt.Fprint(os.Stderr, usage) - } - flag.Parse() - - ctx := context.Background() - - err := func() error { - switch cmd := flag.Arg(0); cmd { - case "pull": - rc, err := ollama.DefaultRegistry() - if err != nil { - log.Fatal(err) - } - - return cmdPull(ctx, rc) - case "push": - rc, err := ollama.DefaultRegistry() - if err != nil { - log.Fatal(err) - } - return cmdPush(ctx, rc) - case "import": - c, err := ollama.DefaultCache() - if err != nil { - log.Fatal(err) - } - return cmdImport(ctx, c) - default: - if cmd == "" { - flag.Usage() - } else { - fmt.Fprintf(os.Stderr, "unknown command %q\n", cmd) - } - os.Exit(2) - return errors.New("unreachable") - } - }() - if err != nil { - fmt.Fprintf(os.Stderr, "opp: %v\n", err) - os.Exit(1) - } -} - -func cmdPull(ctx context.Context, rc *ollama.Registry) error { - model := flag.Arg(1) - if model == "" { - flag.Usage() - os.Exit(1) - } - - tr := http.DefaultTransport.(*http.Transport).Clone() - // TODO(bmizerany): configure transport? - rc.HTTPClient = &http.Client{Transport: tr} - - var mu sync.Mutex - p := make(map[blob.Digest][2]int64) // digest -> [total, downloaded] - - var pb bytes.Buffer - printProgress := func() { - pb.Reset() - mu.Lock() - for d, s := range p { - // Write progress to a buffer first to avoid blocking - // on stdout while holding the lock. - stamp := time.Now().Format("2006/01/02 15:04:05") - fmt.Fprintf(&pb, "%s %s pulling %d/%d (%.1f%%)\n", stamp, d.Short(), s[1], s[0], 100*float64(s[1])/float64(s[0])) - if s[0] == s[1] { - delete(p, d) - } - } - mu.Unlock() - io.Copy(stdout, &pb) - } - - ctx = ollama.WithTrace(ctx, &ollama.Trace{ - Update: func(l *ollama.Layer, n int64, err error) { - if err != nil && !errors.Is(err, ollama.ErrCached) { - fmt.Fprintf(stdout, "opp: pull %s ! %v\n", l.Digest.Short(), err) - return - } - - mu.Lock() - p[l.Digest] = [2]int64{l.Size, n} - mu.Unlock() - }, - }) - - errc := make(chan error) - go func() { - errc <- rc.Pull(ctx, model) - }() - - t := time.NewTicker(time.Second) - defer t.Stop() - for { - select { - case <-t.C: - printProgress() - case err := <-errc: - printProgress() - return err - } - } -} - -func cmdPush(ctx context.Context, rc *ollama.Registry) error { - args := flag.Args()[1:] - flag := flag.NewFlagSet("push", flag.ExitOnError) - flagFrom := flag.String("from", "", "Use the manifest from a model by another name.") - flag.Usage = func() { - fmt.Fprintf(os.Stderr, "Usage: opp push \n") - flag.PrintDefaults() - } - flag.Parse(args) - - model := flag.Arg(0) - if model == "" { - return fmt.Errorf("missing model argument") - } - - from := cmp.Or(*flagFrom, model) - m, err := rc.ResolveLocal(from) - if err != nil { - return err - } - - ctx = ollama.WithTrace(ctx, &ollama.Trace{ - Update: func(l *ollama.Layer, n int64, err error) { - switch { - case errors.Is(err, ollama.ErrCached): - fmt.Fprintf(stdout, "opp: uploading %s %d (existed)", l.Digest.Short(), n) - case err != nil: - fmt.Fprintf(stdout, "opp: uploading %s %d ! %v\n", l.Digest.Short(), n, err) - case n == 0: - l := m.Layer(l.Digest) - mt, p, _ := mime.ParseMediaType(l.MediaType) - mt, _ = strings.CutPrefix(mt, "application/vnd.ollama.image.") - switch mt { - case "tensor": - fmt.Fprintf(stdout, "opp: uploading tensor %s %s\n", l.Digest.Short(), p["name"]) - default: - fmt.Fprintf(stdout, "opp: uploading %s %s\n", l.Digest.Short(), l.MediaType) - } - } - }, - }) - - return rc.Push(ctx, model, &ollama.PushParams{ - From: from, - }) -} - -type trackingReader struct { - io.Reader - n *atomic.Int64 -} - -func (r *trackingReader) Read(p []byte) (n int, err error) { - n, err = r.Reader.Read(p) - r.n.Add(int64(n)) - return n, err -} - -func cmdImport(ctx context.Context, c *blob.DiskCache) error { - args := flag.Args()[1:] - flag := flag.NewFlagSet("import", flag.ExitOnError) - flagAs := flag.String("as", "", "Import using the provided name.") - flag.Usage = func() { - fmt.Fprintf(os.Stderr, "Usage: opp import \n") - flag.PrintDefaults() - } - flag.Parse(args) - if *flagAs == "" { - return fmt.Errorf("missing -as flag") - } - as := ollama.CompleteName(*flagAs) - - dir := cmp.Or(flag.Arg(0), ".") - fmt.Fprintf(os.Stderr, "Reading %s\n", dir) - - m, err := safetensors.Read(os.DirFS(dir)) - if err != nil { - return err - } - - var total int64 - var tt []*safetensors.Tensor - for t, err := range m.Tensors() { - if err != nil { - return err - } - tt = append(tt, t) - total += t.Size() - } - - var n atomic.Int64 - done := make(chan error) - go func() { - layers := make([]*ollama.Layer, len(tt)) - var g errgroup.Group - g.SetLimit(runtime.GOMAXPROCS(0)) - var ctxErr error - for i, t := range tt { - if ctx.Err() != nil { - // The context may cancel AFTER we exit the - // loop, and so if we use ctx.Err() after the - // loop we may report it as the error that - // broke the loop, when it was not. This can - // manifest as a false-negative, leading the - // user to think their import failed when it - // did not, so capture it if and only if we - // exit the loop because of a ctx.Err() and - // report it. - ctxErr = ctx.Err() - break - } - g.Go(func() (err error) { - rc, err := t.Reader() - if err != nil { - return err - } - defer rc.Close() - tr := &trackingReader{rc, &n} - d, err := c.Import(tr, t.Size()) - if err != nil { - return err - } - if err := rc.Close(); err != nil { - return err - } - - layers[i] = &ollama.Layer{ - Digest: d, - Size: t.Size(), - MediaType: mime.FormatMediaType("application/vnd.ollama.image.tensor", map[string]string{ - "name": t.Name(), - "dtype": t.DataType(), - "shape": t.Shape().String(), - }), - } - - return nil - }) - } - - done <- func() error { - if err := errors.Join(g.Wait(), ctxErr); err != nil { - return err - } - m := &ollama.Manifest{Layers: layers} - data, err := json.MarshalIndent(m, "", " ") - if err != nil { - return err - } - d := blob.DigestFromBytes(data) - err = blob.PutBytes(c, d, data) - if err != nil { - return err - } - return c.Link(as, d) - }() - }() - - fmt.Fprintf(stdout, "Importing %d tensors from %s\n", len(tt), dir) - - csiHideCursor(stdout) - defer csiShowCursor(stdout) - - csiSavePos(stdout) - writeProgress := func() { - csiRestorePos(stdout) - nn := n.Load() - fmt.Fprintf(stdout, "Imported %s/%s bytes (%d%%)%s\n", - formatNatural(nn), - formatNatural(total), - nn*100/total, - ansiClearToEndOfLine, - ) - } - - ticker := time.NewTicker(time.Second) - defer ticker.Stop() - for { - select { - case <-ticker.C: - writeProgress() - case err := <-done: - writeProgress() - fmt.Println() - fmt.Println("Successfully imported", as) - return err - } - } -} - -func formatNatural(n int64) string { - switch { - case n < 1024: - return fmt.Sprintf("%d B", n) - case n < 1024*1024: - return fmt.Sprintf("%.1f KB", float64(n)/1024) - case n < 1024*1024*1024: - return fmt.Sprintf("%.1f MB", float64(n)/(1024*1024)) - default: - return fmt.Sprintf("%.1f GB", float64(n)/(1024*1024*1024)) - } -} - -const ansiClearToEndOfLine = "\033[K" - -func csiSavePos(w io.Writer) { fmt.Fprint(w, "\033[s") } -func csiRestorePos(w io.Writer) { fmt.Fprint(w, "\033[u") } -func csiHideCursor(w io.Writer) { fmt.Fprint(w, "\033[?25l") } -func csiShowCursor(w io.Writer) { fmt.Fprint(w, "\033[?25h") } diff --git a/server/internal/internal/backoff/backoff_test.go b/server/internal/internal/backoff/backoff_test.go index 11ace22a8..f474118f0 100644 --- a/server/internal/internal/backoff/backoff_test.go +++ b/server/internal/internal/backoff/backoff_test.go @@ -3,7 +3,6 @@ package backoff import ( - "context" "testing" "testing/synctest" "time" @@ -29,7 +28,7 @@ func TestLoopAllocs(t *testing.T) { } func BenchmarkLoop(b *testing.B) { - ctx := context.Background() + ctx := b.Context() synctest.Run(func() { for n := range Loop(ctx, 100*time.Millisecond) { if n == b.N { diff --git a/server/model.go b/server/model.go index e733fbdb1..2149ff855 100644 --- a/server/model.go +++ b/server/model.go @@ -64,7 +64,7 @@ func parseFromModel(ctx context.Context, name model.Name, fn func(api.ProgressRe } defer blob.Close() - f, _, err := ggml.Decode(blob, 1024) + f, _, err := ggml.Decode(blob, -1) if err != nil { return nil, err } diff --git a/server/modelpath_test.go b/server/modelpath_test.go index 849e0fa73..96429f958 100644 --- a/server/modelpath_test.go +++ b/server/modelpath_test.go @@ -1,7 +1,6 @@ package server import ( - "os" "path/filepath" "testing" @@ -11,9 +10,7 @@ import ( func TestGetBlobsPath(t *testing.T) { // GetBlobsPath expects an actual directory to exist - dir, err := os.MkdirTemp("", "ollama-test") - require.NoError(t, err) - defer os.RemoveAll(dir) + tempDir := t.TempDir() tests := []struct { name string @@ -24,19 +21,19 @@ func TestGetBlobsPath(t *testing.T) { { "empty digest", "", - filepath.Join(dir, "blobs"), + filepath.Join(tempDir, "blobs"), nil, }, { "valid with colon", "sha256:456402914e838a953e0cf80caa6adbe75383d9e63584a964f504a7bbb8f7aad9", - filepath.Join(dir, "blobs", "sha256-456402914e838a953e0cf80caa6adbe75383d9e63584a964f504a7bbb8f7aad9"), + filepath.Join(tempDir, "blobs", "sha256-456402914e838a953e0cf80caa6adbe75383d9e63584a964f504a7bbb8f7aad9"), nil, }, { "valid with dash", "sha256-456402914e838a953e0cf80caa6adbe75383d9e63584a964f504a7bbb8f7aad9", - filepath.Join(dir, "blobs", "sha256-456402914e838a953e0cf80caa6adbe75383d9e63584a964f504a7bbb8f7aad9"), + filepath.Join(tempDir, "blobs", "sha256-456402914e838a953e0cf80caa6adbe75383d9e63584a964f504a7bbb8f7aad9"), nil, }, { @@ -60,7 +57,7 @@ func TestGetBlobsPath(t *testing.T) { } for _, tc := range tests { t.Run(tc.name, func(t *testing.T) { - t.Setenv("OLLAMA_MODELS", dir) + t.Setenv("OLLAMA_MODELS", tempDir) got, err := GetBlobsPath(tc.digest) diff --git a/server/prompt_test.go b/server/prompt_test.go index 62aec86a9..b81c01eef 100644 --- a/server/prompt_test.go +++ b/server/prompt_test.go @@ -2,7 +2,6 @@ package server import ( "bytes" - "context" "image" "image/png" "testing" @@ -318,7 +317,7 @@ func TestChatPrompt(t *testing.T) { t.Run(tt.name, func(t *testing.T) { model := tt.model opts := api.Options{Runner: api.Runner{NumCtx: tt.limit}} - prompt, images, err := chatPrompt(context.TODO(), &model, mockRunner{}.Tokenize, &opts, tt.msgs, nil) + prompt, images, err := chatPrompt(t.Context(), &model, mockRunner{}.Tokenize, &opts, tt.msgs, nil) if tt.error == nil && err != nil { t.Fatal(err) } else if tt.error != nil && err != tt.error { diff --git a/server/quantization.go b/server/quantization.go new file mode 100644 index 000000000..80bc093db --- /dev/null +++ b/server/quantization.go @@ -0,0 +1,274 @@ +package server + +import ( + "fmt" + "io" + "log/slog" + "maps" + "os" + "strings" + "unsafe" + + fsggml "github.com/ollama/ollama/fs/ggml" + "github.com/ollama/ollama/ml/backend/ggml" +) + +type quantizer struct { + *os.File + offset uint64 + from, to *fsggml.Tensor + progressFn func(n uint64) +} + +func (q quantizer) WriteTo(w io.Writer) (int64, error) { + quantize := q.from.Kind != q.to.Kind + sr := io.NewSectionReader(q, int64(q.offset), int64(q.from.Size())) + if !quantize { + n, err := io.Copy(w, sr) + q.progressFn(q.from.Size()) + return n, err + } + data, err := io.ReadAll(sr) + if err != nil { + slog.Warn("file read error", "tensor", q.from.Name, "file", q.Name(), "error", err) + return 0, fmt.Errorf("unable to read tensor %s from %s: %s", q.from.Name, q.Name(), err) + } + var f32s []float32 + newType := fsggml.TensorType(q.to.Kind) + if fsggml.TensorType(q.from.Kind) == fsggml.TensorTypeF32 { + f32s = unsafe.Slice((*float32)(unsafe.Pointer(&data[0])), q.from.Elements()) + } else { + f32s = ggml.ConvertToF32(data, q.from.Kind, q.from.Elements()) + } + data = ggml.Quantize(newType, f32s, q.from.Shape) + n, err := w.Write(data) + q.progressFn(q.from.Size()) + return int64(n), err +} + +type quantizeState struct { + nAttnV int // Number of attn_*v* weight tensors + nFfnDown int // Number of ffn_down tensors + iAttnV int // Running counter of number of attn_v tensors that have been processed + iFfnDown int // Running counter of number of ffn_down tensors that have been processed + hasOutput bool // used to figure out if a model shares tok_embd with the output weight +} + +func useMoreBits(iLayer, nLayers int) bool { + return iLayer < (nLayers/8) || iLayer >= 7*nLayers/8 || (iLayer-nLayers/8)%3 == 2 +} + +func getTensorNewType(kv fsggml.KV, qs *quantizeState, newType fsggml.TensorType, name string, shape []uint64, ftype fsggml.FileType) fsggml.TensorType { + // Ported from llama_tensor_get_type, removed unsupported quantization types + nExperts := max(1, kv.Uint("expert_count", 0)) + if name == "output.weight" || name == "output_norm.weight" || (!qs.hasOutput && name == "token_embd.weight") { + nx := shape[0] + qk_k := newType.BlockSize() + if nx%qk_k != 0 { + newType = fsggml.TensorTypeQ8_0 + } else if newType != fsggml.TensorTypeQ8_0 { + newType = fsggml.TensorTypeQ6_K + } + } else if strings.Contains(name, "attn_v.weight") { + if ftype == fsggml.FileTypeQ2_K { + if kv.GQA() >= 4 { + newType = fsggml.TensorTypeQ4_K + } else { + newType = fsggml.TensorTypeQ3_K + } + } else if ftype == fsggml.FileTypeQ2_K_S && kv.GQA() >= 4 { + newType = fsggml.TensorTypeQ4_K + } else if ftype == fsggml.FileTypeQ3_K_M { + if qs.iAttnV < 2 { + newType = fsggml.TensorTypeQ5_K + } else { + newType = fsggml.TensorTypeQ4_K + } + } else if ftype == fsggml.FileTypeQ3_K_L { + newType = fsggml.TensorTypeQ5_K + } else if (ftype == fsggml.FileTypeQ4_K_M || ftype == fsggml.FileTypeQ5_K_M) && + useMoreBits(qs.iAttnV, qs.nAttnV) { + newType = fsggml.TensorTypeQ6_K + } else if ftype == fsggml.FileTypeQ4_K_S && qs.iAttnV < 4 { + newType = fsggml.TensorTypeQ5_K + } + + // TODO + // if (qs.model.type == LLM_TYPE_70B) { + // // In the 70B model we have 8 heads sharing the same attn_v weights. As a result, the attn_v.weight tensor is + // // 8x smaller compared to attn_q.weight. Hence, we can get a nice boost in quantization accuracy with + // // nearly negligible increase in model size by quantizing this tensor with more bits: + // if (newType == GGML_TYPE_Q3_K || newType == GGML_TYPE_Q4_K) newType = GGML_TYPE_Q5_K; + // } + + if nExperts == 8 { + // for the 8-expert model, bumping this to Q8_0 trades just ~128MB + newType = fsggml.TensorTypeQ8_0 + } + qs.iAttnV++ + } else if strings.Contains(name, "attn_k.weight") { + if nExperts == 8 { + // for the 8-expert model, bumping this to Q8_0 trades just ~128MB + newType = fsggml.TensorTypeQ8_0 + } + } else if strings.Contains(name, "ffn_down") { + iLayer := qs.iFfnDown + n_layer := qs.nFfnDown + if ftype == fsggml.FileTypeQ2_K { + newType = fsggml.TensorTypeQ3_K + } else if ftype == fsggml.FileTypeQ2_K_S { + if iLayer < n_layer/8 { + newType = fsggml.TensorTypeQ4_K + } + } else if ftype == fsggml.FileTypeQ3_K_M { + if iLayer < n_layer/16 { + newType = fsggml.TensorTypeQ5_K + } else if useMoreBits(iLayer, n_layer) { + newType = fsggml.TensorTypeQ4_K + } else { + newType = fsggml.TensorTypeQ3_K + } + } else if ftype == fsggml.FileTypeQ3_K_L { + newType = fsggml.TensorTypeQ5_K + } else if ftype == fsggml.FileTypeQ4_K_M { + if useMoreBits(iLayer, n_layer) { + newType = fsggml.TensorTypeQ6_K + } + } else if ftype == fsggml.FileTypeQ5_K_M && useMoreBits(iLayer, n_layer) { + newType = fsggml.TensorTypeQ6_K + } else if ftype == fsggml.FileTypeQ4_K_S && iLayer < n_layer/8 { + newType = fsggml.TensorTypeQ5_K + } + qs.iFfnDown++ + } else if strings.Contains(name, "attn_output.weight") { + if nExperts == 8 { + if ftype == fsggml.FileTypeQ2_K || ftype == fsggml.FileTypeQ3_K_S || ftype == fsggml.FileTypeQ3_K_M || + ftype == fsggml.FileTypeQ4_K_S || ftype == fsggml.FileTypeQ4_K_M { + newType = fsggml.TensorTypeQ5_K + } + } else { + if ftype == fsggml.FileTypeQ2_K { + newType = fsggml.TensorTypeQ3_K + } else if ftype == fsggml.FileTypeQ3_K_M { + newType = fsggml.TensorTypeQ4_K + } else if ftype == fsggml.FileTypeQ3_K_L { + newType = fsggml.TensorTypeQ5_K + } + } + } else if strings.Contains(name, "attn_qkv.weight") { + if ftype == fsggml.FileTypeQ3_K_M || ftype == fsggml.FileTypeQ3_K_L { + newType = fsggml.TensorTypeQ4_K + } else if ftype == fsggml.FileTypeQ4_K_M { + newType = fsggml.TensorTypeQ5_K + } else if ftype == fsggml.FileTypeQ5_K_M { + newType = fsggml.TensorTypeQ6_K + } + } + + if newType.IsQuantized() { + nx := shape[0] + ny := uint64(1) + if len(shape) > 1 { + ny = shape[1] + } + qk_k := newType.BlockSize() + if nx%qk_k != 0 { + slog.Warn(fmt.Sprintf("tensor cols %d x %d are not divisible by %d, required for %s. Falling back to quantization %s", nx, ny, qk_k, newType.String(), fsggml.TensorTypeF16.String())) + newType = fsggml.TensorTypeF16 + } + } + return newType +} + +func quantize(in, out *os.File, orig *fsggml.GGML, newFileType fsggml.FileType, progressFn func(n uint64)) error { + kv := maps.Clone(orig.KV()) + kv["general.file_type"] = newFileType + // kv["general.quantization_version"] = ggml.QuantizationVersion() + qs := &quantizeState{} + // Build up the quantize state so newType can adjust types + layerCount := 0 + for k, l := range orig.Tensors().GroupLayers() { + if strings.HasPrefix(k, "blk.") { + layerCount++ + } + for _, tensor := range l { + if strings.Contains(tensor.Name, "attn_v.weight") || + strings.Contains(tensor.Name, "attn_qkv.weight") || + strings.Contains(tensor.Name, "attn_kv_b.weight") { + qs.nAttnV++ + } else if tensor.Name == "output.weight" { + qs.hasOutput = true + } + } + } + qs.nFfnDown = layerCount + + origTensors := orig.Tensors().Items() + outputTensors := make([]*fsggml.Tensor, len(origTensors)) + for i, tensor := range origTensors { + tensor := tensor + newType := newType(tensor, kv, qs, newFileType) + newTensor := &fsggml.Tensor{ + Name: tensor.Name, + Shape: tensor.Shape, + Kind: uint32(newType), + } + outputTensors[i] = newTensor + outputTensors[i].WriterTo = quantizer{ + File: in, + offset: orig.Tensors().Offset + tensor.Offset, + from: tensor, + to: newTensor, + progressFn: progressFn, + } + } + return fsggml.WriteGGUF(out, kv, outputTensors) +} + +func newType(t *fsggml.Tensor, kv fsggml.KV, qs *quantizeState, ftype fsggml.FileType) fsggml.TensorType { + defaultType := ftype.ToTensorType() + name := t.Name + quantize := strings.HasSuffix(name, "weight") + + // don't quantize vision stuff + quantize = quantize && (!strings.Contains(name, "v.") || strings.Contains(name, "_v.")) + quantize = quantize && !strings.Contains(name, "mm.") + + // quantize only 2D and 3D tensors (experts) + quantize = quantize && (len(t.Shape) >= 2) + + // do not quantize norm tensors + quantize = quantize && !strings.Contains(name, "_norm.weight") + + // do not quantize expert gating tensors + quantize = quantize && !strings.Contains(name, "ffn_gate_inp.weight") + + // do not quantize positional embeddings and token types (BERT) + quantize = quantize && (name != "position_embd.weight") + quantize = quantize && (name != "token_types.weight") + + // do not quantize Mamba's small yet 2D weights + // NOTE: can't use LLM_TN here because the layer number is not known + quantize = quantize && !strings.Contains(name, "ssm_conv1d.weight") + + // do not quantize RWKV's time_mix_first tensors + quantize = quantize && !strings.Contains(name, "time_mix_first.weight") + quantize = quantize && !strings.Contains(name, "time_mix_w1.weight") + quantize = quantize && !strings.Contains(name, "time_mix_w2.weight") + quantize = quantize && !strings.Contains(name, "time_mix_decay_w1.weight") + quantize = quantize && !strings.Contains(name, "time_mix_decay_w2.weight") + quantize = quantize && !strings.Contains(name, "time_mix_lerp_fused.weight") + + // do not quantize relative position bias (T5) + quantize = quantize && !strings.Contains(name, "attn_rel_b.weight") + + newType := fsggml.TensorType(t.Kind) + if quantize { + // get more optimal quantization type based on the tensor shape, layer, etc. + newType = getTensorNewType(kv, qs, defaultType, t.Name, t.Shape, ftype) + if newType != defaultType { + slog.Debug("tensor quantization adjusted for better quality", "name", t.Name, "requested", defaultType, "quantization", newType) + } + } + return newType +} diff --git a/server/quantization_test.go b/server/quantization_test.go new file mode 100644 index 000000000..b7e133507 --- /dev/null +++ b/server/quantization_test.go @@ -0,0 +1,882 @@ +package server + +import ( + "bytes" + "fmt" + "math" + "os" + "strings" + "testing" + + fsggml "github.com/ollama/ollama/fs/ggml" + "github.com/ollama/ollama/ml/backend/ggml" +) + +func TestGetTensorNewType(t *testing.T) { + cases := []struct { + name string + kv map[string]any + qs quantizeState + newType fsggml.TensorType + tensor_name string + shape []uint64 + ftype fsggml.FileType + expected fsggml.TensorType + expectedPanic string + }{ + { + name: "output_unsupported", + kv: map[string]any{}, + newType: fsggml.TensorTypeQ4_0, + tensor_name: "output.weight", + shape: []uint64{100, 100}, + ftype: fsggml.FileTypeF32, + expected: fsggml.TensorTypeF16, + }, + { + name: "output_Q8", + kv: map[string]any{}, + newType: fsggml.TensorTypeQ4_0, + tensor_name: "output.weight", + shape: []uint64{1024, 1024}, + ftype: fsggml.FileTypeF32, + expected: fsggml.TensorTypeQ6_K, + }, + { + name: "attn_v.weight_q4_k", + kv: map[string]any{ + "general.architecture": "foo", + "foo.attention.head_count": uint32(4), + "foo.attention.head_count_kv": uint32(1), + }, + newType: fsggml.TensorTypeQ4_0, + tensor_name: "blk.0.attn_v.weight", + shape: []uint64{256}, + ftype: fsggml.FileTypeQ2_K, + expected: fsggml.TensorTypeQ4_K, + }, + { + name: "attn_v.weight_q3_k", + kv: map[string]any{}, + newType: fsggml.TensorTypeQ4_0, + tensor_name: "blk.0.attn_v.weight", + shape: []uint64{256}, + ftype: fsggml.FileTypeQ2_K, + expected: fsggml.TensorTypeQ3_K, + }, + { + name: "attn_v.weight_q2_k_s_q4_k", + kv: map[string]any{ + "general.architecture": "foo", + "foo.attention.head_count": uint32(4), + "foo.attention.head_count_kv": uint32(1), + }, + newType: fsggml.TensorTypeQ4_0, + tensor_name: "blk.0.attn_v.weight", + shape: []uint64{256}, + ftype: fsggml.FileTypeQ2_K_S, + expected: fsggml.TensorTypeQ4_K, + }, + { + name: "attn_v.weight_q3_k_m", + kv: map[string]any{}, + newType: fsggml.TensorTypeQ4_0, + tensor_name: "blk.0.attn_v.weight", + shape: []uint64{256}, + ftype: fsggml.FileTypeQ3_K_M, + expected: fsggml.TensorTypeQ5_K, + }, + { + name: "attn_v.weight_q3_k_m_i", + qs: quantizeState{ + iAttnV: 2, + }, + kv: map[string]any{}, + newType: fsggml.TensorTypeQ4_0, + tensor_name: "blk.0.attn_v.weight", + shape: []uint64{256}, + ftype: fsggml.FileTypeQ3_K_M, + expected: fsggml.TensorTypeQ4_K, + }, + { + name: "attn_v.weight_q3_k_l", + kv: map[string]any{}, + newType: fsggml.TensorTypeQ4_0, + tensor_name: "blk.0.attn_v.weight", + shape: []uint64{256}, + ftype: fsggml.FileTypeQ3_K_L, + expected: fsggml.TensorTypeQ5_K, + }, + { + name: "attn_v.weight_q4_k_m", + qs: quantizeState{ + iAttnV: 2, + nAttnV: 3 * 8, + }, + kv: map[string]any{}, + newType: fsggml.TensorTypeQ4_0, + tensor_name: "blk.0.attn_v.weight", + shape: []uint64{256}, + ftype: fsggml.FileTypeQ4_K_M, + expected: fsggml.TensorTypeQ6_K, + }, + { + name: "attn_v.weight_q4_k_s", + qs: quantizeState{}, + kv: map[string]any{}, + newType: fsggml.TensorTypeQ4_0, + tensor_name: "blk.0.attn_v.weight", + shape: []uint64{256}, + ftype: fsggml.FileTypeQ4_K_S, + expected: fsggml.TensorTypeQ5_K, + }, + { + name: "attn_v.weight_8_expert", + qs: quantizeState{}, + kv: map[string]any{ + "general.architecture": "foo", + "foo.expert_count": uint32(8), + }, + newType: fsggml.TensorTypeQ4_0, + tensor_name: "blk.0.attn_v.weight", + shape: []uint64{256}, + ftype: fsggml.FileTypeF32, + expected: fsggml.TensorTypeQ8_0, + }, + { + name: "attn_k.weight_8_expert", + qs: quantizeState{}, + kv: map[string]any{ + "general.architecture": "foo", + "foo.expert_count": uint32(8), + }, + newType: fsggml.TensorTypeQ4_0, + tensor_name: "blk.0.attn_k.weight", + shape: []uint64{256}, + ftype: fsggml.FileTypeF32, + expected: fsggml.TensorTypeQ8_0, + }, + { + name: "ffn_down_q2_k", + qs: quantizeState{}, + kv: map[string]any{}, + newType: fsggml.TensorTypeQ4_0, + tensor_name: "ffn_down", + shape: []uint64{256}, + ftype: fsggml.FileTypeQ2_K, + expected: fsggml.TensorTypeQ3_K, + }, + { + name: "ffn_down_q2_k_s", + qs: quantizeState{}, + kv: map[string]any{}, + newType: fsggml.TensorTypeQ4_0, + tensor_name: "ffn_down", + shape: []uint64{256}, + ftype: fsggml.FileTypeQ2_K_S, + expected: fsggml.TensorTypeQ4_0, + }, + { + name: "ffn_down_q2_k_s_layers", + qs: quantizeState{ + iFfnDown: 2, + nFfnDown: 3 * 8, + }, + kv: map[string]any{}, + newType: fsggml.TensorTypeQ4_0, + tensor_name: "ffn_down", + shape: []uint64{256}, + ftype: fsggml.FileTypeQ2_K_S, + expected: fsggml.TensorTypeQ4_K, + }, + { + name: "ffn_down_q3_k_m_base", + qs: quantizeState{ + iFfnDown: 1, + nFfnDown: 8, + }, + kv: map[string]any{}, + newType: fsggml.TensorTypeQ4_0, + tensor_name: "ffn_down", + shape: []uint64{256}, + ftype: fsggml.FileTypeQ3_K_M, + expected: fsggml.TensorTypeQ3_K, + }, + { + name: "ffn_down_q3_k_m_16", + qs: quantizeState{ + iFfnDown: 2, + nFfnDown: 3 * 16, + }, + kv: map[string]any{}, + newType: fsggml.TensorTypeQ4_0, + tensor_name: "ffn_down", + shape: []uint64{256}, + ftype: fsggml.FileTypeQ3_K_M, + expected: fsggml.TensorTypeQ5_K, + }, + { + name: "ffn_down_q3_k_m_8", + qs: quantizeState{ + iFfnDown: 2, + nFfnDown: 3 * 8, + }, + kv: map[string]any{}, + newType: fsggml.TensorTypeQ4_0, + tensor_name: "ffn_down", + shape: []uint64{256}, + ftype: fsggml.FileTypeQ3_K_M, + expected: fsggml.TensorTypeQ4_K, + }, + { + name: "ffn_down_q3_k_l", + qs: quantizeState{}, + kv: map[string]any{}, + newType: fsggml.TensorTypeQ4_0, + tensor_name: "ffn_down", + shape: []uint64{256}, + ftype: fsggml.FileTypeQ3_K_L, + expected: fsggml.TensorTypeQ5_K, + }, + { + name: "ffn_down_q4_k_m", + qs: quantizeState{ + iFfnDown: 1, + nFfnDown: 8, + }, + kv: map[string]any{}, + newType: fsggml.TensorTypeQ4_0, + tensor_name: "ffn_down", + shape: []uint64{256}, + ftype: fsggml.FileTypeQ4_K_M, + expected: fsggml.TensorTypeQ4_0, + }, + { + name: "ffn_down_q4_k_m_6", + qs: quantizeState{ + iFfnDown: 2, + nFfnDown: 3 * 8, + }, + kv: map[string]any{}, + newType: fsggml.TensorTypeQ4_0, + tensor_name: "ffn_down", + shape: []uint64{256}, + ftype: fsggml.FileTypeQ4_K_M, + expected: fsggml.TensorTypeQ6_K, + }, + { + name: "ffn_down_q5_k_m", + qs: quantizeState{ + iFfnDown: 2, + nFfnDown: 3 * 8, + }, + kv: map[string]any{}, + newType: fsggml.TensorTypeQ4_0, + tensor_name: "ffn_down", + shape: []uint64{256}, + ftype: fsggml.FileTypeQ5_K_M, + expected: fsggml.TensorTypeQ6_K, + }, + { + name: "ffn_down_q4_k_s", + qs: quantizeState{ + iFfnDown: 2, + nFfnDown: 3 * 8, + }, + kv: map[string]any{}, + newType: fsggml.TensorTypeQ4_0, + tensor_name: "ffn_down", + shape: []uint64{256}, + ftype: fsggml.FileTypeQ4_K_S, + expected: fsggml.TensorTypeQ5_K, + }, + { + name: "attn_output.weight_8_expert", + qs: quantizeState{}, + kv: map[string]any{ + "general.architecture": "foo", + "foo.expert_count": uint32(8), + }, + newType: fsggml.TensorTypeQ4_0, + tensor_name: "blk.0.attn_output.weight", + shape: []uint64{256}, + ftype: fsggml.FileTypeQ2_K, + expected: fsggml.TensorTypeQ5_K, + }, + { + name: "attn_output.weight_q2", + qs: quantizeState{}, + kv: map[string]any{}, + newType: fsggml.TensorTypeQ4_0, + tensor_name: "blk.0.attn_output.weight", + shape: []uint64{256}, + ftype: fsggml.FileTypeQ2_K, + expected: fsggml.TensorTypeQ3_K, + }, + { + name: "attn_output.weight_q3_k_m", + qs: quantizeState{}, + kv: map[string]any{}, + newType: fsggml.TensorTypeQ4_0, + tensor_name: "blk.0.attn_output.weight", + shape: []uint64{256}, + ftype: fsggml.FileTypeQ3_K_M, + expected: fsggml.TensorTypeQ4_K, + }, + { + name: "attn_output.weight_q3_k_l", + qs: quantizeState{}, + kv: map[string]any{}, + newType: fsggml.TensorTypeQ4_0, + tensor_name: "blk.0.attn_output.weight", + shape: []uint64{256}, + ftype: fsggml.FileTypeQ3_K_L, + expected: fsggml.TensorTypeQ5_K, + }, + { + name: "attn_qkv.weight_q3_k_m", + qs: quantizeState{}, + kv: map[string]any{}, + newType: fsggml.TensorTypeQ4_0, + tensor_name: "blk.0.attn_qkv.weight", + shape: []uint64{256}, + ftype: fsggml.FileTypeQ3_K_M, + expected: fsggml.TensorTypeQ4_K, + }, + { + name: "attn_qkv.weight_q4_k_m", + qs: quantizeState{}, + kv: map[string]any{}, + newType: fsggml.TensorTypeQ4_0, + tensor_name: "blk.0.attn_qkv.weight", + shape: []uint64{256}, + ftype: fsggml.FileTypeQ4_K_M, + expected: fsggml.TensorTypeQ5_K, + }, + { + name: "attn_qkv.weight_q5_k_m", + qs: quantizeState{}, + kv: map[string]any{}, + newType: fsggml.TensorTypeQ4_0, + tensor_name: "blk.0.attn_qkv.weight", + shape: []uint64{256}, + ftype: fsggml.FileTypeQ5_K_M, + expected: fsggml.TensorTypeQ6_K, + }, + } + for _, tt := range cases { + t.Run(tt.name, func(t *testing.T) { + if tt.expectedPanic != "" { + defer func() { + e := recover() + if !strings.Contains(fmt.Sprintf("%v", e), tt.expectedPanic) { + t.Fatalf("incorrect panic\ngot: %v\nexpected: %s", e, tt.expectedPanic) + } + }() + } else { + defer func() { + e := recover() + if e != nil { + t.Fatalf("hit unexpected panic %v", e) + } + }() + } + ret := getTensorNewType(tt.kv, &tt.qs, tt.newType, tt.tensor_name, tt.shape, tt.ftype) + if ret != tt.expected { + t.Fatalf("incorrect type returned\ngot: %d\nexpected: %d", ret, tt.expected) + } + }) + } +} + +func TestQuantizeModel(t *testing.T) { + cases := []struct { + name string + kv map[string]any + tensors []*fsggml.Tensor + newType string + expectedTensorTypes map[string]fsggml.TensorType + }{ + { + name: "f16_q4_k", + kv: map[string]any{ + "general.architecture": "foo", + }, + tensors: []*fsggml.Tensor{ + { + Name: "blk.0.attn.weight", Kind: uint32(fsggml.TensorTypeF16), + Offset: uint64(0), Shape: []uint64{512, 2}, + WriterTo: bytes.NewReader( + append(append(append(quantBytes[fsggml.TensorTypeF16], quantBytes[fsggml.TensorTypeF16]...), quantBytes[fsggml.TensorTypeF16]...), quantBytes[fsggml.TensorTypeF16]...), + ), + }, + { + Name: "output.weight", Kind: uint32(fsggml.TensorTypeF16), + Offset: uint64(0), Shape: []uint64{256, 4}, + WriterTo: bytes.NewReader( + append(append(append(quantBytes[fsggml.TensorTypeF16], quantBytes[fsggml.TensorTypeF16]...), quantBytes[fsggml.TensorTypeF16]...), quantBytes[fsggml.TensorTypeF16]...), + ), + }, + }, + newType: "Q4_K", + expectedTensorTypes: map[string]fsggml.TensorType{ + "blk.0.attn.weight": fsggml.TensorTypeQ4_K, + "output.weight": fsggml.TensorTypeQ6_K, + }, + }, + { + name: "f32_q4_k", + kv: map[string]any{ + "general.architecture": "foo", + }, + tensors: []*fsggml.Tensor{ + { + Name: "blk.0.attn_v.weight", Kind: uint32(fsggml.TensorTypeF32), + Offset: uint64(0), Shape: []uint64{512, 2}, + WriterTo: bytes.NewReader( + append(append(append(quantBytes[fsggml.TensorTypeF32], quantBytes[fsggml.TensorTypeF32]...), quantBytes[fsggml.TensorTypeF32]...), quantBytes[fsggml.TensorTypeF32]...), + ), + }, + { + Name: "output.weight", Kind: uint32(fsggml.TensorTypeF32), + Offset: uint64(0), Shape: []uint64{512}, + WriterTo: bytes.NewReader(append(quantBytes[fsggml.TensorTypeF32], quantBytes[fsggml.TensorTypeF32]...)), + }, + }, + newType: "Q4_K", + expectedTensorTypes: map[string]fsggml.TensorType{ + "blk.0.attn_v.weight": fsggml.TensorTypeQ6_K, + "output.weight": fsggml.TensorTypeF32, + }, + }, + { + name: "f16_q8_0", + kv: map[string]any{ + "general.architecture": "foo", + }, + tensors: []*fsggml.Tensor{ + { + Name: "blk.0.attn.weight", Kind: uint32(fsggml.TensorTypeF16), + Offset: uint64(0), Shape: []uint64{32, 16, 2}, + WriterTo: bytes.NewReader( + append(append(append(quantBytes[fsggml.TensorTypeF16], quantBytes[fsggml.TensorTypeF16]...), quantBytes[fsggml.TensorTypeF16]...), quantBytes[fsggml.TensorTypeF16]...), + ), + }, + { + Name: "output.weight", Kind: uint32(fsggml.TensorTypeF16), + Offset: uint64(0), Shape: []uint64{256, 4}, + WriterTo: bytes.NewReader( + append(append(append(quantBytes[fsggml.TensorTypeF16], quantBytes[fsggml.TensorTypeF16]...), quantBytes[fsggml.TensorTypeF16]...), quantBytes[fsggml.TensorTypeF16]...), + ), + }, + }, + newType: "Q8_0", + expectedTensorTypes: map[string]fsggml.TensorType{ + "blk.0.attn.weight": fsggml.TensorTypeQ8_0, + "output.weight": fsggml.TensorTypeQ8_0, + }, + }, + } + + for _, tt := range cases { + t.Run(tt.name, func(t *testing.T) { + f, err := os.CreateTemp(t.TempDir(), tt.name) + if err != nil { + t.Fatal(err.Error()) + } + defer f.Close() + err = fsggml.WriteGGUF(f, tt.kv, tt.tensors) + if err != nil { + t.Fatalf("failed to create initial model: %s", err) + } + fp, err := os.Open(f.Name()) + if err != nil { + t.Fatal(err.Error()) + } + defer fp.Close() + meta, _, err := fsggml.Decode(fp, -1) + if err != nil { + t.Fatal(err.Error()) + } + progressCalled := false + progress := func(n uint64) { + // fmt.Fprintf(os.Stderr, "progress: %f\n", p) + progressCalled = true + } + tmp, err := os.CreateTemp(t.TempDir(), tt.name+".out") + if err != nil { + t.Fatal(err.Error()) + } + defer tmp.Close() + ftype, err := fsggml.ParseFileType(tt.newType) + if err != nil { + t.Fatal(err.Error()) + } + + err = quantize(fp, tmp, meta, ftype, progress) + if err != nil { + t.Fatalf("error during quantize: %s", err) + } + if !progressCalled { + t.Fatalf("progress was not reported") + } + // Now attempt to load it back and make sure types match expected + fpNew, err := os.Open(tmp.Name()) + if err != nil { + t.Fatalf("failed to load the quantized model %s: %s", tmp.Name(), err) + } + defer fpNew.Close() + newMeta, _, err := fsggml.Decode(fpNew, -1) + if err != nil { + t.Fatalf("failed to load the quantized model %s: %s", tmp.Name(), err) + } + tensors := newMeta.Tensors() + for _, l := range tensors.GroupLayers() { + for _, tensor := range l { + if fsggml.TensorType(tensor.Kind) != tt.expectedTensorTypes[tensor.Name] { + t.Fatalf("incorrect output type for %s\ngot:%s\nexpected:%s", tensor.Name, fsggml.TensorType(tensor.Kind), tt.expectedTensorTypes[tensor.Name]) + } + } + } + }) + } +} + +func TestConvertToF32(t *testing.T) { + expected := make([]float32, 256) + for i := range expected { + expected[i] = float32(i) + } + for dtype, data := range quantBytes { + // Skip the no-op + if dtype == fsggml.TensorTypeF32 { + continue + } + t.Run(dtype.String(), func(t *testing.T) { + fp32 := ggml.ConvertToF32(data, uint32(dtype), 256) + similarity := cosineSimilarity(expected, fp32) + if similarity < 0.999 { + t.Fatalf("Results not similar enough: %s %f", dtype.String(), similarity) + } + }) + } +} + +func dotProduct[V float32 | float64](v1, v2 []V) V { + var result V = 0 + for i := range v1 { + result += v1[i] * v2[i] + } + return result +} + +func magnitude[V float32 | float64](v []V) V { + var result V = 0 + for _, val := range v { + result += val * val + } + return V(math.Sqrt(float64(result))) +} + +func cosineSimilarity[V float32 | float64](v1, v2 []V) V { + return dotProduct(v1, v2) / (magnitude(v1) * magnitude(v2)) +} + +// Precomputed quantized data - arange 256 +// # For gguf-py supported types +// import gguf +// import numpy as np +// print(repr(gguf.quantize(np.arange(256, dtype=np.float16), gguf.GGMLQuantizationType.Q4_0))) +// +// For types not supported by gguf-py converted via ggml_fp32_to_fp16_row and quantize_XXX +// +// data := make([]byte, 256*2) +// fp32 := make([]float32, 256) +// for i := range 256 { +// fp32[i] = float32(i) +// } +// l := C.quantize_q6_K((*C.float)(&fp32[0]), unsafe.Pointer(&data[0]), 1, 256, nil) +// for i := range data[:int(l)] { +// fmt.Printf("%d, ", data[i]) +// } +var ( + quantBytes = map[fsggml.TensorType][]byte{ + fsggml.TensorTypeQ4_0: { + 192, 195, 72, 72, 55, 55, 55, 55, 38, 38, 38, 38, 21, + 21, 21, 21, 4, 4, 224, 199, 36, 36, 36, 36, 19, 19, + 19, 19, 19, 19, 19, 19, 2, 2, 2, 2, 240, 201, 19, + 19, 18, 18, 18, 18, 18, 18, 18, 18, 2, 2, 2, 2, + 1, 1, 240, 203, 18, 18, 18, 18, 18, 18, 18, 18, 1, + 1, 1, 1, 1, 1, 1, 1, 248, 204, 18, 18, 17, 17, + 17, 17, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 248, + 205, 17, 17, 17, 17, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 248, 206, 17, 17, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 248, 207, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, + }, + fsggml.TensorTypeQ4_1: { + 34, 64, 0, 0, 128, 128, 145, 145, 162, 162, 179, 179, 196, + 196, 213, 213, 230, 230, 247, 247, 34, 64, 0, 80, 128, 128, + 145, 145, 162, 162, 179, 179, 196, 196, 213, 213, 230, 230, 247, + 247, 34, 64, 0, 84, 128, 128, 145, 145, 162, 162, 179, 179, + 196, 196, 213, 213, 230, 230, 247, 247, 34, 64, 0, 86, 128, + 128, 145, 145, 162, 162, 179, 179, 196, 196, 213, 213, 230, 230, + 247, 247, 34, 64, 0, 88, 128, 128, 145, 145, 162, 162, 179, + 179, 196, 196, 213, 213, 230, 230, 247, 247, 34, 64, 0, 89, + 128, 128, 145, 145, 162, 162, 179, 179, 196, 196, 213, 213, 230, + 230, 247, 247, 34, 64, 0, 90, 128, 128, 145, 145, 162, 162, + 179, 179, 196, 196, 213, 213, 230, 230, 247, 247, 34, 64, 0, + 91, 128, 128, 145, 145, 162, 162, 179, 179, 196, 196, 213, 213, + 230, 230, 247, 247, + }, + fsggml.TensorTypeQ5_0: { + 192, 191, 1, 0, 0, 0, 128, 127, 127, 110, 110, 93, 93, + 76, 76, 59, 59, 42, 42, 25, 25, 8, 224, 195, 0, 0, + 0, 0, 72, 72, 55, 55, 55, 55, 38, 38, 38, 38, 21, + 21, 21, 21, 4, 4, 240, 197, 0, 0, 0, 0, 53, 37, + 37, 37, 37, 36, 36, 20, 20, 20, 20, 19, 19, 3, 3, + 3, 240, 199, 0, 0, 0, 0, 36, 36, 36, 36, 19, 19, + 19, 19, 19, 19, 19, 19, 2, 2, 2, 2, 248, 200, 0, + 0, 0, 0, 35, 19, 19, 19, 19, 19, 19, 18, 18, 18, + 18, 2, 2, 2, 2, 2, 248, 201, 0, 0, 0, 0, 19, + 19, 18, 18, 18, 18, 18, 18, 18, 18, 2, 2, 2, 2, + 1, 1, 248, 202, 0, 0, 0, 0, 18, 18, 18, 18, 18, + 18, 18, 18, 18, 2, 2, 1, 1, 1, 1, 1, 248, 203, + 0, 0, 0, 0, 18, 18, 18, 18, 18, 18, 18, 18, 1, + 1, 1, 1, 1, 1, 1, 1, + }, + fsggml.TensorTypeQ5_1: { + 0, 60, 0, 0, 0, 0, 255, 255, 0, 17, 34, 51, 68, + 85, 102, 119, 136, 153, 170, 187, 204, 221, 238, 255, 0, 60, + 0, 80, 0, 0, 255, 255, 0, 17, 34, 51, 68, 85, 102, + 119, 136, 153, 170, 187, 204, 221, 238, 255, 0, 60, 0, 84, + 0, 0, 255, 255, 0, 17, 34, 51, 68, 85, 102, 119, 136, + 153, 170, 187, 204, 221, 238, 255, 0, 60, 0, 86, 0, 0, + 255, 255, 0, 17, 34, 51, 68, 85, 102, 119, 136, 153, 170, + 187, 204, 221, 238, 255, 0, 60, 0, 88, 0, 0, 255, 255, + 0, 17, 34, 51, 68, 85, 102, 119, 136, 153, 170, 187, 204, + 221, 238, 255, 0, 60, 0, 89, 0, 0, 255, 255, 0, 17, + 34, 51, 68, 85, 102, 119, 136, 153, 170, 187, 204, 221, 238, + 255, 0, 60, 0, 90, 0, 0, 255, 255, 0, 17, 34, 51, + 68, 85, 102, 119, 136, 153, 170, 187, 204, 221, 238, 255, 0, + 60, 0, 91, 0, 0, 255, 255, 0, 17, 34, 51, 68, 85, + 102, 119, 136, 153, 170, 187, 204, 221, 238, 255, + }, + fsggml.TensorTypeQ8_0: { + 208, 51, 0, 4, 8, 12, 16, 20, 25, 29, 33, 37, 41, + 45, 49, 53, 57, 61, 66, 70, 74, 78, 82, 86, 90, 94, + 98, 102, 107, 111, 115, 119, 123, 127, 240, 55, 65, 67, 69, + 71, 73, 75, 77, 79, 81, 83, 85, 87, 89, 91, 93, 95, + 97, 99, 101, 103, 105, 107, 109, 111, 113, 115, 117, 119, 121, + 123, 125, 127, 252, 57, 86, 87, 88, 90, 91, 92, 94, 95, + 96, 98, 99, 100, 102, 103, 104, 106, 107, 108, 110, 111, 112, + 114, 115, 116, 118, 119, 120, 122, 123, 124, 126, 127, 0, 60, + 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, + 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, + 122, 123, 124, 125, 126, 127, 2, 61, 102, 103, 104, 105, 105, + 106, 107, 108, 109, 109, 110, 111, 112, 113, 113, 114, 115, 116, + 117, 117, 118, 119, 120, 121, 121, 122, 123, 124, 125, 125, 126, + 127, 4, 62, 106, 107, 108, 108, 109, 110, 110, 111, 112, 112, + 113, 114, 114, 115, 116, 116, 117, 118, 118, 119, 120, 120, 121, + 122, 122, 123, 124, 124, 125, 126, 126, 127, 6, 63, 109, 110, + 110, 111, 112, 112, 113, 113, 114, 114, 115, 116, 116, 117, 117, + 118, 118, 119, 120, 120, 121, 121, 122, 122, 123, 124, 124, 125, + 125, 126, 126, 127, 4, 64, 112, 112, 113, 113, 114, 114, 115, + 115, 116, 116, 117, 117, 118, 118, 119, 119, 120, 120, 121, 121, + 122, 122, 123, 123, 124, 124, 125, 125, 126, 126, 127, 127, + }, + fsggml.TensorTypeBF16: { + 0, 0, 128, 63, 0, 64, 64, 64, 128, 64, 160, 64, 192, + 64, 224, 64, 0, 65, 16, 65, 32, 65, 48, 65, 64, 65, + 80, 65, 96, 65, 112, 65, 128, 65, 136, 65, 144, 65, 152, + 65, 160, 65, 168, 65, 176, 65, 184, 65, 192, 65, 200, 65, + 208, 65, 216, 65, 224, 65, 232, 65, 240, 65, 248, 65, 0, + 66, 4, 66, 8, 66, 12, 66, 16, 66, 20, 66, 24, 66, + 28, 66, 32, 66, 36, 66, 40, 66, 44, 66, 48, 66, 52, + 66, 56, 66, 60, 66, 64, 66, 68, 66, 72, 66, 76, 66, + 80, 66, 84, 66, 88, 66, 92, 66, 96, 66, 100, 66, 104, + 66, 108, 66, 112, 66, 116, 66, 120, 66, 124, 66, 128, 66, + 130, 66, 132, 66, 134, 66, 136, 66, 138, 66, 140, 66, 142, + 66, 144, 66, 146, 66, 148, 66, 150, 66, 152, 66, 154, 66, + 156, 66, 158, 66, 160, 66, 162, 66, 164, 66, 166, 66, 168, + 66, 170, 66, 172, 66, 174, 66, 176, 66, 178, 66, 180, 66, + 182, 66, 184, 66, 186, 66, 188, 66, 190, 66, 192, 66, 194, + 66, 196, 66, 198, 66, 200, 66, 202, 66, 204, 66, 206, 66, + 208, 66, 210, 66, 212, 66, 214, 66, 216, 66, 218, 66, 220, + 66, 222, 66, 224, 66, 226, 66, 228, 66, 230, 66, 232, 66, + 234, 66, 236, 66, 238, 66, 240, 66, 242, 66, 244, 66, 246, + 66, 248, 66, 250, 66, 252, 66, 254, 66, 0, 67, 1, 67, + 2, 67, 3, 67, 4, 67, 5, 67, 6, 67, 7, 67, 8, + 67, 9, 67, 10, 67, 11, 67, 12, 67, 13, 67, 14, 67, + 15, 67, 16, 67, 17, 67, 18, 67, 19, 67, 20, 67, 21, + 67, 22, 67, 23, 67, 24, 67, 25, 67, 26, 67, 27, 67, + 28, 67, 29, 67, 30, 67, 31, 67, 32, 67, 33, 67, 34, + 67, 35, 67, 36, 67, 37, 67, 38, 67, 39, 67, 40, 67, + 41, 67, 42, 67, 43, 67, 44, 67, 45, 67, 46, 67, 47, + 67, 48, 67, 49, 67, 50, 67, 51, 67, 52, 67, 53, 67, + 54, 67, 55, 67, 56, 67, 57, 67, 58, 67, 59, 67, 60, + 67, 61, 67, 62, 67, 63, 67, 64, 67, 65, 67, 66, 67, + 67, 67, 68, 67, 69, 67, 70, 67, 71, 67, 72, 67, 73, + 67, 74, 67, 75, 67, 76, 67, 77, 67, 78, 67, 79, 67, + 80, 67, 81, 67, 82, 67, 83, 67, 84, 67, 85, 67, 86, + 67, 87, 67, 88, 67, 89, 67, 90, 67, 91, 67, 92, 67, + 93, 67, 94, 67, 95, 67, 96, 67, 97, 67, 98, 67, 99, + 67, 100, 67, 101, 67, 102, 67, 103, 67, 104, 67, 105, 67, + 106, 67, 107, 67, 108, 67, 109, 67, 110, 67, 111, 67, 112, + 67, 113, 67, 114, 67, 115, 67, 116, 67, 117, 67, 118, 67, + 119, 67, 120, 67, 121, 67, 122, 67, 123, 67, 124, 67, 125, + 67, 126, 67, 127, 67, + }, + fsggml.TensorTypeF16: { + 0, 0, 0, 60, 0, 64, 0, 66, 0, 68, 0, 69, 0, 70, 0, 71, 0, + 72, 128, 72, 0, 73, 128, 73, 0, 74, 128, 74, 0, 75, 128, 75, + 0, 76, 64, 76, 128, 76, 192, 76, 0, 77, 64, 77, 128, 77, 192, + 77, 0, 78, 64, 78, 128, 78, 192, 78, 0, 79, 64, 79, 128, 79, + 192, 79, 0, 80, 32, 80, 64, 80, 96, 80, 128, 80, 160, 80, + 192, 80, 224, 80, 0, 81, 32, 81, 64, 81, 96, 81, 128, 81, + 160, 81, 192, 81, 224, 81, 0, 82, 32, 82, 64, 82, 96, 82, + 128, 82, 160, 82, 192, 82, 224, 82, 0, 83, 32, 83, 64, 83, + 96, 83, 128, 83, 160, 83, 192, 83, 224, 83, 0, 84, 16, 84, + 32, 84, 48, 84, 64, 84, 80, 84, 96, 84, 112, 84, 128, 84, + 144, 84, 160, 84, 176, 84, 192, 84, 208, 84, 224, 84, 240, + 84, 0, 85, 16, 85, 32, 85, 48, 85, 64, 85, 80, 85, 96, 85, + 112, 85, 128, 85, 144, 85, 160, 85, 176, 85, 192, 85, 208, + 85, 224, 85, 240, 85, 0, 86, 16, 86, 32, 86, 48, 86, 64, + 86, 80, 86, 96, 86, 112, 86, 128, 86, 144, 86, 160, 86, + 176, 86, 192, 86, 208, 86, 224, 86, 240, 86, 0, 87, 16, + 87, 32, 87, 48, 87, 64, 87, 80, 87, 96, 87, 112, 87, 128, + 87, 144, 87, 160, 87, 176, 87, 192, 87, 208, 87, 224, 87, + 240, 87, 0, 88, 8, 88, 16, 88, 24, 88, 32, 88, 40, 88, + 48, 88, 56, 88, 64, 88, 72, 88, 80, 88, 88, 88, 96, 88, + 104, 88, 112, 88, 120, 88, 128, 88, 136, 88, 144, 88, 152, + 88, 160, 88, 168, 88, 176, 88, 184, 88, 192, 88, 200, 88, + 208, 88, 216, 88, 224, 88, 232, 88, 240, 88, 248, 88, 0, + 89, 8, 89, 16, 89, 24, 89, 32, 89, 40, 89, 48, 89, 56, 89, + 64, 89, 72, 89, 80, 89, 88, 89, 96, 89, 104, 89, 112, 89, + 120, 89, 128, 89, 136, 89, 144, 89, 152, 89, 160, 89, 168, + 89, 176, 89, 184, 89, 192, 89, 200, 89, 208, 89, 216, 89, + 224, 89, 232, 89, 240, 89, 248, 89, 0, 90, 8, 90, 16, 90, + 24, 90, 32, 90, 40, 90, 48, 90, 56, 90, 64, 90, 72, 90, 80, + 90, 88, 90, 96, 90, 104, 90, 112, 90, 120, 90, 128, 90, + 136, 90, 144, 90, 152, 90, 160, 90, 168, 90, 176, 90, 184, + 90, 192, 90, 200, 90, 208, 90, 216, 90, 224, 90, 232, 90, + 240, 90, 248, 90, 0, 91, 8, 91, 16, 91, 24, 91, 32, 91, 40, + 91, 48, 91, 56, 91, 64, 91, 72, 91, 80, 91, 88, 91, 96, 91, + 104, 91, 112, 91, 120, 91, 128, 91, 136, 91, 144, 91, 152, + 91, 160, 91, 168, 91, 176, 91, 184, 91, 192, 91, 200, 91, + 208, 91, 216, 91, 224, 91, 232, 91, 240, 91, 248, 91, + }, + fsggml.TensorTypeF32: { + 0, 0, 0, 0, 0, 0, 128, 63, 0, 0, 0, 64, 0, 0, 64, 64, 0, 0, 128, + 64, 0, 0, 160, 64, 0, 0, 192, 64, 0, 0, 224, 64, 0, 0, 0, 65, 0, + 0, 16, 65, 0, 0, 32, 65, 0, 0, 48, 65, 0, 0, 64, 65, 0, 0, 80, 65, + 0, 0, 96, 65, 0, 0, 112, 65, 0, 0, 128, 65, 0, 0, 136, 65, 0, 0, + 144, 65, 0, 0, 152, 65, 0, 0, 160, 65, 0, 0, 168, 65, 0, 0, 176, + 65, 0, 0, 184, 65, 0, 0, 192, 65, 0, 0, 200, 65, 0, 0, 208, 65, 0, + 0, 216, 65, 0, 0, 224, 65, 0, 0, 232, 65, 0, 0, 240, 65, 0, 0, 248, + 65, 0, 0, 0, 66, 0, 0, 4, 66, 0, 0, 8, 66, 0, 0, 12, 66, 0, 0, 16, + 66, 0, 0, 20, 66, 0, 0, 24, 66, 0, 0, 28, 66, 0, 0, 32, 66, 0, 0, + 36, 66, 0, 0, 40, 66, 0, 0, 44, 66, 0, 0, 48, 66, 0, 0, 52, 66, 0, + 0, 56, 66, 0, 0, 60, 66, 0, 0, 64, 66, 0, 0, 68, 66, 0, 0, 72, 66, + 0, 0, 76, 66, 0, 0, 80, 66, 0, 0, 84, 66, 0, 0, 88, 66, 0, 0, 92, 66, + 0, 0, 96, 66, 0, 0, 100, 66, 0, 0, 104, 66, 0, 0, 108, 66, 0, 0, 112, + 66, 0, 0, 116, 66, 0, 0, 120, 66, 0, 0, 124, 66, 0, 0, 128, 66, 0, 0, + 130, 66, 0, 0, 132, 66, 0, 0, 134, 66, 0, 0, 136, 66, 0, 0, 138, 66, + 0, 0, 140, 66, 0, 0, 142, 66, 0, 0, 144, 66, 0, 0, 146, 66, 0, 0, 148, + 66, 0, 0, 150, 66, 0, 0, 152, 66, 0, 0, 154, 66, 0, 0, 156, 66, 0, 0, + 158, 66, 0, 0, 160, 66, 0, 0, 162, 66, 0, 0, 164, 66, 0, 0, 166, 66, + 0, 0, 168, 66, 0, 0, 170, 66, 0, 0, 172, 66, 0, 0, 174, 66, 0, 0, 176, + 66, 0, 0, 178, 66, 0, 0, 180, 66, 0, 0, 182, 66, 0, 0, 184, 66, 0, 0, + 186, 66, 0, 0, 188, 66, 0, 0, 190, 66, 0, 0, 192, 66, 0, 0, 194, 66, 0, + 0, 196, 66, 0, 0, 198, 66, 0, 0, 200, 66, 0, 0, 202, 66, 0, 0, 204, 66, + 0, 0, 206, 66, 0, 0, 208, 66, 0, 0, 210, 66, 0, 0, 212, 66, 0, 0, 214, 66, + 0, 0, 216, 66, 0, 0, 218, 66, 0, 0, 220, 66, 0, 0, 222, 66, 0, 0, 224, 66, + 0, 0, 226, 66, 0, 0, 228, 66, 0, 0, 230, 66, 0, 0, 232, 66, 0, 0, 234, 66, + 0, 0, 236, 66, 0, 0, 238, 66, 0, 0, 240, 66, 0, 0, 242, 66, 0, 0, 244, 66, + 0, 0, 246, 66, 0, 0, 248, 66, 0, 0, 250, 66, 0, 0, 252, 66, 0, 0, 254, 66, + 0, 0, 0, 67, 0, 0, 1, 67, 0, 0, 2, 67, 0, 0, 3, 67, 0, 0, 4, 67, 0, 0, 5, 67, + 0, 0, 6, 67, 0, 0, 7, 67, 0, 0, 8, 67, 0, 0, 9, 67, 0, 0, 10, 67, 0, 0, 11, + 67, 0, 0, 12, 67, 0, 0, 13, 67, 0, 0, 14, 67, 0, 0, 15, 67, 0, 0, 16, 67, + 0, 0, 17, 67, 0, 0, 18, 67, 0, 0, 19, 67, 0, 0, 20, 67, 0, 0, 21, 67, 0, 0, + 22, 67, 0, 0, 23, 67, 0, 0, 24, 67, 0, 0, 25, 67, 0, 0, 26, 67, 0, 0, 27, + 67, 0, 0, 28, 67, 0, 0, 29, 67, 0, 0, 30, 67, 0, 0, 31, 67, 0, 0, 32, 67, + 0, 0, 33, 67, 0, 0, 34, 67, 0, 0, 35, 67, 0, 0, 36, 67, 0, 0, 37, 67, 0, 0, + 38, 67, 0, 0, 39, 67, 0, 0, 40, 67, 0, 0, 41, 67, 0, 0, 42, 67, 0, 0, 43, 67, + 0, 0, 44, 67, 0, 0, 45, 67, 0, 0, 46, 67, 0, 0, 47, 67, 0, 0, 48, 67, 0, 0, + 49, 67, 0, 0, 50, 67, 0, 0, 51, 67, 0, 0, 52, 67, 0, 0, 53, 67, 0, 0, 54, 67, + 0, 0, 55, 67, 0, 0, 56, 67, 0, 0, 57, 67, 0, 0, 58, 67, 0, 0, 59, 67, 0, 0, + 60, 67, 0, 0, 61, 67, 0, 0, 62, 67, 0, 0, 63, 67, 0, 0, 64, 67, 0, 0, 65, 67, + 0, 0, 66, 67, 0, 0, 67, 67, 0, 0, 68, 67, 0, 0, 69, 67, 0, 0, 70, 67, 0, 0, 71, + 67, 0, 0, 72, 67, 0, 0, 73, 67, 0, 0, 74, 67, 0, 0, 75, 67, 0, 0, 76, 67, 0, + 0, 77, 67, 0, 0, 78, 67, 0, 0, 79, 67, 0, 0, 80, 67, 0, 0, 81, 67, 0, 0, 82, + 67, 0, 0, 83, 67, 0, 0, 84, 67, 0, 0, 85, 67, 0, 0, 86, 67, 0, 0, 87, 67, 0, + 0, 88, 67, 0, 0, 89, 67, 0, 0, 90, 67, 0, 0, 91, 67, 0, 0, 92, 67, 0, 0, 93, + 67, 0, 0, 94, 67, 0, 0, 95, 67, 0, 0, 96, 67, 0, 0, 97, 67, 0, 0, 98, 67, 0, + 0, 99, 67, 0, 0, 100, 67, 0, 0, 101, 67, 0, 0, 102, 67, 0, 0, 103, 67, 0, 0, + 104, 67, 0, 0, 105, 67, 0, 0, 106, 67, 0, 0, 107, 67, 0, 0, 108, 67, 0, 0, 109, + 67, 0, 0, 110, 67, 0, 0, 111, 67, 0, 0, 112, 67, 0, 0, 113, 67, 0, 0, 114, 67, + 0, 0, 115, 67, 0, 0, 116, 67, 0, 0, 117, 67, 0, 0, 118, 67, 0, 0, 119, 67, 0, + 0, 120, 67, 0, 0, 121, 67, 0, 0, 122, 67, 0, 0, 123, 67, 0, 0, 124, 67, 0, 0, + 125, 67, 0, 0, 126, 67, 0, 0, 127, 67, + }, + fsggml.TensorTypeQ4_K: { + 52, 52, 0, 0, 136, 208, 216, 223, 0, 0, 0, 0, 8, 0, 8, 15, 128, + 128, 129, 129, 146, 146, 147, 147, 164, 164, 165, 165, 166, 182, + 183, 183, 184, 200, 201, 201, 202, 218, 218, 219, 219, 236, 236, + 237, 237, 254, 254, 255, 202, 202, 202, 203, 203, 203, 219, 219, + 219, 220, 220, 220, 220, 220, 236, 237, 237, 237, 237, 237, + 237, 237, 238, 254, 254, 254, 254, 254, 255, 255, 255, 255, 220, + 220, 220, 220, 221, 221, 221, 221, 221, 221, 221, 237, 237, 237, + 238, 238, 238, 238, 238, 238, 238, 238, 238, 254, 254, 255, 255, + 255, 255, 255, 255, 255, 237, 237, 237, 237, 237, 237, 237, 238, + 238, 238, 238, 238, 238, 238, 238, 238, 254, 254, 254, 254, 254, + 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + }, + fsggml.TensorTypeQ2_K: { + 1, 2, 3, 3, 4, 5, 7, 7, 8, 9, 10, 11, 12, 13, 14, 15, 184, 184, + 184, 185, 249, 249, 249, 249, 249, 250, 250, 254, 254, 254, 254, + 255, 253, 253, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, + 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 171, 69, 0, 0, + }, + fsggml.TensorTypeQ5_K: { + 32, 48, 0, 0, 136, 208, 216, 223, 0, 0, 0, 0, 8, 0, 7, 15, 254, + 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, + 254, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 0, 1, 2, 19, 20, 37, 38, 55, 56, 73, 74, + 91, 92, 109, 110, 127, 112, 128, 129, 146, 147, 164, 165, 182, 183, + 200, 201, 218, 219, 236, 237, 254, 133, 133, 149, 150, 150, 150, + 167, 167, 167, 168, 184, 184, 185, 185, 201, 202, 202, 202, 219, + 219, 219, 219, 236, 236, 236, 237, 253, 253, 254, 254, 254, 255, + 169, 169, 169, 169, 186, 186, 186, 186, 186, 187, 187, 203, 203, + 203, 204, 204, 204, 220, 220, 221, 221, 221, 221, 237, 237, 238, + 238, 238, 238, 254, 255, 255, 203, 203, 203, 204, 204, 204, 204, + 204, 220, 220, 220, 221, 221, 221, 221, 221, 237, 237, 238, 238, + 238, 238, 238, 238, 254, 255, 255, 255, 255, 255, 255, 255, + }, + fsggml.TensorTypeQ6_K: { + 96, 110, 92, 90, 88, 70, 68, 50, 48, 46, 44, 42, 24, 22, 4, 2, 80, + 95, 78, 77, 76, 59, 58, 57, 40, 39, 38, 21, 20, 19, 2, 1, 75, 75, + 74, 57, 57, 56, 55, 39, 38, 37, 21, 20, 20, 19, 2, 2, 72, 55, 55, + 54, 54, 37, 37, 36, 36, 19, 19, 18, 18, 1, 1, 0, 35, 35, 35, 35, + 34, 18, 18, 18, 17, 17, 17, 1, 1, 0, 0, 0, 35, 35, 34, 34, 18, + 18, 18, 17, 17, 17, 17, 1, 0, 0, 0, 0, 35, 35, 35, 19, 19, 18, 18, + 18, 18, 18, 1, 1, 1, 1, 1, 1, 34, 34, 18, 18, 18, 18, 17, 17, 17, + 17, 1, 1, 0, 0, 0, 0, 2, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, + 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 248, 240, 231, 224, 216, 208, 200, 192, 184, 176, + 166, 160, 152, 144, 136, 128, 235, 43, + }, + fsggml.TensorTypeQ3_K: { + 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 20, 20, 23, 23, 7, 7, 6, 6, 6, 2, + 1, 1, 1, 1, 0, 0, 22, 22, 6, 6, 5, 5, 5, 1, 1, 1, 1, 1, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 238, 204, 170, 136, 102, 68, + 34, 1, 5, 5, 5, 5, 189, 63, + }, + } +) diff --git a/server/routes.go b/server/routes.go index 31acd0d1a..8886073cf 100644 --- a/server/routes.go +++ b/server/routes.go @@ -18,6 +18,7 @@ import ( "os" "os/signal" "path/filepath" + "regexp" "slices" "strings" "syscall" @@ -1169,6 +1170,7 @@ func (s *Server) GenerateRoutes(rc *ollama.Registry) (http.Handler, error) { corsConfig.AllowOrigins = envconfig.AllowedOrigins() r := gin.Default() + r.HandleMethodNotAllowed = true r.Use( cors.New(corsConfig), allowedHostsMiddleware(s.addr), @@ -1512,6 +1514,7 @@ func (s *Server) ChatHandler(c *gin.Context) { if req.Messages[0].Role != "system" && m.System != "" { msgs = append([]api.Message{{Role: "system", Content: m.System}}, msgs...) } + msgs = filterThinkTags(msgs, m) prompt, images, err := chatPrompt(c.Request.Context(), m, r.Tokenize, opts, msgs, req.Tools) if err != nil { @@ -1640,3 +1643,23 @@ func handleScheduleError(c *gin.Context, name string, err error) { c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()}) } } + +var thinkTagRegexp = regexp.MustCompile(`(?s).*?(\n)*`) + +func filterThinkTags(msgs []api.Message, m *Model) []api.Message { + if m.Config.ModelFamily == "qwen3" || model.ParseName(m.Name).Model == "deepseek-r1" { + finalUserIndex := -1 + for i, msg := range msgs { + if msg.Role == "user" { + finalUserIndex = i + } + } + + for i, msg := range msgs { + if msg.Role == "assistant" && i < finalUserIndex { + msgs[i].Content = thinkTagRegexp.ReplaceAllString(msg.Content, "") + } + } + } + return msgs +} diff --git a/server/routes_create_test.go b/server/routes_create_test.go index 466dc04f1..3b3d99100 100644 --- a/server/routes_create_test.go +++ b/server/routes_create_test.go @@ -24,7 +24,7 @@ import ( var stream bool = false -func createBinFile(t *testing.T, kv map[string]any, ti []ggml.Tensor) (string, string) { +func createBinFile(t *testing.T, kv map[string]any, ti []*ggml.Tensor) (string, string) { t.Helper() t.Setenv("OLLAMA_MODELS", cmp.Or(os.Getenv("OLLAMA_MODELS"), t.TempDir())) diff --git a/server/routes_generate_test.go b/server/routes_generate_test.go index dd77b574a..6bbf5b112 100644 --- a/server/routes_generate_test.go +++ b/server/routes_generate_test.go @@ -87,7 +87,7 @@ func TestGenerateChat(t *testing.T) { }, } - go s.sched.Run(context.TODO()) + go s.sched.Run(t.Context()) _, digest := createBinFile(t, ggml.KV{ "general.architecture": "llama", @@ -99,7 +99,7 @@ func TestGenerateChat(t *testing.T) { "tokenizer.ggml.tokens": []string{""}, "tokenizer.ggml.scores": []float32{0}, "tokenizer.ggml.token_type": []int32{0}, - }, []ggml.Tensor{ + }, []*ggml.Tensor{ {Name: "token_embd.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))}, {Name: "blk.0.attn_norm.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))}, {Name: "blk.0.ffn_down.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))}, @@ -158,7 +158,7 @@ func TestGenerateChat(t *testing.T) { _, digest := createBinFile(t, ggml.KV{ "general.architecture": "bert", "bert.pooling_type": uint32(0), - }, []ggml.Tensor{}) + }, []*ggml.Tensor{}) w := createRequest(t, s.CreateHandler, api.CreateRequest{ Model: "bert", Files: map[string]string{"bert.gguf": digest}, @@ -299,9 +299,6 @@ func TestGenerateChat(t *testing.T) { {Role: "user", Content: "Hello!"}, }, Stream: &stream, - Options: map[string]any{ - "num_ctx": 1024, - }, }) if w.Code != http.StatusOK { @@ -324,9 +321,6 @@ func TestGenerateChat(t *testing.T) { {Role: "user", Content: "Hello!"}, }, Stream: &stream, - Options: map[string]any{ - "num_ctx": 1024, - }, }) if w.Code != http.StatusOK { @@ -350,9 +344,6 @@ func TestGenerateChat(t *testing.T) { {Role: "user", Content: "Help me write tests."}, }, Stream: &stream, - Options: map[string]any{ - "num_ctx": 1024, - }, }) if w.Code != http.StatusOK { @@ -640,7 +631,7 @@ func TestGenerate(t *testing.T) { }, } - go s.sched.Run(context.TODO()) + go s.sched.Run(t.Context()) _, digest := createBinFile(t, ggml.KV{ "general.architecture": "llama", @@ -652,7 +643,7 @@ func TestGenerate(t *testing.T) { "tokenizer.ggml.tokens": []string{""}, "tokenizer.ggml.scores": []float32{0}, "tokenizer.ggml.token_type": []int32{0}, - }, []ggml.Tensor{ + }, []*ggml.Tensor{ {Name: "token_embd.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))}, {Name: "blk.0.attn_norm.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))}, {Name: "blk.0.ffn_down.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))}, @@ -707,7 +698,7 @@ func TestGenerate(t *testing.T) { _, digest := createBinFile(t, ggml.KV{ "general.architecture": "bert", "bert.pooling_type": uint32(0), - }, []ggml.Tensor{}) + }, []*ggml.Tensor{}) w := createRequest(t, s.CreateHandler, api.CreateRequest{ Model: "bert", diff --git a/server/routes_test.go b/server/routes_test.go index e13c4b599..fd63b78be 100644 --- a/server/routes_test.go +++ b/server/routes_test.go @@ -15,6 +15,7 @@ import ( "net/http/httptest" "os" "path/filepath" + "reflect" "sort" "strings" "testing" @@ -473,14 +474,24 @@ func TestRoutes(t *testing.T) { t.Fatalf("failed to read response body: %v", err) } - var retrieveResp api.RetrieveModelResponse - err = json.Unmarshal(body, &retrieveResp) + var m openai.Model + err = json.Unmarshal(body, &m) if err != nil { t.Fatalf("failed to unmarshal response body: %v", err) } - if retrieveResp.Id != "show-model" || retrieveResp.OwnedBy != "library" { - t.Errorf("expected model 'show-model' owned by 'library', got %v", retrieveResp) + if m.Id != "show-model" || m.OwnedBy != "library" { + t.Errorf("expected model 'show-model' owned by 'library', got %v", m) + } + }, + }, + { + Name: "Method Not Allowed", + Method: http.MethodGet, + Path: "/api/show", + Expected: func(t *testing.T, resp *http.Response) { + if resp.StatusCode != 405 { + t.Errorf("expected status code 405, got %d", resp.StatusCode) } }, }, @@ -516,7 +527,7 @@ func TestRoutes(t *testing.T) { for _, tc := range testCases { t.Run(tc.Name, func(t *testing.T) { u := httpSrv.URL + tc.Path - req, err := http.NewRequestWithContext(context.TODO(), tc.Method, u, nil) + req, err := http.NewRequestWithContext(t.Context(), tc.Method, u, nil) if err != nil { t.Fatalf("failed to create request: %v", err) } @@ -746,3 +757,128 @@ func TestNormalize(t *testing.T) { }) } } + +func TestFilterThinkTags(t *testing.T) { + type testCase struct { + msgs []api.Message + want []api.Message + model *Model + } + testCases := []testCase{ + { + msgs: []api.Message{ + {Role: "user", Content: "Hello, world!"}, + {Role: "assistant", Content: "Thinking... about the answerabc"}, + {Role: "user", Content: "What is the answer?"}, + }, + want: []api.Message{ + {Role: "user", Content: "Hello, world!"}, + {Role: "assistant", Content: "abc"}, + {Role: "user", Content: "What is the answer?"}, + }, + model: &Model{ + Config: ConfigV2{ + ModelFamily: "qwen3", + }, + }, + }, + // with newlines inside the think tag aned newlines after + { + msgs: []api.Message{ + {Role: "user", Content: "Hello, world!"}, + {Role: "assistant", Content: "Thinking... \n\nabout \nthe answer\n\nabc\ndef"}, + {Role: "user", Content: "What is the answer?"}, + }, + want: []api.Message{ + {Role: "user", Content: "Hello, world!"}, + {Role: "assistant", Content: "abc\ndef"}, + {Role: "user", Content: "What is the answer?"}, + }, + model: &Model{ + Config: ConfigV2{ + ModelFamily: "qwen3", + }, + }, + }, + // should leave thinking tags if it's after the last user message + { + msgs: []api.Message{ + {Role: "user", Content: "Hello, world!"}, + {Role: "assistant", Content: "Thinking...after"}, + {Role: "user", Content: "What is the answer?"}, + {Role: "assistant", Content: "thinking againhjk"}, + {Role: "assistant", Content: "thinking yet againhjk"}, + }, + want: []api.Message{ + {Role: "user", Content: "Hello, world!"}, + {Role: "assistant", Content: "after"}, + {Role: "user", Content: "What is the answer?"}, + {Role: "assistant", Content: "thinking againhjk"}, + {Role: "assistant", Content: "thinking yet againhjk"}, + }, + model: &Model{ + Config: ConfigV2{ + ModelFamily: "qwen3", + }, + }, + }, + { + // shouldn't strip anything because the model family isn't one of the hardcoded ones + msgs: []api.Message{ + {Role: "user", Content: "Hello, world!"}, + {Role: "assistant", Content: "Thinking... about the answerabc"}, + {Role: "user", Content: "What is the answer?"}, + }, + want: []api.Message{ + {Role: "user", Content: "Hello, world!"}, + {Role: "assistant", Content: "Thinking... about the answerabc"}, + {Role: "user", Content: "What is the answer?"}, + }, + model: &Model{ + Config: ConfigV2{ + ModelFamily: "llama3", + }, + }, + }, + { + // deepseek-r1:-prefixed model + msgs: []api.Message{ + {Role: "user", Content: "Hello, world!"}, + {Role: "assistant", Content: "Thinking... about the answerabc"}, + {Role: "user", Content: "What is the answer?"}, + }, + want: []api.Message{ + {Role: "user", Content: "Hello, world!"}, + {Role: "assistant", Content: "abc"}, + {Role: "user", Content: "What is the answer?"}, + }, + model: &Model{ + Name: "registry.ollama.ai/library/deepseek-r1:latest", + ShortName: "deepseek-r1:7b", + Config: ConfigV2{}, + }, + }, + } + + for i, tc := range testCases { + filtered := filterThinkTags(tc.msgs, tc.model) + + if !reflect.DeepEqual(filtered, tc.want) { + t.Errorf("messages differ for case %d:", i) + for i := range tc.want { + if i >= len(filtered) { + t.Errorf(" missing message %d: %+v", i, tc.want[i]) + continue + } + if !reflect.DeepEqual(filtered[i], tc.want[i]) { + t.Errorf(" message %d:\n want: %+v\n got: %+v", i, tc.want[i], filtered[i]) + } + } + if len(filtered) > len(tc.want) { + for i := len(tc.want); i < len(filtered); i++ { + t.Errorf(" extra message %d: %+v", i, filtered[i]) + } + } + } + } +} diff --git a/server/sched.go b/server/sched.go index 9c13f6cf5..1a9947677 100644 --- a/server/sched.go +++ b/server/sched.go @@ -81,6 +81,10 @@ func InitScheduler(ctx context.Context) *Scheduler { // context must be canceled to decrement ref count and release the runner func (s *Scheduler) GetRunner(c context.Context, model *Model, opts api.Options, sessionDuration *api.Duration) (chan *runnerRef, chan error) { + if opts.NumCtx < 4 { + opts.NumCtx = 4 + } + req := &LlmRequest{ ctx: c, model: model, @@ -110,11 +114,6 @@ func (s *Scheduler) Run(ctx context.Context) { }() } -const ( - defaultContextLength = 4096 - smallGpuContextLength = 2048 -) - func (s *Scheduler) processPending(ctx context.Context) { for { select { @@ -148,6 +147,7 @@ func (s *Scheduler) processPending(ctx context.Context) { s.loadedMu.Unlock() if runner != nil { if runner.needsReload(ctx, pending) { + slog.Debug("reloading", "runner", runner) runnerToExpire = runner } else { // Runner is usable, return it @@ -167,17 +167,6 @@ func (s *Scheduler) processPending(ctx context.Context) { gpus = s.getGpuFn() } - if pending.origNumCtx == -1 { - if len(gpus) == 1 && gpus[0].Library != "cpu" && gpus[0].TotalMemory <= 4096*1024*1024 { - slog.Info("GPU is small, limiting default context window", "num_ctx", smallGpuContextLength) - pending.opts.NumCtx = smallGpuContextLength - pending.origNumCtx = smallGpuContextLength - } else { - pending.opts.NumCtx = defaultContextLength - pending.origNumCtx = defaultContextLength - } - } - if envconfig.MaxRunners() <= 0 { // No user specified MaxRunners, so figure out what automatic setting to use // If all GPUs have reliable free memory reporting, defaultModelsPerGPU * the number of GPUs @@ -294,7 +283,7 @@ func (s *Scheduler) processPending(ctx context.Context) { } // Trigger an expiration to unload once it's done runnerToExpire.refMu.Lock() - slog.Debug("resetting model to expire immediately to make room", "modelPath", runnerToExpire.modelPath, "refCount", runnerToExpire.refCount) + slog.Debug("resetting model to expire immediately to make room", "runner", runnerToExpire, "refCount", runnerToExpire.refCount) if runnerToExpire.expireTimer != nil { runnerToExpire.expireTimer.Stop() runnerToExpire.expireTimer = nil @@ -307,13 +296,13 @@ func (s *Scheduler) processPending(ctx context.Context) { // Wait for the unload to happen // Note: at this point we're queueing up all incoming requests, even if they were for // a different model that's loaded and not scheduled to be removed. - slog.Debug("waiting for pending requests to complete and unload to occur", "modelPath", runnerToExpire.modelPath) + slog.Debug("waiting for pending requests to complete and unload to occur", "runner", runnerToExpire) select { case <-ctx.Done(): slog.Debug("shutting down scheduler pending loop") return case <-s.unloadedCh: - slog.Debug("unload completed", "modelPath", runnerToExpire.modelPath) + slog.Debug("unload completed", "runner", runnerToExpire) continue } } @@ -343,16 +332,16 @@ func (s *Scheduler) processCompleted(ctx context.Context) { runner.refCount-- if runner.refCount <= 0 { if runner.sessionDuration <= 0 { - slog.Debug("runner with zero duration has gone idle, expiring to unload", "modelPath", runner.modelPath) + slog.Debug("runner with zero duration has gone idle, expiring to unload", "runner", runner) if runner.expireTimer != nil { runner.expireTimer.Stop() runner.expireTimer = nil } s.expiredCh <- runner } else if runner.expireTimer == nil { - slog.Debug("runner with non-zero duration has gone idle, adding timer", "modelPath", runner.modelPath, "duration", runner.sessionDuration) + slog.Debug("runner with non-zero duration has gone idle, adding timer", "runner", runner, "duration", runner.sessionDuration) runner.expireTimer = time.AfterFunc(runner.sessionDuration, func() { - slog.Debug("timer expired, expiring to unload", "modelPath", runner.modelPath) + slog.Debug("timer expired, expiring to unload", "runner", runner) runner.refMu.Lock() defer runner.refMu.Unlock() if runner.expireTimer != nil { @@ -363,18 +352,18 @@ func (s *Scheduler) processCompleted(ctx context.Context) { }) runner.expiresAt = time.Now().Add(runner.sessionDuration) } else { - slog.Debug("runner with non-zero duration has gone idle, resetting timer", "modelPath", runner.modelPath, "duration", runner.sessionDuration) + slog.Debug("runner with non-zero duration has gone idle, resetting timer", "runner", runner, "duration", runner.sessionDuration) runner.expireTimer.Reset(runner.sessionDuration) runner.expiresAt = time.Now().Add(runner.sessionDuration) } } - slog.Debug("after processing request finished event", "modelPath", runner.modelPath, "refCount", runner.refCount) + slog.Debug("after processing request finished event", "runner", runner, "refCount", runner.refCount) runner.refMu.Unlock() case runner := <-s.expiredCh: - slog.Debug("runner expired event received", "modelPath", runner.modelPath) + slog.Debug("runner expired event received", "runner", runner) runner.refMu.Lock() if runner.refCount > 0 { - slog.Debug("expired event with positive ref count, retrying", "modelPath", runner.modelPath, "refCount", runner.refCount) + slog.Debug("expired event with positive ref count, retrying", "runner", runner, "refCount", runner.refCount) go func(runner *runnerRef) { // We can't unload yet, but want to as soon as the current request completes // So queue up another expired event @@ -386,17 +375,29 @@ func (s *Scheduler) processCompleted(ctx context.Context) { } s.loadedMu.Lock() - slog.Debug("got lock to unload", "modelPath", runner.modelPath) - finished := runner.waitForVRAMRecovery() - runner.unload() - delete(s.loaded, runner.modelPath) - s.loadedMu.Unlock() - slog.Debug("runner released", "modelPath", runner.modelPath) - runner.refMu.Unlock() - - <-finished - slog.Debug("sending an unloaded event", "modelPath", runner.modelPath) - s.unloadedCh <- struct{}{} + slog.Debug("got lock to unload expired event", "runner", runner) + runnerToUnload := s.loaded[runner.modelPath] + if runnerToUnload == nil { + // If runnerToUnload is nil, we already processed an event and + // unloaded it. This double unload can happen if the initial + // request is canceled and we're trying to load another model + // that requires this one to be evicted, or the settings change + // and require a reload + s.loadedMu.Unlock() + runner.refMu.Unlock() + slog.Debug("duplicate expired event, ignoring", "runner", runner) + } else { + slog.Debug("starting background wait for VRAM recovery", "runner", runner) + finished := runner.waitForVRAMRecovery() + runner.unload() + delete(s.loaded, runner.modelPath) + s.loadedMu.Unlock() + slog.Debug("runner terminated and removed from list, blocking for VRAM recovery", "runner", runner) + <-finished + runner.refMu.Unlock() + slog.Debug("sending an unloaded event", "runner", runner) + s.unloadedCh <- struct{}{} + } } } } @@ -418,7 +419,7 @@ func (pending *LlmRequest) useLoadedRunner(runner *runnerRef, finished chan *Llm pending.successCh <- runner go func() { <-pending.ctx.Done() - slog.Debug("context for request finished") + slog.Debug("context for request finished", "runner", runner) finished <- pending }() } @@ -453,12 +454,19 @@ func (s *Scheduler) load(req *LlmRequest, f *ggml.GGML, gpus discover.GpuInfoLis estimatedVRAM: llama.EstimatedVRAM(), estimatedTotal: llama.EstimatedTotal(), loading: true, - refCount: 1, + pid: llama.Pid(), } runner.numParallel = numParallel - runner.refMu.Lock() + runner.refMu.Lock() // hold lock until running or aborted s.loadedMu.Lock() + if oldRunner, ok := s.loaded[req.model.ModelPath]; ok { + // Shouldn't happen, but safeguard against leaking a runner + slog.Warn("model was still loaded", "old_runner", oldRunner, "new_runner", runner) + oldRunner.refMu.Lock() + oldRunner.unload() + oldRunner.refMu.Unlock() + } s.loaded[req.model.ModelPath] = runner slog.Info("loaded runners", "count", len(s.loaded)) s.loadedMu.Unlock() @@ -467,13 +475,16 @@ func (s *Scheduler) load(req *LlmRequest, f *ggml.GGML, gpus discover.GpuInfoLis defer runner.refMu.Unlock() if err = llama.WaitUntilRunning(req.ctx); err != nil { slog.Error("error loading llama server", "error", err) - runner.refCount-- req.errCh <- err - slog.Debug("triggering expiration for failed load", "model", runner.modelPath) + slog.Debug("triggering expiration for failed load", "runner", runner) s.expiredCh <- runner return } - slog.Debug("finished setting up runner", "model", req.model.ModelPath) + slog.Debug("finished setting up", "runner", runner) + if runner.pid < 0 { + runner.pid = llama.Pid() + } + runner.refCount++ runner.loading = false go func() { <-req.ctx.Done() @@ -491,7 +502,12 @@ func (s *Scheduler) updateFreeSpace(allGpus discover.GpuInfoList) { } predMap := map[predKey]uint64{} // Sum up the total predicted usage per GPU for all runners s.loadedMu.Lock() + runners := make([]*runnerRef, 0, len(s.loaded)) for _, r := range s.loaded { + runners = append(runners, r) + } + s.loadedMu.Unlock() + for _, r := range runners { r.refMu.Lock() if r.llama != nil { for _, gpu := range allGpus { @@ -502,7 +518,6 @@ func (s *Scheduler) updateFreeSpace(allGpus discover.GpuInfoList) { } r.refMu.Unlock() } - s.loadedMu.Unlock() // Now that we've summed up all the GPU usage predictions across all the loaded runners, update the gpu list for i := range allGpus { @@ -549,12 +564,11 @@ func (s *Scheduler) filterGPUsWithoutLoadingModels(allGpus discover.GpuInfoList) // TODO consolidate sched_types.go type runnerRef struct { - refMu sync.Mutex - // refCond sync.Cond // Signaled on transition from 1 -> 0 refCount + refMu sync.Mutex refCount uint // prevent unloading if > 0 - // unloading bool // set to true when we are trying to unload the runner llama llm.LlamaServer + pid int loading bool // True only during initial load, then false forever gpus discover.GpuInfoList // Recorded at time of provisioning estimatedVRAM uint64 @@ -639,6 +653,7 @@ func (runner *runnerRef) waitForVRAMRecovery() chan any { (len(runner.gpus) == 1 && (runner.gpus[0].Library == "cpu" || runner.gpus[0].Library == "metal")) || (runtime.GOOS == "windows" && runner.gpus[0].Library != "cuda") { finished <- struct{}{} + slog.Debug("no need to wait for VRAM recovery", "runner", runner) return finished } start := time.Now() @@ -657,7 +672,7 @@ func (runner *runnerRef) waitForVRAMRecovery() chan any { for { <-ticker.C if time.Now().After(expiresAt) { - slog.Warn("gpu VRAM usage didn't recover within timeout", "seconds", time.Since(start).Seconds(), "model", runner.modelPath) + slog.Warn("gpu VRAM usage didn't recover within timeout", "seconds", time.Since(start).Seconds(), "runner", runner) finished <- struct{}{} } @@ -670,7 +685,7 @@ func (runner *runnerRef) waitForVRAMRecovery() chan any { } // If we're within ~80% of the estimated memory usage recovered, bail out if float32(freeMemoryNow-freeMemoryBefore) > float32(runner.estimatedVRAM)*0.8 { - slog.Debug(fmt.Sprintf("gpu VRAM free memory converged after %0.2f seconds", time.Since(start).Seconds()), "model", runner.modelPath) + slog.Debug(fmt.Sprintf("gpu VRAM free memory converged after %0.2f seconds", time.Since(start).Seconds()), "runner", runner) finished <- struct{}{} return } @@ -679,6 +694,33 @@ func (runner *runnerRef) waitForVRAMRecovery() chan any { return finished } +func (runner *runnerRef) LogValue() slog.Value { + if runner == nil { + return slog.StringValue("nil") + } + attrs := []slog.Attr{} + if runner.model != nil { + attrs = append(attrs, slog.String("name", runner.model.Name)) + } + if len(runner.gpus) > 0 { + attrs = append(attrs, + slog.String("inference", runner.gpus[0].Library), + slog.Int("devices", len(runner.gpus)), + ) + } + attrs = append(attrs, + slog.String("size", format.HumanBytes2(runner.estimatedTotal)), + slog.String("vram", format.HumanBytes2(runner.estimatedVRAM)), + slog.Int("parallel", runner.numParallel), + slog.Int("pid", runner.pid), + slog.String("model", runner.modelPath), + ) + if runner.Options != nil { + attrs = append(attrs, slog.Int("num_ctx", runner.Options.NumCtx)) + } + return slog.GroupValue(attrs...) +} + type ByDurationAndName []*runnerRef func (a ByDurationAndName) Len() int { return len(a) } @@ -801,12 +843,12 @@ func (s *Scheduler) findRunnerToUnload() *runnerRef { rc := runner.refCount runner.refMu.Unlock() if rc == 0 { - slog.Debug("found an idle runner to unload") + slog.Debug("found an idle runner to unload", "runner", runner) return runner } } // None appear idle, just wait for the one with the shortest duration - slog.Debug("no idle runners, picking the shortest duration", "count", len(runnerList)) + slog.Debug("no idle runners, picking the shortest duration", "runner_count", len(runnerList), "runner", runnerList[0]) return runnerList[0] } @@ -823,8 +865,8 @@ func (s *Scheduler) unloadAllRunners() { func (s *Scheduler) expireRunner(model *Model) { s.loadedMu.Lock() - defer s.loadedMu.Unlock() runner, ok := s.loaded[model.ModelPath] + s.loadedMu.Unlock() if ok { runner.refMu.Lock() runner.expiresAt = time.Now() diff --git a/server/sched_test.go b/server/sched_test.go index 1b620329c..01fb9a703 100644 --- a/server/sched_test.go +++ b/server/sched_test.go @@ -26,7 +26,7 @@ func TestMain(m *testing.M) { } func TestInitScheduler(t *testing.T) { - ctx, done := context.WithCancel(context.Background()) + ctx, done := context.WithCancel(t.Context()) defer done() s := InitScheduler(ctx) s.loadedMu.Lock() @@ -35,7 +35,7 @@ func TestInitScheduler(t *testing.T) { } func TestLoad(t *testing.T) { - ctx, done := context.WithTimeout(context.Background(), 20*time.Millisecond) + ctx, done := context.WithTimeout(t.Context(), 20*time.Millisecond) defer done() s := InitScheduler(ctx) var f *ggml.GGML // value not used in tests @@ -126,7 +126,7 @@ func newScenarioRequest(t *testing.T, ctx context.Context, modelName string, est "tokenizer.ggml.tokens": []string{" "}, "tokenizer.ggml.scores": []float32{0}, "tokenizer.ggml.token_type": []int32{0}, - }, []ggml.Tensor{ + }, []*ggml.Tensor{ {Name: "blk.0.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: bytes.NewReader(make([]byte, 32))}, {Name: "output.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: bytes.NewReader(make([]byte, 32))}, })) @@ -148,7 +148,6 @@ func newScenarioRequest(t *testing.T, ctx context.Context, modelName string, est successCh: make(chan *runnerRef, 1), errCh: make(chan error, 1), } - b.req.opts.NumCtx = 4096 b.srv = &mockLlm{estimatedVRAM: estimatedVRAM, estimatedVRAMByGPU: map[string]uint64{"": estimatedVRAM}} return b } @@ -168,7 +167,7 @@ func getCpuFn() discover.GpuInfoList { } func TestRequestsSameModelSameRequest(t *testing.T) { - ctx, done := context.WithTimeout(context.Background(), 500*time.Millisecond) + ctx, done := context.WithTimeout(t.Context(), 500*time.Millisecond) defer done() s := InitScheduler(ctx) s.getGpuFn = getGpuFn @@ -211,7 +210,7 @@ func TestRequestsSameModelSameRequest(t *testing.T) { } func TestRequestsSimpleReloadSameModel(t *testing.T) { - ctx, done := context.WithTimeout(context.Background(), 500*time.Millisecond) + ctx, done := context.WithTimeout(t.Context(), 500*time.Millisecond) defer done() s := InitScheduler(ctx) s.getGpuFn = getGpuFn @@ -259,7 +258,7 @@ func TestRequestsSimpleReloadSameModel(t *testing.T) { } func TestRequestsMultipleLoadedModels(t *testing.T) { - ctx, done := context.WithTimeout(context.Background(), 500*time.Millisecond) + ctx, done := context.WithTimeout(t.Context(), 500*time.Millisecond) defer done() s := InitScheduler(ctx) s.getGpuFn = getGpuFn @@ -356,7 +355,7 @@ func TestRequestsMultipleLoadedModels(t *testing.T) { } func TestGetRunner(t *testing.T) { - ctx, done := context.WithTimeout(context.Background(), 3*time.Second) + ctx, done := context.WithTimeout(t.Context(), 3*time.Second) defer done() a := newScenarioRequest(t, ctx, "ollama-model-1a", 10, &api.Duration{Duration: 2 * time.Millisecond}) @@ -409,7 +408,7 @@ func TestGetRunner(t *testing.T) { } func TestExpireRunner(t *testing.T) { - ctx, done := context.WithTimeout(context.Background(), 20*time.Millisecond) + ctx, done := context.WithTimeout(t.Context(), 20*time.Millisecond) defer done() s := InitScheduler(ctx) req := &LlmRequest{ @@ -456,7 +455,7 @@ func TestExpireRunner(t *testing.T) { // TODO - add one scenario that triggers the bogus finished event with positive ref count func TestPrematureExpired(t *testing.T) { - ctx, done := context.WithTimeout(context.Background(), 500*time.Millisecond) + ctx, done := context.WithTimeout(t.Context(), 500*time.Millisecond) defer done() // Same model, same request @@ -503,7 +502,7 @@ func TestPrematureExpired(t *testing.T) { } func TestUseLoadedRunner(t *testing.T) { - ctx, done := context.WithTimeout(context.Background(), 100*time.Millisecond) + ctx, done := context.WithTimeout(t.Context(), 100*time.Millisecond) req := &LlmRequest{ ctx: ctx, opts: api.DefaultOptions(), @@ -530,7 +529,7 @@ func TestUseLoadedRunner(t *testing.T) { } func TestUpdateFreeSpace(t *testing.T) { - ctx, done := context.WithTimeout(context.Background(), 100*time.Millisecond) + ctx, done := context.WithTimeout(t.Context(), 100*time.Millisecond) defer done() gpus := discover.GpuInfoList{ { @@ -563,7 +562,7 @@ func TestUpdateFreeSpace(t *testing.T) { } func TestFilterGPUsWithoutLoadingModels(t *testing.T) { - ctx, done := context.WithTimeout(context.Background(), 100*time.Millisecond) + ctx, done := context.WithTimeout(t.Context(), 100*time.Millisecond) defer done() gpus := discover.GpuInfoList{ { @@ -597,7 +596,7 @@ func TestFilterGPUsWithoutLoadingModels(t *testing.T) { } func TestFindRunnerToUnload(t *testing.T) { - ctx, done := context.WithTimeout(context.Background(), 100*time.Millisecond) + ctx, done := context.WithTimeout(t.Context(), 100*time.Millisecond) defer done() r1 := &runnerRef{refCount: 1, sessionDuration: 1, numParallel: 1} @@ -617,7 +616,7 @@ func TestFindRunnerToUnload(t *testing.T) { } func TestNeedsReload(t *testing.T) { - ctx, done := context.WithTimeout(context.Background(), 100*time.Millisecond) + ctx, done := context.WithTimeout(t.Context(), 100*time.Millisecond) defer done() llm := &mockLlm{estimatedVRAMByGPU: map[string]uint64{}} @@ -664,7 +663,7 @@ func TestNeedsReload(t *testing.T) { } func TestUnloadAllRunners(t *testing.T) { - ctx, done := context.WithTimeout(context.Background(), 100*time.Millisecond) + ctx, done := context.WithTimeout(t.Context(), 100*time.Millisecond) defer done() llm1 := &mockLlm{estimatedVRAMByGPU: map[string]uint64{}} @@ -696,7 +695,7 @@ func TestUnload(t *testing.T) { } func TestAlreadyCanceled(t *testing.T) { - ctx, done := context.WithTimeout(context.Background(), 500*time.Millisecond) + ctx, done := context.WithTimeout(t.Context(), 500*time.Millisecond) defer done() dctx, done2 := context.WithCancel(ctx) done2() @@ -713,7 +712,7 @@ func TestAlreadyCanceled(t *testing.T) { } func TestHomogeneousGPUs(t *testing.T) { - ctx, done := context.WithTimeout(context.Background(), 100*time.Millisecond) + ctx, done := context.WithTimeout(t.Context(), 100*time.Millisecond) defer done() s := InitScheduler(ctx) @@ -793,3 +792,4 @@ func (s *mockLlm) Close() error { func (s *mockLlm) EstimatedVRAM() uint64 { return s.estimatedVRAM } func (s *mockLlm) EstimatedTotal() uint64 { return s.estimatedTotal } func (s *mockLlm) EstimatedVRAMByGPU(gpuid string) uint64 { return s.estimatedVRAMByGPU[gpuid] } +func (s *mockLlm) Pid() int { return -1 } diff --git a/types/syncmap/syncmap.go b/types/syncmap/syncmap.go new file mode 100644 index 000000000..ff21cd999 --- /dev/null +++ b/types/syncmap/syncmap.go @@ -0,0 +1,38 @@ +package syncmap + +import ( + "maps" + "sync" +) + +// SyncMap is a simple, generic thread-safe map implementation. +type SyncMap[K comparable, V any] struct { + mu sync.RWMutex + m map[K]V +} + +func NewSyncMap[K comparable, V any]() *SyncMap[K, V] { + return &SyncMap[K, V]{ + m: make(map[K]V), + } +} + +func (s *SyncMap[K, V]) Load(key K) (V, bool) { + s.mu.RLock() + defer s.mu.RUnlock() + val, ok := s.m[key] + return val, ok +} + +func (s *SyncMap[K, V]) Store(key K, value V) { + s.mu.Lock() + defer s.mu.Unlock() + s.m[key] = value +} + +func (s *SyncMap[K, V]) Items() map[K]V { + s.mu.RLock() + defer s.mu.RUnlock() + // shallow copy map items + return maps.Clone(s.m) +}