Compare commits
No commits in common. "main" and "v0.6.6-rc2" have entirely different histories.
main
...
v0.6.6-rc2
16
.github/workflows/release.yaml
vendored
16
.github/workflows/release.yaml
vendored
@ -432,22 +432,6 @@ jobs:
|
|||||||
docker buildx imagetools inspect ollama/ollama:${{ steps.metadata.outputs.version }}
|
docker buildx imagetools inspect ollama/ollama:${{ steps.metadata.outputs.version }}
|
||||||
working-directory: ${{ runner.temp }}
|
working-directory: ${{ runner.temp }}
|
||||||
|
|
||||||
# Trigger downstream release process
|
|
||||||
trigger:
|
|
||||||
runs-on: ubuntu-latest
|
|
||||||
environment: release
|
|
||||||
needs: [darwin-build, windows-build, windows-depends]
|
|
||||||
steps:
|
|
||||||
- name: Trigger downstream release process
|
|
||||||
run: |
|
|
||||||
curl -L \
|
|
||||||
-X POST \
|
|
||||||
-H "Accept: application/vnd.github+json" \
|
|
||||||
-H "Authorization: Bearer ${{ secrets.RELEASE_TOKEN }}" \
|
|
||||||
-H "X-GitHub-Api-Version: 2022-11-28" \
|
|
||||||
https://api.github.com/repos/ollama/${{ vars.RELEASE_REPO }}/dispatches \
|
|
||||||
-d "{\"event_type\": \"trigger-workflow\", \"client_payload\": {\"run_id\": \"${GITHUB_RUN_ID}\", \"version\": \"${GITHUB_REF_NAME#v}\"}}"
|
|
||||||
|
|
||||||
# Aggregate all the assets and ship a release
|
# Aggregate all the assets and ship a release
|
||||||
release:
|
release:
|
||||||
needs: [darwin-sign, windows-sign, linux-build]
|
needs: [darwin-sign, windows-sign, linux-build]
|
||||||
|
@ -19,8 +19,8 @@ linters:
|
|||||||
- nolintlint
|
- nolintlint
|
||||||
- nosprintfhostport
|
- nosprintfhostport
|
||||||
- staticcheck
|
- staticcheck
|
||||||
|
- tenv
|
||||||
- unconvert
|
- unconvert
|
||||||
- usetesting
|
|
||||||
- wastedassign
|
- wastedassign
|
||||||
- whitespace
|
- whitespace
|
||||||
disable:
|
disable:
|
||||||
|
@ -21,16 +21,14 @@
|
|||||||
"name": "CUDA 11",
|
"name": "CUDA 11",
|
||||||
"inherits": [ "CUDA" ],
|
"inherits": [ "CUDA" ],
|
||||||
"cacheVariables": {
|
"cacheVariables": {
|
||||||
"CMAKE_CUDA_ARCHITECTURES": "50;52;53;60;61;70;75;80;86",
|
"CMAKE_CUDA_ARCHITECTURES": "50;52;53;60;61;70;75;80;86"
|
||||||
"CMAKE_CUDA_FLAGS": "-Wno-deprecated-gpu-targets"
|
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"name": "CUDA 12",
|
"name": "CUDA 12",
|
||||||
"inherits": [ "CUDA" ],
|
"inherits": [ "CUDA" ],
|
||||||
"cacheVariables": {
|
"cacheVariables": {
|
||||||
"CMAKE_CUDA_ARCHITECTURES": "50;60;61;70;75;80;86;87;89;90;90a;120",
|
"CMAKE_CUDA_ARCHITECTURES": "50;60;61;70;75;80;86;87;89;90;90a;120"
|
||||||
"CMAKE_CUDA_FLAGS": "-Wno-deprecated-gpu-targets"
|
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
|
@ -1,6 +1,6 @@
|
|||||||
UPSTREAM=https://github.com/ggerganov/llama.cpp.git
|
UPSTREAM=https://github.com/ggerganov/llama.cpp.git
|
||||||
WORKDIR=llama/vendor
|
WORKDIR=llama/vendor
|
||||||
FETCH_HEAD=de4c07f93783a1a96456a44dc16b9db538ee1618
|
FETCH_HEAD=71e90e8813f90097701e62f7fce137d96ddf41e2
|
||||||
|
|
||||||
.PHONY: help
|
.PHONY: help
|
||||||
help:
|
help:
|
||||||
@ -15,13 +15,11 @@ help:
|
|||||||
@echo " make -f $(lastword $(MAKEFILE_LIST)) clean sync"
|
@echo " make -f $(lastword $(MAKEFILE_LIST)) clean sync"
|
||||||
|
|
||||||
.PHONY: sync
|
.PHONY: sync
|
||||||
sync: llama/build-info.cpp ml/backend/ggml/ggml/src/ggml-metal/ggml-metal-embed.metal
|
sync: llama/build-info.cpp llama/llama.cpp ml/backend/ggml/ggml
|
||||||
|
|
||||||
llama/build-info.cpp: llama/build-info.cpp.in llama/llama.cpp
|
.PHONY: llama/build-info.cpp
|
||||||
sed -e 's|@FETCH_HEAD@|$(FETCH_HEAD)|' <$< >$@
|
llama/build-info.cpp: llama/build-info.cpp.in
|
||||||
|
sed -e 's|@FETCH_HEAD@|$(FETCH_HEAD)|' $< > $@
|
||||||
ml/backend/ggml/ggml/src/ggml-metal/ggml-metal-embed.metal: ml/backend/ggml/ggml
|
|
||||||
go generate ./$(@D)
|
|
||||||
|
|
||||||
.PHONY: llama/llama.cpp
|
.PHONY: llama/llama.cpp
|
||||||
llama/llama.cpp: llama/vendor/
|
llama/llama.cpp: llama/vendor/
|
||||||
@ -32,13 +30,12 @@ ml/backend/ggml/ggml: llama/vendor/ggml/
|
|||||||
rsync -arvzc -f "merge $@/.rsync-filter" $< $@
|
rsync -arvzc -f "merge $@/.rsync-filter" $< $@
|
||||||
|
|
||||||
PATCHES=$(wildcard llama/patches/*.patch)
|
PATCHES=$(wildcard llama/patches/*.patch)
|
||||||
PATCHED=$(join $(dir $(PATCHES)), $(addsuffix ed, $(addprefix ., $(notdir $(PATCHES)))))
|
|
||||||
|
|
||||||
.PHONY: apply-patches
|
.PHONY: apply-patches
|
||||||
.NOTPARALLEL:
|
.NOTPARALLEL:
|
||||||
apply-patches: $(PATCHED)
|
apply-patches: $(addsuffix ed, $(PATCHES))
|
||||||
|
|
||||||
llama/patches/.%.patched: llama/patches/%.patch
|
%.patched: %.patch
|
||||||
@if git -c user.name=nobody -c 'user.email=<>' -C $(WORKDIR) am -3 $(realpath $<); then touch $@; else git -C $(WORKDIR) am --abort; exit 1; fi
|
@if git -c user.name=nobody -c 'user.email=<>' -C $(WORKDIR) am -3 $(realpath $<); then touch $@; else git -C $(WORKDIR) am --abort; exit 1; fi
|
||||||
|
|
||||||
.PHONY: checkout
|
.PHONY: checkout
|
||||||
@ -60,4 +57,4 @@ format-patches: llama/patches
|
|||||||
|
|
||||||
.PHONE: clean
|
.PHONE: clean
|
||||||
clean: checkout
|
clean: checkout
|
||||||
$(RM) llama/patches/.*.patched
|
$(RM) $(addsuffix ed, $(PATCHES))
|
||||||
|
49
README.md
49
README.md
@ -61,8 +61,6 @@ Here are some example models that can be downloaded:
|
|||||||
| QwQ | 32B | 20GB | `ollama run qwq` |
|
| QwQ | 32B | 20GB | `ollama run qwq` |
|
||||||
| DeepSeek-R1 | 7B | 4.7GB | `ollama run deepseek-r1` |
|
| DeepSeek-R1 | 7B | 4.7GB | `ollama run deepseek-r1` |
|
||||||
| DeepSeek-R1 | 671B | 404GB | `ollama run deepseek-r1:671b` |
|
| DeepSeek-R1 | 671B | 404GB | `ollama run deepseek-r1:671b` |
|
||||||
| Llama 4 | 109B | 67GB | `ollama run llama4:scout` |
|
|
||||||
| Llama 4 | 400B | 245GB | `ollama run llama4:maverick` |
|
|
||||||
| Llama 3.3 | 70B | 43GB | `ollama run llama3.3` |
|
| Llama 3.3 | 70B | 43GB | `ollama run llama3.3` |
|
||||||
| Llama 3.2 | 3B | 2.0GB | `ollama run llama3.2` |
|
| Llama 3.2 | 3B | 2.0GB | `ollama run llama3.2` |
|
||||||
| Llama 3.2 | 1B | 1.3GB | `ollama run llama3.2:1b` |
|
| Llama 3.2 | 1B | 1.3GB | `ollama run llama3.2:1b` |
|
||||||
@ -79,7 +77,7 @@ Here are some example models that can be downloaded:
|
|||||||
| Code Llama | 7B | 3.8GB | `ollama run codellama` |
|
| Code Llama | 7B | 3.8GB | `ollama run codellama` |
|
||||||
| Llama 2 Uncensored | 7B | 3.8GB | `ollama run llama2-uncensored` |
|
| Llama 2 Uncensored | 7B | 3.8GB | `ollama run llama2-uncensored` |
|
||||||
| LLaVA | 7B | 4.5GB | `ollama run llava` |
|
| LLaVA | 7B | 4.5GB | `ollama run llava` |
|
||||||
| Granite-3.3 | 8B | 4.9GB | `ollama run granite3.3` |
|
| Granite-3.2 | 8B | 4.9GB | `ollama run granite3.2` |
|
||||||
|
|
||||||
> [!NOTE]
|
> [!NOTE]
|
||||||
> You should have at least 8 GB of RAM available to run the 7B models, 16 GB to run the 13B models, and 32 GB to run the 33B models.
|
> You should have at least 8 GB of RAM available to run the 7B models, 16 GB to run the 13B models, and 32 GB to run the 33B models.
|
||||||
@ -287,13 +285,13 @@ See the [API documentation](./docs/api.md) for all endpoints.
|
|||||||
- [Bionic GPT](https://github.com/bionic-gpt/bionic-gpt)
|
- [Bionic GPT](https://github.com/bionic-gpt/bionic-gpt)
|
||||||
- [HTML UI](https://github.com/rtcfirefly/ollama-ui)
|
- [HTML UI](https://github.com/rtcfirefly/ollama-ui)
|
||||||
- [Saddle](https://github.com/jikkuatwork/saddle)
|
- [Saddle](https://github.com/jikkuatwork/saddle)
|
||||||
- [TagSpaces](https://www.tagspaces.org) (A platform for file-based apps, [utilizing Ollama](https://docs.tagspaces.org/ai/) for the generation of tags and descriptions)
|
- [TagSpaces](https://www.tagspaces.org) (A platform for file based apps, [utilizing Ollama](https://docs.tagspaces.org/ai/) for the generation of tags and descriptions)
|
||||||
- [Chatbot UI](https://github.com/ivanfioravanti/chatbot-ollama)
|
- [Chatbot UI](https://github.com/ivanfioravanti/chatbot-ollama)
|
||||||
- [Chatbot UI v2](https://github.com/mckaywrigley/chatbot-ui)
|
- [Chatbot UI v2](https://github.com/mckaywrigley/chatbot-ui)
|
||||||
- [Typescript UI](https://github.com/ollama-interface/Ollama-Gui?tab=readme-ov-file)
|
- [Typescript UI](https://github.com/ollama-interface/Ollama-Gui?tab=readme-ov-file)
|
||||||
- [Minimalistic React UI for Ollama Models](https://github.com/richawo/minimal-llm-ui)
|
- [Minimalistic React UI for Ollama Models](https://github.com/richawo/minimal-llm-ui)
|
||||||
- [Ollamac](https://github.com/kevinhermawan/Ollamac)
|
- [Ollamac](https://github.com/kevinhermawan/Ollamac)
|
||||||
- [big-AGI](https://github.com/enricoros/big-AGI)
|
- [big-AGI](https://github.com/enricoros/big-AGI)
|
||||||
- [Cheshire Cat assistant framework](https://github.com/cheshire-cat-ai/core)
|
- [Cheshire Cat assistant framework](https://github.com/cheshire-cat-ai/core)
|
||||||
- [Amica](https://github.com/semperai/amica)
|
- [Amica](https://github.com/semperai/amica)
|
||||||
- [chatd](https://github.com/BruceMacD/chatd)
|
- [chatd](https://github.com/BruceMacD/chatd)
|
||||||
@ -314,8 +312,6 @@ See the [API documentation](./docs/api.md) for all endpoints.
|
|||||||
- [Ollama Basic Chat: Uses HyperDiv Reactive UI](https://github.com/rapidarchitect/ollama_basic_chat)
|
- [Ollama Basic Chat: Uses HyperDiv Reactive UI](https://github.com/rapidarchitect/ollama_basic_chat)
|
||||||
- [Ollama-chats RPG](https://github.com/drazdra/ollama-chats)
|
- [Ollama-chats RPG](https://github.com/drazdra/ollama-chats)
|
||||||
- [IntelliBar](https://intellibar.app/) (AI-powered assistant for macOS)
|
- [IntelliBar](https://intellibar.app/) (AI-powered assistant for macOS)
|
||||||
- [Jirapt](https://github.com/AliAhmedNada/jirapt) (Jira Integration to generate issues, tasks, epics)
|
|
||||||
- [ojira](https://github.com/AliAhmedNada/ojira) (Jira chrome plugin to easily generate descriptions for tasks)
|
|
||||||
- [QA-Pilot](https://github.com/reid41/QA-Pilot) (Interactive chat tool that can leverage Ollama models for rapid understanding and navigation of GitHub code repositories)
|
- [QA-Pilot](https://github.com/reid41/QA-Pilot) (Interactive chat tool that can leverage Ollama models for rapid understanding and navigation of GitHub code repositories)
|
||||||
- [ChatOllama](https://github.com/sugarforever/chat-ollama) (Open Source Chatbot based on Ollama with Knowledge Bases)
|
- [ChatOllama](https://github.com/sugarforever/chat-ollama) (Open Source Chatbot based on Ollama with Knowledge Bases)
|
||||||
- [CRAG Ollama Chat](https://github.com/Nagi-ovo/CRAG-Ollama-Chat) (Simple Web Search with Corrective RAG)
|
- [CRAG Ollama Chat](https://github.com/Nagi-ovo/CRAG-Ollama-Chat) (Simple Web Search with Corrective RAG)
|
||||||
@ -329,14 +325,14 @@ See the [API documentation](./docs/api.md) for all endpoints.
|
|||||||
- [RWKV-Runner](https://github.com/josStorer/RWKV-Runner) (RWKV offline LLM deployment tool, also usable as a client for ChatGPT and Ollama)
|
- [RWKV-Runner](https://github.com/josStorer/RWKV-Runner) (RWKV offline LLM deployment tool, also usable as a client for ChatGPT and Ollama)
|
||||||
- [Ollama Grid Search](https://github.com/dezoito/ollama-grid-search) (app to evaluate and compare models)
|
- [Ollama Grid Search](https://github.com/dezoito/ollama-grid-search) (app to evaluate and compare models)
|
||||||
- [Olpaka](https://github.com/Otacon/olpaka) (User-friendly Flutter Web App for Ollama)
|
- [Olpaka](https://github.com/Otacon/olpaka) (User-friendly Flutter Web App for Ollama)
|
||||||
- [Casibase](https://casibase.org) (An open source AI knowledge base and dialogue system combining the latest RAG, SSO, ollama support, and multiple large language models.)
|
- [Casibase](https://casibase.org) (An open source AI knowledge base and dialogue system combining the latest RAG, SSO, ollama support and multiple large language models.)
|
||||||
- [OllamaSpring](https://github.com/CrazyNeil/OllamaSpring) (Ollama Client for macOS)
|
- [OllamaSpring](https://github.com/CrazyNeil/OllamaSpring) (Ollama Client for macOS)
|
||||||
- [LLocal.in](https://github.com/kartikm7/llocal) (Easy to use Electron Desktop Client for Ollama)
|
- [LLocal.in](https://github.com/kartikm7/llocal) (Easy to use Electron Desktop Client for Ollama)
|
||||||
- [Shinkai Desktop](https://github.com/dcSpark/shinkai-apps) (Two click install Local AI using Ollama + Files + RAG)
|
- [Shinkai Desktop](https://github.com/dcSpark/shinkai-apps) (Two click install Local AI using Ollama + Files + RAG)
|
||||||
- [AiLama](https://github.com/zeyoyt/ailama) (A Discord User App that allows you to interact with Ollama anywhere in Discord)
|
- [AiLama](https://github.com/zeyoyt/ailama) (A Discord User App that allows you to interact with Ollama anywhere in discord )
|
||||||
- [Ollama with Google Mesop](https://github.com/rapidarchitect/ollama_mesop/) (Mesop Chat Client implementation with Ollama)
|
- [Ollama with Google Mesop](https://github.com/rapidarchitect/ollama_mesop/) (Mesop Chat Client implementation with Ollama)
|
||||||
- [R2R](https://github.com/SciPhi-AI/R2R) (Open-source RAG engine)
|
- [R2R](https://github.com/SciPhi-AI/R2R) (Open-source RAG engine)
|
||||||
- [Ollama-Kis](https://github.com/elearningshow/ollama-kis) (A simple easy-to-use GUI with sample custom LLM for Drivers Education)
|
- [Ollama-Kis](https://github.com/elearningshow/ollama-kis) (A simple easy to use GUI with sample custom LLM for Drivers Education)
|
||||||
- [OpenGPA](https://opengpa.org) (Open-source offline-first Enterprise Agentic Application)
|
- [OpenGPA](https://opengpa.org) (Open-source offline-first Enterprise Agentic Application)
|
||||||
- [Painting Droid](https://github.com/mateuszmigas/painting-droid) (Painting app with AI integrations)
|
- [Painting Droid](https://github.com/mateuszmigas/painting-droid) (Painting app with AI integrations)
|
||||||
- [Kerlig AI](https://www.kerlig.com/) (AI writing assistant for macOS)
|
- [Kerlig AI](https://www.kerlig.com/) (AI writing assistant for macOS)
|
||||||
@ -345,16 +341,16 @@ See the [API documentation](./docs/api.md) for all endpoints.
|
|||||||
- [LLMStack](https://github.com/trypromptly/LLMStack) (No-code multi-agent framework to build LLM agents and workflows)
|
- [LLMStack](https://github.com/trypromptly/LLMStack) (No-code multi-agent framework to build LLM agents and workflows)
|
||||||
- [BoltAI for Mac](https://boltai.com) (AI Chat Client for Mac)
|
- [BoltAI for Mac](https://boltai.com) (AI Chat Client for Mac)
|
||||||
- [Harbor](https://github.com/av/harbor) (Containerized LLM Toolkit with Ollama as default backend)
|
- [Harbor](https://github.com/av/harbor) (Containerized LLM Toolkit with Ollama as default backend)
|
||||||
- [PyGPT](https://github.com/szczyglis-dev/py-gpt) (AI desktop assistant for Linux, Windows, and Mac)
|
- [PyGPT](https://github.com/szczyglis-dev/py-gpt) (AI desktop assistant for Linux, Windows and Mac)
|
||||||
- [Alpaca](https://github.com/Jeffser/Alpaca) (An Ollama client application for Linux and macOS made with GTK4 and Adwaita)
|
- [Alpaca](https://github.com/Jeffser/Alpaca) (An Ollama client application for linux and macos made with GTK4 and Adwaita)
|
||||||
- [AutoGPT](https://github.com/Significant-Gravitas/AutoGPT/blob/master/docs/content/platform/ollama.md) (AutoGPT Ollama integration)
|
- [AutoGPT](https://github.com/Significant-Gravitas/AutoGPT/blob/master/docs/content/platform/ollama.md) (AutoGPT Ollama integration)
|
||||||
- [Go-CREW](https://www.jonathanhecl.com/go-crew/) (Powerful Offline RAG in Golang)
|
- [Go-CREW](https://www.jonathanhecl.com/go-crew/) (Powerful Offline RAG in Golang)
|
||||||
- [PartCAD](https://github.com/openvmp/partcad/) (CAD model generation with OpenSCAD and CadQuery)
|
- [PartCAD](https://github.com/openvmp/partcad/) (CAD model generation with OpenSCAD and CadQuery)
|
||||||
- [Ollama4j Web UI](https://github.com/ollama4j/ollama4j-web-ui) - Java-based Web UI for Ollama built with Vaadin, Spring Boot, and Ollama4j
|
- [Ollama4j Web UI](https://github.com/ollama4j/ollama4j-web-ui) - Java-based Web UI for Ollama built with Vaadin, Spring Boot and Ollama4j
|
||||||
- [PyOllaMx](https://github.com/kspviswa/pyOllaMx) - macOS application capable of chatting with both Ollama and Apple MLX models.
|
- [PyOllaMx](https://github.com/kspviswa/pyOllaMx) - macOS application capable of chatting with both Ollama and Apple MLX models.
|
||||||
- [Cline](https://github.com/cline/cline) - Formerly known as Claude Dev is a VSCode extension for multi-file/whole-repo coding
|
- [Cline](https://github.com/cline/cline) - Formerly known as Claude Dev is a VSCode extension for multi-file/whole-repo coding
|
||||||
- [Cherry Studio](https://github.com/kangfenmao/cherry-studio) (Desktop client with Ollama support)
|
- [Cherry Studio](https://github.com/kangfenmao/cherry-studio) (Desktop client with Ollama support)
|
||||||
- [ConfiChat](https://github.com/1runeberg/confichat) (Lightweight, standalone, multi-platform, and privacy-focused LLM chat interface with optional encryption)
|
- [ConfiChat](https://github.com/1runeberg/confichat) (Lightweight, standalone, multi-platform, and privacy focused LLM chat interface with optional encryption)
|
||||||
- [Archyve](https://github.com/nickthecook/archyve) (RAG-enabling document library)
|
- [Archyve](https://github.com/nickthecook/archyve) (RAG-enabling document library)
|
||||||
- [crewAI with Mesop](https://github.com/rapidarchitect/ollama-crew-mesop) (Mesop Web Interface to run crewAI with Ollama)
|
- [crewAI with Mesop](https://github.com/rapidarchitect/ollama-crew-mesop) (Mesop Web Interface to run crewAI with Ollama)
|
||||||
- [Tkinter-based client](https://github.com/chyok/ollama-gui) (Python tkinter-based Client for Ollama)
|
- [Tkinter-based client](https://github.com/chyok/ollama-gui) (Python tkinter-based Client for Ollama)
|
||||||
@ -372,7 +368,7 @@ See the [API documentation](./docs/api.md) for all endpoints.
|
|||||||
- [DualMind](https://github.com/tcsenpai/dualmind) (Experimental app allowing two models to talk to each other in the terminal or in a web interface)
|
- [DualMind](https://github.com/tcsenpai/dualmind) (Experimental app allowing two models to talk to each other in the terminal or in a web interface)
|
||||||
- [ollamarama-matrix](https://github.com/h1ddenpr0cess20/ollamarama-matrix) (Ollama chatbot for the Matrix chat protocol)
|
- [ollamarama-matrix](https://github.com/h1ddenpr0cess20/ollamarama-matrix) (Ollama chatbot for the Matrix chat protocol)
|
||||||
- [ollama-chat-app](https://github.com/anan1213095357/ollama-chat-app) (Flutter-based chat app)
|
- [ollama-chat-app](https://github.com/anan1213095357/ollama-chat-app) (Flutter-based chat app)
|
||||||
- [Perfect Memory AI](https://www.perfectmemory.ai/) (Productivity AI assists personalized by what you have seen on your screen, heard, and said in the meetings)
|
- [Perfect Memory AI](https://www.perfectmemory.ai/) (Productivity AI assists personalized by what you have seen on your screen, heard and said in the meetings)
|
||||||
- [Hexabot](https://github.com/hexastack/hexabot) (A conversational AI builder)
|
- [Hexabot](https://github.com/hexastack/hexabot) (A conversational AI builder)
|
||||||
- [Reddit Rate](https://github.com/rapidarchitect/reddit_analyzer) (Search and Rate Reddit topics with a weighted summation)
|
- [Reddit Rate](https://github.com/rapidarchitect/reddit_analyzer) (Search and Rate Reddit topics with a weighted summation)
|
||||||
- [OpenTalkGpt](https://github.com/adarshM84/OpenTalkGpt) (Chrome Extension to manage open-source models supported by Ollama, create custom models, and chat with models from a user-friendly UI)
|
- [OpenTalkGpt](https://github.com/adarshM84/OpenTalkGpt) (Chrome Extension to manage open-source models supported by Ollama, create custom models, and chat with models from a user-friendly UI)
|
||||||
@ -390,7 +386,7 @@ See the [API documentation](./docs/api.md) for all endpoints.
|
|||||||
- [ChibiChat](https://github.com/CosmicEventHorizon/ChibiChat) (Kotlin-based Android app to chat with Ollama and Koboldcpp API endpoints)
|
- [ChibiChat](https://github.com/CosmicEventHorizon/ChibiChat) (Kotlin-based Android app to chat with Ollama and Koboldcpp API endpoints)
|
||||||
- [LocalLLM](https://github.com/qusaismael/localllm) (Minimal Web-App to run ollama models on it with a GUI)
|
- [LocalLLM](https://github.com/qusaismael/localllm) (Minimal Web-App to run ollama models on it with a GUI)
|
||||||
- [Ollamazing](https://github.com/buiducnhat/ollamazing) (Web extension to run Ollama models)
|
- [Ollamazing](https://github.com/buiducnhat/ollamazing) (Web extension to run Ollama models)
|
||||||
- [OpenDeepResearcher-via-searxng](https://github.com/benhaotang/OpenDeepResearcher-via-searxng) (A Deep Research equivalent endpoint with Ollama support for running locally)
|
- [OpenDeepResearcher-via-searxng](https://github.com/benhaotang/OpenDeepResearcher-via-searxng) (A Deep Research equivent endpoint with Ollama support for running locally)
|
||||||
- [AntSK](https://github.com/AIDotNet/AntSK) (Out-of-the-box & Adaptable RAG Chatbot)
|
- [AntSK](https://github.com/AIDotNet/AntSK) (Out-of-the-box & Adaptable RAG Chatbot)
|
||||||
- [MaxKB](https://github.com/1Panel-dev/MaxKB/) (Ready-to-use & flexible RAG Chatbot)
|
- [MaxKB](https://github.com/1Panel-dev/MaxKB/) (Ready-to-use & flexible RAG Chatbot)
|
||||||
- [yla](https://github.com/danielekp/yla) (Web interface to freely interact with your customized models)
|
- [yla](https://github.com/danielekp/yla) (Web interface to freely interact with your customized models)
|
||||||
@ -398,13 +394,10 @@ See the [API documentation](./docs/api.md) for all endpoints.
|
|||||||
- [1Panel](https://github.com/1Panel-dev/1Panel/) (Web-based Linux Server Management Tool)
|
- [1Panel](https://github.com/1Panel-dev/1Panel/) (Web-based Linux Server Management Tool)
|
||||||
- [AstrBot](https://github.com/Soulter/AstrBot/) (User-friendly LLM-based multi-platform chatbot with a WebUI, supporting RAG, LLM agents, and plugins integration)
|
- [AstrBot](https://github.com/Soulter/AstrBot/) (User-friendly LLM-based multi-platform chatbot with a WebUI, supporting RAG, LLM agents, and plugins integration)
|
||||||
- [Reins](https://github.com/ibrahimcetin/reins) (Easily tweak parameters, customize system prompts per chat, and enhance your AI experiments with reasoning model support.)
|
- [Reins](https://github.com/ibrahimcetin/reins) (Easily tweak parameters, customize system prompts per chat, and enhance your AI experiments with reasoning model support.)
|
||||||
- [Flufy](https://github.com/Aharon-Bensadoun/Flufy) (A beautiful chat interface for interacting with Ollama's API. Built with React, TypeScript, and Material-UI.)
|
|
||||||
- [Ellama](https://github.com/zeozeozeo/ellama) (Friendly native app to chat with an Ollama instance)
|
- [Ellama](https://github.com/zeozeozeo/ellama) (Friendly native app to chat with an Ollama instance)
|
||||||
- [screenpipe](https://github.com/mediar-ai/screenpipe) Build agents powered by your screen history
|
- [screenpipe](https://github.com/mediar-ai/screenpipe) Build agents powered by your screen history
|
||||||
- [Ollamb](https://github.com/hengkysteen/ollamb) (Simple yet rich in features, cross-platform built with Flutter and designed for Ollama. Try the [web demo](https://hengkysteen.github.io/demo/ollamb/).)
|
- [Ollamb](https://github.com/hengkysteen/ollamb) (Simple yet rich in features, cross-platform built with Flutter and designed for Ollama. Try the [web demo](https://hengkysteen.github.io/demo/ollamb/).)
|
||||||
- [Writeopia](https://github.com/Writeopia/Writeopia) (Text editor with integration with Ollama)
|
- [Writeopia](https://github.com/Writeopia/Writeopia) (Text editor with integration with Ollama)
|
||||||
- [AppFlowy](https://github.com/AppFlowy-IO/AppFlowy) (AI collaborative workspace with Ollama, cross-platform and self-hostable)
|
|
||||||
- [Lumina](https://github.com/cushydigit/lumina.git) (A lightweight, minimal React.js frontend for interacting with Ollama servers)
|
|
||||||
|
|
||||||
### Cloud
|
### Cloud
|
||||||
|
|
||||||
@ -446,7 +439,7 @@ See the [API documentation](./docs/api.md) for all endpoints.
|
|||||||
- [PowershAI](https://github.com/rrg92/powershai) PowerShell module that brings AI to terminal on Windows, including support for Ollama
|
- [PowershAI](https://github.com/rrg92/powershai) PowerShell module that brings AI to terminal on Windows, including support for Ollama
|
||||||
- [DeepShell](https://github.com/Abyss-c0re/deepshell) Your self-hosted AI assistant. Interactive Shell, Files and Folders analysis.
|
- [DeepShell](https://github.com/Abyss-c0re/deepshell) Your self-hosted AI assistant. Interactive Shell, Files and Folders analysis.
|
||||||
- [orbiton](https://github.com/xyproto/orbiton) Configuration-free text editor and IDE with support for tab completion with Ollama.
|
- [orbiton](https://github.com/xyproto/orbiton) Configuration-free text editor and IDE with support for tab completion with Ollama.
|
||||||
- [orca-cli](https://github.com/molbal/orca-cli) Ollama Registry CLI Application - Browse, pull, and download models from Ollama Registry in your terminal.
|
- [orca-cli](https://github.com/molbal/orca-cli) Ollama Registry CLI Application - Browse, pull and download models from Ollama Registry in your terminal.
|
||||||
- [GGUF-to-Ollama](https://github.com/jonathanhecl/gguf-to-ollama) - Importing GGUF to Ollama made easy (multiplatform)
|
- [GGUF-to-Ollama](https://github.com/jonathanhecl/gguf-to-ollama) - Importing GGUF to Ollama made easy (multiplatform)
|
||||||
|
|
||||||
### Apple Vision Pro
|
### Apple Vision Pro
|
||||||
@ -474,7 +467,7 @@ See the [API documentation](./docs/api.md) for all endpoints.
|
|||||||
|
|
||||||
### Libraries
|
### Libraries
|
||||||
|
|
||||||
- [LangChain](https://python.langchain.com/docs/integrations/chat/ollama/) and [LangChain.js](https://js.langchain.com/docs/integrations/chat/ollama/) with [example](https://js.langchain.com/docs/tutorials/local_rag/)
|
- [LangChain](https://python.langchain.com/docs/integrations/llms/ollama) and [LangChain.js](https://js.langchain.com/docs/integrations/chat/ollama/) with [example](https://js.langchain.com/docs/tutorials/local_rag/)
|
||||||
- [Firebase Genkit](https://firebase.google.com/docs/genkit/plugins/ollama)
|
- [Firebase Genkit](https://firebase.google.com/docs/genkit/plugins/ollama)
|
||||||
- [crewAI](https://github.com/crewAIInc/crewAI)
|
- [crewAI](https://github.com/crewAIInc/crewAI)
|
||||||
- [Yacana](https://remembersoftwares.github.io/yacana/) (User-friendly multi-agent framework for brainstorming and executing predetermined flows with built-in tool integration)
|
- [Yacana](https://remembersoftwares.github.io/yacana/) (User-friendly multi-agent framework for brainstorming and executing predetermined flows with built-in tool integration)
|
||||||
@ -521,21 +514,20 @@ See the [API documentation](./docs/api.md) for all endpoints.
|
|||||||
- [Swollama for Swift](https://github.com/marcusziade/Swollama) with [DocC](https://marcusziade.github.io/Swollama/documentation/swollama/)
|
- [Swollama for Swift](https://github.com/marcusziade/Swollama) with [DocC](https://marcusziade.github.io/Swollama/documentation/swollama/)
|
||||||
- [GoLamify](https://github.com/prasad89/golamify)
|
- [GoLamify](https://github.com/prasad89/golamify)
|
||||||
- [Ollama for Haskell](https://github.com/tusharad/ollama-haskell)
|
- [Ollama for Haskell](https://github.com/tusharad/ollama-haskell)
|
||||||
- [multi-llm-ts](https://github.com/nbonamy/multi-llm-ts) (A Typescript/JavaScript library allowing access to different LLM in a unified API)
|
- [multi-llm-ts](https://github.com/nbonamy/multi-llm-ts) (A Typescript/JavaScript library allowing access to different LLM in unified API)
|
||||||
- [LlmTornado](https://github.com/lofcz/llmtornado) (C# library providing a unified interface for major FOSS & Commercial inference APIs)
|
- [LlmTornado](https://github.com/lofcz/llmtornado) (C# library providing a unified interface for major FOSS & Commercial inference APIs)
|
||||||
- [Ollama for Zig](https://github.com/dravenk/ollama-zig)
|
- [Ollama for Zig](https://github.com/dravenk/ollama-zig)
|
||||||
- [Abso](https://github.com/lunary-ai/abso) (OpenAI-compatible TypeScript SDK for any LLM provider)
|
- [Abso](https://github.com/lunary-ai/abso) (OpenAI-compatible TypeScript SDK for any LLM provider)
|
||||||
- [Nichey](https://github.com/goodreasonai/nichey) is a Python package for generating custom wikis for your research topic
|
- [Nichey](https://github.com/goodreasonai/nichey) is a Python package for generating custom wikis for your research topic
|
||||||
- [Ollama for D](https://github.com/kassane/ollama-d)
|
- [Ollama for D](https://github.com/kassane/ollama-d)
|
||||||
- [OllamaPlusPlus](https://github.com/HardCodeDev777/OllamaPlusPlus) (Very simple C++ library for Ollama)
|
|
||||||
|
|
||||||
### Mobile
|
### Mobile
|
||||||
|
|
||||||
- [SwiftChat](https://github.com/aws-samples/swift-chat) (Lightning-fast Cross-platform AI chat app with native UI for Android, iOS, and iPad)
|
- [SwiftChat](https://github.com/aws-samples/swift-chat) (Lightning-fast Cross-platform AI chat app with native UI for Android, iOS and iPad)
|
||||||
- [Enchanted](https://github.com/AugustDev/enchanted)
|
- [Enchanted](https://github.com/AugustDev/enchanted)
|
||||||
- [Maid](https://github.com/Mobile-Artificial-Intelligence/maid)
|
- [Maid](https://github.com/Mobile-Artificial-Intelligence/maid)
|
||||||
- [Ollama App](https://github.com/JHubi1/ollama-app) (Modern and easy-to-use multi-platform client for Ollama)
|
- [Ollama App](https://github.com/JHubi1/ollama-app) (Modern and easy-to-use multi-platform client for Ollama)
|
||||||
- [ConfiChat](https://github.com/1runeberg/confichat) (Lightweight, standalone, multi-platform, and privacy-focused LLM chat interface with optional encryption)
|
- [ConfiChat](https://github.com/1runeberg/confichat) (Lightweight, standalone, multi-platform, and privacy focused LLM chat interface with optional encryption)
|
||||||
- [Ollama Android Chat](https://github.com/sunshine0523/OllamaServer) (No need for Termux, start the Ollama service with one click on an Android device)
|
- [Ollama Android Chat](https://github.com/sunshine0523/OllamaServer) (No need for Termux, start the Ollama service with one click on an Android device)
|
||||||
- [Reins](https://github.com/ibrahimcetin/reins) (Easily tweak parameters, customize system prompts per chat, and enhance your AI experiments with reasoning model support.)
|
- [Reins](https://github.com/ibrahimcetin/reins) (Easily tweak parameters, customize system prompts per chat, and enhance your AI experiments with reasoning model support.)
|
||||||
|
|
||||||
@ -559,7 +551,7 @@ See the [API documentation](./docs/api.md) for all endpoints.
|
|||||||
- [Obsidian Local GPT plugin](https://github.com/pfrankov/obsidian-local-gpt)
|
- [Obsidian Local GPT plugin](https://github.com/pfrankov/obsidian-local-gpt)
|
||||||
- [Open Interpreter](https://docs.openinterpreter.com/language-model-setup/local-models/ollama)
|
- [Open Interpreter](https://docs.openinterpreter.com/language-model-setup/local-models/ollama)
|
||||||
- [Llama Coder](https://github.com/ex3ndr/llama-coder) (Copilot alternative using Ollama)
|
- [Llama Coder](https://github.com/ex3ndr/llama-coder) (Copilot alternative using Ollama)
|
||||||
- [Ollama Copilot](https://github.com/bernardo-bruning/ollama-copilot) (Proxy that allows you to use Ollama as a copilot like GitHub Copilot)
|
- [Ollama Copilot](https://github.com/bernardo-bruning/ollama-copilot) (Proxy that allows you to use ollama as a copilot like Github copilot)
|
||||||
- [twinny](https://github.com/rjmacarthy/twinny) (Copilot and Copilot chat alternative using Ollama)
|
- [twinny](https://github.com/rjmacarthy/twinny) (Copilot and Copilot chat alternative using Ollama)
|
||||||
- [Wingman-AI](https://github.com/RussellCanfield/wingman-ai) (Copilot code and chat alternative using Ollama and Hugging Face)
|
- [Wingman-AI](https://github.com/RussellCanfield/wingman-ai) (Copilot code and chat alternative using Ollama and Hugging Face)
|
||||||
- [Page Assist](https://github.com/n4ze3m/page-assist) (Chrome Extension)
|
- [Page Assist](https://github.com/n4ze3m/page-assist) (Chrome Extension)
|
||||||
@ -569,8 +561,8 @@ See the [API documentation](./docs/api.md) for all endpoints.
|
|||||||
- [Discord-Ollama Chat Bot](https://github.com/kevinthedang/discord-ollama) (Generalized TypeScript Discord Bot w/ Tuning Documentation)
|
- [Discord-Ollama Chat Bot](https://github.com/kevinthedang/discord-ollama) (Generalized TypeScript Discord Bot w/ Tuning Documentation)
|
||||||
- [ChatGPTBox: All in one browser extension](https://github.com/josStorer/chatGPTBox) with [Integrating Tutorial](https://github.com/josStorer/chatGPTBox/issues/616#issuecomment-1975186467)
|
- [ChatGPTBox: All in one browser extension](https://github.com/josStorer/chatGPTBox) with [Integrating Tutorial](https://github.com/josStorer/chatGPTBox/issues/616#issuecomment-1975186467)
|
||||||
- [Discord AI chat/moderation bot](https://github.com/rapmd73/Companion) Chat/moderation bot written in python. Uses Ollama to create personalities.
|
- [Discord AI chat/moderation bot](https://github.com/rapmd73/Companion) Chat/moderation bot written in python. Uses Ollama to create personalities.
|
||||||
- [Headless Ollama](https://github.com/nischalj10/headless-ollama) (Scripts to automatically install ollama client & models on any OS for apps that depend on ollama server)
|
- [Headless Ollama](https://github.com/nischalj10/headless-ollama) (Scripts to automatically install ollama client & models on any OS for apps that depends on ollama server)
|
||||||
- [Terraform AWS Ollama & Open WebUI](https://github.com/xuyangbocn/terraform-aws-self-host-llm) (A Terraform module to deploy on AWS a ready-to-use Ollama service, together with its front-end Open WebUI service.)
|
- [Terraform AWS Ollama & Open WebUI](https://github.com/xuyangbocn/terraform-aws-self-host-llm) (A Terraform module to deploy on AWS a ready-to-use Ollama service, together with its front end Open WebUI service.)
|
||||||
- [node-red-contrib-ollama](https://github.com/jakubburkiewicz/node-red-contrib-ollama)
|
- [node-red-contrib-ollama](https://github.com/jakubburkiewicz/node-red-contrib-ollama)
|
||||||
- [Local AI Helper](https://github.com/ivostoykov/localAI) (Chrome and Firefox extensions that enable interactions with the active tab and customisable API endpoints. Includes secure storage for user prompts.)
|
- [Local AI Helper](https://github.com/ivostoykov/localAI) (Chrome and Firefox extensions that enable interactions with the active tab and customisable API endpoints. Includes secure storage for user prompts.)
|
||||||
- [vnc-lm](https://github.com/jake83741/vnc-lm) (Discord bot for messaging with LLMs through Ollama and LiteLLM. Seamlessly move between local and flagship models.)
|
- [vnc-lm](https://github.com/jake83741/vnc-lm) (Discord bot for messaging with LLMs through Ollama and LiteLLM. Seamlessly move between local and flagship models.)
|
||||||
@ -584,7 +576,6 @@ See the [API documentation](./docs/api.md) for all endpoints.
|
|||||||
- [Simple-Discord-AI](https://github.com/zyphixor/simple-discord-ai)
|
- [Simple-Discord-AI](https://github.com/zyphixor/simple-discord-ai)
|
||||||
- [LLM Telegram Bot](https://github.com/innightwolfsleep/llm_telegram_bot) (telegram bot, primary for RP. Oobabooga-like buttons, [A1111](https://github.com/AUTOMATIC1111/stable-diffusion-webui) API integration e.t.c)
|
- [LLM Telegram Bot](https://github.com/innightwolfsleep/llm_telegram_bot) (telegram bot, primary for RP. Oobabooga-like buttons, [A1111](https://github.com/AUTOMATIC1111/stable-diffusion-webui) API integration e.t.c)
|
||||||
- [mcp-llm](https://github.com/sammcj/mcp-llm) (MCP Server to allow LLMs to call other LLMs)
|
- [mcp-llm](https://github.com/sammcj/mcp-llm) (MCP Server to allow LLMs to call other LLMs)
|
||||||
- [UnityCodeLama](https://github.com/HardCodeDev777/UnityCodeLama) (Unity Edtior tool to analyze scripts via Ollama)
|
|
||||||
|
|
||||||
### Supported backends
|
### Supported backends
|
||||||
|
|
||||||
|
@ -1,6 +1,7 @@
|
|||||||
package api
|
package api
|
||||||
|
|
||||||
import (
|
import (
|
||||||
|
"context"
|
||||||
"encoding/json"
|
"encoding/json"
|
||||||
"fmt"
|
"fmt"
|
||||||
"net/http"
|
"net/http"
|
||||||
@ -136,7 +137,7 @@ func TestClientStream(t *testing.T) {
|
|||||||
client := NewClient(&url.URL{Scheme: "http", Host: ts.Listener.Addr().String()}, http.DefaultClient)
|
client := NewClient(&url.URL{Scheme: "http", Host: ts.Listener.Addr().String()}, http.DefaultClient)
|
||||||
|
|
||||||
var receivedChunks []ChatResponse
|
var receivedChunks []ChatResponse
|
||||||
err := client.stream(t.Context(), http.MethodPost, "/v1/chat", nil, func(chunk []byte) error {
|
err := client.stream(context.Background(), http.MethodPost, "/v1/chat", nil, func(chunk []byte) error {
|
||||||
var resp ChatResponse
|
var resp ChatResponse
|
||||||
if err := json.Unmarshal(chunk, &resp); err != nil {
|
if err := json.Unmarshal(chunk, &resp); err != nil {
|
||||||
return fmt.Errorf("failed to unmarshal chunk: %w", err)
|
return fmt.Errorf("failed to unmarshal chunk: %w", err)
|
||||||
@ -222,7 +223,7 @@ func TestClientDo(t *testing.T) {
|
|||||||
ID string `json:"id"`
|
ID string `json:"id"`
|
||||||
Success bool `json:"success"`
|
Success bool `json:"success"`
|
||||||
}
|
}
|
||||||
err := client.do(t.Context(), http.MethodPost, "/v1/messages", nil, &resp)
|
err := client.do(context.Background(), http.MethodPost, "/v1/messages", nil, &resp)
|
||||||
|
|
||||||
if tc.wantErr != "" {
|
if tc.wantErr != "" {
|
||||||
if err == nil {
|
if err == nil {
|
||||||
|
22
api/types.go
22
api/types.go
@ -76,7 +76,7 @@ type GenerateRequest struct {
|
|||||||
// this request.
|
// this request.
|
||||||
KeepAlive *Duration `json:"keep_alive,omitempty"`
|
KeepAlive *Duration `json:"keep_alive,omitempty"`
|
||||||
|
|
||||||
// Images is an optional list of raw image bytes accompanying this
|
// Images is an optional list of base64-encoded images accompanying this
|
||||||
// request, for multimodal models.
|
// request, for multimodal models.
|
||||||
Images []ImageData `json:"images,omitempty"`
|
Images []ImageData `json:"images,omitempty"`
|
||||||
|
|
||||||
@ -271,6 +271,9 @@ type Options struct {
|
|||||||
RepeatPenalty float32 `json:"repeat_penalty,omitempty"`
|
RepeatPenalty float32 `json:"repeat_penalty,omitempty"`
|
||||||
PresencePenalty float32 `json:"presence_penalty,omitempty"`
|
PresencePenalty float32 `json:"presence_penalty,omitempty"`
|
||||||
FrequencyPenalty float32 `json:"frequency_penalty,omitempty"`
|
FrequencyPenalty float32 `json:"frequency_penalty,omitempty"`
|
||||||
|
Mirostat int `json:"mirostat,omitempty"`
|
||||||
|
MirostatTau float32 `json:"mirostat_tau,omitempty"`
|
||||||
|
MirostatEta float32 `json:"mirostat_eta,omitempty"`
|
||||||
Stop []string `json:"stop,omitempty"`
|
Stop []string `json:"stop,omitempty"`
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -280,7 +283,12 @@ type Runner struct {
|
|||||||
NumBatch int `json:"num_batch,omitempty"`
|
NumBatch int `json:"num_batch,omitempty"`
|
||||||
NumGPU int `json:"num_gpu,omitempty"`
|
NumGPU int `json:"num_gpu,omitempty"`
|
||||||
MainGPU int `json:"main_gpu,omitempty"`
|
MainGPU int `json:"main_gpu,omitempty"`
|
||||||
|
LowVRAM bool `json:"low_vram,omitempty"`
|
||||||
|
F16KV bool `json:"f16_kv,omitempty"` // Deprecated: This option is ignored
|
||||||
|
LogitsAll bool `json:"logits_all,omitempty"`
|
||||||
|
VocabOnly bool `json:"vocab_only,omitempty"`
|
||||||
UseMMap *bool `json:"use_mmap,omitempty"`
|
UseMMap *bool `json:"use_mmap,omitempty"`
|
||||||
|
UseMLock bool `json:"use_mlock,omitempty"`
|
||||||
NumThread int `json:"num_thread,omitempty"`
|
NumThread int `json:"num_thread,omitempty"`
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -463,6 +471,13 @@ type ProcessModelResponse struct {
|
|||||||
SizeVRAM int64 `json:"size_vram"`
|
SizeVRAM int64 `json:"size_vram"`
|
||||||
}
|
}
|
||||||
|
|
||||||
|
type RetrieveModelResponse struct {
|
||||||
|
Id string `json:"id"`
|
||||||
|
Object string `json:"object"`
|
||||||
|
Created int64 `json:"created"`
|
||||||
|
OwnedBy string `json:"owned_by"`
|
||||||
|
}
|
||||||
|
|
||||||
type TokenResponse struct {
|
type TokenResponse struct {
|
||||||
Token string `json:"token"`
|
Token string `json:"token"`
|
||||||
}
|
}
|
||||||
@ -645,6 +660,9 @@ func DefaultOptions() Options {
|
|||||||
RepeatPenalty: 1.1,
|
RepeatPenalty: 1.1,
|
||||||
PresencePenalty: 0.0,
|
PresencePenalty: 0.0,
|
||||||
FrequencyPenalty: 0.0,
|
FrequencyPenalty: 0.0,
|
||||||
|
Mirostat: 0,
|
||||||
|
MirostatTau: 5.0,
|
||||||
|
MirostatEta: 0.1,
|
||||||
Seed: -1,
|
Seed: -1,
|
||||||
|
|
||||||
Runner: Runner{
|
Runner: Runner{
|
||||||
@ -653,6 +671,8 @@ func DefaultOptions() Options {
|
|||||||
NumBatch: 512,
|
NumBatch: 512,
|
||||||
NumGPU: -1, // -1 here indicates that NumGPU should be set dynamically
|
NumGPU: -1, // -1 here indicates that NumGPU should be set dynamically
|
||||||
NumThread: 0, // let the runtime decide
|
NumThread: 0, // let the runtime decide
|
||||||
|
LowVRAM: false,
|
||||||
|
UseMLock: false,
|
||||||
UseMMap: nil,
|
UseMMap: nil,
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
|
@ -4,14 +4,20 @@ import (
|
|||||||
"fmt"
|
"fmt"
|
||||||
"log/slog"
|
"log/slog"
|
||||||
"os"
|
"os"
|
||||||
|
"path/filepath"
|
||||||
"strconv"
|
"strconv"
|
||||||
"strings"
|
"strings"
|
||||||
|
|
||||||
"github.com/ollama/ollama/envconfig"
|
"github.com/ollama/ollama/envconfig"
|
||||||
"github.com/ollama/ollama/logutil"
|
|
||||||
)
|
)
|
||||||
|
|
||||||
func InitLogging() {
|
func InitLogging() {
|
||||||
|
level := slog.LevelInfo
|
||||||
|
|
||||||
|
if envconfig.Debug() {
|
||||||
|
level = slog.LevelDebug
|
||||||
|
}
|
||||||
|
|
||||||
var logFile *os.File
|
var logFile *os.File
|
||||||
var err error
|
var err error
|
||||||
// Detect if we're a GUI app on windows, and if not, send logs to console
|
// Detect if we're a GUI app on windows, and if not, send logs to console
|
||||||
@ -27,8 +33,20 @@ func InitLogging() {
|
|||||||
return
|
return
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
handler := slog.NewTextHandler(logFile, &slog.HandlerOptions{
|
||||||
|
Level: level,
|
||||||
|
AddSource: true,
|
||||||
|
ReplaceAttr: func(_ []string, attr slog.Attr) slog.Attr {
|
||||||
|
if attr.Key == slog.SourceKey {
|
||||||
|
source := attr.Value.Any().(*slog.Source)
|
||||||
|
source.File = filepath.Base(source.File)
|
||||||
|
}
|
||||||
|
return attr
|
||||||
|
},
|
||||||
|
})
|
||||||
|
|
||||||
|
slog.SetDefault(slog.New(handler))
|
||||||
|
|
||||||
slog.SetDefault(logutil.NewLogger(logFile, envconfig.LogLevel()))
|
|
||||||
slog.Info("ollama app started")
|
slog.Info("ollama app started")
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -78,7 +78,7 @@ func BenchmarkColdStart(b *testing.B) {
|
|||||||
|
|
||||||
for _, tt := range tests {
|
for _, tt := range tests {
|
||||||
b.Run(fmt.Sprintf("%s/cold/%s", m, tt.name), func(b *testing.B) {
|
b.Run(fmt.Sprintf("%s/cold/%s", m, tt.name), func(b *testing.B) {
|
||||||
ctx := b.Context()
|
ctx := context.Background()
|
||||||
|
|
||||||
// Set number of tokens as our throughput metric
|
// Set number of tokens as our throughput metric
|
||||||
b.SetBytes(int64(tt.maxTokens))
|
b.SetBytes(int64(tt.maxTokens))
|
||||||
@ -113,7 +113,7 @@ func BenchmarkWarmStart(b *testing.B) {
|
|||||||
|
|
||||||
for _, tt := range tests {
|
for _, tt := range tests {
|
||||||
b.Run(fmt.Sprintf("%s/warm/%s", m, tt.name), func(b *testing.B) {
|
b.Run(fmt.Sprintf("%s/warm/%s", m, tt.name), func(b *testing.B) {
|
||||||
ctx := b.Context()
|
ctx := context.Background()
|
||||||
|
|
||||||
// Pre-warm the model
|
// Pre-warm the model
|
||||||
warmup(client, m, tt.prompt, b)
|
warmup(client, m, tt.prompt, b)
|
||||||
@ -140,7 +140,7 @@ func setup(b *testing.B) *api.Client {
|
|||||||
if err != nil {
|
if err != nil {
|
||||||
b.Fatal(err)
|
b.Fatal(err)
|
||||||
}
|
}
|
||||||
if _, err := client.Show(b.Context(), &api.ShowRequest{Model: modelName(b)}); err != nil {
|
if _, err := client.Show(context.Background(), &api.ShowRequest{Model: modelName(b)}); err != nil {
|
||||||
b.Fatalf("Model unavailable: %v", err)
|
b.Fatalf("Model unavailable: %v", err)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
52
cmd/cmd.go
52
cmd/cmd.go
@ -31,7 +31,6 @@ import (
|
|||||||
"github.com/olekukonko/tablewriter"
|
"github.com/olekukonko/tablewriter"
|
||||||
"github.com/spf13/cobra"
|
"github.com/spf13/cobra"
|
||||||
"golang.org/x/crypto/ssh"
|
"golang.org/x/crypto/ssh"
|
||||||
"golang.org/x/sync/errgroup"
|
|
||||||
"golang.org/x/term"
|
"golang.org/x/term"
|
||||||
|
|
||||||
"github.com/ollama/ollama/api"
|
"github.com/ollama/ollama/api"
|
||||||
@ -42,7 +41,6 @@ import (
|
|||||||
"github.com/ollama/ollama/runner"
|
"github.com/ollama/ollama/runner"
|
||||||
"github.com/ollama/ollama/server"
|
"github.com/ollama/ollama/server"
|
||||||
"github.com/ollama/ollama/types/model"
|
"github.com/ollama/ollama/types/model"
|
||||||
"github.com/ollama/ollama/types/syncmap"
|
|
||||||
"github.com/ollama/ollama/version"
|
"github.com/ollama/ollama/version"
|
||||||
)
|
)
|
||||||
|
|
||||||
@ -108,7 +106,7 @@ func CreateHandler(cmd *cobra.Command, args []string) error {
|
|||||||
}
|
}
|
||||||
spinner.Stop()
|
spinner.Stop()
|
||||||
|
|
||||||
req.Model = args[0]
|
req.Name = args[0]
|
||||||
quantize, _ := cmd.Flags().GetString("quantize")
|
quantize, _ := cmd.Flags().GetString("quantize")
|
||||||
if quantize != "" {
|
if quantize != "" {
|
||||||
req.Quantize = quantize
|
req.Quantize = quantize
|
||||||
@ -119,54 +117,34 @@ func CreateHandler(cmd *cobra.Command, args []string) error {
|
|||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
|
|
||||||
var g errgroup.Group
|
if len(req.Files) > 0 {
|
||||||
g.SetLimit(max(runtime.GOMAXPROCS(0)-1, 1))
|
fileMap := map[string]string{}
|
||||||
|
for f, digest := range req.Files {
|
||||||
files := syncmap.NewSyncMap[string, string]()
|
|
||||||
for f, digest := range req.Files {
|
|
||||||
g.Go(func() error {
|
|
||||||
if _, err := createBlob(cmd, client, f, digest, p); err != nil {
|
if _, err := createBlob(cmd, client, f, digest, p); err != nil {
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
|
fileMap[filepath.Base(f)] = digest
|
||||||
// TODO: this is incorrect since the file might be in a subdirectory
|
}
|
||||||
// instead this should take the path relative to the model directory
|
req.Files = fileMap
|
||||||
// but the current implementation does not allow this
|
|
||||||
files.Store(filepath.Base(f), digest)
|
|
||||||
return nil
|
|
||||||
})
|
|
||||||
}
|
}
|
||||||
|
|
||||||
adapters := syncmap.NewSyncMap[string, string]()
|
if len(req.Adapters) > 0 {
|
||||||
for f, digest := range req.Adapters {
|
fileMap := map[string]string{}
|
||||||
g.Go(func() error {
|
for f, digest := range req.Adapters {
|
||||||
if _, err := createBlob(cmd, client, f, digest, p); err != nil {
|
if _, err := createBlob(cmd, client, f, digest, p); err != nil {
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
|
fileMap[filepath.Base(f)] = digest
|
||||||
// TODO: same here
|
}
|
||||||
adapters.Store(filepath.Base(f), digest)
|
req.Adapters = fileMap
|
||||||
return nil
|
|
||||||
})
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if err := g.Wait(); err != nil {
|
|
||||||
return err
|
|
||||||
}
|
|
||||||
|
|
||||||
req.Files = files.Items()
|
|
||||||
req.Adapters = adapters.Items()
|
|
||||||
|
|
||||||
bars := make(map[string]*progress.Bar)
|
bars := make(map[string]*progress.Bar)
|
||||||
fn := func(resp api.ProgressResponse) error {
|
fn := func(resp api.ProgressResponse) error {
|
||||||
if resp.Digest != "" {
|
if resp.Digest != "" {
|
||||||
bar, ok := bars[resp.Digest]
|
bar, ok := bars[resp.Digest]
|
||||||
if !ok {
|
if !ok {
|
||||||
msg := resp.Status
|
bar = progress.NewBar(fmt.Sprintf("pulling %s...", resp.Digest[7:19]), resp.Total, resp.Completed)
|
||||||
if msg == "" {
|
|
||||||
msg = fmt.Sprintf("pulling %s...", resp.Digest[7:19])
|
|
||||||
}
|
|
||||||
bar = progress.NewBar(msg, resp.Total, resp.Completed)
|
|
||||||
bars[resp.Digest] = bar
|
bars[resp.Digest] = bar
|
||||||
p.Add(resp.Digest, bar)
|
p.Add(resp.Digest, bar)
|
||||||
}
|
}
|
||||||
@ -235,7 +213,7 @@ func createBlob(cmd *cobra.Command, client *api.Client, path string, digest stri
|
|||||||
}
|
}
|
||||||
}()
|
}()
|
||||||
|
|
||||||
if err := client.CreateBlob(cmd.Context(), digest, io.TeeReader(bin, &pw)); err != nil {
|
if err = client.CreateBlob(cmd.Context(), digest, io.TeeReader(bin, &pw)); err != nil {
|
||||||
return "", err
|
return "", err
|
||||||
}
|
}
|
||||||
return digest, nil
|
return digest, nil
|
||||||
|
@ -2,6 +2,7 @@ package cmd
|
|||||||
|
|
||||||
import (
|
import (
|
||||||
"bytes"
|
"bytes"
|
||||||
|
"context"
|
||||||
"encoding/json"
|
"encoding/json"
|
||||||
"io"
|
"io"
|
||||||
"net/http"
|
"net/http"
|
||||||
@ -336,7 +337,7 @@ func TestDeleteHandler(t *testing.T) {
|
|||||||
t.Cleanup(mockServer.Close)
|
t.Cleanup(mockServer.Close)
|
||||||
|
|
||||||
cmd := &cobra.Command{}
|
cmd := &cobra.Command{}
|
||||||
cmd.SetContext(t.Context())
|
cmd.SetContext(context.TODO())
|
||||||
if err := DeleteHandler(cmd, []string{"test-model"}); err != nil {
|
if err := DeleteHandler(cmd, []string{"test-model"}); err != nil {
|
||||||
t.Fatalf("DeleteHandler failed: %v", err)
|
t.Fatalf("DeleteHandler failed: %v", err)
|
||||||
}
|
}
|
||||||
@ -398,6 +399,11 @@ func TestGetModelfileName(t *testing.T) {
|
|||||||
var expectedFilename string
|
var expectedFilename string
|
||||||
|
|
||||||
if tt.fileExists {
|
if tt.fileExists {
|
||||||
|
tempDir, err := os.MkdirTemp("", "modelfiledir")
|
||||||
|
defer os.RemoveAll(tempDir)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("temp modelfile dir creation failed: %v", err)
|
||||||
|
}
|
||||||
var fn string
|
var fn string
|
||||||
if tt.modelfileName != "" {
|
if tt.modelfileName != "" {
|
||||||
fn = tt.modelfileName
|
fn = tt.modelfileName
|
||||||
@ -405,7 +411,7 @@ func TestGetModelfileName(t *testing.T) {
|
|||||||
fn = "Modelfile"
|
fn = "Modelfile"
|
||||||
}
|
}
|
||||||
|
|
||||||
tempFile, err := os.CreateTemp(t.TempDir(), fn)
|
tempFile, err := os.CreateTemp(tempDir, fn)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
t.Fatalf("temp modelfile creation failed: %v", err)
|
t.Fatalf("temp modelfile creation failed: %v", err)
|
||||||
}
|
}
|
||||||
@ -524,7 +530,7 @@ func TestPushHandler(t *testing.T) {
|
|||||||
|
|
||||||
cmd := &cobra.Command{}
|
cmd := &cobra.Command{}
|
||||||
cmd.Flags().Bool("insecure", false, "")
|
cmd.Flags().Bool("insecure", false, "")
|
||||||
cmd.SetContext(t.Context())
|
cmd.SetContext(context.TODO())
|
||||||
|
|
||||||
// Redirect stderr to capture progress output
|
// Redirect stderr to capture progress output
|
||||||
oldStderr := os.Stderr
|
oldStderr := os.Stderr
|
||||||
@ -629,7 +635,7 @@ func TestListHandler(t *testing.T) {
|
|||||||
t.Setenv("OLLAMA_HOST", mockServer.URL)
|
t.Setenv("OLLAMA_HOST", mockServer.URL)
|
||||||
|
|
||||||
cmd := &cobra.Command{}
|
cmd := &cobra.Command{}
|
||||||
cmd.SetContext(t.Context())
|
cmd.SetContext(context.TODO())
|
||||||
|
|
||||||
// Capture stdout
|
// Capture stdout
|
||||||
oldStdout := os.Stdout
|
oldStdout := os.Stdout
|
||||||
@ -684,7 +690,7 @@ func TestCreateHandler(t *testing.T) {
|
|||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
if req.Model != "test-model" {
|
if req.Name != "test-model" {
|
||||||
t.Errorf("expected model name 'test-model', got %s", req.Name)
|
t.Errorf("expected model name 'test-model', got %s", req.Name)
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -724,7 +730,7 @@ func TestCreateHandler(t *testing.T) {
|
|||||||
}))
|
}))
|
||||||
t.Setenv("OLLAMA_HOST", mockServer.URL)
|
t.Setenv("OLLAMA_HOST", mockServer.URL)
|
||||||
t.Cleanup(mockServer.Close)
|
t.Cleanup(mockServer.Close)
|
||||||
tempFile, err := os.CreateTemp(t.TempDir(), "modelfile")
|
tempFile, err := os.CreateTemp("", "modelfile")
|
||||||
if err != nil {
|
if err != nil {
|
||||||
t.Fatal(err)
|
t.Fatal(err)
|
||||||
}
|
}
|
||||||
@ -744,7 +750,7 @@ func TestCreateHandler(t *testing.T) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
cmd.Flags().Bool("insecure", false, "")
|
cmd.Flags().Bool("insecure", false, "")
|
||||||
cmd.SetContext(t.Context())
|
cmd.SetContext(context.TODO())
|
||||||
|
|
||||||
// Redirect stderr to capture progress output
|
// Redirect stderr to capture progress output
|
||||||
oldStderr := os.Stderr
|
oldStderr := os.Stderr
|
||||||
|
@ -44,7 +44,7 @@ func generateInteractive(cmd *cobra.Command, opts runOptions) error {
|
|||||||
fmt.Fprintln(os.Stderr, "Use \"\"\" to begin a multi-line message.")
|
fmt.Fprintln(os.Stderr, "Use \"\"\" to begin a multi-line message.")
|
||||||
|
|
||||||
if opts.MultiModal {
|
if opts.MultiModal {
|
||||||
fmt.Fprintf(os.Stderr, "Use %s to include .jpg, .png, or .webp images.\n", filepath.FromSlash("/path/to/file"))
|
fmt.Fprintf(os.Stderr, "Use %s to include .jpg or .png images.\n", filepath.FromSlash("/path/to/file"))
|
||||||
}
|
}
|
||||||
|
|
||||||
fmt.Fprintln(os.Stderr, "")
|
fmt.Fprintln(os.Stderr, "")
|
||||||
@ -503,7 +503,6 @@ func normalizeFilePath(fp string) string {
|
|||||||
"\\\\", "\\", // Escaped backslash
|
"\\\\", "\\", // Escaped backslash
|
||||||
"\\*", "*", // Escaped asterisk
|
"\\*", "*", // Escaped asterisk
|
||||||
"\\?", "?", // Escaped question mark
|
"\\?", "?", // Escaped question mark
|
||||||
"\\~", "~", // Escaped tilde
|
|
||||||
).Replace(fp)
|
).Replace(fp)
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -511,7 +510,7 @@ func extractFileNames(input string) []string {
|
|||||||
// Regex to match file paths starting with optional drive letter, / ./ \ or .\ and include escaped or unescaped spaces (\ or %20)
|
// Regex to match file paths starting with optional drive letter, / ./ \ or .\ and include escaped or unescaped spaces (\ or %20)
|
||||||
// and followed by more characters and a file extension
|
// and followed by more characters and a file extension
|
||||||
// This will capture non filename strings, but we'll check for file existence to remove mismatches
|
// This will capture non filename strings, but we'll check for file existence to remove mismatches
|
||||||
regexPattern := `(?:[a-zA-Z]:)?(?:\./|/|\\)[\S\\ ]+?\.(?i:jpg|jpeg|png|webp)\b`
|
regexPattern := `(?:[a-zA-Z]:)?(?:\./|/|\\)[\S\\ ]+?\.(?i:jpg|jpeg|png)\b`
|
||||||
re := regexp.MustCompile(regexPattern)
|
re := regexp.MustCompile(regexPattern)
|
||||||
|
|
||||||
return re.FindAllString(input, -1)
|
return re.FindAllString(input, -1)
|
||||||
@ -531,8 +530,6 @@ func extractFileData(input string) (string, []api.ImageData, error) {
|
|||||||
return "", imgs, err
|
return "", imgs, err
|
||||||
}
|
}
|
||||||
fmt.Fprintf(os.Stderr, "Added image '%s'\n", nfp)
|
fmt.Fprintf(os.Stderr, "Added image '%s'\n", nfp)
|
||||||
input = strings.ReplaceAll(input, "'"+nfp+"'", "")
|
|
||||||
input = strings.ReplaceAll(input, "'"+fp+"'", "")
|
|
||||||
input = strings.ReplaceAll(input, fp, "")
|
input = strings.ReplaceAll(input, fp, "")
|
||||||
imgs = append(imgs, data)
|
imgs = append(imgs, data)
|
||||||
}
|
}
|
||||||
@ -553,7 +550,7 @@ func getImageData(filePath string) ([]byte, error) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
contentType := http.DetectContentType(buf)
|
contentType := http.DetectContentType(buf)
|
||||||
allowedTypes := []string{"image/jpeg", "image/jpg", "image/png", "image/webp"}
|
allowedTypes := []string{"image/jpeg", "image/jpg", "image/png"}
|
||||||
if !slices.Contains(allowedTypes, contentType) {
|
if !slices.Contains(allowedTypes, contentType) {
|
||||||
return nil, fmt.Errorf("invalid image type: %s", contentType)
|
return nil, fmt.Errorf("invalid image type: %s", contentType)
|
||||||
}
|
}
|
||||||
|
@ -1,8 +1,6 @@
|
|||||||
package cmd
|
package cmd
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"os"
|
|
||||||
"path/filepath"
|
|
||||||
"testing"
|
"testing"
|
||||||
|
|
||||||
"github.com/stretchr/testify/assert"
|
"github.com/stretchr/testify/assert"
|
||||||
@ -12,17 +10,14 @@ func TestExtractFilenames(t *testing.T) {
|
|||||||
// Unix style paths
|
// Unix style paths
|
||||||
input := ` some preamble
|
input := ` some preamble
|
||||||
./relative\ path/one.png inbetween1 ./not a valid two.jpg inbetween2 ./1.svg
|
./relative\ path/one.png inbetween1 ./not a valid two.jpg inbetween2 ./1.svg
|
||||||
/unescaped space /three.jpeg inbetween3 /valid\ path/dir/four.png "./quoted with spaces/five.JPG
|
/unescaped space /three.jpeg inbetween3 /valid\ path/dir/four.png "./quoted with spaces/five.JPG`
|
||||||
/unescaped space /six.webp inbetween6 /valid\ path/dir/seven.WEBP`
|
|
||||||
res := extractFileNames(input)
|
res := extractFileNames(input)
|
||||||
assert.Len(t, res, 7)
|
assert.Len(t, res, 5)
|
||||||
assert.Contains(t, res[0], "one.png")
|
assert.Contains(t, res[0], "one.png")
|
||||||
assert.Contains(t, res[1], "two.jpg")
|
assert.Contains(t, res[1], "two.jpg")
|
||||||
assert.Contains(t, res[2], "three.jpeg")
|
assert.Contains(t, res[2], "three.jpeg")
|
||||||
assert.Contains(t, res[3], "four.png")
|
assert.Contains(t, res[3], "four.png")
|
||||||
assert.Contains(t, res[4], "five.JPG")
|
assert.Contains(t, res[4], "five.JPG")
|
||||||
assert.Contains(t, res[5], "six.webp")
|
|
||||||
assert.Contains(t, res[6], "seven.WEBP")
|
|
||||||
assert.NotContains(t, res[4], '"')
|
assert.NotContains(t, res[4], '"')
|
||||||
assert.NotContains(t, res, "inbetween1")
|
assert.NotContains(t, res, "inbetween1")
|
||||||
assert.NotContains(t, res, "./1.svg")
|
assert.NotContains(t, res, "./1.svg")
|
||||||
@ -33,12 +28,10 @@ func TestExtractFilenames(t *testing.T) {
|
|||||||
/absolute/nospace/three.jpeg inbetween3 /absolute/with space/four.png inbetween4
|
/absolute/nospace/three.jpeg inbetween3 /absolute/with space/four.png inbetween4
|
||||||
./relative\ path/five.JPG inbetween5 "./relative with/spaces/six.png inbetween6
|
./relative\ path/five.JPG inbetween5 "./relative with/spaces/six.png inbetween6
|
||||||
d:\path with\spaces\seven.JPEG inbetween7 c:\users\jdoe\eight.png inbetween8
|
d:\path with\spaces\seven.JPEG inbetween7 c:\users\jdoe\eight.png inbetween8
|
||||||
d:\program files\someplace\nine.png inbetween9 "E:\program files\someplace\ten.PNG
|
d:\program files\someplace\nine.png inbetween9 "E:\program files\someplace\ten.PNG some ending
|
||||||
c:/users/jdoe/eleven.webp inbetween11 c:/program files/someplace/twelve.WebP inbetween12
|
|
||||||
d:\path with\spaces\thirteen.WEBP some ending
|
|
||||||
`
|
`
|
||||||
res = extractFileNames(input)
|
res = extractFileNames(input)
|
||||||
assert.Len(t, res, 13)
|
assert.Len(t, res, 10)
|
||||||
assert.NotContains(t, res, "inbetween2")
|
assert.NotContains(t, res, "inbetween2")
|
||||||
assert.Contains(t, res[0], "one.png")
|
assert.Contains(t, res[0], "one.png")
|
||||||
assert.Contains(t, res[0], "c:")
|
assert.Contains(t, res[0], "c:")
|
||||||
@ -56,31 +49,4 @@ d:\path with\spaces\thirteen.WEBP some ending
|
|||||||
assert.Contains(t, res[8], "d:")
|
assert.Contains(t, res[8], "d:")
|
||||||
assert.Contains(t, res[9], "ten.PNG")
|
assert.Contains(t, res[9], "ten.PNG")
|
||||||
assert.Contains(t, res[9], "E:")
|
assert.Contains(t, res[9], "E:")
|
||||||
assert.Contains(t, res[10], "eleven.webp")
|
|
||||||
assert.Contains(t, res[10], "c:")
|
|
||||||
assert.Contains(t, res[11], "twelve.WebP")
|
|
||||||
assert.Contains(t, res[11], "c:")
|
|
||||||
assert.Contains(t, res[12], "thirteen.WEBP")
|
|
||||||
assert.Contains(t, res[12], "d:")
|
|
||||||
}
|
|
||||||
|
|
||||||
// Ensure that file paths wrapped in single quotes are removed with the quotes.
|
|
||||||
func TestExtractFileDataRemovesQuotedFilepath(t *testing.T) {
|
|
||||||
dir := t.TempDir()
|
|
||||||
fp := filepath.Join(dir, "img.jpg")
|
|
||||||
data := make([]byte, 600)
|
|
||||||
copy(data, []byte{
|
|
||||||
0xff, 0xd8, 0xff, 0xe0, 0x00, 0x10, 'J', 'F', 'I', 'F',
|
|
||||||
0x00, 0x01, 0x01, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
|
|
||||||
0xff, 0xd9,
|
|
||||||
})
|
|
||||||
if err := os.WriteFile(fp, data, 0o600); err != nil {
|
|
||||||
t.Fatalf("failed to write test image: %v", err)
|
|
||||||
}
|
|
||||||
|
|
||||||
input := "before '" + fp + "' after"
|
|
||||||
cleaned, imgs, err := extractFileData(input)
|
|
||||||
assert.NoError(t, err)
|
|
||||||
assert.Len(t, imgs, 1)
|
|
||||||
assert.Equal(t, cleaned, "before after")
|
|
||||||
}
|
}
|
||||||
|
@ -1,26 +1,25 @@
|
|||||||
package convert
|
package convert
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"cmp"
|
|
||||||
"encoding/json"
|
"encoding/json"
|
||||||
"errors"
|
"errors"
|
||||||
"fmt"
|
"fmt"
|
||||||
|
"io"
|
||||||
"io/fs"
|
"io/fs"
|
||||||
"log/slog"
|
"log/slog"
|
||||||
"os"
|
|
||||||
"slices"
|
|
||||||
"strings"
|
"strings"
|
||||||
|
|
||||||
"github.com/ollama/ollama/fs/ggml"
|
"github.com/ollama/ollama/fs/ggml"
|
||||||
)
|
)
|
||||||
|
|
||||||
type ModelParameters struct {
|
type ModelParameters struct {
|
||||||
Architectures []string `json:"architectures"`
|
Architectures []string `json:"architectures"`
|
||||||
VocabSize uint32 `json:"vocab_size"`
|
VocabSize uint32 `json:"vocab_size"`
|
||||||
|
TextModel TextParameters `json:"text_config"`
|
||||||
|
}
|
||||||
|
|
||||||
TextModel struct {
|
type TextParameters struct {
|
||||||
VocabSize uint32 `json:"vocab_size"`
|
VocabSize uint32 `json:"vocab_size"`
|
||||||
} `json:"text_config"`
|
|
||||||
}
|
}
|
||||||
|
|
||||||
type AdapterParameters struct {
|
type AdapterParameters struct {
|
||||||
@ -85,17 +84,27 @@ func (ModelParameters) specialTokenTypes() []string {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (ModelParameters) writeFile(ws io.WriteSeeker, kv ggml.KV, ts []ggml.Tensor) error {
|
||||||
|
return ggml.WriteGGUF(ws, kv, ts)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (AdapterParameters) writeFile(ws io.WriteSeeker, kv ggml.KV, ts []ggml.Tensor) error {
|
||||||
|
return ggml.WriteGGUF(ws, kv, ts)
|
||||||
|
}
|
||||||
|
|
||||||
type ModelConverter interface {
|
type ModelConverter interface {
|
||||||
// KV maps parameters to LLM key-values
|
// KV maps parameters to LLM key-values
|
||||||
KV(*Tokenizer) ggml.KV
|
KV(*Tokenizer) ggml.KV
|
||||||
// Tensors maps input tensors to LLM tensors. Model specific modifications can be done here.
|
// Tensors maps input tensors to LLM tensors. Model specific modifications can be done here.
|
||||||
Tensors([]Tensor) []*ggml.Tensor
|
Tensors([]Tensor) []ggml.Tensor
|
||||||
// Replacements returns a list of string pairs to replace in tensor names.
|
// Replacements returns a list of string pairs to replace in tensor names.
|
||||||
// See [strings.Replacer](https://pkg.go.dev/strings#Replacer) for details
|
// See [strings.Replacer](https://pkg.go.dev/strings#Replacer) for details
|
||||||
Replacements() []string
|
Replacements() []string
|
||||||
|
|
||||||
// specialTokenTypes returns any special token types the model uses
|
// specialTokenTypes returns any special token types the model uses
|
||||||
specialTokenTypes() []string
|
specialTokenTypes() []string
|
||||||
|
// writeFile writes the model to the provided io.WriteSeeker
|
||||||
|
writeFile(io.WriteSeeker, ggml.KV, []ggml.Tensor) error
|
||||||
}
|
}
|
||||||
|
|
||||||
type moreParser interface {
|
type moreParser interface {
|
||||||
@ -106,13 +115,15 @@ type AdapterConverter interface {
|
|||||||
// KV maps parameters to LLM key-values
|
// KV maps parameters to LLM key-values
|
||||||
KV(ggml.KV) ggml.KV
|
KV(ggml.KV) ggml.KV
|
||||||
// Tensors maps input tensors to LLM tensors. Adapter specific modifications can be done here.
|
// Tensors maps input tensors to LLM tensors. Adapter specific modifications can be done here.
|
||||||
Tensors([]Tensor) []*ggml.Tensor
|
Tensors([]Tensor) []ggml.Tensor
|
||||||
// Replacements returns a list of string pairs to replace in tensor names.
|
// Replacements returns a list of string pairs to replace in tensor names.
|
||||||
// See [strings.Replacer](https://pkg.go.dev/strings#Replacer) for details
|
// See [strings.Replacer](https://pkg.go.dev/strings#Replacer) for details
|
||||||
Replacements() []string
|
Replacements() []string
|
||||||
|
|
||||||
|
writeFile(io.WriteSeeker, ggml.KV, []ggml.Tensor) error
|
||||||
}
|
}
|
||||||
|
|
||||||
func ConvertAdapter(fsys fs.FS, f *os.File, baseKV ggml.KV) error {
|
func ConvertAdapter(fsys fs.FS, ws io.WriteSeeker, baseKV ggml.KV) error {
|
||||||
bts, err := fs.ReadFile(fsys, "adapter_config.json")
|
bts, err := fs.ReadFile(fsys, "adapter_config.json")
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return err
|
return err
|
||||||
@ -147,14 +158,14 @@ func ConvertAdapter(fsys fs.FS, f *os.File, baseKV ggml.KV) error {
|
|||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
|
|
||||||
return writeFile(f, conv.KV(baseKV), conv.Tensors(ts))
|
return conv.writeFile(ws, conv.KV(baseKV), conv.Tensors(ts))
|
||||||
}
|
}
|
||||||
|
|
||||||
// Convert writes an Ollama compatible model to the provided io.WriteSeeker based on configurations
|
// Convert writes an Ollama compatible model to the provided io.WriteSeeker based on configurations
|
||||||
// and files it finds in the input path.
|
// and files it finds in the input path.
|
||||||
// Supported input model formats include safetensors.
|
// Supported input model formats include safetensors.
|
||||||
// Supported input tokenizers files include tokenizer.json (preferred) and tokenizer.model.
|
// Supported input tokenizers files include tokenizer.json (preferred) and tokenizer.model.
|
||||||
func ConvertModel(fsys fs.FS, f *os.File) error {
|
func ConvertModel(fsys fs.FS, ws io.WriteSeeker) error {
|
||||||
bts, err := fs.ReadFile(fsys, "config.json")
|
bts, err := fs.ReadFile(fsys, "config.json")
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return err
|
return err
|
||||||
@ -173,10 +184,6 @@ func ConvertModel(fsys fs.FS, f *os.File) error {
|
|||||||
switch p.Architectures[0] {
|
switch p.Architectures[0] {
|
||||||
case "LlamaForCausalLM":
|
case "LlamaForCausalLM":
|
||||||
conv = &llamaModel{}
|
conv = &llamaModel{}
|
||||||
case "MllamaForConditionalGeneration":
|
|
||||||
conv = &mllamaModel{}
|
|
||||||
case "Llama4ForConditionalGeneration":
|
|
||||||
conv = &llama4Model{}
|
|
||||||
case "Mistral3ForConditionalGeneration":
|
case "Mistral3ForConditionalGeneration":
|
||||||
conv = &mistral3Model{}
|
conv = &mistral3Model{}
|
||||||
case "MixtralForCausalLM":
|
case "MixtralForCausalLM":
|
||||||
@ -191,8 +198,6 @@ func ConvertModel(fsys fs.FS, f *os.File) error {
|
|||||||
conv = &phi3Model{}
|
conv = &phi3Model{}
|
||||||
case "Qwen2ForCausalLM":
|
case "Qwen2ForCausalLM":
|
||||||
conv = &qwen2Model{}
|
conv = &qwen2Model{}
|
||||||
case "Qwen2_5_VLForConditionalGeneration":
|
|
||||||
conv = &qwen25VLModel{}
|
|
||||||
case "BertModel":
|
case "BertModel":
|
||||||
conv = &bertModel{}
|
conv = &bertModel{}
|
||||||
case "CohereForCausalLM":
|
case "CohereForCausalLM":
|
||||||
@ -216,22 +221,24 @@ func ConvertModel(fsys fs.FS, f *os.File) error {
|
|||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
|
|
||||||
vocabSize := int(cmp.Or(p.VocabSize, p.TextModel.VocabSize))
|
vocabSize := int(p.VocabSize)
|
||||||
|
if vocabSize == 0 {
|
||||||
|
tVocabSize := int(p.TextModel.VocabSize)
|
||||||
|
vocabSize = tVocabSize
|
||||||
|
}
|
||||||
|
|
||||||
switch {
|
switch {
|
||||||
case vocabSize == 0:
|
case vocabSize == 0:
|
||||||
slog.Debug("vocabulary size was not explicitly set by the model", "default size", len(t.Vocabulary.Tokens))
|
slog.Warn("vocabulary size was not explicitly set by the model", "default size", len(t.Vocabulary.Tokens))
|
||||||
case vocabSize > len(t.Vocabulary.Tokens):
|
case vocabSize > len(t.Vocabulary.Tokens):
|
||||||
slog.Debug("vocabulary is smaller than expected, padding with dummy tokens", "expect", vocabSize, "actual", len(t.Vocabulary.Tokens))
|
slog.Warn("vocabulary is smaller than expected, padding with dummy tokens", "expect", vocabSize, "actual", len(t.Vocabulary.Tokens))
|
||||||
for i := range vocabSize - len(t.Vocabulary.Tokens) {
|
for i := range vocabSize - len(t.Vocabulary.Tokens) {
|
||||||
t.Vocabulary.Tokens = append(t.Vocabulary.Tokens, fmt.Sprintf("[PAD%d]", i))
|
t.Vocabulary.Tokens = append(t.Vocabulary.Tokens, fmt.Sprintf("[PAD%d]", i))
|
||||||
t.Vocabulary.Scores = append(t.Vocabulary.Scores, -1)
|
t.Vocabulary.Scores = append(t.Vocabulary.Scores, -1)
|
||||||
t.Vocabulary.Types = append(t.Vocabulary.Types, tokenTypeUserDefined)
|
t.Vocabulary.Types = append(t.Vocabulary.Types, tokenTypeUserDefined)
|
||||||
}
|
}
|
||||||
case vocabSize < len(t.Vocabulary.Tokens):
|
case vocabSize < len(t.Vocabulary.Tokens):
|
||||||
slog.Debug("vocabulary is larger than expected", "want", vocabSize, "got", len(t.Vocabulary.Tokens))
|
return fmt.Errorf("vocabulary is larger than expected '%d' instead of '%d'", len(t.Vocabulary.Tokens), vocabSize)
|
||||||
p.VocabSize = uint32(len(t.Vocabulary.Tokens))
|
|
||||||
p.TextModel.VocabSize = uint32(len(t.Vocabulary.Tokens))
|
|
||||||
default:
|
default:
|
||||||
slog.Debug("vocabulary", "size", len(t.Vocabulary.Tokens))
|
slog.Debug("vocabulary", "size", len(t.Vocabulary.Tokens))
|
||||||
}
|
}
|
||||||
@ -241,13 +248,5 @@ func ConvertModel(fsys fs.FS, f *os.File) error {
|
|||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
|
|
||||||
return writeFile(f, conv.KV(t), conv.Tensors(ts))
|
return conv.writeFile(ws, conv.KV(t), conv.Tensors(ts))
|
||||||
}
|
|
||||||
|
|
||||||
func writeFile(f *os.File, kv ggml.KV, ts []*ggml.Tensor) error {
|
|
||||||
for i := range ts {
|
|
||||||
ts[i].Shape = slices.Clone(ts[i].Shape)
|
|
||||||
slices.Reverse(ts[i].Shape)
|
|
||||||
}
|
|
||||||
return ggml.WriteGGUF(f, kv, ts)
|
|
||||||
}
|
}
|
||||||
|
@ -132,8 +132,8 @@ func (p *bertModel) KV(t *Tokenizer) ggml.KV {
|
|||||||
return kv
|
return kv
|
||||||
}
|
}
|
||||||
|
|
||||||
func (p *bertModel) Tensors(ts []Tensor) []*ggml.Tensor {
|
func (p *bertModel) Tensors(ts []Tensor) []ggml.Tensor {
|
||||||
var out []*ggml.Tensor
|
var out []ggml.Tensor
|
||||||
for _, t := range ts {
|
for _, t := range ts {
|
||||||
if slices.Contains([]string{
|
if slices.Contains([]string{
|
||||||
"embeddings.position_ids",
|
"embeddings.position_ids",
|
||||||
@ -143,7 +143,7 @@ func (p *bertModel) Tensors(ts []Tensor) []*ggml.Tensor {
|
|||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
|
|
||||||
out = append(out, &ggml.Tensor{
|
out = append(out, ggml.Tensor{
|
||||||
Name: t.Name(),
|
Name: t.Name(),
|
||||||
Kind: t.Kind(),
|
Kind: t.Kind(),
|
||||||
Shape: t.Shape(),
|
Shape: t.Shape(),
|
||||||
|
@ -43,10 +43,10 @@ func (p *commandrModel) KV(t *Tokenizer) ggml.KV {
|
|||||||
return kv
|
return kv
|
||||||
}
|
}
|
||||||
|
|
||||||
func (p *commandrModel) Tensors(ts []Tensor) []*ggml.Tensor {
|
func (p *commandrModel) Tensors(ts []Tensor) []ggml.Tensor {
|
||||||
var out []*ggml.Tensor
|
var out []ggml.Tensor
|
||||||
for _, t := range ts {
|
for _, t := range ts {
|
||||||
out = append(out, &ggml.Tensor{
|
out = append(out, ggml.Tensor{
|
||||||
Name: t.Name(),
|
Name: t.Name(),
|
||||||
Kind: t.Kind(),
|
Kind: t.Kind(),
|
||||||
Shape: t.Shape(),
|
Shape: t.Shape(),
|
||||||
|
@ -42,14 +42,14 @@ func (p *gemmaModel) KV(t *Tokenizer) ggml.KV {
|
|||||||
return kv
|
return kv
|
||||||
}
|
}
|
||||||
|
|
||||||
func (p *gemmaModel) Tensors(ts []Tensor) []*ggml.Tensor {
|
func (p *gemmaModel) Tensors(ts []Tensor) []ggml.Tensor {
|
||||||
var out []*ggml.Tensor
|
var out []ggml.Tensor
|
||||||
for _, t := range ts {
|
for _, t := range ts {
|
||||||
if !strings.HasPrefix(t.Name(), "v.") && strings.HasSuffix(t.Name(), "_norm.weight") {
|
if !strings.HasPrefix(t.Name(), "v.") && strings.HasSuffix(t.Name(), "_norm.weight") {
|
||||||
t.SetRepacker(p.addOne)
|
t.SetRepacker(p.addOne)
|
||||||
}
|
}
|
||||||
|
|
||||||
out = append(out, &ggml.Tensor{
|
out = append(out, ggml.Tensor{
|
||||||
Name: t.Name(),
|
Name: t.Name(),
|
||||||
Kind: t.Kind(),
|
Kind: t.Kind(),
|
||||||
Shape: t.Shape(),
|
Shape: t.Shape(),
|
||||||
|
@ -21,8 +21,8 @@ func (p *gemma2Adapter) KV(baseKV ggml.KV) ggml.KV {
|
|||||||
return kv
|
return kv
|
||||||
}
|
}
|
||||||
|
|
||||||
func (p *gemma2Adapter) Tensors(ts []Tensor) []*ggml.Tensor {
|
func (p *gemma2Adapter) Tensors(ts []Tensor) []ggml.Tensor {
|
||||||
var out []*ggml.Tensor
|
var out []ggml.Tensor
|
||||||
for _, t := range ts {
|
for _, t := range ts {
|
||||||
shape := t.Shape()
|
shape := t.Shape()
|
||||||
if (strings.HasSuffix(t.Name(), "weight.lora_a") && shape[0] > shape[1]) ||
|
if (strings.HasSuffix(t.Name(), "weight.lora_a") && shape[0] > shape[1]) ||
|
||||||
@ -31,7 +31,7 @@ func (p *gemma2Adapter) Tensors(ts []Tensor) []*ggml.Tensor {
|
|||||||
t.SetRepacker(p.repack)
|
t.SetRepacker(p.repack)
|
||||||
}
|
}
|
||||||
|
|
||||||
out = append(out, &ggml.Tensor{
|
out = append(out, ggml.Tensor{
|
||||||
Name: t.Name(),
|
Name: t.Name(),
|
||||||
Kind: t.Kind(),
|
Kind: t.Kind(),
|
||||||
Shape: t.Shape(),
|
Shape: t.Shape(),
|
||||||
|
@ -42,8 +42,6 @@ type llamaModel struct {
|
|||||||
LayerNormEpsilon float32 `json:"layer_norm_epsilon"`
|
LayerNormEpsilon float32 `json:"layer_norm_epsilon"`
|
||||||
NormEpsilon float32 `json:"norm_epsilon"`
|
NormEpsilon float32 `json:"norm_epsilon"`
|
||||||
HeadDim uint32 `json:"head_dim"`
|
HeadDim uint32 `json:"head_dim"`
|
||||||
|
|
||||||
skipRepack bool
|
|
||||||
}
|
}
|
||||||
|
|
||||||
var _ ModelConverter = (*llamaModel)(nil)
|
var _ ModelConverter = (*llamaModel)(nil)
|
||||||
@ -72,10 +70,6 @@ func (p *llamaModel) KV(t *Tokenizer) ggml.KV {
|
|||||||
kv["llama.rope.dimension_count"] = p.HiddenSize / headCount
|
kv["llama.rope.dimension_count"] = p.HiddenSize / headCount
|
||||||
}
|
}
|
||||||
|
|
||||||
if p.HeadDim > 0 {
|
|
||||||
kv["llama.attention.head_dim"] = p.HeadDim
|
|
||||||
}
|
|
||||||
|
|
||||||
if p.RopeTheta > 0 {
|
if p.RopeTheta > 0 {
|
||||||
kv["llama.rope.freq_base"] = p.RopeTheta
|
kv["llama.rope.freq_base"] = p.RopeTheta
|
||||||
}
|
}
|
||||||
@ -126,11 +120,11 @@ func (p *llamaModel) KV(t *Tokenizer) ggml.KV {
|
|||||||
return kv
|
return kv
|
||||||
}
|
}
|
||||||
|
|
||||||
func (p *llamaModel) Tensors(ts []Tensor) []*ggml.Tensor {
|
func (p *llamaModel) Tensors(ts []Tensor) []ggml.Tensor {
|
||||||
var out []*ggml.Tensor
|
var out []ggml.Tensor
|
||||||
|
|
||||||
if p.RopeScaling.factors != nil {
|
if p.RopeScaling.factors != nil {
|
||||||
out = append(out, &ggml.Tensor{
|
out = append(out, ggml.Tensor{
|
||||||
Name: "rope_freqs.weight",
|
Name: "rope_freqs.weight",
|
||||||
Kind: 0,
|
Kind: 0,
|
||||||
Shape: []uint64{uint64(len(p.RopeScaling.factors))},
|
Shape: []uint64{uint64(len(p.RopeScaling.factors))},
|
||||||
@ -139,13 +133,12 @@ func (p *llamaModel) Tensors(ts []Tensor) []*ggml.Tensor {
|
|||||||
}
|
}
|
||||||
|
|
||||||
for _, t := range ts {
|
for _, t := range ts {
|
||||||
if strings.HasSuffix(t.Name(), "attn_q.weight") || strings.HasSuffix(t.Name(), "attn_k.weight") {
|
if strings.HasSuffix(t.Name(), "attn_q.weight") ||
|
||||||
if !p.skipRepack {
|
strings.HasSuffix(t.Name(), "attn_k.weight") {
|
||||||
t.SetRepacker(p.repack)
|
t.SetRepacker(p.repack)
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
out = append(out, &ggml.Tensor{
|
out = append(out, ggml.Tensor{
|
||||||
Name: t.Name(),
|
Name: t.Name(),
|
||||||
Kind: t.Kind(),
|
Kind: t.Kind(),
|
||||||
Shape: t.Shape(),
|
Shape: t.Shape(),
|
||||||
|
@ -1,169 +0,0 @@
|
|||||||
package convert
|
|
||||||
|
|
||||||
import (
|
|
||||||
"slices"
|
|
||||||
"strings"
|
|
||||||
|
|
||||||
"github.com/pdevine/tensor"
|
|
||||||
"github.com/pdevine/tensor/native"
|
|
||||||
|
|
||||||
"github.com/ollama/ollama/fs/ggml"
|
|
||||||
)
|
|
||||||
|
|
||||||
type llama4Model struct {
|
|
||||||
ModelParameters
|
|
||||||
TextModel struct {
|
|
||||||
llamaModel
|
|
||||||
NumExpertsPerToken uint32 `json:"num_experts_per_tok"`
|
|
||||||
NumLocalExperts uint32 `json:"num_local_experts"`
|
|
||||||
InterleaveMOELayerStep uint32 `json:"interleave_moe_layer_step"`
|
|
||||||
UseQKNorm bool `json:"use_qk_norm"`
|
|
||||||
IntermediateSizeMLP uint32 `json:"intermediate_size_mlp"`
|
|
||||||
AttentionChunkSize uint32 `json:"attention_chunk_size"`
|
|
||||||
} `json:"text_config"`
|
|
||||||
VisionModel struct {
|
|
||||||
NumHiddenLayers uint32 `json:"num_hidden_layers"`
|
|
||||||
HiddenSize uint32 `json:"hidden_size"`
|
|
||||||
IntermediateSize uint32 `json:"intermediate_size"`
|
|
||||||
NumAttentionHeads uint32 `json:"num_attention_heads"`
|
|
||||||
ImageSize uint32 `json:"image_size"`
|
|
||||||
PatchSize uint32 `json:"patch_size"`
|
|
||||||
RopeTheta float32 `json:"rope_theta"`
|
|
||||||
NormEpsilon float32 `json:"norm_eps"`
|
|
||||||
PixelShuffleRatio float32 `json:"pixel_shuffle_ratio"`
|
|
||||||
} `json:"vision_config"`
|
|
||||||
}
|
|
||||||
|
|
||||||
// KV implements ModelConverter.
|
|
||||||
func (p *llama4Model) KV(t *Tokenizer) ggml.KV {
|
|
||||||
kv := p.ModelParameters.KV(t)
|
|
||||||
kv["general.architecture"] = "llama4"
|
|
||||||
|
|
||||||
for k, v := range p.TextModel.KV(t) {
|
|
||||||
if strings.HasPrefix(k, "llama.") {
|
|
||||||
kv[strings.ReplaceAll(k, "llama.", "llama4.")] = v
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
kv["llama4.feed_forward_length"] = p.TextModel.IntermediateSizeMLP
|
|
||||||
kv["llama4.expert_feed_forward_length"] = p.TextModel.IntermediateSize
|
|
||||||
|
|
||||||
kv["llama4.expert_count"] = p.TextModel.NumLocalExperts
|
|
||||||
kv["llama4.expert_used_count"] = p.TextModel.NumExpertsPerToken
|
|
||||||
kv["llama4.interleave_moe_layer_step"] = p.TextModel.InterleaveMOELayerStep
|
|
||||||
kv["llama4.use_qk_norm"] = p.TextModel.UseQKNorm
|
|
||||||
kv["llama4.attention.chunk_size"] = p.TextModel.AttentionChunkSize
|
|
||||||
|
|
||||||
kv["llama4.vision.block_count"] = p.VisionModel.NumHiddenLayers
|
|
||||||
kv["llama4.vision.embedding_length"] = p.VisionModel.HiddenSize
|
|
||||||
kv["llama4.vision.feed_forward_length"] = p.VisionModel.IntermediateSize
|
|
||||||
kv["llama4.vision.attention.head_count"] = p.VisionModel.NumAttentionHeads
|
|
||||||
kv["llama4.vision.image_size"] = p.VisionModel.ImageSize
|
|
||||||
kv["llama4.vision.patch_size"] = p.VisionModel.PatchSize
|
|
||||||
kv["llama4.vision.rope.freq_base"] = p.VisionModel.RopeTheta
|
|
||||||
kv["llama4.vision.layer_norm_epsilon"] = p.VisionModel.NormEpsilon
|
|
||||||
kv["llama4.vision.pixel_shuffle_ratio"] = p.VisionModel.PixelShuffleRatio
|
|
||||||
return kv
|
|
||||||
}
|
|
||||||
|
|
||||||
// Replacements implements ModelConverter.
|
|
||||||
func (p *llama4Model) Replacements() []string {
|
|
||||||
return append(
|
|
||||||
p.TextModel.Replacements(),
|
|
||||||
"language_model.", "",
|
|
||||||
"vision_model", "v",
|
|
||||||
"multi_modal_projector", "mm",
|
|
||||||
"feed_forward.down_proj", "ffn_down",
|
|
||||||
"feed_forward.up_proj", "ffn_up",
|
|
||||||
"feed_forward.gate_proj", "ffn_gate",
|
|
||||||
"feed_forward.", "ffn_",
|
|
||||||
"shared_expert.down_proj", "down_shexp",
|
|
||||||
"shared_expert.gate_proj", "gate_shexp",
|
|
||||||
"shared_expert.up_proj", "up_shexp",
|
|
||||||
"experts.down_proj", "down_exps.weight",
|
|
||||||
"experts.gate_up_proj", "gate_up_exps.weight",
|
|
||||||
"router", "gate_inp",
|
|
||||||
"patch_embedding.linear", "patch_embedding",
|
|
||||||
)
|
|
||||||
}
|
|
||||||
|
|
||||||
// Tensors implements ModelConverter.
|
|
||||||
func (p *llama4Model) Tensors(ts []Tensor) []*ggml.Tensor {
|
|
||||||
var out []*ggml.Tensor
|
|
||||||
|
|
||||||
var textTensors []Tensor
|
|
||||||
for _, t := range ts {
|
|
||||||
if strings.HasPrefix(t.Name(), "v.") || strings.HasPrefix(t.Name(), "mm.") {
|
|
||||||
out = append(out, &ggml.Tensor{
|
|
||||||
Name: t.Name(),
|
|
||||||
Kind: t.Kind(),
|
|
||||||
Shape: t.Shape(),
|
|
||||||
WriterTo: t,
|
|
||||||
})
|
|
||||||
} else if strings.Contains(t.Name(), "ffn_gate_up_exps") {
|
|
||||||
// gate and up projectors are fused
|
|
||||||
// dims[1], dims[2] must be swapped
|
|
||||||
// [experts, hidden_size, intermediate_size * 2] --> [experts, intermediate_size, hidden_size]
|
|
||||||
halfDim := int(t.Shape()[2]) / 2
|
|
||||||
|
|
||||||
newShape := slices.Clone(t.Shape())
|
|
||||||
newShape[1], newShape[2] = newShape[2]/2, newShape[1]
|
|
||||||
for i, name := range []string{"ffn_gate_exps", "ffn_up_exps"} {
|
|
||||||
// clone tensor since we need separate repackers
|
|
||||||
tt := t.Clone()
|
|
||||||
tt.SetRepacker(p.repack(nil, nil, tensor.S(i*halfDim, (i+1)*halfDim)))
|
|
||||||
out = append(out, &ggml.Tensor{
|
|
||||||
Name: strings.ReplaceAll(tt.Name(), "ffn_gate_up_exps", name),
|
|
||||||
Kind: tt.Kind(),
|
|
||||||
Shape: newShape,
|
|
||||||
WriterTo: tt,
|
|
||||||
})
|
|
||||||
}
|
|
||||||
} else if strings.Contains(t.Name(), "ffn_down_exps") {
|
|
||||||
// dims[1], dims[2] must be swapped
|
|
||||||
// [experts, intermediate_size, hidden_size] --> [experts, hidden_size, intermediate_size]
|
|
||||||
t.SetRepacker(p.repack())
|
|
||||||
newShape := slices.Clone(t.Shape())
|
|
||||||
newShape[1], newShape[2] = newShape[2], newShape[1]
|
|
||||||
out = append(out, &ggml.Tensor{
|
|
||||||
Name: t.Name(),
|
|
||||||
Kind: t.Kind(),
|
|
||||||
Shape: newShape,
|
|
||||||
WriterTo: t,
|
|
||||||
})
|
|
||||||
} else {
|
|
||||||
textTensors = append(textTensors, t)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
p.TextModel.skipRepack = true
|
|
||||||
out = append(out, p.TextModel.Tensors(textTensors)...)
|
|
||||||
return out
|
|
||||||
}
|
|
||||||
|
|
||||||
func (p *llama4Model) repack(slice ...tensor.Slice) Repacker {
|
|
||||||
return func(name string, data []float32, shape []uint64) ([]float32, error) {
|
|
||||||
dims := make([]int, len(shape))
|
|
||||||
for i, dim := range shape {
|
|
||||||
dims[i] = int(dim)
|
|
||||||
}
|
|
||||||
|
|
||||||
var t tensor.Tensor = tensor.New(tensor.WithShape(dims...), tensor.WithBacking(data))
|
|
||||||
t, err := t.Slice(slice...)
|
|
||||||
if err != nil {
|
|
||||||
return nil, err
|
|
||||||
}
|
|
||||||
|
|
||||||
if err := t.T(0, 2, 1); err != nil {
|
|
||||||
return nil, err
|
|
||||||
}
|
|
||||||
|
|
||||||
t = tensor.Materialize(t)
|
|
||||||
// flatten tensor so it can be return as a vector
|
|
||||||
if err := t.Reshape(t.Shape().TotalSize()); err != nil {
|
|
||||||
return nil, err
|
|
||||||
}
|
|
||||||
|
|
||||||
return native.VectorF32(t.(*tensor.Dense))
|
|
||||||
}
|
|
||||||
}
|
|
@ -29,8 +29,8 @@ func (p *llamaAdapter) KV(baseKV ggml.KV) ggml.KV {
|
|||||||
return kv
|
return kv
|
||||||
}
|
}
|
||||||
|
|
||||||
func (p *llamaAdapter) Tensors(ts []Tensor) []*ggml.Tensor {
|
func (p *llamaAdapter) Tensors(ts []Tensor) []ggml.Tensor {
|
||||||
var out []*ggml.Tensor
|
var out []ggml.Tensor
|
||||||
for _, t := range ts {
|
for _, t := range ts {
|
||||||
shape := t.Shape()
|
shape := t.Shape()
|
||||||
if (strings.HasSuffix(t.Name(), "weight.lora_a") && shape[0] > shape[1]) ||
|
if (strings.HasSuffix(t.Name(), "weight.lora_a") && shape[0] > shape[1]) ||
|
||||||
@ -41,7 +41,7 @@ func (p *llamaAdapter) Tensors(ts []Tensor) []*ggml.Tensor {
|
|||||||
t.SetRepacker(p.repack)
|
t.SetRepacker(p.repack)
|
||||||
}
|
}
|
||||||
|
|
||||||
out = append(out, &ggml.Tensor{
|
out = append(out, ggml.Tensor{
|
||||||
Name: t.Name(),
|
Name: t.Name(),
|
||||||
Kind: t.Kind(),
|
Kind: t.Kind(),
|
||||||
Shape: shape,
|
Shape: shape,
|
||||||
|
@ -89,8 +89,8 @@ func (p *mistral3Model) KV(t *Tokenizer) ggml.KV {
|
|||||||
return kv
|
return kv
|
||||||
}
|
}
|
||||||
|
|
||||||
func (p *mistral3Model) Tensors(ts []Tensor) []*ggml.Tensor {
|
func (p *mistral3Model) Tensors(ts []Tensor) []ggml.Tensor {
|
||||||
var out []*ggml.Tensor
|
var out []ggml.Tensor
|
||||||
|
|
||||||
for _, t := range ts {
|
for _, t := range ts {
|
||||||
if !strings.HasPrefix(t.Name(), "v.") {
|
if !strings.HasPrefix(t.Name(), "v.") {
|
||||||
@ -100,7 +100,7 @@ func (p *mistral3Model) Tensors(ts []Tensor) []*ggml.Tensor {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
out = append(out, &ggml.Tensor{
|
out = append(out, ggml.Tensor{
|
||||||
Name: t.Name(),
|
Name: t.Name(),
|
||||||
Kind: t.Kind(),
|
Kind: t.Kind(),
|
||||||
Shape: t.Shape(),
|
Shape: t.Shape(),
|
||||||
|
@ -29,7 +29,7 @@ func (p *mixtralModel) KV(t *Tokenizer) ggml.KV {
|
|||||||
return kv
|
return kv
|
||||||
}
|
}
|
||||||
|
|
||||||
func (p *mixtralModel) Tensors(ts []Tensor) []*ggml.Tensor {
|
func (p *mixtralModel) Tensors(ts []Tensor) []ggml.Tensor {
|
||||||
oldnew := []string{
|
oldnew := []string{
|
||||||
"model.layers", "blk",
|
"model.layers", "blk",
|
||||||
"w1", "ffn_gate_exps",
|
"w1", "ffn_gate_exps",
|
||||||
@ -56,10 +56,10 @@ func (p *mixtralModel) Tensors(ts []Tensor) []*ggml.Tensor {
|
|||||||
return true
|
return true
|
||||||
})
|
})
|
||||||
|
|
||||||
var out []*ggml.Tensor
|
var out []ggml.Tensor
|
||||||
for n, e := range experts {
|
for n, e := range experts {
|
||||||
// TODO(mxyng): sanity check experts
|
// TODO(mxyng): sanity check experts
|
||||||
out = append(out, &ggml.Tensor{
|
out = append(out, ggml.Tensor{
|
||||||
Name: n,
|
Name: n,
|
||||||
Kind: e[0].Kind(),
|
Kind: e[0].Kind(),
|
||||||
Shape: append([]uint64{uint64(len(e))}, e[0].Shape()...),
|
Shape: append([]uint64{uint64(len(e))}, e[0].Shape()...),
|
||||||
|
@ -1,160 +0,0 @@
|
|||||||
package convert
|
|
||||||
|
|
||||||
import (
|
|
||||||
"strings"
|
|
||||||
|
|
||||||
"github.com/ollama/ollama/fs/ggml"
|
|
||||||
"github.com/pdevine/tensor"
|
|
||||||
"github.com/pdevine/tensor/native"
|
|
||||||
)
|
|
||||||
|
|
||||||
type mllamaModel struct {
|
|
||||||
ModelParameters
|
|
||||||
TextModel struct {
|
|
||||||
llamaModel
|
|
||||||
|
|
||||||
CrossAttentionLayers []int32 `json:"cross_attention_layers"`
|
|
||||||
} `json:"text_config"`
|
|
||||||
VisionModel struct {
|
|
||||||
NumHiddenLayers uint32 `json:"num_hidden_layers"`
|
|
||||||
NumGlobalLayers uint32 `json:"num_global_layers"`
|
|
||||||
IntermediateLayersIndices []int32 `json:"intermediate_layers_indices"`
|
|
||||||
|
|
||||||
HiddenSize uint32 `json:"hidden_size"`
|
|
||||||
IntermediateSize uint32 `json:"intermediate_size"`
|
|
||||||
|
|
||||||
AttentionHeads uint32 `json:"attention_heads"`
|
|
||||||
|
|
||||||
ImageSize uint32 `json:"image_size"`
|
|
||||||
PatchSize uint32 `json:"patch_size"`
|
|
||||||
NumChannels uint32 `json:"num_channels"`
|
|
||||||
MaxNumTiles uint32 `json:"max_num_tiles"`
|
|
||||||
NormEpsilon float32 `json:"norm_eps"`
|
|
||||||
RopeTheta float32 `json:"rope.freq_base"`
|
|
||||||
} `json:"vision_config"`
|
|
||||||
}
|
|
||||||
|
|
||||||
func (m *mllamaModel) KV(t *Tokenizer) ggml.KV {
|
|
||||||
kv := m.ModelParameters.KV(t)
|
|
||||||
kv["general.architecture"] = "mllama"
|
|
||||||
|
|
||||||
for k, v := range m.TextModel.KV(t) {
|
|
||||||
if strings.HasPrefix(k, "llama.") {
|
|
||||||
kv[strings.ReplaceAll(k, "llama.", "mllama.")] = v
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
kv["mllama.attention.cross_attention_layers"] = m.TextModel.CrossAttentionLayers
|
|
||||||
|
|
||||||
kv["mllama.vision.block_count"] = m.VisionModel.NumHiddenLayers
|
|
||||||
kv["mllama.vision.global.block_count"] = m.VisionModel.NumGlobalLayers
|
|
||||||
kv["mllama.vision.intermediate_layers_indices"] = m.VisionModel.IntermediateLayersIndices
|
|
||||||
|
|
||||||
kv["mllama.vision.embedding_length"] = m.VisionModel.HiddenSize
|
|
||||||
kv["mllama.vision.feed_forward_length"] = m.VisionModel.IntermediateSize
|
|
||||||
|
|
||||||
kv["mllama.vision.attention.head_count"] = m.VisionModel.AttentionHeads
|
|
||||||
kv["mllama.vision.attention.layer_norm_epsilon"] = m.VisionModel.NormEpsilon
|
|
||||||
|
|
||||||
kv["mllama.vision.image_size"] = m.VisionModel.ImageSize
|
|
||||||
kv["mllama.vision.patch_size"] = m.VisionModel.PatchSize
|
|
||||||
kv["mllama.vision.max_num_tiles"] = m.VisionModel.MaxNumTiles
|
|
||||||
kv["mllama.vision.num_channels"] = m.VisionModel.NumChannels
|
|
||||||
|
|
||||||
return kv
|
|
||||||
}
|
|
||||||
|
|
||||||
func (m *mllamaModel) Replacements() []string {
|
|
||||||
return append(
|
|
||||||
m.TextModel.Replacements(),
|
|
||||||
"language_model.", "",
|
|
||||||
"gate_attn", "attn_gate",
|
|
||||||
"gate_ffn", "ffn_gate",
|
|
||||||
"cross_attn.", "cross_attn_",
|
|
||||||
"vision_model", "v",
|
|
||||||
"class_embedding", "class_embd",
|
|
||||||
"patch_embedding", "patch_embd",
|
|
||||||
"gated_positional_embedding.tile_embedding", "tile_position_embd",
|
|
||||||
"gated_positional_embedding.embedding", "position_embd.weight",
|
|
||||||
"gated_positional_embedding", "position_embd",
|
|
||||||
"embedding.weight", "weight",
|
|
||||||
"pre_tile_positional_embedding", "pre_tile_position_embd",
|
|
||||||
"post_tile_positional_embedding", "post_tile_position_embd",
|
|
||||||
"layernorm_pre", "pre_ln",
|
|
||||||
"layernorm_post", "post_ln",
|
|
||||||
"global_transformer.layers", "global.blk",
|
|
||||||
"transformer.layers", "blk",
|
|
||||||
"mlp.fc1", "ffn_up",
|
|
||||||
"mlp.fc2", "ffn_down",
|
|
||||||
"multi_modal_projector", "mm.0",
|
|
||||||
)
|
|
||||||
}
|
|
||||||
|
|
||||||
func (m *mllamaModel) Tensors(ts []Tensor) []*ggml.Tensor {
|
|
||||||
var out []*ggml.Tensor
|
|
||||||
var text []Tensor
|
|
||||||
for _, t := range ts {
|
|
||||||
if t.Name() == "v.position_embd.gate" {
|
|
||||||
for _, name := range []string{"v.position_embd.gate", "v.tile_position_embd.gate"} {
|
|
||||||
tt := t.Clone()
|
|
||||||
tt.SetRepacker(m.repack(name))
|
|
||||||
out = append(out, &ggml.Tensor{
|
|
||||||
Name: name,
|
|
||||||
Kind: t.Kind(),
|
|
||||||
Shape: t.Shape(),
|
|
||||||
WriterTo: tt,
|
|
||||||
})
|
|
||||||
}
|
|
||||||
} else if t.Name() == "v.pre_tile_position_embd.gate" || t.Name() == "v.post_tile_position_embd.gate" {
|
|
||||||
t.SetRepacker(m.repack(t.Name()))
|
|
||||||
out = append(out, &ggml.Tensor{
|
|
||||||
Name: t.Name(),
|
|
||||||
Kind: t.Kind(),
|
|
||||||
Shape: t.Shape(),
|
|
||||||
WriterTo: t,
|
|
||||||
})
|
|
||||||
} else if strings.HasPrefix(t.Name(), "v.") || strings.HasPrefix(t.Name(), "mm.") {
|
|
||||||
out = append(out, &ggml.Tensor{
|
|
||||||
Name: t.Name(),
|
|
||||||
Kind: t.Kind(),
|
|
||||||
Shape: t.Shape(),
|
|
||||||
WriterTo: t,
|
|
||||||
})
|
|
||||||
} else {
|
|
||||||
text = append(text, t)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return append(out, m.TextModel.Tensors(text)...)
|
|
||||||
}
|
|
||||||
|
|
||||||
func (m *mllamaModel) repack(name string) Repacker {
|
|
||||||
return func(_ string, data []float32, shape []uint64) (_ []float32, err error) {
|
|
||||||
dims := make([]int, len(shape))
|
|
||||||
for i, dim := range shape {
|
|
||||||
dims[i] = int(dim)
|
|
||||||
}
|
|
||||||
|
|
||||||
var t tensor.Tensor = tensor.New(tensor.WithShape(dims...), tensor.WithBacking(data))
|
|
||||||
|
|
||||||
t, err = tensor.Tanh(t)
|
|
||||||
if err != nil {
|
|
||||||
return nil, err
|
|
||||||
}
|
|
||||||
|
|
||||||
if name == "v.position_embd.gate" {
|
|
||||||
t, err = tensor.Sub(float32(1), t)
|
|
||||||
if err != nil {
|
|
||||||
return nil, err
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
t = tensor.Materialize(t)
|
|
||||||
// flatten tensor so it can be return as a vector
|
|
||||||
if err := t.Reshape(t.Shape().TotalSize()); err != nil {
|
|
||||||
return nil, err
|
|
||||||
}
|
|
||||||
|
|
||||||
return native.VectorF32(t.(*tensor.Dense))
|
|
||||||
}
|
|
||||||
}
|
|
@ -68,19 +68,19 @@ func (p *phi3Model) KV(t *Tokenizer) ggml.KV {
|
|||||||
return kv
|
return kv
|
||||||
}
|
}
|
||||||
|
|
||||||
func (p *phi3Model) Tensors(ts []Tensor) []*ggml.Tensor {
|
func (p *phi3Model) Tensors(ts []Tensor) []ggml.Tensor {
|
||||||
var addRopeFactors sync.Once
|
var addRopeFactors sync.Once
|
||||||
|
|
||||||
out := make([]*ggml.Tensor, 0, len(ts)+2)
|
out := make([]ggml.Tensor, 0, len(ts)+2)
|
||||||
for _, t := range ts {
|
for _, t := range ts {
|
||||||
if strings.HasPrefix(t.Name(), "blk.0.") {
|
if strings.HasPrefix(t.Name(), "blk.0.") {
|
||||||
addRopeFactors.Do(func() {
|
addRopeFactors.Do(func() {
|
||||||
out = append(out, &ggml.Tensor{
|
out = append(out, ggml.Tensor{
|
||||||
Name: "rope_factors_long.weight",
|
Name: "rope_factors_long.weight",
|
||||||
Kind: 0,
|
Kind: 0,
|
||||||
Shape: []uint64{uint64(len(p.RopeScaling.LongFactor))},
|
Shape: []uint64{uint64(len(p.RopeScaling.LongFactor))},
|
||||||
WriterTo: p.RopeScaling.LongFactor,
|
WriterTo: p.RopeScaling.LongFactor,
|
||||||
}, &ggml.Tensor{
|
}, ggml.Tensor{
|
||||||
Name: "rope_factors_short.weight",
|
Name: "rope_factors_short.weight",
|
||||||
Kind: 0,
|
Kind: 0,
|
||||||
Shape: []uint64{uint64(len(p.RopeScaling.ShortFactor))},
|
Shape: []uint64{uint64(len(p.RopeScaling.ShortFactor))},
|
||||||
@ -89,7 +89,7 @@ func (p *phi3Model) Tensors(ts []Tensor) []*ggml.Tensor {
|
|||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
out = append(out, &ggml.Tensor{
|
out = append(out, ggml.Tensor{
|
||||||
Name: t.Name(),
|
Name: t.Name(),
|
||||||
Kind: t.Kind(),
|
Kind: t.Kind(),
|
||||||
Shape: t.Shape(),
|
Shape: t.Shape(),
|
||||||
|
@ -15,7 +15,6 @@ type qwen2Model struct {
|
|||||||
Type string `json:"type"`
|
Type string `json:"type"`
|
||||||
Factor ropeFactor `json:"factor"`
|
Factor ropeFactor `json:"factor"`
|
||||||
OriginalMaxPositionEmbeddings uint32 `json:"original_max_position_embeddings"`
|
OriginalMaxPositionEmbeddings uint32 `json:"original_max_position_embeddings"`
|
||||||
MropeSection []int32 `json:"mrope_section"`
|
|
||||||
} `json:"rope_scaling"`
|
} `json:"rope_scaling"`
|
||||||
RMSNormEPS float32 `json:"rms_norm_eps"`
|
RMSNormEPS float32 `json:"rms_norm_eps"`
|
||||||
}
|
}
|
||||||
@ -40,18 +39,16 @@ func (q *qwen2Model) KV(t *Tokenizer) ggml.KV {
|
|||||||
case "yarn":
|
case "yarn":
|
||||||
kv["qwen2.rope.scaling.type"] = q.RopeScaling.Type
|
kv["qwen2.rope.scaling.type"] = q.RopeScaling.Type
|
||||||
kv["qwen2.rope.scaling.factor"] = q.RopeScaling.Factor
|
kv["qwen2.rope.scaling.factor"] = q.RopeScaling.Factor
|
||||||
case "mrope", "default":
|
|
||||||
kv["qwen2.rope.mrope_section"] = q.RopeScaling.MropeSection
|
|
||||||
default:
|
default:
|
||||||
panic("unknown rope scaling type")
|
panic("unknown rope scaling type")
|
||||||
}
|
}
|
||||||
return kv
|
return kv
|
||||||
}
|
}
|
||||||
|
|
||||||
func (q *qwen2Model) Tensors(ts []Tensor) []*ggml.Tensor {
|
func (q *qwen2Model) Tensors(ts []Tensor) []ggml.Tensor {
|
||||||
var out []*ggml.Tensor
|
var out []ggml.Tensor
|
||||||
for _, t := range ts {
|
for _, t := range ts {
|
||||||
out = append(out, &ggml.Tensor{
|
out = append(out, ggml.Tensor{
|
||||||
Name: t.Name(),
|
Name: t.Name(),
|
||||||
Kind: t.Kind(),
|
Kind: t.Kind(),
|
||||||
Shape: t.Shape(),
|
Shape: t.Shape(),
|
||||||
|
@ -1,102 +0,0 @@
|
|||||||
package convert
|
|
||||||
|
|
||||||
import (
|
|
||||||
"cmp"
|
|
||||||
"slices"
|
|
||||||
"strings"
|
|
||||||
|
|
||||||
"github.com/ollama/ollama/fs/ggml"
|
|
||||||
)
|
|
||||||
|
|
||||||
type qwen25VLModel struct {
|
|
||||||
qwen2Model
|
|
||||||
|
|
||||||
VisionModel struct {
|
|
||||||
Depth uint32 `json:"depth"`
|
|
||||||
HiddenSize uint32 `json:"hidden_size"`
|
|
||||||
NumHeads uint32 `json:"num_heads"`
|
|
||||||
InChannels uint32 `json:"in_chans"`
|
|
||||||
PatchSize uint32 `json:"patch_size"`
|
|
||||||
SpatialMergeSize uint32 `json:"spatial_merge_size"`
|
|
||||||
SpatialPatchSize uint32 `json:"spatial_patch_size"`
|
|
||||||
WindowSize uint32 `json:"window_size"`
|
|
||||||
RMSNormEps float32 `json:"layer_norm_epsilon"`
|
|
||||||
RopeTheta float32 `json:"rope_theta"`
|
|
||||||
FullAttentionBlocks []int32 `json:"fullatt_block_indexes"`
|
|
||||||
TemporalPatchSize uint32 `json:"temporal_patch_size"`
|
|
||||||
} `json:"vision_config"`
|
|
||||||
}
|
|
||||||
|
|
||||||
var _ ModelConverter = (*qwen25VLModel)(nil)
|
|
||||||
|
|
||||||
func (q *qwen25VLModel) KV(t *Tokenizer) ggml.KV {
|
|
||||||
kv := q.ModelParameters.KV(t)
|
|
||||||
kv["general.architecture"] = "qwen25vl"
|
|
||||||
|
|
||||||
for k, v := range q.qwen2Model.KV(t) {
|
|
||||||
if strings.HasPrefix(k, "qwen2.") {
|
|
||||||
kv[strings.Replace(k, "qwen2.", "qwen25vl.", 1)] = v
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if q.VisionModel.FullAttentionBlocks == nil {
|
|
||||||
kv["qwen25vl.vision.fullatt_block_indexes"] = []int32{7, 15, 23, 31}
|
|
||||||
}
|
|
||||||
|
|
||||||
kv["qwen25vl.vision.block_count"] = cmp.Or(q.VisionModel.Depth, 32)
|
|
||||||
kv["qwen25vl.vision.embedding_length"] = q.VisionModel.HiddenSize
|
|
||||||
kv["qwen25vl.vision.attention.head_count"] = cmp.Or(q.VisionModel.NumHeads, 16)
|
|
||||||
kv["qwen25vl.vision.num_channels"] = q.VisionModel.InChannels
|
|
||||||
kv["qwen25vl.vision.patch_size"] = cmp.Or(q.VisionModel.PatchSize, 14)
|
|
||||||
kv["qwen25vl.vision.spatial_merge_size"] = cmp.Or(q.VisionModel.SpatialMergeSize, 2)
|
|
||||||
kv["qwen25vl.vision.spatial_patch_size"] = q.VisionModel.SpatialPatchSize
|
|
||||||
kv["qwen25vl.vision.window_size"] = cmp.Or(q.VisionModel.WindowSize, 112)
|
|
||||||
kv["qwen25vl.vision.attention.layer_norm_epsilon"] = cmp.Or(q.VisionModel.RMSNormEps, 1e-6)
|
|
||||||
kv["qwen25vl.vision.rope.freq_base"] = cmp.Or(q.VisionModel.RopeTheta, 1e4)
|
|
||||||
kv["qwen25vl.vision.fullatt_block_indexes"] = q.VisionModel.FullAttentionBlocks
|
|
||||||
kv["qwen25vl.vision.temporal_patch_size"] = cmp.Or(q.VisionModel.TemporalPatchSize, 2)
|
|
||||||
|
|
||||||
return kv
|
|
||||||
}
|
|
||||||
|
|
||||||
func (q *qwen25VLModel) Tensors(ts []Tensor) []*ggml.Tensor {
|
|
||||||
var out []*ggml.Tensor
|
|
||||||
|
|
||||||
for _, t := range ts {
|
|
||||||
if strings.Contains(t.Name(), "patch_embed.proj") {
|
|
||||||
for t := range splitDim(t, 2,
|
|
||||||
strings.NewReplacer("patch_embed.proj", "patch_embd_0"),
|
|
||||||
strings.NewReplacer("patch_embed.proj", "patch_embd_1"),
|
|
||||||
) {
|
|
||||||
t.Shape = slices.DeleteFunc(t.Shape, func(i uint64) bool { return i == 1 })
|
|
||||||
out = append(out, t)
|
|
||||||
}
|
|
||||||
} else if strings.Contains(t.Name(), "attn.qkv") {
|
|
||||||
out = append(out, slices.Collect(splitDim(t, 0,
|
|
||||||
strings.NewReplacer("attn.qkv", "attn_q"),
|
|
||||||
strings.NewReplacer("attn.qkv", "attn_k"),
|
|
||||||
strings.NewReplacer("attn.qkv", "attn_v"),
|
|
||||||
))...)
|
|
||||||
} else {
|
|
||||||
out = append(out, &ggml.Tensor{
|
|
||||||
Name: t.Name(),
|
|
||||||
Kind: t.Kind(),
|
|
||||||
Shape: t.Shape(),
|
|
||||||
WriterTo: t,
|
|
||||||
})
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return out
|
|
||||||
}
|
|
||||||
|
|
||||||
func (p *qwen25VLModel) Replacements() []string {
|
|
||||||
return append(
|
|
||||||
p.qwen2Model.Replacements(),
|
|
||||||
"visual", "v",
|
|
||||||
"blocks", "blk",
|
|
||||||
"attn.proj", "attn_out",
|
|
||||||
"norm1", "ln1",
|
|
||||||
"norm2", "ln2",
|
|
||||||
)
|
|
||||||
}
|
|
@ -11,6 +11,7 @@ import (
|
|||||||
"io"
|
"io"
|
||||||
"io/fs"
|
"io/fs"
|
||||||
"log/slog"
|
"log/slog"
|
||||||
|
"math"
|
||||||
"os"
|
"os"
|
||||||
"path/filepath"
|
"path/filepath"
|
||||||
"slices"
|
"slices"
|
||||||
@ -47,7 +48,7 @@ func convertFull(t *testing.T, fsys fs.FS) (*os.File, ggml.KV, ggml.Tensors) {
|
|||||||
}
|
}
|
||||||
t.Cleanup(func() { r.Close() })
|
t.Cleanup(func() { r.Close() })
|
||||||
|
|
||||||
m, _, err := ggml.Decode(r, -1)
|
m, _, err := ggml.Decode(r, math.MaxInt)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
t.Fatal(err)
|
t.Fatal(err)
|
||||||
}
|
}
|
||||||
@ -130,7 +131,6 @@ func TestConvertModel(t *testing.T) {
|
|||||||
if err != nil {
|
if err != nil {
|
||||||
t.Fatal(err)
|
t.Fatal(err)
|
||||||
}
|
}
|
||||||
defer expectFile.Close()
|
|
||||||
|
|
||||||
var expect map[string]string
|
var expect map[string]string
|
||||||
if err := json.NewDecoder(expectFile).Decode(&expect); err != nil {
|
if err := json.NewDecoder(expectFile).Decode(&expect); err != nil {
|
||||||
@ -332,7 +332,7 @@ func TestConvertAdapter(t *testing.T) {
|
|||||||
}
|
}
|
||||||
defer r.Close()
|
defer r.Close()
|
||||||
|
|
||||||
m, _, err := ggml.Decode(r, -1)
|
m, _, err := ggml.Decode(r, math.MaxInt)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
t.Fatal(err)
|
t.Fatal(err)
|
||||||
}
|
}
|
||||||
|
58
convert/fs.go
Normal file
58
convert/fs.go
Normal file
@ -0,0 +1,58 @@
|
|||||||
|
package convert
|
||||||
|
|
||||||
|
import (
|
||||||
|
"archive/zip"
|
||||||
|
"errors"
|
||||||
|
"io"
|
||||||
|
"io/fs"
|
||||||
|
"os"
|
||||||
|
"path/filepath"
|
||||||
|
)
|
||||||
|
|
||||||
|
type ZipReader struct {
|
||||||
|
r *zip.Reader
|
||||||
|
p string
|
||||||
|
|
||||||
|
// limit is the maximum size of a file that can be read directly
|
||||||
|
// from the zip archive. Files larger than this size will be extracted
|
||||||
|
limit int64
|
||||||
|
}
|
||||||
|
|
||||||
|
func NewZipReader(r *zip.Reader, p string, limit int64) fs.FS {
|
||||||
|
return &ZipReader{r, p, limit}
|
||||||
|
}
|
||||||
|
|
||||||
|
func (z *ZipReader) Open(name string) (fs.File, error) {
|
||||||
|
r, err := z.r.Open(name)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
defer r.Close()
|
||||||
|
|
||||||
|
if fi, err := r.Stat(); err != nil {
|
||||||
|
return nil, err
|
||||||
|
} else if fi.Size() < z.limit {
|
||||||
|
return r, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
if !filepath.IsLocal(name) {
|
||||||
|
return nil, zip.ErrInsecurePath
|
||||||
|
}
|
||||||
|
|
||||||
|
n := filepath.Join(z.p, name)
|
||||||
|
if _, err := os.Stat(n); errors.Is(err, os.ErrNotExist) {
|
||||||
|
w, err := os.Create(n)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
defer w.Close()
|
||||||
|
|
||||||
|
if _, err := io.Copy(w, r); err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
} else if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
|
||||||
|
return os.Open(n)
|
||||||
|
}
|
@ -11,15 +11,14 @@ type Tensor interface {
|
|||||||
Name() string
|
Name() string
|
||||||
Shape() []uint64
|
Shape() []uint64
|
||||||
Kind() uint32
|
Kind() uint32
|
||||||
SetRepacker(Repacker)
|
SetRepacker(repacker)
|
||||||
WriteTo(io.Writer) (int64, error)
|
WriteTo(io.Writer) (int64, error)
|
||||||
Clone() Tensor
|
|
||||||
}
|
}
|
||||||
|
|
||||||
type tensorBase struct {
|
type tensorBase struct {
|
||||||
name string
|
name string
|
||||||
shape []uint64
|
shape []uint64
|
||||||
repacker Repacker
|
repacker
|
||||||
}
|
}
|
||||||
|
|
||||||
func (t tensorBase) Name() string {
|
func (t tensorBase) Name() string {
|
||||||
@ -37,11 +36,7 @@ const (
|
|||||||
|
|
||||||
func (t tensorBase) Kind() uint32 {
|
func (t tensorBase) Kind() uint32 {
|
||||||
if strings.HasSuffix(t.name, ".ffn_gate_inp.weight") ||
|
if strings.HasSuffix(t.name, ".ffn_gate_inp.weight") ||
|
||||||
t.name == "token_types.weight" ||
|
t.name == "token_types.weight" {
|
||||||
t.name == "v.positional_embedding_vlm" ||
|
|
||||||
t.name == "v.tile_position_embd.weight" ||
|
|
||||||
t.name == "v.pre_tile_position_embd.weight" ||
|
|
||||||
t.name == "v.post_tile_position_embd.weight" {
|
|
||||||
// these tensors are always F32
|
// these tensors are always F32
|
||||||
return 0
|
return 0
|
||||||
}
|
}
|
||||||
@ -56,11 +51,11 @@ func (t tensorBase) Kind() uint32 {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func (t *tensorBase) SetRepacker(fn Repacker) {
|
func (t *tensorBase) SetRepacker(fn repacker) {
|
||||||
t.repacker = fn
|
t.repacker = fn
|
||||||
}
|
}
|
||||||
|
|
||||||
type Repacker func(string, []float32, []uint64) ([]float32, error)
|
type repacker func(string, []float32, []uint64) ([]float32, error)
|
||||||
|
|
||||||
func parseTensors(fsys fs.FS, replacer *strings.Replacer) ([]Tensor, error) {
|
func parseTensors(fsys fs.FS, replacer *strings.Replacer) ([]Tensor, error) {
|
||||||
patterns := []struct {
|
patterns := []struct {
|
||||||
|
@ -94,21 +94,6 @@ type safetensor struct {
|
|||||||
*tensorBase
|
*tensorBase
|
||||||
}
|
}
|
||||||
|
|
||||||
func (st safetensor) Clone() Tensor {
|
|
||||||
return &safetensor{
|
|
||||||
fs: st.fs,
|
|
||||||
path: st.path,
|
|
||||||
dtype: st.dtype,
|
|
||||||
offset: st.offset,
|
|
||||||
size: st.size,
|
|
||||||
tensorBase: &tensorBase{
|
|
||||||
name: st.name,
|
|
||||||
repacker: st.repacker,
|
|
||||||
shape: slices.Clone(st.shape),
|
|
||||||
},
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
func (st safetensor) WriteTo(w io.Writer) (int64, error) {
|
func (st safetensor) WriteTo(w io.Writer) (int64, error) {
|
||||||
f, err := st.fs.Open(st.path)
|
f, err := st.fs.Open(st.path)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
|
@ -43,17 +43,6 @@ type torch struct {
|
|||||||
*tensorBase
|
*tensorBase
|
||||||
}
|
}
|
||||||
|
|
||||||
func (t torch) Clone() Tensor {
|
|
||||||
return torch{
|
|
||||||
storage: t.storage,
|
|
||||||
tensorBase: &tensorBase{
|
|
||||||
name: t.name,
|
|
||||||
shape: t.shape,
|
|
||||||
repacker: t.repacker,
|
|
||||||
},
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
func (pt torch) WriteTo(w io.Writer) (int64, error) {
|
func (pt torch) WriteTo(w io.Writer) (int64, error) {
|
||||||
return 0, nil
|
return 0, nil
|
||||||
}
|
}
|
||||||
|
@ -1,56 +0,0 @@
|
|||||||
package convert
|
|
||||||
|
|
||||||
import (
|
|
||||||
"iter"
|
|
||||||
"slices"
|
|
||||||
"strings"
|
|
||||||
|
|
||||||
"github.com/ollama/ollama/fs/ggml"
|
|
||||||
"github.com/pdevine/tensor"
|
|
||||||
"github.com/pdevine/tensor/native"
|
|
||||||
)
|
|
||||||
|
|
||||||
// splitDim splits a tensor along a specified dimension into multiple tensors. The dimension
|
|
||||||
// is split evenly based on the number of replacers provided.
|
|
||||||
func splitDim(t Tensor, dim int, replacers ...*strings.Replacer) iter.Seq[*ggml.Tensor] {
|
|
||||||
return func(yield func(*ggml.Tensor) bool) {
|
|
||||||
for i, replacer := range replacers {
|
|
||||||
shape := slices.Clone(t.Shape())
|
|
||||||
shape[dim] = shape[dim] / uint64(len(replacers))
|
|
||||||
|
|
||||||
slice := slices.Repeat([]tensor.Slice{nil}, len(shape))
|
|
||||||
slice[dim] = tensor.S(i*int(shape[dim]), (i+1)*int(shape[dim]))
|
|
||||||
|
|
||||||
tt := t.Clone()
|
|
||||||
tt.SetRepacker(func(_ string, data []float32, shape []uint64) ([]float32, error) {
|
|
||||||
dims := make([]int, len(shape))
|
|
||||||
for i := range shape {
|
|
||||||
dims[i] = int(shape[i])
|
|
||||||
}
|
|
||||||
|
|
||||||
var t tensor.Tensor = tensor.New(tensor.WithShape(dims...), tensor.WithBacking(data))
|
|
||||||
t, err := t.Slice(slice...)
|
|
||||||
if err != nil {
|
|
||||||
return nil, err
|
|
||||||
}
|
|
||||||
|
|
||||||
t = tensor.Materialize(t)
|
|
||||||
// flatten tensor so it can be written as a vector
|
|
||||||
if err := t.Reshape(t.Shape().TotalSize()); err != nil {
|
|
||||||
return nil, err
|
|
||||||
}
|
|
||||||
|
|
||||||
return native.VectorF32(t.(*tensor.Dense))
|
|
||||||
})
|
|
||||||
|
|
||||||
if !yield(&ggml.Tensor{
|
|
||||||
Name: replacer.Replace(t.Name()),
|
|
||||||
Kind: t.Kind(),
|
|
||||||
Shape: shape,
|
|
||||||
WriterTo: tt,
|
|
||||||
}) {
|
|
||||||
break
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
@ -670,7 +670,7 @@ func loadOneapiMgmt(oneapiLibPaths []string) (int, *C.oneapi_handle_t, string, e
|
|||||||
}
|
}
|
||||||
|
|
||||||
func getVerboseState() C.uint16_t {
|
func getVerboseState() C.uint16_t {
|
||||||
if envconfig.LogLevel() < slog.LevelInfo {
|
if envconfig.Debug() {
|
||||||
return C.uint16_t(1)
|
return C.uint16_t(1)
|
||||||
}
|
}
|
||||||
return C.uint16_t(0)
|
return C.uint16_t(0)
|
||||||
|
@ -27,14 +27,12 @@
|
|||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifndef LOG
|
|
||||||
#define LOG(verbose, ...) \
|
#define LOG(verbose, ...) \
|
||||||
do { \
|
do { \
|
||||||
if (verbose) { \
|
if (verbose) { \
|
||||||
fprintf(stderr, __VA_ARGS__); \
|
fprintf(stderr, __VA_ARGS__); \
|
||||||
} \
|
} \
|
||||||
} while (0)
|
} while (0)
|
||||||
#endif
|
|
||||||
|
|
||||||
#ifdef __cplusplus
|
#ifdef __cplusplus
|
||||||
extern "C" {
|
extern "C" {
|
||||||
|
@ -1,7 +1,6 @@
|
|||||||
#ifndef __APPLE__ // TODO - maybe consider nvidia support on intel macs?
|
#ifndef __APPLE__ // TODO - maybe consider nvidia support on intel macs?
|
||||||
|
|
||||||
#include <string.h>
|
#include <string.h>
|
||||||
#include <inttypes.h>
|
|
||||||
#include "gpu_info_cudart.h"
|
#include "gpu_info_cudart.h"
|
||||||
|
|
||||||
void cudart_init(char *cudart_lib_path, cudart_init_resp_t *resp) {
|
void cudart_init(char *cudart_lib_path, cudart_init_resp_t *resp) {
|
||||||
@ -59,7 +58,7 @@ void cudart_init(char *cudart_lib_path, cudart_init_resp_t *resp) {
|
|||||||
LOG(resp->ch.verbose, "cudaSetDevice err: %d\n", ret);
|
LOG(resp->ch.verbose, "cudaSetDevice err: %d\n", ret);
|
||||||
UNLOAD_LIBRARY(resp->ch.handle);
|
UNLOAD_LIBRARY(resp->ch.handle);
|
||||||
resp->ch.handle = NULL;
|
resp->ch.handle = NULL;
|
||||||
if (ret == CUDART_ERROR_INSUFFICIENT_DRIVER) {
|
if (ret == CUDA_ERROR_INSUFFICIENT_DRIVER) {
|
||||||
resp->err = strdup("your nvidia driver is too old or missing. If you have a CUDA GPU please upgrade to run ollama");
|
resp->err = strdup("your nvidia driver is too old or missing. If you have a CUDA GPU please upgrade to run ollama");
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
@ -169,9 +168,9 @@ void cudart_bootstrap(cudart_handle_t h, int i, mem_info_t *resp) {
|
|||||||
resp->free = memInfo.free;
|
resp->free = memInfo.free;
|
||||||
resp->used = memInfo.used;
|
resp->used = memInfo.used;
|
||||||
|
|
||||||
LOG(h.verbose, "[%s] CUDA totalMem %" PRId64 "\n", resp->gpu_id, resp->total);
|
LOG(h.verbose, "[%s] CUDA totalMem %lu\n", resp->gpu_id, resp->total);
|
||||||
LOG(h.verbose, "[%s] CUDA freeMem %" PRId64 "\n", resp->gpu_id, resp->free);
|
LOG(h.verbose, "[%s] CUDA freeMem %lu\n", resp->gpu_id, resp->free);
|
||||||
LOG(h.verbose, "[%s] CUDA usedMem %" PRId64 "\n", resp->gpu_id, resp->used);
|
LOG(h.verbose, "[%s] CUDA usedMem %lu\n", resp->gpu_id, resp->used);
|
||||||
LOG(h.verbose, "[%s] Compute Capability %d.%d\n", resp->gpu_id, resp->major, resp->minor);
|
LOG(h.verbose, "[%s] Compute Capability %d.%d\n", resp->gpu_id, resp->major, resp->minor);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -181,4 +180,4 @@ void cudart_release(cudart_handle_t h) {
|
|||||||
h.handle = NULL;
|
h.handle = NULL;
|
||||||
}
|
}
|
||||||
|
|
||||||
#endif // __APPLE__
|
#endif // __APPLE__
|
@ -1,7 +1,6 @@
|
|||||||
#ifndef __APPLE__ // TODO - maybe consider nvidia support on intel macs?
|
#ifndef __APPLE__ // TODO - maybe consider nvidia support on intel macs?
|
||||||
|
|
||||||
#include <string.h>
|
#include <string.h>
|
||||||
#include <inttypes.h>
|
|
||||||
#include "gpu_info_nvcuda.h"
|
#include "gpu_info_nvcuda.h"
|
||||||
|
|
||||||
void nvcuda_init(char *nvcuda_lib_path, nvcuda_init_resp_t *resp) {
|
void nvcuda_init(char *nvcuda_lib_path, nvcuda_init_resp_t *resp) {
|
||||||
@ -194,8 +193,8 @@ void nvcuda_bootstrap(nvcuda_handle_t h, int i, mem_info_t *resp) {
|
|||||||
resp->total = memInfo.total;
|
resp->total = memInfo.total;
|
||||||
resp->free = memInfo.free;
|
resp->free = memInfo.free;
|
||||||
|
|
||||||
LOG(h.verbose, "[%s] CUDA totalMem %" PRId64 "mb\n", resp->gpu_id, resp->total / 1024 / 1024);
|
LOG(h.verbose, "[%s] CUDA totalMem %lu mb\n", resp->gpu_id, resp->total / 1024 / 1024);
|
||||||
LOG(h.verbose, "[%s] CUDA freeMem %" PRId64 "mb\n", resp->gpu_id, resp->free / 1024 / 1024);
|
LOG(h.verbose, "[%s] CUDA freeMem %lu mb\n", resp->gpu_id, resp->free / 1024 / 1024);
|
||||||
LOG(h.verbose, "[%s] Compute Capability %d.%d\n", resp->gpu_id, resp->major, resp->minor);
|
LOG(h.verbose, "[%s] Compute Capability %d.%d\n", resp->gpu_id, resp->major, resp->minor);
|
||||||
|
|
||||||
|
|
||||||
@ -248,4 +247,4 @@ void nvcuda_release(nvcuda_handle_t h) {
|
|||||||
h.handle = NULL;
|
h.handle = NULL;
|
||||||
}
|
}
|
||||||
|
|
||||||
#endif // __APPLE__
|
#endif // __APPLE__
|
73
docs/api.md
73
docs/api.md
@ -19,7 +19,7 @@
|
|||||||
|
|
||||||
### Model names
|
### Model names
|
||||||
|
|
||||||
Model names follow a `model:tag` format, where `model` can have an optional namespace such as `example/model`. Some examples are `orca-mini:3b-q8_0` and `llama3:70b`. The tag is optional and, if not provided, will default to `latest`. The tag is used to identify a specific version.
|
Model names follow a `model:tag` format, where `model` can have an optional namespace such as `example/model`. Some examples are `orca-mini:3b-q4_1` and `llama3:70b`. The tag is optional and, if not provided, will default to `latest`. The tag is used to identify a specific version.
|
||||||
|
|
||||||
### Durations
|
### Durations
|
||||||
|
|
||||||
@ -394,6 +394,9 @@ curl http://localhost:11434/api/generate -d '{
|
|||||||
"repeat_penalty": 1.2,
|
"repeat_penalty": 1.2,
|
||||||
"presence_penalty": 1.5,
|
"presence_penalty": 1.5,
|
||||||
"frequency_penalty": 1.0,
|
"frequency_penalty": 1.0,
|
||||||
|
"mirostat": 1,
|
||||||
|
"mirostat_tau": 0.8,
|
||||||
|
"mirostat_eta": 0.6,
|
||||||
"penalize_newline": true,
|
"penalize_newline": true,
|
||||||
"stop": ["\n", "user:"],
|
"stop": ["\n", "user:"],
|
||||||
"numa": false,
|
"numa": false,
|
||||||
@ -401,7 +404,10 @@ curl http://localhost:11434/api/generate -d '{
|
|||||||
"num_batch": 2,
|
"num_batch": 2,
|
||||||
"num_gpu": 1,
|
"num_gpu": 1,
|
||||||
"main_gpu": 0,
|
"main_gpu": 0,
|
||||||
|
"low_vram": false,
|
||||||
|
"vocab_only": false,
|
||||||
"use_mmap": true,
|
"use_mmap": true,
|
||||||
|
"use_mlock": false,
|
||||||
"num_thread": 8
|
"num_thread": 8
|
||||||
}
|
}
|
||||||
}'
|
}'
|
||||||
@ -952,8 +958,19 @@ If you are creating a model from a safetensors directory or from a GGUF file, yo
|
|||||||
|
|
||||||
| Type | Recommended |
|
| Type | Recommended |
|
||||||
| --- | :-: |
|
| --- | :-: |
|
||||||
|
| q2_K | |
|
||||||
|
| q3_K_L | |
|
||||||
|
| q3_K_M | |
|
||||||
|
| q3_K_S | |
|
||||||
|
| q4_0 | |
|
||||||
|
| q4_1 | |
|
||||||
| q4_K_M | * |
|
| q4_K_M | * |
|
||||||
| q4_K_S | |
|
| q4_K_S | |
|
||||||
|
| q5_0 | |
|
||||||
|
| q5_1 | |
|
||||||
|
| q5_K_M | |
|
||||||
|
| q5_K_S | |
|
||||||
|
| q6_K | |
|
||||||
| q8_0 | * |
|
| q8_0 | * |
|
||||||
|
|
||||||
### Examples
|
### Examples
|
||||||
@ -998,8 +1015,8 @@ Quantize a non-quantized model.
|
|||||||
|
|
||||||
```shell
|
```shell
|
||||||
curl http://localhost:11434/api/create -d '{
|
curl http://localhost:11434/api/create -d '{
|
||||||
"model": "llama3.2:quantized",
|
"model": "llama3.1:quantized",
|
||||||
"from": "llama3.2:3b-instruct-fp16",
|
"from": "llama3.1:8b-instruct-fp16",
|
||||||
"quantize": "q4_K_M"
|
"quantize": "q4_K_M"
|
||||||
}'
|
}'
|
||||||
```
|
```
|
||||||
@ -1009,14 +1026,12 @@ curl http://localhost:11434/api/create -d '{
|
|||||||
A stream of JSON objects is returned:
|
A stream of JSON objects is returned:
|
||||||
|
|
||||||
```json
|
```json
|
||||||
{"status":"quantizing F16 model to Q4_K_M","digest":"0","total":6433687776,"completed":12302}
|
{"status":"quantizing F16 model to Q4_K_M"}
|
||||||
{"status":"quantizing F16 model to Q4_K_M","digest":"0","total":6433687776,"completed":6433687552}
|
{"status":"creating new layer sha256:667b0c1932bc6ffc593ed1d03f895bf2dc8dc6df21db3042284a6f4416b06a29"}
|
||||||
{"status":"verifying conversion"}
|
{"status":"using existing layer sha256:11ce4ee3e170f6adebac9a991c22e22ab3f8530e154ee669954c4bc73061c258"}
|
||||||
{"status":"creating new layer sha256:fb7f4f211b89c6c4928ff4ddb73db9f9c0cfca3e000c3e40d6cf27ddc6ca72eb"}
|
{"status":"using existing layer sha256:0ba8f0e314b4264dfd19df045cde9d4c394a52474bf92ed6a3de22a4ca31a177"}
|
||||||
{"status":"using existing layer sha256:966de95ca8a62200913e3f8bfbf84c8494536f1b94b49166851e76644e966396"}
|
|
||||||
{"status":"using existing layer sha256:fcc5a6bec9daf9b561a68827b67ab6088e1dba9d1fa2a50d7bbcc8384e0a265d"}
|
|
||||||
{"status":"using existing layer sha256:a70ff7e570d97baaf4e62ac6e6ad9975e04caa6d900d3742d37698494479e0cd"}
|
|
||||||
{"status":"using existing layer sha256:56bb8bd477a519ffa694fc449c2413c6f0e1d3b1c88fa7e3c9d88d3ae49d4dcb"}
|
{"status":"using existing layer sha256:56bb8bd477a519ffa694fc449c2413c6f0e1d3b1c88fa7e3c9d88d3ae49d4dcb"}
|
||||||
|
{"status":"creating new layer sha256:455f34728c9b5dd3376378bfb809ee166c145b0b4c1f1a6feca069055066ef9a"}
|
||||||
{"status":"writing manifest"}
|
{"status":"writing manifest"}
|
||||||
{"status":"success"}
|
{"status":"success"}
|
||||||
```
|
```
|
||||||
@ -1154,37 +1169,29 @@ A single JSON object will be returned.
|
|||||||
{
|
{
|
||||||
"models": [
|
"models": [
|
||||||
{
|
{
|
||||||
"name": "deepseek-r1:latest",
|
"name": "codellama:13b",
|
||||||
"model": "deepseek-r1:latest",
|
"modified_at": "2023-11-04T14:56:49.277302595-07:00",
|
||||||
"modified_at": "2025-05-10T08:06:48.639712648-07:00",
|
"size": 7365960935,
|
||||||
"size": 4683075271,
|
"digest": "9f438cb9cd581fc025612d27f7c1a6669ff83a8bb0ed86c94fcf4c5440555697",
|
||||||
"digest": "0a8c266910232fd3291e71e5ba1e058cc5af9d411192cf88b6d30e92b6e73163",
|
|
||||||
"details": {
|
"details": {
|
||||||
"parent_model": "",
|
|
||||||
"format": "gguf",
|
"format": "gguf",
|
||||||
"family": "qwen2",
|
"family": "llama",
|
||||||
"families": [
|
"families": null,
|
||||||
"qwen2"
|
"parameter_size": "13B",
|
||||||
],
|
"quantization_level": "Q4_0"
|
||||||
"parameter_size": "7.6B",
|
|
||||||
"quantization_level": "Q4_K_M"
|
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"name": "llama3.2:latest",
|
"name": "llama3:latest",
|
||||||
"model": "llama3.2:latest",
|
"modified_at": "2023-12-07T09:32:18.757212583-08:00",
|
||||||
"modified_at": "2025-05-04T17:37:44.706015396-07:00",
|
"size": 3825819519,
|
||||||
"size": 2019393189,
|
"digest": "fe938a131f40e6f6d40083c9f0f430a515233eb2edaa6d72eb85c50d64f2300e",
|
||||||
"digest": "a80c4f17acd55265feec403c7aef86be0c25983ab279d83f3bcd3abbcb5b8b72",
|
|
||||||
"details": {
|
"details": {
|
||||||
"parent_model": "",
|
|
||||||
"format": "gguf",
|
"format": "gguf",
|
||||||
"family": "llama",
|
"family": "llama",
|
||||||
"families": [
|
"families": null,
|
||||||
"llama"
|
"parameter_size": "7B",
|
||||||
],
|
"quantization_level": "Q4_0"
|
||||||
"parameter_size": "3.2B",
|
|
||||||
"quantization_level": "Q4_K_M"
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
]
|
]
|
||||||
|
@ -20,7 +20,7 @@ Please refer to the [GPU docs](./gpu.md).
|
|||||||
|
|
||||||
## How can I specify the context window size?
|
## How can I specify the context window size?
|
||||||
|
|
||||||
By default, Ollama uses a context window size of 4096 tokens.
|
By default, Ollama uses a context window size of 2048 tokens.
|
||||||
|
|
||||||
This can be overridden with the `OLLAMA_CONTEXT_LENGTH` environment variable. For example, to set the default context window to 8K, use:
|
This can be overridden with the `OLLAMA_CONTEXT_LENGTH` environment variable. For example, to set the default context window to 8K, use:
|
||||||
|
|
||||||
|
@ -150,6 +150,9 @@ PARAMETER <parameter> <parametervalue>
|
|||||||
|
|
||||||
| Parameter | Description | Value Type | Example Usage |
|
| Parameter | Description | Value Type | Example Usage |
|
||||||
| -------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ---------- | -------------------- |
|
| -------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ---------- | -------------------- |
|
||||||
|
| mirostat | Enable Mirostat sampling for controlling perplexity. (default: 0, 0 = disabled, 1 = Mirostat, 2 = Mirostat 2.0) | int | mirostat 0 |
|
||||||
|
| mirostat_eta | Influences how quickly the algorithm responds to feedback from the generated text. A lower learning rate will result in slower adjustments, while a higher learning rate will make the algorithm more responsive. (Default: 0.1) | float | mirostat_eta 0.1 |
|
||||||
|
| mirostat_tau | Controls the balance between coherence and diversity of the output. A lower value will result in more focused and coherent text. (Default: 5.0) | float | mirostat_tau 5.0 |
|
||||||
| num_ctx | Sets the size of the context window used to generate the next token. (Default: 2048) | int | num_ctx 4096 |
|
| num_ctx | Sets the size of the context window used to generate the next token. (Default: 2048) | int | num_ctx 4096 |
|
||||||
| repeat_last_n | Sets how far back for the model to look back to prevent repetition. (Default: 64, 0 = disabled, -1 = num_ctx) | int | repeat_last_n 64 |
|
| repeat_last_n | Sets how far back for the model to look back to prevent repetition. (Default: 64, 0 = disabled, -1 = num_ctx) | int | repeat_last_n 64 |
|
||||||
| repeat_penalty | Sets how strongly to penalize repetitions. A higher value (e.g., 1.5) will penalize repetitions more strongly, while a lower value (e.g., 0.9) will be more lenient. (Default: 1.1) | float | repeat_penalty 1.1 |
|
| repeat_penalty | Sets how strongly to penalize repetitions. A higher value (e.g., 1.5) will penalize repetitions more strongly, while a lower value (e.g., 0.9) will be more lenient. (Default: 1.1) | float | repeat_penalty 1.1 |
|
||||||
|
@ -149,22 +149,9 @@ func Bool(k string) func() bool {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// LogLevel returns the log level for the application.
|
|
||||||
// Values are 0 or false INFO (Default), 1 or true DEBUG, 2 TRACE
|
|
||||||
func LogLevel() slog.Level {
|
|
||||||
level := slog.LevelInfo
|
|
||||||
if s := Var("OLLAMA_DEBUG"); s != "" {
|
|
||||||
if b, _ := strconv.ParseBool(s); b {
|
|
||||||
level = slog.LevelDebug
|
|
||||||
} else if i, _ := strconv.ParseInt(s, 10, 64); i != 0 {
|
|
||||||
level = slog.Level(i * -4)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return level
|
|
||||||
}
|
|
||||||
|
|
||||||
var (
|
var (
|
||||||
|
// Debug enabled additional debug information.
|
||||||
|
Debug = Bool("OLLAMA_DEBUG")
|
||||||
// FlashAttention enables the experimental flash attention feature.
|
// FlashAttention enables the experimental flash attention feature.
|
||||||
FlashAttention = Bool("OLLAMA_FLASH_ATTENTION")
|
FlashAttention = Bool("OLLAMA_FLASH_ATTENTION")
|
||||||
// KvCacheType is the quantization type for the K/V cache.
|
// KvCacheType is the quantization type for the K/V cache.
|
||||||
@ -182,7 +169,7 @@ var (
|
|||||||
// Enable the new Ollama engine
|
// Enable the new Ollama engine
|
||||||
NewEngine = Bool("OLLAMA_NEW_ENGINE")
|
NewEngine = Bool("OLLAMA_NEW_ENGINE")
|
||||||
// ContextLength sets the default context length
|
// ContextLength sets the default context length
|
||||||
ContextLength = Uint("OLLAMA_CONTEXT_LENGTH", 4096)
|
ContextLength = Uint("OLLAMA_CONTEXT_LENGTH", 2048)
|
||||||
)
|
)
|
||||||
|
|
||||||
func String(s string) func() string {
|
func String(s string) func() string {
|
||||||
@ -222,6 +209,8 @@ var (
|
|||||||
MaxRunners = Uint("OLLAMA_MAX_LOADED_MODELS", 0)
|
MaxRunners = Uint("OLLAMA_MAX_LOADED_MODELS", 0)
|
||||||
// MaxQueue sets the maximum number of queued requests. MaxQueue can be configured via the OLLAMA_MAX_QUEUE environment variable.
|
// MaxQueue sets the maximum number of queued requests. MaxQueue can be configured via the OLLAMA_MAX_QUEUE environment variable.
|
||||||
MaxQueue = Uint("OLLAMA_MAX_QUEUE", 512)
|
MaxQueue = Uint("OLLAMA_MAX_QUEUE", 512)
|
||||||
|
// MaxVRAM sets a maximum VRAM override in bytes. MaxVRAM can be configured via the OLLAMA_MAX_VRAM environment variable.
|
||||||
|
MaxVRAM = Uint("OLLAMA_MAX_VRAM", 0)
|
||||||
)
|
)
|
||||||
|
|
||||||
func Uint64(key string, defaultValue uint64) func() uint64 {
|
func Uint64(key string, defaultValue uint64) func() uint64 {
|
||||||
@ -249,7 +238,7 @@ type EnvVar struct {
|
|||||||
|
|
||||||
func AsMap() map[string]EnvVar {
|
func AsMap() map[string]EnvVar {
|
||||||
ret := map[string]EnvVar{
|
ret := map[string]EnvVar{
|
||||||
"OLLAMA_DEBUG": {"OLLAMA_DEBUG", LogLevel(), "Show additional debug information (e.g. OLLAMA_DEBUG=1)"},
|
"OLLAMA_DEBUG": {"OLLAMA_DEBUG", Debug(), "Show additional debug information (e.g. OLLAMA_DEBUG=1)"},
|
||||||
"OLLAMA_FLASH_ATTENTION": {"OLLAMA_FLASH_ATTENTION", FlashAttention(), "Enabled flash attention"},
|
"OLLAMA_FLASH_ATTENTION": {"OLLAMA_FLASH_ATTENTION", FlashAttention(), "Enabled flash attention"},
|
||||||
"OLLAMA_KV_CACHE_TYPE": {"OLLAMA_KV_CACHE_TYPE", KvCacheType(), "Quantization type for the K/V cache (default: f16)"},
|
"OLLAMA_KV_CACHE_TYPE": {"OLLAMA_KV_CACHE_TYPE", KvCacheType(), "Quantization type for the K/V cache (default: f16)"},
|
||||||
"OLLAMA_GPU_OVERHEAD": {"OLLAMA_GPU_OVERHEAD", GpuOverhead(), "Reserve a portion of VRAM per GPU (bytes)"},
|
"OLLAMA_GPU_OVERHEAD": {"OLLAMA_GPU_OVERHEAD", GpuOverhead(), "Reserve a portion of VRAM per GPU (bytes)"},
|
||||||
@ -266,7 +255,7 @@ func AsMap() map[string]EnvVar {
|
|||||||
"OLLAMA_ORIGINS": {"OLLAMA_ORIGINS", AllowedOrigins(), "A comma separated list of allowed origins"},
|
"OLLAMA_ORIGINS": {"OLLAMA_ORIGINS", AllowedOrigins(), "A comma separated list of allowed origins"},
|
||||||
"OLLAMA_SCHED_SPREAD": {"OLLAMA_SCHED_SPREAD", SchedSpread(), "Always schedule model across all GPUs"},
|
"OLLAMA_SCHED_SPREAD": {"OLLAMA_SCHED_SPREAD", SchedSpread(), "Always schedule model across all GPUs"},
|
||||||
"OLLAMA_MULTIUSER_CACHE": {"OLLAMA_MULTIUSER_CACHE", MultiUserCache(), "Optimize prompt caching for multi-user scenarios"},
|
"OLLAMA_MULTIUSER_CACHE": {"OLLAMA_MULTIUSER_CACHE", MultiUserCache(), "Optimize prompt caching for multi-user scenarios"},
|
||||||
"OLLAMA_CONTEXT_LENGTH": {"OLLAMA_CONTEXT_LENGTH", ContextLength(), "Context length to use unless otherwise specified (default: 4096)"},
|
"OLLAMA_CONTEXT_LENGTH": {"OLLAMA_CONTEXT_LENGTH", ContextLength(), "Context length to use unless otherwise specified (default: 2048)"},
|
||||||
"OLLAMA_NEW_ENGINE": {"OLLAMA_NEW_ENGINE", NewEngine(), "Enable the new Ollama engine"},
|
"OLLAMA_NEW_ENGINE": {"OLLAMA_NEW_ENGINE", NewEngine(), "Enable the new Ollama engine"},
|
||||||
|
|
||||||
// Informational
|
// Informational
|
||||||
|
@ -1,13 +1,11 @@
|
|||||||
package envconfig
|
package envconfig
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"log/slog"
|
|
||||||
"math"
|
"math"
|
||||||
"testing"
|
"testing"
|
||||||
"time"
|
"time"
|
||||||
|
|
||||||
"github.com/google/go-cmp/cmp"
|
"github.com/google/go-cmp/cmp"
|
||||||
"github.com/ollama/ollama/logutil"
|
|
||||||
)
|
)
|
||||||
|
|
||||||
func TestHost(t *testing.T) {
|
func TestHost(t *testing.T) {
|
||||||
@ -281,8 +279,8 @@ func TestVar(t *testing.T) {
|
|||||||
|
|
||||||
func TestContextLength(t *testing.T) {
|
func TestContextLength(t *testing.T) {
|
||||||
cases := map[string]uint{
|
cases := map[string]uint{
|
||||||
"": 4096,
|
"": 2048,
|
||||||
"2048": 2048,
|
"4096": 4096,
|
||||||
}
|
}
|
||||||
|
|
||||||
for k, v := range cases {
|
for k, v := range cases {
|
||||||
@ -294,34 +292,3 @@ func TestContextLength(t *testing.T) {
|
|||||||
})
|
})
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func TestLogLevel(t *testing.T) {
|
|
||||||
cases := map[string]slog.Level{
|
|
||||||
// Default to INFO
|
|
||||||
"": slog.LevelInfo,
|
|
||||||
"false": slog.LevelInfo,
|
|
||||||
"f": slog.LevelInfo,
|
|
||||||
"0": slog.LevelInfo,
|
|
||||||
|
|
||||||
// True values enable Debug
|
|
||||||
"true": slog.LevelDebug,
|
|
||||||
"t": slog.LevelDebug,
|
|
||||||
|
|
||||||
// Positive values increase verbosity
|
|
||||||
"1": slog.LevelDebug,
|
|
||||||
"2": logutil.LevelTrace,
|
|
||||||
|
|
||||||
// Negative values decrease verbosity
|
|
||||||
"-1": slog.LevelWarn,
|
|
||||||
"-2": slog.LevelError,
|
|
||||||
}
|
|
||||||
|
|
||||||
for k, v := range cases {
|
|
||||||
t.Run(k, func(t *testing.T) {
|
|
||||||
t.Setenv("OLLAMA_DEBUG", k)
|
|
||||||
if i := LogLevel(); i != v {
|
|
||||||
t.Errorf("%s: expected %d, got %d", k, v, i)
|
|
||||||
}
|
|
||||||
})
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
@ -8,6 +8,6 @@ type Config interface {
|
|||||||
Bool(string, ...bool) bool
|
Bool(string, ...bool) bool
|
||||||
|
|
||||||
Strings(string, ...[]string) []string
|
Strings(string, ...[]string) []string
|
||||||
Ints(string, ...[]int32) []int32
|
Uints(string, ...[]uint32) []uint32
|
||||||
Floats(string, ...[]float32) []float32
|
Floats(string, ...[]float32) []float32
|
||||||
}
|
}
|
||||||
|
176
fs/ggml/ggml.go
176
fs/ggml/ggml.go
@ -6,7 +6,6 @@ import (
|
|||||||
"fmt"
|
"fmt"
|
||||||
"io"
|
"io"
|
||||||
"log/slog"
|
"log/slog"
|
||||||
"math"
|
|
||||||
"slices"
|
"slices"
|
||||||
"strings"
|
"strings"
|
||||||
|
|
||||||
@ -34,15 +33,15 @@ func (kv KV) Kind() string {
|
|||||||
}
|
}
|
||||||
|
|
||||||
func (kv KV) ParameterCount() uint64 {
|
func (kv KV) ParameterCount() uint64 {
|
||||||
return keyValue(kv, "general.parameter_count", uint64(0))
|
return keyValue[uint64](kv, "general.parameter_count")
|
||||||
}
|
}
|
||||||
|
|
||||||
func (kv KV) FileType() FileType {
|
func (kv KV) FileType() fileType {
|
||||||
if t := kv.Uint("general.file_type"); t > 0 {
|
if t := kv.Uint("general.file_type"); t > 0 {
|
||||||
return FileType(t)
|
return fileType(t)
|
||||||
}
|
}
|
||||||
|
|
||||||
return FileTypeUnknown
|
return fileTypeUnknown
|
||||||
}
|
}
|
||||||
|
|
||||||
func (kv KV) BlockCount() uint64 {
|
func (kv KV) BlockCount() uint64 {
|
||||||
@ -106,44 +105,42 @@ func (kv KV) Bool(key string, defaultValue ...bool) bool {
|
|||||||
}
|
}
|
||||||
|
|
||||||
func (kv KV) Strings(key string, defaultValue ...[]string) []string {
|
func (kv KV) Strings(key string, defaultValue ...[]string) []string {
|
||||||
return keyValue(kv, key, &array[string]{values: append(defaultValue, []string(nil))[0]}).values
|
r := keyValue(kv, key, &array{})
|
||||||
}
|
s := make([]string, r.size)
|
||||||
|
for i := range r.size {
|
||||||
|
s[i] = r.values[i].(string)
|
||||||
|
}
|
||||||
|
|
||||||
func (kv KV) Ints(key string, defaultValue ...[]int32) []int32 {
|
return s
|
||||||
return keyValue(kv, key, &array[int32]{values: append(defaultValue, []int32(nil))[0]}).values
|
|
||||||
}
|
}
|
||||||
|
|
||||||
func (kv KV) Uints(key string, defaultValue ...[]uint32) []uint32 {
|
func (kv KV) Uints(key string, defaultValue ...[]uint32) []uint32 {
|
||||||
return keyValue(kv, key, &array[uint32]{values: append(defaultValue, []uint32(nil))[0]}).values
|
r := keyValue(kv, key, &array{})
|
||||||
|
s := make([]uint32, r.size)
|
||||||
|
for i := range r.size {
|
||||||
|
s[i] = uint32(r.values[i].(int32))
|
||||||
|
}
|
||||||
|
|
||||||
|
return s
|
||||||
}
|
}
|
||||||
|
|
||||||
func (kv KV) Floats(key string, defaultValue ...[]float32) []float32 {
|
func (kv KV) Floats(key string, defaultValue ...[]float32) []float32 {
|
||||||
return keyValue(kv, key, &array[float32]{values: append(defaultValue, []float32(nil))[0]}).values
|
r := keyValue(kv, key, &array{})
|
||||||
|
s := make([]float32, r.size)
|
||||||
|
for i := range r.size {
|
||||||
|
s[i] = float32(r.values[i].(float32))
|
||||||
|
}
|
||||||
|
return s
|
||||||
}
|
}
|
||||||
|
|
||||||
func (kv KV) OllamaEngineRequired() bool {
|
func (kv KV) OllamaEngineRequired() bool {
|
||||||
return slices.Contains([]string{
|
return slices.Contains([]string{
|
||||||
"gemma3",
|
"gemma3",
|
||||||
"mistral3",
|
"mistral3",
|
||||||
"llama4",
|
|
||||||
"mllama",
|
|
||||||
"qwen25vl",
|
|
||||||
}, kv.Architecture())
|
}, kv.Architecture())
|
||||||
}
|
}
|
||||||
|
|
||||||
type valueTypes interface {
|
func keyValue[T string | uint32 | uint64 | float32 | *array | bool](kv KV, key string, defaultValue ...T) T {
|
||||||
uint8 | int8 | uint16 | int16 |
|
|
||||||
uint32 | int32 | uint64 | int64 |
|
|
||||||
string | float32 | float64 | bool
|
|
||||||
}
|
|
||||||
|
|
||||||
type arrayValueTypes interface {
|
|
||||||
*array[uint8] | *array[int8] | *array[uint16] | *array[int16] |
|
|
||||||
*array[uint32] | *array[int32] | *array[uint64] | *array[int64] |
|
|
||||||
*array[string] | *array[float32] | *array[float64] | *array[bool]
|
|
||||||
}
|
|
||||||
|
|
||||||
func keyValue[T valueTypes | arrayValueTypes](kv KV, key string, defaultValue ...T) T {
|
|
||||||
if !strings.HasPrefix(key, "tokenizer.") && !strings.HasPrefix(key, "general.") {
|
if !strings.HasPrefix(key, "tokenizer.") && !strings.HasPrefix(key, "general.") {
|
||||||
key = kv.Architecture() + "." + key
|
key = kv.Architecture() + "." + key
|
||||||
}
|
}
|
||||||
@ -152,7 +149,7 @@ func keyValue[T valueTypes | arrayValueTypes](kv KV, key string, defaultValue ..
|
|||||||
return val.(T)
|
return val.(T)
|
||||||
}
|
}
|
||||||
|
|
||||||
slog.Debug("key not found", "key", key, "default", defaultValue[0])
|
slog.Warn("key not found", "key", key, "default", defaultValue[0])
|
||||||
return defaultValue[0]
|
return defaultValue[0]
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -229,11 +226,7 @@ func (t Tensor) block() (n int) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
func (t Tensor) blockSize() uint64 {
|
func (t Tensor) blockSize() uint64 {
|
||||||
return (TensorType)(t.Kind).BlockSize()
|
switch t.Kind {
|
||||||
}
|
|
||||||
|
|
||||||
func (t TensorType) BlockSize() uint64 {
|
|
||||||
switch t {
|
|
||||||
case
|
case
|
||||||
0, // F32
|
0, // F32
|
||||||
1, // F16
|
1, // F16
|
||||||
@ -259,77 +252,73 @@ func (t TensorType) BlockSize() uint64 {
|
|||||||
}
|
}
|
||||||
|
|
||||||
func (t Tensor) typeSize() uint64 {
|
func (t Tensor) typeSize() uint64 {
|
||||||
return TensorType(t.Kind).TypeSize()
|
blockSize := t.blockSize()
|
||||||
}
|
|
||||||
|
|
||||||
func (t TensorType) TypeSize() uint64 {
|
switch t.Kind {
|
||||||
blockSize := t.BlockSize()
|
case 0: // FP32
|
||||||
|
|
||||||
switch t {
|
|
||||||
case TensorTypeF32:
|
|
||||||
return 4
|
return 4
|
||||||
case TensorTypeF16:
|
case 1: // FP16
|
||||||
return 2
|
return 2
|
||||||
case TensorTypeQ4_0:
|
case 2: // Q4_0
|
||||||
return 2 + blockSize/2
|
return 2 + blockSize/2
|
||||||
case TensorTypeQ4_1:
|
case 3: // Q4_1
|
||||||
return 2 + 2 + blockSize/2
|
return 2 + 2 + blockSize/2
|
||||||
case TensorTypeQ5_0:
|
case 6: // Q5_0
|
||||||
return 2 + 4 + blockSize/2
|
return 2 + 4 + blockSize/2
|
||||||
case TensorTypeQ5_1:
|
case 7: // Q5_1
|
||||||
return 2 + 2 + 4 + blockSize/2
|
return 2 + 2 + 4 + blockSize/2
|
||||||
case TensorTypeQ8_0:
|
case 8: // Q8_0
|
||||||
return 2 + blockSize
|
return 2 + blockSize
|
||||||
case TensorTypeQ8_1:
|
case 9: // Q8_1
|
||||||
return 2 + 2 + blockSize
|
return 2 + 2 + blockSize
|
||||||
case TensorTypeQ2_K:
|
case 10: // Q2_K
|
||||||
return blockSize/16 + blockSize/4 + 2 + 2
|
return blockSize/16 + blockSize/4 + 2 + 2
|
||||||
case TensorTypeQ3_K:
|
case 11: // Q3_K
|
||||||
return blockSize/8 + blockSize/4 + 12 + 2
|
return blockSize/8 + blockSize/4 + 12 + 2
|
||||||
case TensorTypeQ4_K:
|
case 12: // Q4_K
|
||||||
return 2 + 2 + 12 + blockSize/2
|
return 2 + 2 + 12 + blockSize/2
|
||||||
case TensorTypeQ5_K:
|
case 13: // Q5_K
|
||||||
return 2 + 2 + 12 + blockSize/8 + blockSize/2
|
return 2 + 2 + 12 + blockSize/8 + blockSize/2
|
||||||
case TensorTypeQ6_K:
|
case 14: // Q6_K
|
||||||
return blockSize/2 + blockSize/4 + blockSize/16 + 2
|
return blockSize/2 + blockSize/4 + blockSize/16 + 2
|
||||||
case TensorTypeQ8_K:
|
case 15: // Q8_K
|
||||||
return 4 + blockSize + 2*blockSize/16
|
return 4 + blockSize + 2*blockSize/16
|
||||||
case tensorTypeIQ2_XXS:
|
case 16: // IQ2_XXS
|
||||||
return 2 + 2*blockSize/8
|
return 2 + 2*blockSize/8
|
||||||
case tensorTypeIQ2_XS:
|
case 17: // IQ2_XS
|
||||||
return 2 + 2*blockSize/8 + blockSize/32
|
return 2 + 2*blockSize/8 + blockSize/32
|
||||||
case tensorTypeIQ3_XXS:
|
case 18: // IQ3_XXS
|
||||||
return 2 + blockSize/4 + blockSize/8
|
return 2 + blockSize/4 + blockSize/8
|
||||||
case tensorTypeIQ1_S:
|
case 19: // IQ1_S
|
||||||
return 2 + blockSize/8 + blockSize/16
|
return 2 + blockSize/8 + blockSize/16
|
||||||
case tensorTypeIQ4_NL:
|
case 20: // IQ4_NL
|
||||||
return 2 + blockSize/2
|
return 2 + blockSize/2
|
||||||
case tensorTypeIQ3_S:
|
case 21: // IQ3_S
|
||||||
return 2 + blockSize/4 + blockSize/8 + blockSize/32 + 4
|
return 2 + blockSize/4 + blockSize/8 + blockSize/32 + 4
|
||||||
case tensorTypeIQ2_S:
|
case 22: // IQ2_S
|
||||||
return 2 + blockSize/4 + blockSize/16
|
return 2 + blockSize/4 + blockSize/16
|
||||||
case tensorTypeIQ4_XS:
|
case 23: // IQ4_XS
|
||||||
return 2 + 2 + blockSize/2 + blockSize/64
|
return 2 + 2 + blockSize/2 + blockSize/64
|
||||||
case TensorTypeI8:
|
case 24: // I8
|
||||||
return 1
|
return 1
|
||||||
case TensorTypeI16:
|
case 25: // I16
|
||||||
return 2
|
return 2
|
||||||
case TensorTypeI32:
|
case 26: // I32
|
||||||
return 4
|
return 4
|
||||||
case TensorTypeI64:
|
case 27: // I64
|
||||||
return 8
|
return 8
|
||||||
case TensorTypeF64:
|
case 28: // F64
|
||||||
return 8
|
return 8
|
||||||
case tensorTypeIQ1_M:
|
case 29: // IQ1_M
|
||||||
return blockSize/8 + blockSize/16 + blockSize/32
|
return blockSize/8 + blockSize/16 + blockSize/32
|
||||||
case TensorTypeBF16:
|
case 30: // BF16
|
||||||
return 2
|
return 2
|
||||||
default:
|
default:
|
||||||
return 0
|
return 0
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func (t Tensor) Elements() uint64 {
|
func (t Tensor) parameters() uint64 {
|
||||||
var count uint64 = 1
|
var count uint64 = 1
|
||||||
for _, n := range t.Shape {
|
for _, n := range t.Shape {
|
||||||
count *= n
|
count *= n
|
||||||
@ -338,11 +327,11 @@ func (t Tensor) Elements() uint64 {
|
|||||||
}
|
}
|
||||||
|
|
||||||
func (t Tensor) Size() uint64 {
|
func (t Tensor) Size() uint64 {
|
||||||
return t.Elements() * t.typeSize() / t.blockSize()
|
return t.parameters() * t.typeSize() / t.blockSize()
|
||||||
}
|
}
|
||||||
|
|
||||||
func (t Tensor) Type() string {
|
func (t Tensor) Type() string {
|
||||||
return TensorType(t.Kind).String()
|
return fileType(t.Kind).String()
|
||||||
}
|
}
|
||||||
|
|
||||||
type container interface {
|
type container interface {
|
||||||
@ -386,8 +375,13 @@ func DetectContentType(b []byte) string {
|
|||||||
// Decode decodes a GGML model from the given reader.
|
// Decode decodes a GGML model from the given reader.
|
||||||
//
|
//
|
||||||
// It collects array values for arrays with a size less than or equal to
|
// It collects array values for arrays with a size less than or equal to
|
||||||
// maxArraySize. If the maxArraySize is negative, all arrays are collected.
|
// maxArraySize. If maxArraySize is 0, the default value of 1024 is used. If
|
||||||
|
// the maxArraySize is negative, all arrays are collected.
|
||||||
func Decode(rs io.ReadSeeker, maxArraySize int) (*GGML, int64, error) {
|
func Decode(rs io.ReadSeeker, maxArraySize int) (*GGML, int64, error) {
|
||||||
|
if maxArraySize == 0 {
|
||||||
|
maxArraySize = 1024
|
||||||
|
}
|
||||||
|
|
||||||
rs = bufioutil.NewBufferedSeeker(rs, 32<<10)
|
rs = bufioutil.NewBufferedSeeker(rs, 32<<10)
|
||||||
|
|
||||||
var magic uint32
|
var magic uint32
|
||||||
@ -426,7 +420,7 @@ func (f GGML) GraphSize(context, batch uint64, numParallel int, kvCacheType stri
|
|||||||
embedding := f.KV().EmbeddingLength()
|
embedding := f.KV().EmbeddingLength()
|
||||||
heads := f.KV().HeadCount()
|
heads := f.KV().HeadCount()
|
||||||
headsKV := f.KV().HeadCountKV()
|
headsKV := f.KV().HeadCountKV()
|
||||||
vocab := uint64(f.KV()["tokenizer.ggml.tokens"].(*array[string]).size)
|
vocab := uint64(f.KV()["tokenizer.ggml.tokens"].(*array).size)
|
||||||
|
|
||||||
embeddingHeads := f.KV().EmbeddingHeadCount()
|
embeddingHeads := f.KV().EmbeddingHeadCount()
|
||||||
embeddingHeadsK := f.KV().EmbeddingHeadCountK()
|
embeddingHeadsK := f.KV().EmbeddingHeadCountK()
|
||||||
@ -441,7 +435,7 @@ func (f GGML) GraphSize(context, batch uint64, numParallel int, kvCacheType stri
|
|||||||
}
|
}
|
||||||
|
|
||||||
switch f.KV().Architecture() {
|
switch f.KV().Architecture() {
|
||||||
case "llama", "llama4":
|
case "llama":
|
||||||
fullOffload = max(
|
fullOffload = max(
|
||||||
4*batch*(1+4*embedding+context*(1+heads)),
|
4*batch*(1+4*embedding+context*(1+heads)),
|
||||||
4*batch*(embedding+vocab),
|
4*batch*(embedding+vocab),
|
||||||
@ -455,7 +449,7 @@ func (f GGML) GraphSize(context, batch uint64, numParallel int, kvCacheType stri
|
|||||||
|
|
||||||
if ffnGateExpsWeight, ok := layers["blk.0"]["ffn_gate_exps.weight"]; ok {
|
if ffnGateExpsWeight, ok := layers["blk.0"]["ffn_gate_exps.weight"]; ok {
|
||||||
// mixtral 8x22b
|
// mixtral 8x22b
|
||||||
ff := uint64(f.KV().Uint("feed_forward_length"))
|
ff := uint64(f.KV()["llama.feed_forward_length"].(uint32))
|
||||||
partialOffload = max(
|
partialOffload = max(
|
||||||
3*ffnGateExpsWeight.Size()+4*batch*(2*ff+headsKV+embedding+context+embeddingHeads*headsKV),
|
3*ffnGateExpsWeight.Size()+4*batch*(2*ff+headsKV+embedding+context+embeddingHeads*headsKV),
|
||||||
4*(context*batch*heads+context*embeddingHeads*headsKV+batch*1024+embeddingHeads*headsKV*batch),
|
4*(context*batch*heads+context*embeddingHeads*headsKV+batch*1024+embeddingHeads*headsKV*batch),
|
||||||
@ -472,9 +466,9 @@ func (f GGML) GraphSize(context, batch uint64, numParallel int, kvCacheType stri
|
|||||||
case "mllama":
|
case "mllama":
|
||||||
var visionTokens, tiles uint64 = 1601, 4
|
var visionTokens, tiles uint64 = 1601, 4
|
||||||
|
|
||||||
crossAttentionLayers := f.KV().Ints("attention.cross_attention_layers")
|
crossAttentionLayers := f.KV().Uints("attention.cross_attention_layers")
|
||||||
for i := range kv {
|
for i := range kv {
|
||||||
if slices.Contains(crossAttentionLayers, int32(i)) {
|
if slices.Contains(crossAttentionLayers, uint32(i)) {
|
||||||
kv[i] = headsKV * (embeddingHeadsK + embeddingHeadsV) *
|
kv[i] = headsKV * (embeddingHeadsK + embeddingHeadsV) *
|
||||||
4 * // sizeof(float32)
|
4 * // sizeof(float32)
|
||||||
visionTokens *
|
visionTokens *
|
||||||
@ -491,7 +485,7 @@ func (f GGML) GraphSize(context, batch uint64, numParallel int, kvCacheType stri
|
|||||||
var ropeFreqsCount uint64
|
var ropeFreqsCount uint64
|
||||||
if ropeFreqs, ok := f.Tensors().GroupLayers()["rope_freqs"]; ok {
|
if ropeFreqs, ok := f.Tensors().GroupLayers()["rope_freqs"]; ok {
|
||||||
if ropeFreqsWeights, ok := ropeFreqs["weights"]; ok {
|
if ropeFreqsWeights, ok := ropeFreqs["weights"]; ok {
|
||||||
ropeFreqsCount = ropeFreqsWeights.Elements()
|
ropeFreqsCount = ropeFreqsWeights.parameters()
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -651,32 +645,6 @@ func (llm GGML) VisionGraphSize() (weights, graphSize uint64) {
|
|||||||
graphSize = 4 * (imageSize*imageSize*numChannels +
|
graphSize = 4 * (imageSize*imageSize*numChannels +
|
||||||
embeddingLength*patchSize +
|
embeddingLength*patchSize +
|
||||||
numPatches*numPatches*headCount)
|
numPatches*numPatches*headCount)
|
||||||
case "qwen25vl":
|
|
||||||
maxPixels := uint64(llm.KV().Uint("vision.max_pixels", 28*28*1280))
|
|
||||||
mergeSize := uint64(llm.KV().Uint("vision.spatial_merge_size", 2))
|
|
||||||
temporalPatchSize := uint64(2)
|
|
||||||
|
|
||||||
// Calculate max possible patches based on max_pixels
|
|
||||||
maxHeight := uint64(math.Sqrt(float64(maxPixels)))
|
|
||||||
maxWidth := maxPixels / maxHeight
|
|
||||||
maxGridHeight := maxHeight / patchSize
|
|
||||||
maxGridWidth := maxWidth / patchSize
|
|
||||||
// Account for merged patches (2x2 grid)
|
|
||||||
numPatches := (maxGridHeight * maxGridWidth) / (mergeSize * mergeSize)
|
|
||||||
|
|
||||||
// Calculate graph size based on typical operations in ProcessImage and createPatches
|
|
||||||
graphSize = 4 * (maxPixels*numChannels + // Original image storage
|
|
||||||
// Normalized pixels
|
|
||||||
maxPixels*numChannels +
|
|
||||||
// Patches storage (numPatches * channels * temporalPatchSize * patchSize^2)
|
|
||||||
numPatches*numChannels*temporalPatchSize*patchSize*patchSize +
|
|
||||||
// Self-attention calculations (similar to other architectures)
|
|
||||||
numPatches*numPatches*headCount +
|
|
||||||
// Additional buffer for processing
|
|
||||||
embeddingLength*numPatches)
|
|
||||||
case "llama4":
|
|
||||||
// vision graph is computed independently in the same schedule
|
|
||||||
// and is negligible compared to the worst case text graph
|
|
||||||
}
|
}
|
||||||
|
|
||||||
return weights, graphSize
|
return weights, graphSize
|
||||||
|
@ -2,7 +2,6 @@ package ggml
|
|||||||
|
|
||||||
import (
|
import (
|
||||||
"maps"
|
"maps"
|
||||||
"math"
|
|
||||||
"slices"
|
"slices"
|
||||||
"strconv"
|
"strconv"
|
||||||
"strings"
|
"strings"
|
||||||
@ -211,61 +210,3 @@ func TestTensorTypes(t *testing.T) {
|
|||||||
})
|
})
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func TestKeyValue(t *testing.T) {
|
|
||||||
kv := KV{
|
|
||||||
"general.architecture": "test",
|
|
||||||
"test.strings": &array[string]{size: 3, values: []string{"a", "b", "c"}},
|
|
||||||
"test.float32s": &array[float32]{size: 3, values: []float32{1.0, 2.0, 3.0}},
|
|
||||||
"test.int32s": &array[int32]{size: 3, values: []int32{1, 2, 3}},
|
|
||||||
"test.uint32s": &array[uint32]{size: 3, values: []uint32{1, 2, 3}},
|
|
||||||
}
|
|
||||||
|
|
||||||
if diff := cmp.Diff(kv.Strings("strings"), []string{"a", "b", "c"}); diff != "" {
|
|
||||||
t.Errorf("unexpected strings (-got +want):\n%s", diff)
|
|
||||||
}
|
|
||||||
|
|
||||||
if diff := cmp.Diff(kv.Strings("nonexistent.strings"), []string(nil)); diff != "" {
|
|
||||||
t.Errorf("unexpected strings (-got +want):\n%s", diff)
|
|
||||||
}
|
|
||||||
|
|
||||||
if diff := cmp.Diff(kv.Strings("default.strings", []string{"ollama"}), []string{"ollama"}); diff != "" {
|
|
||||||
t.Errorf("unexpected strings (-got +want):\n%s", diff)
|
|
||||||
}
|
|
||||||
|
|
||||||
if diff := cmp.Diff(kv.Floats("float32s"), []float32{1.0, 2.0, 3.0}); diff != "" {
|
|
||||||
t.Errorf("unexpected float32s (-got +want):\n%s", diff)
|
|
||||||
}
|
|
||||||
|
|
||||||
if diff := cmp.Diff(kv.Floats("nonexistent.float32s"), []float32(nil)); diff != "" {
|
|
||||||
t.Errorf("unexpected float32s (-got +want):\n%s", diff)
|
|
||||||
}
|
|
||||||
|
|
||||||
if diff := cmp.Diff(kv.Floats("default.float32s", []float32{math.MaxFloat32}), []float32{math.MaxFloat32}); diff != "" {
|
|
||||||
t.Errorf("unexpected float32s (-got +want):\n%s", diff)
|
|
||||||
}
|
|
||||||
|
|
||||||
if diff := cmp.Diff(kv.Ints("int32s"), []int32{1, 2, 3}); diff != "" {
|
|
||||||
t.Errorf("unexpected int8s (-got +want):\n%s", diff)
|
|
||||||
}
|
|
||||||
|
|
||||||
if diff := cmp.Diff(kv.Ints("nonexistent.int32s"), []int32(nil)); diff != "" {
|
|
||||||
t.Errorf("unexpected int8s (-got +want):\n%s", diff)
|
|
||||||
}
|
|
||||||
|
|
||||||
if diff := cmp.Diff(kv.Ints("default.int32s", []int32{math.MaxInt32}), []int32{math.MaxInt32}); diff != "" {
|
|
||||||
t.Errorf("unexpected int8s (-got +want):\n%s", diff)
|
|
||||||
}
|
|
||||||
|
|
||||||
if diff := cmp.Diff(kv.Uints("uint32s"), []uint32{1, 2, 3}); diff != "" {
|
|
||||||
t.Errorf("unexpected uint8s (-got +want):\n%s", diff)
|
|
||||||
}
|
|
||||||
|
|
||||||
if diff := cmp.Diff(kv.Uints("nonexistent.uint32s"), []uint32(nil)); diff != "" {
|
|
||||||
t.Errorf("unexpected uint8s (-got +want):\n%s", diff)
|
|
||||||
}
|
|
||||||
|
|
||||||
if diff := cmp.Diff(kv.Uints("default.uint32s", []uint32{math.MaxUint32}), []uint32{math.MaxUint32}); diff != "" {
|
|
||||||
t.Errorf("unexpected uint8s (-got +want):\n%s", diff)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
309
fs/ggml/gguf.go
309
fs/ggml/gguf.go
@ -9,12 +9,8 @@ import (
|
|||||||
"io"
|
"io"
|
||||||
"log/slog"
|
"log/slog"
|
||||||
"maps"
|
"maps"
|
||||||
"os"
|
|
||||||
"runtime"
|
|
||||||
"slices"
|
"slices"
|
||||||
"strings"
|
"strings"
|
||||||
|
|
||||||
"golang.org/x/sync/errgroup"
|
|
||||||
)
|
)
|
||||||
|
|
||||||
type containerGGUF struct {
|
type containerGGUF struct {
|
||||||
@ -40,6 +36,10 @@ type containerGGUF struct {
|
|||||||
maxArraySize int
|
maxArraySize int
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (c *containerGGUF) canCollectArray(size int) bool {
|
||||||
|
return c.maxArraySize < 0 || size <= c.maxArraySize
|
||||||
|
}
|
||||||
|
|
||||||
func (c *containerGGUF) Name() string {
|
func (c *containerGGUF) Name() string {
|
||||||
return "gguf"
|
return "gguf"
|
||||||
}
|
}
|
||||||
@ -229,7 +229,7 @@ func (llm *gguf) Decode(rs io.ReadSeeker) error {
|
|||||||
}
|
}
|
||||||
|
|
||||||
llm.tensors = append(llm.tensors, &tensor)
|
llm.tensors = append(llm.tensors, &tensor)
|
||||||
llm.parameters += tensor.Elements()
|
llm.parameters += tensor.parameters()
|
||||||
}
|
}
|
||||||
|
|
||||||
// patch KV with parameter count
|
// patch KV with parameter count
|
||||||
@ -295,23 +295,6 @@ func readGGUFV1String(llm *gguf, r io.Reader) (string, error) {
|
|||||||
return b.String(), nil
|
return b.String(), nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func readGGUFV1StringsData(llm *gguf, r io.Reader, a *array[string]) (any, error) {
|
|
||||||
for i := range a.size {
|
|
||||||
if a.values != nil {
|
|
||||||
e, err := readGGUFV1String(llm, r)
|
|
||||||
if err != nil {
|
|
||||||
return nil, err
|
|
||||||
}
|
|
||||||
|
|
||||||
a.values[i] = e
|
|
||||||
} else {
|
|
||||||
discardGGUFString(llm, r)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return a, nil
|
|
||||||
}
|
|
||||||
|
|
||||||
func discardGGUFString(llm *gguf, r io.Reader) error {
|
func discardGGUFString(llm *gguf, r io.Reader) error {
|
||||||
buf := llm.scratch[:8]
|
buf := llm.scratch[:8]
|
||||||
_, err := io.ReadFull(r, buf)
|
_, err := io.ReadFull(r, buf)
|
||||||
@ -369,44 +352,78 @@ func writeGGUFString(w io.Writer, s string) error {
|
|||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
|
|
||||||
func readGGUFStringsData(llm *gguf, r io.Reader, a *array[string]) (any, error) {
|
type array struct {
|
||||||
for i := range a.size {
|
size int
|
||||||
if a.values != nil {
|
values []any
|
||||||
e, err := readGGUFString(llm, r)
|
}
|
||||||
if err != nil {
|
|
||||||
return nil, err
|
|
||||||
}
|
|
||||||
|
|
||||||
|
func (a *array) MarshalJSON() ([]byte, error) {
|
||||||
|
return json.Marshal(a.values)
|
||||||
|
}
|
||||||
|
|
||||||
|
func readGGUFV1Array(llm *gguf, r io.Reader) (*array, error) {
|
||||||
|
t, err := readGGUF[uint32](llm, r)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
|
||||||
|
n, err := readGGUF[uint32](llm, r)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
|
||||||
|
a := &array{size: int(n)}
|
||||||
|
if llm.canCollectArray(int(n)) {
|
||||||
|
a.values = make([]any, 0, int(n))
|
||||||
|
}
|
||||||
|
|
||||||
|
for i := range n {
|
||||||
|
var e any
|
||||||
|
switch t {
|
||||||
|
case ggufTypeUint8:
|
||||||
|
e, err = readGGUF[uint8](llm, r)
|
||||||
|
case ggufTypeInt8:
|
||||||
|
e, err = readGGUF[int8](llm, r)
|
||||||
|
case ggufTypeUint16:
|
||||||
|
e, err = readGGUF[uint16](llm, r)
|
||||||
|
case ggufTypeInt16:
|
||||||
|
e, err = readGGUF[int16](llm, r)
|
||||||
|
case ggufTypeUint32:
|
||||||
|
e, err = readGGUF[uint32](llm, r)
|
||||||
|
case ggufTypeInt32:
|
||||||
|
e, err = readGGUF[int32](llm, r)
|
||||||
|
case ggufTypeUint64:
|
||||||
|
e, err = readGGUF[uint64](llm, r)
|
||||||
|
case ggufTypeInt64:
|
||||||
|
e, err = readGGUF[int64](llm, r)
|
||||||
|
case ggufTypeFloat32:
|
||||||
|
e, err = readGGUF[float32](llm, r)
|
||||||
|
case ggufTypeFloat64:
|
||||||
|
e, err = readGGUF[float64](llm, r)
|
||||||
|
case ggufTypeBool:
|
||||||
|
e, err = readGGUF[bool](llm, r)
|
||||||
|
case ggufTypeString:
|
||||||
|
e, err = readGGUFV1String(llm, r)
|
||||||
|
default:
|
||||||
|
return nil, fmt.Errorf("invalid array type: %d", t)
|
||||||
|
}
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
|
||||||
|
if a.values != nil {
|
||||||
a.values[i] = e
|
a.values[i] = e
|
||||||
} else {
|
|
||||||
discardGGUFString(llm, r)
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return a, nil
|
return a, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
type array[T any] struct {
|
func readGGUFArray(llm *gguf, r io.Reader) (*array, error) {
|
||||||
// size is the actual size of the array
|
if llm.Version == 1 {
|
||||||
size int
|
return readGGUFV1Array(llm, r)
|
||||||
|
|
||||||
// values is the array of values. this is nil if the array is larger than configured maxSize
|
|
||||||
values []T
|
|
||||||
}
|
|
||||||
|
|
||||||
func (a *array[T]) MarshalJSON() ([]byte, error) {
|
|
||||||
return json.Marshal(a.values)
|
|
||||||
}
|
|
||||||
|
|
||||||
func newArray[T any](size, maxSize int) *array[T] {
|
|
||||||
a := array[T]{size: size}
|
|
||||||
if maxSize < 0 || size <= maxSize {
|
|
||||||
a.values = make([]T, size)
|
|
||||||
}
|
}
|
||||||
return &a
|
|
||||||
}
|
|
||||||
|
|
||||||
func readGGUFArray(llm *gguf, r io.Reader) (any, error) {
|
|
||||||
t, err := readGGUF[uint32](llm, r)
|
t, err := readGGUF[uint32](llm, r)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, err
|
return nil, err
|
||||||
@ -417,55 +434,45 @@ func readGGUFArray(llm *gguf, r io.Reader) (any, error) {
|
|||||||
return nil, err
|
return nil, err
|
||||||
}
|
}
|
||||||
|
|
||||||
switch t {
|
a := &array{size: int(n)}
|
||||||
case ggufTypeUint8:
|
if llm.canCollectArray(int(n)) {
|
||||||
a := newArray[uint8](int(n), llm.maxArraySize)
|
a.values = make([]any, int(n))
|
||||||
return readGGUFArrayData(llm, r, a)
|
|
||||||
case ggufTypeInt8:
|
|
||||||
a := newArray[int8](int(n), llm.maxArraySize)
|
|
||||||
return readGGUFArrayData(llm, r, a)
|
|
||||||
case ggufTypeUint16:
|
|
||||||
a := newArray[uint16](int(n), llm.maxArraySize)
|
|
||||||
return readGGUFArrayData(llm, r, a)
|
|
||||||
case ggufTypeInt16:
|
|
||||||
a := newArray[int16](int(n), llm.maxArraySize)
|
|
||||||
return readGGUFArrayData(llm, r, a)
|
|
||||||
case ggufTypeUint32:
|
|
||||||
a := newArray[uint32](int(n), llm.maxArraySize)
|
|
||||||
return readGGUFArrayData(llm, r, a)
|
|
||||||
case ggufTypeInt32:
|
|
||||||
a := newArray[int32](int(n), llm.maxArraySize)
|
|
||||||
return readGGUFArrayData(llm, r, a)
|
|
||||||
case ggufTypeUint64:
|
|
||||||
a := newArray[uint64](int(n), llm.maxArraySize)
|
|
||||||
return readGGUFArrayData(llm, r, a)
|
|
||||||
case ggufTypeInt64:
|
|
||||||
a := newArray[int64](int(n), llm.maxArraySize)
|
|
||||||
return readGGUFArrayData(llm, r, a)
|
|
||||||
case ggufTypeFloat32:
|
|
||||||
a := newArray[float32](int(n), llm.maxArraySize)
|
|
||||||
return readGGUFArrayData(llm, r, a)
|
|
||||||
case ggufTypeFloat64:
|
|
||||||
a := newArray[float64](int(n), llm.maxArraySize)
|
|
||||||
return readGGUFArrayData(llm, r, a)
|
|
||||||
case ggufTypeBool:
|
|
||||||
a := newArray[bool](int(n), llm.maxArraySize)
|
|
||||||
return readGGUFArrayData(llm, r, a)
|
|
||||||
case ggufTypeString:
|
|
||||||
a := newArray[string](int(n), llm.maxArraySize)
|
|
||||||
if llm.Version == 1 {
|
|
||||||
return readGGUFV1StringsData(llm, r, a)
|
|
||||||
}
|
|
||||||
|
|
||||||
return readGGUFStringsData(llm, r, a)
|
|
||||||
default:
|
|
||||||
return nil, fmt.Errorf("invalid array type: %d", t)
|
|
||||||
}
|
}
|
||||||
}
|
|
||||||
|
|
||||||
func readGGUFArrayData[T any](llm *gguf, r io.Reader, a *array[T]) (any, error) {
|
for i := range n {
|
||||||
for i := range a.size {
|
var e any
|
||||||
e, err := readGGUF[T](llm, r)
|
switch t {
|
||||||
|
case ggufTypeUint8:
|
||||||
|
e, err = readGGUF[uint8](llm, r)
|
||||||
|
case ggufTypeInt8:
|
||||||
|
e, err = readGGUF[int8](llm, r)
|
||||||
|
case ggufTypeUint16:
|
||||||
|
e, err = readGGUF[uint16](llm, r)
|
||||||
|
case ggufTypeInt16:
|
||||||
|
e, err = readGGUF[int16](llm, r)
|
||||||
|
case ggufTypeUint32:
|
||||||
|
e, err = readGGUF[uint32](llm, r)
|
||||||
|
case ggufTypeInt32:
|
||||||
|
e, err = readGGUF[int32](llm, r)
|
||||||
|
case ggufTypeUint64:
|
||||||
|
e, err = readGGUF[uint64](llm, r)
|
||||||
|
case ggufTypeInt64:
|
||||||
|
e, err = readGGUF[int64](llm, r)
|
||||||
|
case ggufTypeFloat32:
|
||||||
|
e, err = readGGUF[float32](llm, r)
|
||||||
|
case ggufTypeFloat64:
|
||||||
|
e, err = readGGUF[float64](llm, r)
|
||||||
|
case ggufTypeBool:
|
||||||
|
e, err = readGGUF[bool](llm, r)
|
||||||
|
case ggufTypeString:
|
||||||
|
if a.values != nil {
|
||||||
|
e, err = readGGUFString(llm, r)
|
||||||
|
} else {
|
||||||
|
err = discardGGUFString(llm, r)
|
||||||
|
}
|
||||||
|
default:
|
||||||
|
return nil, fmt.Errorf("invalid array type: %d", t)
|
||||||
|
}
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, err
|
return nil, err
|
||||||
}
|
}
|
||||||
@ -492,38 +499,25 @@ func writeGGUFArray[S ~[]E, E any](w io.Writer, t uint32, s S) error {
|
|||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
|
|
||||||
if t == ggufTypeString {
|
|
||||||
for _, e := range any(s).([]string) {
|
|
||||||
if err := binary.Write(w, binary.LittleEndian, uint64(len(e))); err != nil {
|
|
||||||
return err
|
|
||||||
}
|
|
||||||
|
|
||||||
if err := binary.Write(w, binary.LittleEndian, []byte(e)); err != nil {
|
|
||||||
return err
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return nil
|
|
||||||
}
|
|
||||||
|
|
||||||
return binary.Write(w, binary.LittleEndian, s)
|
return binary.Write(w, binary.LittleEndian, s)
|
||||||
}
|
}
|
||||||
|
|
||||||
func WriteGGUF(f *os.File, kv KV, ts []*Tensor) error {
|
func WriteGGUF(ws io.WriteSeeker, kv KV, ts []Tensor) error {
|
||||||
alignment := kv.Uint("general.alignment", 32)
|
alignment := kv.Uint("general.alignment", 32)
|
||||||
|
|
||||||
if err := binary.Write(f, binary.LittleEndian, []byte("GGUF")); err != nil {
|
if err := binary.Write(ws, binary.LittleEndian, []byte("GGUF")); err != nil {
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
|
|
||||||
if err := binary.Write(f, binary.LittleEndian, uint32(3)); err != nil {
|
if err := binary.Write(ws, binary.LittleEndian, uint32(3)); err != nil {
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
|
|
||||||
if err := binary.Write(f, binary.LittleEndian, uint64(len(ts))); err != nil {
|
if err := binary.Write(ws, binary.LittleEndian, uint64(len(ts))); err != nil {
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
|
|
||||||
if err := binary.Write(f, binary.LittleEndian, uint64(len(kv))); err != nil {
|
if err := binary.Write(ws, binary.LittleEndian, uint64(len(kv))); err != nil {
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -531,12 +525,12 @@ func WriteGGUF(f *os.File, kv KV, ts []*Tensor) error {
|
|||||||
slices.Sort(keys)
|
slices.Sort(keys)
|
||||||
|
|
||||||
for _, key := range keys {
|
for _, key := range keys {
|
||||||
if err := ggufWriteKV(f, key, kv[key]); err != nil {
|
if err := ggufWriteKV(ws, key, kv[key]); err != nil {
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
slices.SortStableFunc(ts, func(a, b *Tensor) int {
|
slices.SortStableFunc(ts, func(a, b Tensor) int {
|
||||||
if i, j := a.block(), b.block(); i < 0 && j > 0 {
|
if i, j := a.block(), b.block(); i < 0 && j > 0 {
|
||||||
return 1
|
return 1
|
||||||
} else if i > 0 && j < 0 {
|
} else if i > 0 && j < 0 {
|
||||||
@ -547,34 +541,21 @@ func WriteGGUF(f *os.File, kv KV, ts []*Tensor) error {
|
|||||||
})
|
})
|
||||||
|
|
||||||
var s uint64
|
var s uint64
|
||||||
for i := range ts {
|
for _, t := range ts {
|
||||||
ts[i].Offset = s
|
t.Offset = s + uint64(ggufPadding(int64(s), int64(alignment)))
|
||||||
if err := ggufWriteTensorInfo(f, ts[i]); err != nil {
|
if err := ggufWriteTensorInfo(ws, t); err != nil {
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
s += ts[i].Size()
|
s += t.Size()
|
||||||
s += uint64(ggufPadding(int64(s), int64(alignment)))
|
|
||||||
}
|
}
|
||||||
|
|
||||||
offset, err := f.Seek(0, io.SeekCurrent)
|
|
||||||
if err != nil {
|
|
||||||
return err
|
|
||||||
}
|
|
||||||
offset += ggufPadding(offset, int64(alignment))
|
|
||||||
|
|
||||||
var g errgroup.Group
|
|
||||||
g.SetLimit(runtime.GOMAXPROCS(0))
|
|
||||||
// TODO consider reducing if tensors size * gomaxprocs is larger than free memory
|
|
||||||
for _, t := range ts {
|
for _, t := range ts {
|
||||||
t := t
|
if err := ggufWriteTensor(ws, t, int64(alignment)); err != nil {
|
||||||
w := io.NewOffsetWriter(f, offset+int64(t.Offset))
|
|
||||||
g.Go(func() error {
|
|
||||||
_, err := t.WriteTo(w)
|
|
||||||
return err
|
return err
|
||||||
})
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return g.Wait()
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func ggufWriteKV(ws io.WriteSeeker, k string, v any) error {
|
func ggufWriteKV(ws io.WriteSeeker, k string, v any) error {
|
||||||
@ -589,10 +570,8 @@ func ggufWriteKV(ws io.WriteSeeker, k string, v any) error {
|
|||||||
|
|
||||||
var err error
|
var err error
|
||||||
switch v := v.(type) {
|
switch v := v.(type) {
|
||||||
case uint32, FileType:
|
case uint32:
|
||||||
err = writeGGUF(ws, ggufTypeUint32, v)
|
err = writeGGUF(ws, ggufTypeUint32, v)
|
||||||
case uint64:
|
|
||||||
err = writeGGUF(ws, ggufTypeUint64, v)
|
|
||||||
case float32:
|
case float32:
|
||||||
err = writeGGUF(ws, ggufTypeFloat32, v)
|
err = writeGGUF(ws, ggufTypeFloat32, v)
|
||||||
case bool:
|
case bool:
|
||||||
@ -601,20 +580,32 @@ func ggufWriteKV(ws io.WriteSeeker, k string, v any) error {
|
|||||||
err = writeGGUFString(ws, v)
|
err = writeGGUFString(ws, v)
|
||||||
case []int32:
|
case []int32:
|
||||||
err = writeGGUFArray(ws, ggufTypeInt32, v)
|
err = writeGGUFArray(ws, ggufTypeInt32, v)
|
||||||
case *array[int32]:
|
|
||||||
err = writeGGUFArray(ws, ggufTypeInt32, v.values)
|
|
||||||
case []uint32:
|
case []uint32:
|
||||||
err = writeGGUFArray(ws, ggufTypeUint32, v)
|
err = writeGGUFArray(ws, ggufTypeUint32, v)
|
||||||
case *array[uint32]:
|
|
||||||
err = writeGGUFArray(ws, ggufTypeUint32, v.values)
|
|
||||||
case []float32:
|
case []float32:
|
||||||
err = writeGGUFArray(ws, ggufTypeFloat32, v)
|
err = writeGGUFArray(ws, ggufTypeFloat32, v)
|
||||||
case *array[float32]:
|
|
||||||
err = writeGGUFArray(ws, ggufTypeFloat32, v.values)
|
|
||||||
case []string:
|
case []string:
|
||||||
err = writeGGUFArray(ws, ggufTypeString, v)
|
if err := binary.Write(ws, binary.LittleEndian, ggufTypeArray); err != nil {
|
||||||
case *array[string]:
|
return err
|
||||||
err = writeGGUFArray(ws, ggufTypeString, v.values)
|
}
|
||||||
|
|
||||||
|
if err := binary.Write(ws, binary.LittleEndian, ggufTypeString); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
if err := binary.Write(ws, binary.LittleEndian, uint64(len(v))); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
for _, e := range v {
|
||||||
|
if err := binary.Write(ws, binary.LittleEndian, uint64(len(e))); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
if err := binary.Write(ws, binary.LittleEndian, []byte(e)); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
}
|
||||||
default:
|
default:
|
||||||
return fmt.Errorf("improper type for '%s'", k)
|
return fmt.Errorf("improper type for '%s'", k)
|
||||||
}
|
}
|
||||||
@ -622,7 +613,7 @@ func ggufWriteKV(ws io.WriteSeeker, k string, v any) error {
|
|||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
|
|
||||||
func ggufWriteTensorInfo(ws io.WriteSeeker, t *Tensor) error {
|
func ggufWriteTensorInfo(ws io.WriteSeeker, t Tensor) error {
|
||||||
slog.Debug(t.Name, "kind", t.Kind, "shape", t.Shape, "offset", t.Offset)
|
slog.Debug(t.Name, "kind", t.Kind, "shape", t.Shape, "offset", t.Offset)
|
||||||
if err := binary.Write(ws, binary.LittleEndian, uint64(len(t.Name))); err != nil {
|
if err := binary.Write(ws, binary.LittleEndian, uint64(len(t.Name))); err != nil {
|
||||||
return err
|
return err
|
||||||
@ -636,8 +627,8 @@ func ggufWriteTensorInfo(ws io.WriteSeeker, t *Tensor) error {
|
|||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
|
|
||||||
for _, n := range t.Shape {
|
for i := range len(t.Shape) {
|
||||||
if err := binary.Write(ws, binary.LittleEndian, n); err != nil {
|
if err := binary.Write(ws, binary.LittleEndian, t.Shape[len(t.Shape)-i-1]); err != nil {
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -649,6 +640,20 @@ func ggufWriteTensorInfo(ws io.WriteSeeker, t *Tensor) error {
|
|||||||
return binary.Write(ws, binary.LittleEndian, t.Offset)
|
return binary.Write(ws, binary.LittleEndian, t.Offset)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func ggufWriteTensor(ws io.WriteSeeker, t Tensor, alignment int64) error {
|
||||||
|
offset, err := ws.Seek(0, io.SeekCurrent)
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
if err := binary.Write(ws, binary.LittleEndian, bytes.Repeat([]byte{0}, int(ggufPadding(offset, alignment)))); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
_, err = t.WriteTo(ws)
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
func ggufPadding(offset, align int64) int64 {
|
func ggufPadding(offset, align int64) int64 {
|
||||||
return (align - offset%align) % align
|
return (align - offset%align) % align
|
||||||
}
|
}
|
||||||
|
@ -1,63 +0,0 @@
|
|||||||
package ggml
|
|
||||||
|
|
||||||
import (
|
|
||||||
"bytes"
|
|
||||||
"os"
|
|
||||||
"slices"
|
|
||||||
"testing"
|
|
||||||
|
|
||||||
"github.com/google/go-cmp/cmp"
|
|
||||||
)
|
|
||||||
|
|
||||||
func TestWriteGGUF(t *testing.T) {
|
|
||||||
w, err := os.CreateTemp(t.TempDir(), "*.bin")
|
|
||||||
if err != nil {
|
|
||||||
t.Fatal(err)
|
|
||||||
}
|
|
||||||
defer w.Close()
|
|
||||||
|
|
||||||
if err := WriteGGUF(w, KV{
|
|
||||||
"general.alignment": uint32(16),
|
|
||||||
}, []*Tensor{
|
|
||||||
{Name: "test.0", Shape: []uint64{2, 3}, WriterTo: bytes.NewBuffer(slices.Repeat([]byte{0}, 2*3*4))},
|
|
||||||
{Name: "test.1", Shape: []uint64{2, 3}, WriterTo: bytes.NewBuffer(slices.Repeat([]byte{0}, 2*3*4))},
|
|
||||||
{Name: "test.2", Shape: []uint64{2, 3}, WriterTo: bytes.NewBuffer(slices.Repeat([]byte{0}, 2*3*4))},
|
|
||||||
{Name: "test.3", Shape: []uint64{2, 3}, WriterTo: bytes.NewBuffer(slices.Repeat([]byte{0}, 2*3*4))},
|
|
||||||
{Name: "test.4", Shape: []uint64{2, 3}, WriterTo: bytes.NewBuffer(slices.Repeat([]byte{0}, 2*3*4))},
|
|
||||||
{Name: "test.5", Shape: []uint64{2, 3}, WriterTo: bytes.NewBuffer(slices.Repeat([]byte{0}, 2*3*4))},
|
|
||||||
}); err != nil {
|
|
||||||
t.Fatal(err)
|
|
||||||
}
|
|
||||||
|
|
||||||
r, err := os.Open(w.Name())
|
|
||||||
if err != nil {
|
|
||||||
t.Fatal(err)
|
|
||||||
}
|
|
||||||
defer r.Close()
|
|
||||||
|
|
||||||
ff, _, err := Decode(r, 0)
|
|
||||||
if err != nil {
|
|
||||||
t.Fatal(err)
|
|
||||||
}
|
|
||||||
|
|
||||||
if diff := cmp.Diff(ff.KV(), KV{
|
|
||||||
"general.alignment": uint32(16),
|
|
||||||
"general.parameter_count": uint64(36),
|
|
||||||
}); diff != "" {
|
|
||||||
t.Errorf("Mismatch (-want +got):\n%s", diff)
|
|
||||||
}
|
|
||||||
|
|
||||||
if diff := cmp.Diff(ff.Tensors(), Tensors{
|
|
||||||
Offset: 336,
|
|
||||||
items: []*Tensor{
|
|
||||||
{Name: "test.0", Offset: 0, Shape: []uint64{2, 3}},
|
|
||||||
{Name: "test.1", Offset: 32, Shape: []uint64{2, 3}},
|
|
||||||
{Name: "test.2", Offset: 64, Shape: []uint64{2, 3}},
|
|
||||||
{Name: "test.3", Offset: 96, Shape: []uint64{2, 3}},
|
|
||||||
{Name: "test.4", Offset: 128, Shape: []uint64{2, 3}},
|
|
||||||
{Name: "test.5", Offset: 160, Shape: []uint64{2, 3}},
|
|
||||||
},
|
|
||||||
}, cmp.AllowUnexported(Tensors{})); diff != "" {
|
|
||||||
t.Errorf("Mismatch (-want +got):\n%s", diff)
|
|
||||||
}
|
|
||||||
}
|
|
343
fs/ggml/type.go
343
fs/ggml/type.go
@ -1,31 +1,26 @@
|
|||||||
package ggml
|
package ggml
|
||||||
|
|
||||||
import (
|
import "fmt"
|
||||||
"fmt"
|
|
||||||
"log/slog"
|
|
||||||
"strings"
|
|
||||||
)
|
|
||||||
|
|
||||||
// FileType is the Go equivalent to llama_ftype used for gguf file typing
|
type fileType uint32
|
||||||
type FileType uint32
|
|
||||||
|
|
||||||
const (
|
const (
|
||||||
FileTypeF32 FileType = iota
|
fileTypeF32 fileType = iota
|
||||||
FileTypeF16
|
fileTypeF16
|
||||||
fileTypeQ4_0
|
fileTypeQ4_0
|
||||||
fileTypeQ4_1
|
fileTypeQ4_1
|
||||||
fileTypeQ4_1_F16 // unused by GGML
|
fileTypeQ4_1_F16
|
||||||
fileTypeQ4_2 // unused by GGML
|
fileTypeQ4_2 // unused
|
||||||
fileTypeQ4_3 // unused by GGML
|
fileTypeQ4_3 // unused
|
||||||
FileTypeQ8_0
|
fileTypeQ8_0
|
||||||
fileTypeQ5_0
|
fileTypeQ5_0
|
||||||
fileTypeQ5_1
|
fileTypeQ5_1
|
||||||
fileTypeQ2_K
|
fileTypeQ2_K
|
||||||
fileTypeQ3_K_S
|
fileTypeQ3_K_S
|
||||||
fileTypeQ3_K_M
|
fileTypeQ3_K_M
|
||||||
fileTypeQ3_K_L
|
fileTypeQ3_K_L
|
||||||
FileTypeQ4_K_S
|
fileTypeQ4_K_S
|
||||||
FileTypeQ4_K_M
|
fileTypeQ4_K_M
|
||||||
fileTypeQ5_K_S
|
fileTypeQ5_K_S
|
||||||
fileTypeQ5_K_M
|
fileTypeQ5_K_M
|
||||||
fileTypeQ6_K
|
fileTypeQ6_K
|
||||||
@ -42,62 +37,93 @@ const (
|
|||||||
fileTypeIQ2_M
|
fileTypeIQ2_M
|
||||||
fileTypeIQ4_XS
|
fileTypeIQ4_XS
|
||||||
fileTypeIQ1_M
|
fileTypeIQ1_M
|
||||||
FileTypeBF16
|
fileTypeBF16
|
||||||
fileTypeQ4_0_4_4 // unused by GGML
|
|
||||||
fileTypeQ4_0_4_8 // unused by GGML
|
|
||||||
fileTypeQ4_0_8_8 // unused by GGML
|
|
||||||
fileTypeTQ1_0
|
|
||||||
fileTypeTQ2_0
|
|
||||||
|
|
||||||
FileTypeUnknown = 1024
|
fileTypeUnknown
|
||||||
)
|
)
|
||||||
|
|
||||||
// ParseFileType parses the provided GGUF file type
|
func ParseFileType(s string) (fileType, error) {
|
||||||
// Only Ollama supported types are considered valid
|
|
||||||
func ParseFileType(s string) (FileType, error) {
|
|
||||||
switch s {
|
switch s {
|
||||||
case "F32":
|
case "F32":
|
||||||
return FileTypeF32, nil
|
return fileTypeF32, nil
|
||||||
case "F16":
|
case "F16":
|
||||||
return FileTypeF16, nil
|
return fileTypeF16, nil
|
||||||
|
case "Q4_0":
|
||||||
|
return fileTypeQ4_0, nil
|
||||||
|
case "Q4_1":
|
||||||
|
return fileTypeQ4_1, nil
|
||||||
|
case "Q4_1_F16":
|
||||||
|
return fileTypeQ4_1_F16, nil
|
||||||
case "Q8_0":
|
case "Q8_0":
|
||||||
return FileTypeQ8_0, nil
|
return fileTypeQ8_0, nil
|
||||||
|
case "Q5_0":
|
||||||
|
return fileTypeQ5_0, nil
|
||||||
|
case "Q5_1":
|
||||||
|
return fileTypeQ5_1, nil
|
||||||
|
case "Q2_K":
|
||||||
|
return fileTypeQ2_K, nil
|
||||||
|
case "Q3_K_S":
|
||||||
|
return fileTypeQ3_K_S, nil
|
||||||
|
case "Q3_K_M":
|
||||||
|
return fileTypeQ3_K_M, nil
|
||||||
|
case "Q3_K_L":
|
||||||
|
return fileTypeQ3_K_L, nil
|
||||||
case "Q4_K_S":
|
case "Q4_K_S":
|
||||||
return FileTypeQ4_K_S, nil
|
return fileTypeQ4_K_S, nil
|
||||||
case "Q4_K_M", "Q4_K":
|
case "Q4_K_M":
|
||||||
return FileTypeQ4_K_M, nil
|
return fileTypeQ4_K_M, nil
|
||||||
|
case "Q5_K_S":
|
||||||
|
return fileTypeQ5_K_S, nil
|
||||||
|
case "Q5_K_M":
|
||||||
|
return fileTypeQ5_K_M, nil
|
||||||
|
case "Q6_K":
|
||||||
|
return fileTypeQ6_K, nil
|
||||||
|
case "IQ2_XXS":
|
||||||
|
return fileTypeIQ2_XXS, nil
|
||||||
|
case "IQ2_XS":
|
||||||
|
return fileTypeIQ2_XS, nil
|
||||||
|
case "Q2_K_S":
|
||||||
|
return fileTypeQ2_K_S, nil
|
||||||
|
case "IQ3_XS":
|
||||||
|
return fileTypeIQ3_XS, nil
|
||||||
|
case "IQ3_XXS":
|
||||||
|
return fileTypeIQ3_XXS, nil
|
||||||
|
case "IQ1_S":
|
||||||
|
return fileTypeIQ1_S, nil
|
||||||
|
case "IQ4_NL":
|
||||||
|
return fileTypeIQ4_NL, nil
|
||||||
|
case "IQ3_S":
|
||||||
|
return fileTypeIQ3_S, nil
|
||||||
|
case "IQ3_M":
|
||||||
|
return fileTypeIQ3_M, nil
|
||||||
|
case "IQ2_S":
|
||||||
|
return fileTypeIQ2_S, nil
|
||||||
|
case "IQ2_M":
|
||||||
|
return fileTypeIQ2_M, nil
|
||||||
|
case "IQ4_XS":
|
||||||
|
return fileTypeIQ4_XS, nil
|
||||||
|
case "IQ1_M":
|
||||||
|
return fileTypeIQ1_M, nil
|
||||||
case "BF16":
|
case "BF16":
|
||||||
return FileTypeBF16, nil
|
return fileTypeBF16, nil
|
||||||
default:
|
default:
|
||||||
supportedFileTypes := []FileType{
|
return fileTypeUnknown, fmt.Errorf("unknown fileType: %s", s)
|
||||||
FileTypeF32,
|
|
||||||
FileTypeF16,
|
|
||||||
FileTypeQ4_K_S,
|
|
||||||
FileTypeQ4_K_M,
|
|
||||||
FileTypeQ8_0,
|
|
||||||
// fsggml.FileTypeBF16, // TODO
|
|
||||||
}
|
|
||||||
strs := make([]string, len(supportedFileTypes))
|
|
||||||
for i := range supportedFileTypes {
|
|
||||||
strs[i] = supportedFileTypes[i].String()
|
|
||||||
}
|
|
||||||
|
|
||||||
return FileTypeUnknown, fmt.Errorf("unsupported quantization type %s - supported types are %s", s, strings.Join(strs, ", "))
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func (t FileType) String() string {
|
func (t fileType) String() string {
|
||||||
// Note: this routine will return a broader set of file types for existing models
|
|
||||||
switch t {
|
switch t {
|
||||||
case FileTypeF32:
|
case fileTypeF32:
|
||||||
return "F32"
|
return "F32"
|
||||||
case FileTypeF16:
|
case fileTypeF16:
|
||||||
return "F16"
|
return "F16"
|
||||||
case fileTypeQ4_0:
|
case fileTypeQ4_0:
|
||||||
return "Q4_0"
|
return "Q4_0"
|
||||||
case fileTypeQ4_1:
|
case fileTypeQ4_1:
|
||||||
return "Q4_1"
|
return "Q4_1"
|
||||||
case FileTypeQ8_0:
|
case fileTypeQ4_1_F16:
|
||||||
|
return "Q4_1_F16"
|
||||||
|
case fileTypeQ8_0:
|
||||||
return "Q8_0"
|
return "Q8_0"
|
||||||
case fileTypeQ5_0:
|
case fileTypeQ5_0:
|
||||||
return "Q5_0"
|
return "Q5_0"
|
||||||
@ -111,9 +137,9 @@ func (t FileType) String() string {
|
|||||||
return "Q3_K_M"
|
return "Q3_K_M"
|
||||||
case fileTypeQ3_K_L:
|
case fileTypeQ3_K_L:
|
||||||
return "Q3_K_L"
|
return "Q3_K_L"
|
||||||
case FileTypeQ4_K_S:
|
case fileTypeQ4_K_S:
|
||||||
return "Q4_K_S"
|
return "Q4_K_S"
|
||||||
case FileTypeQ4_K_M:
|
case fileTypeQ4_K_M:
|
||||||
return "Q4_K_M"
|
return "Q4_K_M"
|
||||||
case fileTypeQ5_K_S:
|
case fileTypeQ5_K_S:
|
||||||
return "Q5_K_S"
|
return "Q5_K_S"
|
||||||
@ -121,198 +147,39 @@ func (t FileType) String() string {
|
|||||||
return "Q5_K_M"
|
return "Q5_K_M"
|
||||||
case fileTypeQ6_K:
|
case fileTypeQ6_K:
|
||||||
return "Q6_K"
|
return "Q6_K"
|
||||||
|
case fileTypeIQ2_XXS:
|
||||||
|
return "IQ2_XXS"
|
||||||
|
case fileTypeIQ2_XS:
|
||||||
|
return "IQ2_XS"
|
||||||
case fileTypeQ2_K_S:
|
case fileTypeQ2_K_S:
|
||||||
return "Q2_K_S"
|
return "Q2_K_S"
|
||||||
case FileTypeBF16:
|
case fileTypeIQ3_XS:
|
||||||
|
return "IQ3_XS"
|
||||||
|
case fileTypeIQ3_XXS:
|
||||||
|
return "IQ3_XXS"
|
||||||
|
case fileTypeIQ1_S:
|
||||||
|
return "IQ1_S"
|
||||||
|
case fileTypeIQ4_NL:
|
||||||
|
return "IQ4_NL"
|
||||||
|
case fileTypeIQ3_S:
|
||||||
|
return "IQ3_S"
|
||||||
|
case fileTypeIQ3_M:
|
||||||
|
return "IQ3_M"
|
||||||
|
case fileTypeIQ2_S:
|
||||||
|
return "IQ2_S"
|
||||||
|
case fileTypeIQ4_XS:
|
||||||
|
return "IQ4_XS"
|
||||||
|
case fileTypeIQ2_M:
|
||||||
|
return "IQ2_M"
|
||||||
|
case fileTypeIQ1_M:
|
||||||
|
return "IQ1_M"
|
||||||
|
case fileTypeBF16:
|
||||||
return "BF16"
|
return "BF16"
|
||||||
default:
|
default:
|
||||||
return "unknown"
|
return "unknown"
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func (t FileType) Value() uint32 {
|
func (t fileType) Value() uint32 {
|
||||||
return uint32(t)
|
return uint32(t)
|
||||||
}
|
}
|
||||||
|
|
||||||
func (ftype FileType) ToTensorType() TensorType {
|
|
||||||
switch ftype {
|
|
||||||
case FileTypeF32:
|
|
||||||
return TensorTypeF32
|
|
||||||
case FileTypeF16:
|
|
||||||
return TensorTypeF16
|
|
||||||
case fileTypeQ4_0:
|
|
||||||
return TensorTypeQ4_0
|
|
||||||
case fileTypeQ4_1:
|
|
||||||
return TensorTypeQ4_1
|
|
||||||
case FileTypeQ8_0:
|
|
||||||
return TensorTypeQ8_0
|
|
||||||
case fileTypeQ5_0:
|
|
||||||
return TensorTypeQ5_0
|
|
||||||
case fileTypeQ5_1:
|
|
||||||
return TensorTypeQ5_1
|
|
||||||
case fileTypeQ2_K:
|
|
||||||
return TensorTypeQ2_K
|
|
||||||
case fileTypeQ3_K_S:
|
|
||||||
return TensorTypeQ3_K
|
|
||||||
case fileTypeQ3_K_M:
|
|
||||||
return TensorTypeQ3_K
|
|
||||||
case fileTypeQ3_K_L:
|
|
||||||
return TensorTypeQ3_K
|
|
||||||
case FileTypeQ4_K_S:
|
|
||||||
return TensorTypeQ4_K
|
|
||||||
case FileTypeQ4_K_M:
|
|
||||||
return TensorTypeQ4_K
|
|
||||||
case fileTypeQ5_K_S:
|
|
||||||
return TensorTypeQ5_K
|
|
||||||
case fileTypeQ5_K_M:
|
|
||||||
return TensorTypeQ5_K
|
|
||||||
case fileTypeQ6_K:
|
|
||||||
return TensorTypeQ6_K
|
|
||||||
case fileTypeQ2_K_S:
|
|
||||||
return TensorTypeQ2_K
|
|
||||||
case FileTypeBF16:
|
|
||||||
return TensorTypeBF16
|
|
||||||
default:
|
|
||||||
slog.Warn("unsupported file type", "type", ftype)
|
|
||||||
return 0 // F32
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// TensorType is equivalent to ggml_type for individual tensor types
|
|
||||||
// Note: these are not the same as FileType
|
|
||||||
type TensorType uint32
|
|
||||||
|
|
||||||
const (
|
|
||||||
TensorTypeF32 TensorType = iota
|
|
||||||
TensorTypeF16
|
|
||||||
TensorTypeQ4_0
|
|
||||||
TensorTypeQ4_1
|
|
||||||
tensorTypeQ4_2 // unused by GGML
|
|
||||||
tensorTypeQ4_3 // unused by GGML
|
|
||||||
TensorTypeQ5_0
|
|
||||||
TensorTypeQ5_1
|
|
||||||
TensorTypeQ8_0
|
|
||||||
TensorTypeQ8_1
|
|
||||||
TensorTypeQ2_K
|
|
||||||
TensorTypeQ3_K
|
|
||||||
TensorTypeQ4_K
|
|
||||||
TensorTypeQ5_K
|
|
||||||
TensorTypeQ6_K
|
|
||||||
TensorTypeQ8_K
|
|
||||||
tensorTypeIQ2_XXS // not supported by ollama
|
|
||||||
tensorTypeIQ2_XS // not supported by ollama
|
|
||||||
tensorTypeIQ3_XXS // not supported by ollama
|
|
||||||
tensorTypeIQ1_S // not supported by ollama
|
|
||||||
tensorTypeIQ4_NL // not supported by ollama
|
|
||||||
tensorTypeIQ3_S // not supported by ollama
|
|
||||||
tensorTypeIQ2_S // not supported by ollama
|
|
||||||
tensorTypeIQ4_XS // not supported by ollama
|
|
||||||
TensorTypeI8
|
|
||||||
TensorTypeI16
|
|
||||||
TensorTypeI32
|
|
||||||
TensorTypeI64
|
|
||||||
TensorTypeF64
|
|
||||||
tensorTypeIQ1_M // not supported by ollama
|
|
||||||
TensorTypeBF16
|
|
||||||
tensorTypeQ4_0_4_4 // unused by GGML
|
|
||||||
tensorTypeQ4_0_4_8 // unused by GGML
|
|
||||||
tensorTypeQ4_0_8_8 // unused by GGML
|
|
||||||
tensorTypeTQ1_0 // not supported by ollama
|
|
||||||
tensorTypeTQ2_0 // not supported by ollama
|
|
||||||
tensorTypeIQ4_NL_4_4 // unused by GGML
|
|
||||||
tensorTypeIQ4_NL_4_8 // unused by GGML
|
|
||||||
tensorTypeIQ4_NL_8_8 // unused by GGML
|
|
||||||
)
|
|
||||||
|
|
||||||
// ParseFileType parses the provided GGUF file type
|
|
||||||
// Only Ollama supported types are considered valid
|
|
||||||
func ParseTensorType(s string) (TensorType, error) {
|
|
||||||
switch s {
|
|
||||||
case "F32":
|
|
||||||
return TensorTypeF32, nil
|
|
||||||
case "F16":
|
|
||||||
return TensorTypeF16, nil
|
|
||||||
case "Q4_0":
|
|
||||||
return TensorTypeQ4_0, nil
|
|
||||||
case "Q4_1":
|
|
||||||
return TensorTypeQ4_1, nil
|
|
||||||
case "Q5_0":
|
|
||||||
return TensorTypeQ5_0, nil
|
|
||||||
case "Q5_1":
|
|
||||||
return TensorTypeQ5_1, nil
|
|
||||||
case "Q8_0":
|
|
||||||
return TensorTypeQ8_0, nil
|
|
||||||
case "Q8_1":
|
|
||||||
return TensorTypeQ8_1, nil
|
|
||||||
case "Q2_K":
|
|
||||||
return TensorTypeQ2_K, nil
|
|
||||||
case "Q3_K":
|
|
||||||
return TensorTypeQ3_K, nil
|
|
||||||
case "Q4_K":
|
|
||||||
return TensorTypeQ4_K, nil
|
|
||||||
case "Q5_K":
|
|
||||||
return TensorTypeQ5_K, nil
|
|
||||||
case "Q6_K":
|
|
||||||
return TensorTypeQ6_K, nil
|
|
||||||
case "Q8_K":
|
|
||||||
return TensorTypeQ8_K, nil
|
|
||||||
case "F64":
|
|
||||||
return TensorTypeF64, nil
|
|
||||||
case "BF16":
|
|
||||||
return TensorTypeBF16, nil
|
|
||||||
default:
|
|
||||||
return 0, fmt.Errorf("unsupported quantization type %s", s)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
func (t TensorType) IsQuantized() bool {
|
|
||||||
switch t {
|
|
||||||
case TensorTypeF32, TensorTypeF16, TensorTypeBF16:
|
|
||||||
return false
|
|
||||||
default:
|
|
||||||
return true
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
func (t TensorType) RowSize(ne uint64) uint64 {
|
|
||||||
return t.TypeSize() * ne / t.BlockSize()
|
|
||||||
}
|
|
||||||
|
|
||||||
func (t TensorType) String() string {
|
|
||||||
switch t {
|
|
||||||
case TensorTypeF32:
|
|
||||||
return "F32"
|
|
||||||
case TensorTypeF16:
|
|
||||||
return "F16"
|
|
||||||
case TensorTypeQ4_0:
|
|
||||||
return "Q4_0"
|
|
||||||
case TensorTypeQ4_1:
|
|
||||||
return "Q4_1"
|
|
||||||
case TensorTypeQ5_0:
|
|
||||||
return "Q5_0"
|
|
||||||
case TensorTypeQ5_1:
|
|
||||||
return "Q5_1"
|
|
||||||
case TensorTypeQ8_0:
|
|
||||||
return "Q8_0"
|
|
||||||
case TensorTypeQ8_1:
|
|
||||||
return "Q8_1"
|
|
||||||
case TensorTypeQ2_K:
|
|
||||||
return "Q2_K"
|
|
||||||
case TensorTypeQ3_K:
|
|
||||||
return "Q3_K"
|
|
||||||
case TensorTypeQ4_K:
|
|
||||||
return "Q4_K"
|
|
||||||
case TensorTypeQ5_K:
|
|
||||||
return "Q5_K"
|
|
||||||
case TensorTypeQ6_K:
|
|
||||||
return "Q6_K"
|
|
||||||
case TensorTypeQ8_K:
|
|
||||||
return "Q8_K"
|
|
||||||
case TensorTypeF64:
|
|
||||||
return "F64"
|
|
||||||
case TensorTypeBF16:
|
|
||||||
return "BF16"
|
|
||||||
default:
|
|
||||||
return "unknown"
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
12
go.mod
12
go.mod
@ -11,7 +11,7 @@ require (
|
|||||||
github.com/spf13/cobra v1.7.0
|
github.com/spf13/cobra v1.7.0
|
||||||
github.com/stretchr/testify v1.9.0
|
github.com/stretchr/testify v1.9.0
|
||||||
github.com/x448/float16 v0.8.4
|
github.com/x448/float16 v0.8.4
|
||||||
golang.org/x/sync v0.12.0
|
golang.org/x/sync v0.11.0
|
||||||
)
|
)
|
||||||
|
|
||||||
require (
|
require (
|
||||||
@ -70,12 +70,12 @@ require (
|
|||||||
github.com/twitchyliquid64/golang-asm v0.15.1 // indirect
|
github.com/twitchyliquid64/golang-asm v0.15.1 // indirect
|
||||||
github.com/ugorji/go/codec v1.2.12 // indirect
|
github.com/ugorji/go/codec v1.2.12 // indirect
|
||||||
golang.org/x/arch v0.8.0 // indirect
|
golang.org/x/arch v0.8.0 // indirect
|
||||||
golang.org/x/crypto v0.36.0
|
golang.org/x/crypto v0.33.0
|
||||||
golang.org/x/exp v0.0.0-20250218142911-aa4b98e5adaa
|
golang.org/x/exp v0.0.0-20250218142911-aa4b98e5adaa
|
||||||
golang.org/x/net v0.38.0 // indirect
|
golang.org/x/net v0.35.0 // indirect
|
||||||
golang.org/x/sys v0.31.0
|
golang.org/x/sys v0.30.0
|
||||||
golang.org/x/term v0.30.0
|
golang.org/x/term v0.29.0
|
||||||
golang.org/x/text v0.23.0
|
golang.org/x/text v0.22.0
|
||||||
google.golang.org/protobuf v1.34.1
|
google.golang.org/protobuf v1.34.1
|
||||||
gopkg.in/yaml.v3 v3.0.1 // indirect
|
gopkg.in/yaml.v3 v3.0.1 // indirect
|
||||||
)
|
)
|
||||||
|
24
go.sum
24
go.sum
@ -214,8 +214,8 @@ golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACk
|
|||||||
golang.org/x/crypto v0.0.0-20190510104115-cbcb75029529/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI=
|
golang.org/x/crypto v0.0.0-20190510104115-cbcb75029529/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI=
|
||||||
golang.org/x/crypto v0.0.0-20191011191535-87dc89f01550/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI=
|
golang.org/x/crypto v0.0.0-20191011191535-87dc89f01550/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI=
|
||||||
golang.org/x/crypto v0.0.0-20200622213623-75b288015ac9/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto=
|
golang.org/x/crypto v0.0.0-20200622213623-75b288015ac9/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto=
|
||||||
golang.org/x/crypto v0.36.0 h1:AnAEvhDddvBdpY+uR+MyHmuZzzNqXSe/GvuDeob5L34=
|
golang.org/x/crypto v0.33.0 h1:IOBPskki6Lysi0lo9qQvbxiQ+FvsCC/YWOecCHAixus=
|
||||||
golang.org/x/crypto v0.36.0/go.mod h1:Y4J0ReaxCR1IMaabaSMugxJES1EpwhBHhv2bDHklZvc=
|
golang.org/x/crypto v0.33.0/go.mod h1:bVdXmD7IV/4GdElGPozy6U7lWdRXA4qyRVGJV57uQ5M=
|
||||||
golang.org/x/exp v0.0.0-20180321215751-8460e604b9de/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA=
|
golang.org/x/exp v0.0.0-20180321215751-8460e604b9de/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA=
|
||||||
golang.org/x/exp v0.0.0-20180807140117-3d87b88a115f/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA=
|
golang.org/x/exp v0.0.0-20180807140117-3d87b88a115f/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA=
|
||||||
golang.org/x/exp v0.0.0-20190121172915-509febef88a4/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA=
|
golang.org/x/exp v0.0.0-20190121172915-509febef88a4/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA=
|
||||||
@ -257,8 +257,8 @@ golang.org/x/net v0.0.0-20200822124328-c89045814202/go.mod h1:/O7V0waA8r7cgGh81R
|
|||||||
golang.org/x/net v0.0.0-20201021035429-f5854403a974/go.mod h1:sp8m0HH+o8qH0wwXwYZr8TS3Oi6o0r6Gce1SSxlDquU=
|
golang.org/x/net v0.0.0-20201021035429-f5854403a974/go.mod h1:sp8m0HH+o8qH0wwXwYZr8TS3Oi6o0r6Gce1SSxlDquU=
|
||||||
golang.org/x/net v0.0.0-20210405180319-a5a99cb37ef4/go.mod h1:p54w0d4576C0XHj96bSt6lcn1PtDYWL6XObtHCRCNQM=
|
golang.org/x/net v0.0.0-20210405180319-a5a99cb37ef4/go.mod h1:p54w0d4576C0XHj96bSt6lcn1PtDYWL6XObtHCRCNQM=
|
||||||
golang.org/x/net v0.0.0-20210614182718-04defd469f4e/go.mod h1:9nx3DQGgdP8bBQD5qxJ1jj9UTztislL4KSBs9R2vV5Y=
|
golang.org/x/net v0.0.0-20210614182718-04defd469f4e/go.mod h1:9nx3DQGgdP8bBQD5qxJ1jj9UTztislL4KSBs9R2vV5Y=
|
||||||
golang.org/x/net v0.38.0 h1:vRMAPTMaeGqVhG5QyLJHqNDwecKTomGeqbnfZyKlBI8=
|
golang.org/x/net v0.35.0 h1:T5GQRQb2y08kTAByq9L4/bz8cipCdA8FbRTXewonqY8=
|
||||||
golang.org/x/net v0.38.0/go.mod h1:ivrbrMbzFq5J41QOQh0siUuly180yBYtLp+CKbEaFx8=
|
golang.org/x/net v0.35.0/go.mod h1:EglIi67kWsHKlRzzVMUD93VMSWGFOMSZgxFjparz1Qk=
|
||||||
golang.org/x/oauth2 v0.0.0-20180821212333-d2e6202438be/go.mod h1:N/0e6XlmueqKjAGxoOufVs8QHGRruUQn6yWY3a++T0U=
|
golang.org/x/oauth2 v0.0.0-20180821212333-d2e6202438be/go.mod h1:N/0e6XlmueqKjAGxoOufVs8QHGRruUQn6yWY3a++T0U=
|
||||||
golang.org/x/oauth2 v0.0.0-20200107190931-bf48bf16ab8d/go.mod h1:gOpvHmFTYa4IltrdGE7lF6nIHvwfUNPOp7c8zoXwtLw=
|
golang.org/x/oauth2 v0.0.0-20200107190931-bf48bf16ab8d/go.mod h1:gOpvHmFTYa4IltrdGE7lF6nIHvwfUNPOp7c8zoXwtLw=
|
||||||
golang.org/x/sync v0.0.0-20180314180146-1d60e4601c6f/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
|
golang.org/x/sync v0.0.0-20180314180146-1d60e4601c6f/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
|
||||||
@ -268,8 +268,8 @@ golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJ
|
|||||||
golang.org/x/sync v0.0.0-20190911185100-cd5d95a43a6e/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
|
golang.org/x/sync v0.0.0-20190911185100-cd5d95a43a6e/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
|
||||||
golang.org/x/sync v0.0.0-20201020160332-67f06af15bc9/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
|
golang.org/x/sync v0.0.0-20201020160332-67f06af15bc9/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
|
||||||
golang.org/x/sync v0.0.0-20210220032951-036812b2e83c/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
|
golang.org/x/sync v0.0.0-20210220032951-036812b2e83c/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
|
||||||
golang.org/x/sync v0.12.0 h1:MHc5BpPuC30uJk597Ri8TV3CNZcTLu6B6z4lJy+g6Jw=
|
golang.org/x/sync v0.11.0 h1:GGz8+XQP4FvTTrjZPzNKTMFtSXH80RAzG+5ghFPgK9w=
|
||||||
golang.org/x/sync v0.12.0/go.mod h1:1dzgHSNfp02xaA81J2MS99Qcpr2w7fw1gpm99rleRqA=
|
golang.org/x/sync v0.11.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk=
|
||||||
golang.org/x/sys v0.0.0-20180830151530-49385e6e1522/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
|
golang.org/x/sys v0.0.0-20180830151530-49385e6e1522/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
|
||||||
golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
|
golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
|
||||||
golang.org/x/sys v0.0.0-20190312061237-fead79001313/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
|
golang.org/x/sys v0.0.0-20190312061237-fead79001313/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
|
||||||
@ -285,17 +285,17 @@ golang.org/x/sys v0.0.0-20210510120138-977fb7262007/go.mod h1:oPkhp1MJrh7nUepCBc
|
|||||||
golang.org/x/sys v0.0.0-20210630005230-0f9fa26af87c/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
|
golang.org/x/sys v0.0.0-20210630005230-0f9fa26af87c/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
|
||||||
golang.org/x/sys v0.5.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
|
golang.org/x/sys v0.5.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
|
||||||
golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
|
golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
|
||||||
golang.org/x/sys v0.31.0 h1:ioabZlmFYtWhL+TRYpcnNlLwhyxaM9kWTDEmfnprqik=
|
golang.org/x/sys v0.30.0 h1:QjkSwP/36a20jFYWkSue1YwXzLmsV5Gfq7Eiy72C1uc=
|
||||||
golang.org/x/sys v0.31.0/go.mod h1:BJP2sWEmIv4KK5OTEluFJCKSidICx8ciO85XgH3Ak8k=
|
golang.org/x/sys v0.30.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA=
|
||||||
golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo=
|
golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo=
|
||||||
golang.org/x/term v0.30.0 h1:PQ39fJZ+mfadBm0y5WlL4vlM7Sx1Hgf13sMIY2+QS9Y=
|
golang.org/x/term v0.29.0 h1:L6pJp37ocefwRRtYPKSWOWzOtWSxVajvz2ldH/xi3iU=
|
||||||
golang.org/x/term v0.30.0/go.mod h1:NYYFdzHoI5wRh/h5tDMdMqCqPJZEuNqVR5xJLd/n67g=
|
golang.org/x/term v0.29.0/go.mod h1:6bl4lRlvVuDgSf3179VpIxBF0o10JUpXWOnI7nErv7s=
|
||||||
golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
|
golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
|
||||||
golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
|
golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
|
||||||
golang.org/x/text v0.3.5/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
|
golang.org/x/text v0.3.5/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
|
||||||
golang.org/x/text v0.3.6/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
|
golang.org/x/text v0.3.6/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
|
||||||
golang.org/x/text v0.23.0 h1:D71I7dUrlY+VX0gQShAThNGHFxZ13dGLBHQLVl1mJlY=
|
golang.org/x/text v0.22.0 h1:bofq7m3/HAFvbF51jz3Q9wLg3jkvSPuiZu/pD1XwgtM=
|
||||||
golang.org/x/text v0.23.0/go.mod h1:/BLNzu4aZCJ1+kcD0DNRotWKage4q2rGVAg4o22unh4=
|
golang.org/x/text v0.22.0/go.mod h1:YRoo4H8PVmsu+E3Ou7cqLVH8oXWIHVoX0jqUWALQhfY=
|
||||||
golang.org/x/tools v0.0.0-20180525024113-a5b4c53f6e8b/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
|
golang.org/x/tools v0.0.0-20180525024113-a5b4c53f6e8b/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
|
||||||
golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
|
golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
|
||||||
golang.org/x/tools v0.0.0-20190114222345-bf090417da8b/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
|
golang.org/x/tools v0.0.0-20190114222345-bf090417da8b/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
|
||||||
|
@ -34,15 +34,13 @@ func cosineSimilarity[V float32 | float64](v1, v2 []V) V {
|
|||||||
func TestAllMiniLMEmbeddings(t *testing.T) {
|
func TestAllMiniLMEmbeddings(t *testing.T) {
|
||||||
ctx, cancel := context.WithTimeout(context.Background(), 2*time.Minute)
|
ctx, cancel := context.WithTimeout(context.Background(), 2*time.Minute)
|
||||||
defer cancel()
|
defer cancel()
|
||||||
client, _, cleanup := InitServerConnection(ctx, t)
|
|
||||||
defer cleanup()
|
|
||||||
|
|
||||||
req := api.EmbeddingRequest{
|
req := api.EmbeddingRequest{
|
||||||
Model: "all-minilm",
|
Model: "all-minilm",
|
||||||
Prompt: "why is the sky blue?",
|
Prompt: "why is the sky blue?",
|
||||||
}
|
}
|
||||||
|
|
||||||
res, err := embeddingTestHelper(ctx, client, t, req)
|
res, err := embeddingTestHelper(ctx, t, req)
|
||||||
|
|
||||||
if err != nil {
|
if err != nil {
|
||||||
t.Fatalf("error: %v", err)
|
t.Fatalf("error: %v", err)
|
||||||
@ -64,15 +62,13 @@ func TestAllMiniLMEmbeddings(t *testing.T) {
|
|||||||
func TestAllMiniLMEmbed(t *testing.T) {
|
func TestAllMiniLMEmbed(t *testing.T) {
|
||||||
ctx, cancel := context.WithTimeout(context.Background(), 2*time.Minute)
|
ctx, cancel := context.WithTimeout(context.Background(), 2*time.Minute)
|
||||||
defer cancel()
|
defer cancel()
|
||||||
client, _, cleanup := InitServerConnection(ctx, t)
|
|
||||||
defer cleanup()
|
|
||||||
|
|
||||||
req := api.EmbedRequest{
|
req := api.EmbedRequest{
|
||||||
Model: "all-minilm",
|
Model: "all-minilm",
|
||||||
Input: "why is the sky blue?",
|
Input: "why is the sky blue?",
|
||||||
}
|
}
|
||||||
|
|
||||||
res, err := embedTestHelper(ctx, client, t, req)
|
res, err := embedTestHelper(ctx, t, req)
|
||||||
|
|
||||||
if err != nil {
|
if err != nil {
|
||||||
t.Fatalf("error: %v", err)
|
t.Fatalf("error: %v", err)
|
||||||
@ -102,15 +98,13 @@ func TestAllMiniLMEmbed(t *testing.T) {
|
|||||||
func TestAllMiniLMBatchEmbed(t *testing.T) {
|
func TestAllMiniLMBatchEmbed(t *testing.T) {
|
||||||
ctx, cancel := context.WithTimeout(context.Background(), 2*time.Minute)
|
ctx, cancel := context.WithTimeout(context.Background(), 2*time.Minute)
|
||||||
defer cancel()
|
defer cancel()
|
||||||
client, _, cleanup := InitServerConnection(ctx, t)
|
|
||||||
defer cleanup()
|
|
||||||
|
|
||||||
req := api.EmbedRequest{
|
req := api.EmbedRequest{
|
||||||
Model: "all-minilm",
|
Model: "all-minilm",
|
||||||
Input: []string{"why is the sky blue?", "why is the grass green?"},
|
Input: []string{"why is the sky blue?", "why is the grass green?"},
|
||||||
}
|
}
|
||||||
|
|
||||||
res, err := embedTestHelper(ctx, client, t, req)
|
res, err := embedTestHelper(ctx, t, req)
|
||||||
|
|
||||||
if err != nil {
|
if err != nil {
|
||||||
t.Fatalf("error: %v", err)
|
t.Fatalf("error: %v", err)
|
||||||
@ -150,8 +144,6 @@ func TestAllMiniLMBatchEmbed(t *testing.T) {
|
|||||||
func TestAllMiniLMEmbedTruncate(t *testing.T) {
|
func TestAllMiniLMEmbedTruncate(t *testing.T) {
|
||||||
ctx, cancel := context.WithTimeout(context.Background(), 2*time.Minute)
|
ctx, cancel := context.WithTimeout(context.Background(), 2*time.Minute)
|
||||||
defer cancel()
|
defer cancel()
|
||||||
client, _, cleanup := InitServerConnection(ctx, t)
|
|
||||||
defer cleanup()
|
|
||||||
|
|
||||||
truncTrue, truncFalse := true, false
|
truncTrue, truncFalse := true, false
|
||||||
|
|
||||||
@ -190,7 +182,7 @@ func TestAllMiniLMEmbedTruncate(t *testing.T) {
|
|||||||
res := make(map[string]*api.EmbedResponse)
|
res := make(map[string]*api.EmbedResponse)
|
||||||
|
|
||||||
for _, req := range reqs {
|
for _, req := range reqs {
|
||||||
response, err := embedTestHelper(ctx, client, t, req.Request)
|
response, err := embedTestHelper(ctx, t, req.Request)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
t.Fatalf("error: %v", err)
|
t.Fatalf("error: %v", err)
|
||||||
}
|
}
|
||||||
@ -206,7 +198,7 @@ func TestAllMiniLMEmbedTruncate(t *testing.T) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// check that truncate set to false returns an error if context length is exceeded
|
// check that truncate set to false returns an error if context length is exceeded
|
||||||
_, err := embedTestHelper(ctx, client, t, api.EmbedRequest{
|
_, err := embedTestHelper(ctx, t, api.EmbedRequest{
|
||||||
Model: "all-minilm",
|
Model: "all-minilm",
|
||||||
Input: "why is the sky blue?",
|
Input: "why is the sky blue?",
|
||||||
Truncate: &truncFalse,
|
Truncate: &truncFalse,
|
||||||
@ -218,7 +210,9 @@ func TestAllMiniLMEmbedTruncate(t *testing.T) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func embeddingTestHelper(ctx context.Context, client *api.Client, t *testing.T, req api.EmbeddingRequest) (*api.EmbeddingResponse, error) {
|
func embeddingTestHelper(ctx context.Context, t *testing.T, req api.EmbeddingRequest) (*api.EmbeddingResponse, error) {
|
||||||
|
client, _, cleanup := InitServerConnection(ctx, t)
|
||||||
|
defer cleanup()
|
||||||
if err := PullIfMissing(ctx, client, req.Model); err != nil {
|
if err := PullIfMissing(ctx, client, req.Model); err != nil {
|
||||||
t.Fatalf("failed to pull model %s: %v", req.Model, err)
|
t.Fatalf("failed to pull model %s: %v", req.Model, err)
|
||||||
}
|
}
|
||||||
@ -232,7 +226,9 @@ func embeddingTestHelper(ctx context.Context, client *api.Client, t *testing.T,
|
|||||||
return response, nil
|
return response, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func embedTestHelper(ctx context.Context, client *api.Client, t *testing.T, req api.EmbedRequest) (*api.EmbedResponse, error) {
|
func embedTestHelper(ctx context.Context, t *testing.T, req api.EmbedRequest) (*api.EmbedResponse, error) {
|
||||||
|
client, _, cleanup := InitServerConnection(ctx, t)
|
||||||
|
defer cleanup()
|
||||||
if err := PullIfMissing(ctx, client, req.Model); err != nil {
|
if err := PullIfMissing(ctx, client, req.Model); err != nil {
|
||||||
t.Fatalf("failed to pull model %s: %v", req.Model, err)
|
t.Fatalf("failed to pull model %s: %v", req.Model, err)
|
||||||
}
|
}
|
||||||
|
@ -48,6 +48,17 @@ var (
|
|||||||
}
|
}
|
||||||
)
|
)
|
||||||
|
|
||||||
|
func getTimeouts(t *testing.T) (soft time.Duration, hard time.Duration) {
|
||||||
|
deadline, hasDeadline := t.Deadline()
|
||||||
|
if !hasDeadline {
|
||||||
|
return 8 * time.Minute, 10 * time.Minute
|
||||||
|
} else if deadline.Compare(time.Now().Add(2*time.Minute)) <= 0 {
|
||||||
|
t.Skip("too little time")
|
||||||
|
return time.Duration(0), time.Duration(0)
|
||||||
|
}
|
||||||
|
return -time.Since(deadline.Add(-2 * time.Minute)), -time.Since(deadline.Add(-20 * time.Second))
|
||||||
|
}
|
||||||
|
|
||||||
func TestModelsGenerate(t *testing.T) {
|
func TestModelsGenerate(t *testing.T) {
|
||||||
softTimeout, hardTimeout := getTimeouts(t)
|
softTimeout, hardTimeout := getTimeouts(t)
|
||||||
slog.Info("Setting timeouts", "soft", softTimeout, "hard", hardTimeout)
|
slog.Info("Setting timeouts", "soft", softTimeout, "hard", hardTimeout)
|
||||||
|
@ -1,130 +0,0 @@
|
|||||||
//go:build integration && models
|
|
||||||
|
|
||||||
package integration
|
|
||||||
|
|
||||||
import (
|
|
||||||
"bytes"
|
|
||||||
"context"
|
|
||||||
"fmt"
|
|
||||||
"log/slog"
|
|
||||||
"strings"
|
|
||||||
"testing"
|
|
||||||
"time"
|
|
||||||
|
|
||||||
"github.com/ollama/ollama/api"
|
|
||||||
)
|
|
||||||
|
|
||||||
func TestQuantization(t *testing.T) {
|
|
||||||
sourceModels := []string{
|
|
||||||
"qwen2.5:0.5b-instruct-fp16",
|
|
||||||
}
|
|
||||||
quantizations := []string{
|
|
||||||
"Q8_0",
|
|
||||||
"Q4_K_S",
|
|
||||||
"Q4_K_M",
|
|
||||||
"Q4_K",
|
|
||||||
}
|
|
||||||
softTimeout, hardTimeout := getTimeouts(t)
|
|
||||||
started := time.Now()
|
|
||||||
slog.Info("Setting timeouts", "soft", softTimeout, "hard", hardTimeout)
|
|
||||||
ctx, cancel := context.WithTimeout(context.Background(), hardTimeout)
|
|
||||||
defer cancel()
|
|
||||||
client, _, cleanup := InitServerConnection(ctx, t)
|
|
||||||
defer cleanup()
|
|
||||||
|
|
||||||
for _, base := range sourceModels {
|
|
||||||
if err := PullIfMissing(ctx, client, base); err != nil {
|
|
||||||
t.Fatalf("pull failed %s", err)
|
|
||||||
}
|
|
||||||
for _, quant := range quantizations {
|
|
||||||
newName := fmt.Sprintf("%s__%s", base, quant)
|
|
||||||
t.Run(newName, func(t *testing.T) {
|
|
||||||
if time.Now().Sub(started) > softTimeout {
|
|
||||||
t.Skip("skipping remaining tests to avoid excessive runtime")
|
|
||||||
}
|
|
||||||
req := &api.CreateRequest{
|
|
||||||
Model: newName,
|
|
||||||
Quantization: quant,
|
|
||||||
From: base,
|
|
||||||
}
|
|
||||||
fn := func(resp api.ProgressResponse) error {
|
|
||||||
// fmt.Print(".")
|
|
||||||
return nil
|
|
||||||
}
|
|
||||||
t.Logf("quantizing: %s -> %s", base, quant)
|
|
||||||
if err := client.Create(ctx, req, fn); err != nil {
|
|
||||||
t.Fatalf("create failed %s", err)
|
|
||||||
}
|
|
||||||
defer func() {
|
|
||||||
req := &api.DeleteRequest{
|
|
||||||
Model: newName,
|
|
||||||
}
|
|
||||||
t.Logf("deleting: %s -> %s", base, quant)
|
|
||||||
if err := client.Delete(ctx, req); err != nil {
|
|
||||||
t.Logf("failed to clean up %s: %s", req.Model, err)
|
|
||||||
}
|
|
||||||
}()
|
|
||||||
// Check metadata on the model
|
|
||||||
resp, err := client.Show(ctx, &api.ShowRequest{Name: newName})
|
|
||||||
if err != nil {
|
|
||||||
t.Fatalf("unable to show model: %s", err)
|
|
||||||
}
|
|
||||||
if !strings.Contains(resp.Details.QuantizationLevel, quant) {
|
|
||||||
t.Fatalf("unexpected quantization for %s:\ngot: %s", newName, resp.Details.QuantizationLevel)
|
|
||||||
}
|
|
||||||
|
|
||||||
stream := true
|
|
||||||
genReq := api.GenerateRequest{
|
|
||||||
Model: newName,
|
|
||||||
Prompt: "why is the sky blue?",
|
|
||||||
KeepAlive: &api.Duration{Duration: 3 * time.Second},
|
|
||||||
Options: map[string]any{
|
|
||||||
"seed": 42,
|
|
||||||
"temperature": 0.0,
|
|
||||||
},
|
|
||||||
Stream: &stream,
|
|
||||||
}
|
|
||||||
t.Logf("verifying: %s -> %s", base, quant)
|
|
||||||
|
|
||||||
// Some smaller quantizations can cause models to have poor quality
|
|
||||||
// or get stuck in repetition loops, so we stop as soon as we have any matches
|
|
||||||
anyResp := []string{"rayleigh", "scattering", "day", "sun", "moon", "color", "nitrogen", "oxygen"}
|
|
||||||
reqCtx, reqCancel := context.WithCancel(ctx)
|
|
||||||
atLeastOne := false
|
|
||||||
var buf bytes.Buffer
|
|
||||||
genfn := func(response api.GenerateResponse) error {
|
|
||||||
buf.Write([]byte(response.Response))
|
|
||||||
fullResp := strings.ToLower(buf.String())
|
|
||||||
for _, resp := range anyResp {
|
|
||||||
if strings.Contains(fullResp, resp) {
|
|
||||||
atLeastOne = true
|
|
||||||
t.Log(fullResp)
|
|
||||||
reqCancel()
|
|
||||||
break
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return nil
|
|
||||||
}
|
|
||||||
|
|
||||||
done := make(chan int)
|
|
||||||
var genErr error
|
|
||||||
go func() {
|
|
||||||
genErr = client.Generate(reqCtx, &genReq, genfn)
|
|
||||||
done <- 0
|
|
||||||
}()
|
|
||||||
|
|
||||||
select {
|
|
||||||
case <-done:
|
|
||||||
if genErr != nil && !atLeastOne {
|
|
||||||
t.Fatalf("failed with %s request prompt %s ", genReq.Model, genReq.Prompt)
|
|
||||||
}
|
|
||||||
case <-ctx.Done():
|
|
||||||
t.Error("outer test context done while waiting for generate")
|
|
||||||
}
|
|
||||||
|
|
||||||
t.Logf("passed")
|
|
||||||
|
|
||||||
})
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
@ -217,7 +217,6 @@ func InitServerConnection(ctx context.Context, t *testing.T) (*api.Client, strin
|
|||||||
slog.Error("failed to open server log", "logfile", lifecycle.ServerLogFile, "error", err)
|
slog.Error("failed to open server log", "logfile", lifecycle.ServerLogFile, "error", err)
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
defer fp.Close()
|
|
||||||
data, err := io.ReadAll(fp)
|
data, err := io.ReadAll(fp)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
slog.Error("failed to read server log", "logfile", lifecycle.ServerLogFile, "error", err)
|
slog.Error("failed to read server log", "logfile", lifecycle.ServerLogFile, "error", err)
|
||||||
@ -359,14 +358,3 @@ func skipUnderMinVRAM(t *testing.T, gb uint64) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func getTimeouts(t *testing.T) (soft time.Duration, hard time.Duration) {
|
|
||||||
deadline, hasDeadline := t.Deadline()
|
|
||||||
if !hasDeadline {
|
|
||||||
return 8 * time.Minute, 10 * time.Minute
|
|
||||||
} else if deadline.Compare(time.Now().Add(2*time.Minute)) <= 0 {
|
|
||||||
t.Skip("too little time")
|
|
||||||
return time.Duration(0), time.Duration(0)
|
|
||||||
}
|
|
||||||
return -time.Since(deadline.Add(-2 * time.Minute)), -time.Since(deadline.Add(-20 * time.Second))
|
|
||||||
}
|
|
||||||
|
@ -21,7 +21,6 @@ type shiftFn func(ctx ml.Context, layer int, key, shift ml.Tensor) (ml.Tensor, e
|
|||||||
type Causal struct {
|
type Causal struct {
|
||||||
DType ml.DType
|
DType ml.DType
|
||||||
windowSize int32
|
windowSize int32
|
||||||
chunkSize int32
|
|
||||||
|
|
||||||
opts CausalOptions
|
opts CausalOptions
|
||||||
|
|
||||||
@ -98,17 +97,6 @@ func NewSWACache(windowSize int32, shift shiftFn) *Causal {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func NewChunkedAttentionCache(chunkSize int32, shift shiftFn) *Causal {
|
|
||||||
return &Causal{
|
|
||||||
windowSize: math.MaxInt32,
|
|
||||||
chunkSize: chunkSize,
|
|
||||||
shiftFn: shift,
|
|
||||||
ctxs: make(map[int]ml.Context),
|
|
||||||
keys: make(map[int]ml.Tensor),
|
|
||||||
values: make(map[int]ml.Tensor),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
func (c *Causal) Init(backend ml.Backend, dtype ml.DType, maxSequences, capacity, maxBatch int) {
|
func (c *Causal) Init(backend ml.Backend, dtype ml.DType, maxSequences, capacity, maxBatch int) {
|
||||||
if c.config == nil {
|
if c.config == nil {
|
||||||
var config ml.CacheConfig
|
var config ml.CacheConfig
|
||||||
@ -239,7 +227,7 @@ func (c *Causal) findStartLoc() (int, error) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return 0, fmt.Errorf("%w (cache: %v batch: %v)", ErrKvCacheFull, len(c.cells), c.curBatchSize)
|
return 0, fmt.Errorf("%w (length: %v)", ErrKvCacheFull, len(c.cells))
|
||||||
}
|
}
|
||||||
|
|
||||||
func (c *Causal) updateSlidingWindow() {
|
func (c *Causal) updateSlidingWindow() {
|
||||||
@ -312,7 +300,6 @@ func (c *Causal) buildMask(ctx ml.Context) (ml.Tensor, error) {
|
|||||||
for j := c.curCellRange.min; j <= c.curCellRange.max; j++ {
|
for j := c.curCellRange.min; j <= c.curCellRange.max; j++ {
|
||||||
if !slices.Contains(c.cells[j].sequences, c.curSequences[i]) ||
|
if !slices.Contains(c.cells[j].sequences, c.curSequences[i]) ||
|
||||||
(enabled && c.cells[j].pos > c.curPositions[i]) ||
|
(enabled && c.cells[j].pos > c.curPositions[i]) ||
|
||||||
c.chunkSize > 0 && c.cells[j].pos < c.curPositions[i]-c.curPositions[i]%c.chunkSize ||
|
|
||||||
c.cells[j].pos < c.curPositions[i]-c.windowSize {
|
c.cells[j].pos < c.curPositions[i]-c.windowSize {
|
||||||
mask[i*length+(j-c.curCellRange.min)] = float32(math.Inf(-1))
|
mask[i*length+(j-c.curCellRange.min)] = float32(math.Inf(-1))
|
||||||
}
|
}
|
||||||
|
@ -86,64 +86,6 @@ func TestSWA(t *testing.T) {
|
|||||||
testCache(t, backend, cache, tests)
|
testCache(t, backend, cache, tests)
|
||||||
}
|
}
|
||||||
|
|
||||||
func TestChunkedAttention(t *testing.T) {
|
|
||||||
cache := NewChunkedAttentionCache(2, nil)
|
|
||||||
defer cache.Close()
|
|
||||||
|
|
||||||
var b testBackend
|
|
||||||
cache.Init(&b, ml.DTypeF16, 1, 16, 16)
|
|
||||||
|
|
||||||
x := float32(math.Inf(-1))
|
|
||||||
|
|
||||||
testCache(
|
|
||||||
t, &b, cache,
|
|
||||||
[]testCase{
|
|
||||||
{
|
|
||||||
name: "FirstBatch",
|
|
||||||
in: []float32{1, 2, 3, 4},
|
|
||||||
inShape: []int{1, 1, 4},
|
|
||||||
seqs: []int{0, 0, 0, 0},
|
|
||||||
pos: []int32{0, 1, 2, 3},
|
|
||||||
expected: []float32{1, 2, 3, 4},
|
|
||||||
expectedShape: []int{1, 1, 4},
|
|
||||||
expectedMask: []float32{
|
|
||||||
0, x, x, x,
|
|
||||||
0, 0, x, x,
|
|
||||||
x, x, 0, x,
|
|
||||||
x, x, 0, 0,
|
|
||||||
},
|
|
||||||
},
|
|
||||||
{
|
|
||||||
name: "SecondBatch",
|
|
||||||
in: []float32{5, 6, 7},
|
|
||||||
inShape: []int{1, 1, 3},
|
|
||||||
seqs: []int{0, 0, 0},
|
|
||||||
pos: []int32{4, 5, 6},
|
|
||||||
expected: []float32{1, 2, 3, 4, 5, 6, 7},
|
|
||||||
expectedShape: []int{1, 1, 7},
|
|
||||||
expectedMask: []float32{
|
|
||||||
x, x, x, x, 0, x, x,
|
|
||||||
x, x, x, x, 0, 0, x,
|
|
||||||
x, x, x, x, x, x, 0,
|
|
||||||
},
|
|
||||||
},
|
|
||||||
{
|
|
||||||
name: "ThirdBatch",
|
|
||||||
in: []float32{8, 9},
|
|
||||||
inShape: []int{1, 1, 2},
|
|
||||||
seqs: []int{0, 0},
|
|
||||||
pos: []int32{7, 8},
|
|
||||||
expected: []float32{1, 2, 3, 4, 5, 6, 7, 8, 9},
|
|
||||||
expectedShape: []int{1, 1, 9},
|
|
||||||
expectedMask: []float32{
|
|
||||||
x, x, x, x, x, x, 0, 0, x,
|
|
||||||
x, x, x, x, x, x, x, x, 0,
|
|
||||||
},
|
|
||||||
},
|
|
||||||
},
|
|
||||||
)
|
|
||||||
}
|
|
||||||
|
|
||||||
func TestSequences(t *testing.T) {
|
func TestSequences(t *testing.T) {
|
||||||
backend := &testBackend{}
|
backend := &testBackend{}
|
||||||
cache := NewCausalCache(nil)
|
cache := NewCausalCache(nil)
|
||||||
@ -351,16 +293,8 @@ func testCache(t *testing.T, backend ml.Backend, cache Cache, tests []testCase)
|
|||||||
|
|
||||||
context.Forward(out, mask).Compute(out, mask)
|
context.Forward(out, mask).Compute(out, mask)
|
||||||
|
|
||||||
if !slices.Equal(out.Floats(), test.expected) {
|
if !slices.Equal(out.Floats(), test.expected) || !slices.Equal(out.Shape(), test.expectedShape) || !slices.Equal(mask.Floats(), test.expectedMask) {
|
||||||
t.Errorf("TestCache: have %v; want %v", out.Floats(), test.expected)
|
t.Errorf("TestCache: have %v (shape %v); want %v (shape %v); mask: have %v (shape %v) want %v", out.Floats(), out.Shape(), test.expected, test.expectedShape, mask.Floats(), mask.Shape(), test.expectedMask)
|
||||||
}
|
|
||||||
|
|
||||||
if !slices.Equal(out.Shape(), test.expectedShape) {
|
|
||||||
t.Errorf("TestCache: has shape %v; want %v", out.Shape(), test.expectedShape)
|
|
||||||
}
|
|
||||||
|
|
||||||
if !slices.Equal(mask.Floats(), test.expectedMask) {
|
|
||||||
t.Errorf("TestCache: have mask: have %v want %v", mask.Floats(), test.expectedMask)
|
|
||||||
}
|
}
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
@ -490,17 +424,6 @@ func (c *testContext) FromIntSlice(s []int32, shape ...int) (ml.Tensor, error) {
|
|||||||
return out, nil
|
return out, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func (c *testContext) Arange(start, stop, step float32, dtype ml.DType) ml.Tensor {
|
|
||||||
s := make([]float32, 0, int((stop-start)/step))
|
|
||||||
for i := start; i < stop; i += step {
|
|
||||||
s = append(s, i)
|
|
||||||
}
|
|
||||||
|
|
||||||
out, _ := c.FromFloatSlice(s, len(s))
|
|
||||||
out.(*testTensor).dtype = dtype
|
|
||||||
return out
|
|
||||||
}
|
|
||||||
|
|
||||||
func (c *testContext) Input() ml.Context { return c }
|
func (c *testContext) Input() ml.Context { return c }
|
||||||
func (c *testContext) Layer(int) ml.Context { return c }
|
func (c *testContext) Layer(int) ml.Context { return c }
|
||||||
|
|
||||||
|
2
llama/build-info.cpp
generated
vendored
2
llama/build-info.cpp
generated
vendored
@ -1,4 +1,4 @@
|
|||||||
int LLAMA_BUILD_NUMBER = 0;
|
int LLAMA_BUILD_NUMBER = 0;
|
||||||
char const *LLAMA_COMMIT = "de4c07f93783a1a96456a44dc16b9db538ee1618";
|
char const *LLAMA_COMMIT = "71e90e8813f90097701e62f7fce137d96ddf41e2";
|
||||||
char const *LLAMA_COMPILER = "";
|
char const *LLAMA_COMPILER = "";
|
||||||
char const *LLAMA_BUILD_TARGET = "";
|
char const *LLAMA_BUILD_TARGET = "";
|
||||||
|
@ -10,11 +10,11 @@ include common/stb_image.*
|
|||||||
include include/
|
include include/
|
||||||
include include/llama.*
|
include include/llama.*
|
||||||
include include/llama-*.*
|
include include/llama-*.*
|
||||||
include tools/
|
include examples/
|
||||||
include tools/mtmd/
|
include examples/llava/
|
||||||
include tools/mtmd/clip.*
|
include examples/llava/clip.*
|
||||||
include tools/mtmd/clip-impl.*
|
include examples/llava/clip-impl.*
|
||||||
include tools/mtmd/llava.*
|
include examples/llava/llava.*
|
||||||
include src/
|
include src/
|
||||||
include src/llama.*
|
include src/llama.*
|
||||||
include src/llama-*.*
|
include src/llama-*.*
|
||||||
|
19
llama/llama.cpp/common/common.cpp
vendored
19
llama/llama.cpp/common/common.cpp
vendored
@ -1096,6 +1096,7 @@ struct llama_context_params common_context_params_to_llama(const common_params &
|
|||||||
cparams.n_threads = params.cpuparams.n_threads;
|
cparams.n_threads = params.cpuparams.n_threads;
|
||||||
cparams.n_threads_batch = params.cpuparams_batch.n_threads == -1 ?
|
cparams.n_threads_batch = params.cpuparams_batch.n_threads == -1 ?
|
||||||
params.cpuparams.n_threads : params.cpuparams_batch.n_threads;
|
params.cpuparams.n_threads : params.cpuparams_batch.n_threads;
|
||||||
|
cparams.logits_all = params.logits_all;
|
||||||
cparams.embeddings = params.embedding;
|
cparams.embeddings = params.embedding;
|
||||||
cparams.rope_scaling_type = params.rope_scaling_type;
|
cparams.rope_scaling_type = params.rope_scaling_type;
|
||||||
cparams.rope_freq_base = params.rope_freq_base;
|
cparams.rope_freq_base = params.rope_freq_base;
|
||||||
@ -1113,7 +1114,6 @@ struct llama_context_params common_context_params_to_llama(const common_params &
|
|||||||
cparams.offload_kqv = !params.no_kv_offload;
|
cparams.offload_kqv = !params.no_kv_offload;
|
||||||
cparams.flash_attn = params.flash_attn;
|
cparams.flash_attn = params.flash_attn;
|
||||||
cparams.no_perf = params.no_perf;
|
cparams.no_perf = params.no_perf;
|
||||||
cparams.op_offload = !params.no_op_offload;
|
|
||||||
|
|
||||||
if (params.reranking) {
|
if (params.reranking) {
|
||||||
cparams.embeddings = true;
|
cparams.embeddings = true;
|
||||||
@ -1565,20 +1565,3 @@ common_control_vector_data common_control_vector_load(const std::vector<common_c
|
|||||||
|
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
ggml_opt_dataset_t common_opt_dataset_init(struct llama_context * ctx, const std::vector<llama_token> & tokens, int64_t stride) {
|
|
||||||
const int64_t ne_datapoint = llama_n_ctx(ctx);
|
|
||||||
const int64_t ndata = (tokens.size() - ne_datapoint - 1) / stride;
|
|
||||||
ggml_opt_dataset_t result = ggml_opt_dataset_init(
|
|
||||||
GGML_TYPE_I32, GGML_TYPE_I32, ne_datapoint, ne_datapoint, ndata, /*ndata_shard =*/ 1);
|
|
||||||
|
|
||||||
llama_token * data = (llama_token *) ggml_opt_dataset_data(result)->data;
|
|
||||||
llama_token * labels = (llama_token *) ggml_opt_dataset_labels(result)->data;
|
|
||||||
|
|
||||||
for (int64_t idata = 0; idata < ndata; ++idata) {
|
|
||||||
memcpy(data + idata*ne_datapoint, tokens.data() + idata*stride + 0, ne_datapoint*sizeof(llama_token));
|
|
||||||
memcpy(labels + idata*ne_datapoint, tokens.data() + idata*stride + 1, ne_datapoint*sizeof(llama_token));
|
|
||||||
}
|
|
||||||
|
|
||||||
return result;
|
|
||||||
}
|
|
||||||
|
20
llama/llama.cpp/common/common.h
vendored
20
llama/llama.cpp/common/common.h
vendored
@ -66,6 +66,7 @@ enum llama_example {
|
|||||||
LLAMA_EXAMPLE_COMMON,
|
LLAMA_EXAMPLE_COMMON,
|
||||||
LLAMA_EXAMPLE_SPECULATIVE,
|
LLAMA_EXAMPLE_SPECULATIVE,
|
||||||
LLAMA_EXAMPLE_MAIN,
|
LLAMA_EXAMPLE_MAIN,
|
||||||
|
LLAMA_EXAMPLE_INFILL,
|
||||||
LLAMA_EXAMPLE_EMBEDDING,
|
LLAMA_EXAMPLE_EMBEDDING,
|
||||||
LLAMA_EXAMPLE_PERPLEXITY,
|
LLAMA_EXAMPLE_PERPLEXITY,
|
||||||
LLAMA_EXAMPLE_RETRIEVAL,
|
LLAMA_EXAMPLE_RETRIEVAL,
|
||||||
@ -95,7 +96,6 @@ enum common_sampler_type {
|
|||||||
COMMON_SAMPLER_TYPE_XTC = 8,
|
COMMON_SAMPLER_TYPE_XTC = 8,
|
||||||
COMMON_SAMPLER_TYPE_INFILL = 9,
|
COMMON_SAMPLER_TYPE_INFILL = 9,
|
||||||
COMMON_SAMPLER_TYPE_PENALTIES = 10,
|
COMMON_SAMPLER_TYPE_PENALTIES = 10,
|
||||||
COMMON_SAMPLER_TYPE_TOP_N_SIGMA = 11,
|
|
||||||
};
|
};
|
||||||
|
|
||||||
// dimensionality reduction methods, used by cvector-generator
|
// dimensionality reduction methods, used by cvector-generator
|
||||||
@ -161,7 +161,6 @@ struct common_params_sampling {
|
|||||||
std::vector<enum common_sampler_type> samplers = {
|
std::vector<enum common_sampler_type> samplers = {
|
||||||
COMMON_SAMPLER_TYPE_PENALTIES,
|
COMMON_SAMPLER_TYPE_PENALTIES,
|
||||||
COMMON_SAMPLER_TYPE_DRY,
|
COMMON_SAMPLER_TYPE_DRY,
|
||||||
COMMON_SAMPLER_TYPE_TOP_N_SIGMA,
|
|
||||||
COMMON_SAMPLER_TYPE_TOP_K,
|
COMMON_SAMPLER_TYPE_TOP_K,
|
||||||
COMMON_SAMPLER_TYPE_TYPICAL_P,
|
COMMON_SAMPLER_TYPE_TYPICAL_P,
|
||||||
COMMON_SAMPLER_TYPE_TOP_P,
|
COMMON_SAMPLER_TYPE_TOP_P,
|
||||||
@ -324,6 +323,7 @@ struct common_params {
|
|||||||
bool ctx_shift = true; // context shift on inifinite text generation
|
bool ctx_shift = true; // context shift on inifinite text generation
|
||||||
|
|
||||||
bool input_prefix_bos = false; // prefix BOS to user inputs, preceding input_prefix
|
bool input_prefix_bos = false; // prefix BOS to user inputs, preceding input_prefix
|
||||||
|
bool logits_all = false; // return logits for all tokens in the batch
|
||||||
bool use_mmap = true; // use mmap for faster loads
|
bool use_mmap = true; // use mmap for faster loads
|
||||||
bool use_mlock = false; // use mlock to keep model in memory
|
bool use_mlock = false; // use mlock to keep model in memory
|
||||||
bool verbose_prompt = false; // print prompt tokens before generation
|
bool verbose_prompt = false; // print prompt tokens before generation
|
||||||
@ -332,7 +332,6 @@ struct common_params {
|
|||||||
bool no_kv_offload = false; // disable KV offloading
|
bool no_kv_offload = false; // disable KV offloading
|
||||||
bool warmup = true; // warmup run
|
bool warmup = true; // warmup run
|
||||||
bool check_tensors = false; // validate tensor data
|
bool check_tensors = false; // validate tensor data
|
||||||
bool no_op_offload = false; // globally disable offload host tensor operations to device
|
|
||||||
|
|
||||||
bool single_turn = false; // single turn chat conversation
|
bool single_turn = false; // single turn chat conversation
|
||||||
|
|
||||||
@ -341,10 +340,8 @@ struct common_params {
|
|||||||
|
|
||||||
common_conversation_mode conversation_mode = COMMON_CONVERSATION_MODE_AUTO;
|
common_conversation_mode conversation_mode = COMMON_CONVERSATION_MODE_AUTO;
|
||||||
|
|
||||||
// multimodal models (see tools/mtmd)
|
// multimodal models (see examples/llava)
|
||||||
struct common_params_model mmproj;
|
struct common_params_model mmproj;
|
||||||
bool mmproj_use_gpu = true; // use GPU for multimodal model
|
|
||||||
bool no_mmproj = false; // explicitly disable multimodal model
|
|
||||||
std::vector<std::string> image; // path to image file(s)
|
std::vector<std::string> image; // path to image file(s)
|
||||||
|
|
||||||
// embedding
|
// embedding
|
||||||
@ -410,14 +407,13 @@ struct common_params {
|
|||||||
|
|
||||||
bool process_output = false; // collect data for the output tensor
|
bool process_output = false; // collect data for the output tensor
|
||||||
bool compute_ppl = true; // whether to compute perplexity
|
bool compute_ppl = true; // whether to compute perplexity
|
||||||
bool parse_special = false; // whether to parse special tokens during imatrix tokenization
|
|
||||||
|
|
||||||
// cvector-generator params
|
// cvector-generator params
|
||||||
int n_pca_batch = 100;
|
int n_pca_batch = 100;
|
||||||
int n_pca_iterations = 1000;
|
int n_pca_iterations = 1000;
|
||||||
dimre_method cvector_dimre_method = DIMRE_METHOD_PCA;
|
dimre_method cvector_dimre_method = DIMRE_METHOD_PCA;
|
||||||
std::string cvector_positive_file = "tools/cvector-generator/positive.txt";
|
std::string cvector_positive_file = "examples/cvector-generator/positive.txt";
|
||||||
std::string cvector_negative_file = "tools/cvector-generator/negative.txt";
|
std::string cvector_negative_file = "examples/cvector-generator/negative.txt";
|
||||||
|
|
||||||
bool spm_infill = false; // suffix/prefix/middle pattern for infill
|
bool spm_infill = false; // suffix/prefix/middle pattern for infill
|
||||||
|
|
||||||
@ -666,9 +662,3 @@ const char * const LLM_KV_SPLIT_COUNT = "split.count";
|
|||||||
const char * const LLM_KV_SPLIT_TENSORS_COUNT = "split.tensors.count";
|
const char * const LLM_KV_SPLIT_TENSORS_COUNT = "split.tensors.count";
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
//
|
|
||||||
// training utils
|
|
||||||
//
|
|
||||||
|
|
||||||
ggml_opt_dataset_t common_opt_dataset_init(struct llama_context * ctx, const std::vector<llama_token> & tokens, int64_t stride);
|
|
||||||
|
@ -16,9 +16,6 @@ using json = nlohmann::ordered_json;
|
|||||||
static std::string build_repetition(const std::string & item_rule, int min_items, int max_items, const std::string & separator_rule = "") {
|
static std::string build_repetition(const std::string & item_rule, int min_items, int max_items, const std::string & separator_rule = "") {
|
||||||
auto has_max = max_items != std::numeric_limits<int>::max();
|
auto has_max = max_items != std::numeric_limits<int>::max();
|
||||||
|
|
||||||
if (max_items == 0) {
|
|
||||||
return "";
|
|
||||||
}
|
|
||||||
if (min_items == 0 && max_items == 1) {
|
if (min_items == 0 && max_items == 1) {
|
||||||
return item_rule + "?";
|
return item_rule + "?";
|
||||||
}
|
}
|
||||||
|
107
llama/llama.cpp/common/sampling.cpp
vendored
107
llama/llama.cpp/common/sampling.cpp
vendored
@ -1,7 +1,6 @@
|
|||||||
#include "sampling.h"
|
#include "sampling.h"
|
||||||
|
|
||||||
#include "common.h"
|
#include "common.h"
|
||||||
#include "log.h"
|
|
||||||
|
|
||||||
#include <cmath>
|
#include <cmath>
|
||||||
#include <unordered_map>
|
#include <unordered_map>
|
||||||
@ -230,48 +229,51 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, co
|
|||||||
params.logit_bias.data()));
|
params.logit_bias.data()));
|
||||||
|
|
||||||
if (params.mirostat == 0) {
|
if (params.mirostat == 0) {
|
||||||
for (const auto & cnstr : params.samplers) {
|
if (params.top_n_sigma >= 0) {
|
||||||
switch (cnstr) {
|
llama_sampler_chain_add(result->chain, llama_sampler_init_top_k (params.top_k));
|
||||||
case COMMON_SAMPLER_TYPE_DRY:
|
llama_sampler_chain_add(result->chain, llama_sampler_init_temp (params.temp));
|
||||||
{
|
llama_sampler_chain_add(result->chain, llama_sampler_init_top_n_sigma (params.top_n_sigma));
|
||||||
std::vector<const char *> c_breakers;
|
} else {
|
||||||
c_breakers.reserve(params.dry_sequence_breakers.size());
|
for (const auto & cnstr : params.samplers) {
|
||||||
for (const auto & str : params.dry_sequence_breakers) {
|
switch (cnstr) {
|
||||||
c_breakers.push_back(str.c_str());
|
case COMMON_SAMPLER_TYPE_DRY:
|
||||||
}
|
{
|
||||||
|
std::vector<const char *> c_breakers;
|
||||||
|
c_breakers.reserve(params.dry_sequence_breakers.size());
|
||||||
|
for (const auto & str : params.dry_sequence_breakers) {
|
||||||
|
c_breakers.push_back(str.c_str());
|
||||||
|
}
|
||||||
|
|
||||||
llama_sampler_chain_add(result->chain, llama_sampler_init_dry (vocab, llama_model_n_ctx_train(model), params.dry_multiplier, params.dry_base, params.dry_allowed_length, params.dry_penalty_last_n, c_breakers.data(), c_breakers.size()));
|
llama_sampler_chain_add(result->chain, llama_sampler_init_dry (vocab, llama_model_n_ctx_train(model), params.dry_multiplier, params.dry_base, params.dry_allowed_length, params.dry_penalty_last_n, c_breakers.data(), c_breakers.size()));
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
case COMMON_SAMPLER_TYPE_TOP_K:
|
case COMMON_SAMPLER_TYPE_TOP_K:
|
||||||
llama_sampler_chain_add(result->chain, llama_sampler_init_top_k (params.top_k));
|
llama_sampler_chain_add(result->chain, llama_sampler_init_top_k (params.top_k));
|
||||||
break;
|
break;
|
||||||
case COMMON_SAMPLER_TYPE_TOP_P:
|
case COMMON_SAMPLER_TYPE_TOP_P:
|
||||||
llama_sampler_chain_add(result->chain, llama_sampler_init_top_p (params.top_p, params.min_keep));
|
llama_sampler_chain_add(result->chain, llama_sampler_init_top_p (params.top_p, params.min_keep));
|
||||||
break;
|
break;
|
||||||
case COMMON_SAMPLER_TYPE_TOP_N_SIGMA:
|
case COMMON_SAMPLER_TYPE_MIN_P:
|
||||||
llama_sampler_chain_add(result->chain, llama_sampler_init_top_n_sigma (params.top_n_sigma));
|
llama_sampler_chain_add(result->chain, llama_sampler_init_min_p (params.min_p, params.min_keep));
|
||||||
break;
|
break;
|
||||||
case COMMON_SAMPLER_TYPE_MIN_P:
|
case COMMON_SAMPLER_TYPE_XTC:
|
||||||
llama_sampler_chain_add(result->chain, llama_sampler_init_min_p (params.min_p, params.min_keep));
|
llama_sampler_chain_add(result->chain, llama_sampler_init_xtc (params.xtc_probability, params.xtc_threshold, params.min_keep, params.seed));
|
||||||
break;
|
break;
|
||||||
case COMMON_SAMPLER_TYPE_XTC:
|
case COMMON_SAMPLER_TYPE_TYPICAL_P:
|
||||||
llama_sampler_chain_add(result->chain, llama_sampler_init_xtc (params.xtc_probability, params.xtc_threshold, params.min_keep, params.seed));
|
llama_sampler_chain_add(result->chain, llama_sampler_init_typical (params.typ_p, params.min_keep));
|
||||||
break;
|
break;
|
||||||
case COMMON_SAMPLER_TYPE_TYPICAL_P:
|
case COMMON_SAMPLER_TYPE_TEMPERATURE:
|
||||||
llama_sampler_chain_add(result->chain, llama_sampler_init_typical (params.typ_p, params.min_keep));
|
llama_sampler_chain_add(result->chain, llama_sampler_init_temp_ext (params.temp, params.dynatemp_range, params.dynatemp_exponent));
|
||||||
break;
|
break;
|
||||||
case COMMON_SAMPLER_TYPE_TEMPERATURE:
|
case COMMON_SAMPLER_TYPE_INFILL:
|
||||||
llama_sampler_chain_add(result->chain, llama_sampler_init_temp_ext (params.temp, params.dynatemp_range, params.dynatemp_exponent));
|
llama_sampler_chain_add(result->chain, llama_sampler_init_infill (vocab));
|
||||||
break;
|
break;
|
||||||
case COMMON_SAMPLER_TYPE_INFILL:
|
case COMMON_SAMPLER_TYPE_PENALTIES:
|
||||||
llama_sampler_chain_add(result->chain, llama_sampler_init_infill (vocab));
|
llama_sampler_chain_add(result->chain, llama_sampler_init_penalties(params.penalty_last_n, params.penalty_repeat, params.penalty_freq, params.penalty_present));
|
||||||
break;
|
break;
|
||||||
case COMMON_SAMPLER_TYPE_PENALTIES:
|
default:
|
||||||
llama_sampler_chain_add(result->chain, llama_sampler_init_penalties (params.penalty_last_n, params.penalty_repeat, params.penalty_freq, params.penalty_present));
|
GGML_ASSERT(false && "unknown sampler type");
|
||||||
break;
|
}
|
||||||
default:
|
|
||||||
GGML_ASSERT(false && "unknown sampler type");
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
llama_sampler_chain_add(result->chain, llama_sampler_init_dist(params.seed));
|
llama_sampler_chain_add(result->chain, llama_sampler_init_dist(params.seed));
|
||||||
@ -473,7 +475,6 @@ char common_sampler_type_to_chr(enum common_sampler_type cnstr) {
|
|||||||
case COMMON_SAMPLER_TYPE_TOP_K: return 'k';
|
case COMMON_SAMPLER_TYPE_TOP_K: return 'k';
|
||||||
case COMMON_SAMPLER_TYPE_TYPICAL_P: return 'y';
|
case COMMON_SAMPLER_TYPE_TYPICAL_P: return 'y';
|
||||||
case COMMON_SAMPLER_TYPE_TOP_P: return 'p';
|
case COMMON_SAMPLER_TYPE_TOP_P: return 'p';
|
||||||
case COMMON_SAMPLER_TYPE_TOP_N_SIGMA: return 's';
|
|
||||||
case COMMON_SAMPLER_TYPE_MIN_P: return 'm';
|
case COMMON_SAMPLER_TYPE_MIN_P: return 'm';
|
||||||
case COMMON_SAMPLER_TYPE_TEMPERATURE: return 't';
|
case COMMON_SAMPLER_TYPE_TEMPERATURE: return 't';
|
||||||
case COMMON_SAMPLER_TYPE_XTC: return 'x';
|
case COMMON_SAMPLER_TYPE_XTC: return 'x';
|
||||||
@ -489,7 +490,6 @@ std::string common_sampler_type_to_str(enum common_sampler_type cnstr) {
|
|||||||
case COMMON_SAMPLER_TYPE_TOP_K: return "top_k";
|
case COMMON_SAMPLER_TYPE_TOP_K: return "top_k";
|
||||||
case COMMON_SAMPLER_TYPE_TYPICAL_P: return "typ_p";
|
case COMMON_SAMPLER_TYPE_TYPICAL_P: return "typ_p";
|
||||||
case COMMON_SAMPLER_TYPE_TOP_P: return "top_p";
|
case COMMON_SAMPLER_TYPE_TOP_P: return "top_p";
|
||||||
case COMMON_SAMPLER_TYPE_TOP_N_SIGMA: return "top_n_sigma";
|
|
||||||
case COMMON_SAMPLER_TYPE_MIN_P: return "min_p";
|
case COMMON_SAMPLER_TYPE_MIN_P: return "min_p";
|
||||||
case COMMON_SAMPLER_TYPE_TEMPERATURE: return "temperature";
|
case COMMON_SAMPLER_TYPE_TEMPERATURE: return "temperature";
|
||||||
case COMMON_SAMPLER_TYPE_XTC: return "xtc";
|
case COMMON_SAMPLER_TYPE_XTC: return "xtc";
|
||||||
@ -504,7 +504,6 @@ std::vector<common_sampler_type> common_sampler_types_from_names(const std::vect
|
|||||||
{ "dry", COMMON_SAMPLER_TYPE_DRY },
|
{ "dry", COMMON_SAMPLER_TYPE_DRY },
|
||||||
{ "top_k", COMMON_SAMPLER_TYPE_TOP_K },
|
{ "top_k", COMMON_SAMPLER_TYPE_TOP_K },
|
||||||
{ "top_p", COMMON_SAMPLER_TYPE_TOP_P },
|
{ "top_p", COMMON_SAMPLER_TYPE_TOP_P },
|
||||||
{ "top_n_sigma", COMMON_SAMPLER_TYPE_TOP_N_SIGMA },
|
|
||||||
{ "typ_p", COMMON_SAMPLER_TYPE_TYPICAL_P },
|
{ "typ_p", COMMON_SAMPLER_TYPE_TYPICAL_P },
|
||||||
{ "min_p", COMMON_SAMPLER_TYPE_MIN_P },
|
{ "min_p", COMMON_SAMPLER_TYPE_MIN_P },
|
||||||
{ "temperature", COMMON_SAMPLER_TYPE_TEMPERATURE },
|
{ "temperature", COMMON_SAMPLER_TYPE_TEMPERATURE },
|
||||||
@ -518,7 +517,6 @@ std::vector<common_sampler_type> common_sampler_types_from_names(const std::vect
|
|||||||
std::unordered_map<std::string, common_sampler_type> sampler_alt_name_map {
|
std::unordered_map<std::string, common_sampler_type> sampler_alt_name_map {
|
||||||
{ "top-k", COMMON_SAMPLER_TYPE_TOP_K },
|
{ "top-k", COMMON_SAMPLER_TYPE_TOP_K },
|
||||||
{ "top-p", COMMON_SAMPLER_TYPE_TOP_P },
|
{ "top-p", COMMON_SAMPLER_TYPE_TOP_P },
|
||||||
{ "top-n-sigma", COMMON_SAMPLER_TYPE_TOP_N_SIGMA },
|
|
||||||
{ "nucleus", COMMON_SAMPLER_TYPE_TOP_P },
|
{ "nucleus", COMMON_SAMPLER_TYPE_TOP_P },
|
||||||
{ "typical-p", COMMON_SAMPLER_TYPE_TYPICAL_P },
|
{ "typical-p", COMMON_SAMPLER_TYPE_TYPICAL_P },
|
||||||
{ "typical", COMMON_SAMPLER_TYPE_TYPICAL_P },
|
{ "typical", COMMON_SAMPLER_TYPE_TYPICAL_P },
|
||||||
@ -535,16 +533,14 @@ std::vector<common_sampler_type> common_sampler_types_from_names(const std::vect
|
|||||||
auto sampler = sampler_canonical_name_map.find(name);
|
auto sampler = sampler_canonical_name_map.find(name);
|
||||||
if (sampler != sampler_canonical_name_map.end()) {
|
if (sampler != sampler_canonical_name_map.end()) {
|
||||||
samplers.push_back(sampler->second);
|
samplers.push_back(sampler->second);
|
||||||
continue;
|
} else {
|
||||||
}
|
if (allow_alt_names) {
|
||||||
if (allow_alt_names) {
|
sampler = sampler_alt_name_map.find(name);
|
||||||
sampler = sampler_alt_name_map.find(name);
|
if (sampler != sampler_alt_name_map.end()) {
|
||||||
if (sampler != sampler_alt_name_map.end()) {
|
samplers.push_back(sampler->second);
|
||||||
samplers.push_back(sampler->second);
|
}
|
||||||
continue;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
LOG_WRN("%s: unable to match sampler by name '%s'\n", __func__, name.c_str());
|
|
||||||
}
|
}
|
||||||
|
|
||||||
return samplers;
|
return samplers;
|
||||||
@ -556,7 +552,6 @@ std::vector<common_sampler_type> common_sampler_types_from_chars(const std::stri
|
|||||||
{ common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_TOP_K), COMMON_SAMPLER_TYPE_TOP_K },
|
{ common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_TOP_K), COMMON_SAMPLER_TYPE_TOP_K },
|
||||||
{ common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_TYPICAL_P), COMMON_SAMPLER_TYPE_TYPICAL_P },
|
{ common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_TYPICAL_P), COMMON_SAMPLER_TYPE_TYPICAL_P },
|
||||||
{ common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_TOP_P), COMMON_SAMPLER_TYPE_TOP_P },
|
{ common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_TOP_P), COMMON_SAMPLER_TYPE_TOP_P },
|
||||||
{ common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_TOP_N_SIGMA), COMMON_SAMPLER_TYPE_TOP_N_SIGMA },
|
|
||||||
{ common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_MIN_P), COMMON_SAMPLER_TYPE_MIN_P },
|
{ common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_MIN_P), COMMON_SAMPLER_TYPE_MIN_P },
|
||||||
{ common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_TEMPERATURE), COMMON_SAMPLER_TYPE_TEMPERATURE },
|
{ common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_TEMPERATURE), COMMON_SAMPLER_TYPE_TEMPERATURE },
|
||||||
{ common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_XTC), COMMON_SAMPLER_TYPE_XTC },
|
{ common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_XTC), COMMON_SAMPLER_TYPE_XTC },
|
||||||
@ -571,8 +566,6 @@ std::vector<common_sampler_type> common_sampler_types_from_chars(const std::stri
|
|||||||
const auto sampler = sampler_name_map.find(c);
|
const auto sampler = sampler_name_map.find(c);
|
||||||
if (sampler != sampler_name_map.end()) {
|
if (sampler != sampler_name_map.end()) {
|
||||||
samplers.push_back(sampler->second);
|
samplers.push_back(sampler->second);
|
||||||
} else {
|
|
||||||
LOG_WRN("%s: unable to match sampler by char '%c'\n", __func__, c);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -2,6 +2,8 @@
|
|||||||
#include "gguf.h"
|
#include "gguf.h"
|
||||||
#include "clip.h"
|
#include "clip.h"
|
||||||
|
|
||||||
|
#include "clip.h"
|
||||||
|
|
||||||
#include <climits>
|
#include <climits>
|
||||||
#include <cstdarg>
|
#include <cstdarg>
|
||||||
#include <string>
|
#include <string>
|
||||||
@ -15,35 +17,40 @@
|
|||||||
#define KEY_FTYPE "general.file_type"
|
#define KEY_FTYPE "general.file_type"
|
||||||
#define KEY_NAME "general.name"
|
#define KEY_NAME "general.name"
|
||||||
#define KEY_DESCRIPTION "general.description"
|
#define KEY_DESCRIPTION "general.description"
|
||||||
|
#define KEY_HAS_TEXT_ENC "clip.has_text_encoder"
|
||||||
|
#define KEY_HAS_VIS_ENC "clip.has_vision_encoder"
|
||||||
|
#define KEY_HAS_LLAVA_PROJ "clip.has_llava_projector"
|
||||||
|
#define KEY_HAS_MINICPMV_PROJ "clip.has_minicpmv_projector"
|
||||||
|
#define KEY_HAS_GLM_PROJ "clip.has_glm_projector"
|
||||||
#define KEY_MINICPMV_VERSION "clip.minicpmv_version"
|
#define KEY_MINICPMV_VERSION "clip.minicpmv_version"
|
||||||
|
#define KEY_HAS_QWEN2VL_MERGER "clip.has_qwen2vl_merger"
|
||||||
#define KEY_USE_GELU "clip.use_gelu"
|
#define KEY_USE_GELU "clip.use_gelu"
|
||||||
#define KEY_USE_SILU "clip.use_silu"
|
#define KEY_USE_SILU "clip.use_silu"
|
||||||
#define KEY_N_EMBD "clip.vision.embedding_length"
|
#define KEY_N_EMBD "clip.%s.embedding_length"
|
||||||
#define KEY_N_FF "clip.vision.feed_forward_length"
|
#define KEY_N_FF "clip.%s.feed_forward_length"
|
||||||
#define KEY_N_BLOCK "clip.vision.block_count"
|
#define KEY_N_BLOCK "clip.%s.block_count"
|
||||||
#define KEY_N_HEAD "clip.vision.attention.head_count"
|
#define KEY_N_HEAD "clip.%s.attention.head_count"
|
||||||
#define KEY_LAYER_NORM_EPS "clip.vision.attention.layer_norm_epsilon"
|
#define KEY_LAYER_NORM_EPS "clip.%s.attention.layer_norm_epsilon"
|
||||||
#define KEY_PROJ_DIM "clip.vision.projection_dim"
|
#define KEY_PROJ_DIM "clip.%s.projection_dim"
|
||||||
|
#define KEY_TOKENS "tokenizer.ggml.tokens"
|
||||||
|
#define KEY_N_POSITIONS "clip.text.context_length"
|
||||||
#define KEY_IMAGE_SIZE "clip.vision.image_size"
|
#define KEY_IMAGE_SIZE "clip.vision.image_size"
|
||||||
#define KEY_PATCH_SIZE "clip.vision.patch_size"
|
#define KEY_PATCH_SIZE "clip.vision.patch_size"
|
||||||
#define KEY_IMAGE_MEAN "clip.vision.image_mean"
|
#define KEY_IMAGE_MEAN "clip.vision.image_mean"
|
||||||
#define KEY_IMAGE_STD "clip.vision.image_std"
|
#define KEY_IMAGE_STD "clip.vision.image_std"
|
||||||
#define KEY_FEATURE_LAYER "clip.vision.feature_layer"
|
|
||||||
#define KEY_PROJ_SCALE_FACTOR "clip.vision.projector.scale_factor"
|
|
||||||
#define KEY_PROJ_TYPE "clip.projector_type"
|
#define KEY_PROJ_TYPE "clip.projector_type"
|
||||||
#define KEY_SPATIAL_MERGE_SIZE "clip.vision.spatial_merge_size"
|
#define KEY_FEATURE_LAYER "clip.vision.feature_layer"
|
||||||
|
|
||||||
#define KEY_MM_PATCH_MERGE_TYPE "clip.vision.mm_patch_merge_type"
|
#define KEY_MM_PATCH_MERGE_TYPE "clip.vision.mm_patch_merge_type"
|
||||||
#define KEY_IMAGE_GRID_PINPOINTS "clip.vision.image_grid_pinpoints"
|
#define KEY_IMAGE_GRID_PINPOINTS "clip.vision.image_grid_pinpoints"
|
||||||
#define KEY_IMAGE_CROP_RESOLUTION "clip.vision.image_crop_resolution"
|
#define KEY_IMAGE_CROP_RESOLUTION "clip.vision.image_crop_resolution"
|
||||||
#define KEY_WIN_ATTN_PATTERN "clip.vision.n_wa_pattern"
|
|
||||||
#define KEY_ATTN_WINDOW_SIZE "clip.vision.window_size"
|
|
||||||
|
|
||||||
|
|
||||||
//
|
//
|
||||||
// tensor name constants
|
// tensor name constants
|
||||||
//
|
//
|
||||||
|
|
||||||
|
#define TN_TOKEN_EMBD "%s.token_embd.weight"
|
||||||
#define TN_POS_EMBD "%s.position_embd.weight"
|
#define TN_POS_EMBD "%s.position_embd.weight"
|
||||||
#define TN_CLASS_EMBD "v.class_embd"
|
#define TN_CLASS_EMBD "v.class_embd"
|
||||||
#define TN_PATCH_EMBD "v.patch_embd.weight" // not rename tensor with ".0" postfix for backwrad compat
|
#define TN_PATCH_EMBD "v.patch_embd.weight" // not rename tensor with ".0" postfix for backwrad compat
|
||||||
@ -53,31 +60,21 @@
|
|||||||
#define TN_ATTN_Q "%s.blk.%d.attn_q.%s"
|
#define TN_ATTN_Q "%s.blk.%d.attn_q.%s"
|
||||||
#define TN_ATTN_V "%s.blk.%d.attn_v.%s"
|
#define TN_ATTN_V "%s.blk.%d.attn_v.%s"
|
||||||
#define TN_ATTN_OUTPUT "%s.blk.%d.attn_out.%s"
|
#define TN_ATTN_OUTPUT "%s.blk.%d.attn_out.%s"
|
||||||
#define TN_ATTN_K_NORM "%s.blk.%d.attn_k_norm.%s"
|
|
||||||
#define TN_ATTN_Q_NORM "%s.blk.%d.attn_q_norm.%s"
|
|
||||||
#define TN_FFN_DOWN "%s.blk.%d.ffn_down.%s"
|
#define TN_FFN_DOWN "%s.blk.%d.ffn_down.%s"
|
||||||
#define TN_FFN_GATE "%s.blk.%d.ffn_gate.%s"
|
|
||||||
#define TN_FFN_UP "%s.blk.%d.ffn_up.%s"
|
#define TN_FFN_UP "%s.blk.%d.ffn_up.%s"
|
||||||
#define TN_FFN_GATE "%s.blk.%d.ffn_gate.%s"
|
#define TN_LN_1 "%s.blk.%d.ln1.%s"
|
||||||
#define TN_LN_1 "%s.blk.%d.ln1.%s" // layer norm
|
#define TN_LN_2 "%s.blk.%d.ln2.%s"
|
||||||
#define TN_LN_2 "%s.blk.%d.ln2.%s" // layer norm
|
|
||||||
#define TN_LS_1 "%s.blk.%d.ls1.%s" // layer scale
|
|
||||||
#define TN_LS_2 "%s.blk.%d.ls2.%s" // layer scale
|
|
||||||
#define TN_LN_PRE "%s.pre_ln.%s"
|
#define TN_LN_PRE "%s.pre_ln.%s"
|
||||||
#define TN_LN_POST "%s.post_ln.%s"
|
#define TN_LN_POST "%s.post_ln.%s"
|
||||||
|
#define TN_TEXT_PROJ "text_projection.weight"
|
||||||
|
#define TN_VIS_PROJ "visual_projection.weight"
|
||||||
#define TN_LLAVA_PROJ "mm.%d.%s"
|
#define TN_LLAVA_PROJ "mm.%d.%s"
|
||||||
#define TN_MVLM_PROJ_MLP "mm.model.mlp.%d.%s"
|
#define TN_MVLM_PROJ_MLP "mm.model.mlp.%d.%s"
|
||||||
#define TN_MVLM_PROJ_BLOCK "mm.model.mb_block.%d.block.%d.%s"
|
#define TN_MVLM_PROJ_BLOCK "mm.model.mb_block.%d.block.%d.%s"
|
||||||
#define TN_MVLM_PROJ_PEG "mm.model.peg.%d.%s"
|
#define TN_MVLM_PROJ_PEG "mm.model.peg.%d.%s"
|
||||||
#define TN_IMAGE_NEWLINE "model.image_newline"
|
#define TN_IMAGE_NEWLINE "model.image_newline"
|
||||||
#define TN_MM_INP_NORM "mm.input_norm.weight"
|
|
||||||
#define TN_MM_INP_PROJ "mm.input_projection.weight" // gemma3
|
#define TN_MM_INP_PROJ "mm.input_projection.weight" // gemma3
|
||||||
#define TN_MM_SOFT_EMB_N "mm.soft_emb_norm.weight" // gemma3
|
#define TN_MM_SOFT_EMB_N "mm.soft_emb_norm.weight" // gemma3
|
||||||
#define TN_MM_PROJECTOR "mm.model.fc.weight" // idefics3
|
|
||||||
#define TN_MM_PATCH_MERGER "mm.patch_merger.weight" // mistral small 3.1
|
|
||||||
#define TN_TOK_IMG_BREAK "v.token_embd.img_break" // pixtral
|
|
||||||
#define TN_TOK_GLM_BOI "adapter.boi" // glm-edge (these embeddings are not in text model)
|
|
||||||
#define TN_TOK_GLM_EOI "adapter.eoi" // glm-edge (these embeddings are not in text model)
|
|
||||||
|
|
||||||
// mimicpmv
|
// mimicpmv
|
||||||
#define TN_MINICPMV_POS_EMBD_K "resampler.pos_embed_k"
|
#define TN_MINICPMV_POS_EMBD_K "resampler.pos_embed_k"
|
||||||
@ -93,23 +90,18 @@
|
|||||||
#define TN_GLM_ADAPTER_D_H_2_4H "adapter.linear.dense_h_to_4h.%s"
|
#define TN_GLM_ADAPTER_D_H_2_4H "adapter.linear.dense_h_to_4h.%s"
|
||||||
#define TN_GLM_ADAPTER_GATE "adapter.linear.gate.%s"
|
#define TN_GLM_ADAPTER_GATE "adapter.linear.gate.%s"
|
||||||
#define TN_GLM_ADAPTER_D_4H_2_H "adapter.linear.dense_4h_to_h.%s"
|
#define TN_GLM_ADAPTER_D_4H_2_H "adapter.linear.dense_4h_to_h.%s"
|
||||||
|
#define TN_GLM_BOI_W "adapter.boi"
|
||||||
// align x to upper multiple of n
|
#define TN_GLM_EOI_W "adapter.eoi"
|
||||||
#define CLIP_ALIGN(x, n) ((((x) + (n) - 1) / (n)) * (n))
|
|
||||||
|
|
||||||
enum projector_type {
|
enum projector_type {
|
||||||
PROJECTOR_TYPE_MLP,
|
PROJECTOR_TYPE_MLP,
|
||||||
PROJECTOR_TYPE_MLP_NORM,
|
PROJECTOR_TYPE_MLP_NORM,
|
||||||
PROJECTOR_TYPE_LDP,
|
PROJECTOR_TYPE_LDP,
|
||||||
PROJECTOR_TYPE_LDPV2,
|
PROJECTOR_TYPE_LDPV2,
|
||||||
PROJECTOR_TYPE_MINICPMV,
|
PROJECTOR_TYPE_RESAMPLER,
|
||||||
PROJECTOR_TYPE_GLM_EDGE,
|
PROJECTOR_TYPE_GLM_EDGE,
|
||||||
PROJECTOR_TYPE_QWEN2VL,
|
PROJECTOR_TYPE_MERGER,
|
||||||
PROJECTOR_TYPE_GEMMA3,
|
PROJECTOR_TYPE_GEMMA3,
|
||||||
PROJECTOR_TYPE_IDEFICS3,
|
|
||||||
PROJECTOR_TYPE_PIXTRAL,
|
|
||||||
PROJECTOR_TYPE_QWEN25VL,
|
|
||||||
PROJECTOR_TYPE_INTERNVL,
|
|
||||||
PROJECTOR_TYPE_UNKNOWN,
|
PROJECTOR_TYPE_UNKNOWN,
|
||||||
};
|
};
|
||||||
|
|
||||||
@ -117,14 +109,10 @@ static std::map<projector_type, std::string> PROJECTOR_TYPE_NAMES = {
|
|||||||
{ PROJECTOR_TYPE_MLP, "mlp" },
|
{ PROJECTOR_TYPE_MLP, "mlp" },
|
||||||
{ PROJECTOR_TYPE_LDP, "ldp" },
|
{ PROJECTOR_TYPE_LDP, "ldp" },
|
||||||
{ PROJECTOR_TYPE_LDPV2, "ldpv2"},
|
{ PROJECTOR_TYPE_LDPV2, "ldpv2"},
|
||||||
{ PROJECTOR_TYPE_MINICPMV, "resampler"},
|
{ PROJECTOR_TYPE_RESAMPLER, "resampler"},
|
||||||
{ PROJECTOR_TYPE_GLM_EDGE, "adapter"},
|
{ PROJECTOR_TYPE_GLM_EDGE, "adapter"},
|
||||||
{ PROJECTOR_TYPE_QWEN2VL, "qwen2vl_merger"},
|
{ PROJECTOR_TYPE_MERGER, "qwen2vl_merger"},
|
||||||
{ PROJECTOR_TYPE_QWEN25VL, "qwen2.5vl_merger"},
|
|
||||||
{ PROJECTOR_TYPE_GEMMA3, "gemma3"},
|
{ PROJECTOR_TYPE_GEMMA3, "gemma3"},
|
||||||
{ PROJECTOR_TYPE_IDEFICS3, "idefics3"},
|
|
||||||
{ PROJECTOR_TYPE_PIXTRAL, "pixtral"},
|
|
||||||
{ PROJECTOR_TYPE_INTERNVL, "internvl"},
|
|
||||||
};
|
};
|
||||||
|
|
||||||
static projector_type clip_projector_type_from_string(const std::string & str) {
|
static projector_type clip_projector_type_from_string(const std::string & str) {
|
||||||
@ -239,15 +227,6 @@ struct clip_image_u8_batch {
|
|||||||
|
|
||||||
struct clip_image_f32_batch {
|
struct clip_image_f32_batch {
|
||||||
std::vector<clip_image_f32_ptr> entries;
|
std::vector<clip_image_f32_ptr> entries;
|
||||||
|
|
||||||
clip_image_f32_batch clone() const {
|
|
||||||
clip_image_f32_batch new_batch;
|
|
||||||
new_batch.entries.reserve(entries.size());
|
|
||||||
for (const auto & entry : entries) {
|
|
||||||
new_batch.entries.emplace_back(new clip_image_f32(*entry));
|
|
||||||
}
|
|
||||||
return new_batch;
|
|
||||||
}
|
|
||||||
};
|
};
|
||||||
|
|
||||||
//
|
//
|
2921
llama/llama.cpp/examples/llava/clip.cpp
vendored
Normal file
2921
llama/llama.cpp/examples/llava/clip.cpp
vendored
Normal file
File diff suppressed because it is too large
Load Diff
@ -47,7 +47,7 @@ CLIP_API struct clip_ctx * clip_init(const char * fname, struct clip_context_par
|
|||||||
CLIP_API void clip_free(struct clip_ctx * ctx);
|
CLIP_API void clip_free(struct clip_ctx * ctx);
|
||||||
|
|
||||||
CLIP_API size_t clip_embd_nbytes(const struct clip_ctx * ctx);
|
CLIP_API size_t clip_embd_nbytes(const struct clip_ctx * ctx);
|
||||||
CLIP_API size_t clip_embd_nbytes_by_img(const struct clip_ctx * ctx, int img_w, int img_h);
|
CLIP_API size_t clip_embd_nbytes_by_img(const struct clip_ctx * ctx, int img_h, int img_w);
|
||||||
|
|
||||||
CLIP_API int32_t clip_get_image_size (const struct clip_ctx * ctx);
|
CLIP_API int32_t clip_get_image_size (const struct clip_ctx * ctx);
|
||||||
CLIP_API int32_t clip_get_patch_size (const struct clip_ctx * ctx);
|
CLIP_API int32_t clip_get_patch_size (const struct clip_ctx * ctx);
|
||||||
@ -59,29 +59,18 @@ CLIP_API const char * clip_patch_merge_type(const struct clip_ctx * ctx);
|
|||||||
CLIP_API const int32_t * clip_image_grid(const struct clip_ctx * ctx);
|
CLIP_API const int32_t * clip_image_grid(const struct clip_ctx * ctx);
|
||||||
CLIP_API size_t get_clip_image_grid_size(const struct clip_ctx * ctx);
|
CLIP_API size_t get_clip_image_grid_size(const struct clip_ctx * ctx);
|
||||||
|
|
||||||
GGML_DEPRECATED(CLIP_API int clip_n_patches(const struct clip_ctx * ctx),
|
CLIP_API int clip_n_patches (const struct clip_ctx * ctx);
|
||||||
"use clip_n_output_tokens instead");
|
CLIP_API int clip_n_patches_by_img (const struct clip_ctx * ctx, struct clip_image_f32 * img);
|
||||||
GGML_DEPRECATED(CLIP_API int clip_n_patches_by_img(const struct clip_ctx * ctx, struct clip_image_f32 * img),
|
CLIP_API int clip_n_mmproj_embd (const struct clip_ctx * ctx);
|
||||||
"use clip_n_output_tokens instead");
|
|
||||||
|
|
||||||
CLIP_API int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * img);
|
|
||||||
|
|
||||||
// for M-RoPE, this will be the number of token positions in X and Y directions
|
|
||||||
// for other models, X will be the total number of tokens and Y will be 1
|
|
||||||
CLIP_API int clip_n_output_tokens_x(const struct clip_ctx * ctx, struct clip_image_f32 * img);
|
|
||||||
CLIP_API int clip_n_output_tokens_y(const struct clip_ctx * ctx, struct clip_image_f32 * img);
|
|
||||||
|
|
||||||
// this should be equal to the embedding dimension of the text model
|
|
||||||
CLIP_API int clip_n_mmproj_embd(const struct clip_ctx * ctx);
|
|
||||||
|
|
||||||
CLIP_API int clip_uhd_num_image_embeds_col(struct clip_ctx * ctx_clip);
|
CLIP_API int clip_uhd_num_image_embeds_col(struct clip_ctx * ctx_clip);
|
||||||
CLIP_API void clip_add_load_image_size(struct clip_ctx * ctx_clip, struct clip_image_size * load_image_size);
|
CLIP_API void clip_add_load_image_size(struct clip_ctx * ctx_clip, struct clip_image_size * load_image_size);
|
||||||
CLIP_API struct clip_image_size * clip_get_load_image_size(struct clip_ctx * ctx_clip);
|
CLIP_API struct clip_image_size * clip_get_load_image_size(struct clip_ctx * ctx_clip);
|
||||||
|
|
||||||
CLIP_API struct clip_image_size * clip_image_size_init(void);
|
CLIP_API struct clip_image_size * clip_image_size_init();
|
||||||
CLIP_API struct clip_image_u8 * clip_image_u8_init (void);
|
CLIP_API struct clip_image_u8 * clip_image_u8_init ();
|
||||||
CLIP_API struct clip_image_f32 * clip_image_f32_init(void);
|
CLIP_API struct clip_image_f32 * clip_image_f32_init();
|
||||||
CLIP_API struct clip_image_f32_batch * clip_image_f32_batch_init(void); // only used by libllava
|
CLIP_API struct clip_image_f32_batch * clip_image_f32_batch_init(); // only used by libllava
|
||||||
|
|
||||||
// nx, ny are the output image dimensions
|
// nx, ny are the output image dimensions
|
||||||
CLIP_API unsigned char * clip_image_u8_get_data(struct clip_image_u8 * img, uint32_t * nx, uint32_t * ny);
|
CLIP_API unsigned char * clip_image_u8_get_data(struct clip_image_u8 * img, uint32_t * nx, uint32_t * ny);
|
||||||
@ -125,6 +114,8 @@ CLIP_API bool clip_is_qwen2vl(const struct clip_ctx * ctx);
|
|||||||
CLIP_API bool clip_is_llava(const struct clip_ctx * ctx);
|
CLIP_API bool clip_is_llava(const struct clip_ctx * ctx);
|
||||||
CLIP_API bool clip_is_gemma3(const struct clip_ctx * ctx);
|
CLIP_API bool clip_is_gemma3(const struct clip_ctx * ctx);
|
||||||
|
|
||||||
|
CLIP_API int get_deepest_feature_layer(const struct clip_ctx * ctx);
|
||||||
|
|
||||||
CLIP_API bool clip_encode_float_image (struct clip_ctx * ctx, int n_threads, float * img, int h, int w, float * vec);
|
CLIP_API bool clip_encode_float_image (struct clip_ctx * ctx, int n_threads, float * img, int h, int w, float * vec);
|
||||||
|
|
||||||
|
|
@ -2,7 +2,6 @@
|
|||||||
#include "llava.h"
|
#include "llava.h"
|
||||||
|
|
||||||
#include "llama.h"
|
#include "llama.h"
|
||||||
#include "ggml-cpp.h"
|
|
||||||
|
|
||||||
#include <algorithm>
|
#include <algorithm>
|
||||||
#include <cerrno>
|
#include <cerrno>
|
||||||
@ -113,7 +112,7 @@ static struct clip_image_grid_shape get_anyres_image_grid_shape(const std::pair<
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Take the image segments in a grid configuration and return the embeddings and the number of embeddings into preallocated memory (image_embd_out)
|
// Take the image segments in a grid configuration and return the embeddings and the number of embeddings into preallocated memory (image_embd_out)
|
||||||
static bool clip_llava_handle_patches(clip_ctx * ctx_clip, std::vector<float *> & image_embd_v, struct clip_image_grid_shape grid_shape, float * image_embd_out, int * n_img_pos_out, clip_image_f32 * img_input) {
|
static bool clip_llava_handle_patches(clip_ctx * ctx_clip, std::vector<float *> & image_embd_v, struct clip_image_grid_shape grid_shape, float * image_embd_out, int * n_img_pos_out) {
|
||||||
struct {
|
struct {
|
||||||
struct ggml_context * ctx;
|
struct ggml_context * ctx;
|
||||||
} model;
|
} model;
|
||||||
@ -176,7 +175,7 @@ static bool clip_llava_handle_patches(clip_ctx * ctx_clip, std::vector<float *>
|
|||||||
|
|
||||||
model.ctx = ggml_init(params);
|
model.ctx = ggml_init(params);
|
||||||
|
|
||||||
struct ggml_tensor * image_features = ggml_new_tensor_3d(model.ctx, GGML_TYPE_F32, clip_n_mmproj_embd(ctx_clip), clip_n_output_tokens(ctx_clip, img_input), num_images - 1); // example: 4096 x 576 x 4
|
struct ggml_tensor * image_features = ggml_new_tensor_3d(model.ctx, GGML_TYPE_F32, clip_n_mmproj_embd(ctx_clip), clip_n_patches(ctx_clip), num_images - 1); // example: 4096 x 576 x 4
|
||||||
// ggml_tensor_printf(image_features,"image_features",__LINE__,false,false);
|
// ggml_tensor_printf(image_features,"image_features",__LINE__,false,false);
|
||||||
// fill it with the image embeddings, ignoring the base
|
// fill it with the image embeddings, ignoring the base
|
||||||
for (size_t i = 1; i < num_images; i++) {
|
for (size_t i = 1; i < num_images; i++) {
|
||||||
@ -210,17 +209,13 @@ static bool clip_llava_handle_patches(clip_ctx * ctx_clip, std::vector<float *>
|
|||||||
struct ggml_tensor *flatten = ggml_view_2d(model.ctx, permuted_cont, clip_n_mmproj_embd(ctx_clip), num_patches_height * num_patches_width * num_patches_per_side * num_patches_per_side, size_ele * clip_n_mmproj_embd(ctx_clip), 0);
|
struct ggml_tensor *flatten = ggml_view_2d(model.ctx, permuted_cont, clip_n_mmproj_embd(ctx_clip), num_patches_height * num_patches_width * num_patches_per_side * num_patches_per_side, size_ele * clip_n_mmproj_embd(ctx_clip), 0);
|
||||||
// ggml_tensor_printf(flatten,"flatten",__LINE__,false,false);
|
// ggml_tensor_printf(flatten,"flatten",__LINE__,false,false);
|
||||||
ggml_build_forward_expand(gf, flatten);
|
ggml_build_forward_expand(gf, flatten);
|
||||||
|
ggml_graph_compute_with_ctx(model.ctx, gf, 1);
|
||||||
ggml_backend_ptr backend { ggml_backend_init_by_type(GGML_BACKEND_DEVICE_TYPE_CPU, nullptr) };
|
|
||||||
GGML_ASSERT(backend != nullptr && "failed to initialize CPU backend");
|
|
||||||
ggml_backend_graph_compute(backend.get(), gf);
|
|
||||||
|
|
||||||
struct ggml_tensor* result = ggml_graph_node(gf, -1);
|
struct ggml_tensor* result = ggml_graph_node(gf, -1);
|
||||||
|
|
||||||
memcpy(image_embd_out, image_embd_v[0], clip_embd_nbytes(ctx_clip)); // main image as global context
|
memcpy(image_embd_out, image_embd_v[0], clip_embd_nbytes(ctx_clip)); // main image as global context
|
||||||
// append without newline tokens (default behavior in llava_arch when not using unpad ):
|
// append without newline tokens (default behavior in llava_arch when not using unpad ):
|
||||||
memcpy(image_embd_out + clip_n_output_tokens(ctx_clip, img_input) * clip_n_mmproj_embd(ctx_clip), (float*)result->data, clip_embd_nbytes(ctx_clip) * (num_images-1)); // grid patches
|
memcpy(image_embd_out + clip_n_patches(ctx_clip) * clip_n_mmproj_embd(ctx_clip), (float*)result->data, clip_embd_nbytes(ctx_clip) * (num_images-1)); // grid patches
|
||||||
*n_img_pos_out = static_cast<int>(result->ne[1]+clip_n_output_tokens(ctx_clip, img_input));
|
*n_img_pos_out = static_cast<int>(result->ne[1]+clip_n_patches(ctx_clip));
|
||||||
|
|
||||||
// Debug: Test single segments
|
// Debug: Test single segments
|
||||||
// Current findings: sending base image, sending a segment embedding all works similar to python
|
// Current findings: sending base image, sending a segment embedding all works similar to python
|
||||||
@ -318,7 +313,7 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli
|
|||||||
image_embd + n_img_pos_out * clip_n_mmproj_embd(ctx_clip),
|
image_embd + n_img_pos_out * clip_n_mmproj_embd(ctx_clip),
|
||||||
image_embd_v[i],
|
image_embd_v[i],
|
||||||
clip_embd_nbytes_by_img(ctx_clip, nx, ny));
|
clip_embd_nbytes_by_img(ctx_clip, nx, ny));
|
||||||
n_img_pos_out += clip_n_output_tokens(ctx_clip, img_res);
|
n_img_pos_out += clip_n_patches_by_img(ctx_clip, img_res);
|
||||||
}
|
}
|
||||||
*n_img_pos = n_img_pos_out;
|
*n_img_pos = n_img_pos_out;
|
||||||
for (size_t i = 0; i < image_embd_v.size(); i++) {
|
for (size_t i = 0; i < image_embd_v.size(); i++) {
|
||||||
@ -347,8 +342,8 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli
|
|||||||
}
|
}
|
||||||
else if (strcmp(mm_patch_merge_type, "spatial_unpad") != 0) {
|
else if (strcmp(mm_patch_merge_type, "spatial_unpad") != 0) {
|
||||||
// flat / default llava-1.5 type embedding
|
// flat / default llava-1.5 type embedding
|
||||||
|
*n_img_pos = clip_n_patches(ctx_clip);
|
||||||
clip_image_f32 * img_res = clip_image_f32_get_img(img_res_v.get(), 0);
|
clip_image_f32 * img_res = clip_image_f32_get_img(img_res_v.get(), 0);
|
||||||
*n_img_pos = clip_n_output_tokens(ctx_clip, img_res);
|
|
||||||
bool encoded = clip_image_encode(ctx_clip, n_threads, img_res, image_embd); // image_embd shape is 576 x 4096
|
bool encoded = clip_image_encode(ctx_clip, n_threads, img_res, image_embd); // image_embd shape is 576 x 4096
|
||||||
if (!encoded) {
|
if (!encoded) {
|
||||||
LOG_ERR("Unable to encode image\n");
|
LOG_ERR("Unable to encode image\n");
|
||||||
@ -386,8 +381,7 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli
|
|||||||
struct clip_image_grid_shape grid_shape = get_anyres_image_grid_shape({img->nx,img->ny}, grid_pinpoints, image_size);
|
struct clip_image_grid_shape grid_shape = get_anyres_image_grid_shape({img->nx,img->ny}, grid_pinpoints, image_size);
|
||||||
|
|
||||||
int n_img_pos_out;
|
int n_img_pos_out;
|
||||||
clip_image_f32 * img_input = clip_image_f32_get_img(img_res_v.get(), 0);
|
clip_llava_handle_patches(ctx_clip, image_embd_v, grid_shape, image_embd, &n_img_pos_out);
|
||||||
clip_llava_handle_patches(ctx_clip, image_embd_v, grid_shape, image_embd, &n_img_pos_out, img_input);
|
|
||||||
*n_img_pos = n_img_pos_out;
|
*n_img_pos = n_img_pos_out;
|
||||||
|
|
||||||
for (size_t i = 0; i < image_embd_v.size(); i++) {
|
for (size_t i = 0; i < image_embd_v.size(); i++) {
|
||||||
@ -462,7 +456,7 @@ struct llava_embd_batch {
|
|||||||
std::vector<llama_seq_id *> seq_ids;
|
std::vector<llama_seq_id *> seq_ids;
|
||||||
std::vector<int8_t> logits;
|
std::vector<int8_t> logits;
|
||||||
llama_batch batch;
|
llama_batch batch;
|
||||||
llava_embd_batch(float * embd, int32_t n_tokens, llama_pos pos_0, llama_seq_id seq_id) {
|
llava_embd_batch(float * embd, int32_t n_embd, int32_t n_tokens, llama_pos pos_0, llama_seq_id seq_id) {
|
||||||
pos .resize(n_tokens);
|
pos .resize(n_tokens);
|
||||||
n_seq_id.resize(n_tokens);
|
n_seq_id.resize(n_tokens);
|
||||||
seq_ids .resize(n_tokens + 1);
|
seq_ids .resize(n_tokens + 1);
|
||||||
@ -474,6 +468,7 @@ struct llava_embd_batch {
|
|||||||
/*n_tokens =*/ n_tokens,
|
/*n_tokens =*/ n_tokens,
|
||||||
/*tokens =*/ nullptr,
|
/*tokens =*/ nullptr,
|
||||||
/*embd =*/ embd,
|
/*embd =*/ embd,
|
||||||
|
/*n_embd =*/ n_embd,
|
||||||
/*pos =*/ pos.data(),
|
/*pos =*/ pos.data(),
|
||||||
/*n_seq_id =*/ n_seq_id.data(),
|
/*n_seq_id =*/ n_seq_id.data(),
|
||||||
/*seq_id =*/ seq_ids.data(),
|
/*seq_id =*/ seq_ids.data(),
|
||||||
@ -497,7 +492,7 @@ bool llava_eval_image_embed(llama_context * ctx_llama, const struct llava_image_
|
|||||||
n_eval = n_batch;
|
n_eval = n_batch;
|
||||||
}
|
}
|
||||||
float * embd = image_embed->embed+i*n_embd;
|
float * embd = image_embed->embed+i*n_embd;
|
||||||
llava_embd_batch llava_batch = llava_embd_batch(embd, n_eval, *n_past, 0);
|
llava_embd_batch llava_batch = llava_embd_batch(embd, n_embd, n_eval, *n_past, 0);
|
||||||
if (llama_decode(ctx_llama, llava_batch.batch)) {
|
if (llama_decode(ctx_llama, llava_batch.batch)) {
|
||||||
LOG_ERR("%s : failed to eval\n", __func__);
|
LOG_ERR("%s : failed to eval\n", __func__);
|
||||||
return false;
|
return false;
|
@ -1,4 +1,4 @@
|
|||||||
package mtmd
|
package llava
|
||||||
|
|
||||||
// #cgo CXXFLAGS: -std=c++11
|
// #cgo CXXFLAGS: -std=c++11
|
||||||
// #cgo CPPFLAGS: -I${SRCDIR}/../../include -I${SRCDIR}/../../common
|
// #cgo CPPFLAGS: -I${SRCDIR}/../../include -I${SRCDIR}/../../common
|
69
llama/llama.cpp/include/llama.h
vendored
69
llama/llama.cpp/include/llama.h
vendored
@ -4,7 +4,6 @@
|
|||||||
#include "ggml.h"
|
#include "ggml.h"
|
||||||
#include "ggml-cpu.h"
|
#include "ggml-cpu.h"
|
||||||
#include "ggml-backend.h"
|
#include "ggml-backend.h"
|
||||||
#include "ggml-opt.h"
|
|
||||||
|
|
||||||
#include <stddef.h>
|
#include <stddef.h>
|
||||||
#include <stdint.h>
|
#include <stdint.h>
|
||||||
@ -112,8 +111,6 @@ extern "C" {
|
|||||||
LLAMA_VOCAB_PRE_TYPE_TRILLION = 31,
|
LLAMA_VOCAB_PRE_TYPE_TRILLION = 31,
|
||||||
LLAMA_VOCAB_PRE_TYPE_BAILINGMOE = 32,
|
LLAMA_VOCAB_PRE_TYPE_BAILINGMOE = 32,
|
||||||
LLAMA_VOCAB_PRE_TYPE_LLAMA4 = 33,
|
LLAMA_VOCAB_PRE_TYPE_LLAMA4 = 33,
|
||||||
LLAMA_VOCAB_PRE_TYPE_PIXTRAL = 34,
|
|
||||||
LLAMA_VOCAB_PRE_TYPE_SEED_CODER = 35,
|
|
||||||
};
|
};
|
||||||
|
|
||||||
enum llama_rope_type {
|
enum llama_rope_type {
|
||||||
@ -258,6 +255,7 @@ extern "C" {
|
|||||||
|
|
||||||
llama_token * token;
|
llama_token * token;
|
||||||
float * embd;
|
float * embd;
|
||||||
|
int32_t n_embd;
|
||||||
llama_pos * pos;
|
llama_pos * pos;
|
||||||
int32_t * n_seq_id;
|
int32_t * n_seq_id;
|
||||||
llama_seq_id ** seq_id;
|
llama_seq_id ** seq_id;
|
||||||
@ -353,18 +351,20 @@ extern "C" {
|
|||||||
enum ggml_type type_k; // data type for K cache [EXPERIMENTAL]
|
enum ggml_type type_k; // data type for K cache [EXPERIMENTAL]
|
||||||
enum ggml_type type_v; // data type for V cache [EXPERIMENTAL]
|
enum ggml_type type_v; // data type for V cache [EXPERIMENTAL]
|
||||||
|
|
||||||
|
// Keep the booleans together and at the end of the struct to avoid misalignment during copy-by-value.
|
||||||
|
// TODO: move at the end of the struct
|
||||||
|
bool logits_all; // the llama_decode() call computes all logits, not just the last one (DEPRECATED - set llama_batch.logits instead)
|
||||||
|
bool embeddings; // if true, extract embeddings (together with logits)
|
||||||
|
bool offload_kqv; // whether to offload the KQV ops (including the KV cache) to GPU
|
||||||
|
bool flash_attn; // whether to use flash attention [EXPERIMENTAL]
|
||||||
|
bool no_perf; // whether to measure performance timings
|
||||||
|
bool cross_attn; // whether to use cross attention
|
||||||
|
|
||||||
// Abort callback
|
// Abort callback
|
||||||
// if it returns true, execution of llama_decode() will be aborted
|
// if it returns true, execution of llama_decode() will be aborted
|
||||||
// currently works only with CPU execution
|
// currently works only with CPU execution
|
||||||
ggml_abort_callback abort_callback;
|
ggml_abort_callback abort_callback;
|
||||||
void * abort_callback_data;
|
void * abort_callback_data;
|
||||||
|
|
||||||
// Keep the booleans together and at the end of the struct to avoid misalignment during copy-by-value.
|
|
||||||
bool embeddings; // if true, extract embeddings (together with logits)
|
|
||||||
bool offload_kqv; // whether to offload the KQV ops (including the KV cache) to GPU
|
|
||||||
bool flash_attn; // whether to use flash attention [EXPERIMENTAL]
|
|
||||||
bool no_perf; // whether to measure performance timings
|
|
||||||
bool op_offload; // whether to offload host tensor operations to device
|
|
||||||
};
|
};
|
||||||
|
|
||||||
// model quantization parameters
|
// model quantization parameters
|
||||||
@ -446,10 +446,6 @@ extern "C" {
|
|||||||
size_t n_paths,
|
size_t n_paths,
|
||||||
struct llama_model_params params);
|
struct llama_model_params params);
|
||||||
|
|
||||||
LLAMA_API void llama_model_save_to_file(
|
|
||||||
const struct llama_model * model,
|
|
||||||
const char * path_model);
|
|
||||||
|
|
||||||
DEPRECATED(LLAMA_API void llama_free_model(struct llama_model * model),
|
DEPRECATED(LLAMA_API void llama_free_model(struct llama_model * model),
|
||||||
"use llama_model_free instead");
|
"use llama_model_free instead");
|
||||||
|
|
||||||
@ -464,6 +460,10 @@ extern "C" {
|
|||||||
struct llama_context_params params),
|
struct llama_context_params params),
|
||||||
"use llama_init_from_model instead");
|
"use llama_init_from_model instead");
|
||||||
|
|
||||||
|
// TODO (jmorganca): this should most likely be passed in as part of a batch
|
||||||
|
// and not set on the context for all batches.
|
||||||
|
LLAMA_API void llama_set_cross_attention(struct llama_context * ctx, bool cross_attn_state);
|
||||||
|
|
||||||
// Frees all allocated memory
|
// Frees all allocated memory
|
||||||
LLAMA_API void llama_free(struct llama_context * ctx);
|
LLAMA_API void llama_free(struct llama_context * ctx);
|
||||||
|
|
||||||
@ -929,19 +929,14 @@ extern "C" {
|
|||||||
// Frees a batch of tokens allocated with llama_batch_init()
|
// Frees a batch of tokens allocated with llama_batch_init()
|
||||||
LLAMA_API void llama_batch_free(struct llama_batch batch);
|
LLAMA_API void llama_batch_free(struct llama_batch batch);
|
||||||
|
|
||||||
// Process a batch of tokens.
|
// Processes a batch of tokens with the ecoder part of the encoder-decoder model.
|
||||||
// In contrast to llama_decode() - this call does not use KV cache.
|
// Stores the encoder output internally for later use by the decoder cross-attention layers.
|
||||||
// For encode-decoder contexts, processes the batch using the encoder.
|
|
||||||
// Can store the encoder output internally for later use by the decoder's cross-attention layers.
|
|
||||||
// 0 - success
|
// 0 - success
|
||||||
// < 0 - error. the KV cache state is restored to the state before this call
|
// < 0 - error. the KV cache state is restored to the state before this call
|
||||||
LLAMA_API int32_t llama_encode(
|
LLAMA_API int32_t llama_encode(
|
||||||
struct llama_context * ctx,
|
struct llama_context * ctx,
|
||||||
struct llama_batch batch);
|
struct llama_batch batch);
|
||||||
|
|
||||||
// Process a batch of tokens.
|
|
||||||
// Requires KV cache.
|
|
||||||
// For encode-decoder contexts, processes the batch using the decoder.
|
|
||||||
// Positive return values does not mean a fatal error, but rather a warning.
|
// Positive return values does not mean a fatal error, but rather a warning.
|
||||||
// 0 - success
|
// 0 - success
|
||||||
// 1 - could not find a KV slot for the batch (try reducing the size of the batch or increase the context)
|
// 1 - could not find a KV slot for the batch (try reducing the size of the batch or increase the context)
|
||||||
@ -1242,7 +1237,6 @@ extern "C" {
|
|||||||
"will be removed in the future (see https://github.com/ggml-org/llama.cpp/pull/9896#discussion_r1800920915)");
|
"will be removed in the future (see https://github.com/ggml-org/llama.cpp/pull/9896#discussion_r1800920915)");
|
||||||
|
|
||||||
/// @details Top-K sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751
|
/// @details Top-K sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751
|
||||||
/// Setting k <= 0 makes this a noop
|
|
||||||
LLAMA_API struct llama_sampler * llama_sampler_init_top_k (int32_t k);
|
LLAMA_API struct llama_sampler * llama_sampler_init_top_k (int32_t k);
|
||||||
|
|
||||||
/// @details Nucleus sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751
|
/// @details Nucleus sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751
|
||||||
@ -1438,37 +1432,6 @@ extern "C" {
|
|||||||
LLAMA_API void llama_perf_sampler_print(const struct llama_sampler * chain);
|
LLAMA_API void llama_perf_sampler_print(const struct llama_sampler * chain);
|
||||||
LLAMA_API void llama_perf_sampler_reset( struct llama_sampler * chain);
|
LLAMA_API void llama_perf_sampler_reset( struct llama_sampler * chain);
|
||||||
|
|
||||||
//
|
|
||||||
// training
|
|
||||||
//
|
|
||||||
|
|
||||||
// function that returns whether or not a given tensor contains trainable parameters
|
|
||||||
typedef bool (*llama_opt_param_filter)(const struct ggml_tensor * tensor, void * userdata);
|
|
||||||
|
|
||||||
// always returns true
|
|
||||||
LLAMA_API bool llama_opt_param_filter_all(const struct ggml_tensor * tensor, void * userdata);
|
|
||||||
|
|
||||||
struct llama_opt_params {
|
|
||||||
uint32_t n_ctx_train; // assumed context size post training, use context size specified in llama_context if 0
|
|
||||||
|
|
||||||
llama_opt_param_filter param_filter; // callback for determining which tensors contain trainable parameters
|
|
||||||
void * param_filter_ud; // userdata for determining which tensors contain trainable parameters
|
|
||||||
|
|
||||||
ggml_opt_get_optimizer_params get_opt_pars; // callback for calculating optimizer parameters
|
|
||||||
void * get_opt_pars_ud; // userdata for calculating optimizer parameters
|
|
||||||
};
|
|
||||||
|
|
||||||
LLAMA_API void llama_opt_init(struct llama_context * lctx, struct llama_model * model, struct llama_opt_params lopt_params);
|
|
||||||
|
|
||||||
LLAMA_API void llama_opt_epoch(
|
|
||||||
struct llama_context * lctx,
|
|
||||||
ggml_opt_dataset_t dataset,
|
|
||||||
ggml_opt_result_t result_train,
|
|
||||||
ggml_opt_result_t result_eval,
|
|
||||||
int64_t idata_split,
|
|
||||||
ggml_opt_epoch_callback callback_train,
|
|
||||||
ggml_opt_epoch_callback callback_eval);
|
|
||||||
|
|
||||||
#ifdef __cplusplus
|
#ifdef __cplusplus
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
6
llama/llama.cpp/src/llama-adapter.cpp
vendored
6
llama/llama.cpp/src/llama-adapter.cpp
vendored
@ -253,9 +253,6 @@ static void llama_adapter_lora_init_impl(llama_model & model, const char * path_
|
|||||||
std::vector<ggml_backend_buffer_type_t> buft_extra;
|
std::vector<ggml_backend_buffer_type_t> buft_extra;
|
||||||
{
|
{
|
||||||
auto * cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
|
auto * cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
|
||||||
if (!cpu_dev) {
|
|
||||||
throw std::runtime_error(format("%s: no CPU backend found", __func__));
|
|
||||||
}
|
|
||||||
auto * cpu_reg = ggml_backend_dev_backend_reg(cpu_dev);
|
auto * cpu_reg = ggml_backend_dev_backend_reg(cpu_dev);
|
||||||
|
|
||||||
auto ggml_backend_dev_get_extra_bufts_fn = (ggml_backend_dev_get_extra_bufts_t)
|
auto ggml_backend_dev_get_extra_bufts_fn = (ggml_backend_dev_get_extra_bufts_t)
|
||||||
@ -294,9 +291,6 @@ static void llama_adapter_lora_init_impl(llama_model & model, const char * path_
|
|||||||
LLAMA_LOG_WARN("%s: lora for '%s' cannot use buft '%s', fallback to CPU\n", __func__, model_tensor->name, ggml_backend_buft_name(buft));
|
LLAMA_LOG_WARN("%s: lora for '%s' cannot use buft '%s', fallback to CPU\n", __func__, model_tensor->name, ggml_backend_buft_name(buft));
|
||||||
|
|
||||||
auto * cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
|
auto * cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
|
||||||
if (!cpu_dev) {
|
|
||||||
throw std::runtime_error(format("%s: no CPU backend found", __func__));
|
|
||||||
}
|
|
||||||
buft = ggml_backend_dev_buffer_type(cpu_dev);
|
buft = ggml_backend_dev_buffer_type(cpu_dev);
|
||||||
|
|
||||||
break;
|
break;
|
||||||
|
104
llama/llama.cpp/src/llama-arch.cpp
vendored
104
llama/llama.cpp/src/llama-arch.cpp
vendored
@ -6,6 +6,7 @@
|
|||||||
|
|
||||||
static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
|
static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
|
||||||
{ LLM_ARCH_LLAMA, "llama" },
|
{ LLM_ARCH_LLAMA, "llama" },
|
||||||
|
{ LLM_ARCH_MLLAMA, "mllama" },
|
||||||
{ LLM_ARCH_LLAMA4, "llama4" },
|
{ LLM_ARCH_LLAMA4, "llama4" },
|
||||||
{ LLM_ARCH_DECI, "deci" },
|
{ LLM_ARCH_DECI, "deci" },
|
||||||
{ LLM_ARCH_FALCON, "falcon" },
|
{ LLM_ARCH_FALCON, "falcon" },
|
||||||
@ -19,7 +20,6 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
|
|||||||
{ LLM_ARCH_REFACT, "refact" },
|
{ LLM_ARCH_REFACT, "refact" },
|
||||||
{ LLM_ARCH_BERT, "bert" },
|
{ LLM_ARCH_BERT, "bert" },
|
||||||
{ LLM_ARCH_NOMIC_BERT, "nomic-bert" },
|
{ LLM_ARCH_NOMIC_BERT, "nomic-bert" },
|
||||||
{ LLM_ARCH_NOMIC_BERT_MOE, "nomic-bert-moe" },
|
|
||||||
{ LLM_ARCH_JINA_BERT_V2, "jina-bert-v2" },
|
{ LLM_ARCH_JINA_BERT_V2, "jina-bert-v2" },
|
||||||
{ LLM_ARCH_BLOOM, "bloom" },
|
{ LLM_ARCH_BLOOM, "bloom" },
|
||||||
{ LLM_ARCH_STABLELM, "stablelm" },
|
{ LLM_ARCH_STABLELM, "stablelm" },
|
||||||
@ -73,6 +73,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
|
|||||||
{ LLM_ARCH_WAVTOKENIZER_DEC, "wavtokenizer-dec" },
|
{ LLM_ARCH_WAVTOKENIZER_DEC, "wavtokenizer-dec" },
|
||||||
{ LLM_ARCH_PLM, "plm" },
|
{ LLM_ARCH_PLM, "plm" },
|
||||||
{ LLM_ARCH_BAILINGMOE, "bailingmoe" },
|
{ LLM_ARCH_BAILINGMOE, "bailingmoe" },
|
||||||
|
{ LLM_ARCH_MISTRAL3, "mistral3" },
|
||||||
{ LLM_ARCH_UNKNOWN, "(unknown)" },
|
{ LLM_ARCH_UNKNOWN, "(unknown)" },
|
||||||
};
|
};
|
||||||
|
|
||||||
@ -108,7 +109,6 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
|
|||||||
{ LLM_KV_EXPERT_WEIGHTS_SCALE, "%s.expert_weights_scale" },
|
{ LLM_KV_EXPERT_WEIGHTS_SCALE, "%s.expert_weights_scale" },
|
||||||
{ LLM_KV_EXPERT_WEIGHTS_NORM, "%s.expert_weights_norm" },
|
{ LLM_KV_EXPERT_WEIGHTS_NORM, "%s.expert_weights_norm" },
|
||||||
{ LLM_KV_EXPERT_GATING_FUNC, "%s.expert_gating_func" },
|
{ LLM_KV_EXPERT_GATING_FUNC, "%s.expert_gating_func" },
|
||||||
{ LLM_KV_MOE_EVERY_N_LAYERS, "%s.moe_every_n_layers" },
|
|
||||||
{ LLM_KV_POOLING_TYPE, "%s.pooling_type" },
|
{ LLM_KV_POOLING_TYPE, "%s.pooling_type" },
|
||||||
{ LLM_KV_LOGIT_SCALE, "%s.logit_scale" },
|
{ LLM_KV_LOGIT_SCALE, "%s.logit_scale" },
|
||||||
{ LLM_KV_DECODER_START_TOKEN_ID, "%s.decoder_start_token_id" },
|
{ LLM_KV_DECODER_START_TOKEN_ID, "%s.decoder_start_token_id" },
|
||||||
@ -144,8 +144,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
|
|||||||
{ LLM_KV_ATTENTION_SLIDING_WINDOW, "%s.attention.sliding_window" },
|
{ LLM_KV_ATTENTION_SLIDING_WINDOW, "%s.attention.sliding_window" },
|
||||||
{ LLM_KV_ATTENTION_SCALE, "%s.attention.scale" },
|
{ LLM_KV_ATTENTION_SCALE, "%s.attention.scale" },
|
||||||
{ LLM_KV_ATTENTION_BLOCK_SKIP_CONNECTION, "%s.attention.block_skip_connection" },
|
{ LLM_KV_ATTENTION_BLOCK_SKIP_CONNECTION, "%s.attention.block_skip_connection" },
|
||||||
{ LLM_KV_ATTENTION_KEY_LENGTH_MLA, "%s.attention.key_length_mla" },
|
{ LLM_KV_ATTENTION_CROSS_ATTENTION_LAYERS, "%s.attention.cross_attention_layers" },
|
||||||
{ LLM_KV_ATTENTION_VALUE_LENGTH_MLA, "%s.attention.value_length_mla" },
|
|
||||||
|
|
||||||
{ LLM_KV_ROPE_DIMENSION_COUNT, "%s.rope.dimension_count" },
|
{ LLM_KV_ROPE_DIMENSION_COUNT, "%s.rope.dimension_count" },
|
||||||
{ LLM_KV_ROPE_DIMENSION_SECTIONS, "%s.rope.dimension_sections" },
|
{ LLM_KV_ROPE_DIMENSION_SECTIONS, "%s.rope.dimension_sections" },
|
||||||
@ -273,6 +272,40 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
|
|||||||
{ LLM_TENSOR_FFN_UP_SHEXP, "blk.%d.ffn_up_shexp" },
|
{ LLM_TENSOR_FFN_UP_SHEXP, "blk.%d.ffn_up_shexp" },
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
LLM_ARCH_MLLAMA,
|
||||||
|
{
|
||||||
|
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
||||||
|
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
||||||
|
{ LLM_TENSOR_OUTPUT, "output" },
|
||||||
|
{ LLM_TENSOR_ROPE_FREQS, "rope_freqs" },
|
||||||
|
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
||||||
|
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
|
||||||
|
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
|
||||||
|
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
|
||||||
|
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
||||||
|
{ LLM_TENSOR_ATTN_ROT_EMBD, "blk.%d.attn_rot_embd" },
|
||||||
|
{ LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" },
|
||||||
|
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
|
||||||
|
{ LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
|
||||||
|
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
||||||
|
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
||||||
|
{ LLM_TENSOR_FFN_GATE_EXP, "blk.%d.ffn_gate.%d" },
|
||||||
|
{ LLM_TENSOR_FFN_DOWN_EXP, "blk.%d.ffn_down.%d" },
|
||||||
|
{ LLM_TENSOR_FFN_UP_EXP, "blk.%d.ffn_up.%d" },
|
||||||
|
{ LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" },
|
||||||
|
{ LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" },
|
||||||
|
{ LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
|
||||||
|
{ LLM_TENSOR_CROSS_ATTN_K_NORM, "blk.%d.cross_attn_k_norm" },
|
||||||
|
{ LLM_TENSOR_CROSS_ATTN_K_PROJ, "blk.%d.cross_attn_k_proj" },
|
||||||
|
{ LLM_TENSOR_CROSS_ATTN_O_PROJ, "blk.%d.cross_attn_o_proj" },
|
||||||
|
{ LLM_TENSOR_CROSS_ATTN_Q_NORM, "blk.%d.cross_attn_q_norm" },
|
||||||
|
{ LLM_TENSOR_CROSS_ATTN_Q_PROJ, "blk.%d.cross_attn_q_proj" },
|
||||||
|
{ LLM_TENSOR_CROSS_ATTN_V_PROJ, "blk.%d.cross_attn_v_proj" },
|
||||||
|
{ LLM_TENSOR_CROSS_ATTN_ATTN_GATE, "blk.%d.cross_attn_attn_gate" },
|
||||||
|
{ LLM_TENSOR_CROSS_ATTN_MLP_GATE, "blk.%d.cross_attn_mlp_gate" },
|
||||||
|
},
|
||||||
|
},
|
||||||
{
|
{
|
||||||
LLM_ARCH_DECI,
|
LLM_ARCH_DECI,
|
||||||
{
|
{
|
||||||
@ -476,24 +509,6 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
|
|||||||
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
{
|
|
||||||
LLM_ARCH_NOMIC_BERT_MOE,
|
|
||||||
{
|
|
||||||
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
|
||||||
{ LLM_TENSOR_TOKEN_EMBD_NORM, "token_embd_norm" },
|
|
||||||
{ LLM_TENSOR_TOKEN_TYPES, "token_types" },
|
|
||||||
{ LLM_TENSOR_ATTN_OUT_NORM, "blk.%d.attn_output_norm" },
|
|
||||||
{ LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" },
|
|
||||||
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
|
||||||
{ LLM_TENSOR_LAYER_OUT_NORM, "blk.%d.layer_output_norm" },
|
|
||||||
{ LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
|
|
||||||
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
|
||||||
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
|
||||||
{ LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" },
|
|
||||||
{ LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" },
|
|
||||||
{ LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
|
|
||||||
},
|
|
||||||
},
|
|
||||||
{
|
{
|
||||||
LLM_ARCH_JINA_BERT_V2,
|
LLM_ARCH_JINA_BERT_V2,
|
||||||
{
|
{
|
||||||
@ -1127,8 +1142,6 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
|
|||||||
{ LLM_TENSOR_ATTN_Q_B, "blk.%d.attn_q_b" },
|
{ LLM_TENSOR_ATTN_Q_B, "blk.%d.attn_q_b" },
|
||||||
{ LLM_TENSOR_ATTN_KV_A_MQA, "blk.%d.attn_kv_a_mqa" },
|
{ LLM_TENSOR_ATTN_KV_A_MQA, "blk.%d.attn_kv_a_mqa" },
|
||||||
{ LLM_TENSOR_ATTN_KV_B, "blk.%d.attn_kv_b" },
|
{ LLM_TENSOR_ATTN_KV_B, "blk.%d.attn_kv_b" },
|
||||||
{ LLM_TENSOR_ATTN_K_B, "blk.%d.attn_k_b" },
|
|
||||||
{ LLM_TENSOR_ATTN_V_B, "blk.%d.attn_v_b" },
|
|
||||||
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
||||||
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
|
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
|
||||||
{ LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
|
{ LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
|
||||||
@ -1570,6 +1583,22 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
|
|||||||
{ LLM_TENSOR_FFN_UP_SHEXP, "blk.%d.ffn_up_shexp" },
|
{ LLM_TENSOR_FFN_UP_SHEXP, "blk.%d.ffn_up_shexp" },
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
LLM_ARCH_MISTRAL3,
|
||||||
|
{
|
||||||
|
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
||||||
|
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
||||||
|
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
||||||
|
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
|
||||||
|
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
|
||||||
|
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
|
||||||
|
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
||||||
|
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
|
||||||
|
{ LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
|
||||||
|
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
||||||
|
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
||||||
|
}
|
||||||
|
},
|
||||||
{
|
{
|
||||||
LLM_ARCH_UNKNOWN,
|
LLM_ARCH_UNKNOWN,
|
||||||
{
|
{
|
||||||
@ -1607,8 +1636,23 @@ static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
|
|||||||
{LLM_TENSOR_ATTN_Q_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
|
{LLM_TENSOR_ATTN_Q_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
|
||||||
{LLM_TENSOR_ATTN_KV_A_MQA, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
|
{LLM_TENSOR_ATTN_KV_A_MQA, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
|
||||||
{LLM_TENSOR_ATTN_KV_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
|
{LLM_TENSOR_ATTN_KV_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
|
||||||
{LLM_TENSOR_ATTN_K_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
|
{LLM_TENSOR_DEC_ATTN_Q, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
|
||||||
{LLM_TENSOR_ATTN_V_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
|
{LLM_TENSOR_DEC_ATTN_K, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
|
||||||
|
{LLM_TENSOR_ATTN_Q, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
|
||||||
|
{LLM_TENSOR_ATTN_K, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
|
||||||
|
{LLM_TENSOR_ATTN_V, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
|
||||||
|
{LLM_TENSOR_ATTN_QKV, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
|
||||||
|
{LLM_TENSOR_ATTN_OUT, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
|
||||||
|
{LLM_TENSOR_FFN_GATE, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
|
||||||
|
{LLM_TENSOR_FFN_DOWN, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
|
||||||
|
{LLM_TENSOR_FFN_UP, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
|
||||||
|
{LLM_TENSOR_FFN_DOWN_SHEXP, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
|
||||||
|
{LLM_TENSOR_FFN_GATE_SHEXP, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
|
||||||
|
{LLM_TENSOR_FFN_UP_SHEXP, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
|
||||||
|
{LLM_TENSOR_ATTN_Q_A, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
|
||||||
|
{LLM_TENSOR_ATTN_Q_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
|
||||||
|
{LLM_TENSOR_ATTN_KV_A_MQA, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
|
||||||
|
{LLM_TENSOR_ATTN_KV_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
|
||||||
{LLM_TENSOR_DEC_ATTN_Q, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
|
{LLM_TENSOR_DEC_ATTN_Q, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
|
||||||
{LLM_TENSOR_DEC_ATTN_K, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
|
{LLM_TENSOR_DEC_ATTN_K, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
|
||||||
{LLM_TENSOR_DEC_ATTN_V, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
|
{LLM_TENSOR_DEC_ATTN_V, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
|
||||||
@ -1701,6 +1745,14 @@ static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
|
|||||||
// this tensor is loaded for T5, but never used
|
// this tensor is loaded for T5, but never used
|
||||||
{LLM_TENSOR_DEC_CROSS_ATTN_REL_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_NONE}},
|
{LLM_TENSOR_DEC_CROSS_ATTN_REL_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_NONE}},
|
||||||
{LLM_TENSOR_BSKCN_TV, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
|
{LLM_TENSOR_BSKCN_TV, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
|
||||||
|
{LLM_TENSOR_CROSS_ATTN_K_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
|
||||||
|
{LLM_TENSOR_CROSS_ATTN_K_PROJ, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
|
||||||
|
{LLM_TENSOR_CROSS_ATTN_O_PROJ, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
|
||||||
|
{LLM_TENSOR_CROSS_ATTN_Q_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
|
||||||
|
{LLM_TENSOR_CROSS_ATTN_Q_PROJ, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
|
||||||
|
{LLM_TENSOR_CROSS_ATTN_V_PROJ, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
|
||||||
|
{LLM_TENSOR_CROSS_ATTN_ATTN_GATE, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
|
||||||
|
{LLM_TENSOR_CROSS_ATTN_MLP_GATE, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
|
||||||
{LLM_TENSOR_CONV1D, {LLM_TENSOR_LAYER_INPUT, GGML_OP_IM2COL}},
|
{LLM_TENSOR_CONV1D, {LLM_TENSOR_LAYER_INPUT, GGML_OP_IM2COL}},
|
||||||
{LLM_TENSOR_POS_NET_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
|
{LLM_TENSOR_POS_NET_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
|
||||||
{LLM_TENSOR_POS_NET_NORM1, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
|
{LLM_TENSOR_POS_NET_NORM1, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
|
||||||
|
17
llama/llama.cpp/src/llama-arch.h
vendored
17
llama/llama.cpp/src/llama-arch.h
vendored
@ -11,6 +11,7 @@
|
|||||||
enum llm_arch {
|
enum llm_arch {
|
||||||
LLM_ARCH_LLAMA,
|
LLM_ARCH_LLAMA,
|
||||||
LLM_ARCH_LLAMA4,
|
LLM_ARCH_LLAMA4,
|
||||||
|
LLM_ARCH_MLLAMA,
|
||||||
LLM_ARCH_DECI,
|
LLM_ARCH_DECI,
|
||||||
LLM_ARCH_FALCON,
|
LLM_ARCH_FALCON,
|
||||||
LLM_ARCH_BAICHUAN,
|
LLM_ARCH_BAICHUAN,
|
||||||
@ -23,7 +24,6 @@ enum llm_arch {
|
|||||||
LLM_ARCH_REFACT,
|
LLM_ARCH_REFACT,
|
||||||
LLM_ARCH_BERT,
|
LLM_ARCH_BERT,
|
||||||
LLM_ARCH_NOMIC_BERT,
|
LLM_ARCH_NOMIC_BERT,
|
||||||
LLM_ARCH_NOMIC_BERT_MOE,
|
|
||||||
LLM_ARCH_JINA_BERT_V2,
|
LLM_ARCH_JINA_BERT_V2,
|
||||||
LLM_ARCH_BLOOM,
|
LLM_ARCH_BLOOM,
|
||||||
LLM_ARCH_STABLELM,
|
LLM_ARCH_STABLELM,
|
||||||
@ -75,6 +75,7 @@ enum llm_arch {
|
|||||||
LLM_ARCH_CHAMELEON,
|
LLM_ARCH_CHAMELEON,
|
||||||
LLM_ARCH_SOLAR,
|
LLM_ARCH_SOLAR,
|
||||||
LLM_ARCH_WAVTOKENIZER_DEC,
|
LLM_ARCH_WAVTOKENIZER_DEC,
|
||||||
|
LLM_ARCH_MISTRAL3,
|
||||||
LLM_ARCH_PLM,
|
LLM_ARCH_PLM,
|
||||||
LLM_ARCH_BAILINGMOE,
|
LLM_ARCH_BAILINGMOE,
|
||||||
LLM_ARCH_UNKNOWN,
|
LLM_ARCH_UNKNOWN,
|
||||||
@ -112,7 +113,6 @@ enum llm_kv {
|
|||||||
LLM_KV_EXPERT_WEIGHTS_SCALE,
|
LLM_KV_EXPERT_WEIGHTS_SCALE,
|
||||||
LLM_KV_EXPERT_WEIGHTS_NORM,
|
LLM_KV_EXPERT_WEIGHTS_NORM,
|
||||||
LLM_KV_EXPERT_GATING_FUNC,
|
LLM_KV_EXPERT_GATING_FUNC,
|
||||||
LLM_KV_MOE_EVERY_N_LAYERS,
|
|
||||||
LLM_KV_POOLING_TYPE,
|
LLM_KV_POOLING_TYPE,
|
||||||
LLM_KV_LOGIT_SCALE,
|
LLM_KV_LOGIT_SCALE,
|
||||||
LLM_KV_DECODER_START_TOKEN_ID,
|
LLM_KV_DECODER_START_TOKEN_ID,
|
||||||
@ -148,8 +148,7 @@ enum llm_kv {
|
|||||||
LLM_KV_ATTENTION_SLIDING_WINDOW,
|
LLM_KV_ATTENTION_SLIDING_WINDOW,
|
||||||
LLM_KV_ATTENTION_SCALE,
|
LLM_KV_ATTENTION_SCALE,
|
||||||
LLM_KV_ATTENTION_BLOCK_SKIP_CONNECTION,
|
LLM_KV_ATTENTION_BLOCK_SKIP_CONNECTION,
|
||||||
LLM_KV_ATTENTION_KEY_LENGTH_MLA,
|
LLM_KV_ATTENTION_CROSS_ATTENTION_LAYERS,
|
||||||
LLM_KV_ATTENTION_VALUE_LENGTH_MLA,
|
|
||||||
|
|
||||||
LLM_KV_ROPE_DIMENSION_COUNT,
|
LLM_KV_ROPE_DIMENSION_COUNT,
|
||||||
LLM_KV_ROPE_DIMENSION_SECTIONS,
|
LLM_KV_ROPE_DIMENSION_SECTIONS,
|
||||||
@ -312,8 +311,6 @@ enum llm_tensor {
|
|||||||
LLM_TENSOR_ATTN_Q_B,
|
LLM_TENSOR_ATTN_Q_B,
|
||||||
LLM_TENSOR_ATTN_KV_A_MQA,
|
LLM_TENSOR_ATTN_KV_A_MQA,
|
||||||
LLM_TENSOR_ATTN_KV_B,
|
LLM_TENSOR_ATTN_KV_B,
|
||||||
LLM_TENSOR_ATTN_K_B,
|
|
||||||
LLM_TENSOR_ATTN_V_B,
|
|
||||||
LLM_TENSOR_ATTN_Q_A_NORM,
|
LLM_TENSOR_ATTN_Q_A_NORM,
|
||||||
LLM_TENSOR_ATTN_KV_A_NORM,
|
LLM_TENSOR_ATTN_KV_A_NORM,
|
||||||
LLM_TENSOR_ATTN_SUB_NORM,
|
LLM_TENSOR_ATTN_SUB_NORM,
|
||||||
@ -349,6 +346,14 @@ enum llm_tensor {
|
|||||||
LLM_TENSOR_CLS,
|
LLM_TENSOR_CLS,
|
||||||
LLM_TENSOR_CLS_OUT,
|
LLM_TENSOR_CLS_OUT,
|
||||||
LLM_TENSOR_BSKCN_TV,
|
LLM_TENSOR_BSKCN_TV,
|
||||||
|
LLM_TENSOR_CROSS_ATTN_K_NORM,
|
||||||
|
LLM_TENSOR_CROSS_ATTN_K_PROJ,
|
||||||
|
LLM_TENSOR_CROSS_ATTN_O_PROJ,
|
||||||
|
LLM_TENSOR_CROSS_ATTN_Q_NORM,
|
||||||
|
LLM_TENSOR_CROSS_ATTN_Q_PROJ,
|
||||||
|
LLM_TENSOR_CROSS_ATTN_V_PROJ,
|
||||||
|
LLM_TENSOR_CROSS_ATTN_ATTN_GATE,
|
||||||
|
LLM_TENSOR_CROSS_ATTN_MLP_GATE,
|
||||||
LLM_TENSOR_CONV1D,
|
LLM_TENSOR_CONV1D,
|
||||||
LLM_TENSOR_CONVNEXT_DW,
|
LLM_TENSOR_CONVNEXT_DW,
|
||||||
LLM_TENSOR_CONVNEXT_NORM,
|
LLM_TENSOR_CONVNEXT_NORM,
|
||||||
|
9
llama/llama.cpp/src/llama-batch.cpp
vendored
9
llama/llama.cpp/src/llama-batch.cpp
vendored
@ -189,7 +189,7 @@ llama_ubatch llama_sbatch::split_seq(size_t n_ubatch) {
|
|||||||
return ubatch;
|
return ubatch;
|
||||||
}
|
}
|
||||||
|
|
||||||
llama_sbatch::llama_sbatch(const llama_batch & batch, size_t n_embd, bool simple_split, bool logits_all) {
|
void llama_sbatch::from_batch(const llama_batch & batch, size_t n_embd, bool simple_split, bool logits_all) {
|
||||||
GGML_ASSERT(batch.n_tokens >= 0);
|
GGML_ASSERT(batch.n_tokens >= 0);
|
||||||
this->batch = &batch;
|
this->batch = &batch;
|
||||||
this->n_embd = n_embd;
|
this->n_embd = n_embd;
|
||||||
@ -203,7 +203,6 @@ llama_sbatch::llama_sbatch(const llama_batch & batch, size_t n_embd, bool simple
|
|||||||
for (size_t i = 0; i < n_tokens; ++i) {
|
for (size_t i = 0; i < n_tokens; ++i) {
|
||||||
ids[i] = i;
|
ids[i] = i;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (simple_split) {
|
if (simple_split) {
|
||||||
seq.resize(1);
|
seq.resize(1);
|
||||||
llama_sbatch_seq & s = seq[0];
|
llama_sbatch_seq & s = seq[0];
|
||||||
@ -213,7 +212,6 @@ llama_sbatch::llama_sbatch(const llama_batch & batch, size_t n_embd, bool simple
|
|||||||
s.length = n_tokens;
|
s.length = n_tokens;
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
std::sort(ids.begin(), ids.end(),
|
std::sort(ids.begin(), ids.end(),
|
||||||
[&batch](size_t a, size_t b) {
|
[&batch](size_t a, size_t b) {
|
||||||
int32_t n_seq_a = batch.n_seq_id ? batch.n_seq_id[a] : 1;
|
int32_t n_seq_a = batch.n_seq_id ? batch.n_seq_id[a] : 1;
|
||||||
@ -241,7 +239,6 @@ llama_sbatch::llama_sbatch(const llama_batch & batch, size_t n_embd, bool simple
|
|||||||
return n_seq_a > n_seq_b;
|
return n_seq_a > n_seq_b;
|
||||||
}
|
}
|
||||||
);
|
);
|
||||||
|
|
||||||
// init seq
|
// init seq
|
||||||
llama_sbatch_seq * last_seq = nullptr;
|
llama_sbatch_seq * last_seq = nullptr;
|
||||||
|
|
||||||
@ -265,7 +262,6 @@ llama_sbatch::llama_sbatch(const llama_batch & batch, size_t n_embd, bool simple
|
|||||||
seq.push_back(new_seq);
|
seq.push_back(new_seq);
|
||||||
last_seq = &seq.back();
|
last_seq = &seq.back();
|
||||||
}
|
}
|
||||||
|
|
||||||
// keep shared prompts first at the end, then sort by length descending.
|
// keep shared prompts first at the end, then sort by length descending.
|
||||||
std::sort(seq.begin(), seq.end(),
|
std::sort(seq.begin(), seq.end(),
|
||||||
[](llama_sbatch_seq & a, llama_sbatch_seq & b) {
|
[](llama_sbatch_seq & a, llama_sbatch_seq & b) {
|
||||||
@ -320,6 +316,7 @@ struct llama_batch llama_batch_get_one(
|
|||||||
/*n_tokens =*/ n_tokens,
|
/*n_tokens =*/ n_tokens,
|
||||||
/*tokens =*/ tokens,
|
/*tokens =*/ tokens,
|
||||||
/*embd =*/ nullptr,
|
/*embd =*/ nullptr,
|
||||||
|
/*n_embd =*/ 0,
|
||||||
/*pos =*/ nullptr,
|
/*pos =*/ nullptr,
|
||||||
/*n_seq_id =*/ nullptr,
|
/*n_seq_id =*/ nullptr,
|
||||||
/*seq_id =*/ nullptr,
|
/*seq_id =*/ nullptr,
|
||||||
@ -332,6 +329,7 @@ struct llama_batch llama_batch_init(int32_t n_tokens_alloc, int32_t embd, int32_
|
|||||||
/*n_tokens =*/ 0,
|
/*n_tokens =*/ 0,
|
||||||
/*tokens =*/ nullptr,
|
/*tokens =*/ nullptr,
|
||||||
/*embd =*/ nullptr,
|
/*embd =*/ nullptr,
|
||||||
|
/*n_embd =*/ 0,
|
||||||
/*pos =*/ nullptr,
|
/*pos =*/ nullptr,
|
||||||
/*n_seq_id =*/ nullptr,
|
/*n_seq_id =*/ nullptr,
|
||||||
/*seq_id =*/ nullptr,
|
/*seq_id =*/ nullptr,
|
||||||
@ -340,6 +338,7 @@ struct llama_batch llama_batch_init(int32_t n_tokens_alloc, int32_t embd, int32_
|
|||||||
|
|
||||||
if (embd) {
|
if (embd) {
|
||||||
batch.embd = (float *) malloc(sizeof(float) * n_tokens_alloc * embd);
|
batch.embd = (float *) malloc(sizeof(float) * n_tokens_alloc * embd);
|
||||||
|
batch.n_embd = embd;
|
||||||
} else {
|
} else {
|
||||||
batch.token = (llama_token *) malloc(sizeof(llama_token) * n_tokens_alloc);
|
batch.token = (llama_token *) malloc(sizeof(llama_token) * n_tokens_alloc);
|
||||||
}
|
}
|
||||||
|
3
llama/llama.cpp/src/llama-batch.h
vendored
3
llama/llama.cpp/src/llama-batch.h
vendored
@ -70,8 +70,7 @@ struct llama_sbatch {
|
|||||||
// sequence-wise split
|
// sequence-wise split
|
||||||
llama_ubatch split_seq(size_t n_ubatch);
|
llama_ubatch split_seq(size_t n_ubatch);
|
||||||
|
|
||||||
llama_sbatch() = default;
|
void from_batch(const llama_batch & batch, size_t n_embd, bool simple_split = false, bool logits_all = false);
|
||||||
llama_sbatch(const llama_batch & batch, size_t n_embd, bool simple_split = false, bool logits_all = false);
|
|
||||||
};
|
};
|
||||||
|
|
||||||
// temporary allocate memory for the input batch if needed
|
// temporary allocate memory for the input batch if needed
|
||||||
|
55
llama/llama.cpp/src/llama-chat.cpp
vendored
55
llama/llama.cpp/src/llama-chat.cpp
vendored
@ -35,7 +35,6 @@ static const std::map<std::string, llm_chat_template> LLM_CHAT_TEMPLATES = {
|
|||||||
{ "mistral-v3", LLM_CHAT_TEMPLATE_MISTRAL_V3 },
|
{ "mistral-v3", LLM_CHAT_TEMPLATE_MISTRAL_V3 },
|
||||||
{ "mistral-v3-tekken", LLM_CHAT_TEMPLATE_MISTRAL_V3_TEKKEN },
|
{ "mistral-v3-tekken", LLM_CHAT_TEMPLATE_MISTRAL_V3_TEKKEN },
|
||||||
{ "mistral-v7", LLM_CHAT_TEMPLATE_MISTRAL_V7 },
|
{ "mistral-v7", LLM_CHAT_TEMPLATE_MISTRAL_V7 },
|
||||||
{ "mistral-v7-tekken", LLM_CHAT_TEMPLATE_MISTRAL_V7_TEKKEN },
|
|
||||||
{ "phi3", LLM_CHAT_TEMPLATE_PHI_3 },
|
{ "phi3", LLM_CHAT_TEMPLATE_PHI_3 },
|
||||||
{ "phi4", LLM_CHAT_TEMPLATE_PHI_4 },
|
{ "phi4", LLM_CHAT_TEMPLATE_PHI_4 },
|
||||||
{ "falcon3", LLM_CHAT_TEMPLATE_FALCON_3 },
|
{ "falcon3", LLM_CHAT_TEMPLATE_FALCON_3 },
|
||||||
@ -51,8 +50,8 @@ static const std::map<std::string, llm_chat_template> LLM_CHAT_TEMPLATES = {
|
|||||||
{ "deepseek3", LLM_CHAT_TEMPLATE_DEEPSEEK_3 },
|
{ "deepseek3", LLM_CHAT_TEMPLATE_DEEPSEEK_3 },
|
||||||
{ "command-r", LLM_CHAT_TEMPLATE_COMMAND_R },
|
{ "command-r", LLM_CHAT_TEMPLATE_COMMAND_R },
|
||||||
{ "llama3", LLM_CHAT_TEMPLATE_LLAMA_3 },
|
{ "llama3", LLM_CHAT_TEMPLATE_LLAMA_3 },
|
||||||
{ "chatglm3", LLM_CHAT_TEMPLATE_CHATGLM_3 },
|
{ "chatglm3", LLM_CHAT_TEMPLATE_CHATGML_3 },
|
||||||
{ "chatglm4", LLM_CHAT_TEMPLATE_CHATGLM_4 },
|
{ "chatglm4", LLM_CHAT_TEMPLATE_CHATGML_4 },
|
||||||
{ "glmedge", LLM_CHAT_TEMPLATE_GLMEDGE },
|
{ "glmedge", LLM_CHAT_TEMPLATE_GLMEDGE },
|
||||||
{ "minicpm", LLM_CHAT_TEMPLATE_MINICPM },
|
{ "minicpm", LLM_CHAT_TEMPLATE_MINICPM },
|
||||||
{ "exaone3", LLM_CHAT_TEMPLATE_EXAONE_3 },
|
{ "exaone3", LLM_CHAT_TEMPLATE_EXAONE_3 },
|
||||||
@ -63,7 +62,6 @@ static const std::map<std::string, llm_chat_template> LLM_CHAT_TEMPLATES = {
|
|||||||
{ "yandex", LLM_CHAT_TEMPLATE_YANDEX },
|
{ "yandex", LLM_CHAT_TEMPLATE_YANDEX },
|
||||||
{ "bailing", LLM_CHAT_TEMPLATE_BAILING },
|
{ "bailing", LLM_CHAT_TEMPLATE_BAILING },
|
||||||
{ "llama4", LLM_CHAT_TEMPLATE_LLAMA4 },
|
{ "llama4", LLM_CHAT_TEMPLATE_LLAMA4 },
|
||||||
{ "smolvlm", LLM_CHAT_TEMPLATE_SMOLVLM },
|
|
||||||
};
|
};
|
||||||
|
|
||||||
llm_chat_template llm_chat_template_from_str(const std::string & name) {
|
llm_chat_template llm_chat_template_from_str(const std::string & name) {
|
||||||
@ -83,9 +81,7 @@ llm_chat_template llm_chat_detect_template(const std::string & tmpl) {
|
|||||||
if (tmpl_contains("<|im_start|>")) {
|
if (tmpl_contains("<|im_start|>")) {
|
||||||
return tmpl_contains("<|im_sep|>")
|
return tmpl_contains("<|im_sep|>")
|
||||||
? LLM_CHAT_TEMPLATE_PHI_4
|
? LLM_CHAT_TEMPLATE_PHI_4
|
||||||
: tmpl_contains("<end_of_utterance>")
|
: LLM_CHAT_TEMPLATE_CHATML;
|
||||||
? LLM_CHAT_TEMPLATE_SMOLVLM // SmolVLM uses <|im_start|> as BOS, but it is NOT chatml
|
|
||||||
: LLM_CHAT_TEMPLATE_CHATML;
|
|
||||||
} else if (tmpl.find("mistral") == 0 || tmpl_contains("[INST]")) {
|
} else if (tmpl.find("mistral") == 0 || tmpl_contains("[INST]")) {
|
||||||
if (tmpl_contains("[SYSTEM_PROMPT]")) {
|
if (tmpl_contains("[SYSTEM_PROMPT]")) {
|
||||||
return LLM_CHAT_TEMPLATE_MISTRAL_V7;
|
return LLM_CHAT_TEMPLATE_MISTRAL_V7;
|
||||||
@ -123,12 +119,8 @@ llm_chat_template llm_chat_detect_template(const std::string & tmpl) {
|
|||||||
}
|
}
|
||||||
} else if (tmpl_contains("<|assistant|>") && tmpl_contains("<|end|>")) {
|
} else if (tmpl_contains("<|assistant|>") && tmpl_contains("<|end|>")) {
|
||||||
return LLM_CHAT_TEMPLATE_PHI_3;
|
return LLM_CHAT_TEMPLATE_PHI_3;
|
||||||
} else if (tmpl_contains("[gMASK]<sop>")) {
|
|
||||||
return LLM_CHAT_TEMPLATE_CHATGLM_4;
|
|
||||||
} else if (tmpl_contains("<|assistant|>") && tmpl_contains("<|user|>")) {
|
} else if (tmpl_contains("<|assistant|>") && tmpl_contains("<|user|>")) {
|
||||||
return tmpl_contains("</s>") ? LLM_CHAT_TEMPLATE_FALCON_3 : LLM_CHAT_TEMPLATE_GLMEDGE;
|
return tmpl_contains("</s>") ? LLM_CHAT_TEMPLATE_FALCON_3 : LLM_CHAT_TEMPLATE_GLMEDGE;
|
||||||
} else if (tmpl_contains("<|{{ item['role'] }}|>") && tmpl_contains("<|begin_of_image|>")) {
|
|
||||||
return LLM_CHAT_TEMPLATE_GLMEDGE;
|
|
||||||
} else if (tmpl_contains("<|user|>") && tmpl_contains("<|endoftext|>")) {
|
} else if (tmpl_contains("<|user|>") && tmpl_contains("<|endoftext|>")) {
|
||||||
return LLM_CHAT_TEMPLATE_ZEPHYR;
|
return LLM_CHAT_TEMPLATE_ZEPHYR;
|
||||||
} else if (tmpl_contains("bos_token + message['role']")) {
|
} else if (tmpl_contains("bos_token + message['role']")) {
|
||||||
@ -157,7 +149,9 @@ llm_chat_template llm_chat_detect_template(const std::string & tmpl) {
|
|||||||
return LLM_CHAT_TEMPLATE_LLAMA_3;
|
return LLM_CHAT_TEMPLATE_LLAMA_3;
|
||||||
} else if (tmpl_contains("[gMASK]sop")) {
|
} else if (tmpl_contains("[gMASK]sop")) {
|
||||||
// chatglm3-6b
|
// chatglm3-6b
|
||||||
return LLM_CHAT_TEMPLATE_CHATGLM_3;
|
return LLM_CHAT_TEMPLATE_CHATGML_3;
|
||||||
|
} else if (tmpl_contains("[gMASK]<sop>")) {
|
||||||
|
return LLM_CHAT_TEMPLATE_CHATGML_4;
|
||||||
} else if (tmpl_contains(LU8("<用户>"))) {
|
} else if (tmpl_contains(LU8("<用户>"))) {
|
||||||
// MiniCPM-3B-OpenHermes-2.5-v2-GGUF
|
// MiniCPM-3B-OpenHermes-2.5-v2-GGUF
|
||||||
return LLM_CHAT_TEMPLATE_MINICPM;
|
return LLM_CHAT_TEMPLATE_MINICPM;
|
||||||
@ -203,20 +197,19 @@ int32_t llm_chat_apply_template(
|
|||||||
if (add_ass) {
|
if (add_ass) {
|
||||||
ss << "<|im_start|>assistant\n";
|
ss << "<|im_start|>assistant\n";
|
||||||
}
|
}
|
||||||
} else if (tmpl == LLM_CHAT_TEMPLATE_MISTRAL_V7 || tmpl == LLM_CHAT_TEMPLATE_MISTRAL_V7_TEKKEN) {
|
} else if (tmpl == LLM_CHAT_TEMPLATE_MISTRAL_V7) {
|
||||||
// Official mistral 'v7' template
|
// Official mistral 'v7' template
|
||||||
// See: https://huggingface.co/mistralai/Mistral-Large-Instruct-2411#basic-instruct-template-v7
|
// See: https://huggingface.co/mistralai/Mistral-Large-Instruct-2411#basic-instruct-template-v7
|
||||||
// https://huggingface.co/mistralai/Mistral-Small-3.1-24B-Instruct-2503#basic-instruct-template-v7-tekken
|
|
||||||
const char * trailing_space = tmpl == LLM_CHAT_TEMPLATE_MISTRAL_V7 ? " " : "";
|
|
||||||
for (auto message : chat) {
|
for (auto message : chat) {
|
||||||
std::string role(message->role);
|
std::string role(message->role);
|
||||||
std::string content(message->content);
|
std::string content(message->content);
|
||||||
if (role == "system") {
|
if (role == "system") {
|
||||||
ss << "[SYSTEM_PROMPT]" << trailing_space << content << "[/SYSTEM_PROMPT]";
|
ss << "[SYSTEM_PROMPT] " << content << "[/SYSTEM_PROMPT]";
|
||||||
} else if (role == "user") {
|
} else if (role == "user") {
|
||||||
ss << "[INST]" << trailing_space << content << "[/INST]";
|
ss << "[INST] " << content << "[/INST]";
|
||||||
} else {
|
}
|
||||||
ss << trailing_space << content << "</s>";
|
else {
|
||||||
|
ss << " " << content << "</s>";
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
} else if (tmpl == LLM_CHAT_TEMPLATE_MISTRAL_V1
|
} else if (tmpl == LLM_CHAT_TEMPLATE_MISTRAL_V1
|
||||||
@ -439,7 +432,7 @@ int32_t llm_chat_apply_template(
|
|||||||
if (add_ass) {
|
if (add_ass) {
|
||||||
ss << "<|start_header_id|>assistant<|end_header_id|>\n\n";
|
ss << "<|start_header_id|>assistant<|end_header_id|>\n\n";
|
||||||
}
|
}
|
||||||
} else if (tmpl == LLM_CHAT_TEMPLATE_CHATGLM_3) {
|
} else if (tmpl == LLM_CHAT_TEMPLATE_CHATGML_3) {
|
||||||
// chatglm3-6b
|
// chatglm3-6b
|
||||||
ss << "[gMASK]" << "sop";
|
ss << "[gMASK]" << "sop";
|
||||||
for (auto message : chat) {
|
for (auto message : chat) {
|
||||||
@ -449,14 +442,14 @@ int32_t llm_chat_apply_template(
|
|||||||
if (add_ass) {
|
if (add_ass) {
|
||||||
ss << "<|assistant|>";
|
ss << "<|assistant|>";
|
||||||
}
|
}
|
||||||
} else if (tmpl == LLM_CHAT_TEMPLATE_CHATGLM_4) {
|
} else if (tmpl == LLM_CHAT_TEMPLATE_CHATGML_4) {
|
||||||
ss << "[gMASK]" << "<sop>";
|
ss << "[gMASK]" << "<sop>";
|
||||||
for (auto message : chat) {
|
for (auto message : chat) {
|
||||||
std::string role(message->role);
|
std::string role(message->role);
|
||||||
ss << "<|" << role << "|>" << "\n" << message->content;
|
ss << "<|" << role << "|>" << "\n" << message->content;
|
||||||
}
|
}
|
||||||
if (add_ass) {
|
if (add_ass) {
|
||||||
ss << "<|assistant|>\n";
|
ss << "<|assistant|>";
|
||||||
}
|
}
|
||||||
} else if (tmpl == LLM_CHAT_TEMPLATE_GLMEDGE) {
|
} else if (tmpl == LLM_CHAT_TEMPLATE_GLMEDGE) {
|
||||||
for (auto message : chat) {
|
for (auto message : chat) {
|
||||||
@ -627,23 +620,7 @@ int32_t llm_chat_apply_template(
|
|||||||
if (add_ass) {
|
if (add_ass) {
|
||||||
ss << "<|header_start|>assistant<|header_end|>\n\n";
|
ss << "<|header_start|>assistant<|header_end|>\n\n";
|
||||||
}
|
}
|
||||||
} else if (tmpl == LLM_CHAT_TEMPLATE_SMOLVLM) {
|
} else {
|
||||||
// SmolVLM
|
|
||||||
ss << "<|im_start|>"; // uses <|im_start|> as BOS, but the actual content is NOT chatml
|
|
||||||
for (auto message : chat) {
|
|
||||||
std::string role(message->role);
|
|
||||||
if (role == "system") {
|
|
||||||
ss << message->content << "\n\n";
|
|
||||||
} else if (role == "user") {
|
|
||||||
ss << "User: " << message->content << "<end_of_utterance>\n";
|
|
||||||
} else {
|
|
||||||
ss << "Assistant: " << message->content << "<end_of_utterance>\n";
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if (add_ass) {
|
|
||||||
ss << "Assistant:";
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
// template not supported
|
// template not supported
|
||||||
return -1;
|
return -1;
|
||||||
}
|
}
|
||||||
|
6
llama/llama.cpp/src/llama-chat.h
vendored
6
llama/llama.cpp/src/llama-chat.h
vendored
@ -14,7 +14,6 @@ enum llm_chat_template {
|
|||||||
LLM_CHAT_TEMPLATE_MISTRAL_V3,
|
LLM_CHAT_TEMPLATE_MISTRAL_V3,
|
||||||
LLM_CHAT_TEMPLATE_MISTRAL_V3_TEKKEN,
|
LLM_CHAT_TEMPLATE_MISTRAL_V3_TEKKEN,
|
||||||
LLM_CHAT_TEMPLATE_MISTRAL_V7,
|
LLM_CHAT_TEMPLATE_MISTRAL_V7,
|
||||||
LLM_CHAT_TEMPLATE_MISTRAL_V7_TEKKEN,
|
|
||||||
LLM_CHAT_TEMPLATE_PHI_3,
|
LLM_CHAT_TEMPLATE_PHI_3,
|
||||||
LLM_CHAT_TEMPLATE_PHI_4,
|
LLM_CHAT_TEMPLATE_PHI_4,
|
||||||
LLM_CHAT_TEMPLATE_FALCON_3,
|
LLM_CHAT_TEMPLATE_FALCON_3,
|
||||||
@ -30,8 +29,8 @@ enum llm_chat_template {
|
|||||||
LLM_CHAT_TEMPLATE_DEEPSEEK_3,
|
LLM_CHAT_TEMPLATE_DEEPSEEK_3,
|
||||||
LLM_CHAT_TEMPLATE_COMMAND_R,
|
LLM_CHAT_TEMPLATE_COMMAND_R,
|
||||||
LLM_CHAT_TEMPLATE_LLAMA_3,
|
LLM_CHAT_TEMPLATE_LLAMA_3,
|
||||||
LLM_CHAT_TEMPLATE_CHATGLM_3,
|
LLM_CHAT_TEMPLATE_CHATGML_3,
|
||||||
LLM_CHAT_TEMPLATE_CHATGLM_4,
|
LLM_CHAT_TEMPLATE_CHATGML_4,
|
||||||
LLM_CHAT_TEMPLATE_GLMEDGE,
|
LLM_CHAT_TEMPLATE_GLMEDGE,
|
||||||
LLM_CHAT_TEMPLATE_MINICPM,
|
LLM_CHAT_TEMPLATE_MINICPM,
|
||||||
LLM_CHAT_TEMPLATE_EXAONE_3,
|
LLM_CHAT_TEMPLATE_EXAONE_3,
|
||||||
@ -42,7 +41,6 @@ enum llm_chat_template {
|
|||||||
LLM_CHAT_TEMPLATE_YANDEX,
|
LLM_CHAT_TEMPLATE_YANDEX,
|
||||||
LLM_CHAT_TEMPLATE_BAILING,
|
LLM_CHAT_TEMPLATE_BAILING,
|
||||||
LLM_CHAT_TEMPLATE_LLAMA4,
|
LLM_CHAT_TEMPLATE_LLAMA4,
|
||||||
LLM_CHAT_TEMPLATE_SMOLVLM,
|
|
||||||
LLM_CHAT_TEMPLATE_UNKNOWN,
|
LLM_CHAT_TEMPLATE_UNKNOWN,
|
||||||
};
|
};
|
||||||
|
|
||||||
|
925
llama/llama.cpp/src/llama-context.cpp
vendored
925
llama/llama.cpp/src/llama-context.cpp
vendored
File diff suppressed because it is too large
Load Diff
79
llama/llama.cpp/src/llama-context.h
vendored
79
llama/llama.cpp/src/llama-context.h
vendored
@ -8,7 +8,6 @@
|
|||||||
#include "llama-kv-cache.h"
|
#include "llama-kv-cache.h"
|
||||||
|
|
||||||
#include "ggml-cpp.h"
|
#include "ggml-cpp.h"
|
||||||
#include "ggml-opt.h"
|
|
||||||
|
|
||||||
#include <map>
|
#include <map>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
@ -29,12 +28,7 @@ struct llama_context {
|
|||||||
|
|
||||||
void synchronize();
|
void synchronize();
|
||||||
|
|
||||||
const llama_model & get_model() const;
|
const llama_model & get_model() const;
|
||||||
const llama_cparams & get_cparams() const;
|
|
||||||
|
|
||||||
ggml_backend_sched_t get_sched() const;
|
|
||||||
|
|
||||||
ggml_context * get_ctx_compute() const;
|
|
||||||
|
|
||||||
uint32_t n_ctx() const;
|
uint32_t n_ctx() const;
|
||||||
uint32_t n_ctx_per_seq() const;
|
uint32_t n_ctx_per_seq() const;
|
||||||
@ -72,6 +66,7 @@ struct llama_context {
|
|||||||
void set_embeddings (bool value);
|
void set_embeddings (bool value);
|
||||||
void set_causal_attn(bool value);
|
void set_causal_attn(bool value);
|
||||||
void set_warmup(bool value);
|
void set_warmup(bool value);
|
||||||
|
void set_cross_attn(bool value);
|
||||||
|
|
||||||
void set_adapter_lora(
|
void set_adapter_lora(
|
||||||
llama_adapter_lora * adapter,
|
llama_adapter_lora * adapter,
|
||||||
@ -135,32 +130,6 @@ struct llama_context {
|
|||||||
llama_perf_context_data perf_get_data() const;
|
llama_perf_context_data perf_get_data() const;
|
||||||
void perf_reset();
|
void perf_reset();
|
||||||
|
|
||||||
//
|
|
||||||
// training
|
|
||||||
//
|
|
||||||
|
|
||||||
void opt_init(struct llama_model * model, struct llama_opt_params lopt_params);
|
|
||||||
|
|
||||||
void opt_epoch(
|
|
||||||
ggml_opt_dataset_t dataset,
|
|
||||||
ggml_opt_result_t result_train,
|
|
||||||
ggml_opt_result_t result_eval,
|
|
||||||
int64_t idata_split,
|
|
||||||
ggml_opt_epoch_callback callback_train,
|
|
||||||
ggml_opt_epoch_callback callback_eval);
|
|
||||||
|
|
||||||
void opt_epoch_iter(
|
|
||||||
ggml_opt_dataset_t dataset,
|
|
||||||
ggml_opt_result_t result,
|
|
||||||
const std::vector<llama_token> & tokens,
|
|
||||||
const std::vector<llama_token> & labels_sparse,
|
|
||||||
llama_batch & batch,
|
|
||||||
ggml_opt_epoch_callback callback,
|
|
||||||
bool train,
|
|
||||||
int64_t idata_in_loop,
|
|
||||||
int64_t ndata_in_loop,
|
|
||||||
int64_t t_loop_start);
|
|
||||||
|
|
||||||
private:
|
private:
|
||||||
//
|
//
|
||||||
// output
|
// output
|
||||||
@ -170,30 +139,51 @@ private:
|
|||||||
// Returns max number of outputs for which space was reserved.
|
// Returns max number of outputs for which space was reserved.
|
||||||
int32_t output_reserve(int32_t n_outputs);
|
int32_t output_reserve(int32_t n_outputs);
|
||||||
|
|
||||||
|
// make the outputs have the same order they had in the user-provided batch
|
||||||
|
// TODO: maybe remove this
|
||||||
|
void output_reorder();
|
||||||
|
|
||||||
//
|
//
|
||||||
// graph
|
// graph
|
||||||
//
|
//
|
||||||
|
|
||||||
public:
|
|
||||||
int32_t graph_max_nodes() const;
|
int32_t graph_max_nodes() const;
|
||||||
|
|
||||||
// zero-out inputs and create the ctx_compute for the compute graph
|
// zero-out inputs and create the ctx_compute for the compute graph
|
||||||
ggml_cgraph * graph_init();
|
ggml_cgraph * graph_init();
|
||||||
|
|
||||||
// returns the result of ggml_backend_sched_graph_compute_async execution
|
|
||||||
ggml_status graph_compute(
|
|
||||||
ggml_cgraph * gf,
|
|
||||||
bool batched);
|
|
||||||
|
|
||||||
private:
|
|
||||||
llm_graph_result_ptr graph_build(
|
llm_graph_result_ptr graph_build(
|
||||||
ggml_context * ctx,
|
ggml_context * ctx,
|
||||||
ggml_cgraph * gf,
|
ggml_cgraph * gf,
|
||||||
const llama_ubatch & ubatch,
|
const llama_ubatch & ubatch,
|
||||||
llm_graph_type gtype);
|
llm_graph_type gtype);
|
||||||
|
|
||||||
|
// returns the result of ggml_backend_sched_graph_compute_async execution
|
||||||
|
ggml_status graph_compute(
|
||||||
|
ggml_cgraph * gf,
|
||||||
|
bool batched);
|
||||||
|
|
||||||
llm_graph_cb graph_get_cb() const;
|
llm_graph_cb graph_get_cb() const;
|
||||||
|
|
||||||
|
// used by kv_self_update()
|
||||||
|
ggml_tensor * build_rope_shift(
|
||||||
|
ggml_context * ctx0,
|
||||||
|
ggml_tensor * cur,
|
||||||
|
ggml_tensor * shift,
|
||||||
|
ggml_tensor * factors,
|
||||||
|
float freq_base,
|
||||||
|
float freq_scale,
|
||||||
|
ggml_backend_buffer * bbuf) const;
|
||||||
|
|
||||||
|
llm_graph_result_ptr build_kv_self_shift(
|
||||||
|
ggml_context * ctx0,
|
||||||
|
ggml_cgraph * gf) const;
|
||||||
|
|
||||||
|
llm_graph_result_ptr build_kv_self_defrag(
|
||||||
|
ggml_context * ctx0,
|
||||||
|
ggml_cgraph * gf,
|
||||||
|
const std::vector<struct llama_kv_defrag_move> & moves) const;
|
||||||
|
|
||||||
// TODO: read/write lora adapters and cvec
|
// TODO: read/write lora adapters and cvec
|
||||||
size_t state_write_data(llama_io_write_i & io);
|
size_t state_write_data(llama_io_write_i & io);
|
||||||
size_t state_read_data (llama_io_read_i & io);
|
size_t state_read_data (llama_io_read_i & io);
|
||||||
@ -210,10 +200,14 @@ private:
|
|||||||
llama_cparams cparams;
|
llama_cparams cparams;
|
||||||
llama_adapter_cvec cvec;
|
llama_adapter_cvec cvec;
|
||||||
llama_adapter_loras loras;
|
llama_adapter_loras loras;
|
||||||
|
llama_sbatch sbatch;
|
||||||
|
|
||||||
llama_cross cross; // TODO: tmp for handling cross-attention - need something better probably
|
llama_cross cross; // TODO: tmp for handling cross-attention - need something better probably
|
||||||
|
|
||||||
std::unique_ptr<llama_memory_i> memory;
|
std::unique_ptr<llama_kv_cache_unified> kv_self;
|
||||||
|
|
||||||
|
// TODO: remove
|
||||||
|
bool logits_all = false;
|
||||||
|
|
||||||
// decode output (2-dimensional array: [n_outputs][n_vocab])
|
// decode output (2-dimensional array: [n_outputs][n_vocab])
|
||||||
size_t logits_size = 0; // capacity (of floats) for logits
|
size_t logits_size = 0; // capacity (of floats) for logits
|
||||||
@ -240,9 +234,6 @@ private:
|
|||||||
|
|
||||||
ggml_context_ptr ctx_compute;
|
ggml_context_ptr ctx_compute;
|
||||||
|
|
||||||
// training
|
|
||||||
ggml_opt_context_t opt_ctx = nullptr;
|
|
||||||
|
|
||||||
ggml_threadpool_t threadpool = nullptr;
|
ggml_threadpool_t threadpool = nullptr;
|
||||||
ggml_threadpool_t threadpool_batch = nullptr;
|
ggml_threadpool_t threadpool_batch = nullptr;
|
||||||
|
|
||||||
|
2
llama/llama.cpp/src/llama-cparams.h
vendored
2
llama/llama.cpp/src/llama-cparams.h
vendored
@ -29,8 +29,8 @@ struct llama_cparams {
|
|||||||
bool offload_kqv;
|
bool offload_kqv;
|
||||||
bool flash_attn;
|
bool flash_attn;
|
||||||
bool no_perf;
|
bool no_perf;
|
||||||
|
bool cross_attn;
|
||||||
bool warmup;
|
bool warmup;
|
||||||
bool op_offload;
|
|
||||||
|
|
||||||
enum llama_pooling_type pooling_type;
|
enum llama_pooling_type pooling_type;
|
||||||
|
|
||||||
|
49
llama/llama.cpp/src/llama-grammar.cpp
vendored
49
llama/llama.cpp/src/llama-grammar.cpp
vendored
@ -907,7 +907,6 @@ llama_grammar_candidates llama_grammar_reject_candidates_for_stack(
|
|||||||
|
|
||||||
struct llama_grammar * llama_grammar_init_impl(
|
struct llama_grammar * llama_grammar_init_impl(
|
||||||
const struct llama_vocab * vocab,
|
const struct llama_vocab * vocab,
|
||||||
const struct ollama_vocab * ollama_vocab,
|
|
||||||
const llama_grammar_element ** rules,
|
const llama_grammar_element ** rules,
|
||||||
size_t n_rules,
|
size_t n_rules,
|
||||||
size_t start_rule_index) {
|
size_t start_rule_index) {
|
||||||
@ -963,7 +962,6 @@ struct llama_grammar * llama_grammar_init_impl(
|
|||||||
// then the pointers would be invalidated when the local vec_rules goes out of scope.
|
// then the pointers would be invalidated when the local vec_rules goes out of scope.
|
||||||
return new llama_grammar {
|
return new llama_grammar {
|
||||||
vocab,
|
vocab,
|
||||||
ollama_vocab,
|
|
||||||
std::move(vec_rules),
|
std::move(vec_rules),
|
||||||
std::move(stacks),
|
std::move(stacks),
|
||||||
/* .partial_utf8 = */ {},
|
/* .partial_utf8 = */ {},
|
||||||
@ -977,7 +975,6 @@ struct llama_grammar * llama_grammar_init_impl(
|
|||||||
|
|
||||||
struct llama_grammar * llama_grammar_init_impl(
|
struct llama_grammar * llama_grammar_init_impl(
|
||||||
const struct llama_vocab * vocab,
|
const struct llama_vocab * vocab,
|
||||||
const struct ollama_vocab * ollama_vocab,
|
|
||||||
const char * grammar_str,
|
const char * grammar_str,
|
||||||
const char * grammar_root,
|
const char * grammar_root,
|
||||||
bool lazy,
|
bool lazy,
|
||||||
@ -1070,7 +1067,6 @@ struct llama_grammar * llama_grammar_init_impl(
|
|||||||
// then the pointers would be invalidated when the local vec_rules goes out of scope.
|
// then the pointers would be invalidated when the local vec_rules goes out of scope.
|
||||||
return new llama_grammar {
|
return new llama_grammar {
|
||||||
vocab,
|
vocab,
|
||||||
ollama_vocab,
|
|
||||||
std::move(vec_rules),
|
std::move(vec_rules),
|
||||||
std::move(stacks),
|
std::move(stacks),
|
||||||
/* .partial_utf8 = */ {},
|
/* .partial_utf8 = */ {},
|
||||||
@ -1093,7 +1089,6 @@ void llama_grammar_free_impl(struct llama_grammar * grammar) {
|
|||||||
struct llama_grammar * llama_grammar_clone_impl(const struct llama_grammar & grammar) {
|
struct llama_grammar * llama_grammar_clone_impl(const struct llama_grammar & grammar) {
|
||||||
auto * result = new llama_grammar {
|
auto * result = new llama_grammar {
|
||||||
grammar.vocab,
|
grammar.vocab,
|
||||||
grammar.o_vocab,
|
|
||||||
grammar.rules,
|
grammar.rules,
|
||||||
grammar.stacks,
|
grammar.stacks,
|
||||||
grammar.partial_utf8,
|
grammar.partial_utf8,
|
||||||
@ -1121,6 +1116,7 @@ struct llama_grammar * llama_grammar_clone_impl(const struct llama_grammar & gra
|
|||||||
}
|
}
|
||||||
|
|
||||||
void llama_grammar_apply_impl(const struct llama_grammar & grammar, llama_token_data_array * cur_p) {
|
void llama_grammar_apply_impl(const struct llama_grammar & grammar, llama_token_data_array * cur_p) {
|
||||||
|
GGML_ASSERT(grammar.vocab != nullptr);
|
||||||
|
|
||||||
if (grammar.awaiting_trigger) {
|
if (grammar.awaiting_trigger) {
|
||||||
return;
|
return;
|
||||||
@ -1142,13 +1138,9 @@ void llama_grammar_apply_impl(const struct llama_grammar & grammar, llama_token_
|
|||||||
|
|
||||||
for (size_t i = 0; i < cur_p->size; ++i) {
|
for (size_t i = 0; i < cur_p->size; ++i) {
|
||||||
const llama_token id = cur_p->data[i].id;
|
const llama_token id = cur_p->data[i].id;
|
||||||
const std::string piece = grammar.o_vocab ?
|
const std::string & piece = grammar.vocab->token_to_piece(id);
|
||||||
grammar.o_vocab->token_to_piece(id) :
|
|
||||||
grammar.vocab->token_to_piece(id);
|
|
||||||
|
|
||||||
const bool is_eog = grammar.o_vocab ? grammar.o_vocab->is_eog(id) : grammar.vocab->is_eog(id);
|
if (grammar.vocab->is_eog(id)) {
|
||||||
|
|
||||||
if (is_eog) {
|
|
||||||
if (!allow_eog) {
|
if (!allow_eog) {
|
||||||
cur_p->data[i].logit = -INFINITY;
|
cur_p->data[i].logit = -INFINITY;
|
||||||
}
|
}
|
||||||
@ -1167,10 +1159,9 @@ void llama_grammar_apply_impl(const struct llama_grammar & grammar, llama_token_
|
|||||||
}
|
}
|
||||||
|
|
||||||
void llama_grammar_accept_impl(struct llama_grammar & grammar, llama_token token) {
|
void llama_grammar_accept_impl(struct llama_grammar & grammar, llama_token token) {
|
||||||
|
GGML_ASSERT(grammar.vocab != nullptr);
|
||||||
|
|
||||||
const std::string piece = grammar.o_vocab ?
|
const auto & piece = grammar.vocab->token_to_piece(token);
|
||||||
grammar.o_vocab->token_to_piece(token) :
|
|
||||||
grammar.vocab->token_to_piece(token);
|
|
||||||
|
|
||||||
if (grammar.awaiting_trigger) {
|
if (grammar.awaiting_trigger) {
|
||||||
if (std::find(grammar.trigger_tokens.begin(), grammar.trigger_tokens.end(), token) != grammar.trigger_tokens.end()) {
|
if (std::find(grammar.trigger_tokens.begin(), grammar.trigger_tokens.end(), token) != grammar.trigger_tokens.end()) {
|
||||||
@ -1200,14 +1191,13 @@ void llama_grammar_accept_impl(struct llama_grammar & grammar, llama_token token
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
const bool is_eog = grammar.o_vocab ? grammar.o_vocab->is_eog(token) : grammar.vocab->is_eog(token);
|
if (grammar.vocab->is_eog(token)) {
|
||||||
if (is_eog) {
|
|
||||||
for (const auto & stack : grammar.stacks) {
|
for (const auto & stack : grammar.stacks) {
|
||||||
if (stack.empty()) {
|
if (stack.empty()) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
GGML_ABORT("grammar error: end of grammar token received but grammar stack is not empty");
|
GGML_ABORT("fatal error");
|
||||||
}
|
}
|
||||||
|
|
||||||
llama_grammar_accept_str(grammar, piece);
|
llama_grammar_accept_str(grammar, piece);
|
||||||
@ -1227,28 +1217,3 @@ void llama_grammar_accept_str(struct llama_grammar & grammar, const std::string
|
|||||||
throw std::runtime_error("Unexpected empty grammar stack after accepting piece: " + piece);
|
throw std::runtime_error("Unexpected empty grammar stack after accepting piece: " + piece);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
const std::string & ollama_vocab::token_to_piece(const uint32_t token) const {
|
|
||||||
try {
|
|
||||||
return token_to_piece_map.at(token);
|
|
||||||
} catch (const std::out_of_range&) {
|
|
||||||
throw std::runtime_error("Token not found in vocabulary: " + std::to_string(token));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
void ollama_vocab::add_token_pieces(const uint32_t* tokens, size_t n_tokens, const char** pieces) {
|
|
||||||
for (size_t i = 0; i < n_tokens; i++) {
|
|
||||||
token_to_piece_map[tokens[i]] = pieces[i];
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
bool ollama_vocab::is_eog(const uint32_t token) const {
|
|
||||||
return special_eog_ids.count(token) > 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
void ollama_vocab::set_eog_tokens(const uint32_t* tokens, size_t n_tokens) {
|
|
||||||
for (size_t i = 0; i < n_tokens; i++) {
|
|
||||||
special_eog_ids.insert(tokens[i]);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
14
llama/llama.cpp/src/llama-grammar.h
vendored
14
llama/llama.cpp/src/llama-grammar.h
vendored
@ -6,19 +6,8 @@
|
|||||||
#include <regex>
|
#include <regex>
|
||||||
#include <string>
|
#include <string>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
#include <set>
|
|
||||||
|
|
||||||
struct llama_vocab;
|
struct llama_vocab;
|
||||||
struct ollama_vocab {
|
|
||||||
std::map<uint32_t, std::string> token_to_piece_map;
|
|
||||||
std::set<uint32_t> special_eog_ids;
|
|
||||||
|
|
||||||
const std::string & token_to_piece(const uint32_t token) const;
|
|
||||||
void add_token_pieces(const uint32_t* tokens, size_t n_tokens, const char** pieces);
|
|
||||||
void set_eog_tokens(const uint32_t* tokens, size_t n_tokens);
|
|
||||||
bool is_eog(const uint32_t token) const;
|
|
||||||
|
|
||||||
};
|
|
||||||
|
|
||||||
// grammar element type
|
// grammar element type
|
||||||
enum llama_gretype {
|
enum llama_gretype {
|
||||||
@ -125,7 +114,6 @@ struct llama_grammar_trigger_pattern {
|
|||||||
struct llama_grammar {
|
struct llama_grammar {
|
||||||
// note: allow null vocab for testing (not great)
|
// note: allow null vocab for testing (not great)
|
||||||
const llama_vocab * vocab;
|
const llama_vocab * vocab;
|
||||||
const ollama_vocab * o_vocab;
|
|
||||||
|
|
||||||
const llama_grammar_rules rules; // TODO: shared ptr
|
const llama_grammar_rules rules; // TODO: shared ptr
|
||||||
llama_grammar_stacks stacks;
|
llama_grammar_stacks stacks;
|
||||||
@ -153,14 +141,12 @@ struct llama_grammar {
|
|||||||
// note: needed for tests (not great)
|
// note: needed for tests (not great)
|
||||||
struct llama_grammar * llama_grammar_init_impl(
|
struct llama_grammar * llama_grammar_init_impl(
|
||||||
const struct llama_vocab * vocab,
|
const struct llama_vocab * vocab,
|
||||||
const struct ollama_vocab * ollama_vocab,
|
|
||||||
const llama_grammar_element ** rules,
|
const llama_grammar_element ** rules,
|
||||||
size_t n_rules,
|
size_t n_rules,
|
||||||
size_t start_rule_index);
|
size_t start_rule_index);
|
||||||
|
|
||||||
struct llama_grammar * llama_grammar_init_impl(
|
struct llama_grammar * llama_grammar_init_impl(
|
||||||
const struct llama_vocab * vocab,
|
const struct llama_vocab * vocab,
|
||||||
const struct ollama_vocab * ollama_vocab,
|
|
||||||
const char * grammar_str,
|
const char * grammar_str,
|
||||||
const char * grammar_root,
|
const char * grammar_root,
|
||||||
bool lazy,
|
bool lazy,
|
||||||
|
170
llama/llama.cpp/src/llama-graph.cpp
vendored
170
llama/llama.cpp/src/llama-graph.cpp
vendored
@ -55,21 +55,7 @@ void llm_graph_input_pos::set_input(const llama_ubatch * ubatch) {
|
|||||||
if (ubatch->pos && pos) {
|
if (ubatch->pos && pos) {
|
||||||
const int64_t n_tokens = ubatch->n_tokens;
|
const int64_t n_tokens = ubatch->n_tokens;
|
||||||
|
|
||||||
if (ubatch->token && n_pos_per_embd == 4) {
|
ggml_backend_tensor_set(pos, ubatch->pos, 0, n_tokens*n_pos_per_token*ggml_element_size(pos));
|
||||||
// in case we're using M-RoPE with text tokens, convert the 1D positions to 4D
|
|
||||||
// the 3 first dims are the same, and 4th dim is all 0
|
|
||||||
std::vector<llama_pos> pos_data(n_tokens*n_pos_per_embd);
|
|
||||||
// copy the first dimension
|
|
||||||
for (int i = 0; i < n_tokens; ++i) {
|
|
||||||
pos_data[ i] = ubatch->pos[i];
|
|
||||||
pos_data[ n_tokens + i] = ubatch->pos[i];
|
|
||||||
pos_data[2 * n_tokens + i] = ubatch->pos[i];
|
|
||||||
pos_data[3 * n_tokens + i] = 0; // 4th dim is 0
|
|
||||||
}
|
|
||||||
ggml_backend_tensor_set(pos, pos_data.data(), 0, pos_data.size()*ggml_element_size(pos));
|
|
||||||
} else {
|
|
||||||
ggml_backend_tensor_set(pos, ubatch->pos, 0, n_tokens*n_pos_per_embd*ggml_element_size(pos));
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -85,7 +71,7 @@ void llm_graph_input_attn_temp::set_input(const llama_ubatch * ubatch) {
|
|||||||
) * f_attn_temp_scale + 1.0;
|
) * f_attn_temp_scale + 1.0;
|
||||||
}
|
}
|
||||||
|
|
||||||
ggml_backend_tensor_set(attn_scale, attn_scale_data.data(), 0, n_tokens*ggml_element_size(attn_scale));
|
ggml_backend_tensor_set(attn_scale, attn_scale_data.data(), 0, n_tokens*n_pos_per_token*ggml_element_size(attn_scale));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -284,7 +270,24 @@ void llm_graph_input_s_copy::set_input(const llama_ubatch * ubatch) {
|
|||||||
|
|
||||||
// assuming copy destinations ALWAYS happen ONLY on the cells between head and head+n
|
// assuming copy destinations ALWAYS happen ONLY on the cells between head and head+n
|
||||||
for (uint32_t i = 0; i < n_kv; ++i) {
|
for (uint32_t i = 0; i < n_kv; ++i) {
|
||||||
data[i] = kv_self->s_copy(i);
|
const uint32_t cell_id = i + kv_self->head;
|
||||||
|
|
||||||
|
//////////////////////////////////////////////
|
||||||
|
// TODO: this should not mutate the KV cache !
|
||||||
|
llama_kv_cell & kv_cell = const_cast<class llama_kv_cache_unified *>(kv_self)->cells[i];
|
||||||
|
|
||||||
|
// prevent out-of-bound sources
|
||||||
|
if (kv_cell.src < 0 || (uint32_t) kv_cell.src >= kv_self->size) {
|
||||||
|
kv_cell.src = cell_id;
|
||||||
|
}
|
||||||
|
|
||||||
|
data[i] = kv_cell.src;
|
||||||
|
|
||||||
|
// TODO: do not mutate the KV cache
|
||||||
|
// ensure copy only happens once
|
||||||
|
if (kv_cell.src != (int32_t) cell_id) {
|
||||||
|
kv_cell.src = cell_id;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -300,7 +303,18 @@ void llm_graph_input_s_mask::set_input(const llama_ubatch * ubatch) {
|
|||||||
|
|
||||||
// clear unused states
|
// clear unused states
|
||||||
for (int i = 0; i < n_kv; ++i) {
|
for (int i = 0; i < n_kv; ++i) {
|
||||||
data[i] = kv_self->s_mask(i);
|
const uint32_t cell_id = i + kv_self->head;
|
||||||
|
|
||||||
|
//////////////////////////////////////////////
|
||||||
|
// TODO: this should not mutate the KV cache !
|
||||||
|
llama_kv_cell & kv_cell = const_cast<class llama_kv_cache_unified *>(kv_self)->cells[i];
|
||||||
|
|
||||||
|
data[i] = (float) (kv_cell.src >= 0);
|
||||||
|
|
||||||
|
// only clear once
|
||||||
|
if (kv_cell.src < 0) {
|
||||||
|
kv_cell.src = cell_id;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -532,6 +546,12 @@ void llm_graph_input_attn_cross::set_input(const llama_ubatch * ubatch) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void llm_graph_input_cross_attn_state::set_input(const llama_ubatch * ubatch) {
|
||||||
|
if (ubatch->embd) {
|
||||||
|
ggml_backend_tensor_set(cross_attn_state, ubatch->embd, 0, ggml_nbytes(cross_attn_state));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
//
|
//
|
||||||
// llm_graph_context
|
// llm_graph_context
|
||||||
//
|
//
|
||||||
@ -578,7 +598,7 @@ llm_graph_context::llm_graph_context(const llm_graph_params & params) :
|
|||||||
res (std::make_unique<llm_graph_result>()) {
|
res (std::make_unique<llm_graph_result>()) {
|
||||||
}
|
}
|
||||||
|
|
||||||
int64_t llm_graph_context::n_pos_per_embd() const {
|
int64_t llm_graph_context::n_pos_per_token() const {
|
||||||
return arch == LLM_ARCH_QWEN2VL ? 4 : 1;
|
return arch == LLM_ARCH_QWEN2VL ? 4 : 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -782,17 +802,13 @@ ggml_tensor * llm_graph_context::build_ffn(
|
|||||||
} break;
|
} break;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (gate && type_gate == LLM_FFN_PAR) {
|
if (type_gate == LLM_FFN_PAR) {
|
||||||
cur = ggml_mul(ctx0, cur, tmp);
|
cur = ggml_mul(ctx0, cur, tmp);
|
||||||
cb(cur, "ffn_gate_par", il);
|
cb(cur, "ffn_gate_par", il);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (down) {
|
if (down) {
|
||||||
cur = build_lora_mm(down, cur);
|
cur = build_lora_mm(down, cur);
|
||||||
if (arch == LLM_ARCH_GLM4) {
|
|
||||||
// GLM4 seems to have numerical issues with half-precision accumulators
|
|
||||||
ggml_mul_mat_set_prec(cur, GGML_PREC_F32);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if (down_b) {
|
if (down_b) {
|
||||||
@ -900,35 +916,28 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
|
|||||||
ggml_tensor * up = build_lora_mm_id(up_exps, cur, selected_experts); // [n_ff, n_expert_used, n_tokens]
|
ggml_tensor * up = build_lora_mm_id(up_exps, cur, selected_experts); // [n_ff, n_expert_used, n_tokens]
|
||||||
cb(up, "ffn_moe_up", il);
|
cb(up, "ffn_moe_up", il);
|
||||||
|
|
||||||
ggml_tensor * experts = nullptr;
|
ggml_tensor * gate = build_lora_mm_id(gate_exps, cur, selected_experts); // [n_ff, n_expert_used, n_tokens]
|
||||||
if (gate_exps) {
|
cb(gate, "ffn_moe_gate", il);
|
||||||
cur = build_lora_mm_id(gate_exps, cur, selected_experts); // [n_ff, n_expert_used, n_tokens]
|
|
||||||
cb(cur, "ffn_moe_gate", il);
|
|
||||||
} else {
|
|
||||||
cur = up;
|
|
||||||
}
|
|
||||||
|
|
||||||
switch (type_op) {
|
switch (type_op) {
|
||||||
case LLM_FFN_SILU:
|
case LLM_FFN_SILU:
|
||||||
{
|
{
|
||||||
cur = ggml_silu(ctx0, cur);
|
gate = ggml_silu(ctx0, gate);
|
||||||
cb(cur, "ffn_moe_silu", il);
|
cb(gate, "ffn_moe_silu", il);
|
||||||
} break;
|
} break;
|
||||||
case LLM_FFN_GELU:
|
case LLM_FFN_GELU:
|
||||||
{
|
{
|
||||||
cur = ggml_gelu(ctx0, cur);
|
gate = ggml_gelu(ctx0, gate);
|
||||||
cb(cur, "ffn_moe_gelu", il);
|
cb(gate, "ffn_moe_gelu", il);
|
||||||
} break;
|
} break;
|
||||||
default:
|
default:
|
||||||
GGML_ABORT("fatal error");
|
GGML_ABORT("fatal error");
|
||||||
}
|
}
|
||||||
|
|
||||||
if (gate_exps) {
|
ggml_tensor * par = ggml_mul(ctx0, up, gate); // [n_ff, n_expert_used, n_tokens]
|
||||||
cur = ggml_mul(ctx0, cur, up); // [n_ff, n_expert_used, n_tokens]
|
cb(par, "ffn_moe_gate_par", il);
|
||||||
cb(cur, "ffn_moe_gate_par", il);
|
|
||||||
}
|
|
||||||
|
|
||||||
experts = build_lora_mm_id(down_exps, cur, selected_experts); // [n_embd, n_expert_used, n_tokens]
|
ggml_tensor * experts = build_lora_mm_id(down_exps, par, selected_experts); // [n_embd, n_expert_used, n_tokens]
|
||||||
cb(experts, "ffn_moe_down", il);
|
cb(experts, "ffn_moe_down", il);
|
||||||
|
|
||||||
if (!weight_before_ffn) {
|
if (!weight_before_ffn) {
|
||||||
@ -971,7 +980,6 @@ ggml_tensor * llm_graph_context::build_inp_embd(ggml_tensor * tok_embd) const {
|
|||||||
inp->tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, ubatch.n_tokens);
|
inp->tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, ubatch.n_tokens);
|
||||||
//cb(inp->tokens, "inp_tokens", -1);
|
//cb(inp->tokens, "inp_tokens", -1);
|
||||||
ggml_set_input(inp->tokens);
|
ggml_set_input(inp->tokens);
|
||||||
res->t_tokens = inp->tokens;
|
|
||||||
|
|
||||||
cur = ggml_get_rows(ctx0, tok_embd, inp->tokens);
|
cur = ggml_get_rows(ctx0, tok_embd, inp->tokens);
|
||||||
|
|
||||||
@ -1012,11 +1020,11 @@ ggml_tensor * llm_graph_context::build_inp_embd(ggml_tensor * tok_embd) const {
|
|||||||
}
|
}
|
||||||
|
|
||||||
ggml_tensor * llm_graph_context::build_inp_pos() const {
|
ggml_tensor * llm_graph_context::build_inp_pos() const {
|
||||||
auto inp = std::make_unique<llm_graph_input_pos>(n_pos_per_embd());
|
auto inp = std::make_unique<llm_graph_input_pos>(n_pos_per_token());
|
||||||
|
|
||||||
auto & cur = inp->pos;
|
auto & cur = inp->pos;
|
||||||
|
|
||||||
cur = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens*n_pos_per_embd());
|
cur = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens*n_pos_per_token());
|
||||||
ggml_set_input(cur);
|
ggml_set_input(cur);
|
||||||
|
|
||||||
res->add_input(std::move(inp));
|
res->add_input(std::move(inp));
|
||||||
@ -1025,12 +1033,11 @@ ggml_tensor * llm_graph_context::build_inp_pos() const {
|
|||||||
}
|
}
|
||||||
|
|
||||||
ggml_tensor * llm_graph_context::build_inp_attn_scale() const {
|
ggml_tensor * llm_graph_context::build_inp_attn_scale() const {
|
||||||
auto inp = std::make_unique<llm_graph_input_attn_temp>(hparams.n_attn_temp_floor_scale, hparams.f_attn_temp_scale);
|
auto inp = std::make_unique<llm_graph_input_attn_temp>(n_pos_per_token(), hparams.n_attn_temp_floor_scale, hparams.f_attn_temp_scale);
|
||||||
|
|
||||||
auto & cur = inp->attn_scale;
|
auto & cur = inp->attn_scale;
|
||||||
|
|
||||||
// this need to be 1x1xN for broadcasting
|
cur = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, 1, 1, n_tokens*n_pos_per_token());
|
||||||
cur = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, 1, 1, n_tokens);
|
|
||||||
ggml_set_input(cur);
|
ggml_set_input(cur);
|
||||||
|
|
||||||
res->add_input(std::move(inp));
|
res->add_input(std::move(inp));
|
||||||
@ -1078,7 +1085,7 @@ ggml_tensor * llm_graph_context::build_inp_cls() const {
|
|||||||
}
|
}
|
||||||
|
|
||||||
ggml_tensor * llm_graph_context::build_inp_s_copy() const {
|
ggml_tensor * llm_graph_context::build_inp_s_copy() const {
|
||||||
const llama_kv_cache_recurrent * kv_self = static_cast<const llama_kv_cache_recurrent *>(memory);
|
const llama_kv_cache_unified * kv_self = static_cast<const llama_kv_cache_unified *>(memory);
|
||||||
|
|
||||||
auto inp = std::make_unique<llm_graph_input_s_copy>(kv_self);
|
auto inp = std::make_unique<llm_graph_input_s_copy>(kv_self);
|
||||||
|
|
||||||
@ -1095,7 +1102,7 @@ ggml_tensor * llm_graph_context::build_inp_s_copy() const {
|
|||||||
}
|
}
|
||||||
|
|
||||||
ggml_tensor * llm_graph_context::build_inp_s_mask() const {
|
ggml_tensor * llm_graph_context::build_inp_s_mask() const {
|
||||||
const llama_kv_cache_recurrent * kv_self = static_cast<const llama_kv_cache_recurrent *>(memory);
|
const llama_kv_cache_unified * kv_self = static_cast<const llama_kv_cache_unified *>(memory);
|
||||||
|
|
||||||
auto inp = std::make_unique<llm_graph_input_s_mask>(kv_self);
|
auto inp = std::make_unique<llm_graph_input_s_mask>(kv_self);
|
||||||
|
|
||||||
@ -1187,7 +1194,6 @@ ggml_tensor * llm_graph_context::build_attn_mha(
|
|||||||
ggml_tensor * v,
|
ggml_tensor * v,
|
||||||
ggml_tensor * kq_b,
|
ggml_tensor * kq_b,
|
||||||
ggml_tensor * kq_mask,
|
ggml_tensor * kq_mask,
|
||||||
ggml_tensor * v_mla,
|
|
||||||
bool v_trans,
|
bool v_trans,
|
||||||
float kq_scale) const {
|
float kq_scale) const {
|
||||||
//const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa(il);
|
//const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa(il);
|
||||||
@ -1199,6 +1205,8 @@ ggml_tensor * llm_graph_context::build_attn_mha(
|
|||||||
//const auto & n_embd_head_k = hparams.n_embd_head_k;
|
//const auto & n_embd_head_k = hparams.n_embd_head_k;
|
||||||
//const auto & n_embd_head_v = hparams.n_embd_head_v;
|
//const auto & n_embd_head_v = hparams.n_embd_head_v;
|
||||||
|
|
||||||
|
const auto n_embd_head_v = v_trans ? v->ne[1] : v->ne[0];
|
||||||
|
|
||||||
const auto n_tokens = q->ne[1];
|
const auto n_tokens = q->ne[1];
|
||||||
const auto n_head = q->ne[2];
|
const auto n_head = q->ne[2];
|
||||||
const auto n_kv = k->ne[1];
|
const auto n_kv = k->ne[1];
|
||||||
@ -1227,23 +1235,7 @@ ggml_tensor * llm_graph_context::build_attn_mha(
|
|||||||
|
|
||||||
ggml_flash_attn_ext_set_prec(cur, GGML_PREC_F32);
|
ggml_flash_attn_ext_set_prec(cur, GGML_PREC_F32);
|
||||||
|
|
||||||
if (v_mla) {
|
cur = ggml_reshape_2d(ctx0, cur, n_embd_head_v*n_head, n_tokens);
|
||||||
#if 0
|
|
||||||
// v_mla can be applied as a matrix-vector multiplication with broadcasting across dimension 3 == n_tokens.
|
|
||||||
// However, the code is optimized for dimensions 0 and 1 being large, so this is ineffient.
|
|
||||||
cur = ggml_reshape_4d(ctx0, cur, v_mla->ne[0], 1, n_head, n_tokens);
|
|
||||||
cur = ggml_mul_mat(ctx0, v_mla, cur);
|
|
||||||
#else
|
|
||||||
// It's preferable to do the calculation as a matrix-matrix multiplication with n_tokens in dimension 1.
|
|
||||||
// The permutations are noops and only change how the tensor data is interpreted.
|
|
||||||
cur = ggml_permute(ctx0, cur, 0, 2, 1, 3);
|
|
||||||
cur = ggml_mul_mat(ctx0, v_mla, cur);
|
|
||||||
cur = ggml_permute(ctx0, cur, 0, 2, 1, 3);
|
|
||||||
cur = ggml_cont(ctx0, cur); // Needed because ggml_reshape_2d expects contiguous inputs.
|
|
||||||
#endif
|
|
||||||
}
|
|
||||||
|
|
||||||
cur = ggml_reshape_2d(ctx0, cur, cur->ne[0]*n_head, n_tokens);
|
|
||||||
} else {
|
} else {
|
||||||
ggml_tensor * kq = ggml_mul_mat(ctx0, k, q);
|
ggml_tensor * kq = ggml_mul_mat(ctx0, k, q);
|
||||||
|
|
||||||
@ -1281,14 +1273,9 @@ ggml_tensor * llm_graph_context::build_attn_mha(
|
|||||||
|
|
||||||
ggml_tensor * kqv = ggml_mul_mat(ctx0, v, kq);
|
ggml_tensor * kqv = ggml_mul_mat(ctx0, v, kq);
|
||||||
|
|
||||||
// for MLA with the absorption optimization, we need to "decompress" from MQA back to MHA
|
ggml_tensor * kqv_merged = ggml_permute(ctx0, kqv, 0, 2, 1, 3);
|
||||||
if (v_mla) {
|
|
||||||
kqv = ggml_mul_mat(ctx0, v_mla, kqv);
|
|
||||||
}
|
|
||||||
|
|
||||||
cur = ggml_permute(ctx0, kqv, 0, 2, 1, 3);
|
cur = ggml_cont_2d(ctx0, kqv_merged, n_embd_head_v*n_head, n_tokens);
|
||||||
|
|
||||||
cur = ggml_cont_2d(ctx0, cur, cur->ne[0]*n_head, n_tokens);
|
|
||||||
|
|
||||||
if (!cparams.offload_kqv) {
|
if (!cparams.offload_kqv) {
|
||||||
// all nodes between the KV store and the attention output are run on the CPU
|
// all nodes between the KV store and the attention output are run on the CPU
|
||||||
@ -1323,7 +1310,6 @@ ggml_tensor * llm_graph_context::build_attn(
|
|||||||
ggml_tensor * k_cur,
|
ggml_tensor * k_cur,
|
||||||
ggml_tensor * v_cur,
|
ggml_tensor * v_cur,
|
||||||
ggml_tensor * kq_b,
|
ggml_tensor * kq_b,
|
||||||
ggml_tensor * v_mla,
|
|
||||||
float kq_scale,
|
float kq_scale,
|
||||||
int il) const {
|
int il) const {
|
||||||
GGML_UNUSED(n_tokens);
|
GGML_UNUSED(n_tokens);
|
||||||
@ -1345,7 +1331,7 @@ ggml_tensor * llm_graph_context::build_attn(
|
|||||||
ggml_tensor * v = ggml_permute(ctx0, v_cur, 0, 2, 1, 3);
|
ggml_tensor * v = ggml_permute(ctx0, v_cur, 0, 2, 1, 3);
|
||||||
//cb(k, "v", il);
|
//cb(k, "v", il);
|
||||||
|
|
||||||
ggml_tensor * cur = build_attn_mha(gf, q, k, v, kq_b, kq_mask, v_mla, false, kq_scale);
|
ggml_tensor * cur = build_attn_mha(gf, q, k, v, kq_b, kq_mask, false, kq_scale);
|
||||||
|
|
||||||
cb(cur, "kqv_out", il);
|
cb(cur, "kqv_out", il);
|
||||||
|
|
||||||
@ -1399,7 +1385,6 @@ ggml_tensor * llm_graph_context::build_attn(
|
|||||||
ggml_tensor * k_cur,
|
ggml_tensor * k_cur,
|
||||||
ggml_tensor * v_cur,
|
ggml_tensor * v_cur,
|
||||||
ggml_tensor * kq_b,
|
ggml_tensor * kq_b,
|
||||||
ggml_tensor * v_mla,
|
|
||||||
float kq_scale,
|
float kq_scale,
|
||||||
int il) const {
|
int il) const {
|
||||||
// these nodes are added to the graph together so that they are not reordered
|
// these nodes are added to the graph together so that they are not reordered
|
||||||
@ -1420,6 +1405,8 @@ ggml_tensor * llm_graph_context::build_attn(
|
|||||||
|
|
||||||
// store to KV cache
|
// store to KV cache
|
||||||
{
|
{
|
||||||
|
GGML_ASSERT(!kv_self->recurrent);
|
||||||
|
|
||||||
const auto kv_head = kv_self->head;
|
const auto kv_head = kv_self->head;
|
||||||
|
|
||||||
GGML_ASSERT(kv_self->size == n_ctx);
|
GGML_ASSERT(kv_self->size == n_ctx);
|
||||||
@ -1483,7 +1470,7 @@ ggml_tensor * llm_graph_context::build_attn(
|
|||||||
ggml_element_size(kv_self->v_l[il])*n_ctx*n_embd_head_v,
|
ggml_element_size(kv_self->v_l[il])*n_ctx*n_embd_head_v,
|
||||||
0);
|
0);
|
||||||
|
|
||||||
ggml_tensor * cur = build_attn_mha(gf, q, k, v, kq_b, kq_mask, v_mla, v_trans, kq_scale);
|
ggml_tensor * cur = build_attn_mha(gf, q, k, v, kq_b, kq_mask, v_trans, kq_scale);
|
||||||
cb(cur, "kqv_out", il);
|
cb(cur, "kqv_out", il);
|
||||||
|
|
||||||
if (wo) {
|
if (wo) {
|
||||||
@ -1514,6 +1501,25 @@ llm_graph_input_attn_cross * llm_graph_context::build_attn_inp_cross() const {
|
|||||||
return (llm_graph_input_attn_cross *) res->add_input(std::move(inp));
|
return (llm_graph_input_attn_cross *) res->add_input(std::move(inp));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
ggml_tensor * llm_graph_context::build_inp_cross_attn_state() const {
|
||||||
|
const int64_t n_embd = hparams.n_embd;
|
||||||
|
|
||||||
|
auto inp = std::make_unique<llm_graph_input_cross_attn_state>();
|
||||||
|
|
||||||
|
ggml_tensor * cur = nullptr;
|
||||||
|
|
||||||
|
inp->cross_attn_state = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_embd, 1601, 4);
|
||||||
|
ggml_set_input(inp->cross_attn_state);
|
||||||
|
|
||||||
|
cur = inp->cross_attn_state;
|
||||||
|
|
||||||
|
cb(cur, "inp_cross_attn_state", -1);
|
||||||
|
|
||||||
|
res->add_input(std::move(inp));
|
||||||
|
|
||||||
|
return cur;
|
||||||
|
}
|
||||||
|
|
||||||
ggml_tensor * llm_graph_context::build_attn(
|
ggml_tensor * llm_graph_context::build_attn(
|
||||||
llm_graph_input_attn_cross * inp,
|
llm_graph_input_attn_cross * inp,
|
||||||
ggml_cgraph * gf,
|
ggml_cgraph * gf,
|
||||||
@ -1523,7 +1529,6 @@ ggml_tensor * llm_graph_context::build_attn(
|
|||||||
ggml_tensor * k_cur,
|
ggml_tensor * k_cur,
|
||||||
ggml_tensor * v_cur,
|
ggml_tensor * v_cur,
|
||||||
ggml_tensor * kq_b,
|
ggml_tensor * kq_b,
|
||||||
ggml_tensor * v_mla,
|
|
||||||
float kq_scale,
|
float kq_scale,
|
||||||
int il) const {
|
int il) const {
|
||||||
// these nodes are added to the graph together so that they are not reordered
|
// these nodes are added to the graph together so that they are not reordered
|
||||||
@ -1543,7 +1548,7 @@ ggml_tensor * llm_graph_context::build_attn(
|
|||||||
ggml_tensor * v = ggml_permute(ctx0, v_cur, 0, 2, 1, 3);
|
ggml_tensor * v = ggml_permute(ctx0, v_cur, 0, 2, 1, 3);
|
||||||
//cb(k, "v", il);
|
//cb(k, "v", il);
|
||||||
|
|
||||||
ggml_tensor * cur = build_attn_mha(gf, q, k, v, kq_b, kq_mask, v_mla, false, kq_scale);
|
ggml_tensor * cur = build_attn_mha(gf, q, k, v, kq_b, kq_mask, false, kq_scale);
|
||||||
|
|
||||||
cb(cur, "kqv_out", il);
|
cb(cur, "kqv_out", il);
|
||||||
|
|
||||||
@ -1569,7 +1574,7 @@ ggml_tensor * llm_graph_context::build_copy_mask_state(
|
|||||||
ggml_tensor * state_mask,
|
ggml_tensor * state_mask,
|
||||||
int32_t n_state,
|
int32_t n_state,
|
||||||
int32_t n_seqs) const {
|
int32_t n_seqs) const {
|
||||||
const llama_kv_cache_recurrent * kv_self = static_cast<const llama_kv_cache_recurrent *>(memory);
|
const llama_kv_cache_unified * kv_self = static_cast<const llama_kv_cache_unified *>(memory);
|
||||||
|
|
||||||
const auto n_kv = kv_self->n;
|
const auto n_kv = kv_self->n;
|
||||||
const auto kv_head = kv_self->head;
|
const auto kv_head = kv_self->head;
|
||||||
@ -1601,7 +1606,7 @@ ggml_tensor * llm_graph_context::build_rwkv_token_shift_load(
|
|||||||
ggml_tensor * state_mask,
|
ggml_tensor * state_mask,
|
||||||
const llama_ubatch & ubatch,
|
const llama_ubatch & ubatch,
|
||||||
int il) const {
|
int il) const {
|
||||||
const llama_kv_cache_recurrent * kv_self = static_cast<const llama_kv_cache_recurrent *>(memory);
|
const llama_kv_cache_unified * kv_self = static_cast<const llama_kv_cache_unified *>(memory);
|
||||||
|
|
||||||
const auto token_shift_count = hparams.token_shift_count;
|
const auto token_shift_count = hparams.token_shift_count;
|
||||||
|
|
||||||
@ -1622,7 +1627,7 @@ ggml_tensor * llm_graph_context::build_rwkv_token_shift_store(
|
|||||||
ggml_tensor * token_shift,
|
ggml_tensor * token_shift,
|
||||||
const llama_ubatch & ubatch,
|
const llama_ubatch & ubatch,
|
||||||
int il) const {
|
int il) const {
|
||||||
const llama_kv_cache_recurrent * kv_self = static_cast<const llama_kv_cache_recurrent *>(memory);
|
const llama_kv_cache_unified * kv_self = static_cast<const llama_kv_cache_unified *>(memory);
|
||||||
|
|
||||||
const auto token_shift_count = hparams.token_shift_count;
|
const auto token_shift_count = hparams.token_shift_count;
|
||||||
const auto n_embd = hparams.n_embd;
|
const auto n_embd = hparams.n_embd;
|
||||||
@ -1712,3 +1717,4 @@ void llm_graph_context::build_pooling(
|
|||||||
|
|
||||||
ggml_build_forward_expand(gf, cur);
|
ggml_build_forward_expand(gf, cur);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
54
llama/llama.cpp/src/llama-graph.h
vendored
54
llama/llama.cpp/src/llama-graph.h
vendored
@ -19,7 +19,6 @@ struct llama_cparams;
|
|||||||
|
|
||||||
class llama_memory_i;
|
class llama_memory_i;
|
||||||
class llama_kv_cache_unified;
|
class llama_kv_cache_unified;
|
||||||
class llama_kv_cache_recurrent;
|
|
||||||
|
|
||||||
// certain models (typically multi-modal) can produce different types of graphs
|
// certain models (typically multi-modal) can produce different types of graphs
|
||||||
enum llm_graph_type {
|
enum llm_graph_type {
|
||||||
@ -87,31 +86,34 @@ public:
|
|||||||
|
|
||||||
ggml_tensor * tokens = nullptr; // I32 [n_batch]
|
ggml_tensor * tokens = nullptr; // I32 [n_batch]
|
||||||
ggml_tensor * embd = nullptr; // F32 [n_embd, n_batch]
|
ggml_tensor * embd = nullptr; // F32 [n_embd, n_batch]
|
||||||
|
ggml_tensor * cross_attn_state; // F32 [4, n_embd, 1061]
|
||||||
};
|
};
|
||||||
|
|
||||||
class llm_graph_input_pos : public llm_graph_input_i {
|
class llm_graph_input_pos : public llm_graph_input_i {
|
||||||
public:
|
public:
|
||||||
llm_graph_input_pos(int64_t n_pos_per_embd) : n_pos_per_embd(n_pos_per_embd) {}
|
llm_graph_input_pos(int64_t n_pos_per_token) : n_pos_per_token(n_pos_per_token) {}
|
||||||
virtual ~llm_graph_input_pos() = default;
|
virtual ~llm_graph_input_pos() = default;
|
||||||
|
|
||||||
void set_input(const llama_ubatch * ubatch) override;
|
void set_input(const llama_ubatch * ubatch) override;
|
||||||
|
|
||||||
ggml_tensor * pos = nullptr; // I32 [n_batch]
|
ggml_tensor * pos = nullptr; // I32 [n_batch]
|
||||||
|
|
||||||
const int64_t n_pos_per_embd = 1;
|
const int64_t n_pos_per_token = 1;
|
||||||
};
|
};
|
||||||
|
|
||||||
// temperature tuning, used by llama4
|
// temperature tuning, used by llama4
|
||||||
class llm_graph_input_attn_temp : public llm_graph_input_i {
|
class llm_graph_input_attn_temp : public llm_graph_input_i {
|
||||||
public:
|
public:
|
||||||
llm_graph_input_attn_temp(uint32_t n_attn_temp_floor_scale, float f_attn_temp_scale)
|
llm_graph_input_attn_temp(int64_t n_pos_per_token, uint32_t n_attn_temp_floor_scale, float f_attn_temp_scale)
|
||||||
: n_attn_temp_floor_scale(n_attn_temp_floor_scale), f_attn_temp_scale(f_attn_temp_scale) {}
|
: n_pos_per_token(n_pos_per_token), n_attn_temp_floor_scale(n_attn_temp_floor_scale), f_attn_temp_scale(f_attn_temp_scale) {}
|
||||||
virtual ~llm_graph_input_attn_temp() = default;
|
virtual ~llm_graph_input_attn_temp() = default;
|
||||||
|
|
||||||
void set_input(const llama_ubatch * ubatch) override;
|
void set_input(const llama_ubatch * ubatch) override;
|
||||||
|
|
||||||
ggml_tensor * attn_scale = nullptr; // F32 [n_batch]
|
ggml_tensor * attn_scale = nullptr; // F32 [n_batch]
|
||||||
|
|
||||||
|
const int64_t n_pos_per_token = 1;
|
||||||
|
|
||||||
const uint32_t n_attn_temp_floor_scale;
|
const uint32_t n_attn_temp_floor_scale;
|
||||||
const float f_attn_temp_scale;
|
const float f_attn_temp_scale;
|
||||||
};
|
};
|
||||||
@ -187,26 +189,26 @@ public:
|
|||||||
|
|
||||||
class llm_graph_input_s_copy : public llm_graph_input_i {
|
class llm_graph_input_s_copy : public llm_graph_input_i {
|
||||||
public:
|
public:
|
||||||
llm_graph_input_s_copy(const llama_kv_cache_recurrent * kv_self) : kv_self(kv_self) {}
|
llm_graph_input_s_copy(const llama_kv_cache_unified * kv_self) : kv_self(kv_self) {}
|
||||||
virtual ~llm_graph_input_s_copy() = default;
|
virtual ~llm_graph_input_s_copy() = default;
|
||||||
|
|
||||||
void set_input(const llama_ubatch * ubatch) override;
|
void set_input(const llama_ubatch * ubatch) override;
|
||||||
|
|
||||||
ggml_tensor * s_copy; // I32 [kv_size]
|
ggml_tensor * s_copy; // I32 [kv_size]
|
||||||
|
|
||||||
const llama_kv_cache_recurrent * kv_self;
|
const llama_kv_cache_unified * kv_self;
|
||||||
};
|
};
|
||||||
|
|
||||||
class llm_graph_input_s_mask : public llm_graph_input_i {
|
class llm_graph_input_s_mask : public llm_graph_input_i {
|
||||||
public:
|
public:
|
||||||
llm_graph_input_s_mask(const llama_kv_cache_recurrent * kv_self) : kv_self(kv_self) {}
|
llm_graph_input_s_mask(const llama_kv_cache_unified * kv_self) : kv_self(kv_self) {}
|
||||||
virtual ~llm_graph_input_s_mask() = default;
|
virtual ~llm_graph_input_s_mask() = default;
|
||||||
|
|
||||||
void set_input(const llama_ubatch * ubatch) override;
|
void set_input(const llama_ubatch * ubatch) override;
|
||||||
|
|
||||||
ggml_tensor * s_mask; // F32 [1, n_kv]
|
ggml_tensor * s_mask; // F32 [1, n_kv]
|
||||||
|
|
||||||
const llama_kv_cache_recurrent * kv_self;
|
const llama_kv_cache_unified * kv_self;
|
||||||
};
|
};
|
||||||
|
|
||||||
class llm_graph_input_cross_embd : public llm_graph_input_i {
|
class llm_graph_input_cross_embd : public llm_graph_input_i {
|
||||||
@ -284,6 +286,16 @@ public:
|
|||||||
const llama_cross * cross = nullptr;
|
const llama_cross * cross = nullptr;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
class llm_graph_input_cross_attn_state : public llm_graph_input_i {
|
||||||
|
public:
|
||||||
|
llm_graph_input_cross_attn_state() = default;
|
||||||
|
virtual ~llm_graph_input_cross_attn_state() = default;
|
||||||
|
|
||||||
|
void set_input(const llama_ubatch * ubatch) override;
|
||||||
|
|
||||||
|
ggml_tensor * cross_attn_state; // F32 [4, n_embd, 1061]
|
||||||
|
};
|
||||||
|
|
||||||
//
|
//
|
||||||
// llm_graph_result
|
// llm_graph_result
|
||||||
//
|
//
|
||||||
@ -298,7 +310,6 @@ class llm_graph_result_i {
|
|||||||
public:
|
public:
|
||||||
virtual ~llm_graph_result_i() = default;
|
virtual ~llm_graph_result_i() = default;
|
||||||
|
|
||||||
virtual ggml_tensor * get_tokens() = 0;
|
|
||||||
virtual ggml_tensor * get_logits() = 0;
|
virtual ggml_tensor * get_logits() = 0;
|
||||||
virtual ggml_tensor * get_embd() = 0;
|
virtual ggml_tensor * get_embd() = 0;
|
||||||
virtual ggml_tensor * get_embd_pooled() = 0;
|
virtual ggml_tensor * get_embd_pooled() = 0;
|
||||||
@ -313,7 +324,6 @@ class llm_graph_result : public llm_graph_result_i {
|
|||||||
public:
|
public:
|
||||||
virtual ~llm_graph_result() = default;
|
virtual ~llm_graph_result() = default;
|
||||||
|
|
||||||
ggml_tensor * get_tokens() override { return t_tokens; }
|
|
||||||
ggml_tensor * get_logits() override { return t_logits; }
|
ggml_tensor * get_logits() override { return t_logits; }
|
||||||
ggml_tensor * get_embd() override { return t_embd; }
|
ggml_tensor * get_embd() override { return t_embd; }
|
||||||
ggml_tensor * get_embd_pooled() override { return t_embd_pooled; }
|
ggml_tensor * get_embd_pooled() override { return t_embd_pooled; }
|
||||||
@ -330,7 +340,6 @@ public:
|
|||||||
}
|
}
|
||||||
|
|
||||||
// important graph nodes
|
// important graph nodes
|
||||||
ggml_tensor * t_tokens = nullptr;
|
|
||||||
ggml_tensor * t_logits = nullptr;
|
ggml_tensor * t_logits = nullptr;
|
||||||
ggml_tensor * t_embd = nullptr;
|
ggml_tensor * t_embd = nullptr;
|
||||||
ggml_tensor * t_embd_pooled = nullptr;
|
ggml_tensor * t_embd_pooled = nullptr;
|
||||||
@ -354,8 +363,8 @@ struct llm_graph_params {
|
|||||||
const llama_cparams & cparams;
|
const llama_cparams & cparams;
|
||||||
const llama_ubatch & ubatch;
|
const llama_ubatch & ubatch;
|
||||||
|
|
||||||
ggml_backend_sched_t sched;
|
ggml_backend_sched * sched;
|
||||||
ggml_backend_t backend_cpu;
|
ggml_backend * backend_cpu;
|
||||||
|
|
||||||
const llama_adapter_cvec * cvec;
|
const llama_adapter_cvec * cvec;
|
||||||
const llama_adapter_loras * loras;
|
const llama_adapter_loras * loras;
|
||||||
@ -406,9 +415,9 @@ struct llm_graph_context {
|
|||||||
|
|
||||||
ggml_context * ctx0 = nullptr;
|
ggml_context * ctx0 = nullptr;
|
||||||
|
|
||||||
ggml_backend_sched_t sched;
|
ggml_backend_sched * sched;
|
||||||
|
|
||||||
ggml_backend_t backend_cpu; // TODO: needed by build_attn_mha, figure out a way to remove?
|
ggml_backend * backend_cpu; // TODO: needed by build_attn_mha, figure out a way to remove?
|
||||||
|
|
||||||
const llama_adapter_cvec * cvec;
|
const llama_adapter_cvec * cvec;
|
||||||
const llama_adapter_loras * loras;
|
const llama_adapter_loras * loras;
|
||||||
@ -421,7 +430,7 @@ struct llm_graph_context {
|
|||||||
|
|
||||||
llm_graph_context(const llm_graph_params & params);
|
llm_graph_context(const llm_graph_params & params);
|
||||||
|
|
||||||
int64_t n_pos_per_embd() const;
|
int64_t n_pos_per_token() const;
|
||||||
|
|
||||||
void cb(ggml_tensor * cur, const char * name, int il) const;
|
void cb(ggml_tensor * cur, const char * name, int il) const;
|
||||||
|
|
||||||
@ -495,6 +504,7 @@ struct llm_graph_context {
|
|||||||
ggml_tensor * build_inp_cls() const;
|
ggml_tensor * build_inp_cls() const;
|
||||||
ggml_tensor * build_inp_s_copy() const;
|
ggml_tensor * build_inp_s_copy() const;
|
||||||
ggml_tensor * build_inp_s_mask() const;
|
ggml_tensor * build_inp_s_mask() const;
|
||||||
|
ggml_tensor * build_inp_cross_attn_state() const;
|
||||||
|
|
||||||
ggml_tensor * build_inp_cross_embd() const;
|
ggml_tensor * build_inp_cross_embd() const;
|
||||||
ggml_tensor * build_inp_pos_bucket_enc() const;
|
ggml_tensor * build_inp_pos_bucket_enc() const;
|
||||||
@ -507,12 +517,11 @@ struct llm_graph_context {
|
|||||||
|
|
||||||
ggml_tensor * build_attn_mha(
|
ggml_tensor * build_attn_mha(
|
||||||
ggml_cgraph * gf,
|
ggml_cgraph * gf,
|
||||||
ggml_tensor * q, // [n_embd_head_q, n_tokens, n_head_q]
|
ggml_tensor * q, // [n_embd_head_q, n_tokens, n_head_q]
|
||||||
ggml_tensor * k, // [n_embd_head_k, n_tokens, n_head_k]
|
ggml_tensor * k, // [n_embd_head_k, n_tokens, n_head_k]
|
||||||
ggml_tensor * v, // [n_embd_head_v, n_tokens, n_head_v] (v_trans == false)
|
ggml_tensor * v, // [n_embd_head_v, n_tokens, n_head_v] (v_trans == false)
|
||||||
ggml_tensor * kq_b,
|
ggml_tensor * kq_b,
|
||||||
ggml_tensor * kq_mask,
|
ggml_tensor * kq_mask,
|
||||||
ggml_tensor * v_mla, // [n_embd_head_v_mla, n_embd_head_v, n_head_v]
|
|
||||||
bool v_trans,
|
bool v_trans,
|
||||||
float kq_scale) const;
|
float kq_scale) const;
|
||||||
|
|
||||||
@ -527,7 +536,6 @@ struct llm_graph_context {
|
|||||||
ggml_tensor * k_cur, // [n_embd_head_k, n_head_k, n_tokens]
|
ggml_tensor * k_cur, // [n_embd_head_k, n_head_k, n_tokens]
|
||||||
ggml_tensor * v_cur, // [n_embd_head_v, n_head_v, n_tokens]
|
ggml_tensor * v_cur, // [n_embd_head_v, n_head_v, n_tokens]
|
||||||
ggml_tensor * kq_b,
|
ggml_tensor * kq_b,
|
||||||
ggml_tensor * v_mla, // [n_embd_head_v_mla, n_embd_head_v, n_head_v]
|
|
||||||
float kq_scale,
|
float kq_scale,
|
||||||
int il) const;
|
int il) const;
|
||||||
|
|
||||||
@ -542,7 +550,6 @@ struct llm_graph_context {
|
|||||||
ggml_tensor * k_cur, // [n_embd_head_k, n_head_k, n_tokens]
|
ggml_tensor * k_cur, // [n_embd_head_k, n_head_k, n_tokens]
|
||||||
ggml_tensor * v_cur, // [n_embd_head_v, n_head_v, n_tokens]
|
ggml_tensor * v_cur, // [n_embd_head_v, n_head_v, n_tokens]
|
||||||
ggml_tensor * kq_b,
|
ggml_tensor * kq_b,
|
||||||
ggml_tensor * v_mla, // [n_embd_head_v_mla, n_embd_head_v, n_head_v]
|
|
||||||
float kq_scale,
|
float kq_scale,
|
||||||
int il) const;
|
int il) const;
|
||||||
|
|
||||||
@ -557,7 +564,6 @@ struct llm_graph_context {
|
|||||||
ggml_tensor * k_cur, // [n_embd_head_k, n_head_k, n_tokens]
|
ggml_tensor * k_cur, // [n_embd_head_k, n_head_k, n_tokens]
|
||||||
ggml_tensor * v_cur, // [n_embd_head_v, n_head_v, n_tokens]
|
ggml_tensor * v_cur, // [n_embd_head_v, n_head_v, n_tokens]
|
||||||
ggml_tensor * kq_b,
|
ggml_tensor * kq_b,
|
||||||
ggml_tensor * v_mla, // [n_embd_head_v_mla, n_embd_head_v, n_head_v]
|
|
||||||
float kq_scale,
|
float kq_scale,
|
||||||
int il) const;
|
int il) const;
|
||||||
|
|
||||||
|
4
llama/llama.cpp/src/llama-hparams.cpp
vendored
4
llama/llama.cpp/src/llama-hparams.cpp
vendored
@ -85,3 +85,7 @@ bool llama_hparams::is_swa(uint32_t il) const {
|
|||||||
|
|
||||||
GGML_ABORT("fatal error");
|
GGML_ABORT("fatal error");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
bool llama_hparams::cross_attention_layers(uint32_t il) const {
|
||||||
|
return std::find(cross_attn_layers.begin(), cross_attn_layers.end(), il) != cross_attn_layers.end();
|
||||||
|
}
|
||||||
|
12
llama/llama.cpp/src/llama-hparams.h
vendored
12
llama/llama.cpp/src/llama-hparams.h
vendored
@ -2,6 +2,8 @@
|
|||||||
|
|
||||||
#include "llama.h"
|
#include "llama.h"
|
||||||
|
|
||||||
|
#include <algorithm>
|
||||||
|
|
||||||
#include <array>
|
#include <array>
|
||||||
|
|
||||||
// bump if necessary
|
// bump if necessary
|
||||||
@ -42,10 +44,7 @@ struct llama_hparams {
|
|||||||
uint32_t n_expert = 0;
|
uint32_t n_expert = 0;
|
||||||
uint32_t n_expert_used = 0;
|
uint32_t n_expert_used = 0;
|
||||||
uint32_t n_rel_attn_bkts = 0;
|
uint32_t n_rel_attn_bkts = 0;
|
||||||
|
uint32_t n_vocab = 0;
|
||||||
// note: deepseek2 using MLA converts into MQA with larger heads, then decompresses to MHA
|
|
||||||
uint32_t n_embd_head_k_mla = 0;
|
|
||||||
uint32_t n_embd_head_v_mla = 0;
|
|
||||||
|
|
||||||
// for WavTokenizer
|
// for WavTokenizer
|
||||||
struct llama_hparams_posnet posnet;
|
struct llama_hparams_posnet posnet;
|
||||||
@ -56,6 +55,7 @@ struct llama_hparams {
|
|||||||
std::array<uint32_t, LLAMA_MAX_LAYERS> n_ff_arr;
|
std::array<uint32_t, LLAMA_MAX_LAYERS> n_ff_arr;
|
||||||
|
|
||||||
std::array<std::array<uint32_t, LLAMA_MAX_LAYERS>, 4> n_bskcn_arr = {};
|
std::array<std::array<uint32_t, LLAMA_MAX_LAYERS>, 4> n_bskcn_arr = {};
|
||||||
|
std::array<uint32_t, LLAMA_MAX_LAYERS> cross_attn_layers;
|
||||||
|
|
||||||
uint32_t n_layer_dense_lead = 0;
|
uint32_t n_layer_dense_lead = 0;
|
||||||
uint32_t n_lora_q = 0;
|
uint32_t n_lora_q = 0;
|
||||||
@ -68,7 +68,6 @@ struct llama_hparams {
|
|||||||
float expert_weights_scale = 0.0;
|
float expert_weights_scale = 0.0;
|
||||||
bool expert_weights_norm = false;
|
bool expert_weights_norm = false;
|
||||||
uint32_t expert_gating_func = LLAMA_EXPERT_GATING_FUNC_TYPE_NONE;
|
uint32_t expert_gating_func = LLAMA_EXPERT_GATING_FUNC_TYPE_NONE;
|
||||||
uint32_t moe_every_n_layers = 0;
|
|
||||||
|
|
||||||
float f_norm_eps;
|
float f_norm_eps;
|
||||||
float f_norm_rms_eps;
|
float f_norm_rms_eps;
|
||||||
@ -159,6 +158,9 @@ struct llama_hparams {
|
|||||||
// Block skip connection
|
// Block skip connection
|
||||||
bool n_bskcn(uint32_t n, uint32_t il) const;
|
bool n_bskcn(uint32_t n, uint32_t il) const;
|
||||||
|
|
||||||
|
// cross attention layers
|
||||||
|
bool cross_attention_layers(uint32_t il) const;
|
||||||
|
|
||||||
bool is_swa(uint32_t il) const;
|
bool is_swa(uint32_t il) const;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
1826
llama/llama.cpp/src/llama-kv-cache.cpp
vendored
1826
llama/llama.cpp/src/llama-kv-cache.cpp
vendored
File diff suppressed because it is too large
Load Diff
367
llama/llama.cpp/src/llama-kv-cache.h
vendored
367
llama/llama.cpp/src/llama-kv-cache.h
vendored
@ -2,72 +2,32 @@
|
|||||||
|
|
||||||
#include "llama.h"
|
#include "llama.h"
|
||||||
#include "llama-io.h"
|
#include "llama-io.h"
|
||||||
#include "llama-graph.h"
|
|
||||||
#include "llama-memory.h"
|
#include "llama-memory.h"
|
||||||
|
|
||||||
#include "ggml-cpp.h"
|
#include "ggml-cpp.h"
|
||||||
|
|
||||||
|
#include <functional>
|
||||||
#include <set>
|
#include <set>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
|
|
||||||
struct llama_cparams;
|
struct llama_cparams;
|
||||||
struct llama_hparams;
|
struct llama_hparams;
|
||||||
struct llama_ubatch;
|
struct llama_ubatch;
|
||||||
struct llama_sbatch;
|
|
||||||
struct llama_model;
|
|
||||||
struct llama_context;
|
|
||||||
|
|
||||||
struct llama_kv_cache : public llama_memory_i {
|
struct llama_kv_cache : public llama_memory_i {
|
||||||
virtual ~llama_kv_cache() = default;
|
using llama_memory_i::llama_memory_i;
|
||||||
|
|
||||||
// call if batch processing fails - restores the cache state
|
virtual void restore() = 0; // call if batch processing fails - restores the cache state
|
||||||
virtual void restore() = 0;
|
virtual void commit() = 0; // call after successful batch processing - clears any pending state
|
||||||
|
|
||||||
// call after successful batch processing - clears any pending state
|
virtual int32_t get_n_tokens() const = 0;
|
||||||
virtual void commit() = 0;
|
virtual int32_t get_used_cells() const = 0; // TODO: remove, this is too-specific to the unified cache
|
||||||
|
|
||||||
// process any pending defrag/shift/etc. operations
|
virtual bool get_can_shift() const = 0;
|
||||||
// optionally call once before processing a new batch
|
|
||||||
virtual bool update(llama_context & lctx) = 0;
|
|
||||||
|
|
||||||
// schedule a defrag if the fragmentation threshold is exceeded. otherwise, do nothing
|
|
||||||
virtual void defrag_sched(float thold) = 0;
|
|
||||||
|
|
||||||
// simulate full cache, used for allocating worst-case compute buffers
|
|
||||||
virtual void set_full() = 0;
|
|
||||||
|
|
||||||
//
|
|
||||||
// batch processing
|
|
||||||
//
|
|
||||||
|
|
||||||
virtual llama_sbatch sbatch_init(const llama_batch & batch, bool logits_all) = 0;
|
|
||||||
|
|
||||||
// different KV caches require different batch splitting strategies
|
|
||||||
virtual llama_ubatch ubatch_next(llama_sbatch & sbatch, uint32_t n_ubatch, bool embd_pooled) const = 0;
|
|
||||||
|
|
||||||
// find an empty slot of size "n_tokens" in the cache
|
|
||||||
virtual bool find_slot(const llama_ubatch & batch) = 0;
|
|
||||||
|
|
||||||
// getters
|
|
||||||
virtual int32_t get_n_tokens() const = 0;
|
|
||||||
virtual int32_t get_used_cells() const = 0; // TODO: remove, this is too-specific to the unified cache
|
|
||||||
virtual llama_pos get_pos_max() const = 0;
|
|
||||||
virtual bool get_can_shift() const = 0;
|
|
||||||
|
|
||||||
bool get_can_edit() const override { return get_can_shift(); }
|
bool get_can_edit() const override { return get_can_shift(); }
|
||||||
|
|
||||||
//
|
|
||||||
// state write/read
|
|
||||||
//
|
|
||||||
|
|
||||||
virtual void state_write(llama_io_write_i & io, llama_seq_id seq_id = -1) const = 0;
|
|
||||||
virtual void state_read (llama_io_read_i & io, llama_seq_id seq_id = -1) = 0;
|
|
||||||
};
|
};
|
||||||
|
|
||||||
//
|
|
||||||
// llama_kv_cache_guard
|
|
||||||
//
|
|
||||||
|
|
||||||
struct llama_kv_cache_guard {
|
struct llama_kv_cache_guard {
|
||||||
llama_kv_cache_guard(llama_kv_cache * kv) : kv(kv) {}
|
llama_kv_cache_guard(llama_kv_cache * kv) : kv(kv) {}
|
||||||
|
|
||||||
@ -82,7 +42,7 @@ struct llama_kv_cache_guard {
|
|||||||
private:
|
private:
|
||||||
llama_kv_cache * kv;
|
llama_kv_cache * kv;
|
||||||
};
|
};
|
||||||
|
|
||||||
// block of KV slots to move when defragging
|
// block of KV slots to move when defragging
|
||||||
struct llama_kv_defrag_move {
|
struct llama_kv_defrag_move {
|
||||||
uint32_t src;
|
uint32_t src;
|
||||||
@ -90,50 +50,65 @@ struct llama_kv_defrag_move {
|
|||||||
uint32_t len;
|
uint32_t len;
|
||||||
};
|
};
|
||||||
|
|
||||||
//
|
struct llama_kv_cell {
|
||||||
// llama_kv_cache_unified
|
llama_pos pos = -1;
|
||||||
//
|
llama_pos delta = 0;
|
||||||
|
int32_t src = -1; // used by recurrent state models to copy states
|
||||||
|
int32_t tail = -1;
|
||||||
|
|
||||||
|
std::set<llama_seq_id> seq_id;
|
||||||
|
|
||||||
|
bool has_seq_id(const llama_seq_id & id) const {
|
||||||
|
return seq_id.find(id) != seq_id.end();
|
||||||
|
}
|
||||||
|
|
||||||
|
bool is_empty() const {
|
||||||
|
return seq_id.empty();
|
||||||
|
}
|
||||||
|
|
||||||
|
bool is_same_seq(const llama_kv_cell & other) const {
|
||||||
|
return seq_id == other.seq_id;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
// ring-buffer of cached KV data
|
||||||
|
// TODO: pimpl
|
||||||
// TODO: add notion of max sequences
|
// TODO: add notion of max sequences
|
||||||
class llama_kv_cache_unified : public llama_kv_cache {
|
class llama_kv_cache_unified : public llama_kv_cache {
|
||||||
public:
|
public:
|
||||||
struct kv_cell {
|
// can be used to query data from the model if needed
|
||||||
llama_pos pos = -1;
|
struct callbacks {
|
||||||
llama_pos delta = 0;
|
std::function<ggml_tensor * (uint32_t n_ctx_per_seq, int il)> get_rope_factors;
|
||||||
|
|
||||||
std::set<llama_seq_id> seq_id;
|
|
||||||
|
|
||||||
bool has_seq_id(const llama_seq_id & id) const {
|
|
||||||
return seq_id.find(id) != seq_id.end();
|
|
||||||
}
|
|
||||||
|
|
||||||
bool is_empty() const {
|
|
||||||
return seq_id.empty();
|
|
||||||
}
|
|
||||||
|
|
||||||
bool is_same_seq(const kv_cell & other) const {
|
|
||||||
return seq_id == other.seq_id;
|
|
||||||
}
|
|
||||||
};
|
};
|
||||||
|
|
||||||
static uint32_t get_padding(const llama_cparams & cparams);
|
|
||||||
|
|
||||||
llama_kv_cache_unified(
|
llama_kv_cache_unified(
|
||||||
const llama_model & model,
|
const llama_hparams & hparams,
|
||||||
|
callbacks cbs);
|
||||||
|
|
||||||
|
virtual ~llama_kv_cache_unified() = default;
|
||||||
|
|
||||||
|
// TODO: become constructor
|
||||||
|
bool init(
|
||||||
|
const llama_model & model, // TODO: do not reference the model
|
||||||
|
const llama_cparams & cparams,
|
||||||
ggml_type type_k,
|
ggml_type type_k,
|
||||||
ggml_type type_v,
|
ggml_type type_v,
|
||||||
bool v_trans,
|
|
||||||
bool offload,
|
|
||||||
uint32_t kv_size,
|
uint32_t kv_size,
|
||||||
uint32_t padding);
|
bool offload);
|
||||||
|
|
||||||
~llama_kv_cache_unified() = default;
|
int32_t get_n_tokens() const override;
|
||||||
|
int32_t get_used_cells() const override;
|
||||||
|
|
||||||
//
|
size_t total_size() const;
|
||||||
// llama_memory_i
|
|
||||||
//
|
// TODO: better data structures to reduce the cost of this operation
|
||||||
|
llama_pos pos_max() const;
|
||||||
|
|
||||||
void clear() override;
|
void clear() override;
|
||||||
|
void defrag() override;
|
||||||
|
|
||||||
|
virtual void restore() override;
|
||||||
|
virtual void commit() override;
|
||||||
|
|
||||||
bool seq_rm (llama_seq_id seq_id, llama_pos p0, llama_pos p1) override;
|
bool seq_rm (llama_seq_id seq_id, llama_pos p0, llama_pos p1) override;
|
||||||
void seq_cp (llama_seq_id seq_id_src, llama_seq_id seq_id_dst, llama_pos p0, llama_pos p1) override;
|
void seq_cp (llama_seq_id seq_id_src, llama_seq_id seq_id_dst, llama_pos p0, llama_pos p1) override;
|
||||||
@ -143,76 +118,25 @@ public:
|
|||||||
|
|
||||||
llama_pos seq_pos_max(llama_seq_id seq_id) const override;
|
llama_pos seq_pos_max(llama_seq_id seq_id) const override;
|
||||||
|
|
||||||
//
|
bool get_can_shift() const override;
|
||||||
// llama_kv_cache
|
|
||||||
//
|
|
||||||
|
|
||||||
void restore() override;
|
|
||||||
void commit() override;
|
|
||||||
|
|
||||||
bool update(llama_context & ctx) override;
|
|
||||||
|
|
||||||
void defrag_sched(float thold) override;
|
|
||||||
|
|
||||||
void set_full() override;
|
|
||||||
|
|
||||||
llama_sbatch sbatch_init(const llama_batch & batch, bool logits_all) override;
|
|
||||||
|
|
||||||
llama_ubatch ubatch_next(llama_sbatch & sbatch, uint32_t n_ubatch, bool embd_pooled) const override;
|
|
||||||
|
|
||||||
|
// find an empty slot of size "n_tokens" in the cache
|
||||||
// updates the cache head
|
// updates the cache head
|
||||||
// Note: On success, it's important that cache.head points
|
// Note: On success, it's important that cache.head points
|
||||||
// to the first cell of the slot.
|
// to the first cell of the slot.
|
||||||
bool find_slot(const llama_ubatch & batch) override;
|
bool find_slot(const llama_ubatch & batch);
|
||||||
|
|
||||||
int32_t get_n_tokens() const override;
|
// TODO: maybe not needed
|
||||||
int32_t get_used_cells() const override;
|
uint32_t get_padding(const llama_cparams & cparams) const;
|
||||||
|
|
||||||
// TODO: better data structures to reduce the cost of this operation
|
// find how many cells are currently in use
|
||||||
llama_pos get_pos_max() const override;
|
uint32_t cell_max() const;
|
||||||
|
|
||||||
bool get_can_shift() const override;
|
size_t size_k_bytes() const;
|
||||||
|
size_t size_v_bytes() const;
|
||||||
// state write/load
|
|
||||||
|
|
||||||
void state_write(llama_io_write_i & io, llama_seq_id seq_id = -1) const override;
|
|
||||||
void state_read (llama_io_read_i & io, llama_seq_id seq_id = -1) override;
|
|
||||||
|
|
||||||
// Note: The value of head isn't only used to optimize searching
|
|
||||||
// for a free KV slot. llama_decode_impl also uses it, so it
|
|
||||||
// cannot be freely changed after a slot has been allocated.
|
|
||||||
uint32_t head = 0;
|
|
||||||
uint32_t size = 0;
|
|
||||||
uint32_t used = 0; // used cells (i.e. at least one seq_id)
|
|
||||||
|
|
||||||
// computed before each graph build
|
|
||||||
uint32_t n = 0;
|
|
||||||
|
|
||||||
std::vector<kv_cell> cells;
|
|
||||||
|
|
||||||
std::vector<ggml_tensor *> k_l; // per layer
|
|
||||||
std::vector<ggml_tensor *> v_l;
|
|
||||||
|
|
||||||
private:
|
|
||||||
const llama_model & model;
|
|
||||||
const llama_hparams & hparams;
|
|
||||||
|
|
||||||
bool has_shift = false;
|
|
||||||
bool do_defrag = false;
|
|
||||||
|
|
||||||
bool v_trans = true; // the value tensor is transposed
|
|
||||||
bool can_shift = false;
|
|
||||||
|
|
||||||
// required padding
|
|
||||||
uint32_t padding = 1;
|
|
||||||
|
|
||||||
ggml_type type_k = GGML_TYPE_F16;
|
|
||||||
ggml_type type_v = GGML_TYPE_F16;
|
|
||||||
|
|
||||||
std::vector<ggml_context_ptr> ctxs;
|
|
||||||
std::vector<ggml_backend_buffer_ptr> bufs;
|
|
||||||
|
|
||||||
// defrag
|
// defrag
|
||||||
|
|
||||||
struct {
|
struct {
|
||||||
std::vector<llama_kv_defrag_move> moves;
|
std::vector<llama_kv_defrag_move> moves;
|
||||||
} defrag_info;
|
} defrag_info;
|
||||||
@ -221,6 +145,7 @@ private:
|
|||||||
bool defrag_prepare(int32_t n_max_nodes);
|
bool defrag_prepare(int32_t n_max_nodes);
|
||||||
|
|
||||||
// commit/restore cache
|
// commit/restore cache
|
||||||
|
|
||||||
struct slot_range {
|
struct slot_range {
|
||||||
uint32_t c0 = 0; // note: these are cell indices, not sequence positions
|
uint32_t c0 = 0; // note: these are cell indices, not sequence positions
|
||||||
uint32_t c1 = 0;
|
uint32_t c1 = 0;
|
||||||
@ -231,125 +156,25 @@ private:
|
|||||||
std::vector<slot_range> ranges;
|
std::vector<slot_range> ranges;
|
||||||
} pending;
|
} pending;
|
||||||
|
|
||||||
// find how many cells are currently in use
|
|
||||||
uint32_t cell_max() const;
|
|
||||||
|
|
||||||
size_t total_size() const;
|
|
||||||
|
|
||||||
size_t size_k_bytes() const;
|
|
||||||
size_t size_v_bytes() const;
|
|
||||||
|
|
||||||
ggml_tensor * build_rope_shift(
|
|
||||||
const llama_cparams & cparams,
|
|
||||||
ggml_context * ctx,
|
|
||||||
ggml_tensor * cur,
|
|
||||||
ggml_tensor * shift,
|
|
||||||
ggml_tensor * factors,
|
|
||||||
float freq_base,
|
|
||||||
float freq_scale) const;
|
|
||||||
|
|
||||||
llm_graph_result_ptr build_graph_shift(
|
|
||||||
const llama_cparams & cparams,
|
|
||||||
ggml_context * ctx,
|
|
||||||
ggml_cgraph * gf) const;
|
|
||||||
|
|
||||||
llm_graph_result_ptr build_graph_defrag(
|
|
||||||
const llama_cparams & cparams,
|
|
||||||
ggml_context * ctx,
|
|
||||||
ggml_cgraph * gf,
|
|
||||||
const std::vector<llama_kv_defrag_move> & moves) const;
|
|
||||||
|
|
||||||
void state_write_meta(llama_io_write_i & io, const std::vector<std::pair<uint32_t, uint32_t>> & cell_ranges, llama_seq_id seq_id = -1) const;
|
|
||||||
void state_write_data(llama_io_write_i & io, const std::vector<std::pair<uint32_t, uint32_t>> & cell_ranges) const;
|
|
||||||
|
|
||||||
bool state_read_meta(llama_io_read_i & io, uint32_t cell_count, llama_seq_id dest_seq_id = -1);
|
|
||||||
bool state_read_data(llama_io_read_i & io, uint32_t cell_count);
|
|
||||||
};
|
|
||||||
|
|
||||||
//
|
|
||||||
// llama_kv_cache_recurrent
|
|
||||||
//
|
|
||||||
|
|
||||||
class llama_kv_cache_recurrent : public llama_kv_cache {
|
|
||||||
public:
|
|
||||||
struct kv_cell {
|
|
||||||
llama_pos pos = -1;
|
|
||||||
int32_t src = -1; // used to copy states
|
|
||||||
int32_t tail = -1;
|
|
||||||
|
|
||||||
std::set<llama_seq_id> seq_id;
|
|
||||||
|
|
||||||
bool has_seq_id(const llama_seq_id & id) const {
|
|
||||||
return seq_id.find(id) != seq_id.end();
|
|
||||||
}
|
|
||||||
|
|
||||||
bool is_empty() const {
|
|
||||||
return seq_id.empty();
|
|
||||||
}
|
|
||||||
|
|
||||||
bool is_same_seq(const kv_cell & other) const {
|
|
||||||
return seq_id == other.seq_id;
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
llama_kv_cache_recurrent(
|
|
||||||
const llama_model & model,
|
|
||||||
ggml_type type_k,
|
|
||||||
ggml_type type_v,
|
|
||||||
bool offload,
|
|
||||||
uint32_t kv_size);
|
|
||||||
|
|
||||||
~llama_kv_cache_recurrent() = default;
|
|
||||||
|
|
||||||
//
|
|
||||||
// llama_memory_i
|
|
||||||
//
|
|
||||||
|
|
||||||
void clear() override;
|
|
||||||
|
|
||||||
bool seq_rm (llama_seq_id seq_id, llama_pos p0, llama_pos p1) override;
|
|
||||||
void seq_cp (llama_seq_id seq_id_src, llama_seq_id seq_id_dst, llama_pos p0, llama_pos p1) override;
|
|
||||||
void seq_keep(llama_seq_id seq_id) override;
|
|
||||||
void seq_add (llama_seq_id seq_id, llama_pos p0, llama_pos p1, llama_pos delta) override;
|
|
||||||
void seq_div (llama_seq_id seq_id, llama_pos p0, llama_pos p1, int d) override;
|
|
||||||
|
|
||||||
llama_pos seq_pos_max(llama_seq_id seq_id) const override;
|
|
||||||
|
|
||||||
//
|
|
||||||
// llama_kv_cache
|
|
||||||
//
|
|
||||||
|
|
||||||
void restore() override;
|
|
||||||
void commit() override;
|
|
||||||
|
|
||||||
bool update(llama_context & lctx) override;
|
|
||||||
|
|
||||||
void defrag_sched(float thold) override;
|
|
||||||
|
|
||||||
void set_full() override;
|
|
||||||
|
|
||||||
llama_sbatch sbatch_init(const llama_batch & batch, bool logits_all) override;
|
|
||||||
|
|
||||||
llama_ubatch ubatch_next(llama_sbatch & sbatch, uint32_t n_ubatch, bool embd_pooled) const override;
|
|
||||||
|
|
||||||
bool find_slot(const llama_ubatch & batch) override;
|
|
||||||
|
|
||||||
int32_t get_n_tokens() const override;
|
|
||||||
int32_t get_used_cells() const override;
|
|
||||||
|
|
||||||
// TODO: better data structures to reduce the cost of this operation
|
|
||||||
llama_pos get_pos_max() const override;
|
|
||||||
|
|
||||||
bool get_can_shift() const override;
|
|
||||||
|
|
||||||
// TODO: temporary methods - they are not really const as they do const_cast<>, fix this
|
|
||||||
int32_t s_copy(int i) const;
|
|
||||||
float s_mask(int i) const;
|
|
||||||
|
|
||||||
// state write/load
|
// state write/load
|
||||||
|
|
||||||
void state_write(llama_io_write_i & io, llama_seq_id seq_id = -1) const override;
|
void state_write(llama_io_write_i & io, llama_seq_id seq_id = -1) const;
|
||||||
void state_read (llama_io_read_i & io, llama_seq_id seq_id = -1) override;
|
void state_read (llama_io_read_i & io, llama_seq_id seq_id = -1);
|
||||||
|
|
||||||
|
// members
|
||||||
|
|
||||||
|
const llama_hparams & hparams;
|
||||||
|
|
||||||
|
callbacks cbs;
|
||||||
|
|
||||||
|
bool has_shift = false;
|
||||||
|
bool do_defrag = false;
|
||||||
|
|
||||||
|
// TODO: remove this and implement llama_kv_cache_recurrent instead
|
||||||
|
bool recurrent = false; // with recurrent state models, a cell can hold the state for more than one past token
|
||||||
|
|
||||||
|
bool v_trans = true; // the value tensor is transposed
|
||||||
|
bool can_shift = false;
|
||||||
|
|
||||||
// Note: The value of head isn't only used to optimize searching
|
// Note: The value of head isn't only used to optimize searching
|
||||||
// for a free KV slot. llama_decode_impl also uses it, so it
|
// for a free KV slot. llama_decode_impl also uses it, so it
|
||||||
@ -361,41 +186,18 @@ public:
|
|||||||
// computed before each graph build
|
// computed before each graph build
|
||||||
uint32_t n = 0;
|
uint32_t n = 0;
|
||||||
|
|
||||||
std::vector<kv_cell> cells;
|
std::vector<llama_kv_cell> cells;
|
||||||
|
|
||||||
std::vector<ggml_tensor *> k_l; // per layer
|
std::vector<ggml_tensor *> k_l; // per layer
|
||||||
std::vector<ggml_tensor *> v_l;
|
std::vector<ggml_tensor *> v_l;
|
||||||
|
|
||||||
private:
|
private:
|
||||||
//const llama_model & model;
|
|
||||||
const llama_hparams & hparams;
|
|
||||||
|
|
||||||
// commit/restore cache
|
|
||||||
// TODO: rework for recurrent cache
|
|
||||||
struct slot_range {
|
|
||||||
uint32_t c0 = 0; // note: these are cell indices, not sequence positions
|
|
||||||
uint32_t c1 = 0;
|
|
||||||
};
|
|
||||||
|
|
||||||
// pending cell updates that are not yet committed
|
|
||||||
struct {
|
|
||||||
std::vector<slot_range> ranges;
|
|
||||||
} pending;
|
|
||||||
|
|
||||||
ggml_type type_k = GGML_TYPE_F16;
|
ggml_type type_k = GGML_TYPE_F16;
|
||||||
ggml_type type_v = GGML_TYPE_F16;
|
ggml_type type_v = GGML_TYPE_F16;
|
||||||
|
|
||||||
std::vector<ggml_context_ptr> ctxs;
|
std::vector<ggml_context_ptr> ctxs;
|
||||||
std::vector<ggml_backend_buffer_ptr> bufs;
|
std::vector<ggml_backend_buffer_ptr> bufs;
|
||||||
|
|
||||||
// find how many cells are currently in use
|
|
||||||
uint32_t cell_max() const;
|
|
||||||
|
|
||||||
size_t total_size() const;
|
|
||||||
|
|
||||||
size_t size_k_bytes() const;
|
|
||||||
size_t size_v_bytes() const;
|
|
||||||
|
|
||||||
void state_write_meta(llama_io_write_i & io, const std::vector<std::pair<uint32_t, uint32_t>> & cell_ranges, llama_seq_id seq_id = -1) const;
|
void state_write_meta(llama_io_write_i & io, const std::vector<std::pair<uint32_t, uint32_t>> & cell_ranges, llama_seq_id seq_id = -1) const;
|
||||||
void state_write_data(llama_io_write_i & io, const std::vector<std::pair<uint32_t, uint32_t>> & cell_ranges) const;
|
void state_write_data(llama_io_write_i & io, const std::vector<std::pair<uint32_t, uint32_t>> & cell_ranges) const;
|
||||||
|
|
||||||
@ -403,6 +205,11 @@ private:
|
|||||||
bool state_read_data(llama_io_read_i & io, uint32_t cell_count);
|
bool state_read_data(llama_io_read_i & io, uint32_t cell_count);
|
||||||
};
|
};
|
||||||
|
|
||||||
|
// TODO: temporary reusing llama_kv_cache_unified -- implement recurrent cache and simplify llama_kv_cache_unified
|
||||||
|
//class llama_kv_cache_recurrent : public llama_kv_cache_unified {
|
||||||
|
//public:
|
||||||
|
// using llama_kv_cache_unified::llama_kv_cache_unified;
|
||||||
|
//};
|
||||||
|
|
||||||
//
|
//
|
||||||
// kv cache view
|
// kv cache view
|
||||||
|
12
llama/llama.cpp/src/llama-memory.h
vendored
12
llama/llama.cpp/src/llama-memory.h
vendored
@ -2,22 +2,12 @@
|
|||||||
|
|
||||||
#include "llama.h"
|
#include "llama.h"
|
||||||
|
|
||||||
struct llama_memory_params {
|
|
||||||
// kv cache
|
|
||||||
ggml_type type_k;
|
|
||||||
ggml_type type_v;
|
|
||||||
|
|
||||||
// parameters for other types of memory
|
|
||||||
// ...
|
|
||||||
};
|
|
||||||
|
|
||||||
// general concept of LLM memory
|
// general concept of LLM memory
|
||||||
// the KV cache is a type of LLM memory, but there can be other types
|
// the KV cache is a type of LLM memory, but there can be other types
|
||||||
class llama_memory_i {
|
class llama_memory_i {
|
||||||
public:
|
public:
|
||||||
virtual ~llama_memory_i() = default;
|
|
||||||
|
|
||||||
virtual void clear() = 0;
|
virtual void clear() = 0;
|
||||||
|
virtual void defrag() = 0;
|
||||||
|
|
||||||
virtual bool seq_rm (llama_seq_id seq_id, llama_pos p0, llama_pos p1) = 0;
|
virtual bool seq_rm (llama_seq_id seq_id, llama_pos p0, llama_pos p1) = 0;
|
||||||
virtual void seq_cp (llama_seq_id seq_id_src, llama_seq_id seq_id_dst, llama_pos p0, llama_pos p1) = 0;
|
virtual void seq_cp (llama_seq_id seq_id_src, llama_seq_id seq_id_dst, llama_pos p0, llama_pos p1) = 0;
|
||||||
|
26
llama/llama.cpp/src/llama-model-loader.cpp
vendored
26
llama/llama.cpp/src/llama-model-loader.cpp
vendored
@ -301,12 +301,12 @@ namespace GGUFMeta {
|
|||||||
GGUFMeta::GKV<GGUFMeta::ArrayInfo>::get_kv(meta.get(), kid);
|
GGUFMeta::GKV<GGUFMeta::ArrayInfo>::get_kv(meta.get(), kid);
|
||||||
|
|
||||||
switch (arr_info.gt) {
|
switch (arr_info.gt) {
|
||||||
case GGUF_TYPE_UINT32:
|
case GGUF_TYPE_FLOAT32: GGML_ASSERT((std::is_same<T, float>::value)); break;
|
||||||
case GGUF_TYPE_INT32: GGML_ASSERT((std::is_same<T, int32_t>::value) ||
|
case GGUF_TYPE_INT32: GGML_ASSERT(
|
||||||
(std::is_same<T, uint32_t>::value)); break;
|
(std::is_same<T, int32_t>::value) ||
|
||||||
case GGUF_TYPE_FLOAT32: GGML_ASSERT((std::is_same<T, float>::value)); break;
|
(std::is_same<T, uint32_t>::value)); break;
|
||||||
default:
|
default:
|
||||||
throw std::runtime_error(format("%s is not a float32/uint32/int32 array", key.c_str()));
|
throw std::runtime_error(format("%s is not a float32, int32 array", key.c_str()));
|
||||||
}
|
}
|
||||||
|
|
||||||
result.resize(arr_info.length);
|
result.resize(arr_info.length);
|
||||||
@ -315,6 +315,8 @@ namespace GGUFMeta {
|
|||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
template bool llama_model_loader::get_arr<std::array<unsigned int, 512>>(enum llm_kv kid, std::array<unsigned int, 512>& result, bool required);
|
||||||
|
|
||||||
template<typename T, size_t N_MAX>
|
template<typename T, size_t N_MAX>
|
||||||
bool llama_model_loader::get_arr(const std::string & key, std::array<T, N_MAX> & result, bool required) {
|
bool llama_model_loader::get_arr(const std::string & key, std::array<T, N_MAX> & result, bool required) {
|
||||||
const int kid = gguf_find_key(meta.get(), key.c_str());
|
const int kid = gguf_find_key(meta.get(), key.c_str());
|
||||||
@ -330,12 +332,12 @@ namespace GGUFMeta {
|
|||||||
GGUFMeta::GKV<GGUFMeta::ArrayInfo>::get_kv(meta.get(), kid);
|
GGUFMeta::GKV<GGUFMeta::ArrayInfo>::get_kv(meta.get(), kid);
|
||||||
|
|
||||||
switch (arr_info.gt) {
|
switch (arr_info.gt) {
|
||||||
case GGUF_TYPE_UINT32:
|
case GGUF_TYPE_FLOAT32: GGML_ASSERT((std::is_same<T, float>::value)); break;
|
||||||
case GGUF_TYPE_INT32: GGML_ASSERT((std::is_same<T, int32_t>::value) ||
|
case GGUF_TYPE_INT32: GGML_ASSERT(
|
||||||
(std::is_same<T, uint32_t>::value)); break;
|
(std::is_same<T, int32_t>::value) ||
|
||||||
case GGUF_TYPE_FLOAT32: GGML_ASSERT((std::is_same<T, float>::value)); break;
|
(std::is_same<T, uint32_t>::value)); break;
|
||||||
default:
|
default:
|
||||||
throw std::runtime_error(format("%s is not a float32/uint32/int32 array", key.c_str()));
|
throw std::runtime_error(format("%s is not a float32, int32 array", key.c_str()));
|
||||||
}
|
}
|
||||||
|
|
||||||
if (arr_info.length > N_MAX) {
|
if (arr_info.length > N_MAX) {
|
||||||
@ -824,10 +826,6 @@ void llama_model_loader::init_mappings(bool prefetch, llama_mlocks * mlock_mmaps
|
|||||||
mmaps_used.reserve(files.size());
|
mmaps_used.reserve(files.size());
|
||||||
for (const auto & file : files) {
|
for (const auto & file : files) {
|
||||||
auto * reg = ggml_backend_dev_backend_reg(ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU));
|
auto * reg = ggml_backend_dev_backend_reg(ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU));
|
||||||
if (!reg) {
|
|
||||||
throw std::runtime_error(format("%s: no CPU backend found", __func__));
|
|
||||||
}
|
|
||||||
|
|
||||||
auto * is_numa_fn = (decltype(ggml_is_numa) *) ggml_backend_reg_get_proc_address(reg, "ggml_backend_cpu_is_numa");
|
auto * is_numa_fn = (decltype(ggml_is_numa) *) ggml_backend_reg_get_proc_address(reg, "ggml_backend_cpu_is_numa");
|
||||||
std::unique_ptr<llama_mmap> mapping = std::make_unique<llama_mmap>(file.get(), prefetch ? -1 : 0, is_numa_fn());
|
std::unique_ptr<llama_mmap> mapping = std::make_unique<llama_mmap>(file.get(), prefetch ? -1 : 0, is_numa_fn());
|
||||||
mmaps_used.emplace_back(mapping->size(), 0);
|
mmaps_used.emplace_back(mapping->size(), 0);
|
||||||
|
281
llama/llama.cpp/src/llama-model-saver.cpp
vendored
281
llama/llama.cpp/src/llama-model-saver.cpp
vendored
@ -1,281 +0,0 @@
|
|||||||
#include "llama-model-saver.h"
|
|
||||||
|
|
||||||
#include "gguf.h"
|
|
||||||
|
|
||||||
#include "llama.h"
|
|
||||||
#include "llama-hparams.h"
|
|
||||||
#include "llama-model.h"
|
|
||||||
#include "llama-vocab.h"
|
|
||||||
|
|
||||||
#include <string>
|
|
||||||
|
|
||||||
llama_model_saver::llama_model_saver(const struct llama_model & model) : model(model), llm_kv(model.arch) {
|
|
||||||
gguf_ctx = gguf_init_empty();
|
|
||||||
}
|
|
||||||
|
|
||||||
llama_model_saver::~llama_model_saver() {
|
|
||||||
gguf_free(gguf_ctx);
|
|
||||||
}
|
|
||||||
|
|
||||||
void llama_model_saver::add_kv(const enum llm_kv key, const uint32_t value) {
|
|
||||||
gguf_set_val_u32(gguf_ctx, llm_kv(key).c_str(), value);
|
|
||||||
}
|
|
||||||
|
|
||||||
void llama_model_saver::add_kv(const enum llm_kv key, const int32_t value) {
|
|
||||||
gguf_set_val_i32(gguf_ctx, llm_kv(key).c_str(), value);
|
|
||||||
}
|
|
||||||
|
|
||||||
void llama_model_saver::add_kv(const enum llm_kv key, const float value) {
|
|
||||||
gguf_set_val_f32(gguf_ctx, llm_kv(key).c_str(), value);
|
|
||||||
}
|
|
||||||
|
|
||||||
void llama_model_saver::add_kv(const enum llm_kv key, const bool value) {
|
|
||||||
gguf_set_val_bool(gguf_ctx, llm_kv(key).c_str(), value);
|
|
||||||
}
|
|
||||||
|
|
||||||
void llama_model_saver::add_kv(const enum llm_kv key, const char * value) {
|
|
||||||
gguf_set_val_str(gguf_ctx, llm_kv(key).c_str(), value);
|
|
||||||
}
|
|
||||||
|
|
||||||
[[noreturn]]
|
|
||||||
void llama_model_saver::add_kv(const enum llm_kv key, const char value) {
|
|
||||||
GGML_UNUSED(key);
|
|
||||||
GGML_UNUSED(value);
|
|
||||||
GGML_ABORT("fatal error"); // this should never be called, only needed to make the template below compile
|
|
||||||
}
|
|
||||||
|
|
||||||
template <typename Container>
|
|
||||||
void llama_model_saver::add_kv(const enum llm_kv key, const Container & value, const bool per_layer) {
|
|
||||||
const size_t n_values = per_layer ? size_t(model.hparams.n_layer) : value.size();
|
|
||||||
GGML_ASSERT(n_values <= value.size());
|
|
||||||
|
|
||||||
if (n_values == 0) {
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (per_layer) {
|
|
||||||
bool all_values_the_same = true;
|
|
||||||
for (size_t i = 1; i < n_values; ++i) {
|
|
||||||
if (value[i] != value[0]) {
|
|
||||||
all_values_the_same = false;
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if (all_values_the_same) {
|
|
||||||
add_kv(key, value[0]);
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if (std::is_same<typename Container::value_type, uint8_t>::value) {
|
|
||||||
gguf_set_arr_data(gguf_ctx, llm_kv(key).c_str(), GGUF_TYPE_UINT8, value.data(), n_values);
|
|
||||||
} else if (std::is_same<typename Container::value_type, int8_t>::value) {
|
|
||||||
gguf_set_arr_data(gguf_ctx, llm_kv(key).c_str(), GGUF_TYPE_INT8, value.data(), n_values);
|
|
||||||
} else if (std::is_same<typename Container::value_type, uint32_t>::value) {
|
|
||||||
gguf_set_arr_data(gguf_ctx, llm_kv(key).c_str(), GGUF_TYPE_UINT32, value.data(), n_values);
|
|
||||||
} else if (std::is_same<typename Container::value_type, int32_t>::value) {
|
|
||||||
gguf_set_arr_data(gguf_ctx, llm_kv(key).c_str(), GGUF_TYPE_INT32, value.data(), n_values);
|
|
||||||
} else if (std::is_same<typename Container::value_type, float>::value) {
|
|
||||||
gguf_set_arr_data(gguf_ctx, llm_kv(key).c_str(), GGUF_TYPE_FLOAT32, value.data(), n_values);
|
|
||||||
} else if (std::is_same<Container, std::string>::value) {
|
|
||||||
gguf_set_val_str(gguf_ctx, llm_kv(key).c_str(), reinterpret_cast<const char *>(value.data()));
|
|
||||||
} else {
|
|
||||||
GGML_ABORT("fatal error");
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
void llama_model_saver::add_kv(const enum llm_kv key, const std::vector<std::string> & value) {
|
|
||||||
std::vector<const char *> tmp(value.size());
|
|
||||||
for (size_t i = 0; i < value.size(); ++i) {
|
|
||||||
tmp[i] = value[i].c_str();
|
|
||||||
}
|
|
||||||
gguf_set_arr_str(gguf_ctx, llm_kv(key).c_str(), tmp.data(), tmp.size());
|
|
||||||
}
|
|
||||||
|
|
||||||
void llama_model_saver::add_tensor(const struct ggml_tensor * tensor) {
|
|
||||||
if (!tensor) {
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
if (gguf_find_tensor(gguf_ctx, tensor->name) >= 0) {
|
|
||||||
GGML_ASSERT(std::string(tensor->name) == "rope_freqs.weight"); // FIXME
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
gguf_add_tensor(gguf_ctx, tensor);
|
|
||||||
}
|
|
||||||
|
|
||||||
void llama_model_saver::add_kv_from_model() {
|
|
||||||
const llama_hparams & hparams = model.hparams;
|
|
||||||
const llama_vocab & vocab = model.vocab;
|
|
||||||
|
|
||||||
const int32_t n_vocab = vocab.n_tokens();
|
|
||||||
std::vector<std::string> tokens(n_vocab);
|
|
||||||
std::vector<float> scores(n_vocab);
|
|
||||||
std::vector<int32_t> token_types(n_vocab);
|
|
||||||
|
|
||||||
for (int32_t id = 0; id < n_vocab; ++id) {
|
|
||||||
const llama_vocab::token_data & token_data = vocab.get_token_data(id);
|
|
||||||
|
|
||||||
tokens[id] = token_data.text;
|
|
||||||
scores[id] = token_data.score;
|
|
||||||
|
|
||||||
switch(token_data.attr) {
|
|
||||||
case LLAMA_TOKEN_ATTR_UNKNOWN: token_types[id] = LLAMA_TOKEN_TYPE_UNKNOWN; break;
|
|
||||||
case LLAMA_TOKEN_ATTR_UNUSED: token_types[id] = LLAMA_TOKEN_TYPE_UNUSED; break;
|
|
||||||
case LLAMA_TOKEN_ATTR_NORMAL: token_types[id] = LLAMA_TOKEN_TYPE_NORMAL; break;
|
|
||||||
case LLAMA_TOKEN_ATTR_CONTROL: token_types[id] = LLAMA_TOKEN_TYPE_CONTROL; break;
|
|
||||||
case LLAMA_TOKEN_ATTR_USER_DEFINED: token_types[id] = LLAMA_TOKEN_TYPE_USER_DEFINED; break;
|
|
||||||
case LLAMA_TOKEN_ATTR_BYTE: token_types[id] = LLAMA_TOKEN_TYPE_BYTE; break;
|
|
||||||
case LLAMA_TOKEN_ATTR_UNDEFINED:
|
|
||||||
default: token_types[id] = LLAMA_TOKEN_TYPE_UNDEFINED; break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// add_kv(LLM_KV_GENERAL_TYPE, ???);
|
|
||||||
add_kv(LLM_KV_GENERAL_ARCHITECTURE, model.arch_name());
|
|
||||||
// add_kv(LLM_KV_GENERAL_QUANTIZATION_VERSION, ???);
|
|
||||||
// add_kv(LLM_KV_GENERAL_ALIGNMENT, ???);
|
|
||||||
add_kv(LLM_KV_GENERAL_NAME, model.name);
|
|
||||||
// add_kv(LLM_KV_GENERAL_AUTHOR, ???);
|
|
||||||
// add_kv(LLM_KV_GENERAL_VERSION, ???);
|
|
||||||
// add_kv(LLM_KV_GENERAL_URL, ???);
|
|
||||||
// add_kv(LLM_KV_GENERAL_DESCRIPTION, ???);
|
|
||||||
// add_kv(LLM_KV_GENERAL_LICENSE, ???);
|
|
||||||
// add_kv(LLM_KV_GENERAL_SOURCE_URL, ???);
|
|
||||||
// add_kv(LLM_KV_GENERAL_SOURCE_HF_REPO, ???);
|
|
||||||
|
|
||||||
add_kv(LLM_KV_VOCAB_SIZE, vocab.n_tokens());
|
|
||||||
add_kv(LLM_KV_CONTEXT_LENGTH, hparams.n_ctx_train);
|
|
||||||
add_kv(LLM_KV_EMBEDDING_LENGTH, hparams.n_embd);
|
|
||||||
add_kv(LLM_KV_BLOCK_COUNT, hparams.n_layer);
|
|
||||||
add_kv(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead);
|
|
||||||
add_kv(LLM_KV_FEED_FORWARD_LENGTH, hparams.n_ff_arr, true);
|
|
||||||
add_kv(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
|
|
||||||
add_kv(LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
|
|
||||||
add_kv(LLM_KV_USE_PARALLEL_RESIDUAL, hparams.use_par_res);
|
|
||||||
// add_kv(LLM_KV_TENSOR_DATA_LAYOUT, ???);
|
|
||||||
add_kv(LLM_KV_EXPERT_COUNT, hparams.n_expert);
|
|
||||||
add_kv(LLM_KV_EXPERT_USED_COUNT, hparams.n_expert_used);
|
|
||||||
add_kv(LLM_KV_EXPERT_SHARED_COUNT, hparams.n_expert_shared);
|
|
||||||
add_kv(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale);
|
|
||||||
add_kv(LLM_KV_POOLING_TYPE, uint32_t(hparams.pooling_type));
|
|
||||||
add_kv(LLM_KV_LOGIT_SCALE, hparams.f_logit_scale);
|
|
||||||
add_kv(LLM_KV_DECODER_START_TOKEN_ID, hparams.dec_start_token_id);
|
|
||||||
add_kv(LLM_KV_ATTN_LOGIT_SOFTCAPPING, hparams.f_attn_logit_softcapping);
|
|
||||||
add_kv(LLM_KV_FINAL_LOGIT_SOFTCAPPING, hparams.f_final_logit_softcapping);
|
|
||||||
add_kv(LLM_KV_SWIN_NORM, hparams.swin_norm);
|
|
||||||
add_kv(LLM_KV_RESCALE_EVERY_N_LAYERS, hparams.rescale_every_n_layers);
|
|
||||||
add_kv(LLM_KV_TIME_MIX_EXTRA_DIM, hparams.time_mix_extra_dim);
|
|
||||||
add_kv(LLM_KV_TIME_DECAY_EXTRA_DIM, hparams.time_decay_extra_dim);
|
|
||||||
add_kv(LLM_KV_RESIDUAL_SCALE, hparams.f_residual_scale);
|
|
||||||
add_kv(LLM_KV_EMBEDDING_SCALE, hparams.f_embedding_scale);
|
|
||||||
|
|
||||||
add_kv(LLM_KV_ATTENTION_HEAD_COUNT, hparams.n_head_arr, true);
|
|
||||||
add_kv(LLM_KV_ATTENTION_HEAD_COUNT_KV, hparams.n_head_kv_arr, true);
|
|
||||||
add_kv(LLM_KV_ATTENTION_MAX_ALIBI_BIAS, hparams.f_max_alibi_bias);
|
|
||||||
add_kv(LLM_KV_ATTENTION_CLAMP_KQV, hparams.f_clamp_kqv);
|
|
||||||
add_kv(LLM_KV_ATTENTION_KEY_LENGTH, hparams.n_embd_head_k);
|
|
||||||
add_kv(LLM_KV_ATTENTION_VALUE_LENGTH, hparams.n_embd_head_v);
|
|
||||||
add_kv(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
|
||||||
add_kv(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
|
||||||
add_kv(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn);
|
|
||||||
add_kv(LLM_KV_ATTENTION_Q_LORA_RANK, hparams.n_lora_q);
|
|
||||||
add_kv(LLM_KV_ATTENTION_KV_LORA_RANK, hparams.n_lora_kv);
|
|
||||||
add_kv(LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT, hparams.n_rel_attn_bkts);
|
|
||||||
add_kv(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa);
|
|
||||||
add_kv(LLM_KV_ATTENTION_SCALE, hparams.f_attention_scale);
|
|
||||||
|
|
||||||
const float rope_scaling_factor = hparams.rope_freq_scale_train == 1.0f ? 0.0f : 1.0f/hparams.rope_freq_scale_train;
|
|
||||||
|
|
||||||
add_kv(LLM_KV_ROPE_DIMENSION_COUNT, hparams.n_rot);
|
|
||||||
add_kv(LLM_KV_ROPE_FREQ_BASE, hparams.rope_freq_base_train);
|
|
||||||
// add_kv(LLM_KV_ROPE_SCALE_LINEAR, rope_scaling_factor); // old name
|
|
||||||
add_kv(LLM_KV_ROPE_SCALING_TYPE, llama_rope_scaling_type_name(hparams.rope_scaling_type_train));
|
|
||||||
add_kv(LLM_KV_ROPE_SCALING_FACTOR, rope_scaling_factor);
|
|
||||||
add_kv(LLM_KV_ROPE_SCALING_ATTN_FACTOR, hparams.rope_attn_factor);
|
|
||||||
add_kv(LLM_KV_ROPE_SCALING_ORIG_CTX_LEN, hparams.n_ctx_orig_yarn);
|
|
||||||
add_kv(LLM_KV_ROPE_SCALING_FINETUNED, hparams.rope_finetuned);
|
|
||||||
add_kv(LLM_KV_ROPE_SCALING_YARN_LOG_MUL, hparams.rope_yarn_log_mul);
|
|
||||||
|
|
||||||
// TODO: implement split file support
|
|
||||||
// add_kv(LLM_KV_SPLIT_NO, ???);
|
|
||||||
// add_kv(LLM_KV_SPLIT_COUNT, ???);
|
|
||||||
// add_kv(LLM_KV_SPLIT_TENSORS_COUNT, ???);
|
|
||||||
|
|
||||||
add_kv(LLM_KV_SSM_INNER_SIZE, hparams.ssm_d_inner);
|
|
||||||
add_kv(LLM_KV_SSM_CONV_KERNEL, hparams.ssm_d_conv);
|
|
||||||
add_kv(LLM_KV_SSM_STATE_SIZE, hparams.ssm_d_state);
|
|
||||||
add_kv(LLM_KV_SSM_TIME_STEP_RANK, hparams.ssm_dt_rank);
|
|
||||||
add_kv(LLM_KV_SSM_DT_B_C_RMS, hparams.ssm_dt_b_c_rms);
|
|
||||||
|
|
||||||
add_kv(LLM_KV_WKV_HEAD_SIZE, hparams.wkv_head_size);
|
|
||||||
|
|
||||||
add_kv(LLM_KV_TOKENIZER_MODEL, vocab.get_tokenizer_model());
|
|
||||||
add_kv(LLM_KV_TOKENIZER_PRE, vocab.get_tokenizer_pre());
|
|
||||||
add_kv(LLM_KV_TOKENIZER_LIST, tokens);
|
|
||||||
add_kv(LLM_KV_TOKENIZER_TOKEN_TYPE, token_types);
|
|
||||||
add_kv(LLM_KV_TOKENIZER_TOKEN_TYPE_COUNT, vocab.n_token_types());
|
|
||||||
add_kv(LLM_KV_TOKENIZER_SCORES, scores);
|
|
||||||
add_kv(LLM_KV_TOKENIZER_MERGES, vocab.get_bpe_merges());
|
|
||||||
// FIXME llama_token is type i32 but when reading in a GGUF file u32 is expected, not an issue for writing though
|
|
||||||
add_kv(LLM_KV_TOKENIZER_BOS_ID, uint32_t(vocab.token_bos()));
|
|
||||||
add_kv(LLM_KV_TOKENIZER_EOS_ID, uint32_t(vocab.token_eos()));
|
|
||||||
add_kv(LLM_KV_TOKENIZER_EOT_ID, uint32_t(vocab.token_eot()));
|
|
||||||
add_kv(LLM_KV_TOKENIZER_EOM_ID, uint32_t(vocab.token_eom()));
|
|
||||||
add_kv(LLM_KV_TOKENIZER_UNK_ID, uint32_t(vocab.token_unk()));
|
|
||||||
add_kv(LLM_KV_TOKENIZER_SEP_ID, uint32_t(vocab.token_sep()));
|
|
||||||
add_kv(LLM_KV_TOKENIZER_PAD_ID, uint32_t(vocab.token_pad()));
|
|
||||||
// add_kv(LLM_KV_TOKENIZER_CLS_ID, uint32_t(vocab.token_bos())); // deprecated
|
|
||||||
// add_kv(LLM_KV_TOKENIZER_MASK_ID, ???);
|
|
||||||
add_kv(LLM_KV_TOKENIZER_ADD_BOS, vocab.get_add_bos());
|
|
||||||
add_kv(LLM_KV_TOKENIZER_ADD_EOS, vocab.get_add_eos());
|
|
||||||
add_kv(LLM_KV_TOKENIZER_ADD_PREFIX, vocab.get_add_space_prefix());
|
|
||||||
add_kv(LLM_KV_TOKENIZER_REMOVE_EXTRA_WS, vocab.get_remove_extra_whitespaces());
|
|
||||||
add_kv(LLM_KV_TOKENIZER_PRECOMPILED_CHARSMAP, vocab.get_precompiled_charsmap());
|
|
||||||
// add_kv(LLM_KV_TOKENIZER_HF_JSON, ???);
|
|
||||||
// add_kv(LLM_KV_TOKENIZER_RWKV, ???);
|
|
||||||
add_kv(LLM_KV_TOKENIZER_FIM_PRE_ID, uint32_t(vocab.token_fim_pre()));
|
|
||||||
add_kv(LLM_KV_TOKENIZER_FIM_SUF_ID, uint32_t(vocab.token_fim_suf()));
|
|
||||||
add_kv(LLM_KV_TOKENIZER_FIM_MID_ID, uint32_t(vocab.token_fim_mid()));
|
|
||||||
add_kv(LLM_KV_TOKENIZER_FIM_PAD_ID, uint32_t(vocab.token_fim_pad()));
|
|
||||||
add_kv(LLM_KV_TOKENIZER_FIM_REP_ID, uint32_t(vocab.token_fim_rep()));
|
|
||||||
add_kv(LLM_KV_TOKENIZER_FIM_SEP_ID, uint32_t(vocab.token_fim_sep()));
|
|
||||||
|
|
||||||
// TODO: implement LoRA support
|
|
||||||
// add_kv(LLM_KV_ADAPTER_TYPE, ???);
|
|
||||||
// add_kv(LLM_KV_ADAPTER_LORA_ALPHA, ???);
|
|
||||||
|
|
||||||
// deprecated
|
|
||||||
// add_kv(LLM_KV_TOKENIZER_PREFIX_ID, ???);
|
|
||||||
// add_kv(LLM_KV_TOKENIZER_SUFFIX_ID, ???);
|
|
||||||
// add_kv(LLM_KV_TOKENIZER_MIDDLE_ID, ???);
|
|
||||||
}
|
|
||||||
|
|
||||||
void llama_model_saver::add_tensors_from_model() {
|
|
||||||
if (std::string(model.output->name) != std::string(model.tok_embd->name)) {
|
|
||||||
add_tensor(model.tok_embd); // some models use the same tensor for tok_embd and output
|
|
||||||
}
|
|
||||||
add_tensor(model.type_embd);
|
|
||||||
add_tensor(model.pos_embd);
|
|
||||||
add_tensor(model.tok_norm);
|
|
||||||
add_tensor(model.tok_norm_b);
|
|
||||||
add_tensor(model.output_norm);
|
|
||||||
add_tensor(model.output_norm_b);
|
|
||||||
add_tensor(model.output);
|
|
||||||
add_tensor(model.output_b);
|
|
||||||
add_tensor(model.output_norm_enc);
|
|
||||||
add_tensor(model.cls);
|
|
||||||
add_tensor(model.cls_b);
|
|
||||||
add_tensor(model.cls_out);
|
|
||||||
add_tensor(model.cls_out_b);
|
|
||||||
|
|
||||||
for (const struct llama_layer & layer : model.layers) {
|
|
||||||
for (size_t i = 0; i < sizeof(layer)/sizeof(struct ggml_tensor *); ++i) {
|
|
||||||
add_tensor(reinterpret_cast<const struct ggml_tensor * const *>(&layer)[i]);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
void llama_model_saver::save(const std::string & path_model) {
|
|
||||||
gguf_write_to_file(gguf_ctx, path_model.c_str(), false);
|
|
||||||
}
|
|
||||||
|
|
37
llama/llama.cpp/src/llama-model-saver.h
vendored
37
llama/llama.cpp/src/llama-model-saver.h
vendored
@ -1,37 +0,0 @@
|
|||||||
#pragma once
|
|
||||||
|
|
||||||
#include "llama.h"
|
|
||||||
#include "llama-arch.h"
|
|
||||||
|
|
||||||
#include <vector>
|
|
||||||
|
|
||||||
struct llama_model_saver {
|
|
||||||
struct gguf_context * gguf_ctx = nullptr;
|
|
||||||
const struct llama_model & model;
|
|
||||||
const struct LLM_KV llm_kv;
|
|
||||||
|
|
||||||
llama_model_saver(const struct llama_model & model);
|
|
||||||
~llama_model_saver();
|
|
||||||
|
|
||||||
void add_kv(enum llm_kv key, uint32_t value);
|
|
||||||
void add_kv(enum llm_kv key, int32_t value);
|
|
||||||
void add_kv(enum llm_kv key, float value);
|
|
||||||
void add_kv(enum llm_kv key, bool value);
|
|
||||||
void add_kv(enum llm_kv key, const char * value);
|
|
||||||
|
|
||||||
[[noreturn]]
|
|
||||||
void add_kv(enum llm_kv key, char value); // needed to make the template below compile
|
|
||||||
|
|
||||||
template <typename Container>
|
|
||||||
void add_kv(enum llm_kv key, const Container & value, bool per_layer = false);
|
|
||||||
|
|
||||||
void add_kv(enum llm_kv key, const std::vector<std::string> & value);
|
|
||||||
|
|
||||||
void add_tensor(const struct ggml_tensor * tensor);
|
|
||||||
|
|
||||||
void add_kv_from_model();
|
|
||||||
|
|
||||||
void add_tensors_from_model();
|
|
||||||
|
|
||||||
void save(const std::string & path_model);
|
|
||||||
};
|
|
864
llama/llama.cpp/src/llama-model.cpp
vendored
864
llama/llama.cpp/src/llama-model.cpp
vendored
File diff suppressed because it is too large
Load Diff
31
llama/llama.cpp/src/llama-model.h
vendored
31
llama/llama.cpp/src/llama-model.h
vendored
@ -11,6 +11,7 @@
|
|||||||
#include <string>
|
#include <string>
|
||||||
#include <unordered_map>
|
#include <unordered_map>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
|
#include <stdexcept>
|
||||||
|
|
||||||
struct llama_cparams;
|
struct llama_cparams;
|
||||||
struct llama_ubatch;
|
struct llama_ubatch;
|
||||||
@ -36,17 +37,14 @@ enum llm_type {
|
|||||||
LLM_TYPE_335M,
|
LLM_TYPE_335M,
|
||||||
LLM_TYPE_410M,
|
LLM_TYPE_410M,
|
||||||
LLM_TYPE_450M,
|
LLM_TYPE_450M,
|
||||||
LLM_TYPE_475M,
|
|
||||||
LLM_TYPE_770M,
|
LLM_TYPE_770M,
|
||||||
LLM_TYPE_780M,
|
LLM_TYPE_780M,
|
||||||
LLM_TYPE_0_5B,
|
LLM_TYPE_0_5B,
|
||||||
LLM_TYPE_0_6B,
|
|
||||||
LLM_TYPE_1B,
|
LLM_TYPE_1B,
|
||||||
LLM_TYPE_1_3B,
|
LLM_TYPE_1_3B,
|
||||||
LLM_TYPE_1_4B,
|
LLM_TYPE_1_4B,
|
||||||
LLM_TYPE_1_5B,
|
LLM_TYPE_1_5B,
|
||||||
LLM_TYPE_1_6B,
|
LLM_TYPE_1_6B,
|
||||||
LLM_TYPE_1_7B,
|
|
||||||
LLM_TYPE_1_8B,
|
LLM_TYPE_1_8B,
|
||||||
LLM_TYPE_2B,
|
LLM_TYPE_2B,
|
||||||
LLM_TYPE_2_8B,
|
LLM_TYPE_2_8B,
|
||||||
@ -66,7 +64,6 @@ enum llm_type {
|
|||||||
LLM_TYPE_16B,
|
LLM_TYPE_16B,
|
||||||
LLM_TYPE_20B,
|
LLM_TYPE_20B,
|
||||||
LLM_TYPE_22B,
|
LLM_TYPE_22B,
|
||||||
LLM_TYPE_27B,
|
|
||||||
LLM_TYPE_30B,
|
LLM_TYPE_30B,
|
||||||
LLM_TYPE_32B,
|
LLM_TYPE_32B,
|
||||||
LLM_TYPE_34B,
|
LLM_TYPE_34B,
|
||||||
@ -74,10 +71,9 @@ enum llm_type {
|
|||||||
LLM_TYPE_40B,
|
LLM_TYPE_40B,
|
||||||
LLM_TYPE_65B,
|
LLM_TYPE_65B,
|
||||||
LLM_TYPE_70B,
|
LLM_TYPE_70B,
|
||||||
|
LLM_TYPE_90B,
|
||||||
LLM_TYPE_236B,
|
LLM_TYPE_236B,
|
||||||
LLM_TYPE_290B,
|
|
||||||
LLM_TYPE_314B,
|
LLM_TYPE_314B,
|
||||||
LLM_TYPE_405B,
|
|
||||||
LLM_TYPE_671B,
|
LLM_TYPE_671B,
|
||||||
LLM_TYPE_SMALL,
|
LLM_TYPE_SMALL,
|
||||||
LLM_TYPE_MEDIUM,
|
LLM_TYPE_MEDIUM,
|
||||||
@ -91,14 +87,12 @@ enum llm_type {
|
|||||||
LLM_TYPE_16x3_8B,
|
LLM_TYPE_16x3_8B,
|
||||||
LLM_TYPE_10B_128x3_66B,
|
LLM_TYPE_10B_128x3_66B,
|
||||||
LLM_TYPE_57B_A14B,
|
LLM_TYPE_57B_A14B,
|
||||||
|
LLM_TYPE_27B,
|
||||||
|
LLM_TYPE_290B,
|
||||||
LLM_TYPE_17B_16E, // llama4 Scout
|
LLM_TYPE_17B_16E, // llama4 Scout
|
||||||
LLM_TYPE_17B_128E, // llama4 Maverick
|
LLM_TYPE_17B_128E, // llama4 Maverick
|
||||||
LLM_TYPE_30B_A3B,
|
|
||||||
LLM_TYPE_235B_A22B,
|
|
||||||
};
|
};
|
||||||
|
|
||||||
std::string llama_rope_scaling_type_name(llama_rope_scaling_type rope_scaling_type);
|
|
||||||
|
|
||||||
struct llama_layer_posnet {
|
struct llama_layer_posnet {
|
||||||
// resnet
|
// resnet
|
||||||
struct ggml_tensor * norm1 = nullptr;
|
struct ggml_tensor * norm1 = nullptr;
|
||||||
@ -180,8 +174,6 @@ struct llama_layer {
|
|||||||
struct ggml_tensor * wq_b = nullptr;
|
struct ggml_tensor * wq_b = nullptr;
|
||||||
struct ggml_tensor * wkv_a_mqa = nullptr;
|
struct ggml_tensor * wkv_a_mqa = nullptr;
|
||||||
struct ggml_tensor * wkv_b = nullptr;
|
struct ggml_tensor * wkv_b = nullptr;
|
||||||
struct ggml_tensor * wk_b = nullptr;
|
|
||||||
struct ggml_tensor * wv_b = nullptr;
|
|
||||||
struct ggml_tensor * wq_cross = nullptr;
|
struct ggml_tensor * wq_cross = nullptr;
|
||||||
struct ggml_tensor * wk_cross = nullptr;
|
struct ggml_tensor * wk_cross = nullptr;
|
||||||
struct ggml_tensor * wv_cross = nullptr;
|
struct ggml_tensor * wv_cross = nullptr;
|
||||||
@ -318,6 +310,16 @@ struct llama_layer {
|
|||||||
|
|
||||||
struct ggml_tensor * bskcn_tv = nullptr;
|
struct ggml_tensor * bskcn_tv = nullptr;
|
||||||
|
|
||||||
|
// cross attention
|
||||||
|
struct ggml_tensor * cross_attn_k_norm = nullptr;
|
||||||
|
struct ggml_tensor * cross_attn_k_proj = nullptr;
|
||||||
|
struct ggml_tensor * cross_attn_o_proj = nullptr;
|
||||||
|
struct ggml_tensor * cross_attn_q_norm = nullptr;
|
||||||
|
struct ggml_tensor * cross_attn_q_proj = nullptr;
|
||||||
|
struct ggml_tensor * cross_attn_v_proj = nullptr;
|
||||||
|
struct ggml_tensor * cross_attn_attn_gate = nullptr;
|
||||||
|
struct ggml_tensor * cross_attn_mlp_gate = nullptr;
|
||||||
|
|
||||||
struct llama_layer_posnet posnet;
|
struct llama_layer_posnet posnet;
|
||||||
|
|
||||||
struct llama_layer_convnext convnext;
|
struct llama_layer_convnext convnext;
|
||||||
@ -401,11 +403,8 @@ struct llama_model {
|
|||||||
|
|
||||||
const struct ggml_tensor * get_tensor(const char * name) const;
|
const struct ggml_tensor * get_tensor(const char * name) const;
|
||||||
|
|
||||||
ggml_tensor * get_rope_factors(uint32_t n_ctx_per_seq, int il) const;
|
|
||||||
|
|
||||||
// note: can mutate `cparams`
|
|
||||||
// TODO: move this to new llm_arch_model_i interface
|
// TODO: move this to new llm_arch_model_i interface
|
||||||
llama_memory_i * create_memory(const llama_memory_params & params, llama_cparams & cparams) const;
|
llama_memory_i * create_memory() const; // TODO: params
|
||||||
|
|
||||||
// TODO: move this to new llm_arch_model_i interface
|
// TODO: move this to new llm_arch_model_i interface
|
||||||
llm_graph_result_ptr build_graph(
|
llm_graph_result_ptr build_graph(
|
||||||
|
12
llama/llama.cpp/src/llama-quant.cpp
vendored
12
llama/llama.cpp/src/llama-quant.cpp
vendored
@ -519,7 +519,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
|
|||||||
nthread = std::thread::hardware_concurrency();
|
nthread = std::thread::hardware_concurrency();
|
||||||
}
|
}
|
||||||
|
|
||||||
// mmap consistently increases speed on Linux, and also increases speed on Windows with
|
// mmap consistently increases speed Linux, and also increases speed on Windows with
|
||||||
// hot cache. It may cause a slowdown on macOS, possibly related to free memory.
|
// hot cache. It may cause a slowdown on macOS, possibly related to free memory.
|
||||||
#if defined(__linux__) || defined(_WIN32)
|
#if defined(__linux__) || defined(_WIN32)
|
||||||
constexpr bool use_mmap = true;
|
constexpr bool use_mmap = true;
|
||||||
@ -529,7 +529,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
|
|||||||
|
|
||||||
llama_model_kv_override * kv_overrides = nullptr;
|
llama_model_kv_override * kv_overrides = nullptr;
|
||||||
if (params->kv_overrides) {
|
if (params->kv_overrides) {
|
||||||
auto * v = (std::vector<llama_model_kv_override>*)params->kv_overrides;
|
auto v = (std::vector<llama_model_kv_override>*)params->kv_overrides;
|
||||||
kv_overrides = v->data();
|
kv_overrides = v->data();
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -639,7 +639,9 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
|
|||||||
if (llama_model_has_encoder(&model)) {
|
if (llama_model_has_encoder(&model)) {
|
||||||
n_attn_layer *= 3;
|
n_attn_layer *= 3;
|
||||||
}
|
}
|
||||||
GGML_ASSERT((qs.n_attention_wv == n_attn_layer) && "n_attention_wv is unexpected");
|
if (qs.n_attention_wv != n_attn_layer) {
|
||||||
|
LLAMA_LOG_WARN("%s: n_attention_wv is unexpected, expected: %d, found: %d\n", __func__, n_attn_layer, qs.n_attention_wv);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
size_t total_size_org = 0;
|
size_t total_size_org = 0;
|
||||||
@ -742,6 +744,10 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
|
|||||||
// This used to be a regex, but <regex> has an extreme cost to compile times.
|
// This used to be a regex, but <regex> has an extreme cost to compile times.
|
||||||
bool quantize = name.rfind("weight") == name.size() - 6; // ends with 'weight'?
|
bool quantize = name.rfind("weight") == name.size() - 6; // ends with 'weight'?
|
||||||
|
|
||||||
|
// don't quantize vision stuff
|
||||||
|
quantize &= name.find("v.") == std::string::npos;
|
||||||
|
quantize &= name.find("mm.") == std::string::npos;
|
||||||
|
|
||||||
// quantize only 2D and 3D tensors (experts)
|
// quantize only 2D and 3D tensors (experts)
|
||||||
quantize &= (ggml_n_dims(tensor) >= 2);
|
quantize &= (ggml_n_dims(tensor) >= 2);
|
||||||
|
|
||||||
|
31
llama/llama.cpp/src/llama-sampling.cpp
vendored
31
llama/llama.cpp/src/llama-sampling.cpp
vendored
@ -232,7 +232,7 @@ static void llama_sampler_top_k_impl(llama_token_data_array * cur_p, int32_t k)
|
|||||||
// }
|
// }
|
||||||
|
|
||||||
if (k <= 0) {
|
if (k <= 0) {
|
||||||
return;
|
k = cur_p->size;
|
||||||
}
|
}
|
||||||
|
|
||||||
k = std::min(k, (int) cur_p->size);
|
k = std::min(k, (int) cur_p->size);
|
||||||
@ -298,7 +298,6 @@ static void llama_sampler_top_k_impl(llama_token_data_array * cur_p, int32_t k)
|
|||||||
}
|
}
|
||||||
cur_p->sorted = true;
|
cur_p->sorted = true;
|
||||||
}
|
}
|
||||||
|
|
||||||
cur_p->size = k;
|
cur_p->size = k;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -1466,7 +1465,7 @@ static void llama_sampler_grammar_reset(struct llama_sampler * smpl) {
|
|||||||
trigger_patterns_c.push_back(trigger_pattern.pattern.c_str());
|
trigger_patterns_c.push_back(trigger_pattern.pattern.c_str());
|
||||||
}
|
}
|
||||||
|
|
||||||
auto * grammar_new = llama_grammar_init_impl(ctx->grammar->vocab, nullptr, ctx->grammar_str.c_str(), ctx->grammar_root.c_str(),
|
auto * grammar_new = llama_grammar_init_impl(ctx->grammar->vocab, ctx->grammar_str.c_str(), ctx->grammar_root.c_str(),
|
||||||
ctx->grammar->lazy, trigger_patterns_c.data(), trigger_patterns_c.size(),
|
ctx->grammar->lazy, trigger_patterns_c.data(), trigger_patterns_c.size(),
|
||||||
ctx->grammar->trigger_tokens.data(), ctx->grammar->trigger_tokens.size());
|
ctx->grammar->trigger_tokens.data(), ctx->grammar->trigger_tokens.size());
|
||||||
|
|
||||||
@ -1548,7 +1547,7 @@ static struct llama_sampler * llama_sampler_init_grammar_impl(
|
|||||||
/* .vocab = */ vocab,
|
/* .vocab = */ vocab,
|
||||||
/* .grammar_str = */ grammar_str,
|
/* .grammar_str = */ grammar_str,
|
||||||
/* .grammar_root = */ grammar_root,
|
/* .grammar_root = */ grammar_root,
|
||||||
/* .grammar = */ llama_grammar_init_impl(vocab, nullptr, grammar_str, grammar_root, lazy, trigger_patterns, num_trigger_patterns, trigger_tokens, num_trigger_tokens),
|
/* .grammar = */ llama_grammar_init_impl(vocab, grammar_str, grammar_root, lazy, trigger_patterns, num_trigger_patterns, trigger_tokens, num_trigger_tokens),
|
||||||
};
|
};
|
||||||
if (!ctx->grammar) {
|
if (!ctx->grammar) {
|
||||||
delete ctx;
|
delete ctx;
|
||||||
@ -1750,35 +1749,23 @@ static const char * llama_sampler_top_n_sigma_name(const struct llama_sampler *
|
|||||||
static void llama_sampler_top_n_sigma_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) {
|
static void llama_sampler_top_n_sigma_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) {
|
||||||
const auto * ctx = (llama_sampler_top_n_sigma *) smpl->ctx;
|
const auto * ctx = (llama_sampler_top_n_sigma *) smpl->ctx;
|
||||||
|
|
||||||
if (ctx->n <= 0.0f || cur_p->size <= 1) {
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
// find max logit and calculate mean
|
// find max logit and calculate mean
|
||||||
float max = cur_p->data[0].logit;
|
float max = cur_p->data[0].logit;
|
||||||
float logits_sum = 0;
|
float logits_sum = 0;
|
||||||
size_t valid_count = 0;
|
|
||||||
for (size_t i = 0; i < cur_p->size; ++i) {
|
for (size_t i = 0; i < cur_p->size; ++i) {
|
||||||
// Only count non-negative infinity values
|
if (cur_p->data[i].logit > max) {
|
||||||
if (cur_p->data[i].logit != -INFINITY) {
|
max = cur_p->data[i].logit;
|
||||||
if (cur_p->data[i].logit > max) {
|
|
||||||
max = cur_p->data[i].logit;
|
|
||||||
}
|
|
||||||
logits_sum += cur_p->data[i].logit;
|
|
||||||
valid_count++;
|
|
||||||
}
|
}
|
||||||
|
logits_sum += cur_p->data[i].logit;
|
||||||
}
|
}
|
||||||
float mean = valid_count > 0 ? logits_sum/valid_count : 0;
|
float mean = logits_sum/cur_p->size;
|
||||||
|
|
||||||
// calculate standard deviation
|
// calculate standard deviation
|
||||||
float acc = 0;
|
float acc = 0;
|
||||||
for (size_t i = 0; i < cur_p->size; ++i) {
|
for (size_t i = 0; i < cur_p->size; ++i) {
|
||||||
// Skip -infinity in std calculation
|
acc += pow(cur_p->data[i].logit - mean, 2);
|
||||||
if (cur_p->data[i].logit != -INFINITY) {
|
|
||||||
acc += pow(cur_p->data[i].logit - mean, 2);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
float std = valid_count > 0 ? sqrt(acc/valid_count) : 0;
|
float std = sqrt(acc/cur_p->size);
|
||||||
|
|
||||||
//apply mask
|
//apply mask
|
||||||
for (size_t i = 0; i < cur_p->size; ++i) {
|
for (size_t i = 0; i < cur_p->size; ++i) {
|
||||||
|
52
llama/llama.cpp/src/llama-vocab.cpp
vendored
52
llama/llama.cpp/src/llama-vocab.cpp
vendored
@ -1,7 +1,5 @@
|
|||||||
#include "llama-vocab.h"
|
#include "llama-vocab.h"
|
||||||
|
|
||||||
#include "ggml.h"
|
|
||||||
#include "gguf.h"
|
|
||||||
#include "llama-impl.h"
|
#include "llama-impl.h"
|
||||||
#include "llama-model-loader.h"
|
#include "llama-model-loader.h"
|
||||||
|
|
||||||
@ -417,13 +415,6 @@ struct llm_tokenizer_bpe : llm_tokenizer {
|
|||||||
"'(?:[sSdDmMtT]|[lL][lL]|[vV][eE]|[rR][eE])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]|\\s+(?!\\S)|\\s+",
|
"'(?:[sSdDmMtT]|[lL][lL]|[vV][eE]|[rR][eE])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]|\\s+(?!\\S)|\\s+",
|
||||||
};
|
};
|
||||||
break;
|
break;
|
||||||
case LLAMA_VOCAB_PRE_TYPE_SEED_CODER:
|
|
||||||
regex_exprs = {
|
|
||||||
// original regex from tokenizer.json
|
|
||||||
// "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1}| ?[^\\s\\p{L}\\p{N}\r\n]+|\\s*[\r\n]+|\\s+(?!\\S)|\\s+"
|
|
||||||
"(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1}| ?[^\\s\\p{L}\\p{N}\\r\\n]+|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
|
|
||||||
};
|
|
||||||
break;
|
|
||||||
default:
|
default:
|
||||||
// default regex for BPE tokenization pre-processing
|
// default regex for BPE tokenization pre-processing
|
||||||
regex_exprs = {
|
regex_exprs = {
|
||||||
@ -1236,9 +1227,6 @@ struct fragment_buffer_variant {
|
|||||||
struct llama_vocab::impl {
|
struct llama_vocab::impl {
|
||||||
uint32_t n_token_types = 0; // for BERT-style token types
|
uint32_t n_token_types = 0; // for BERT-style token types
|
||||||
|
|
||||||
std::string tokenizer_model;
|
|
||||||
std::string tokenizer_pre;
|
|
||||||
|
|
||||||
enum llama_vocab_type type = LLAMA_VOCAB_TYPE_SPM;
|
enum llama_vocab_type type = LLAMA_VOCAB_TYPE_SPM;
|
||||||
enum llama_vocab_pre_type pre_type = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
|
enum llama_vocab_pre_type pre_type = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
|
||||||
|
|
||||||
@ -1374,6 +1362,9 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|
|||||||
|
|
||||||
// determine vocab type
|
// determine vocab type
|
||||||
{
|
{
|
||||||
|
std::string tokenizer_model;
|
||||||
|
std::string tokenizer_pre;
|
||||||
|
|
||||||
ml.get_key(LLM_KV_TOKENIZER_MODEL, tokenizer_model);
|
ml.get_key(LLM_KV_TOKENIZER_MODEL, tokenizer_model);
|
||||||
ml.get_key(LLM_KV_TOKENIZER_PRE, tokenizer_pre, false);
|
ml.get_key(LLM_KV_TOKENIZER_PRE, tokenizer_pre, false);
|
||||||
|
|
||||||
@ -1468,8 +1459,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|
|||||||
|
|
||||||
const int precompiled_charsmap_keyidx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_PRECOMPILED_CHARSMAP).c_str());
|
const int precompiled_charsmap_keyidx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_PRECOMPILED_CHARSMAP).c_str());
|
||||||
if (precompiled_charsmap_keyidx != -1) {
|
if (precompiled_charsmap_keyidx != -1) {
|
||||||
const gguf_type pc_type = gguf_get_arr_type(ctx, precompiled_charsmap_keyidx);
|
size_t n_precompiled_charsmap = gguf_get_arr_data_n(ctx, precompiled_charsmap_keyidx);
|
||||||
const size_t n_precompiled_charsmap = gguf_get_arr_data_n(ctx, precompiled_charsmap_keyidx);
|
|
||||||
const char * pc = (const char *) gguf_get_arr_data(ctx, precompiled_charsmap_keyidx);
|
const char * pc = (const char *) gguf_get_arr_data(ctx, precompiled_charsmap_keyidx);
|
||||||
precompiled_charsmap.assign(pc, pc + n_precompiled_charsmap);
|
precompiled_charsmap.assign(pc, pc + n_precompiled_charsmap);
|
||||||
#ifdef IS_BIG_ENDIAN
|
#ifdef IS_BIG_ENDIAN
|
||||||
@ -1507,8 +1497,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|
|||||||
tokenizer_pre == "llama3" ||
|
tokenizer_pre == "llama3" ||
|
||||||
tokenizer_pre == "llama-v3" ||
|
tokenizer_pre == "llama-v3" ||
|
||||||
tokenizer_pre == "llama-bpe"||
|
tokenizer_pre == "llama-bpe"||
|
||||||
tokenizer_pre == "falcon3" ||
|
tokenizer_pre == "falcon3") {
|
||||||
tokenizer_pre == "pixtral") {
|
|
||||||
pre_type = LLAMA_VOCAB_PRE_TYPE_LLAMA3;
|
pre_type = LLAMA_VOCAB_PRE_TYPE_LLAMA3;
|
||||||
ignore_merges = true;
|
ignore_merges = true;
|
||||||
add_bos = true;
|
add_bos = true;
|
||||||
@ -1635,10 +1624,6 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|
|||||||
tokenizer_pre == "bailingmoe") {
|
tokenizer_pre == "bailingmoe") {
|
||||||
pre_type = LLAMA_VOCAB_PRE_TYPE_BAILINGMOE;
|
pre_type = LLAMA_VOCAB_PRE_TYPE_BAILINGMOE;
|
||||||
clean_spaces = false;
|
clean_spaces = false;
|
||||||
} else if (
|
|
||||||
tokenizer_pre == "seed-coder") {
|
|
||||||
pre_type = LLAMA_VOCAB_PRE_TYPE_SEED_CODER;
|
|
||||||
clean_spaces = false;
|
|
||||||
} else {
|
} else {
|
||||||
LLAMA_LOG_WARN("%s: missing or unrecognized pre-tokenizer type, using: 'default'\n", __func__);
|
LLAMA_LOG_WARN("%s: missing or unrecognized pre-tokenizer type, using: 'default'\n", __func__);
|
||||||
pre_type = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
|
pre_type = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
|
||||||
@ -1848,7 +1833,6 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|
|||||||
if (false
|
if (false
|
||||||
|| t.first == "<|fim_prefix|>" // Qwen
|
|| t.first == "<|fim_prefix|>" // Qwen
|
||||||
|| t.first == "<fim-prefix>"
|
|| t.first == "<fim-prefix>"
|
||||||
|| t.first == "<fim_prefix>" // Granite
|
|
||||||
|| t.first == "<|fim▁begin|>" // DeepSeek
|
|| t.first == "<|fim▁begin|>" // DeepSeek
|
||||||
|| t.first == "<PRE>"
|
|| t.first == "<PRE>"
|
||||||
|| t.first == "▁<PRE>" // CodeLlama
|
|| t.first == "▁<PRE>" // CodeLlama
|
||||||
@ -1867,7 +1851,6 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|
|||||||
if (false
|
if (false
|
||||||
|| t.first == "<|fim_suffix|>" // Qwen
|
|| t.first == "<|fim_suffix|>" // Qwen
|
||||||
|| t.first == "<fim-suffix>"
|
|| t.first == "<fim-suffix>"
|
||||||
|| t.first == "<fim_suffix>" // Granite
|
|
||||||
|| t.first == "<|fim▁hole|>" // DeepSeek
|
|| t.first == "<|fim▁hole|>" // DeepSeek
|
||||||
|| t.first == "<SUF>"
|
|| t.first == "<SUF>"
|
||||||
|| t.first == "▁<SUF>" // CodeLlama
|
|| t.first == "▁<SUF>" // CodeLlama
|
||||||
@ -1886,7 +1869,6 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|
|||||||
if (false
|
if (false
|
||||||
|| t.first == "<|fim_middle|>" // Qwen
|
|| t.first == "<|fim_middle|>" // Qwen
|
||||||
|| t.first == "<fim-middle>"
|
|| t.first == "<fim-middle>"
|
||||||
|| t.first == "<fim_middle>" // Granite
|
|
||||||
|| t.first == "<|fim▁end|>" // DeepSeek
|
|| t.first == "<|fim▁end|>" // DeepSeek
|
||||||
|| t.first == "<MID>"
|
|| t.first == "<MID>"
|
||||||
|| t.first == "▁<MID>" // CodeLlama
|
|| t.first == "▁<MID>" // CodeLlama
|
||||||
@ -1905,7 +1887,6 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|
|||||||
if (false
|
if (false
|
||||||
|| t.first == "<|fim_pad|>" // Qwen
|
|| t.first == "<|fim_pad|>" // Qwen
|
||||||
|| t.first == "<fim-pad>"
|
|| t.first == "<fim-pad>"
|
||||||
|| t.first == "<fim_pad>" // Granite
|
|
||||||
|| t.first == "<PAD>"
|
|| t.first == "<PAD>"
|
||||||
) {
|
) {
|
||||||
special_fim_pad_id = t.second;
|
special_fim_pad_id = t.second;
|
||||||
@ -1924,7 +1905,6 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|
|||||||
|| t.first == "<|repo_name|>"
|
|| t.first == "<|repo_name|>"
|
||||||
|| t.first == "<fim-repo>"
|
|| t.first == "<fim-repo>"
|
||||||
|| t.first == "<REPO>"
|
|| t.first == "<REPO>"
|
||||||
|| t.first == "<reponame>" // Granite
|
|
||||||
) {
|
) {
|
||||||
special_fim_rep_id = t.second;
|
special_fim_rep_id = t.second;
|
||||||
if ((id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
|
if ((id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
|
||||||
@ -2784,14 +2764,6 @@ void llama_vocab::load(llama_model_loader & ml, const LLM_KV & kv) {
|
|||||||
pimpl->load(ml, kv);
|
pimpl->load(ml, kv);
|
||||||
}
|
}
|
||||||
|
|
||||||
std::string llama_vocab::get_tokenizer_model() const {
|
|
||||||
return pimpl->tokenizer_model;
|
|
||||||
}
|
|
||||||
|
|
||||||
std::string llama_vocab::get_tokenizer_pre() const {
|
|
||||||
return pimpl->tokenizer_pre;
|
|
||||||
}
|
|
||||||
|
|
||||||
enum llama_vocab_type llama_vocab::get_type() const {
|
enum llama_vocab_type llama_vocab::get_type() const {
|
||||||
return pimpl->type;
|
return pimpl->type;
|
||||||
}
|
}
|
||||||
@ -3014,20 +2986,6 @@ int llama_vocab::find_bpe_rank(const std::string & token_left, const std::string
|
|||||||
return it->second;
|
return it->second;
|
||||||
}
|
}
|
||||||
|
|
||||||
std::vector<std::string> llama_vocab::get_bpe_merges() const {
|
|
||||||
std::vector<std::string> result(pimpl->bpe_ranks.size());
|
|
||||||
|
|
||||||
for (const auto & pair : pimpl->bpe_ranks) {
|
|
||||||
result[pair.second] = pair.first.first + " " + pair.first.second;
|
|
||||||
}
|
|
||||||
|
|
||||||
return result;
|
|
||||||
}
|
|
||||||
|
|
||||||
std::vector<char> llama_vocab::get_precompiled_charsmap() const {
|
|
||||||
return pimpl->precompiled_charsmap;
|
|
||||||
}
|
|
||||||
|
|
||||||
int32_t llama_vocab::tokenize(
|
int32_t llama_vocab::tokenize(
|
||||||
const char * text,
|
const char * text,
|
||||||
int32_t text_len,
|
int32_t text_len,
|
||||||
|
6
llama/llama.cpp/src/llama-vocab.h
vendored
6
llama/llama.cpp/src/llama-vocab.h
vendored
@ -21,9 +21,6 @@ struct llama_vocab {
|
|||||||
|
|
||||||
void load(llama_model_loader & ml, const LLM_KV & kv);
|
void load(llama_model_loader & ml, const LLM_KV & kv);
|
||||||
|
|
||||||
std::string get_tokenizer_model() const;
|
|
||||||
std::string get_tokenizer_pre() const;
|
|
||||||
|
|
||||||
enum llama_vocab_type get_type() const;
|
enum llama_vocab_type get_type() const;
|
||||||
enum llama_vocab_pre_type get_pre_type() const;
|
enum llama_vocab_pre_type get_pre_type() const;
|
||||||
|
|
||||||
@ -83,9 +80,6 @@ struct llama_vocab {
|
|||||||
int max_token_len() const;
|
int max_token_len() const;
|
||||||
|
|
||||||
int find_bpe_rank(const std::string & token_left, const std::string & token_right) const;
|
int find_bpe_rank(const std::string & token_left, const std::string & token_right) const;
|
||||||
std::vector<std::string> get_bpe_merges() const;
|
|
||||||
|
|
||||||
std::vector<char> get_precompiled_charsmap() const;
|
|
||||||
|
|
||||||
int32_t tokenize(
|
int32_t tokenize(
|
||||||
const char * text,
|
const char * text,
|
||||||
|
9
llama/llama.cpp/src/llama.cpp
vendored
9
llama/llama.cpp/src/llama.cpp
vendored
@ -4,7 +4,6 @@
|
|||||||
#include "llama-mmap.h"
|
#include "llama-mmap.h"
|
||||||
#include "llama-vocab.h"
|
#include "llama-vocab.h"
|
||||||
#include "llama-model-loader.h"
|
#include "llama-model-loader.h"
|
||||||
#include "llama-model-saver.h"
|
|
||||||
#include "llama-model.h"
|
#include "llama-model.h"
|
||||||
|
|
||||||
#include "ggml.h"
|
#include "ggml.h"
|
||||||
@ -254,13 +253,6 @@ struct llama_model * llama_model_load_from_splits(
|
|||||||
return llama_model_load_from_file_impl(splits.front(), splits, params);
|
return llama_model_load_from_file_impl(splits.front(), splits, params);
|
||||||
}
|
}
|
||||||
|
|
||||||
void llama_model_save_to_file(const struct llama_model * model, const char * path_model) {
|
|
||||||
llama_model_saver ms(*model);
|
|
||||||
ms.add_kv_from_model();
|
|
||||||
ms.add_tensors_from_model();
|
|
||||||
ms.save(path_model);
|
|
||||||
}
|
|
||||||
|
|
||||||
//
|
//
|
||||||
// chat templates
|
// chat templates
|
||||||
//
|
//
|
||||||
@ -346,4 +338,3 @@ const char * llama_print_system_info(void) {
|
|||||||
|
|
||||||
return s.c_str();
|
return s.c_str();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
3841
llama/llama.cpp/tools/mtmd/clip.cpp
vendored
3841
llama/llama.cpp/tools/mtmd/clip.cpp
vendored
File diff suppressed because it is too large
Load Diff
201
llama/llama.go
201
llama/llama.go
@ -2,11 +2,10 @@ package llama
|
|||||||
|
|
||||||
/*
|
/*
|
||||||
#cgo CFLAGS: -std=c11
|
#cgo CFLAGS: -std=c11
|
||||||
#cgo windows CFLAGS: -Wno-dll-attribute-on-redeclaration
|
|
||||||
#cgo CXXFLAGS: -std=c++17
|
#cgo CXXFLAGS: -std=c++17
|
||||||
#cgo CPPFLAGS: -I${SRCDIR}/llama.cpp/include
|
#cgo CPPFLAGS: -I${SRCDIR}/llama.cpp/include
|
||||||
#cgo CPPFLAGS: -I${SRCDIR}/llama.cpp/common
|
#cgo CPPFLAGS: -I${SRCDIR}/llama.cpp/common
|
||||||
#cgo CPPFLAGS: -I${SRCDIR}/llama.cpp/tools/mtmd
|
#cgo CPPFLAGS: -I${SRCDIR}/llama.cpp/examples/llava
|
||||||
#cgo CPPFLAGS: -I${SRCDIR}/llama.cpp/src
|
#cgo CPPFLAGS: -I${SRCDIR}/llama.cpp/src
|
||||||
#cgo CPPFLAGS: -I${SRCDIR}/../ml/backend/ggml/ggml/include
|
#cgo CPPFLAGS: -I${SRCDIR}/../ml/backend/ggml/ggml/include
|
||||||
|
|
||||||
@ -17,6 +16,7 @@ package llama
|
|||||||
#include "llava.h"
|
#include "llava.h"
|
||||||
#include "gguf.h"
|
#include "gguf.h"
|
||||||
|
|
||||||
|
#include "mllama.h"
|
||||||
#include "sampling_ext.h"
|
#include "sampling_ext.h"
|
||||||
|
|
||||||
extern bool llamaProgressCallback(float progress, void *user_data);
|
extern bool llamaProgressCallback(float progress, void *user_data);
|
||||||
@ -35,12 +35,11 @@ import (
|
|||||||
"runtime/cgo"
|
"runtime/cgo"
|
||||||
"slices"
|
"slices"
|
||||||
"strings"
|
"strings"
|
||||||
"sync"
|
|
||||||
"unsafe"
|
"unsafe"
|
||||||
|
|
||||||
_ "github.com/ollama/ollama/llama/llama.cpp/common"
|
_ "github.com/ollama/ollama/llama/llama.cpp/common"
|
||||||
|
_ "github.com/ollama/ollama/llama/llama.cpp/examples/llava"
|
||||||
_ "github.com/ollama/ollama/llama/llama.cpp/src"
|
_ "github.com/ollama/ollama/llama/llama.cpp/src"
|
||||||
_ "github.com/ollama/ollama/llama/llama.cpp/tools/mtmd"
|
|
||||||
ggml "github.com/ollama/ollama/ml/backend/ggml/ggml/src"
|
ggml "github.com/ollama/ollama/ml/backend/ggml/ggml/src"
|
||||||
)
|
)
|
||||||
|
|
||||||
@ -198,6 +197,7 @@ type ModelParams struct {
|
|||||||
NumGpuLayers int
|
NumGpuLayers int
|
||||||
MainGpu int
|
MainGpu int
|
||||||
UseMmap bool
|
UseMmap bool
|
||||||
|
UseMlock bool
|
||||||
TensorSplit []float32
|
TensorSplit []float32
|
||||||
Progress func(float32)
|
Progress func(float32)
|
||||||
VocabOnly bool
|
VocabOnly bool
|
||||||
@ -216,6 +216,7 @@ func LoadModelFromFile(modelPath string, params ModelParams) (*Model, error) {
|
|||||||
cparams.n_gpu_layers = C.int(params.NumGpuLayers)
|
cparams.n_gpu_layers = C.int(params.NumGpuLayers)
|
||||||
cparams.main_gpu = C.int32_t(params.MainGpu)
|
cparams.main_gpu = C.int32_t(params.MainGpu)
|
||||||
cparams.use_mmap = C.bool(params.UseMmap)
|
cparams.use_mmap = C.bool(params.UseMmap)
|
||||||
|
cparams.use_mlock = C.bool(params.UseMlock)
|
||||||
cparams.vocab_only = C.bool(params.VocabOnly)
|
cparams.vocab_only = C.bool(params.VocabOnly)
|
||||||
|
|
||||||
if len(params.TensorSplit) > 0 {
|
if len(params.TensorSplit) > 0 {
|
||||||
@ -248,6 +249,20 @@ func LoadModelFromFile(modelPath string, params ModelParams) (*Model, error) {
|
|||||||
return &m, nil
|
return &m, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func LoadVocabFromFile(path string) (*Vocab, error) {
|
||||||
|
mp := C.CString(path)
|
||||||
|
defer C.free(unsafe.Pointer(mp))
|
||||||
|
v := Vocab{c: C.llama_load_vocab_from_file(mp)}
|
||||||
|
if v.c == nil {
|
||||||
|
return nil, fmt.Errorf("unable to load vocab: %s", path)
|
||||||
|
}
|
||||||
|
return &v, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func FreeVocab(vocab *Vocab) {
|
||||||
|
C.llama_free_vocab(vocab.c)
|
||||||
|
}
|
||||||
|
|
||||||
func FreeModel(model *Model) {
|
func FreeModel(model *Model) {
|
||||||
C.llama_model_free(model.c)
|
C.llama_model_free(model.c)
|
||||||
}
|
}
|
||||||
@ -296,6 +311,10 @@ func (m *Model) ApplyLoraFromFile(context *Context, loraPath string, scale float
|
|||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
type Vocab struct {
|
||||||
|
c *C.struct_llama_vocab
|
||||||
|
}
|
||||||
|
|
||||||
func (m *Model) Vocab() *C.struct_llama_vocab {
|
func (m *Model) Vocab() *C.struct_llama_vocab {
|
||||||
return C.llama_model_get_vocab(m.c)
|
return C.llama_model_get_vocab(m.c)
|
||||||
}
|
}
|
||||||
@ -459,6 +478,24 @@ func (m *Model) NEmbd() int {
|
|||||||
return int(C.llama_model_n_embd(m.c))
|
return int(C.llama_model_n_embd(m.c))
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func Quantize(infile, outfile string, ftype uint32) error {
|
||||||
|
cinfile := C.CString(infile)
|
||||||
|
defer C.free(unsafe.Pointer(cinfile))
|
||||||
|
|
||||||
|
coutfile := C.CString(outfile)
|
||||||
|
defer C.free(unsafe.Pointer(coutfile))
|
||||||
|
|
||||||
|
params := C.llama_model_quantize_default_params()
|
||||||
|
params.nthread = -1
|
||||||
|
params.ftype = ftype
|
||||||
|
|
||||||
|
if rc := C.llama_model_quantize(cinfile, coutfile, ¶ms); rc != 0 {
|
||||||
|
return fmt.Errorf("llama_model_quantize: %d", rc)
|
||||||
|
}
|
||||||
|
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
// vision processing
|
// vision processing
|
||||||
type ClipContext struct {
|
type ClipContext struct {
|
||||||
c *C.struct_clip_ctx
|
c *C.struct_clip_ctx
|
||||||
@ -509,6 +546,63 @@ func (c *ClipContext) NewEmbed(llamaContext *Context, data []byte) ([][]float32,
|
|||||||
return embed, nil
|
return embed, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
type MllamaContext struct {
|
||||||
|
c *C.struct_mllama_ctx
|
||||||
|
}
|
||||||
|
|
||||||
|
func NewMllamaContext(llamaContext *Context, modelPath string) (*MllamaContext, error) {
|
||||||
|
mp := C.CString(modelPath)
|
||||||
|
defer C.free(unsafe.Pointer(mp))
|
||||||
|
c := C.mllama_model_load(mp, 1)
|
||||||
|
if c == nil {
|
||||||
|
return nil, fmt.Errorf("unable to load mllama model: %v", modelPath)
|
||||||
|
}
|
||||||
|
|
||||||
|
projEmbedSize := int(C.mllama_n_embd(c))
|
||||||
|
modelEmbedSize := llamaContext.Model().NEmbd()
|
||||||
|
if projEmbedSize != modelEmbedSize {
|
||||||
|
return nil, fmt.Errorf("projector embedding size (%d) does not match model (%d)", projEmbedSize, modelEmbedSize)
|
||||||
|
}
|
||||||
|
|
||||||
|
return &MllamaContext{c: c}, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func (m *MllamaContext) Free() {
|
||||||
|
C.mllama_free(m.c)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (m *MllamaContext) NewEmbed(llamaContext *Context, data []byte, aspectRatioId int) ([][]float32, error) {
|
||||||
|
img := C.mllama_image_init()
|
||||||
|
defer C.mllama_image_free(img)
|
||||||
|
|
||||||
|
ok := bool(C.mllama_image_load_from_data(unsafe.Pointer(&data[0]), C.int(len(data)), 560, 560, 3, 4, C.int(aspectRatioId), img))
|
||||||
|
if !ok {
|
||||||
|
return nil, errors.New("unable to load mllama image data")
|
||||||
|
}
|
||||||
|
|
||||||
|
rows := make([]float32, m.EmbedSize(llamaContext))
|
||||||
|
ok = bool(C.mllama_image_encode(m.c, C.int(llamaContext.numThreads), img, (*C.float)(unsafe.Pointer(&rows[0]))))
|
||||||
|
if !ok {
|
||||||
|
return nil, errors.New("unable to make mllama embedding from image")
|
||||||
|
}
|
||||||
|
|
||||||
|
embed := make([][]float32, 1)
|
||||||
|
embed[0] = rows
|
||||||
|
|
||||||
|
return embed, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func (m *MllamaContext) EmbedSize(llamaContext *Context) int {
|
||||||
|
numTokens := int(C.mllama_n_positions(m.c) * C.mllama_n_tiles(m.c))
|
||||||
|
numEmbed := llamaContext.Model().NEmbd()
|
||||||
|
|
||||||
|
return numTokens * numEmbed
|
||||||
|
}
|
||||||
|
|
||||||
|
func (c *Context) SetCrossAttention(state bool) {
|
||||||
|
C.llama_set_cross_attention(c.c, C.bool(state))
|
||||||
|
}
|
||||||
|
|
||||||
func (c *Context) Synchronize() {
|
func (c *Context) Synchronize() {
|
||||||
C.llama_synchronize(c.c)
|
C.llama_synchronize(c.c)
|
||||||
}
|
}
|
||||||
@ -529,6 +623,9 @@ type SamplingParams struct {
|
|||||||
PenaltyRepeat float32
|
PenaltyRepeat float32
|
||||||
PenaltyFreq float32
|
PenaltyFreq float32
|
||||||
PenaltyPresent float32
|
PenaltyPresent float32
|
||||||
|
Mirostat int
|
||||||
|
MirostatTau float32
|
||||||
|
MirostatEta float32
|
||||||
PenalizeNl bool
|
PenalizeNl bool
|
||||||
Seed uint32
|
Seed uint32
|
||||||
Grammar string
|
Grammar string
|
||||||
@ -545,6 +642,9 @@ func NewSamplingContext(model *Model, params SamplingParams) (*SamplingContext,
|
|||||||
cparams.penalty_repeat = C.float(params.PenaltyRepeat)
|
cparams.penalty_repeat = C.float(params.PenaltyRepeat)
|
||||||
cparams.penalty_freq = C.float(params.PenaltyFreq)
|
cparams.penalty_freq = C.float(params.PenaltyFreq)
|
||||||
cparams.penalty_present = C.float(params.PenaltyFreq)
|
cparams.penalty_present = C.float(params.PenaltyFreq)
|
||||||
|
cparams.mirostat = C.int32_t(params.Mirostat)
|
||||||
|
cparams.mirostat_tau = C.float(params.MirostatTau)
|
||||||
|
cparams.mirostat_eta = C.float(params.MirostatEta)
|
||||||
cparams.seed = C.uint32_t(params.Seed)
|
cparams.seed = C.uint32_t(params.Seed)
|
||||||
|
|
||||||
grammar := C.CString(params.Grammar)
|
grammar := C.CString(params.Grammar)
|
||||||
@ -579,8 +679,8 @@ func SchemaToGrammar(schema []byte) []byte {
|
|||||||
cStr := C.CString(string(schema))
|
cStr := C.CString(string(schema))
|
||||||
defer C.free(unsafe.Pointer(cStr))
|
defer C.free(unsafe.Pointer(cStr))
|
||||||
|
|
||||||
// Allocate buffer for grammar based on schema length but with upper bound
|
// Allocate buffer for grammar output with reasonable size
|
||||||
maxLen := min(1024*1024, len(schema)*4)
|
const maxLen = 32768 // 32KB
|
||||||
buf := make([]byte, maxLen)
|
buf := make([]byte, maxLen)
|
||||||
|
|
||||||
// Call C function to convert schema to grammar
|
// Call C function to convert schema to grammar
|
||||||
@ -592,65 +692,35 @@ func SchemaToGrammar(schema []byte) []byte {
|
|||||||
return buf[:n]
|
return buf[:n]
|
||||||
}
|
}
|
||||||
|
|
||||||
|
type Sampler struct {
|
||||||
|
c *C.struct_llama_sampler
|
||||||
|
}
|
||||||
|
|
||||||
|
func NewGrammarSampler(vocab *Vocab, grammar string) *Sampler {
|
||||||
|
cGrammar := C.CString(grammar)
|
||||||
|
cRoot := C.CString("root")
|
||||||
|
defer C.free(unsafe.Pointer(cGrammar))
|
||||||
|
defer C.free(unsafe.Pointer(cRoot))
|
||||||
|
|
||||||
|
sampler := &Sampler{c: C.llama_sampler_init_grammar(vocab.c, cGrammar, cRoot)}
|
||||||
|
|
||||||
|
return sampler
|
||||||
|
}
|
||||||
|
|
||||||
|
func (s *Sampler) Accept(token int32) {
|
||||||
|
C.llama_sampler_accept(s.c, C.llama_token(token))
|
||||||
|
}
|
||||||
|
|
||||||
type TokenData struct {
|
type TokenData struct {
|
||||||
ID int32
|
Id int32
|
||||||
Logit float32
|
Logit float32
|
||||||
}
|
}
|
||||||
|
|
||||||
type Grammar struct {
|
func (s *Sampler) Apply(tokens []TokenData) {
|
||||||
c *C.struct_llama_grammar
|
|
||||||
mu sync.Mutex
|
|
||||||
}
|
|
||||||
|
|
||||||
func NewGrammar(grammar string, vocabIds []uint32, vocabValues []string, eogTokens []uint32) *Grammar {
|
|
||||||
cGrammar := C.CString(grammar)
|
|
||||||
defer C.free(unsafe.Pointer(cGrammar))
|
|
||||||
|
|
||||||
cTokens := make([]C.uint32_t, len(vocabIds))
|
|
||||||
for i, token := range vocabIds {
|
|
||||||
cTokens[i] = C.uint32_t(token)
|
|
||||||
}
|
|
||||||
|
|
||||||
cPieces := make([]*C.char, len(vocabValues))
|
|
||||||
for i, piece := range vocabValues {
|
|
||||||
cPieces[i] = C.CString(piece)
|
|
||||||
defer C.free(unsafe.Pointer(cPieces[i]))
|
|
||||||
}
|
|
||||||
|
|
||||||
cEogTokens := make([]C.uint32_t, len(eogTokens))
|
|
||||||
for i, token := range eogTokens {
|
|
||||||
cEogTokens[i] = C.uint32_t(token)
|
|
||||||
}
|
|
||||||
|
|
||||||
g := C.grammar_init(cGrammar, (*C.uint32_t)(unsafe.Pointer(&cTokens[0])), C.size_t(len(cTokens)), (**C.char)(unsafe.Pointer(&cPieces[0])), (*C.uint32_t)(unsafe.Pointer(&cEogTokens[0])), C.size_t(len(cEogTokens)))
|
|
||||||
if g == nil {
|
|
||||||
return nil
|
|
||||||
}
|
|
||||||
|
|
||||||
return &Grammar{c: g}
|
|
||||||
}
|
|
||||||
|
|
||||||
func (g *Grammar) Free() {
|
|
||||||
g.mu.Lock()
|
|
||||||
defer g.mu.Unlock()
|
|
||||||
if g.c != nil {
|
|
||||||
C.grammar_free(g.c)
|
|
||||||
g.c = nil
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
func (g *Grammar) Apply(tokens []TokenData) {
|
|
||||||
g.mu.Lock()
|
|
||||||
defer g.mu.Unlock()
|
|
||||||
|
|
||||||
if g.c == nil {
|
|
||||||
return
|
|
||||||
}
|
|
||||||
|
|
||||||
tds := make([]C.struct_llama_token_data, len(tokens))
|
tds := make([]C.struct_llama_token_data, len(tokens))
|
||||||
for i, token := range tokens {
|
for i, token := range tokens {
|
||||||
tds[i] = C.struct_llama_token_data{
|
tds[i] = C.struct_llama_token_data{
|
||||||
id: C.int32_t(token.ID),
|
id: C.int32_t(token.Id),
|
||||||
logit: C.float(token.Logit),
|
logit: C.float(token.Logit),
|
||||||
p: C.float(0.0),
|
p: C.float(0.0),
|
||||||
}
|
}
|
||||||
@ -661,24 +731,13 @@ func (g *Grammar) Apply(tokens []TokenData) {
|
|||||||
selected: C.int64_t(-1),
|
selected: C.int64_t(-1),
|
||||||
sorted: C.bool(false),
|
sorted: C.bool(false),
|
||||||
}
|
}
|
||||||
|
|
||||||
var pinner runtime.Pinner
|
var pinner runtime.Pinner
|
||||||
pinner.Pin(&tds[0])
|
pinner.Pin(&tds[0])
|
||||||
defer pinner.Unpin()
|
defer pinner.Unpin()
|
||||||
|
|
||||||
C.grammar_apply(g.c, tda)
|
C.llama_sampler_apply(s.c, tda)
|
||||||
for i := range tokens {
|
for i := range tokens {
|
||||||
tokens[i].Logit = float32(tds[i].logit)
|
tokens[i].Logit = float32(tds[i].logit)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func (g *Grammar) Accept(token int32) {
|
|
||||||
g.mu.Lock()
|
|
||||||
defer g.mu.Unlock()
|
|
||||||
|
|
||||||
// Check if grammar was freed
|
|
||||||
if g.c == nil {
|
|
||||||
return
|
|
||||||
}
|
|
||||||
|
|
||||||
C.grammar_accept(g.c, C.llama_token(token))
|
|
||||||
}
|
|
||||||
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
x
Reference in New Issue
Block a user