Compare commits

..

2 Commits

Author SHA1 Message Date
ParthSareen 23e8ac9428 wip? 2025-05-07 19:00:44 -07:00
ParthSareen 611d3a17ed server: add python tool parsing logic 2025-05-02 16:23:54 -07:00
94 changed files with 2431 additions and 4206 deletions
+1 -1
View File
@@ -1,6 +1,6 @@
UPSTREAM=https://github.com/ggerganov/llama.cpp.git
WORKDIR=llama/vendor
FETCH_HEAD=e1e8e0991ffd9e99a445c6812bb519d5bac9f4b5
FETCH_HEAD=2016f07bd106c73699ecbaace80f55db5ed95dac
.PHONY: help
help:
+19 -23
View File
@@ -61,8 +61,6 @@ Here are some example models that can be downloaded:
| QwQ | 32B | 20GB | `ollama run qwq` |
| DeepSeek-R1 | 7B | 4.7GB | `ollama run deepseek-r1` |
| DeepSeek-R1 | 671B | 404GB | `ollama run deepseek-r1:671b` |
| Llama 4 | 109B | 67GB | `ollama run llama4:scout` |
| Llama 4 | 400B | 245GB | `ollama run llama4:maverick` |
| Llama 3.3 | 70B | 43GB | `ollama run llama3.3` |
| Llama 3.2 | 3B | 2.0GB | `ollama run llama3.2` |
| Llama 3.2 | 1B | 1.3GB | `ollama run llama3.2:1b` |
@@ -79,7 +77,7 @@ Here are some example models that can be downloaded:
| Code Llama | 7B | 3.8GB | `ollama run codellama` |
| Llama 2 Uncensored | 7B | 3.8GB | `ollama run llama2-uncensored` |
| LLaVA | 7B | 4.5GB | `ollama run llava` |
| Granite-3.3 | 8B | 4.9GB | `ollama run granite3.3` |
| Granite-3.2 | 8B | 4.9GB | `ollama run granite3.2` |
> [!NOTE]
> You should have at least 8 GB of RAM available to run the 7B models, 16 GB to run the 13B models, and 32 GB to run the 33B models.
@@ -287,7 +285,7 @@ See the [API documentation](./docs/api.md) for all endpoints.
- [Bionic GPT](https://github.com/bionic-gpt/bionic-gpt)
- [HTML UI](https://github.com/rtcfirefly/ollama-ui)
- [Saddle](https://github.com/jikkuatwork/saddle)
- [TagSpaces](https://www.tagspaces.org) (A platform for file-based apps, [utilizing Ollama](https://docs.tagspaces.org/ai/) for the generation of tags and descriptions)
- [TagSpaces](https://www.tagspaces.org) (A platform for file based apps, [utilizing Ollama](https://docs.tagspaces.org/ai/) for the generation of tags and descriptions)
- [Chatbot UI](https://github.com/ivanfioravanti/chatbot-ollama)
- [Chatbot UI v2](https://github.com/mckaywrigley/chatbot-ui)
- [Typescript UI](https://github.com/ollama-interface/Ollama-Gui?tab=readme-ov-file)
@@ -314,7 +312,6 @@ See the [API documentation](./docs/api.md) for all endpoints.
- [Ollama Basic Chat: Uses HyperDiv Reactive UI](https://github.com/rapidarchitect/ollama_basic_chat)
- [Ollama-chats RPG](https://github.com/drazdra/ollama-chats)
- [IntelliBar](https://intellibar.app/) (AI-powered assistant for macOS)
- [Jirapt](https://github.com/AliAhmedNada/jirapt) (Jira Integration to generate issues, tasks, epics)
- [QA-Pilot](https://github.com/reid41/QA-Pilot) (Interactive chat tool that can leverage Ollama models for rapid understanding and navigation of GitHub code repositories)
- [ChatOllama](https://github.com/sugarforever/chat-ollama) (Open Source Chatbot based on Ollama with Knowledge Bases)
- [CRAG Ollama Chat](https://github.com/Nagi-ovo/CRAG-Ollama-Chat) (Simple Web Search with Corrective RAG)
@@ -328,14 +325,14 @@ See the [API documentation](./docs/api.md) for all endpoints.
- [RWKV-Runner](https://github.com/josStorer/RWKV-Runner) (RWKV offline LLM deployment tool, also usable as a client for ChatGPT and Ollama)
- [Ollama Grid Search](https://github.com/dezoito/ollama-grid-search) (app to evaluate and compare models)
- [Olpaka](https://github.com/Otacon/olpaka) (User-friendly Flutter Web App for Ollama)
- [Casibase](https://casibase.org) (An open source AI knowledge base and dialogue system combining the latest RAG, SSO, ollama support, and multiple large language models.)
- [Casibase](https://casibase.org) (An open source AI knowledge base and dialogue system combining the latest RAG, SSO, ollama support and multiple large language models.)
- [OllamaSpring](https://github.com/CrazyNeil/OllamaSpring) (Ollama Client for macOS)
- [LLocal.in](https://github.com/kartikm7/llocal) (Easy to use Electron Desktop Client for Ollama)
- [Shinkai Desktop](https://github.com/dcSpark/shinkai-apps) (Two click install Local AI using Ollama + Files + RAG)
- [AiLama](https://github.com/zeyoyt/ailama) (A Discord User App that allows you to interact with Ollama anywhere in Discord)
- [AiLama](https://github.com/zeyoyt/ailama) (A Discord User App that allows you to interact with Ollama anywhere in discord )
- [Ollama with Google Mesop](https://github.com/rapidarchitect/ollama_mesop/) (Mesop Chat Client implementation with Ollama)
- [R2R](https://github.com/SciPhi-AI/R2R) (Open-source RAG engine)
- [Ollama-Kis](https://github.com/elearningshow/ollama-kis) (A simple easy-to-use GUI with sample custom LLM for Drivers Education)
- [Ollama-Kis](https://github.com/elearningshow/ollama-kis) (A simple easy to use GUI with sample custom LLM for Drivers Education)
- [OpenGPA](https://opengpa.org) (Open-source offline-first Enterprise Agentic Application)
- [Painting Droid](https://github.com/mateuszmigas/painting-droid) (Painting app with AI integrations)
- [Kerlig AI](https://www.kerlig.com/) (AI writing assistant for macOS)
@@ -344,16 +341,16 @@ See the [API documentation](./docs/api.md) for all endpoints.
- [LLMStack](https://github.com/trypromptly/LLMStack) (No-code multi-agent framework to build LLM agents and workflows)
- [BoltAI for Mac](https://boltai.com) (AI Chat Client for Mac)
- [Harbor](https://github.com/av/harbor) (Containerized LLM Toolkit with Ollama as default backend)
- [PyGPT](https://github.com/szczyglis-dev/py-gpt) (AI desktop assistant for Linux, Windows, and Mac)
- [Alpaca](https://github.com/Jeffser/Alpaca) (An Ollama client application for Linux and macOS made with GTK4 and Adwaita)
- [PyGPT](https://github.com/szczyglis-dev/py-gpt) (AI desktop assistant for Linux, Windows and Mac)
- [Alpaca](https://github.com/Jeffser/Alpaca) (An Ollama client application for linux and macos made with GTK4 and Adwaita)
- [AutoGPT](https://github.com/Significant-Gravitas/AutoGPT/blob/master/docs/content/platform/ollama.md) (AutoGPT Ollama integration)
- [Go-CREW](https://www.jonathanhecl.com/go-crew/) (Powerful Offline RAG in Golang)
- [PartCAD](https://github.com/openvmp/partcad/) (CAD model generation with OpenSCAD and CadQuery)
- [Ollama4j Web UI](https://github.com/ollama4j/ollama4j-web-ui) - Java-based Web UI for Ollama built with Vaadin, Spring Boot, and Ollama4j
- [Ollama4j Web UI](https://github.com/ollama4j/ollama4j-web-ui) - Java-based Web UI for Ollama built with Vaadin, Spring Boot and Ollama4j
- [PyOllaMx](https://github.com/kspviswa/pyOllaMx) - macOS application capable of chatting with both Ollama and Apple MLX models.
- [Cline](https://github.com/cline/cline) - Formerly known as Claude Dev is a VSCode extension for multi-file/whole-repo coding
- [Cherry Studio](https://github.com/kangfenmao/cherry-studio) (Desktop client with Ollama support)
- [ConfiChat](https://github.com/1runeberg/confichat) (Lightweight, standalone, multi-platform, and privacy-focused LLM chat interface with optional encryption)
- [ConfiChat](https://github.com/1runeberg/confichat) (Lightweight, standalone, multi-platform, and privacy focused LLM chat interface with optional encryption)
- [Archyve](https://github.com/nickthecook/archyve) (RAG-enabling document library)
- [crewAI with Mesop](https://github.com/rapidarchitect/ollama-crew-mesop) (Mesop Web Interface to run crewAI with Ollama)
- [Tkinter-based client](https://github.com/chyok/ollama-gui) (Python tkinter-based Client for Ollama)
@@ -371,7 +368,7 @@ See the [API documentation](./docs/api.md) for all endpoints.
- [DualMind](https://github.com/tcsenpai/dualmind) (Experimental app allowing two models to talk to each other in the terminal or in a web interface)
- [ollamarama-matrix](https://github.com/h1ddenpr0cess20/ollamarama-matrix) (Ollama chatbot for the Matrix chat protocol)
- [ollama-chat-app](https://github.com/anan1213095357/ollama-chat-app) (Flutter-based chat app)
- [Perfect Memory AI](https://www.perfectmemory.ai/) (Productivity AI assists personalized by what you have seen on your screen, heard, and said in the meetings)
- [Perfect Memory AI](https://www.perfectmemory.ai/) (Productivity AI assists personalized by what you have seen on your screen, heard and said in the meetings)
- [Hexabot](https://github.com/hexastack/hexabot) (A conversational AI builder)
- [Reddit Rate](https://github.com/rapidarchitect/reddit_analyzer) (Search and Rate Reddit topics with a weighted summation)
- [OpenTalkGpt](https://github.com/adarshM84/OpenTalkGpt) (Chrome Extension to manage open-source models supported by Ollama, create custom models, and chat with models from a user-friendly UI)
@@ -389,7 +386,7 @@ See the [API documentation](./docs/api.md) for all endpoints.
- [ChibiChat](https://github.com/CosmicEventHorizon/ChibiChat) (Kotlin-based Android app to chat with Ollama and Koboldcpp API endpoints)
- [LocalLLM](https://github.com/qusaismael/localllm) (Minimal Web-App to run ollama models on it with a GUI)
- [Ollamazing](https://github.com/buiducnhat/ollamazing) (Web extension to run Ollama models)
- [OpenDeepResearcher-via-searxng](https://github.com/benhaotang/OpenDeepResearcher-via-searxng) (A Deep Research equivalent endpoint with Ollama support for running locally)
- [OpenDeepResearcher-via-searxng](https://github.com/benhaotang/OpenDeepResearcher-via-searxng) (A Deep Research equivent endpoint with Ollama support for running locally)
- [AntSK](https://github.com/AIDotNet/AntSK) (Out-of-the-box & Adaptable RAG Chatbot)
- [MaxKB](https://github.com/1Panel-dev/MaxKB/) (Ready-to-use & flexible RAG Chatbot)
- [yla](https://github.com/danielekp/yla) (Web interface to freely interact with your customized models)
@@ -402,7 +399,6 @@ See the [API documentation](./docs/api.md) for all endpoints.
- [Ollamb](https://github.com/hengkysteen/ollamb) (Simple yet rich in features, cross-platform built with Flutter and designed for Ollama. Try the [web demo](https://hengkysteen.github.io/demo/ollamb/).)
- [Writeopia](https://github.com/Writeopia/Writeopia) (Text editor with integration with Ollama)
- [AppFlowy](https://github.com/AppFlowy-IO/AppFlowy) (AI collaborative workspace with Ollama, cross-platform and self-hostable)
- [Lumina](https://github.com/cushydigit/lumina.git) (A lightweight, minimal React.js frontend for interacting with Ollama servers)
### Cloud
@@ -444,7 +440,7 @@ See the [API documentation](./docs/api.md) for all endpoints.
- [PowershAI](https://github.com/rrg92/powershai) PowerShell module that brings AI to terminal on Windows, including support for Ollama
- [DeepShell](https://github.com/Abyss-c0re/deepshell) Your self-hosted AI assistant. Interactive Shell, Files and Folders analysis.
- [orbiton](https://github.com/xyproto/orbiton) Configuration-free text editor and IDE with support for tab completion with Ollama.
- [orca-cli](https://github.com/molbal/orca-cli) Ollama Registry CLI Application - Browse, pull, and download models from Ollama Registry in your terminal.
- [orca-cli](https://github.com/molbal/orca-cli) Ollama Registry CLI Application - Browse, pull and download models from Ollama Registry in your terminal.
- [GGUF-to-Ollama](https://github.com/jonathanhecl/gguf-to-ollama) - Importing GGUF to Ollama made easy (multiplatform)
### Apple Vision Pro
@@ -472,7 +468,7 @@ See the [API documentation](./docs/api.md) for all endpoints.
### Libraries
- [LangChain](https://python.langchain.com/docs/integrations/chat/ollama/) and [LangChain.js](https://js.langchain.com/docs/integrations/chat/ollama/) with [example](https://js.langchain.com/docs/tutorials/local_rag/)
- [LangChain](https://python.langchain.com/docs/integrations/llms/ollama) and [LangChain.js](https://js.langchain.com/docs/integrations/chat/ollama/) with [example](https://js.langchain.com/docs/tutorials/local_rag/)
- [Firebase Genkit](https://firebase.google.com/docs/genkit/plugins/ollama)
- [crewAI](https://github.com/crewAIInc/crewAI)
- [Yacana](https://remembersoftwares.github.io/yacana/) (User-friendly multi-agent framework for brainstorming and executing predetermined flows with built-in tool integration)
@@ -519,7 +515,7 @@ See the [API documentation](./docs/api.md) for all endpoints.
- [Swollama for Swift](https://github.com/marcusziade/Swollama) with [DocC](https://marcusziade.github.io/Swollama/documentation/swollama/)
- [GoLamify](https://github.com/prasad89/golamify)
- [Ollama for Haskell](https://github.com/tusharad/ollama-haskell)
- [multi-llm-ts](https://github.com/nbonamy/multi-llm-ts) (A Typescript/JavaScript library allowing access to different LLM in a unified API)
- [multi-llm-ts](https://github.com/nbonamy/multi-llm-ts) (A Typescript/JavaScript library allowing access to different LLM in unified API)
- [LlmTornado](https://github.com/lofcz/llmtornado) (C# library providing a unified interface for major FOSS & Commercial inference APIs)
- [Ollama for Zig](https://github.com/dravenk/ollama-zig)
- [Abso](https://github.com/lunary-ai/abso) (OpenAI-compatible TypeScript SDK for any LLM provider)
@@ -528,11 +524,11 @@ See the [API documentation](./docs/api.md) for all endpoints.
### Mobile
- [SwiftChat](https://github.com/aws-samples/swift-chat) (Lightning-fast Cross-platform AI chat app with native UI for Android, iOS, and iPad)
- [SwiftChat](https://github.com/aws-samples/swift-chat) (Lightning-fast Cross-platform AI chat app with native UI for Android, iOS and iPad)
- [Enchanted](https://github.com/AugustDev/enchanted)
- [Maid](https://github.com/Mobile-Artificial-Intelligence/maid)
- [Ollama App](https://github.com/JHubi1/ollama-app) (Modern and easy-to-use multi-platform client for Ollama)
- [ConfiChat](https://github.com/1runeberg/confichat) (Lightweight, standalone, multi-platform, and privacy-focused LLM chat interface with optional encryption)
- [ConfiChat](https://github.com/1runeberg/confichat) (Lightweight, standalone, multi-platform, and privacy focused LLM chat interface with optional encryption)
- [Ollama Android Chat](https://github.com/sunshine0523/OllamaServer) (No need for Termux, start the Ollama service with one click on an Android device)
- [Reins](https://github.com/ibrahimcetin/reins) (Easily tweak parameters, customize system prompts per chat, and enhance your AI experiments with reasoning model support.)
@@ -556,7 +552,7 @@ See the [API documentation](./docs/api.md) for all endpoints.
- [Obsidian Local GPT plugin](https://github.com/pfrankov/obsidian-local-gpt)
- [Open Interpreter](https://docs.openinterpreter.com/language-model-setup/local-models/ollama)
- [Llama Coder](https://github.com/ex3ndr/llama-coder) (Copilot alternative using Ollama)
- [Ollama Copilot](https://github.com/bernardo-bruning/ollama-copilot) (Proxy that allows you to use Ollama as a copilot like GitHub Copilot)
- [Ollama Copilot](https://github.com/bernardo-bruning/ollama-copilot) (Proxy that allows you to use ollama as a copilot like Github copilot)
- [twinny](https://github.com/rjmacarthy/twinny) (Copilot and Copilot chat alternative using Ollama)
- [Wingman-AI](https://github.com/RussellCanfield/wingman-ai) (Copilot code and chat alternative using Ollama and Hugging Face)
- [Page Assist](https://github.com/n4ze3m/page-assist) (Chrome Extension)
@@ -566,8 +562,8 @@ See the [API documentation](./docs/api.md) for all endpoints.
- [Discord-Ollama Chat Bot](https://github.com/kevinthedang/discord-ollama) (Generalized TypeScript Discord Bot w/ Tuning Documentation)
- [ChatGPTBox: All in one browser extension](https://github.com/josStorer/chatGPTBox) with [Integrating Tutorial](https://github.com/josStorer/chatGPTBox/issues/616#issuecomment-1975186467)
- [Discord AI chat/moderation bot](https://github.com/rapmd73/Companion) Chat/moderation bot written in python. Uses Ollama to create personalities.
- [Headless Ollama](https://github.com/nischalj10/headless-ollama) (Scripts to automatically install ollama client & models on any OS for apps that depend on ollama server)
- [Terraform AWS Ollama & Open WebUI](https://github.com/xuyangbocn/terraform-aws-self-host-llm) (A Terraform module to deploy on AWS a ready-to-use Ollama service, together with its front-end Open WebUI service.)
- [Headless Ollama](https://github.com/nischalj10/headless-ollama) (Scripts to automatically install ollama client & models on any OS for apps that depends on ollama server)
- [Terraform AWS Ollama & Open WebUI](https://github.com/xuyangbocn/terraform-aws-self-host-llm) (A Terraform module to deploy on AWS a ready-to-use Ollama service, together with its front end Open WebUI service.)
- [node-red-contrib-ollama](https://github.com/jakubburkiewicz/node-red-contrib-ollama)
- [Local AI Helper](https://github.com/ivostoykov/localAI) (Chrome and Firefox extensions that enable interactions with the active tab and customisable API endpoints. Includes secure storage for user prompts.)
- [vnc-lm](https://github.com/jake83741/vnc-lm) (Discord bot for messaging with LLMs through Ollama and LiteLLM. Seamlessly move between local and flagship models.)
+1
View File
@@ -1407,6 +1407,7 @@ func NewCLI() *cobra.Command {
envVars["OLLAMA_LLM_LIBRARY"],
envVars["OLLAMA_GPU_OVERHEAD"],
envVars["OLLAMA_LOAD_TIMEOUT"],
envVars["OLLAMA_CONTEXT_LENGTH"],
})
default:
appendEnvDocs(cmd, envs)
+3 -3
View File
@@ -20,7 +20,7 @@ Please refer to the [GPU docs](./gpu.md).
## How can I specify the context window size?
By default, Ollama uses a context window size of 4096 tokens.
By default, Ollama uses a context window size of 4096 tokens, unless you have a single GPU with <= 4 GB of VRAM, in which case it will default to 2048 tokens.
This can be overridden with the `OLLAMA_CONTEXT_LENGTH` environment variable. For example, to set the default context window to 8K, use:
@@ -31,7 +31,7 @@ OLLAMA_CONTEXT_LENGTH=8192 ollama serve
To change this when using `ollama run`, use `/set parameter`:
```shell
/set parameter num_ctx 4096
/set parameter num_ctx 8192
```
When using the API, specify the `num_ctx` parameter:
@@ -41,7 +41,7 @@ curl http://localhost:11434/api/generate -d '{
"model": "llama3.2",
"prompt": "Why is the sky blue?",
"options": {
"num_ctx": 4096
"num_ctx": 8192
}
}'
```
+16 -2
View File
@@ -169,7 +169,7 @@ var (
// Enable the new Ollama engine
NewEngine = Bool("OLLAMA_NEW_ENGINE")
// ContextLength sets the default context length
ContextLength = Uint("OLLAMA_CONTEXT_LENGTH", 4096)
ContextLength = Int64("OLLAMA_CONTEXT_LENGTH", -1)
)
func String(s string) func() string {
@@ -227,6 +227,20 @@ func Uint64(key string, defaultValue uint64) func() uint64 {
}
}
func Int64(key string, defaultValue int64) func() int64 {
return func() int64 {
if s := Var(key); s != "" {
if n, err := strconv.ParseInt(s, 10, 64); err != nil {
slog.Warn("invalid environment variable, using default", "key", key, "value", s, "default", defaultValue)
} else {
return n
}
}
return defaultValue
}
}
// Set aside VRAM per GPU
var GpuOverhead = Uint64("OLLAMA_GPU_OVERHEAD", 0)
@@ -255,7 +269,7 @@ func AsMap() map[string]EnvVar {
"OLLAMA_ORIGINS": {"OLLAMA_ORIGINS", AllowedOrigins(), "A comma separated list of allowed origins"},
"OLLAMA_SCHED_SPREAD": {"OLLAMA_SCHED_SPREAD", SchedSpread(), "Always schedule model across all GPUs"},
"OLLAMA_MULTIUSER_CACHE": {"OLLAMA_MULTIUSER_CACHE", MultiUserCache(), "Optimize prompt caching for multi-user scenarios"},
"OLLAMA_CONTEXT_LENGTH": {"OLLAMA_CONTEXT_LENGTH", ContextLength(), "Context length to use unless otherwise specified (default: 4096)"},
"OLLAMA_CONTEXT_LENGTH": {"OLLAMA_CONTEXT_LENGTH", ContextLength(), "Context length to use unless otherwise specified (default 4096 or 2048 with low VRAM)"},
"OLLAMA_NEW_ENGINE": {"OLLAMA_NEW_ENGINE", NewEngine(), "Enable the new Ollama engine"},
// Informational
+3 -3
View File
@@ -278,9 +278,9 @@ func TestVar(t *testing.T) {
}
func TestContextLength(t *testing.T) {
cases := map[string]uint{
"": 4096,
"2048": 2048,
cases := map[string]int64{
"": -1,
"4096": 4096,
}
for k, v := range cases {
+1 -2
View File
@@ -531,12 +531,11 @@ func WriteGGUF(ws io.WriteSeeker, kv KV, ts []Tensor) error {
var s uint64
for _, t := range ts {
t.Offset = s
t.Offset = s + uint64(ggufPadding(int64(s), int64(alignment)))
if err := ggufWriteTensorInfo(ws, t); err != nil {
return err
}
s += t.Size()
s += uint64(ggufPadding(int64(s), int64(alignment)))
}
for _, t := range ts {
-63
View File
@@ -1,63 +0,0 @@
package ggml
import (
"bytes"
"os"
"slices"
"testing"
"github.com/google/go-cmp/cmp"
)
func TestWriteGGUF(t *testing.T) {
w, err := os.CreateTemp(t.TempDir(), "*.bin")
if err != nil {
t.Fatal(err)
}
defer w.Close()
if err := WriteGGUF(w, KV{
"general.alignment": uint32(16),
}, []Tensor{
{Name: "test.0", Shape: []uint64{2, 3}, WriterTo: bytes.NewBuffer(slices.Repeat([]byte{0}, 2*3*4))},
{Name: "test.1", Shape: []uint64{2, 3}, WriterTo: bytes.NewBuffer(slices.Repeat([]byte{0}, 2*3*4))},
{Name: "test.2", Shape: []uint64{2, 3}, WriterTo: bytes.NewBuffer(slices.Repeat([]byte{0}, 2*3*4))},
{Name: "test.3", Shape: []uint64{2, 3}, WriterTo: bytes.NewBuffer(slices.Repeat([]byte{0}, 2*3*4))},
{Name: "test.4", Shape: []uint64{2, 3}, WriterTo: bytes.NewBuffer(slices.Repeat([]byte{0}, 2*3*4))},
{Name: "test.5", Shape: []uint64{2, 3}, WriterTo: bytes.NewBuffer(slices.Repeat([]byte{0}, 2*3*4))},
}); err != nil {
t.Fatal(err)
}
r, err := os.Open(w.Name())
if err != nil {
t.Fatal(err)
}
defer r.Close()
ff, _, err := Decode(r, 0)
if err != nil {
t.Fatal(err)
}
if diff := cmp.Diff(ff.KV(), KV{
"general.alignment": uint32(16),
"general.parameter_count": uint64(36),
}); diff != "" {
t.Errorf("Mismatch (-want +got):\n%s", diff)
}
if diff := cmp.Diff(ff.Tensors(), Tensors{
Offset: 336,
items: []*Tensor{
{Name: "test.0", Offset: 0, Shape: []uint64{2, 3}},
{Name: "test.1", Offset: 32, Shape: []uint64{2, 3}},
{Name: "test.2", Offset: 64, Shape: []uint64{2, 3}},
{Name: "test.3", Offset: 96, Shape: []uint64{2, 3}},
{Name: "test.4", Offset: 128, Shape: []uint64{2, 3}},
{Name: "test.5", Offset: 160, Shape: []uint64{2, 3}},
},
}, cmp.AllowUnexported(Tensors{})); diff != "" {
t.Errorf("Mismatch (-want +got):\n%s", diff)
}
}
+6 -6
View File
@@ -11,7 +11,7 @@ require (
github.com/spf13/cobra v1.7.0
github.com/stretchr/testify v1.9.0
github.com/x448/float16 v0.8.4
golang.org/x/sync v0.12.0
golang.org/x/sync v0.11.0
)
require (
@@ -70,12 +70,12 @@ require (
github.com/twitchyliquid64/golang-asm v0.15.1 // indirect
github.com/ugorji/go/codec v1.2.12 // indirect
golang.org/x/arch v0.8.0 // indirect
golang.org/x/crypto v0.36.0
golang.org/x/crypto v0.33.0
golang.org/x/exp v0.0.0-20250218142911-aa4b98e5adaa
golang.org/x/net v0.38.0 // indirect
golang.org/x/sys v0.31.0
golang.org/x/term v0.30.0
golang.org/x/text v0.23.0
golang.org/x/net v0.35.0 // indirect
golang.org/x/sys v0.30.0
golang.org/x/term v0.29.0
golang.org/x/text v0.22.0
google.golang.org/protobuf v1.34.1
gopkg.in/yaml.v3 v3.0.1 // indirect
)
+12 -12
View File
@@ -214,8 +214,8 @@ golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACk
golang.org/x/crypto v0.0.0-20190510104115-cbcb75029529/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI=
golang.org/x/crypto v0.0.0-20191011191535-87dc89f01550/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI=
golang.org/x/crypto v0.0.0-20200622213623-75b288015ac9/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto=
golang.org/x/crypto v0.36.0 h1:AnAEvhDddvBdpY+uR+MyHmuZzzNqXSe/GvuDeob5L34=
golang.org/x/crypto v0.36.0/go.mod h1:Y4J0ReaxCR1IMaabaSMugxJES1EpwhBHhv2bDHklZvc=
golang.org/x/crypto v0.33.0 h1:IOBPskki6Lysi0lo9qQvbxiQ+FvsCC/YWOecCHAixus=
golang.org/x/crypto v0.33.0/go.mod h1:bVdXmD7IV/4GdElGPozy6U7lWdRXA4qyRVGJV57uQ5M=
golang.org/x/exp v0.0.0-20180321215751-8460e604b9de/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA=
golang.org/x/exp v0.0.0-20180807140117-3d87b88a115f/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA=
golang.org/x/exp v0.0.0-20190121172915-509febef88a4/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA=
@@ -257,8 +257,8 @@ golang.org/x/net v0.0.0-20200822124328-c89045814202/go.mod h1:/O7V0waA8r7cgGh81R
golang.org/x/net v0.0.0-20201021035429-f5854403a974/go.mod h1:sp8m0HH+o8qH0wwXwYZr8TS3Oi6o0r6Gce1SSxlDquU=
golang.org/x/net v0.0.0-20210405180319-a5a99cb37ef4/go.mod h1:p54w0d4576C0XHj96bSt6lcn1PtDYWL6XObtHCRCNQM=
golang.org/x/net v0.0.0-20210614182718-04defd469f4e/go.mod h1:9nx3DQGgdP8bBQD5qxJ1jj9UTztislL4KSBs9R2vV5Y=
golang.org/x/net v0.38.0 h1:vRMAPTMaeGqVhG5QyLJHqNDwecKTomGeqbnfZyKlBI8=
golang.org/x/net v0.38.0/go.mod h1:ivrbrMbzFq5J41QOQh0siUuly180yBYtLp+CKbEaFx8=
golang.org/x/net v0.35.0 h1:T5GQRQb2y08kTAByq9L4/bz8cipCdA8FbRTXewonqY8=
golang.org/x/net v0.35.0/go.mod h1:EglIi67kWsHKlRzzVMUD93VMSWGFOMSZgxFjparz1Qk=
golang.org/x/oauth2 v0.0.0-20180821212333-d2e6202438be/go.mod h1:N/0e6XlmueqKjAGxoOufVs8QHGRruUQn6yWY3a++T0U=
golang.org/x/oauth2 v0.0.0-20200107190931-bf48bf16ab8d/go.mod h1:gOpvHmFTYa4IltrdGE7lF6nIHvwfUNPOp7c8zoXwtLw=
golang.org/x/sync v0.0.0-20180314180146-1d60e4601c6f/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
@@ -268,8 +268,8 @@ golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJ
golang.org/x/sync v0.0.0-20190911185100-cd5d95a43a6e/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
golang.org/x/sync v0.0.0-20201020160332-67f06af15bc9/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
golang.org/x/sync v0.0.0-20210220032951-036812b2e83c/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
golang.org/x/sync v0.12.0 h1:MHc5BpPuC30uJk597Ri8TV3CNZcTLu6B6z4lJy+g6Jw=
golang.org/x/sync v0.12.0/go.mod h1:1dzgHSNfp02xaA81J2MS99Qcpr2w7fw1gpm99rleRqA=
golang.org/x/sync v0.11.0 h1:GGz8+XQP4FvTTrjZPzNKTMFtSXH80RAzG+5ghFPgK9w=
golang.org/x/sync v0.11.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk=
golang.org/x/sys v0.0.0-20180830151530-49385e6e1522/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
golang.org/x/sys v0.0.0-20190312061237-fead79001313/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
@@ -285,17 +285,17 @@ golang.org/x/sys v0.0.0-20210510120138-977fb7262007/go.mod h1:oPkhp1MJrh7nUepCBc
golang.org/x/sys v0.0.0-20210630005230-0f9fa26af87c/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.5.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.31.0 h1:ioabZlmFYtWhL+TRYpcnNlLwhyxaM9kWTDEmfnprqik=
golang.org/x/sys v0.31.0/go.mod h1:BJP2sWEmIv4KK5OTEluFJCKSidICx8ciO85XgH3Ak8k=
golang.org/x/sys v0.30.0 h1:QjkSwP/36a20jFYWkSue1YwXzLmsV5Gfq7Eiy72C1uc=
golang.org/x/sys v0.30.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA=
golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo=
golang.org/x/term v0.30.0 h1:PQ39fJZ+mfadBm0y5WlL4vlM7Sx1Hgf13sMIY2+QS9Y=
golang.org/x/term v0.30.0/go.mod h1:NYYFdzHoI5wRh/h5tDMdMqCqPJZEuNqVR5xJLd/n67g=
golang.org/x/term v0.29.0 h1:L6pJp37ocefwRRtYPKSWOWzOtWSxVajvz2ldH/xi3iU=
golang.org/x/term v0.29.0/go.mod h1:6bl4lRlvVuDgSf3179VpIxBF0o10JUpXWOnI7nErv7s=
golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
golang.org/x/text v0.3.5/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
golang.org/x/text v0.3.6/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
golang.org/x/text v0.23.0 h1:D71I7dUrlY+VX0gQShAThNGHFxZ13dGLBHQLVl1mJlY=
golang.org/x/text v0.23.0/go.mod h1:/BLNzu4aZCJ1+kcD0DNRotWKage4q2rGVAg4o22unh4=
golang.org/x/text v0.22.0 h1:bofq7m3/HAFvbF51jz3Q9wLg3jkvSPuiZu/pD1XwgtM=
golang.org/x/text v0.22.0/go.mod h1:YRoo4H8PVmsu+E3Ou7cqLVH8oXWIHVoX0jqUWALQhfY=
golang.org/x/tools v0.0.0-20180525024113-a5b4c53f6e8b/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
golang.org/x/tools v0.0.0-20190114222345-bf090417da8b/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
+11 -15
View File
@@ -34,15 +34,13 @@ func cosineSimilarity[V float32 | float64](v1, v2 []V) V {
func TestAllMiniLMEmbeddings(t *testing.T) {
ctx, cancel := context.WithTimeout(context.Background(), 2*time.Minute)
defer cancel()
client, _, cleanup := InitServerConnection(ctx, t)
defer cleanup()
req := api.EmbeddingRequest{
Model: "all-minilm",
Prompt: "why is the sky blue?",
}
res, err := embeddingTestHelper(ctx, client, t, req)
res, err := embeddingTestHelper(ctx, t, req)
if err != nil {
t.Fatalf("error: %v", err)
@@ -64,15 +62,13 @@ func TestAllMiniLMEmbeddings(t *testing.T) {
func TestAllMiniLMEmbed(t *testing.T) {
ctx, cancel := context.WithTimeout(context.Background(), 2*time.Minute)
defer cancel()
client, _, cleanup := InitServerConnection(ctx, t)
defer cleanup()
req := api.EmbedRequest{
Model: "all-minilm",
Input: "why is the sky blue?",
}
res, err := embedTestHelper(ctx, client, t, req)
res, err := embedTestHelper(ctx, t, req)
if err != nil {
t.Fatalf("error: %v", err)
@@ -102,15 +98,13 @@ func TestAllMiniLMEmbed(t *testing.T) {
func TestAllMiniLMBatchEmbed(t *testing.T) {
ctx, cancel := context.WithTimeout(context.Background(), 2*time.Minute)
defer cancel()
client, _, cleanup := InitServerConnection(ctx, t)
defer cleanup()
req := api.EmbedRequest{
Model: "all-minilm",
Input: []string{"why is the sky blue?", "why is the grass green?"},
}
res, err := embedTestHelper(ctx, client, t, req)
res, err := embedTestHelper(ctx, t, req)
if err != nil {
t.Fatalf("error: %v", err)
@@ -150,8 +144,6 @@ func TestAllMiniLMBatchEmbed(t *testing.T) {
func TestAllMiniLMEmbedTruncate(t *testing.T) {
ctx, cancel := context.WithTimeout(context.Background(), 2*time.Minute)
defer cancel()
client, _, cleanup := InitServerConnection(ctx, t)
defer cleanup()
truncTrue, truncFalse := true, false
@@ -190,7 +182,7 @@ func TestAllMiniLMEmbedTruncate(t *testing.T) {
res := make(map[string]*api.EmbedResponse)
for _, req := range reqs {
response, err := embedTestHelper(ctx, client, t, req.Request)
response, err := embedTestHelper(ctx, t, req.Request)
if err != nil {
t.Fatalf("error: %v", err)
}
@@ -206,7 +198,7 @@ func TestAllMiniLMEmbedTruncate(t *testing.T) {
}
// check that truncate set to false returns an error if context length is exceeded
_, err := embedTestHelper(ctx, client, t, api.EmbedRequest{
_, err := embedTestHelper(ctx, t, api.EmbedRequest{
Model: "all-minilm",
Input: "why is the sky blue?",
Truncate: &truncFalse,
@@ -218,7 +210,9 @@ func TestAllMiniLMEmbedTruncate(t *testing.T) {
}
}
func embeddingTestHelper(ctx context.Context, client *api.Client, t *testing.T, req api.EmbeddingRequest) (*api.EmbeddingResponse, error) {
func embeddingTestHelper(ctx context.Context, t *testing.T, req api.EmbeddingRequest) (*api.EmbeddingResponse, error) {
client, _, cleanup := InitServerConnection(ctx, t)
defer cleanup()
if err := PullIfMissing(ctx, client, req.Model); err != nil {
t.Fatalf("failed to pull model %s: %v", req.Model, err)
}
@@ -232,7 +226,9 @@ func embeddingTestHelper(ctx context.Context, client *api.Client, t *testing.T,
return response, nil
}
func embedTestHelper(ctx context.Context, client *api.Client, t *testing.T, req api.EmbedRequest) (*api.EmbedResponse, error) {
func embedTestHelper(ctx context.Context, t *testing.T, req api.EmbedRequest) (*api.EmbedResponse, error) {
client, _, cleanup := InitServerConnection(ctx, t)
defer cleanup()
if err := PullIfMissing(ctx, client, req.Model); err != nil {
t.Fatalf("failed to pull model %s: %v", req.Model, err)
}
+1 -1
View File
@@ -239,7 +239,7 @@ func (c *Causal) findStartLoc() (int, error) {
}
}
return 0, fmt.Errorf("%w (cache: %v batch: %v)", ErrKvCacheFull, len(c.cells), c.curBatchSize)
return 0, fmt.Errorf("%w (length: %v)", ErrKvCacheFull, len(c.cells))
}
func (c *Causal) updateSlidingWindow() {
Generated Vendored
+1 -1
View File
@@ -1,4 +1,4 @@
int LLAMA_BUILD_NUMBER = 0;
char const *LLAMA_COMMIT = "e1e8e0991ffd9e99a445c6812bb519d5bac9f4b5";
char const *LLAMA_COMMIT = "2016f07bd106c73699ecbaace80f55db5ed95dac";
char const *LLAMA_COMPILER = "";
char const *LLAMA_BUILD_TARGET = "";
-2
View File
@@ -342,8 +342,6 @@ struct common_params {
// multimodal models (see examples/llava)
struct common_params_model mmproj;
bool mmproj_use_gpu = true; // use GPU for multimodal model
bool no_mmproj = false; // explicitly disable multimodal model
std::vector<std::string> image; // path to image file(s)
// embedding
-3
View File
@@ -16,9 +16,6 @@ using json = nlohmann::ordered_json;
static std::string build_repetition(const std::string & item_rule, int min_items, int max_items, const std::string & separator_rule = "") {
auto has_max = max_items != std::numeric_limits<int>::max();
if (max_items == 0) {
return "";
}
if (min_items == 0 && max_items == 1) {
return item_rule + "?";
}
+23 -27
View File
@@ -2,6 +2,8 @@
#include "gguf.h"
#include "clip.h"
#include "clip.h"
#include <climits>
#include <cstdarg>
#include <string>
@@ -15,31 +17,33 @@
#define KEY_FTYPE "general.file_type"
#define KEY_NAME "general.name"
#define KEY_DESCRIPTION "general.description"
#define KEY_HAS_TEXT_ENC "clip.has_text_encoder"
#define KEY_HAS_VIS_ENC "clip.has_vision_encoder"
#define KEY_HAS_LLAVA_PROJ "clip.has_llava_projector"
#define KEY_HAS_MINICPMV_PROJ "clip.has_minicpmv_projector"
#define KEY_HAS_GLM_PROJ "clip.has_glm_projector"
#define KEY_MINICPMV_VERSION "clip.minicpmv_version"
#define KEY_HAS_QWEN2VL_MERGER "clip.has_qwen2vl_merger"
#define KEY_USE_GELU "clip.use_gelu"
#define KEY_USE_SILU "clip.use_silu"
#define KEY_N_EMBD "clip.vision.embedding_length"
#define KEY_N_FF "clip.vision.feed_forward_length"
#define KEY_N_BLOCK "clip.vision.block_count"
#define KEY_N_HEAD "clip.vision.attention.head_count"
#define KEY_LAYER_NORM_EPS "clip.vision.attention.layer_norm_epsilon"
#define KEY_PROJ_DIM "clip.vision.projection_dim"
#define KEY_N_EMBD "clip.%s.embedding_length"
#define KEY_N_FF "clip.%s.feed_forward_length"
#define KEY_N_BLOCK "clip.%s.block_count"
#define KEY_N_HEAD "clip.%s.attention.head_count"
#define KEY_LAYER_NORM_EPS "clip.%s.attention.layer_norm_epsilon"
#define KEY_PROJ_DIM "clip.%s.projection_dim"
#define KEY_TOKENS "tokenizer.ggml.tokens"
#define KEY_N_POSITIONS "clip.text.context_length"
#define KEY_IMAGE_SIZE "clip.vision.image_size"
#define KEY_PATCH_SIZE "clip.vision.patch_size"
#define KEY_IMAGE_MEAN "clip.vision.image_mean"
#define KEY_IMAGE_STD "clip.vision.image_std"
#define KEY_FEATURE_LAYER "clip.vision.feature_layer"
#define KEY_PROJ_SCALE_FACTOR "clip.vision.projector.scale_factor"
#define KEY_PROJ_TYPE "clip.projector_type"
#define KEY_USE_GLU_MLP "clip.use_glu_mlp" // for qwen2.5vl
#define KEY_USE_RMS_NORM "clip.use_rms_norm" // for qwen2.5vl
#define KEY_FEATURE_LAYER "clip.vision.feature_layer"
#define KEY_MM_PATCH_MERGE_TYPE "clip.vision.mm_patch_merge_type"
#define KEY_IMAGE_GRID_PINPOINTS "clip.vision.image_grid_pinpoints"
#define KEY_IMAGE_CROP_RESOLUTION "clip.vision.image_crop_resolution"
#define KEY_WIN_ATTN_PATTERN "clip.vision.n_wa_pattern"
#define KEY_ATTN_WINDOW_SIZE "clip.vision.window_size"
//
@@ -56,9 +60,7 @@
#define TN_ATTN_V "%s.blk.%d.attn_v.%s"
#define TN_ATTN_OUTPUT "%s.blk.%d.attn_out.%s"
#define TN_FFN_DOWN "%s.blk.%d.ffn_down.%s"
#define TN_FFN_GATE "%s.blk.%d.ffn_gate.%s"
#define TN_FFN_UP "%s.blk.%d.ffn_up.%s"
#define TN_FFN_GATE "%s.blk.%d.ffn_gate.%s"
#define TN_LN_1 "%s.blk.%d.ln1.%s"
#define TN_LN_2 "%s.blk.%d.ln2.%s"
#define TN_LN_PRE "%s.pre_ln.%s"
@@ -70,8 +72,6 @@
#define TN_IMAGE_NEWLINE "model.image_newline"
#define TN_MM_INP_PROJ "mm.input_projection.weight" // gemma3
#define TN_MM_SOFT_EMB_N "mm.soft_emb_norm.weight" // gemma3
#define TN_MM_PROJECTOR "mm.model.fc.weight" // idefics3
#define TN_TOK_IMG_BREAK "v.token_embd.img_break" // pixtral
// mimicpmv
#define TN_MINICPMV_POS_EMBD_K "resampler.pos_embed_k"
@@ -87,19 +87,18 @@
#define TN_GLM_ADAPTER_D_H_2_4H "adapter.linear.dense_h_to_4h.%s"
#define TN_GLM_ADAPTER_GATE "adapter.linear.gate.%s"
#define TN_GLM_ADAPTER_D_4H_2_H "adapter.linear.dense_4h_to_h.%s"
#define TN_GLM_BOI_W "adapter.boi"
#define TN_GLM_EOI_W "adapter.eoi"
enum projector_type {
PROJECTOR_TYPE_MLP,
PROJECTOR_TYPE_MLP_NORM,
PROJECTOR_TYPE_LDP,
PROJECTOR_TYPE_LDPV2,
PROJECTOR_TYPE_MINICPMV,
PROJECTOR_TYPE_RESAMPLER,
PROJECTOR_TYPE_GLM_EDGE,
PROJECTOR_TYPE_QWEN2VL,
PROJECTOR_TYPE_MERGER,
PROJECTOR_TYPE_GEMMA3,
PROJECTOR_TYPE_IDEFICS3,
PROJECTOR_TYPE_PIXTRAL,
PROJECTOR_TYPE_QWEN25VL,
PROJECTOR_TYPE_UNKNOWN,
};
@@ -107,13 +106,10 @@ static std::map<projector_type, std::string> PROJECTOR_TYPE_NAMES = {
{ PROJECTOR_TYPE_MLP, "mlp" },
{ PROJECTOR_TYPE_LDP, "ldp" },
{ PROJECTOR_TYPE_LDPV2, "ldpv2"},
{ PROJECTOR_TYPE_MINICPMV, "resampler"},
{ PROJECTOR_TYPE_RESAMPLER, "resampler"},
{ PROJECTOR_TYPE_GLM_EDGE, "adapter"},
{ PROJECTOR_TYPE_QWEN2VL, "qwen2vl_merger"},
{ PROJECTOR_TYPE_QWEN25VL, "qwen2.5vl_merger"},
{ PROJECTOR_TYPE_MERGER, "qwen2vl_merger"},
{ PROJECTOR_TYPE_GEMMA3, "gemma3"},
{ PROJECTOR_TYPE_IDEFICS3, "idefics3"},
{ PROJECTOR_TYPE_PIXTRAL, "pixtral"},
};
static projector_type clip_projector_type_from_string(const std::string & str) {
+434 -1101
View File
File diff suppressed because it is too large Load Diff
+6 -15
View File
@@ -47,7 +47,7 @@ CLIP_API struct clip_ctx * clip_init(const char * fname, struct clip_context_par
CLIP_API void clip_free(struct clip_ctx * ctx);
CLIP_API size_t clip_embd_nbytes(const struct clip_ctx * ctx);
CLIP_API size_t clip_embd_nbytes_by_img(const struct clip_ctx * ctx, int img_w, int img_h);
CLIP_API size_t clip_embd_nbytes_by_img(const struct clip_ctx * ctx, int img_h, int img_w);
CLIP_API int32_t clip_get_image_size (const struct clip_ctx * ctx);
CLIP_API int32_t clip_get_patch_size (const struct clip_ctx * ctx);
@@ -59,20 +59,9 @@ CLIP_API const char * clip_patch_merge_type(const struct clip_ctx * ctx);
CLIP_API const int32_t * clip_image_grid(const struct clip_ctx * ctx);
CLIP_API size_t get_clip_image_grid_size(const struct clip_ctx * ctx);
GGML_DEPRECATED(CLIP_API int clip_n_patches(const struct clip_ctx * ctx),
"use clip_n_output_tokens instead");
GGML_DEPRECATED(CLIP_API int clip_n_patches_by_img(const struct clip_ctx * ctx, struct clip_image_f32 * img),
"use clip_n_output_tokens instead");
CLIP_API int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * img);
// for M-RoPE, this will be the number of token positions in X and Y directions
// for other models, X will be the total number of tokens and Y will be 1
CLIP_API int clip_n_output_tokens_x(const struct clip_ctx * ctx, struct clip_image_f32 * img);
CLIP_API int clip_n_output_tokens_y(const struct clip_ctx * ctx, struct clip_image_f32 * img);
// this should be equal to the embedding dimension of the text model
CLIP_API int clip_n_mmproj_embd(const struct clip_ctx * ctx);
CLIP_API int clip_n_patches (const struct clip_ctx * ctx);
CLIP_API int clip_n_patches_by_img (const struct clip_ctx * ctx, struct clip_image_f32 * img);
CLIP_API int clip_n_mmproj_embd (const struct clip_ctx * ctx);
CLIP_API int clip_uhd_num_image_embeds_col(struct clip_ctx * ctx_clip);
CLIP_API void clip_add_load_image_size(struct clip_ctx * ctx_clip, struct clip_image_size * load_image_size);
@@ -125,6 +114,8 @@ CLIP_API bool clip_is_qwen2vl(const struct clip_ctx * ctx);
CLIP_API bool clip_is_llava(const struct clip_ctx * ctx);
CLIP_API bool clip_is_gemma3(const struct clip_ctx * ctx);
CLIP_API int get_deepest_feature_layer(const struct clip_ctx * ctx);
CLIP_API bool clip_encode_float_image (struct clip_ctx * ctx, int n_threads, float * img, int h, int w, float * vec);
+7 -8
View File
@@ -112,7 +112,7 @@ static struct clip_image_grid_shape get_anyres_image_grid_shape(const std::pair<
}
// Take the image segments in a grid configuration and return the embeddings and the number of embeddings into preallocated memory (image_embd_out)
static bool clip_llava_handle_patches(clip_ctx * ctx_clip, std::vector<float *> & image_embd_v, struct clip_image_grid_shape grid_shape, float * image_embd_out, int * n_img_pos_out, clip_image_f32 * img_input) {
static bool clip_llava_handle_patches(clip_ctx * ctx_clip, std::vector<float *> & image_embd_v, struct clip_image_grid_shape grid_shape, float * image_embd_out, int * n_img_pos_out) {
struct {
struct ggml_context * ctx;
} model;
@@ -175,7 +175,7 @@ static bool clip_llava_handle_patches(clip_ctx * ctx_clip, std::vector<float *>
model.ctx = ggml_init(params);
struct ggml_tensor * image_features = ggml_new_tensor_3d(model.ctx, GGML_TYPE_F32, clip_n_mmproj_embd(ctx_clip), clip_n_output_tokens(ctx_clip, img_input), num_images - 1); // example: 4096 x 576 x 4
struct ggml_tensor * image_features = ggml_new_tensor_3d(model.ctx, GGML_TYPE_F32, clip_n_mmproj_embd(ctx_clip), clip_n_patches(ctx_clip), num_images - 1); // example: 4096 x 576 x 4
// ggml_tensor_printf(image_features,"image_features",__LINE__,false,false);
// fill it with the image embeddings, ignoring the base
for (size_t i = 1; i < num_images; i++) {
@@ -214,8 +214,8 @@ static bool clip_llava_handle_patches(clip_ctx * ctx_clip, std::vector<float *>
memcpy(image_embd_out, image_embd_v[0], clip_embd_nbytes(ctx_clip)); // main image as global context
// append without newline tokens (default behavior in llava_arch when not using unpad ):
memcpy(image_embd_out + clip_n_output_tokens(ctx_clip, img_input) * clip_n_mmproj_embd(ctx_clip), (float*)result->data, clip_embd_nbytes(ctx_clip) * (num_images-1)); // grid patches
*n_img_pos_out = static_cast<int>(result->ne[1]+clip_n_output_tokens(ctx_clip, img_input));
memcpy(image_embd_out + clip_n_patches(ctx_clip) * clip_n_mmproj_embd(ctx_clip), (float*)result->data, clip_embd_nbytes(ctx_clip) * (num_images-1)); // grid patches
*n_img_pos_out = static_cast<int>(result->ne[1]+clip_n_patches(ctx_clip));
// Debug: Test single segments
// Current findings: sending base image, sending a segment embedding all works similar to python
@@ -313,7 +313,7 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli
image_embd + n_img_pos_out * clip_n_mmproj_embd(ctx_clip),
image_embd_v[i],
clip_embd_nbytes_by_img(ctx_clip, nx, ny));
n_img_pos_out += clip_n_output_tokens(ctx_clip, img_res);
n_img_pos_out += clip_n_patches_by_img(ctx_clip, img_res);
}
*n_img_pos = n_img_pos_out;
for (size_t i = 0; i < image_embd_v.size(); i++) {
@@ -342,8 +342,8 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli
}
else if (strcmp(mm_patch_merge_type, "spatial_unpad") != 0) {
// flat / default llava-1.5 type embedding
*n_img_pos = clip_n_patches(ctx_clip);
clip_image_f32 * img_res = clip_image_f32_get_img(img_res_v.get(), 0);
*n_img_pos = clip_n_output_tokens(ctx_clip, img_res);
bool encoded = clip_image_encode(ctx_clip, n_threads, img_res, image_embd); // image_embd shape is 576 x 4096
if (!encoded) {
LOG_ERR("Unable to encode image\n");
@@ -381,8 +381,7 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli
struct clip_image_grid_shape grid_shape = get_anyres_image_grid_shape({img->nx,img->ny}, grid_pinpoints, image_size);
int n_img_pos_out;
clip_image_f32 * img_input = clip_image_f32_get_img(img_res_v.get(), 0);
clip_llava_handle_patches(ctx_clip, image_embd_v, grid_shape, image_embd, &n_img_pos_out, img_input);
clip_llava_handle_patches(ctx_clip, image_embd_v, grid_shape, image_embd, &n_img_pos_out);
*n_img_pos = n_img_pos_out;
for (size_t i = 0; i < image_embd_v.size(); i++) {
-2
View File
@@ -111,7 +111,6 @@ extern "C" {
LLAMA_VOCAB_PRE_TYPE_TRILLION = 31,
LLAMA_VOCAB_PRE_TYPE_BAILINGMOE = 32,
LLAMA_VOCAB_PRE_TYPE_LLAMA4 = 33,
LLAMA_VOCAB_PRE_TYPE_PIXTRAL = 34,
};
enum llama_rope_type {
@@ -1238,7 +1237,6 @@ extern "C" {
"will be removed in the future (see https://github.com/ggml-org/llama.cpp/pull/9896#discussion_r1800920915)");
/// @details Top-K sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751
/// Setting k <= 0 makes this a noop
LLAMA_API struct llama_sampler * llama_sampler_init_top_k (int32_t k);
/// @details Nucleus sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751
-20
View File
@@ -20,7 +20,6 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
{ LLM_ARCH_REFACT, "refact" },
{ LLM_ARCH_BERT, "bert" },
{ LLM_ARCH_NOMIC_BERT, "nomic-bert" },
{ LLM_ARCH_NOMIC_BERT_MOE, "nomic-bert-moe" },
{ LLM_ARCH_JINA_BERT_V2, "jina-bert-v2" },
{ LLM_ARCH_BLOOM, "bloom" },
{ LLM_ARCH_STABLELM, "stablelm" },
@@ -110,7 +109,6 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
{ LLM_KV_EXPERT_WEIGHTS_SCALE, "%s.expert_weights_scale" },
{ LLM_KV_EXPERT_WEIGHTS_NORM, "%s.expert_weights_norm" },
{ LLM_KV_EXPERT_GATING_FUNC, "%s.expert_gating_func" },
{ LLM_KV_MOE_EVERY_N_LAYERS, "%s.moe_every_n_layers" },
{ LLM_KV_POOLING_TYPE, "%s.pooling_type" },
{ LLM_KV_LOGIT_SCALE, "%s.logit_scale" },
{ LLM_KV_DECODER_START_TOKEN_ID, "%s.decoder_start_token_id" },
@@ -513,24 +511,6 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
},
},
{
LLM_ARCH_NOMIC_BERT_MOE,
{
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
{ LLM_TENSOR_TOKEN_EMBD_NORM, "token_embd_norm" },
{ LLM_TENSOR_TOKEN_TYPES, "token_types" },
{ LLM_TENSOR_ATTN_OUT_NORM, "blk.%d.attn_output_norm" },
{ LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" },
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
{ LLM_TENSOR_LAYER_OUT_NORM, "blk.%d.layer_output_norm" },
{ LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
{ LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" },
{ LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" },
{ LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
},
},
{
LLM_ARCH_JINA_BERT_V2,
{
-2
View File
@@ -24,7 +24,6 @@ enum llm_arch {
LLM_ARCH_REFACT,
LLM_ARCH_BERT,
LLM_ARCH_NOMIC_BERT,
LLM_ARCH_NOMIC_BERT_MOE,
LLM_ARCH_JINA_BERT_V2,
LLM_ARCH_BLOOM,
LLM_ARCH_STABLELM,
@@ -114,7 +113,6 @@ enum llm_kv {
LLM_KV_EXPERT_WEIGHTS_SCALE,
LLM_KV_EXPERT_WEIGHTS_NORM,
LLM_KV_EXPERT_GATING_FUNC,
LLM_KV_MOE_EVERY_N_LAYERS,
LLM_KV_POOLING_TYPE,
LLM_KV_LOGIT_SCALE,
LLM_KV_DECODER_START_TOKEN_ID,
+17 -30
View File
@@ -50,8 +50,8 @@ static const std::map<std::string, llm_chat_template> LLM_CHAT_TEMPLATES = {
{ "deepseek3", LLM_CHAT_TEMPLATE_DEEPSEEK_3 },
{ "command-r", LLM_CHAT_TEMPLATE_COMMAND_R },
{ "llama3", LLM_CHAT_TEMPLATE_LLAMA_3 },
{ "chatglm3", LLM_CHAT_TEMPLATE_CHATGLM_3 },
{ "chatglm4", LLM_CHAT_TEMPLATE_CHATGLM_4 },
{ "chatglm3", LLM_CHAT_TEMPLATE_CHATGML_3 },
{ "chatglm4", LLM_CHAT_TEMPLATE_CHATGML_4 },
{ "glmedge", LLM_CHAT_TEMPLATE_GLMEDGE },
{ "minicpm", LLM_CHAT_TEMPLATE_MINICPM },
{ "exaone3", LLM_CHAT_TEMPLATE_EXAONE_3 },
@@ -62,7 +62,6 @@ static const std::map<std::string, llm_chat_template> LLM_CHAT_TEMPLATES = {
{ "yandex", LLM_CHAT_TEMPLATE_YANDEX },
{ "bailing", LLM_CHAT_TEMPLATE_BAILING },
{ "llama4", LLM_CHAT_TEMPLATE_LLAMA4 },
{ "smolvlm", LLM_CHAT_TEMPLATE_SMOLVLM },
};
llm_chat_template llm_chat_template_from_str(const std::string & name) {
@@ -82,9 +81,7 @@ llm_chat_template llm_chat_detect_template(const std::string & tmpl) {
if (tmpl_contains("<|im_start|>")) {
return tmpl_contains("<|im_sep|>")
? LLM_CHAT_TEMPLATE_PHI_4
: tmpl_contains("<end_of_utterance>")
? LLM_CHAT_TEMPLATE_SMOLVLM // SmolVLM uses <|im_start|> as BOS, but it is NOT chatml
: LLM_CHAT_TEMPLATE_CHATML;
: LLM_CHAT_TEMPLATE_CHATML;
} else if (tmpl.find("mistral") == 0 || tmpl_contains("[INST]")) {
if (tmpl_contains("[SYSTEM_PROMPT]")) {
return LLM_CHAT_TEMPLATE_MISTRAL_V7;
@@ -122,12 +119,8 @@ llm_chat_template llm_chat_detect_template(const std::string & tmpl) {
}
} else if (tmpl_contains("<|assistant|>") && tmpl_contains("<|end|>")) {
return LLM_CHAT_TEMPLATE_PHI_3;
} else if (tmpl_contains("[gMASK]<sop>")) {
return LLM_CHAT_TEMPLATE_CHATGLM_4;
} else if (tmpl_contains("<|assistant|>") && tmpl_contains("<|user|>")) {
return tmpl_contains("</s>") ? LLM_CHAT_TEMPLATE_FALCON_3 : LLM_CHAT_TEMPLATE_GLMEDGE;
} else if (tmpl_contains("<|{{ item['role'] }}|>") && tmpl_contains("<|begin_of_image|>")) {
return LLM_CHAT_TEMPLATE_GLMEDGE;
} else if (tmpl_contains("<|user|>") && tmpl_contains("<|endoftext|>")) {
return LLM_CHAT_TEMPLATE_ZEPHYR;
} else if (tmpl_contains("bos_token + message['role']")) {
@@ -156,7 +149,9 @@ llm_chat_template llm_chat_detect_template(const std::string & tmpl) {
return LLM_CHAT_TEMPLATE_LLAMA_3;
} else if (tmpl_contains("[gMASK]sop")) {
// chatglm3-6b
return LLM_CHAT_TEMPLATE_CHATGLM_3;
return LLM_CHAT_TEMPLATE_CHATGML_3;
} else if (tmpl_contains("[gMASK]<sop>")) {
return LLM_CHAT_TEMPLATE_CHATGML_4;
} else if (tmpl_contains(LU8("<用户>"))) {
// MiniCPM-3B-OpenHermes-2.5-v2-GGUF
return LLM_CHAT_TEMPLATE_MINICPM;
@@ -437,7 +432,7 @@ int32_t llm_chat_apply_template(
if (add_ass) {
ss << "<|start_header_id|>assistant<|end_header_id|>\n\n";
}
} else if (tmpl == LLM_CHAT_TEMPLATE_CHATGLM_3) {
} else if (tmpl == LLM_CHAT_TEMPLATE_CHATGML_3) {
// chatglm3-6b
ss << "[gMASK]" << "sop";
for (auto message : chat) {
@@ -447,7 +442,7 @@ int32_t llm_chat_apply_template(
if (add_ass) {
ss << "<|assistant|>";
}
} else if (tmpl == LLM_CHAT_TEMPLATE_CHATGLM_4 || tmpl == LLM_CHAT_TEMPLATE_GLMEDGE) {
} else if (tmpl == LLM_CHAT_TEMPLATE_CHATGML_4) {
ss << "[gMASK]" << "<sop>";
for (auto message : chat) {
std::string role(message->role);
@@ -456,6 +451,14 @@ int32_t llm_chat_apply_template(
if (add_ass) {
ss << "<|assistant|>";
}
} else if (tmpl == LLM_CHAT_TEMPLATE_GLMEDGE) {
for (auto message : chat) {
std::string role(message->role);
ss << "<|" << role << "|>" << "\n" << message->content;
}
if (add_ass) {
ss << "<|assistant|>";
}
} else if (tmpl == LLM_CHAT_TEMPLATE_MINICPM) {
// MiniCPM-3B-OpenHermes-2.5-v2-GGUF
for (auto message : chat) {
@@ -617,23 +620,7 @@ int32_t llm_chat_apply_template(
if (add_ass) {
ss << "<|header_start|>assistant<|header_end|>\n\n";
}
} else if (tmpl == LLM_CHAT_TEMPLATE_SMOLVLM) {
// SmolVLM
ss << "<|im_start|>"; // uses <|im_start|> as BOS, but the actual content is NOT chatml
for (auto message : chat) {
std::string role(message->role);
if (role == "system") {
ss << message->content << "\n\n";
} else if (role == "user") {
ss << "User: " << message->content << "<end_of_utterance>\n";
} else {
ss << "Assistant: " << message->content << "<end_of_utterance>\n";
}
}
if (add_ass) {
ss << "Assistant:";
}
} else {
} else {
// template not supported
return -1;
}
+2 -3
View File
@@ -29,8 +29,8 @@ enum llm_chat_template {
LLM_CHAT_TEMPLATE_DEEPSEEK_3,
LLM_CHAT_TEMPLATE_COMMAND_R,
LLM_CHAT_TEMPLATE_LLAMA_3,
LLM_CHAT_TEMPLATE_CHATGLM_3,
LLM_CHAT_TEMPLATE_CHATGLM_4,
LLM_CHAT_TEMPLATE_CHATGML_3,
LLM_CHAT_TEMPLATE_CHATGML_4,
LLM_CHAT_TEMPLATE_GLMEDGE,
LLM_CHAT_TEMPLATE_MINICPM,
LLM_CHAT_TEMPLATE_EXAONE_3,
@@ -41,7 +41,6 @@ enum llm_chat_template {
LLM_CHAT_TEMPLATE_YANDEX,
LLM_CHAT_TEMPLATE_BAILING,
LLM_CHAT_TEMPLATE_LLAMA4,
LLM_CHAT_TEMPLATE_SMOLVLM,
LLM_CHAT_TEMPLATE_UNKNOWN,
};
+17 -4
View File
@@ -114,7 +114,7 @@ llama_context::llama_context(
}
if (n_ctx_per_seq > hparams.n_ctx_train) {
LLAMA_LOG_WARN("%s: n_ctx_per_seq (%u) > n_ctx_train (%u) -- possible training context overflow\n",
LLAMA_LOG_WARN("%s: n_ctx_pre_seq (%u) > n_ctx_train (%u) -- possible training context overflow\n",
__func__, n_ctx_per_seq, hparams.n_ctx_train);
}
@@ -469,7 +469,8 @@ ggml_tensor * llama_context::build_rope_shift(
ggml_tensor * shift,
ggml_tensor * factors,
float freq_base,
float freq_scale) const {
float freq_scale,
ggml_backend_buffer * bbuf) const {
const auto & n_ctx_orig = cparams.n_ctx_orig_yarn;
const auto & yarn_ext_factor = cparams.yarn_ext_factor;
@@ -491,7 +492,17 @@ ggml_tensor * llama_context::build_rope_shift(
// dequantize to f32 -> RoPE -> quantize back
tmp = ggml_cast(ctx0, cur, GGML_TYPE_F32);
tmp = ggml_rope_ext(ctx0, tmp,
if (bbuf) {
for (const auto & backend : backends) {
// Figure out which backend KV cache belongs to
if (ggml_backend_supports_buft(backend.get(), ggml_backend_buffer_get_type(bbuf))) {
ggml_backend_sched_set_tensor_backend(sched.get(), tmp, backend.get());
break;
}
}
}
tmp = ggml_rope_ext_inplace(ctx0, tmp,
shift, factors, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
yarn_ext_factor, yarn_attn_factor, yarn_beta_fast, yarn_beta_slow);
@@ -571,7 +582,7 @@ llm_graph_result_ptr llama_context::build_kv_self_shift(
ggml_row_size(kv_self->k_l[il]->type, n_embd_k_gqa),
0);
ggml_tensor * cur = build_rope_shift(ctx0, k, inp->k_shift, rope_factors, freq_base_l, freq_scale_l);
ggml_tensor * cur = build_rope_shift(ctx0, k, inp->k_shift, rope_factors, freq_base_l, freq_scale_l, kv_self->k_l[il]->buffer);
ggml_build_forward_expand(gf, cur);
}
@@ -1499,6 +1510,8 @@ int32_t llama_context::output_reserve(int32_t n_outputs) {
// set all ids as invalid (negative)
std::fill(output_ids.begin(), output_ids.end(), -1);
ggml_backend_buffer_clear(buf_output.get(), 0);
this->n_outputs = 0;
this->n_outputs_max = n_outputs_max;
+2 -1
View File
@@ -172,7 +172,8 @@ private:
ggml_tensor * shift,
ggml_tensor * factors,
float freq_base,
float freq_scale) const;
float freq_scale,
ggml_backend_buffer * bbuf) const;
llm_graph_result_ptr build_kv_self_shift(
ggml_context * ctx0,
+16 -42
View File
@@ -55,21 +55,7 @@ void llm_graph_input_pos::set_input(const llama_ubatch * ubatch) {
if (ubatch->pos && pos) {
const int64_t n_tokens = ubatch->n_tokens;
if (ubatch->token && n_pos_per_embd == 4) {
// in case we're using M-RoPE with text tokens, convert the 1D positions to 4D
// the 3 first dims are the same, and 4th dim is all 0
std::vector<llama_pos> pos_data(n_tokens*n_pos_per_embd);
// copy the first dimension
for (int i = 0; i < n_tokens; ++i) {
pos_data[ i] = ubatch->pos[i];
pos_data[ n_tokens + i] = ubatch->pos[i];
pos_data[2 * n_tokens + i] = ubatch->pos[i];
pos_data[3 * n_tokens + i] = 0; // 4th dim is 0
}
ggml_backend_tensor_set(pos, pos_data.data(), 0, pos_data.size()*ggml_element_size(pos));
} else {
ggml_backend_tensor_set(pos, ubatch->pos, 0, n_tokens*n_pos_per_embd*ggml_element_size(pos));
}
ggml_backend_tensor_set(pos, ubatch->pos, 0, n_tokens*n_pos_per_token*ggml_element_size(pos));
}
}
@@ -85,7 +71,7 @@ void llm_graph_input_attn_temp::set_input(const llama_ubatch * ubatch) {
) * f_attn_temp_scale + 1.0;
}
ggml_backend_tensor_set(attn_scale, attn_scale_data.data(), 0, n_tokens*ggml_element_size(attn_scale));
ggml_backend_tensor_set(attn_scale, attn_scale_data.data(), 0, n_tokens*n_pos_per_token*ggml_element_size(attn_scale));
}
}
@@ -612,7 +598,7 @@ llm_graph_context::llm_graph_context(const llm_graph_params & params) :
res (std::make_unique<llm_graph_result>()) {
}
int64_t llm_graph_context::n_pos_per_embd() const {
int64_t llm_graph_context::n_pos_per_token() const {
return arch == LLM_ARCH_QWEN2VL ? 4 : 1;
}
@@ -823,10 +809,6 @@ ggml_tensor * llm_graph_context::build_ffn(
if (down) {
cur = build_lora_mm(down, cur);
if (arch == LLM_ARCH_GLM4) {
// GLM4 seems to have numerical issues with half-precision accumulators
ggml_mul_mat_set_prec(cur, GGML_PREC_F32);
}
}
if (down_b) {
@@ -934,35 +916,28 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
ggml_tensor * up = build_lora_mm_id(up_exps, cur, selected_experts); // [n_ff, n_expert_used, n_tokens]
cb(up, "ffn_moe_up", il);
ggml_tensor * experts = nullptr;
if (gate_exps) {
cur = build_lora_mm_id(gate_exps, cur, selected_experts); // [n_ff, n_expert_used, n_tokens]
cb(cur, "ffn_moe_gate", il);
} else {
cur = up;
}
ggml_tensor * gate = build_lora_mm_id(gate_exps, cur, selected_experts); // [n_ff, n_expert_used, n_tokens]
cb(gate, "ffn_moe_gate", il);
switch (type_op) {
case LLM_FFN_SILU:
{
cur = ggml_silu(ctx0, cur);
cb(cur, "ffn_moe_silu", il);
gate = ggml_silu(ctx0, gate);
cb(gate, "ffn_moe_silu", il);
} break;
case LLM_FFN_GELU:
{
cur = ggml_gelu(ctx0, cur);
cb(cur, "ffn_moe_gelu", il);
gate = ggml_gelu(ctx0, gate);
cb(gate, "ffn_moe_gelu", il);
} break;
default:
GGML_ABORT("fatal error");
}
if (gate_exps) {
cur = ggml_mul(ctx0, cur, up); // [n_ff, n_expert_used, n_tokens]
cb(cur, "ffn_moe_gate_par", il);
}
ggml_tensor * par = ggml_mul(ctx0, up, gate); // [n_ff, n_expert_used, n_tokens]
cb(par, "ffn_moe_gate_par", il);
experts = build_lora_mm_id(down_exps, cur, selected_experts); // [n_embd, n_expert_used, n_tokens]
ggml_tensor * experts = build_lora_mm_id(down_exps, par, selected_experts); // [n_embd, n_expert_used, n_tokens]
cb(experts, "ffn_moe_down", il);
if (!weight_before_ffn) {
@@ -1045,11 +1020,11 @@ ggml_tensor * llm_graph_context::build_inp_embd(ggml_tensor * tok_embd) const {
}
ggml_tensor * llm_graph_context::build_inp_pos() const {
auto inp = std::make_unique<llm_graph_input_pos>(n_pos_per_embd());
auto inp = std::make_unique<llm_graph_input_pos>(n_pos_per_token());
auto & cur = inp->pos;
cur = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens*n_pos_per_embd());
cur = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens*n_pos_per_token());
ggml_set_input(cur);
res->add_input(std::move(inp));
@@ -1058,12 +1033,11 @@ ggml_tensor * llm_graph_context::build_inp_pos() const {
}
ggml_tensor * llm_graph_context::build_inp_attn_scale() const {
auto inp = std::make_unique<llm_graph_input_attn_temp>(hparams.n_attn_temp_floor_scale, hparams.f_attn_temp_scale);
auto inp = std::make_unique<llm_graph_input_attn_temp>(n_pos_per_token(), hparams.n_attn_temp_floor_scale, hparams.f_attn_temp_scale);
auto & cur = inp->attn_scale;
// this need to be 1x1xN for broadcasting
cur = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, 1, 1, n_tokens);
cur = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, 1, 1, n_tokens*n_pos_per_token());
ggml_set_input(cur);
res->add_input(std::move(inp));
+7 -5
View File
@@ -91,27 +91,29 @@ public:
class llm_graph_input_pos : public llm_graph_input_i {
public:
llm_graph_input_pos(int64_t n_pos_per_embd) : n_pos_per_embd(n_pos_per_embd) {}
llm_graph_input_pos(int64_t n_pos_per_token) : n_pos_per_token(n_pos_per_token) {}
virtual ~llm_graph_input_pos() = default;
void set_input(const llama_ubatch * ubatch) override;
ggml_tensor * pos = nullptr; // I32 [n_batch]
const int64_t n_pos_per_embd = 1;
const int64_t n_pos_per_token = 1;
};
// temperature tuning, used by llama4
class llm_graph_input_attn_temp : public llm_graph_input_i {
public:
llm_graph_input_attn_temp(uint32_t n_attn_temp_floor_scale, float f_attn_temp_scale)
: n_attn_temp_floor_scale(n_attn_temp_floor_scale), f_attn_temp_scale(f_attn_temp_scale) {}
llm_graph_input_attn_temp(int64_t n_pos_per_token, uint32_t n_attn_temp_floor_scale, float f_attn_temp_scale)
: n_pos_per_token(n_pos_per_token), n_attn_temp_floor_scale(n_attn_temp_floor_scale), f_attn_temp_scale(f_attn_temp_scale) {}
virtual ~llm_graph_input_attn_temp() = default;
void set_input(const llama_ubatch * ubatch) override;
ggml_tensor * attn_scale = nullptr; // F32 [n_batch]
const int64_t n_pos_per_token = 1;
const uint32_t n_attn_temp_floor_scale;
const float f_attn_temp_scale;
};
@@ -428,7 +430,7 @@ struct llm_graph_context {
llm_graph_context(const llm_graph_params & params);
int64_t n_pos_per_embd() const;
int64_t n_pos_per_token() const;
void cb(ggml_tensor * cur, const char * name, int il) const;
-1
View File
@@ -72,7 +72,6 @@ struct llama_hparams {
float expert_weights_scale = 0.0;
bool expert_weights_norm = false;
uint32_t expert_gating_func = LLAMA_EXPERT_GATING_FUNC_TYPE_NONE;
uint32_t moe_every_n_layers = 0;
float f_norm_eps;
float f_norm_rms_eps;
+12 -60
View File
@@ -43,13 +43,11 @@ const char * llm_type_name(llm_type type) {
case LLM_TYPE_770M: return "770M";
case LLM_TYPE_780M: return "780M";
case LLM_TYPE_0_5B: return "0.5B";
case LLM_TYPE_0_6B: return "0.6B";
case LLM_TYPE_1B: return "1B";
case LLM_TYPE_1_3B: return "1.3B";
case LLM_TYPE_1_4B: return "1.4B";
case LLM_TYPE_1_5B: return "1.5B";
case LLM_TYPE_1_6B: return "1.6B";
case LLM_TYPE_1_7B: return "1.7B";
case LLM_TYPE_1_8B: return "1.8B";
case LLM_TYPE_2B: return "2B";
case LLM_TYPE_2_8B: return "2.8B";
@@ -68,7 +66,6 @@ const char * llm_type_name(llm_type type) {
case LLM_TYPE_15B: return "15B";
case LLM_TYPE_16B: return "16B";
case LLM_TYPE_20B: return "20B";
case LLM_TYPE_27B: return "27B";
case LLM_TYPE_30B: return "30B";
case LLM_TYPE_32B: return "32B";
case LLM_TYPE_34B: return "34B";
@@ -77,7 +74,6 @@ const char * llm_type_name(llm_type type) {
case LLM_TYPE_65B: return "65B";
case LLM_TYPE_70B: return "70B";
case LLM_TYPE_236B: return "236B";
case LLM_TYPE_290B: return "290B";
case LLM_TYPE_314B: return "314B";
case LLM_TYPE_671B: return "671B";
case LLM_TYPE_SMALL: return "0.1B";
@@ -92,10 +88,10 @@ const char * llm_type_name(llm_type type) {
case LLM_TYPE_16x3_8B: return "16x3.8B";
case LLM_TYPE_10B_128x3_66B: return "10B+128x3.66B";
case LLM_TYPE_57B_A14B: return "57B.A14B";
case LLM_TYPE_27B: return "27B";
case LLM_TYPE_290B: return "290B";
case LLM_TYPE_17B_16E: return "17Bx16E (Scout)";
case LLM_TYPE_17B_128E: return "17Bx128E (Maverick)";
case LLM_TYPE_30B_A3B: return "30B.A3B";
case LLM_TYPE_235B_A22B: return "235B.A22B";
default: return "?B";
}
}
@@ -713,12 +709,10 @@ void llama_model::load_hparams(llama_model_loader & ml) {
}
} break;
case LLM_ARCH_NOMIC_BERT:
case LLM_ARCH_NOMIC_BERT_MOE:
{
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn);
ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type);
ml.get_key(LLM_KV_MOE_EVERY_N_LAYERS, hparams.moe_every_n_layers, 0);
if (hparams.n_layer == 12 && hparams.n_embd == 768) {
type = LLM_TYPE_137M;
@@ -811,10 +805,6 @@ void llama_model::load_hparams(llama_model_loader & ml) {
{
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
switch (hparams.n_layer) {
case 28: type = hparams.n_embd == 1024 ? LLM_TYPE_0_6B : LLM_TYPE_1_7B; break;
case 36: type = hparams.n_embd == 2560 ? LLM_TYPE_4B : LLM_TYPE_8B; break;
case 40: type = LLM_TYPE_14B; break;
case 64: type = LLM_TYPE_32B; break;
default: type = LLM_TYPE_UNKNOWN;
}
} break;
@@ -824,8 +814,6 @@ void llama_model::load_hparams(llama_model_loader & ml) {
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
switch (hparams.n_layer) {
case 48: type = LLM_TYPE_30B_A3B; break;
case 94: type = LLM_TYPE_235B_A22B; break;
default: type = LLM_TYPE_UNKNOWN;
}
} break;
@@ -2145,7 +2133,6 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
} break;
case LLM_ARCH_BERT:
case LLM_ARCH_NOMIC_BERT:
case LLM_ARCH_NOMIC_BERT_MOE:
{
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
type_embd = create_tensor(tn(LLM_TENSOR_TOKEN_TYPES, "weight"), {n_embd, n_token_types}, 0);
@@ -2179,31 +2166,20 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
}
if (arch == LLM_ARCH_NOMIC_BERT_MOE) {
layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, 0);
}
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
layer.attn_out_norm = create_tensor(tn(LLM_TENSOR_ATTN_OUT_NORM, "weight", i), {n_embd}, 0);
layer.attn_out_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_OUT_NORM, "bias", i), {n_embd}, 0);
if (hparams.moe_every_n_layers > 0 && i % hparams.moe_every_n_layers == 1) {
layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff, n_expert}, 0);
layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff, n_embd, n_expert}, 0);
layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
} else {
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
if (arch == LLM_ARCH_BERT || arch == LLM_ARCH_NOMIC_BERT_MOE) {
layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, 0);
layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, 0);
} else {
layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
}
if (arch == LLM_ARCH_BERT) {
layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, 0);
layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, 0);
} else {
layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
}
layer.layer_out_norm = create_tensor(tn(LLM_TENSOR_LAYER_OUT_NORM, "weight", i), {n_embd}, 0);
@@ -6098,11 +6074,6 @@ struct llm_build_bert : public llm_graph_context {
cur = build_lora_mm(model.layers[il].wqkv, cur);
cb(cur, "wqkv", il);
if (model.arch == LLM_ARCH_NOMIC_BERT_MOE) {
cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
cb(cur, "bqkv", il);
}
Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
@@ -6155,29 +6126,13 @@ struct llm_build_bert : public llm_graph_context {
cb(ffn_inp, "ffn_inp", il);
// feed-forward network
if (hparams.moe_every_n_layers > 0 && il % hparams.moe_every_n_layers == 1) {
// MoE branch
cur = build_moe_ffn(cur,
model.layers[il].ffn_gate_inp,
model.layers[il].ffn_up_exps,
nullptr,
model.layers[il].ffn_down_exps,
nullptr,
hparams.n_expert,
hparams.n_expert_used,
LLM_FFN_GELU,
false, false,
0.0f,
LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX, il);
cb(cur, "ffn_moe_out", il);
} else if (model.arch == LLM_ARCH_BERT || model.arch == LLM_ARCH_NOMIC_BERT_MOE) {
if (model.arch == LLM_ARCH_BERT) {
cur = build_ffn(cur,
model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
NULL, NULL, NULL,
model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
NULL,
LLM_FFN_GELU, LLM_FFN_SEQ, il);
cb(cur, "ffn_out", il);
} else if (model.arch == LLM_ARCH_JINA_BERT_V2) {
cur = build_ffn(cur,
model.layers[il].ffn_up, NULL, NULL,
@@ -6185,7 +6140,6 @@ struct llm_build_bert : public llm_graph_context {
model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
NULL,
LLM_FFN_GELU, LLM_FFN_PAR, il);
cb(cur, "ffn_out", il);
} else {
cur = build_ffn(cur,
model.layers[il].ffn_up, NULL, NULL,
@@ -6193,8 +6147,8 @@ struct llm_build_bert : public llm_graph_context {
model.layers[il].ffn_down, NULL, NULL,
NULL,
LLM_FFN_SILU, LLM_FFN_PAR, il);
cb(cur, "ffn_out", il);
}
cb(cur, "ffn_out", il);
// attentions bypass the intermediate layer
cur = ggml_add(ctx0, cur, ffn_inp);
@@ -13395,7 +13349,6 @@ llm_graph_result_ptr llama_model::build_graph(
case LLM_ARCH_BERT:
case LLM_ARCH_JINA_BERT_V2:
case LLM_ARCH_NOMIC_BERT:
case LLM_ARCH_NOMIC_BERT_MOE:
{
llm = std::make_unique<llm_build_bert>(*this, params, gf);
} break;
@@ -13761,7 +13714,6 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
case LLM_ARCH_DBRX:
case LLM_ARCH_BERT:
case LLM_ARCH_NOMIC_BERT:
case LLM_ARCH_NOMIC_BERT_MOE:
case LLM_ARCH_STABLELM:
case LLM_ARCH_BITNET:
case LLM_ARCH_QWEN:
+2 -6
View File
@@ -40,13 +40,11 @@ enum llm_type {
LLM_TYPE_770M,
LLM_TYPE_780M,
LLM_TYPE_0_5B,
LLM_TYPE_0_6B,
LLM_TYPE_1B,
LLM_TYPE_1_3B,
LLM_TYPE_1_4B,
LLM_TYPE_1_5B,
LLM_TYPE_1_6B,
LLM_TYPE_1_7B,
LLM_TYPE_1_8B,
LLM_TYPE_2B,
LLM_TYPE_2_8B,
@@ -66,7 +64,6 @@ enum llm_type {
LLM_TYPE_16B,
LLM_TYPE_20B,
LLM_TYPE_22B,
LLM_TYPE_27B,
LLM_TYPE_30B,
LLM_TYPE_32B,
LLM_TYPE_34B,
@@ -76,7 +73,6 @@ enum llm_type {
LLM_TYPE_70B,
LLM_TYPE_90B,
LLM_TYPE_236B,
LLM_TYPE_290B,
LLM_TYPE_314B,
LLM_TYPE_671B,
LLM_TYPE_SMALL,
@@ -91,10 +87,10 @@ enum llm_type {
LLM_TYPE_16x3_8B,
LLM_TYPE_10B_128x3_66B,
LLM_TYPE_57B_A14B,
LLM_TYPE_27B,
LLM_TYPE_290B,
LLM_TYPE_17B_16E, // llama4 Scout
LLM_TYPE_17B_128E, // llama4 Maverick
LLM_TYPE_30B_A3B,
LLM_TYPE_235B_A22B,
};
struct llama_layer_posnet {
+1 -2
View File
@@ -232,7 +232,7 @@ static void llama_sampler_top_k_impl(llama_token_data_array * cur_p, int32_t k)
// }
if (k <= 0) {
return;
k = cur_p->size;
}
k = std::min(k, (int) cur_p->size);
@@ -298,7 +298,6 @@ static void llama_sampler_top_k_impl(llama_token_data_array * cur_p, int32_t k)
}
cur_p->sorted = true;
}
cur_p->size = k;
}
+1 -2
View File
@@ -1497,8 +1497,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
tokenizer_pre == "llama3" ||
tokenizer_pre == "llama-v3" ||
tokenizer_pre == "llama-bpe"||
tokenizer_pre == "falcon3" ||
tokenizer_pre == "pixtral") {
tokenizer_pre == "falcon3") {
pre_type = LLAMA_VOCAB_PRE_TYPE_LLAMA3;
ignore_merges = true;
add_bos = true;
@@ -85,7 +85,7 @@ index e2617b06..242e50a7 100644
/**
diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
index 9fb2134f..04ce764e 100644
index a7febef7..31750b6f 100644
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
@@ -534,6 +534,7 @@ struct ggml_backend_cuda_buffer_context {
@@ -125,10 +125,10 @@ index 50579227..2799a0a5 100644
static void * ggml_backend_kompute_buffer_get_base(ggml_backend_buffer_t buffer) {
diff --git a/ggml/src/ggml-metal/ggml-metal.m b/ggml/src/ggml-metal/ggml-metal.m
index d92392ed..425524d0 100644
index 266d8af4..12886cd3 100644
--- a/ggml/src/ggml-metal/ggml-metal.m
+++ b/ggml/src/ggml-metal/ggml-metal.m
@@ -5077,6 +5077,7 @@ static void ggml_backend_metal_buffer_free_buffer(ggml_backend_buffer_t buffer)
@@ -4759,6 +4759,7 @@ static void ggml_backend_metal_buffer_free_buffer(ggml_backend_buffer_t buffer)
}
free(ctx);
@@ -149,10 +149,10 @@ index 05a2f4e6..392cc18d 100644
static void * ggml_backend_opencl_buffer_get_base(ggml_backend_buffer_t buffer) {
diff --git a/ggml/src/ggml-rpc/ggml-rpc.cpp b/ggml/src/ggml-rpc/ggml-rpc.cpp
index 140a775f..e33c4ba0 100644
index a0667b7d..bd83adc5 100644
--- a/ggml/src/ggml-rpc/ggml-rpc.cpp
+++ b/ggml/src/ggml-rpc/ggml-rpc.cpp
@@ -477,6 +477,7 @@ static void ggml_backend_rpc_buffer_free_buffer(ggml_backend_buffer_t buffer) {
@@ -468,6 +468,7 @@ static void ggml_backend_rpc_buffer_free_buffer(ggml_backend_buffer_t buffer) {
bool status = send_rpc_cmd(ctx->sock, RPC_CMD_FREE_BUFFER, &request, sizeof(request), nullptr, 0);
GGML_ASSERT(status);
delete ctx;
@@ -161,10 +161,10 @@ index 140a775f..e33c4ba0 100644
static void * ggml_backend_rpc_buffer_get_base(ggml_backend_buffer_t buffer) {
diff --git a/ggml/src/ggml-sycl/ggml-sycl.cpp b/ggml/src/ggml-sycl/ggml-sycl.cpp
index 66b6f2cc..e3e6deae 100644
index 1de34c96..4600f61e 100644
--- a/ggml/src/ggml-sycl/ggml-sycl.cpp
+++ b/ggml/src/ggml-sycl/ggml-sycl.cpp
@@ -317,6 +317,7 @@ ggml_backend_sycl_buffer_free_buffer(ggml_backend_buffer_t buffer) try {
@@ -316,6 +316,7 @@ ggml_backend_sycl_buffer_free_buffer(ggml_backend_buffer_t buffer) try {
ggml_sycl_set_device(ctx->device);
delete ctx;
@@ -172,7 +172,7 @@ index 66b6f2cc..e3e6deae 100644
}
catch (sycl::exception const &exc) {
std::cerr << exc.what() << "Exception caught at file:" << __FILE__
@@ -762,6 +763,7 @@ struct ggml_backend_sycl_split_buffer_context {
@@ -761,6 +762,7 @@ struct ggml_backend_sycl_split_buffer_context {
static void ggml_backend_sycl_split_buffer_free_buffer(ggml_backend_buffer_t buffer) {
ggml_backend_sycl_split_buffer_context * ctx = (ggml_backend_sycl_split_buffer_context *)buffer->context;
delete ctx;
@@ -180,7 +180,7 @@ index 66b6f2cc..e3e6deae 100644
}
static void * ggml_backend_sycl_split_buffer_get_base(ggml_backend_buffer_t buffer) {
@@ -1096,6 +1098,7 @@ static const char * ggml_backend_sycl_host_buffer_type_name(ggml_backend_buffer_
@@ -1095,6 +1097,7 @@ static const char * ggml_backend_sycl_host_buffer_type_name(ggml_backend_buffer_
static void ggml_backend_sycl_host_buffer_free_buffer(ggml_backend_buffer_t buffer) {
ggml_sycl_host_free(buffer->context);
@@ -189,10 +189,10 @@ index 66b6f2cc..e3e6deae 100644
static ggml_backend_buffer_t ggml_backend_sycl_host_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
index c0bdb9e1..03d03064 100644
index 39f3cd34..c569a8a5 100644
--- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp
+++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
@@ -8660,6 +8660,7 @@ static void ggml_backend_vk_buffer_free_buffer(ggml_backend_buffer_t buffer) {
@@ -8653,6 +8653,7 @@ static void ggml_backend_vk_buffer_free_buffer(ggml_backend_buffer_t buffer) {
ggml_backend_vk_buffer_context * ctx = (ggml_backend_vk_buffer_context *)buffer->context;
ggml_vk_destroy_buffer(ctx->dev_buffer);
delete ctx;
@@ -200,7 +200,7 @@ index c0bdb9e1..03d03064 100644
}
static void * ggml_backend_vk_buffer_get_base(ggml_backend_buffer_t buffer) {
@@ -8803,6 +8804,7 @@ static const char * ggml_backend_vk_host_buffer_name(ggml_backend_buffer_t buffe
@@ -8796,6 +8797,7 @@ static const char * ggml_backend_vk_host_buffer_name(ggml_backend_buffer_t buffe
static void ggml_backend_vk_host_buffer_free_buffer(ggml_backend_buffer_t buffer) {
VK_LOG_MEMORY("ggml_backend_vk_host_buffer_free_buffer()");
ggml_vk_host_free(vk_instance.devices[0], buffer->context);
+2 -2
View File
@@ -10,7 +10,7 @@ logs instead of throwing an error
1 file changed, 3 insertions(+), 11 deletions(-)
diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp
index 50ded286..a9ee9f03 100644
index 48060517..a35b498c 100644
--- a/src/llama-vocab.cpp
+++ b/src/llama-vocab.cpp
@@ -1491,16 +1491,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
@@ -31,7 +31,7 @@ index 50ded286..a9ee9f03 100644
pre_type = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
} else if (
tokenizer_pre == "llama3" ||
@@ -1635,7 +1626,8 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
@@ -1634,7 +1625,8 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
pre_type = LLAMA_VOCAB_PRE_TYPE_BAILINGMOE;
clean_spaces = false;
} else {
+4 -4
View File
@@ -11,10 +11,10 @@ instead of forcing one or the error
1 file changed, 3 insertions(+), 3 deletions(-)
diff --git a/src/llama-context.cpp b/src/llama-context.cpp
index 5a2eef9b..9c1fe93f 100644
index 983385f8..32f59819 100644
--- a/src/llama-context.cpp
+++ b/src/llama-context.cpp
@@ -1225,7 +1225,7 @@ int llama_context::decode(llama_batch & inp_batch) {
@@ -1236,7 +1236,7 @@ int llama_context::decode(llama_batch & inp_batch) {
int64_t n_outputs_all = 0;
// count outputs
@@ -23,7 +23,7 @@ index 5a2eef9b..9c1fe93f 100644
for (uint32_t i = 0; i < n_tokens_all; ++i) {
n_outputs_all += batch.logits[i] != 0;
}
@@ -1337,7 +1337,7 @@ int llama_context::decode(llama_batch & inp_batch) {
@@ -1348,7 +1348,7 @@ int llama_context::decode(llama_batch & inp_batch) {
// ggml_graph_dump_dot(gf, NULL, "llama.dot");
//}
@@ -32,7 +32,7 @@ index 5a2eef9b..9c1fe93f 100644
auto * t_embd = cparams.embeddings ? res->get_embd() : nullptr;
if (t_embd && res->get_embd_pooled()) {
@@ -1481,7 +1481,7 @@ int32_t llama_context::output_reserve(int32_t n_outputs) {
@@ -1492,7 +1492,7 @@ int32_t llama_context::output_reserve(int32_t n_outputs) {
const auto n_embd = hparams.n_embd;
// TODO: use a per-batch flag for logits presence instead
+5 -5
View File
@@ -10,12 +10,12 @@ filesystems for paths that include wide characters
1 file changed, 39 insertions(+)
diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp
index ad3e7df1..b3218c78 100644
index 75970615..d57b4bd6 100644
--- a/examples/llava/clip.cpp
+++ b/examples/llava/clip.cpp
@@ -30,6 +30,19 @@
@@ -29,6 +29,19 @@
#include <limits>
#include <array>
#include <numeric>
+#if defined(_WIN32)
+#define WIN32_LEAN_AND_MEAN
@@ -33,7 +33,7 @@ index ad3e7df1..b3218c78 100644
struct clip_logger_state g_logger_state = {GGML_LOG_LEVEL_CONT, clip_log_callback_default, NULL};
//#define CLIP_DEBUG_FUNCTIONS
@@ -1971,7 +1984,29 @@ struct clip_model_loader {
@@ -1430,7 +1443,29 @@ struct clip_model_loader {
{
std::vector<uint8_t> read_buf;
@@ -63,7 +63,7 @@ index ad3e7df1..b3218c78 100644
if (!fin) {
throw std::runtime_error(string_format("%s: failed to open %s\n", __func__, fname.c_str()));
}
@@ -1998,7 +2033,11 @@ struct clip_model_loader {
@@ -1457,7 +1492,11 @@ struct clip_model_loader {
ggml_backend_tensor_set(cur, read_buf.data(), 0, num_bytes);
}
}
+21 -21
View File
@@ -15,10 +15,10 @@ adds support for the Solar Pro architecture
7 files changed, 248 insertions(+)
diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp
index f2bc8ca7..5ab3f572 100644
index 62e1480b..f754bc8f 100644
--- a/src/llama-arch.cpp
+++ b/src/llama-arch.cpp
@@ -69,6 +69,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
@@ -68,6 +68,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
{ LLM_ARCH_GRANITE, "granite" },
{ LLM_ARCH_GRANITE_MOE, "granitemoe" },
{ LLM_ARCH_CHAMELEON, "chameleon" },
@@ -26,7 +26,7 @@ index f2bc8ca7..5ab3f572 100644
{ LLM_ARCH_WAVTOKENIZER_DEC, "wavtokenizer-dec" },
{ LLM_ARCH_PLM, "plm" },
{ LLM_ARCH_BAILINGMOE, "bailingmoe" },
@@ -142,6 +143,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
@@ -140,6 +141,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
{ LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT, "%s.attention.relative_buckets_count" },
{ LLM_KV_ATTENTION_SLIDING_WINDOW, "%s.attention.sliding_window" },
{ LLM_KV_ATTENTION_SCALE, "%s.attention.scale" },
@@ -34,7 +34,7 @@ index f2bc8ca7..5ab3f572 100644
{ LLM_KV_ATTENTION_KEY_LENGTH_MLA, "%s.attention.key_length_mla" },
{ LLM_KV_ATTENTION_VALUE_LENGTH_MLA, "%s.attention.value_length_mla" },
@@ -1502,6 +1504,24 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
@@ -1482,6 +1484,24 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
{ LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" },
},
},
@@ -59,7 +59,7 @@ index f2bc8ca7..5ab3f572 100644
{
LLM_ARCH_WAVTOKENIZER_DEC,
{
@@ -1680,6 +1700,7 @@ static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
@@ -1660,6 +1680,7 @@ static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
{LLM_TENSOR_FFN_EXP_PROBS_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_ADD}},
// this tensor is loaded for T5, but never used
{LLM_TENSOR_DEC_CROSS_ATTN_REL_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_NONE}},
@@ -68,10 +68,10 @@ index f2bc8ca7..5ab3f572 100644
{LLM_TENSOR_POS_NET_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
{LLM_TENSOR_POS_NET_NORM1, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
diff --git a/src/llama-arch.h b/src/llama-arch.h
index 41a023da..525c1b7d 100644
index 98ca00a1..439aaeab 100644
--- a/src/llama-arch.h
+++ b/src/llama-arch.h
@@ -73,6 +73,7 @@ enum llm_arch {
@@ -72,6 +72,7 @@ enum llm_arch {
LLM_ARCH_GRANITE,
LLM_ARCH_GRANITE_MOE,
LLM_ARCH_CHAMELEON,
@@ -79,7 +79,7 @@ index 41a023da..525c1b7d 100644
LLM_ARCH_WAVTOKENIZER_DEC,
LLM_ARCH_PLM,
LLM_ARCH_BAILINGMOE,
@@ -146,6 +147,7 @@ enum llm_kv {
@@ -144,6 +145,7 @@ enum llm_kv {
LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT,
LLM_KV_ATTENTION_SLIDING_WINDOW,
LLM_KV_ATTENTION_SCALE,
@@ -87,7 +87,7 @@ index 41a023da..525c1b7d 100644
LLM_KV_ATTENTION_KEY_LENGTH_MLA,
LLM_KV_ATTENTION_VALUE_LENGTH_MLA,
@@ -346,6 +348,7 @@ enum llm_tensor {
@@ -344,6 +346,7 @@ enum llm_tensor {
LLM_TENSOR_ENC_OUTPUT_NORM,
LLM_TENSOR_CLS,
LLM_TENSOR_CLS_OUT,
@@ -115,7 +115,7 @@ index 90dfe7a7..8a667960 100644
if (il < n_layer) {
return n_swa > 0 && n_swa_pattern > 0 && il % n_swa_pattern < (n_swa_pattern - 1);
diff --git a/src/llama-hparams.h b/src/llama-hparams.h
index 7ee6a5b7..48dce407 100644
index 80fcd65d..6e278945 100644
--- a/src/llama-hparams.h
+++ b/src/llama-hparams.h
@@ -55,6 +55,8 @@ struct llama_hparams {
@@ -127,7 +127,7 @@ index 7ee6a5b7..48dce407 100644
uint32_t n_layer_dense_lead = 0;
uint32_t n_lora_q = 0;
uint32_t n_lora_kv = 0;
@@ -154,6 +156,9 @@ struct llama_hparams {
@@ -153,6 +155,9 @@ struct llama_hparams {
// dimension of the recurrent state embeddings
uint32_t n_embd_v_s() const;
@@ -150,10 +150,10 @@ index ea73a8a7..a012aeae 100644
llama_model_loader::llama_model_loader(
const std::string & fname,
diff --git a/src/llama-model.cpp b/src/llama-model.cpp
index 822e2bb2..572378c9 100644
index 6b7bfecf..aba42819 100644
--- a/src/llama-model.cpp
+++ b/src/llama-model.cpp
@@ -1386,6 +1386,21 @@ void llama_model::load_hparams(llama_model_loader & ml) {
@@ -1374,6 +1374,21 @@ void llama_model::load_hparams(llama_model_loader & ml) {
default: type = LLM_TYPE_UNKNOWN;
}
} break;
@@ -175,7 +175,7 @@ index 822e2bb2..572378c9 100644
case LLM_ARCH_WAVTOKENIZER_DEC:
{
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
@@ -3741,6 +3756,34 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
@@ -3717,6 +3732,34 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
@@ -210,7 +210,7 @@ index 822e2bb2..572378c9 100644
layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
@@ -12342,6 +12385,165 @@ struct llm_build_chameleon : public llm_graph_context {
@@ -12296,6 +12339,165 @@ struct llm_build_chameleon : public llm_graph_context {
}
};
@@ -376,7 +376,7 @@ index 822e2bb2..572378c9 100644
struct llm_build_wavtokenizer_dec : public llm_graph_context {
llm_build_wavtokenizer_dec(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
ggml_tensor * cur;
@@ -13092,6 +13294,10 @@ llm_graph_result_ptr llama_model::build_graph(
@@ -13045,6 +13247,10 @@ llm_graph_result_ptr llama_model::build_graph(
{
llm = std::make_unique<llm_build_chameleon>(*this, params, gf);
} break;
@@ -387,7 +387,7 @@ index 822e2bb2..572378c9 100644
case LLM_ARCH_WAVTOKENIZER_DEC:
{
llm = std::make_unique<llm_build_wavtokenizer_dec>(*this, params, gf);
@@ -13238,6 +13444,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
@@ -13191,6 +13397,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
case LLM_ARCH_GRANITE:
case LLM_ARCH_GRANITE_MOE:
case LLM_ARCH_CHAMELEON:
@@ -396,18 +396,18 @@ index 822e2bb2..572378c9 100644
return LLAMA_ROPE_TYPE_NORM;
diff --git a/src/llama-model.h b/src/llama-model.h
index 95eca002..856e6042 100644
index fd82d106..5865d5e9 100644
--- a/src/llama-model.h
+++ b/src/llama-model.h
@@ -64,6 +64,7 @@ enum llm_type {
@@ -62,6 +62,7 @@ enum llm_type {
LLM_TYPE_15B,
LLM_TYPE_16B,
LLM_TYPE_20B,
+ LLM_TYPE_22B,
LLM_TYPE_27B,
LLM_TYPE_30B,
LLM_TYPE_32B,
@@ -311,6 +312,8 @@ struct llama_layer {
LLM_TYPE_34B,
@@ -307,6 +308,8 @@ struct llama_layer {
struct ggml_tensor * ffn_up_scale = nullptr;
struct ggml_tensor * ffn_down_scale = nullptr;
+88 -66
View File
@@ -5,6 +5,7 @@ Subject: [PATCH] add mllama support
adds support for the llama 3.2 vision architecture
---
examples/llava/gemma3-cli.cpp | 3 +-
examples/llava/llava.cpp | 5 +-
examples/llava/mtmd.cpp | 6 +-
ggml/src/ggml-backend-reg.cpp | 6 +-
@@ -24,13 +25,34 @@ adds support for the llama 3.2 vision architecture
src/llama-model.cpp | 309 +++++++++++++++++++++++++++++++++-
src/llama-model.h | 12 ++
src/llama-quant.cpp | 4 +-
19 files changed, 473 insertions(+), 21 deletions(-)
20 files changed, 475 insertions(+), 22 deletions(-)
diff --git a/examples/llava/gemma3-cli.cpp b/examples/llava/gemma3-cli.cpp
index 3d566475..654d1358 100644
--- a/examples/llava/gemma3-cli.cpp
+++ b/examples/llava/gemma3-cli.cpp
@@ -106,7 +106,7 @@ struct decode_embd_batch {
std::vector<llama_seq_id *> seq_ids;
std::vector<int8_t> logits;
llama_batch batch;
- decode_embd_batch(float * embd, int32_t n_tokens, llama_pos pos_0, llama_seq_id seq_id) {
+ decode_embd_batch(float * embd, int32_t n_embd, int32_t n_tokens, llama_pos pos_0, llama_seq_id seq_id) {
pos .resize(n_tokens);
n_seq_id.resize(n_tokens);
seq_ids .resize(n_tokens + 1);
@@ -118,6 +118,7 @@ struct decode_embd_batch {
/*n_tokens =*/ n_tokens,
/*tokens =*/ nullptr,
/*embd =*/ embd,
+ /*n_embd =*/ n_embd,
/*pos =*/ pos.data(),
/*n_seq_id =*/ n_seq_id.data(),
/*seq_id =*/ seq_ids.data(),
diff --git a/examples/llava/llava.cpp b/examples/llava/llava.cpp
index c00d16ae..bab027b5 100644
index 03a22cbb..5eb40bcd 100644
--- a/examples/llava/llava.cpp
+++ b/examples/llava/llava.cpp
@@ -457,7 +457,7 @@ struct llava_embd_batch {
@@ -456,7 +456,7 @@ struct llava_embd_batch {
std::vector<llama_seq_id *> seq_ids;
std::vector<int8_t> logits;
llama_batch batch;
@@ -39,7 +61,7 @@ index c00d16ae..bab027b5 100644
pos .resize(n_tokens);
n_seq_id.resize(n_tokens);
seq_ids .resize(n_tokens + 1);
@@ -469,6 +469,7 @@ struct llava_embd_batch {
@@ -468,6 +468,7 @@ struct llava_embd_batch {
/*n_tokens =*/ n_tokens,
/*tokens =*/ nullptr,
/*embd =*/ embd,
@@ -47,7 +69,7 @@ index c00d16ae..bab027b5 100644
/*pos =*/ pos.data(),
/*n_seq_id =*/ n_seq_id.data(),
/*seq_id =*/ seq_ids.data(),
@@ -492,7 +493,7 @@ bool llava_eval_image_embed(llama_context * ctx_llama, const struct llava_image_
@@ -491,7 +492,7 @@ bool llava_eval_image_embed(llama_context * ctx_llama, const struct llava_image_
n_eval = n_batch;
}
float * embd = image_embed->embed+i*n_embd;
@@ -57,19 +79,19 @@ index c00d16ae..bab027b5 100644
LOG_ERR("%s : failed to eval\n", __func__);
return false;
diff --git a/examples/llava/mtmd.cpp b/examples/llava/mtmd.cpp
index 7081fd73..c14ac501 100644
index 3fd5bebc..f0cec596 100644
--- a/examples/llava/mtmd.cpp
+++ b/examples/llava/mtmd.cpp
@@ -476,7 +476,7 @@ struct decode_embd_batch {
@@ -233,7 +233,7 @@ struct decode_embd_batch {
std::vector<llama_seq_id *> seq_ids;
std::vector<int8_t> logits;
llama_batch batch;
- decode_embd_batch(float * embd, int32_t n_tokens, int n_pos_per_embd, int n_mmproj_embd) : n_pos_per_embd(n_pos_per_embd), n_mmproj_embd(n_mmproj_embd) {
+ decode_embd_batch(float * embd, int32_t n_embd, int32_t n_tokens, llama_pos pos_0, llama_seq_id seq_id) : n_pos_per_embd(n_pos_per_embd), n_mmproj_embd(n_mmproj_embd) {
pos .resize(n_tokens * n_pos_per_embd);
- decode_embd_batch(float * embd, int32_t n_tokens, llama_pos pos_0, llama_seq_id seq_id) {
+ decode_embd_batch(float * embd, int32_t n_embd, int32_t n_tokens, llama_pos pos_0, llama_seq_id seq_id) {
pos .resize(n_tokens);
n_seq_id.resize(n_tokens);
seq_ids .resize(n_tokens + 1);
@@ -487,6 +487,7 @@ struct decode_embd_batch {
@@ -245,6 +245,7 @@ struct decode_embd_batch {
/*n_tokens =*/ n_tokens,
/*tokens =*/ nullptr,
/*embd =*/ embd,
@@ -77,16 +99,16 @@ index 7081fd73..c14ac501 100644
/*pos =*/ pos.data(),
/*n_seq_id =*/ n_seq_id.data(),
/*seq_id =*/ seq_ids.data(),
@@ -610,7 +611,8 @@ int32_t mtmd_helper_eval(mtmd_context * ctx,
int32_t i_batch = 0;
int32_t n_img_batches = GGML_PAD(n_tokens, n_batch) / n_batch;
float * embd = mtmd_get_output_embd(ctx);
- decode_embd_batch batch_embd(embd, n_tokens, n_pos_per_embd, n_mmproj_embd);
+ int n_embd = llama_model_n_embd(llama_get_model(lctx));
+ decode_embd_batch batch_embd(embd, n_embd, n_tokens, n_past, 0);
@@ -311,7 +312,8 @@ int32_t mtmd_helper_eval(mtmd_context * ctx,
const int nx = mtmd_image_tokens_get_nx(chunk.tokens_image.get());
const int ny = mtmd_image_tokens_get_ny(chunk.tokens_image.get());
int32_t n_tokens = mtmd_image_tokens_get_n_tokens(chunk.tokens_image.get());
float * embd = mtmd_get_output_embd(ctx);
- decode_embd_batch batch_img(embd, n_tokens, n_past, 0);
+ int n_embd = llama_model_n_embd(llama_get_model(lctx));
+ decode_embd_batch batch_img(embd, n_embd, n_tokens, n_past, 0);
int64_t t1 = ggml_time_ms();
ret = llama_decode(lctx, batch_img.batch);
if (ret != 0) {
diff --git a/ggml/src/ggml-backend-reg.cpp b/ggml/src/ggml-backend-reg.cpp
index 405d8e31..82ae1b5b 100644
--- a/ggml/src/ggml-backend-reg.cpp
@@ -105,10 +127,10 @@ index 405d8e31..82ae1b5b 100644
register_backend(ggml_backend_rpc_reg());
#endif
diff --git a/include/llama.h b/include/llama.h
index 06c56395..f1628e88 100644
index 5657fbf0..f91896e4 100644
--- a/include/llama.h
+++ b/include/llama.h
@@ -256,6 +256,7 @@ extern "C" {
@@ -255,6 +255,7 @@ extern "C" {
llama_token * token;
float * embd;
@@ -116,7 +138,7 @@ index 06c56395..f1628e88 100644
llama_pos * pos;
int32_t * n_seq_id;
llama_seq_id ** seq_id;
@@ -358,6 +359,7 @@ extern "C" {
@@ -357,6 +358,7 @@ extern "C" {
bool offload_kqv; // whether to offload the KQV ops (including the KV cache) to GPU
bool flash_attn; // whether to use flash attention [EXPERIMENTAL]
bool no_perf; // whether to measure performance timings
@@ -124,7 +146,7 @@ index 06c56395..f1628e88 100644
// Abort callback
// if it returns true, execution of llama_decode() will be aborted
@@ -459,6 +461,10 @@ extern "C" {
@@ -458,6 +460,10 @@ extern "C" {
struct llama_context_params params),
"use llama_init_from_model instead");
@@ -136,7 +158,7 @@ index 06c56395..f1628e88 100644
LLAMA_API void llama_free(struct llama_context * ctx);
diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp
index 5ab3f572..eb7b5325 100644
index f754bc8f..0568565f 100644
--- a/src/llama-arch.cpp
+++ b/src/llama-arch.cpp
@@ -6,6 +6,7 @@
@@ -147,7 +169,7 @@ index 5ab3f572..eb7b5325 100644
{ LLM_ARCH_LLAMA4, "llama4" },
{ LLM_ARCH_DECI, "deci" },
{ LLM_ARCH_FALCON, "falcon" },
@@ -144,6 +145,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
@@ -142,6 +143,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
{ LLM_KV_ATTENTION_SLIDING_WINDOW, "%s.attention.sliding_window" },
{ LLM_KV_ATTENTION_SCALE, "%s.attention.scale" },
{ LLM_KV_ATTENTION_BLOCK_SKIP_CONNECTION, "%s.attention.block_skip_connection" },
@@ -155,7 +177,7 @@ index 5ab3f572..eb7b5325 100644
{ LLM_KV_ATTENTION_KEY_LENGTH_MLA, "%s.attention.key_length_mla" },
{ LLM_KV_ATTENTION_VALUE_LENGTH_MLA, "%s.attention.value_length_mla" },
@@ -273,6 +275,40 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
@@ -271,6 +273,40 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
{ LLM_TENSOR_FFN_UP_SHEXP, "blk.%d.ffn_up_shexp" },
},
},
@@ -196,7 +218,7 @@ index 5ab3f572..eb7b5325 100644
{
LLM_ARCH_DECI,
{
@@ -1701,6 +1737,14 @@ static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
@@ -1681,6 +1717,14 @@ static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
// this tensor is loaded for T5, but never used
{LLM_TENSOR_DEC_CROSS_ATTN_REL_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_NONE}},
{LLM_TENSOR_BSKCN_TV, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
@@ -212,7 +234,7 @@ index 5ab3f572..eb7b5325 100644
{LLM_TENSOR_POS_NET_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
{LLM_TENSOR_POS_NET_NORM1, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
diff --git a/src/llama-arch.h b/src/llama-arch.h
index 525c1b7d..bc8a4f0b 100644
index 439aaeab..6a989034 100644
--- a/src/llama-arch.h
+++ b/src/llama-arch.h
@@ -11,6 +11,7 @@
@@ -223,7 +245,7 @@ index 525c1b7d..bc8a4f0b 100644
LLM_ARCH_DECI,
LLM_ARCH_FALCON,
LLM_ARCH_BAICHUAN,
@@ -148,6 +149,7 @@ enum llm_kv {
@@ -146,6 +147,7 @@ enum llm_kv {
LLM_KV_ATTENTION_SLIDING_WINDOW,
LLM_KV_ATTENTION_SCALE,
LLM_KV_ATTENTION_BLOCK_SKIP_CONNECTION,
@@ -231,7 +253,7 @@ index 525c1b7d..bc8a4f0b 100644
LLM_KV_ATTENTION_KEY_LENGTH_MLA,
LLM_KV_ATTENTION_VALUE_LENGTH_MLA,
@@ -349,6 +351,14 @@ enum llm_tensor {
@@ -347,6 +349,14 @@ enum llm_tensor {
LLM_TENSOR_CLS,
LLM_TENSOR_CLS_OUT,
LLM_TENSOR_BSKCN_TV,
@@ -275,10 +297,10 @@ index 01d5ca57..8682b0e6 100644
batch.token = (llama_token *) malloc(sizeof(llama_token) * n_tokens_alloc);
}
diff --git a/src/llama-context.cpp b/src/llama-context.cpp
index 9c1fe93f..cd06ad91 100644
index 32f59819..0343ba8a 100644
--- a/src/llama-context.cpp
+++ b/src/llama-context.cpp
@@ -851,7 +851,7 @@ float * llama_context::get_logits_ith(int32_t i) {
@@ -862,7 +862,7 @@ float * llama_context::get_logits_ith(int32_t i) {
throw std::runtime_error(format("corrupt output buffer (j=%d, n_outputs=%d)", j, n_outputs));
}
@@ -287,7 +309,7 @@ index 9c1fe93f..cd06ad91 100644
} catch (const std::exception & err) {
LLAMA_LOG_ERROR("%s: invalid logits id %d, reason: %s\n", __func__, i, err.what());
#ifndef NDEBUG
@@ -972,6 +972,10 @@ void llama_context::set_warmup(bool value) {
@@ -983,6 +983,10 @@ void llama_context::set_warmup(bool value) {
cparams.warmup = value;
}
@@ -298,7 +320,7 @@ index 9c1fe93f..cd06ad91 100644
void llama_context::set_adapter_lora(
llama_adapter_lora * adapter,
float scale) {
@@ -1047,7 +1051,7 @@ int llama_context::encode(llama_batch & inp_batch) {
@@ -1058,7 +1062,7 @@ int llama_context::encode(llama_batch & inp_batch) {
const int64_t n_embd = hparams.n_embd;
@@ -307,7 +329,7 @@ index 9c1fe93f..cd06ad91 100644
const llama_ubatch ubatch = sbatch.split_simple(n_tokens);
@@ -1187,10 +1191,9 @@ int llama_context::decode(llama_batch & inp_batch) {
@@ -1198,10 +1202,9 @@ int llama_context::decode(llama_batch & inp_batch) {
const llama_batch & batch = batch_allocr.batch;
@@ -319,7 +341,7 @@ index 9c1fe93f..cd06ad91 100644
const int64_t n_tokens_all = batch.n_tokens;
const int64_t n_embd = hparams.n_embd;
@@ -1238,7 +1241,7 @@ int llama_context::decode(llama_batch & inp_batch) {
@@ -1249,7 +1252,7 @@ int llama_context::decode(llama_batch & inp_batch) {
const bool logits_all = n_outputs_all == n_tokens_all;
@@ -328,7 +350,7 @@ index 9c1fe93f..cd06ad91 100644
/* simple_split */ !kv_self->recurrent,
/* logits_all */ logits_all);
@@ -1472,12 +1475,11 @@ int llama_context::decode(llama_batch & inp_batch) {
@@ -1483,12 +1486,11 @@ int llama_context::decode(llama_batch & inp_batch) {
int32_t llama_context::output_reserve(int32_t n_outputs) {
const auto & hparams = model.hparams;
@@ -342,7 +364,7 @@ index 9c1fe93f..cd06ad91 100644
const auto n_embd = hparams.n_embd;
// TODO: use a per-batch flag for logits presence instead
@@ -1545,7 +1547,7 @@ int32_t llama_context::output_reserve(int32_t n_outputs) {
@@ -1558,7 +1560,7 @@ int32_t llama_context::output_reserve(int32_t n_outputs) {
void llama_context::output_reorder() {
auto & out_ids = sbatch.out_ids;
if (!out_ids.empty()) {
@@ -351,7 +373,7 @@ index 9c1fe93f..cd06ad91 100644
const uint32_t n_embd = model.hparams.n_embd;
GGML_ASSERT((size_t) n_outputs == out_ids.size());
@@ -2052,7 +2054,7 @@ size_t llama_context::state_write_data(llama_io_write_i & io) {
@@ -2065,7 +2067,7 @@ size_t llama_context::state_write_data(llama_io_write_i & io) {
{
LLAMA_LOG_DEBUG("%s: - writing logits\n", __func__);
@@ -360,7 +382,7 @@ index 9c1fe93f..cd06ad91 100644
io.write(&logits_size, sizeof(logits_size));
@@ -2235,6 +2237,7 @@ llama_context_params llama_context_default_params() {
@@ -2248,6 +2250,7 @@ llama_context_params llama_context_default_params() {
/*.offload_kqv =*/ true,
/*.flash_attn =*/ false,
/*.no_perf =*/ true,
@@ -368,7 +390,7 @@ index 9c1fe93f..cd06ad91 100644
/*.abort_callback =*/ nullptr,
/*.abort_callback_data =*/ nullptr,
};
@@ -2362,6 +2365,10 @@ void llama_set_warmup(llama_context * ctx, bool warmup) {
@@ -2375,6 +2378,10 @@ void llama_set_warmup(llama_context * ctx, bool warmup) {
ctx->set_warmup(warmup);
}
@@ -380,7 +402,7 @@ index 9c1fe93f..cd06ad91 100644
ctx->synchronize();
}
diff --git a/src/llama-context.h b/src/llama-context.h
index 5457f077..a50c4afa 100644
index 04facb54..baa03276 100644
--- a/src/llama-context.h
+++ b/src/llama-context.h
@@ -65,6 +65,7 @@ struct llama_context {
@@ -404,10 +426,10 @@ index 30e550f0..85ad91b9 100644
enum llama_pooling_type pooling_type;
diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp
index fabb9ca2..b67216a4 100644
index a85e9728..d740c120 100644
--- a/src/llama-graph.cpp
+++ b/src/llama-graph.cpp
@@ -560,6 +560,12 @@ void llm_graph_input_attn_cross::set_input(const llama_ubatch * ubatch) {
@@ -546,6 +546,12 @@ void llm_graph_input_attn_cross::set_input(const llama_ubatch * ubatch) {
}
}
@@ -420,7 +442,7 @@ index fabb9ca2..b67216a4 100644
//
// llm_graph_context
//
@@ -1532,6 +1538,25 @@ llm_graph_input_attn_cross * llm_graph_context::build_attn_inp_cross() const {
@@ -1506,6 +1512,25 @@ llm_graph_input_attn_cross * llm_graph_context::build_attn_inp_cross() const {
return (llm_graph_input_attn_cross *) res->add_input(std::move(inp));
}
@@ -447,7 +469,7 @@ index fabb9ca2..b67216a4 100644
llm_graph_input_attn_cross * inp,
ggml_cgraph * gf,
diff --git a/src/llama-graph.h b/src/llama-graph.h
index d0c8d321..0fe18150 100644
index d192dc14..260a2af2 100644
--- a/src/llama-graph.h
+++ b/src/llama-graph.h
@@ -86,6 +86,7 @@ public:
@@ -458,7 +480,7 @@ index d0c8d321..0fe18150 100644
};
class llm_graph_input_pos : public llm_graph_input_i {
@@ -283,6 +284,16 @@ public:
@@ -285,6 +286,16 @@ public:
const llama_cross * cross = nullptr;
};
@@ -475,7 +497,7 @@ index d0c8d321..0fe18150 100644
//
// llm_graph_result
//
@@ -491,6 +502,7 @@ struct llm_graph_context {
@@ -493,6 +504,7 @@ struct llm_graph_context {
ggml_tensor * build_inp_cls() const;
ggml_tensor * build_inp_s_copy() const;
ggml_tensor * build_inp_s_mask() const;
@@ -496,7 +518,7 @@ index 8a667960..6a02de03 100644
+ return std::find(cross_attn_layers.begin(), cross_attn_layers.end(), il) != cross_attn_layers.end();
+}
diff --git a/src/llama-hparams.h b/src/llama-hparams.h
index 48dce407..b6fc7e6d 100644
index 6e278945..c8a34d52 100644
--- a/src/llama-hparams.h
+++ b/src/llama-hparams.h
@@ -2,6 +2,8 @@
@@ -524,7 +546,7 @@ index 48dce407..b6fc7e6d 100644
uint32_t n_layer_dense_lead = 0;
uint32_t n_lora_q = 0;
@@ -159,6 +163,9 @@ struct llama_hparams {
@@ -158,6 +162,9 @@ struct llama_hparams {
// Block skip connection
bool n_bskcn(uint32_t n, uint32_t il) const;
@@ -571,10 +593,10 @@ index a012aeae..2e11507d 100644
bool llama_model_loader::get_arr(const std::string & key, std::array<T, N_MAX> & result, bool required) {
const int kid = gguf_find_key(meta.get(), key.c_str());
diff --git a/src/llama-model.cpp b/src/llama-model.cpp
index 572378c9..9d099f11 100644
index aba42819..d051696c 100644
--- a/src/llama-model.cpp
+++ b/src/llama-model.cpp
@@ -423,6 +423,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
@@ -419,6 +419,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
// get general kv
ml.get_key(LLM_KV_GENERAL_NAME, name, false);
@@ -582,7 +604,7 @@ index 572378c9..9d099f11 100644
// everything past this point is not vocab-related
if (hparams.vocab_only) {
@@ -434,6 +435,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
@@ -430,6 +431,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
ml.get_key(LLM_KV_BLOCK_COUNT, hparams.n_layer);
ml.get_key(LLM_KV_EXPERT_COUNT, hparams.n_expert, false);
ml.get_key(LLM_KV_EXPERT_USED_COUNT, hparams.n_expert_used, false);
@@ -590,7 +612,7 @@ index 572378c9..9d099f11 100644
if (arch == LLM_ARCH_WAVTOKENIZER_DEC) {
ml.get_key(LLM_KV_FEATURES_LENGTH, hparams.n_embd_features);
@@ -457,9 +459,11 @@ void llama_model::load_hparams(llama_model_loader & ml) {
@@ -453,9 +455,11 @@ void llama_model::load_hparams(llama_model_loader & ml) {
std::fill(hparams.n_head_arr.begin(), hparams.n_head_arr.end(), 0);
std::fill(hparams.n_head_kv_arr.begin(), hparams.n_head_kv_arr.end(), 0);
std::fill(hparams.n_ff_arr.begin(), hparams.n_ff_arr.end(), 0);
@@ -602,7 +624,7 @@ index 572378c9..9d099f11 100644
// n_head_kv is optional, default to n_head
hparams.n_head_kv_arr = hparams.n_head_arr;
@@ -512,7 +516,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
@@ -508,7 +512,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
ml.get_key(LLM_KV_ROPE_DIMENSION_COUNT, hparams.n_rot, false);
@@ -611,7 +633,7 @@ index 572378c9..9d099f11 100644
if (hparams.n_rot != hparams.n_embd_head_k) {
throw std::runtime_error(format("invalid n_rot: %u, expected %u", hparams.n_rot, hparams.n_embd_head_k));
}
@@ -575,6 +579,16 @@ void llama_model::load_hparams(llama_model_loader & ml) {
@@ -571,6 +575,16 @@ void llama_model::load_hparams(llama_model_loader & ml) {
hparams.use_kq_norm = false;
}
} break;
@@ -628,7 +650,7 @@ index 572378c9..9d099f11 100644
case LLM_ARCH_DECI:
{
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
@@ -1562,7 +1576,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
@@ -1550,7 +1564,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
const int64_t n_embd_head_v = hparams.n_embd_head_v;
const int64_t n_ff = hparams.n_ff();
const int64_t n_embd_gqa = n_embd_v_gqa;
@@ -637,7 +659,7 @@ index 572378c9..9d099f11 100644
const int64_t n_token_types = vocab.n_token_types();
const int64_t n_rot = hparams.n_rot;
const int64_t n_expert = hparams.n_expert;
@@ -1815,6 +1829,52 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
@@ -1803,6 +1817,52 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
}
}
} break;
@@ -690,7 +712,7 @@ index 572378c9..9d099f11 100644
case LLM_ARCH_DECI:
{
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
@@ -4707,6 +4767,246 @@ struct llm_build_llama : public llm_graph_context {
@@ -4683,6 +4743,246 @@ struct llm_build_llama : public llm_graph_context {
}
};
@@ -937,7 +959,7 @@ index 572378c9..9d099f11 100644
struct llm_build_deci : public llm_graph_context {
llm_build_deci(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
const int64_t n_embd_head = hparams.n_embd_head_v;
@@ -13063,6 +13363,10 @@ llm_graph_result_ptr llama_model::build_graph(
@@ -13017,6 +13317,10 @@ llm_graph_result_ptr llama_model::build_graph(
{
llm = std::make_unique<llm_build_llama>(*this, params, gf);
} break;
@@ -948,7 +970,7 @@ index 572378c9..9d099f11 100644
case LLM_ARCH_DECI:
{
llm = std::make_unique<llm_build_deci>(*this, params, gf);
@@ -13424,6 +13728,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
@@ -13377,6 +13681,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
// use what we call a normal RoPE, operating on pairs of consecutive head values
case LLM_ARCH_LLAMA:
case LLM_ARCH_LLAMA4:
@@ -957,7 +979,7 @@ index 572378c9..9d099f11 100644
case LLM_ARCH_BAICHUAN:
case LLM_ARCH_STARCODER:
diff --git a/src/llama-model.h b/src/llama-model.h
index 856e6042..6be91282 100644
index 5865d5e9..72bab5be 100644
--- a/src/llama-model.h
+++ b/src/llama-model.h
@@ -11,6 +11,7 @@
@@ -968,15 +990,15 @@ index 856e6042..6be91282 100644
struct llama_cparams;
struct llama_ubatch;
@@ -73,6 +74,7 @@ enum llm_type {
@@ -70,6 +71,7 @@ enum llm_type {
LLM_TYPE_40B,
LLM_TYPE_65B,
LLM_TYPE_70B,
+ LLM_TYPE_90B,
LLM_TYPE_236B,
LLM_TYPE_290B,
LLM_TYPE_314B,
@@ -314,6 +316,16 @@ struct llama_layer {
LLM_TYPE_671B,
@@ -310,6 +312,16 @@ struct llama_layer {
struct ggml_tensor * bskcn_tv = nullptr;
+28 -28
View File
@@ -18,10 +18,10 @@ adds the unpad operator to GGML
10 files changed, 223 insertions(+), 2 deletions(-)
diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h
index 1b8603e7..53ef31b2 100644
index 8fcc16df..d19fc167 100644
--- a/ggml/include/ggml.h
+++ b/ggml/include/ggml.h
@@ -489,6 +489,7 @@ extern "C" {
@@ -488,6 +488,7 @@ extern "C" {
GGML_OP_UPSCALE, // nearest interpolate
GGML_OP_PAD,
GGML_OP_PAD_REFLECT_1D,
@@ -29,7 +29,7 @@ index 1b8603e7..53ef31b2 100644
GGML_OP_ARANGE,
GGML_OP_TIMESTEP_EMBEDDING,
GGML_OP_ARGSORT,
@@ -1777,6 +1778,15 @@ extern "C" {
@@ -1757,6 +1758,15 @@ extern "C" {
int p0,
int p1);
@@ -46,10 +46,10 @@ index 1b8603e7..53ef31b2 100644
// timesteps: [N,]
// return: [N, dim]
diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c
index 64405449..34624cca 100644
index 50400328..432942bf 100644
--- a/ggml/src/ggml-cpu/ggml-cpu.c
+++ b/ggml/src/ggml-cpu/ggml-cpu.c
@@ -1964,6 +1964,10 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
@@ -1960,6 +1960,10 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
{
ggml_compute_forward_pad_reflect_1d(params, tensor);
} break;
@@ -60,7 +60,7 @@ index 64405449..34624cca 100644
case GGML_OP_ARANGE:
{
ggml_compute_forward_arange(params, tensor);
@@ -2287,6 +2291,7 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
@@ -2282,6 +2286,7 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
case GGML_OP_UPSCALE:
case GGML_OP_PAD:
case GGML_OP_PAD_REFLECT_1D:
@@ -69,10 +69,10 @@ index 64405449..34624cca 100644
case GGML_OP_TIMESTEP_EMBEDDING:
case GGML_OP_ARGSORT:
diff --git a/ggml/src/ggml-cpu/ops.cpp b/ggml/src/ggml-cpu/ops.cpp
index 7413192b..becdae07 100644
index 6050147b..66b8da68 100644
--- a/ggml/src/ggml-cpu/ops.cpp
+++ b/ggml/src/ggml-cpu/ops.cpp
@@ -6703,6 +6703,61 @@ void ggml_compute_forward_pad_reflect_1d(
@@ -6531,6 +6531,61 @@ void ggml_compute_forward_pad_reflect_1d(
}
}
@@ -135,10 +135,10 @@ index 7413192b..becdae07 100644
static void ggml_compute_forward_arange_f32(
diff --git a/ggml/src/ggml-cpu/ops.h b/ggml/src/ggml-cpu/ops.h
index dc081b9e..a7125555 100644
index 410a3720..3eca1cf8 100644
--- a/ggml/src/ggml-cpu/ops.h
+++ b/ggml/src/ggml-cpu/ops.h
@@ -72,6 +72,7 @@ void ggml_compute_forward_pool_2d_back(const struct ggml_compute_params * params
@@ -71,6 +71,7 @@ void ggml_compute_forward_pool_2d_back(const struct ggml_compute_params * params
void ggml_compute_forward_upscale(const struct ggml_compute_params * params, struct ggml_tensor * dst);
void ggml_compute_forward_pad(const struct ggml_compute_params * params, struct ggml_tensor * dst);
void ggml_compute_forward_pad_reflect_1d(const struct ggml_compute_params * params, struct ggml_tensor * dst);
@@ -147,10 +147,10 @@ index dc081b9e..a7125555 100644
void ggml_compute_forward_timestep_embedding(const struct ggml_compute_params * params, struct ggml_tensor * dst);
void ggml_compute_forward_argsort(const struct ggml_compute_params * params, struct ggml_tensor * dst);
diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
index 04ce764e..491acccb 100644
index 31750b6f..0fef9522 100644
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
@@ -2223,6 +2223,9 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
@@ -2246,6 +2246,9 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
case GGML_OP_PAD:
ggml_cuda_op_pad(ctx, dst);
break;
@@ -160,7 +160,7 @@ index 04ce764e..491acccb 100644
case GGML_OP_ARANGE:
ggml_cuda_op_arange(ctx, dst);
break;
@@ -3197,6 +3200,7 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
@@ -3222,6 +3225,7 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
case GGML_OP_UPSCALE:
return op->src[0]->type == GGML_TYPE_F32 && op->op_params[0] == GGML_SCALE_MODE_NEAREST;
case GGML_OP_PAD:
@@ -233,7 +233,7 @@ index 8fd386b0..e2ededc3 100644
void ggml_cuda_op_pad(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
+void ggml_cuda_op_unpad(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
diff --git a/ggml/src/ggml-metal/ggml-metal.m b/ggml/src/ggml-metal/ggml-metal.m
index 425524d0..112abef6 100644
index 12886cd3..b2e95a66 100644
--- a/ggml/src/ggml-metal/ggml-metal.m
+++ b/ggml/src/ggml-metal/ggml-metal.m
@@ -341,6 +341,7 @@ static void ggml_backend_metal_device_rel(struct ggml_backend_metal_device_conte
@@ -244,7 +244,7 @@ index 425524d0..112abef6 100644
GGML_METAL_KERNEL_TYPE_ARANGE_F32,
GGML_METAL_KERNEL_TYPE_TIMESTEP_EMBEDDING_F32,
GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_ASC,
@@ -1277,6 +1278,7 @@ @implementation GGMLMetalClass
@@ -1020,6 +1021,7 @@ @implementation GGMLMetalClass
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_UPSCALE_F32, upscale_f32, true);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_PAD_F32, pad_f32, true);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_PAD_REFLECT_1D_F32, pad_reflect_1d_f32, true);
@@ -252,7 +252,7 @@ index 425524d0..112abef6 100644
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_TIMESTEP_EMBEDDING_F32, timestep_embedding_f32, true);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ARANGE_F32, arange_f32, true);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_ASC, argsort_f32_i32_asc, true);
@@ -1647,6 +1649,7 @@ static bool ggml_metal_supports_op(const struct ggml_backend_metal_device_contex
@@ -1384,6 +1386,7 @@ static bool ggml_metal_supports_op(const struct ggml_backend_metal_device_contex
case GGML_OP_POOL_2D:
case GGML_OP_PAD:
case GGML_OP_PAD_REFLECT_1D:
@@ -260,7 +260,7 @@ index 425524d0..112abef6 100644
case GGML_OP_TIMESTEP_EMBEDDING:
case GGML_OP_ARGSORT:
case GGML_OP_LEAKY_RELU:
@@ -4047,6 +4050,36 @@ static bool ggml_metal_encode_node(
@@ -3731,6 +3734,36 @@ static void ggml_metal_encode_node(
const int nth = MIN(1024, ne0);
@@ -298,7 +298,7 @@ index 425524d0..112abef6 100644
} break;
case GGML_OP_ARANGE:
diff --git a/ggml/src/ggml-metal/ggml-metal.metal b/ggml/src/ggml-metal/ggml-metal.metal
index 9f4147e9..6ceb3cef 100644
index 8d6e99e6..71f0f97f 100644
--- a/ggml/src/ggml-metal/ggml-metal.metal
+++ b/ggml/src/ggml-metal/ggml-metal.metal
@@ -2975,6 +2975,51 @@ kernel void kernel_pad_reflect_1d_f32(
@@ -354,10 +354,10 @@ index 9f4147e9..6ceb3cef 100644
device char * dst,
constant ggml_metal_kargs_arange & args,
diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c
index 7654ae17..3c57aff8 100644
index 950772c7..2276b631 100644
--- a/ggml/src/ggml.c
+++ b/ggml/src/ggml.c
@@ -923,6 +923,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
@@ -963,6 +963,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
"UPSCALE",
"PAD",
"PAD_REFLECT_1D",
@@ -365,16 +365,16 @@ index 7654ae17..3c57aff8 100644
"ARANGE",
"TIMESTEP_EMBEDDING",
"ARGSORT",
@@ -953,7 +954,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
@@ -993,7 +994,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
"OPT_STEP_ADAMW",
};
-static_assert(GGML_OP_COUNT == 82, "GGML_OP_COUNT != 82");
+static_assert(GGML_OP_COUNT == 83, "GGML_OP_COUNT != 83");
-static_assert(GGML_OP_COUNT == 81, "GGML_OP_COUNT != 81");
+static_assert(GGML_OP_COUNT == 82, "GGML_OP_COUNT != 82");
static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
"none",
@@ -1018,6 +1019,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
@@ -1057,6 +1058,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
"upscale(x)",
"pad(x)",
"pad_reflect_1d(x)",
@@ -382,16 +382,16 @@ index 7654ae17..3c57aff8 100644
"arange(start, stop, step)",
"timestep_embedding(timesteps, dim, max_period)",
"argsort(x)",
@@ -1048,7 +1050,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
@@ -1087,7 +1089,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
"adamw(x)",
};
-static_assert(GGML_OP_COUNT == 82, "GGML_OP_COUNT != 82");
+static_assert(GGML_OP_COUNT == 83, "GGML_OP_COUNT != 83");
-static_assert(GGML_OP_COUNT == 81, "GGML_OP_COUNT != 81");
+static_assert(GGML_OP_COUNT == 82, "GGML_OP_COUNT != 82");
static_assert(GGML_OP_POOL_COUNT == 2, "GGML_OP_POOL_COUNT != 2");
@@ -4270,6 +4272,25 @@ struct ggml_tensor * ggml_pad_reflect_1d(
@@ -4262,6 +4264,25 @@ struct ggml_tensor * ggml_pad_reflect_1d(
return result;
}
@@ -12,7 +12,7 @@ regex
2 files changed, 22 insertions(+), 1 deletion(-)
diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp
index a9ee9f03..1306864e 100644
index a35b498c..032019c9 100644
--- a/src/llama-vocab.cpp
+++ b/src/llama-vocab.cpp
@@ -296,7 +296,7 @@ struct llm_tokenizer_bpe : llm_tokenizer {
@@ -8,10 +8,10 @@ Subject: [PATCH] maintain ordering for rules for grammar
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/common/json-schema-to-grammar.cpp b/common/json-schema-to-grammar.cpp
index 5b3059c2..656b3eca 100644
index 90679822..56043678 100644
--- a/common/json-schema-to-grammar.cpp
+++ b/common/json-schema-to-grammar.cpp
@@ -349,7 +349,7 @@ private:
@@ -346,7 +346,7 @@ private:
friend std::string build_grammar(const std::function<void(const common_grammar_builder &)> & cb, const common_grammar_options & options);
std::function<json(const std::string &)> _fetch_json;
bool _dotall;
@@ -22,10 +22,10 @@ multiple batches of processing until everything is complete.
4 files changed, 51 insertions(+), 106 deletions(-)
diff --git a/src/llama-context.cpp b/src/llama-context.cpp
index cd06ad91..77177c5e 100644
index 0343ba8a..4b3e6a83 100644
--- a/src/llama-context.cpp
+++ b/src/llama-context.cpp
@@ -583,13 +583,12 @@ llm_graph_result_ptr llama_context::build_kv_self_shift(
@@ -594,13 +594,12 @@ llm_graph_result_ptr llama_context::build_kv_self_shift(
llm_graph_result_ptr llama_context::build_kv_self_defrag(
ggml_context * ctx0,
@@ -41,7 +41,7 @@ index cd06ad91..77177c5e 100644
#if 0
// CPU defrag
//
@@ -661,32 +660,20 @@ llm_graph_result_ptr llama_context::build_kv_self_defrag(
@@ -672,32 +671,20 @@ llm_graph_result_ptr llama_context::build_kv_self_defrag(
ggml_backend_tensor_set(v_l[il], buf_v.data(), 0, buf_v.size());
}
#else
@@ -79,7 +79,7 @@ index cd06ad91..77177c5e 100644
ggml_tensor * view_v_src;
ggml_tensor * view_v_dst;
@@ -694,34 +681,30 @@ llm_graph_result_ptr llama_context::build_kv_self_defrag(
@@ -705,34 +692,30 @@ llm_graph_result_ptr llama_context::build_kv_self_defrag(
if (cparams.flash_attn) {
// NOTE: the V cache is not transposed when using flash attention
view_v_src = ggml_view_2d(ctx0, kv_self->v_l[il],
@@ -122,7 +122,7 @@ index cd06ad91..77177c5e 100644
#endif
return res;
@@ -730,8 +713,6 @@ llm_graph_result_ptr llama_context::build_kv_self_defrag(
@@ -741,8 +724,6 @@ llm_graph_result_ptr llama_context::build_kv_self_defrag(
void llama_context::kv_self_update() {
auto & kv = kv_self;
@@ -131,7 +131,7 @@ index cd06ad91..77177c5e 100644
if (kv->has_shift) {
if (!kv->get_can_shift()) {
GGML_ABORT("The current context does not support K-shift");
@@ -752,8 +733,6 @@ void llama_context::kv_self_update() {
@@ -763,8 +744,6 @@ void llama_context::kv_self_update() {
res->set_inputs(nullptr);
graph_compute(gf, false);
@@ -140,7 +140,7 @@ index cd06ad91..77177c5e 100644
}
{
@@ -768,49 +747,28 @@ void llama_context::kv_self_update() {
@@ -779,49 +758,28 @@ void llama_context::kv_self_update() {
// defragment the KV cache if needed
if (kv->do_defrag) {
LLAMA_LOG_DEBUG("%s: defragmenting KV cache\n", __func__);
@@ -202,7 +202,7 @@ index cd06ad91..77177c5e 100644
}
enum llama_pooling_type llama_context::pooling_type() const {
@@ -1294,9 +1252,12 @@ int llama_context::decode(llama_batch & inp_batch) {
@@ -1305,9 +1263,12 @@ int llama_context::decode(llama_batch & inp_batch) {
// find KV slot
{
if (!kv_self->find_slot(ubatch)) {
@@ -219,7 +219,7 @@ index cd06ad91..77177c5e 100644
if (!kv_self->recurrent) {
diff --git a/src/llama-context.h b/src/llama-context.h
index a50c4afa..30f84bfd 100644
index baa03276..a59ff8fd 100644
--- a/src/llama-context.h
+++ b/src/llama-context.h
@@ -5,6 +5,7 @@
@@ -230,7 +230,7 @@ index a50c4afa..30f84bfd 100644
#include "ggml-cpp.h"
@@ -179,7 +180,8 @@ private:
@@ -180,7 +181,8 @@ private:
llm_graph_result_ptr build_kv_self_defrag(
ggml_context * ctx0,
@@ -8,10 +8,10 @@ Subject: [PATCH] add phony target ggml-cpu for all cpu variants
1 file changed, 2 insertions(+)
diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt
index 43d9fc4f..4c0d3824 100644
index f00700da..91d6a7d5 100644
--- a/ggml/src/CMakeLists.txt
+++ b/ggml/src/CMakeLists.txt
@@ -279,6 +279,7 @@ function(ggml_add_cpu_backend_variant tag_name)
@@ -278,6 +278,7 @@ function(ggml_add_cpu_backend_variant tag_name)
endforeach()
ggml_add_cpu_backend_variant_impl(${tag_name})
@@ -19,11 +19,11 @@ index 43d9fc4f..4c0d3824 100644
endfunction()
ggml_add_backend(CPU)
@@ -287,6 +288,7 @@ if (GGML_CPU_ALL_VARIANTS)
@@ -286,6 +287,7 @@ if (GGML_CPU_ALL_VARIANTS)
if (NOT GGML_BACKEND_DL)
message(FATAL_ERROR "GGML_CPU_ALL_VARIANTS requires GGML_BACKEND_DL")
endif()
+ add_custom_target(ggml-cpu)
ggml_add_cpu_backend_variant(x64)
ggml_add_cpu_backend_variant(sse42 SSE42)
ggml_add_cpu_backend_variant(sandybridge SSE42 AVX)
ggml_add_cpu_backend_variant(sandybridge AVX)
ggml_add_cpu_backend_variant(haswell AVX F16C AVX2 BMI2 FMA)
ggml_add_cpu_backend_variant(skylakex AVX F16C AVX2 BMI2 FMA AVX512)
+7 -7
View File
@@ -1,6 +1,6 @@
From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
From: jmorganca <jmorganca@gmail.com>
Date: Thu, 1 May 2025 15:05:08 -0700
Date: Tue, 8 Apr 2025 20:33:01 -0700
Subject: [PATCH] remove amx
disable amx as it reduces performance on some systems
@@ -9,16 +9,16 @@ disable amx as it reduces performance on some systems
1 file changed, 4 deletions(-)
diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt
index 4c0d3824..79c26312 100644
index 91d6a7d5..d6b393a2 100644
--- a/ggml/src/CMakeLists.txt
+++ b/ggml/src/CMakeLists.txt
@@ -296,10 +296,6 @@ if (GGML_CPU_ALL_VARIANTS)
ggml_add_cpu_backend_variant(skylakex SSE42 AVX F16C AVX2 BMI2 FMA AVX512)
ggml_add_cpu_backend_variant(icelake SSE42 AVX F16C AVX2 BMI2 FMA AVX512 AVX512_VBMI AVX512_VNNI)
ggml_add_cpu_backend_variant(alderlake SSE42 AVX F16C AVX2 BMI2 FMA AVX_VNNI)
@@ -293,10 +293,6 @@ if (GGML_CPU_ALL_VARIANTS)
ggml_add_cpu_backend_variant(skylakex AVX F16C AVX2 BMI2 FMA AVX512)
ggml_add_cpu_backend_variant(icelake AVX F16C AVX2 BMI2 FMA AVX512 AVX512_VBMI AVX512_VNNI)
ggml_add_cpu_backend_variant(alderlake AVX F16C AVX2 BMI2 FMA AVX_VNNI)
- if (NOT MSVC)
- # MSVC doesn't support AMX
- ggml_add_cpu_backend_variant(sapphirerapids SSE42 AVX F16C AVX2 BMI2 FMA AVX512 AVX512_VBMI AVX512_VNNI AVX512_BF16 AMX_TILE AMX_INT8)
- ggml_add_cpu_backend_variant(sapphirerapids AVX F16C AVX2 BMI2 FMA AVX512 AVX512_VBMI AVX512_VNNI AVX512_BF16 AMX_TILE AMX_INT8)
- endif()
elseif (GGML_CPU)
ggml_add_cpu_backend_variant_impl("")
@@ -53,7 +53,7 @@ index 381a9c7d..e45b453d 100644
}
diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp
index 1306864e..d6515ff6 100644
index 032019c9..ba37df35 100644
--- a/src/llama-vocab.cpp
+++ b/src/llama-vocab.cpp
@@ -1459,7 +1459,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
+2 -2
View File
@@ -8,7 +8,7 @@ Subject: [PATCH] ollama debug tensor
1 file changed, 6 insertions(+)
diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c
index 34624cca..59bd3c62 100644
index 432942bf..6d4abe4c 100644
--- a/ggml/src/ggml-cpu/ggml-cpu.c
+++ b/ggml/src/ggml-cpu/ggml-cpu.c
@@ -15,6 +15,8 @@
@@ -20,7 +20,7 @@ index 34624cca..59bd3c62 100644
#if defined(_MSC_VER) || defined(__MINGW32__)
#include <malloc.h> // using malloc.h with MSC/MINGW
#elif !defined(__FreeBSD__) && !defined(__NetBSD__) && !defined(__OpenBSD__)
@@ -2859,6 +2861,10 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
@@ -2854,6 +2856,10 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
ggml_compute_forward(&params, node);
@@ -13,10 +13,10 @@ models not supported in llama.cpp
4 files changed, 24 insertions(+)
diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp
index eb7b5325..df42d1a5 100644
index 0568565f..dd01df60 100644
--- a/src/llama-arch.cpp
+++ b/src/llama-arch.cpp
@@ -74,6 +74,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
@@ -73,6 +73,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
{ LLM_ARCH_WAVTOKENIZER_DEC, "wavtokenizer-dec" },
{ LLM_ARCH_PLM, "plm" },
{ LLM_ARCH_BAILINGMOE, "bailingmoe" },
@@ -24,7 +24,7 @@ index eb7b5325..df42d1a5 100644
{ LLM_ARCH_UNKNOWN, "(unknown)" },
};
@@ -1606,6 +1607,22 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
@@ -1586,6 +1587,22 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
{ LLM_TENSOR_FFN_UP_SHEXP, "blk.%d.ffn_up_shexp" },
},
},
@@ -48,10 +48,10 @@ index eb7b5325..df42d1a5 100644
LLM_ARCH_UNKNOWN,
{
diff --git a/src/llama-arch.h b/src/llama-arch.h
index bc8a4f0b..bda9d071 100644
index 6a989034..b6227eeb 100644
--- a/src/llama-arch.h
+++ b/src/llama-arch.h
@@ -76,6 +76,7 @@ enum llm_arch {
@@ -75,6 +75,7 @@ enum llm_arch {
LLM_ARCH_CHAMELEON,
LLM_ARCH_SOLAR,
LLM_ARCH_WAVTOKENIZER_DEC,
@@ -60,10 +60,10 @@ index bc8a4f0b..bda9d071 100644
LLM_ARCH_BAILINGMOE,
LLM_ARCH_UNKNOWN,
diff --git a/src/llama-model.cpp b/src/llama-model.cpp
index 9d099f11..ef70486d 100644
index d051696c..c8374159 100644
--- a/src/llama-model.cpp
+++ b/src/llama-model.cpp
@@ -1437,6 +1437,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
@@ -1425,6 +1425,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
default: type = LLM_TYPE_UNKNOWN;
}
} break;
@@ -71,7 +71,7 @@ index 9d099f11..ef70486d 100644
default: throw std::runtime_error("unsupported model architecture");
}
@@ -13751,6 +13752,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
@@ -13704,6 +13705,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
case LLM_ARCH_CHAMELEON:
case LLM_ARCH_SOLAR:
case LLM_ARCH_BAILINGMOE:
@@ -1,38 +0,0 @@
From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
From: Jesse Gross <jesse@kernel.org>
Date: Thu, 1 May 2025 13:46:10 -0700
Subject: [PATCH] ggml: Don't assert fail when tensor data changes (#13222)
The following scenario will cause an assertion failure in the graph
allocator:
- Build and allocate a graph containing a tensor with a non-NULL data
pointer
- Build and allocate a new graph where that data is NULL
Result:
ggml-alloc.c:819: GGML_ASSERT(talloc->buffer_id >= 0) failed
This happens during revalidation because we think that memory should
have been previously allocated based on the current graph but in
reality the previous graph was different. In this situation, we
should do a full reallocation pass.
---
ggml/src/ggml-alloc.c | 5 ++++-
1 file changed, 4 insertions(+), 1 deletion(-)
diff --git a/ggml/src/ggml-alloc.c b/ggml/src/ggml-alloc.c
index a3d3f690..5fd379f6 100644
--- a/ggml/src/ggml-alloc.c
+++ b/ggml/src/ggml-alloc.c
@@ -816,7 +816,10 @@ static void ggml_gallocr_init_tensor(ggml_gallocr_t galloc, struct ggml_tensor *
static bool ggml_gallocr_node_needs_realloc(ggml_gallocr_t galloc, struct ggml_tensor * node, struct tensor_alloc * talloc) {
size_t node_size = 0;
if (!node->data && !node->view_src) {
- GGML_ASSERT(talloc->buffer_id >= 0); // prevent segfault when misusing the API
+ // If we previously had data but don't now then reallocate
+ if (talloc->buffer_id < 0) {
+ return false;
+ }
node_size = ggml_backend_buft_get_alloc_size(galloc->bufts[talloc->buffer_id], node);
}
return talloc->size_max >= node_size;
@@ -184,10 +184,10 @@ index f8c291de..2a3a62db 100644
const char * grammar_root,
bool lazy,
diff --git a/src/llama-sampling.cpp b/src/llama-sampling.cpp
index c0a5f934..75731053 100644
index d1497985..b1a9dca3 100644
--- a/src/llama-sampling.cpp
+++ b/src/llama-sampling.cpp
@@ -1466,7 +1466,7 @@ static void llama_sampler_grammar_reset(struct llama_sampler * smpl) {
@@ -1465,7 +1465,7 @@ static void llama_sampler_grammar_reset(struct llama_sampler * smpl) {
trigger_patterns_c.push_back(trigger_pattern.pattern.c_str());
}
@@ -196,7 +196,7 @@ index c0a5f934..75731053 100644
ctx->grammar->lazy, trigger_patterns_c.data(), trigger_patterns_c.size(),
ctx->grammar->trigger_tokens.data(), ctx->grammar->trigger_tokens.size());
@@ -1548,7 +1548,7 @@ static struct llama_sampler * llama_sampler_init_grammar_impl(
@@ -1547,7 +1547,7 @@ static struct llama_sampler * llama_sampler_init_grammar_impl(
/* .vocab = */ vocab,
/* .grammar_str = */ grammar_str,
/* .grammar_root = */ grammar_root,
+5 -21
View File
@@ -44,7 +44,6 @@ type LlamaServer interface {
EstimatedVRAM() uint64 // Total VRAM across all GPUs
EstimatedTotal() uint64
EstimatedVRAMByGPU(gpuID string) uint64
Pid() int
}
// llmServer is an instance of the llama.cpp server
@@ -325,23 +324,21 @@ func NewLlamaServer(gpus discover.GpuInfoList, modelPath string, f *ggml.GGML, a
pathEnv = "LD_LIBRARY_PATH"
}
// Note: we always put our dependency paths first
// since these are the exact version we compiled/linked against
libraryPaths := []string{discover.LibOllamaPath}
var libraryPaths []string
if libraryPath, ok := os.LookupEnv(pathEnv); ok {
libraryPaths = append(libraryPaths, filepath.SplitList(libraryPath)...)
}
ggmlPaths := []string{discover.LibOllamaPath}
if len(compatible) > 0 {
c := compatible[0]
if libpath, ok := libs[c]; ok {
slog.Debug("adding gpu library", "path", libpath)
libraryPaths = append([]string{libpath}, libraryPaths...)
ggmlPaths = append(ggmlPaths, libpath)
libraryPaths = append(libraryPaths, libpath)
}
}
// Note: we always put the dependency path first
// since this was the exact version we compiled/linked against
if gpus[0].DependencyPath != nil {
slog.Debug("adding gpu dependency paths", "paths", gpus[0].DependencyPath)
// assume gpus from the same library have the same dependency path
@@ -372,8 +369,6 @@ func NewLlamaServer(gpus discover.GpuInfoList, modelPath string, f *ggml.GGML, a
s.cmd.Stderr = s.status
s.cmd.SysProcAttr = LlamaServerSysProcAttr
s.cmd.Env = append(s.cmd.Env, "OLLAMA_LIBRARY_PATH="+strings.Join(ggmlPaths, string(filepath.ListSeparator)))
envWorkarounds := [][2]string{}
for _, gpu := range gpus {
envWorkarounds = append(envWorkarounds, gpu.EnvWorkarounds...)
@@ -411,8 +406,7 @@ func NewLlamaServer(gpus discover.GpuInfoList, modelPath string, f *ggml.GGML, a
if envconfig.Debug() {
filteredEnv := []string{}
for _, ev := range s.cmd.Env {
if strings.HasPrefix(ev, "OLLAMA_") ||
strings.HasPrefix(ev, "CUDA_") ||
if strings.HasPrefix(ev, "CUDA_") ||
strings.HasPrefix(ev, "ROCR_") ||
strings.HasPrefix(ev, "ROCM_") ||
strings.HasPrefix(ev, "HIP_") ||
@@ -521,9 +515,6 @@ func (s *llmServer) getServerStatus(ctx context.Context) (ServerStatus, error) {
if errors.Is(err, context.DeadlineExceeded) {
return ServerStatusNotResponding, errors.New("server not responding")
}
if strings.Contains(err.Error(), "connection refused") {
return ServerStatusNotResponding, errors.New("connection refused")
}
return ServerStatusError, fmt.Errorf("health resp: %w", err)
}
defer resp.Body.Close()
@@ -644,13 +635,6 @@ func (s *llmServer) WaitUntilRunning(ctx context.Context) error {
}
}
func (s *llmServer) Pid() int {
if s.cmd != nil && s.cmd.Process != nil {
return s.cmd.Process.Pid
}
return -1
}
var grammarJSON = `
root ::= object
value ::= object | array | string | number | ("true" | "false" | "null") ws
+8 -5
View File
@@ -341,11 +341,6 @@ func New(ctx context.Context, r *os.File, params ml.BackendParams) (ml.Backend,
var s uint64
for s < t.Size() {
// Stop if either the parent context has been canceled or if any of the other tensors returned an error
if err := ctx.Err(); err != nil {
return err
}
n, err := io.ReadFull(sr, bts[:min(len(bts), int(t.Size()-s))])
if err != nil {
slog.Warn("file read error", "file", r.Name(), "error", err)
@@ -368,6 +363,14 @@ func New(ctx context.Context, r *os.File, params ml.BackendParams) (ml.Backend,
})
}
// start a goroutine to cancel the errgroup if the parent context is done
go func() {
<-ctx.Done()
g.Go(func() error {
return ctx.Err()
})
}()
if err := g.Wait(); err != nil {
return nil, err
}
-5
View File
@@ -133,11 +133,6 @@ extern "C" {
GGML_BACKEND_API ggml_backend_reg_t ggml_backend_cpu_reg(void);
GGML_BACKEND_API void ggml_cpu_fp32_to_fp16(const float *, ggml_fp16_t *, int64_t);
GGML_BACKEND_API void ggml_cpu_fp16_to_fp32(const ggml_fp16_t *, float *, int64_t);
GGML_BACKEND_API void ggml_cpu_fp32_to_bf16(const float *, ggml_bf16_t *, int64_t);
GGML_BACKEND_API void ggml_cpu_bf16_to_fp32(const ggml_bf16_t *, float *, int64_t);
#ifdef __cplusplus
}
#endif
+1 -1
View File
@@ -7,7 +7,7 @@
extern "C" {
#endif
#define RPC_PROTO_MAJOR_VERSION 2
#define RPC_PROTO_MAJOR_VERSION 1
#define RPC_PROTO_MINOR_VERSION 0
#define RPC_PROTO_PATCH_VERSION 0
#define GGML_RPC_MAX_SERVERS 16
+3 -23
View File
@@ -393,8 +393,8 @@ extern "C" {
// precision
enum ggml_prec {
GGML_PREC_DEFAULT = 0, // stored as ggml_tensor.op_params, 0 by default
GGML_PREC_F32 = 10,
GGML_PREC_DEFAULT,
GGML_PREC_F32,
};
// model file types
@@ -481,7 +481,6 @@ extern "C" {
GGML_OP_CONV_TRANSPOSE_1D,
GGML_OP_IM2COL,
GGML_OP_IM2COL_BACK,
GGML_OP_CONV_2D_DW,
GGML_OP_CONV_TRANSPOSE_2D,
GGML_OP_POOL_1D,
GGML_OP_POOL_2D,
@@ -679,9 +678,6 @@ extern "C" {
GGML_API bool ggml_is_contiguous_1(const struct ggml_tensor * tensor); // contiguous for dims >= 1
GGML_API bool ggml_is_contiguous_2(const struct ggml_tensor * tensor); // contiguous for dims >= 2
// true for tensor that is stored in memory as CxWxHxN and has been permuted to WxHxCxN
GGML_API bool ggml_is_contiguous_channels(const struct ggml_tensor * tensor);
GGML_API bool ggml_are_same_shape (const struct ggml_tensor * t0, const struct ggml_tensor * t1);
GGML_API bool ggml_are_same_stride(const struct ggml_tensor * t0, const struct ggml_tensor * t1);
@@ -1665,7 +1661,7 @@ extern "C" {
struct ggml_tensor * a,
struct ggml_tensor * b);
// depthwise (via im2col and mul_mat)
// depthwise
GGML_API struct ggml_tensor * ggml_conv_2d_dw(
struct ggml_context * ctx,
struct ggml_tensor * a, // convolution kernel
@@ -1677,22 +1673,6 @@ extern "C" {
int d0, // dilation dimension 0
int d1); // dilation dimension 1
// Depthwise 2D convolution
// may be faster than ggml_conv_2d_dw, but not available in all backends
// a: KW KH 1 C convolution kernel
// b: W H C N input data
// res: W_out H_out C N
GGML_API struct ggml_tensor * ggml_conv_2d_dw_direct(
struct ggml_context * ctx,
struct ggml_tensor * a,
struct ggml_tensor * b,
int stride0,
int stride1,
int pad0,
int pad1,
int dilation0,
int dilation1);
GGML_API struct ggml_tensor * ggml_conv_transpose_2d_p0(
struct ggml_context * ctx,
struct ggml_tensor * a,
+5 -8
View File
@@ -267,7 +267,6 @@ function(ggml_add_cpu_backend_variant tag_name)
set(GGML_CPU_TAG_NAME ${tag_name})
# other: OPENMP LLAMAFILE CPU_HBM
foreach (feat NATIVE
SSE42
AVX AVX2 BMI2 AVX_VNNI FMA F16C
AVX512 AVX512_VBMI AVX512_VNNI AVX512_BF16
AMX_TILE AMX_INT8 AMX_BF16)
@@ -289,13 +288,11 @@ if (GGML_CPU_ALL_VARIANTS)
message(FATAL_ERROR "GGML_CPU_ALL_VARIANTS requires GGML_BACKEND_DL")
endif()
add_custom_target(ggml-cpu)
ggml_add_cpu_backend_variant(x64)
ggml_add_cpu_backend_variant(sse42 SSE42)
ggml_add_cpu_backend_variant(sandybridge SSE42 AVX)
ggml_add_cpu_backend_variant(haswell SSE42 AVX F16C AVX2 BMI2 FMA)
ggml_add_cpu_backend_variant(skylakex SSE42 AVX F16C AVX2 BMI2 FMA AVX512)
ggml_add_cpu_backend_variant(icelake SSE42 AVX F16C AVX2 BMI2 FMA AVX512 AVX512_VBMI AVX512_VNNI)
ggml_add_cpu_backend_variant(alderlake SSE42 AVX F16C AVX2 BMI2 FMA AVX_VNNI)
ggml_add_cpu_backend_variant(sandybridge AVX)
ggml_add_cpu_backend_variant(haswell AVX F16C AVX2 BMI2 FMA)
ggml_add_cpu_backend_variant(skylakex AVX F16C AVX2 BMI2 FMA AVX512)
ggml_add_cpu_backend_variant(icelake AVX F16C AVX2 BMI2 FMA AVX512 AVX512_VBMI AVX512_VNNI)
ggml_add_cpu_backend_variant(alderlake AVX F16C AVX2 BMI2 FMA AVX_VNNI)
elseif (GGML_CPU)
ggml_add_cpu_backend_variant_impl("")
endif()
+1 -4
View File
@@ -816,10 +816,7 @@ static void ggml_gallocr_init_tensor(ggml_gallocr_t galloc, struct ggml_tensor *
static bool ggml_gallocr_node_needs_realloc(ggml_gallocr_t galloc, struct ggml_tensor * node, struct tensor_alloc * talloc) {
size_t node_size = 0;
if (!node->data && !node->view_src) {
// If we previously had data but don't now then reallocate
if (talloc->buffer_id < 0) {
return false;
}
GGML_ASSERT(talloc->buffer_id >= 0); // prevent segfault when misusing the API
node_size = ggml_backend_buft_get_alloc_size(galloc->bufts[talloc->buffer_id], node);
}
return talloc->size_max >= node_size;
+5 -11
View File
@@ -222,7 +222,7 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
elseif (GGML_AVX)
list(APPEND ARCH_FLAGS /arch:AVX)
list(APPEND ARCH_DEFINITIONS GGML_AVX)
elseif (GGML_SSE42)
else ()
list(APPEND ARCH_FLAGS /arch:SSE4.2)
list(APPEND ARCH_DEFINITIONS GGML_SSE42)
endif()
@@ -237,10 +237,8 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
if (GGML_NATIVE)
list(APPEND ARCH_FLAGS -march=native)
else ()
if (GGML_SSE42)
list(APPEND ARCH_FLAGS -msse4.2)
list(APPEND ARCH_DEFINITIONS GGML_SSE42)
endif()
list(APPEND ARCH_FLAGS -msse4.2)
list(APPEND ARCH_DEFINITIONS GGML_SSE42)
if (GGML_F16C)
list(APPEND ARCH_FLAGS -mf16c)
list(APPEND ARCH_DEFINITIONS GGML_F16C)
@@ -352,14 +350,10 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
# TODO: Separation to determine activation of VX/VXE/VXE2
if (${S390X_M} MATCHES "8561|8562")
message(STATUS "z15 target")
list(APPEND ARCH_FLAGS -march=z15)
list(APPEND ARCH_FLAGS -march=z15 -mtune=z15)
elseif (${S390X_M} MATCHES "3931")
message(STATUS "z16 target")
list(APPEND ARCH_FLAGS -march=z16)
elseif (${S390X_M} MATCHES "9175|9176")
# NOTE: Only available from GCC 15.1.0 onwards. Any z17 machine with compile issues must first verify their GCC version.
message(STATUS "z17 target")
list(APPEND ARCH_FLAGS -march=z17)
list(APPEND ARCH_FLAGS -march=z16 -mtune=z16)
else()
message(STATUS "Unknown target")
message(WARNING "Unknown target. If you are compiling for z14 and earlier, you might have to add -DGGML_VXE=OFF.")
+1 -1
View File
@@ -263,7 +263,7 @@ void test_x86_is() {
static int ggml_backend_cpu_x86_score() {
// FIXME: this does not check for OS support
int score = 1;
int score = 0;
cpuid_x86 is;
#ifdef GGML_FMA
+2 -94
View File
@@ -217,7 +217,7 @@ static const struct ggml_type_traits_cpu type_traits_cpu[GGML_TYPE_COUNT] = {
.nrows = 1,
},
[GGML_TYPE_F16] = {
.from_float = (ggml_from_float_t) ggml_cpu_fp32_to_fp16,
.from_float = (ggml_from_float_t) ggml_fp32_to_fp16_row,
.vec_dot = (ggml_vec_dot_t) ggml_vec_dot_f16,
.vec_dot_type = GGML_TYPE_F16,
.nrows = 1,
@@ -358,7 +358,7 @@ static const struct ggml_type_traits_cpu type_traits_cpu[GGML_TYPE_COUNT] = {
.from_float = quantize_row_q8_K,
},
[GGML_TYPE_BF16] = {
.from_float = (ggml_from_float_t) ggml_cpu_fp32_to_bf16,
.from_float = (ggml_from_float_t) ggml_fp32_to_bf16_row,
.vec_dot = (ggml_vec_dot_t) ggml_vec_dot_bf16,
.vec_dot_type = GGML_TYPE_BF16,
.nrows = 1,
@@ -1934,10 +1934,6 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
{
ggml_compute_forward_im2col_back_f32(params, tensor);
} break;
case GGML_OP_CONV_2D_DW:
{
ggml_compute_forward_conv_2d_dw(params, tensor);
} break;
case GGML_OP_CONV_TRANSPOSE_2D:
{
ggml_compute_forward_conv_transpose_2d(params, tensor);
@@ -2278,7 +2274,6 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
} break;
case GGML_OP_IM2COL:
case GGML_OP_IM2COL_BACK:
case GGML_OP_CONV_2D_DW:
case GGML_OP_CONV_TRANSPOSE_1D:
case GGML_OP_CONV_TRANSPOSE_2D:
{
@@ -3177,93 +3172,6 @@ enum ggml_status ggml_graph_compute_with_ctx(struct ggml_context * ctx, struct g
return ggml_graph_compute(cgraph, &cplan);
}
void ggml_cpu_fp32_to_fp16(const float * x, ggml_fp16_t * y, int64_t n) {
int64_t i = 0;
#if defined(__F16C__)
#if defined(__AVX512F__)
for (; i + 15 < n; i += 16) {
__m512 x_vec = _mm512_loadu_ps(x + i);
__m256i y_vec = _mm512_cvtps_ph(x_vec, _MM_FROUND_TO_NEAREST_INT);
_mm256_storeu_si256((__m256i *)(y + i), y_vec);
}
#endif
for (; i + 7 < n; i += 8) {
__m256 x_vec = _mm256_loadu_ps(x + i);
__m128i y_vec = _mm256_cvtps_ph(x_vec, _MM_FROUND_TO_NEAREST_INT);
_mm_storeu_si128((__m128i *)(y + i), y_vec);
}
for (; i + 3 < n; i += 4) {
__m128 x_vec = _mm_loadu_ps(x + i);
__m128i y_vec = _mm_cvtps_ph(x_vec, _MM_FROUND_TO_NEAREST_INT);
_mm_storel_epi64((__m128i *)(y + i), y_vec);
}
#endif
for (; i < n; ++i) {
y[i] = GGML_FP32_TO_FP16(x[i]);
}
}
void ggml_cpu_fp16_to_fp32(const ggml_fp16_t * x, float * y, int64_t n) {
int64_t i = 0;
#if defined(__F16C__)
#if defined(__AVX512F__)
for (; i + 15 < n; i += 16) {
__m256i x_vec = _mm256_loadu_si256((const __m256i *)(x + i));
__m512 y_vec = _mm512_cvtph_ps(x_vec);
_mm512_storeu_ps(y + i, y_vec);
}
#endif
for (; i + 7 < n; i += 8) {
__m128i x_vec = _mm_loadu_si128((const __m128i *)(x + i));
__m256 y_vec = _mm256_cvtph_ps(x_vec);
_mm256_storeu_ps(y + i, y_vec);
}
for (; i + 3 < n; i += 4) {
__m128i x_vec = _mm_loadl_epi64((const __m128i *)(x + i));
__m128 y_vec = _mm_cvtph_ps(x_vec);
_mm_storeu_ps(y + i, y_vec);
}
#endif
for (; i < n; ++i) {
y[i] = GGML_FP16_TO_FP32(x[i]);
}
}
void ggml_cpu_fp32_to_bf16(const float * x, ggml_bf16_t * y, int64_t n) {
int64_t i = 0;
for (; i < n; ++i) {
y[i] = GGML_FP32_TO_BF16(x[i]);
}
}
void ggml_cpu_bf16_to_fp32(const ggml_bf16_t * x, float * y, int64_t n) {
int64_t i = 0;
#if defined(__AVX2__)
#if defined(__AVX512F__)
for (; i + 15 < n; i += 16) {
_mm512_storeu_ps(y + i,
_mm512_castsi512_ps(
_mm512_slli_epi32(
_mm512_cvtepu16_epi32(
_mm256_loadu_si256(
(const __m256i *)(x + i))),
16)));
}
#endif
for (; i + 7 < n; i += 8) {
_mm256_storeu_ps(y + i,
_mm256_castsi256_ps(
_mm256_slli_epi32(
_mm256_cvtepu16_epi32(
_mm_loadu_si128(
(const __m128i *)(x + i))),
16)));
}
#endif
for (; i < n; i++) {
y[i] = GGML_BF16_TO_FP32(x[i]);
}
}
int ggml_cpu_has_avx(void) {
#if defined(__AVX__)
+2 -174
View File
@@ -4222,7 +4222,7 @@ static void ggml_compute_forward_get_rows_f16(
GGML_ASSERT(i01 >= 0 && i01 < ne01);
ggml_cpu_fp16_to_fp32(
ggml_fp16_to_fp32_row(
(const ggml_fp16_t*) ((char *) src0->data + i01*nb01 + i11*nb02 + i12*nb03),
(float *) ((char *) dst->data + i10*nb1 + i11*nb2 + i12*nb3), nc);
}
@@ -4263,7 +4263,7 @@ static void ggml_compute_forward_get_rows_bf16(
GGML_ASSERT(i01 >= 0 && i01 < ne01);
ggml_cpu_bf16_to_fp32(
ggml_bf16_to_fp32_row(
(const ggml_bf16_t *) ((char *) src0->data + i01*nb01 + i11*nb02 + i12*nb03),
(float *) ((char *) dst->data + i10*nb1 + i11*nb2 + i12*nb3), nc);
}
@@ -6064,178 +6064,6 @@ void ggml_compute_forward_conv_transpose_2d(
}
}
// ggml_compute_forward_conv_2d_dw
struct ggml_conv_2d_dw_params {
int64_t channels;
int64_t batch;
int64_t src_w;
int64_t src_h;
int64_t dst_w;
int64_t dst_h;
int64_t knl_w;
int64_t knl_h;
int stride_x;
int stride_y;
int pad_x;
int pad_y;
int dilation_x;
int dilation_y;
};
static void ggml_compute_forward_conv_2d_dw_cwhn(
const ggml_compute_params * params,
const ggml_tensor * src,
const ggml_tensor * kernel,
ggml_tensor * dst,
const ggml_conv_2d_dw_params & p) {
const int64_t c = p.channels;
const float * knl_data = (const float *)kernel->data;
const int64_t rows_total = p.dst_h * p.batch;
const int64_t rows_per_thread = (rows_total + params->nth - 1) / params->nth;
const int64_t row_start = params->ith * rows_per_thread;
const int64_t row_end = MIN(row_start + rows_per_thread, rows_total);
#ifdef GGML_SIMD
const int64_t pkg_size = GGML_F32_EPR;
const int64_t pkg_count = c / pkg_size;
const int64_t c_pkg_end = pkg_count * pkg_size;
#else
const int64_t c_pkg_end = 0;
#endif
for (int64_t row = row_start; row < row_end; ++row) {
const int64_t dst_y = row % p.dst_h;
const float * src_data = (const float *)src->data + (row / p.dst_h) * p.src_w * p.src_h * c;
for (int64_t dst_x = 0; dst_x < p.dst_w; ++dst_x) {
float * dst_data = (float *)dst->data + (row * p.dst_w + dst_x) * c;
const int64_t src_y_base = dst_y * p.stride_y - p.pad_y;
const int64_t src_x_base = dst_x * p.stride_x - p.pad_x;
#ifdef GGML_SIMD
// Vectorized loop
for (int64_t c_i = 0; c_i < c_pkg_end; c_i += pkg_size) {
GGML_F32_VEC sum = GGML_F32_VEC_ZERO;
for (int64_t knl_y = 0; knl_y < p.knl_h; ++knl_y) {
const int64_t src_y = src_y_base + knl_y * p.dilation_y;
if (src_y < 0 || src_y >= p.src_h) {
continue;
}
for (int64_t knl_x = 0; knl_x < p.knl_w; ++knl_x) {
const int64_t src_x = src_x_base + knl_x * p.dilation_x;
if (src_x < 0 || src_x >= p.src_w) {
continue;
}
GGML_F32_VEC k = GGML_F32_VEC_LOAD(knl_data + (knl_y * p.knl_w + knl_x) * c + c_i);
GGML_F32_VEC s = GGML_F32_VEC_LOAD(src_data + (src_y * p.src_w + src_x) * c + c_i);
sum = GGML_F32_VEC_FMA(sum, k, s);
}
}
GGML_F32_VEC_STORE(dst_data + c_i, sum);
}
#endif
// Scalar loop
for (int64_t c_i = c_pkg_end; c_i < c; ++c_i) {
float sum = 0.0f;
for (int64_t knl_y = 0; knl_y < p.knl_h; ++knl_y) {
const int64_t src_y = src_y_base + knl_y * p.dilation_y;
if (src_y < 0 || src_y >= p.src_h) {
continue;
}
for (int64_t knl_x = 0; knl_x < p.knl_w; ++knl_x) {
const int64_t src_x = src_x_base + knl_x * p.dilation_x;
if (src_x < 0 || src_x >= p.src_w) {
continue;
}
sum += knl_data[(knl_y * p.knl_w + knl_x) * c + c_i]
* src_data[(src_y * p.src_w + src_x) * c + c_i];
}
}
dst_data[c_i] = sum;
}
}
}
}
static void ggml_compute_forward_conv_2d_dw_whcn(
const ggml_compute_params * params,
const ggml_tensor * src,
const ggml_tensor * kernel,
ggml_tensor * dst,
const ggml_conv_2d_dw_params & p) {
const int64_t n = p.channels * p.batch;
const int64_t per_thread = (n + params->nth - 1) / params->nth;
const int64_t start = params->ith * per_thread;
const int64_t end = MIN(start + per_thread, n);
for (int64_t i = start; i < end; ++i) {
const float * knl_data = (const float *)kernel->data + (i % p.channels) * p.knl_w * p.knl_h;
const float * src_data = (const float *)src->data + i * p.src_w * p.src_h;
float * dst_data = (float *)dst->data + i * p.dst_w * p.dst_h;
for (int64_t dst_y = 0; dst_y < p.dst_h; ++dst_y) {
for (int64_t dst_x = 0; dst_x < p.dst_w; ++dst_x) {
float sum = 0.0f;
for (int64_t knl_y = 0; knl_y < p.knl_h; ++knl_y) {
const int64_t src_y = dst_y * p.stride_y + knl_y * p.dilation_y - p.pad_y;
if (src_y < 0 || src_y >= p.src_h) {
continue;
}
for (int64_t knl_x = 0; knl_x < p.knl_w; ++knl_x) {
const int64_t src_x = dst_x * p.stride_x + knl_x * p.dilation_x - p.pad_x;
if (src_x < 0 || src_x >= p.src_w) {
continue;
}
sum += knl_data[knl_y * p.knl_w + knl_x]
* src_data[src_y * p.src_w + src_x];
}
}
dst_data[dst_y * p.dst_w + dst_x] = sum;
}
}
}
}
void ggml_compute_forward_conv_2d_dw(
const ggml_compute_params * params,
ggml_tensor * dst) {
const ggml_tensor * kernel = dst->src[0];
const ggml_tensor * src = dst->src[1];
ggml_conv_2d_dw_params p;
p.channels = src->ne[2];
p.batch = src->ne[3];
p.src_w = src->ne[0];
p.src_h = src->ne[1];
p.dst_w = dst->ne[0];
p.dst_h = dst->ne[1];
p.knl_w = kernel->ne[0];
p.knl_h = kernel->ne[1];
p.stride_x = dst->op_params[0];
p.stride_y = dst->op_params[1];
p.pad_x = dst->op_params[2];
p.pad_y = dst->op_params[3];
p.dilation_x = dst->op_params[4];
p.dilation_y = dst->op_params[5];
GGML_ASSERT(kernel->ne[3] == p.channels);
GGML_ASSERT(dst->ne[3] == p.batch);
if (ggml_is_contiguous(src)) {
ggml_compute_forward_conv_2d_dw_whcn(params, src, kernel, dst, p);
} else if (ggml_is_contiguous_channels(src)) {
// kernel should also have channels most contiguous in memory
GGML_ASSERT(kernel->nb[0] >= kernel->nb[2] && kernel->nb[1] >= kernel->nb[0]);
ggml_compute_forward_conv_2d_dw_cwhn(params, src, kernel, dst, p);
} else {
GGML_ABORT("non-contiguous memory layout not supported");
}
}
// ggml_compute_forward_pool_1d_sk_p0
static void ggml_compute_forward_pool_1d_sk_p0(
-1
View File
@@ -65,7 +65,6 @@ void ggml_compute_forward_conv_transpose_1d(const struct ggml_compute_params * p
void ggml_compute_forward_im2col(const struct ggml_compute_params * params, struct ggml_tensor * dst);
void ggml_compute_forward_im2col_back_f32(const struct ggml_compute_params * params, struct ggml_tensor * dst);
void ggml_compute_forward_conv_transpose_2d(const struct ggml_compute_params * params, struct ggml_tensor * dst);
void ggml_compute_forward_conv_2d_dw(const struct ggml_compute_params * params, struct ggml_tensor * dst);
void ggml_compute_forward_pool_1d(const struct ggml_compute_params * params, struct ggml_tensor * dst);
void ggml_compute_forward_pool_2d(const struct ggml_compute_params * params, struct ggml_tensor * dst);
void ggml_compute_forward_pool_2d_back(const struct ggml_compute_params * params, struct ggml_tensor * dst);
+1 -1
View File
@@ -341,7 +341,7 @@ static inline void __avx_f32cx8_store(ggml_fp16_t *x, __m256 y) {
#define GGML_F32_EPR 4
#define GGML_F32x4 vector float
#define GGML_F32x4_ZERO {0.0f}
#define GGML_F32x4_ZERO 0.0f
#define GGML_F32x4_SET1 vec_splats
#define GGML_F32x4_LOAD(p) vec_xl(0, p)
#define GGML_F32x4_STORE(p, r) vec_xst(r, 0, p)
+4 -4
View File
@@ -78,13 +78,13 @@
// Moore Threads
#define GGML_CUDA_MUSA_ARCH_IS_QY1 (__MUSA_ARCH__ <= 210)
#define GGML_CUDA_CC_QY1 (GGML_CUDA_CC_OFFSET_MTHREADS + 0x210) // MTT S80, MTT S3000
#define GGML_CUDA_CC_QY2 (GGML_CUDA_CC_OFFSET_MTHREADS + 0x220) // MTT S4000
#define GGML_CUDA_CC_NG (GGML_CUDA_CC_OFFSET_MTHREADS + 0x310) // TBD
#define GGML_CUDA_CC_QY1 (GGML_MUSA_CC_OFFSET_MTHREADS + 0x210) // MTT S80, MTT S3000
#define GGML_CUDA_CC_QY2 (GGML_MUSA_CC_OFFSET_MTHREADS + 0x220) // MTT S4000
#define GGML_CUDA_CC_NG (GGML_MUSA_CC_OFFSET_MTHREADS + 0x310) // TBD
#define GGML_CUDA_CC_IS_MTHREADS(cc) (cc >= GGML_CUDA_CC_OFFSET_MTHREADS && cc < GGML_CUDA_CC_OFFSET_AMD)
#define GGML_CUDA_CC_IS_QY1(cc) (cc >= GGML_CUDA_CC_QY1 && cc < GGML_CUDA_CC_QY2)
#define GGML_CUDA_CC_IS_QY2(cc) (cc >= GGML_CUDA_CC_QY2 && cc < GGML_CUDA_CC_NG)
#define GGML_CUDA_CC_IS_QY2(cc) (cc >= GGML_CUDA_CC_QY2 && cc < GGML_CUDA_CC_NEXT)
#define GGML_CUDA_CC_IS_NG(cc) (cc >= GGML_CUDA_CC_NG)
#ifdef __CUDA_ARCH_LIST__
+12 -41
View File
@@ -1,8 +1,6 @@
#include "convert.cuh"
#include "dequantize.cuh"
#include <cstdint>
#define CUDA_Q8_0_NE_ALIGN 2048
template <int qk, int qr, dequantize_kernel_t dequantize_kernel, typename dst_t>
@@ -572,46 +570,30 @@ static void dequantize_row_iq4_xs_cuda(const void * vx, dst_t * y, const int64_t
}
template <typename src_t, typename dst_t>
static __global__ void convert_unary(
const void * __restrict__ vx, dst_t * __restrict__ y, const int64_t ne00, const int64_t ne01, const int64_t ne02,
const int64_t s01, const int64_t s02, const int64_t s03) {
const int64_t i00 = (int64_t)blockDim.x*blockIdx.x + threadIdx.x;
static __global__ void convert_unary(const void * __restrict__ vx, dst_t * __restrict__ y, const int64_t k) {
const int64_t i = (int64_t)blockDim.x*blockIdx.x + threadIdx.x;
if (i00 >= ne00) {
if (i >= k) {
return;
}
const int64_t i01 = blockIdx.y;
const int64_t i02 = blockIdx.z % ne02;
const int64_t i03 = blockIdx.z / ne02;
const src_t * x = (const src_t *) vx;
const int64_t ix = i03*s03 + i02*s02 + i01*s01 + i00;
const int64_t iy = ((i03*ne02 + i02)*ne01 + i01)*ne00 + i00;
y[iy] = float(x[ix]);
y[i] = float(x[i]);
}
template <typename src_t, typename dst_t>
static void convert_unary_cuda(const void * vx, dst_t * y,
const int64_t ne00, const int64_t ne01, const int64_t ne02, const int64_t ne03,
const int64_t s01, const int64_t s02, const int64_t s03, cudaStream_t stream) {
const dim3 num_blocks((ne00 + CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / CUDA_DEQUANTIZE_BLOCK_SIZE, ne01, ne02*ne03);
convert_unary<src_t><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>
(vx, y, ne00, ne01, ne02, s01, s02, s03);
}
template <typename src_t, typename dst_t>
static void convert_unary_cont_cuda(const void * vx, dst_t * y, const int64_t k, cudaStream_t stream) {
convert_unary_cuda<src_t>(vx, y, k, 1, 1, 1, k, k, k, stream);
static void convert_unary_cuda(const void * __restrict__ vx, dst_t * __restrict__ y, const int64_t k, cudaStream_t stream) {
const int num_blocks = (k + CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / CUDA_DEQUANTIZE_BLOCK_SIZE;
convert_unary<src_t><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>(vx, y, k);
}
to_bf16_cuda_t ggml_get_to_bf16_cuda(ggml_type type) {
switch (type) {
case GGML_TYPE_F32:
return convert_unary_cont_cuda<float>;
return convert_unary_cuda<float>;
case GGML_TYPE_F16:
return convert_unary_cont_cuda<half>;
return convert_unary_cuda<half>;
default:
return nullptr;
}
@@ -661,9 +643,9 @@ to_fp16_cuda_t ggml_get_to_fp16_cuda(ggml_type type) {
case GGML_TYPE_IQ3_S:
return dequantize_row_iq3_s_cuda;
case GGML_TYPE_F32:
return convert_unary_cont_cuda<float>;
return convert_unary_cuda<float>;
case GGML_TYPE_BF16:
return convert_unary_cont_cuda<nv_bfloat16>;
return convert_unary_cuda<nv_bfloat16>;
default:
return nullptr;
}
@@ -710,18 +692,7 @@ to_fp32_cuda_t ggml_get_to_fp32_cuda(ggml_type type) {
case GGML_TYPE_IQ3_S:
return dequantize_row_iq3_s_cuda;
case GGML_TYPE_F16:
return convert_unary_cont_cuda<half>;
case GGML_TYPE_BF16:
return convert_unary_cont_cuda<nv_bfloat16>;
default:
return nullptr;
}
}
to_fp16_nc_cuda_t ggml_get_to_fp16_nc_cuda(ggml_type type) {
switch (type) {
case GGML_TYPE_F32:
return convert_unary_cuda<float>;
return convert_unary_cuda<half>;
case GGML_TYPE_BF16:
return convert_unary_cuda<nv_bfloat16>;
default:
+1 -11
View File
@@ -3,7 +3,7 @@
#define CUDA_DEQUANTIZE_BLOCK_SIZE 256
template<typename T>
using to_t_cuda_t = void (*)(const void * x, T * y, int64_t k, cudaStream_t stream);
using to_t_cuda_t = void (*)(const void * __restrict__ x, T * __restrict__ y, int64_t k, cudaStream_t stream);
typedef to_t_cuda_t<float> to_fp32_cuda_t;
typedef to_t_cuda_t<half> to_fp16_cuda_t;
@@ -14,13 +14,3 @@ to_fp16_cuda_t ggml_get_to_fp16_cuda(ggml_type type);
to_bf16_cuda_t ggml_get_to_bf16_cuda(ggml_type type);
to_fp32_cuda_t ggml_get_to_fp32_cuda(ggml_type type);
// TODO more general support for non-contiguous inputs
template<typename T>
using to_t_nc_cuda_t = void (*)(const void * x, T * y,
int64_t ne00, int64_t ne01, int64_t ne02, int64_t ne03,
int64_t s01, int64_t s02, int64_t s03, cudaStream_t stream);
typedef to_t_nc_cuda_t<half> to_fp16_nc_cuda_t;
to_fp16_nc_cuda_t ggml_get_to_fp16_nc_cuda(ggml_type type);
-2
View File
@@ -639,8 +639,6 @@ void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, gg
if(ctx.cuda_graph->use_cpy_indirection && !disable_indirection_for_this_node) {
ctx.cuda_graph->graph_cpynode_index = graph_cpynode_index;
}
#else
GGML_UNUSED(disable_indirection_for_this_node);
#endif
}
+71 -110
View File
@@ -33,8 +33,8 @@ static __global__ void k_get_rows(
dfloat2 v;
dequantize_kernel(src0_row, ib, iqs, v);
dst_row[iybs + iqs + 0] = float(v.x);
dst_row[iybs + iqs + y_offset] = float(v.y);
dst_row[iybs + iqs + 0] = v.x;
dst_row[iybs + iqs + y_offset] = v.y;
}
template<typename src0_t, typename dst_t>
@@ -60,7 +60,7 @@ static __global__ void k_get_rows_float(
dst_t * dst_row = dst + i10*s1 + i11*s2 + i12*s3;
const src0_t * src0_row = (const src0_t *)((const char *) src0 + i01*nb01 + i11*nb02 + i12*nb03);
dst_row[i00] = float(src0_row[i00]);
dst_row[i00] = src0_row[i00];
}
template<typename grad_t, typename dst_t>
@@ -86,159 +86,120 @@ static __global__ void k_get_rows_back_float(
dst[dst_row*ncols + col] = sum;
}
template<int qk, int qr, dequantize_kernel_t dq, typename dst_t>
static void get_rows_cuda_q(
const void * src0_d, const int32_t * src1_d, dst_t * dst_d,
const int64_t ne00, const size_t nb01, const size_t nb02, const size_t nb03,
const int64_t ne10, const int64_t ne11, const int64_t ne12, const size_t nb10, const size_t nb11, const size_t nb12,
const size_t nb1, const size_t nb2, const size_t nb3,
cudaStream_t stream) {
template<int qk, int qr, dequantize_kernel_t dq>
static void get_rows_cuda(
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
const void * src0_dd, const int32_t * src1_dd, float * dst_dd, cudaStream_t stream) {
GGML_TENSOR_BINARY_OP_LOCALS
const dim3 block_dims(CUDA_GET_ROWS_BLOCK_SIZE, 1, 1);
const int block_num_x = (ne00 + 2*CUDA_GET_ROWS_BLOCK_SIZE - 1) / (2*CUDA_GET_ROWS_BLOCK_SIZE);
const dim3 block_nums(block_num_x, ne10, ne11*ne12);
// strides in elements
// const size_t s0 = nb0 / sizeof(dst_t);
const size_t s1 = nb1 / sizeof(dst_t);
const size_t s2 = nb2 / sizeof(dst_t);
const size_t s3 = nb3 / sizeof(dst_t);
//const size_t s0 = nb0 / ggml_element_size(dst);
const size_t s1 = nb1 / ggml_element_size(dst);
const size_t s2 = nb2 / ggml_element_size(dst);
const size_t s3 = nb3 / ggml_element_size(dst);
const size_t s10 = nb10 / sizeof(int32_t);
const size_t s11 = nb11 / sizeof(int32_t);
const size_t s12 = nb12 / sizeof(int32_t);
// const size_t s13 = nb13 / sizeof(int32_t);
const size_t s10 = nb10 / ggml_element_size(src1);
const size_t s11 = nb11 / ggml_element_size(src1);
const size_t s12 = nb12 / ggml_element_size(src1);
//const size_t s13 = nb13 / ggml_element_size(src1);
GGML_ASSERT(ne00 % 2 == 0);
k_get_rows<qk, qr, dq><<<block_nums, block_dims, 0, stream>>>(
src0_d, src1_d, dst_d,
src0_dd, src1_dd, dst_dd,
ne00, /*ne01, ne02, ne03,*/
/*ne10, ne11,*/ ne12, /*ne13,*/
/* s0,*/ s1, s2, s3,
/* nb00,*/ nb01, nb02, nb03,
s10, s11, s12/*, s13*/);
GGML_UNUSED(dst);
}
template<typename src0_t, typename dst_t>
template<typename src0_t>
static void get_rows_cuda_float(
const src0_t * src0_d, const int32_t * src1_d, dst_t * dst_d,
const int64_t ne00, const size_t nb01, const size_t nb02, const size_t nb03,
const int64_t ne10, const int64_t ne11, const int64_t ne12, const size_t nb10, const size_t nb11, const size_t nb12,
const size_t nb1, const size_t nb2, const size_t nb3,
cudaStream_t stream) {
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
const src0_t * src0_dd, const int32_t * src1_dd, float * dst_dd, cudaStream_t stream) {
GGML_TENSOR_BINARY_OP_LOCALS
GGML_ASSERT(ne13 == 1);
const dim3 block_dims(CUDA_GET_ROWS_BLOCK_SIZE, 1, 1);
const int block_num_x = (ne00 + CUDA_GET_ROWS_BLOCK_SIZE - 1) / CUDA_GET_ROWS_BLOCK_SIZE;
const dim3 block_nums(block_num_x, ne10, ne11*ne12);
// strides in elements
// const size_t s0 = nb0 / sizeof(dst_t);
const size_t s1 = nb1 / sizeof(dst_t);
const size_t s2 = nb2 / sizeof(dst_t);
const size_t s3 = nb3 / sizeof(dst_t);
//const size_t s0 = nb0 / ggml_element_size(dst);
const size_t s1 = nb1 / ggml_element_size(dst);
const size_t s2 = nb2 / ggml_element_size(dst);
const size_t s3 = nb3 / ggml_element_size(dst);
const size_t s10 = nb10 / sizeof(int32_t);
const size_t s11 = nb11 / sizeof(int32_t);
const size_t s12 = nb12 / sizeof(int32_t);
// const size_t s13 = nb13 / sizeof(int32_t);
const size_t s10 = nb10 / ggml_element_size(src1);
const size_t s11 = nb11 / ggml_element_size(src1);
const size_t s12 = nb12 / ggml_element_size(src1);
//const size_t s13 = nb13 / ggml_element_size(src1);
k_get_rows_float<<<block_nums, block_dims, 0, stream>>>(
src0_d, src1_d, dst_d,
src0_dd, src1_dd, dst_dd,
ne00, /*ne01, ne02, ne03,*/
/*ne10, ne11,*/ ne12, /*ne13,*/
/* s0,*/ s1, s2, s3,
/* nb00,*/ nb01, nb02, nb03,
s10, s11, s12/*, s13*/);
}
template <typename dst_t>
static void ggml_cuda_get_rows_switch_src0_type(
const void * src0_d, const ggml_type src0_type, const int32_t * src1_d, dst_t * dst_d,
const int64_t ne00, const size_t nb01, const size_t nb02, const size_t nb03,
const int64_t ne10, const int64_t ne11, const int64_t ne12, const size_t nb10, const size_t nb11, const size_t nb12,
const size_t nb1, const size_t nb2, const size_t nb3,
cudaStream_t stream) {
switch (src0_type) {
case GGML_TYPE_F16:
get_rows_cuda_float((const half *) src0_d, src1_d, dst_d,
ne00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb1, nb2, nb3, stream);
break;
case GGML_TYPE_F32:
get_rows_cuda_float((const float *) src0_d, src1_d, dst_d,
ne00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb1, nb2, nb3, stream);
break;
case GGML_TYPE_BF16:
get_rows_cuda_float((const nv_bfloat16 *) src0_d, src1_d, dst_d,
ne00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb1, nb2, nb3, stream);
break;
case GGML_TYPE_Q4_0:
get_rows_cuda_q<QK4_0, QR4_0, dequantize_q4_0>(src0_d, src1_d, dst_d,
ne00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb1, nb2, nb3, stream);
break;
case GGML_TYPE_Q4_1:
get_rows_cuda_q<QK4_1, QR4_1, dequantize_q4_1>(src0_d, src1_d, dst_d,
ne00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb1, nb2, nb3, stream);
break;
case GGML_TYPE_Q5_0:
get_rows_cuda_q<QK5_0, QR5_0, dequantize_q5_0>(src0_d, src1_d, dst_d,
ne00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb1, nb2, nb3, stream);
break;
case GGML_TYPE_Q5_1:
get_rows_cuda_q<QK5_1, QR5_1, dequantize_q5_1>(src0_d, src1_d, dst_d,
ne00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb1, nb2, nb3, stream);
break;
case GGML_TYPE_Q8_0:
get_rows_cuda_q<QK8_0, QR8_0, dequantize_q8_0>(src0_d, src1_d, dst_d,
ne00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb1, nb2, nb3, stream);
break;
default:
// TODO: k-quants
GGML_ABORT("%s: unsupported src0 type: %s\n", __func__, ggml_type_name(src0_type));
break;
}
}
void get_rows_cuda(
const void * src0_d, ggml_type src0_type, const int32_t * src1_d, void * dst_d, ggml_type dst_type,
int64_t ne00, size_t nb01, size_t nb02, size_t nb03,
int64_t ne10, int64_t ne11, int64_t ne12, size_t nb10, size_t nb11, size_t nb12,
size_t nb1, size_t nb2, size_t nb3,
cudaStream_t stream) {
switch (dst_type) {
case GGML_TYPE_F32:
ggml_cuda_get_rows_switch_src0_type(src0_d, src0_type, src1_d, (float *) dst_d,
ne00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb1, nb2, nb3, stream);
break;
case GGML_TYPE_F16:
ggml_cuda_get_rows_switch_src0_type(src0_d, src0_type, src1_d, (half *) dst_d,
ne00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb1, nb2, nb3, stream);
break;
case GGML_TYPE_BF16:
ggml_cuda_get_rows_switch_src0_type(src0_d, src0_type, src1_d, (nv_bfloat16 *) dst_d,
ne00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb1, nb2, nb3, stream);
break;
default:
GGML_ABORT("%s: unsupported dst type: %s\n", __func__, ggml_type_name(dst_type));
break;
}
GGML_UNUSED(dst);
}
void ggml_cuda_op_get_rows(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
const ggml_tensor * src0 = dst->src[0];
const ggml_tensor * src1 = dst->src[1];
const void * src0_d = (const void *) src0->data;
const int32_t * src1_d = (const int32_t *) src1->data;
float * dst_d = (float *) dst->data;
cudaStream_t stream = ctx.stream();
GGML_TENSOR_BINARY_OP_LOCALS
GGML_ASSERT(src1->type == GGML_TYPE_I32);
GGML_ASSERT(ne13 == 1);
GGML_ASSERT(dst->type == GGML_TYPE_F32);
GGML_ASSERT(src0->nb[0] == ggml_type_size(src0->type));
GGML_ASSERT(src1->nb[0] == ggml_type_size(src1->type));
GGML_ASSERT(dst->nb[0] == ggml_type_size(dst->type));
get_rows_cuda(src0->data, src0->type, (const int32_t *) src1->data, dst->data, dst->type,
ne00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb1, nb2, nb3, stream);
switch (src0->type) {
case GGML_TYPE_F16:
get_rows_cuda_float(src0, src1, dst, (const half *) src0_d, src1_d, dst_d, stream);
break;
case GGML_TYPE_F32:
get_rows_cuda_float(src0, src1, dst, (const float *) src0_d, src1_d, dst_d, stream);
break;
case GGML_TYPE_Q4_0:
get_rows_cuda<QK4_0, QR4_0, dequantize_q4_0>(src0, src1, dst, src0_d, src1_d, dst_d, stream);
break;
case GGML_TYPE_Q4_1:
get_rows_cuda<QK4_1, QR4_1, dequantize_q4_1>(src0, src1, dst, src0_d, src1_d, dst_d, stream);
break;
case GGML_TYPE_Q5_0:
get_rows_cuda<QK5_0, QR5_0, dequantize_q5_0>(src0, src1, dst, src0_d, src1_d, dst_d, stream);
break;
case GGML_TYPE_Q5_1:
get_rows_cuda<QK5_1, QR5_1, dequantize_q5_1>(src0, src1, dst, src0_d, src1_d, dst_d, stream);
break;
case GGML_TYPE_Q8_0:
get_rows_cuda<QK8_0, QR8_0, dequantize_q8_0>(src0, src1, dst, src0_d, src1_d, dst_d, stream);
break;
default:
// TODO: k-quants
GGML_ABORT("%s: unsupported type: %s\n", __func__, ggml_type_name(src0->type));
break;
}
}
void ggml_cuda_op_get_rows_back(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
-7
View File
@@ -3,13 +3,6 @@
#define CUDA_GET_ROWS_BLOCK_SIZE 256
#define CUDA_GET_ROWS_BACK_BLOCK_SIZE 256
void get_rows_cuda(
const void * src0_d, ggml_type src0_type, const int32_t * src1_d, void * dst_d, ggml_type dst_type,
int64_t ne00, size_t nb01, size_t nb02, size_t nb03,
int64_t ne10, int64_t ne11, int64_t ne12, size_t nb10, size_t nb11, size_t nb12,
size_t nb1, size_t nb2, size_t nb3,
cudaStream_t stream);
void ggml_cuda_op_get_rows(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
void ggml_cuda_op_get_rows_back(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
+210 -185
View File
@@ -1413,11 +1413,6 @@ static void ggml_cuda_op_mul_mat(
const int64_t ne0 = dst->ne[0];
const int64_t ne1 = dst->ne[1];
// const int64_t nb10 = src1->nb[0];
const int64_t nb11 = src1->nb[1];
const int64_t nb12 = src1->nb[2];
const int64_t nb13 = src1->nb[3];
const int64_t nb2 = dst->nb[2];
const int64_t nb3 = dst->nb[3];
@@ -1553,10 +1548,7 @@ static void ggml_cuda_op_mul_mat(
dev[id].src1_ddq = dev[id].src1_ddq_alloc.alloc(ctx.pool(id), src_1_ddq_size);
if (src1_on_device && src1_is_contiguous) {
quantize_src1(
dev[id].src1_ddf, nullptr, dev[id].src1_ddq, src0->type, ne10,
nb11/sizeof(float), nb12/sizeof(float), nb13/sizeof(float),
src1_padded_col_size, ne11, ne12, ne13, stream);
quantize_src1(dev[id].src1_ddf, dev[id].src1_ddq, ne10, ne11, ne12*ne13, src1_padded_col_size, src0->type, stream);
CUDA_CHECK(cudaGetLastError());
}
}
@@ -1651,9 +1643,7 @@ static void ggml_cuda_op_mul_mat(
}
if (quantize_src1 && !src1_is_contiguous) {
quantize_src1(
src1_ddf_i, nullptr, src1_ddq_i, src0->type, ne10, ne10, ne11*ne10, ne12*ne11*ne10,
src1_padded_col_size, src1_ncols, 1, 1, stream);
quantize_src1(src1_ddf_i, src1_ddq_i, ne10, src1_ncols, 1, src1_padded_col_size, src0->type, stream);
CUDA_CHECK(cudaGetLastError());
}
@@ -1723,15 +1713,15 @@ static __global__ void k_compute_batched_ptrs(
size_t nb12, size_t nb13,
size_t nbd2, size_t nbd3,
int64_t r2, int64_t r3) {
const int64_t i13 = blockIdx.x * blockDim.x + threadIdx.x;
const int64_t i12 = blockIdx.y * blockDim.y + threadIdx.y;
int64_t i13 = blockIdx.x * blockDim.x + threadIdx.x;
int64_t i12 = blockIdx.y * blockDim.y + threadIdx.y;
if (i13 >= ne13 || i12 >= ne12) {
return;
}
const int64_t i03 = i13 / r3;
const int64_t i02 = i12 / r2;
int64_t i03 = i13 / r3;
int64_t i02 = i12 / r2;
ptrs_src[0*ne23 + i12 + i13*ne12] = (const char *) src0_as_f16 + i02*nb02 + i03*nb03;
ptrs_src[1*ne23 + i12 + i13*ne12] = (const char *) src1_as_f16 + i12*nb12 + i13*nb13;
@@ -1745,10 +1735,6 @@ static void ggml_cuda_mul_mat_batched_cublas(ggml_backend_cuda_context & ctx, co
GGML_ASSERT(ggml_backend_buffer_is_cuda(src0->buffer));
GGML_ASSERT(src0->type == GGML_TYPE_F16);
// Byte offsets and tensor dimensions are currently used in an inconsistent way for dst.
// As long as dst is contiguous this does not matter though.
GGML_ASSERT(ggml_is_contiguous(dst));
GGML_TENSOR_BINARY_OP_LOCALS
const int64_t ne_dst = ggml_nelements(dst);
@@ -1757,31 +1743,21 @@ static void ggml_cuda_mul_mat_batched_cublas(ggml_backend_cuda_context & ctx, co
CUBLAS_CHECK(cublasSetStream(ctx.cublas_handle(), main_stream));
const half * src0_f16 = (const half *) src0->data;
float * dst_ddf = (float *) dst->data;
const half * src1_f16 = (const half *) src1->data;
const size_t ts_src1 = ggml_type_size(src1->type);
GGML_ASSERT(nb10 == ts_src1);
int64_t s11 = nb11 / ts_src1;
int64_t s12 = nb12 / ts_src1;
int64_t s13 = nb13 / ts_src1;
ggml_cuda_pool_alloc<half> src1_f16_alloc(ctx.pool());
void * src0_ddq = src0->data;
half * src0_f16 = (half *) src0_ddq;
float * src1_ddf = (float *) src1->data;
float * dst_ddf = (float *) dst->data;
// convert src1 to fp16
ggml_cuda_pool_alloc<half> src1_f16_alloc(ctx.pool());
if (src1->type != GGML_TYPE_F16) {
const to_fp16_nc_cuda_t to_fp16_cuda = ggml_get_to_fp16_nc_cuda(src1->type);
const to_fp16_cuda_t to_fp16_cuda = ggml_get_to_fp16_cuda(src1->type);
const int64_t ne_src1 = ggml_nelements(src1);
src1_f16_alloc.alloc(ne_src1);
GGML_ASSERT(to_fp16_cuda != nullptr);
to_fp16_cuda(src1_f16, src1_f16_alloc.get(), ne10, ne11, ne12, ne13, s11, s12, s13, main_stream);
src1_f16 = src1_f16_alloc.get();
s11 = ne10;
s12 = ne11*s11;
s13 = ne12*s12;
to_fp16_cuda(src1_ddf, src1_f16_alloc.get(), ne_src1, main_stream);
}
half * src1_f16 = src1->type == GGML_TYPE_F16 ? (half *) src1_ddf : src1_f16_alloc.get();
ggml_cuda_pool_alloc<half> dst_f16(ctx.pool());
char * dst_t;
@@ -1841,13 +1817,13 @@ static void ggml_cuda_mul_mat_batched_cublas(ggml_backend_cuda_context & ctx, co
int i02 = i12 / r2;
CUBLAS_CHECK(
cublasGemmEx(ctx.cublas_handle(), CUBLAS_OP_T, CUBLAS_OP_N,
ne01, ne11, ne10,
alpha, (const char *) src0_f16 + i03*nb03 + i02*nb02, CUDA_R_16F, nb01/sizeof(half),
src1_f16 + i13*s13 + i12*s12, CUDA_R_16F, s11,
beta, ( char *) dst_t + i13*nbd3 + i12*nbd2, cu_data_type, ne0,
cu_compute_type,
CUBLAS_GEMM_DEFAULT_TENSOR_OP));
cublasGemmEx(g_cublas_handles[g_main_device], CUBLAS_OP_T, CUBLAS_OP_N,
ne01, ne11, ne10,
alpha, (const char *) src0_as_f16 + i02*src0->nb[2] + i03*src0->nb[3] , CUDA_R_16F, nb01/sizeof(half),
(const char *) src1_as_f16 + i12*src1->nb[2]/2 + i13*src1->nb[3]/2, CUDA_R_16F, nb11/sizeof(float),
beta, ( char *) dst_t + i12*nbd2 + i13*nbd3, cu_data_type, ne01,
cu_compute_type,
CUBLAS_GEMM_DEFAULT_TENSOR_OP));
}
}
}
@@ -1858,15 +1834,15 @@ static void ggml_cuda_mul_mat_batched_cublas(ggml_backend_cuda_context & ctx, co
CUBLAS_CHECK(
cublasGemmStridedBatchedEx(ctx.cublas_handle(), CUBLAS_OP_T, CUBLAS_OP_N,
ne01, ne11, ne10,
alpha, src0_f16, CUDA_R_16F, nb01/nb00, nb02/nb00, // strideA
src1_f16, CUDA_R_16F, s11, s12, // strideB
beta, dst_t, cu_data_type, ne0, ne1*ne0, // strideC
alpha, (const char *) src0_f16, CUDA_R_16F, nb01/nb00, nb02/nb00, // strideA
(const char *) src1_f16, CUDA_R_16F, nb11/nb10, nb12/nb10, // strideB
beta, ( char *) dst_t, cu_data_type, ne01, nb2/nb0, // strideC
ne12*ne13,
cu_compute_type,
CUBLAS_GEMM_DEFAULT_TENSOR_OP));
} else {
// use cublasGemmBatchedEx
const int64_t ne23 = ne12*ne13;
const int ne23 = ne12*ne13;
ggml_cuda_pool_alloc<const void *> ptrs_src(ctx.pool(), 2*ne23);
ggml_cuda_pool_alloc< void *> ptrs_dst(ctx.pool(), 1*ne23);
@@ -1878,8 +1854,8 @@ static void ggml_cuda_mul_mat_batched_cublas(ggml_backend_cuda_context & ctx, co
ne12, ne13,
ne23,
nb02, nb03,
src1->type == GGML_TYPE_F16 ? nb12 : s12*sizeof(half),
src1->type == GGML_TYPE_F16 ? nb13 : s13*sizeof(half),
src1->type == GGML_TYPE_F16 ? nb12 : nb12/2,
src1->type == GGML_TYPE_F16 ? nb13 : nb13/2,
nbd2, nbd3,
r2, r3);
CUDA_CHECK(cudaGetLastError());
@@ -1888,8 +1864,8 @@ static void ggml_cuda_mul_mat_batched_cublas(ggml_backend_cuda_context & ctx, co
cublasGemmBatchedEx(ctx.cublas_handle(), CUBLAS_OP_T, CUBLAS_OP_N,
ne01, ne11, ne10,
alpha, (const void **) (ptrs_src.get() + 0*ne23), CUDA_R_16F, nb01/nb00,
(const void **) (ptrs_src.get() + 1*ne23), CUDA_R_16F, s11,
beta, ( void **) (ptrs_dst.get() + 0*ne23), cu_data_type, ne0,
(const void **) (ptrs_src.get() + 1*ne23), CUDA_R_16F, nb11/nb10,
beta, ( void **) (ptrs_dst.get() + 0*ne23), cu_data_type, ne01,
ne23,
cu_compute_type,
CUBLAS_GEMM_DEFAULT_TENSOR_OP));
@@ -1905,7 +1881,7 @@ static void ggml_cuda_mul_mat_batched_cublas(ggml_backend_cuda_context & ctx, co
static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
const bool split = ggml_backend_buft_is_cuda_split(src0->buffer->buft);
bool use_mul_mat_vec = (src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16 || src0->type == GGML_TYPE_BF16)
bool use_mul_mat_vec = (src0->type == GGML_TYPE_F16 || src0->type == GGML_TYPE_BF16)
&& src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32
&& src0->ne[0] % 2 == 0 && src1->ne[1] == 1;
bool use_mul_mat_vec_q = ggml_is_quantized(src0->type)
@@ -1946,16 +1922,12 @@ static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor
//printf("src0 is contiguous %d, transposed %d, type = %s, name = %s\n", ggml_is_contiguous(src0), ggml_is_transposed(src0), ggml_type_name(src0->type), src0->name);
//printf("src1 is contiguous %d, transposed %d, type = %s, name = %s\n", ggml_is_contiguous(src1), ggml_is_transposed(src1), ggml_type_name(src1->type), src1->name);
if (!split && use_mul_mat_vec && (src0->ne[1] <= MMV_MAX_ROWS || any_gpus_without_fp16_mma)) {
if (!split && use_mul_mat_vec && (src0->ne[1] < MMV_MAX_ROWS || any_gpus_without_fp16_mma)) {
// the custom F16 vector kernel can be used over batched cuBLAS GEMM
// but this is only faster for GPUs without tensor cores or with a thin src0 matrix (particularly KQV in attention)
ggml_cuda_mul_mat_vec(ctx, src0, src1, nullptr, dst);
} else if (!split && use_mul_mat_vec_q) {
ggml_cuda_mul_mat_vec_q(ctx, src0, src1, nullptr, dst);
} else if (!split && use_mul_mat_q) {
ggml_cuda_mul_mat_q(ctx, src0, src1, nullptr, dst);
} else if (!split && src0->type == GGML_TYPE_F16 && (src1->type == GGML_TYPE_F16 || !any_gpus_with_slow_fp16) &&
!ggml_is_transposed(src0) && !ggml_is_transposed(src1) && src1->ne[2]*src1->ne[3] > 1) {
ggml_cuda_mul_mat_vec(ctx, src0, src1, dst);
} else if (!split && src0->type == GGML_TYPE_F16 && (src1->type == GGML_TYPE_F16 || !any_gpus_with_slow_fp16)
&& !ggml_is_transposed(src0) && !ggml_is_transposed(src1) && src1->ne[2]*src1->ne[3] > 1) {
// general KQ + KQV multi-batch without FlashAttention
ggml_cuda_mul_mat_batched_cublas(ctx, src0, src1, dst);
} else if (use_mul_mat_vec) {
@@ -1969,145 +1941,196 @@ static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor
}
}
struct mmid_row_mapping {
int32_t i1;
int32_t i2;
};
static __global__ void k_copy_src1_to_contiguous(const char * __restrict__ src1_original, char * __restrict__ src1_contiguous,
int * __restrict__ cur_src1_row, mmid_row_mapping * __restrict__ row_mapping,
const char * __restrict ids, int64_t i02, size_t ids_nb1, size_t ids_nb0,
int64_t ne11, int64_t ne10,
size_t nb11, size_t nb12) {
int32_t iid1 = blockIdx.x;
int32_t id = blockIdx.y;
const int32_t row_id_i = *(const int32_t *) (ids + iid1*ids_nb1 + id*ids_nb0);
if (row_id_i != i02) {
return;
}
const int64_t i11 = id % ne11;
const int64_t i12 = iid1;
__shared__ int src1_row;
if (threadIdx.x == 0) {
src1_row = atomicAdd(cur_src1_row, 1);
row_mapping[src1_row] = {id, iid1};
}
__syncthreads();
const float * src1_row_original = (const float *)(src1_original + i11*nb11 + i12*nb12);
float * src1_row_contiguous = (float *)(src1_contiguous + src1_row*nb11);
for (int i = threadIdx.x; i < ne10; i += blockDim.x) {
src1_row_contiguous[i] = src1_row_original[i];
}
}
static __global__ void k_copy_dst_from_contiguous(char * __restrict__ dst_original, const char * __restrict__ dst_contiguous,
const mmid_row_mapping * __restrict__ row_mapping,
int64_t ne0,
size_t nb1, size_t nb2) {
int32_t i = blockIdx.x;
const int32_t i1 = row_mapping[i].i1;
const int32_t i2 = row_mapping[i].i2;
const float * dst_row_contiguous = (const float *)(dst_contiguous + i*nb1);
float * dst_row_original = (float *)(dst_original + i1*nb1 + i2*nb2);
for (int j = threadIdx.x; j < ne0; j += blockDim.x) {
dst_row_original[j] = dst_row_contiguous[j];
}
}
static void ggml_cuda_mul_mat_id(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
const ggml_tensor * src0 = dst->src[0];
const ggml_tensor * src1 = dst->src[1];
const ggml_tensor * ids = dst->src[2];
GGML_ASSERT(src1->type == GGML_TYPE_F32);
GGML_ASSERT(dst->type == GGML_TYPE_F32);
GGML_ASSERT(!ggml_backend_buft_is_cuda_split(src0->buffer->buft) && "mul_mat_id does not support split buffers");
GGML_TENSOR_BINARY_OP_LOCALS
const int cc = ggml_cuda_info().devices[ggml_cuda_get_device()].cc;
if (src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
if (ne2 == 1) {
if (ggml_is_quantized(src0->type)) {
ggml_cuda_mul_mat_vec_q(ctx, src0, src1, ids, dst);
} else {
ggml_cuda_mul_mat_vec(ctx, src0, src1, ids, dst);
}
return;
}
if (ggml_cuda_should_use_mmq(src0->type, cc, ne12)) {
ggml_cuda_mul_mat_q(ctx, src0, src1, ids, dst);
return;
}
}
GGML_ASSERT(!ggml_backend_buft_is_cuda_split(src0->buffer->buft) && "mul_mat_id does not support split buffers");
cudaStream_t stream = ctx.stream();
GGML_ASSERT(nb12 % nb11 == 0);
GGML_ASSERT(nb2 % nb1 == 0);
const ggml_type type_src1_sorted = (src0->type == GGML_TYPE_F16 && !fast_fp16_hardware_available(cc))
|| ggml_is_quantized(src0->type) ? GGML_TYPE_F32 : src0->type;
const ggml_type type_dst_sorted = GGML_TYPE_F32;
const size_t ts_src1_sorted = ggml_type_size(type_src1_sorted);
const size_t ts_dst_sorted = ggml_type_size(type_dst_sorted);
const int64_t n_expert_used = ids->ne[0];
const int64_t ne_get_rows = ne12 * n_expert_used;
std::vector<int32_t> ids_to_sorted_host;
ids_to_sorted_host.reserve(2*ne_get_rows);
std::vector<int32_t> ids_from_sorted_host(ne_get_rows);
ggml_cuda_pool_alloc<int32_t> ids_buf_dev(ctx.pool(), 2*ne_get_rows);
std::vector<int32_t> tokens_per_expert(ne02);
ggml_cuda_pool_alloc<char> src1_sorted(ctx.pool(), ne12*n_expert_used*ne10*ts_src1_sorted);
ggml_cuda_pool_alloc<char> dst_sorted(ctx.pool(), ne2 *n_expert_used* ne0*ts_dst_sorted);
const int64_t n_as = ne02;
const int64_t n_ids = ids->ne[0];
std::vector<char> ids_host(ggml_nbytes(ids));
CUDA_CHECK(cudaMemcpyAsync(ids_host.data(), ids->data, ggml_nbytes(ids), cudaMemcpyDeviceToHost, stream));
const char * ids_dev = (const char *) ids->data;
CUDA_CHECK(cudaMemcpyAsync(ids_host.data(), ids_dev, ggml_nbytes(ids), cudaMemcpyDeviceToHost, stream));
CUDA_CHECK(cudaStreamSynchronize(stream));
for (int64_t i02 = 0; i02 < ne02; ++i02) { // expert matrices
for (int64_t i12 = 0; i12 < ne12; ++i12) { // tokens
for (int64_t iex = 0; iex < n_expert_used; ++iex) {
const int32_t expert_to_use = *(const int32_t *)(ids_host.data() + i12*ids->nb[1] + iex*ids->nb[0]);
assert(expert_to_use >= 0 && expert_to_use < ne02);
if (expert_to_use == i02) {
ids_from_sorted_host[i12*n_expert_used + iex] = ids_to_sorted_host.size();
ids_to_sorted_host.push_back(i12*ne11 + iex % ne11);
tokens_per_expert[i02]++;
break;
ggml_tensor src0_row = *src0;
ggml_tensor src1_row = *src1;
ggml_tensor dst_row = *dst;
char * src0_original = (char *) src0->data;
char * src1_original = (char *) src1->data;
char * dst_original = (char *) dst->data;
src0_row.ne[2] = 1;
src0_row.ne[3] = 1;
src0_row.nb[3] = nb02;
src1_row.ne[1] = 1;
src1_row.ne[2] = 1;
src1_row.ne[3] = 1;
src1_row.nb[2] = nb11;
src1_row.nb[3] = nb11;
dst_row.ne[1] = 1;
dst_row.ne[2] = 1;
dst_row.ne[3] = 1;
dst_row.nb[2] = nb1;
dst_row.nb[3] = nb1;
if (ne12 == 1) {
for (int64_t iid1 = 0; iid1 < ids->ne[1]; iid1++) {
for (int64_t id = 0; id < n_ids; id++) {
const int32_t i02 = *(const int32_t *) (ids_host.data() + iid1*ids->nb[1] + id*ids->nb[0]);
GGML_ASSERT(i02 >= 0 && i02 < n_as);
const int64_t i11 = id % ne11;
const int64_t i12 = iid1;
const int64_t i1 = id;
const int64_t i2 = i12;
src0_row.data = src0_original + i02*nb02;
src1_row.data = src1_original + i11*nb11 + i12*nb12;
dst_row.data = dst_original + i1*nb1 + i2*nb2;
ggml_cuda_mul_mat(ctx, &src0_row, &src1_row, &dst_row);
}
}
} else {
ggml_cuda_pool_alloc<char> src1_contiguous(ctx.pool(), sizeof(float)*ggml_nelements(src1));
ggml_cuda_pool_alloc<char> dst_contiguous(ctx.pool(), sizeof(float)*ggml_nelements(dst));
src1_row.data = src1_contiguous.get();
dst_row.data = dst_contiguous.get();
for (int64_t i02 = 0; i02 < n_as; i02++) {
int64_t num_src1_rows = 0;
for (int64_t iid1 = 0; iid1 < ids->ne[1]; iid1++) {
for (int64_t id = 0; id < n_ids; id++) {
const int32_t row_id_i = *(const int32_t *) (ids_host.data() + iid1*ids->nb[1] + id*ids->nb[0]);
GGML_ASSERT(row_id_i >= 0 && row_id_i < n_as);
if (row_id_i != i02) {
continue;
}
num_src1_rows++;
}
}
if (num_src1_rows == 0) {
continue;
}
ggml_cuda_pool_alloc<int> dev_cur_src1_row(ctx.pool(), 1);
ggml_cuda_pool_alloc<mmid_row_mapping> dev_row_mapping(ctx.pool(), num_src1_rows);
CUDA_CHECK(cudaMemsetAsync(dev_cur_src1_row.get(), 0, sizeof(int), stream));
{
dim3 block_dims(std::min((unsigned int)ne10, 768u));
dim3 grid_dims(ids->ne[1], n_ids);
k_copy_src1_to_contiguous<<<grid_dims, block_dims, 0, stream>>>(
src1_original, src1_contiguous.get(),
dev_cur_src1_row.get(), dev_row_mapping.get(),
ids_dev, i02, ids->nb[1], ids->nb[0],
ne11, ne10,
nb11, nb12);
CUDA_CHECK(cudaGetLastError());
}
src0_row.data = src0_original + i02*nb02;
GGML_ASSERT(nb11 == sizeof(float)*ne10);
GGML_ASSERT(nb1 == sizeof(float)*ne0);
src1_row.ne[1] = num_src1_rows;
src1_row.nb[1] = nb11;
src1_row.nb[2] = num_src1_rows*nb11;
src1_row.nb[3] = num_src1_rows*nb11;
dst_row.ne[1] = num_src1_rows;
dst_row.nb[1] = nb1;
dst_row.nb[2] = num_src1_rows*nb1;
dst_row.nb[3] = num_src1_rows*nb1;
ggml_cuda_mul_mat(ctx, &src0_row, &src1_row, &dst_row);
{
dim3 block_dims(std::min((unsigned int)ne0, 768u));
dim3 grid_dims(num_src1_rows);
k_copy_dst_from_contiguous<<<grid_dims, block_dims, 0, stream>>>(
dst_original, dst_contiguous.get(),
dev_row_mapping.get(),
ne0,
nb1, nb2);
CUDA_CHECK(cudaGetLastError());
}
}
}
GGML_ASSERT(ids_to_sorted_host.size() == size_t(ne_get_rows));
ids_to_sorted_host.insert(ids_to_sorted_host.end(), ids_from_sorted_host.begin(), ids_from_sorted_host.end());
CUDA_CHECK(cudaMemcpyAsync(ids_buf_dev.ptr, ids_to_sorted_host.data(), 2*ne_get_rows*sizeof(int32_t), cudaMemcpyHostToDevice, stream));
CUDA_CHECK(cudaStreamSynchronize(stream));
const int32_t * ids_to_sorted = ids_buf_dev.ptr + 0*ne_get_rows;
const int32_t * ids_from_sorted = ids_buf_dev.ptr + 1*ne_get_rows;
get_rows_cuda(src1->data, src1->type, ids_to_sorted, src1_sorted.ptr, type_src1_sorted,
ne10, nb11, nb12, nb13,
ne_get_rows, 1, 1, sizeof(int32_t), ne_get_rows*sizeof(int32_t), ne_get_rows*sizeof(int32_t),
ne10*ts_src1_sorted, ne_get_rows*ne10*ts_src1_sorted, ne_get_rows*ne10*ts_src1_sorted, stream);
CUDA_CHECK(cudaGetLastError());
char * src1_data_cur = (char *) src1_sorted.ptr;
char * dst_data_cur = (char *) dst_sorted.ptr;
for (int64_t i02 = 0; i02 < ne02; ++i02) {
if (tokens_per_expert[i02] == 0) {
continue;
}
ggml_tensor src0_slice = *src0;
src0_slice.ne[2] = 1;
src0_slice.nb[3] = src0_slice.nb[2];
src0_slice.data = (char *) src0->data + i02*nb02;
ggml_tensor src1_slice;
memset(&src1_slice, 0, sizeof(src1_slice));
src1_slice.buffer = src1->buffer;
src1_slice.type = type_src1_sorted;
src1_slice.ne[0] = ne10;
src1_slice.ne[1] = tokens_per_expert[i02];
src1_slice.ne[2] = 1;
src1_slice.ne[3] = 1;
src1_slice.nb[0] = ts_src1_sorted;
src1_slice.nb[1] = src1_slice.ne[0] * src1_slice.nb[0];
src1_slice.nb[2] = src1_slice.ne[1] * src1_slice.nb[1];
src1_slice.nb[3] = src1_slice.ne[2] * src1_slice.nb[2];
src1_slice.data = src1_data_cur;
ggml_tensor dst_slice;
memset(&dst_slice, 0, sizeof(dst_slice));
dst_slice.buffer = dst->buffer;
dst_slice.type = type_dst_sorted;
dst_slice.ne[0] = ne0;
dst_slice.ne[1] = tokens_per_expert[i02];
dst_slice.ne[2] = 1;
dst_slice.ne[3] = 1;
dst_slice.nb[0] = ts_dst_sorted;
dst_slice.nb[1] = dst_slice.ne[0] * dst_slice.nb[0];
dst_slice.nb[2] = dst_slice.ne[1] * dst_slice.nb[1];
dst_slice.nb[3] = dst_slice.ne[2] * dst_slice.nb[2];
dst_slice.data = dst_data_cur;
ggml_cuda_mul_mat(ctx, &src0_slice, &src1_slice, &dst_slice);
CUDA_CHECK(cudaGetLastError());
src1_data_cur += src1_slice.nb[2];
dst_data_cur += dst_slice.nb[2];
}
get_rows_cuda(dst_sorted.ptr, type_dst_sorted, ids_from_sorted, dst->data, dst->type,
ne0, ne0*ts_dst_sorted, ne_get_rows*ne0*ts_dst_sorted, ne_get_rows*ne0*ts_dst_sorted,
ne_get_rows, 1, 1, sizeof(int32_t), ne_get_rows*sizeof(int32_t), ne_get_rows*sizeof(int32_t),
nb1, nb2, nb3, stream);
}
static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct ggml_tensor * dst) {
@@ -2472,7 +2495,7 @@ static bool check_node_graph_compatibility_and_refresh_copy_ops(ggml_backend_cud
#endif
}
if (node->op == GGML_OP_MUL_MAT_ID && node->ne[2] != 1) {
if (node->op == GGML_OP_MUL_MAT_ID) {
use_cuda_graph = false; // This node type is not supported by CUDA graph capture
#ifndef NDEBUG
GGML_LOG_DEBUG("%s: disabling CUDA graphs due to unsupported node type\n", __func__);
@@ -3186,7 +3209,9 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
}
case GGML_OP_ROPE:
case GGML_OP_ROPE_BACK: {
return op->src[0]->nb[0] == ggml_type_size(op->src[0]->type) && ggml_is_contiguous_2(op->src[0]);
const size_t ts = ggml_type_size(op->src[0]->type);
const int64_t ne0_012 = op->src[0]->ne[0] * op->src[0]->ne[1] * op->src[0]->ne[2];
return op->src[0]->nb[0] == ts && op->src[0]->nb[3] == ne0_012*ts;
}
case GGML_OP_IM2COL:
case GGML_OP_POOL_2D:
+31 -189
View File
@@ -1,10 +1,37 @@
#include "mmq.cuh"
#include "quantize.cuh"
#include <vector>
void ggml_cuda_op_mul_mat_q(
ggml_backend_cuda_context & ctx,
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i,
const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols,
const int64_t src1_padded_row_size, cudaStream_t stream) {
static void ggml_cuda_mul_mat_q_switch_type(ggml_backend_cuda_context & ctx, const mmq_args & args, cudaStream_t stream) {
switch (args.type_x) {
const int64_t ne00 = src0->ne[0];
const int64_t ne10 = src1->ne[0];
const int64_t ne11 = src1->ne[1];
GGML_ASSERT(ne10 % QK8_1 == 0);
const int64_t ne0 = dst->ne[0];
const int64_t row_diff = row_high - row_low;
const int64_t stride00 = ne00 / ggml_blck_size(src0->type);
int id = ggml_cuda_get_device();
const int cc = ggml_cuda_info().devices[id].cc;
// the main device has a larger memory buffer to hold the results from all GPUs
// nrows_dst == nrows of the matrix that the kernel writes into
const int64_t nrows_dst = id == ctx.device ? ne0 : row_diff;
// The stream-k decomposition is only faster for recent NVIDIA GPUs.
// Also its fixup needs to allocate a temporary buffer in the memory pool.
// There are multiple parallel CUDA streams for src1_ncols != ne11 which would introduce a race condition for this buffer.
const bool use_stream_k = GGML_CUDA_CC_IS_NVIDIA(cc) &&
ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_VOLTA && src1_ncols == ne11;
const mmq_args args = {src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stride00, src1_padded_row_size, src1_ncols, ne11, nrows_dst, use_stream_k};
switch (src0->type) {
case GGML_TYPE_Q4_0:
mul_mat_q_case<GGML_TYPE_Q4_0>(ctx, args, stream);
break;
@@ -63,195 +90,10 @@ static void ggml_cuda_mul_mat_q_switch_type(ggml_backend_cuda_context & ctx, con
GGML_ABORT("fatal error");
break;
}
}
void ggml_cuda_mul_mat_q(
ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * ids, ggml_tensor * dst) {
GGML_ASSERT( src1->type == GGML_TYPE_F32);
GGML_ASSERT( dst->type == GGML_TYPE_F32);
GGML_ASSERT(!ids || ids->type == GGML_TYPE_I32); // Optional, used for batched GGML_MUL_MAT_ID.
GGML_TENSOR_BINARY_OP_LOCALS;
cudaStream_t stream = ctx.stream();
const int cc = ggml_cuda_info().devices[ggml_cuda_get_device()].cc;
const size_t ts_src0 = ggml_type_size(src0->type);
const size_t ts_src1 = ggml_type_size(src1->type);
const size_t ts_dst = ggml_type_size(dst->type);
GGML_ASSERT( nb00 == ts_src0);
GGML_ASSERT( nb10 == ts_src1);
GGML_ASSERT( nb0 == ts_dst);
GGML_ASSERT(!ids || ids->nb[0] == ggml_type_size(ids->type));
const char * src0_d = (const char *) src0->data;
const float * src1_d = (const float *) src1->data;
float * dst_d = (float *) dst->data;
const int64_t ne10_padded = GGML_PAD(ne10, MATRIX_ROW_PADDING);
const int64_t s01 = src0->nb[1] / ts_src0;
const int64_t s1 = dst->nb[1] / ts_dst;
const int64_t s02 = src0->nb[2] / ts_src0;
const int64_t s2 = dst->nb[2] / ts_dst;
const int64_t s03 = src0->nb[3] / ts_src0;
const int64_t s3 = dst->nb[3] / ts_dst;
const bool use_stream_k = GGML_CUDA_CC_IS_NVIDIA(cc) && ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_VOLTA;
if (!ids) {
const size_t nbytes_src1_q8_1 = ne13*ne12 * ne11*ne10_padded * sizeof(block_q8_1)/QK8_1 +
get_mmq_x_max_host(cc)*sizeof(block_q8_1_mmq);
ggml_cuda_pool_alloc<char> src1_q8_1(ctx.pool(), nbytes_src1_q8_1);
{
const int64_t s11 = src1->nb[1] / ts_src1;
const int64_t s12 = src1->nb[2] / ts_src1;
const int64_t s13 = src1->nb[3] / ts_src1;
quantize_mmq_q8_1_cuda(src1_d, nullptr, src1_q8_1.get(), src0->type,
ne10, s11, s12, s13, ne10_padded, ne11, ne12, ne13, stream);
}
const int64_t s12 = ne11*ne10_padded * sizeof(block_q8_1)/(QK8_1*sizeof(int));
const int64_t s13 = ne12*s12;
const mmq_args args = {
src0_d, src0->type, (const int *) src1_q8_1.ptr, nullptr, nullptr, dst_d,
ne00, ne01, ne1, s01, s1,
ne02, ne12, s02, s12, s2,
ne03, ne13, s03, s13, s3,
use_stream_k};
ggml_cuda_mul_mat_q_switch_type(ctx, args, stream);
return;
}
GGML_ASSERT(ne13 == 1);
GGML_ASSERT(nb12 % nb11 == 0);
GGML_ASSERT(nb2 % nb1 == 0);
const int64_t n_expert_used = ids->ne[0];
const int64_t ne_get_rows = ne12 * n_expert_used;
std::vector<char> ids_host(ggml_nbytes(ids));
std::vector<int32_t> ids_src1_host;
ids_src1_host.reserve(ne_get_rows);
std::vector<int32_t> ids_dst_host;
ids_dst_host.reserve(ne_get_rows);
std::vector<int32_t> tokens_per_expert_host(ne02);
std::vector<int32_t> expert_bounds_host(ne02 + 1);
ggml_cuda_pool_alloc<int32_t> ids_buf_dev(ctx.pool());
CUDA_CHECK(cudaMemcpyAsync(ids_host.data(), ids->data, ggml_nbytes(ids), cudaMemcpyDeviceToHost, stream));
CUDA_CHECK(cudaStreamSynchronize(stream));
for (int64_t i02 = 0; i02 < ne02; ++i02) { // expert matrices
for (int64_t i12 = 0; i12 < ne12; ++i12) { // tokens
for (int64_t iex = 0; iex < n_expert_used; ++iex) {
const int32_t expert_to_use = *(const int32_t *)(ids_host.data() + i12*ids->nb[1] + iex*ids->nb[0]);
assert(expert_to_use >= 0 && expert_to_use < ne02);
if (expert_to_use == i02) {
ids_src1_host.push_back(i12*(nb12/nb11) + iex % ne11);
ids_dst_host.push_back(i12*ne1 + iex);
tokens_per_expert_host[i02]++;
break;
}
}
}
}
int32_t cumsum = 0;
for (int64_t i = 0; i < ne02; ++i) {
expert_bounds_host[i] = cumsum;
cumsum += tokens_per_expert_host[i];
}
expert_bounds_host[ne02] = cumsum;
std::vector<int32_t> ids_buf_host;
ids_buf_host.reserve(ids_src1_host.size() + ids_dst_host.size() + expert_bounds_host.size());
ids_buf_host.insert(ids_buf_host.end(), ids_src1_host.begin(), ids_src1_host.end());
ids_buf_host.insert(ids_buf_host.end(), ids_dst_host.begin(), ids_dst_host.end());
ids_buf_host.insert(ids_buf_host.end(), expert_bounds_host.begin(), expert_bounds_host.end());
ids_buf_dev.alloc(ids_buf_host.size() + get_mmq_x_max_host(cc)); // Expert bounds are padded on device.
CUDA_CHECK(cudaMemcpyAsync(ids_buf_dev.ptr, ids_buf_host.data(), ids_buf_host.size()*sizeof(int32_t), cudaMemcpyHostToDevice, stream));
CUDA_CHECK(cudaStreamSynchronize(stream));
const int32_t * ids_src1_dev = ids_buf_dev.ptr;
const int32_t * ids_dst_dev = ids_src1_dev + ids_src1_host.size();
const int32_t * expert_bounds_dev = ids_dst_dev + ids_dst_host.size();
const size_t nbytes_src1_q8_1 = ne12*n_expert_used*ne10_padded * sizeof(block_q8_1)/QK8_1 +
get_mmq_x_max_host(cc)*sizeof(block_q8_1_mmq);
ggml_cuda_pool_alloc<char> src1_q8_1(ctx.pool(), nbytes_src1_q8_1);
const int64_t ne11_flat = ne12*n_expert_used;
const int64_t ne12_flat = 1;
const int64_t ne13_flat = 1;
{
const int64_t s11 = src1->nb[1] / ts_src1;
const int64_t s12 = src1->nb[2] / ts_src1;
const int64_t s13 = src1->nb[2] / ts_src1;
quantize_mmq_q8_1_cuda(src1_d, ids_src1_dev, src1_q8_1.get(), src0->type,
ne10, s11, s12, s13, ne10_padded, ne11_flat, ne12_flat, ne13_flat, stream);
}
const int64_t s12 = ne11*ne10_padded * sizeof(block_q8_1)/(QK8_1*sizeof(int));
const int64_t s13 = ne12*s12;
// Note that ne02 is used instead of ne12 because the number of y channels determines the z dimension of the CUDA grid.
const mmq_args args = {
src0_d, src0->type, (const int *) src1_q8_1.ptr, ids_dst_dev, expert_bounds_dev, dst_d,
ne00, ne01, ne_get_rows, s01, s1,
ne02, ne02, s02, s12, s2,
ne03, ne13, s03, s13, s3,
use_stream_k};
ggml_cuda_mul_mat_q_switch_type(ctx, args, stream);
}
void ggml_cuda_op_mul_mat_q(
ggml_backend_cuda_context & ctx,
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i,
const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols,
const int64_t src1_padded_row_size, cudaStream_t stream) {
const int64_t ne00 = src0->ne[0];
const int64_t ne10 = src1->ne[0];
const int64_t ne11 = src1->ne[1];
GGML_ASSERT(ne10 % QK8_1 == 0);
const int64_t ne0 = dst->ne[0];
const int64_t row_diff = row_high - row_low;
const int64_t stride01 = ne00 / ggml_blck_size(src0->type);
const int id = ggml_cuda_get_device();
const int cc = ggml_cuda_info().devices[id].cc;
// the main device has a larger memory buffer to hold the results from all GPUs
// nrows_dst == nrows of the matrix that the kernel writes into
const int64_t nrows_dst = id == ctx.device ? ne0 : row_diff;
// The stream-k decomposition is only faster for recent NVIDIA GPUs.
// Also its fixup needs to allocate a temporary buffer in the memory pool.
// There are multiple parallel CUDA streams for src1_ncols != ne11 which would introduce a race condition for this buffer.
const bool use_stream_k = GGML_CUDA_CC_IS_NVIDIA(cc) &&
ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_VOLTA && src1_ncols == ne11;
const mmq_args args = {
src0_dd_i, src0->type, (const int *) src1_ddq_i, nullptr, nullptr, dst_dd_i,
ne00, row_diff, src1_ncols, stride01, nrows_dst,
1, 1, 0, 0, 0,
1, 1, 0, 0, 0,
use_stream_k};
ggml_cuda_mul_mat_q_switch_type(ctx, args, stream);
GGML_UNUSED(src1);
GGML_UNUSED(dst);
GGML_UNUSED(src1_ddf_i);
GGML_UNUSED(src1_padded_row_size);
}
bool ggml_cuda_should_use_mmq(enum ggml_type type, int cc, int64_t ne11) {
File diff suppressed because it is too large Load Diff
+70 -109
View File
@@ -4,23 +4,18 @@
template <typename T, typename type_acc, int block_size>
static __global__ void mul_mat_vec(
const T * __restrict__ x, const float * __restrict__ y, const int32_t * __restrict__ ids, float * __restrict__ dst,
const int64_t ncols2, const int64_t nchannels_y, const int64_t stride_row,
const T * __restrict__ x, const float * __restrict__ y, float * __restrict__ dst, const int64_t ncols2, const int64_t stride_row,
const int64_t channel_ratio, const int64_t stride_channel_x, const int64_t stride_channel_y, const int64_t stride_channel_dst,
const int64_t sample_ratio, const int64_t stride_sample_x, const int64_t stride_sample_y, const int64_t stride_sample_dst) {
const int64_t row = blockIdx.x;
const int64_t channel_dst = blockIdx.y;
const int64_t channel_x = ids ? ids[channel_dst] : channel_dst / channel_ratio;
const int64_t channel_y = ids ? channel_dst % nchannels_y : channel_dst;
const int64_t sample_dst = blockIdx.z;
const int64_t sample_x = sample_dst / sample_ratio;
const int64_t sample_y = sample_dst;
const int tid = threadIdx.x;
constexpr int warp_size = ggml_cuda_get_physical_warp_size();
const int64_t row = blockIdx.x;
const int64_t channel = blockIdx.y;
const int64_t sample = blockIdx.z;
const int tid = threadIdx.x;
constexpr int warp_size = ggml_cuda_get_physical_warp_size();
x += sample_x *stride_sample_x + channel_x *stride_channel_x + row*stride_row;
y += sample_y *stride_sample_y + channel_y *stride_channel_y;
dst += sample_dst*stride_sample_dst + channel_dst*stride_channel_dst;
x += (sample/sample_ratio)*stride_sample_x + (channel/channel_ratio)*stride_channel_x + row*stride_row;
y += sample *stride_sample_y + channel *stride_channel_y;
dst += sample *stride_sample_dst + channel *stride_channel_dst;
const float2 * y2 = (const float2 *) y;
@@ -36,19 +31,12 @@ static __global__ void mul_mat_vec(
float sumf = 0.0f;
if constexpr (std::is_same<T, float>::value) {
const float2 * x2 = (const float2 *) x;
for (int64_t col2 = tid; col2 < ncols2; col2 += block_size) {
const float2 tmpx = x2[col2];
const float2 tmpy = y2[col2];
sumf += tmpx.x*tmpy.x;
sumf += tmpx.y*tmpy.y;
}
} else if constexpr (std::is_same<T, half>::value) {
if constexpr (std::is_same<T, half>::value) {
const half2 * x2 = (const half2 *) x;
if (std::is_same<type_acc, float>::value) {
sumf = 0.0f;
for (int64_t col2 = tid; col2 < ncols2; col2 += block_size) {
const float2 tmpx = __half22float2(x2[col2]);
const float2 tmpy = y2[col2];
@@ -71,6 +59,8 @@ static __global__ void mul_mat_vec(
}
} else if constexpr (std::is_same<T, nv_bfloat16>::value) {
const int * x2 = (const int *) x;
sumf = 0.0f;
for (int64_t col2 = tid; col2 < ncols2; col2 += block_size) {
const int tmpx = x2[col2];
const float2 tmpy = y2[col2];
@@ -102,17 +92,17 @@ static __global__ void mul_mat_vec(
template <typename T, typename type_acc>
static void launch_mul_mat_vec_cuda(
const T * x, const float * y, const int32_t * ids, float * dst,
const int64_t ncols, const int64_t nrows, const int64_t stride_row, const int64_t nchannels_x, const int64_t nchannels_y, const int64_t nchannels_dst,
const T * x, const float * y, float * dst,
const int64_t ncols, const int64_t nrows, const int64_t stride_row, const int64_t nchannels_x, const int64_t nchannels_y,
const int64_t stride_channel_x, const int64_t stride_channel_y, const int64_t stride_channel_dst, const int64_t nsamples_x,
const int64_t nsamples_dst, const int64_t stride_sample_x, const int64_t stride_sample_y, const int64_t stride_sample_dst,
const int64_t nsamples_y, const int64_t stride_sample_x, const int64_t stride_sample_y, const int64_t stride_sample_dst,
cudaStream_t stream) {
GGML_ASSERT(ncols % 2 == 0);
GGML_ASSERT(stride_row % 2 == 0);
GGML_ASSERT(ids || nchannels_dst % nchannels_x == 0);
GGML_ASSERT( nsamples_dst % nsamples_x == 0);
const int64_t channel_ratio = nchannels_dst / nchannels_x;
const int64_t sample_ratio = nsamples_dst / nsamples_x;
GGML_ASSERT(nchannels_y % nchannels_x == 0);
GGML_ASSERT(nsamples_y % nsamples_x == 0);
const int64_t channel_ratio = nchannels_y / nchannels_x;
const int64_t sample_ratio = nsamples_y / nsamples_x;
int device;
int warp_size;
@@ -134,48 +124,48 @@ static void launch_mul_mat_vec_cuda(
}
const int smem = warp_size*sizeof(float);
const dim3 block_nums(nrows, nchannels_dst, nsamples_dst);
const dim3 block_nums(nrows, nchannels_y, nsamples_y);
const dim3 block_dims(block_size_best, 1, 1);
switch (block_size_best) {
case 32: {
mul_mat_vec<T, type_acc, 32><<<block_nums, block_dims, smem, stream>>>
(x, y, ids, dst, ncols/2, nchannels_y, stride_row, channel_ratio, stride_channel_x, stride_channel_y,
stride_channel_dst, sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst);
(x, y, dst, ncols/2, stride_row, channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst,
sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst);
} break;
case 64: {
mul_mat_vec<T, type_acc, 64><<<block_nums, block_dims, smem, stream>>>
(x, y, ids, dst, ncols/2, nchannels_y, stride_row, channel_ratio, stride_channel_x, stride_channel_y,
stride_channel_dst, sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst);
(x, y, dst, ncols/2, stride_row, channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst,
sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst);
} break;
case 96: {
mul_mat_vec<T, type_acc, 96><<<block_nums, block_dims, smem, stream>>>
(x, y, ids, dst, ncols/2, nchannels_y, stride_row, channel_ratio, stride_channel_x, stride_channel_y,
stride_channel_dst, sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst);
(x, y, dst, ncols/2, stride_row, channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst,
sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst);
} break;
case 128: {
mul_mat_vec<T, type_acc, 128><<<block_nums, block_dims, smem, stream>>>
(x, y, ids, dst, ncols/2, nchannels_y, stride_row, channel_ratio, stride_channel_x, stride_channel_y,
stride_channel_dst, sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst);
(x, y, dst, ncols/2, stride_row, channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst,
sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst);
} break;
case 160: {
mul_mat_vec<T, type_acc, 160><<<block_nums, block_dims, smem, stream>>>
(x, y, ids, dst, ncols/2, nchannels_y, stride_row, channel_ratio, stride_channel_x, stride_channel_y,
stride_channel_dst, sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst);
(x, y, dst, ncols/2, stride_row, channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst,
sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst);
} break;
case 192: {
mul_mat_vec<T, type_acc, 192><<<block_nums, block_dims, smem, stream>>>
(x, y, ids, dst, ncols/2, nchannels_y, stride_row, channel_ratio, stride_channel_x, stride_channel_y,
stride_channel_dst, sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst);
(x, y, dst, ncols/2, stride_row, channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst,
sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst);
} break;
case 224: {
mul_mat_vec<T, type_acc, 224><<<block_nums, block_dims, smem, stream>>>
(x, y, ids, dst, ncols/2, nchannels_y, stride_row, channel_ratio, stride_channel_x, stride_channel_y,
stride_channel_dst, sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst);
(x, y, dst, ncols/2, stride_row, channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst,
sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst);
} break;
case 256: {
mul_mat_vec<T, type_acc, 256><<<block_nums, block_dims, smem, stream>>>
(x, y, ids, dst, ncols/2, nchannels_y, stride_row, channel_ratio, stride_channel_x, stride_channel_y,
stride_channel_dst, sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst);
(x, y, dst, ncols/2, stride_row, channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst,
sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst);
} break;
default: {
GGML_ABORT("fatal error");
@@ -185,28 +175,28 @@ static void launch_mul_mat_vec_cuda(
template<typename T>
static void mul_mat_vec_cuda(
const T * x, const float * y, const int32_t * ids, float * dst,
const int64_t ncols, const int64_t nrows, const int64_t stride_row, const int64_t nchannels_x, const int64_t nchannels_y, const int64_t nchannels_dst,
const T * x, const float * y, float * dst,
const int64_t ncols, const int64_t nrows, const int64_t stride_row, const int64_t nchannels_x, const int64_t nchannels_y,
const int64_t stride_channel_x, const int64_t stride_channel_y, const int64_t stride_channel_dst, const int64_t nsamples_x,
const int64_t nsamples_dst, const int64_t stride_sample_x, const int64_t stride_sample_y, const int64_t stride_sample_dst,
const int64_t nsamples_y, const int64_t stride_sample_x, const int64_t stride_sample_y, const int64_t stride_sample_dst,
enum ggml_prec prec, cudaStream_t stream) {
if constexpr(std::is_same<T, half>::value) {
if (prec == GGML_PREC_DEFAULT) {
switch (prec) {
case GGML_PREC_DEFAULT: {
launch_mul_mat_vec_cuda<T, half>
(x, y, ids, dst, ncols, nrows, stride_row, nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y,
stride_channel_dst, nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream);
return;
}
(x, y, dst, ncols, nrows, stride_row, nchannels_x, nchannels_y, stride_channel_x, stride_channel_y, stride_channel_dst,
nsamples_x, nsamples_y, stride_sample_x, stride_sample_y, stride_sample_dst, stream);
} break;
case GGML_PREC_F32: {
launch_mul_mat_vec_cuda<T, float>
(x, y, dst, ncols, nrows, stride_row, nchannels_x, nchannels_y, stride_channel_x, stride_channel_y, stride_channel_dst,
nsamples_x, nsamples_y, stride_sample_x, stride_sample_y, stride_sample_dst, stream);
} break;
}
launch_mul_mat_vec_cuda<T, float>
(x, y, ids, dst, ncols, nrows, stride_row, nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y,
stride_channel_dst, nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream);
}
void ggml_cuda_mul_mat_vec(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * ids, ggml_tensor * dst) {
GGML_ASSERT( src1->type == GGML_TYPE_F32);
GGML_ASSERT(!ids || ids->type == GGML_TYPE_I32);
GGML_ASSERT( dst->type == GGML_TYPE_F32);
void ggml_cuda_mul_mat_vec(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
GGML_ASSERT(src1->type == GGML_TYPE_F32);
GGML_ASSERT(dst->type == GGML_TYPE_F32);
GGML_TENSOR_BINARY_OP_LOCALS;
@@ -214,24 +204,21 @@ void ggml_cuda_mul_mat_vec(ggml_backend_cuda_context & ctx, const ggml_tensor *
const size_t ts_src1 = ggml_type_size(src1->type);
const size_t ts_dst = ggml_type_size(dst->type);
GGML_ASSERT(!ids || ne12 == 1); // Implementation is only correct for batch size 1.
GGML_ASSERT(ne11 == 1);
GGML_ASSERT(ne12 == ne2);
GGML_ASSERT(ne13 == ne3);
GGML_ASSERT( nb00 == ts_src0);
GGML_ASSERT( nb10 == ts_src1);
GGML_ASSERT(!ids || ids->nb[0] == ggml_type_size(ids->type));
GGML_ASSERT( nb0 == ts_dst);
GGML_ASSERT(nb00 == ts_src0);
GGML_ASSERT(nb10 == ts_src1);
GGML_ASSERT(nb0 == ts_dst);
const int cc = ggml_cuda_info().devices[ggml_cuda_get_device()].cc;
const enum ggml_prec prec = fast_fp16_available(cc) ? ggml_prec(dst->op_params[0]) : GGML_PREC_F32;
const float * src1_d = (const float *) src1->data;
const int32_t * ids_d = ids ? (const int32_t *) ids->data : nullptr;
float * dst_d = (float *) dst->data;
const float * src1_d = (const float *) src1->data;
float * dst_d = (float *) dst->data;
const int64_t s01 = src0->nb[1] / ts_src0;
const int64_t s11 = src1->nb[1] / ts_src1;
const int64_t s1 = dst->nb[1] / ts_dst;
const int64_t s02 = src0->nb[2] / ts_src0;
const int64_t s12 = src1->nb[2] / ts_src1;
const int64_t s2 = dst->nb[2] / ts_dst;
@@ -239,33 +226,14 @@ void ggml_cuda_mul_mat_vec(ggml_backend_cuda_context & ctx, const ggml_tensor *
const int64_t s13 = src1->nb[3] / ts_src1;
const int64_t s3 = dst->nb[3] / ts_dst;
// For MUL_MAT_ID the memory layout is different than for MUL_MAT:
const int64_t ncols_dst = ids ? ne2 : ne1;
const int64_t nchannels_y = ids ? ne11 : ne12;
const int64_t nchannels_dst = ids ? ne1 : ne2;
const int64_t stride_channel_dst = ids ? s1 : s2;
const int64_t stride_channel_y = ids ? s11 : s12;
GGML_ASSERT(ncols_dst == 1);
switch (src0->type) {
case GGML_TYPE_F32: {
const float * src0_d = (const float *) src0->data;
mul_mat_vec_cuda(src0_d, src1_d, ids_d, dst_d, ne00, ne01, s01,
ne02, nchannels_y, nchannels_dst, s02, stride_channel_y, stride_channel_dst,
ne03, ne3, s03, s13, s3, prec, ctx.stream());
} break;
case GGML_TYPE_F16: {
const half * src0_d = (const half *) src0->data;
mul_mat_vec_cuda(src0_d, src1_d, ids_d, dst_d, ne00, ne01, s01,
ne02, nchannels_y, nchannels_dst, s02, stride_channel_y, stride_channel_dst,
ne03, ne3, s03, s13, s3, prec, ctx.stream());
mul_mat_vec_cuda(src0_d, src1_d, dst_d, ne00, ne01, s01, ne02, ne12, s02, s12, s2, ne03, ne13, s03, s13, s3, prec, ctx.stream());
} break;
case GGML_TYPE_BF16: {
const nv_bfloat16 * src0_d = (const nv_bfloat16 *) src0->data;
mul_mat_vec_cuda(src0_d, src1_d, ids_d, dst_d, ne00, ne01, s01,
ne02, nchannels_y, nchannels_dst, s02, stride_channel_y, stride_channel_dst,
ne03, ne3, s03, s13, s3, prec, ctx.stream());
mul_mat_vec_cuda(src0_d, src1_d, dst_d, ne00, ne01, s01, ne02, ne12, s02, s12, s2, ne03, ne13, s03, s13, s3, prec, ctx.stream());
} break;
default:
GGML_ABORT("unsupported type: %s", ggml_type_name(src0->type));
@@ -294,34 +262,27 @@ void ggml_cuda_op_mul_mat_vec(
const int64_t stride_row = ne00;
const int64_t nchannels_x = 1;
const int64_t nchannels_y = 1;
const int64_t nchannels_dst = 1;
const int64_t stride_channel_x = 0;
const int64_t stride_channel_y = 0;
const int64_t stride_channel_dst = 0;
const int64_t nsamples_x = 1;
const int64_t nsamples_dst = 1;
const int64_t nsamples_y = 1;
const int64_t stride_sample_x = 0;
const int64_t stride_sample_y = 0;
const int64_t stride_sample_dst = 0;
switch (src0->type) {
case GGML_TYPE_F32: {
const float * src0_d = (const float *) src0_dd_i;
mul_mat_vec_cuda(src0_d, src1_ddf_i, nullptr, dst_dd_i, ne00, row_diff, stride_row,
nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, prec, stream);
} break;
case GGML_TYPE_F16: {
const half * src0_d = (const half *) src0_dd_i;
mul_mat_vec_cuda(src0_d, src1_ddf_i, nullptr, dst_dd_i, ne00, row_diff, stride_row,
nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, prec, stream);
mul_mat_vec_cuda(src0_d, src1_ddf_i, dst_dd_i, ne00, row_diff, stride_row,
nchannels_x, nchannels_y, stride_channel_x, stride_channel_y, stride_channel_dst,
nsamples_x, nsamples_y, stride_sample_x, stride_sample_y, stride_sample_dst, prec, stream);
} break;
case GGML_TYPE_BF16: {
const nv_bfloat16 * src0_d = (const nv_bfloat16 *) src0_dd_i;
mul_mat_vec_cuda(src0_d, src1_ddf_i, nullptr, dst_dd_i, ne00, row_diff, stride_row,
nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, prec, stream);
mul_mat_vec_cuda(src0_d, src1_ddf_i, dst_dd_i, ne00, row_diff, stride_row,
nchannels_x, nchannels_y, stride_channel_x, stride_channel_y, stride_channel_dst,
nsamples_x, nsamples_y, stride_sample_x, stride_sample_y, stride_sample_dst, prec, stream);
} break;
default:
GGML_ABORT("unsupported type: %s", ggml_type_name(src0->type));
+1 -1
View File
@@ -3,7 +3,7 @@
// maximum number of src0 rows with which to use mul_mat_vec over cuBLAS if FP16 tensor cores are available
#define MMV_MAX_ROWS 512
void ggml_cuda_mul_mat_vec(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * ids, ggml_tensor * dst);
void ggml_cuda_mul_mat_vec(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst);
void ggml_cuda_op_mul_mat_vec(
ggml_backend_cuda_context & ctx,
+274 -347
View File
@@ -1,57 +1,50 @@
#include "mmvq.cuh"
#include "quantize.cuh"
#include "vecdotq.cuh"
#include <cstdint>
typedef float (*vec_dot_q_cuda_t)(const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & kbx, const int & iqs);
static constexpr __device__ vec_dot_q_cuda_t get_vec_dot_q_cuda(ggml_type type) {
switch (type) {
case GGML_TYPE_Q4_0: return vec_dot_q4_0_q8_1;
case GGML_TYPE_Q4_1: return vec_dot_q4_1_q8_1;
case GGML_TYPE_Q5_0: return vec_dot_q5_0_q8_1;
case GGML_TYPE_Q5_1: return vec_dot_q5_1_q8_1;
case GGML_TYPE_Q8_0: return vec_dot_q8_0_q8_1;
case GGML_TYPE_Q2_K: return vec_dot_q2_K_q8_1;
case GGML_TYPE_Q3_K: return vec_dot_q3_K_q8_1;
case GGML_TYPE_Q4_K: return vec_dot_q4_K_q8_1;
case GGML_TYPE_Q5_K: return vec_dot_q5_K_q8_1;
case GGML_TYPE_Q6_K: return vec_dot_q6_K_q8_1;
case GGML_TYPE_IQ2_XXS: return vec_dot_iq2_xxs_q8_1;
case GGML_TYPE_IQ2_XS: return vec_dot_iq2_xs_q8_1;
case GGML_TYPE_IQ2_S: return vec_dot_iq2_s_q8_1;
case GGML_TYPE_IQ3_XXS: return vec_dot_iq3_xxs_q8_1;
case GGML_TYPE_IQ1_S: return vec_dot_iq1_s_q8_1;
case GGML_TYPE_IQ1_M: return vec_dot_iq1_m_q8_1;
case GGML_TYPE_IQ4_NL: return vec_dot_iq4_nl_q8_1;
case GGML_TYPE_IQ4_XS: return vec_dot_iq4_xs_q8_1;
case GGML_TYPE_IQ3_S: return vec_dot_iq3_s_q8_1;
default: return nullptr;
}
return type == GGML_TYPE_Q4_0 ? vec_dot_q4_0_q8_1 :
type == GGML_TYPE_Q4_1 ? vec_dot_q4_1_q8_1 :
type == GGML_TYPE_Q5_0 ? vec_dot_q5_0_q8_1 :
type == GGML_TYPE_Q5_1 ? vec_dot_q5_1_q8_1 :
type == GGML_TYPE_Q8_0 ? vec_dot_q8_0_q8_1 :
type == GGML_TYPE_Q2_K ? vec_dot_q2_K_q8_1 :
type == GGML_TYPE_Q3_K ? vec_dot_q3_K_q8_1 :
type == GGML_TYPE_Q4_K ? vec_dot_q4_K_q8_1 :
type == GGML_TYPE_Q5_K ? vec_dot_q5_K_q8_1 :
type == GGML_TYPE_Q6_K ? vec_dot_q6_K_q8_1 :
type == GGML_TYPE_IQ2_XXS ? vec_dot_iq2_xxs_q8_1 :
type == GGML_TYPE_IQ2_XS ? vec_dot_iq2_xs_q8_1 :
type == GGML_TYPE_IQ2_S ? vec_dot_iq2_s_q8_1 :
type == GGML_TYPE_IQ3_XXS ? vec_dot_iq3_xxs_q8_1 :
type == GGML_TYPE_IQ1_S ? vec_dot_iq1_s_q8_1 :
type == GGML_TYPE_IQ1_M ? vec_dot_iq1_m_q8_1 :
type == GGML_TYPE_IQ4_NL ? vec_dot_iq4_nl_q8_1 :
type == GGML_TYPE_IQ4_XS ? vec_dot_iq4_xs_q8_1 :
type == GGML_TYPE_IQ3_S ? vec_dot_iq3_s_q8_1 :
nullptr;
}
static constexpr __device__ int get_vdr_mmvq(ggml_type type) {
switch (type) {
case GGML_TYPE_Q4_0: return VDR_Q4_0_Q8_1_MMVQ;
case GGML_TYPE_Q4_1: return VDR_Q4_1_Q8_1_MMVQ;
case GGML_TYPE_Q5_0: return VDR_Q5_0_Q8_1_MMVQ;
case GGML_TYPE_Q5_1: return VDR_Q5_1_Q8_1_MMVQ;
case GGML_TYPE_Q8_0: return VDR_Q8_0_Q8_1_MMVQ;
case GGML_TYPE_Q2_K: return VDR_Q2_K_Q8_1_MMVQ;
case GGML_TYPE_Q3_K: return VDR_Q3_K_Q8_1_MMVQ;
case GGML_TYPE_Q4_K: return VDR_Q4_K_Q8_1_MMVQ;
case GGML_TYPE_Q5_K: return VDR_Q5_K_Q8_1_MMVQ;
case GGML_TYPE_Q6_K: return VDR_Q6_K_Q8_1_MMVQ;
case GGML_TYPE_IQ2_XXS: return VDR_IQ2_XXS_Q8_1_MMVQ;
case GGML_TYPE_IQ2_XS: return VDR_IQ2_XS_Q8_1_MMVQ;
case GGML_TYPE_IQ2_S: return VDR_IQ2_S_Q8_1_MMVQ;
case GGML_TYPE_IQ3_XXS: return VDR_IQ3_XXS_Q8_1_MMVQ;
case GGML_TYPE_IQ3_S: return VDR_IQ3_S_Q8_1_MMVQ;
case GGML_TYPE_IQ4_NL: return VDR_IQ4_NL_Q8_1_MMVQ;
case GGML_TYPE_IQ4_XS: return VDR_IQ4_XS_Q8_1_MMVQ;
default: return 1;
}
return type == GGML_TYPE_Q4_0 ? VDR_Q4_0_Q8_1_MMVQ :
type == GGML_TYPE_Q4_1 ? VDR_Q4_1_Q8_1_MMVQ :
type == GGML_TYPE_Q5_0 ? VDR_Q5_0_Q8_1_MMVQ :
type == GGML_TYPE_Q5_1 ? VDR_Q5_1_Q8_1_MMVQ :
type == GGML_TYPE_Q8_0 ? VDR_Q8_0_Q8_1_MMVQ :
type == GGML_TYPE_Q2_K ? VDR_Q2_K_Q8_1_MMVQ :
type == GGML_TYPE_Q3_K ? VDR_Q3_K_Q8_1_MMVQ :
type == GGML_TYPE_Q4_K ? VDR_Q4_K_Q8_1_MMVQ :
type == GGML_TYPE_Q5_K ? VDR_Q5_K_Q8_1_MMVQ :
type == GGML_TYPE_Q6_K ? VDR_Q6_K_Q8_1_MMVQ :
type == GGML_TYPE_IQ2_XXS ? VDR_IQ2_XXS_Q8_1_MMVQ :
type == GGML_TYPE_IQ2_XS ? VDR_IQ2_XS_Q8_1_MMVQ :
type == GGML_TYPE_IQ2_S ? VDR_IQ2_S_Q8_1_MMVQ :
type == GGML_TYPE_IQ3_XXS ? VDR_IQ3_XXS_Q8_1_MMVQ :
type == GGML_TYPE_IQ3_S ? VDR_IQ3_S_Q8_1_MMVQ :
type == GGML_TYPE_IQ4_NL ? VDR_IQ4_NL_Q8_1_MMVQ :
type == GGML_TYPE_IQ4_XS ? VDR_IQ4_XS_Q8_1_MMVQ :
1;
}
enum mmvq_parameter_table_id {
@@ -80,9 +73,9 @@ static __host__ mmvq_parameter_table_id get_device_table_id(int cc) {
return MMVQ_PARAMETERS_GENERIC;
}
static constexpr __host__ __device__ int calc_nwarps(int ncols_dst, mmvq_parameter_table_id table_id) {
static constexpr __host__ __device__ int calc_nwarps(int ncols_y, mmvq_parameter_table_id table_id) {
if (table_id == MMVQ_PARAMETERS_GENERIC) {
switch (ncols_dst) {
switch (ncols_y) {
case 1:
case 2:
case 3:
@@ -97,7 +90,7 @@ static constexpr __host__ __device__ int calc_nwarps(int ncols_dst, mmvq_parame
return 1;
}
} else if (table_id == MMVQ_PARAMETERS_GCN) {
switch (ncols_dst) {
switch (ncols_y) {
case 1:
case 2:
case 3:
@@ -114,9 +107,9 @@ static constexpr __host__ __device__ int calc_nwarps(int ncols_dst, mmvq_parame
return 1;
}
static constexpr __host__ __device__ int calc_rows_per_block(int ncols_dst, int table_id) {
static constexpr __host__ __device__ int calc_rows_per_block(int ncols_y, int table_id) {
if (table_id == MMVQ_PARAMETERS_GENERIC || table_id == MMVQ_PARAMETERS_GCN) {
switch (ncols_dst) {
switch (ncols_y) {
case 1:
return 1;
case 2:
@@ -134,21 +127,19 @@ static constexpr __host__ __device__ int calc_rows_per_block(int ncols_dst, int
return 1;
}
template <ggml_type type, int ncols_dst>
template <ggml_type type, int ncols_y>
// tell the compiler to use as many registers as it wants, see nwarps definition below
__launch_bounds__(calc_nwarps(ncols_dst, get_device_table_id())*ggml_cuda_get_physical_warp_size(), 1)
__launch_bounds__(calc_nwarps(ncols_y, get_device_table_id())*ggml_cuda_get_physical_warp_size(), 1)
static __global__ void mul_mat_vec_q(
const void * __restrict__ vx, const void * __restrict__ vy, const int32_t * __restrict__ ids, float * __restrict__ dst,
const int ncols_x, const int nchannels_y, const int stride_row_x, const int stride_col_y, const int stride_col_dst,
const int channel_ratio, const int stride_channel_x, const int stride_channel_y, const int stride_channel_dst,
const int sample_ratio, const int stride_sample_x, const int stride_sample_y, const int stride_sample_dst) {
const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
const int ncols_x, const int nrows_x, const int nrows_y, const int nrows_dst) {
constexpr int qk = ggml_cuda_type_traits<type>::qk;
constexpr int qi = ggml_cuda_type_traits<type>::qi;
constexpr int vdr = get_vdr_mmvq(type);
constexpr mmvq_parameter_table_id table_id = get_device_table_id();
constexpr int nwarps = calc_nwarps(ncols_dst, table_id);
constexpr int rows_per_cuda_block = calc_rows_per_block(ncols_dst, table_id);
constexpr int nwarps = calc_nwarps(ncols_y, table_id);
constexpr int rows_per_cuda_block = calc_rows_per_block(ncols_y, table_id);
constexpr int warp_size = ggml_cuda_get_physical_warp_size();
constexpr vec_dot_q_cuda_t vec_dot_q_cuda = get_vec_dot_q_cuda(type);
@@ -156,21 +147,13 @@ static __global__ void mul_mat_vec_q(
const int tid = warp_size*threadIdx.y + threadIdx.x;
const int row0 = rows_per_cuda_block*blockIdx.x;
const int blocks_per_row_x = ncols_x / qk;
const int blocks_per_col_y = nrows_y / QK8_1;
constexpr int blocks_per_iter = vdr * nwarps*warp_size / qi;
// The MUL_MAT_ID code path with ids != nullptr is only implemented for ncols_dst == 1.
const int channel_dst = blockIdx.y;
const int channel_x = ncols_dst == 1 && ids ? ids[channel_dst] : channel_dst / channel_ratio;
const int channel_y = ncols_dst == 1 && ids ? channel_dst % nchannels_y : channel_dst;
const int sample_dst = blockIdx.z;
const int sample_x = sample_dst / sample_ratio;
const int sample_y = sample_dst;
// partial sum for each thread
float tmp[ncols_dst][rows_per_cuda_block] = {{0.0f}};
float tmp[ncols_y][rows_per_cuda_block] = {{0.0f}};
const block_q8_1 * y = ((const block_q8_1 *) vy) + sample_y*stride_sample_y + channel_y*stride_channel_y;
const int kbx_offset = sample_x*stride_sample_x + channel_x*stride_channel_x + row0*stride_row_x;
const block_q8_1 * y = (const block_q8_1 *) vy;
for (int kbx = tid / (qi/vdr); kbx < blocks_per_row_x; kbx += blocks_per_iter) {
const int kby = kbx * (qk/QK8_1); // y block index that aligns with kbx
@@ -179,19 +162,18 @@ static __global__ void mul_mat_vec_q(
const int kqs = vdr * (tid % (qi/vdr));
#pragma unroll
for (int j = 0; j < ncols_dst; ++j) {
for (int j = 0; j < ncols_y; ++j) {
#pragma unroll
for (int i = 0; i < rows_per_cuda_block; ++i) {
tmp[j][i] += vec_dot_q_cuda(
vx, &y[j*stride_col_y + kby], kbx_offset + i*stride_row_x + kbx, kqs);
tmp[j][i] += vec_dot_q_cuda(vx, &y[j*blocks_per_col_y + kby], (row0 + i)*blocks_per_row_x + kbx, kqs);
}
}
}
__shared__ float tmp_shared[nwarps-1 > 0 ? nwarps-1 : 1][ncols_dst][rows_per_cuda_block][warp_size];
__shared__ float tmp_shared[nwarps-1 > 0 ? nwarps-1 : 1][ncols_y][rows_per_cuda_block][warp_size];
if (threadIdx.y > 0) {
#pragma unroll
for (int j = 0; j < ncols_dst; ++j) {
for (int j = 0; j < ncols_y; ++j) {
#pragma unroll
for (int i = 0; i < rows_per_cuda_block; ++i) {
tmp_shared[threadIdx.y-1][j][i][threadIdx.x] = tmp[j][i];
@@ -203,11 +185,9 @@ static __global__ void mul_mat_vec_q(
return;
}
dst += sample_dst*stride_sample_dst + channel_dst*stride_channel_dst + row0;
// sum up partial sums and write back result
#pragma unroll
for (int j = 0; j < ncols_dst; ++j) {
for (int j = 0; j < ncols_y; ++j) {
#pragma unroll
for (int i = 0; i < rows_per_cuda_block; ++i) {
#pragma unroll
@@ -217,121 +197,88 @@ static __global__ void mul_mat_vec_q(
tmp[j][i] = warp_reduce_sum<warp_size>(tmp[j][i]);
}
if (threadIdx.x < rows_per_cuda_block && (rows_per_cuda_block == 1 || row0 + int(threadIdx.x) < stride_col_dst)) {
dst[j*stride_col_dst + threadIdx.x] = tmp[j][threadIdx.x];
if (threadIdx.x < rows_per_cuda_block && (rows_per_cuda_block == 1 || row0 + threadIdx.x < (unsigned)nrows_dst)) {
dst[j*nrows_dst + row0 + threadIdx.x] = tmp[j][threadIdx.x];
}
}
GGML_UNUSED(nrows_x);
}
static std::pair<dim3, dim3> calc_launch_params(
const int ncols_dst, const int nrows_x, const int nchannels_y, const int nsamples_y,
const int warp_size, const mmvq_parameter_table_id table_id) {
const int64_t nblocks = (nrows_x + calc_rows_per_block(ncols_dst, table_id) - 1) / calc_rows_per_block(ncols_dst, table_id);
const dim3 block_nums(nblocks, nchannels_y, nsamples_y);
const dim3 block_dims(warp_size, calc_nwarps(ncols_dst, table_id), 1);
static std::pair<dim3, dim3> calc_launch_params(const int ncols_y, const int nrows_x, const int warp_size, const mmvq_parameter_table_id table_id) {
const int64_t nblocks = (nrows_x + calc_rows_per_block(ncols_y, table_id) - 1) / calc_rows_per_block(ncols_y, table_id);
const dim3 block_nums(nblocks, 1, 1);
const dim3 block_dims(warp_size, calc_nwarps(ncols_y, table_id), 1);
return {block_nums, block_dims};
}
template <ggml_type type>
static void mul_mat_vec_q_switch_ncols_dst(
const void * vx, const void * vy, const int32_t * ids, float * dst,
const int ncols_x, const int nrows_x, const int ncols_dst,
const int stride_row_x, const int stride_col_y, const int stride_col_dst,
const int nchannels_x, const int nchannels_y, const int nchannels_dst,
const int stride_channel_x, const int stride_channel_y, const int stride_channel_dst,
const int nsamples_x, const int nsamples_dst, const int stride_sample_x, const int stride_sample_y, const int stride_sample_dst,
cudaStream_t stream) {
static void mul_mat_vec_q_cuda(
const void * vx, const void * vy, float * dst,
const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) {
GGML_ASSERT(ncols_x % ggml_blck_size(type) == 0);
GGML_ASSERT(ncols_dst <= MMVQ_MAX_BATCH_SIZE);
const int channel_ratio = nchannels_dst / nchannels_x;
const int sample_ratio = nsamples_dst / nsamples_x;
GGML_ASSERT(ncols_y <= MMVQ_MAX_BATCH_SIZE);
const int device = ggml_cuda_get_device();
const int warp_size = ggml_cuda_info().devices[device].warp_size;
const mmvq_parameter_table_id table_id = get_device_table_id(ggml_cuda_info().devices[device].cc);
GGML_ASSERT(!ids || ncols_dst == 1);
switch (ncols_dst) {
switch (ncols_y) {
case 1:
{
constexpr int c_ncols_dst = 1;
std::pair<dim3, dim3> dims = calc_launch_params(c_ncols_dst, nrows_x, nchannels_dst, nsamples_dst, warp_size, table_id);
mul_mat_vec_q<type, c_ncols_dst><<<dims.first, dims.second, 0, stream>>>
(vx, vy, ids, dst, ncols_x, nchannels_y, stride_row_x, stride_col_y, stride_col_dst,
channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst,
sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst);
constexpr int c_ncols_y = 1;
std::pair<dim3, dim3> dims = calc_launch_params(c_ncols_y, nrows_x, warp_size, table_id);
mul_mat_vec_q<type, c_ncols_y><<<dims.first, dims.second, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, nrows_dst);
break;
}
case 2:
{
constexpr int c_ncols_dst = 2;
std::pair<dim3, dim3> dims = calc_launch_params(c_ncols_dst, nrows_x, nchannels_dst, nsamples_dst, warp_size, table_id);
mul_mat_vec_q<type, c_ncols_dst><<<dims.first, dims.second, 0, stream>>>
(vx, vy, ids, dst, ncols_x, nchannels_y, stride_row_x, stride_col_y, stride_col_dst,
channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst,
sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst);
constexpr int c_ncols_y = 2;
std::pair<dim3, dim3> dims = calc_launch_params(c_ncols_y, nrows_x, warp_size, table_id);
mul_mat_vec_q<type, c_ncols_y><<<dims.first, dims.second, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, nrows_dst);
break;
}
case 3:
{
constexpr int c_ncols_dst = 3;
std::pair<dim3, dim3> dims = calc_launch_params(c_ncols_dst, nrows_x, nchannels_dst, nsamples_dst, warp_size, table_id);
mul_mat_vec_q<type, c_ncols_dst><<<dims.first, dims.second, 0, stream>>>
(vx, vy, ids, dst, ncols_x, nchannels_y, stride_row_x, stride_col_y, stride_col_dst,
channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst,
sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst);
constexpr int c_ncols_y = 3;
std::pair<dim3, dim3> dims = calc_launch_params(c_ncols_y, nrows_x, warp_size, table_id);
mul_mat_vec_q<type, c_ncols_y><<<dims.first, dims.second, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, nrows_dst);
break;
}
case 4:
{
constexpr int c_ncols_dst = 4;
std::pair<dim3, dim3> dims = calc_launch_params(c_ncols_dst, nrows_x, nchannels_dst, nsamples_dst, warp_size, table_id);
mul_mat_vec_q<type, c_ncols_dst><<<dims.first, dims.second, 0, stream>>>
(vx, vy, ids, dst, ncols_x, nchannels_y, stride_row_x, stride_col_y, stride_col_dst,
channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst,
sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst);
constexpr int c_ncols_y = 4;
std::pair<dim3, dim3> dims = calc_launch_params(c_ncols_y, nrows_x, warp_size, table_id);
mul_mat_vec_q<type, c_ncols_y><<<dims.first, dims.second, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, nrows_dst);
break;
}
case 5:
{
constexpr int c_ncols_dst = 5;
std::pair<dim3, dim3> dims = calc_launch_params(c_ncols_dst, nrows_x, nchannels_dst, nsamples_dst, warp_size, table_id);
mul_mat_vec_q<type, c_ncols_dst><<<dims.first, dims.second, 0, stream>>>
(vx, vy, ids, dst, ncols_x, nchannels_y, stride_row_x, stride_col_y, stride_col_dst,
channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst,
sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst);
constexpr int c_ncols_y = 5;
std::pair<dim3, dim3> dims = calc_launch_params(c_ncols_y, nrows_x, warp_size, table_id);
mul_mat_vec_q<type, c_ncols_y><<<dims.first, dims.second, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, nrows_dst);
break;
}
case 6:
{
constexpr int c_ncols_dst = 6;
std::pair<dim3, dim3> dims = calc_launch_params(c_ncols_dst, nrows_x, nchannels_dst, nsamples_dst, warp_size, table_id);
mul_mat_vec_q<type, c_ncols_dst><<<dims.first, dims.second, 0, stream>>>
(vx, vy, ids, dst, ncols_x, nchannels_y, stride_row_x, stride_col_y, stride_col_dst,
channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst,
sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst);
constexpr int c_ncols_y = 6;
std::pair<dim3, dim3> dims = calc_launch_params(c_ncols_y, nrows_x, warp_size, table_id);
mul_mat_vec_q<type, c_ncols_y><<<dims.first, dims.second, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, nrows_dst);
break;
}
case 7:
{
constexpr int c_ncols_dst = 7;
std::pair<dim3, dim3> dims = calc_launch_params(c_ncols_dst, nrows_x, nchannels_dst, nsamples_dst, warp_size, table_id);
mul_mat_vec_q<type, c_ncols_dst><<<dims.first, dims.second, 0, stream>>>
(vx, vy, ids, dst, ncols_x, nchannels_y, stride_row_x, stride_col_y, stride_col_dst,
channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst,
sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst);
constexpr int c_ncols_y = 7;
std::pair<dim3, dim3> dims = calc_launch_params(c_ncols_y, nrows_x, warp_size, table_id);
mul_mat_vec_q<type, c_ncols_y><<<dims.first, dims.second, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, nrows_dst);
break;
}
case 8:
{
constexpr int c_ncols_dst = 8;
std::pair<dim3, dim3> dims = calc_launch_params(c_ncols_dst, nrows_x, nchannels_dst, nsamples_dst, warp_size, table_id);
mul_mat_vec_q<type, c_ncols_dst><<<dims.first, dims.second, 0, stream>>>
(vx, vy, ids, dst, ncols_x, nchannels_y, stride_row_x, stride_col_y, stride_col_dst,
channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst,
sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst);
constexpr int c_ncols_y = 8;
std::pair<dim3, dim3> dims = calc_launch_params(c_ncols_y, nrows_x, warp_size, table_id);
mul_mat_vec_q<type, c_ncols_y><<<dims.first, dims.second, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, nrows_dst);
break;
}
default:
@@ -340,213 +287,137 @@ static void mul_mat_vec_q_switch_ncols_dst(
}
}
static void mul_mat_vec_q_switch_type(
const void * vx, const ggml_type type_x, const void * vy, const int32_t * ids, float * dst,
const int ncols_x, const int nrows_x, const int ncols_dst,
const int stride_row_x, const int stride_col_y, const int stride_col_dst,
const int nchannels_x, const int nchannels_y, const int nchannels_dst,
const int stride_channel_x, const int stride_channel_y, const int stride_channel_dst,
const int nsamples_x, const int nsamples_dst, const int stride_sample_x, const int stride_sample_y, const int stride_sample_dst,
cudaStream_t stream) {
switch (type_x) {
case GGML_TYPE_Q4_0:
mul_mat_vec_q_switch_ncols_dst<GGML_TYPE_Q4_0>
(vx, vy, ids, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst,
nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst,
stream);
break;
case GGML_TYPE_Q4_1:
mul_mat_vec_q_switch_ncols_dst<GGML_TYPE_Q4_1>
(vx, vy, ids, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst,
nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst,
stream);
break;
case GGML_TYPE_Q5_0:
mul_mat_vec_q_switch_ncols_dst<GGML_TYPE_Q5_0>
(vx, vy, ids, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst,
nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst,
stream);
break;
case GGML_TYPE_Q5_1:
mul_mat_vec_q_switch_ncols_dst<GGML_TYPE_Q5_1>
(vx, vy, ids, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst,
nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst,
stream);
break;
case GGML_TYPE_Q8_0:
mul_mat_vec_q_switch_ncols_dst<GGML_TYPE_Q8_0>
(vx, vy, ids, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst,
nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst,
stream);
break;
case GGML_TYPE_Q2_K:
mul_mat_vec_q_switch_ncols_dst<GGML_TYPE_Q2_K>
(vx, vy, ids, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst,
nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst,
stream);
break;
case GGML_TYPE_Q3_K:
mul_mat_vec_q_switch_ncols_dst<GGML_TYPE_Q3_K>
(vx, vy, ids, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst,
nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst,
stream);
break;
case GGML_TYPE_Q4_K:
mul_mat_vec_q_switch_ncols_dst<GGML_TYPE_Q4_K>
(vx, vy, ids, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst,
nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst,
stream);
break;
case GGML_TYPE_Q5_K:
mul_mat_vec_q_switch_ncols_dst<GGML_TYPE_Q5_K>
(vx, vy, ids, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst,
nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst,
stream);
break;
case GGML_TYPE_Q6_K:
mul_mat_vec_q_switch_ncols_dst<GGML_TYPE_Q6_K>
(vx, vy, ids, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst,
nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst,
stream);
break;
case GGML_TYPE_IQ2_XXS:
mul_mat_vec_q_switch_ncols_dst<GGML_TYPE_IQ2_XXS>
(vx, vy, ids, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst,
nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst,
stream);
break;
case GGML_TYPE_IQ2_XS:
mul_mat_vec_q_switch_ncols_dst<GGML_TYPE_IQ2_XS>
(vx, vy, ids, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst,
nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst,
stream);
break;
case GGML_TYPE_IQ2_S:
mul_mat_vec_q_switch_ncols_dst<GGML_TYPE_IQ2_S>
(vx, vy, ids, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst,
nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst,
stream);
break;
case GGML_TYPE_IQ3_XXS:
mul_mat_vec_q_switch_ncols_dst<GGML_TYPE_IQ3_XXS>
(vx, vy, ids, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst,
nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst,
stream);
break;
case GGML_TYPE_IQ1_S:
mul_mat_vec_q_switch_ncols_dst<GGML_TYPE_IQ1_S>
(vx, vy, ids, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst,
nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst,
stream);
break;
case GGML_TYPE_IQ1_M:
mul_mat_vec_q_switch_ncols_dst<GGML_TYPE_IQ1_M>
(vx, vy, ids, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst,
nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst,
stream);
break;
case GGML_TYPE_IQ4_NL:
mul_mat_vec_q_switch_ncols_dst<GGML_TYPE_IQ4_NL>
(vx, vy, ids, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst,
nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst,
stream);
break;
case GGML_TYPE_IQ4_XS:
mul_mat_vec_q_switch_ncols_dst<GGML_TYPE_IQ4_XS>
(vx, vy, ids, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst,
nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst,
stream);
break;
case GGML_TYPE_IQ3_S:
mul_mat_vec_q_switch_ncols_dst<GGML_TYPE_IQ3_S>
(vx, vy, ids, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst,
nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst,
stream);
break;
default:
GGML_ABORT("fatal error");
break;
}
static void mul_mat_vec_q4_0_q8_1_cuda(
const void * vx, const void * vy, float * dst,
const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) {
mul_mat_vec_q_cuda<GGML_TYPE_Q4_0>(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream);
}
void ggml_cuda_mul_mat_vec_q(
ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * ids, ggml_tensor * dst) {
GGML_ASSERT( src1->type == GGML_TYPE_F32);
GGML_ASSERT( dst->type == GGML_TYPE_F32);
GGML_ASSERT(!ids || ids->type == GGML_TYPE_I32); // Optional, used for batched GGML_MUL_MAT_ID.
static void mul_mat_vec_q4_1_q8_1_cuda(
const void * vx, const void * vy, float * dst,
const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) {
GGML_TENSOR_BINARY_OP_LOCALS;
mul_mat_vec_q_cuda<GGML_TYPE_Q4_1>(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream);
}
cudaStream_t stream = ctx.stream();
static void mul_mat_vec_q5_0_q8_1_cuda(
const void * vx, const void * vy, float * dst,
const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) {
const size_t ts_src0 = ggml_type_size(src0->type);
const size_t ts_src1 = ggml_type_size(src1->type);
const size_t ts_dst = ggml_type_size(dst->type);
mul_mat_vec_q_cuda<GGML_TYPE_Q5_0>(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream);
}
GGML_ASSERT( nb00 == ts_src0);
GGML_ASSERT( nb10 == ts_src1);
GGML_ASSERT( nb0 == ts_dst);
GGML_ASSERT(!ids || ids->nb[0] == ggml_type_size(ids->type));
static void mul_mat_vec_q5_1_q8_1_cuda(
const void * vx, const void * vy, float * dst,
const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) {
GGML_ASSERT(!ids || ne12 == 1); // Implementation is only correct for batch size 1.
mul_mat_vec_q_cuda<GGML_TYPE_Q5_1>(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream);
}
const float * src1_d = (const float *) src1->data;
const int32_t * ids_d = ids ? (const int32_t *) ids->data : nullptr;
float * dst_d = (float *) dst->data;
static void mul_mat_vec_q8_0_q8_1_cuda(
const void * vx, const void * vy, float * dst,
const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) {
const int64_t ne10_padded = GGML_PAD(ne10, MATRIX_ROW_PADDING);
ggml_cuda_pool_alloc<char> src1_q8_1(ctx.pool(), ne13*ne12 * ne11*ne10_padded * sizeof(block_q8_1)/QK8_1);
{
const int64_t s11 = src1->nb[1] / ts_src1;
const int64_t s12 = src1->nb[2] / ts_src1;
const int64_t s13 = src1->nb[3] / ts_src1;
quantize_row_q8_1_cuda(src1_d, nullptr, src1_q8_1.get(), src0->type, ne10, s11, s12, s13, ne10_padded, ne11, ne12, ne13, stream);
}
mul_mat_vec_q_cuda<GGML_TYPE_Q8_0>(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream);
}
const int64_t s01 = src0->nb[1] / ts_src0;
const int64_t s11 = ne10_padded / QK8_1;
const int64_t s1 = dst->nb[1] / ts_dst;
const int64_t s02 = src0->nb[2] / ts_src0;
const int64_t s2 = dst->nb[2] / ts_dst;
const int64_t s03 = src0->nb[3] / ts_src0;
const int64_t s3 = dst->nb[3] / ts_dst;
static void mul_mat_vec_q2_K_q8_1_cuda(
const void * vx, const void * vy, float * dst,
const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) {
const int64_t s12 = ne11*s11;
const int64_t s13 = ne12*s12;
mul_mat_vec_q_cuda<GGML_TYPE_Q2_K>(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream);
}
// For MUL_MAT_ID the memory layout is different than for MUL_MAT:
const int64_t ncols_dst = ids ? ne2 : ne1;
const int64_t nchannels_y = ids ? ne11 : ne12;
const int64_t nchannels_dst = ids ? ne1 : ne2;
const int64_t stride_col_dst = ids ? s2 : s1;
const int64_t stride_col_y = ids ? s12 : s11;
const int64_t stride_channel_dst = ids ? s1 : s2;
const int64_t stride_channel_y = ids ? s11 : s12;
static void mul_mat_vec_q3_K_q8_1_cuda(
const void * vx, const void * vy, float * dst,
const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) {
mul_mat_vec_q_switch_type(
src0->data, src0->type, src1_q8_1.get(), ids_d, dst_d, ne00,
ne01, ncols_dst, s01, stride_col_y, stride_col_dst,
ne02, nchannels_y, nchannels_dst, s02, stride_channel_y, stride_channel_dst,
ne03, ne3, s03, s13, s3, stream);
mul_mat_vec_q_cuda<GGML_TYPE_Q3_K>(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream);
}
static void mul_mat_vec_q4_K_q8_1_cuda(
const void * vx, const void * vy, float * dst,
const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) {
mul_mat_vec_q_cuda<GGML_TYPE_Q4_K>(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream);
}
static void mul_mat_vec_q5_K_q8_1_cuda(
const void * vx, const void * vy, float * dst,
const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) {
mul_mat_vec_q_cuda<GGML_TYPE_Q5_K>(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream);
}
static void mul_mat_vec_q6_K_q8_1_cuda(
const void * vx, const void * vy, float * dst,
const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) {
mul_mat_vec_q_cuda<GGML_TYPE_Q6_K>(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream);
}
static void mul_mat_vec_iq2_xxs_q8_1_cuda(
const void * vx, const void * vy, float * dst,
const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) {
mul_mat_vec_q_cuda<GGML_TYPE_IQ2_XXS>(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream);
}
static void mul_mat_vec_iq2_xs_q8_1_cuda(
const void * vx, const void * vy, float * dst,
const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) {
mul_mat_vec_q_cuda<GGML_TYPE_IQ2_XS>(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream);
}
static void mul_mat_vec_iq2_s_q8_1_cuda(
const void * vx, const void * vy, float * dst,
const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) {
mul_mat_vec_q_cuda<GGML_TYPE_IQ2_S>(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream);
}
static void mul_mat_vec_iq3_xxs_q8_1_cuda(
const void * vx, const void * vy, float * dst,
const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) {
mul_mat_vec_q_cuda<GGML_TYPE_IQ3_XXS>(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream);
}
static void mul_mat_vec_iq1_s_q8_1_cuda(
const void * vx, const void * vy, float * dst,
const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) {
mul_mat_vec_q_cuda<GGML_TYPE_IQ1_S>(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream);
}
static void mul_mat_vec_iq1_m_q8_1_cuda(
const void * vx, const void * vy, float * dst,
const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) {
mul_mat_vec_q_cuda<GGML_TYPE_IQ1_M>(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream);
}
static void mul_mat_vec_iq4_nl_q8_1_cuda(
const void * vx, const void * vy, float * dst,
const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) {
mul_mat_vec_q_cuda<GGML_TYPE_IQ4_NL>(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream);
}
static void mul_mat_vec_iq4_xs_q8_1_cuda(
const void * vx, const void * vy, float * dst,
const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) {
mul_mat_vec_q_cuda<GGML_TYPE_IQ4_XS>(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream);
}
static void mul_mat_vec_iq3_s_q8_1_cuda(
const void * vx, const void * vy, float * dst,
const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) {
mul_mat_vec_q_cuda<GGML_TYPE_IQ3_S>(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream);
}
void ggml_cuda_op_mul_mat_vec_q(
@@ -569,12 +440,68 @@ void ggml_cuda_op_mul_mat_vec_q(
// nrows_dst == nrows of the matrix that the kernel writes into
const int64_t nrows_dst = id == ctx.device ? ne0 : row_diff;
const int stride_row_x = ne00 / ggml_blck_size(src0->type);
const int stride_col_y = src1_padded_row_size / QK8_1;
mul_mat_vec_q_switch_type(
src0_dd_i, src0->type, src1_ddq_i, nullptr, dst_dd_i, ne00, row_diff, src1_ncols, stride_row_x, stride_col_y, nrows_dst,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, stream);
switch (src0->type) {
case GGML_TYPE_Q4_0:
mul_mat_vec_q4_0_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
break;
case GGML_TYPE_Q4_1:
mul_mat_vec_q4_1_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
break;
case GGML_TYPE_Q5_0:
mul_mat_vec_q5_0_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
break;
case GGML_TYPE_Q5_1:
mul_mat_vec_q5_1_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
break;
case GGML_TYPE_Q8_0:
mul_mat_vec_q8_0_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
break;
case GGML_TYPE_Q2_K:
mul_mat_vec_q2_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
break;
case GGML_TYPE_Q3_K:
mul_mat_vec_q3_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
break;
case GGML_TYPE_Q4_K:
mul_mat_vec_q4_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
break;
case GGML_TYPE_Q5_K:
mul_mat_vec_q5_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
break;
case GGML_TYPE_Q6_K:
mul_mat_vec_q6_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
break;
case GGML_TYPE_IQ2_XXS:
mul_mat_vec_iq2_xxs_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
break;
case GGML_TYPE_IQ2_XS:
mul_mat_vec_iq2_xs_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
break;
case GGML_TYPE_IQ2_S:
mul_mat_vec_iq2_s_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
break;
case GGML_TYPE_IQ3_XXS:
mul_mat_vec_iq3_xxs_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
break;
case GGML_TYPE_IQ1_S:
mul_mat_vec_iq1_s_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
break;
case GGML_TYPE_IQ1_M:
mul_mat_vec_iq1_m_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
break;
case GGML_TYPE_IQ4_NL:
mul_mat_vec_iq4_nl_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
break;
case GGML_TYPE_IQ4_XS:
mul_mat_vec_iq4_xs_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
break;
case GGML_TYPE_IQ3_S:
mul_mat_vec_iq3_s_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
break;
default:
GGML_ABORT("fatal error");
break;
}
GGML_UNUSED(src1);
GGML_UNUSED(dst);
-3
View File
@@ -2,9 +2,6 @@
#define MMVQ_MAX_BATCH_SIZE 8 // Max. batch size for which to use MMVQ kernels.
void ggml_cuda_mul_mat_vec_q(ggml_backend_cuda_context & ctx,
const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * ids, ggml_tensor * dst);
void ggml_cuda_op_mul_mat_vec_q(
ggml_backend_cuda_context & ctx,
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i,
+38 -57
View File
@@ -1,40 +1,30 @@
#include "quantize.cuh"
#include <cstdint>
static __global__ void quantize_q8_1(
const float * __restrict__ x, void * __restrict__ vy,
const int64_t ne00, const int64_t s01, const int64_t s02, const int64_t s03,
const int64_t ne0, const int ne1, const int ne2) {
const int64_t i0 = (int64_t)blockDim.x*blockIdx.x + threadIdx.x;
static __global__ void quantize_q8_1(const float * __restrict__ x, void * __restrict__ vy, const int64_t kx, const int64_t kx0_padded) {
const int64_t ix0 = (int64_t)blockDim.x*blockIdx.x + threadIdx.x;
if (i0 >= ne0) {
if (ix0 >= kx0_padded) {
return;
}
const int64_t i1 = blockIdx.y;
const int64_t i2 = blockIdx.z % ne2;
const int64_t i3 = blockIdx.z / ne2;
const int64_t ix1 = blockIdx.y;
const int64_t & i00 = i0;
const int64_t & i01 = i1;
const int64_t & i02 = i2;
const int64_t & i03 = i3;
const int64_t i_cont = ((i3*ne2 + i2) * ne1 + i1) * ne0 + i0;
const int64_t i_padded = ix1*kx0_padded + ix0;
block_q8_1 * y = (block_q8_1 *) vy;
const int64_t ib = i_cont / QK8_1; // block index
const int64_t iqs = i_cont % QK8_1; // quant index
const int64_t ib = i_padded / QK8_1; // block index
const int64_t iqs = i_padded % QK8_1; // quant index
const float xi = i0 < ne00 ? x[i03*s03 + i02*s02 + i01*s01 + i00] : 0.0f;
const float xi = ix0 < kx ? x[ix1*kx + ix0] : 0.0f;
float amax = fabsf(xi);
float sum = xi;
amax = warp_reduce_max(amax);
sum = warp_reduce_sum(sum);
sum = warp_reduce_sum(sum);
const float d = amax / 127;
const float d = amax / 127;
const int8_t q = amax == 0.0f ? 0 : roundf(xi / d);
y[ib].qs[iqs] = q;
@@ -49,38 +39,29 @@ static __global__ void quantize_q8_1(
template <mmq_q8_1_ds_layout ds_layout>
static __global__ void quantize_mmq_q8_1(
const float * __restrict__ x, const int32_t * __restrict__ ids, void * __restrict__ vy,
const int64_t ne00, const int64_t s01, const int64_t s02, const int64_t s03,
const int64_t ne0, const int ne1, const int ne2) {
const float * __restrict__ x, void * __restrict__ vy, const int64_t kx0, const int64_t kx1, const int64_t kx0_padded) {
constexpr int vals_per_scale = ds_layout == MMQ_Q8_1_DS_LAYOUT_D2S6 ? 64 : 32;
constexpr int vals_per_sum = ds_layout == MMQ_Q8_1_DS_LAYOUT_D2S6 ? 16 : 32;
const int64_t i0 = ((int64_t)blockDim.x*blockIdx.x + threadIdx.x)*4;
const int64_t ix0 = ((int64_t)blockDim.x*blockIdx.x + threadIdx.x)*4;
if (i0 >= ne0) {
if (ix0 >= kx0_padded) {
return;
}
const int64_t i1 = blockIdx.y;
const int64_t i2 = blockIdx.z % ne2;
const int64_t i3 = blockIdx.z / ne2;
const int64_t i00 = i0;
const int64_t i01 = ids ? ids[i1] : i1;
const int64_t i02 = i2;
const int64_t i03 = i3;
const float4 * x4 = (const float4 *) x;
const int64_t ix1 = kx1*blockIdx.z + blockIdx.y;
block_q8_1_mmq * y = (block_q8_1_mmq *) vy;
const int64_t ib0 = blockIdx.z*((int64_t)gridDim.y*gridDim.x*blockDim.x/QK8_1); // first block of channel
const int64_t ib = ib0 + (i0 / (4*QK8_1))*ne1 + blockIdx.y; // block index in channel
const int64_t iqs = i0 % (4*QK8_1); // quant index in block
const int64_t ib = ib0 + (ix0 / (4*QK8_1))*kx1 + blockIdx.y; // block index in channel
const int64_t iqs = ix0 % (4*QK8_1); // quant index in block
// Load 4 floats per thread and calculate max. abs. value between them:
const float4 xi = i0 < ne00 ? x4[(i03*s03 + i02*s02 + i01*s01 + i00)/4] : make_float4(0.0f, 0.0f, 0.0f, 0.0f);
const float4 xi = ix0 < kx0 ? x4[(ix1*kx0 + ix0)/4] : make_float4(0.0f, 0.0f, 0.0f, 0.0f);
float amax = fabsf(xi.x);
amax = fmaxf(amax, fabsf(xi.y));
amax = fmaxf(amax, fabsf(xi.z));
@@ -96,7 +77,7 @@ static __global__ void quantize_mmq_q8_1(
if (ds_layout != MMQ_Q8_1_DS_LAYOUT_D4) {
sum = xi.x + xi.y + xi.z + xi.w;
// Calculate sums across vals_per_sum/4 threads.
// Exchange calculate sum across vals_per_sum/4 threads.
#pragma unroll
for (int offset = vals_per_sum/8; offset > 0; offset >>= 1) {
sum += __shfl_xor_sync(0xFFFFFFFF, sum, offset, WARP_SIZE);
@@ -146,40 +127,40 @@ static __global__ void quantize_mmq_q8_1(
}
void quantize_row_q8_1_cuda(
const float * x, const int32_t * ids, void * vy, const ggml_type type_src0,
const int64_t ne00, const int64_t s01, const int64_t s02, const int64_t s03,
const int64_t ne0, const int64_t ne1, const int64_t ne2, const int64_t ne3, cudaStream_t stream) {
GGML_ASSERT(!ids);
GGML_ASSERT(ne0 % QK8_1 == 0);
const float * x, void * vy, const int64_t kx0, const int64_t kx1, const int64_t channels,
const int64_t kx0_padded, const ggml_type type_x, cudaStream_t stream) {
const int64_t block_num_x = (ne0 + CUDA_QUANTIZE_BLOCK_SIZE - 1) / CUDA_QUANTIZE_BLOCK_SIZE;
const dim3 num_blocks(block_num_x, ne1, ne2*ne3);
GGML_ASSERT(kx0_padded % QK8_1 == 0);
const int64_t block_num_x = (kx0_padded + CUDA_QUANTIZE_BLOCK_SIZE - 1) / CUDA_QUANTIZE_BLOCK_SIZE;
const dim3 num_blocks(block_num_x, kx1*channels, 1);
const dim3 block_size(CUDA_QUANTIZE_BLOCK_SIZE, 1, 1);
quantize_q8_1<<<num_blocks, block_size, 0, stream>>>(x, vy, ne00, s01, s02, s03, ne0, ne1, ne2);
GGML_UNUSED(type_src0);
quantize_q8_1<<<num_blocks, block_size, 0, stream>>>(x, vy, kx0, kx0_padded);
GGML_UNUSED(type_x);
}
void quantize_mmq_q8_1_cuda(
const float * x, const int32_t * ids, void * vy, const ggml_type type_src0,
const int64_t ne00, const int64_t s01, const int64_t s02, const int64_t s03,
const int64_t ne0, const int64_t ne1, const int64_t ne2, const int64_t ne3, cudaStream_t stream) {
GGML_ASSERT(ne0 % (4*QK8_1) == 0);
const float * x, void * vy, const int64_t kx0, const int64_t kx1, const int64_t channels,
const int64_t kx0_padded, const ggml_type type_x, cudaStream_t stream) {
const int64_t block_num_x = (ne0 + 4*CUDA_QUANTIZE_BLOCK_SIZE_MMQ - 1) / (4*CUDA_QUANTIZE_BLOCK_SIZE_MMQ);
const dim3 num_blocks(block_num_x, ne1, ne2*ne3);
GGML_ASSERT(kx0_padded % (4*QK8_1) == 0);
const int64_t block_num_x = (kx0_padded + 4*CUDA_QUANTIZE_BLOCK_SIZE_MMQ - 1) / (4*CUDA_QUANTIZE_BLOCK_SIZE_MMQ);
const dim3 num_blocks(block_num_x, kx1, channels);
const dim3 block_size(CUDA_QUANTIZE_BLOCK_SIZE_MMQ, 1, 1);
switch (mmq_get_q8_1_ds_layout(type_src0)) {
switch (mmq_get_q8_1_ds_layout(type_x)) {
case MMQ_Q8_1_DS_LAYOUT_D4:
quantize_mmq_q8_1<MMQ_Q8_1_DS_LAYOUT_D4>
<<<num_blocks, block_size, 0, stream>>>(x, ids, vy, ne00, s01, s02, s03, ne0, ne1, ne2);
<<<num_blocks, block_size, 0, stream>>>(x, vy, kx0, kx1, kx0_padded);
break;
case MMQ_Q8_1_DS_LAYOUT_DS4:
quantize_mmq_q8_1<MMQ_Q8_1_DS_LAYOUT_DS4>
<<<num_blocks, block_size, 0, stream>>>(x, ids, vy, ne00, s01, s02, s03, ne0, ne1, ne2);
<<<num_blocks, block_size, 0, stream>>>(x, vy, kx0, kx1, kx0_padded);
break;
case MMQ_Q8_1_DS_LAYOUT_D2S6:
quantize_mmq_q8_1<MMQ_Q8_1_DS_LAYOUT_D2S6>
<<<num_blocks, block_size, 0, stream>>>(x, ids, vy, ne00, s01, s02, s03, ne0, ne1, ne2);
<<<num_blocks, block_size, 0, stream>>>(x, vy, kx0, kx1, kx0_padded);
break;
default:
GGML_ABORT("fatal error");
+6 -9
View File
@@ -12,16 +12,13 @@ static_assert(MATRIX_ROW_PADDING % CUDA_QUANTIZE_BLOCK_SIZE == 0, "Risk
static_assert(MATRIX_ROW_PADDING % (4*CUDA_QUANTIZE_BLOCK_SIZE_MMQ) == 0, "Risk of out-of-bounds access.");
typedef void (*quantize_cuda_t)(
const float * x, const int32_t * ids, void * vy,
ggml_type type_src0, int64_t ne00, int64_t s01, int64_t s02, int64_t s03,
int64_t ne0, int64_t ne1, int64_t ne2, int64_t ne3, cudaStream_t stream);
const float * x, void * vy, const int64_t kx0, const int64_t kx1, const int64_t channels, const int64_t kx0_padded,
const ggml_type type_x, cudaStream_t stream);
void quantize_row_q8_1_cuda(
const float * x, const int32_t * ids, void * vy,
ggml_type type_src0, int64_t ne00, int64_t s01, int64_t s02, int64_t s03,
int64_t ne0, int64_t ne1, int64_t ne2, int64_t ne3, cudaStream_t stream);
const float * x, void * vy, const int64_t kx0, const int64_t kx1, const int64_t channels, const int64_t kx0_padded,
const ggml_type type_x, cudaStream_t stream);
void quantize_mmq_q8_1_cuda(
const float * x, const int32_t * ids, void * vy,
ggml_type type_src0, int64_t ne00, int64_t s01, int64_t s02, int64_t s03,
int64_t ne0, int64_t ne1, int64_t ne2, int64_t ne3, cudaStream_t stream);
const float * x, void * vy, const int64_t kx0, const int64_t kx1, const int64_t channels, const int64_t kx0_padded,
const ggml_type type_x, cudaStream_t stream);
-2
View File
@@ -1,5 +1,3 @@
#pragma once
#include "common.cuh"
#include <cstdint>
+3 -3
View File
@@ -5690,7 +5690,7 @@ kernel void kernel_flash_attn_ext(
{
float S[Q] = { [0 ... Q-1] = 0.0f };
float M[Q] = { [0 ... Q-1] = -__FLT_MAX__/2 };
float M[Q] = { [0 ... Q-1] = -__FLT16_MAX__/2 };
// thread indices inside the simdgroup
// TODO: see if we can utilize quad-group functions for better performance
@@ -5950,7 +5950,7 @@ kernel void kernel_flash_attn_ext(
// reduce the warps sequentially
for (ushort sg = 1; sg < nsg; ++sg) {
float S = { 0.0f };
float M = { -__FLT_MAX__/2 };
float M = { -__FLT16_MAX__/2 };
threadgroup_barrier(mem_flags::mem_threadgroup);
@@ -6197,7 +6197,7 @@ kernel void kernel_flash_attn_ext_vec(
{
float S = 0.0f;
float M = -__FLT_MAX__/2;
float M = -__FLT16_MAX__/2;
// thread indices inside the simdgroup
const short tx = tiisg%NL;
+40 -366
View File
@@ -44,8 +44,8 @@ static struct ggml_backend_device g_ggml_backend_metal_device;
// note: assumes single GPU device - the default one
// TODO: support multiple GPU devices
static struct ggml_backend_metal_device_context {
id<MTLDevice> mtl_device;
int mtl_device_ref_count;
id<MTLDevice> mtl_device;
int mtl_device_ref_count;
id<MTLLibrary> mtl_library;
bool has_simdgroup_reduction;
@@ -491,259 +491,7 @@ enum ggml_metal_kernel_type {
GGML_METAL_KERNEL_TYPE_COUNT
};
//
// ggml_metal_heap
//
struct ggml_metal_heap {
// number of times the heap was unused
int n_unused;
// total number of buffer allocations in this heap across all computes
int64_t n_alloc;
// current offset in the heap - we reset this after each node in order to reuse the memory
size_t offs;
// the currently allocated MTLBuffer objects in this heap
id<MTLHeap> obj;
NSMutableArray * bufs;
};
static struct ggml_metal_heap * ggml_metal_heap_init(id<MTLDevice> device, size_t size) {
struct ggml_metal_heap * heap = calloc(1, sizeof(struct ggml_metal_heap));
MTLHeapDescriptor * desc = [[MTLHeapDescriptor alloc] init];
desc.storageMode = MTLStorageModePrivate;
desc.cpuCacheMode = MTLCPUCacheModeDefaultCache;
desc.type = MTLHeapTypePlacement;
desc.size = size;
heap->n_unused = 0;
heap->n_alloc = 0;
heap->obj = [device newHeapWithDescriptor:desc];
if (!heap->obj) {
GGML_LOG_ERROR("%s: error: failed to create MTLHeap with size %zu\n", __func__, size);
free(heap);
return false;
}
[desc release];
heap->bufs = [[NSMutableArray alloc] init];
return heap;
}
static void ggml_metal_heap_reset(struct ggml_metal_heap * heap) {
heap->offs = 0;
// count how many graph computes the heap ended up being unused
if ([heap->bufs count] > 0) {
heap->n_unused = 0;
} else {
heap->n_unused++;
}
for (id<MTLBuffer> buf in heap->bufs) {
[buf release];
}
[heap->bufs removeAllObjects];
// tell the OS that it can reuse this memory if needed
// ref: https://developer.apple.com/documentation/metal/mtlpurgeablestate?language=objc
[heap->obj setPurgeableState:MTLPurgeableStateVolatile];
}
static void ggml_metal_heap_free(struct ggml_metal_heap * heap) {
if (heap == nil) {
return;
}
ggml_metal_heap_reset(heap);
[heap->obj release];
[heap->bufs release];
free(heap);
}
@interface ggml_metal_heap_ptr : NSObject
@property (nonatomic, assign) struct ggml_metal_heap * data;
@end
@implementation ggml_metal_heap_ptr
@end
//
// ggml_metal_mem_pool
//
struct ggml_metal_mem_pool {
id<MTLDevice> device;
int n_heaps; // total number of heaps ever created (including those that were removed)
NSMutableArray * heaps;
NSMutableArray * heaps_to_remove;
};
static struct ggml_metal_mem_pool * ggml_metal_mem_pool_init(void) {
struct ggml_metal_mem_pool * mem_pool = calloc(1, sizeof(struct ggml_metal_mem_pool));
mem_pool->n_heaps = 0;
mem_pool->heaps = [[NSMutableArray alloc] init];
mem_pool->heaps_to_remove = [[NSMutableArray alloc] init];
return mem_pool;
}
static void ggml_metal_mem_pool_free(struct ggml_metal_mem_pool * mem_pool) {
GGML_LOG_DEBUG("%s: freeing memory pool, num heaps = %zu (total = %d)\n", __func__, [mem_pool->heaps count], mem_pool->n_heaps);
size_t size_all = 0;
size_t size_cur = 0;
for (ggml_metal_heap_ptr * ptr in mem_pool->heaps) {
GGML_LOG_DEBUG("%s: heap: %p\n", __func__, (void *) ptr.data);
GGML_LOG_DEBUG("%s: n_alloc: %" PRId64 "\n", __func__, ptr.data->n_alloc);
GGML_LOG_DEBUG("%s: n_unused: %d\n", __func__, ptr.data->n_unused);
GGML_LOG_DEBUG("%s: size: %.2f MiB\n", __func__, [ptr.data->obj size] / 1024.0 / 1024.0);
GGML_LOG_DEBUG("%s: bufs: %zu\n", __func__, [ptr.data->bufs count]);
if ([ptr.data->bufs count] > 0) {
size_cur += [ptr.data->obj size];
}
size_all += [ptr.data->obj size];
ggml_metal_heap_free(ptr.data);
[ptr release];
}
[mem_pool->heaps release];
[mem_pool->heaps_to_remove release];
if (size_all > 0) {
GGML_LOG_DEBUG("%s: size_all: %.2f MiB\n", __func__, size_all / 1024.0 / 1024.0);
GGML_LOG_DEBUG("%s: size_cur: %.2f MiB\n", __func__, size_cur / 1024.0 / 1024.0);
}
free(mem_pool);
}
static void ggml_metal_mem_pool_reset(struct ggml_metal_mem_pool * mem_pool) {
for (NSUInteger i = 0; i < [mem_pool->heaps count]; i++) {
ggml_metal_heap_ptr * ptr = [mem_pool->heaps objectAtIndex:i];
struct ggml_metal_heap * heap = ptr.data;
ggml_metal_heap_reset(heap);
// if the heap hasn't been used for a while, remove it
if (heap->n_unused >= 128) {
[mem_pool->heaps_to_remove addObject:@(i)];
}
}
if (mem_pool->heaps_to_remove.count > 0) {
for (NSUInteger i = 0; i < [mem_pool->heaps_to_remove count]; i++) {
NSUInteger index = [[mem_pool->heaps_to_remove objectAtIndex:i] intValue];
ggml_metal_heap_ptr * ptr = [mem_pool->heaps objectAtIndex:index];
struct ggml_metal_heap * heap = ptr.data;
ggml_metal_heap_free(heap);
[mem_pool->heaps removeObjectAtIndex:index];
[ptr release];
}
[mem_pool->heaps_to_remove removeAllObjects];
}
}
static void ggml_metal_mem_pool_clear(struct ggml_metal_mem_pool * mem_pool) {
for (ggml_metal_heap_ptr * ptr in mem_pool->heaps) {
ptr.data->offs = 0;
}
}
static id<MTLBuffer> ggml_metal_mem_pool_alloc(struct ggml_metal_mem_pool * mem_pool, size_t size) {
const size_t alignment = 32;
const size_t size_aligned = GGML_PAD(size, alignment);
// try one of the existing heaps
for (ggml_metal_heap_ptr * ptr in mem_pool->heaps) {
struct ggml_metal_heap * heap = ptr.data;
if (heap->offs + size_aligned <= [heap->obj size]) {
// if this is the first buffer in the heap for the current command buffer, tell the OS that
// it cannot free the memory used by the heap
// ref: https://developer.apple.com/documentation/metal/mtlpurgeablestate?language=objc
if ([heap->bufs count] == 0) {
[heap->obj setPurgeableState:MTLPurgeableStateNonVolatile];
}
id<MTLBuffer> buf = [heap->obj newBufferWithLength:size_aligned options:MTLResourceStorageModePrivate offset:heap->offs];
if (buf == nil) {
GGML_LOG_ERROR("%s: error: failed to create MTLBuffer with size %zu\n", __func__, size_aligned);
return nil;
}
heap->n_alloc++;
heap->offs += size_aligned;
[heap->bufs addObject:buf];
return buf;
}
}
// create a new heap that can fit this buffer
ggml_metal_heap_ptr * heap_ptr = [ggml_metal_heap_ptr new];
struct ggml_metal_heap * heap = ggml_metal_heap_init(mem_pool->device, size_aligned);
if (heap == NULL) {
GGML_LOG_ERROR("%s: error: failed to create heap of size %zu\n", __func__, size_aligned);
return NULL;
}
//GGML_LOG_DEBUG("%s: creating new heap of size %zu, got %zu\n", __func__, size_aligned, [heap->obj size]);
heap_ptr.data = heap;
ggml_metal_heap_reset(heap);
[heap->obj setPurgeableState:MTLPurgeableStateNonVolatile];
id<MTLBuffer> buf = [heap->obj newBufferWithLength:size_aligned options:MTLResourceStorageModePrivate offset:heap->offs];
if (buf == nil) {
GGML_LOG_ERROR("%s: error: failed to create MTLBuffer with size %zu\n", __func__, size_aligned);
return NULL;
}
heap->n_alloc++;
heap->offs += size_aligned;
[heap->bufs addObject:buf];
[mem_pool->heaps addObject:heap_ptr];
mem_pool->n_heaps++;
return buf;
}
struct ggml_metal_command_buffer {
id<MTLCommandBuffer> obj;
// each command buffer has a memory pool from which it can allocate temporary buffers during the compute
struct ggml_metal_mem_pool * mem_pool;
};
struct ggml_backend_metal_context {
id<MTLDevice> device;
id<MTLCommandQueue> queue;
dispatch_queue_t d_queue;
@@ -768,7 +516,7 @@ struct ggml_backend_metal_context {
void (^encode_async)(size_t ith);
// n_cb command buffers + 1 used by the main thread
struct ggml_metal_command_buffer cmd_bufs[GGML_METAL_MAX_COMMAND_BUFFERS + 1];
id<MTLCommandBuffer> command_buffers[GGML_METAL_MAX_COMMAND_BUFFERS + 1];
// abort ggml_metal_graph_compute if callback returns true
ggml_abort_callback abort_callback;
@@ -958,11 +706,9 @@ static struct ggml_backend_metal_context * ggml_metal_init(ggml_backend_dev_t de
struct ggml_backend_metal_device_context * ctx_dev = dev->context;
id<MTLDevice> device = ggml_backend_metal_device_acq(ctx_dev);
GGML_LOG_INFO("%s: picking default device: %s\n", __func__, [[device name] UTF8String]);
ctx->device = device;
ctx->queue = [device newCommandQueue];
ctx->queue = [device newCommandQueue];
if (ctx->queue == nil) {
GGML_LOG_ERROR("%s: error: failed to create command queue\n", __func__);
return NULL;
@@ -1023,10 +769,7 @@ static struct ggml_backend_metal_context * ggml_metal_init(ggml_backend_dev_t de
ctx->gf = nil;
ctx->encode_async = nil;
for (int i = 0; i < GGML_METAL_MAX_COMMAND_BUFFERS; ++i) {
ctx->cmd_bufs[i].obj = nil;
ctx->cmd_bufs[i].mem_pool = ggml_metal_mem_pool_init();
ctx->cmd_bufs[i].mem_pool->device = device;
ctx->command_buffers[i] = nil;
}
#if TARGET_OS_OSX || (TARGET_OS_IOS && __clang_major__ >= 15)
@@ -1440,12 +1183,6 @@ static void ggml_metal_free(struct ggml_backend_metal_context * ctx) {
[ctx->queue release];
for (int i = 0; i < GGML_METAL_MAX_COMMAND_BUFFERS; ++i) {
// ctx->cmd_bufs[i].obj is auto released
ggml_metal_mem_pool_free(ctx->cmd_bufs[i].mem_pool);
}
dispatch_release(ctx->d_queue);
free(ctx);
@@ -1752,11 +1489,10 @@ static bool ggml_metal_supports_op(const struct ggml_backend_metal_device_contex
}
}
static bool ggml_metal_encode_node(
static void ggml_metal_encode_node(
ggml_backend_t backend,
int idx,
id<MTLComputeCommandEncoder> encoder,
struct ggml_metal_mem_pool * mem_pool) {
id<MTLComputeCommandEncoder> encoder) {
struct ggml_backend_metal_context * ctx = backend->context;
struct ggml_backend_metal_device_context * ctx_dev = backend->device->context;
@@ -1772,7 +1508,7 @@ static bool ggml_metal_encode_node(
struct ggml_tensor * dst = node;
if (ggml_is_empty(dst)) {
return true;
return;
}
switch (dst->op) {
@@ -1783,7 +1519,7 @@ static bool ggml_metal_encode_node(
case GGML_OP_PERMUTE:
{
// noop -> next node
} return true;
} return;
default:
{
} break;
@@ -1794,8 +1530,6 @@ static bool ggml_metal_encode_node(
GGML_ABORT("unsupported op");
}
ggml_metal_mem_pool_clear(mem_pool);
const int64_t ne00 = src0 ? src0->ne[0] : 0;
const int64_t ne01 = src0 ? src0->ne[1] : 0;
const int64_t ne02 = src0 ? src0->ne[2] : 0;
@@ -2442,76 +2176,26 @@ static bool ggml_metal_encode_node(
const float m0 = powf(2.0f, -(max_bias ) / n_head_log2);
const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_head_log2);
// use this branch to test the ggml_metal_mem_pool functionality
#if 0
// cpy to tmp buffer in MTLHeap
id<MTLBuffer> h_src0 = h_src0 = ggml_metal_mem_pool_alloc(mem_pool, ggml_nbytes(src0));
if (!h_src0) {
GGML_LOG_ERROR("%s: failed to allocate buffer from memory pool, size = %zu\n", __func__, ggml_nbytes(src0));
return false;
}
offs_src0 = 0;
ggml_metal_kargs_cpy args_cpy = {
ggml_metal_kargs_soft_max args = {
/*.ne00 =*/ ne00,
/*.ne01 =*/ ne01,
/*.ne02 =*/ ne02,
/*.ne03 =*/ ne03,
/*.nb00 =*/ nb00,
/*.nb01 =*/ nb01,
/*.nb02 =*/ nb02,
/*.nb03 =*/ nb03,
/*.ne0 =*/ ne00,
/*.ne1 =*/ ne01,
/*.ne2 =*/ ne02,
/*.ne3 =*/ ne03,
/*.nb0 =*/ nb00,
/*.nb1 =*/ nb01,
/*.nb2 =*/ nb02,
/*.nb3 =*/ nb03,
};
if (src0->type == GGML_TYPE_F16) {
[encoder setComputePipelineState:ctx->kernels[GGML_METAL_KERNEL_TYPE_CPY_F16_F16].pipeline];
} else {
[encoder setComputePipelineState:ctx->kernels[GGML_METAL_KERNEL_TYPE_CPY_F32_F32].pipeline];
}
[encoder setBytes:&args_cpy length:sizeof(args_cpy) atIndex:0];
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:1];
[encoder setBuffer:h_src0 offset:0 atIndex:2];
GGML_ASSERT(ne00 % ggml_blck_size(src0->type) == 0);
int nth_cpy = MIN(1024, ne00 / ggml_blck_size(src0->type));
[encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(nth_cpy, 1, 1)];
#else
id<MTLBuffer> h_src0 = id_src0;
#endif
// softmax
ggml_metal_kargs_soft_max args = {
/*.ne00 =*/ ne00,
/*.ne01 =*/ ne01,
/*.ne02 =*/ ne02,
/*.scale =*/ scale,
/*.max_bias =*/ max_bias,
/*.m0 =*/ m0,
/*.m1 =*/ m1,
/*.scale =*/ scale,
/*.max_bias =*/ max_bias,
/*.m0 =*/ m0,
/*.m1 =*/ m1,
/*.n_head_log2 =*/ n_head_log2,
};
[encoder setComputePipelineState:pipeline];
[encoder setBuffer:h_src0 offset:offs_src0 atIndex:0];
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
if (id_src1) {
[encoder setBuffer:id_src1 offset:offs_src1 atIndex:1];
[encoder setBuffer:id_src1 offset:offs_src1 atIndex:1];
} else {
[encoder setBuffer:h_src0 offset:offs_src0 atIndex:1];
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:1];
}
[encoder setBuffer:id_dst offset:offs_dst atIndex:2];
[encoder setBytes:&args length:sizeof(args) atIndex:3];
[encoder setBuffer:id_dst offset:offs_dst atIndex:2];
[encoder setBytes:&args length:sizeof(args) atIndex:3];
[encoder setThreadgroupMemoryLength:32*sizeof(float) atIndex:0];
@@ -4950,8 +4634,6 @@ static bool ggml_metal_encode_node(
GGML_ABORT("fatal error");
}
}
return true;
}
static enum ggml_status ggml_metal_graph_compute(
@@ -5005,25 +4687,25 @@ static enum ggml_status ggml_metal_graph_compute(
}
// the main thread commits the first few commands immediately
// cmd_buf[n_cb]
// command_buffer[n_cb]
{
id<MTLCommandBuffer> cmd_buf = [ctx->queue commandBufferWithUnretainedReferences];
ctx->cmd_bufs[n_cb].obj = cmd_buf;
id<MTLCommandBuffer> command_buffer = [ctx->queue commandBufferWithUnretainedReferences];
ctx->command_buffers[n_cb] = command_buffer;
[cmd_buf enqueue];
[command_buffer enqueue];
ctx->encode_async(n_cb);
}
// prepare the rest of the command buffers asynchronously
// cmd_buf[0.. n_cb)
// command_buffer[0.. n_cb)
for (int cb_idx = 0; cb_idx < n_cb; ++cb_idx) {
id<MTLCommandBuffer> cmd_buf = [ctx->queue commandBufferWithUnretainedReferences];
ctx->cmd_bufs[cb_idx].obj = cmd_buf;
id<MTLCommandBuffer> command_buffer = [ctx->queue commandBufferWithUnretainedReferences];
ctx->command_buffers[cb_idx] = command_buffer;
// always enqueue the first two command buffers
// enqueue all of the command buffers if we don't need to abort
if (cb_idx < 2 || ctx->abort_callback == NULL) {
[cmd_buf enqueue];
[command_buffer enqueue];
}
}
@@ -5032,14 +4714,14 @@ static enum ggml_status ggml_metal_graph_compute(
// wait for completion and check status of each command buffer
// needed to detect if the device ran out-of-memory for example (#1881)
{
id<MTLCommandBuffer> cmd_buf = ctx->cmd_bufs[n_cb].obj;
[cmd_buf waitUntilCompleted];
id<MTLCommandBuffer> command_buffer = ctx->command_buffers[n_cb];
[command_buffer waitUntilCompleted];
MTLCommandBufferStatus status = [cmd_buf status];
MTLCommandBufferStatus status = [command_buffer status];
if (status != MTLCommandBufferStatusCompleted) {
GGML_LOG_INFO("%s: command buffer %d failed with status %lu\n", __func__, n_cb, status);
if (status == MTLCommandBufferStatusError) {
GGML_LOG_INFO("error: %s\n", [[cmd_buf error].localizedDescription UTF8String]);
GGML_LOG_INFO("error: %s\n", [[command_buffer error].localizedDescription UTF8String]);
}
return GGML_STATUS_FAILED;
@@ -5047,20 +4729,20 @@ static enum ggml_status ggml_metal_graph_compute(
}
for (int i = 0; i < n_cb; ++i) {
id<MTLCommandBuffer> cmd_buf = ctx->cmd_bufs[i].obj;
[cmd_buf waitUntilCompleted];
id<MTLCommandBuffer> command_buffer = ctx->command_buffers[i];
[command_buffer waitUntilCompleted];
MTLCommandBufferStatus status = [cmd_buf status];
MTLCommandBufferStatus status = [command_buffer status];
if (status != MTLCommandBufferStatusCompleted) {
GGML_LOG_INFO("%s: command buffer %d failed with status %lu\n", __func__, i, status);
if (status == MTLCommandBufferStatusError) {
GGML_LOG_INFO("error: %s\n", [[cmd_buf error].localizedDescription UTF8String]);
GGML_LOG_INFO("error: %s\n", [[command_buffer error].localizedDescription UTF8String]);
}
return GGML_STATUS_FAILED;
}
id<MTLCommandBuffer> next_buffer = (i + 1 < n_cb ? ctx->cmd_bufs[i + 1].obj : nil);
id<MTLCommandBuffer> next_buffer = (i + 1 < n_cb ? ctx->command_buffers[i + 1] : nil);
if (!next_buffer) {
continue;
}
@@ -5444,9 +5126,8 @@ static void ggml_backend_metal_set_n_cb(ggml_backend_t backend, int n_cb) {
const int n_nodes_per_cb = ctx->n_nodes_per_cb;
id<MTLCommandBuffer> cmd_buf = ctx->cmd_bufs[cb_idx].obj;
id<MTLComputeCommandEncoder> encoder = [cmd_buf computeCommandEncoder];
id<MTLCommandBuffer> command_buffer = ctx->command_buffers[cb_idx];
id<MTLComputeCommandEncoder> encoder = [command_buffer computeCommandEncoder];
int node_start = 0;
int node_end = n_nodes_0;
@@ -5458,29 +5139,22 @@ static void ggml_backend_metal_set_n_cb(ggml_backend_t backend, int n_cb) {
const bool should_capture = ctx->capture_next_compute;
struct ggml_metal_mem_pool * mem_pool = ctx->cmd_bufs[cb_idx].mem_pool;
ggml_metal_mem_pool_reset(mem_pool);
for (int idx = node_start; idx < node_end; ++idx) {
if (should_capture) {
[encoder pushDebugGroup:[NSString stringWithCString:ggml_op_desc(ggml_graph_node(ctx->gf, idx)) encoding:NSUTF8StringEncoding]];
}
const bool res = ggml_metal_encode_node(backend, idx, encoder, mem_pool);
ggml_metal_encode_node(backend, idx, encoder);
if (should_capture) {
[encoder popDebugGroup];
}
if (!res) {
break;
}
}
[encoder endEncoding];
if (cb_idx < 2 || ctx->abort_callback == NULL) {
[cmd_buf commit];
[command_buffer commit];
}
});
}
+3 -3
View File
@@ -3237,7 +3237,7 @@ kernel void kernel_flash_attn_ext(
{
float S[Q] = { [0 ... Q-1] = 0.0f };
float M[Q] = { [0 ... Q-1] = -__FLT_MAX__/2 };
float M[Q] = { [0 ... Q-1] = -__FLT16_MAX__/2 };
// thread indices inside the simdgroup
// TODO: see if we can utilize quad-group functions for better performance
@@ -3497,7 +3497,7 @@ kernel void kernel_flash_attn_ext(
// reduce the warps sequentially
for (ushort sg = 1; sg < nsg; ++sg) {
float S = { 0.0f };
float M = { -__FLT_MAX__/2 };
float M = { -__FLT16_MAX__/2 };
threadgroup_barrier(mem_flags::mem_threadgroup);
@@ -3744,7 +3744,7 @@ kernel void kernel_flash_attn_ext_vec(
{
float S = 0.0f;
float M = -__FLT_MAX__/2;
float M = -__FLT16_MAX__/2;
// thread indices inside the simdgroup
const short tx = tiisg%NL;
+48 -56
View File
@@ -4,7 +4,6 @@
#include "ggml-backend.h"
#include "ggml-impl.h"
#include "ggml-threading.h"
#include "ggml-cpu.h"
#include "ggml.h"
// FIXME: required here for quantization functions
@@ -383,16 +382,58 @@ void ggml_fp16_to_fp32_row(const ggml_fp16_t * x, float * y, int64_t n) {
}
}
// FIXME: these functions must detect the instruction set at runtime, since they are part of the core ggml library
// currently, the ggml_cpu_has_* functions are entirely compile-time
void ggml_fp32_to_fp16_row(const float * x, ggml_fp16_t * y, int64_t n) {
int i = 0;
for (; i < n; ++i) {
int64_t i = 0;
#if defined(__F16C__)
//if (ggml_cpu_has_f16c()) {
for (; i + 7 < n; i += 8) {
__m256 x_vec = _mm256_loadu_ps(x + i);
__m128i y_vec = _mm256_cvtps_ph(x_vec, _MM_FROUND_TO_NEAREST_INT);
_mm_storeu_si128((__m128i *)(y + i), y_vec);
}
for(; i + 3 < n; i += 4) {
__m128 x_vec = _mm_loadu_ps(x + i);
__m128i y_vec = _mm_cvtps_ph(x_vec, _MM_FROUND_TO_NEAREST_INT);
_mm_storel_epi64((__m128i *)(y + i), y_vec);
}
//}
#endif
for (; i < n; i++) {
y[i] = GGML_FP32_TO_FP16(x[i]);
}
}
void ggml_bf16_to_fp32_row(const ggml_bf16_t * x, float * y, int64_t n) {
int i = 0;
for (; i < n; ++i) {
int64_t i = 0;
#if defined(__AVX512F__)
//if (ggml_cpu_has_avx512()) {
for (; i + 16 <= n; i += 16) {
_mm512_storeu_ps(y + i,
_mm512_castsi512_ps(
_mm512_slli_epi32(
_mm512_cvtepu16_epi32(
_mm256_loadu_si256(
(const __m256i *)(x + i))),
16)));
}
//}
#endif
#if defined(__AVX2__)
//if (ggml_cpu_has_avx2()) {
for (; i + 8 <= n; i += 8) {
_mm256_storeu_ps(y + i,
_mm256_castsi256_ps(
_mm256_slli_epi32(
_mm256_cvtepu16_epi32(
_mm_loadu_si128(
(const __m128i *)(x + i))),
16)));
}
//}
#endif
for (; i < n; i++) {
y[i] = GGML_BF16_TO_FP32(x[i]);
}
}
@@ -915,7 +956,6 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
"CONV_TRANSPOSE_1D",
"IM2COL",
"IM2COL_BACK",
"CONV_2D_DW",
"CONV_TRANSPOSE_2D",
"POOL_1D",
"POOL_2D",
@@ -954,7 +994,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
"OPT_STEP_ADAMW",
};
static_assert(GGML_OP_COUNT == 83, "GGML_OP_COUNT != 83");
static_assert(GGML_OP_COUNT == 82, "GGML_OP_COUNT != 82");
static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
"none",
@@ -1011,7 +1051,6 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
"conv_transpose_1d(x)",
"im2col(x)",
"im2col_back(x)",
"conv_2d_dw(x)",
"conv_transpose_2d(x)",
"pool_1d(x)",
"pool_2d(x)",
@@ -1050,7 +1089,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
"adamw(x)",
};
static_assert(GGML_OP_COUNT == 83, "GGML_OP_COUNT != 83");
static_assert(GGML_OP_COUNT == 82, "GGML_OP_COUNT != 82");
static_assert(GGML_OP_POOL_COUNT == 2, "GGML_OP_POOL_COUNT != 2");
@@ -1307,13 +1346,6 @@ bool ggml_is_permuted(const struct ggml_tensor * tensor) {
return tensor->nb[0] > tensor->nb[1] || tensor->nb[1] > tensor->nb[2] || tensor->nb[2] > tensor->nb[3];
}
bool ggml_is_contiguous_channels(const struct ggml_tensor * tensor) {
return
tensor->nb[0] > tensor->nb[2] &&
tensor->nb[1] > tensor->nb[0] &&
tensor->nb[2] == ggml_type_size(tensor->type);
}
static inline bool ggml_is_padded_1d(const struct ggml_tensor * tensor) {
static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
@@ -4020,46 +4052,6 @@ struct ggml_tensor * ggml_conv_2d_dw(
return result;
}
// ggml_conv_2d_dw_direct
struct ggml_tensor * ggml_conv_2d_dw_direct(
struct ggml_context * ctx,
struct ggml_tensor * a,
struct ggml_tensor * b,
int stride0,
int stride1,
int pad0,
int pad1,
int dilation0,
int dilation1) {
GGML_ASSERT(a->ne[2] == 1);
GGML_ASSERT(a->ne[3] == b->ne[2]);
int64_t ne[4];
ne[0] = ggml_calc_conv_output_size(b->ne[0], a->ne[0], stride0, pad0, dilation0);
ne[1] = ggml_calc_conv_output_size(b->ne[1], a->ne[1], stride1, pad1, dilation1);
ne[2] = b->ne[2];
ne[3] = b->ne[3];
struct ggml_tensor * result = ggml_new_tensor(ctx, b->type, 4, ne);
if (ggml_is_contiguous_channels(b)) {
// Result will be permuted the same way as input (CWHN order)
const int64_t type_size = ggml_type_size(result->type);
GGML_ASSERT(ggml_blck_size(result->type) == 1);
result->nb[0] = result->ne[2] * type_size;
result->nb[1] = result->ne[0] * result->nb[0];
result->nb[2] = type_size;
}
int32_t params[] = { stride0, stride1, pad0, pad1, dilation0, dilation1 };
ggml_set_op_params(result, params, sizeof(params));
result->op = GGML_OP_CONV_2D_DW;
result->src[0] = a;
result->src[1] = b;
return result;
}
// ggml_conv_transpose_2d_p0
static int64_t ggml_calc_conv_transpose_output_size(int64_t ins, int64_t ks, int s, int p) {
+10 -4
View File
@@ -57,20 +57,26 @@ var OnceLoad = sync.OnceFunc(func() {
exe = "."
}
var value string
// PATH, LD_LIBRARY_PATH, and DYLD_LIBRARY_PATH are often
// set by the parent process, however, use a default value
// if the environment variable is not set.
var name, value string
switch runtime.GOOS {
case "darwin":
// On macOS, DYLD_LIBRARY_PATH is often not set, so
// we use the directory of the executable as the default.
name = "DYLD_LIBRARY_PATH"
value = filepath.Dir(exe)
case "windows":
name = "PATH"
value = filepath.Join(filepath.Dir(exe), "lib", "ollama")
default:
name = "LD_LIBRARY_PATH"
value = filepath.Join(filepath.Dir(exe), "..", "lib", "ollama")
}
// Avoid potentially loading incompatible GGML libraries
paths, ok := os.LookupEnv("OLLAMA_LIBRARY_PATH")
paths, ok := os.LookupEnv(name)
if !ok {
slog.Debug("OLLAMA_LIBRARY_PATH not set, falling back to default", "search", value)
paths = value
}
+15 -7
View File
@@ -34,10 +34,14 @@ import (
_ "github.com/ollama/ollama/model/models"
)
type contextList struct {
list []ml.Context
}
type Sequence struct {
// ctxs are used for allocating tensors that last the lifetime of the sequence, such as
// multimodal embeddings
ctxs []ml.Context
ctxs *contextList
// batch index
iBatch int
@@ -173,10 +177,8 @@ func (s *Server) NewSequence(prompt string, images []llm.ImageData, params NewSe
// inputs processes the prompt and images into a list of inputs
// by splitting the prompt on [img-<n>] tags, tokenizing text and
// decoding images
func (s *Server) inputs(prompt string, images []llm.ImageData) ([]input.Input, []ml.Context, error) {
func (s *Server) inputs(prompt string, images []llm.ImageData) ([]input.Input, *contextList, error) {
var inputs []input.Input
var ctxs []ml.Context
var parts []string
var matches [][]string
@@ -190,6 +192,13 @@ func (s *Server) inputs(prompt string, images []llm.ImageData) ([]input.Input, [
parts = []string{prompt}
}
var contexts contextList
runtime.AddCleanup(&contexts, func(ctxs []ml.Context) {
for _, ctx := range ctxs {
ctx.Close()
}
}, contexts.list)
postTokenize := false
for i, part := range parts {
// text - tokenize
@@ -219,8 +228,7 @@ func (s *Server) inputs(prompt string, images []llm.ImageData) ([]input.Input, [
}
ctx := s.model.Backend().NewContext()
runtime.SetFinalizer(ctx, func(c ml.Context) { c.Close() })
ctxs = append(ctxs, ctx)
contexts.list = append(contexts.list, ctx)
imageEmbeddings, err := multimodalProcessor.EncodeMultimodal(ctx, images[imageIndex].Data)
if err != nil {
return nil, nil, err
@@ -243,7 +251,7 @@ func (s *Server) inputs(prompt string, images []llm.ImageData) ([]input.Input, [
}
}
return inputs, ctxs, nil
return inputs, &contexts, nil
}
type Server struct {
+1 -1
View File
@@ -121,7 +121,7 @@ function buildOllama() {
if ($env:HIP_PATH) {
write-host "Building ROCm backend libraries"
if (-Not (get-command -ErrorAction silent ninja)) {
$NINJA_DIR=(gci -path (Get-CimInstance MSFT_VSInstance -Namespace root/cimv2/vs)[0].InstallLocation -r -fi ninja.exe).Directory.FullName
$NINJA_DIR=(gci -path (Get-CimInstance MSFT_VSInstance -Namespace root/cimv2/vs)[0].InstallLocation -r -fi ninja.exe) | split-path -parent
$env:PATH="$NINJA_DIR;$env:PATH"
}
$env:HIPCXX="${env:HIP_PATH}\bin\clang++.exe"
-5
View File
@@ -106,11 +106,6 @@ func (m *Model) Capabilities() []model.Capability {
capabilities = append(capabilities, model.CapabilityInsert)
}
// Check for vision capability in projector-based models
if len(m.ProjectorPaths) > 0 {
capabilities = append(capabilities, model.CapabilityVision)
}
return capabilities
}
+226
View File
@@ -0,0 +1,226 @@
package server
import (
"fmt"
"regexp"
"strconv"
"strings"
"github.com/ollama/ollama/api"
)
var (
pythonFuncRegex = regexp.MustCompile(`(\w+)\((.*?)\)`)
braces = map[rune]rune{
'[': ']',
'{': '}',
'(': ')',
'"': '"',
'\'': '\'',
}
)
// parsePythonValue converts a Python value string to its appropriate Go type
func parsePythonValue(value string) (any, error) {
value = strings.TrimSpace(value)
// string
if (strings.HasPrefix(value, "\"") && strings.HasSuffix(value, "\"")) ||
(strings.HasPrefix(value, "'") && strings.HasSuffix(value, "'")) {
// Remove quotes
result := value[1 : len(value)-1]
return result, nil
}
// bool
switch strings.ToLower(value) {
case "true":
return true, nil
case "false":
return false, nil
case "none":
return nil, nil
}
// int
if i, err := strconv.Atoi(value); err == nil {
return i, nil
}
// float
if f, err := strconv.ParseFloat(value, 64); err == nil {
return f, nil
}
// list
if strings.HasPrefix(value, "[") && strings.HasSuffix(value, "]") {
listStr := value[1 : len(value)-1]
var list []any
stack := []rune{}
start := 0
for i, char := range listStr {
if len(stack) != 0 && char == braces[stack[len(stack)-1]] {
stack = stack[:len(stack)-1]
} else if _, ok := braces[char]; ok {
stack = append(stack, char)
}
if len(stack) == 0 && (char == ',' || i == len(listStr)-1) {
end := i
if i == len(listStr)-1 {
end = i + 1
}
item := strings.TrimSpace(listStr[start:end])
if val, err := parsePythonValue(item); err == nil {
list = append(list, val)
} else {
return nil, fmt.Errorf("invalid list item: %s", item)
}
start = i + 1
}
}
return list, nil
}
// dictionary
if strings.HasPrefix(value, "{") && strings.HasSuffix(value, "}") && strings.Contains(value, ":") {
dictStr := value[1 : len(value)-1]
dict := make(map[any]any)
stack := []rune{}
start := 0
for i, char := range dictStr {
if len(stack) != 0 && char == braces[stack[len(stack)-1]] {
stack = stack[:len(stack)-1]
} else if _, ok := braces[char]; ok {
stack = append(stack, char)
}
if len(stack) == 0 && (char == ',' || i == len(dictStr)-1) {
end := i
if i == len(dictStr)-1 {
end = i + 1
}
item := strings.TrimSpace(dictStr[start:end])
kv := strings.SplitN(item, ":", 2)
if len(kv) != 2 {
return nil, fmt.Errorf("invalid dictionary key-value pair: %s", item)
}
key, err := parsePythonValue(strings.TrimSpace(kv[0]))
if err != nil {
return nil, fmt.Errorf("invalid dictionary key: %s", kv[0])
}
val, err := parsePythonValue(strings.TrimSpace(kv[1]))
if err != nil {
return nil, fmt.Errorf("invalid dictionary value: %s", kv[1])
}
dict[key] = val
start = i + 1
}
}
return dict, nil
}
// sets (stored as lists)
if strings.HasPrefix(value, "{") && strings.HasSuffix(value, "}") {
setStr := value[1 : len(value)-1]
var list []any
stack := []rune{}
start := 0
for i, char := range setStr {
if len(stack) != 0 && char == braces[stack[len(stack)-1]] {
stack = stack[:len(stack)-1]
} else if _, ok := braces[char]; ok {
stack = append(stack, char)
}
if len(stack) == 0 && (char == ',' || i == len(setStr)-1) {
end := i
if i == len(setStr)-1 {
end = i + 1
}
item := strings.TrimSpace(setStr[start:end])
if val, err := parsePythonValue(item); err == nil {
list = append(list, val)
} else {
return nil, fmt.Errorf("invalid set item: %s", item)
}
start = i + 1
}
}
return list, nil
}
return nil, fmt.Errorf("invalid Python value: %s", value)
}
// parsePythonToolCall parses Python function calls from a string
// it supports keyword arguments, as well as multiple functions in a single string
func parsePythonToolCall(s string) ([]api.ToolCall, error) {
matches := pythonFuncRegex.FindAllStringSubmatchIndex(s, -1)
if len(matches) == 0 {
return nil, fmt.Errorf("no Python function calls found")
}
var toolCalls []api.ToolCall
for _, match := range matches {
name := s[match[2]:match[3]]
args := s[match[4]:match[5]]
var arguments api.ToolCallFunctionArguments
if len(args) == 0 {
toolCalls = append(toolCalls, api.ToolCall{
Function: api.ToolCallFunction{
Name: name,
},
})
continue
}
start := 0
stack := []rune{}
for i, char := range args {
if len(stack) != 0 && char == braces[stack[len(stack)-1]] {
stack = stack[:len(stack)-1]
} else if _, ok := braces[char]; ok {
stack = append(stack, char)
}
if len(stack) == 0 && (char == ',' || i == len(args)-1) {
end := i
if i == len(args)-1 {
end = i + 1
}
kv := strings.SplitN(args[start:end], "=", 2)
if len(kv) == 2 {
key := strings.TrimSpace(kv[0])
valueStr := strings.TrimSpace(kv[1])
// Parse the value into appropriate type
value, err := parsePythonValue(valueStr)
if err != nil {
return nil, fmt.Errorf("failed to parse value for key %q: %v", key, err)
}
arguments[key] = value
} else {
return nil, fmt.Errorf("invalid argument format: %q", args[start:end])
}
start = i + 1
}
}
if len(arguments) > 0 {
toolCalls = append(toolCalls, api.ToolCall{
Function: api.ToolCallFunction{
Name: name,
Arguments: arguments,
},
})
}
}
if len(toolCalls) > 0 {
return toolCalls, nil
}
return nil, fmt.Errorf("failed to parse any valid tool calls")
}
+269
View File
@@ -0,0 +1,269 @@
package server
import (
"testing"
"github.com/google/go-cmp/cmp"
"github.com/ollama/ollama/api"
)
func TestParsePythonFunctionCall(t *testing.T) {
t1 := api.ToolCall{
Function: api.ToolCallFunction{
Name: "get_current_weather",
Arguments: api.ToolCallFunctionArguments{
"location": "San Francisco, CA",
"format": "fahrenheit",
},
},
}
t2 := api.ToolCall{
Function: api.ToolCallFunction{
Name: "get_forecast",
Arguments: api.ToolCallFunctionArguments{
"days": 5,
"location": "Seattle",
},
},
}
t3 := api.ToolCall{
Function: api.ToolCallFunction{
Name: "get_current_weather",
Arguments: api.ToolCallFunctionArguments{
"list": []any{1, 2, 3},
"int": -1,
"float": 1.23,
"string": "hello",
},
},
}
t4 := api.ToolCall{
Function: api.ToolCallFunction{
Name: "get_current_weather",
},
}
cases := []struct {
name string
input string
want []api.ToolCall
err bool
}{
{
name: "malformed function call - missing closing paren",
input: "get_current_weather(location=\"San Francisco\"",
err: true,
},
{
name: "empty function call",
input: "get_current_weather()",
want: []api.ToolCall{t4},
err: false,
},
{
name: "single valid function call",
input: "get_current_weather(location=\"San Francisco, CA\", format=\"fahrenheit\")",
want: []api.ToolCall{t1},
},
{
name: "multiple valid function calls",
input: "get_current_weather(location=\"San Francisco, CA\", format=\"fahrenheit\") get_forecast(days=5, location=\"Seattle\")",
want: []api.ToolCall{t1, t2},
},
{
name: "multiple valid function calls with list",
input: "get_current_weather(list=[1,2,3], int=-1, float=1.23, string=\"hello\")",
want: []api.ToolCall{t3},
},
{
name: "positional arguments not supported",
input: "get_current_weather(1, 2, 3)",
err: true,
},
{
name: "invalid argument format without equals",
input: "get_current_weather(\"San Francisco\")",
err: true,
},
{
name: "nested lists",
input: "get_current_weather(data=[[1,2],[3,4]])",
want: []api.ToolCall{{
Function: api.ToolCallFunction{
Name: "get_current_weather",
Arguments: api.ToolCallFunctionArguments{
"data": []any{[]any{1, 2}, []any{3, 4}},
},
},
}},
},
{
name: "boolean and none values",
input: "get_current_weather(active=true, enabled=false, value=None)",
want: []api.ToolCall{{
Function: api.ToolCallFunction{
Name: "get_current_weather",
Arguments: api.ToolCallFunctionArguments{
"active": true,
"enabled": false,
"value": nil,
},
},
}},
},
{
name: "single vs double quotes",
input: "get_current_weather(str1='single', str2=\"double\")",
want: []api.ToolCall{{
Function: api.ToolCallFunction{
Name: "get_current_weather",
Arguments: api.ToolCallFunctionArguments{
"str1": "single",
"str2": "double",
},
},
}},
},
{
name: "whitespace handling",
input: "get_current_weather( location = \"San Francisco\" , temp = 72 )",
want: []api.ToolCall{{
Function: api.ToolCallFunction{
Name: "get_current_weather",
Arguments: api.ToolCallFunctionArguments{
"location": "San Francisco",
"temp": 72,
},
},
}},
},
}
for _, tt := range cases {
t.Run(tt.name, func(t *testing.T) {
got, err := parsePythonToolCall(tt.input)
if (err != nil) != tt.err {
t.Fatalf("expected error: %v, got error: %v", tt.err, err)
}
if tt.err {
return
}
if diff := cmp.Diff(got, tt.want); diff != "" {
t.Errorf("mismatch (-got +want):\n%s", diff)
}
})
}
}
func TestParsePythonValue(t *testing.T) {
cases := []struct {
name string
input string
want any
err bool
}{
{
name: "string with double quotes",
input: "\"hello\"",
want: "hello",
},
{
name: "string with single quotes",
input: "'world'",
want: "world",
},
{
name: "integer",
input: "42",
want: 42,
},
{
name: "float",
input: "3.14",
want: 3.14,
},
{
name: "boolean true",
input: "True",
want: true,
},
{
name: "boolean false",
input: "False",
want: false,
},
{
name: "none/null",
input: "None",
want: nil,
},
{
name: "simple list",
input: "[1, 2, 3]",
want: []any{1, 2, 3},
},
{
name: "nested list",
input: "[1, [2, 3], 4]",
want: []any{1, []any{2, 3}, 4},
},
{
name: "mixed type list",
input: "[1, \"two\", 3.0, true]",
want: []any{1, "two", 3.0, true},
},
{
name: "invalid list",
input: "[1, 2,",
want: nil,
err: true,
},
{
name: "dictionaries",
input: "{'a': 1, 'b': 2}",
want: map[any]any{"a": 1, "b": 2},
err: false,
},
{
name: "int dictionary",
input: "{1: 2}",
want: map[any]any{1: 2},
err: false,
},
{
name: "mixed type dictionary",
input: "{'a': 1, 'b': 2.0, 'c': True}",
want: map[any]any{"a": 1, "b": 2.0, "c": true},
err: false,
},
{
name: "invalid dictionary - missing closing brace",
input: "{'a': 1, 'b': 2",
want: nil,
err: true,
},
{
name: "sets",
input: "{1, 2, 3}",
want: []any{1, 2, 3},
err: false,
},
}
for _, tt := range cases {
t.Run(tt.name, func(t *testing.T) {
got, err := parsePythonValue(tt.input)
if (err != nil) != tt.err {
t.Fatalf("expected error: %v, got error: %v", tt.err, err)
}
if tt.err {
return
}
if diff := cmp.Diff(got, tt.want); diff != "" {
t.Errorf("mismatch (-got +want):\n%s", diff)
}
})
}
}
-22
View File
@@ -18,7 +18,6 @@ import (
"os"
"os/signal"
"path/filepath"
"regexp"
"slices"
"strings"
"syscall"
@@ -1513,7 +1512,6 @@ func (s *Server) ChatHandler(c *gin.Context) {
if req.Messages[0].Role != "system" && m.System != "" {
msgs = append([]api.Message{{Role: "system", Content: m.System}}, msgs...)
}
msgs = filterThinkTags(msgs, m)
prompt, images, err := chatPrompt(c.Request.Context(), m, r.Tokenize, opts, msgs, req.Tools)
if err != nil {
@@ -1642,23 +1640,3 @@ func handleScheduleError(c *gin.Context, name string, err error) {
c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
}
}
var thinkTagRegexp = regexp.MustCompile(`<think>(?s).*?</think>(\n)*`)
func filterThinkTags(msgs []api.Message, m *Model) []api.Message {
if m.Config.ModelFamily == "qwen3" || model.ParseName(m.Name).Model == "deepseek-r1" {
finalUserIndex := -1
for i, msg := range msgs {
if msg.Role == "user" {
finalUserIndex = i
}
}
for i, msg := range msgs {
if msg.Role == "assistant" && i < finalUserIndex {
msgs[i].Content = thinkTagRegexp.ReplaceAllString(msg.Content, "")
}
}
}
return msgs
}
+9
View File
@@ -299,6 +299,9 @@ func TestGenerateChat(t *testing.T) {
{Role: "user", Content: "Hello!"},
},
Stream: &stream,
Options: map[string]any{
"num_ctx": 1024,
},
})
if w.Code != http.StatusOK {
@@ -321,6 +324,9 @@ func TestGenerateChat(t *testing.T) {
{Role: "user", Content: "Hello!"},
},
Stream: &stream,
Options: map[string]any{
"num_ctx": 1024,
},
})
if w.Code != http.StatusOK {
@@ -344,6 +350,9 @@ func TestGenerateChat(t *testing.T) {
{Role: "user", Content: "Help me write tests."},
},
Stream: &stream,
Options: map[string]any{
"num_ctx": 1024,
},
})
if w.Code != http.StatusOK {
-126
View File
@@ -15,7 +15,6 @@ import (
"net/http/httptest"
"os"
"path/filepath"
"reflect"
"sort"
"strings"
"testing"
@@ -747,128 +746,3 @@ func TestNormalize(t *testing.T) {
})
}
}
func TestFilterThinkTags(t *testing.T) {
type testCase struct {
msgs []api.Message
want []api.Message
model *Model
}
testCases := []testCase{
{
msgs: []api.Message{
{Role: "user", Content: "Hello, world!"},
{Role: "assistant", Content: "<think>Thinking... about the answer</think>abc"},
{Role: "user", Content: "What is the answer?"},
},
want: []api.Message{
{Role: "user", Content: "Hello, world!"},
{Role: "assistant", Content: "abc"},
{Role: "user", Content: "What is the answer?"},
},
model: &Model{
Config: ConfigV2{
ModelFamily: "qwen3",
},
},
},
// with newlines inside the think tag aned newlines after
{
msgs: []api.Message{
{Role: "user", Content: "Hello, world!"},
{Role: "assistant", Content: "<think>Thinking... \n\nabout \nthe answer</think>\n\nabc\ndef"},
{Role: "user", Content: "What is the answer?"},
},
want: []api.Message{
{Role: "user", Content: "Hello, world!"},
{Role: "assistant", Content: "abc\ndef"},
{Role: "user", Content: "What is the answer?"},
},
model: &Model{
Config: ConfigV2{
ModelFamily: "qwen3",
},
},
},
// should leave thinking tags if it's after the last user message
{
msgs: []api.Message{
{Role: "user", Content: "Hello, world!"},
{Role: "assistant", Content: "<think>Thinking...</think>after"},
{Role: "user", Content: "What is the answer?"},
{Role: "assistant", Content: "<think>thinking again</think>hjk"},
{Role: "assistant", Content: "<think>thinking yet again</think>hjk"},
},
want: []api.Message{
{Role: "user", Content: "Hello, world!"},
{Role: "assistant", Content: "after"},
{Role: "user", Content: "What is the answer?"},
{Role: "assistant", Content: "<think>thinking again</think>hjk"},
{Role: "assistant", Content: "<think>thinking yet again</think>hjk"},
},
model: &Model{
Config: ConfigV2{
ModelFamily: "qwen3",
},
},
},
{
// shouldn't strip anything because the model family isn't one of the hardcoded ones
msgs: []api.Message{
{Role: "user", Content: "Hello, world!"},
{Role: "assistant", Content: "<think>Thinking... about the answer</think>abc"},
{Role: "user", Content: "What is the answer?"},
},
want: []api.Message{
{Role: "user", Content: "Hello, world!"},
{Role: "assistant", Content: "<think>Thinking... about the answer</think>abc"},
{Role: "user", Content: "What is the answer?"},
},
model: &Model{
Config: ConfigV2{
ModelFamily: "llama3",
},
},
},
{
// deepseek-r1:-prefixed model
msgs: []api.Message{
{Role: "user", Content: "Hello, world!"},
{Role: "assistant", Content: "<think>Thinking... about the answer</think>abc"},
{Role: "user", Content: "What is the answer?"},
},
want: []api.Message{
{Role: "user", Content: "Hello, world!"},
{Role: "assistant", Content: "abc"},
{Role: "user", Content: "What is the answer?"},
},
model: &Model{
Name: "registry.ollama.ai/library/deepseek-r1:latest",
ShortName: "deepseek-r1:7b",
Config: ConfigV2{},
},
},
}
for i, tc := range testCases {
filtered := filterThinkTags(tc.msgs, tc.model)
if !reflect.DeepEqual(filtered, tc.want) {
t.Errorf("messages differ for case %d:", i)
for i := range tc.want {
if i >= len(filtered) {
t.Errorf(" missing message %d: %+v", i, tc.want[i])
continue
}
if !reflect.DeepEqual(filtered[i], tc.want[i]) {
t.Errorf(" message %d:\n want: %+v\n got: %+v", i, tc.want[i], filtered[i])
}
}
if len(filtered) > len(tc.want) {
for i := len(tc.want); i < len(filtered); i++ {
t.Errorf(" extra message %d: %+v", i, filtered[i])
}
}
}
}
}
+38 -58
View File
@@ -81,10 +81,6 @@ func InitScheduler(ctx context.Context) *Scheduler {
// context must be canceled to decrement ref count and release the runner
func (s *Scheduler) GetRunner(c context.Context, model *Model, opts api.Options, sessionDuration *api.Duration) (chan *runnerRef, chan error) {
if opts.NumCtx < 4 {
opts.NumCtx = 4
}
req := &LlmRequest{
ctx: c,
model: model,
@@ -114,6 +110,11 @@ func (s *Scheduler) Run(ctx context.Context) {
}()
}
const (
defaultContextLength = 4096
smallGpuContextLength = 2048
)
func (s *Scheduler) processPending(ctx context.Context) {
for {
select {
@@ -147,7 +148,6 @@ func (s *Scheduler) processPending(ctx context.Context) {
s.loadedMu.Unlock()
if runner != nil {
if runner.needsReload(ctx, pending) {
slog.Debug("reloading", "runner", runner)
runnerToExpire = runner
} else {
// Runner is usable, return it
@@ -167,6 +167,17 @@ func (s *Scheduler) processPending(ctx context.Context) {
gpus = s.getGpuFn()
}
if pending.origNumCtx == -1 {
if len(gpus) == 1 && gpus[0].Library != "cpu" && gpus[0].TotalMemory <= 4096*1024*1024 {
slog.Info("GPU is small, limiting default context window", "num_ctx", smallGpuContextLength)
pending.opts.NumCtx = smallGpuContextLength
pending.origNumCtx = smallGpuContextLength
} else {
pending.opts.NumCtx = defaultContextLength
pending.origNumCtx = defaultContextLength
}
}
if envconfig.MaxRunners() <= 0 {
// No user specified MaxRunners, so figure out what automatic setting to use
// If all GPUs have reliable free memory reporting, defaultModelsPerGPU * the number of GPUs
@@ -283,7 +294,7 @@ func (s *Scheduler) processPending(ctx context.Context) {
}
// Trigger an expiration to unload once it's done
runnerToExpire.refMu.Lock()
slog.Debug("resetting model to expire immediately to make room", "runner", runnerToExpire, "refCount", runnerToExpire.refCount)
slog.Debug("resetting model to expire immediately to make room", "modelPath", runnerToExpire.modelPath, "refCount", runnerToExpire.refCount)
if runnerToExpire.expireTimer != nil {
runnerToExpire.expireTimer.Stop()
runnerToExpire.expireTimer = nil
@@ -332,16 +343,16 @@ func (s *Scheduler) processCompleted(ctx context.Context) {
runner.refCount--
if runner.refCount <= 0 {
if runner.sessionDuration <= 0 {
slog.Debug("runner with zero duration has gone idle, expiring to unload", "runner", runner)
slog.Debug("runner with zero duration has gone idle, expiring to unload", "modelPath", runner.modelPath)
if runner.expireTimer != nil {
runner.expireTimer.Stop()
runner.expireTimer = nil
}
s.expiredCh <- runner
} else if runner.expireTimer == nil {
slog.Debug("runner with non-zero duration has gone idle, adding timer", "runner", runner, "duration", runner.sessionDuration)
slog.Debug("runner with non-zero duration has gone idle, adding timer", "modelPath", runner.modelPath, "duration", runner.sessionDuration)
runner.expireTimer = time.AfterFunc(runner.sessionDuration, func() {
slog.Debug("timer expired, expiring to unload", "runner", runner)
slog.Debug("timer expired, expiring to unload", "modelPath", runner.modelPath)
runner.refMu.Lock()
defer runner.refMu.Unlock()
if runner.expireTimer != nil {
@@ -352,18 +363,18 @@ func (s *Scheduler) processCompleted(ctx context.Context) {
})
runner.expiresAt = time.Now().Add(runner.sessionDuration)
} else {
slog.Debug("runner with non-zero duration has gone idle, resetting timer", "runner", runner, "duration", runner.sessionDuration)
slog.Debug("runner with non-zero duration has gone idle, resetting timer", "modelPath", runner.modelPath, "duration", runner.sessionDuration)
runner.expireTimer.Reset(runner.sessionDuration)
runner.expiresAt = time.Now().Add(runner.sessionDuration)
}
}
slog.Debug("after processing request finished event", "runner", runner, "refCount", runner.refCount)
slog.Debug("after processing request finished event", "modelPath", runner.modelPath, "refCount", runner.refCount)
runner.refMu.Unlock()
case runner := <-s.expiredCh:
slog.Debug("runner expired event received", "runner", runner)
slog.Debug("runner expired event received", "modelPath", runner.modelPath)
runner.refMu.Lock()
if runner.refCount > 0 {
slog.Debug("expired event with positive ref count, retrying", "runner", runner, "refCount", runner.refCount)
slog.Debug("expired event with positive ref count, retrying", "modelPath", runner.modelPath, "refCount", runner.refCount)
go func(runner *runnerRef) {
// We can't unload yet, but want to as soon as the current request completes
// So queue up another expired event
@@ -375,16 +386,16 @@ func (s *Scheduler) processCompleted(ctx context.Context) {
}
s.loadedMu.Lock()
slog.Debug("got lock to unload", "runner", runner)
slog.Debug("got lock to unload", "modelPath", runner.modelPath)
finished := runner.waitForVRAMRecovery()
runner.unload()
delete(s.loaded, runner.modelPath)
s.loadedMu.Unlock()
slog.Debug("runner released", "runner", runner)
slog.Debug("runner released", "modelPath", runner.modelPath)
runner.refMu.Unlock()
<-finished
slog.Debug("sending an unloaded event", "runner", runner)
slog.Debug("sending an unloaded event", "modelPath", runner.modelPath)
s.unloadedCh <- struct{}{}
}
}
@@ -407,7 +418,7 @@ func (pending *LlmRequest) useLoadedRunner(runner *runnerRef, finished chan *Llm
pending.successCh <- runner
go func() {
<-pending.ctx.Done()
slog.Debug("context for request finished", "runner", runner)
slog.Debug("context for request finished")
finished <- pending
}()
}
@@ -442,10 +453,10 @@ func (s *Scheduler) load(req *LlmRequest, f *ggml.GGML, gpus discover.GpuInfoLis
estimatedVRAM: llama.EstimatedVRAM(),
estimatedTotal: llama.EstimatedTotal(),
loading: true,
pid: llama.Pid(),
refCount: 1,
}
runner.numParallel = numParallel
runner.refMu.Lock() // hold lock until running or aborted
runner.refMu.Lock()
s.loadedMu.Lock()
s.loaded[req.model.ModelPath] = runner
@@ -456,16 +467,13 @@ func (s *Scheduler) load(req *LlmRequest, f *ggml.GGML, gpus discover.GpuInfoLis
defer runner.refMu.Unlock()
if err = llama.WaitUntilRunning(req.ctx); err != nil {
slog.Error("error loading llama server", "error", err)
runner.refCount--
req.errCh <- err
slog.Debug("triggering expiration for failed load", "model", runner.modelPath)
s.expiredCh <- runner
return
}
slog.Debug("finished setting up runner", "model", req.model.ModelPath)
if runner.pid < 0 {
runner.pid = llama.Pid()
}
runner.refCount++
runner.loading = false
go func() {
<-req.ctx.Done()
@@ -483,12 +491,7 @@ func (s *Scheduler) updateFreeSpace(allGpus discover.GpuInfoList) {
}
predMap := map[predKey]uint64{} // Sum up the total predicted usage per GPU for all runners
s.loadedMu.Lock()
runners := make([]*runnerRef, 0, len(s.loaded))
for _, r := range s.loaded {
runners = append(runners, r)
}
s.loadedMu.Unlock()
for _, r := range runners {
r.refMu.Lock()
if r.llama != nil {
for _, gpu := range allGpus {
@@ -499,6 +502,7 @@ func (s *Scheduler) updateFreeSpace(allGpus discover.GpuInfoList) {
}
r.refMu.Unlock()
}
s.loadedMu.Unlock()
// Now that we've summed up all the GPU usage predictions across all the loaded runners, update the gpu list
for i := range allGpus {
@@ -545,11 +549,12 @@ func (s *Scheduler) filterGPUsWithoutLoadingModels(allGpus discover.GpuInfoList)
// TODO consolidate sched_types.go
type runnerRef struct {
refMu sync.Mutex
refMu sync.Mutex
// refCond sync.Cond // Signaled on transition from 1 -> 0 refCount
refCount uint // prevent unloading if > 0
// unloading bool // set to true when we are trying to unload the runner
llama llm.LlamaServer
pid int
loading bool // True only during initial load, then false forever
gpus discover.GpuInfoList // Recorded at time of provisioning
estimatedVRAM uint64
@@ -674,31 +679,6 @@ func (runner *runnerRef) waitForVRAMRecovery() chan any {
return finished
}
func (runner *runnerRef) LogValue() slog.Value {
if runner == nil {
return slog.StringValue("nil")
}
attrs := []slog.Attr{}
if runner.model != nil {
attrs = append(attrs, slog.String("name", runner.model.Name))
}
if len(runner.gpus) > 0 {
attrs = append(attrs,
slog.String("inference", runner.gpus[0].Library),
slog.Int("devices", len(runner.gpus)),
)
}
attrs = append(attrs,
slog.String("size", format.HumanBytes2(runner.estimatedTotal)),
slog.String("vram", format.HumanBytes2(runner.estimatedVRAM)),
slog.Int("num_ctx", runner.NumCtx),
slog.Int("parallel", runner.numParallel),
slog.Int("pid", runner.pid),
slog.String("model", runner.modelPath),
)
return slog.GroupValue(attrs...)
}
type ByDurationAndName []*runnerRef
func (a ByDurationAndName) Len() int { return len(a) }
@@ -821,12 +801,12 @@ func (s *Scheduler) findRunnerToUnload() *runnerRef {
rc := runner.refCount
runner.refMu.Unlock()
if rc == 0 {
slog.Debug("found an idle runner to unload", "runner", runner)
slog.Debug("found an idle runner to unload")
return runner
}
}
// None appear idle, just wait for the one with the shortest duration
slog.Debug("no idle runners, picking the shortest duration", "runner_count", len(runnerList), "runner", runnerList[0])
slog.Debug("no idle runners, picking the shortest duration", "count", len(runnerList))
return runnerList[0]
}
@@ -843,8 +823,8 @@ func (s *Scheduler) unloadAllRunners() {
func (s *Scheduler) expireRunner(model *Model) {
s.loadedMu.Lock()
defer s.loadedMu.Unlock()
runner, ok := s.loaded[model.ModelPath]
s.loadedMu.Unlock()
if ok {
runner.refMu.Lock()
runner.expiresAt = time.Now()
+1 -1
View File
@@ -148,6 +148,7 @@ func newScenarioRequest(t *testing.T, ctx context.Context, modelName string, est
successCh: make(chan *runnerRef, 1),
errCh: make(chan error, 1),
}
b.req.opts.NumCtx = 4096
b.srv = &mockLlm{estimatedVRAM: estimatedVRAM, estimatedVRAMByGPU: map[string]uint64{"": estimatedVRAM}}
return b
}
@@ -792,4 +793,3 @@ func (s *mockLlm) Close() error {
func (s *mockLlm) EstimatedVRAM() uint64 { return s.estimatedVRAM }
func (s *mockLlm) EstimatedTotal() uint64 { return s.estimatedTotal }
func (s *mockLlm) EstimatedVRAMByGPU(gpuid string) uint64 { return s.estimatedVRAMByGPU[gpuid] }
func (s *mockLlm) Pid() int { return -1 }